From 319001192d59bc57923ba3838eb83685cb3af014 Mon Sep 17 00:00:00 2001 From: Burdette Lamar Date: Tue, 18 Nov 2025 20:56:14 -0600 Subject: [PATCH] [DOC] Tweaks for String#dump and String#undump --- doc/string.rb | 3 +- doc/string/dump.rdoc | 123 +++++++++++++++++++++++++++++-------------- string.c | 12 ++--- 3 files changed, 88 insertions(+), 50 deletions(-) diff --git a/doc/string.rb b/doc/string.rb index 4304b96aee..b37cb5d324 100644 --- a/doc/string.rb +++ b/doc/string.rb @@ -322,8 +322,7 @@ # _Substitution_ # # - #dump: Returns a printable version of +self+, enclosed in double-quotes. -# - #undump: Returns a copy of +self+ with all \xNN notations replaced by \uNNNN notations -# and all escaped characters unescaped. +# - #undump: Inverse of #dump; returns a copy of +self+ with changes of the kinds made by #dump "undone." # - #sub: Returns a copy of +self+ with the first substring matching a given pattern # replaced with a given replacement string. # - #gsub: Returns a copy of +self+ with each substring that matches a given pattern diff --git a/doc/string/dump.rdoc b/doc/string/dump.rdoc index a5ab0bb42f..2ab9521540 100644 --- a/doc/string/dump.rdoc +++ b/doc/string/dump.rdoc @@ -1,52 +1,97 @@ -Returns a printable version of +self+, enclosed in double-quotes: +For an ordinary string, this method, +String#dump+, +returns a printable ASCII-only version of +self+, enclosed in double-quotes. - 'hello'.dump # => "\"hello\"" +For a dumped string, method String#undump is the inverse of +String#dump+; +it returns a "restored" version of +self+, +where all the dumping changes have been undone. -Certain special characters are rendered with escapes: +In the simplest case, the dumped string contains the original string, +enclosed in double-quotes; +this example is done in +irb+ (interactive Ruby), which uses method `inspect` to render the results: - '"'.dump # => "\"\\\"\"" - '\\'.dump # => "\"\\\\\"" + s = 'hello' # => "hello" + s.dump # => "\"hello\"" + s.dump.undump # => "hello" -Non-printing characters are rendered with escapes: +Keep in mind that in the second line above: + +- The outer double-quotes are put on by +inspect+, + and _are_ _not_ part of the output of #dump. +- The inner double-quotes _are_ part of the output of +dump+, + and are escaped by +inspect+ because they are within the outer double-quotes. + +To avoid confusion, we'll use this helper method to omit the outer double-quotes: + + def dump(s) + print "String: ", s, "\n" + print "Dumped: ", s.dump, "\n" + print "Undumped: ", s.dump.undump, "\n" + end + +So that for string 'hello', we'll see: + + String: hello + Dumped: "hello" + Undumped: hello + +In a dump, certain special characters are escaped: + + String: " + Dumped: "\"" + Undumped: " + + String: \ + Dumped: "\\" + Undumped: \ + +In a dump, unprintable characters are replaced by printable ones; +the unprintable characters are the whitespace characters (other than space itself); +here we see the ordinals for those characers, together with explanatory text: + + h = { + 7 => 'Alert (BEL)', + 8 => 'Backspace (BS)', + 9 => 'Horizontal tab (HT)', + 10 => 'Linefeed (LF)', + 11 => 'Vertical tab (VT)', + 12 => 'Formfeed (FF)', + 13 => 'Carriage return (CR)' + } + +In this example, the dumped output is printed by method #inspect, +and so contains both outer double-quotes and escaped inner double-quotes: s = '' - s << 7 # Alarm (bell). - s << 8 # Back space. - s << 9 # Horizontal tab. - s << 10 # Line feed. - s << 11 # Vertical tab. - s << 12 # Form feed. - s << 13 # Carriage return. - s # => "\a\b\t\n\v\f\r" - s.dump # => "\"\\a\\b\\t\\n\\v\\f\\r\"" + h.keys.each {|i| s << i } # => [7, 8, 9, 10, 11, 12, 13] + s # => "\a\b\t\n\v\f\r" + s.dump # => "\"\\a\\b\\t\\n\\v\\f\\r\"" -If +self+ is encoded in UTF-8 and contains Unicode characters, renders Unicode -characters in Unicode escape sequence: +If +self+ is encoded in UTF-8 and contains Unicode characters, +each Unicode character is dumped as a Unicode escape sequence: - 'тест'.dump # => "\"\\u0442\\u0435\\u0441\\u0442\"" - 'こんにちは'.dump # => "\"\\u3053\\u3093\\u306B\\u3061\\u306F\"" + String: тест + Dumped: "\u0442\u0435\u0441\u0442" + Undumped: тест -If the encoding of +self+ is not ASCII-compatible (i.e., +self.encoding.ascii_compatible?+ -returns +false+), renders all ASCII-compatible bytes as ASCII characters and all -other bytes as hexadecimal. Appends .dup.force_encoding(\"encoding\"), where - is +self.encoding.name+: + String: こんにちは + Dumped: "\u3053\u3093\u306B\u3061\u306F" + Undumped: こんにちは - s = 'hello' - s.encoding # => # - s.dump # => "\"hello\"" - s.encode('utf-16').dump # => "\"\\xFE\\xFF\\x00h\\x00e\\x00l\\x00l\\x00o\".dup.force_encoding(\"UTF-16\")" - s.encode('utf-16le').dump # => "\"h\\x00e\\x00l\\x00l\\x00o\\x00\".dup.force_encoding(\"UTF-16LE\")" +If the encoding of +self+ is not ASCII-compatible +(i.e., if self.encoding.ascii_compatible? returns +false+), +each ASCII-compatible byte is dumped as an ASCII character, +and all other bytes are dumped as hexadecimal; +also appends .dup.force_encoding(\"encoding\"), +where is self.encoding.name: - s = 'тест' - s.encoding # => # - s.dump # => "\"\\u0442\\u0435\\u0441\\u0442\"" - s.encode('utf-16').dump # => "\"\\xFE\\xFF\\x04B\\x045\\x04A\\x04B\".dup.force_encoding(\"UTF-16\")" - s.encode('utf-16le').dump # => "\"B\\x045\\x04A\\x04B\\x04\".dup.force_encoding(\"UTF-16LE\")" + String: hello + Dumped: "\xFE\xFF\x00h\x00e\x00l\x00l\x00o".dup.force_encoding("UTF-16") + Undumped: hello - s = 'こんにちは' - s.encoding # => # - s.dump # => "\"\\u3053\\u3093\\u306B\\u3061\\u306F\"" - s.encode('utf-16').dump # => "\"\\xFE\\xFF0S0\\x930k0a0o\".dup.force_encoding(\"UTF-16\")" - s.encode('utf-16le').dump # => "\"S0\\x930k0a0o0\".dup.force_encoding(\"UTF-16LE\")" + String: тест + Dumped: "\xFE\xFF\x04B\x045\x04A\x04B".dup.force_encoding("UTF-16") + Undumped: тест -Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String]. + String: こんにちは + Dumped: "\xFE\xFF0S0\x930k0a0o".dup.force_encoding("UTF-16") + Undumped: こんにちは diff --git a/string.c b/string.c index 827555d9e0..f371e185d6 100644 --- a/string.c +++ b/string.c @@ -7628,17 +7628,11 @@ static VALUE rb_str_is_ascii_only_p(VALUE str); /* * call-seq: - * undump -> string + * undump -> new_string * - * Returns an unescaped version of +self+: - * - * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\"" - * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\"" - * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\"" - * s_undumped == s_orig # => true - * - * Related: String#dump (inverse of String#undump). + * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone." * + * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String]. */ static VALUE