Fix bug in NTriples::Reader.unescape where it was overeager; now uses a string scanner to iterate through the string buffer.

gkellogg · gkellogg · commit 21711058797d · 2019-01-20T15:45:34.000-08:00
diff --git a/lib/rdf/ntriples/reader.rb b/lib/rdf/ntriples/reader.rb
@@ -135,7 +135,6 @@ def self.parse_node(input, **options)
     # @return [RDF::URI]
     def self.parse_uri(input, intern: false, **options)
       if input =~ URIREF
-        uri_str = unescape($1)
         RDF::URI.send(intern ? :intern : :new, unescape($1))
       end
     end
@@ -178,26 +177,23 @@ def self.parse_literal(input, **options)
     def self.unescape(string)
       # Note: avoiding copying the input string when no escaping is needed
       # greatly reduces the number of allocations and the processing time.
-      unless string.encoding == Encoding::UTF_8
-        string = string.dup.force_encoding(Encoding::UTF_8)
-      end
-
-      has_escape_chars = ESCAPE_CHARS_ESCAPED_REGEXP.match?(string)
-      has_uchar = UCHAR.match?(string)
-
-      string = string.dup if has_escape_chars || has_uchar
+      string = string.dup.force_encoding(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8
+      scanner = StringScanner.new(string)
 
-      # Decode \t|\n|\r|\"\'\|\\ character escapes using Regexp:
-      string.gsub!(ESCAPE_CHARS_ESCAPED_REGEXP) do
-        ESCAPE_CHARS_ESCAPED.fetch($~[0])
-      end if has_escape_chars
+      buffer = ""
 
-      # Decode \uXXXX and \UXXXXXXXX code points:
-      string.gsub!(UCHAR) do
-        [($1 || $2).hex].pack('U*')
-      end if has_uchar
+      while !scanner.eos?
+        buffer << if scanner.scan(ESCAPE_CHARS_ESCAPED_REGEXP)
+          ESCAPE_CHARS_ESCAPED[scanner.matched]
+        elsif scanner.scan(UCHAR)
+          scanner.matched.sub(UCHAR) {[($1 || $2).hex].pack('U*')}
+        else
+          # Scan one character
+          scanner.getch
+        end
+      end
 
-      string
+      buffer
     end
 
     ##
@@ -257,15 +253,15 @@ def read_uriref(intern: false, **options)
         uri.canonicalize! if canonicalize?
         uri
       end
-    rescue ArgumentError => e
+    rescue ArgumentError
       log_error("Invalid URI (found: \"<#{uri_str}>\")", lineno: lineno, token: "<#{uri_str}>", exception: RDF::ReaderError)
     end
 
     ##
     # @return [RDF::Node]
     # @see    http://www.w3.org/TR/rdf-testcases/#ntrip_grammar (nodeID)
     def read_node
-       if node_id = match(NODEID)
+      if node_id = match(NODEID)
         @nodes ||= {}
         @nodes[node_id] ||= RDF::Node.new(node_id)
       end
diff --git a/spec/ntriples_spec.rb b/spec/ntriples_spec.rb
@@ -213,12 +213,13 @@
         "_\\u6C34_"                    => "_\xE6\xB0\xB4_",
         "\\u677E\\u672C \\u540E\\u5B50"=> "松本 后子",
         "D\\u00FCrst"                  => "Dürst",
+        "\\u0039"                      => "9",
+        "\\\\u0039"                    => "\\u0039",
       }
       strings.each do |string, unescaped|
         specify string do
           unescaped = unescaped.encode(Encoding::UTF_8)
           expect(reader.unescape(string.freeze)).to eq unescaped
-
         end
       end
     end