Skip to content

Commit 2171105

Browse files
committed
Fix bug in NTriples::Reader.unescape where it was overeager; now uses a string scanner to iterate through the string buffer.
1 parent f859e03 commit 2171105

2 files changed

Lines changed: 18 additions & 21 deletions

File tree

lib/rdf/ntriples/reader.rb

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@ def self.parse_node(input, **options)
135135
# @return [RDF::URI]
136136
def self.parse_uri(input, intern: false, **options)
137137
if input =~ URIREF
138-
uri_str = unescape($1)
139138
RDF::URI.send(intern ? :intern : :new, unescape($1))
140139
end
141140
end
@@ -178,26 +177,23 @@ def self.parse_literal(input, **options)
178177
def self.unescape(string)
179178
# Note: avoiding copying the input string when no escaping is needed
180179
# greatly reduces the number of allocations and the processing time.
181-
unless string.encoding == Encoding::UTF_8
182-
string = string.dup.force_encoding(Encoding::UTF_8)
183-
end
184-
185-
has_escape_chars = ESCAPE_CHARS_ESCAPED_REGEXP.match?(string)
186-
has_uchar = UCHAR.match?(string)
187-
188-
string = string.dup if has_escape_chars || has_uchar
180+
string = string.dup.force_encoding(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8
181+
scanner = StringScanner.new(string)
189182

190-
# Decode \t|\n|\r|\"\'\|\\ character escapes using Regexp:
191-
string.gsub!(ESCAPE_CHARS_ESCAPED_REGEXP) do
192-
ESCAPE_CHARS_ESCAPED.fetch($~[0])
193-
end if has_escape_chars
183+
buffer = ""
194184

195-
# Decode \uXXXX and \UXXXXXXXX code points:
196-
string.gsub!(UCHAR) do
197-
[($1 || $2).hex].pack('U*')
198-
end if has_uchar
185+
while !scanner.eos?
186+
buffer << if scanner.scan(ESCAPE_CHARS_ESCAPED_REGEXP)
187+
ESCAPE_CHARS_ESCAPED[scanner.matched]
188+
elsif scanner.scan(UCHAR)
189+
scanner.matched.sub(UCHAR) {[($1 || $2).hex].pack('U*')}
190+
else
191+
# Scan one character
192+
scanner.getch
193+
end
194+
end
199195

200-
string
196+
buffer
201197
end
202198

203199
##
@@ -257,15 +253,15 @@ def read_uriref(intern: false, **options)
257253
uri.canonicalize! if canonicalize?
258254
uri
259255
end
260-
rescue ArgumentError => e
256+
rescue ArgumentError
261257
log_error("Invalid URI (found: \"<#{uri_str}>\")", lineno: lineno, token: "<#{uri_str}>", exception: RDF::ReaderError)
262258
end
263259

264260
##
265261
# @return [RDF::Node]
266262
# @see http://www.w3.org/TR/rdf-testcases/#ntrip_grammar (nodeID)
267263
def read_node
268-
if node_id = match(NODEID)
264+
if node_id = match(NODEID)
269265
@nodes ||= {}
270266
@nodes[node_id] ||= RDF::Node.new(node_id)
271267
end

spec/ntriples_spec.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,12 +213,13 @@
213213
"_\\u6C34_" => "_\xE6\xB0\xB4_",
214214
"\\u677E\\u672C \\u540E\\u5B50"=> "松本 后子",
215215
"D\\u00FCrst" => "Dürst",
216+
"\\u0039" => "9",
217+
"\\\\u0039" => "\\u0039",
216218
}
217219
strings.each do |string, unescaped|
218220
specify string do
219221
unescaped = unescaped.encode(Encoding::UTF_8)
220222
expect(reader.unescape(string.freeze)).to eq unescaped
221-
222223
end
223224
end
224225
end

0 commit comments

Comments
 (0)