Skip to content

Commit 2e9ed41

Browse files
committed
Updates to URI normalization. Fixes #439.
1 parent 60363a4 commit 2e9ed41

2 files changed

Lines changed: 76 additions & 50 deletions

File tree

lib/rdf/model/uri.rb

Lines changed: 35 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# coding: utf-8
2+
# frozen_string_literal: true
23
require 'cgi'
34

45
module RDF
@@ -116,7 +117,17 @@ class URI
116117
# Note: not all reserved characters need to be escaped in SPARQL/Turtle, but they must be unescaped when encountered
117118
PN_ESCAPE_CHARS = /[~\.!\$&'\(\)\*\+,;=\/\?\#@%]/.freeze
118119
PN_ESCAPES = /\\#{Regexp.union(PN_ESCAPE_CHARS, /[\-_]/)}/.freeze
119-
120+
121+
# For URI encoding
122+
ENCODE_USER = Regexp.compile("[^#{IUNRESERVED}#{SUB_DELIMS}]").freeze
123+
ENCODE_PASSWORD = Regexp.compile("[^#{IUNRESERVED}#{SUB_DELIMS}]").freeze
124+
ENCODE_ISEGMENT = Regexp.compile("[^#{IPCHAR}]").freeze
125+
ENCODE_ISEGMENT_NC = Regexp.compile("[^#{IUNRESERVED}|#{PCT_ENCODED}|[#{SUB_DELIMS}]|@]").freeze
126+
ENCODE_IQUERY = Regexp.compile("[^#{IQUERY}]").freeze
127+
ENCODE_IFRAGMENT = Regexp.compile("[^#{IFRAGMENT}]").freeze
128+
ENCODE_PORT = Regexp.compile('[^\d]').freeze
129+
ENCODE_IHOST = Regexp.compile("(?:#{IP_literal})|(?:#{IREG_NAME})").freeze
130+
120131
##
121132
# Cache size may be set through {RDF.config} using `uri_cache_size`.
122133
#
@@ -170,7 +181,7 @@ def self.parse(str)
170181
# @return [String] normalized path
171182
# @see http://tools.ietf.org/html/rfc3986#section-5.2.4
172183
def self.normalize_path(path)
173-
output, input = "", path.to_s
184+
output, input = String.new, path.to_s
174185
if input.encoding != Encoding::ASCII_8BIT
175186
input = input.dup.force_encoding(Encoding::ASCII_8BIT)
176187
end
@@ -353,7 +364,7 @@ def length
353364
# @return [Boolean] `true` or `false`
354365
# @since 0.3.9
355366
def valid?
356-
RDF::URI::IRI.match(to_s) || false
367+
RDF::URI::IRI.match?(to_s) || false
357368
end
358369

359370
##
@@ -920,7 +931,7 @@ def scheme=(value)
920931
# Return normalized version of scheme, if any
921932
# @return [String]
922933
def normalized_scheme
923-
normalize_segment(scheme.strip, SCHEME, true) if scheme
934+
scheme.strip.downcase if scheme
924935
end
925936

926937
##
@@ -946,7 +957,7 @@ def user=(value)
946957
# Normalized version of user
947958
# @return [String]
948959
def normalized_user
949-
URI.encode(CGI.unescape(user), /[^#{IUNRESERVED}|#{SUB_DELIMS}]/).force_encoding(Encoding::UTF_8) if user
960+
URI.encode(CGI.unescape(user), ENCODE_USER).force_encoding(Encoding::UTF_8) if user
950961
end
951962

952963
##
@@ -972,7 +983,7 @@ def password=(value)
972983
# Normalized version of password
973984
# @return [String]
974985
def normalized_password
975-
URI.encode(CGI.unescape(password), /[^#{IUNRESERVED}|#{SUB_DELIMS}]/).force_encoding(Encoding::UTF_8) if password
986+
URI.encode(CGI.unescape(password), ENCODE_PASSWORD).force_encoding(Encoding::UTF_8) if password
976987
end
977988

978989
HOST_FROM_AUTHORITY_RE = /(?:[^@]+@)?([^:]+)(?::.*)?$/.freeze
@@ -1000,7 +1011,7 @@ def host=(value)
10001011
# @return [String]
10011012
def normalized_host
10021013
# Remove trailing '.' characters
1003-
normalize_segment(host, IHOST, true).chomp('.') if host
1014+
host.sub(/\.*$/, '').downcase if host
10041015
end
10051016

10061017
PORT_FROM_AUTHORITY_RE = /:(\d+)$/.freeze
@@ -1028,12 +1039,8 @@ def port=(value)
10281039
# @return [String]
10291040
def normalized_port
10301041
if port
1031-
np = normalize_segment(port.to_s, PORT)
1032-
if PORT_MAPPING[normalized_scheme] == np.to_i
1033-
nil
1034-
else
1035-
np.to_i
1036-
end
1042+
np = port.to_i
1043+
PORT_MAPPING[normalized_scheme] != np ? np : nil
10371044
end
10381045
end
10391046

@@ -1069,25 +1076,25 @@ def normalized_path
10691076
norm_segs = case
10701077
when authority
10711078
# ipath-abempty
1072-
segments.map {|s| normalize_segment(s, ISEGMENT)}
1079+
segments.map {|s| normalize_segment(s, ENCODE_ISEGMENT)}
10731080
when segments[0].nil?
10741081
# ipath-absolute
10751082
res = [nil]
1076-
res << normalize_segment(segments[1], ISEGMENT_NZ) if segments.length > 1
1077-
res += segments[2..-1].map {|s| normalize_segment(s, ISEGMENT)} if segments.length > 2
1083+
res << normalize_segment(segments[1], ENCODE_ISEGMENT) if segments.length > 1
1084+
res += segments[2..-1].map {|s| normalize_segment(s, ENCODE_ISEGMENT)} if segments.length > 2
10781085
res
10791086
when segments[0].to_s.index(':')
10801087
# ipath-noscheme
10811088
res = []
1082-
res << normalize_segment(segments[0], ISEGMENT_NZ_NC)
1083-
res += segments[1..-1].map {|s| normalize_segment(s, ISEGMENT)} if segments.length > 1
1089+
res << normalize_segment(segments[0], ENCODE_ISEGMENT_NC)
1090+
res += segments[1..-1].map {|s| normalize_segment(s, ENCODE_ISEGMENT)} if segments.length > 1
10841091
res
10851092
when segments[0]
10861093
# ipath-rootless
10871094
# ipath-noscheme
10881095
res = []
1089-
res << normalize_segment(segments[0], ISEGMENT_NZ)
1090-
res += segments[1..-1].map {|s| normalize_segment(s, ISEGMENT)} if segments.length > 1
1096+
res << normalize_segment(segments[0], ENCODE_ISEGMENT)
1097+
res += segments[1..-1].map {|s| normalize_segment(s, ENCODE_ISEGMENT)} if segments.length > 1
10911098
res
10921099
else
10931100
# Should be empty
@@ -1096,7 +1103,7 @@ def normalized_path
10961103

10971104
res = self.class.normalize_path(norm_segs.join("/"))
10981105
# Special rules for specific protocols having empty paths
1099-
normalize_segment(res.empty? ? (%w(http https ftp tftp).include?(normalized_scheme) ? '/' : "") : res, IHIER_PART)
1106+
res = (res.empty? && %w(http https ftp tftp).include?(normalized_scheme)) ? '/' : res
11001107
end
11011108

11021109
##
@@ -1120,7 +1127,7 @@ def query=(value)
11201127
# Normalized version of query
11211128
# @return [String]
11221129
def normalized_query
1123-
normalize_segment(query, IQUERY) if query
1130+
normalize_segment(query, ENCODE_IQUERY) if query
11241131
end
11251132

11261133
##
@@ -1144,7 +1151,7 @@ def fragment=(value)
11441151
# Normalized version of fragment
11451152
# @return [String]
11461153
def normalized_fragment
1147-
normalize_segment(fragment, IFRAGMENT) if fragment
1154+
normalize_segment(fragment, ENCODE_IFRAGMENT) if fragment
11481155
end
11491156

11501157
##
@@ -1274,15 +1281,15 @@ def query_values=(value)
12741281
self.query = case value
12751282
when Array, Hash
12761283
value.map do |(k,v)|
1277-
k = normalize_segment(k.to_s, UNRESERVED)
1284+
k = normalize_segment(k.to_s, /[^A-Za-z0-9\._~-]/)
12781285
if v.nil?
12791286
k
12801287
else
12811288
Array(v).map do |vv|
12821289
if vv === TrueClass
12831290
k
12841291
else
1285-
"#{k}=#{normalize_segment(vv.to_s, UNRESERVED)}"
1292+
"#{k}=#{normalize_segment(vv.to_s, /[^A-Za-z0-9\._~-]/)}"
12861293
end
12871294
end.join("&")
12881295
end
@@ -1331,15 +1338,15 @@ def self._load(data)
13311338
# Normalize a segment using a character range
13321339
#
13331340
# @param [String] value
1334-
# @param [Regexp] expr
1341+
# @param [Regexp] expr matches characters to be encoded
13351342
# @param [Boolean] downcase
13361343
# @return [String]
13371344
def normalize_segment(value, expr, downcase = false)
13381345
if value
13391346
value = value.dup.force_encoding(Encoding::UTF_8)
13401347
decoded = CGI.unescape(value)
13411348
decoded.downcase! if downcase
1342-
URI.encode(decoded, /[^(?:#{expr})]/).force_encoding(Encoding::UTF_8)
1349+
URI.encode(decoded, expr).force_encoding(Encoding::UTF_8)
13431350
end
13441351
end
13451352

@@ -1364,7 +1371,7 @@ def format_authority
13641371
def self.encode(str, expr)
13651372
str.gsub(expr) do
13661373
us = $&
1367-
tmp = ''
1374+
tmp = String.new
13681375
us.each_byte do |uc|
13691376
tmp << sprintf('%%%02X', uc)
13701377
end

spec/model_uri_spec.rb

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -317,64 +317,64 @@
317317
%W(\U00000053 Dürst AZazÀÖØöø˿Ͱͽ΄῾‌‍⁰↉Ⰰ⿕、ퟻ﨎ﷇﷰ￯𐀀𪘀)
318318
}
319319
{
320-
"" => "%s",
321-
"and query" => "%s?%s",
322-
"and fragment" => "%s#%s",
323-
"and query and fragment" => "%s?%s#%s",
320+
"" => "%{p1}",
321+
"and query" => "%{p1}?%{p2}",
322+
"and fragment" => "%{p1}#%{p2}",
323+
"and query and fragment" => "%{p1}?%{p2}#%{p3}",
324324
}.each do |mod, fmt|
325325
it "validates IRI with authority and ipath-abempty #{mod}" do
326326
refs.each do |c|
327-
expect(RDF::URI("scheme://auth/#{fmt}" % ["", c, c])).to be_valid
328-
expect(RDF::URI("scheme://auth/#{fmt}" % [c, c, c])).to be_valid
329-
expect(RDF::URI("scheme://auth/#{fmt}" % ["#{c}/#{c}", c, c])).to be_valid
327+
expect(RDF::URI("scheme://auth/#{fmt}" % {p1: "", p2: c, p3: c})).to be_valid
328+
expect(RDF::URI("scheme://auth/#{fmt}" % {p1: c, p2: c, p3: c})).to be_valid
329+
expect(RDF::URI("scheme://auth/#{fmt}" % {p1: "#{c}/#{c}", p2: c, p3: c})).to be_valid
330330
end
331331
end
332332
it "validates IRI with path-absolute #{mod}" do
333333
refs.each do |c|
334-
expect(RDF::URI("scheme:/#{fmt}" % ["", c, c])).to be_valid
335-
expect(RDF::URI("scheme:/#{fmt}" % [c, c, c])).to be_valid
336-
expect(RDF::URI("scheme:/#{fmt}" % ["#{c}/#{c}", c, c])).to be_valid
334+
expect(RDF::URI("scheme:/#{fmt}" % {p1: "", p2: c, p3: c})).to be_valid
335+
expect(RDF::URI("scheme:/#{fmt}" % {p1: c, p2: c, p3: c})).to be_valid
336+
expect(RDF::URI("scheme:/#{fmt}" % {p1: "#{c}/#{c}", p2: c, p3: c})).to be_valid
337337
end
338338
end
339339
it "validates IRI with ipath-rootless #{mod}" do
340340
refs.each do |c|
341-
expect(RDF::URI("scheme:#{fmt}" % [c, c, c])).to be_valid
342-
expect(RDF::URI("scheme:#{fmt}" % ["#{c}/#{c}", c, c])).to be_valid
341+
expect(RDF::URI("scheme:#{fmt}" % {p1: c, p2: c, p3: c})).to be_valid
342+
expect(RDF::URI("scheme:#{fmt}" % {p1: "#{c}/#{c}", p2: c, p3: c})).to be_valid
343343
end
344344
end
345345
it "validates IRI with ipath-empty #{mod}" do
346346
refs.each do |c|
347-
expect(RDF::URI("scheme:#{fmt}" % ["", c, c])).to be_valid
347+
expect(RDF::URI("scheme:#{fmt}" % {p1: "", p2: c, p3: c})).to be_valid
348348
end
349349
end
350350

351351
it "invalidates irelative-ref with authority #{mod}" do
352352
refs.each do |c|
353-
expect(RDF::URI("//auth/#{fmt}" % [c, c, c])).not_to be_valid
353+
expect(RDF::URI("//auth/#{fmt}" % {p1: c, p2: c, p3: c})).not_to be_valid
354354
end
355355
end
356356
it "invalidates irelative-ref with authority and port #{mod}" do
357357
refs.each do |c|
358-
expect(RDF::URI("//auth:123/#{fmt}" % [c, c, c])).not_to be_valid
358+
expect(RDF::URI("//auth:123/#{fmt}" % {p1: c, p2: c, p3: c})).not_to be_valid
359359
end
360360
end
361361
it "invalidates irelative-ref with ipath-absolute #{mod}" do
362362
refs.each do |c|
363-
expect(RDF::URI("/#{fmt}" % [c, c, c])).not_to be_valid
364-
expect(RDF::URI("/#{fmt}" % ["#{c}/", c, c])).not_to be_valid
365-
expect(RDF::URI("/#{fmt}" % ["#{c}/#{c}", c, c])).not_to be_valid
363+
expect(RDF::URI("/#{fmt}" % {p1: c, p2: c, p3: c})).not_to be_valid
364+
expect(RDF::URI("/#{fmt}" % {p1: "#{c}/", p2: c, p3: c})).not_to be_valid
365+
expect(RDF::URI("/#{fmt}" % {p1: "#{c}/#{c}", p2: c, p3: c})).not_to be_valid
366366
end
367367
end
368368
it "invalidates irelative-ref with ipath-noscheme #{mod}" do
369369
refs.each do |c|
370-
expect(RDF::URI("#{fmt}" % [c, c, c])).not_to be_valid
371-
expect(RDF::URI("#{fmt}" % ["#{c}/", c, c])).not_to be_valid
372-
expect(RDF::URI("#{fmt}" % ["#{c}/#{c}", c, c])).not_to be_valid
370+
expect(RDF::URI("#{fmt}" % {p1: c, p2: c, p3: c})).not_to be_valid
371+
expect(RDF::URI("#{fmt}" % {p1: "#{c}/", p2: c, p3: c})).not_to be_valid
372+
expect(RDF::URI("#{fmt}" % {p1: "#{c}/#{c}", p2: c, p3: c})).not_to be_valid
373373
end
374374
end
375375
it "invalidates irelative-ref with ipath-empty #{mod}" do
376376
refs.each do |c|
377-
expect(RDF::URI("#{fmt}" % ["", c, c])).not_to be_valid
377+
expect(RDF::URI("#{fmt}" % {p1: "", p2: c, p3: c})).not_to be_valid
378378
end
379379
end
380380
end
@@ -386,6 +386,25 @@
386386
expect(RDF::URI("http://example/#{c}")).not_to be_valid
387387
end
388388
end
389+
390+
[
391+
'file:///path/to/file with spaces.txt',
392+
'scheme://auth/\u0000',
393+
'scheme://auth/\u005C',
394+
'scheme://auth/\u005E',
395+
'scheme://auth/\u0060',
396+
'scheme://auth/\\u0000',
397+
'scheme://auth/\\u005C',
398+
'scheme://auth/\\u005E',
399+
'scheme://auth/\\u0060',
400+
'scheme://auth/^',
401+
'scheme://auth/`',
402+
'scheme://auth/\\',
403+
].each do |u|
404+
it "does not validate <#{u}>" do
405+
expect(RDF::URI(u)).not_to be_valid
406+
end
407+
end
389408
end
390409

391410
describe "#invalid?" do

0 commit comments

Comments
 (0)