Skip to content

Commit b695f0d

Browse files
committed
add --encoding-compatibility mode to prevent exceptions on operations between utf-8 and ascii-8bit strings
1 parent 28ad07f commit b695f0d

8 files changed

Lines changed: 217 additions & 10 deletions

File tree

enc/trans/single_byte.trans

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22

33
<%
44
us_ascii_map = [["{00-7f}", :nomap]]
5+
binary_map = [["{00-ff}", :nomap]]
56

67
transcode_tblgen "US-ASCII", "UTF-8", us_ascii_map
78
transcode_tblgen "UTF-8", "US-ASCII", us_ascii_map
89
transcode_tblgen "ASCII-8BIT", "UTF-8", us_ascii_map
910
transcode_tblgen "UTF-8", "ASCII-8BIT", us_ascii_map
11+
transcode_tblgen "UTF-8-COMPAT", "ASCII-8BIT", binary_map, '{00-ff}'
1012

1113
CONTROL1_TO_UCS_TBL = (0x80..0x9f).map {|c| ["%02X" % c, c] }
1214

@@ -84,8 +86,14 @@
8486

8587
<%= transcode_generated_code %>
8688

89+
extern int rb_encoding_compat;
90+
8791
TRANS_INIT(single_byte)
8892
{
93+
if (rb_encoding_compat) {
94+
((struct rb_transcoder *)&rb_from_ASCII_8BIT)->conv_tree_start = from_UTF_8_COMPAT_to_ASCII_8BIT;
95+
((struct rb_transcoder *) &rb_to_ASCII_8BIT)->conv_tree_start = from_UTF_8_COMPAT_to_ASCII_8BIT;
96+
}
8997
<%= transcode_register_code %>
9098
}
9199

encoding.c

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ void rb_encdb_set_unicode(int index);
3333
#pragma GCC visibility pop
3434
#endif
3535

36+
int rb_encoding_compat;
37+
3638
static ID id_encoding;
3739
VALUE rb_cEncoding;
3840
static VALUE rb_encoding_list;
@@ -899,6 +901,14 @@ rb_enc_compatible(VALUE str1, VALUE str2)
899901
if (cr2 == ENC_CODERANGE_7BIT) {
900902
return enc1;
901903
}
904+
if (rb_encoding_compat) {
905+
if (idx1 == ENCINDEX_UTF_8 && idx2 == ENCINDEX_ASCII) {
906+
return enc2;
907+
}
908+
else if (idx1 == ENCINDEX_ASCII && idx2 == ENCINDEX_UTF_8) {
909+
return enc1;
910+
}
911+
}
902912
}
903913
if (cr1 == ENC_CODERANGE_7BIT)
904914
return enc2;
@@ -991,7 +1001,10 @@ rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
9911001
rb_raise(rb_eArgError, "empty string");
9921002
r = rb_enc_precise_mbclen(p, e, enc);
9931003
if (!MBCLEN_CHARFOUND_P(r)) {
994-
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
1004+
if (rb_encoding_compat && enc == rb_utf8_encoding() && MBCLEN_CHARFOUND_P(r = rb_enc_precise_mbclen(p, e, rb_ascii8bit_encoding())))
1005+
enc = rb_ascii8bit_encoding();
1006+
else
1007+
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
9951008
}
9961009
if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
9971010
return rb_enc_mbc_to_codepoint(p, e, enc);
@@ -1480,6 +1493,12 @@ get_default_internal(VALUE klass)
14801493
return rb_enc_default_internal();
14811494
}
14821495

1496+
static VALUE
1497+
rb_enc_compat_mode_enabled_p(VALUE klass)
1498+
{
1499+
return rb_encoding_compat ? Qtrue : Qfalse;
1500+
}
1501+
14831502
void
14841503
rb_enc_set_default_internal(VALUE encoding)
14851504
{
@@ -1907,6 +1926,9 @@ Init_Encoding(void)
19071926
for (i = 0; i < enc_table.count; ++i) {
19081927
rb_ary_push(list, enc_new(enc_table.list[i].enc));
19091928
}
1929+
1930+
rb_const_set(rb_cEncoding, rb_intern_const("COMPAT_MODE_AVAILABLE"), Qtrue);
1931+
rb_define_singleton_method(rb_cEncoding, "compat_mode_enabled?", rb_enc_compat_mode_enabled_p, 0);
19101932
}
19111933

19121934
/* locale insensitive ctype functions */

include/ruby/encoding.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ RUBY_SYMBOL_EXPORT_BEGIN
6868
ENC_CODERANGE_SET(rb_encoding_coderange_obj, (cr)); \
6969
} while (0)
7070

71+
extern int rb_encoding_compat;
7172
typedef OnigEncodingType rb_encoding;
7273

7374
int rb_char_to_option_kcode(int c, int *option, int *kcode);

re.c

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,15 +1272,16 @@ static rb_encoding*
12721272
rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
12731273
{
12741274
rb_encoding *enc = 0;
1275+
enc = rb_enc_get(str);
12751276

12761277
if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
1277-
rb_raise(rb_eArgError,
1278-
"invalid byte sequence in %s",
1279-
rb_enc_name(rb_enc_get(str)));
1278+
if (!(rb_encoding_compat && enc == rb_utf8_encoding()))
1279+
rb_raise(rb_eArgError,
1280+
"invalid byte sequence in %s",
1281+
rb_enc_name(rb_enc_get(str)));
12801282
}
12811283

12821284
rb_reg_check(re);
1283-
enc = rb_enc_get(str);
12841285
if (!rb_enc_str_asciicompat_p(str)) {
12851286
if (RREGEXP(re)->ptr->enc != enc) {
12861287
reg_enc_error(re, str);
@@ -1290,15 +1291,21 @@ rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
12901291
if (RREGEXP(re)->ptr->enc != enc &&
12911292
(!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
12921293
rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
1294+
if (rb_encoding_compat &&
1295+
((RREGEXP(re)->ptr->enc == rb_ascii8bit_encoding() && enc == rb_utf8_encoding()) ||
1296+
(enc == rb_ascii8bit_encoding() && RREGEXP(re)->ptr->enc == rb_utf8_encoding()))) {
1297+
return rb_ascii8bit_encoding();
1298+
}
12931299
reg_enc_error(re, str);
12941300
}
12951301
enc = RREGEXP(re)->ptr->enc;
12961302
}
12971303
if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
12981304
enc != rb_ascii8bit_encoding() &&
12991305
rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1300-
rb_warn("regexp match /.../n against to %s string",
1301-
rb_enc_name(enc));
1306+
if (!(rb_encoding_compat && enc == rb_utf8_encoding()))
1307+
rb_warn("regexp match /.../n against to %s string",
1308+
rb_enc_name(enc));
13021309
}
13031310
return enc;
13041311
}

ruby.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,6 +1117,9 @@ proc_options(long argc, char **argv, struct cmdline_options *opt, int envopt)
11171117
set_source_encoding_once(opt, s, 0);
11181118
}
11191119
#endif
1120+
else if (strcmp("encoding-compatibility", s) == 0) {
1121+
rb_encoding_compat = 1;
1122+
}
11201123
else if (strcmp("version", s) == 0) {
11211124
if (envopt) goto noenvopt_long;
11221125
opt->dump |= DUMP_BIT(version);

string.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2161,7 +2161,12 @@ rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
21612161
if (ptr_cr_ret)
21622162
*ptr_cr_ret = ptr_cr;
21632163

2164-
if (str_encindex != ptr_encindex &&
2164+
if (rb_encoding_compat &&
2165+
((str_encindex == rb_utf8_encindex() && ptr_encindex == rb_ascii8bit_encindex()) ||
2166+
(str_encindex == rb_ascii8bit_encindex() && ptr_encindex == rb_utf8_encindex()))) {
2167+
/* fall through to conditional below */
2168+
}
2169+
else if (str_encindex != ptr_encindex &&
21652170
str_cr != ENC_CODERANGE_7BIT &&
21662171
ptr_cr != ENC_CODERANGE_7BIT) {
21672172
incompatible:
@@ -2170,7 +2175,14 @@ rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
21702175
rb_enc_name(rb_enc_from_index(ptr_encindex)));
21712176
}
21722177

2173-
if (str_cr == ENC_CODERANGE_UNKNOWN) {
2178+
if (rb_encoding_compat &&
2179+
str_encindex != ptr_encindex &&
2180+
str_cr != ENC_CODERANGE_7BIT && ptr_cr != ENC_CODERANGE_7BIT) {
2181+
/* from fall through above */
2182+
res_encindex = rb_ascii8bit_encindex();
2183+
res_cr = ENC_CODERANGE_VALID;
2184+
}
2185+
else if (str_cr == ENC_CODERANGE_UNKNOWN) {
21742186
res_encindex = str_encindex;
21752187
res_cr = ENC_CODERANGE_UNKNOWN;
21762188
}
@@ -2383,6 +2395,8 @@ rb_str_hash(VALUE str)
23832395
if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
23842396
e = 0;
23852397
}
2398+
if (rb_encoding_compat && (e == rb_utf8_encindex() || e == rb_ascii8bit_encindex()))
2399+
e = 0;
23862400
return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
23872401
}
23882402

@@ -2437,6 +2451,11 @@ rb_str_comparable(VALUE str1, VALUE str2)
24372451
if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
24382452
return TRUE;
24392453
}
2454+
if (rb_encoding_compat &&
2455+
((idx1 == rb_utf8_encindex() && idx2 == rb_ascii8bit_encindex()) ||
2456+
(idx1 == rb_ascii8bit_encindex() && idx2 == rb_utf8_encindex()))) {
2457+
return TRUE;
2458+
}
24402459
return FALSE;
24412460
}
24422461

@@ -6286,7 +6305,8 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
62866305
long slen = RSTRING_LEN(spat);
62876306

62886307
if (is_broken_string(str)) {
6289-
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
6308+
if (!(rb_encoding_compat && STR_ENC_GET(str) == rb_utf8_encoding()))
6309+
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
62906310
}
62916311
if (is_broken_string(spat)) {
62926312
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# coding: utf-8
2+
require 'test/unit'
3+
4+
class BinaryUTF8CompatTest < Test::Unit::TestCase
5+
def setup
6+
@binary_mb = 'héllø'.force_encoding('binary')
7+
@utf8_mb = 'héllø'.force_encoding('utf-8')
8+
@binary_as = 'hello'.force_encoding('binary')
9+
@utf8_as = 'hello'.force_encoding('utf-8')
10+
end
11+
12+
def test_encode_utf8_to_binary
13+
assert_nothing_raised do
14+
@utf8_mb.encode('binary')
15+
end
16+
end
17+
18+
def test_encode_binary_to_utf8
19+
assert_nothing_raised do
20+
@binary_mb.encode('utf-8')
21+
end
22+
end
23+
24+
def test_invalid_byte_seq
25+
email = "\xD0\xEE\xEC\xE0\xF8\xEA\xE0@MyAcerPC.(none)".force_encoding('UTF-8')
26+
assert_nothing_raised do
27+
email.strip
28+
email.split("\0")
29+
email.split(/\s+/)
30+
end
31+
end
32+
33+
def test_equal_contents
34+
assert_equal @binary_mb, @utf8_mb
35+
end
36+
37+
def test_hash_lookups
38+
hash = {}
39+
hash[@binary_mb] = 1
40+
assert_equal 1, hash[@utf8_mb]
41+
end
42+
43+
def test_match_binary_regexp
44+
assert_nothing_raised do
45+
assert_equal 0, Regexp.new(@binary_mb) =~ @utf8_mb
46+
end
47+
end
48+
49+
def test_match_utf8_regexp
50+
assert_nothing_raised do
51+
assert_equal 0, Regexp.new(@utf8_mb) =~ @binary_mb
52+
end
53+
end
54+
55+
def test_add_binary
56+
ret = @binary_mb + @binary_mb
57+
assert_equal Encoding::ASCII_8BIT, ret.encoding
58+
ret = @binary_mb + @binary_as
59+
assert_equal Encoding::ASCII_8BIT, ret.encoding
60+
ret = @binary_as + @binary_mb
61+
assert_equal Encoding::ASCII_8BIT, ret.encoding
62+
ret = @binary_as + @binary_as
63+
assert_equal Encoding::ASCII_8BIT, ret.encoding
64+
end
65+
66+
def test_add_utf8
67+
ret = @utf8_mb + @utf8_mb
68+
assert_equal Encoding::UTF_8, ret.encoding
69+
ret = @utf8_mb + @utf8_as
70+
assert_equal Encoding::UTF_8, ret.encoding
71+
ret = @utf8_as + @utf8_mb
72+
assert_equal Encoding::UTF_8, ret.encoding
73+
ret = @utf8_as + @utf8_as
74+
assert_equal Encoding::UTF_8, ret.encoding
75+
end
76+
77+
def test_add_utf8_plus_7bit
78+
ret = @binary_as + @utf8_as
79+
assert_equal Encoding::ASCII_8BIT, ret.encoding
80+
ret = @binary_as + @utf8_mb
81+
assert_equal Encoding::UTF_8, ret.encoding
82+
ret = @utf8_as + @binary_as
83+
assert_equal Encoding::UTF_8, ret.encoding
84+
ret = @utf8_mb + @binary_as
85+
assert_equal Encoding::UTF_8, ret.encoding
86+
end
87+
88+
def test_add_8bit_plus_utf8
89+
ret = @binary_mb + @utf8_mb
90+
assert_equal Encoding::ASCII_8BIT, ret.encoding
91+
ret = @binary_mb + @utf8_as
92+
assert_equal Encoding::ASCII_8BIT, ret.encoding
93+
ret = @utf8_mb + @binary_mb
94+
assert_equal Encoding::ASCII_8BIT, ret.encoding
95+
ret = @utf8_as + @binary_mb
96+
assert_equal Encoding::ASCII_8BIT, ret.encoding
97+
end
98+
99+
def test_concat_binary
100+
ret = @binary_mb.dup << @binary_mb
101+
assert_equal Encoding::ASCII_8BIT, ret.encoding
102+
ret = @binary_mb.dup << @binary_as
103+
assert_equal Encoding::ASCII_8BIT, ret.encoding
104+
ret = @binary_as.dup << @binary_mb
105+
assert_equal Encoding::ASCII_8BIT, ret.encoding
106+
ret = @binary_as.dup << @binary_as
107+
assert_equal Encoding::ASCII_8BIT, ret.encoding
108+
end
109+
110+
def test_concat_utf8
111+
ret = @utf8_mb.dup << @utf8_mb
112+
assert_equal Encoding::UTF_8, ret.encoding
113+
ret = @utf8_mb.dup << @utf8_as
114+
assert_equal Encoding::UTF_8, ret.encoding
115+
ret = @utf8_as.dup << @utf8_mb
116+
assert_equal Encoding::UTF_8, ret.encoding
117+
ret = @utf8_as.dup << @utf8_as
118+
assert_equal Encoding::UTF_8, ret.encoding
119+
end
120+
121+
def test_concat_utf8_and_7bit
122+
ret = @binary_as.dup << @utf8_as
123+
assert_equal Encoding::ASCII_8BIT, ret.encoding
124+
ret = @binary_as.dup << @utf8_mb
125+
assert_equal Encoding::UTF_8, ret.encoding
126+
ret = @utf8_as.dup << @binary_as
127+
assert_equal Encoding::UTF_8, ret.encoding
128+
ret = @utf8_mb.dup << @binary_as
129+
assert_equal Encoding::UTF_8, ret.encoding
130+
end
131+
132+
def test_concat_8bit_and_utf8
133+
ret = @binary_mb.dup << @utf8_mb
134+
assert_equal Encoding::ASCII_8BIT, ret.encoding
135+
ret = @binary_mb.dup << @utf8_as
136+
assert_equal Encoding::ASCII_8BIT, ret.encoding
137+
ret = @utf8_mb.dup << @binary_mb
138+
assert_equal Encoding::ASCII_8BIT, ret.encoding
139+
ret = @utf8_as.dup << @binary_mb
140+
assert_equal Encoding::ASCII_8BIT, ret.encoding
141+
end
142+
end

test/ruby/test_encoding.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,8 @@ def test_errinfo_after_autoload
117117
}
118118
end;
119119
end
120+
121+
def test_compatibility_mode
122+
assert_in_out_err(%W[--encoding-compatibility #{File.expand_path('../enc/compatibility_mode.rb', __FILE__)}], "", /15 tests, 41 assertions, 0 failures, 0 errors, 0 skips/, [])
123+
end
120124
end

0 commit comments

Comments
 (0)