Correctly handle celestial plane codepoints in ES5.1. (#3510)

zherczeg · dbatyai · commit 0d7b46118571 · 2020-01-14T15:11:59.000+01:00
Fixes #3498. JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg.u-szeged@partner.samsung.com
diff --git a/jerry-core/lit/lit-char-helpers.c b/jerry-core/lit/lit-char-helpers.c
@@ -223,6 +223,8 @@ lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point
     /* TODO: detect these ranges correctly. */
     return (code_point >= 0x10C80 && code_point <= 0x10CF2);
   }
+#else /* !ENABLED (JERRY_ES2015) */
+  JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN);
 #endif /* ENABLED (JERRY_ES2015) */
 
   return lit_char_is_unicode_letter ((ecma_char_t) code_point);
@@ -252,6 +254,8 @@ lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point
     /* TODO: detect these ranges correctly. */
     return (code_point >= 0x10C80 && code_point <= 0x10CF2);
   }
+#else /* !ENABLED (JERRY_ES2015) */
+  JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN);
 #endif /* ENABLED (JERRY_ES2015) */
 
   return (lit_char_is_unicode_letter ((ecma_char_t) code_point)
diff --git a/jerry-core/parser/js/js-lexer.c b/jerry-core/parser/js/js-lexer.c
@@ -693,12 +693,12 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */
 
     if (JERRY_UNLIKELY (code_point >= LIT_UTF8_2_BYTE_MARKER))
     {
+#if ENABLED (JERRY_ES2015)
       utf8_length = lit_read_code_point_from_utf8 (source_p,
                                                    (lit_utf8_size_t) (source_end_p - source_p),
                                                    &code_point);
       decoded_length = utf8_length;
 
-#if ENABLED (JERRY_ES2015)
       /* Only ES2015 supports code points outside of the basic plane which can be part of an identifier. */
       if ((code_point >= LIT_UTF16_HIGH_SURROGATE_MIN && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX)
           && source_p + 3 < source_end_p)
@@ -717,11 +717,23 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */
           char_count = 2;
         }
       }
-      else if (source_p[0] >= LEXER_UTF8_4BYTE_START)
+      else if (source_p[0] >= LIT_UTF8_4_BYTE_MARKER)
       {
         decoded_length = 2 * 3;
         has_escape = true;
       }
+#else /* !ENABLED (JERRY_ES2015) */
+      if (code_point < LIT_UTF8_4_BYTE_MARKER)
+      {
+        utf8_length = lit_read_code_point_from_utf8 (source_p,
+                                                     (lit_utf8_size_t) (source_end_p - source_p),
+                                                     &code_point);
+        decoded_length = utf8_length;
+      }
+      else
+      {
+        code_point = 0;
+      }
 #endif /* ENABLED (JERRY_ES2015) */
     }
 
@@ -1091,7 +1103,7 @@ lexer_parse_string (parser_context_t *context_p, /**< context */
     }
 #endif /* ENABLED (JERRY_ES2015) */
 
-    if (*source_p >= LEXER_UTF8_4BYTE_START)
+    if (*source_p >= LIT_UTF8_4_BYTE_MARKER)
     {
       /* Processing 4 byte unicode sequence (even if it is
        * after a backslash). Always converted to two 3 byte
@@ -1893,7 +1905,7 @@ lexer_convert_ident_to_cesu8 (uint8_t *destination_p, /**< destination string */
     }
 
 #if ENABLED (JERRY_ES2015)
-    if (*source_p >= LEXER_UTF8_4BYTE_START)
+    if (*source_p >= LIT_UTF8_4_BYTE_MARKER)
     {
       lit_four_byte_utf8_char_to_cesu8 (destination_p, source_p);
 
@@ -2113,7 +2125,7 @@ lexer_convert_literal_to_chars (parser_context_t *context_p, /**< context */
     }
 #endif /* ENABLED (JERRY_ES2015) */
 
-    if (*source_p >= LEXER_UTF8_4BYTE_START)
+    if (*source_p >= LIT_UTF8_4_BYTE_MARKER)
     {
       /* Processing 4 byte unicode sequence (even if it is
         * after a backslash). Always converted to two 3 byte
@@ -3028,7 +3040,7 @@ lexer_compare_identifier_to_chars (const uint8_t *left_p, /**< left identifier *
 
       escape_size = lit_code_point_to_cesu8_bytes (utf8_buf, code_point);
     }
-    else if (*left_p >= LEXER_UTF8_4BYTE_START)
+    else if (*left_p >= LIT_UTF8_4_BYTE_MARKER)
     {
       lit_four_byte_utf8_char_to_cesu8 (utf8_buf, left_p);
       escape_size = 3 * 2;
diff --git a/jerry-core/parser/js/js-lexer.h b/jerry-core/parser/js/js-lexer.h
@@ -201,7 +201,6 @@ typedef enum
 #define LEXER_NEWLINE_LS_PS_BYTE_1 0xe2
 #define LEXER_NEWLINE_LS_PS_BYTE_23(source) \
   ((source)[1] == LIT_UTF8_2_BYTE_CODE_POINT_MIN && ((source)[2] | 0x1) == 0xa9)
-#define LEXER_UTF8_4BYTE_START 0xf0
 
 #define LEXER_IS_LEFT_BRACKET(type) \
   ((type) == LEXER_LEFT_BRACE || (type) == LEXER_LEFT_PAREN || (type) == LEXER_LEFT_SQUARE)
diff --git a/tests/unit-core/test-api-errortype.c b/tests/unit-core/test-api-errortype.c
@@ -62,5 +62,17 @@ main (void)
     jerry_release_value (test_values[idx]);
   }
 
+  char test_source[] = "\xF0\x9D\x84\x9E";
+
+  jerry_value_t result = jerry_parse (NULL,
+                                      0,
+                                      (const jerry_char_t *) test_source,
+                                      sizeof (test_source) - 1,
+                                      JERRY_PARSE_NO_OPTS);
+  TEST_ASSERT (jerry_value_is_error (result));
+  TEST_ASSERT (jerry_get_error_type (result) == JERRY_ERROR_SYNTAX);
+
+  jerry_release_value (result);
+
   jerry_cleanup ();
 } /* main */

Original file line number	Diff line number	Diff line change
`@@ -693,12 +693,12 @@ lexer_parse_identifier (parser_context_t context_p, /< context /`
`693`	`693`
`694`	`694`	`if (JERRY_UNLIKELY (code_point >= LIT_UTF8_2_BYTE_MARKER))`
`695`	`695`	`{`
	`696`	`+#if ENABLED (JERRY_ES2015)`
`696`	`697`	`utf8_length = lit_read_code_point_from_utf8 (source_p,`
`697`	`698`	`(lit_utf8_size_t) (source_end_p - source_p),`
`698`	`699`	`&code_point);`
`699`	`700`	`decoded_length = utf8_length;`
`700`	`701`
`701`		`-#if ENABLED (JERRY_ES2015)`
`702`	`702`	`/* Only ES2015 supports code points outside of the basic plane which can be part of an identifier. */`
`703`	`703`	`if ((code_point >= LIT_UTF16_HIGH_SURROGATE_MIN && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX)`
`704`	`704`	`&& source_p + 3 < source_end_p)`
`@@ -717,11 +717,23 @@ lexer_parse_identifier (parser_context_t context_p, /< context /`
`717`	`717`	`char_count = 2;`
`718`	`718`	`}`
`719`	`719`	`}`
`720`		`- else if (source_p[0] >= LEXER_UTF8_4BYTE_START)`
	`720`	`+ else if (source_p[0] >= LIT_UTF8_4_BYTE_MARKER)`
`721`	`721`	`{`
`722`	`722`	`decoded_length = 2 * 3;`
`723`	`723`	`has_escape = true;`
`724`	`724`	`}`
	`725`	`+#else /* !ENABLED (JERRY_ES2015) */`
	`726`	`+ if (code_point < LIT_UTF8_4_BYTE_MARKER)`
	`727`	`+ {`
	`728`	`+ utf8_length = lit_read_code_point_from_utf8 (source_p,`
	`729`	`+ (lit_utf8_size_t) (source_end_p - source_p),`
	`730`	`+ &code_point);`
	`731`	`+ decoded_length = utf8_length;`
	`732`	`+ }`
	`733`	`+ else`
	`734`	`+ {`
	`735`	`+ code_point = 0;`
	`736`	`+ }`
`725`	`737`	`#endif /* ENABLED (JERRY_ES2015) */`
`726`	`738`	`}`
`727`	`739`
`@@ -1091,7 +1103,7 @@ lexer_parse_string (parser_context_t context_p, /< context /`
`1091`	`1103`	`}`
`1092`	`1104`	`#endif /* ENABLED (JERRY_ES2015) */`
`1093`	`1105`
`1094`		`- if (*source_p >= LEXER_UTF8_4BYTE_START)`
	`1106`	`+ if (*source_p >= LIT_UTF8_4_BYTE_MARKER)`
`1095`	`1107`	`{`
`1096`	`1108`	`/* Processing 4 byte unicode sequence (even if it is`
`1097`	`1109`	`* after a backslash). Always converted to two 3 byte`
`@@ -1893,7 +1905,7 @@ lexer_convert_ident_to_cesu8 (uint8_t destination_p, /< destination string /`
`1893`	`1905`	`}`
`1894`	`1906`
`1895`	`1907`	`#if ENABLED (JERRY_ES2015)`
`1896`		`- if (*source_p >= LEXER_UTF8_4BYTE_START)`
	`1908`	`+ if (*source_p >= LIT_UTF8_4_BYTE_MARKER)`
`1897`	`1909`	`{`
`1898`	`1910`	`lit_four_byte_utf8_char_to_cesu8 (destination_p, source_p);`
`1899`	`1911`
`@@ -2113,7 +2125,7 @@ lexer_convert_literal_to_chars (parser_context_t context_p, /< context /`
`2113`	`2125`	`}`
`2114`	`2126`	`#endif /* ENABLED (JERRY_ES2015) */`
`2115`	`2127`
`2116`		`- if (*source_p >= LEXER_UTF8_4BYTE_START)`
	`2128`	`+ if (*source_p >= LIT_UTF8_4_BYTE_MARKER)`
`2117`	`2129`	`{`
`2118`	`2130`	`/* Processing 4 byte unicode sequence (even if it is`
`2119`	`2131`	`* after a backslash). Always converted to two 3 byte`
`@@ -3028,7 +3040,7 @@ lexer_compare_identifier_to_chars (const uint8_t left_p, /< left identifier `
`3028`	`3040`
`3029`	`3041`	`escape_size = lit_code_point_to_cesu8_bytes (utf8_buf, code_point);`
`3030`	`3042`	`}`
`3031`		`- else if (*left_p >= LEXER_UTF8_4BYTE_START)`
	`3043`	`+ else if (*left_p >= LIT_UTF8_4_BYTE_MARKER)`
`3032`	`3044`	`{`
`3033`	`3045`	`lit_four_byte_utf8_char_to_cesu8 (utf8_buf, left_p);`
`3034`	`3046`	`escape_size = 3 * 2;`