From f0ef9e159387105da55907fbc8d58a4d1fe920b4 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Thu, 21 Dec 2023 19:37:31 +0100 Subject: [PATCH] Implement RegExp 'v' flag, part 1 (#229) This commit implements the flag itself and teaches the regex engine to reject previously accepted patterns when in unicodeSets mode. Refs: https://github.com/quickjs-ng/quickjs/issues/228 --- libregexp.c | 31 +++++++++ libregexp.h | 2 +- quickjs.c | 13 ++++ test262.conf | 144 +++++++++++++++++++++++++++++++++++++++++- tests/test_builtin.js | 11 ++++ 5 files changed, 199 insertions(+), 2 deletions(-) diff --git a/libregexp.c b/libregexp.c index 9e36540..c2f9a1f 100644 --- a/libregexp.c +++ b/libregexp.c @@ -63,6 +63,7 @@ typedef enum { #define TMP_BUF_SIZE 128 +// invariant: is_unicode ^ unicode_sets (or neither, but not both) typedef struct { DynBuf byte_code; const uint8_t *buf_ptr; @@ -70,6 +71,7 @@ typedef struct { const uint8_t *buf_start; int re_flags; BOOL is_unicode; + BOOL unicode_sets; BOOL ignore_case; BOOL dotall; int capture_count; @@ -853,6 +855,8 @@ static int re_emit_range(REParseState *s, const CharRange *cr) return 0; } +// s->unicode turns patterns like []] into syntax errors +// s->unicode_sets turns more patterns into errors, like [a-] or [[] static int re_parse_char_class(REParseState *s, const uint8_t **pp) { const uint8_t *p; @@ -864,17 +868,43 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp) cr_init(cr, s->opaque, lre_realloc); p = *pp; p++; /* skip '[' */ + + if (s->unicode_sets) { + static const char verboten[] = + "()[{}/-|" "\0" + "&&!!##$$%%**++,,..::;;<<==>>??@@``~~" "\0" + "^^^_^^"; + const char *s = verboten; + int n = 1; + do { + if (!memcmp(s, p, n)) + if (p[n] == ']') + goto invalid_class_range; + s += n; + if (!*s) { + s++; + n++; + } + } while (n < 4); + } + invert = FALSE; if (*p == '^') { p++; invert = TRUE; } + for(;;) { if (*p == ']') break; c1 = get_class_atom(s, cr1, &p, TRUE); if ((int)c1 < 0) goto fail; + if (*p == '-' && p[1] == ']' && s->unicode_sets) { + if (c1 >= CLASS_RANGE_BASE) + cr_free(cr1); + goto invalid_class_range; + } if (*p == '-' && p[1] != ']') { const uint8_t *p0 = p + 1; if (c1 >= CLASS_RANGE_BASE) { @@ -1843,6 +1873,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); + s->unicode_sets = ((re_flags & LRE_FLAG_UNICODE_SETS) != 0); s->capture_count = 1; s->total_capture_count = -1; s->has_named_captures = -1; diff --git a/libregexp.h b/libregexp.h index efc35a8..f029309 100644 --- a/libregexp.h +++ b/libregexp.h @@ -37,8 +37,8 @@ #define LRE_FLAG_UNICODE (1 << 4) #define LRE_FLAG_STICKY (1 << 5) #define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */ - #define LRE_FLAG_NAMED_GROUPS (1 << 7) /* named groups are present in the regexp */ +#define LRE_FLAG_UNICODE_SETS (1 << 8) uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, const char *buf, size_t buf_len, int re_flags, diff --git a/quickjs.c b/quickjs.c index 6e16c7a..c6c3546 100644 --- a/quickjs.c +++ b/quickjs.c @@ -40730,6 +40730,9 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValue pattern, case 'u': mask = LRE_FLAG_UNICODE; break; + case 'v': + mask = LRE_FLAG_UNICODE_SETS; + break; case 'y': mask = LRE_FLAG_STICKY; break; @@ -40746,6 +40749,10 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValue pattern, JS_FreeCString(ctx, str); } + if (re_flags & LRE_FLAG_UNICODE) + if (re_flags & LRE_FLAG_UNICODE_SETS) + return JS_ThrowSyntaxError(ctx, "invalid regular expression flags"); + str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UNICODE)); if (!str) return JS_EXCEPTION; @@ -41067,6 +41074,11 @@ static JSValue js_regexp_get_flags(JSContext *ctx, JSValue this_val) goto exception; if (res) *p++ = 'u'; + res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "unicodeSets")); + if (res < 0) + goto exception; + if (res) + *p++ = 'v'; res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "sticky")); if (res < 0) goto exception; @@ -42152,6 +42164,7 @@ static const JSCFunctionListEntry js_regexp_proto_funcs[] = { JS_CGETSET_MAGIC_DEF("multiline", js_regexp_get_flag, NULL, LRE_FLAG_MULTILINE ), JS_CGETSET_MAGIC_DEF("dotAll", js_regexp_get_flag, NULL, LRE_FLAG_DOTALL ), JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE ), + JS_CGETSET_MAGIC_DEF("unicodeSets", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE_SETS ), JS_CGETSET_MAGIC_DEF("sticky", js_regexp_get_flag, NULL, LRE_FLAG_STICKY ), JS_CGETSET_MAGIC_DEF("hasIndices", js_regexp_get_flag, NULL, LRE_FLAG_INDICES ), JS_CFUNC_DEF("exec", 1, js_regexp_exec ), diff --git a/test262.conf b/test262.conf index 7679ebb..1763269 100644 --- a/test262.conf +++ b/test262.conf @@ -157,7 +157,7 @@ regexp-lookbehind regexp-match-indices regexp-named-groups regexp-unicode-property-escapes -regexp-v-flag=skip +regexp-v-flag resizable-arraybuffer=skip rest-parameters Set @@ -223,5 +223,147 @@ test262/test/built-ins/ThrowTypeError/unique-per-realm-function-proto.js #test262/test/built-ins/RegExp/CharacterClassEscapes/ #test262/test/built-ins/RegExp/property-escapes/ +# in progress regexp-v-flag support, see https://github.com/quickjs-ng/quickjs/issues/228 +test262/test/built-ins/RegExp/property-escapes/generated/strings/Basic_Emoji-negative-CharacterClass.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/Basic_Emoji-negative-P.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/Basic_Emoji-negative-u.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/Basic_Emoji.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/Emoji_Keycap_Sequence-negative-CharacterClass.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/Emoji_Keycap_Sequence-negative-P.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/Emoji_Keycap_Sequence-negative-u.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/Emoji_Keycap_Sequence.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji-negative-CharacterClass.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji-negative-P.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji-negative-u.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Flag_Sequence-negative-CharacterClass.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Flag_Sequence-negative-P.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Flag_Sequence-negative-u.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Flag_Sequence.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Modifier_Sequence-negative-CharacterClass.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Modifier_Sequence-negative-P.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Modifier_Sequence-negative-u.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Modifier_Sequence.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Tag_Sequence-negative-CharacterClass.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Tag_Sequence-negative-P.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Tag_Sequence-negative-u.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_Tag_Sequence.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-CharacterClass.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-P.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence-negative-u.js +test262/test/built-ins/RegExp/property-escapes/generated/strings/RGI_Emoji_ZWJ_Sequence.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-difference-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-difference-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-difference-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-difference-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-difference-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-difference-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-difference-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-intersection-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-union-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-union-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-escape-union-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-intersection-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-intersection-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-intersection-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-intersection-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-union-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-union-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-union-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-union-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-union-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-class-union-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-difference-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-difference-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-difference-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-difference-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-difference-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-difference-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-intersection-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-intersection-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-intersection-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-intersection-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-intersection-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-intersection-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-intersection-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-union-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-union-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-union-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-union-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-union-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-union-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-union-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-union-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/character-union-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-difference-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-intersection-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-intersection-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-intersection-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-intersection-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-intersection-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-intersection-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-union-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-union-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-union-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-union-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-union-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/property-of-strings-escape-union-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/rgi-emoji-13.1.js +test262/test/built-ins/RegExp/unicodeSets/generated/rgi-emoji-14.0.js +test262/test/built-ins/RegExp/unicodeSets/generated/rgi-emoji-15.0.js +test262/test/built-ins/RegExp/unicodeSets/generated/rgi-emoji-15.1.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-difference-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-difference-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-difference-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-difference-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-difference-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-difference-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-intersection-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-intersection-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-intersection-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-intersection-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-intersection-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-intersection-string-literal.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-union-character-class-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-union-character-class.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-union-character-property-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-union-character.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-union-property-of-strings-escape.js +test262/test/built-ins/RegExp/unicodeSets/generated/string-literal-union-string-literal.js + [tests] # list test files or use config.testdir diff --git a/tests/test_builtin.js b/tests/test_builtin.js index a2a657e..c8ba98e 100644 --- a/tests/test_builtin.js +++ b/tests/test_builtin.js @@ -646,6 +646,17 @@ function test_regexp() assert(/{1a}/.toString(), "/{1a}/"); a = /a{1+/.exec("a{11"); assert(a, ["a{11"] ); + + eval("/[a-]/"); // accepted with no flag + eval("/[a-]/u"); // accepted with 'u' flag + + let ex; + try { + eval("/[a-]/v"); // rejected with 'v' flag + } catch (_ex) { + ex = _ex; + } + assert(ex?.message, "invalid class range"); } function test_symbol()