From f7d2169999dcce1c33b794835b799d903c5698c4 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Fri, 8 Dec 2023 10:58:00 +0100 Subject: [PATCH] Rename LRE_FLAG_UTF16 to LRE_FLAG_UNICODE (#186) Prep work for https://github.com/tc39/proposal-regexp-v-flag a.k.a. UnicodeSets. --- libregexp.c | 66 ++++++++++++++++++++++++++--------------------------- libregexp.h | 2 +- quickjs.c | 8 +++---- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/libregexp.c b/libregexp.c index 95e9ce7..b415c7a 100644 --- a/libregexp.c +++ b/libregexp.c @@ -69,7 +69,7 @@ typedef struct { const uint8_t *buf_end; const uint8_t *buf_start; int re_flags; - BOOL is_utf16; + BOOL is_unicode; BOOL ignore_case; BOOL dotall; int capture_count; @@ -122,11 +122,11 @@ static int dbuf_insert(DynBuf *s, int pos, int len) } /* canonicalize with the specific JS regexp rules */ -static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16) +static uint32_t lre_canonicalize(uint32_t c, BOOL is_unicode) { uint32_t res[LRE_CC_RES_LEN_MAX]; int len; - if (is_utf16) { + if (is_unicode) { if (likely(c < 128)) { if (c >= 'A' && c <= 'Z') c = c - 'A' + 'a'; @@ -751,10 +751,10 @@ static int get_class_atom(REParseState *s, CharRange *cr, if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (((c >= '0' && c <= '9') || c == '_') && - inclass && !s->is_utf16)) { /* Annex B.1.4 */ + inclass && !s->is_unicode)) { /* Annex B.1.4 */ c &= 0x1f; p++; - } else if (s->is_utf16) { + } else if (s->is_unicode) { goto invalid_escape; } else { /* otherwise return '\' and 'c' */ @@ -764,7 +764,7 @@ static int get_class_atom(REParseState *s, CharRange *cr, break; case 'p': case 'P': - if (s->is_utf16) { + if (s->is_unicode) { if (parse_unicode_property(s, cr, &p, (c == 'P'))) return -1; c = CLASS_RANGE_BASE; @@ -773,14 +773,14 @@ static int get_class_atom(REParseState *s, CharRange *cr, /* fall thru */ default: p--; - ret = lre_parse_escape(&p, s->is_utf16 * 2); + ret = lre_parse_escape(&p, s->is_unicode * 2); if (ret >= 0) { c = ret; } else { if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) { /* always valid to escape these characters */ goto normal_char; - } else if (s->is_utf16) { + } else if (s->is_unicode) { invalid_escape: return re_parse_error(s, "invalid escape sequence in regular expression"); } else { @@ -802,7 +802,7 @@ static int get_class_atom(REParseState *s, CharRange *cr, /* normal char */ if (c >= 128) { c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); - if ((unsigned)c > 0xffff && !s->is_utf16) { + if ((unsigned)c > 0xffff && !s->is_unicode) { /* XXX: should handle non BMP-1 code points */ return re_parse_error(s, "malformed unicode char"); } @@ -878,7 +878,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp) if (*p == '-' && p[1] != ']') { const uint8_t *p0 = p + 1; if (c1 >= CLASS_RANGE_BASE) { - if (s->is_utf16) { + if (s->is_unicode) { cr_free(cr1); goto invalid_class_range; } @@ -890,7 +890,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp) goto fail; if (c2 >= CLASS_RANGE_BASE) { cr_free(cr1); - if (s->is_utf16) { + if (s->is_unicode) { goto invalid_class_range; } /* Annex B: match '-' character */ @@ -1248,7 +1248,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) re_emit_op(s, REOP_prev); break; case '{': - if (s->is_utf16) { + if (s->is_unicode) { return re_parse_error(s, "syntax error"); } else if (!is_digit(p[1])) { /* Annex B: we accept '{' not followed by digits as a @@ -1300,7 +1300,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) lookahead: /* Annex B allows lookahead to be used as an atom for the quantifiers */ - if (!s->is_utf16 && !is_backward_lookahead) { + if (!s->is_unicode && !is_backward_lookahead) { last_atom_start = s->byte_code.size; last_capture_count = s->capture_count; } @@ -1376,7 +1376,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) /* annex B: we tolerate invalid group names in non unicode mode if there is no named capture definition */ - if (s->is_utf16 || re_has_named_captures(s)) + if (s->is_unicode || re_has_named_captures(s)) return re_parse_error(s, "expecting group name"); else goto parse_class_atom; @@ -1384,7 +1384,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) p1 += 3; if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), &p1)) { - if (s->is_utf16 || re_has_named_captures(s)) + if (s->is_unicode || re_has_named_captures(s)) return re_parse_error(s, "invalid group name"); else goto parse_class_atom; @@ -1395,7 +1395,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) after (inefficient, but hopefully not common */ c = re_parse_captures(s, &dummy_res, s->u.tmp_buf); if (c < 0) { - if (s->is_utf16 || re_has_named_captures(s)) + if (s->is_unicode || re_has_named_captures(s)) return re_parse_error(s, "group name not defined"); else goto parse_class_atom; @@ -1407,7 +1407,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) case '0': p += 2; c = 0; - if (s->is_utf16) { + if (s->is_unicode) { if (is_digit(*p)) { return re_parse_error(s, "invalid decimal escape in regular expression"); } @@ -1429,7 +1429,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) c = parse_digits(&p, FALSE); if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) { - if (!s->is_utf16) { + if (!s->is_unicode) { /* Annex B.1.4: accept legacy octal */ p = q; if (*p <= '7') { @@ -1471,7 +1471,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) break; case ']': case '}': - if (s->is_utf16) + if (s->is_unicode) return re_parse_error(s, "syntax error"); goto parse_class_atom; default: @@ -1493,7 +1493,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) return -1; } else { if (s->ignore_case) - c = lre_canonicalize(c, s->is_utf16); + c = lre_canonicalize(c, s->is_unicode); if (c <= 0x7f) re_emit_op_u8(s, REOP_char8, c); else if (c <= 0xffff) @@ -1531,7 +1531,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) /* As an extension (see ES6 annex B), we accept '{' not followed by digits as a normal atom */ if (!is_digit(p[1])) { - if (s->is_utf16) + if (s->is_unicode) goto invalid_quant_count; break; } @@ -1550,7 +1550,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) quant_max = INT32_MAX; /* infinity */ } } - if (*p != '}' && !s->is_utf16) { + if (*p != '}' && !s->is_unicode) { /* Annex B: normal atom if invalid '{' syntax */ p = p1; break; @@ -1839,7 +1839,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, s->buf_end = s->buf_ptr + buf_len; s->buf_start = s->buf_ptr; s->re_flags = re_flags; - s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0); + s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0); is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); @@ -2039,7 +2039,7 @@ typedef struct { int stack_size_max; BOOL multi_line; BOOL ignore_case; - BOOL is_utf16; + BOOL is_unicode; void *opaque; /* used for stack overflow check */ size_t state_size; @@ -2189,7 +2189,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, goto no_match; GET_CHAR(c, cptr, cbuf_end); if (s->ignore_case) { - c = lre_canonicalize(c, s->is_utf16); + c = lre_canonicalize(c, s->is_unicode); } if (val != c) goto no_match; @@ -2346,8 +2346,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, GET_CHAR(c1, cptr1, cptr1_end); GET_CHAR(c2, cptr, cbuf_end); if (s->ignore_case) { - c1 = lre_canonicalize(c1, s->is_utf16); - c2 = lre_canonicalize(c2, s->is_utf16); + c1 = lre_canonicalize(c1, s->is_unicode); + c2 = lre_canonicalize(c2, s->is_unicode); } if (c1 != c2) goto no_match; @@ -2360,8 +2360,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, GET_PREV_CHAR(c1, cptr1, cptr1_start); GET_PREV_CHAR(c2, cptr, s->cbuf); if (s->ignore_case) { - c1 = lre_canonicalize(c1, s->is_utf16); - c2 = lre_canonicalize(c2, s->is_utf16); + c1 = lre_canonicalize(c1, s->is_unicode); + c2 = lre_canonicalize(c2, s->is_unicode); } if (c1 != c2) goto no_match; @@ -2380,7 +2380,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, goto no_match; GET_CHAR(c, cptr, cbuf_end); if (s->ignore_case) { - c = lre_canonicalize(c, s->is_utf16); + c = lre_canonicalize(c, s->is_unicode); } idx_min = 0; low = get_u16(pc + 0 * 4); @@ -2420,7 +2420,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, goto no_match; GET_CHAR(c, cptr, cbuf_end); if (s->ignore_case) { - c = lre_canonicalize(c, s->is_utf16); + c = lre_canonicalize(c, s->is_unicode); } idx_min = 0; low = get_u32(pc + 0 * 8); @@ -2512,13 +2512,13 @@ int lre_exec(uint8_t **capture, re_flags = bc_buf[RE_HEADER_FLAGS]; s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; - s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0; + s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0; s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; s->cbuf = cbuf; s->cbuf_end = cbuf + (clen << cbuf_type); s->cbuf_type = cbuf_type; - if (s->cbuf_type == 1 && s->is_utf16) + if (s->cbuf_type == 1 && s->is_unicode) s->cbuf_type = 2; s->opaque = opaque; diff --git a/libregexp.h b/libregexp.h index 46e0004..efc35a8 100644 --- a/libregexp.h +++ b/libregexp.h @@ -34,7 +34,7 @@ #define LRE_FLAG_IGNORECASE (1 << 1) #define LRE_FLAG_MULTILINE (1 << 2) #define LRE_FLAG_DOTALL (1 << 3) -#define LRE_FLAG_UTF16 (1 << 4) +#define LRE_FLAG_UNICODE (1 << 4) #define LRE_FLAG_STICKY (1 << 5) #define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */ diff --git a/quickjs.c b/quickjs.c index 110793c..5a4a3a3 100644 --- a/quickjs.c +++ b/quickjs.c @@ -40450,7 +40450,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern, mask = LRE_FLAG_DOTALL; break; case 'u': - mask = LRE_FLAG_UTF16; + mask = LRE_FLAG_UNICODE; break; case 'y': mask = LRE_FLAG_STICKY; @@ -40468,7 +40468,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern, JS_FreeCString(ctx, str); } - str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UTF16)); + str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UNICODE)); if (!str) return JS_EXCEPTION; re_bytecode_buf = lre_compile(&re_bytecode_len, error_msg, @@ -41113,7 +41113,7 @@ static JSValue JS_RegExpDelete(JSContext *ctx, JSValueConst this_val, JSValueCon break; } if (end == start) { - if (!(re_flags & LRE_FLAG_UTF16) || (unsigned)end >= str->len || !str->is_wide_char) { + if (!(re_flags & LRE_FLAG_UNICODE) || (unsigned)end >= str->len || !str->is_wide_char) { end++; } else { string_getc(str, &end); @@ -41873,7 +41873,7 @@ static const JSCFunctionListEntry js_regexp_proto_funcs[] = { JS_CGETSET_MAGIC_DEF("ignoreCase", js_regexp_get_flag, NULL, LRE_FLAG_IGNORECASE ), JS_CGETSET_MAGIC_DEF("multiline", js_regexp_get_flag, NULL, LRE_FLAG_MULTILINE ), JS_CGETSET_MAGIC_DEF("dotAll", js_regexp_get_flag, NULL, LRE_FLAG_DOTALL ), - JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UTF16 ), + JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE ), JS_CGETSET_MAGIC_DEF("sticky", js_regexp_get_flag, NULL, LRE_FLAG_STICKY ), JS_CGETSET_MAGIC_DEF("hasIndices", js_regexp_get_flag, NULL, LRE_FLAG_INDICES ), JS_CFUNC_DEF("exec", 1, js_regexp_exec ),