Rename LRE_FLAG_UTF16 to LRE_FLAG_UNICODE (#186)

Prep work for https://github.com/tc39/proposal-regexp-v-flag a.k.a. UnicodeSets.
2023-12-08 10:58:00 +01:00 · 2023-12-08 10:58:00 +01:00 · f7d2169999
commit f7d2169999
parent 42b708622c
3 changed files with 38 additions and 38 deletions
--- a/libregexp.c
+++ b/libregexp.c
@ -69,7 +69,7 @@ typedef struct {
    const uint8_t *buf_end;
    const uint8_t *buf_start;
    int re_flags;
-    BOOL is_utf16;
+    BOOL is_unicode;
    BOOL ignore_case;
    BOOL dotall;
    int capture_count;
@ -122,11 +122,11 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
 }

 /* canonicalize with the specific JS regexp rules */
-static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
+static uint32_t lre_canonicalize(uint32_t c, BOOL is_unicode)
 {
    uint32_t res[LRE_CC_RES_LEN_MAX];
    int len;
-    if (is_utf16) {
+    if (is_unicode) {
        if (likely(c < 128)) {
            if (c >= 'A' && c <= 'Z')
                c = c - 'A' + 'a';
@ -751,10 +751,10 @@ static int get_class_atom(REParseState *s, CharRange *cr,
            if ((c >= 'a' && c <= 'z') ||
                (c >= 'A' && c <= 'Z') ||
                (((c >= '0' && c <= '9') || c == '_') &&
-                 inclass && !s->is_utf16)) {   /* Annex B.1.4 */
+                 inclass && !s->is_unicode)) {   /* Annex B.1.4 */
                c &= 0x1f;
                p++;
-            } else if (s->is_utf16) {
+            } else if (s->is_unicode) {
                goto invalid_escape;
            } else {
                /* otherwise return '\' and 'c' */
@ -764,7 +764,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
            break;
        case 'p':
        case 'P':
-            if (s->is_utf16) {
+            if (s->is_unicode) {
                if (parse_unicode_property(s, cr, &p, (c == 'P')))
                    return -1;
                c = CLASS_RANGE_BASE;
@ -773,14 +773,14 @@ static int get_class_atom(REParseState *s, CharRange *cr,
            /* fall thru */
        default:
            p--;
-            ret = lre_parse_escape(&p, s->is_utf16 * 2);
+            ret = lre_parse_escape(&p, s->is_unicode * 2);
            if (ret >= 0) {
                c = ret;
            } else {
                if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
                    /* always valid to escape these characters */
                    goto normal_char;
-                } else if (s->is_utf16) {
+                } else if (s->is_unicode) {
                invalid_escape:
                    return re_parse_error(s, "invalid escape sequence in regular expression");
                } else {
@ -802,7 +802,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
        /* normal char */
        if (c >= 128) {
            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
-            if ((unsigned)c > 0xffff && !s->is_utf16) {
+            if ((unsigned)c > 0xffff && !s->is_unicode) {
                /* XXX: should handle non BMP-1 code points */
                return re_parse_error(s, "malformed unicode char");
            }
@ -878,7 +878,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
        if (*p == '-' && p[1] != ']') {
            const uint8_t *p0 = p + 1;
            if (c1 >= CLASS_RANGE_BASE) {
-                if (s->is_utf16) {
+                if (s->is_unicode) {
                    cr_free(cr1);
                    goto invalid_class_range;
                }
@ -890,7 +890,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
                goto fail;
            if (c2 >= CLASS_RANGE_BASE) {
                cr_free(cr1);
-                if (s->is_utf16) {
+                if (s->is_unicode) {
                    goto invalid_class_range;
                }
                /* Annex B: match '-' character */
@ -1248,7 +1248,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
            re_emit_op(s, REOP_prev);
        break;
    case '{':
-        if (s->is_utf16) {
+        if (s->is_unicode) {
            return re_parse_error(s, "syntax error");
        } else if (!is_digit(p[1])) {
            /* Annex B: we accept '{' not followed by digits as a
@ -1300,7 +1300,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
            lookahead:
                /* Annex B allows lookahead to be used as an atom for
                   the quantifiers */
-                if (!s->is_utf16 && !is_backward_lookahead)  {
+                if (!s->is_unicode && !is_backward_lookahead)  {
                    last_atom_start = s->byte_code.size;
                    last_capture_count = s->capture_count;
                }
@ -1376,7 +1376,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                    /* annex B: we tolerate invalid group names in non
                       unicode mode if there is no named capture
                       definition */
-                    if (s->is_utf16 || re_has_named_captures(s))
+                    if (s->is_unicode || re_has_named_captures(s))
                        return re_parse_error(s, "expecting group name");
                    else
                        goto parse_class_atom;
@ -1384,7 +1384,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                p1 += 3;
                if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
                                        &p1)) {
-                    if (s->is_utf16 || re_has_named_captures(s))
+                    if (s->is_unicode || re_has_named_captures(s))
                        return re_parse_error(s, "invalid group name");
                    else
                        goto parse_class_atom;
@ -1395,7 +1395,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                       after (inefficient, but hopefully not common */
                    c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
                    if (c < 0) {
-                        if (s->is_utf16 || re_has_named_captures(s))
+                        if (s->is_unicode || re_has_named_captures(s))
                            return re_parse_error(s, "group name not defined");
                        else
                            goto parse_class_atom;
@ -1407,7 +1407,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
        case '0':
            p += 2;
            c = 0;
-            if (s->is_utf16) {
+            if (s->is_unicode) {
                if (is_digit(*p)) {
                    return re_parse_error(s, "invalid decimal escape in regular expression");
                }
@ -1429,7 +1429,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)

                c = parse_digits(&p, FALSE);
                if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
-                    if (!s->is_utf16) {
+                    if (!s->is_unicode) {
                        /* Annex B.1.4: accept legacy octal */
                        p = q;
                        if (*p <= '7') {
@ -1471,7 +1471,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
        break;
    case ']':
    case '}':
-        if (s->is_utf16)
+        if (s->is_unicode)
            return re_parse_error(s, "syntax error");
        goto parse_class_atom;
    default:
@ -1493,7 +1493,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                return -1;
        } else {
            if (s->ignore_case)
-                c = lre_canonicalize(c, s->is_utf16);
+                c = lre_canonicalize(c, s->is_unicode);
            if (c <= 0x7f)
                re_emit_op_u8(s, REOP_char8, c);
            else if (c <= 0xffff)
@ -1531,7 +1531,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                /* As an extension (see ES6 annex B), we accept '{' not
                   followed by digits as a normal atom */
                if (!is_digit(p[1])) {
-                    if (s->is_utf16)
+                    if (s->is_unicode)
                        goto invalid_quant_count;
                    break;
                }
@ -1550,7 +1550,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                        quant_max = INT32_MAX; /* infinity */
                    }
                }
-                if (*p != '}' && !s->is_utf16) {
+                if (*p != '}' && !s->is_unicode) {
                    /* Annex B: normal atom if invalid '{' syntax */
                    p = p1;
                    break;
@ -1839,7 +1839,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
    s->buf_end = s->buf_ptr + buf_len;
    s->buf_start = s->buf_ptr;
    s->re_flags = re_flags;
-    s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
+    s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
    is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
    s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
    s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
@ -2039,7 +2039,7 @@ typedef struct {
    int stack_size_max;
    BOOL multi_line;
    BOOL ignore_case;
-    BOOL is_utf16;
+    BOOL is_unicode;
    void *opaque; /* used for stack overflow check */

    size_t state_size;
@ -2189,7 +2189,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                goto no_match;
            GET_CHAR(c, cptr, cbuf_end);
            if (s->ignore_case) {
-                c = lre_canonicalize(c, s->is_utf16);
+                c = lre_canonicalize(c, s->is_unicode);
            }
            if (val != c)
                goto no_match;
@ -2346,8 +2346,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                        GET_CHAR(c1, cptr1, cptr1_end);
                        GET_CHAR(c2, cptr, cbuf_end);
                        if (s->ignore_case) {
-                            c1 = lre_canonicalize(c1, s->is_utf16);
-                            c2 = lre_canonicalize(c2, s->is_utf16);
+                            c1 = lre_canonicalize(c1, s->is_unicode);
+                            c2 = lre_canonicalize(c2, s->is_unicode);
                        }
                        if (c1 != c2)
                            goto no_match;
@ -2360,8 +2360,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                        GET_PREV_CHAR(c1, cptr1, cptr1_start);
                        GET_PREV_CHAR(c2, cptr, s->cbuf);
                        if (s->ignore_case) {
-                            c1 = lre_canonicalize(c1, s->is_utf16);
-                            c2 = lre_canonicalize(c2, s->is_utf16);
+                            c1 = lre_canonicalize(c1, s->is_unicode);
+                            c2 = lre_canonicalize(c2, s->is_unicode);
                        }
                        if (c1 != c2)
                            goto no_match;
@ -2380,7 +2380,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                    goto no_match;
                GET_CHAR(c, cptr, cbuf_end);
                if (s->ignore_case) {
-                    c = lre_canonicalize(c, s->is_utf16);
+                    c = lre_canonicalize(c, s->is_unicode);
                }
                idx_min = 0;
                low = get_u16(pc + 0 * 4);
@ -2420,7 +2420,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                    goto no_match;
                GET_CHAR(c, cptr, cbuf_end);
                if (s->ignore_case) {
-                    c = lre_canonicalize(c, s->is_utf16);
+                    c = lre_canonicalize(c, s->is_unicode);
                }
                idx_min = 0;
                low = get_u32(pc + 0 * 8);
@ -2512,13 +2512,13 @@ int lre_exec(uint8_t **capture,
    re_flags = bc_buf[RE_HEADER_FLAGS];
    s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
    s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
-    s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
+    s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
    s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
    s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
    s->cbuf = cbuf;
    s->cbuf_end = cbuf + (clen << cbuf_type);
    s->cbuf_type = cbuf_type;
-    if (s->cbuf_type == 1 && s->is_utf16)
+    if (s->cbuf_type == 1 && s->is_unicode)
        s->cbuf_type = 2;
    s->opaque = opaque;

--- a/libregexp.h
+++ b/libregexp.h
@ -34,7 +34,7 @@
 #define LRE_FLAG_IGNORECASE (1 << 1)
 #define LRE_FLAG_MULTILINE  (1 << 2)
 #define LRE_FLAG_DOTALL     (1 << 3)
-#define LRE_FLAG_UTF16      (1 << 4)
+#define LRE_FLAG_UNICODE    (1 << 4)
 #define LRE_FLAG_STICKY     (1 << 5)
 #define LRE_FLAG_INDICES    (1 << 6) /* Unused by libregexp, just recorded. */

--- a/quickjs.c
+++ b/quickjs.c
@ -40450,7 +40450,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
                mask = LRE_FLAG_DOTALL;
                break;
            case 'u':
-                mask = LRE_FLAG_UTF16;
+                mask = LRE_FLAG_UNICODE;
                break;
            case 'y':
                mask = LRE_FLAG_STICKY;
@ -40468,7 +40468,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
        JS_FreeCString(ctx, str);
    }

-    str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UTF16));
+    str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UNICODE));
    if (!str)
        return JS_EXCEPTION;
    re_bytecode_buf = lre_compile(&re_bytecode_len, error_msg,
@ -41113,7 +41113,7 @@ static JSValue JS_RegExpDelete(JSContext *ctx, JSValueConst this_val, JSValueCon
            break;
        }
        if (end == start) {
-            if (!(re_flags & LRE_FLAG_UTF16) || (unsigned)end >= str->len || !str->is_wide_char) {
+            if (!(re_flags & LRE_FLAG_UNICODE) || (unsigned)end >= str->len || !str->is_wide_char) {
                end++;
            } else {
                string_getc(str, &end);
@ -41873,7 +41873,7 @@ static const JSCFunctionListEntry js_regexp_proto_funcs[] = {
    JS_CGETSET_MAGIC_DEF("ignoreCase", js_regexp_get_flag, NULL, LRE_FLAG_IGNORECASE ),
    JS_CGETSET_MAGIC_DEF("multiline", js_regexp_get_flag, NULL, LRE_FLAG_MULTILINE ),
    JS_CGETSET_MAGIC_DEF("dotAll", js_regexp_get_flag, NULL, LRE_FLAG_DOTALL ),
-    JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UTF16 ),
+    JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE ),
    JS_CGETSET_MAGIC_DEF("sticky", js_regexp_get_flag, NULL, LRE_FLAG_STICKY ),
    JS_CGETSET_MAGIC_DEF("hasIndices", js_regexp_get_flag, NULL, LRE_FLAG_INDICES ),
    JS_CFUNC_DEF("exec", 1, js_regexp_exec ),