Rename LRE_FLAG_UTF16 to LRE_FLAG_UNICODE (#186)

Prep work for https://github.com/tc39/proposal-regexp-v-flag a.k.a.
UnicodeSets.
This commit is contained in:
Ben Noordhuis 2023-12-08 10:58:00 +01:00 committed by GitHub
parent 42b708622c
commit f7d2169999
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 38 additions and 38 deletions

View file

@ -69,7 +69,7 @@ typedef struct {
const uint8_t *buf_end; const uint8_t *buf_end;
const uint8_t *buf_start; const uint8_t *buf_start;
int re_flags; int re_flags;
BOOL is_utf16; BOOL is_unicode;
BOOL ignore_case; BOOL ignore_case;
BOOL dotall; BOOL dotall;
int capture_count; int capture_count;
@ -122,11 +122,11 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
} }
/* canonicalize with the specific JS regexp rules */ /* canonicalize with the specific JS regexp rules */
static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16) static uint32_t lre_canonicalize(uint32_t c, BOOL is_unicode)
{ {
uint32_t res[LRE_CC_RES_LEN_MAX]; uint32_t res[LRE_CC_RES_LEN_MAX];
int len; int len;
if (is_utf16) { if (is_unicode) {
if (likely(c < 128)) { if (likely(c < 128)) {
if (c >= 'A' && c <= 'Z') if (c >= 'A' && c <= 'Z')
c = c - 'A' + 'a'; c = c - 'A' + 'a';
@ -751,10 +751,10 @@ static int get_class_atom(REParseState *s, CharRange *cr,
if ((c >= 'a' && c <= 'z') || if ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') || (c >= 'A' && c <= 'Z') ||
(((c >= '0' && c <= '9') || c == '_') && (((c >= '0' && c <= '9') || c == '_') &&
inclass && !s->is_utf16)) { /* Annex B.1.4 */ inclass && !s->is_unicode)) { /* Annex B.1.4 */
c &= 0x1f; c &= 0x1f;
p++; p++;
} else if (s->is_utf16) { } else if (s->is_unicode) {
goto invalid_escape; goto invalid_escape;
} else { } else {
/* otherwise return '\' and 'c' */ /* otherwise return '\' and 'c' */
@ -764,7 +764,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
break; break;
case 'p': case 'p':
case 'P': case 'P':
if (s->is_utf16) { if (s->is_unicode) {
if (parse_unicode_property(s, cr, &p, (c == 'P'))) if (parse_unicode_property(s, cr, &p, (c == 'P')))
return -1; return -1;
c = CLASS_RANGE_BASE; c = CLASS_RANGE_BASE;
@ -773,14 +773,14 @@ static int get_class_atom(REParseState *s, CharRange *cr,
/* fall thru */ /* fall thru */
default: default:
p--; p--;
ret = lre_parse_escape(&p, s->is_utf16 * 2); ret = lre_parse_escape(&p, s->is_unicode * 2);
if (ret >= 0) { if (ret >= 0) {
c = ret; c = ret;
} else { } else {
if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) { if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
/* always valid to escape these characters */ /* always valid to escape these characters */
goto normal_char; goto normal_char;
} else if (s->is_utf16) { } else if (s->is_unicode) {
invalid_escape: invalid_escape:
return re_parse_error(s, "invalid escape sequence in regular expression"); return re_parse_error(s, "invalid escape sequence in regular expression");
} else { } else {
@ -802,7 +802,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
/* normal char */ /* normal char */
if (c >= 128) { if (c >= 128) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
if ((unsigned)c > 0xffff && !s->is_utf16) { if ((unsigned)c > 0xffff && !s->is_unicode) {
/* XXX: should handle non BMP-1 code points */ /* XXX: should handle non BMP-1 code points */
return re_parse_error(s, "malformed unicode char"); return re_parse_error(s, "malformed unicode char");
} }
@ -878,7 +878,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
if (*p == '-' && p[1] != ']') { if (*p == '-' && p[1] != ']') {
const uint8_t *p0 = p + 1; const uint8_t *p0 = p + 1;
if (c1 >= CLASS_RANGE_BASE) { if (c1 >= CLASS_RANGE_BASE) {
if (s->is_utf16) { if (s->is_unicode) {
cr_free(cr1); cr_free(cr1);
goto invalid_class_range; goto invalid_class_range;
} }
@ -890,7 +890,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
goto fail; goto fail;
if (c2 >= CLASS_RANGE_BASE) { if (c2 >= CLASS_RANGE_BASE) {
cr_free(cr1); cr_free(cr1);
if (s->is_utf16) { if (s->is_unicode) {
goto invalid_class_range; goto invalid_class_range;
} }
/* Annex B: match '-' character */ /* Annex B: match '-' character */
@ -1248,7 +1248,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
re_emit_op(s, REOP_prev); re_emit_op(s, REOP_prev);
break; break;
case '{': case '{':
if (s->is_utf16) { if (s->is_unicode) {
return re_parse_error(s, "syntax error"); return re_parse_error(s, "syntax error");
} else if (!is_digit(p[1])) { } else if (!is_digit(p[1])) {
/* Annex B: we accept '{' not followed by digits as a /* Annex B: we accept '{' not followed by digits as a
@ -1300,7 +1300,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
lookahead: lookahead:
/* Annex B allows lookahead to be used as an atom for /* Annex B allows lookahead to be used as an atom for
the quantifiers */ the quantifiers */
if (!s->is_utf16 && !is_backward_lookahead) { if (!s->is_unicode && !is_backward_lookahead) {
last_atom_start = s->byte_code.size; last_atom_start = s->byte_code.size;
last_capture_count = s->capture_count; last_capture_count = s->capture_count;
} }
@ -1376,7 +1376,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
/* annex B: we tolerate invalid group names in non /* annex B: we tolerate invalid group names in non
unicode mode if there is no named capture unicode mode if there is no named capture
definition */ definition */
if (s->is_utf16 || re_has_named_captures(s)) if (s->is_unicode || re_has_named_captures(s))
return re_parse_error(s, "expecting group name"); return re_parse_error(s, "expecting group name");
else else
goto parse_class_atom; goto parse_class_atom;
@ -1384,7 +1384,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
p1 += 3; p1 += 3;
if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
&p1)) { &p1)) {
if (s->is_utf16 || re_has_named_captures(s)) if (s->is_unicode || re_has_named_captures(s))
return re_parse_error(s, "invalid group name"); return re_parse_error(s, "invalid group name");
else else
goto parse_class_atom; goto parse_class_atom;
@ -1395,7 +1395,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
after (inefficient, but hopefully not common */ after (inefficient, but hopefully not common */
c = re_parse_captures(s, &dummy_res, s->u.tmp_buf); c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
if (c < 0) { if (c < 0) {
if (s->is_utf16 || re_has_named_captures(s)) if (s->is_unicode || re_has_named_captures(s))
return re_parse_error(s, "group name not defined"); return re_parse_error(s, "group name not defined");
else else
goto parse_class_atom; goto parse_class_atom;
@ -1407,7 +1407,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
case '0': case '0':
p += 2; p += 2;
c = 0; c = 0;
if (s->is_utf16) { if (s->is_unicode) {
if (is_digit(*p)) { if (is_digit(*p)) {
return re_parse_error(s, "invalid decimal escape in regular expression"); return re_parse_error(s, "invalid decimal escape in regular expression");
} }
@ -1429,7 +1429,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
c = parse_digits(&p, FALSE); c = parse_digits(&p, FALSE);
if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) { if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
if (!s->is_utf16) { if (!s->is_unicode) {
/* Annex B.1.4: accept legacy octal */ /* Annex B.1.4: accept legacy octal */
p = q; p = q;
if (*p <= '7') { if (*p <= '7') {
@ -1471,7 +1471,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
break; break;
case ']': case ']':
case '}': case '}':
if (s->is_utf16) if (s->is_unicode)
return re_parse_error(s, "syntax error"); return re_parse_error(s, "syntax error");
goto parse_class_atom; goto parse_class_atom;
default: default:
@ -1493,7 +1493,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
return -1; return -1;
} else { } else {
if (s->ignore_case) if (s->ignore_case)
c = lre_canonicalize(c, s->is_utf16); c = lre_canonicalize(c, s->is_unicode);
if (c <= 0x7f) if (c <= 0x7f)
re_emit_op_u8(s, REOP_char8, c); re_emit_op_u8(s, REOP_char8, c);
else if (c <= 0xffff) else if (c <= 0xffff)
@ -1531,7 +1531,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
/* As an extension (see ES6 annex B), we accept '{' not /* As an extension (see ES6 annex B), we accept '{' not
followed by digits as a normal atom */ followed by digits as a normal atom */
if (!is_digit(p[1])) { if (!is_digit(p[1])) {
if (s->is_utf16) if (s->is_unicode)
goto invalid_quant_count; goto invalid_quant_count;
break; break;
} }
@ -1550,7 +1550,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
quant_max = INT32_MAX; /* infinity */ quant_max = INT32_MAX; /* infinity */
} }
} }
if (*p != '}' && !s->is_utf16) { if (*p != '}' && !s->is_unicode) {
/* Annex B: normal atom if invalid '{' syntax */ /* Annex B: normal atom if invalid '{' syntax */
p = p1; p = p1;
break; break;
@ -1839,7 +1839,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
s->buf_end = s->buf_ptr + buf_len; s->buf_end = s->buf_ptr + buf_len;
s->buf_start = s->buf_ptr; s->buf_start = s->buf_ptr;
s->re_flags = re_flags; s->re_flags = re_flags;
s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0); s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
@ -2039,7 +2039,7 @@ typedef struct {
int stack_size_max; int stack_size_max;
BOOL multi_line; BOOL multi_line;
BOOL ignore_case; BOOL ignore_case;
BOOL is_utf16; BOOL is_unicode;
void *opaque; /* used for stack overflow check */ void *opaque; /* used for stack overflow check */
size_t state_size; size_t state_size;
@ -2189,7 +2189,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
goto no_match; goto no_match;
GET_CHAR(c, cptr, cbuf_end); GET_CHAR(c, cptr, cbuf_end);
if (s->ignore_case) { if (s->ignore_case) {
c = lre_canonicalize(c, s->is_utf16); c = lre_canonicalize(c, s->is_unicode);
} }
if (val != c) if (val != c)
goto no_match; goto no_match;
@ -2346,8 +2346,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
GET_CHAR(c1, cptr1, cptr1_end); GET_CHAR(c1, cptr1, cptr1_end);
GET_CHAR(c2, cptr, cbuf_end); GET_CHAR(c2, cptr, cbuf_end);
if (s->ignore_case) { if (s->ignore_case) {
c1 = lre_canonicalize(c1, s->is_utf16); c1 = lre_canonicalize(c1, s->is_unicode);
c2 = lre_canonicalize(c2, s->is_utf16); c2 = lre_canonicalize(c2, s->is_unicode);
} }
if (c1 != c2) if (c1 != c2)
goto no_match; goto no_match;
@ -2360,8 +2360,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
GET_PREV_CHAR(c1, cptr1, cptr1_start); GET_PREV_CHAR(c1, cptr1, cptr1_start);
GET_PREV_CHAR(c2, cptr, s->cbuf); GET_PREV_CHAR(c2, cptr, s->cbuf);
if (s->ignore_case) { if (s->ignore_case) {
c1 = lre_canonicalize(c1, s->is_utf16); c1 = lre_canonicalize(c1, s->is_unicode);
c2 = lre_canonicalize(c2, s->is_utf16); c2 = lre_canonicalize(c2, s->is_unicode);
} }
if (c1 != c2) if (c1 != c2)
goto no_match; goto no_match;
@ -2380,7 +2380,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
goto no_match; goto no_match;
GET_CHAR(c, cptr, cbuf_end); GET_CHAR(c, cptr, cbuf_end);
if (s->ignore_case) { if (s->ignore_case) {
c = lre_canonicalize(c, s->is_utf16); c = lre_canonicalize(c, s->is_unicode);
} }
idx_min = 0; idx_min = 0;
low = get_u16(pc + 0 * 4); low = get_u16(pc + 0 * 4);
@ -2420,7 +2420,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
goto no_match; goto no_match;
GET_CHAR(c, cptr, cbuf_end); GET_CHAR(c, cptr, cbuf_end);
if (s->ignore_case) { if (s->ignore_case) {
c = lre_canonicalize(c, s->is_utf16); c = lre_canonicalize(c, s->is_unicode);
} }
idx_min = 0; idx_min = 0;
low = get_u32(pc + 0 * 8); low = get_u32(pc + 0 * 8);
@ -2512,13 +2512,13 @@ int lre_exec(uint8_t **capture,
re_flags = bc_buf[RE_HEADER_FLAGS]; re_flags = bc_buf[RE_HEADER_FLAGS];
s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0; s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
s->cbuf = cbuf; s->cbuf = cbuf;
s->cbuf_end = cbuf + (clen << cbuf_type); s->cbuf_end = cbuf + (clen << cbuf_type);
s->cbuf_type = cbuf_type; s->cbuf_type = cbuf_type;
if (s->cbuf_type == 1 && s->is_utf16) if (s->cbuf_type == 1 && s->is_unicode)
s->cbuf_type = 2; s->cbuf_type = 2;
s->opaque = opaque; s->opaque = opaque;

View file

@ -34,7 +34,7 @@
#define LRE_FLAG_IGNORECASE (1 << 1) #define LRE_FLAG_IGNORECASE (1 << 1)
#define LRE_FLAG_MULTILINE (1 << 2) #define LRE_FLAG_MULTILINE (1 << 2)
#define LRE_FLAG_DOTALL (1 << 3) #define LRE_FLAG_DOTALL (1 << 3)
#define LRE_FLAG_UTF16 (1 << 4) #define LRE_FLAG_UNICODE (1 << 4)
#define LRE_FLAG_STICKY (1 << 5) #define LRE_FLAG_STICKY (1 << 5)
#define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */ #define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */

View file

@ -40450,7 +40450,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
mask = LRE_FLAG_DOTALL; mask = LRE_FLAG_DOTALL;
break; break;
case 'u': case 'u':
mask = LRE_FLAG_UTF16; mask = LRE_FLAG_UNICODE;
break; break;
case 'y': case 'y':
mask = LRE_FLAG_STICKY; mask = LRE_FLAG_STICKY;
@ -40468,7 +40468,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
JS_FreeCString(ctx, str); JS_FreeCString(ctx, str);
} }
str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UTF16)); str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UNICODE));
if (!str) if (!str)
return JS_EXCEPTION; return JS_EXCEPTION;
re_bytecode_buf = lre_compile(&re_bytecode_len, error_msg, re_bytecode_buf = lre_compile(&re_bytecode_len, error_msg,
@ -41113,7 +41113,7 @@ static JSValue JS_RegExpDelete(JSContext *ctx, JSValueConst this_val, JSValueCon
break; break;
} }
if (end == start) { if (end == start) {
if (!(re_flags & LRE_FLAG_UTF16) || (unsigned)end >= str->len || !str->is_wide_char) { if (!(re_flags & LRE_FLAG_UNICODE) || (unsigned)end >= str->len || !str->is_wide_char) {
end++; end++;
} else { } else {
string_getc(str, &end); string_getc(str, &end);
@ -41873,7 +41873,7 @@ static const JSCFunctionListEntry js_regexp_proto_funcs[] = {
JS_CGETSET_MAGIC_DEF("ignoreCase", js_regexp_get_flag, NULL, LRE_FLAG_IGNORECASE ), JS_CGETSET_MAGIC_DEF("ignoreCase", js_regexp_get_flag, NULL, LRE_FLAG_IGNORECASE ),
JS_CGETSET_MAGIC_DEF("multiline", js_regexp_get_flag, NULL, LRE_FLAG_MULTILINE ), JS_CGETSET_MAGIC_DEF("multiline", js_regexp_get_flag, NULL, LRE_FLAG_MULTILINE ),
JS_CGETSET_MAGIC_DEF("dotAll", js_regexp_get_flag, NULL, LRE_FLAG_DOTALL ), JS_CGETSET_MAGIC_DEF("dotAll", js_regexp_get_flag, NULL, LRE_FLAG_DOTALL ),
JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UTF16 ), JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE ),
JS_CGETSET_MAGIC_DEF("sticky", js_regexp_get_flag, NULL, LRE_FLAG_STICKY ), JS_CGETSET_MAGIC_DEF("sticky", js_regexp_get_flag, NULL, LRE_FLAG_STICKY ),
JS_CGETSET_MAGIC_DEF("hasIndices", js_regexp_get_flag, NULL, LRE_FLAG_INDICES ), JS_CGETSET_MAGIC_DEF("hasIndices", js_regexp_get_flag, NULL, LRE_FLAG_INDICES ),
JS_CFUNC_DEF("exec", 1, js_regexp_exec ), JS_CFUNC_DEF("exec", 1, js_regexp_exec ),