From 139b51fe4b49a7f72e5111430955698101cfbf54 Mon Sep 17 00:00:00 2001 From: Charlie Gordon Date: Sun, 26 May 2024 00:17:04 +0200 Subject: [PATCH] Simplify number parsing (#386) - use single test in `js_strtod` loop. - use more explicit `ATOD_xxx` flags - remove `ATOD_TYPE_MASK`, use `ATOD_WANT_BIG_INT` instead - remove unused arguments `flags` and `pexponent` in `js_string_to_bigint` - merge `js_atof` and `js_atof2`, remove `slimb_t *pexponent` argument - simplify and document `js_atof` parser, remove cumbersome labels, - simplify `js_parseInt` test for zero radix for `ATOD_ACCEPT_HEX_PREFIX` - simplify `next_token` number parsing, handle legacy octal in parser only - simplify `JS_StringToBigInt`, use flags only. - remove unused `slimb_t exponent` token field - add number syntax tests --- quickjs.c | 374 ++++++++++++++++++----------------------- tests/test_language.js | 21 +++ 2 files changed, 189 insertions(+), 206 deletions(-) diff --git a/quickjs.c b/quickjs.c index 54d1778..1d01eb6 100644 --- a/quickjs.c +++ b/quickjs.c @@ -10103,10 +10103,7 @@ static double js_strtod(const char *str, int radix, BOOL is_float) n_max = ((uint64_t)-1 - (radix - 1)) / radix; /* XXX: could be more precise */ int_exp = 0; - while (*p != '\0') { - c = to_digit((uint8_t)*p); - if (c >= radix) - break; + while ((c = to_digit(*p)) < radix) { if (n <= n_max) { n = n * radix + c; } else { @@ -10129,26 +10126,9 @@ static double js_strtod(const char *str, int radix, BOOL is_float) return d; } -#define ATOD_INT_ONLY (1 << 0) -/* accept Oo and Ob prefixes in addition to 0x prefix if radix = 0 */ -#define ATOD_ACCEPT_BIN_OCT (1 << 2) -/* accept O prefix as octal if radix == 0 and properly formed (Annex B) */ -#define ATOD_ACCEPT_LEGACY_OCTAL (1 << 4) -/* accept _ between digits as a digit separator */ -#define ATOD_ACCEPT_UNDERSCORES (1 << 5) -/* allow a suffix to override the type */ -#define ATOD_ACCEPT_SUFFIX (1 << 6) -/* default type */ -#define ATOD_TYPE_MASK (1 << 7) -#define ATOD_TYPE_FLOAT64 (0 << 7) -#define ATOD_TYPE_BIG_INT (1 << 7) -/* accept -0x1 */ -#define ATOD_ACCEPT_PREFIX_AFTER_SIGN (1 << 10) - -static JSValue js_string_to_bigint(JSContext *ctx, const char *buf, - int radix, int flags, slimb_t *pexponent) +static JSValue js_string_to_bigint(JSContext *ctx, const char *buf, int radix) { - bf_t a_s, *a = &a_s; + bf_t *a; int ret; JSValue val; val = JS_NewBigInt(ctx); @@ -10160,136 +10140,134 @@ static JSValue js_string_to_bigint(JSContext *ctx, const char *buf, JS_FreeValue(ctx, val); return JS_ThrowOutOfMemory(ctx); } - val = JS_CompactBigInt1(ctx, val); - return val; + return JS_CompactBigInt1(ctx, val); } -/* return an exception in case of memory error. Return JS_NAN if - invalid syntax */ -static JSValue js_atof2(JSContext *ctx, const char *str, const char **pp, - int radix, int flags, slimb_t *pexponent) +/* `js_atof(ctx, p, end, pp, radix, flags)` + Return an exception in case of memory error. + Return `JS_NAN` if invalid syntax. + - `p` points to a null terminated UTF-8 encoded char array + - `end` points to the end of the array. + - `pp` if not null receives a pointer to the next character + - `radix` must be in range 2 to 36, else return `JS_NAN` + - `flags` is a combination of the flags below + There is a null byte at `*end`, but there might be embedded null bytes + between `p` and `end` which must produce `JS_NAN` if the + `ATOD_NO_TRAILING_CHARS` flag is not present. + */ + +#define ATOD_TRIM_SPACES (1 << 0) /* trim white space */ +#define ATOD_ACCEPT_EMPTY (1 << 1) /* accept an empty string, value is 0 */ +#define ATOD_ACCEPT_FLOAT (1 << 2) /* parse decimal floating point syntax */ +#define ATOD_ACCEPT_INFINITY (1 << 3) /* parse Infinity as a float point number */ +#define ATOD_ACCEPT_BIN_OCT (1 << 4) /* accept 0o and 0b prefixes */ +#define ATOD_ACCEPT_HEX_PREFIX (1 << 5) /* accept 0x prefix for radix 16 */ +#define ATOD_ACCEPT_UNDERSCORES (1 << 6) /* accept _ between digits as a digit separator */ +#define ATOD_ACCEPT_SUFFIX (1 << 7) /* allow 'n' suffix to produce BigInt */ +#define ATOD_WANT_BIG_INT (1 << 8) /* return type must be BigInt */ +#define ATOD_DECIMAL_AFTER_SIGN (1 << 9) /* only accept decimal number after sign */ +#define ATOD_NO_TRAILING_CHARS (1 << 10) /* do not accept trailing characters */ + +static JSValue js_atof(JSContext *ctx, const char *p, const char *end, + const char **pp, int radix, int flags) { - const char *p, *p_start; - int sep, is_neg; - BOOL is_float, has_legacy_octal; - int atod_type = flags & ATOD_TYPE_MASK; - char buf1[64], *buf; - int i, j, len; - BOOL buf_allocated = FALSE; - JSValue val; + const char *p_start; + int sep; + BOOL is_float; + char buf1[64], *buf = buf1; + size_t i, j, len; + JSValue val = JS_NAN; + double d; + char sign; + + if (radix < 2 || radix > 36) + goto done; /* optional separator between digits */ sep = (flags & ATOD_ACCEPT_UNDERSCORES) ? '_' : 256; - has_legacy_octal = FALSE; - - p = str; - p_start = p; - is_neg = 0; - if (p[0] == '+') { + sign = 0; + if (flags & ATOD_TRIM_SPACES) + p += skip_spaces(p); + if (p == end && (flags & ATOD_ACCEPT_EMPTY)) { + if (pp) *pp = p; + if (flags & ATOD_WANT_BIG_INT) + return JS_NewBigInt64(ctx, 0); + else + return js_int32(0); + } + if (*p == '+' || *p == '-') { + sign = *p; p++; - p_start++; - if (!(flags & ATOD_ACCEPT_PREFIX_AFTER_SIGN)) - goto no_radix_prefix; - } else if (p[0] == '-') { - p++; - p_start++; - is_neg = 1; - if (!(flags & ATOD_ACCEPT_PREFIX_AFTER_SIGN)) - goto no_radix_prefix; + if (flags & ATOD_DECIMAL_AFTER_SIGN) + flags &= ~(ATOD_ACCEPT_HEX_PREFIX | ATOD_ACCEPT_BIN_OCT); } if (p[0] == '0') { if ((p[1] == 'x' || p[1] == 'X') && - (radix == 0 || radix == 16)) { + ((flags & ATOD_ACCEPT_HEX_PREFIX) || radix == 16)) { p += 2; radix = 16; - } else if ((p[1] == 'o' || p[1] == 'O') && - radix == 0 && (flags & ATOD_ACCEPT_BIN_OCT)) { - p += 2; - radix = 8; - } else if ((p[1] == 'b' || p[1] == 'B') && - radix == 0 && (flags & ATOD_ACCEPT_BIN_OCT)) { - p += 2; - radix = 2; - } else if ((p[1] >= '0' && p[1] <= '9') && - radix == 0 && (flags & ATOD_ACCEPT_LEGACY_OCTAL)) { - int i; - has_legacy_octal = TRUE; - sep = 256; - for (i = 1; (p[i] >= '0' && p[i] <= '7'); i++) - continue; - if (p[i] == '8' || p[i] == '9') - goto no_prefix; - p += 1; - radix = 8; - } else { - goto no_prefix; + } else if (flags & ATOD_ACCEPT_BIN_OCT) { + if (p[1] == 'o' || p[1] == 'O') { + p += 2; + radix = 8; + } else if (p[1] == 'b' || p[1] == 'B') { + p += 2; + radix = 2; + } } - /* there must be a digit after the prefix */ - if (to_digit((uint8_t)*p) >= radix) - goto fail; - no_prefix: ; } else { - no_radix_prefix: - if (!(flags & ATOD_INT_ONLY) && - atod_type == ATOD_TYPE_FLOAT64 && - strstart(p, "Infinity", &p)) { - double d = INF; - if (is_neg) + if (*p == 'I' && (flags & ATOD_ACCEPT_INFINITY) && strstart(p, "Infinity", &p)) { + d = INF; + if (sign == '-') d = -d; val = js_float64(d); goto done; } } - if (radix == 0) - radix = 10; is_float = FALSE; p_start = p; - while (to_digit((uint8_t)*p) < radix - || (*p == sep && (radix != 10 || - p != p_start + 1 || p[-1] != '0') && - to_digit((uint8_t)p[1]) < radix)) { + while (to_digit(*p) < radix) { p++; + if (*p == sep && to_digit(p[1]) < radix) + p++; } - if (!(flags & ATOD_INT_ONLY) && radix == 10) { - if (*p == '.' && (p > p_start || to_digit((uint8_t)p[1]) < radix)) { + if ((flags & ATOD_ACCEPT_FLOAT) && radix == 10) { + if (*p == '.' && (p > p_start || to_digit(p[1]) < radix)) { is_float = TRUE; p++; - if (*p == sep) - goto fail; - while (to_digit((uint8_t)*p) < radix || - (*p == sep && to_digit((uint8_t)p[1]) < radix)) + while (to_digit(*p) < radix) { p++; + if (*p == sep && to_digit(p[1]) < radix) + p++; + } } if (p > p_start && (*p == 'e' || *p == 'E')) { - const char *p1 = p + 1; - is_float = TRUE; - if (*p1 == '+') { - p1++; - } else if (*p1 == '-') { - p1++; + i = 1; + if (p[1] == '+' || p[1] == '-') { + i++; } - if (is_digit((uint8_t)*p1)) { - p = p1 + 1; - while (is_digit((uint8_t)*p) || (*p == sep && is_digit((uint8_t)p[1]))) + if (is_digit(p[i])) { + is_float = TRUE; + p += i + 1; + while (is_digit(*p) || (*p == sep && is_digit(p[1]))) p++; } } } if (p == p_start) - goto fail; + goto done; - buf = buf1; - buf_allocated = FALSE; len = p - p_start; if (unlikely((len + 2) > sizeof(buf1))) { buf = js_malloc_rt(ctx->rt, len + 2); /* no exception raised */ - if (!buf) - goto mem_error; - buf_allocated = TRUE; + if (!buf) { + if (pp) *pp = p; + return JS_ThrowOutOfMemory(ctx); + } } - /* remove the separators and the radix prefixes */ + /* remove the separators and the radix prefix */ j = 0; - if (is_neg) + if (sign == '-') buf[j++] = '-'; for (i = 0; i < len; i++) { if (p_start[i] != '_') @@ -10300,46 +10278,31 @@ static JSValue js_atof2(JSContext *ctx, const char *str, const char **pp, if (flags & ATOD_ACCEPT_SUFFIX) { if (*p == 'n') { p++; - atod_type = ATOD_TYPE_BIG_INT; + flags |= ATOD_WANT_BIG_INT; } } - switch(atod_type) { - case ATOD_TYPE_FLOAT64: - { - double d; - d = js_strtod(buf, radix, is_float); - /* return int or float64 */ - val = js_number(d); - } - break; - case ATOD_TYPE_BIG_INT: - if (has_legacy_octal || is_float) - goto fail; - val = js_string_to_bigint(ctx, buf, radix, flags, NULL); - break; - default: - abort(); + if (flags & ATOD_WANT_BIG_INT) { + if (!is_float) + val = js_string_to_bigint(ctx, buf, radix); + } else { + d = js_strtod(buf, radix, is_float); + val = js_number(d); /* return int or float64 */ } -done: - if (buf_allocated) + done: + if (flags & ATOD_NO_TRAILING_CHARS) { + if (flags & ATOD_TRIM_SPACES) + p += skip_spaces(p); + if (p != end) { + JS_FreeValue(ctx, val); + val = JS_NAN; + } + } + if (buf != buf1) js_free_rt(ctx->rt, buf); - if (pp) - *pp = p; + if (pp) *pp = p; return val; - fail: - val = JS_NAN; - goto done; - mem_error: - val = JS_ThrowOutOfMemory(ctx); - goto done; -} - -static JSValue js_atof(JSContext *ctx, const char *str, const char **pp, - int radix, int flags) -{ - return js_atof2(ctx, str, pp, radix, flags, NULL); } typedef enum JSToNumberHintEnum { @@ -10383,28 +10346,18 @@ static JSValue JS_ToNumberHintFree(JSContext *ctx, JSValue val, case JS_TAG_STRING: { const char *str; - const char *p; size_t len; + int flags; str = JS_ToCStringLen(ctx, &len, val); JS_FreeValue(ctx, val); if (!str) return JS_EXCEPTION; - p = str; - p += skip_spaces(p); - if ((p - str) == len) { - ret = js_int32(0); - } else { - int flags = ATOD_ACCEPT_BIN_OCT; - ret = js_atof(ctx, p, &p, 0, flags); - if (!JS_IsException(ret)) { - p += skip_spaces(p); - if ((p - str) != len) { - JS_FreeValue(ctx, ret); - ret = JS_NAN; - } - } - } + flags = ATOD_TRIM_SPACES | ATOD_ACCEPT_EMPTY | + ATOD_ACCEPT_FLOAT | ATOD_ACCEPT_INFINITY | + ATOD_ACCEPT_HEX_PREFIX | ATOD_ACCEPT_BIN_OCT | + ATOD_DECIMAL_AFTER_SIGN | ATOD_NO_TRAILING_CHARS; + ret = js_atof(ctx, str, str + len, NULL, 10, flags); JS_FreeCString(ctx, str); } break; @@ -11827,7 +11780,7 @@ static bf_t *JS_ToBigInt1(JSContext *ctx, bf_t *buf, JSValue val) /* return NaN if bad bigint literal */ static JSValue JS_StringToBigInt(JSContext *ctx, JSValue val) { - const char *str, *p; + const char *str; size_t len; int flags; @@ -11835,21 +11788,11 @@ static JSValue JS_StringToBigInt(JSContext *ctx, JSValue val) JS_FreeValue(ctx, val); if (!str) return JS_EXCEPTION; - p = str; - p += skip_spaces(p); - if ((p - str) == len) { - val = JS_NewBigInt64(ctx, 0); - } else { - flags = ATOD_INT_ONLY | ATOD_ACCEPT_BIN_OCT | ATOD_TYPE_BIG_INT; - val = js_atof(ctx, p, &p, 0, flags); - p += skip_spaces(p); - if (!JS_IsException(val)) { - if ((p - str) != len) { - JS_FreeValue(ctx, val); - val = JS_NAN; - } - } - } + flags = ATOD_WANT_BIG_INT | + ATOD_TRIM_SPACES | ATOD_ACCEPT_EMPTY | + ATOD_ACCEPT_HEX_PREFIX | ATOD_ACCEPT_BIN_OCT | + ATOD_DECIMAL_AFTER_SIGN | ATOD_NO_TRAILING_CHARS; + val = js_atof(ctx, str, str + len, NULL, 10, flags); JS_FreeCString(ctx, str); return val; } @@ -18428,7 +18371,6 @@ typedef struct JSToken { } str; struct { JSValue val; - slimb_t exponent; /* may be != 0 only if val is a float */ } num; struct { JSAtom atom; @@ -19030,6 +18972,7 @@ static __exception int next_token(JSParseState *s) int c; BOOL ident_has_escape; JSAtom atom; + int flags, radix; if (js_check_stack_overflow(s->ctx->rt, 1000)) { JS_ThrowStackOverflow(s->ctx); @@ -19251,31 +19194,50 @@ static __exception int next_token(JSParseState *s) break; } if (p[1] >= '0' && p[1] <= '9') { + flags = ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_FLOAT; + radix = 10; goto parse_number; - } else { - goto def_token; } - break; + goto def_token; case '0': - /* in strict mode, octal literals are not accepted */ - if (is_digit(p[1]) && (s->cur_func->js_mode & JS_MODE_STRICT)) { - js_parse_error(s, "octal literals are deprecated in strict mode"); + if (is_digit(p[1])) { /* handle legacy octal */ + if (s->cur_func->js_mode & JS_MODE_STRICT) { + js_parse_error(s, "Octal literals are not allowed in strict mode"); + goto fail; + } + /* Legacy octal: no separators, no suffix, no floats, + base 8 unless non octal digits are detected */ + flags = 0; + radix = 8; + while (is_digit(*p)) { + if (*p >= '8' && *p <= '9') + radix = 10; + p++; + } + p = s->token.ptr; + goto parse_number; + } + if (p[1] == '_') { + js_parse_error(s, "Numeric separator can not be used after leading 0"); goto fail; } + flags = ATOD_ACCEPT_HEX_PREFIX | ATOD_ACCEPT_BIN_OCT | + ATOD_ACCEPT_FLOAT | ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX; + radix = 10; goto parse_number; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* number */ - parse_number: { JSValue ret; - int flags; - flags = ATOD_ACCEPT_BIN_OCT | ATOD_ACCEPT_LEGACY_OCTAL | - ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX; - s->token.u.num.exponent = 0; - ret = js_atof2(s->ctx, (const char *)p, (const char **)&p, 0, - flags, &s->token.u.num.exponent); + const uint8_t *p1; + + flags = ATOD_ACCEPT_FLOAT | ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX; + radix = 10; + parse_number: + ret = js_atof(s->ctx, (const char *)p, (const char *)s->buf_end, + (const char **)&p, radix, flags); if (JS_IsException(ret)) goto fail; /* reject `10instanceof Number` */ @@ -39137,25 +39099,24 @@ static const JSCFunctionListEntry js_number_proto_funcs[] = { static JSValue js_parseInt(JSContext *ctx, JSValue this_val, int argc, JSValue *argv) { - const char *str, *p; + const char *str; int radix, flags; JSValue ret; + size_t len; - str = JS_ToCString(ctx, argv[0]); + str = JS_ToCStringLen(ctx, &len, argv[0]); if (!str) return JS_EXCEPTION; if (JS_ToInt32(ctx, &radix, argv[1])) { JS_FreeCString(ctx, str); return JS_EXCEPTION; } - if (radix != 0 && (radix < 2 || radix > 36)) { - ret = JS_NAN; - } else { - p = str; - p += skip_spaces(p); - flags = ATOD_INT_ONLY | ATOD_ACCEPT_PREFIX_AFTER_SIGN; - ret = js_atof(ctx, p, NULL, radix, flags); + flags = ATOD_TRIM_SPACES; + if (radix == 0) { + flags |= ATOD_ACCEPT_HEX_PREFIX; // Only 0x and 0X are supported + radix = 10; } + ret = js_atof(ctx, str, str + len, NULL, radix, flags); JS_FreeCString(ctx, str); return ret; } @@ -39163,15 +39124,16 @@ static JSValue js_parseInt(JSContext *ctx, JSValue this_val, static JSValue js_parseFloat(JSContext *ctx, JSValue this_val, int argc, JSValue *argv) { - const char *str, *p; + const char *str; JSValue ret; + int flags; + size_t len; - str = JS_ToCString(ctx, argv[0]); + str = JS_ToCStringLen(ctx, &len, argv[0]); if (!str) return JS_EXCEPTION; - p = str; - p += skip_spaces(p); - ret = js_atof(ctx, p, NULL, 10, 0); + flags = ATOD_TRIM_SPACES | ATOD_ACCEPT_FLOAT | ATOD_ACCEPT_INFINITY; + ret = js_atof(ctx, str, str + len, NULL, 10, flags); JS_FreeCString(ctx, str); return ret; } diff --git a/tests/test_language.js b/tests/test_language.js index 717a7f4..eea0161 100644 --- a/tests/test_language.js +++ b/tests/test_language.js @@ -5,6 +5,10 @@ function assert(actual, expected, message) { if (actual === expected) return; + if (typeof actual == 'number' && isNaN(actual) + && typeof expected == 'number' && isNaN(expected)) + return; + if (actual !== null && expected !== null && typeof actual == 'object' && typeof expected == 'object' && actual.toString() === expected.toString()) @@ -616,6 +620,23 @@ function test_number_literals() assert(01.a, undefined); assert(0o1.a, undefined); test_expr('0.a', SyntaxError); + assert(parseInt("0_1"), 0); + assert(parseInt("1_0"), 1); + assert(parseInt("0_1", 8), 0); + assert(parseInt("1_0", 8), 1); + assert(parseFloat("0_1"), 0); + assert(parseFloat("1_0"), 1); + assert(1_0, 10); + assert(parseInt("Infinity"), NaN); + assert(parseFloat("Infinity"), Infinity); + assert(parseFloat("Infinity1"), Infinity); + assert(parseFloat("Infinity_"), Infinity); + assert(parseFloat("Infinity."), Infinity); + test_expr('0_0', SyntaxError); + test_expr('00_0', SyntaxError); + test_expr('01_0', SyntaxError); + test_expr('08_0', SyntaxError); + test_expr('09_0', SyntaxError); } function test_syntax()