Simpler utf8_decode (#414)

- no longer pass the array length to `utf8_decode` - add `utf8_decode_len` for border cases - use switch based dispatch in `utf8_decode_len` to work around a gcc 12.2 optimizer bug
2024-05-27 08:15:52 +02:00 · 2024-05-27 08:15:52 +02:00 · 921c1eef50
commit 921c1eef50
parent 9e67b47c0d
5 changed files with 53 additions and 37 deletions
--- a/cutils.c
+++ b/cutils.c
@ -276,7 +276,6 @@ size_t utf8_encode(uint8_t *buf, uint32_t c)
 /* Decode a single code point from a UTF-8 encoded array of bytes
   `p` is a valid pointer to an array of bytes
   `max_len` is the number of bytes available in the array
   `pp` is a valid pointer to a `const uint8_t *` to store a pointer
   to the byte following the current sequence.
   Return the code point at `p`, in the range `0..0x10FFFF`
@ -284,9 +283,12 @@ size_t utf8_encode(uint8_t *buf, uint32_t c)
   The maximum length for a UTF-8 byte sequence is 4 bytes.
   This implements the algorithm specified in whatwg.org, except it accepts
   UTF-8 encoded surrogates as JavaScript allows them in strings.
   The source string is assumed to have at least UTF8_CHAR_LEN_MAX bytes
   or be null terminated.
   If `p[0]` is '\0', the return value is `0` and the byte is consumed.
   cf: https://encoding.spec.whatwg.org/#utf-8-encoder
 */
-uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
+uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp)
 {
    uint32_t c;
    uint8_t lower, upper;
@ -305,10 +307,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
    case 0xD4: case 0xD5: case 0xD6: case 0xD7:
    case 0xD8: case 0xD9: case 0xDA: case 0xDB:
    case 0xDC: case 0xDD: case 0xDE: case 0xDF:
        if (max_len < 2) {
            // need more bytes
            break;
        }
        if (*p >= 0x80 && *p <= 0xBF) {
            *pp = p + 1;
            return ((c - 0xC0) << 6) + (*p - 0x80);
@ -324,10 +322,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
    case 0xEC: case 0xED: case 0xEE: case 0xEF:
        lower = 0x80;
    need2:
        if (max_len < 3) {
            // need more bytes
            break;
        }
        if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
            *pp = p + 2;
            return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
@ -346,10 +340,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
        lower = 0x80;
        upper = 0xBF;
    need3:
        if (max_len < 4) {
            // need more bytes
            break;
        }
        if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
        &&  p[2] >= 0x80 && p[2] <= 0xBF) {
            *pp = p + 3;
@ -366,6 +356,31 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
    return 0xFFFD;
 }
 uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp) {
    switch (max_len) {
    case 0:
        *pp = p;
        return 0xFFFD;
    case 1:
        if (*p < 0x80)
            goto good;
        break;
    case 2:
        if (*p < 0xE0)
            goto good;
        break;
    case 3:
        if (*p < 0xF0)
            goto good;
        break;
    default:
    good:
        return utf8_decode(p, pp);
    }
    *pp = p + 1;
    return 0xFFFD;
 }
 /* Scan a UTF-8 encoded buffer for content type
   `buf` is a valid pointer to a UTF-8 encoded string
   `len` is the number of bytes to scan
@ -399,7 +414,7 @@ int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
            len++;
            if (*p++ >= 0x80) {
                /* parse UTF-8 sequence, check for encoding error */
-                uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
+                uint32_t c = utf8_decode_len(p - 1, p_end - (p - 1), &p_next);
                if (p_next == p)
                    kind |= UTF8_HAS_ERRORS;
                p = p_next;
@ -464,7 +479,7 @@ size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_
        uint32_t c = *p++;
        if (c >= 0x80) {
            /* parse utf-8 sequence */
-            c = utf8_decode(p - 1, p_end - (p - 1), &p);
+            c = utf8_decode_len(p - 1, p_end - (p - 1), &p);
            /* encoding errors are converted as 0xFFFD and use a single byte */
            if (c > 0xFFFF) {
                if (i < dest_len)
--- a/cutils.h
+++ b/cutils.h
@ -401,7 +401,8 @@ enum {
 int utf8_scan(const char *buf, size_t len, size_t *plen);
 size_t utf8_encode_len(uint32_t c);
 size_t utf8_encode(uint8_t *buf, uint32_t c);
-uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
+uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp);
 uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp);
 size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
 size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
 size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);
--- a/libregexp.c
+++ b/libregexp.c
@ -806,7 +806,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
    normal_char:
        p++;
        if (c >= 0x80) {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
            if (p_next == p)
                return re_parse_error(s, "invalid UTF-8 sequence");
            p = p_next;
@ -1125,12 +1125,12 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
        } else if (c == '>') {
            break;
        } else if (c >= 0x80) {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
            if (p_next == p)
                return -1;
            p = p_next;
            if (is_hi_surrogate(c)) {
-                d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+                d = utf8_decode(p, &p_next);
                if (is_lo_surrogate(d)) {
                    c = from_surrogate(c, d);
                    p = p_next;
--- a/quickjs-libc.c
+++ b/quickjs-libc.c
@ -276,7 +276,7 @@ static JSValue js_printf_internal(JSContext *ctx,
                    string_arg = JS_ToCString(ctx, argv[i++]);
                    if (!string_arg)
                        goto fail;
-                    int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
+                    int32_arg = utf8_decode((const uint8_t *)string_arg, &p);
                    JS_FreeCString(ctx, string_arg);
                } else {
                    if (JS_ToInt32(ctx, &int32_arg, argv[i++]))
--- a/quickjs.c
+++ b/quickjs.c
@ -10049,7 +10049,7 @@ static int skip_spaces(const char *pc)
            if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
                break;
        } else {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
            /* no need to test for invalid UTF-8, 0xFFFD is not a space */
            if (!lre_is_space(c))
                break;
@ -18724,7 +18724,7 @@ static __exception int js_parse_template_part(JSParseState *s,
            s->eol = &p[-1];
            s->mark = p;
        } else if (c >= 0x80) {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
            if (p_next == p) {
                js_parse_error(s, "invalid UTF-8 sequence");
                goto fail;
@ -18830,7 +18830,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
                    }
                    goto fail;
                } else if (c >= 0x80) {
-                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+                    c = utf8_decode(p, &p_next);
                    if (p_next == p + 1) {
                        goto invalid_utf8;
                    }
@ -18856,7 +18856,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
                break;
            }
        } else if (c >= 0x80) {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
            if (p_next == p)
                goto invalid_utf8;
            p = p_next;
@ -18928,7 +18928,7 @@ static __exception int js_parse_regexp(JSParseState *s)
            else if (c == '\0' && p >= s->buf_end)
                goto eof_error;
            else if (c >= 0x80) {
-                c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+                c = utf8_decode(p - 1, &p_next);
                if (p_next == p) {
                    goto invalid_utf8;
                }
@ -18937,7 +18937,7 @@ static __exception int js_parse_regexp(JSParseState *s)
                    goto eol_error;
            }
        } else if (c >= 0x80) {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
            if (p_next == p) {
            invalid_utf8:
                js_parse_error(s, "invalid UTF-8 sequence");
@ -18957,7 +18957,7 @@ static __exception int js_parse_regexp(JSParseState *s)
    /* flags */
    for(;;) {
-        c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+        c = utf8_decode(p, &p_next);
        /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
        if (!lre_js_is_ident_next(c))
            break;
@ -19031,7 +19031,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
            c = lre_parse_escape(&p_next, TRUE);
            *pident_has_escape = TRUE;
        } else if (c >= 0x80) {
-            c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p, &p_next);
            /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
        }
        if (!lre_js_is_ident_next(c))
@ -19135,7 +19135,7 @@ static __exception int next_token(JSParseState *s)
                    s->got_lf = TRUE; /* considered as LF for ASI */
                    p++;
                } else if (*p >= 0x80) {
-                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                    c = utf8_decode(p, &p);
                    /* ignore invalid UTF-8 in comments */
                    if (c == CP_LS || c == CP_PS) {
                        s->got_lf = TRUE; /* considered as LF for ASI */
@ -19156,7 +19156,7 @@ static __exception int next_token(JSParseState *s)
                if (*p == '\r' || *p == '\n')
                    break;
                if (*p >= 0x80) {
-                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                    c = utf8_decode(p, &p);
                    /* ignore invalid UTF-8 in comments */
                    /* LS or PS are considered as line terminator */
                    if (c == CP_LS || c == CP_PS) {
@ -19256,7 +19256,7 @@ static __exception int next_token(JSParseState *s)
            if (c == '\\' && *p_next == 'u') {
                c = lre_parse_escape(&p_next, TRUE);
            } else if (c >= 0x80) {
-                c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+                c = utf8_decode(p, &p_next);
                if (p_next == p + 1)
                    goto invalid_utf8;
            }
@ -19328,7 +19328,7 @@ static __exception int next_token(JSParseState *s)
                goto fail;
            /* reject `10instanceof Number` */
            if (JS_VALUE_IS_NAN(ret) ||
-                lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
+                lre_js_is_ident_next(utf8_decode(p, &p_next))) {
                JS_FreeValue(s->ctx, ret);
                js_parse_error(s, "invalid number literal");
                goto fail;
@ -19521,7 +19521,7 @@ static __exception int next_token(JSParseState *s)
        break;
    default:
        if (c >= 0x80) {  /* non-ASCII code-point */
-            c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p, &p_next);
            if (p_next == p + 1)
                goto invalid_utf8;
            p = p_next;
@ -19631,7 +19631,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
            }
        } else
        if (c >= 0x80) {
-            c = utf8_decode(p - 1, s->buf_end - p, &p_next);
+            c = utf8_decode(p - 1, &p_next);
            if (p_next == p) {
                json_parse_error(s, p - 1, "Bad UTF-8 sequence");
                goto fail;
@ -19835,7 +19835,7 @@ static __exception int json_next_token(JSParseState *s)
        break;
    default:
        if (c >= 0x80) {
-            c = utf8_decode(p, s->buf_end - p, &p_next);
+            c = utf8_decode(p, &p_next);
            if (p_next == p + 1) {
                js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
            } else {
@ -19958,7 +19958,7 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
            if (*p == '\n' || *p == '\r') {
                break;
            } else if (*p >= 0x80) {
-                c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                c = utf8_decode(p, &p);
                /* purposely ignore UTF-8 encoding errors in this comment line */
                if (c == CP_LS || c == CP_PS)
                    break;