From 921c1eef509db9b93d86af23165582ab69bfbb2f Mon Sep 17 00:00:00 2001 From: Charlie Gordon Date: Mon, 27 May 2024 08:15:52 +0200 Subject: [PATCH] Simpler utf8_decode (#414) - no longer pass the array length to `utf8_decode` - add `utf8_decode_len` for border cases - use switch based dispatch in `utf8_decode_len` to work around a gcc 12.2 optimizer bug --- cutils.c | 47 +++++++++++++++++++++++++++++++---------------- cutils.h | 3 ++- libregexp.c | 6 +++--- quickjs-libc.c | 2 +- quickjs.c | 32 ++++++++++++++++---------------- 5 files changed, 53 insertions(+), 37 deletions(-) diff --git a/cutils.c b/cutils.c index bdd1256..56982df 100644 --- a/cutils.c +++ b/cutils.c @@ -276,7 +276,6 @@ size_t utf8_encode(uint8_t *buf, uint32_t c) /* Decode a single code point from a UTF-8 encoded array of bytes `p` is a valid pointer to an array of bytes - `max_len` is the number of bytes available in the array `pp` is a valid pointer to a `const uint8_t *` to store a pointer to the byte following the current sequence. Return the code point at `p`, in the range `0..0x10FFFF` @@ -284,9 +283,12 @@ size_t utf8_encode(uint8_t *buf, uint32_t c) The maximum length for a UTF-8 byte sequence is 4 bytes. This implements the algorithm specified in whatwg.org, except it accepts UTF-8 encoded surrogates as JavaScript allows them in strings. + The source string is assumed to have at least UTF8_CHAR_LEN_MAX bytes + or be null terminated. + If `p[0]` is '\0', the return value is `0` and the byte is consumed. cf: https://encoding.spec.whatwg.org/#utf-8-encoder */ -uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp) +uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp) { uint32_t c; uint8_t lower, upper; @@ -305,10 +307,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp) case 0xD4: case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE: case 0xDF: - if (max_len < 2) { - // need more bytes - break; - } if (*p >= 0x80 && *p <= 0xBF) { *pp = p + 1; return ((c - 0xC0) << 6) + (*p - 0x80); @@ -324,10 +322,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp) case 0xEC: case 0xED: case 0xEE: case 0xEF: lower = 0x80; need2: - if (max_len < 3) { - // need more bytes - break; - } if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) { *pp = p + 2; return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80); @@ -346,10 +340,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp) lower = 0x80; upper = 0xBF; need3: - if (max_len < 4) { - // need more bytes - break; - } if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF && p[2] >= 0x80 && p[2] <= 0xBF) { *pp = p + 3; @@ -366,6 +356,31 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp) return 0xFFFD; } +uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp) { + switch (max_len) { + case 0: + *pp = p; + return 0xFFFD; + case 1: + if (*p < 0x80) + goto good; + break; + case 2: + if (*p < 0xE0) + goto good; + break; + case 3: + if (*p < 0xF0) + goto good; + break; + default: + good: + return utf8_decode(p, pp); + } + *pp = p + 1; + return 0xFFFD; +} + /* Scan a UTF-8 encoded buffer for content type `buf` is a valid pointer to a UTF-8 encoded string `len` is the number of bytes to scan @@ -399,7 +414,7 @@ int utf8_scan(const char *buf, size_t buf_len, size_t *plen) len++; if (*p++ >= 0x80) { /* parse UTF-8 sequence, check for encoding error */ - uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next); + uint32_t c = utf8_decode_len(p - 1, p_end - (p - 1), &p_next); if (p_next == p) kind |= UTF8_HAS_ERRORS; p = p_next; @@ -464,7 +479,7 @@ size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_ uint32_t c = *p++; if (c >= 0x80) { /* parse utf-8 sequence */ - c = utf8_decode(p - 1, p_end - (p - 1), &p); + c = utf8_decode_len(p - 1, p_end - (p - 1), &p); /* encoding errors are converted as 0xFFFD and use a single byte */ if (c > 0xFFFF) { if (i < dest_len) diff --git a/cutils.h b/cutils.h index 6c7a0a3..a5da60d 100644 --- a/cutils.h +++ b/cutils.h @@ -401,7 +401,8 @@ enum { int utf8_scan(const char *buf, size_t len, size_t *plen); size_t utf8_encode_len(uint32_t c); size_t utf8_encode(uint8_t *buf, uint32_t c); -uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp); +uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp); +uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp); size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len); size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len); size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len); diff --git a/libregexp.c b/libregexp.c index 4b6aa48..a26b74c 100644 --- a/libregexp.c +++ b/libregexp.c @@ -806,7 +806,7 @@ static int get_class_atom(REParseState *s, CharRange *cr, normal_char: p++; if (c >= 0x80) { - c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p - 1, &p_next); if (p_next == p) return re_parse_error(s, "invalid UTF-8 sequence"); p = p_next; @@ -1125,12 +1125,12 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) } else if (c == '>') { break; } else if (c >= 0x80) { - c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p - 1, &p_next); if (p_next == p) return -1; p = p_next; if (is_hi_surrogate(c)) { - d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + d = utf8_decode(p, &p_next); if (is_lo_surrogate(d)) { c = from_surrogate(c, d); p = p_next; diff --git a/quickjs-libc.c b/quickjs-libc.c index d8ed30c..f5c496b 100644 --- a/quickjs-libc.c +++ b/quickjs-libc.c @@ -276,7 +276,7 @@ static JSValue js_printf_internal(JSContext *ctx, string_arg = JS_ToCString(ctx, argv[i++]); if (!string_arg) goto fail; - int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p); + int32_arg = utf8_decode((const uint8_t *)string_arg, &p); JS_FreeCString(ctx, string_arg); } else { if (JS_ToInt32(ctx, &int32_arg, argv[i++])) diff --git a/quickjs.c b/quickjs.c index 8e49631..9adc713 100644 --- a/quickjs.c +++ b/quickjs.c @@ -10049,7 +10049,7 @@ static int skip_spaces(const char *pc) if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20))) break; } else { - c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p - 1, &p_next); /* no need to test for invalid UTF-8, 0xFFFD is not a space */ if (!lre_is_space(c)) break; @@ -18724,7 +18724,7 @@ static __exception int js_parse_template_part(JSParseState *s, s->eol = &p[-1]; s->mark = p; } else if (c >= 0x80) { - c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p - 1, &p_next); if (p_next == p) { js_parse_error(s, "invalid UTF-8 sequence"); goto fail; @@ -18830,7 +18830,7 @@ static __exception int js_parse_string(JSParseState *s, int sep, } goto fail; } else if (c >= 0x80) { - c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p, &p_next); if (p_next == p + 1) { goto invalid_utf8; } @@ -18856,7 +18856,7 @@ static __exception int js_parse_string(JSParseState *s, int sep, break; } } else if (c >= 0x80) { - c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p - 1, &p_next); if (p_next == p) goto invalid_utf8; p = p_next; @@ -18928,7 +18928,7 @@ static __exception int js_parse_regexp(JSParseState *s) else if (c == '\0' && p >= s->buf_end) goto eof_error; else if (c >= 0x80) { - c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p - 1, &p_next); if (p_next == p) { goto invalid_utf8; } @@ -18937,7 +18937,7 @@ static __exception int js_parse_regexp(JSParseState *s) goto eol_error; } } else if (c >= 0x80) { - c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p - 1, &p_next); if (p_next == p) { invalid_utf8: js_parse_error(s, "invalid UTF-8 sequence"); @@ -18957,7 +18957,7 @@ static __exception int js_parse_regexp(JSParseState *s) /* flags */ for(;;) { - c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p, &p_next); /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */ if (!lre_js_is_ident_next(c)) break; @@ -19031,7 +19031,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp, c = lre_parse_escape(&p_next, TRUE); *pident_has_escape = TRUE; } else if (c >= 0x80) { - c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p, &p_next); /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */ } if (!lre_js_is_ident_next(c)) @@ -19135,7 +19135,7 @@ static __exception int next_token(JSParseState *s) s->got_lf = TRUE; /* considered as LF for ASI */ p++; } else if (*p >= 0x80) { - c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p); + c = utf8_decode(p, &p); /* ignore invalid UTF-8 in comments */ if (c == CP_LS || c == CP_PS) { s->got_lf = TRUE; /* considered as LF for ASI */ @@ -19156,7 +19156,7 @@ static __exception int next_token(JSParseState *s) if (*p == '\r' || *p == '\n') break; if (*p >= 0x80) { - c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p); + c = utf8_decode(p, &p); /* ignore invalid UTF-8 in comments */ /* LS or PS are considered as line terminator */ if (c == CP_LS || c == CP_PS) { @@ -19256,7 +19256,7 @@ static __exception int next_token(JSParseState *s) if (c == '\\' && *p_next == 'u') { c = lre_parse_escape(&p_next, TRUE); } else if (c >= 0x80) { - c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p, &p_next); if (p_next == p + 1) goto invalid_utf8; } @@ -19328,7 +19328,7 @@ static __exception int next_token(JSParseState *s) goto fail; /* reject `10instanceof Number` */ if (JS_VALUE_IS_NAN(ret) || - lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) { + lre_js_is_ident_next(utf8_decode(p, &p_next))) { JS_FreeValue(s->ctx, ret); js_parse_error(s, "invalid number literal"); goto fail; @@ -19521,7 +19521,7 @@ static __exception int next_token(JSParseState *s) break; default: if (c >= 0x80) { /* non-ASCII code-point */ - c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p, &p_next); if (p_next == p + 1) goto invalid_utf8; p = p_next; @@ -19631,7 +19631,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp) } } else if (c >= 0x80) { - c = utf8_decode(p - 1, s->buf_end - p, &p_next); + c = utf8_decode(p - 1, &p_next); if (p_next == p) { json_parse_error(s, p - 1, "Bad UTF-8 sequence"); goto fail; @@ -19835,7 +19835,7 @@ static __exception int json_next_token(JSParseState *s) break; default: if (c >= 0x80) { - c = utf8_decode(p, s->buf_end - p, &p_next); + c = utf8_decode(p, &p_next); if (p_next == p + 1) { js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p); } else { @@ -19958,7 +19958,7 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end) if (*p == '\n' || *p == '\r') { break; } else if (*p >= 0x80) { - c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p); + c = utf8_decode(p, &p); /* purposely ignore UTF-8 encoding errors in this comment line */ if (c == CP_LS || c == CP_PS) break;