Simpler utf8_decode (#414)

- no longer pass the array length to `utf8_decode`
- add `utf8_decode_len` for border cases
- use switch based dispatch in `utf8_decode_len` to work around a gcc 12.2 optimizer bug
This commit is contained in:
Charlie Gordon 2024-05-27 08:15:52 +02:00 committed by GitHub
parent 9e67b47c0d
commit 921c1eef50
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 53 additions and 37 deletions

View file

@ -276,7 +276,6 @@ size_t utf8_encode(uint8_t *buf, uint32_t c)
/* Decode a single code point from a UTF-8 encoded array of bytes
`p` is a valid pointer to an array of bytes
`max_len` is the number of bytes available in the array
`pp` is a valid pointer to a `const uint8_t *` to store a pointer
to the byte following the current sequence.
Return the code point at `p`, in the range `0..0x10FFFF`
@ -284,9 +283,12 @@ size_t utf8_encode(uint8_t *buf, uint32_t c)
The maximum length for a UTF-8 byte sequence is 4 bytes.
This implements the algorithm specified in whatwg.org, except it accepts
UTF-8 encoded surrogates as JavaScript allows them in strings.
The source string is assumed to have at least UTF8_CHAR_LEN_MAX bytes
or be null terminated.
If `p[0]` is '\0', the return value is `0` and the byte is consumed.
cf: https://encoding.spec.whatwg.org/#utf-8-encoder
*/
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp)
{
uint32_t c;
uint8_t lower, upper;
@ -305,10 +307,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
case 0xD4: case 0xD5: case 0xD6: case 0xD7:
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
if (max_len < 2) {
// need more bytes
break;
}
if (*p >= 0x80 && *p <= 0xBF) {
*pp = p + 1;
return ((c - 0xC0) << 6) + (*p - 0x80);
@ -324,10 +322,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
case 0xEC: case 0xED: case 0xEE: case 0xEF:
lower = 0x80;
need2:
if (max_len < 3) {
// need more bytes
break;
}
if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
*pp = p + 2;
return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
@ -346,10 +340,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
lower = 0x80;
upper = 0xBF;
need3:
if (max_len < 4) {
// need more bytes
break;
}
if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
&& p[2] >= 0x80 && p[2] <= 0xBF) {
*pp = p + 3;
@ -366,6 +356,31 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
return 0xFFFD;
}
uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp) {
switch (max_len) {
case 0:
*pp = p;
return 0xFFFD;
case 1:
if (*p < 0x80)
goto good;
break;
case 2:
if (*p < 0xE0)
goto good;
break;
case 3:
if (*p < 0xF0)
goto good;
break;
default:
good:
return utf8_decode(p, pp);
}
*pp = p + 1;
return 0xFFFD;
}
/* Scan a UTF-8 encoded buffer for content type
`buf` is a valid pointer to a UTF-8 encoded string
`len` is the number of bytes to scan
@ -399,7 +414,7 @@ int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
len++;
if (*p++ >= 0x80) {
/* parse UTF-8 sequence, check for encoding error */
uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
uint32_t c = utf8_decode_len(p - 1, p_end - (p - 1), &p_next);
if (p_next == p)
kind |= UTF8_HAS_ERRORS;
p = p_next;
@ -464,7 +479,7 @@ size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_
uint32_t c = *p++;
if (c >= 0x80) {
/* parse utf-8 sequence */
c = utf8_decode(p - 1, p_end - (p - 1), &p);
c = utf8_decode_len(p - 1, p_end - (p - 1), &p);
/* encoding errors are converted as 0xFFFD and use a single byte */
if (c > 0xFFFF) {
if (i < dest_len)

View file

@ -401,7 +401,8 @@ enum {
int utf8_scan(const char *buf, size_t len, size_t *plen);
size_t utf8_encode_len(uint32_t c);
size_t utf8_encode(uint8_t *buf, uint32_t c);
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp);
uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp);
size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);

View file

@ -806,7 +806,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
normal_char:
p++;
if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p)
return re_parse_error(s, "invalid UTF-8 sequence");
p = p_next;
@ -1125,12 +1125,12 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
} else if (c == '>') {
break;
} else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p)
return -1;
p = p_next;
if (is_hi_surrogate(c)) {
d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
d = utf8_decode(p, &p_next);
if (is_lo_surrogate(d)) {
c = from_surrogate(c, d);
p = p_next;

View file

@ -276,7 +276,7 @@ static JSValue js_printf_internal(JSContext *ctx,
string_arg = JS_ToCString(ctx, argv[i++]);
if (!string_arg)
goto fail;
int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
int32_arg = utf8_decode((const uint8_t *)string_arg, &p);
JS_FreeCString(ctx, string_arg);
} else {
if (JS_ToInt32(ctx, &int32_arg, argv[i++]))

View file

@ -10049,7 +10049,7 @@ static int skip_spaces(const char *pc)
if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
break;
} else {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not a space */
if (!lre_is_space(c))
break;
@ -18724,7 +18724,7 @@ static __exception int js_parse_template_part(JSParseState *s,
s->eol = &p[-1];
s->mark = p;
} else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p) {
js_parse_error(s, "invalid UTF-8 sequence");
goto fail;
@ -18830,7 +18830,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
}
goto fail;
} else if (c >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p, &p_next);
if (p_next == p + 1) {
goto invalid_utf8;
}
@ -18856,7 +18856,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
break;
}
} else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p)
goto invalid_utf8;
p = p_next;
@ -18928,7 +18928,7 @@ static __exception int js_parse_regexp(JSParseState *s)
else if (c == '\0' && p >= s->buf_end)
goto eof_error;
else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p) {
goto invalid_utf8;
}
@ -18937,7 +18937,7 @@ static __exception int js_parse_regexp(JSParseState *s)
goto eol_error;
}
} else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p) {
invalid_utf8:
js_parse_error(s, "invalid UTF-8 sequence");
@ -18957,7 +18957,7 @@ static __exception int js_parse_regexp(JSParseState *s)
/* flags */
for(;;) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
if (!lre_js_is_ident_next(c))
break;
@ -19031,7 +19031,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
c = lre_parse_escape(&p_next, TRUE);
*pident_has_escape = TRUE;
} else if (c >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
}
if (!lre_js_is_ident_next(c))
@ -19135,7 +19135,7 @@ static __exception int next_token(JSParseState *s)
s->got_lf = TRUE; /* considered as LF for ASI */
p++;
} else if (*p >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
c = utf8_decode(p, &p);
/* ignore invalid UTF-8 in comments */
if (c == CP_LS || c == CP_PS) {
s->got_lf = TRUE; /* considered as LF for ASI */
@ -19156,7 +19156,7 @@ static __exception int next_token(JSParseState *s)
if (*p == '\r' || *p == '\n')
break;
if (*p >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
c = utf8_decode(p, &p);
/* ignore invalid UTF-8 in comments */
/* LS or PS are considered as line terminator */
if (c == CP_LS || c == CP_PS) {
@ -19256,7 +19256,7 @@ static __exception int next_token(JSParseState *s)
if (c == '\\' && *p_next == 'u') {
c = lre_parse_escape(&p_next, TRUE);
} else if (c >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p, &p_next);
if (p_next == p + 1)
goto invalid_utf8;
}
@ -19328,7 +19328,7 @@ static __exception int next_token(JSParseState *s)
goto fail;
/* reject `10instanceof Number` */
if (JS_VALUE_IS_NAN(ret) ||
lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
lre_js_is_ident_next(utf8_decode(p, &p_next))) {
JS_FreeValue(s->ctx, ret);
js_parse_error(s, "invalid number literal");
goto fail;
@ -19521,7 +19521,7 @@ static __exception int next_token(JSParseState *s)
break;
default:
if (c >= 0x80) { /* non-ASCII code-point */
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p, &p_next);
if (p_next == p + 1)
goto invalid_utf8;
p = p_next;
@ -19631,7 +19631,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
}
} else
if (c >= 0x80) {
c = utf8_decode(p - 1, s->buf_end - p, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p) {
json_parse_error(s, p - 1, "Bad UTF-8 sequence");
goto fail;
@ -19835,7 +19835,7 @@ static __exception int json_next_token(JSParseState *s)
break;
default:
if (c >= 0x80) {
c = utf8_decode(p, s->buf_end - p, &p_next);
c = utf8_decode(p, &p_next);
if (p_next == p + 1) {
js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
} else {
@ -19958,7 +19958,7 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
if (*p == '\n' || *p == '\r') {
break;
} else if (*p >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
c = utf8_decode(p, &p);
/* purposely ignore UTF-8 encoding errors in this comment line */
if (c == CP_LS || c == CP_PS)
break;