Simpler utf8_decode (#414)
- no longer pass the array length to `utf8_decode` - add `utf8_decode_len` for border cases - use switch based dispatch in `utf8_decode_len` to work around a gcc 12.2 optimizer bug
This commit is contained in:
parent
9e67b47c0d
commit
921c1eef50
5 changed files with 53 additions and 37 deletions
47
cutils.c
47
cutils.c
|
@ -276,7 +276,6 @@ size_t utf8_encode(uint8_t *buf, uint32_t c)
|
|||
|
||||
/* Decode a single code point from a UTF-8 encoded array of bytes
|
||||
`p` is a valid pointer to an array of bytes
|
||||
`max_len` is the number of bytes available in the array
|
||||
`pp` is a valid pointer to a `const uint8_t *` to store a pointer
|
||||
to the byte following the current sequence.
|
||||
Return the code point at `p`, in the range `0..0x10FFFF`
|
||||
|
@ -284,9 +283,12 @@ size_t utf8_encode(uint8_t *buf, uint32_t c)
|
|||
The maximum length for a UTF-8 byte sequence is 4 bytes.
|
||||
This implements the algorithm specified in whatwg.org, except it accepts
|
||||
UTF-8 encoded surrogates as JavaScript allows them in strings.
|
||||
The source string is assumed to have at least UTF8_CHAR_LEN_MAX bytes
|
||||
or be null terminated.
|
||||
If `p[0]` is '\0', the return value is `0` and the byte is consumed.
|
||||
cf: https://encoding.spec.whatwg.org/#utf-8-encoder
|
||||
*/
|
||||
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
|
||||
uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp)
|
||||
{
|
||||
uint32_t c;
|
||||
uint8_t lower, upper;
|
||||
|
@ -305,10 +307,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
|
|||
case 0xD4: case 0xD5: case 0xD6: case 0xD7:
|
||||
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
|
||||
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
|
||||
if (max_len < 2) {
|
||||
// need more bytes
|
||||
break;
|
||||
}
|
||||
if (*p >= 0x80 && *p <= 0xBF) {
|
||||
*pp = p + 1;
|
||||
return ((c - 0xC0) << 6) + (*p - 0x80);
|
||||
|
@ -324,10 +322,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
|
|||
case 0xEC: case 0xED: case 0xEE: case 0xEF:
|
||||
lower = 0x80;
|
||||
need2:
|
||||
if (max_len < 3) {
|
||||
// need more bytes
|
||||
break;
|
||||
}
|
||||
if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
|
||||
*pp = p + 2;
|
||||
return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
|
||||
|
@ -346,10 +340,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
|
|||
lower = 0x80;
|
||||
upper = 0xBF;
|
||||
need3:
|
||||
if (max_len < 4) {
|
||||
// need more bytes
|
||||
break;
|
||||
}
|
||||
if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
|
||||
&& p[2] >= 0x80 && p[2] <= 0xBF) {
|
||||
*pp = p + 3;
|
||||
|
@ -366,6 +356,31 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
|
|||
return 0xFFFD;
|
||||
}
|
||||
|
||||
uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp) {
|
||||
switch (max_len) {
|
||||
case 0:
|
||||
*pp = p;
|
||||
return 0xFFFD;
|
||||
case 1:
|
||||
if (*p < 0x80)
|
||||
goto good;
|
||||
break;
|
||||
case 2:
|
||||
if (*p < 0xE0)
|
||||
goto good;
|
||||
break;
|
||||
case 3:
|
||||
if (*p < 0xF0)
|
||||
goto good;
|
||||
break;
|
||||
default:
|
||||
good:
|
||||
return utf8_decode(p, pp);
|
||||
}
|
||||
*pp = p + 1;
|
||||
return 0xFFFD;
|
||||
}
|
||||
|
||||
/* Scan a UTF-8 encoded buffer for content type
|
||||
`buf` is a valid pointer to a UTF-8 encoded string
|
||||
`len` is the number of bytes to scan
|
||||
|
@ -399,7 +414,7 @@ int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
|
|||
len++;
|
||||
if (*p++ >= 0x80) {
|
||||
/* parse UTF-8 sequence, check for encoding error */
|
||||
uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
|
||||
uint32_t c = utf8_decode_len(p - 1, p_end - (p - 1), &p_next);
|
||||
if (p_next == p)
|
||||
kind |= UTF8_HAS_ERRORS;
|
||||
p = p_next;
|
||||
|
@ -464,7 +479,7 @@ size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_
|
|||
uint32_t c = *p++;
|
||||
if (c >= 0x80) {
|
||||
/* parse utf-8 sequence */
|
||||
c = utf8_decode(p - 1, p_end - (p - 1), &p);
|
||||
c = utf8_decode_len(p - 1, p_end - (p - 1), &p);
|
||||
/* encoding errors are converted as 0xFFFD and use a single byte */
|
||||
if (c > 0xFFFF) {
|
||||
if (i < dest_len)
|
||||
|
|
3
cutils.h
3
cutils.h
|
@ -401,7 +401,8 @@ enum {
|
|||
int utf8_scan(const char *buf, size_t len, size_t *plen);
|
||||
size_t utf8_encode_len(uint32_t c);
|
||||
size_t utf8_encode(uint8_t *buf, uint32_t c);
|
||||
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
|
||||
uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp);
|
||||
uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp);
|
||||
size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
|
||||
size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
|
||||
size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);
|
||||
|
|
|
@ -806,7 +806,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
|
|||
normal_char:
|
||||
p++;
|
||||
if (c >= 0x80) {
|
||||
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p - 1, &p_next);
|
||||
if (p_next == p)
|
||||
return re_parse_error(s, "invalid UTF-8 sequence");
|
||||
p = p_next;
|
||||
|
@ -1125,12 +1125,12 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
|
|||
} else if (c == '>') {
|
||||
break;
|
||||
} else if (c >= 0x80) {
|
||||
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p - 1, &p_next);
|
||||
if (p_next == p)
|
||||
return -1;
|
||||
p = p_next;
|
||||
if (is_hi_surrogate(c)) {
|
||||
d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
d = utf8_decode(p, &p_next);
|
||||
if (is_lo_surrogate(d)) {
|
||||
c = from_surrogate(c, d);
|
||||
p = p_next;
|
||||
|
|
|
@ -276,7 +276,7 @@ static JSValue js_printf_internal(JSContext *ctx,
|
|||
string_arg = JS_ToCString(ctx, argv[i++]);
|
||||
if (!string_arg)
|
||||
goto fail;
|
||||
int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
|
||||
int32_arg = utf8_decode((const uint8_t *)string_arg, &p);
|
||||
JS_FreeCString(ctx, string_arg);
|
||||
} else {
|
||||
if (JS_ToInt32(ctx, &int32_arg, argv[i++]))
|
||||
|
|
32
quickjs.c
32
quickjs.c
|
@ -10049,7 +10049,7 @@ static int skip_spaces(const char *pc)
|
|||
if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
|
||||
break;
|
||||
} else {
|
||||
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p - 1, &p_next);
|
||||
/* no need to test for invalid UTF-8, 0xFFFD is not a space */
|
||||
if (!lre_is_space(c))
|
||||
break;
|
||||
|
@ -18724,7 +18724,7 @@ static __exception int js_parse_template_part(JSParseState *s,
|
|||
s->eol = &p[-1];
|
||||
s->mark = p;
|
||||
} else if (c >= 0x80) {
|
||||
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p - 1, &p_next);
|
||||
if (p_next == p) {
|
||||
js_parse_error(s, "invalid UTF-8 sequence");
|
||||
goto fail;
|
||||
|
@ -18830,7 +18830,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
|
|||
}
|
||||
goto fail;
|
||||
} else if (c >= 0x80) {
|
||||
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p, &p_next);
|
||||
if (p_next == p + 1) {
|
||||
goto invalid_utf8;
|
||||
}
|
||||
|
@ -18856,7 +18856,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
|
|||
break;
|
||||
}
|
||||
} else if (c >= 0x80) {
|
||||
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p - 1, &p_next);
|
||||
if (p_next == p)
|
||||
goto invalid_utf8;
|
||||
p = p_next;
|
||||
|
@ -18928,7 +18928,7 @@ static __exception int js_parse_regexp(JSParseState *s)
|
|||
else if (c == '\0' && p >= s->buf_end)
|
||||
goto eof_error;
|
||||
else if (c >= 0x80) {
|
||||
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p - 1, &p_next);
|
||||
if (p_next == p) {
|
||||
goto invalid_utf8;
|
||||
}
|
||||
|
@ -18937,7 +18937,7 @@ static __exception int js_parse_regexp(JSParseState *s)
|
|||
goto eol_error;
|
||||
}
|
||||
} else if (c >= 0x80) {
|
||||
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p - 1, &p_next);
|
||||
if (p_next == p) {
|
||||
invalid_utf8:
|
||||
js_parse_error(s, "invalid UTF-8 sequence");
|
||||
|
@ -18957,7 +18957,7 @@ static __exception int js_parse_regexp(JSParseState *s)
|
|||
|
||||
/* flags */
|
||||
for(;;) {
|
||||
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p, &p_next);
|
||||
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
|
||||
if (!lre_js_is_ident_next(c))
|
||||
break;
|
||||
|
@ -19031,7 +19031,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
|
|||
c = lre_parse_escape(&p_next, TRUE);
|
||||
*pident_has_escape = TRUE;
|
||||
} else if (c >= 0x80) {
|
||||
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p, &p_next);
|
||||
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
|
||||
}
|
||||
if (!lre_js_is_ident_next(c))
|
||||
|
@ -19135,7 +19135,7 @@ static __exception int next_token(JSParseState *s)
|
|||
s->got_lf = TRUE; /* considered as LF for ASI */
|
||||
p++;
|
||||
} else if (*p >= 0x80) {
|
||||
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
|
||||
c = utf8_decode(p, &p);
|
||||
/* ignore invalid UTF-8 in comments */
|
||||
if (c == CP_LS || c == CP_PS) {
|
||||
s->got_lf = TRUE; /* considered as LF for ASI */
|
||||
|
@ -19156,7 +19156,7 @@ static __exception int next_token(JSParseState *s)
|
|||
if (*p == '\r' || *p == '\n')
|
||||
break;
|
||||
if (*p >= 0x80) {
|
||||
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
|
||||
c = utf8_decode(p, &p);
|
||||
/* ignore invalid UTF-8 in comments */
|
||||
/* LS or PS are considered as line terminator */
|
||||
if (c == CP_LS || c == CP_PS) {
|
||||
|
@ -19256,7 +19256,7 @@ static __exception int next_token(JSParseState *s)
|
|||
if (c == '\\' && *p_next == 'u') {
|
||||
c = lre_parse_escape(&p_next, TRUE);
|
||||
} else if (c >= 0x80) {
|
||||
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p, &p_next);
|
||||
if (p_next == p + 1)
|
||||
goto invalid_utf8;
|
||||
}
|
||||
|
@ -19328,7 +19328,7 @@ static __exception int next_token(JSParseState *s)
|
|||
goto fail;
|
||||
/* reject `10instanceof Number` */
|
||||
if (JS_VALUE_IS_NAN(ret) ||
|
||||
lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
|
||||
lre_js_is_ident_next(utf8_decode(p, &p_next))) {
|
||||
JS_FreeValue(s->ctx, ret);
|
||||
js_parse_error(s, "invalid number literal");
|
||||
goto fail;
|
||||
|
@ -19521,7 +19521,7 @@ static __exception int next_token(JSParseState *s)
|
|||
break;
|
||||
default:
|
||||
if (c >= 0x80) { /* non-ASCII code-point */
|
||||
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||
c = utf8_decode(p, &p_next);
|
||||
if (p_next == p + 1)
|
||||
goto invalid_utf8;
|
||||
p = p_next;
|
||||
|
@ -19631,7 +19631,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
|
|||
}
|
||||
} else
|
||||
if (c >= 0x80) {
|
||||
c = utf8_decode(p - 1, s->buf_end - p, &p_next);
|
||||
c = utf8_decode(p - 1, &p_next);
|
||||
if (p_next == p) {
|
||||
json_parse_error(s, p - 1, "Bad UTF-8 sequence");
|
||||
goto fail;
|
||||
|
@ -19835,7 +19835,7 @@ static __exception int json_next_token(JSParseState *s)
|
|||
break;
|
||||
default:
|
||||
if (c >= 0x80) {
|
||||
c = utf8_decode(p, s->buf_end - p, &p_next);
|
||||
c = utf8_decode(p, &p_next);
|
||||
if (p_next == p + 1) {
|
||||
js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
|
||||
} else {
|
||||
|
@ -19958,7 +19958,7 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
|
|||
if (*p == '\n' || *p == '\r') {
|
||||
break;
|
||||
} else if (*p >= 0x80) {
|
||||
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
|
||||
c = utf8_decode(p, &p);
|
||||
/* purposely ignore UTF-8 encoding errors in this comment line */
|
||||
if (c == CP_LS || c == CP_PS)
|
||||
break;
|
||||
|
|
Loading…
Reference in a new issue