From 1baa6763f894a59c03d9f5c9e969ea83cf519d81 Mon Sep 17 00:00:00 2001 From: Charlie Gordon Date: Tue, 21 May 2024 14:08:33 +0200 Subject: [PATCH] Improve UTF-8 decoding and encoding functions (#410) Ensure proper UTF-8 encoding (1 to 4 bytes). Handle invalid encodings (return 0xFFFD and consume a single byte) Individually encoded surrogate code points are accepted. - add `utf8_scan()` to analyze a byte array for UTF-8 contents detects invalid encoding, computes number of codepoints and content kind: plain ASCII, 8-bit, 16-bit or larger codepoints. - add `utf8_encode_len(c)` to compute the number of bytes to encode `c` - rename `unicode_to_utf8` as `utf8_encode` - rename `unicode_from_utf8` as `utf8_decode` - add `utf8_decode_buf8(dest, size, src, len)` to decode a UTF-8 encoded byte array known to contain only ASCII and 8-bit codepoints. - add `utf8_decode_buf16(dest, size, src, len)` to decode a UTF-8 encoded byte array into an array of 16-bit codepoints using UTF-16 surrogate pairs for non-BMP1 codepoints. - add `utf8_encode_buf8(dest, size, src, len)` to encode an array of 8-bit codepoints as a UTF-8 encoded null terminated string - add `utf16_encode_buf8(dest, size, src, len)` to decode an array of 16-bit codepoints (including surrogate pairs) as a UTF-8 encoded null terminated string - detect invalid UTF-8 encoding in RegExp parser - simplify `JS_AtomGetStrRT`, `JS_NewStringLen` using the above functions - simplify UTF-8 decoding and error testing --- cutils.c | 414 +++++++++++++++++++++++++++++++++++++++---------- cutils.h | 21 ++- libregexp.c | 46 +++--- quickjs-libc.c | 7 +- quickjs.c | 271 ++++++++++++++------------------ 5 files changed, 490 insertions(+), 269 deletions(-) diff --git a/cutils.c b/cutils.c index 4973455..88cd72d 100644 --- a/cutils.c +++ b/cutils.c @@ -213,58 +213,83 @@ void dbuf_free(DynBuf *s) memset(s, 0, sizeof(*s)); } -/*--- Unicode / UTF-8 utility functions --*/ +/*--- UTF-8 utility functions --*/ -/* Note: at most 31 bits are encoded. At most UTF8_CHAR_LEN_MAX bytes - are output. */ -int unicode_to_utf8(uint8_t *buf, unsigned int c) +/* Note: only encode valid codepoints (0x0000..0x10FFFF). + At most UTF8_CHAR_LEN_MAX bytes are output. */ + +/* Compute the number of bytes of the UTF-8 encoding for a codepoint + `c` is a code-point. + Returns the number of bytes. If a codepoint is beyond 0x10FFFF the + return value is 3 as the codepoint would be encoded as 0xFFFD. + */ +size_t utf8_encode_len(uint32_t c) { - uint8_t *q = buf; - - if (c < 0x80) { - *q++ = c; - } else { - if (c < 0x800) { - *q++ = (c >> 6) | 0xc0; - } else { - if (c < 0x10000) { - *q++ = (c >> 12) | 0xe0; - } else { - if (c < 0x00200000) { - *q++ = (c >> 18) | 0xf0; - } else { - if (c < 0x04000000) { - *q++ = (c >> 24) | 0xf8; - } else if (c < 0x80000000) { - *q++ = (c >> 30) | 0xfc; - *q++ = ((c >> 24) & 0x3f) | 0x80; - } else { - return 0; - } - *q++ = ((c >> 18) & 0x3f) | 0x80; - } - *q++ = ((c >> 12) & 0x3f) | 0x80; - } - *q++ = ((c >> 6) & 0x3f) | 0x80; - } - *q++ = (c & 0x3f) | 0x80; - } - return q - buf; + if (c < 0x80) + return 1; + if (c < 0x800) + return 2; + if (c < 0x10000) + return 3; + if (c < 0x110000) + return 4; + return 3; } -static const unsigned int utf8_min_code[5] = { - 0x80, 0x800, 0x10000, 0x00200000, 0x04000000, -}; - -static const unsigned char utf8_first_code_mask[5] = { - 0x1f, 0xf, 0x7, 0x3, 0x1, -}; - -/* return -1 if error. *pp is not updated in this case. max_len must - be >= 1. The maximum length for a UTF8 byte sequence is 6 bytes. */ -int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp) +/* Encode a codepoint in UTF-8 + `buf` points to an array of at least `UTF8_CHAR_LEN_MAX` bytes + `c` is a code-point. + Returns the number of bytes. If a codepoint is beyond 0x10FFFF the + return value is 3 and the codepoint is encoded as 0xFFFD. + No null byte is stored after the encoded bytes. + Return value is in range 1..4 + */ +size_t utf8_encode(uint8_t *buf, uint32_t c) { - int l, c, b, i; + if (c < 0x80) { + buf[0] = c; + return 1; + } + if (c < 0x800) { + buf[0] = (c >> 6) | 0xC0; + buf[1] = (c & 0x3F) | 0x80; + return 2; + } + if (c < 0x10000) { + buf[0] = (c >> 12) | 0xE0; + buf[1] = ((c >> 6) & 0x3F) | 0x80; + buf[2] = (c & 0x3F) | 0x80; + return 3; + } + if (c < 0x110000) { + buf[0] = (c >> 18) | 0xF0; + buf[1] = ((c >> 12) & 0x3F) | 0x80; + buf[2] = ((c >> 6) & 0x3F) | 0x80; + buf[3] = (c & 0x3F) | 0x80; + return 4; + } + buf[0] = (0xFFFD >> 12) | 0xE0; + buf[1] = ((0xFFFD >> 6) & 0x3F) | 0x80; + buf[2] = (0xFFFD & 0x3F) | 0x80; + return 3; +} + +/* Decode a single code point from a UTF-8 encoded array of bytes + `p` is a valid pointer to an array of bytes + `max_len` is the number of bytes available in the array + `pp` is a valid pointer to a `const uint8_t *` to store a pointer + to the byte following the current sequence. + Return the code point at `p`, in the range `0..0x10FFFF` + Return 0xFFFD on error. Only a single byte is consumed in this case + The maximum length for a UTF-8 byte sequence is 4 bytes. + This implements the algorithm specified in whatwg.org, except it accepts + UTF-8 encoded surrogates as JavaScript allows them in strings. + cf: https://encoding.spec.whatwg.org/#utf-8-encoder + */ +uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp) +{ + uint32_t c; + uint8_t lower, upper; c = *p++; if (c < 0x80) { @@ -272,49 +297,270 @@ int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp) return c; } switch(c) { - case 0xc0: case 0xc1: case 0xc2: case 0xc3: - case 0xc4: case 0xc5: case 0xc6: case 0xc7: - case 0xc8: case 0xc9: case 0xca: case 0xcb: - case 0xcc: case 0xcd: case 0xce: case 0xcf: - case 0xd0: case 0xd1: case 0xd2: case 0xd3: - case 0xd4: case 0xd5: case 0xd6: case 0xd7: - case 0xd8: case 0xd9: case 0xda: case 0xdb: - case 0xdc: case 0xdd: case 0xde: case 0xdf: - l = 1; + case 0xC2: case 0xC3: + case 0xC4: case 0xC5: case 0xC6: case 0xC7: + case 0xC8: case 0xC9: case 0xCA: case 0xCB: + case 0xCC: case 0xCD: case 0xCE: case 0xCF: + case 0xD0: case 0xD1: case 0xD2: case 0xD3: + case 0xD4: case 0xD5: case 0xD6: case 0xD7: + case 0xD8: case 0xD9: case 0xDA: case 0xDB: + case 0xDC: case 0xDD: case 0xDE: case 0xDF: + if (max_len < 2) { + // need more bytes + break; + } + if (*p >= 0x80 && *p <= 0xBF) { + *pp = p + 1; + return ((c - 0xC0) << 6) + (*p - 0x80); + } + // otherwise encoding error break; - case 0xe0: case 0xe1: case 0xe2: case 0xe3: - case 0xe4: case 0xe5: case 0xe6: case 0xe7: - case 0xe8: case 0xe9: case 0xea: case 0xeb: - case 0xec: case 0xed: case 0xee: case 0xef: - l = 2; + case 0xE0: + lower = 0xA0; /* reject invalid encoding */ + goto need2; + case 0xE1: case 0xE2: case 0xE3: + case 0xE4: case 0xE5: case 0xE6: case 0xE7: + case 0xE8: case 0xE9: case 0xEA: case 0xEB: + case 0xEC: case 0xED: case 0xEE: case 0xEF: + lower = 0x80; + need2: + if (max_len < 3) { + // need more bytes + break; + } + if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) { + *pp = p + 2; + return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80); + } + // otherwise encoding error break; - case 0xf0: case 0xf1: case 0xf2: case 0xf3: - case 0xf4: case 0xf5: case 0xf6: case 0xf7: - l = 3; - break; - case 0xf8: case 0xf9: case 0xfa: case 0xfb: - l = 4; - break; - case 0xfc: case 0xfd: - l = 5; + case 0xF0: + lower = 0x90; /* reject invalid encoding */ + upper = 0xBF; + goto need3; + case 0xF4: + lower = 0x80; + upper = 0x8F; /* reject values above 0x10FFFF */ + goto need3; + case 0xF1: case 0xF2: case 0xF3: + lower = 0x80; + upper = 0xBF; + need3: + if (max_len < 4) { + // need more bytes + break; + } + if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF + && p[2] >= 0x80 && p[2] <= 0xBF) { + *pp = p + 3; + return ((c - 0xF0) << 18) + ((*p - 0x80) << 12) + + ((p[1] - 0x80) << 6) + (p[2] - 0x80); + } + // otherwise encoding error break; default: - return -1; + // invalid lead byte + break; } - /* check that we have enough characters */ - if (l > (max_len - 1)) - return -1; - c &= utf8_first_code_mask[l - 1]; - for(i = 0; i < l; i++) { - b = *p++; - if (b < 0x80 || b >= 0xc0) - return -1; - c = (c << 6) | (b & 0x3f); - } - if (c < utf8_min_code[l - 1]) - return -1; *pp = p; - return c; + return 0xFFFD; +} + +/* Scan a UTF-8 encoded buffer for content type + `buf` is a valid pointer to a UTF-8 encoded string + `len` is the number of bytes to scan + `plen` points to a `size_t` variable to receive the number of units + Return value is a mask of bits. + - `UTF8_PLAIN_ASCII`: return value for 7-bit ASCII plain text + - `UTF8_NON_ASCII`: bit for non ASCII code points (8-bit or more) + - `UTF8_HAS_16BIT`: bit for 16-bit code points + - `UTF8_HAS_NON_BMP1`: bit for non-BMP1 code points, needs UTF-16 surrogate pairs + - `UTF8_HAS_ERRORS`: bit for encoding errors + */ +int utf8_scan(const char *buf, size_t buf_len, size_t *plen) +{ + const uint8_t *p, *p_end, *p_next; + size_t i, len; + int kind; + uint8_t cbits; + + kind = UTF8_PLAIN_ASCII; + cbits = 0; + len = buf_len; + // TODO: handle more than 1 byte at a time + for (i = 0; i < buf_len; i++) + cbits |= buf[i]; + if (cbits >= 0x80) { + p = (const uint8_t *)buf; + p_end = p + buf_len; + kind = UTF8_NON_ASCII; + len = 0; + while (p < p_end) { + len++; + if (*p++ >= 0x80) { + /* parse UTF-8 sequence, check for encoding error */ + uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next); + if (p_next == p) + kind |= UTF8_HAS_ERRORS; + p = p_next; + if (c > 0xFF) { + kind |= UTF8_HAS_16BIT; + if (c > 0xFFFF) { + len++; + kind |= UTF8_HAS_NON_BMP1; + } + } + } + } + } + *plen = len; + return kind; +} + +/* Decode a string encoded in UTF-8 into an array of bytes + `src` points to the source string. It is assumed to be correctly encoded + and only contains code points below 0x800 + `src_len` is the length of the source string + `dest` points to the destination array, it can be null if `dest_len` is `0` + `dest_len` is the length of the destination array. A null + terminator is stored at the end of the array unless `dest_len` is `0`. + */ +size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len) +{ + const uint8_t *p, *p_end; + size_t i; + + p = (const uint8_t *)src; + p_end = p + src_len; + for (i = 0; p < p_end; i++) { + uint32_t c = *p++; + if (c >= 0xC0) + c = (c << 6) + *p++ - ((0xC0 << 6) + 0x80); + if (i < dest_len) + dest[i] = c; + } + if (i < dest_len) + dest[i] = '\0'; + else if (dest_len > 0) + dest[dest_len - 1] = '\0'; + return i; +} + +/* Decode a string encoded in UTF-8 into an array of 16-bit words + `src` points to the source string. It is assumed to be correctly encoded. + `src_len` is the length of the source string + `dest` points to the destination array, it can be null if `dest_len` is `0` + `dest_len` is the length of the destination array. No null terminator is + stored at the end of the array. + */ +size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len) +{ + const uint8_t *p, *p_end; + size_t i; + + p = (const uint8_t *)src; + p_end = p + src_len; + for (i = 0; p < p_end; i++) { + uint32_t c = *p++; + if (c >= 0x80) { + /* parse utf-8 sequence */ + c = utf8_decode(p - 1, p_end - (p - 1), &p); + /* encoding errors are converted as 0xFFFD and use a single byte */ + if (c > 0xFFFF) { + if (i < dest_len) + dest[i] = get_hi_surrogate(c); + i++; + c = get_lo_surrogate(c); + } + } + if (i < dest_len) + dest[i] = c; + } + return i; +} + +/* Encode a buffer of 8-bit bytes as a UTF-8 encoded string + `src` points to the source buffer. + `src_len` is the length of the source buffer + `dest` points to the destination array, it can be null if `dest_len` is `0` + `dest_len` is the length in bytes of the destination array. A null + terminator is stored at the end of the array unless `dest_len` is `0`. + */ +size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len) +{ + size_t i, j; + uint32_t c; + + for (i = j = 0; i < src_len; i++) { + c = src[i]; + if (c < 0x80) { + if (j + 1 >= dest_len) + goto overflow; + dest[j++] = c; + } else { + if (j + 2 >= dest_len) + goto overflow; + dest[j++] = (c >> 6) | 0xC0; + dest[j++] = (c & 0x3F) | 0x80; + } + } + if (j < dest_len) + dest[j] = '\0'; + return j; + +overflow: + if (j < dest_len) + dest[j] = '\0'; + while (i < src_len) + j += 1 + (src[i++] >= 0x80); + return j; +} + +/* Encode a buffer of 16-bit code points as a UTF-8 encoded string + `src` points to the source buffer. + `src_len` is the length of the source buffer + `dest` points to the destination array, it can be null if `dest_len` is `0` + `dest_len` is the length in bytes of the destination array. A null + terminator is stored at the end of the array unless `dest_len` is `0`. + */ +size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len) +{ + size_t i, j; + uint32_t c; + + for (i = j = 0; i < src_len;) { + c = src[i++]; + if (c < 0x80) { + if (j + 1 >= dest_len) + goto overflow; + dest[j++] = c; + } else { + if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i])) + c = from_surrogate(c, src[i++]); + if (j + utf8_encode_len(c) >= dest_len) + goto overflow; + j += utf8_encode((uint8_t *)dest + j, c); + } + } + if (j < dest_len) + dest[j] = '\0'; + return j; + +overflow: + i -= 1 + (c > 0xFFFF); + if (j < dest_len) + dest[j] = '\0'; + while (i < src_len) { + c = src[i++]; + if (c < 0x80) { + j++; + } else { + if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i])) + c = from_surrogate(c, src[i++]); + j += utf8_encode_len(c); + } + } + return j; } /*--- integer to string conversions --*/ diff --git a/cutils.h b/cutils.h index 79a25de..853277e 100644 --- a/cutils.h +++ b/cutils.h @@ -387,10 +387,25 @@ static inline void dbuf_set_error(DynBuf *s) s->error = TRUE; } -#define UTF8_CHAR_LEN_MAX 6 +/*---- UTF-8 and UTF-16 handling ----*/ -int unicode_to_utf8(uint8_t *buf, unsigned int c); -int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp); +#define UTF8_CHAR_LEN_MAX 4 + +enum { + UTF8_PLAIN_ASCII = 0, // 7-bit ASCII plain text + UTF8_NON_ASCII = 1, // has non ASCII code points (8-bit or more) + UTF8_HAS_16BIT = 2, // has 16-bit code points + UTF8_HAS_NON_BMP1 = 4, // has non-BMP1 code points, needs UTF-16 surrogate pairs + UTF8_HAS_ERRORS = 8, // has encoding errors +}; +int utf8_scan(const char *buf, size_t len, size_t *plen); +size_t utf8_encode_len(uint32_t c); +size_t utf8_encode(uint8_t *buf, uint32_t c); +uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp); +size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len); +size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len); +size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len); +size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len); static inline BOOL is_surrogate(uint32_t c) { diff --git a/libregexp.c b/libregexp.c index 11b574e..4b6aa48 100644 --- a/libregexp.c +++ b/libregexp.c @@ -712,7 +712,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr, static int get_class_atom(REParseState *s, CharRange *cr, const uint8_t **pp, BOOL inclass) { - const uint8_t *p; + const uint8_t *p, *p_next; uint32_t c; int ret; @@ -804,15 +804,18 @@ static int get_class_atom(REParseState *s, CharRange *cr, /* fall thru */ default: normal_char: - /* normal char */ - if (c >= 128) { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); - if ((unsigned)c > 0xffff && !s->is_unicode) { - /* XXX: should handle non BMP-1 code points */ + p++; + if (c >= 0x80) { + c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + if (p_next == p) + return re_parse_error(s, "invalid UTF-8 sequence"); + p = p_next; + if (c > 0xFFFF && !s->is_unicode) { + // TODO(chqrlie): should handle non BMP-1 code points in + // the calling function and no require the source string + // to be CESU-8 encoded if not s->is_unicode return re_parse_error(s, "malformed unicode char"); } - } else { - p++; } break; } @@ -1105,35 +1108,35 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len) /* '*pp' is the first char after '<' */ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) { - const uint8_t *p, *p1; + const uint8_t *p, *p_next; uint32_t c, d; char *q; p = *pp; q = buf; for(;;) { - c = *p; + c = *p++; if (c == '\\') { - p++; if (*p != 'u') return -1; c = lre_parse_escape(&p, 2); // accept surrogate pairs + if ((int)c < 0) + return -1; } else if (c == '>') { break; - } else if (c >= 128) { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + } else if (c >= 0x80) { + c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + if (p_next == p) + return -1; + p = p_next; if (is_hi_surrogate(c)) { - d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); + d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); if (is_lo_surrogate(d)) { c = from_surrogate(c, d); - p = p1; + p = p_next; } } - } else { - p++; } - if (c > 0x10FFFF) - return -1; if (q == buf) { if (!lre_js_is_ident_first(c)) return -1; @@ -1143,16 +1146,15 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) } if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size) return -1; - if (c < 128) { + if (c < 0x80) { *q++ = c; } else { - q += unicode_to_utf8((uint8_t*)q, c); + q += utf8_encode((uint8_t*)q, c); } } if (q == buf) return -1; *q = '\0'; - p++; *pp = p; return 0; } diff --git a/quickjs-libc.c b/quickjs-libc.c index c5b1235..d8ed30c 100644 --- a/quickjs-libc.c +++ b/quickjs-libc.c @@ -272,20 +272,21 @@ static JSValue js_printf_internal(JSContext *ctx, if (i >= argc) goto missing; if (JS_IsString(argv[i])) { + // TODO(chqrlie) need an API to wrap charCodeAt and codePointAt */ string_arg = JS_ToCString(ctx, argv[i++]); if (!string_arg) goto fail; - int32_arg = unicode_from_utf8((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p); + int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p); JS_FreeCString(ctx, string_arg); } else { if (JS_ToInt32(ctx, &int32_arg, argv[i++])) goto fail; } - /* handle utf-8 encoding explicitly */ + // XXX: throw an exception? if ((unsigned)int32_arg > 0x10FFFF) int32_arg = 0xFFFD; /* ignore conversion flags, width and precision */ - len = unicode_to_utf8(cbuf, int32_arg); + len = utf8_encode(cbuf, int32_arg); dbuf_put(&dbuf, cbuf, len); break; diff --git a/quickjs.c b/quickjs.c index f076d49..54d1778 100644 --- a/quickjs.c +++ b/quickjs.c @@ -3030,38 +3030,25 @@ static const char *JS_AtomGetStrRT(JSRuntime *rt, char *buf, int buf_size, snprintf(buf, buf_size, "", atom); } else { JSAtomStruct *p = rt->atom_array[atom]; + *buf = '\0'; if (atom_is_free(p)) { assert(!atom_is_free(p)); snprintf(buf, buf_size, "", atom); - } else { - int i, c; - char *q; - JSString *str; - - q = buf; - str = p; - if (str) { - if (!str->is_wide_char) { - /* special case ASCII strings */ - c = 0; - for(i = 0; i < str->len; i++) { - c |= str->u.str8[i]; - } - if (c < 0x80) - return (const char *)str->u.str8; - } + } else if (p != NULL) { + JSString *str = p; + if (str->is_wide_char) { + /* encode surrogates correctly */ + utf8_encode_buf16(buf, buf_size, str->u.str16, str->len); + } else { + /* special case ASCII strings */ + int i, c = 0; for(i = 0; i < str->len; i++) { - c = string_get(str, i); - if ((q - buf) >= buf_size - UTF8_CHAR_LEN_MAX) - break; - if (c < 128) { - *q++ = c; - } else { - q += unicode_to_utf8((uint8_t *)q, c); - } + c |= str->u.str8[i]; } + if (c < 0x80) + return (const char *)str->u.str8; + utf8_encode_buf8(buf, buf_size, str->u.str8, str->len); } - *q = '\0'; } } return buf; @@ -3311,6 +3298,7 @@ const char *JS_AtomToCString(JSContext *ctx, JSAtom atom) /* return a string atom containing name concatenated with str1 */ /* `str1` may be pure ASCII or UTF-8 encoded */ +// TODO(chqrlie): use string concatenation instead of UTF-8 conversion static JSAtom js_atom_concat_str(JSContext *ctx, JSAtom name, const char *str1) { JSValue str; @@ -3863,64 +3851,44 @@ static JSValue string_buffer_end(StringBuffer *s) /* create a string from a UTF-8 buffer */ JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len) { - const uint8_t *p, *p_end, *p_start, *p_next; - uint32_t c; - StringBuffer b_s, *b = &b_s; - size_t len1; + JSString *str; + size_t len; + int kind; if (buf_len <= 0) { return JS_AtomToString(ctx, JS_ATOM_empty_string); } - p_start = (const uint8_t *)buf; - p_end = p_start + buf_len; - p = p_start; - while (p < p_end && *p < 128) - p++; - len1 = p - p_start; - if (len1 > JS_STRING_LEN_MAX) + /* Compute string kind and length: 7-bit, 8-bit, 16-bit, 16-bit UTF-16 */ + kind = utf8_scan(buf, buf_len, &len); + if (len > JS_STRING_LEN_MAX) return JS_ThrowRangeError(ctx, "invalid string length"); - if (p == p_end) { - /* ASCII string */ - return js_new_string8_len(ctx, buf, buf_len); - } else { - if (string_buffer_init(ctx, b, buf_len)) - goto fail; - string_buffer_write8(b, p_start, len1); - while (p < p_end) { - if (*p < 128) { - string_buffer_putc8(b, *p++); - } else { - /* parse utf-8 sequence, return 0xFFFFFFFF for error */ - c = unicode_from_utf8(p, p_end - p, &p_next); - if (c < 0x10000) { - p = p_next; - } else if (c <= 0x10FFFF) { - p = p_next; - /* surrogate pair */ - string_buffer_putc16(b, get_hi_surrogate(c)); - c = get_lo_surrogate(c); - } else { - /* invalid char */ - c = 0xfffd; - /* skip the invalid chars */ - /* XXX: seems incorrect. Why not just use c = *p++; ? */ - while (p < p_end && (*p >= 0x80 && *p < 0xc0)) - p++; - if (p < p_end) { - p++; - while (p < p_end && (*p >= 0x80 && *p < 0xc0)) - p++; - } - } - string_buffer_putc16(b, c); - } - } - } - return string_buffer_end(b); - fail: - string_buffer_free(b); - return JS_EXCEPTION; + switch (kind) { + case UTF8_PLAIN_ASCII: + str = js_alloc_string(ctx, len, 0); + if (!str) + return JS_EXCEPTION; + memcpy(str->u.str8, buf, len); + str->u.str8[len] = '\0'; + break; + case UTF8_NON_ASCII: + /* buf contains non-ASCII code-points, but limited to 8-bit values */ + str = js_alloc_string(ctx, len, 0); + if (!str) + return JS_EXCEPTION; + utf8_decode_buf8(str->u.str8, len + 1, buf, buf_len); + break; + default: + // This causes a potential problem in JS_ThrowError if message is invalid + //if (kind & UTF8_HAS_ERRORS) + // return JS_ThrowRangeError(ctx, "invalid UTF-8 sequence"); + str = js_alloc_string(ctx, len, 1); + if (!str) + return JS_EXCEPTION; + utf8_decode_buf16(str->u.str16, len, buf, buf_len); + break; + } + return JS_MKPTR(JS_TAG_STRING, str); } static JSValue JS_ConcatString3(JSContext *ctx, const char *str1, @@ -4067,7 +4035,7 @@ go: /* c = 0xfffd; */ /* error */ } } - q += unicode_to_utf8(q, c); + q += utf8_encode(q, c); } } } @@ -10073,6 +10041,7 @@ int JS_ToBool(JSContext *ctx, JSValue val) return JS_ToBoolFree(ctx, js_dup(val)); } +/* pc points to pure ASCII or UTF-8, null terminated contents */ static int skip_spaces(const char *pc) { const uint8_t *p, *p_next, *p_start; @@ -10080,19 +10049,19 @@ static int skip_spaces(const char *pc) p = p_start = (const uint8_t *)pc; for (;;) { - c = *p; - if (c < 128) { + c = *p++; + if (c < 0x80) { if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20))) break; - p++; } else { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next); + c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + /* no need to test for invalid UTF-8, 0xFFFD is not a space */ if (!lre_is_space(c)) break; p = p_next; } } - return p - p_start; + return p - 1 - p_start; } static inline int to_digit(int c) @@ -18689,6 +18658,7 @@ static int js_parse_error_reserved_identifier(JSParseState *s) static __exception int js_parse_template_part(JSParseState *s, const uint8_t *p) { + const uint8_t *p_next; uint32_t c; StringBuffer b_s, *b = &b_s; @@ -18726,9 +18696,8 @@ static __exception int js_parse_template_part(JSParseState *s, s->eol = &p[-1]; s->mark = p; } else if (c >= 0x80) { - const uint8_t *p_next; - c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next); - if (c > 0x10FFFF) { + c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + if (p_next == p) { js_parse_error(s, "invalid UTF-8 sequence"); goto fail; } @@ -18754,6 +18723,7 @@ static __exception int js_parse_string(JSParseState *s, int sep, BOOL do_throw, const uint8_t *p, JSToken *token, const uint8_t **pp) { + const uint8_t *p_next; int ret; uint32_t c; StringBuffer b_s, *b = &b_s; @@ -18832,9 +18802,8 @@ static __exception int js_parse_string(JSParseState *s, int sep, } goto fail; } else if (c >= 0x80) { - const uint8_t *p_next; - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next); - if (c > 0x10FFFF) { + c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + if (p_next == p + 1) { goto invalid_utf8; } p = p_next; @@ -18859,9 +18828,8 @@ static __exception int js_parse_string(JSParseState *s, int sep, break; } } else if (c >= 0x80) { - const uint8_t *p_next; - c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next); - if (c > 0x10FFFF) + c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + if (p_next == p) goto invalid_utf8; p = p_next; } @@ -18893,7 +18861,7 @@ static inline BOOL token_is_pseudo_keyword(JSParseState *s, JSAtom atom) { static __exception int js_parse_regexp(JSParseState *s) { - const uint8_t *p; + const uint8_t *p, *p_next; BOOL in_class; StringBuffer b_s, *b = &b_s; StringBuffer b2_s, *b2 = &b2_s; @@ -18932,9 +18900,8 @@ static __exception int js_parse_regexp(JSParseState *s) else if (c == '\0' && p >= s->buf_end) goto eof_error; else if (c >= 0x80) { - const uint8_t *p_next; - c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next); - if (c > 0x10FFFF) { + c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + if (p_next == p) { goto invalid_utf8; } p = p_next; @@ -18942,9 +18909,8 @@ static __exception int js_parse_regexp(JSParseState *s) goto eol_error; } } else if (c >= 0x80) { - const uint8_t *p_next; - c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next); - if (c > 0x10FFFF) { + c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next); + if (p_next == p) { invalid_utf8: js_parse_error(s, "invalid UTF-8 sequence"); goto fail; @@ -18963,14 +18929,8 @@ static __exception int js_parse_regexp(JSParseState *s) /* flags */ for(;;) { - const uint8_t *p_next = p; - c = *p_next++; - if (c >= 0x80) { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next); - if (c > 0x10FFFF) { - goto invalid_utf8; - } - } + c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */ if (!lre_js_is_ident_next(c)) break; if (string_buffer_putc(b2, c)) @@ -19020,10 +18980,10 @@ static __exception int ident_realloc(JSContext *ctx, char **pbuf, size_t *psize, static JSAtom parse_ident(JSParseState *s, const uint8_t **pp, BOOL *pident_has_escape, int c, BOOL is_private) { - const uint8_t *p, *p1; + const uint8_t *p, *p_next; char ident_buf[128], *buf; size_t ident_size, ident_pos; - JSAtom atom; + JSAtom atom = JS_ATOM_NULL; p = *pp; buf = ident_buf; @@ -19032,28 +18992,26 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp, if (is_private) buf[ident_pos++] = '#'; for(;;) { - p1 = p; - - if (c < 128) { + if (c < 0x80) { buf[ident_pos++] = c; } else { - ident_pos += unicode_to_utf8((uint8_t*)buf + ident_pos, c); + ident_pos += utf8_encode((uint8_t*)buf + ident_pos, c); } - c = *p1++; - if (c == '\\' && *p1 == 'u') { - c = lre_parse_escape(&p1, TRUE); + c = *p; + p_next = p + 1; + if (c == '\\' && *p_next == 'u') { + c = lre_parse_escape(&p_next, TRUE); *pident_has_escape = TRUE; - } else if (c >= 128) { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); + } else if (c >= 0x80) { + c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */ } if (!lre_js_is_ident_next(c)) break; - p = p1; + p = p_next; if (unlikely(ident_pos >= ident_size - UTF8_CHAR_LEN_MAX)) { - if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf)) { - atom = JS_ATOM_NULL; + if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf)) goto done; - } } } /* buf is pure ASCII or UTF-8 encoded */ @@ -19068,7 +19026,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp, static __exception int next_token(JSParseState *s) { - const uint8_t *p; + const uint8_t *p, *p_next; int c; BOOL ident_has_escape; JSAtom atom; @@ -19148,11 +19106,10 @@ static __exception int next_token(JSParseState *s) s->got_lf = TRUE; /* considered as LF for ASI */ p++; } else if (*p >= 0x80) { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p); + /* ignore invalid UTF-8 in comments */ if (c == CP_LS || c == CP_PS) { s->got_lf = TRUE; /* considered as LF for ASI */ - } else if (c == -1) { - p++; /* skip invalid UTF-8 */ } } else { p++; @@ -19170,12 +19127,11 @@ static __exception int next_token(JSParseState *s) if (*p == '\r' || *p == '\n') break; if (*p >= 0x80) { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p); + /* ignore invalid UTF-8 in comments */ /* LS or PS are considered as line terminator */ if (c == CP_LS || c == CP_PS) { break; - } else if (c == -1) { - p++; /* skip invalid UTF-8 */ } } else { p++; @@ -19265,20 +19221,21 @@ static __exception int next_token(JSParseState *s) case '#': /* private name */ { - const uint8_t *p1; p++; - p1 = p; - c = *p1++; - if (c == '\\' && *p1 == 'u') { - c = lre_parse_escape(&p1, TRUE); - } else if (c >= 128) { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); + c = *p; + p_next = p + 1; + if (c == '\\' && *p_next == 'u') { + c = lre_parse_escape(&p_next, TRUE); + } else if (c >= 0x80) { + c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + if (p_next == p + 1) + goto invalid_utf8; } if (!lre_js_is_ident_first(c)) { js_parse_error(s, "invalid first character of private name"); goto fail; } - p = p1; + p = p_next; ident_has_escape = FALSE; /* not used */ atom = parse_ident(s, &p, &ident_has_escape, c, TRUE); if (atom == JS_ATOM_NULL) @@ -19313,7 +19270,6 @@ static __exception int next_token(JSParseState *s) parse_number: { JSValue ret; - const uint8_t *p1; int flags; flags = ATOD_ACCEPT_BIN_OCT | ATOD_ACCEPT_LEGACY_OCTAL | ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX; @@ -19324,7 +19280,7 @@ static __exception int next_token(JSParseState *s) goto fail; /* reject `10instanceof Number` */ if (JS_VALUE_IS_NAN(ret) || - lre_js_is_ident_next(unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1))) { + lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) { JS_FreeValue(s->ctx, ret); js_parse_error(s, "invalid number literal"); goto fail; @@ -19516,9 +19472,11 @@ static __exception int next_token(JSParseState *s) } break; default: - if (c >= 128) { - /* unicode value */ - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + if (c >= 0x80) { /* non-ASCII code-point */ + c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next); + if (p_next == p + 1) + goto invalid_utf8; + p = p_next; switch(c) { case CP_PS: case CP_LS: @@ -19549,6 +19507,8 @@ static __exception int next_token(JSParseState *s) // dump_token(s, &s->token); return 0; + invalid_utf8: + js_parse_error(s, "invalid UTF-8 sequence"); fail: s->token.val = TOK_ERROR; return -1; @@ -19573,7 +19533,7 @@ static int json_parse_error(JSParseState *s, const uint8_t *curp, const char *ms static int json_parse_string(JSParseState *s, const uint8_t **pp) { - const uint8_t *p = *pp; + const uint8_t *p, *p_next; int i; uint32_t c; StringBuffer b_s, *b = &b_s; @@ -19581,6 +19541,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp) if (string_buffer_init(s->ctx, b, 32)) goto fail; + p = *pp; for(;;) { if (p >= s->buf_end) { goto end_of_input; @@ -19622,9 +19583,8 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp) } } else if (c >= 0x80) { - const uint8_t *p_next; - c = unicode_from_utf8(p - 1, s->buf_end - p, &p_next); - if (c > 0x10FFFF) { + c = utf8_decode(p - 1, s->buf_end - p, &p_next); + if (p_next == p) { json_parse_error(s, p - 1, "Bad UTF-8 sequence"); goto fail; } @@ -19722,7 +19682,7 @@ static JSAtom json_parse_ident(JSParseState *s, const uint8_t **pp, int c) static __exception int json_next_token(JSParseState *s) { - const uint8_t *p; + const uint8_t *p, *p_next; int c; JSAtom atom; @@ -19826,10 +19786,9 @@ static __exception int json_next_token(JSParseState *s) goto fail; break; default: - if (c >= 128) { - const uint8_t *p_next; - c = unicode_from_utf8(p, s->buf_end - p, &p_next); - if (c == -1) { + if (c >= 0x80) { + c = utf8_decode(p, s->buf_end - p, &p_next); + if (p_next == p + 1) { js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p); } else { if (c > 0xFFFF) { @@ -19951,12 +19910,10 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end) if (*p == '\n' || *p == '\r') { break; } else if (*p >= 0x80) { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); - if (c == CP_LS || c == CP_PS) { + c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p); + /* purposely ignore UTF-8 encoding errors in this comment line */ + if (c == CP_LS || c == CP_PS) break; - } else if (c == -1) { - p++; /* skip invalid UTF-8 */ - } } else { p++; }