Improve UTF-8 decoding and encoding functions (#410)

Ensure proper UTF-8 encoding (1 to 4 bytes).
Handle invalid encodings (return 0xFFFD and consume a single byte)
Individually encoded surrogate code points are accepted.

- add `utf8_scan()` to analyze a byte array for UTF-8 contents
  detects invalid encoding, computes number of codepoints and content kind:
  plain ASCII, 8-bit, 16-bit or larger codepoints.
- add `utf8_encode_len(c)` to compute the number of bytes to encode `c`
- rename `unicode_to_utf8` as `utf8_encode`
- rename `unicode_from_utf8` as `utf8_decode`
- add `utf8_decode_buf8(dest, size, src, len)` to decode a UTF-8 encoded
  byte array known to contain only ASCII and 8-bit codepoints.
- add `utf8_decode_buf16(dest, size, src, len)` to decode a UTF-8 encoded
  byte array into an array of 16-bit codepoints using UTF-16 surrogate pairs
  for non-BMP1 codepoints.
- add `utf8_encode_buf8(dest, size, src, len)` to encode an array of 8-bit
  codepoints as a UTF-8 encoded null terminated string
- add `utf16_encode_buf8(dest, size, src, len)` to decode an array of 16-bit
  codepoints (including surrogate pairs) as a UTF-8 encoded null terminated string
- detect invalid UTF-8 encoding in RegExp parser
- simplify `JS_AtomGetStrRT`, `JS_NewStringLen` using the above functions
- simplify UTF-8 decoding and error testing
This commit is contained in:
Charlie Gordon 2024-05-21 14:08:33 +02:00 committed by GitHub
parent f588210641
commit 1baa6763f8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 490 additions and 269 deletions

414
cutils.c
View file

@ -213,58 +213,83 @@ void dbuf_free(DynBuf *s)
memset(s, 0, sizeof(*s)); memset(s, 0, sizeof(*s));
} }
/*--- Unicode / UTF-8 utility functions --*/ /*--- UTF-8 utility functions --*/
/* Note: at most 31 bits are encoded. At most UTF8_CHAR_LEN_MAX bytes /* Note: only encode valid codepoints (0x0000..0x10FFFF).
are output. */ At most UTF8_CHAR_LEN_MAX bytes are output. */
int unicode_to_utf8(uint8_t *buf, unsigned int c)
/* Compute the number of bytes of the UTF-8 encoding for a codepoint
`c` is a code-point.
Returns the number of bytes. If a codepoint is beyond 0x10FFFF the
return value is 3 as the codepoint would be encoded as 0xFFFD.
*/
size_t utf8_encode_len(uint32_t c)
{ {
uint8_t *q = buf; if (c < 0x80)
return 1;
if (c < 0x80) { if (c < 0x800)
*q++ = c; return 2;
} else { if (c < 0x10000)
if (c < 0x800) { return 3;
*q++ = (c >> 6) | 0xc0; if (c < 0x110000)
} else { return 4;
if (c < 0x10000) { return 3;
*q++ = (c >> 12) | 0xe0;
} else {
if (c < 0x00200000) {
*q++ = (c >> 18) | 0xf0;
} else {
if (c < 0x04000000) {
*q++ = (c >> 24) | 0xf8;
} else if (c < 0x80000000) {
*q++ = (c >> 30) | 0xfc;
*q++ = ((c >> 24) & 0x3f) | 0x80;
} else {
return 0;
}
*q++ = ((c >> 18) & 0x3f) | 0x80;
}
*q++ = ((c >> 12) & 0x3f) | 0x80;
}
*q++ = ((c >> 6) & 0x3f) | 0x80;
}
*q++ = (c & 0x3f) | 0x80;
}
return q - buf;
} }
static const unsigned int utf8_min_code[5] = { /* Encode a codepoint in UTF-8
0x80, 0x800, 0x10000, 0x00200000, 0x04000000, `buf` points to an array of at least `UTF8_CHAR_LEN_MAX` bytes
}; `c` is a code-point.
Returns the number of bytes. If a codepoint is beyond 0x10FFFF the
static const unsigned char utf8_first_code_mask[5] = { return value is 3 and the codepoint is encoded as 0xFFFD.
0x1f, 0xf, 0x7, 0x3, 0x1, No null byte is stored after the encoded bytes.
}; Return value is in range 1..4
*/
/* return -1 if error. *pp is not updated in this case. max_len must size_t utf8_encode(uint8_t *buf, uint32_t c)
be >= 1. The maximum length for a UTF8 byte sequence is 6 bytes. */
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
{ {
int l, c, b, i; if (c < 0x80) {
buf[0] = c;
return 1;
}
if (c < 0x800) {
buf[0] = (c >> 6) | 0xC0;
buf[1] = (c & 0x3F) | 0x80;
return 2;
}
if (c < 0x10000) {
buf[0] = (c >> 12) | 0xE0;
buf[1] = ((c >> 6) & 0x3F) | 0x80;
buf[2] = (c & 0x3F) | 0x80;
return 3;
}
if (c < 0x110000) {
buf[0] = (c >> 18) | 0xF0;
buf[1] = ((c >> 12) & 0x3F) | 0x80;
buf[2] = ((c >> 6) & 0x3F) | 0x80;
buf[3] = (c & 0x3F) | 0x80;
return 4;
}
buf[0] = (0xFFFD >> 12) | 0xE0;
buf[1] = ((0xFFFD >> 6) & 0x3F) | 0x80;
buf[2] = (0xFFFD & 0x3F) | 0x80;
return 3;
}
/* Decode a single code point from a UTF-8 encoded array of bytes
`p` is a valid pointer to an array of bytes
`max_len` is the number of bytes available in the array
`pp` is a valid pointer to a `const uint8_t *` to store a pointer
to the byte following the current sequence.
Return the code point at `p`, in the range `0..0x10FFFF`
Return 0xFFFD on error. Only a single byte is consumed in this case
The maximum length for a UTF-8 byte sequence is 4 bytes.
This implements the algorithm specified in whatwg.org, except it accepts
UTF-8 encoded surrogates as JavaScript allows them in strings.
cf: https://encoding.spec.whatwg.org/#utf-8-encoder
*/
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
{
uint32_t c;
uint8_t lower, upper;
c = *p++; c = *p++;
if (c < 0x80) { if (c < 0x80) {
@ -272,49 +297,270 @@ int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
return c; return c;
} }
switch(c) { switch(c) {
case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xC2: case 0xC3:
case 0xc4: case 0xc5: case 0xc6: case 0xc7: case 0xC4: case 0xC5: case 0xC6: case 0xC7:
case 0xc8: case 0xc9: case 0xca: case 0xcb: case 0xC8: case 0xC9: case 0xCA: case 0xCB:
case 0xcc: case 0xcd: case 0xce: case 0xcf: case 0xCC: case 0xCD: case 0xCE: case 0xCF:
case 0xd0: case 0xd1: case 0xd2: case 0xd3: case 0xD0: case 0xD1: case 0xD2: case 0xD3:
case 0xd4: case 0xd5: case 0xd6: case 0xd7: case 0xD4: case 0xD5: case 0xD6: case 0xD7:
case 0xd8: case 0xd9: case 0xda: case 0xdb: case 0xD8: case 0xD9: case 0xDA: case 0xDB:
case 0xdc: case 0xdd: case 0xde: case 0xdf: case 0xDC: case 0xDD: case 0xDE: case 0xDF:
l = 1; if (max_len < 2) {
// need more bytes
break;
}
if (*p >= 0x80 && *p <= 0xBF) {
*pp = p + 1;
return ((c - 0xC0) << 6) + (*p - 0x80);
}
// otherwise encoding error
break; break;
case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xE0:
case 0xe4: case 0xe5: case 0xe6: case 0xe7: lower = 0xA0; /* reject invalid encoding */
case 0xe8: case 0xe9: case 0xea: case 0xeb: goto need2;
case 0xec: case 0xed: case 0xee: case 0xef: case 0xE1: case 0xE2: case 0xE3:
l = 2; case 0xE4: case 0xE5: case 0xE6: case 0xE7:
case 0xE8: case 0xE9: case 0xEA: case 0xEB:
case 0xEC: case 0xED: case 0xEE: case 0xEF:
lower = 0x80;
need2:
if (max_len < 3) {
// need more bytes
break;
}
if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
*pp = p + 2;
return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
}
// otherwise encoding error
break; break;
case 0xf0: case 0xf1: case 0xf2: case 0xf3: case 0xF0:
case 0xf4: case 0xf5: case 0xf6: case 0xf7: lower = 0x90; /* reject invalid encoding */
l = 3; upper = 0xBF;
break; goto need3;
case 0xf8: case 0xf9: case 0xfa: case 0xfb: case 0xF4:
l = 4; lower = 0x80;
break; upper = 0x8F; /* reject values above 0x10FFFF */
case 0xfc: case 0xfd: goto need3;
l = 5; case 0xF1: case 0xF2: case 0xF3:
lower = 0x80;
upper = 0xBF;
need3:
if (max_len < 4) {
// need more bytes
break;
}
if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
&& p[2] >= 0x80 && p[2] <= 0xBF) {
*pp = p + 3;
return ((c - 0xF0) << 18) + ((*p - 0x80) << 12) +
((p[1] - 0x80) << 6) + (p[2] - 0x80);
}
// otherwise encoding error
break; break;
default: default:
return -1; // invalid lead byte
break;
} }
/* check that we have enough characters */
if (l > (max_len - 1))
return -1;
c &= utf8_first_code_mask[l - 1];
for(i = 0; i < l; i++) {
b = *p++;
if (b < 0x80 || b >= 0xc0)
return -1;
c = (c << 6) | (b & 0x3f);
}
if (c < utf8_min_code[l - 1])
return -1;
*pp = p; *pp = p;
return c; return 0xFFFD;
}
/* Scan a UTF-8 encoded buffer for content type
`buf` is a valid pointer to a UTF-8 encoded string
`len` is the number of bytes to scan
`plen` points to a `size_t` variable to receive the number of units
Return value is a mask of bits.
- `UTF8_PLAIN_ASCII`: return value for 7-bit ASCII plain text
- `UTF8_NON_ASCII`: bit for non ASCII code points (8-bit or more)
- `UTF8_HAS_16BIT`: bit for 16-bit code points
- `UTF8_HAS_NON_BMP1`: bit for non-BMP1 code points, needs UTF-16 surrogate pairs
- `UTF8_HAS_ERRORS`: bit for encoding errors
*/
int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
{
const uint8_t *p, *p_end, *p_next;
size_t i, len;
int kind;
uint8_t cbits;
kind = UTF8_PLAIN_ASCII;
cbits = 0;
len = buf_len;
// TODO: handle more than 1 byte at a time
for (i = 0; i < buf_len; i++)
cbits |= buf[i];
if (cbits >= 0x80) {
p = (const uint8_t *)buf;
p_end = p + buf_len;
kind = UTF8_NON_ASCII;
len = 0;
while (p < p_end) {
len++;
if (*p++ >= 0x80) {
/* parse UTF-8 sequence, check for encoding error */
uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
if (p_next == p)
kind |= UTF8_HAS_ERRORS;
p = p_next;
if (c > 0xFF) {
kind |= UTF8_HAS_16BIT;
if (c > 0xFFFF) {
len++;
kind |= UTF8_HAS_NON_BMP1;
}
}
}
}
}
*plen = len;
return kind;
}
/* Decode a string encoded in UTF-8 into an array of bytes
`src` points to the source string. It is assumed to be correctly encoded
and only contains code points below 0x800
`src_len` is the length of the source string
`dest` points to the destination array, it can be null if `dest_len` is `0`
`dest_len` is the length of the destination array. A null
terminator is stored at the end of the array unless `dest_len` is `0`.
*/
size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len)
{
const uint8_t *p, *p_end;
size_t i;
p = (const uint8_t *)src;
p_end = p + src_len;
for (i = 0; p < p_end; i++) {
uint32_t c = *p++;
if (c >= 0xC0)
c = (c << 6) + *p++ - ((0xC0 << 6) + 0x80);
if (i < dest_len)
dest[i] = c;
}
if (i < dest_len)
dest[i] = '\0';
else if (dest_len > 0)
dest[dest_len - 1] = '\0';
return i;
}
/* Decode a string encoded in UTF-8 into an array of 16-bit words
`src` points to the source string. It is assumed to be correctly encoded.
`src_len` is the length of the source string
`dest` points to the destination array, it can be null if `dest_len` is `0`
`dest_len` is the length of the destination array. No null terminator is
stored at the end of the array.
*/
size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len)
{
const uint8_t *p, *p_end;
size_t i;
p = (const uint8_t *)src;
p_end = p + src_len;
for (i = 0; p < p_end; i++) {
uint32_t c = *p++;
if (c >= 0x80) {
/* parse utf-8 sequence */
c = utf8_decode(p - 1, p_end - (p - 1), &p);
/* encoding errors are converted as 0xFFFD and use a single byte */
if (c > 0xFFFF) {
if (i < dest_len)
dest[i] = get_hi_surrogate(c);
i++;
c = get_lo_surrogate(c);
}
}
if (i < dest_len)
dest[i] = c;
}
return i;
}
/* Encode a buffer of 8-bit bytes as a UTF-8 encoded string
`src` points to the source buffer.
`src_len` is the length of the source buffer
`dest` points to the destination array, it can be null if `dest_len` is `0`
`dest_len` is the length in bytes of the destination array. A null
terminator is stored at the end of the array unless `dest_len` is `0`.
*/
size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len)
{
size_t i, j;
uint32_t c;
for (i = j = 0; i < src_len; i++) {
c = src[i];
if (c < 0x80) {
if (j + 1 >= dest_len)
goto overflow;
dest[j++] = c;
} else {
if (j + 2 >= dest_len)
goto overflow;
dest[j++] = (c >> 6) | 0xC0;
dest[j++] = (c & 0x3F) | 0x80;
}
}
if (j < dest_len)
dest[j] = '\0';
return j;
overflow:
if (j < dest_len)
dest[j] = '\0';
while (i < src_len)
j += 1 + (src[i++] >= 0x80);
return j;
}
/* Encode a buffer of 16-bit code points as a UTF-8 encoded string
`src` points to the source buffer.
`src_len` is the length of the source buffer
`dest` points to the destination array, it can be null if `dest_len` is `0`
`dest_len` is the length in bytes of the destination array. A null
terminator is stored at the end of the array unless `dest_len` is `0`.
*/
size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len)
{
size_t i, j;
uint32_t c;
for (i = j = 0; i < src_len;) {
c = src[i++];
if (c < 0x80) {
if (j + 1 >= dest_len)
goto overflow;
dest[j++] = c;
} else {
if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i]))
c = from_surrogate(c, src[i++]);
if (j + utf8_encode_len(c) >= dest_len)
goto overflow;
j += utf8_encode((uint8_t *)dest + j, c);
}
}
if (j < dest_len)
dest[j] = '\0';
return j;
overflow:
i -= 1 + (c > 0xFFFF);
if (j < dest_len)
dest[j] = '\0';
while (i < src_len) {
c = src[i++];
if (c < 0x80) {
j++;
} else {
if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i]))
c = from_surrogate(c, src[i++]);
j += utf8_encode_len(c);
}
}
return j;
} }
/*--- integer to string conversions --*/ /*--- integer to string conversions --*/

View file

@ -387,10 +387,25 @@ static inline void dbuf_set_error(DynBuf *s)
s->error = TRUE; s->error = TRUE;
} }
#define UTF8_CHAR_LEN_MAX 6 /*---- UTF-8 and UTF-16 handling ----*/
int unicode_to_utf8(uint8_t *buf, unsigned int c); #define UTF8_CHAR_LEN_MAX 4
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
enum {
UTF8_PLAIN_ASCII = 0, // 7-bit ASCII plain text
UTF8_NON_ASCII = 1, // has non ASCII code points (8-bit or more)
UTF8_HAS_16BIT = 2, // has 16-bit code points
UTF8_HAS_NON_BMP1 = 4, // has non-BMP1 code points, needs UTF-16 surrogate pairs
UTF8_HAS_ERRORS = 8, // has encoding errors
};
int utf8_scan(const char *buf, size_t len, size_t *plen);
size_t utf8_encode_len(uint32_t c);
size_t utf8_encode(uint8_t *buf, uint32_t c);
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);
size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len);
static inline BOOL is_surrogate(uint32_t c) static inline BOOL is_surrogate(uint32_t c)
{ {

View file

@ -712,7 +712,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
static int get_class_atom(REParseState *s, CharRange *cr, static int get_class_atom(REParseState *s, CharRange *cr,
const uint8_t **pp, BOOL inclass) const uint8_t **pp, BOOL inclass)
{ {
const uint8_t *p; const uint8_t *p, *p_next;
uint32_t c; uint32_t c;
int ret; int ret;
@ -804,15 +804,18 @@ static int get_class_atom(REParseState *s, CharRange *cr,
/* fall thru */ /* fall thru */
default: default:
normal_char: normal_char:
/* normal char */ p++;
if (c >= 128) { if (c >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if ((unsigned)c > 0xffff && !s->is_unicode) { if (p_next == p)
/* XXX: should handle non BMP-1 code points */ return re_parse_error(s, "invalid UTF-8 sequence");
p = p_next;
if (c > 0xFFFF && !s->is_unicode) {
// TODO(chqrlie): should handle non BMP-1 code points in
// the calling function and no require the source string
// to be CESU-8 encoded if not s->is_unicode
return re_parse_error(s, "malformed unicode char"); return re_parse_error(s, "malformed unicode char");
} }
} else {
p++;
} }
break; break;
} }
@ -1105,35 +1108,35 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
/* '*pp' is the first char after '<' */ /* '*pp' is the first char after '<' */
static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
{ {
const uint8_t *p, *p1; const uint8_t *p, *p_next;
uint32_t c, d; uint32_t c, d;
char *q; char *q;
p = *pp; p = *pp;
q = buf; q = buf;
for(;;) { for(;;) {
c = *p; c = *p++;
if (c == '\\') { if (c == '\\') {
p++;
if (*p != 'u') if (*p != 'u')
return -1; return -1;
c = lre_parse_escape(&p, 2); // accept surrogate pairs c = lre_parse_escape(&p, 2); // accept surrogate pairs
if ((int)c < 0)
return -1;
} else if (c == '>') { } else if (c == '>') {
break; break;
} else if (c >= 128) { } else if (c >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p)
return -1;
p = p_next;
if (is_hi_surrogate(c)) { if (is_hi_surrogate(c)) {
d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
if (is_lo_surrogate(d)) { if (is_lo_surrogate(d)) {
c = from_surrogate(c, d); c = from_surrogate(c, d);
p = p1; p = p_next;
} }
} }
} else {
p++;
} }
if (c > 0x10FFFF)
return -1;
if (q == buf) { if (q == buf) {
if (!lre_js_is_ident_first(c)) if (!lre_js_is_ident_first(c))
return -1; return -1;
@ -1143,16 +1146,15 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
} }
if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size) if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
return -1; return -1;
if (c < 128) { if (c < 0x80) {
*q++ = c; *q++ = c;
} else { } else {
q += unicode_to_utf8((uint8_t*)q, c); q += utf8_encode((uint8_t*)q, c);
} }
} }
if (q == buf) if (q == buf)
return -1; return -1;
*q = '\0'; *q = '\0';
p++;
*pp = p; *pp = p;
return 0; return 0;
} }

View file

@ -272,20 +272,21 @@ static JSValue js_printf_internal(JSContext *ctx,
if (i >= argc) if (i >= argc)
goto missing; goto missing;
if (JS_IsString(argv[i])) { if (JS_IsString(argv[i])) {
// TODO(chqrlie) need an API to wrap charCodeAt and codePointAt */
string_arg = JS_ToCString(ctx, argv[i++]); string_arg = JS_ToCString(ctx, argv[i++]);
if (!string_arg) if (!string_arg)
goto fail; goto fail;
int32_arg = unicode_from_utf8((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p); int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
JS_FreeCString(ctx, string_arg); JS_FreeCString(ctx, string_arg);
} else { } else {
if (JS_ToInt32(ctx, &int32_arg, argv[i++])) if (JS_ToInt32(ctx, &int32_arg, argv[i++]))
goto fail; goto fail;
} }
/* handle utf-8 encoding explicitly */ // XXX: throw an exception?
if ((unsigned)int32_arg > 0x10FFFF) if ((unsigned)int32_arg > 0x10FFFF)
int32_arg = 0xFFFD; int32_arg = 0xFFFD;
/* ignore conversion flags, width and precision */ /* ignore conversion flags, width and precision */
len = unicode_to_utf8(cbuf, int32_arg); len = utf8_encode(cbuf, int32_arg);
dbuf_put(&dbuf, cbuf, len); dbuf_put(&dbuf, cbuf, len);
break; break;

271
quickjs.c
View file

@ -3030,38 +3030,25 @@ static const char *JS_AtomGetStrRT(JSRuntime *rt, char *buf, int buf_size,
snprintf(buf, buf_size, "<invalid %x>", atom); snprintf(buf, buf_size, "<invalid %x>", atom);
} else { } else {
JSAtomStruct *p = rt->atom_array[atom]; JSAtomStruct *p = rt->atom_array[atom];
*buf = '\0';
if (atom_is_free(p)) { if (atom_is_free(p)) {
assert(!atom_is_free(p)); assert(!atom_is_free(p));
snprintf(buf, buf_size, "<free %x>", atom); snprintf(buf, buf_size, "<free %x>", atom);
} else { } else if (p != NULL) {
int i, c; JSString *str = p;
char *q; if (str->is_wide_char) {
JSString *str; /* encode surrogates correctly */
utf8_encode_buf16(buf, buf_size, str->u.str16, str->len);
q = buf; } else {
str = p; /* special case ASCII strings */
if (str) { int i, c = 0;
if (!str->is_wide_char) {
/* special case ASCII strings */
c = 0;
for(i = 0; i < str->len; i++) {
c |= str->u.str8[i];
}
if (c < 0x80)
return (const char *)str->u.str8;
}
for(i = 0; i < str->len; i++) { for(i = 0; i < str->len; i++) {
c = string_get(str, i); c |= str->u.str8[i];
if ((q - buf) >= buf_size - UTF8_CHAR_LEN_MAX)
break;
if (c < 128) {
*q++ = c;
} else {
q += unicode_to_utf8((uint8_t *)q, c);
}
} }
if (c < 0x80)
return (const char *)str->u.str8;
utf8_encode_buf8(buf, buf_size, str->u.str8, str->len);
} }
*q = '\0';
} }
} }
return buf; return buf;
@ -3311,6 +3298,7 @@ const char *JS_AtomToCString(JSContext *ctx, JSAtom atom)
/* return a string atom containing name concatenated with str1 */ /* return a string atom containing name concatenated with str1 */
/* `str1` may be pure ASCII or UTF-8 encoded */ /* `str1` may be pure ASCII or UTF-8 encoded */
// TODO(chqrlie): use string concatenation instead of UTF-8 conversion
static JSAtom js_atom_concat_str(JSContext *ctx, JSAtom name, const char *str1) static JSAtom js_atom_concat_str(JSContext *ctx, JSAtom name, const char *str1)
{ {
JSValue str; JSValue str;
@ -3863,64 +3851,44 @@ static JSValue string_buffer_end(StringBuffer *s)
/* create a string from a UTF-8 buffer */ /* create a string from a UTF-8 buffer */
JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len) JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
{ {
const uint8_t *p, *p_end, *p_start, *p_next; JSString *str;
uint32_t c; size_t len;
StringBuffer b_s, *b = &b_s; int kind;
size_t len1;
if (buf_len <= 0) { if (buf_len <= 0) {
return JS_AtomToString(ctx, JS_ATOM_empty_string); return JS_AtomToString(ctx, JS_ATOM_empty_string);
} }
p_start = (const uint8_t *)buf; /* Compute string kind and length: 7-bit, 8-bit, 16-bit, 16-bit UTF-16 */
p_end = p_start + buf_len; kind = utf8_scan(buf, buf_len, &len);
p = p_start; if (len > JS_STRING_LEN_MAX)
while (p < p_end && *p < 128)
p++;
len1 = p - p_start;
if (len1 > JS_STRING_LEN_MAX)
return JS_ThrowRangeError(ctx, "invalid string length"); return JS_ThrowRangeError(ctx, "invalid string length");
if (p == p_end) {
/* ASCII string */
return js_new_string8_len(ctx, buf, buf_len);
} else {
if (string_buffer_init(ctx, b, buf_len))
goto fail;
string_buffer_write8(b, p_start, len1);
while (p < p_end) {
if (*p < 128) {
string_buffer_putc8(b, *p++);
} else {
/* parse utf-8 sequence, return 0xFFFFFFFF for error */
c = unicode_from_utf8(p, p_end - p, &p_next);
if (c < 0x10000) {
p = p_next;
} else if (c <= 0x10FFFF) {
p = p_next;
/* surrogate pair */
string_buffer_putc16(b, get_hi_surrogate(c));
c = get_lo_surrogate(c);
} else {
/* invalid char */
c = 0xfffd;
/* skip the invalid chars */
/* XXX: seems incorrect. Why not just use c = *p++; ? */
while (p < p_end && (*p >= 0x80 && *p < 0xc0))
p++;
if (p < p_end) {
p++;
while (p < p_end && (*p >= 0x80 && *p < 0xc0))
p++;
}
}
string_buffer_putc16(b, c);
}
}
}
return string_buffer_end(b);
fail: switch (kind) {
string_buffer_free(b); case UTF8_PLAIN_ASCII:
return JS_EXCEPTION; str = js_alloc_string(ctx, len, 0);
if (!str)
return JS_EXCEPTION;
memcpy(str->u.str8, buf, len);
str->u.str8[len] = '\0';
break;
case UTF8_NON_ASCII:
/* buf contains non-ASCII code-points, but limited to 8-bit values */
str = js_alloc_string(ctx, len, 0);
if (!str)
return JS_EXCEPTION;
utf8_decode_buf8(str->u.str8, len + 1, buf, buf_len);
break;
default:
// This causes a potential problem in JS_ThrowError if message is invalid
//if (kind & UTF8_HAS_ERRORS)
// return JS_ThrowRangeError(ctx, "invalid UTF-8 sequence");
str = js_alloc_string(ctx, len, 1);
if (!str)
return JS_EXCEPTION;
utf8_decode_buf16(str->u.str16, len, buf, buf_len);
break;
}
return JS_MKPTR(JS_TAG_STRING, str);
} }
static JSValue JS_ConcatString3(JSContext *ctx, const char *str1, static JSValue JS_ConcatString3(JSContext *ctx, const char *str1,
@ -4067,7 +4035,7 @@ go:
/* c = 0xfffd; */ /* error */ /* c = 0xfffd; */ /* error */
} }
} }
q += unicode_to_utf8(q, c); q += utf8_encode(q, c);
} }
} }
} }
@ -10073,6 +10041,7 @@ int JS_ToBool(JSContext *ctx, JSValue val)
return JS_ToBoolFree(ctx, js_dup(val)); return JS_ToBoolFree(ctx, js_dup(val));
} }
/* pc points to pure ASCII or UTF-8, null terminated contents */
static int skip_spaces(const char *pc) static int skip_spaces(const char *pc)
{ {
const uint8_t *p, *p_next, *p_start; const uint8_t *p, *p_next, *p_start;
@ -10080,19 +10049,19 @@ static int skip_spaces(const char *pc)
p = p_start = (const uint8_t *)pc; p = p_start = (const uint8_t *)pc;
for (;;) { for (;;) {
c = *p; c = *p++;
if (c < 128) { if (c < 0x80) {
if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20))) if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
break; break;
p++;
} else { } else {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next); c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not a space */
if (!lre_is_space(c)) if (!lre_is_space(c))
break; break;
p = p_next; p = p_next;
} }
} }
return p - p_start; return p - 1 - p_start;
} }
static inline int to_digit(int c) static inline int to_digit(int c)
@ -18689,6 +18658,7 @@ static int js_parse_error_reserved_identifier(JSParseState *s)
static __exception int js_parse_template_part(JSParseState *s, static __exception int js_parse_template_part(JSParseState *s,
const uint8_t *p) const uint8_t *p)
{ {
const uint8_t *p_next;
uint32_t c; uint32_t c;
StringBuffer b_s, *b = &b_s; StringBuffer b_s, *b = &b_s;
@ -18726,9 +18696,8 @@ static __exception int js_parse_template_part(JSParseState *s,
s->eol = &p[-1]; s->eol = &p[-1];
s->mark = p; s->mark = p;
} else if (c >= 0x80) { } else if (c >= 0x80) {
const uint8_t *p_next; c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next); if (p_next == p) {
if (c > 0x10FFFF) {
js_parse_error(s, "invalid UTF-8 sequence"); js_parse_error(s, "invalid UTF-8 sequence");
goto fail; goto fail;
} }
@ -18754,6 +18723,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
BOOL do_throw, const uint8_t *p, BOOL do_throw, const uint8_t *p,
JSToken *token, const uint8_t **pp) JSToken *token, const uint8_t **pp)
{ {
const uint8_t *p_next;
int ret; int ret;
uint32_t c; uint32_t c;
StringBuffer b_s, *b = &b_s; StringBuffer b_s, *b = &b_s;
@ -18832,9 +18802,8 @@ static __exception int js_parse_string(JSParseState *s, int sep,
} }
goto fail; goto fail;
} else if (c >= 0x80) { } else if (c >= 0x80) {
const uint8_t *p_next; c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next); if (p_next == p + 1) {
if (c > 0x10FFFF) {
goto invalid_utf8; goto invalid_utf8;
} }
p = p_next; p = p_next;
@ -18859,9 +18828,8 @@ static __exception int js_parse_string(JSParseState *s, int sep,
break; break;
} }
} else if (c >= 0x80) { } else if (c >= 0x80) {
const uint8_t *p_next; c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next); if (p_next == p)
if (c > 0x10FFFF)
goto invalid_utf8; goto invalid_utf8;
p = p_next; p = p_next;
} }
@ -18893,7 +18861,7 @@ static inline BOOL token_is_pseudo_keyword(JSParseState *s, JSAtom atom) {
static __exception int js_parse_regexp(JSParseState *s) static __exception int js_parse_regexp(JSParseState *s)
{ {
const uint8_t *p; const uint8_t *p, *p_next;
BOOL in_class; BOOL in_class;
StringBuffer b_s, *b = &b_s; StringBuffer b_s, *b = &b_s;
StringBuffer b2_s, *b2 = &b2_s; StringBuffer b2_s, *b2 = &b2_s;
@ -18932,9 +18900,8 @@ static __exception int js_parse_regexp(JSParseState *s)
else if (c == '\0' && p >= s->buf_end) else if (c == '\0' && p >= s->buf_end)
goto eof_error; goto eof_error;
else if (c >= 0x80) { else if (c >= 0x80) {
const uint8_t *p_next; c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next); if (p_next == p) {
if (c > 0x10FFFF) {
goto invalid_utf8; goto invalid_utf8;
} }
p = p_next; p = p_next;
@ -18942,9 +18909,8 @@ static __exception int js_parse_regexp(JSParseState *s)
goto eol_error; goto eol_error;
} }
} else if (c >= 0x80) { } else if (c >= 0x80) {
const uint8_t *p_next; c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next); if (p_next == p) {
if (c > 0x10FFFF) {
invalid_utf8: invalid_utf8:
js_parse_error(s, "invalid UTF-8 sequence"); js_parse_error(s, "invalid UTF-8 sequence");
goto fail; goto fail;
@ -18963,14 +18929,8 @@ static __exception int js_parse_regexp(JSParseState *s)
/* flags */ /* flags */
for(;;) { for(;;) {
const uint8_t *p_next = p; c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = *p_next++; /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
if (c >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
if (c > 0x10FFFF) {
goto invalid_utf8;
}
}
if (!lre_js_is_ident_next(c)) if (!lre_js_is_ident_next(c))
break; break;
if (string_buffer_putc(b2, c)) if (string_buffer_putc(b2, c))
@ -19020,10 +18980,10 @@ static __exception int ident_realloc(JSContext *ctx, char **pbuf, size_t *psize,
static JSAtom parse_ident(JSParseState *s, const uint8_t **pp, static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
BOOL *pident_has_escape, int c, BOOL is_private) BOOL *pident_has_escape, int c, BOOL is_private)
{ {
const uint8_t *p, *p1; const uint8_t *p, *p_next;
char ident_buf[128], *buf; char ident_buf[128], *buf;
size_t ident_size, ident_pos; size_t ident_size, ident_pos;
JSAtom atom; JSAtom atom = JS_ATOM_NULL;
p = *pp; p = *pp;
buf = ident_buf; buf = ident_buf;
@ -19032,28 +18992,26 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
if (is_private) if (is_private)
buf[ident_pos++] = '#'; buf[ident_pos++] = '#';
for(;;) { for(;;) {
p1 = p; if (c < 0x80) {
if (c < 128) {
buf[ident_pos++] = c; buf[ident_pos++] = c;
} else { } else {
ident_pos += unicode_to_utf8((uint8_t*)buf + ident_pos, c); ident_pos += utf8_encode((uint8_t*)buf + ident_pos, c);
} }
c = *p1++; c = *p;
if (c == '\\' && *p1 == 'u') { p_next = p + 1;
c = lre_parse_escape(&p1, TRUE); if (c == '\\' && *p_next == 'u') {
c = lre_parse_escape(&p_next, TRUE);
*pident_has_escape = TRUE; *pident_has_escape = TRUE;
} else if (c >= 128) { } else if (c >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
} }
if (!lre_js_is_ident_next(c)) if (!lre_js_is_ident_next(c))
break; break;
p = p1; p = p_next;
if (unlikely(ident_pos >= ident_size - UTF8_CHAR_LEN_MAX)) { if (unlikely(ident_pos >= ident_size - UTF8_CHAR_LEN_MAX)) {
if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf)) { if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf))
atom = JS_ATOM_NULL;
goto done; goto done;
}
} }
} }
/* buf is pure ASCII or UTF-8 encoded */ /* buf is pure ASCII or UTF-8 encoded */
@ -19068,7 +19026,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
static __exception int next_token(JSParseState *s) static __exception int next_token(JSParseState *s)
{ {
const uint8_t *p; const uint8_t *p, *p_next;
int c; int c;
BOOL ident_has_escape; BOOL ident_has_escape;
JSAtom atom; JSAtom atom;
@ -19148,11 +19106,10 @@ static __exception int next_token(JSParseState *s)
s->got_lf = TRUE; /* considered as LF for ASI */ s->got_lf = TRUE; /* considered as LF for ASI */
p++; p++;
} else if (*p >= 0x80) { } else if (*p >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
/* ignore invalid UTF-8 in comments */
if (c == CP_LS || c == CP_PS) { if (c == CP_LS || c == CP_PS) {
s->got_lf = TRUE; /* considered as LF for ASI */ s->got_lf = TRUE; /* considered as LF for ASI */
} else if (c == -1) {
p++; /* skip invalid UTF-8 */
} }
} else { } else {
p++; p++;
@ -19170,12 +19127,11 @@ static __exception int next_token(JSParseState *s)
if (*p == '\r' || *p == '\n') if (*p == '\r' || *p == '\n')
break; break;
if (*p >= 0x80) { if (*p >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
/* ignore invalid UTF-8 in comments */
/* LS or PS are considered as line terminator */ /* LS or PS are considered as line terminator */
if (c == CP_LS || c == CP_PS) { if (c == CP_LS || c == CP_PS) {
break; break;
} else if (c == -1) {
p++; /* skip invalid UTF-8 */
} }
} else { } else {
p++; p++;
@ -19265,20 +19221,21 @@ static __exception int next_token(JSParseState *s)
case '#': case '#':
/* private name */ /* private name */
{ {
const uint8_t *p1;
p++; p++;
p1 = p; c = *p;
c = *p1++; p_next = p + 1;
if (c == '\\' && *p1 == 'u') { if (c == '\\' && *p_next == 'u') {
c = lre_parse_escape(&p1, TRUE); c = lre_parse_escape(&p_next, TRUE);
} else if (c >= 128) { } else if (c >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p + 1)
goto invalid_utf8;
} }
if (!lre_js_is_ident_first(c)) { if (!lre_js_is_ident_first(c)) {
js_parse_error(s, "invalid first character of private name"); js_parse_error(s, "invalid first character of private name");
goto fail; goto fail;
} }
p = p1; p = p_next;
ident_has_escape = FALSE; /* not used */ ident_has_escape = FALSE; /* not used */
atom = parse_ident(s, &p, &ident_has_escape, c, TRUE); atom = parse_ident(s, &p, &ident_has_escape, c, TRUE);
if (atom == JS_ATOM_NULL) if (atom == JS_ATOM_NULL)
@ -19313,7 +19270,6 @@ static __exception int next_token(JSParseState *s)
parse_number: parse_number:
{ {
JSValue ret; JSValue ret;
const uint8_t *p1;
int flags; int flags;
flags = ATOD_ACCEPT_BIN_OCT | ATOD_ACCEPT_LEGACY_OCTAL | flags = ATOD_ACCEPT_BIN_OCT | ATOD_ACCEPT_LEGACY_OCTAL |
ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX; ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX;
@ -19324,7 +19280,7 @@ static __exception int next_token(JSParseState *s)
goto fail; goto fail;
/* reject `10instanceof Number` */ /* reject `10instanceof Number` */
if (JS_VALUE_IS_NAN(ret) || if (JS_VALUE_IS_NAN(ret) ||
lre_js_is_ident_next(unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1))) { lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
JS_FreeValue(s->ctx, ret); JS_FreeValue(s->ctx, ret);
js_parse_error(s, "invalid number literal"); js_parse_error(s, "invalid number literal");
goto fail; goto fail;
@ -19516,9 +19472,11 @@ static __exception int next_token(JSParseState *s)
} }
break; break;
default: default:
if (c >= 128) { if (c >= 0x80) { /* non-ASCII code-point */
/* unicode value */ c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); if (p_next == p + 1)
goto invalid_utf8;
p = p_next;
switch(c) { switch(c) {
case CP_PS: case CP_PS:
case CP_LS: case CP_LS:
@ -19549,6 +19507,8 @@ static __exception int next_token(JSParseState *s)
// dump_token(s, &s->token); // dump_token(s, &s->token);
return 0; return 0;
invalid_utf8:
js_parse_error(s, "invalid UTF-8 sequence");
fail: fail:
s->token.val = TOK_ERROR; s->token.val = TOK_ERROR;
return -1; return -1;
@ -19573,7 +19533,7 @@ static int json_parse_error(JSParseState *s, const uint8_t *curp, const char *ms
static int json_parse_string(JSParseState *s, const uint8_t **pp) static int json_parse_string(JSParseState *s, const uint8_t **pp)
{ {
const uint8_t *p = *pp; const uint8_t *p, *p_next;
int i; int i;
uint32_t c; uint32_t c;
StringBuffer b_s, *b = &b_s; StringBuffer b_s, *b = &b_s;
@ -19581,6 +19541,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
if (string_buffer_init(s->ctx, b, 32)) if (string_buffer_init(s->ctx, b, 32))
goto fail; goto fail;
p = *pp;
for(;;) { for(;;) {
if (p >= s->buf_end) { if (p >= s->buf_end) {
goto end_of_input; goto end_of_input;
@ -19622,9 +19583,8 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
} }
} else } else
if (c >= 0x80) { if (c >= 0x80) {
const uint8_t *p_next; c = utf8_decode(p - 1, s->buf_end - p, &p_next);
c = unicode_from_utf8(p - 1, s->buf_end - p, &p_next); if (p_next == p) {
if (c > 0x10FFFF) {
json_parse_error(s, p - 1, "Bad UTF-8 sequence"); json_parse_error(s, p - 1, "Bad UTF-8 sequence");
goto fail; goto fail;
} }
@ -19722,7 +19682,7 @@ static JSAtom json_parse_ident(JSParseState *s, const uint8_t **pp, int c)
static __exception int json_next_token(JSParseState *s) static __exception int json_next_token(JSParseState *s)
{ {
const uint8_t *p; const uint8_t *p, *p_next;
int c; int c;
JSAtom atom; JSAtom atom;
@ -19826,10 +19786,9 @@ static __exception int json_next_token(JSParseState *s)
goto fail; goto fail;
break; break;
default: default:
if (c >= 128) { if (c >= 0x80) {
const uint8_t *p_next; c = utf8_decode(p, s->buf_end - p, &p_next);
c = unicode_from_utf8(p, s->buf_end - p, &p_next); if (p_next == p + 1) {
if (c == -1) {
js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p); js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
} else { } else {
if (c > 0xFFFF) { if (c > 0xFFFF) {
@ -19951,12 +19910,10 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
if (*p == '\n' || *p == '\r') { if (*p == '\n' || *p == '\r') {
break; break;
} else if (*p >= 0x80) { } else if (*p >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
if (c == CP_LS || c == CP_PS) { /* purposely ignore UTF-8 encoding errors in this comment line */
if (c == CP_LS || c == CP_PS)
break; break;
} else if (c == -1) {
p++; /* skip invalid UTF-8 */
}
} else { } else {
p++; p++;
} }