Improve UTF-8 decoding and encoding functions (#410)

Ensure proper UTF-8 encoding (1 to 4 bytes).
Handle invalid encodings (return 0xFFFD and consume a single byte)
Individually encoded surrogate code points are accepted.

- add `utf8_scan()` to analyze a byte array for UTF-8 contents
  detects invalid encoding, computes number of codepoints and content kind:
  plain ASCII, 8-bit, 16-bit or larger codepoints.
- add `utf8_encode_len(c)` to compute the number of bytes to encode `c`
- rename `unicode_to_utf8` as `utf8_encode`
- rename `unicode_from_utf8` as `utf8_decode`
- add `utf8_decode_buf8(dest, size, src, len)` to decode a UTF-8 encoded
  byte array known to contain only ASCII and 8-bit codepoints.
- add `utf8_decode_buf16(dest, size, src, len)` to decode a UTF-8 encoded
  byte array into an array of 16-bit codepoints using UTF-16 surrogate pairs
  for non-BMP1 codepoints.
- add `utf8_encode_buf8(dest, size, src, len)` to encode an array of 8-bit
  codepoints as a UTF-8 encoded null terminated string
- add `utf16_encode_buf8(dest, size, src, len)` to decode an array of 16-bit
  codepoints (including surrogate pairs) as a UTF-8 encoded null terminated string
- detect invalid UTF-8 encoding in RegExp parser
- simplify `JS_AtomGetStrRT`, `JS_NewStringLen` using the above functions
- simplify UTF-8 decoding and error testing
This commit is contained in:
Charlie Gordon 2024-05-21 14:08:33 +02:00 committed by GitHub
parent f588210641
commit 1baa6763f8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 490 additions and 269 deletions

398
cutils.c
View file

@ -213,58 +213,83 @@ void dbuf_free(DynBuf *s)
memset(s, 0, sizeof(*s));
}
/*--- Unicode / UTF-8 utility functions --*/
/*--- UTF-8 utility functions --*/
/* Note: at most 31 bits are encoded. At most UTF8_CHAR_LEN_MAX bytes
are output. */
int unicode_to_utf8(uint8_t *buf, unsigned int c)
/* Note: only encode valid codepoints (0x0000..0x10FFFF).
At most UTF8_CHAR_LEN_MAX bytes are output. */
/* Compute the number of bytes of the UTF-8 encoding for a codepoint
`c` is a code-point.
Returns the number of bytes. If a codepoint is beyond 0x10FFFF the
return value is 3 as the codepoint would be encoded as 0xFFFD.
*/
size_t utf8_encode_len(uint32_t c)
{
uint8_t *q = buf;
if (c < 0x80)
return 1;
if (c < 0x800)
return 2;
if (c < 0x10000)
return 3;
if (c < 0x110000)
return 4;
return 3;
}
/* Encode a codepoint in UTF-8
`buf` points to an array of at least `UTF8_CHAR_LEN_MAX` bytes
`c` is a code-point.
Returns the number of bytes. If a codepoint is beyond 0x10FFFF the
return value is 3 and the codepoint is encoded as 0xFFFD.
No null byte is stored after the encoded bytes.
Return value is in range 1..4
*/
size_t utf8_encode(uint8_t *buf, uint32_t c)
{
if (c < 0x80) {
*q++ = c;
} else {
buf[0] = c;
return 1;
}
if (c < 0x800) {
*q++ = (c >> 6) | 0xc0;
} else {
buf[0] = (c >> 6) | 0xC0;
buf[1] = (c & 0x3F) | 0x80;
return 2;
}
if (c < 0x10000) {
*q++ = (c >> 12) | 0xe0;
} else {
if (c < 0x00200000) {
*q++ = (c >> 18) | 0xf0;
} else {
if (c < 0x04000000) {
*q++ = (c >> 24) | 0xf8;
} else if (c < 0x80000000) {
*q++ = (c >> 30) | 0xfc;
*q++ = ((c >> 24) & 0x3f) | 0x80;
} else {
return 0;
buf[0] = (c >> 12) | 0xE0;
buf[1] = ((c >> 6) & 0x3F) | 0x80;
buf[2] = (c & 0x3F) | 0x80;
return 3;
}
*q++ = ((c >> 18) & 0x3f) | 0x80;
if (c < 0x110000) {
buf[0] = (c >> 18) | 0xF0;
buf[1] = ((c >> 12) & 0x3F) | 0x80;
buf[2] = ((c >> 6) & 0x3F) | 0x80;
buf[3] = (c & 0x3F) | 0x80;
return 4;
}
*q++ = ((c >> 12) & 0x3f) | 0x80;
}
*q++ = ((c >> 6) & 0x3f) | 0x80;
}
*q++ = (c & 0x3f) | 0x80;
}
return q - buf;
buf[0] = (0xFFFD >> 12) | 0xE0;
buf[1] = ((0xFFFD >> 6) & 0x3F) | 0x80;
buf[2] = (0xFFFD & 0x3F) | 0x80;
return 3;
}
static const unsigned int utf8_min_code[5] = {
0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
};
static const unsigned char utf8_first_code_mask[5] = {
0x1f, 0xf, 0x7, 0x3, 0x1,
};
/* return -1 if error. *pp is not updated in this case. max_len must
be >= 1. The maximum length for a UTF8 byte sequence is 6 bytes. */
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
/* Decode a single code point from a UTF-8 encoded array of bytes
`p` is a valid pointer to an array of bytes
`max_len` is the number of bytes available in the array
`pp` is a valid pointer to a `const uint8_t *` to store a pointer
to the byte following the current sequence.
Return the code point at `p`, in the range `0..0x10FFFF`
Return 0xFFFD on error. Only a single byte is consumed in this case
The maximum length for a UTF-8 byte sequence is 4 bytes.
This implements the algorithm specified in whatwg.org, except it accepts
UTF-8 encoded surrogates as JavaScript allows them in strings.
cf: https://encoding.spec.whatwg.org/#utf-8-encoder
*/
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
{
int l, c, b, i;
uint32_t c;
uint8_t lower, upper;
c = *p++;
if (c < 0x80) {
@ -272,49 +297,270 @@ int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
return c;
}
switch(c) {
case 0xc0: case 0xc1: case 0xc2: case 0xc3:
case 0xc4: case 0xc5: case 0xc6: case 0xc7:
case 0xc8: case 0xc9: case 0xca: case 0xcb:
case 0xcc: case 0xcd: case 0xce: case 0xcf:
case 0xd0: case 0xd1: case 0xd2: case 0xd3:
case 0xd4: case 0xd5: case 0xd6: case 0xd7:
case 0xd8: case 0xd9: case 0xda: case 0xdb:
case 0xdc: case 0xdd: case 0xde: case 0xdf:
l = 1;
case 0xC2: case 0xC3:
case 0xC4: case 0xC5: case 0xC6: case 0xC7:
case 0xC8: case 0xC9: case 0xCA: case 0xCB:
case 0xCC: case 0xCD: case 0xCE: case 0xCF:
case 0xD0: case 0xD1: case 0xD2: case 0xD3:
case 0xD4: case 0xD5: case 0xD6: case 0xD7:
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
if (max_len < 2) {
// need more bytes
break;
case 0xe0: case 0xe1: case 0xe2: case 0xe3:
case 0xe4: case 0xe5: case 0xe6: case 0xe7:
case 0xe8: case 0xe9: case 0xea: case 0xeb:
case 0xec: case 0xed: case 0xee: case 0xef:
l = 2;
}
if (*p >= 0x80 && *p <= 0xBF) {
*pp = p + 1;
return ((c - 0xC0) << 6) + (*p - 0x80);
}
// otherwise encoding error
break;
case 0xf0: case 0xf1: case 0xf2: case 0xf3:
case 0xf4: case 0xf5: case 0xf6: case 0xf7:
l = 3;
case 0xE0:
lower = 0xA0; /* reject invalid encoding */
goto need2;
case 0xE1: case 0xE2: case 0xE3:
case 0xE4: case 0xE5: case 0xE6: case 0xE7:
case 0xE8: case 0xE9: case 0xEA: case 0xEB:
case 0xEC: case 0xED: case 0xEE: case 0xEF:
lower = 0x80;
need2:
if (max_len < 3) {
// need more bytes
break;
case 0xf8: case 0xf9: case 0xfa: case 0xfb:
l = 4;
}
if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
*pp = p + 2;
return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
}
// otherwise encoding error
break;
case 0xfc: case 0xfd:
l = 5;
case 0xF0:
lower = 0x90; /* reject invalid encoding */
upper = 0xBF;
goto need3;
case 0xF4:
lower = 0x80;
upper = 0x8F; /* reject values above 0x10FFFF */
goto need3;
case 0xF1: case 0xF2: case 0xF3:
lower = 0x80;
upper = 0xBF;
need3:
if (max_len < 4) {
// need more bytes
break;
}
if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
&& p[2] >= 0x80 && p[2] <= 0xBF) {
*pp = p + 3;
return ((c - 0xF0) << 18) + ((*p - 0x80) << 12) +
((p[1] - 0x80) << 6) + (p[2] - 0x80);
}
// otherwise encoding error
break;
default:
return -1;
// invalid lead byte
break;
}
/* check that we have enough characters */
if (l > (max_len - 1))
return -1;
c &= utf8_first_code_mask[l - 1];
for(i = 0; i < l; i++) {
b = *p++;
if (b < 0x80 || b >= 0xc0)
return -1;
c = (c << 6) | (b & 0x3f);
}
if (c < utf8_min_code[l - 1])
return -1;
*pp = p;
return c;
return 0xFFFD;
}
/* Scan a UTF-8 encoded buffer for content type
`buf` is a valid pointer to a UTF-8 encoded string
`len` is the number of bytes to scan
`plen` points to a `size_t` variable to receive the number of units
Return value is a mask of bits.
- `UTF8_PLAIN_ASCII`: return value for 7-bit ASCII plain text
- `UTF8_NON_ASCII`: bit for non ASCII code points (8-bit or more)
- `UTF8_HAS_16BIT`: bit for 16-bit code points
- `UTF8_HAS_NON_BMP1`: bit for non-BMP1 code points, needs UTF-16 surrogate pairs
- `UTF8_HAS_ERRORS`: bit for encoding errors
*/
int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
{
const uint8_t *p, *p_end, *p_next;
size_t i, len;
int kind;
uint8_t cbits;
kind = UTF8_PLAIN_ASCII;
cbits = 0;
len = buf_len;
// TODO: handle more than 1 byte at a time
for (i = 0; i < buf_len; i++)
cbits |= buf[i];
if (cbits >= 0x80) {
p = (const uint8_t *)buf;
p_end = p + buf_len;
kind = UTF8_NON_ASCII;
len = 0;
while (p < p_end) {
len++;
if (*p++ >= 0x80) {
/* parse UTF-8 sequence, check for encoding error */
uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
if (p_next == p)
kind |= UTF8_HAS_ERRORS;
p = p_next;
if (c > 0xFF) {
kind |= UTF8_HAS_16BIT;
if (c > 0xFFFF) {
len++;
kind |= UTF8_HAS_NON_BMP1;
}
}
}
}
}
*plen = len;
return kind;
}
/* Decode a string encoded in UTF-8 into an array of bytes
`src` points to the source string. It is assumed to be correctly encoded
and only contains code points below 0x800
`src_len` is the length of the source string
`dest` points to the destination array, it can be null if `dest_len` is `0`
`dest_len` is the length of the destination array. A null
terminator is stored at the end of the array unless `dest_len` is `0`.
*/
size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len)
{
const uint8_t *p, *p_end;
size_t i;
p = (const uint8_t *)src;
p_end = p + src_len;
for (i = 0; p < p_end; i++) {
uint32_t c = *p++;
if (c >= 0xC0)
c = (c << 6) + *p++ - ((0xC0 << 6) + 0x80);
if (i < dest_len)
dest[i] = c;
}
if (i < dest_len)
dest[i] = '\0';
else if (dest_len > 0)
dest[dest_len - 1] = '\0';
return i;
}
/* Decode a string encoded in UTF-8 into an array of 16-bit words
`src` points to the source string. It is assumed to be correctly encoded.
`src_len` is the length of the source string
`dest` points to the destination array, it can be null if `dest_len` is `0`
`dest_len` is the length of the destination array. No null terminator is
stored at the end of the array.
*/
size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len)
{
const uint8_t *p, *p_end;
size_t i;
p = (const uint8_t *)src;
p_end = p + src_len;
for (i = 0; p < p_end; i++) {
uint32_t c = *p++;
if (c >= 0x80) {
/* parse utf-8 sequence */
c = utf8_decode(p - 1, p_end - (p - 1), &p);
/* encoding errors are converted as 0xFFFD and use a single byte */
if (c > 0xFFFF) {
if (i < dest_len)
dest[i] = get_hi_surrogate(c);
i++;
c = get_lo_surrogate(c);
}
}
if (i < dest_len)
dest[i] = c;
}
return i;
}
/* Encode a buffer of 8-bit bytes as a UTF-8 encoded string
`src` points to the source buffer.
`src_len` is the length of the source buffer
`dest` points to the destination array, it can be null if `dest_len` is `0`
`dest_len` is the length in bytes of the destination array. A null
terminator is stored at the end of the array unless `dest_len` is `0`.
*/
size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len)
{
size_t i, j;
uint32_t c;
for (i = j = 0; i < src_len; i++) {
c = src[i];
if (c < 0x80) {
if (j + 1 >= dest_len)
goto overflow;
dest[j++] = c;
} else {
if (j + 2 >= dest_len)
goto overflow;
dest[j++] = (c >> 6) | 0xC0;
dest[j++] = (c & 0x3F) | 0x80;
}
}
if (j < dest_len)
dest[j] = '\0';
return j;
overflow:
if (j < dest_len)
dest[j] = '\0';
while (i < src_len)
j += 1 + (src[i++] >= 0x80);
return j;
}
/* Encode a buffer of 16-bit code points as a UTF-8 encoded string
`src` points to the source buffer.
`src_len` is the length of the source buffer
`dest` points to the destination array, it can be null if `dest_len` is `0`
`dest_len` is the length in bytes of the destination array. A null
terminator is stored at the end of the array unless `dest_len` is `0`.
*/
size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len)
{
size_t i, j;
uint32_t c;
for (i = j = 0; i < src_len;) {
c = src[i++];
if (c < 0x80) {
if (j + 1 >= dest_len)
goto overflow;
dest[j++] = c;
} else {
if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i]))
c = from_surrogate(c, src[i++]);
if (j + utf8_encode_len(c) >= dest_len)
goto overflow;
j += utf8_encode((uint8_t *)dest + j, c);
}
}
if (j < dest_len)
dest[j] = '\0';
return j;
overflow:
i -= 1 + (c > 0xFFFF);
if (j < dest_len)
dest[j] = '\0';
while (i < src_len) {
c = src[i++];
if (c < 0x80) {
j++;
} else {
if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i]))
c = from_surrogate(c, src[i++]);
j += utf8_encode_len(c);
}
}
return j;
}
/*--- integer to string conversions --*/

View file

@ -387,10 +387,25 @@ static inline void dbuf_set_error(DynBuf *s)
s->error = TRUE;
}
#define UTF8_CHAR_LEN_MAX 6
/*---- UTF-8 and UTF-16 handling ----*/
int unicode_to_utf8(uint8_t *buf, unsigned int c);
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
#define UTF8_CHAR_LEN_MAX 4
enum {
UTF8_PLAIN_ASCII = 0, // 7-bit ASCII plain text
UTF8_NON_ASCII = 1, // has non ASCII code points (8-bit or more)
UTF8_HAS_16BIT = 2, // has 16-bit code points
UTF8_HAS_NON_BMP1 = 4, // has non-BMP1 code points, needs UTF-16 surrogate pairs
UTF8_HAS_ERRORS = 8, // has encoding errors
};
int utf8_scan(const char *buf, size_t len, size_t *plen);
size_t utf8_encode_len(uint32_t c);
size_t utf8_encode(uint8_t *buf, uint32_t c);
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);
size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len);
static inline BOOL is_surrogate(uint32_t c)
{

View file

@ -712,7 +712,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
static int get_class_atom(REParseState *s, CharRange *cr,
const uint8_t **pp, BOOL inclass)
{
const uint8_t *p;
const uint8_t *p, *p_next;
uint32_t c;
int ret;
@ -804,15 +804,18 @@ static int get_class_atom(REParseState *s, CharRange *cr,
/* fall thru */
default:
normal_char:
/* normal char */
if (c >= 128) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
if ((unsigned)c > 0xffff && !s->is_unicode) {
/* XXX: should handle non BMP-1 code points */
p++;
if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p)
return re_parse_error(s, "invalid UTF-8 sequence");
p = p_next;
if (c > 0xFFFF && !s->is_unicode) {
// TODO(chqrlie): should handle non BMP-1 code points in
// the calling function and no require the source string
// to be CESU-8 encoded if not s->is_unicode
return re_parse_error(s, "malformed unicode char");
}
} else {
p++;
}
break;
}
@ -1105,35 +1108,35 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
/* '*pp' is the first char after '<' */
static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
{
const uint8_t *p, *p1;
const uint8_t *p, *p_next;
uint32_t c, d;
char *q;
p = *pp;
q = buf;
for(;;) {
c = *p;
c = *p++;
if (c == '\\') {
p++;
if (*p != 'u')
return -1;
c = lre_parse_escape(&p, 2); // accept surrogate pairs
if ((int)c < 0)
return -1;
} else if (c == '>') {
break;
} else if (c >= 128) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
} else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p)
return -1;
p = p_next;
if (is_hi_surrogate(c)) {
d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
if (is_lo_surrogate(d)) {
c = from_surrogate(c, d);
p = p1;
p = p_next;
}
}
} else {
p++;
}
if (c > 0x10FFFF)
return -1;
if (q == buf) {
if (!lre_js_is_ident_first(c))
return -1;
@ -1143,16 +1146,15 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
}
if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
return -1;
if (c < 128) {
if (c < 0x80) {
*q++ = c;
} else {
q += unicode_to_utf8((uint8_t*)q, c);
q += utf8_encode((uint8_t*)q, c);
}
}
if (q == buf)
return -1;
*q = '\0';
p++;
*pp = p;
return 0;
}

View file

@ -272,20 +272,21 @@ static JSValue js_printf_internal(JSContext *ctx,
if (i >= argc)
goto missing;
if (JS_IsString(argv[i])) {
// TODO(chqrlie) need an API to wrap charCodeAt and codePointAt */
string_arg = JS_ToCString(ctx, argv[i++]);
if (!string_arg)
goto fail;
int32_arg = unicode_from_utf8((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
JS_FreeCString(ctx, string_arg);
} else {
if (JS_ToInt32(ctx, &int32_arg, argv[i++]))
goto fail;
}
/* handle utf-8 encoding explicitly */
// XXX: throw an exception?
if ((unsigned)int32_arg > 0x10FFFF)
int32_arg = 0xFFFD;
/* ignore conversion flags, width and precision */
len = unicode_to_utf8(cbuf, int32_arg);
len = utf8_encode(cbuf, int32_arg);
dbuf_put(&dbuf, cbuf, len);
break;

259
quickjs.c
View file

@ -3030,38 +3030,25 @@ static const char *JS_AtomGetStrRT(JSRuntime *rt, char *buf, int buf_size,
snprintf(buf, buf_size, "<invalid %x>", atom);
} else {
JSAtomStruct *p = rt->atom_array[atom];
*buf = '\0';
if (atom_is_free(p)) {
assert(!atom_is_free(p));
snprintf(buf, buf_size, "<free %x>", atom);
} else if (p != NULL) {
JSString *str = p;
if (str->is_wide_char) {
/* encode surrogates correctly */
utf8_encode_buf16(buf, buf_size, str->u.str16, str->len);
} else {
int i, c;
char *q;
JSString *str;
q = buf;
str = p;
if (str) {
if (!str->is_wide_char) {
/* special case ASCII strings */
c = 0;
int i, c = 0;
for(i = 0; i < str->len; i++) {
c |= str->u.str8[i];
}
if (c < 0x80)
return (const char *)str->u.str8;
utf8_encode_buf8(buf, buf_size, str->u.str8, str->len);
}
for(i = 0; i < str->len; i++) {
c = string_get(str, i);
if ((q - buf) >= buf_size - UTF8_CHAR_LEN_MAX)
break;
if (c < 128) {
*q++ = c;
} else {
q += unicode_to_utf8((uint8_t *)q, c);
}
}
}
*q = '\0';
}
}
return buf;
@ -3311,6 +3298,7 @@ const char *JS_AtomToCString(JSContext *ctx, JSAtom atom)
/* return a string atom containing name concatenated with str1 */
/* `str1` may be pure ASCII or UTF-8 encoded */
// TODO(chqrlie): use string concatenation instead of UTF-8 conversion
static JSAtom js_atom_concat_str(JSContext *ctx, JSAtom name, const char *str1)
{
JSValue str;
@ -3863,64 +3851,44 @@ static JSValue string_buffer_end(StringBuffer *s)
/* create a string from a UTF-8 buffer */
JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
{
const uint8_t *p, *p_end, *p_start, *p_next;
uint32_t c;
StringBuffer b_s, *b = &b_s;
size_t len1;
JSString *str;
size_t len;
int kind;
if (buf_len <= 0) {
return JS_AtomToString(ctx, JS_ATOM_empty_string);
}
p_start = (const uint8_t *)buf;
p_end = p_start + buf_len;
p = p_start;
while (p < p_end && *p < 128)
p++;
len1 = p - p_start;
if (len1 > JS_STRING_LEN_MAX)
/* Compute string kind and length: 7-bit, 8-bit, 16-bit, 16-bit UTF-16 */
kind = utf8_scan(buf, buf_len, &len);
if (len > JS_STRING_LEN_MAX)
return JS_ThrowRangeError(ctx, "invalid string length");
if (p == p_end) {
/* ASCII string */
return js_new_string8_len(ctx, buf, buf_len);
} else {
if (string_buffer_init(ctx, b, buf_len))
goto fail;
string_buffer_write8(b, p_start, len1);
while (p < p_end) {
if (*p < 128) {
string_buffer_putc8(b, *p++);
} else {
/* parse utf-8 sequence, return 0xFFFFFFFF for error */
c = unicode_from_utf8(p, p_end - p, &p_next);
if (c < 0x10000) {
p = p_next;
} else if (c <= 0x10FFFF) {
p = p_next;
/* surrogate pair */
string_buffer_putc16(b, get_hi_surrogate(c));
c = get_lo_surrogate(c);
} else {
/* invalid char */
c = 0xfffd;
/* skip the invalid chars */
/* XXX: seems incorrect. Why not just use c = *p++; ? */
while (p < p_end && (*p >= 0x80 && *p < 0xc0))
p++;
if (p < p_end) {
p++;
while (p < p_end && (*p >= 0x80 && *p < 0xc0))
p++;
}
}
string_buffer_putc16(b, c);
}
}
}
return string_buffer_end(b);
fail:
string_buffer_free(b);
switch (kind) {
case UTF8_PLAIN_ASCII:
str = js_alloc_string(ctx, len, 0);
if (!str)
return JS_EXCEPTION;
memcpy(str->u.str8, buf, len);
str->u.str8[len] = '\0';
break;
case UTF8_NON_ASCII:
/* buf contains non-ASCII code-points, but limited to 8-bit values */
str = js_alloc_string(ctx, len, 0);
if (!str)
return JS_EXCEPTION;
utf8_decode_buf8(str->u.str8, len + 1, buf, buf_len);
break;
default:
// This causes a potential problem in JS_ThrowError if message is invalid
//if (kind & UTF8_HAS_ERRORS)
// return JS_ThrowRangeError(ctx, "invalid UTF-8 sequence");
str = js_alloc_string(ctx, len, 1);
if (!str)
return JS_EXCEPTION;
utf8_decode_buf16(str->u.str16, len, buf, buf_len);
break;
}
return JS_MKPTR(JS_TAG_STRING, str);
}
static JSValue JS_ConcatString3(JSContext *ctx, const char *str1,
@ -4067,7 +4035,7 @@ go:
/* c = 0xfffd; */ /* error */
}
}
q += unicode_to_utf8(q, c);
q += utf8_encode(q, c);
}
}
}
@ -10073,6 +10041,7 @@ int JS_ToBool(JSContext *ctx, JSValue val)
return JS_ToBoolFree(ctx, js_dup(val));
}
/* pc points to pure ASCII or UTF-8, null terminated contents */
static int skip_spaces(const char *pc)
{
const uint8_t *p, *p_next, *p_start;
@ -10080,19 +10049,19 @@ static int skip_spaces(const char *pc)
p = p_start = (const uint8_t *)pc;
for (;;) {
c = *p;
if (c < 128) {
c = *p++;
if (c < 0x80) {
if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
break;
p++;
} else {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not a space */
if (!lre_is_space(c))
break;
p = p_next;
}
}
return p - p_start;
return p - 1 - p_start;
}
static inline int to_digit(int c)
@ -18689,6 +18658,7 @@ static int js_parse_error_reserved_identifier(JSParseState *s)
static __exception int js_parse_template_part(JSParseState *s,
const uint8_t *p)
{
const uint8_t *p_next;
uint32_t c;
StringBuffer b_s, *b = &b_s;
@ -18726,9 +18696,8 @@ static __exception int js_parse_template_part(JSParseState *s,
s->eol = &p[-1];
s->mark = p;
} else if (c >= 0x80) {
const uint8_t *p_next;
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (c > 0x10FFFF) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p) {
js_parse_error(s, "invalid UTF-8 sequence");
goto fail;
}
@ -18754,6 +18723,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
BOOL do_throw, const uint8_t *p,
JSToken *token, const uint8_t **pp)
{
const uint8_t *p_next;
int ret;
uint32_t c;
StringBuffer b_s, *b = &b_s;
@ -18832,9 +18802,8 @@ static __exception int js_parse_string(JSParseState *s, int sep,
}
goto fail;
} else if (c >= 0x80) {
const uint8_t *p_next;
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
if (c > 0x10FFFF) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p + 1) {
goto invalid_utf8;
}
p = p_next;
@ -18859,9 +18828,8 @@ static __exception int js_parse_string(JSParseState *s, int sep,
break;
}
} else if (c >= 0x80) {
const uint8_t *p_next;
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (c > 0x10FFFF)
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p)
goto invalid_utf8;
p = p_next;
}
@ -18893,7 +18861,7 @@ static inline BOOL token_is_pseudo_keyword(JSParseState *s, JSAtom atom) {
static __exception int js_parse_regexp(JSParseState *s)
{
const uint8_t *p;
const uint8_t *p, *p_next;
BOOL in_class;
StringBuffer b_s, *b = &b_s;
StringBuffer b2_s, *b2 = &b2_s;
@ -18932,9 +18900,8 @@ static __exception int js_parse_regexp(JSParseState *s)
else if (c == '\0' && p >= s->buf_end)
goto eof_error;
else if (c >= 0x80) {
const uint8_t *p_next;
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (c > 0x10FFFF) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p) {
goto invalid_utf8;
}
p = p_next;
@ -18942,9 +18909,8 @@ static __exception int js_parse_regexp(JSParseState *s)
goto eol_error;
}
} else if (c >= 0x80) {
const uint8_t *p_next;
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (c > 0x10FFFF) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p) {
invalid_utf8:
js_parse_error(s, "invalid UTF-8 sequence");
goto fail;
@ -18963,14 +18929,8 @@ static __exception int js_parse_regexp(JSParseState *s)
/* flags */
for(;;) {
const uint8_t *p_next = p;
c = *p_next++;
if (c >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
if (c > 0x10FFFF) {
goto invalid_utf8;
}
}
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
if (!lre_js_is_ident_next(c))
break;
if (string_buffer_putc(b2, c))
@ -19020,10 +18980,10 @@ static __exception int ident_realloc(JSContext *ctx, char **pbuf, size_t *psize,
static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
BOOL *pident_has_escape, int c, BOOL is_private)
{
const uint8_t *p, *p1;
const uint8_t *p, *p_next;
char ident_buf[128], *buf;
size_t ident_size, ident_pos;
JSAtom atom;
JSAtom atom = JS_ATOM_NULL;
p = *pp;
buf = ident_buf;
@ -19032,30 +18992,28 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
if (is_private)
buf[ident_pos++] = '#';
for(;;) {
p1 = p;
if (c < 128) {
if (c < 0x80) {
buf[ident_pos++] = c;
} else {
ident_pos += unicode_to_utf8((uint8_t*)buf + ident_pos, c);
ident_pos += utf8_encode((uint8_t*)buf + ident_pos, c);
}
c = *p1++;
if (c == '\\' && *p1 == 'u') {
c = lre_parse_escape(&p1, TRUE);
c = *p;
p_next = p + 1;
if (c == '\\' && *p_next == 'u') {
c = lre_parse_escape(&p_next, TRUE);
*pident_has_escape = TRUE;
} else if (c >= 128) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
} else if (c >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
}
if (!lre_js_is_ident_next(c))
break;
p = p1;
p = p_next;
if (unlikely(ident_pos >= ident_size - UTF8_CHAR_LEN_MAX)) {
if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf)) {
atom = JS_ATOM_NULL;
if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf))
goto done;
}
}
}
/* buf is pure ASCII or UTF-8 encoded */
atom = JS_NewAtomLen(s->ctx, buf, ident_pos);
done:
@ -19068,7 +19026,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
static __exception int next_token(JSParseState *s)
{
const uint8_t *p;
const uint8_t *p, *p_next;
int c;
BOOL ident_has_escape;
JSAtom atom;
@ -19148,11 +19106,10 @@ static __exception int next_token(JSParseState *s)
s->got_lf = TRUE; /* considered as LF for ASI */
p++;
} else if (*p >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
/* ignore invalid UTF-8 in comments */
if (c == CP_LS || c == CP_PS) {
s->got_lf = TRUE; /* considered as LF for ASI */
} else if (c == -1) {
p++; /* skip invalid UTF-8 */
}
} else {
p++;
@ -19170,12 +19127,11 @@ static __exception int next_token(JSParseState *s)
if (*p == '\r' || *p == '\n')
break;
if (*p >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
/* ignore invalid UTF-8 in comments */
/* LS or PS are considered as line terminator */
if (c == CP_LS || c == CP_PS) {
break;
} else if (c == -1) {
p++; /* skip invalid UTF-8 */
}
} else {
p++;
@ -19265,20 +19221,21 @@ static __exception int next_token(JSParseState *s)
case '#':
/* private name */
{
const uint8_t *p1;
p++;
p1 = p;
c = *p1++;
if (c == '\\' && *p1 == 'u') {
c = lre_parse_escape(&p1, TRUE);
} else if (c >= 128) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
c = *p;
p_next = p + 1;
if (c == '\\' && *p_next == 'u') {
c = lre_parse_escape(&p_next, TRUE);
} else if (c >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p + 1)
goto invalid_utf8;
}
if (!lre_js_is_ident_first(c)) {
js_parse_error(s, "invalid first character of private name");
goto fail;
}
p = p1;
p = p_next;
ident_has_escape = FALSE; /* not used */
atom = parse_ident(s, &p, &ident_has_escape, c, TRUE);
if (atom == JS_ATOM_NULL)
@ -19313,7 +19270,6 @@ static __exception int next_token(JSParseState *s)
parse_number:
{
JSValue ret;
const uint8_t *p1;
int flags;
flags = ATOD_ACCEPT_BIN_OCT | ATOD_ACCEPT_LEGACY_OCTAL |
ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX;
@ -19324,7 +19280,7 @@ static __exception int next_token(JSParseState *s)
goto fail;
/* reject `10instanceof Number` */
if (JS_VALUE_IS_NAN(ret) ||
lre_js_is_ident_next(unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1))) {
lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
JS_FreeValue(s->ctx, ret);
js_parse_error(s, "invalid number literal");
goto fail;
@ -19516,9 +19472,11 @@ static __exception int next_token(JSParseState *s)
}
break;
default:
if (c >= 128) {
/* unicode value */
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
if (c >= 0x80) { /* non-ASCII code-point */
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
if (p_next == p + 1)
goto invalid_utf8;
p = p_next;
switch(c) {
case CP_PS:
case CP_LS:
@ -19549,6 +19507,8 @@ static __exception int next_token(JSParseState *s)
// dump_token(s, &s->token);
return 0;
invalid_utf8:
js_parse_error(s, "invalid UTF-8 sequence");
fail:
s->token.val = TOK_ERROR;
return -1;
@ -19573,7 +19533,7 @@ static int json_parse_error(JSParseState *s, const uint8_t *curp, const char *ms
static int json_parse_string(JSParseState *s, const uint8_t **pp)
{
const uint8_t *p = *pp;
const uint8_t *p, *p_next;
int i;
uint32_t c;
StringBuffer b_s, *b = &b_s;
@ -19581,6 +19541,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
if (string_buffer_init(s->ctx, b, 32))
goto fail;
p = *pp;
for(;;) {
if (p >= s->buf_end) {
goto end_of_input;
@ -19622,9 +19583,8 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
}
} else
if (c >= 0x80) {
const uint8_t *p_next;
c = unicode_from_utf8(p - 1, s->buf_end - p, &p_next);
if (c > 0x10FFFF) {
c = utf8_decode(p - 1, s->buf_end - p, &p_next);
if (p_next == p) {
json_parse_error(s, p - 1, "Bad UTF-8 sequence");
goto fail;
}
@ -19722,7 +19682,7 @@ static JSAtom json_parse_ident(JSParseState *s, const uint8_t **pp, int c)
static __exception int json_next_token(JSParseState *s)
{
const uint8_t *p;
const uint8_t *p, *p_next;
int c;
JSAtom atom;
@ -19826,10 +19786,9 @@ static __exception int json_next_token(JSParseState *s)
goto fail;
break;
default:
if (c >= 128) {
const uint8_t *p_next;
c = unicode_from_utf8(p, s->buf_end - p, &p_next);
if (c == -1) {
if (c >= 0x80) {
c = utf8_decode(p, s->buf_end - p, &p_next);
if (p_next == p + 1) {
js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
} else {
if (c > 0xFFFF) {
@ -19951,12 +19910,10 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
if (*p == '\n' || *p == '\r') {
break;
} else if (*p >= 0x80) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
if (c == CP_LS || c == CP_PS) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
/* purposely ignore UTF-8 encoding errors in this comment line */
if (c == CP_LS || c == CP_PS)
break;
} else if (c == -1) {
p++; /* skip invalid UTF-8 */
}
} else {
p++;
}