Improve UTF-8 decoding and encoding functions (#410)
Ensure proper UTF-8 encoding (1 to 4 bytes). Handle invalid encodings (return 0xFFFD and consume a single byte) Individually encoded surrogate code points are accepted. - add `utf8_scan()` to analyze a byte array for UTF-8 contents detects invalid encoding, computes number of codepoints and content kind: plain ASCII, 8-bit, 16-bit or larger codepoints. - add `utf8_encode_len(c)` to compute the number of bytes to encode `c` - rename `unicode_to_utf8` as `utf8_encode` - rename `unicode_from_utf8` as `utf8_decode` - add `utf8_decode_buf8(dest, size, src, len)` to decode a UTF-8 encoded byte array known to contain only ASCII and 8-bit codepoints. - add `utf8_decode_buf16(dest, size, src, len)` to decode a UTF-8 encoded byte array into an array of 16-bit codepoints using UTF-16 surrogate pairs for non-BMP1 codepoints. - add `utf8_encode_buf8(dest, size, src, len)` to encode an array of 8-bit codepoints as a UTF-8 encoded null terminated string - add `utf16_encode_buf8(dest, size, src, len)` to decode an array of 16-bit codepoints (including surrogate pairs) as a UTF-8 encoded null terminated string - detect invalid UTF-8 encoding in RegExp parser - simplify `JS_AtomGetStrRT`, `JS_NewStringLen` using the above functions - simplify UTF-8 decoding and error testing
This commit is contained in:
parent
f588210641
commit
1baa6763f8
5 changed files with 490 additions and 269 deletions
414
cutils.c
414
cutils.c
|
@ -213,58 +213,83 @@ void dbuf_free(DynBuf *s)
|
||||||
memset(s, 0, sizeof(*s));
|
memset(s, 0, sizeof(*s));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*--- Unicode / UTF-8 utility functions --*/
|
/*--- UTF-8 utility functions --*/
|
||||||
|
|
||||||
/* Note: at most 31 bits are encoded. At most UTF8_CHAR_LEN_MAX bytes
|
/* Note: only encode valid codepoints (0x0000..0x10FFFF).
|
||||||
are output. */
|
At most UTF8_CHAR_LEN_MAX bytes are output. */
|
||||||
int unicode_to_utf8(uint8_t *buf, unsigned int c)
|
|
||||||
|
/* Compute the number of bytes of the UTF-8 encoding for a codepoint
|
||||||
|
`c` is a code-point.
|
||||||
|
Returns the number of bytes. If a codepoint is beyond 0x10FFFF the
|
||||||
|
return value is 3 as the codepoint would be encoded as 0xFFFD.
|
||||||
|
*/
|
||||||
|
size_t utf8_encode_len(uint32_t c)
|
||||||
{
|
{
|
||||||
uint8_t *q = buf;
|
if (c < 0x80)
|
||||||
|
return 1;
|
||||||
if (c < 0x80) {
|
if (c < 0x800)
|
||||||
*q++ = c;
|
return 2;
|
||||||
} else {
|
if (c < 0x10000)
|
||||||
if (c < 0x800) {
|
return 3;
|
||||||
*q++ = (c >> 6) | 0xc0;
|
if (c < 0x110000)
|
||||||
} else {
|
return 4;
|
||||||
if (c < 0x10000) {
|
return 3;
|
||||||
*q++ = (c >> 12) | 0xe0;
|
|
||||||
} else {
|
|
||||||
if (c < 0x00200000) {
|
|
||||||
*q++ = (c >> 18) | 0xf0;
|
|
||||||
} else {
|
|
||||||
if (c < 0x04000000) {
|
|
||||||
*q++ = (c >> 24) | 0xf8;
|
|
||||||
} else if (c < 0x80000000) {
|
|
||||||
*q++ = (c >> 30) | 0xfc;
|
|
||||||
*q++ = ((c >> 24) & 0x3f) | 0x80;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
*q++ = ((c >> 18) & 0x3f) | 0x80;
|
|
||||||
}
|
|
||||||
*q++ = ((c >> 12) & 0x3f) | 0x80;
|
|
||||||
}
|
|
||||||
*q++ = ((c >> 6) & 0x3f) | 0x80;
|
|
||||||
}
|
|
||||||
*q++ = (c & 0x3f) | 0x80;
|
|
||||||
}
|
|
||||||
return q - buf;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static const unsigned int utf8_min_code[5] = {
|
/* Encode a codepoint in UTF-8
|
||||||
0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
|
`buf` points to an array of at least `UTF8_CHAR_LEN_MAX` bytes
|
||||||
};
|
`c` is a code-point.
|
||||||
|
Returns the number of bytes. If a codepoint is beyond 0x10FFFF the
|
||||||
static const unsigned char utf8_first_code_mask[5] = {
|
return value is 3 and the codepoint is encoded as 0xFFFD.
|
||||||
0x1f, 0xf, 0x7, 0x3, 0x1,
|
No null byte is stored after the encoded bytes.
|
||||||
};
|
Return value is in range 1..4
|
||||||
|
*/
|
||||||
/* return -1 if error. *pp is not updated in this case. max_len must
|
size_t utf8_encode(uint8_t *buf, uint32_t c)
|
||||||
be >= 1. The maximum length for a UTF8 byte sequence is 6 bytes. */
|
|
||||||
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
|
|
||||||
{
|
{
|
||||||
int l, c, b, i;
|
if (c < 0x80) {
|
||||||
|
buf[0] = c;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (c < 0x800) {
|
||||||
|
buf[0] = (c >> 6) | 0xC0;
|
||||||
|
buf[1] = (c & 0x3F) | 0x80;
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
if (c < 0x10000) {
|
||||||
|
buf[0] = (c >> 12) | 0xE0;
|
||||||
|
buf[1] = ((c >> 6) & 0x3F) | 0x80;
|
||||||
|
buf[2] = (c & 0x3F) | 0x80;
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
if (c < 0x110000) {
|
||||||
|
buf[0] = (c >> 18) | 0xF0;
|
||||||
|
buf[1] = ((c >> 12) & 0x3F) | 0x80;
|
||||||
|
buf[2] = ((c >> 6) & 0x3F) | 0x80;
|
||||||
|
buf[3] = (c & 0x3F) | 0x80;
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
buf[0] = (0xFFFD >> 12) | 0xE0;
|
||||||
|
buf[1] = ((0xFFFD >> 6) & 0x3F) | 0x80;
|
||||||
|
buf[2] = (0xFFFD & 0x3F) | 0x80;
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Decode a single code point from a UTF-8 encoded array of bytes
|
||||||
|
`p` is a valid pointer to an array of bytes
|
||||||
|
`max_len` is the number of bytes available in the array
|
||||||
|
`pp` is a valid pointer to a `const uint8_t *` to store a pointer
|
||||||
|
to the byte following the current sequence.
|
||||||
|
Return the code point at `p`, in the range `0..0x10FFFF`
|
||||||
|
Return 0xFFFD on error. Only a single byte is consumed in this case
|
||||||
|
The maximum length for a UTF-8 byte sequence is 4 bytes.
|
||||||
|
This implements the algorithm specified in whatwg.org, except it accepts
|
||||||
|
UTF-8 encoded surrogates as JavaScript allows them in strings.
|
||||||
|
cf: https://encoding.spec.whatwg.org/#utf-8-encoder
|
||||||
|
*/
|
||||||
|
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
|
||||||
|
{
|
||||||
|
uint32_t c;
|
||||||
|
uint8_t lower, upper;
|
||||||
|
|
||||||
c = *p++;
|
c = *p++;
|
||||||
if (c < 0x80) {
|
if (c < 0x80) {
|
||||||
|
@ -272,49 +297,270 @@ int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
switch(c) {
|
switch(c) {
|
||||||
case 0xc0: case 0xc1: case 0xc2: case 0xc3:
|
case 0xC2: case 0xC3:
|
||||||
case 0xc4: case 0xc5: case 0xc6: case 0xc7:
|
case 0xC4: case 0xC5: case 0xC6: case 0xC7:
|
||||||
case 0xc8: case 0xc9: case 0xca: case 0xcb:
|
case 0xC8: case 0xC9: case 0xCA: case 0xCB:
|
||||||
case 0xcc: case 0xcd: case 0xce: case 0xcf:
|
case 0xCC: case 0xCD: case 0xCE: case 0xCF:
|
||||||
case 0xd0: case 0xd1: case 0xd2: case 0xd3:
|
case 0xD0: case 0xD1: case 0xD2: case 0xD3:
|
||||||
case 0xd4: case 0xd5: case 0xd6: case 0xd7:
|
case 0xD4: case 0xD5: case 0xD6: case 0xD7:
|
||||||
case 0xd8: case 0xd9: case 0xda: case 0xdb:
|
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
|
||||||
case 0xdc: case 0xdd: case 0xde: case 0xdf:
|
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
|
||||||
l = 1;
|
if (max_len < 2) {
|
||||||
|
// need more bytes
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (*p >= 0x80 && *p <= 0xBF) {
|
||||||
|
*pp = p + 1;
|
||||||
|
return ((c - 0xC0) << 6) + (*p - 0x80);
|
||||||
|
}
|
||||||
|
// otherwise encoding error
|
||||||
break;
|
break;
|
||||||
case 0xe0: case 0xe1: case 0xe2: case 0xe3:
|
case 0xE0:
|
||||||
case 0xe4: case 0xe5: case 0xe6: case 0xe7:
|
lower = 0xA0; /* reject invalid encoding */
|
||||||
case 0xe8: case 0xe9: case 0xea: case 0xeb:
|
goto need2;
|
||||||
case 0xec: case 0xed: case 0xee: case 0xef:
|
case 0xE1: case 0xE2: case 0xE3:
|
||||||
l = 2;
|
case 0xE4: case 0xE5: case 0xE6: case 0xE7:
|
||||||
|
case 0xE8: case 0xE9: case 0xEA: case 0xEB:
|
||||||
|
case 0xEC: case 0xED: case 0xEE: case 0xEF:
|
||||||
|
lower = 0x80;
|
||||||
|
need2:
|
||||||
|
if (max_len < 3) {
|
||||||
|
// need more bytes
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
|
||||||
|
*pp = p + 2;
|
||||||
|
return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
|
||||||
|
}
|
||||||
|
// otherwise encoding error
|
||||||
break;
|
break;
|
||||||
case 0xf0: case 0xf1: case 0xf2: case 0xf3:
|
case 0xF0:
|
||||||
case 0xf4: case 0xf5: case 0xf6: case 0xf7:
|
lower = 0x90; /* reject invalid encoding */
|
||||||
l = 3;
|
upper = 0xBF;
|
||||||
break;
|
goto need3;
|
||||||
case 0xf8: case 0xf9: case 0xfa: case 0xfb:
|
case 0xF4:
|
||||||
l = 4;
|
lower = 0x80;
|
||||||
break;
|
upper = 0x8F; /* reject values above 0x10FFFF */
|
||||||
case 0xfc: case 0xfd:
|
goto need3;
|
||||||
l = 5;
|
case 0xF1: case 0xF2: case 0xF3:
|
||||||
|
lower = 0x80;
|
||||||
|
upper = 0xBF;
|
||||||
|
need3:
|
||||||
|
if (max_len < 4) {
|
||||||
|
// need more bytes
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
|
||||||
|
&& p[2] >= 0x80 && p[2] <= 0xBF) {
|
||||||
|
*pp = p + 3;
|
||||||
|
return ((c - 0xF0) << 18) + ((*p - 0x80) << 12) +
|
||||||
|
((p[1] - 0x80) << 6) + (p[2] - 0x80);
|
||||||
|
}
|
||||||
|
// otherwise encoding error
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
return -1;
|
// invalid lead byte
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
/* check that we have enough characters */
|
|
||||||
if (l > (max_len - 1))
|
|
||||||
return -1;
|
|
||||||
c &= utf8_first_code_mask[l - 1];
|
|
||||||
for(i = 0; i < l; i++) {
|
|
||||||
b = *p++;
|
|
||||||
if (b < 0x80 || b >= 0xc0)
|
|
||||||
return -1;
|
|
||||||
c = (c << 6) | (b & 0x3f);
|
|
||||||
}
|
|
||||||
if (c < utf8_min_code[l - 1])
|
|
||||||
return -1;
|
|
||||||
*pp = p;
|
*pp = p;
|
||||||
return c;
|
return 0xFFFD;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Scan a UTF-8 encoded buffer for content type
|
||||||
|
`buf` is a valid pointer to a UTF-8 encoded string
|
||||||
|
`len` is the number of bytes to scan
|
||||||
|
`plen` points to a `size_t` variable to receive the number of units
|
||||||
|
Return value is a mask of bits.
|
||||||
|
- `UTF8_PLAIN_ASCII`: return value for 7-bit ASCII plain text
|
||||||
|
- `UTF8_NON_ASCII`: bit for non ASCII code points (8-bit or more)
|
||||||
|
- `UTF8_HAS_16BIT`: bit for 16-bit code points
|
||||||
|
- `UTF8_HAS_NON_BMP1`: bit for non-BMP1 code points, needs UTF-16 surrogate pairs
|
||||||
|
- `UTF8_HAS_ERRORS`: bit for encoding errors
|
||||||
|
*/
|
||||||
|
int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
|
||||||
|
{
|
||||||
|
const uint8_t *p, *p_end, *p_next;
|
||||||
|
size_t i, len;
|
||||||
|
int kind;
|
||||||
|
uint8_t cbits;
|
||||||
|
|
||||||
|
kind = UTF8_PLAIN_ASCII;
|
||||||
|
cbits = 0;
|
||||||
|
len = buf_len;
|
||||||
|
// TODO: handle more than 1 byte at a time
|
||||||
|
for (i = 0; i < buf_len; i++)
|
||||||
|
cbits |= buf[i];
|
||||||
|
if (cbits >= 0x80) {
|
||||||
|
p = (const uint8_t *)buf;
|
||||||
|
p_end = p + buf_len;
|
||||||
|
kind = UTF8_NON_ASCII;
|
||||||
|
len = 0;
|
||||||
|
while (p < p_end) {
|
||||||
|
len++;
|
||||||
|
if (*p++ >= 0x80) {
|
||||||
|
/* parse UTF-8 sequence, check for encoding error */
|
||||||
|
uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
|
||||||
|
if (p_next == p)
|
||||||
|
kind |= UTF8_HAS_ERRORS;
|
||||||
|
p = p_next;
|
||||||
|
if (c > 0xFF) {
|
||||||
|
kind |= UTF8_HAS_16BIT;
|
||||||
|
if (c > 0xFFFF) {
|
||||||
|
len++;
|
||||||
|
kind |= UTF8_HAS_NON_BMP1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*plen = len;
|
||||||
|
return kind;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Decode a string encoded in UTF-8 into an array of bytes
|
||||||
|
`src` points to the source string. It is assumed to be correctly encoded
|
||||||
|
and only contains code points below 0x800
|
||||||
|
`src_len` is the length of the source string
|
||||||
|
`dest` points to the destination array, it can be null if `dest_len` is `0`
|
||||||
|
`dest_len` is the length of the destination array. A null
|
||||||
|
terminator is stored at the end of the array unless `dest_len` is `0`.
|
||||||
|
*/
|
||||||
|
size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len)
|
||||||
|
{
|
||||||
|
const uint8_t *p, *p_end;
|
||||||
|
size_t i;
|
||||||
|
|
||||||
|
p = (const uint8_t *)src;
|
||||||
|
p_end = p + src_len;
|
||||||
|
for (i = 0; p < p_end; i++) {
|
||||||
|
uint32_t c = *p++;
|
||||||
|
if (c >= 0xC0)
|
||||||
|
c = (c << 6) + *p++ - ((0xC0 << 6) + 0x80);
|
||||||
|
if (i < dest_len)
|
||||||
|
dest[i] = c;
|
||||||
|
}
|
||||||
|
if (i < dest_len)
|
||||||
|
dest[i] = '\0';
|
||||||
|
else if (dest_len > 0)
|
||||||
|
dest[dest_len - 1] = '\0';
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Decode a string encoded in UTF-8 into an array of 16-bit words
|
||||||
|
`src` points to the source string. It is assumed to be correctly encoded.
|
||||||
|
`src_len` is the length of the source string
|
||||||
|
`dest` points to the destination array, it can be null if `dest_len` is `0`
|
||||||
|
`dest_len` is the length of the destination array. No null terminator is
|
||||||
|
stored at the end of the array.
|
||||||
|
*/
|
||||||
|
size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len)
|
||||||
|
{
|
||||||
|
const uint8_t *p, *p_end;
|
||||||
|
size_t i;
|
||||||
|
|
||||||
|
p = (const uint8_t *)src;
|
||||||
|
p_end = p + src_len;
|
||||||
|
for (i = 0; p < p_end; i++) {
|
||||||
|
uint32_t c = *p++;
|
||||||
|
if (c >= 0x80) {
|
||||||
|
/* parse utf-8 sequence */
|
||||||
|
c = utf8_decode(p - 1, p_end - (p - 1), &p);
|
||||||
|
/* encoding errors are converted as 0xFFFD and use a single byte */
|
||||||
|
if (c > 0xFFFF) {
|
||||||
|
if (i < dest_len)
|
||||||
|
dest[i] = get_hi_surrogate(c);
|
||||||
|
i++;
|
||||||
|
c = get_lo_surrogate(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i < dest_len)
|
||||||
|
dest[i] = c;
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Encode a buffer of 8-bit bytes as a UTF-8 encoded string
|
||||||
|
`src` points to the source buffer.
|
||||||
|
`src_len` is the length of the source buffer
|
||||||
|
`dest` points to the destination array, it can be null if `dest_len` is `0`
|
||||||
|
`dest_len` is the length in bytes of the destination array. A null
|
||||||
|
terminator is stored at the end of the array unless `dest_len` is `0`.
|
||||||
|
*/
|
||||||
|
size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len)
|
||||||
|
{
|
||||||
|
size_t i, j;
|
||||||
|
uint32_t c;
|
||||||
|
|
||||||
|
for (i = j = 0; i < src_len; i++) {
|
||||||
|
c = src[i];
|
||||||
|
if (c < 0x80) {
|
||||||
|
if (j + 1 >= dest_len)
|
||||||
|
goto overflow;
|
||||||
|
dest[j++] = c;
|
||||||
|
} else {
|
||||||
|
if (j + 2 >= dest_len)
|
||||||
|
goto overflow;
|
||||||
|
dest[j++] = (c >> 6) | 0xC0;
|
||||||
|
dest[j++] = (c & 0x3F) | 0x80;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (j < dest_len)
|
||||||
|
dest[j] = '\0';
|
||||||
|
return j;
|
||||||
|
|
||||||
|
overflow:
|
||||||
|
if (j < dest_len)
|
||||||
|
dest[j] = '\0';
|
||||||
|
while (i < src_len)
|
||||||
|
j += 1 + (src[i++] >= 0x80);
|
||||||
|
return j;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Encode a buffer of 16-bit code points as a UTF-8 encoded string
|
||||||
|
`src` points to the source buffer.
|
||||||
|
`src_len` is the length of the source buffer
|
||||||
|
`dest` points to the destination array, it can be null if `dest_len` is `0`
|
||||||
|
`dest_len` is the length in bytes of the destination array. A null
|
||||||
|
terminator is stored at the end of the array unless `dest_len` is `0`.
|
||||||
|
*/
|
||||||
|
size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len)
|
||||||
|
{
|
||||||
|
size_t i, j;
|
||||||
|
uint32_t c;
|
||||||
|
|
||||||
|
for (i = j = 0; i < src_len;) {
|
||||||
|
c = src[i++];
|
||||||
|
if (c < 0x80) {
|
||||||
|
if (j + 1 >= dest_len)
|
||||||
|
goto overflow;
|
||||||
|
dest[j++] = c;
|
||||||
|
} else {
|
||||||
|
if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i]))
|
||||||
|
c = from_surrogate(c, src[i++]);
|
||||||
|
if (j + utf8_encode_len(c) >= dest_len)
|
||||||
|
goto overflow;
|
||||||
|
j += utf8_encode((uint8_t *)dest + j, c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (j < dest_len)
|
||||||
|
dest[j] = '\0';
|
||||||
|
return j;
|
||||||
|
|
||||||
|
overflow:
|
||||||
|
i -= 1 + (c > 0xFFFF);
|
||||||
|
if (j < dest_len)
|
||||||
|
dest[j] = '\0';
|
||||||
|
while (i < src_len) {
|
||||||
|
c = src[i++];
|
||||||
|
if (c < 0x80) {
|
||||||
|
j++;
|
||||||
|
} else {
|
||||||
|
if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i]))
|
||||||
|
c = from_surrogate(c, src[i++]);
|
||||||
|
j += utf8_encode_len(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return j;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*--- integer to string conversions --*/
|
/*--- integer to string conversions --*/
|
||||||
|
|
21
cutils.h
21
cutils.h
|
@ -387,10 +387,25 @@ static inline void dbuf_set_error(DynBuf *s)
|
||||||
s->error = TRUE;
|
s->error = TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define UTF8_CHAR_LEN_MAX 6
|
/*---- UTF-8 and UTF-16 handling ----*/
|
||||||
|
|
||||||
int unicode_to_utf8(uint8_t *buf, unsigned int c);
|
#define UTF8_CHAR_LEN_MAX 4
|
||||||
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
|
|
||||||
|
enum {
|
||||||
|
UTF8_PLAIN_ASCII = 0, // 7-bit ASCII plain text
|
||||||
|
UTF8_NON_ASCII = 1, // has non ASCII code points (8-bit or more)
|
||||||
|
UTF8_HAS_16BIT = 2, // has 16-bit code points
|
||||||
|
UTF8_HAS_NON_BMP1 = 4, // has non-BMP1 code points, needs UTF-16 surrogate pairs
|
||||||
|
UTF8_HAS_ERRORS = 8, // has encoding errors
|
||||||
|
};
|
||||||
|
int utf8_scan(const char *buf, size_t len, size_t *plen);
|
||||||
|
size_t utf8_encode_len(uint32_t c);
|
||||||
|
size_t utf8_encode(uint8_t *buf, uint32_t c);
|
||||||
|
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
|
||||||
|
size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
|
||||||
|
size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
|
||||||
|
size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);
|
||||||
|
size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len);
|
||||||
|
|
||||||
static inline BOOL is_surrogate(uint32_t c)
|
static inline BOOL is_surrogate(uint32_t c)
|
||||||
{
|
{
|
||||||
|
|
46
libregexp.c
46
libregexp.c
|
@ -712,7 +712,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
|
||||||
static int get_class_atom(REParseState *s, CharRange *cr,
|
static int get_class_atom(REParseState *s, CharRange *cr,
|
||||||
const uint8_t **pp, BOOL inclass)
|
const uint8_t **pp, BOOL inclass)
|
||||||
{
|
{
|
||||||
const uint8_t *p;
|
const uint8_t *p, *p_next;
|
||||||
uint32_t c;
|
uint32_t c;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
@ -804,15 +804,18 @@ static int get_class_atom(REParseState *s, CharRange *cr,
|
||||||
/* fall thru */
|
/* fall thru */
|
||||||
default:
|
default:
|
||||||
normal_char:
|
normal_char:
|
||||||
/* normal char */
|
p++;
|
||||||
if (c >= 128) {
|
if (c >= 0x80) {
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
|
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
if ((unsigned)c > 0xffff && !s->is_unicode) {
|
if (p_next == p)
|
||||||
/* XXX: should handle non BMP-1 code points */
|
return re_parse_error(s, "invalid UTF-8 sequence");
|
||||||
|
p = p_next;
|
||||||
|
if (c > 0xFFFF && !s->is_unicode) {
|
||||||
|
// TODO(chqrlie): should handle non BMP-1 code points in
|
||||||
|
// the calling function and no require the source string
|
||||||
|
// to be CESU-8 encoded if not s->is_unicode
|
||||||
return re_parse_error(s, "malformed unicode char");
|
return re_parse_error(s, "malformed unicode char");
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
p++;
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1105,35 +1108,35 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
|
||||||
/* '*pp' is the first char after '<' */
|
/* '*pp' is the first char after '<' */
|
||||||
static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
|
static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
|
||||||
{
|
{
|
||||||
const uint8_t *p, *p1;
|
const uint8_t *p, *p_next;
|
||||||
uint32_t c, d;
|
uint32_t c, d;
|
||||||
char *q;
|
char *q;
|
||||||
|
|
||||||
p = *pp;
|
p = *pp;
|
||||||
q = buf;
|
q = buf;
|
||||||
for(;;) {
|
for(;;) {
|
||||||
c = *p;
|
c = *p++;
|
||||||
if (c == '\\') {
|
if (c == '\\') {
|
||||||
p++;
|
|
||||||
if (*p != 'u')
|
if (*p != 'u')
|
||||||
return -1;
|
return -1;
|
||||||
c = lre_parse_escape(&p, 2); // accept surrogate pairs
|
c = lre_parse_escape(&p, 2); // accept surrogate pairs
|
||||||
|
if ((int)c < 0)
|
||||||
|
return -1;
|
||||||
} else if (c == '>') {
|
} else if (c == '>') {
|
||||||
break;
|
break;
|
||||||
} else if (c >= 128) {
|
} else if (c >= 0x80) {
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
|
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
|
if (p_next == p)
|
||||||
|
return -1;
|
||||||
|
p = p_next;
|
||||||
if (is_hi_surrogate(c)) {
|
if (is_hi_surrogate(c)) {
|
||||||
d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
|
d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
if (is_lo_surrogate(d)) {
|
if (is_lo_surrogate(d)) {
|
||||||
c = from_surrogate(c, d);
|
c = from_surrogate(c, d);
|
||||||
p = p1;
|
p = p_next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
p++;
|
|
||||||
}
|
}
|
||||||
if (c > 0x10FFFF)
|
|
||||||
return -1;
|
|
||||||
if (q == buf) {
|
if (q == buf) {
|
||||||
if (!lre_js_is_ident_first(c))
|
if (!lre_js_is_ident_first(c))
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -1143,16 +1146,15 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
|
||||||
}
|
}
|
||||||
if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
|
if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
|
||||||
return -1;
|
return -1;
|
||||||
if (c < 128) {
|
if (c < 0x80) {
|
||||||
*q++ = c;
|
*q++ = c;
|
||||||
} else {
|
} else {
|
||||||
q += unicode_to_utf8((uint8_t*)q, c);
|
q += utf8_encode((uint8_t*)q, c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (q == buf)
|
if (q == buf)
|
||||||
return -1;
|
return -1;
|
||||||
*q = '\0';
|
*q = '\0';
|
||||||
p++;
|
|
||||||
*pp = p;
|
*pp = p;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -272,20 +272,21 @@ static JSValue js_printf_internal(JSContext *ctx,
|
||||||
if (i >= argc)
|
if (i >= argc)
|
||||||
goto missing;
|
goto missing;
|
||||||
if (JS_IsString(argv[i])) {
|
if (JS_IsString(argv[i])) {
|
||||||
|
// TODO(chqrlie) need an API to wrap charCodeAt and codePointAt */
|
||||||
string_arg = JS_ToCString(ctx, argv[i++]);
|
string_arg = JS_ToCString(ctx, argv[i++]);
|
||||||
if (!string_arg)
|
if (!string_arg)
|
||||||
goto fail;
|
goto fail;
|
||||||
int32_arg = unicode_from_utf8((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
|
int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
|
||||||
JS_FreeCString(ctx, string_arg);
|
JS_FreeCString(ctx, string_arg);
|
||||||
} else {
|
} else {
|
||||||
if (JS_ToInt32(ctx, &int32_arg, argv[i++]))
|
if (JS_ToInt32(ctx, &int32_arg, argv[i++]))
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
/* handle utf-8 encoding explicitly */
|
// XXX: throw an exception?
|
||||||
if ((unsigned)int32_arg > 0x10FFFF)
|
if ((unsigned)int32_arg > 0x10FFFF)
|
||||||
int32_arg = 0xFFFD;
|
int32_arg = 0xFFFD;
|
||||||
/* ignore conversion flags, width and precision */
|
/* ignore conversion flags, width and precision */
|
||||||
len = unicode_to_utf8(cbuf, int32_arg);
|
len = utf8_encode(cbuf, int32_arg);
|
||||||
dbuf_put(&dbuf, cbuf, len);
|
dbuf_put(&dbuf, cbuf, len);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
271
quickjs.c
271
quickjs.c
|
@ -3030,38 +3030,25 @@ static const char *JS_AtomGetStrRT(JSRuntime *rt, char *buf, int buf_size,
|
||||||
snprintf(buf, buf_size, "<invalid %x>", atom);
|
snprintf(buf, buf_size, "<invalid %x>", atom);
|
||||||
} else {
|
} else {
|
||||||
JSAtomStruct *p = rt->atom_array[atom];
|
JSAtomStruct *p = rt->atom_array[atom];
|
||||||
|
*buf = '\0';
|
||||||
if (atom_is_free(p)) {
|
if (atom_is_free(p)) {
|
||||||
assert(!atom_is_free(p));
|
assert(!atom_is_free(p));
|
||||||
snprintf(buf, buf_size, "<free %x>", atom);
|
snprintf(buf, buf_size, "<free %x>", atom);
|
||||||
} else {
|
} else if (p != NULL) {
|
||||||
int i, c;
|
JSString *str = p;
|
||||||
char *q;
|
if (str->is_wide_char) {
|
||||||
JSString *str;
|
/* encode surrogates correctly */
|
||||||
|
utf8_encode_buf16(buf, buf_size, str->u.str16, str->len);
|
||||||
q = buf;
|
} else {
|
||||||
str = p;
|
/* special case ASCII strings */
|
||||||
if (str) {
|
int i, c = 0;
|
||||||
if (!str->is_wide_char) {
|
|
||||||
/* special case ASCII strings */
|
|
||||||
c = 0;
|
|
||||||
for(i = 0; i < str->len; i++) {
|
|
||||||
c |= str->u.str8[i];
|
|
||||||
}
|
|
||||||
if (c < 0x80)
|
|
||||||
return (const char *)str->u.str8;
|
|
||||||
}
|
|
||||||
for(i = 0; i < str->len; i++) {
|
for(i = 0; i < str->len; i++) {
|
||||||
c = string_get(str, i);
|
c |= str->u.str8[i];
|
||||||
if ((q - buf) >= buf_size - UTF8_CHAR_LEN_MAX)
|
|
||||||
break;
|
|
||||||
if (c < 128) {
|
|
||||||
*q++ = c;
|
|
||||||
} else {
|
|
||||||
q += unicode_to_utf8((uint8_t *)q, c);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
if (c < 0x80)
|
||||||
|
return (const char *)str->u.str8;
|
||||||
|
utf8_encode_buf8(buf, buf_size, str->u.str8, str->len);
|
||||||
}
|
}
|
||||||
*q = '\0';
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return buf;
|
return buf;
|
||||||
|
@ -3311,6 +3298,7 @@ const char *JS_AtomToCString(JSContext *ctx, JSAtom atom)
|
||||||
|
|
||||||
/* return a string atom containing name concatenated with str1 */
|
/* return a string atom containing name concatenated with str1 */
|
||||||
/* `str1` may be pure ASCII or UTF-8 encoded */
|
/* `str1` may be pure ASCII or UTF-8 encoded */
|
||||||
|
// TODO(chqrlie): use string concatenation instead of UTF-8 conversion
|
||||||
static JSAtom js_atom_concat_str(JSContext *ctx, JSAtom name, const char *str1)
|
static JSAtom js_atom_concat_str(JSContext *ctx, JSAtom name, const char *str1)
|
||||||
{
|
{
|
||||||
JSValue str;
|
JSValue str;
|
||||||
|
@ -3863,64 +3851,44 @@ static JSValue string_buffer_end(StringBuffer *s)
|
||||||
/* create a string from a UTF-8 buffer */
|
/* create a string from a UTF-8 buffer */
|
||||||
JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
|
JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
|
||||||
{
|
{
|
||||||
const uint8_t *p, *p_end, *p_start, *p_next;
|
JSString *str;
|
||||||
uint32_t c;
|
size_t len;
|
||||||
StringBuffer b_s, *b = &b_s;
|
int kind;
|
||||||
size_t len1;
|
|
||||||
|
|
||||||
if (buf_len <= 0) {
|
if (buf_len <= 0) {
|
||||||
return JS_AtomToString(ctx, JS_ATOM_empty_string);
|
return JS_AtomToString(ctx, JS_ATOM_empty_string);
|
||||||
}
|
}
|
||||||
p_start = (const uint8_t *)buf;
|
/* Compute string kind and length: 7-bit, 8-bit, 16-bit, 16-bit UTF-16 */
|
||||||
p_end = p_start + buf_len;
|
kind = utf8_scan(buf, buf_len, &len);
|
||||||
p = p_start;
|
if (len > JS_STRING_LEN_MAX)
|
||||||
while (p < p_end && *p < 128)
|
|
||||||
p++;
|
|
||||||
len1 = p - p_start;
|
|
||||||
if (len1 > JS_STRING_LEN_MAX)
|
|
||||||
return JS_ThrowRangeError(ctx, "invalid string length");
|
return JS_ThrowRangeError(ctx, "invalid string length");
|
||||||
if (p == p_end) {
|
|
||||||
/* ASCII string */
|
|
||||||
return js_new_string8_len(ctx, buf, buf_len);
|
|
||||||
} else {
|
|
||||||
if (string_buffer_init(ctx, b, buf_len))
|
|
||||||
goto fail;
|
|
||||||
string_buffer_write8(b, p_start, len1);
|
|
||||||
while (p < p_end) {
|
|
||||||
if (*p < 128) {
|
|
||||||
string_buffer_putc8(b, *p++);
|
|
||||||
} else {
|
|
||||||
/* parse utf-8 sequence, return 0xFFFFFFFF for error */
|
|
||||||
c = unicode_from_utf8(p, p_end - p, &p_next);
|
|
||||||
if (c < 0x10000) {
|
|
||||||
p = p_next;
|
|
||||||
} else if (c <= 0x10FFFF) {
|
|
||||||
p = p_next;
|
|
||||||
/* surrogate pair */
|
|
||||||
string_buffer_putc16(b, get_hi_surrogate(c));
|
|
||||||
c = get_lo_surrogate(c);
|
|
||||||
} else {
|
|
||||||
/* invalid char */
|
|
||||||
c = 0xfffd;
|
|
||||||
/* skip the invalid chars */
|
|
||||||
/* XXX: seems incorrect. Why not just use c = *p++; ? */
|
|
||||||
while (p < p_end && (*p >= 0x80 && *p < 0xc0))
|
|
||||||
p++;
|
|
||||||
if (p < p_end) {
|
|
||||||
p++;
|
|
||||||
while (p < p_end && (*p >= 0x80 && *p < 0xc0))
|
|
||||||
p++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
string_buffer_putc16(b, c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return string_buffer_end(b);
|
|
||||||
|
|
||||||
fail:
|
switch (kind) {
|
||||||
string_buffer_free(b);
|
case UTF8_PLAIN_ASCII:
|
||||||
return JS_EXCEPTION;
|
str = js_alloc_string(ctx, len, 0);
|
||||||
|
if (!str)
|
||||||
|
return JS_EXCEPTION;
|
||||||
|
memcpy(str->u.str8, buf, len);
|
||||||
|
str->u.str8[len] = '\0';
|
||||||
|
break;
|
||||||
|
case UTF8_NON_ASCII:
|
||||||
|
/* buf contains non-ASCII code-points, but limited to 8-bit values */
|
||||||
|
str = js_alloc_string(ctx, len, 0);
|
||||||
|
if (!str)
|
||||||
|
return JS_EXCEPTION;
|
||||||
|
utf8_decode_buf8(str->u.str8, len + 1, buf, buf_len);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// This causes a potential problem in JS_ThrowError if message is invalid
|
||||||
|
//if (kind & UTF8_HAS_ERRORS)
|
||||||
|
// return JS_ThrowRangeError(ctx, "invalid UTF-8 sequence");
|
||||||
|
str = js_alloc_string(ctx, len, 1);
|
||||||
|
if (!str)
|
||||||
|
return JS_EXCEPTION;
|
||||||
|
utf8_decode_buf16(str->u.str16, len, buf, buf_len);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return JS_MKPTR(JS_TAG_STRING, str);
|
||||||
}
|
}
|
||||||
|
|
||||||
static JSValue JS_ConcatString3(JSContext *ctx, const char *str1,
|
static JSValue JS_ConcatString3(JSContext *ctx, const char *str1,
|
||||||
|
@ -4067,7 +4035,7 @@ go:
|
||||||
/* c = 0xfffd; */ /* error */
|
/* c = 0xfffd; */ /* error */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
q += unicode_to_utf8(q, c);
|
q += utf8_encode(q, c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10073,6 +10041,7 @@ int JS_ToBool(JSContext *ctx, JSValue val)
|
||||||
return JS_ToBoolFree(ctx, js_dup(val));
|
return JS_ToBoolFree(ctx, js_dup(val));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* pc points to pure ASCII or UTF-8, null terminated contents */
|
||||||
static int skip_spaces(const char *pc)
|
static int skip_spaces(const char *pc)
|
||||||
{
|
{
|
||||||
const uint8_t *p, *p_next, *p_start;
|
const uint8_t *p, *p_next, *p_start;
|
||||||
|
@ -10080,19 +10049,19 @@ static int skip_spaces(const char *pc)
|
||||||
|
|
||||||
p = p_start = (const uint8_t *)pc;
|
p = p_start = (const uint8_t *)pc;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
c = *p;
|
c = *p++;
|
||||||
if (c < 128) {
|
if (c < 0x80) {
|
||||||
if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
|
if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
|
||||||
break;
|
break;
|
||||||
p++;
|
|
||||||
} else {
|
} else {
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
|
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
|
/* no need to test for invalid UTF-8, 0xFFFD is not a space */
|
||||||
if (!lre_is_space(c))
|
if (!lre_is_space(c))
|
||||||
break;
|
break;
|
||||||
p = p_next;
|
p = p_next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return p - p_start;
|
return p - 1 - p_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int to_digit(int c)
|
static inline int to_digit(int c)
|
||||||
|
@ -18689,6 +18658,7 @@ static int js_parse_error_reserved_identifier(JSParseState *s)
|
||||||
static __exception int js_parse_template_part(JSParseState *s,
|
static __exception int js_parse_template_part(JSParseState *s,
|
||||||
const uint8_t *p)
|
const uint8_t *p)
|
||||||
{
|
{
|
||||||
|
const uint8_t *p_next;
|
||||||
uint32_t c;
|
uint32_t c;
|
||||||
StringBuffer b_s, *b = &b_s;
|
StringBuffer b_s, *b = &b_s;
|
||||||
|
|
||||||
|
@ -18726,9 +18696,8 @@ static __exception int js_parse_template_part(JSParseState *s,
|
||||||
s->eol = &p[-1];
|
s->eol = &p[-1];
|
||||||
s->mark = p;
|
s->mark = p;
|
||||||
} else if (c >= 0x80) {
|
} else if (c >= 0x80) {
|
||||||
const uint8_t *p_next;
|
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
if (p_next == p) {
|
||||||
if (c > 0x10FFFF) {
|
|
||||||
js_parse_error(s, "invalid UTF-8 sequence");
|
js_parse_error(s, "invalid UTF-8 sequence");
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
@ -18754,6 +18723,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
|
||||||
BOOL do_throw, const uint8_t *p,
|
BOOL do_throw, const uint8_t *p,
|
||||||
JSToken *token, const uint8_t **pp)
|
JSToken *token, const uint8_t **pp)
|
||||||
{
|
{
|
||||||
|
const uint8_t *p_next;
|
||||||
int ret;
|
int ret;
|
||||||
uint32_t c;
|
uint32_t c;
|
||||||
StringBuffer b_s, *b = &b_s;
|
StringBuffer b_s, *b = &b_s;
|
||||||
|
@ -18832,9 +18802,8 @@ static __exception int js_parse_string(JSParseState *s, int sep,
|
||||||
}
|
}
|
||||||
goto fail;
|
goto fail;
|
||||||
} else if (c >= 0x80) {
|
} else if (c >= 0x80) {
|
||||||
const uint8_t *p_next;
|
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
|
if (p_next == p + 1) {
|
||||||
if (c > 0x10FFFF) {
|
|
||||||
goto invalid_utf8;
|
goto invalid_utf8;
|
||||||
}
|
}
|
||||||
p = p_next;
|
p = p_next;
|
||||||
|
@ -18859,9 +18828,8 @@ static __exception int js_parse_string(JSParseState *s, int sep,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else if (c >= 0x80) {
|
} else if (c >= 0x80) {
|
||||||
const uint8_t *p_next;
|
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
if (p_next == p)
|
||||||
if (c > 0x10FFFF)
|
|
||||||
goto invalid_utf8;
|
goto invalid_utf8;
|
||||||
p = p_next;
|
p = p_next;
|
||||||
}
|
}
|
||||||
|
@ -18893,7 +18861,7 @@ static inline BOOL token_is_pseudo_keyword(JSParseState *s, JSAtom atom) {
|
||||||
|
|
||||||
static __exception int js_parse_regexp(JSParseState *s)
|
static __exception int js_parse_regexp(JSParseState *s)
|
||||||
{
|
{
|
||||||
const uint8_t *p;
|
const uint8_t *p, *p_next;
|
||||||
BOOL in_class;
|
BOOL in_class;
|
||||||
StringBuffer b_s, *b = &b_s;
|
StringBuffer b_s, *b = &b_s;
|
||||||
StringBuffer b2_s, *b2 = &b2_s;
|
StringBuffer b2_s, *b2 = &b2_s;
|
||||||
|
@ -18932,9 +18900,8 @@ static __exception int js_parse_regexp(JSParseState *s)
|
||||||
else if (c == '\0' && p >= s->buf_end)
|
else if (c == '\0' && p >= s->buf_end)
|
||||||
goto eof_error;
|
goto eof_error;
|
||||||
else if (c >= 0x80) {
|
else if (c >= 0x80) {
|
||||||
const uint8_t *p_next;
|
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
if (p_next == p) {
|
||||||
if (c > 0x10FFFF) {
|
|
||||||
goto invalid_utf8;
|
goto invalid_utf8;
|
||||||
}
|
}
|
||||||
p = p_next;
|
p = p_next;
|
||||||
|
@ -18942,9 +18909,8 @@ static __exception int js_parse_regexp(JSParseState *s)
|
||||||
goto eol_error;
|
goto eol_error;
|
||||||
}
|
}
|
||||||
} else if (c >= 0x80) {
|
} else if (c >= 0x80) {
|
||||||
const uint8_t *p_next;
|
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
|
if (p_next == p) {
|
||||||
if (c > 0x10FFFF) {
|
|
||||||
invalid_utf8:
|
invalid_utf8:
|
||||||
js_parse_error(s, "invalid UTF-8 sequence");
|
js_parse_error(s, "invalid UTF-8 sequence");
|
||||||
goto fail;
|
goto fail;
|
||||||
|
@ -18963,14 +18929,8 @@ static __exception int js_parse_regexp(JSParseState *s)
|
||||||
|
|
||||||
/* flags */
|
/* flags */
|
||||||
for(;;) {
|
for(;;) {
|
||||||
const uint8_t *p_next = p;
|
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
c = *p_next++;
|
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
|
||||||
if (c >= 0x80) {
|
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
|
|
||||||
if (c > 0x10FFFF) {
|
|
||||||
goto invalid_utf8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!lre_js_is_ident_next(c))
|
if (!lre_js_is_ident_next(c))
|
||||||
break;
|
break;
|
||||||
if (string_buffer_putc(b2, c))
|
if (string_buffer_putc(b2, c))
|
||||||
|
@ -19020,10 +18980,10 @@ static __exception int ident_realloc(JSContext *ctx, char **pbuf, size_t *psize,
|
||||||
static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
|
static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
|
||||||
BOOL *pident_has_escape, int c, BOOL is_private)
|
BOOL *pident_has_escape, int c, BOOL is_private)
|
||||||
{
|
{
|
||||||
const uint8_t *p, *p1;
|
const uint8_t *p, *p_next;
|
||||||
char ident_buf[128], *buf;
|
char ident_buf[128], *buf;
|
||||||
size_t ident_size, ident_pos;
|
size_t ident_size, ident_pos;
|
||||||
JSAtom atom;
|
JSAtom atom = JS_ATOM_NULL;
|
||||||
|
|
||||||
p = *pp;
|
p = *pp;
|
||||||
buf = ident_buf;
|
buf = ident_buf;
|
||||||
|
@ -19032,28 +18992,26 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
|
||||||
if (is_private)
|
if (is_private)
|
||||||
buf[ident_pos++] = '#';
|
buf[ident_pos++] = '#';
|
||||||
for(;;) {
|
for(;;) {
|
||||||
p1 = p;
|
if (c < 0x80) {
|
||||||
|
|
||||||
if (c < 128) {
|
|
||||||
buf[ident_pos++] = c;
|
buf[ident_pos++] = c;
|
||||||
} else {
|
} else {
|
||||||
ident_pos += unicode_to_utf8((uint8_t*)buf + ident_pos, c);
|
ident_pos += utf8_encode((uint8_t*)buf + ident_pos, c);
|
||||||
}
|
}
|
||||||
c = *p1++;
|
c = *p;
|
||||||
if (c == '\\' && *p1 == 'u') {
|
p_next = p + 1;
|
||||||
c = lre_parse_escape(&p1, TRUE);
|
if (c == '\\' && *p_next == 'u') {
|
||||||
|
c = lre_parse_escape(&p_next, TRUE);
|
||||||
*pident_has_escape = TRUE;
|
*pident_has_escape = TRUE;
|
||||||
} else if (c >= 128) {
|
} else if (c >= 0x80) {
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
|
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
|
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
|
||||||
}
|
}
|
||||||
if (!lre_js_is_ident_next(c))
|
if (!lre_js_is_ident_next(c))
|
||||||
break;
|
break;
|
||||||
p = p1;
|
p = p_next;
|
||||||
if (unlikely(ident_pos >= ident_size - UTF8_CHAR_LEN_MAX)) {
|
if (unlikely(ident_pos >= ident_size - UTF8_CHAR_LEN_MAX)) {
|
||||||
if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf)) {
|
if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf))
|
||||||
atom = JS_ATOM_NULL;
|
|
||||||
goto done;
|
goto done;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* buf is pure ASCII or UTF-8 encoded */
|
/* buf is pure ASCII or UTF-8 encoded */
|
||||||
|
@ -19068,7 +19026,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
|
||||||
|
|
||||||
static __exception int next_token(JSParseState *s)
|
static __exception int next_token(JSParseState *s)
|
||||||
{
|
{
|
||||||
const uint8_t *p;
|
const uint8_t *p, *p_next;
|
||||||
int c;
|
int c;
|
||||||
BOOL ident_has_escape;
|
BOOL ident_has_escape;
|
||||||
JSAtom atom;
|
JSAtom atom;
|
||||||
|
@ -19148,11 +19106,10 @@ static __exception int next_token(JSParseState *s)
|
||||||
s->got_lf = TRUE; /* considered as LF for ASI */
|
s->got_lf = TRUE; /* considered as LF for ASI */
|
||||||
p++;
|
p++;
|
||||||
} else if (*p >= 0x80) {
|
} else if (*p >= 0x80) {
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
|
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
|
||||||
|
/* ignore invalid UTF-8 in comments */
|
||||||
if (c == CP_LS || c == CP_PS) {
|
if (c == CP_LS || c == CP_PS) {
|
||||||
s->got_lf = TRUE; /* considered as LF for ASI */
|
s->got_lf = TRUE; /* considered as LF for ASI */
|
||||||
} else if (c == -1) {
|
|
||||||
p++; /* skip invalid UTF-8 */
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
p++;
|
p++;
|
||||||
|
@ -19170,12 +19127,11 @@ static __exception int next_token(JSParseState *s)
|
||||||
if (*p == '\r' || *p == '\n')
|
if (*p == '\r' || *p == '\n')
|
||||||
break;
|
break;
|
||||||
if (*p >= 0x80) {
|
if (*p >= 0x80) {
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
|
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
|
||||||
|
/* ignore invalid UTF-8 in comments */
|
||||||
/* LS or PS are considered as line terminator */
|
/* LS or PS are considered as line terminator */
|
||||||
if (c == CP_LS || c == CP_PS) {
|
if (c == CP_LS || c == CP_PS) {
|
||||||
break;
|
break;
|
||||||
} else if (c == -1) {
|
|
||||||
p++; /* skip invalid UTF-8 */
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
p++;
|
p++;
|
||||||
|
@ -19265,20 +19221,21 @@ static __exception int next_token(JSParseState *s)
|
||||||
case '#':
|
case '#':
|
||||||
/* private name */
|
/* private name */
|
||||||
{
|
{
|
||||||
const uint8_t *p1;
|
|
||||||
p++;
|
p++;
|
||||||
p1 = p;
|
c = *p;
|
||||||
c = *p1++;
|
p_next = p + 1;
|
||||||
if (c == '\\' && *p1 == 'u') {
|
if (c == '\\' && *p_next == 'u') {
|
||||||
c = lre_parse_escape(&p1, TRUE);
|
c = lre_parse_escape(&p_next, TRUE);
|
||||||
} else if (c >= 128) {
|
} else if (c >= 0x80) {
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
|
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
|
if (p_next == p + 1)
|
||||||
|
goto invalid_utf8;
|
||||||
}
|
}
|
||||||
if (!lre_js_is_ident_first(c)) {
|
if (!lre_js_is_ident_first(c)) {
|
||||||
js_parse_error(s, "invalid first character of private name");
|
js_parse_error(s, "invalid first character of private name");
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
p = p1;
|
p = p_next;
|
||||||
ident_has_escape = FALSE; /* not used */
|
ident_has_escape = FALSE; /* not used */
|
||||||
atom = parse_ident(s, &p, &ident_has_escape, c, TRUE);
|
atom = parse_ident(s, &p, &ident_has_escape, c, TRUE);
|
||||||
if (atom == JS_ATOM_NULL)
|
if (atom == JS_ATOM_NULL)
|
||||||
|
@ -19313,7 +19270,6 @@ static __exception int next_token(JSParseState *s)
|
||||||
parse_number:
|
parse_number:
|
||||||
{
|
{
|
||||||
JSValue ret;
|
JSValue ret;
|
||||||
const uint8_t *p1;
|
|
||||||
int flags;
|
int flags;
|
||||||
flags = ATOD_ACCEPT_BIN_OCT | ATOD_ACCEPT_LEGACY_OCTAL |
|
flags = ATOD_ACCEPT_BIN_OCT | ATOD_ACCEPT_LEGACY_OCTAL |
|
||||||
ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX;
|
ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX;
|
||||||
|
@ -19324,7 +19280,7 @@ static __exception int next_token(JSParseState *s)
|
||||||
goto fail;
|
goto fail;
|
||||||
/* reject `10instanceof Number` */
|
/* reject `10instanceof Number` */
|
||||||
if (JS_VALUE_IS_NAN(ret) ||
|
if (JS_VALUE_IS_NAN(ret) ||
|
||||||
lre_js_is_ident_next(unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1))) {
|
lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
|
||||||
JS_FreeValue(s->ctx, ret);
|
JS_FreeValue(s->ctx, ret);
|
||||||
js_parse_error(s, "invalid number literal");
|
js_parse_error(s, "invalid number literal");
|
||||||
goto fail;
|
goto fail;
|
||||||
|
@ -19516,9 +19472,11 @@ static __exception int next_token(JSParseState *s)
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
if (c >= 128) {
|
if (c >= 0x80) { /* non-ASCII code-point */
|
||||||
/* unicode value */
|
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
|
if (p_next == p + 1)
|
||||||
|
goto invalid_utf8;
|
||||||
|
p = p_next;
|
||||||
switch(c) {
|
switch(c) {
|
||||||
case CP_PS:
|
case CP_PS:
|
||||||
case CP_LS:
|
case CP_LS:
|
||||||
|
@ -19549,6 +19507,8 @@ static __exception int next_token(JSParseState *s)
|
||||||
// dump_token(s, &s->token);
|
// dump_token(s, &s->token);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
invalid_utf8:
|
||||||
|
js_parse_error(s, "invalid UTF-8 sequence");
|
||||||
fail:
|
fail:
|
||||||
s->token.val = TOK_ERROR;
|
s->token.val = TOK_ERROR;
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -19573,7 +19533,7 @@ static int json_parse_error(JSParseState *s, const uint8_t *curp, const char *ms
|
||||||
|
|
||||||
static int json_parse_string(JSParseState *s, const uint8_t **pp)
|
static int json_parse_string(JSParseState *s, const uint8_t **pp)
|
||||||
{
|
{
|
||||||
const uint8_t *p = *pp;
|
const uint8_t *p, *p_next;
|
||||||
int i;
|
int i;
|
||||||
uint32_t c;
|
uint32_t c;
|
||||||
StringBuffer b_s, *b = &b_s;
|
StringBuffer b_s, *b = &b_s;
|
||||||
|
@ -19581,6 +19541,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
|
||||||
if (string_buffer_init(s->ctx, b, 32))
|
if (string_buffer_init(s->ctx, b, 32))
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
|
p = *pp;
|
||||||
for(;;) {
|
for(;;) {
|
||||||
if (p >= s->buf_end) {
|
if (p >= s->buf_end) {
|
||||||
goto end_of_input;
|
goto end_of_input;
|
||||||
|
@ -19622,9 +19583,8 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
|
||||||
}
|
}
|
||||||
} else
|
} else
|
||||||
if (c >= 0x80) {
|
if (c >= 0x80) {
|
||||||
const uint8_t *p_next;
|
c = utf8_decode(p - 1, s->buf_end - p, &p_next);
|
||||||
c = unicode_from_utf8(p - 1, s->buf_end - p, &p_next);
|
if (p_next == p) {
|
||||||
if (c > 0x10FFFF) {
|
|
||||||
json_parse_error(s, p - 1, "Bad UTF-8 sequence");
|
json_parse_error(s, p - 1, "Bad UTF-8 sequence");
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
@ -19722,7 +19682,7 @@ static JSAtom json_parse_ident(JSParseState *s, const uint8_t **pp, int c)
|
||||||
|
|
||||||
static __exception int json_next_token(JSParseState *s)
|
static __exception int json_next_token(JSParseState *s)
|
||||||
{
|
{
|
||||||
const uint8_t *p;
|
const uint8_t *p, *p_next;
|
||||||
int c;
|
int c;
|
||||||
JSAtom atom;
|
JSAtom atom;
|
||||||
|
|
||||||
|
@ -19826,10 +19786,9 @@ static __exception int json_next_token(JSParseState *s)
|
||||||
goto fail;
|
goto fail;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
if (c >= 128) {
|
if (c >= 0x80) {
|
||||||
const uint8_t *p_next;
|
c = utf8_decode(p, s->buf_end - p, &p_next);
|
||||||
c = unicode_from_utf8(p, s->buf_end - p, &p_next);
|
if (p_next == p + 1) {
|
||||||
if (c == -1) {
|
|
||||||
js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
|
js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
|
||||||
} else {
|
} else {
|
||||||
if (c > 0xFFFF) {
|
if (c > 0xFFFF) {
|
||||||
|
@ -19951,12 +19910,10 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
|
||||||
if (*p == '\n' || *p == '\r') {
|
if (*p == '\n' || *p == '\r') {
|
||||||
break;
|
break;
|
||||||
} else if (*p >= 0x80) {
|
} else if (*p >= 0x80) {
|
||||||
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
|
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
|
||||||
if (c == CP_LS || c == CP_PS) {
|
/* purposely ignore UTF-8 encoding errors in this comment line */
|
||||||
|
if (c == CP_LS || c == CP_PS)
|
||||||
break;
|
break;
|
||||||
} else if (c == -1) {
|
|
||||||
p++; /* skip invalid UTF-8 */
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
p++;
|
p++;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue