Improve UTF-8 decoding and encoding functions (#410)

Ensure proper UTF-8 encoding (1 to 4 bytes). Handle invalid encodings (return 0xFFFD and consume a single byte) Individually encoded surrogate code points are accepted. - add `utf8_scan()` to analyze a byte array for UTF-8 contents detects invalid encoding, computes number of codepoints and content kind: plain ASCII, 8-bit, 16-bit or larger codepoints. - add `utf8_encode_len(c)` to compute the number of bytes to encode `c` - rename `unicode_to_utf8` as `utf8_encode` - rename `unicode_from_utf8` as `utf8_decode` - add `utf8_decode_buf8(dest, size, src, len)` to decode a UTF-8 encoded byte array known to contain only ASCII and 8-bit codepoints. - add `utf8_decode_buf16(dest, size, src, len)` to decode a UTF-8 encoded byte array into an array of 16-bit codepoints using UTF-16 surrogate pairs for non-BMP1 codepoints. - add `utf8_encode_buf8(dest, size, src, len)` to encode an array of 8-bit codepoints as a UTF-8 encoded null terminated string - add `utf16_encode_buf8(dest, size, src, len)` to decode an array of 16-bit codepoints (including surrogate pairs) as a UTF-8 encoded null terminated string - detect invalid UTF-8 encoding in RegExp parser - simplify `JS_AtomGetStrRT`, `JS_NewStringLen` using the above functions - simplify UTF-8 decoding and error testing
2024-05-21 14:08:33 +02:00 · 2024-05-21 14:08:33 +02:00 · 1baa6763f8
commit 1baa6763f8
parent f588210641
5 changed files with 490 additions and 269 deletions
--- a/cutils.c
+++ b/cutils.c
@ -213,58 +213,83 @@ void dbuf_free(DynBuf *s)
    memset(s, 0, sizeof(*s));
 }

-/*--- Unicode / UTF-8 utility functions --*/
+/*--- UTF-8 utility functions --*/

-/* Note: at most 31 bits are encoded. At most UTF8_CHAR_LEN_MAX bytes
-   are output. */
-int unicode_to_utf8(uint8_t *buf, unsigned int c)
+/* Note: only encode valid codepoints (0x0000..0x10FFFF).
+   At most UTF8_CHAR_LEN_MAX bytes are output. */
+
+/* Compute the number of bytes of the UTF-8 encoding for a codepoint
+   `c` is a code-point.
+   Returns the number of bytes. If a codepoint is beyond 0x10FFFF the
+   return value is 3 as the codepoint would be encoded as 0xFFFD.
+ */
+size_t utf8_encode_len(uint32_t c)
 {
-    uint8_t *q = buf;
+    if (c < 0x80)
+        return 1;
+    if (c < 0x800)
+        return 2;
+    if (c < 0x10000)
+        return 3;
+    if (c < 0x110000)
+        return 4;
+    return 3;
+}

+/* Encode a codepoint in UTF-8
+   `buf` points to an array of at least `UTF8_CHAR_LEN_MAX` bytes
+   `c` is a code-point.
+   Returns the number of bytes. If a codepoint is beyond 0x10FFFF the
+   return value is 3 and the codepoint is encoded as 0xFFFD.
+   No null byte is stored after the encoded bytes.
+   Return value is in range 1..4
+ */
+size_t utf8_encode(uint8_t *buf, uint32_t c)
+{
    if (c < 0x80) {
-        *q++ = c;
-    } else {
+        buf[0] = c;
+        return 1;
+    }
    if (c < 0x800) {
-            *q++ = (c >> 6) | 0xc0;
-        } else {
+        buf[0] = (c >> 6) | 0xC0;
+        buf[1] = (c & 0x3F) | 0x80;
+        return 2;
+    }
    if (c < 0x10000) {
-                *q++ = (c >> 12) | 0xe0;
-            } else {
-                if (c < 0x00200000) {
-                    *q++ = (c >> 18) | 0xf0;
-                } else {
-                    if (c < 0x04000000) {
-                        *q++ = (c >> 24) | 0xf8;
-                    } else if (c < 0x80000000) {
-                        *q++ = (c >> 30) | 0xfc;
-                        *q++ = ((c >> 24) & 0x3f) | 0x80;
-                    } else {
-                        return 0;
+        buf[0] = (c >> 12) | 0xE0;
+        buf[1] = ((c >> 6) & 0x3F) | 0x80;
+        buf[2] = (c & 0x3F) | 0x80;
+        return 3;
    }
-                    *q++ = ((c >> 18) & 0x3f) | 0x80;
+    if (c < 0x110000) {
+        buf[0] = (c >> 18) | 0xF0;
+        buf[1] = ((c >> 12) & 0x3F) | 0x80;
+        buf[2] = ((c >> 6) & 0x3F) | 0x80;
+        buf[3] = (c & 0x3F) | 0x80;
+        return 4;
    }
-                *q++ = ((c >> 12) & 0x3f) | 0x80;
-            }
-            *q++ = ((c >> 6) & 0x3f) | 0x80;
-        }
-        *q++ = (c & 0x3f) | 0x80;
-    }
-    return q - buf;
+    buf[0] = (0xFFFD >> 12) | 0xE0;
+    buf[1] = ((0xFFFD >> 6) & 0x3F) | 0x80;
+    buf[2] = (0xFFFD & 0x3F) | 0x80;
+    return 3;
 }

-static const unsigned int utf8_min_code[5] = {
-    0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
-};
-
-static const unsigned char utf8_first_code_mask[5] = {
-    0x1f, 0xf, 0x7, 0x3, 0x1,
-};
-
-/* return -1 if error. *pp is not updated in this case. max_len must
-   be >= 1. The maximum length for a UTF8 byte sequence is 6 bytes. */
-int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
+/* Decode a single code point from a UTF-8 encoded array of bytes
+   `p` is a valid pointer to an array of bytes
+   `max_len` is the number of bytes available in the array
+   `pp` is a valid pointer to a `const uint8_t *` to store a pointer
+   to the byte following the current sequence.
+   Return the code point at `p`, in the range `0..0x10FFFF`
+   Return 0xFFFD on error. Only a single byte is consumed in this case
+   The maximum length for a UTF-8 byte sequence is 4 bytes.
+   This implements the algorithm specified in whatwg.org, except it accepts
+   UTF-8 encoded surrogates as JavaScript allows them in strings.
+   cf: https://encoding.spec.whatwg.org/#utf-8-encoder
+ */
+uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
 {
-    int l, c, b, i;
+    uint32_t c;
+    uint8_t lower, upper;

    c = *p++;
    if (c < 0x80) {
@ -272,49 +297,270 @@ int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
        return c;
    }
    switch(c) {
-    case 0xc0: case 0xc1: case 0xc2: case 0xc3:
-    case 0xc4: case 0xc5: case 0xc6: case 0xc7:
-    case 0xc8: case 0xc9: case 0xca: case 0xcb:
-    case 0xcc: case 0xcd: case 0xce: case 0xcf:
-    case 0xd0: case 0xd1: case 0xd2: case 0xd3:
-    case 0xd4: case 0xd5: case 0xd6: case 0xd7:
-    case 0xd8: case 0xd9: case 0xda: case 0xdb:
-    case 0xdc: case 0xdd: case 0xde: case 0xdf:
-        l = 1;
+    case 0xC2: case 0xC3:
+    case 0xC4: case 0xC5: case 0xC6: case 0xC7:
+    case 0xC8: case 0xC9: case 0xCA: case 0xCB:
+    case 0xCC: case 0xCD: case 0xCE: case 0xCF:
+    case 0xD0: case 0xD1: case 0xD2: case 0xD3:
+    case 0xD4: case 0xD5: case 0xD6: case 0xD7:
+    case 0xD8: case 0xD9: case 0xDA: case 0xDB:
+    case 0xDC: case 0xDD: case 0xDE: case 0xDF:
+        if (max_len < 2) {
+            // need more bytes
            break;
-    case 0xe0: case 0xe1: case 0xe2: case 0xe3:
-    case 0xe4: case 0xe5: case 0xe6: case 0xe7:
-    case 0xe8: case 0xe9: case 0xea: case 0xeb:
-    case 0xec: case 0xed: case 0xee: case 0xef:
-        l = 2;
+        }
+        if (*p >= 0x80 && *p <= 0xBF) {
+            *pp = p + 1;
+            return ((c - 0xC0) << 6) + (*p - 0x80);
+        }
+        // otherwise encoding error
        break;
-    case 0xf0: case 0xf1: case 0xf2: case 0xf3:
-    case 0xf4: case 0xf5: case 0xf6: case 0xf7:
-        l = 3;
+    case 0xE0:
+        lower = 0xA0;   /* reject invalid encoding */
+        goto need2;
+    case 0xE1: case 0xE2: case 0xE3:
+    case 0xE4: case 0xE5: case 0xE6: case 0xE7:
+    case 0xE8: case 0xE9: case 0xEA: case 0xEB:
+    case 0xEC: case 0xED: case 0xEE: case 0xEF:
+        lower = 0x80;
+    need2:
+        if (max_len < 3) {
+            // need more bytes
            break;
-    case 0xf8: case 0xf9: case 0xfa: case 0xfb:
-        l = 4;
+        }
+        if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
+            *pp = p + 2;
+            return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
+        }
+        // otherwise encoding error
        break;
-    case 0xfc: case 0xfd:
-        l = 5;
+    case 0xF0:
+        lower = 0x90;   /* reject invalid encoding */
+        upper = 0xBF;
+        goto need3;
+    case 0xF4:
+        lower = 0x80;
+        upper = 0x8F;   /* reject values above 0x10FFFF */
+        goto need3;
+    case 0xF1: case 0xF2: case 0xF3:
+        lower = 0x80;
+        upper = 0xBF;
+    need3:
+        if (max_len < 4) {
+            // need more bytes
+            break;
+        }
+        if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
+        &&  p[2] >= 0x80 && p[2] <= 0xBF) {
+            *pp = p + 3;
+            return ((c - 0xF0) << 18) + ((*p - 0x80) << 12) +
+                ((p[1] - 0x80) << 6) + (p[2] - 0x80);
+        }
+        // otherwise encoding error
        break;
    default:
-        return -1;
+        // invalid lead byte
+        break;
    }
-    /* check that we have enough characters */
-    if (l > (max_len - 1))
-        return -1;
-    c &= utf8_first_code_mask[l - 1];
-    for(i = 0; i < l; i++) {
-        b = *p++;
-        if (b < 0x80 || b >= 0xc0)
-            return -1;
-        c = (c << 6) | (b & 0x3f);
-    }
-    if (c < utf8_min_code[l - 1])
-        return -1;
    *pp = p;
-    return c;
+    return 0xFFFD;
+}
+
+/* Scan a UTF-8 encoded buffer for content type
+   `buf` is a valid pointer to a UTF-8 encoded string
+   `len` is the number of bytes to scan
+   `plen` points to a `size_t` variable to receive the number of units
+   Return value is a mask of bits.
+   - `UTF8_PLAIN_ASCII`: return value for 7-bit ASCII plain text
+   - `UTF8_NON_ASCII`: bit for non ASCII code points (8-bit or more)
+   - `UTF8_HAS_16BIT`: bit for 16-bit code points
+   - `UTF8_HAS_NON_BMP1`: bit for non-BMP1 code points, needs UTF-16 surrogate pairs
+   - `UTF8_HAS_ERRORS`: bit for encoding errors
+ */
+int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
+{
+    const uint8_t *p, *p_end, *p_next;
+    size_t i, len;
+    int kind;
+    uint8_t cbits;
+
+    kind = UTF8_PLAIN_ASCII;
+    cbits = 0;
+    len = buf_len;
+    // TODO: handle more than 1 byte at a time
+    for (i = 0; i < buf_len; i++)
+        cbits |= buf[i];
+    if (cbits >= 0x80) {
+        p = (const uint8_t *)buf;
+        p_end = p + buf_len;
+        kind = UTF8_NON_ASCII;
+        len = 0;
+        while (p < p_end) {
+            len++;
+            if (*p++ >= 0x80) {
+                /* parse UTF-8 sequence, check for encoding error */
+                uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
+                if (p_next == p)
+                    kind |= UTF8_HAS_ERRORS;
+                p = p_next;
+                if (c > 0xFF) {
+                    kind |= UTF8_HAS_16BIT;
+                    if (c > 0xFFFF) {
+                        len++;
+                        kind |= UTF8_HAS_NON_BMP1;
+                    }
+                }
+            }
+        }
+    }
+    *plen = len;
+    return kind;
+}
+
+/* Decode a string encoded in UTF-8 into an array of bytes
+   `src` points to the source string. It is assumed to be correctly encoded
+   and only contains code points below 0x800
+   `src_len` is the length of the source string
+   `dest` points to the destination array, it can be null if `dest_len` is `0`
+   `dest_len` is the length of the destination array. A null
+   terminator is stored at the end of the array unless `dest_len` is `0`.
+ */
+size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len)
+{
+    const uint8_t *p, *p_end;
+    size_t i;
+
+    p = (const uint8_t *)src;
+    p_end = p + src_len;
+    for (i = 0; p < p_end; i++) {
+        uint32_t c = *p++;
+        if (c >= 0xC0)
+            c = (c << 6) + *p++ - ((0xC0 << 6) + 0x80);
+        if (i < dest_len)
+            dest[i] = c;
+    }
+    if (i < dest_len)
+        dest[i] = '\0';
+    else if (dest_len > 0)
+        dest[dest_len - 1] = '\0';
+    return i;
+}
+
+/* Decode a string encoded in UTF-8 into an array of 16-bit words
+   `src` points to the source string. It is assumed to be correctly encoded.
+   `src_len` is the length of the source string
+   `dest` points to the destination array, it can be null if `dest_len` is `0`
+   `dest_len` is the length of the destination array. No null terminator is
+   stored at the end of the array.
+ */
+size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len)
+{
+    const uint8_t *p, *p_end;
+    size_t i;
+
+    p = (const uint8_t *)src;
+    p_end = p + src_len;
+    for (i = 0; p < p_end; i++) {
+        uint32_t c = *p++;
+        if (c >= 0x80) {
+            /* parse utf-8 sequence */
+            c = utf8_decode(p - 1, p_end - (p - 1), &p);
+            /* encoding errors are converted as 0xFFFD and use a single byte */
+            if (c > 0xFFFF) {
+                if (i < dest_len)
+                    dest[i] = get_hi_surrogate(c);
+                i++;
+                c = get_lo_surrogate(c);
+            }
+        }
+        if (i < dest_len)
+            dest[i] = c;
+    }
+    return i;
+}
+
+/* Encode a buffer of 8-bit bytes as a UTF-8 encoded string
+   `src` points to the source buffer.
+   `src_len` is the length of the source buffer
+   `dest` points to the destination array, it can be null if `dest_len` is `0`
+   `dest_len` is the length in bytes of the destination array. A null
+   terminator is stored at the end of the array unless `dest_len` is `0`.
+ */
+size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len)
+{
+    size_t i, j;
+    uint32_t c;
+
+    for (i = j = 0; i < src_len; i++) {
+        c = src[i];
+        if (c < 0x80) {
+            if (j + 1 >= dest_len)
+                goto overflow;
+            dest[j++] = c;
+        } else {
+            if (j + 2 >= dest_len)
+                goto overflow;
+            dest[j++] = (c >> 6) | 0xC0;
+            dest[j++] = (c & 0x3F) | 0x80;
+        }
+    }
+    if (j < dest_len)
+        dest[j] = '\0';
+    return j;
+
+overflow:
+    if (j < dest_len)
+        dest[j] = '\0';
+    while (i < src_len)
+        j += 1 + (src[i++] >= 0x80);
+    return j;
+}
+
+/* Encode a buffer of 16-bit code points as a UTF-8 encoded string
+   `src` points to the source buffer.
+   `src_len` is the length of the source buffer
+   `dest` points to the destination array, it can be null if `dest_len` is `0`
+   `dest_len` is the length in bytes of the destination array. A null
+   terminator is stored at the end of the array unless `dest_len` is `0`.
+ */
+size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len)
+{
+    size_t i, j;
+    uint32_t c;
+
+    for (i = j = 0; i < src_len;) {
+        c = src[i++];
+        if (c < 0x80) {
+            if (j + 1 >= dest_len)
+                goto overflow;
+            dest[j++] = c;
+        } else {
+            if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i]))
+                c = from_surrogate(c, src[i++]);
+            if (j + utf8_encode_len(c) >= dest_len)
+                goto overflow;
+            j += utf8_encode((uint8_t *)dest + j, c);
+        }
+    }
+    if (j < dest_len)
+        dest[j] = '\0';
+    return j;
+
+overflow:
+    i -= 1 + (c > 0xFFFF);
+    if (j < dest_len)
+        dest[j] = '\0';
+    while (i < src_len) {
+        c = src[i++];
+        if (c < 0x80) {
+            j++;
+        } else {
+            if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i]))
+                c = from_surrogate(c, src[i++]);
+            j += utf8_encode_len(c);
+        }
+    }
+    return j;
 }

 /*--- integer to string conversions --*/
--- a/cutils.h
+++ b/cutils.h
@ -387,10 +387,25 @@ static inline void dbuf_set_error(DynBuf *s)
    s->error = TRUE;
 }

-#define UTF8_CHAR_LEN_MAX 6
+/*---- UTF-8 and UTF-16 handling ----*/

-int unicode_to_utf8(uint8_t *buf, unsigned int c);
-int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
+#define UTF8_CHAR_LEN_MAX 4
+
+enum {
+    UTF8_PLAIN_ASCII  = 0,  // 7-bit ASCII plain text
+    UTF8_NON_ASCII    = 1,  // has non ASCII code points (8-bit or more)
+    UTF8_HAS_16BIT    = 2,  // has 16-bit code points
+    UTF8_HAS_NON_BMP1 = 4,  // has non-BMP1 code points, needs UTF-16 surrogate pairs
+    UTF8_HAS_ERRORS   = 8,  // has encoding errors
+};
+int utf8_scan(const char *buf, size_t len, size_t *plen);
+size_t utf8_encode_len(uint32_t c);
+size_t utf8_encode(uint8_t *buf, uint32_t c);
+uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
+size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
+size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
+size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);
+size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len);

 static inline BOOL is_surrogate(uint32_t c)
 {
--- a/libregexp.c
+++ b/libregexp.c
@ -712,7 +712,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
 static int get_class_atom(REParseState *s, CharRange *cr,
                          const uint8_t **pp, BOOL inclass)
 {
-    const uint8_t *p;
+    const uint8_t *p, *p_next;
    uint32_t c;
    int ret;

@ -804,15 +804,18 @@ static int get_class_atom(REParseState *s, CharRange *cr,
        /* fall thru */
    default:
    normal_char:
-        /* normal char */
-        if (c >= 128) {
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
-            if ((unsigned)c > 0xffff && !s->is_unicode) {
-                /* XXX: should handle non BMP-1 code points */
+        p++;
+        if (c >= 0x80) {
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p)
+                return re_parse_error(s, "invalid UTF-8 sequence");
+            p = p_next;
+            if (c > 0xFFFF && !s->is_unicode) {
+                // TODO(chqrlie): should handle non BMP-1 code points in
+                //   the calling function and no require the source string
+                //   to be CESU-8 encoded if not s->is_unicode
                return re_parse_error(s, "malformed unicode char");
            }
-        } else {
-            p++;
        }
        break;
    }
@ -1105,35 +1108,35 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
 /* '*pp' is the first char after '<' */
 static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
 {
-    const uint8_t *p, *p1;
+    const uint8_t *p, *p_next;
    uint32_t c, d;
    char *q;

    p = *pp;
    q = buf;
    for(;;) {
-        c = *p;
+        c = *p++;
        if (c == '\\') {
-            p++;
            if (*p != 'u')
                return -1;
            c = lre_parse_escape(&p, 2); // accept surrogate pairs
+            if ((int)c < 0)
+                return -1;
        } else if (c == '>') {
            break;
-        } else if (c >= 128) {
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+        } else if (c >= 0x80) {
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p)
+                return -1;
+            p = p_next;
            if (is_hi_surrogate(c)) {
-                d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
+                d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
                if (is_lo_surrogate(d)) {
                    c = from_surrogate(c, d);
-                    p = p1;
+                    p = p_next;
                }
            }
-        } else {
-            p++;
        }
-        if (c > 0x10FFFF)
-            return -1;
        if (q == buf) {
            if (!lre_js_is_ident_first(c))
                return -1;
@ -1143,16 +1146,15 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
        }
        if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
            return -1;
-        if (c < 128) {
+        if (c < 0x80) {
            *q++ = c;
        } else {
-            q += unicode_to_utf8((uint8_t*)q, c);
+            q += utf8_encode((uint8_t*)q, c);
        }
    }
    if (q == buf)
        return -1;
    *q = '\0';
-    p++;
    *pp = p;
    return 0;
 }
--- a/quickjs-libc.c
+++ b/quickjs-libc.c
@ -272,20 +272,21 @@ static JSValue js_printf_internal(JSContext *ctx,
                if (i >= argc)
                    goto missing;
                if (JS_IsString(argv[i])) {
+                    // TODO(chqrlie) need an API to wrap charCodeAt and codePointAt */
                    string_arg = JS_ToCString(ctx, argv[i++]);
                    if (!string_arg)
                        goto fail;
-                    int32_arg = unicode_from_utf8((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
+                    int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
                    JS_FreeCString(ctx, string_arg);
                } else {
                    if (JS_ToInt32(ctx, &int32_arg, argv[i++]))
                        goto fail;
                }
-                /* handle utf-8 encoding explicitly */
+                // XXX: throw an exception?
                if ((unsigned)int32_arg > 0x10FFFF)
                    int32_arg = 0xFFFD;
                /* ignore conversion flags, width and precision */
-                len = unicode_to_utf8(cbuf, int32_arg);
+                len = utf8_encode(cbuf, int32_arg);
                dbuf_put(&dbuf, cbuf, len);
                break;

--- a/quickjs.c
+++ b/quickjs.c
@ -3030,38 +3030,25 @@ static const char *JS_AtomGetStrRT(JSRuntime *rt, char *buf, int buf_size,
        snprintf(buf, buf_size, "<invalid %x>", atom);
    } else {
        JSAtomStruct *p = rt->atom_array[atom];
+        *buf = '\0';
        if (atom_is_free(p)) {
            assert(!atom_is_free(p));
            snprintf(buf, buf_size, "<free %x>", atom);
+        } else if (p != NULL) {
+            JSString *str = p;
+            if (str->is_wide_char) {
+                /* encode surrogates correctly */
+                utf8_encode_buf16(buf, buf_size, str->u.str16, str->len);
            } else {
-            int i, c;
-            char *q;
-            JSString *str;
-
-            q = buf;
-            str = p;
-            if (str) {
-                if (!str->is_wide_char) {
                /* special case ASCII strings */
-                    c = 0;
+                int i, c = 0;
                for(i = 0; i < str->len; i++) {
                    c |= str->u.str8[i];
                }
                if (c < 0x80)
                    return (const char *)str->u.str8;
+                utf8_encode_buf8(buf, buf_size, str->u.str8, str->len);
            }
-                for(i = 0; i < str->len; i++) {
-                    c = string_get(str, i);
-                    if ((q - buf) >= buf_size - UTF8_CHAR_LEN_MAX)
-                        break;
-                    if (c < 128) {
-                        *q++ = c;
-                    } else {
-                        q += unicode_to_utf8((uint8_t *)q, c);
-                    }
-                }
-            }
-            *q = '\0';
        }
    }
    return buf;
@ -3311,6 +3298,7 @@ const char *JS_AtomToCString(JSContext *ctx, JSAtom atom)

 /* return a string atom containing name concatenated with str1 */
 /* `str1` may be pure ASCII or UTF-8 encoded */
+// TODO(chqrlie): use string concatenation instead of UTF-8 conversion
 static JSAtom js_atom_concat_str(JSContext *ctx, JSAtom name, const char *str1)
 {
    JSValue str;
@ -3863,64 +3851,44 @@ static JSValue string_buffer_end(StringBuffer *s)
 /* create a string from a UTF-8 buffer */
 JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
 {
-    const uint8_t *p, *p_end, *p_start, *p_next;
-    uint32_t c;
-    StringBuffer b_s, *b = &b_s;
-    size_t len1;
+    JSString *str;
+    size_t len;
+    int kind;

    if (buf_len <= 0) {
        return JS_AtomToString(ctx, JS_ATOM_empty_string);
    }
-    p_start = (const uint8_t *)buf;
-    p_end = p_start + buf_len;
-    p = p_start;
-    while (p < p_end && *p < 128)
-        p++;
-    len1 = p - p_start;
-    if (len1 > JS_STRING_LEN_MAX)
+    /* Compute string kind and length: 7-bit, 8-bit, 16-bit, 16-bit UTF-16 */
+    kind = utf8_scan(buf, buf_len, &len);
+    if (len > JS_STRING_LEN_MAX)
        return JS_ThrowRangeError(ctx, "invalid string length");
-    if (p == p_end) {
-        /* ASCII string */
-        return js_new_string8_len(ctx, buf, buf_len);
-    } else {
-        if (string_buffer_init(ctx, b, buf_len))
-            goto fail;
-        string_buffer_write8(b, p_start, len1);
-        while (p < p_end) {
-            if (*p < 128) {
-                string_buffer_putc8(b, *p++);
-            } else {
-                /* parse utf-8 sequence, return 0xFFFFFFFF for error */
-                c = unicode_from_utf8(p, p_end - p, &p_next);
-                if (c < 0x10000) {
-                    p = p_next;
-                } else if (c <= 0x10FFFF) {
-                    p = p_next;
-                    /* surrogate pair */
-                    string_buffer_putc16(b, get_hi_surrogate(c));
-                    c = get_lo_surrogate(c);
-                } else {
-                    /* invalid char */
-                    c = 0xfffd;
-                    /* skip the invalid chars */
-                    /* XXX: seems incorrect. Why not just use c = *p++; ? */
-                    while (p < p_end && (*p >= 0x80 && *p < 0xc0))
-                        p++;
-                    if (p < p_end) {
-                        p++;
-                        while (p < p_end && (*p >= 0x80 && *p < 0xc0))
-                            p++;
-                    }
-                }
-                string_buffer_putc16(b, c);
-            }
-        }
-    }
-    return string_buffer_end(b);

- fail:
-    string_buffer_free(b);
+    switch (kind) {
+    case UTF8_PLAIN_ASCII:
+        str = js_alloc_string(ctx, len, 0);
+        if (!str)
            return JS_EXCEPTION;
+        memcpy(str->u.str8, buf, len);
+        str->u.str8[len] = '\0';
+        break;
+    case UTF8_NON_ASCII:
+        /* buf contains non-ASCII code-points, but limited to 8-bit values */
+        str = js_alloc_string(ctx, len, 0);
+        if (!str)
+            return JS_EXCEPTION;
+        utf8_decode_buf8(str->u.str8, len + 1, buf, buf_len);
+        break;
+    default:
+        // This causes a potential problem in JS_ThrowError if message is invalid
+        //if (kind & UTF8_HAS_ERRORS)
+        //    return JS_ThrowRangeError(ctx, "invalid UTF-8 sequence");
+        str = js_alloc_string(ctx, len, 1);
+        if (!str)
+            return JS_EXCEPTION;
+        utf8_decode_buf16(str->u.str16, len, buf, buf_len);
+        break;
+    }
+    return JS_MKPTR(JS_TAG_STRING, str);
 }

 static JSValue JS_ConcatString3(JSContext *ctx, const char *str1,
@ -4067,7 +4035,7 @@ go:
                        /* c = 0xfffd; */ /* error */
                    }
                }
-                q += unicode_to_utf8(q, c);
+                q += utf8_encode(q, c);
            }
        }
    }
@ -10073,6 +10041,7 @@ int JS_ToBool(JSContext *ctx, JSValue val)
    return JS_ToBoolFree(ctx, js_dup(val));
 }

+/* pc points to pure ASCII or UTF-8, null terminated contents */
 static int skip_spaces(const char *pc)
 {
    const uint8_t *p, *p_next, *p_start;
@ -10080,19 +10049,19 @@ static int skip_spaces(const char *pc)

    p = p_start = (const uint8_t *)pc;
    for (;;) {
-        c = *p;
-        if (c < 128) {
+        c = *p++;
+        if (c < 0x80) {
            if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
                break;
-            p++;
        } else {
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            /* no need to test for invalid UTF-8, 0xFFFD is not a space */
            if (!lre_is_space(c))
                break;
            p = p_next;
        }
    }
-    return p - p_start;
+    return p - 1 - p_start;
 }

 static inline int to_digit(int c)
@ -18689,6 +18658,7 @@ static int js_parse_error_reserved_identifier(JSParseState *s)
 static __exception int js_parse_template_part(JSParseState *s,
                                              const uint8_t *p)
 {
+    const uint8_t *p_next;
    uint32_t c;
    StringBuffer b_s, *b = &b_s;

@ -18726,9 +18696,8 @@ static __exception int js_parse_template_part(JSParseState *s,
            s->eol = &p[-1];
            s->mark = p;
        } else if (c >= 0x80) {
-            const uint8_t *p_next;
-            c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
-            if (c > 0x10FFFF) {
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p) {
                js_parse_error(s, "invalid UTF-8 sequence");
                goto fail;
            }
@ -18754,6 +18723,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
                                       BOOL do_throw, const uint8_t *p,
                                       JSToken *token, const uint8_t **pp)
 {
+    const uint8_t *p_next;
    int ret;
    uint32_t c;
    StringBuffer b_s, *b = &b_s;
@ -18832,9 +18802,8 @@ static __exception int js_parse_string(JSParseState *s, int sep,
                    }
                    goto fail;
                } else if (c >= 0x80) {
-                    const uint8_t *p_next;
-                    c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
-                    if (c > 0x10FFFF) {
+                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+                    if (p_next == p + 1) {
                        goto invalid_utf8;
                    }
                    p = p_next;
@ -18859,9 +18828,8 @@ static __exception int js_parse_string(JSParseState *s, int sep,
                break;
            }
        } else if (c >= 0x80) {
-            const uint8_t *p_next;
-            c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
-            if (c > 0x10FFFF)
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p)
                goto invalid_utf8;
            p = p_next;
        }
@ -18893,7 +18861,7 @@ static inline BOOL token_is_pseudo_keyword(JSParseState *s, JSAtom atom) {

 static __exception int js_parse_regexp(JSParseState *s)
 {
-    const uint8_t *p;
+    const uint8_t *p, *p_next;
    BOOL in_class;
    StringBuffer b_s, *b = &b_s;
    StringBuffer b2_s, *b2 = &b2_s;
@ -18932,9 +18900,8 @@ static __exception int js_parse_regexp(JSParseState *s)
            else if (c == '\0' && p >= s->buf_end)
                goto eof_error;
            else if (c >= 0x80) {
-                const uint8_t *p_next;
-                c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
-                if (c > 0x10FFFF) {
+                c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+                if (p_next == p) {
                    goto invalid_utf8;
                }
                p = p_next;
@ -18942,9 +18909,8 @@ static __exception int js_parse_regexp(JSParseState *s)
                    goto eol_error;
            }
        } else if (c >= 0x80) {
-            const uint8_t *p_next;
-            c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
-            if (c > 0x10FFFF) {
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p) {
            invalid_utf8:
                js_parse_error(s, "invalid UTF-8 sequence");
                goto fail;
@ -18963,14 +18929,8 @@ static __exception int js_parse_regexp(JSParseState *s)

    /* flags */
    for(;;) {
-        const uint8_t *p_next = p;
-        c = *p_next++;
-        if (c >= 0x80) {
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
-            if (c > 0x10FFFF) {
-                goto invalid_utf8;
-            }
-        }
+        c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+        /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
        if (!lre_js_is_ident_next(c))
            break;
        if (string_buffer_putc(b2, c))
@ -19020,10 +18980,10 @@ static __exception int ident_realloc(JSContext *ctx, char **pbuf, size_t *psize,
 static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
                          BOOL *pident_has_escape, int c, BOOL is_private)
 {
-    const uint8_t *p, *p1;
+    const uint8_t *p, *p_next;
    char ident_buf[128], *buf;
    size_t ident_size, ident_pos;
-    JSAtom atom;
+    JSAtom atom = JS_ATOM_NULL;

    p = *pp;
    buf = ident_buf;
@ -19032,30 +18992,28 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
    if (is_private)
        buf[ident_pos++] = '#';
    for(;;) {
-        p1 = p;
-
-        if (c < 128) {
+        if (c < 0x80) {
            buf[ident_pos++] = c;
        } else {
-            ident_pos += unicode_to_utf8((uint8_t*)buf + ident_pos, c);
+            ident_pos += utf8_encode((uint8_t*)buf + ident_pos, c);
        }
-        c = *p1++;
-        if (c == '\\' && *p1 == 'u') {
-            c = lre_parse_escape(&p1, TRUE);
+        c = *p;
+        p_next = p + 1;
+        if (c == '\\' && *p_next == 'u') {
+            c = lre_parse_escape(&p_next, TRUE);
            *pident_has_escape = TRUE;
-        } else if (c >= 128) {
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
+        } else if (c >= 0x80) {
+            c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+            /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
        }
        if (!lre_js_is_ident_next(c))
            break;
-        p = p1;
+        p = p_next;
        if (unlikely(ident_pos >= ident_size - UTF8_CHAR_LEN_MAX)) {
-            if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf)) {
-                atom = JS_ATOM_NULL;
+            if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf))
                goto done;
        }
    }
-    }
    /* buf is pure ASCII or UTF-8 encoded */
    atom = JS_NewAtomLen(s->ctx, buf, ident_pos);
 done:
@ -19068,7 +19026,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,

 static __exception int next_token(JSParseState *s)
 {
-    const uint8_t *p;
+    const uint8_t *p, *p_next;
    int c;
    BOOL ident_has_escape;
    JSAtom atom;
@ -19148,11 +19106,10 @@ static __exception int next_token(JSParseState *s)
                    s->got_lf = TRUE; /* considered as LF for ASI */
                    p++;
                } else if (*p >= 0x80) {
-                    c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                    /* ignore invalid UTF-8 in comments */
                    if (c == CP_LS || c == CP_PS) {
                        s->got_lf = TRUE; /* considered as LF for ASI */
-                    } else if (c == -1) {
-                        p++; /* skip invalid UTF-8 */
                    }
                } else {
                    p++;
@ -19170,12 +19127,11 @@ static __exception int next_token(JSParseState *s)
                if (*p == '\r' || *p == '\n')
                    break;
                if (*p >= 0x80) {
-                    c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                    /* ignore invalid UTF-8 in comments */
                    /* LS or PS are considered as line terminator */
                    if (c == CP_LS || c == CP_PS) {
                        break;
-                    } else if (c == -1) {
-                        p++; /* skip invalid UTF-8 */
                    }
                } else {
                    p++;
@ -19265,20 +19221,21 @@ static __exception int next_token(JSParseState *s)
    case '#':
        /* private name */
        {
-            const uint8_t *p1;
            p++;
-            p1 = p;
-            c = *p1++;
-            if (c == '\\' && *p1 == 'u') {
-                c = lre_parse_escape(&p1, TRUE);
-            } else if (c >= 128) {
-                c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
+            c = *p;
+            p_next = p + 1;
+            if (c == '\\' && *p_next == 'u') {
+                c = lre_parse_escape(&p_next, TRUE);
+            } else if (c >= 0x80) {
+                c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+                if (p_next == p + 1)
+                    goto invalid_utf8;
            }
            if (!lre_js_is_ident_first(c)) {
                js_parse_error(s, "invalid first character of private name");
                goto fail;
            }
-            p = p1;
+            p = p_next;
            ident_has_escape = FALSE; /* not used */
            atom = parse_ident(s, &p, &ident_has_escape, c, TRUE);
            if (atom == JS_ATOM_NULL)
@ -19313,7 +19270,6 @@ static __exception int next_token(JSParseState *s)
    parse_number:
        {
            JSValue ret;
-            const uint8_t *p1;
            int flags;
            flags = ATOD_ACCEPT_BIN_OCT | ATOD_ACCEPT_LEGACY_OCTAL |
                ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX;
@ -19324,7 +19280,7 @@ static __exception int next_token(JSParseState *s)
                goto fail;
            /* reject `10instanceof Number` */
            if (JS_VALUE_IS_NAN(ret) ||
-                lre_js_is_ident_next(unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1))) {
+                lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
                JS_FreeValue(s->ctx, ret);
                js_parse_error(s, "invalid number literal");
                goto fail;
@ -19516,9 +19472,11 @@ static __exception int next_token(JSParseState *s)
        }
        break;
    default:
-        if (c >= 128) {
-            /* unicode value */
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+        if (c >= 0x80) {  /* non-ASCII code-point */
+            c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p + 1)
+                goto invalid_utf8;
+            p = p_next;
            switch(c) {
            case CP_PS:
            case CP_LS:
@ -19549,6 +19507,8 @@ static __exception int next_token(JSParseState *s)
    //    dump_token(s, &s->token);
    return 0;

+ invalid_utf8:
+    js_parse_error(s, "invalid UTF-8 sequence");
 fail:
    s->token.val = TOK_ERROR;
    return -1;
@ -19573,7 +19533,7 @@ static int json_parse_error(JSParseState *s, const uint8_t *curp, const char *ms

 static int json_parse_string(JSParseState *s, const uint8_t **pp)
 {
-    const uint8_t *p = *pp;
+    const uint8_t *p, *p_next;
    int i;
    uint32_t c;
    StringBuffer b_s, *b = &b_s;
@ -19581,6 +19541,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
    if (string_buffer_init(s->ctx, b, 32))
        goto fail;

+    p = *pp;
    for(;;) {
        if (p >= s->buf_end) {
            goto end_of_input;
@ -19622,9 +19583,8 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
            }
        } else
        if (c >= 0x80) {
-            const uint8_t *p_next;
-            c = unicode_from_utf8(p - 1, s->buf_end - p, &p_next);
-            if (c > 0x10FFFF) {
+            c = utf8_decode(p - 1, s->buf_end - p, &p_next);
+            if (p_next == p) {
                json_parse_error(s, p - 1, "Bad UTF-8 sequence");
                goto fail;
            }
@ -19722,7 +19682,7 @@ static JSAtom json_parse_ident(JSParseState *s, const uint8_t **pp, int c)

 static __exception int json_next_token(JSParseState *s)
 {
-    const uint8_t *p;
+    const uint8_t *p, *p_next;
    int c;
    JSAtom atom;

@ -19826,10 +19786,9 @@ static __exception int json_next_token(JSParseState *s)
            goto fail;
        break;
    default:
-        if (c >= 128) {
-            const uint8_t *p_next;
-            c = unicode_from_utf8(p, s->buf_end - p, &p_next);
-            if (c == -1) {
+        if (c >= 0x80) {
+            c = utf8_decode(p, s->buf_end - p, &p_next);
+            if (p_next == p + 1) {
                js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
            } else {
                if (c > 0xFFFF) {
@ -19951,12 +19910,10 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
            if (*p == '\n' || *p == '\r') {
                break;
            } else if (*p >= 0x80) {
-                c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
-                if (c == CP_LS || c == CP_PS) {
+                c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                /* purposely ignore UTF-8 encoding errors in this comment line */
+                if (c == CP_LS || c == CP_PS)
                    break;
-                } else if (c == -1) {
-                    p++; /* skip invalid UTF-8 */
-                }
            } else {
                p++;
            }