From 1baa6763f894a59c03d9f5c9e969ea83cf519d81 Mon Sep 17 00:00:00 2001
From: Charlie Gordon <github@chqrlie.org>
Date: Tue, 21 May 2024 14:08:33 +0200
Subject: [PATCH] Improve UTF-8 decoding and encoding functions (#410)

Ensure proper UTF-8 encoding (1 to 4 bytes).
Handle invalid encodings (return 0xFFFD and consume a single byte)
Individually encoded surrogate code points are accepted.

- add `utf8_scan()` to analyze a byte array for UTF-8 contents
  detects invalid encoding, computes number of codepoints and content kind:
  plain ASCII, 8-bit, 16-bit or larger codepoints.
- add `utf8_encode_len(c)` to compute the number of bytes to encode `c`
- rename `unicode_to_utf8` as `utf8_encode`
- rename `unicode_from_utf8` as `utf8_decode`
- add `utf8_decode_buf8(dest, size, src, len)` to decode a UTF-8 encoded
  byte array known to contain only ASCII and 8-bit codepoints.
- add `utf8_decode_buf16(dest, size, src, len)` to decode a UTF-8 encoded
  byte array into an array of 16-bit codepoints using UTF-16 surrogate pairs
  for non-BMP1 codepoints.
- add `utf8_encode_buf8(dest, size, src, len)` to encode an array of 8-bit
  codepoints as a UTF-8 encoded null terminated string
- add `utf16_encode_buf8(dest, size, src, len)` to decode an array of 16-bit
  codepoints (including surrogate pairs) as a UTF-8 encoded null terminated string
- detect invalid UTF-8 encoding in RegExp parser
- simplify `JS_AtomGetStrRT`, `JS_NewStringLen` using the above functions
- simplify UTF-8 decoding and error testing
---
 cutils.c       | 414 +++++++++++++++++++++++++++++++++++++++----------
 cutils.h       |  21 ++-
 libregexp.c    |  46 +++---
 quickjs-libc.c |   7 +-
 quickjs.c      | 271 ++++++++++++++------------------
 5 files changed, 490 insertions(+), 269 deletions(-)

diff --git a/cutils.c b/cutils.c
index 4973455..88cd72d 100644
--- a/cutils.c
+++ b/cutils.c
@@ -213,58 +213,83 @@ void dbuf_free(DynBuf *s)
     memset(s, 0, sizeof(*s));
 }
 
-/*--- Unicode / UTF-8 utility functions --*/
+/*--- UTF-8 utility functions --*/
 
-/* Note: at most 31 bits are encoded. At most UTF8_CHAR_LEN_MAX bytes
-   are output. */
-int unicode_to_utf8(uint8_t *buf, unsigned int c)
+/* Note: only encode valid codepoints (0x0000..0x10FFFF).
+   At most UTF8_CHAR_LEN_MAX bytes are output. */
+
+/* Compute the number of bytes of the UTF-8 encoding for a codepoint
+   `c` is a code-point.
+   Returns the number of bytes. If a codepoint is beyond 0x10FFFF the
+   return value is 3 as the codepoint would be encoded as 0xFFFD.
+ */
+size_t utf8_encode_len(uint32_t c)
 {
-    uint8_t *q = buf;
-
-    if (c < 0x80) {
-        *q++ = c;
-    } else {
-        if (c < 0x800) {
-            *q++ = (c >> 6) | 0xc0;
-        } else {
-            if (c < 0x10000) {
-                *q++ = (c >> 12) | 0xe0;
-            } else {
-                if (c < 0x00200000) {
-                    *q++ = (c >> 18) | 0xf0;
-                } else {
-                    if (c < 0x04000000) {
-                        *q++ = (c >> 24) | 0xf8;
-                    } else if (c < 0x80000000) {
-                        *q++ = (c >> 30) | 0xfc;
-                        *q++ = ((c >> 24) & 0x3f) | 0x80;
-                    } else {
-                        return 0;
-                    }
-                    *q++ = ((c >> 18) & 0x3f) | 0x80;
-                }
-                *q++ = ((c >> 12) & 0x3f) | 0x80;
-            }
-            *q++ = ((c >> 6) & 0x3f) | 0x80;
-        }
-        *q++ = (c & 0x3f) | 0x80;
-    }
-    return q - buf;
+    if (c < 0x80)
+        return 1;
+    if (c < 0x800)
+        return 2;
+    if (c < 0x10000)
+        return 3;
+    if (c < 0x110000)
+        return 4;
+    return 3;
 }
 
-static const unsigned int utf8_min_code[5] = {
-    0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
-};
-
-static const unsigned char utf8_first_code_mask[5] = {
-    0x1f, 0xf, 0x7, 0x3, 0x1,
-};
-
-/* return -1 if error. *pp is not updated in this case. max_len must
-   be >= 1. The maximum length for a UTF8 byte sequence is 6 bytes. */
-int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
+/* Encode a codepoint in UTF-8
+   `buf` points to an array of at least `UTF8_CHAR_LEN_MAX` bytes
+   `c` is a code-point.
+   Returns the number of bytes. If a codepoint is beyond 0x10FFFF the
+   return value is 3 and the codepoint is encoded as 0xFFFD.
+   No null byte is stored after the encoded bytes.
+   Return value is in range 1..4
+ */
+size_t utf8_encode(uint8_t *buf, uint32_t c)
 {
-    int l, c, b, i;
+    if (c < 0x80) {
+        buf[0] = c;
+        return 1;
+    }
+    if (c < 0x800) {
+        buf[0] = (c >> 6) | 0xC0;
+        buf[1] = (c & 0x3F) | 0x80;
+        return 2;
+    }
+    if (c < 0x10000) {
+        buf[0] = (c >> 12) | 0xE0;
+        buf[1] = ((c >> 6) & 0x3F) | 0x80;
+        buf[2] = (c & 0x3F) | 0x80;
+        return 3;
+    }
+    if (c < 0x110000) {
+        buf[0] = (c >> 18) | 0xF0;
+        buf[1] = ((c >> 12) & 0x3F) | 0x80;
+        buf[2] = ((c >> 6) & 0x3F) | 0x80;
+        buf[3] = (c & 0x3F) | 0x80;
+        return 4;
+    }
+    buf[0] = (0xFFFD >> 12) | 0xE0;
+    buf[1] = ((0xFFFD >> 6) & 0x3F) | 0x80;
+    buf[2] = (0xFFFD & 0x3F) | 0x80;
+    return 3;
+}
+
+/* Decode a single code point from a UTF-8 encoded array of bytes
+   `p` is a valid pointer to an array of bytes
+   `max_len` is the number of bytes available in the array
+   `pp` is a valid pointer to a `const uint8_t *` to store a pointer
+   to the byte following the current sequence.
+   Return the code point at `p`, in the range `0..0x10FFFF`
+   Return 0xFFFD on error. Only a single byte is consumed in this case
+   The maximum length for a UTF-8 byte sequence is 4 bytes.
+   This implements the algorithm specified in whatwg.org, except it accepts
+   UTF-8 encoded surrogates as JavaScript allows them in strings.
+   cf: https://encoding.spec.whatwg.org/#utf-8-encoder
+ */
+uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
+{
+    uint32_t c;
+    uint8_t lower, upper;
 
     c = *p++;
     if (c < 0x80) {
@@ -272,49 +297,270 @@ int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp)
         return c;
     }
     switch(c) {
-    case 0xc0: case 0xc1: case 0xc2: case 0xc3:
-    case 0xc4: case 0xc5: case 0xc6: case 0xc7:
-    case 0xc8: case 0xc9: case 0xca: case 0xcb:
-    case 0xcc: case 0xcd: case 0xce: case 0xcf:
-    case 0xd0: case 0xd1: case 0xd2: case 0xd3:
-    case 0xd4: case 0xd5: case 0xd6: case 0xd7:
-    case 0xd8: case 0xd9: case 0xda: case 0xdb:
-    case 0xdc: case 0xdd: case 0xde: case 0xdf:
-        l = 1;
+    case 0xC2: case 0xC3:
+    case 0xC4: case 0xC5: case 0xC6: case 0xC7:
+    case 0xC8: case 0xC9: case 0xCA: case 0xCB:
+    case 0xCC: case 0xCD: case 0xCE: case 0xCF:
+    case 0xD0: case 0xD1: case 0xD2: case 0xD3:
+    case 0xD4: case 0xD5: case 0xD6: case 0xD7:
+    case 0xD8: case 0xD9: case 0xDA: case 0xDB:
+    case 0xDC: case 0xDD: case 0xDE: case 0xDF:
+        if (max_len < 2) {
+            // need more bytes
+            break;
+        }
+        if (*p >= 0x80 && *p <= 0xBF) {
+            *pp = p + 1;
+            return ((c - 0xC0) << 6) + (*p - 0x80);
+        }
+        // otherwise encoding error
         break;
-    case 0xe0: case 0xe1: case 0xe2: case 0xe3:
-    case 0xe4: case 0xe5: case 0xe6: case 0xe7:
-    case 0xe8: case 0xe9: case 0xea: case 0xeb:
-    case 0xec: case 0xed: case 0xee: case 0xef:
-        l = 2;
+    case 0xE0:
+        lower = 0xA0;   /* reject invalid encoding */
+        goto need2;
+    case 0xE1: case 0xE2: case 0xE3:
+    case 0xE4: case 0xE5: case 0xE6: case 0xE7:
+    case 0xE8: case 0xE9: case 0xEA: case 0xEB:
+    case 0xEC: case 0xED: case 0xEE: case 0xEF:
+        lower = 0x80;
+    need2:
+        if (max_len < 3) {
+            // need more bytes
+            break;
+        }
+        if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
+            *pp = p + 2;
+            return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
+        }
+        // otherwise encoding error
         break;
-    case 0xf0: case 0xf1: case 0xf2: case 0xf3:
-    case 0xf4: case 0xf5: case 0xf6: case 0xf7:
-        l = 3;
-        break;
-    case 0xf8: case 0xf9: case 0xfa: case 0xfb:
-        l = 4;
-        break;
-    case 0xfc: case 0xfd:
-        l = 5;
+    case 0xF0:
+        lower = 0x90;   /* reject invalid encoding */
+        upper = 0xBF;
+        goto need3;
+    case 0xF4:
+        lower = 0x80;
+        upper = 0x8F;   /* reject values above 0x10FFFF */
+        goto need3;
+    case 0xF1: case 0xF2: case 0xF3:
+        lower = 0x80;
+        upper = 0xBF;
+    need3:
+        if (max_len < 4) {
+            // need more bytes
+            break;
+        }
+        if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
+        &&  p[2] >= 0x80 && p[2] <= 0xBF) {
+            *pp = p + 3;
+            return ((c - 0xF0) << 18) + ((*p - 0x80) << 12) +
+                ((p[1] - 0x80) << 6) + (p[2] - 0x80);
+        }
+        // otherwise encoding error
         break;
     default:
-        return -1;
+        // invalid lead byte
+        break;
     }
-    /* check that we have enough characters */
-    if (l > (max_len - 1))
-        return -1;
-    c &= utf8_first_code_mask[l - 1];
-    for(i = 0; i < l; i++) {
-        b = *p++;
-        if (b < 0x80 || b >= 0xc0)
-            return -1;
-        c = (c << 6) | (b & 0x3f);
-    }
-    if (c < utf8_min_code[l - 1])
-        return -1;
     *pp = p;
-    return c;
+    return 0xFFFD;
+}
+
+/* Scan a UTF-8 encoded buffer for content type
+   `buf` is a valid pointer to a UTF-8 encoded string
+   `len` is the number of bytes to scan
+   `plen` points to a `size_t` variable to receive the number of units
+   Return value is a mask of bits.
+   - `UTF8_PLAIN_ASCII`: return value for 7-bit ASCII plain text
+   - `UTF8_NON_ASCII`: bit for non ASCII code points (8-bit or more)
+   - `UTF8_HAS_16BIT`: bit for 16-bit code points
+   - `UTF8_HAS_NON_BMP1`: bit for non-BMP1 code points, needs UTF-16 surrogate pairs
+   - `UTF8_HAS_ERRORS`: bit for encoding errors
+ */
+int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
+{
+    const uint8_t *p, *p_end, *p_next;
+    size_t i, len;
+    int kind;
+    uint8_t cbits;
+
+    kind = UTF8_PLAIN_ASCII;
+    cbits = 0;
+    len = buf_len;
+    // TODO: handle more than 1 byte at a time
+    for (i = 0; i < buf_len; i++)
+        cbits |= buf[i];
+    if (cbits >= 0x80) {
+        p = (const uint8_t *)buf;
+        p_end = p + buf_len;
+        kind = UTF8_NON_ASCII;
+        len = 0;
+        while (p < p_end) {
+            len++;
+            if (*p++ >= 0x80) {
+                /* parse UTF-8 sequence, check for encoding error */
+                uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
+                if (p_next == p)
+                    kind |= UTF8_HAS_ERRORS;
+                p = p_next;
+                if (c > 0xFF) {
+                    kind |= UTF8_HAS_16BIT;
+                    if (c > 0xFFFF) {
+                        len++;
+                        kind |= UTF8_HAS_NON_BMP1;
+                    }
+                }
+            }
+        }
+    }
+    *plen = len;
+    return kind;
+}
+
+/* Decode a string encoded in UTF-8 into an array of bytes
+   `src` points to the source string. It is assumed to be correctly encoded
+   and only contains code points below 0x800
+   `src_len` is the length of the source string
+   `dest` points to the destination array, it can be null if `dest_len` is `0`
+   `dest_len` is the length of the destination array. A null
+   terminator is stored at the end of the array unless `dest_len` is `0`.
+ */
+size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len)
+{
+    const uint8_t *p, *p_end;
+    size_t i;
+
+    p = (const uint8_t *)src;
+    p_end = p + src_len;
+    for (i = 0; p < p_end; i++) {
+        uint32_t c = *p++;
+        if (c >= 0xC0)
+            c = (c << 6) + *p++ - ((0xC0 << 6) + 0x80);
+        if (i < dest_len)
+            dest[i] = c;
+    }
+    if (i < dest_len)
+        dest[i] = '\0';
+    else if (dest_len > 0)
+        dest[dest_len - 1] = '\0';
+    return i;
+}
+
+/* Decode a string encoded in UTF-8 into an array of 16-bit words
+   `src` points to the source string. It is assumed to be correctly encoded.
+   `src_len` is the length of the source string
+   `dest` points to the destination array, it can be null if `dest_len` is `0`
+   `dest_len` is the length of the destination array. No null terminator is
+   stored at the end of the array.
+ */
+size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len)
+{
+    const uint8_t *p, *p_end;
+    size_t i;
+
+    p = (const uint8_t *)src;
+    p_end = p + src_len;
+    for (i = 0; p < p_end; i++) {
+        uint32_t c = *p++;
+        if (c >= 0x80) {
+            /* parse utf-8 sequence */
+            c = utf8_decode(p - 1, p_end - (p - 1), &p);
+            /* encoding errors are converted as 0xFFFD and use a single byte */
+            if (c > 0xFFFF) {
+                if (i < dest_len)
+                    dest[i] = get_hi_surrogate(c);
+                i++;
+                c = get_lo_surrogate(c);
+            }
+        }
+        if (i < dest_len)
+            dest[i] = c;
+    }
+    return i;
+}
+
+/* Encode a buffer of 8-bit bytes as a UTF-8 encoded string
+   `src` points to the source buffer.
+   `src_len` is the length of the source buffer
+   `dest` points to the destination array, it can be null if `dest_len` is `0`
+   `dest_len` is the length in bytes of the destination array. A null
+   terminator is stored at the end of the array unless `dest_len` is `0`.
+ */
+size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len)
+{
+    size_t i, j;
+    uint32_t c;
+
+    for (i = j = 0; i < src_len; i++) {
+        c = src[i];
+        if (c < 0x80) {
+            if (j + 1 >= dest_len)
+                goto overflow;
+            dest[j++] = c;
+        } else {
+            if (j + 2 >= dest_len)
+                goto overflow;
+            dest[j++] = (c >> 6) | 0xC0;
+            dest[j++] = (c & 0x3F) | 0x80;
+        }
+    }
+    if (j < dest_len)
+        dest[j] = '\0';
+    return j;
+
+overflow:
+    if (j < dest_len)
+        dest[j] = '\0';
+    while (i < src_len)
+        j += 1 + (src[i++] >= 0x80);
+    return j;
+}
+
+/* Encode a buffer of 16-bit code points as a UTF-8 encoded string
+   `src` points to the source buffer.
+   `src_len` is the length of the source buffer
+   `dest` points to the destination array, it can be null if `dest_len` is `0`
+   `dest_len` is the length in bytes of the destination array. A null
+   terminator is stored at the end of the array unless `dest_len` is `0`.
+ */
+size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len)
+{
+    size_t i, j;
+    uint32_t c;
+
+    for (i = j = 0; i < src_len;) {
+        c = src[i++];
+        if (c < 0x80) {
+            if (j + 1 >= dest_len)
+                goto overflow;
+            dest[j++] = c;
+        } else {
+            if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i]))
+                c = from_surrogate(c, src[i++]);
+            if (j + utf8_encode_len(c) >= dest_len)
+                goto overflow;
+            j += utf8_encode((uint8_t *)dest + j, c);
+        }
+    }
+    if (j < dest_len)
+        dest[j] = '\0';
+    return j;
+
+overflow:
+    i -= 1 + (c > 0xFFFF);
+    if (j < dest_len)
+        dest[j] = '\0';
+    while (i < src_len) {
+        c = src[i++];
+        if (c < 0x80) {
+            j++;
+        } else {
+            if (is_hi_surrogate(c) && i < src_len && is_lo_surrogate(src[i]))
+                c = from_surrogate(c, src[i++]);
+            j += utf8_encode_len(c);
+        }
+    }
+    return j;
 }
 
 /*--- integer to string conversions --*/
diff --git a/cutils.h b/cutils.h
index 79a25de..853277e 100644
--- a/cutils.h
+++ b/cutils.h
@@ -387,10 +387,25 @@ static inline void dbuf_set_error(DynBuf *s)
     s->error = TRUE;
 }
 
-#define UTF8_CHAR_LEN_MAX 6
+/*---- UTF-8 and UTF-16 handling ----*/
 
-int unicode_to_utf8(uint8_t *buf, unsigned int c);
-int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
+#define UTF8_CHAR_LEN_MAX 4
+
+enum {
+    UTF8_PLAIN_ASCII  = 0,  // 7-bit ASCII plain text
+    UTF8_NON_ASCII    = 1,  // has non ASCII code points (8-bit or more)
+    UTF8_HAS_16BIT    = 2,  // has 16-bit code points
+    UTF8_HAS_NON_BMP1 = 4,  // has non-BMP1 code points, needs UTF-16 surrogate pairs
+    UTF8_HAS_ERRORS   = 8,  // has encoding errors
+};
+int utf8_scan(const char *buf, size_t len, size_t *plen);
+size_t utf8_encode_len(uint32_t c);
+size_t utf8_encode(uint8_t *buf, uint32_t c);
+uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
+size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
+size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
+size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);
+size_t utf8_encode_buf16(char *dest, size_t dest_len, const uint16_t *src, size_t src_len);
 
 static inline BOOL is_surrogate(uint32_t c)
 {
diff --git a/libregexp.c b/libregexp.c
index 11b574e..4b6aa48 100644
--- a/libregexp.c
+++ b/libregexp.c
@@ -712,7 +712,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
 static int get_class_atom(REParseState *s, CharRange *cr,
                           const uint8_t **pp, BOOL inclass)
 {
-    const uint8_t *p;
+    const uint8_t *p, *p_next;
     uint32_t c;
     int ret;
 
@@ -804,15 +804,18 @@ static int get_class_atom(REParseState *s, CharRange *cr,
         /* fall thru */
     default:
     normal_char:
-        /* normal char */
-        if (c >= 128) {
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
-            if ((unsigned)c > 0xffff && !s->is_unicode) {
-                /* XXX: should handle non BMP-1 code points */
+        p++;
+        if (c >= 0x80) {
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p)
+                return re_parse_error(s, "invalid UTF-8 sequence");
+            p = p_next;
+            if (c > 0xFFFF && !s->is_unicode) {
+                // TODO(chqrlie): should handle non BMP-1 code points in
+                //   the calling function and no require the source string
+                //   to be CESU-8 encoded if not s->is_unicode
                 return re_parse_error(s, "malformed unicode char");
             }
-        } else {
-            p++;
         }
         break;
     }
@@ -1105,35 +1108,35 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
 /* '*pp' is the first char after '<' */
 static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
 {
-    const uint8_t *p, *p1;
+    const uint8_t *p, *p_next;
     uint32_t c, d;
     char *q;
 
     p = *pp;
     q = buf;
     for(;;) {
-        c = *p;
+        c = *p++;
         if (c == '\\') {
-            p++;
             if (*p != 'u')
                 return -1;
             c = lre_parse_escape(&p, 2); // accept surrogate pairs
+            if ((int)c < 0)
+                return -1;
         } else if (c == '>') {
             break;
-        } else if (c >= 128) {
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+        } else if (c >= 0x80) {
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p)
+                return -1;
+            p = p_next;
             if (is_hi_surrogate(c)) {
-                d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
+                d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
                 if (is_lo_surrogate(d)) {
                     c = from_surrogate(c, d);
-                    p = p1;
+                    p = p_next;
                 }
             }
-        } else {
-            p++;
         }
-        if (c > 0x10FFFF)
-            return -1;
         if (q == buf) {
             if (!lre_js_is_ident_first(c))
                 return -1;
@@ -1143,16 +1146,15 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
         }
         if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
             return -1;
-        if (c < 128) {
+        if (c < 0x80) {
             *q++ = c;
         } else {
-            q += unicode_to_utf8((uint8_t*)q, c);
+            q += utf8_encode((uint8_t*)q, c);
         }
     }
     if (q == buf)
         return -1;
     *q = '\0';
-    p++;
     *pp = p;
     return 0;
 }
diff --git a/quickjs-libc.c b/quickjs-libc.c
index c5b1235..d8ed30c 100644
--- a/quickjs-libc.c
+++ b/quickjs-libc.c
@@ -272,20 +272,21 @@ static JSValue js_printf_internal(JSContext *ctx,
                 if (i >= argc)
                     goto missing;
                 if (JS_IsString(argv[i])) {
+                    // TODO(chqrlie) need an API to wrap charCodeAt and codePointAt */
                     string_arg = JS_ToCString(ctx, argv[i++]);
                     if (!string_arg)
                         goto fail;
-                    int32_arg = unicode_from_utf8((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
+                    int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
                     JS_FreeCString(ctx, string_arg);
                 } else {
                     if (JS_ToInt32(ctx, &int32_arg, argv[i++]))
                         goto fail;
                 }
-                /* handle utf-8 encoding explicitly */
+                // XXX: throw an exception?
                 if ((unsigned)int32_arg > 0x10FFFF)
                     int32_arg = 0xFFFD;
                 /* ignore conversion flags, width and precision */
-                len = unicode_to_utf8(cbuf, int32_arg);
+                len = utf8_encode(cbuf, int32_arg);
                 dbuf_put(&dbuf, cbuf, len);
                 break;
 
diff --git a/quickjs.c b/quickjs.c
index f076d49..54d1778 100644
--- a/quickjs.c
+++ b/quickjs.c
@@ -3030,38 +3030,25 @@ static const char *JS_AtomGetStrRT(JSRuntime *rt, char *buf, int buf_size,
         snprintf(buf, buf_size, "<invalid %x>", atom);
     } else {
         JSAtomStruct *p = rt->atom_array[atom];
+        *buf = '\0';
         if (atom_is_free(p)) {
             assert(!atom_is_free(p));
             snprintf(buf, buf_size, "<free %x>", atom);
-        } else {
-            int i, c;
-            char *q;
-            JSString *str;
-
-            q = buf;
-            str = p;
-            if (str) {
-                if (!str->is_wide_char) {
-                    /* special case ASCII strings */
-                    c = 0;
-                    for(i = 0; i < str->len; i++) {
-                        c |= str->u.str8[i];
-                    }
-                    if (c < 0x80)
-                        return (const char *)str->u.str8;
-                }
+        } else if (p != NULL) {
+            JSString *str = p;
+            if (str->is_wide_char) {
+                /* encode surrogates correctly */
+                utf8_encode_buf16(buf, buf_size, str->u.str16, str->len);
+            } else {
+                /* special case ASCII strings */
+                int i, c = 0;
                 for(i = 0; i < str->len; i++) {
-                    c = string_get(str, i);
-                    if ((q - buf) >= buf_size - UTF8_CHAR_LEN_MAX)
-                        break;
-                    if (c < 128) {
-                        *q++ = c;
-                    } else {
-                        q += unicode_to_utf8((uint8_t *)q, c);
-                    }
+                    c |= str->u.str8[i];
                 }
+                if (c < 0x80)
+                    return (const char *)str->u.str8;
+                utf8_encode_buf8(buf, buf_size, str->u.str8, str->len);
             }
-            *q = '\0';
         }
     }
     return buf;
@@ -3311,6 +3298,7 @@ const char *JS_AtomToCString(JSContext *ctx, JSAtom atom)
 
 /* return a string atom containing name concatenated with str1 */
 /* `str1` may be pure ASCII or UTF-8 encoded */
+// TODO(chqrlie): use string concatenation instead of UTF-8 conversion
 static JSAtom js_atom_concat_str(JSContext *ctx, JSAtom name, const char *str1)
 {
     JSValue str;
@@ -3863,64 +3851,44 @@ static JSValue string_buffer_end(StringBuffer *s)
 /* create a string from a UTF-8 buffer */
 JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
 {
-    const uint8_t *p, *p_end, *p_start, *p_next;
-    uint32_t c;
-    StringBuffer b_s, *b = &b_s;
-    size_t len1;
+    JSString *str;
+    size_t len;
+    int kind;
 
     if (buf_len <= 0) {
         return JS_AtomToString(ctx, JS_ATOM_empty_string);
     }
-    p_start = (const uint8_t *)buf;
-    p_end = p_start + buf_len;
-    p = p_start;
-    while (p < p_end && *p < 128)
-        p++;
-    len1 = p - p_start;
-    if (len1 > JS_STRING_LEN_MAX)
+    /* Compute string kind and length: 7-bit, 8-bit, 16-bit, 16-bit UTF-16 */
+    kind = utf8_scan(buf, buf_len, &len);
+    if (len > JS_STRING_LEN_MAX)
         return JS_ThrowRangeError(ctx, "invalid string length");
-    if (p == p_end) {
-        /* ASCII string */
-        return js_new_string8_len(ctx, buf, buf_len);
-    } else {
-        if (string_buffer_init(ctx, b, buf_len))
-            goto fail;
-        string_buffer_write8(b, p_start, len1);
-        while (p < p_end) {
-            if (*p < 128) {
-                string_buffer_putc8(b, *p++);
-            } else {
-                /* parse utf-8 sequence, return 0xFFFFFFFF for error */
-                c = unicode_from_utf8(p, p_end - p, &p_next);
-                if (c < 0x10000) {
-                    p = p_next;
-                } else if (c <= 0x10FFFF) {
-                    p = p_next;
-                    /* surrogate pair */
-                    string_buffer_putc16(b, get_hi_surrogate(c));
-                    c = get_lo_surrogate(c);
-                } else {
-                    /* invalid char */
-                    c = 0xfffd;
-                    /* skip the invalid chars */
-                    /* XXX: seems incorrect. Why not just use c = *p++; ? */
-                    while (p < p_end && (*p >= 0x80 && *p < 0xc0))
-                        p++;
-                    if (p < p_end) {
-                        p++;
-                        while (p < p_end && (*p >= 0x80 && *p < 0xc0))
-                            p++;
-                    }
-                }
-                string_buffer_putc16(b, c);
-            }
-        }
-    }
-    return string_buffer_end(b);
 
- fail:
-    string_buffer_free(b);
-    return JS_EXCEPTION;
+    switch (kind) {
+    case UTF8_PLAIN_ASCII:
+        str = js_alloc_string(ctx, len, 0);
+        if (!str)
+            return JS_EXCEPTION;
+        memcpy(str->u.str8, buf, len);
+        str->u.str8[len] = '\0';
+        break;
+    case UTF8_NON_ASCII:
+        /* buf contains non-ASCII code-points, but limited to 8-bit values */
+        str = js_alloc_string(ctx, len, 0);
+        if (!str)
+            return JS_EXCEPTION;
+        utf8_decode_buf8(str->u.str8, len + 1, buf, buf_len);
+        break;
+    default:
+        // This causes a potential problem in JS_ThrowError if message is invalid
+        //if (kind & UTF8_HAS_ERRORS)
+        //    return JS_ThrowRangeError(ctx, "invalid UTF-8 sequence");
+        str = js_alloc_string(ctx, len, 1);
+        if (!str)
+            return JS_EXCEPTION;
+        utf8_decode_buf16(str->u.str16, len, buf, buf_len);
+        break;
+    }
+    return JS_MKPTR(JS_TAG_STRING, str);
 }
 
 static JSValue JS_ConcatString3(JSContext *ctx, const char *str1,
@@ -4067,7 +4035,7 @@ go:
                         /* c = 0xfffd; */ /* error */
                     }
                 }
-                q += unicode_to_utf8(q, c);
+                q += utf8_encode(q, c);
             }
         }
     }
@@ -10073,6 +10041,7 @@ int JS_ToBool(JSContext *ctx, JSValue val)
     return JS_ToBoolFree(ctx, js_dup(val));
 }
 
+/* pc points to pure ASCII or UTF-8, null terminated contents */
 static int skip_spaces(const char *pc)
 {
     const uint8_t *p, *p_next, *p_start;
@@ -10080,19 +10049,19 @@ static int skip_spaces(const char *pc)
 
     p = p_start = (const uint8_t *)pc;
     for (;;) {
-        c = *p;
-        if (c < 128) {
+        c = *p++;
+        if (c < 0x80) {
             if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
                 break;
-            p++;
         } else {
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            /* no need to test for invalid UTF-8, 0xFFFD is not a space */
             if (!lre_is_space(c))
                 break;
             p = p_next;
         }
     }
-    return p - p_start;
+    return p - 1 - p_start;
 }
 
 static inline int to_digit(int c)
@@ -18689,6 +18658,7 @@ static int js_parse_error_reserved_identifier(JSParseState *s)
 static __exception int js_parse_template_part(JSParseState *s,
                                               const uint8_t *p)
 {
+    const uint8_t *p_next;
     uint32_t c;
     StringBuffer b_s, *b = &b_s;
 
@@ -18726,9 +18696,8 @@ static __exception int js_parse_template_part(JSParseState *s,
             s->eol = &p[-1];
             s->mark = p;
         } else if (c >= 0x80) {
-            const uint8_t *p_next;
-            c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
-            if (c > 0x10FFFF) {
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p) {
                 js_parse_error(s, "invalid UTF-8 sequence");
                 goto fail;
             }
@@ -18754,6 +18723,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
                                        BOOL do_throw, const uint8_t *p,
                                        JSToken *token, const uint8_t **pp)
 {
+    const uint8_t *p_next;
     int ret;
     uint32_t c;
     StringBuffer b_s, *b = &b_s;
@@ -18832,9 +18802,8 @@ static __exception int js_parse_string(JSParseState *s, int sep,
                     }
                     goto fail;
                 } else if (c >= 0x80) {
-                    const uint8_t *p_next;
-                    c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
-                    if (c > 0x10FFFF) {
+                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+                    if (p_next == p + 1) {
                         goto invalid_utf8;
                     }
                     p = p_next;
@@ -18859,9 +18828,8 @@ static __exception int js_parse_string(JSParseState *s, int sep,
                 break;
             }
         } else if (c >= 0x80) {
-            const uint8_t *p_next;
-            c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
-            if (c > 0x10FFFF)
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p)
                 goto invalid_utf8;
             p = p_next;
         }
@@ -18893,7 +18861,7 @@ static inline BOOL token_is_pseudo_keyword(JSParseState *s, JSAtom atom) {
 
 static __exception int js_parse_regexp(JSParseState *s)
 {
-    const uint8_t *p;
+    const uint8_t *p, *p_next;
     BOOL in_class;
     StringBuffer b_s, *b = &b_s;
     StringBuffer b2_s, *b2 = &b2_s;
@@ -18932,9 +18900,8 @@ static __exception int js_parse_regexp(JSParseState *s)
             else if (c == '\0' && p >= s->buf_end)
                 goto eof_error;
             else if (c >= 0x80) {
-                const uint8_t *p_next;
-                c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
-                if (c > 0x10FFFF) {
+                c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+                if (p_next == p) {
                     goto invalid_utf8;
                 }
                 p = p_next;
@@ -18942,9 +18909,8 @@ static __exception int js_parse_regexp(JSParseState *s)
                     goto eol_error;
             }
         } else if (c >= 0x80) {
-            const uint8_t *p_next;
-            c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
-            if (c > 0x10FFFF) {
+            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p) {
             invalid_utf8:
                 js_parse_error(s, "invalid UTF-8 sequence");
                 goto fail;
@@ -18963,14 +18929,8 @@ static __exception int js_parse_regexp(JSParseState *s)
 
     /* flags */
     for(;;) {
-        const uint8_t *p_next = p;
-        c = *p_next++;
-        if (c >= 0x80) {
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p_next);
-            if (c > 0x10FFFF) {
-                goto invalid_utf8;
-            }
-        }
+        c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+        /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
         if (!lre_js_is_ident_next(c))
             break;
         if (string_buffer_putc(b2, c))
@@ -19020,10 +18980,10 @@ static __exception int ident_realloc(JSContext *ctx, char **pbuf, size_t *psize,
 static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
                           BOOL *pident_has_escape, int c, BOOL is_private)
 {
-    const uint8_t *p, *p1;
+    const uint8_t *p, *p_next;
     char ident_buf[128], *buf;
     size_t ident_size, ident_pos;
-    JSAtom atom;
+    JSAtom atom = JS_ATOM_NULL;
 
     p = *pp;
     buf = ident_buf;
@@ -19032,28 +18992,26 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
     if (is_private)
         buf[ident_pos++] = '#';
     for(;;) {
-        p1 = p;
-
-        if (c < 128) {
+        if (c < 0x80) {
             buf[ident_pos++] = c;
         } else {
-            ident_pos += unicode_to_utf8((uint8_t*)buf + ident_pos, c);
+            ident_pos += utf8_encode((uint8_t*)buf + ident_pos, c);
         }
-        c = *p1++;
-        if (c == '\\' && *p1 == 'u') {
-            c = lre_parse_escape(&p1, TRUE);
+        c = *p;
+        p_next = p + 1;
+        if (c == '\\' && *p_next == 'u') {
+            c = lre_parse_escape(&p_next, TRUE);
             *pident_has_escape = TRUE;
-        } else if (c >= 128) {
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
+        } else if (c >= 0x80) {
+            c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+            /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
         }
         if (!lre_js_is_ident_next(c))
             break;
-        p = p1;
+        p = p_next;
         if (unlikely(ident_pos >= ident_size - UTF8_CHAR_LEN_MAX)) {
-            if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf)) {
-                atom = JS_ATOM_NULL;
+            if (ident_realloc(s->ctx, &buf, &ident_size, ident_buf))
                 goto done;
-            }
         }
     }
     /* buf is pure ASCII or UTF-8 encoded */
@@ -19068,7 +19026,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
 
 static __exception int next_token(JSParseState *s)
 {
-    const uint8_t *p;
+    const uint8_t *p, *p_next;
     int c;
     BOOL ident_has_escape;
     JSAtom atom;
@@ -19148,11 +19106,10 @@ static __exception int next_token(JSParseState *s)
                     s->got_lf = TRUE; /* considered as LF for ASI */
                     p++;
                 } else if (*p >= 0x80) {
-                    c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                    /* ignore invalid UTF-8 in comments */
                     if (c == CP_LS || c == CP_PS) {
                         s->got_lf = TRUE; /* considered as LF for ASI */
-                    } else if (c == -1) {
-                        p++; /* skip invalid UTF-8 */
                     }
                 } else {
                     p++;
@@ -19170,12 +19127,11 @@ static __exception int next_token(JSParseState *s)
                 if (*p == '\r' || *p == '\n')
                     break;
                 if (*p >= 0x80) {
-                    c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                    /* ignore invalid UTF-8 in comments */
                     /* LS or PS are considered as line terminator */
                     if (c == CP_LS || c == CP_PS) {
                         break;
-                    } else if (c == -1) {
-                        p++; /* skip invalid UTF-8 */
                     }
                 } else {
                     p++;
@@ -19265,20 +19221,21 @@ static __exception int next_token(JSParseState *s)
     case '#':
         /* private name */
         {
-            const uint8_t *p1;
             p++;
-            p1 = p;
-            c = *p1++;
-            if (c == '\\' && *p1 == 'u') {
-                c = lre_parse_escape(&p1, TRUE);
-            } else if (c >= 128) {
-                c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
+            c = *p;
+            p_next = p + 1;
+            if (c == '\\' && *p_next == 'u') {
+                c = lre_parse_escape(&p_next, TRUE);
+            } else if (c >= 0x80) {
+                c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+                if (p_next == p + 1)
+                    goto invalid_utf8;
             }
             if (!lre_js_is_ident_first(c)) {
                 js_parse_error(s, "invalid first character of private name");
                 goto fail;
             }
-            p = p1;
+            p = p_next;
             ident_has_escape = FALSE; /* not used */
             atom = parse_ident(s, &p, &ident_has_escape, c, TRUE);
             if (atom == JS_ATOM_NULL)
@@ -19313,7 +19270,6 @@ static __exception int next_token(JSParseState *s)
     parse_number:
         {
             JSValue ret;
-            const uint8_t *p1;
             int flags;
             flags = ATOD_ACCEPT_BIN_OCT | ATOD_ACCEPT_LEGACY_OCTAL |
                 ATOD_ACCEPT_UNDERSCORES | ATOD_ACCEPT_SUFFIX;
@@ -19324,7 +19280,7 @@ static __exception int next_token(JSParseState *s)
                 goto fail;
             /* reject `10instanceof Number` */
             if (JS_VALUE_IS_NAN(ret) ||
-                lre_js_is_ident_next(unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1))) {
+                lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
                 JS_FreeValue(s->ctx, ret);
                 js_parse_error(s, "invalid number literal");
                 goto fail;
@@ -19516,9 +19472,11 @@ static __exception int next_token(JSParseState *s)
         }
         break;
     default:
-        if (c >= 128) {
-            /* unicode value */
-            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+        if (c >= 0x80) {  /* non-ASCII code-point */
+            c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+            if (p_next == p + 1)
+                goto invalid_utf8;
+            p = p_next;
             switch(c) {
             case CP_PS:
             case CP_LS:
@@ -19549,6 +19507,8 @@ static __exception int next_token(JSParseState *s)
     //    dump_token(s, &s->token);
     return 0;
 
+ invalid_utf8:
+    js_parse_error(s, "invalid UTF-8 sequence");
  fail:
     s->token.val = TOK_ERROR;
     return -1;
@@ -19573,7 +19533,7 @@ static int json_parse_error(JSParseState *s, const uint8_t *curp, const char *ms
 
 static int json_parse_string(JSParseState *s, const uint8_t **pp)
 {
-    const uint8_t *p = *pp;
+    const uint8_t *p, *p_next;
     int i;
     uint32_t c;
     StringBuffer b_s, *b = &b_s;
@@ -19581,6 +19541,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
     if (string_buffer_init(s->ctx, b, 32))
         goto fail;
 
+    p = *pp;
     for(;;) {
         if (p >= s->buf_end) {
             goto end_of_input;
@@ -19622,9 +19583,8 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
             }
         } else
         if (c >= 0x80) {
-            const uint8_t *p_next;
-            c = unicode_from_utf8(p - 1, s->buf_end - p, &p_next);
-            if (c > 0x10FFFF) {
+            c = utf8_decode(p - 1, s->buf_end - p, &p_next);
+            if (p_next == p) {
                 json_parse_error(s, p - 1, "Bad UTF-8 sequence");
                 goto fail;
             }
@@ -19722,7 +19682,7 @@ static JSAtom json_parse_ident(JSParseState *s, const uint8_t **pp, int c)
 
 static __exception int json_next_token(JSParseState *s)
 {
-    const uint8_t *p;
+    const uint8_t *p, *p_next;
     int c;
     JSAtom atom;
 
@@ -19826,10 +19786,9 @@ static __exception int json_next_token(JSParseState *s)
             goto fail;
         break;
     default:
-        if (c >= 128) {
-            const uint8_t *p_next;
-            c = unicode_from_utf8(p, s->buf_end - p, &p_next);
-            if (c == -1) {
+        if (c >= 0x80) {
+            c = utf8_decode(p, s->buf_end - p, &p_next);
+            if (p_next == p + 1) {
                 js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
             } else {
                 if (c > 0xFFFF) {
@@ -19951,12 +19910,10 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
             if (*p == '\n' || *p == '\r') {
                 break;
             } else if (*p >= 0x80) {
-                c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
-                if (c == CP_LS || c == CP_PS) {
+                c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                /* purposely ignore UTF-8 encoding errors in this comment line */
+                if (c == CP_LS || c == CP_PS)
                     break;
-                } else if (c == -1) {
-                    p++; /* skip invalid UTF-8 */
-                }
             } else {
                 p++;
             }