From 3f06c95558f64b205fc4803b4acce3a9cfced329 Mon Sep 17 00:00:00 2001 From: Charlie Gordon Date: Sun, 7 Apr 2024 18:19:55 +0200 Subject: [PATCH] Use more functions for explicit surrogate handling (#353) - add `is_surrogate`, `get_hi_surrogate` and `get_lo_surrogate` - use surrogate functions instead of hard coded computations --- cutils.h | 19 +++++++++++++++++-- quickjs.c | 30 +++++++++++++----------------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/cutils.h b/cutils.h index f731eb9..f96e753 100644 --- a/cutils.h +++ b/cutils.h @@ -384,14 +384,29 @@ static inline void dbuf_set_error(DynBuf *s) int unicode_to_utf8(uint8_t *buf, unsigned int c); int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp); +static inline BOOL is_surrogate(uint32_t c) +{ + return (c >> 11) == (0xD800 >> 11); // 0xD800-0xDFFF +} + static inline BOOL is_hi_surrogate(uint32_t c) { - return 54 == (c >> 10); // 0xD800-0xDBFF + return (c >> 10) == (0xD800 >> 10); // 0xD800-0xDBFF } static inline BOOL is_lo_surrogate(uint32_t c) { - return 55 == (c >> 10); // 0xDC00-0xDFFF + return (c >> 10) == (0xDC00 >> 10); // 0xDC00-0xDFFF +} + +static inline uint32_t get_hi_surrogate(uint32_t c) +{ + return (c >> 10) - (0x10000 >> 10) + 0xD800; +} + +static inline uint32_t get_lo_surrogate(uint32_t c) +{ + return (c & 0x3FF) | 0xDC00; } static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo) diff --git a/quickjs.c b/quickjs.c index ad22f32..cc9ac0c 100644 --- a/quickjs.c +++ b/quickjs.c @@ -3664,10 +3664,9 @@ static int string_buffer_putc(StringBuffer *s, uint32_t c) { if (unlikely(c >= 0x10000)) { /* surrogate pair */ - c -= 0x10000; - if (string_buffer_putc16(s, (c >> 10) + 0xd800)) + if (string_buffer_putc16(s, get_hi_surrogate(c))) return -1; - c = (c & 0x3ff) + 0xdc00; + c = get_lo_surrogate(c); } return string_buffer_putc16(s, c); } @@ -3883,9 +3882,8 @@ JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len) } else if (c <= 0x10FFFF) { p = p_next; /* surrogate pair */ - c -= 0x10000; - string_buffer_putc16(b, (c >> 10) + 0xd800); - c = (c & 0x3ff) + 0xdc00; + string_buffer_putc16(b, get_hi_surrogate(c)); + c = get_lo_surrogate(c); } else { /* invalid char */ c = 0xfffd; @@ -11508,7 +11506,7 @@ static JSValue JS_ToQuotedString(JSContext *ctx, JSValue val1) goto fail; break; default: - if (c < 32 || is_hi_surrogate(c) || is_lo_surrogate(c)) { + if (c < 32 || is_surrogate(c)) { snprintf(buf, sizeof(buf), "\\u%04x", c); if (string_buffer_puts8(b, buf)) goto fail; @@ -19796,8 +19794,7 @@ static __exception int json_next_token(JSParseState *s) js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p); } else { if (c > 0xFFFF) { - /* get high surrogate */ - c = (c >> 10) - (0x10000 >> 10) + 0xD800; + c = get_hi_surrogate(c); } js_parse_error(s, "Unexpected token '\\u%04x' in JSON", c); } @@ -39555,12 +39552,12 @@ static JSValue js_string_isWellFormed(JSContext *ctx, JSValue this_val, for (i = 0, n = p->len; i < n; i++) { c = p->u.str16[i]; - if (c < 0xD800 || c > 0xDFFF) + if (!is_surrogate(c)) continue; - if (c > 0xDBFF || i+1 == n) + if (is_lo_surrogate(c) || i + 1 == n) break; c = p->u.str16[++i]; - if (c < 0xDC00 || c > 0xDFFF) + if (!is_lo_surrogate(c)) break; } @@ -39597,14 +39594,14 @@ static JSValue js_string_toWellFormed(JSContext *ctx, JSValue this_val, p = JS_VALUE_GET_STRING(ret); for (i = 0, n = p->len; i < n; i++) { c = p->u.str16[i]; - if (c < 0xD800 || c > 0xDFFF) + if (!is_surrogate(c)) continue; - if (c > 0xDBFF || i+1 == n) { + if (is_lo_surrogate(c) || i + 1 == n) { p->u.str16[i] = 0xFFFD; continue; } c = p->u.str16[++i]; - if (c < 0xDC00 || c > 0xDFFF) + if (!is_lo_surrogate(c)) p->u.str16[--i] = 0xFFFD; } @@ -46865,8 +46862,7 @@ static JSValue js_global_decodeURI(JSContext *ctx, JSValue this_val, } c = (c << 6) | (c1 & 0x3f); } - if (c < c_min || c > 0x10FFFF || - is_hi_surrogate(c) || is_lo_surrogate(c)) { + if (c < c_min || c > 0x10FFFF || is_surrogate(c)) { js_throw_URIError(ctx, "malformed UTF-8"); goto fail; }