Use more functions for explicit surrogate handling (#353)

- add `is_surrogate`, `get_hi_surrogate` and `get_lo_surrogate`
- use surrogate functions instead of hard coded computations
This commit is contained in:
Charlie Gordon 2024-04-07 18:19:55 +02:00 committed by GitHub
parent 1db884b140
commit 3f06c95558
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 30 additions and 19 deletions

View file

@ -384,14 +384,29 @@ static inline void dbuf_set_error(DynBuf *s)
int unicode_to_utf8(uint8_t *buf, unsigned int c); int unicode_to_utf8(uint8_t *buf, unsigned int c);
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp); int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
static inline BOOL is_surrogate(uint32_t c)
{
return (c >> 11) == (0xD800 >> 11); // 0xD800-0xDFFF
}
static inline BOOL is_hi_surrogate(uint32_t c) static inline BOOL is_hi_surrogate(uint32_t c)
{ {
return 54 == (c >> 10); // 0xD800-0xDBFF return (c >> 10) == (0xD800 >> 10); // 0xD800-0xDBFF
} }
static inline BOOL is_lo_surrogate(uint32_t c) static inline BOOL is_lo_surrogate(uint32_t c)
{ {
return 55 == (c >> 10); // 0xDC00-0xDFFF return (c >> 10) == (0xDC00 >> 10); // 0xDC00-0xDFFF
}
static inline uint32_t get_hi_surrogate(uint32_t c)
{
return (c >> 10) - (0x10000 >> 10) + 0xD800;
}
static inline uint32_t get_lo_surrogate(uint32_t c)
{
return (c & 0x3FF) | 0xDC00;
} }
static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo) static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo)

View file

@ -3664,10 +3664,9 @@ static int string_buffer_putc(StringBuffer *s, uint32_t c)
{ {
if (unlikely(c >= 0x10000)) { if (unlikely(c >= 0x10000)) {
/* surrogate pair */ /* surrogate pair */
c -= 0x10000; if (string_buffer_putc16(s, get_hi_surrogate(c)))
if (string_buffer_putc16(s, (c >> 10) + 0xd800))
return -1; return -1;
c = (c & 0x3ff) + 0xdc00; c = get_lo_surrogate(c);
} }
return string_buffer_putc16(s, c); return string_buffer_putc16(s, c);
} }
@ -3883,9 +3882,8 @@ JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
} else if (c <= 0x10FFFF) { } else if (c <= 0x10FFFF) {
p = p_next; p = p_next;
/* surrogate pair */ /* surrogate pair */
c -= 0x10000; string_buffer_putc16(b, get_hi_surrogate(c));
string_buffer_putc16(b, (c >> 10) + 0xd800); c = get_lo_surrogate(c);
c = (c & 0x3ff) + 0xdc00;
} else { } else {
/* invalid char */ /* invalid char */
c = 0xfffd; c = 0xfffd;
@ -11508,7 +11506,7 @@ static JSValue JS_ToQuotedString(JSContext *ctx, JSValue val1)
goto fail; goto fail;
break; break;
default: default:
if (c < 32 || is_hi_surrogate(c) || is_lo_surrogate(c)) { if (c < 32 || is_surrogate(c)) {
snprintf(buf, sizeof(buf), "\\u%04x", c); snprintf(buf, sizeof(buf), "\\u%04x", c);
if (string_buffer_puts8(b, buf)) if (string_buffer_puts8(b, buf))
goto fail; goto fail;
@ -19796,8 +19794,7 @@ static __exception int json_next_token(JSParseState *s)
js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p); js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
} else { } else {
if (c > 0xFFFF) { if (c > 0xFFFF) {
/* get high surrogate */ c = get_hi_surrogate(c);
c = (c >> 10) - (0x10000 >> 10) + 0xD800;
} }
js_parse_error(s, "Unexpected token '\\u%04x' in JSON", c); js_parse_error(s, "Unexpected token '\\u%04x' in JSON", c);
} }
@ -39555,12 +39552,12 @@ static JSValue js_string_isWellFormed(JSContext *ctx, JSValue this_val,
for (i = 0, n = p->len; i < n; i++) { for (i = 0, n = p->len; i < n; i++) {
c = p->u.str16[i]; c = p->u.str16[i];
if (c < 0xD800 || c > 0xDFFF) if (!is_surrogate(c))
continue; continue;
if (c > 0xDBFF || i+1 == n) if (is_lo_surrogate(c) || i + 1 == n)
break; break;
c = p->u.str16[++i]; c = p->u.str16[++i];
if (c < 0xDC00 || c > 0xDFFF) if (!is_lo_surrogate(c))
break; break;
} }
@ -39597,14 +39594,14 @@ static JSValue js_string_toWellFormed(JSContext *ctx, JSValue this_val,
p = JS_VALUE_GET_STRING(ret); p = JS_VALUE_GET_STRING(ret);
for (i = 0, n = p->len; i < n; i++) { for (i = 0, n = p->len; i < n; i++) {
c = p->u.str16[i]; c = p->u.str16[i];
if (c < 0xD800 || c > 0xDFFF) if (!is_surrogate(c))
continue; continue;
if (c > 0xDBFF || i+1 == n) { if (is_lo_surrogate(c) || i + 1 == n) {
p->u.str16[i] = 0xFFFD; p->u.str16[i] = 0xFFFD;
continue; continue;
} }
c = p->u.str16[++i]; c = p->u.str16[++i];
if (c < 0xDC00 || c > 0xDFFF) if (!is_lo_surrogate(c))
p->u.str16[--i] = 0xFFFD; p->u.str16[--i] = 0xFFFD;
} }
@ -46865,8 +46862,7 @@ static JSValue js_global_decodeURI(JSContext *ctx, JSValue this_val,
} }
c = (c << 6) | (c1 & 0x3f); c = (c << 6) | (c1 & 0x3f);
} }
if (c < c_min || c > 0x10FFFF || if (c < c_min || c > 0x10FFFF || is_surrogate(c)) {
is_hi_surrogate(c) || is_lo_surrogate(c)) {
js_throw_URIError(ctx, "malformed UTF-8"); js_throw_URIError(ctx, "malformed UTF-8");
goto fail; goto fail;
} }