Use more functions for explicit surrogate handling (#353)

- add `is_surrogate`, `get_hi_surrogate` and `get_lo_surrogate`
- use surrogate functions instead of hard coded computations
This commit is contained in:
Charlie Gordon 2024-04-07 18:19:55 +02:00 committed by GitHub
parent 1db884b140
commit 3f06c95558
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 30 additions and 19 deletions

View file

@ -384,14 +384,29 @@ static inline void dbuf_set_error(DynBuf *s)
int unicode_to_utf8(uint8_t *buf, unsigned int c);
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
static inline BOOL is_surrogate(uint32_t c)
{
return (c >> 11) == (0xD800 >> 11); // 0xD800-0xDFFF
}
static inline BOOL is_hi_surrogate(uint32_t c)
{
return 54 == (c >> 10); // 0xD800-0xDBFF
return (c >> 10) == (0xD800 >> 10); // 0xD800-0xDBFF
}
static inline BOOL is_lo_surrogate(uint32_t c)
{
return 55 == (c >> 10); // 0xDC00-0xDFFF
return (c >> 10) == (0xDC00 >> 10); // 0xDC00-0xDFFF
}
static inline uint32_t get_hi_surrogate(uint32_t c)
{
return (c >> 10) - (0x10000 >> 10) + 0xD800;
}
static inline uint32_t get_lo_surrogate(uint32_t c)
{
return (c & 0x3FF) | 0xDC00;
}
static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo)

View file

@ -3664,10 +3664,9 @@ static int string_buffer_putc(StringBuffer *s, uint32_t c)
{
if (unlikely(c >= 0x10000)) {
/* surrogate pair */
c -= 0x10000;
if (string_buffer_putc16(s, (c >> 10) + 0xd800))
if (string_buffer_putc16(s, get_hi_surrogate(c)))
return -1;
c = (c & 0x3ff) + 0xdc00;
c = get_lo_surrogate(c);
}
return string_buffer_putc16(s, c);
}
@ -3883,9 +3882,8 @@ JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
} else if (c <= 0x10FFFF) {
p = p_next;
/* surrogate pair */
c -= 0x10000;
string_buffer_putc16(b, (c >> 10) + 0xd800);
c = (c & 0x3ff) + 0xdc00;
string_buffer_putc16(b, get_hi_surrogate(c));
c = get_lo_surrogate(c);
} else {
/* invalid char */
c = 0xfffd;
@ -11508,7 +11506,7 @@ static JSValue JS_ToQuotedString(JSContext *ctx, JSValue val1)
goto fail;
break;
default:
if (c < 32 || is_hi_surrogate(c) || is_lo_surrogate(c)) {
if (c < 32 || is_surrogate(c)) {
snprintf(buf, sizeof(buf), "\\u%04x", c);
if (string_buffer_puts8(b, buf))
goto fail;
@ -19796,8 +19794,7 @@ static __exception int json_next_token(JSParseState *s)
js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
} else {
if (c > 0xFFFF) {
/* get high surrogate */
c = (c >> 10) - (0x10000 >> 10) + 0xD800;
c = get_hi_surrogate(c);
}
js_parse_error(s, "Unexpected token '\\u%04x' in JSON", c);
}
@ -39555,12 +39552,12 @@ static JSValue js_string_isWellFormed(JSContext *ctx, JSValue this_val,
for (i = 0, n = p->len; i < n; i++) {
c = p->u.str16[i];
if (c < 0xD800 || c > 0xDFFF)
if (!is_surrogate(c))
continue;
if (c > 0xDBFF || i+1 == n)
if (is_lo_surrogate(c) || i + 1 == n)
break;
c = p->u.str16[++i];
if (c < 0xDC00 || c > 0xDFFF)
if (!is_lo_surrogate(c))
break;
}
@ -39597,14 +39594,14 @@ static JSValue js_string_toWellFormed(JSContext *ctx, JSValue this_val,
p = JS_VALUE_GET_STRING(ret);
for (i = 0, n = p->len; i < n; i++) {
c = p->u.str16[i];
if (c < 0xD800 || c > 0xDFFF)
if (!is_surrogate(c))
continue;
if (c > 0xDBFF || i+1 == n) {
if (is_lo_surrogate(c) || i + 1 == n) {
p->u.str16[i] = 0xFFFD;
continue;
}
c = p->u.str16[++i];
if (c < 0xDC00 || c > 0xDFFF)
if (!is_lo_surrogate(c))
p->u.str16[--i] = 0xFFFD;
}
@ -46865,8 +46862,7 @@ static JSValue js_global_decodeURI(JSContext *ctx, JSValue this_val,
}
c = (c << 6) | (c1 & 0x3f);
}
if (c < c_min || c > 0x10FFFF ||
is_hi_surrogate(c) || is_lo_surrogate(c)) {
if (c < c_min || c > 0x10FFFF || is_surrogate(c)) {
js_throw_URIError(ctx, "malformed UTF-8");
goto fail;
}