diff --git a/cutils.h b/cutils.h index e7d500f..4435e38 100644 --- a/cutils.h +++ b/cutils.h @@ -278,6 +278,21 @@ static inline void dbuf_set_error(DynBuf *s) int unicode_to_utf8(uint8_t *buf, unsigned int c); int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp); +static inline BOOL is_hi_surrogate(uint32_t c) +{ + return 54 == (c >> 10); // 0xD800-0xDBFF +} + +static inline BOOL is_lo_surrogate(uint32_t c) +{ + return 55 == (c >> 10); // 0xDC00-0xDFFF +} + +static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo) +{ + return 65536 + 1024 * (hi & 1023) + (lo & 1023); +} + static inline int from_hex(int c) { if (c >= '0' && c <= '9') diff --git a/libregexp.c b/libregexp.c index 3e6fcaa..d2fb427 100644 --- a/libregexp.c +++ b/libregexp.c @@ -550,7 +550,7 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16) } c = (c << 4) | h; } - if (c >= 0xd800 && c < 0xdc00 && + if (is_hi_surrogate(c) && allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') { /* convert an escaped surrogate pair into a unicode char */ @@ -561,9 +561,9 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16) break; c1 = (c1 << 4) | h; } - if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) { + if (i == 4 && is_lo_surrogate(c1)) { p += 6; - c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; + c = from_surrogate(c, c1); } } } @@ -1092,10 +1092,10 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) break; } else if (c >= 128) { c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); - if (c >= 0xD800 && c <= 0xDBFF) { + if (is_hi_surrogate(c)) { d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); - if (d >= 0xDC00 && d <= 0xDFFF) { - c = 0x10000 + 0x400 * (c - 0xD800) + (d - 0xDC00); + if (is_lo_surrogate(d)) { + c = from_surrogate(c, d); p = p1; } } @@ -1935,88 +1935,81 @@ static BOOL is_word_char(uint32_t c) if (cbuf_type == 0) { \ c = *cptr++; \ } else { \ - uint32_t __c1; \ - c = *(uint16_t *)cptr; \ - cptr += 2; \ - if (c >= 0xd800 && c < 0xdc00 && \ - cbuf_type == 2 && cptr < cbuf_end) { \ - __c1 = *(uint16_t *)cptr; \ - if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ - c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ - cptr += 2; \ - } \ - } \ + const uint16_t *_p = (uint16_t *)cptr; \ + const uint16_t *_end = (uint16_t *)cbuf_end; \ + c = *_p++; \ + if (is_hi_surrogate(c)) \ + if (cbuf_type == 2) \ + if (_p < _end) \ + if (is_lo_surrogate(*_p)) \ + c = from_surrogate(c, *_p++); \ + cptr = (void *) _p; \ } \ } while (0) -#define PEEK_CHAR(c, cptr, cbuf_end) \ - do { \ - if (cbuf_type == 0) { \ - c = cptr[0]; \ - } else { \ - uint32_t __c1; \ - c = ((uint16_t *)cptr)[0]; \ - if (c >= 0xd800 && c < 0xdc00 && \ - cbuf_type == 2 && (cptr + 2) < cbuf_end) { \ - __c1 = ((uint16_t *)cptr)[1]; \ - if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ - c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ - } \ - } \ - } \ - } while (0) - -#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \ - do { \ - if (cbuf_type == 0) { \ - c = cptr[-1]; \ - } else { \ - uint32_t __c1; \ - c = ((uint16_t *)cptr)[-1]; \ - if (c >= 0xdc00 && c < 0xe000 && \ - cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \ - __c1 = ((uint16_t *)cptr)[-2]; \ - if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \ - c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ - } \ - } \ +#define PEEK_CHAR(c, cptr, cbuf_end) \ + do { \ + if (cbuf_type == 0) { \ + c = cptr[0]; \ + } else { \ + const uint16_t *_p = (uint16_t *)cptr; \ + const uint16_t *_end = (uint16_t *)cbuf_end; \ + c = *_p++; \ + if (is_hi_surrogate(c)) \ + if (cbuf_type == 2) \ + if (_p < _end) \ + if (is_lo_surrogate(*_p)) \ + c = from_surrogate(c, *_p++); \ } \ } while (0) -#define GET_PREV_CHAR(c, cptr, cbuf_start) \ - do { \ - if (cbuf_type == 0) { \ - cptr--; \ - c = cptr[0]; \ - } else { \ - uint32_t __c1; \ - cptr -= 2; \ - c = ((uint16_t *)cptr)[0]; \ - if (c >= 0xdc00 && c < 0xe000 && \ - cbuf_type == 2 && cptr > cbuf_start) { \ - __c1 = ((uint16_t *)cptr)[-1]; \ - if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \ - cptr -= 2; \ - c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ - } \ - } \ +#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \ + do { \ + if (cbuf_type == 0) { \ + c = cptr[-1]; \ + } else { \ + const uint16_t *_p = (uint16_t *)cptr - 1; \ + const uint16_t *_start = (uint16_t *)cbuf_start; \ + c = *_p; \ + if (is_lo_surrogate(c)) \ + if (cbuf_type == 2) \ + if (_p > _start) \ + if (is_hi_surrogate(*--_p)) \ + c = from_surrogate(*_p, c); \ } \ } while (0) -#define PREV_CHAR(cptr, cbuf_start) \ - do { \ - if (cbuf_type == 0) { \ - cptr--; \ - } else { \ - cptr -= 2; \ - if (cbuf_type == 2) { \ - c = ((uint16_t *)cptr)[0]; \ - if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \ - c = ((uint16_t *)cptr)[-1]; \ - if (c >= 0xd800 && c < 0xdc00) \ - cptr -= 2; \ - } \ - } \ +#define GET_PREV_CHAR(c, cptr, cbuf_start) \ + do { \ + if (cbuf_type == 0) { \ + cptr--; \ + c = cptr[0]; \ + } else { \ + const uint16_t *_p = (uint16_t *)cptr - 1; \ + const uint16_t *_start = (uint16_t *)cbuf_start; \ + c = *_p; \ + if (is_lo_surrogate(c)) \ + if (cbuf_type == 2) \ + if (_p > _start) \ + if (is_hi_surrogate(*--_p)) \ + c = from_surrogate(*_p, c); \ + cptr = (void *) _p; \ + } \ + } while (0) + +#define PREV_CHAR(cptr, cbuf_start) \ + do { \ + if (cbuf_type == 0) { \ + cptr--; \ + } else { \ + const uint16_t *_p = (uint16_t *)cptr - 1; \ + const uint16_t *_start = (uint16_t *)cbuf_start; \ + if (is_lo_surrogate(*_p)) \ + if (cbuf_type == 2) \ + if (_p > _start) \ + if (is_hi_surrogate(_p[-1])) \ + _p--; \ + cptr = (void *) _p; \ } \ } while (0) diff --git a/quickjs.c b/quickjs.c index 5cdee41..a1af736 100644 --- a/quickjs.c +++ b/quickjs.c @@ -3501,10 +3501,10 @@ static int string_getc(const JSString *p, int *pidx) idx = *pidx; if (p->is_wide_char) { c = p->u.str16[idx++]; - if (c >= 0xd800 && c < 0xdc00 && idx < p->len) { + if (is_hi_surrogate(c) && idx < p->len) { c1 = p->u.str16[idx]; - if (c1 >= 0xdc00 && c1 < 0xe000) { - c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; + if (is_lo_surrogate(c1)) { + c = from_surrogate(c, c1); idx++; } } @@ -3842,13 +3842,12 @@ const char *JS_ToCStringLen2(JSContext *ctx, size_t *plen, JSValueConst val1, BO if (c < 0x80) { *q++ = c; } else { - if (c >= 0xd800 && c < 0xdc00) { + if (is_hi_surrogate(c)) { if (pos < len && !cesu8) { c1 = src[pos]; - if (c1 >= 0xdc00 && c1 < 0xe000) { + if (is_lo_surrogate(c1)) { pos++; - /* surrogate pair */ - c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; + c = from_surrogate(c, c1); } else { /* Keep unmatched surrogate code points */ /* c = 0xfffd; */ /* error */ @@ -11087,7 +11086,7 @@ static JSValue JS_ToQuotedString(JSContext *ctx, JSValueConst val1) goto fail; break; default: - if (c < 32 || (c >= 0xd800 && c < 0xe000)) { + if (c < 32 || is_hi_surrogate(c) || is_lo_surrogate(c)) { snprintf(buf, sizeof(buf), "\\u%04x", c); if (string_buffer_puts8(b, buf)) goto fail; @@ -39098,10 +39097,10 @@ static int string_prevc(JSString *p, int *pidx) idx--; if (p->is_wide_char) { c = p->u.str16[idx]; - if (c >= 0xdc00 && c < 0xe000 && idx > 0) { + if (is_lo_surrogate(c) && idx > 0) { c1 = p->u.str16[idx - 1]; - if (c1 >= 0xd800 && c1 <= 0xdc00) { - c = (((c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; + if (is_hi_surrogate(c1)) { + c = from_surrogate(c1, c); idx--; } } @@ -45453,7 +45452,7 @@ static JSValue js_global_decodeURI(JSContext *ctx, JSValueConst this_val, c = (c << 6) | (c1 & 0x3f); } if (c < c_min || c > 0x10FFFF || - (c >= 0xd800 && c < 0xe000)) { + is_hi_surrogate(c) || is_lo_surrogate(c)) { js_throw_URIError(ctx, "malformed UTF-8"); goto fail; } @@ -45528,21 +45527,21 @@ static JSValue js_global_encodeURI(JSContext *ctx, JSValueConst this_val, if (isURIUnescaped(c, isComponent)) { string_buffer_putc16(b, c); } else { - if (c >= 0xdc00 && c <= 0xdfff) { + if (is_lo_surrogate(c)) { js_throw_URIError(ctx, "invalid character"); goto fail; - } else if (c >= 0xd800 && c <= 0xdbff) { + } else if (is_hi_surrogate(c)) { if (k >= p->len) { js_throw_URIError(ctx, "expecting surrogate pair"); goto fail; } c1 = string_get(p, k); k++; - if (c1 < 0xdc00 || c1 > 0xdfff) { + if (!is_lo_surrogate(c1)) { js_throw_URIError(ctx, "expecting surrogate pair"); goto fail; } - c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; + c = from_surrogate(c, c1); } if (c < 0x80) { encodeURI_hex(b, c);