From b56a82d19ff5cc0df13fffa9591e2c588aa901b8 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Tue, 21 Nov 2023 00:00:54 +0100 Subject: [PATCH] Normalize inputs to String.prototype.localeCompare (#97) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NFC-normalize the inputs, otherwise strings like "Å" and "A\u030A" (latin A with combining diacritical mark) don't compare equal. --- quickjs.c | 108 +++++++++++++++++++++++++++++++-------------- test262_errors.txt | 2 - 2 files changed, 74 insertions(+), 36 deletions(-) diff --git a/quickjs.c b/quickjs.c index 52d4375..910708e 100644 --- a/quickjs.c +++ b/quickjs.c @@ -1151,6 +1151,11 @@ static const JSClassExoticMethods js_proxy_exotic_methods; static const JSClassExoticMethods js_module_ns_exotic_methods; static JSClassID js_class_id_alloc = JS_CLASS_INIT_COUNT; +static int compare_u32(uint32_t a, uint32_t b) +{ + return -(a < b) + (b < a); // -1, 0 or 1 +} + static JSValue js_int32(int32_t v) { return JS_MKVAL(JS_TAG_INT, v); @@ -3930,14 +3935,8 @@ static int js_string_compare(JSContext *ctx, int res, len; len = min_int(p1->len, p2->len); res = js_string_memcmp(p1, p2, len); - if (res == 0) { - if (p1->len == p2->len) - res = 0; - else if (p1->len < p2->len) - res = -1; - else - res = 1; - } + if (res == 0) + res = compare_u32(p1->len, p2->len); return res; } @@ -39138,24 +39137,80 @@ static BOOL test_final_sigma(JSString *p, int sigma_pos) return !lre_is_cased(c1); } +static int to_utf32_buf(JSContext *ctx, JSString *p, uint32_t **pbuf) +{ + uint32_t *b; + int i, j, n; + + j = -1; + n = p->len; + b = js_malloc(ctx, max_int(1, n) * sizeof(*b)); + if (b) + for (i = j = 0; i < n;) + b[j++] = string_getc(p, &i); + *pbuf = b; + return j; +} + static JSValue js_string_localeCompare(JSContext *ctx, JSValueConst this_val, int argc, JSValueConst *argv) { - JSValue a, b; - int cmp; + int i, n, an, bn, cmp; + uint32_t *as, *bs, *ts; + JSValue a, b, ret; + + ret = JS_EXCEPTION; + as = NULL; + bs = NULL; a = JS_ToStringCheckObject(ctx, this_val); if (JS_IsException(a)) return JS_EXCEPTION; + b = JS_ToString(ctx, argv[0]); - if (JS_IsException(b)) { - JS_FreeValue(ctx, a); - return JS_EXCEPTION; - } - cmp = js_string_compare(ctx, JS_VALUE_GET_STRING(a), JS_VALUE_GET_STRING(b)); + if (JS_IsException(b)) + goto exception; + + an = to_utf32_buf(ctx, JS_VALUE_GET_STRING(a), &as); + if (an == -1) + goto exception; + + bn = to_utf32_buf(ctx, JS_VALUE_GET_STRING(b), &bs); + if (bn == -1) + goto exception; + + // TODO(bnoordhuis) skip normalization when input is latin1 + an = unicode_normalize(&ts, as, an, UNICODE_NFC, ctx, + (DynBufReallocFunc *)js_realloc); + if (an == -1) + goto exception; + js_free(ctx, as); + as = ts; + + // TODO(bnoordhuis) skip normalization when input is latin1 + bn = unicode_normalize(&ts, bs, bn, UNICODE_NFC, ctx, + (DynBufReallocFunc *)js_realloc); + if (bn == -1) + goto exception; + js_free(ctx, bs); + bs = ts; + + n = min_int(an, bn); + for (i = 0; i < n; i++) + if (as[i] != bs[i]) + break; + if (i < n) + cmp = compare_u32(as[i], bs[i]); + else + cmp = compare_u32(an, bn); + ret = js_int32(cmp); + +exception: JS_FreeValue(ctx, a); JS_FreeValue(ctx, b); - return JS_NewInt32(ctx, cmp); + js_free(ctx, as); + js_free(ctx, bs); + return ret; } static JSValue js_string_toLowerCase(JSContext *ctx, JSValueConst this_val, @@ -39200,29 +39255,14 @@ static JSValue js_string_toLowerCase(JSContext *ctx, JSValueConst this_val, static int JS_ToUTF32String(JSContext *ctx, uint32_t **pbuf, JSValueConst val1) { JSValue val; - JSString *p; - uint32_t *buf; - int i, j, len; + int len; val = JS_ToString(ctx, val1); if (JS_IsException(val)) return -1; - p = JS_VALUE_GET_STRING(val); - len = p->len; - /* UTF32 buffer length is len minus the number of correct surrogates pairs */ - buf = js_malloc(ctx, sizeof(buf[0]) * max_int(len, 1)); - if (!buf) { - JS_FreeValue(ctx, val); - goto fail; - } - for(i = j = 0; i < len;) - buf[j++] = string_getc(p, &i); + len = to_utf32_buf(ctx, JS_VALUE_GET_STRING(val), pbuf); JS_FreeValue(ctx, val); - *pbuf = buf; - return j; - fail: - *pbuf = NULL; - return -1; + return len; } static JSValue JS_NewUTF32String(JSContext *ctx, const uint32_t *buf, int len) diff --git a/test262_errors.txt b/test262_errors.txt index f10cf1a..4559160 100644 --- a/test262_errors.txt +++ b/test262_errors.txt @@ -37,8 +37,6 @@ test262/test/built-ins/Function/internals/Construct/derived-this-uninitialized-r test262/test/built-ins/Function/internals/Construct/derived-this-uninitialized-realm.js:20: strict mode: Test262Error: Expected a ReferenceError but got a different error constructor with the same name test262/test/built-ins/RegExp/lookahead-quantifier-match-groups.js:27: Test262Error: Expected [a, abc] and [a, undefined] to have the same contents. ? quantifier test262/test/built-ins/RegExp/lookahead-quantifier-match-groups.js:27: strict mode: Test262Error: Expected [a, abc] and [a, undefined] to have the same contents. ? quantifier -test262/test/built-ins/String/prototype/localeCompare/15.5.4.9_CE.js:62: Test262Error: String.prototype.localeCompare considers ö (\u006f\u0308) ≠ ö (\u00f6). -test262/test/built-ins/String/prototype/localeCompare/15.5.4.9_CE.js:62: strict mode: Test262Error: String.prototype.localeCompare considers ö (\u006f\u0308) ≠ ö (\u00f6). test262/test/built-ins/TypedArray/prototype/set/array-arg-targetbuffer-detached-on-get-src-value-no-throw.js:30: TypeError: out-of-bound numeric index (Testing with Float64Array.) test262/test/built-ins/TypedArray/prototype/set/array-arg-targetbuffer-detached-on-get-src-value-no-throw.js:30: strict mode: TypeError: out-of-bound numeric index (Testing with Float64Array.) test262/test/built-ins/TypedArray/prototype/sort/sort-tonumber.js:30: TypeError: ArrayBuffer is detached (Testing with Float64Array.)