Optimize RegExp ASCII literal matching (#94)

Add REOP_char8 that matches single bytes. Compresses bytecode for the ASCII common case by 33% and reduces regexp_ascii benchmark running time by 4%. The regexp_utf16 benchmark is unaffected.
2023-11-19 17:26:45 +01:00 · 2023-11-19 17:26:45 +01:00 · e2bc6441f8
commit e2bc6441f8
parent e49da8e96f
3 changed files with 47 additions and 12 deletions
--- a/libregexp-opcode.h
+++ b/libregexp-opcode.h
@ -25,7 +25,8 @@
 #ifdef DEF

 DEF(invalid, 1) /* never used */
-DEF(char, 3)
+DEF(char8, 2) /* 7 bits in fact */
+DEF(char16, 3)
 DEF(char32, 5)
 DEF(dot, 1)
 DEF(any, 1) /* same as dot but match any character including line terminator */
--- a/libregexp.c
+++ b/libregexp.c
@ -315,15 +315,15 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
        }
        printf("%s", reopcode_info[opcode].name);
        switch(opcode) {
-        case REOP_char:
+        case REOP_char8:
+            val = get_u8(buf + pos + 1);
+            goto printchar;
+        case REOP_char16:
            val = get_u16(buf + pos + 1);
-            if (val >= ' ' && val <= 126)
-                printf(" '%c'", val);
-            else
-                printf(" 0x%04x", val);
-            break;
+            goto printchar;
        case REOP_char32:
            val = get_u32(buf + pos + 1);
+        printchar:
            if (val >= ' ' && val <= 126)
                printf(" '%c'", val);
            else
@ -971,8 +971,9 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
            val = get_u16(bc_buf + pos + 1);
            len += val * 8;
            goto simple_char;
-        case REOP_char:
        case REOP_char32:
+        case REOP_char16:
+        case REOP_char8:
        case REOP_dot:
        case REOP_any:
        simple_char:
@ -1050,8 +1051,9 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
            val = get_u16(bc_buf + pos + 1);
            len += val * 8;
            goto simple_char;
-        case REOP_char:
        case REOP_char32:
+        case REOP_char16:
+        case REOP_char8:
        case REOP_dot:
        case REOP_any:
        simple_char:
@ -1494,8 +1496,10 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
        } else {
            if (s->ignore_case)
                c = lre_canonicalize(c, s->is_utf16);
-            if (c <= 0xffff)
-                re_emit_op_u16(s, REOP_char, c);
+            if (c <= 0x7f)
+                re_emit_op_u8(s, REOP_char8, c);
+            else if (c <= 0xffff)
+                re_emit_op_u16(s, REOP_char16, c);
            else
                re_emit_op_u32(s, REOP_char32, c);
        }
@ -2181,9 +2185,13 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
            val = get_u32(pc);
            pc += 4;
            goto test_char;
-        case REOP_char:
+        case REOP_char16:
            val = get_u16(pc);
            pc += 2;
+            goto test_char;
+        case REOP_char8:
+            val = get_u8(pc);
+            pc += 1;
        test_char:
            if (cptr >= cbuf_end)
                goto no_match;
--- a/tests/microbench.js
+++ b/tests/microbench.js
@ -654,6 +654,30 @@ function math_min(n)
    return n * 1000;
 }

+function regexp_ascii(n)
+{
+    var i, j, r, s;
+    s = "the quick brown fox jumped over the lazy dog"
+    for(j = 0; j < n; j++) {
+        for(i = 0; i < 10000; i++)
+            r = /the quick brown fox/.exec(s)
+        global_res = r;
+    }
+    return n * 10000;
+}
+
+function regexp_utf16(n)
+{
+    var i, j, r, s;
+    s = "the quick brown ᶠᵒˣ jumped over the lazy ᵈᵒᵍ"
+    for(j = 0; j < n; j++) {
+        for(i = 0; i < 10000; i++)
+            r = /the quick brown ᶠᵒˣ/.exec(s)
+        global_res = r;
+    }
+    return n * 10000;
+}
+
 /* incremental string contruction as local var */
 function string_build1(n)
 {
@ -951,6 +975,8 @@ function main(argc, argv, g)
        array_for_in,
        array_for_of,
        math_min,
+        regexp_ascii,
+        regexp_utf16,
        string_build1,
        string_build2,
        //string_build3,