From e2bc6441f88efcd35ab1f797a3d923e7dc7d4d63 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Sun, 19 Nov 2023 17:26:45 +0100 Subject: [PATCH] Optimize RegExp ASCII literal matching (#94) Add REOP_char8 that matches single bytes. Compresses bytecode for the ASCII common case by 33% and reduces regexp_ascii benchmark running time by 4%. The regexp_utf16 benchmark is unaffected. --- libregexp-opcode.h | 3 ++- libregexp.c | 30 +++++++++++++++++++----------- tests/microbench.js | 26 ++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/libregexp-opcode.h b/libregexp-opcode.h index cd3683f..7fdc887 100644 --- a/libregexp-opcode.h +++ b/libregexp-opcode.h @@ -25,7 +25,8 @@ #ifdef DEF DEF(invalid, 1) /* never used */ -DEF(char, 3) +DEF(char8, 2) /* 7 bits in fact */ +DEF(char16, 3) DEF(char32, 5) DEF(dot, 1) DEF(any, 1) /* same as dot but match any character including line terminator */ diff --git a/libregexp.c b/libregexp.c index 4406922..c04765f 100644 --- a/libregexp.c +++ b/libregexp.c @@ -315,15 +315,15 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, } printf("%s", reopcode_info[opcode].name); switch(opcode) { - case REOP_char: + case REOP_char8: + val = get_u8(buf + pos + 1); + goto printchar; + case REOP_char16: val = get_u16(buf + pos + 1); - if (val >= ' ' && val <= 126) - printf(" '%c'", val); - else - printf(" 0x%04x", val); - break; + goto printchar; case REOP_char32: val = get_u32(buf + pos + 1); + printchar: if (val >= ' ' && val <= 126) printf(" '%c'", val); else @@ -971,8 +971,9 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len) val = get_u16(bc_buf + pos + 1); len += val * 8; goto simple_char; - case REOP_char: case REOP_char32: + case REOP_char16: + case REOP_char8: case REOP_dot: case REOP_any: simple_char: @@ -1050,8 +1051,9 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len) val = get_u16(bc_buf + pos + 1); len += val * 8; goto simple_char; - case REOP_char: case REOP_char32: + case REOP_char16: + case REOP_char8: case REOP_dot: case REOP_any: simple_char: @@ -1494,8 +1496,10 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) } else { if (s->ignore_case) c = lre_canonicalize(c, s->is_utf16); - if (c <= 0xffff) - re_emit_op_u16(s, REOP_char, c); + if (c <= 0x7f) + re_emit_op_u8(s, REOP_char8, c); + else if (c <= 0xffff) + re_emit_op_u16(s, REOP_char16, c); else re_emit_op_u32(s, REOP_char32, c); } @@ -2181,9 +2185,13 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, val = get_u32(pc); pc += 4; goto test_char; - case REOP_char: + case REOP_char16: val = get_u16(pc); pc += 2; + goto test_char; + case REOP_char8: + val = get_u8(pc); + pc += 1; test_char: if (cptr >= cbuf_end) goto no_match; diff --git a/tests/microbench.js b/tests/microbench.js index 19af59c..52108da 100644 --- a/tests/microbench.js +++ b/tests/microbench.js @@ -654,6 +654,30 @@ function math_min(n) return n * 1000; } +function regexp_ascii(n) +{ + var i, j, r, s; + s = "the quick brown fox jumped over the lazy dog" + for(j = 0; j < n; j++) { + for(i = 0; i < 10000; i++) + r = /the quick brown fox/.exec(s) + global_res = r; + } + return n * 10000; +} + +function regexp_utf16(n) +{ + var i, j, r, s; + s = "the quick brown ᶠᵒˣ jumped over the lazy ᵈᵒᵍ" + for(j = 0; j < n; j++) { + for(i = 0; i < 10000; i++) + r = /the quick brown ᶠᵒˣ/.exec(s) + global_res = r; + } + return n * 10000; +} + /* incremental string contruction as local var */ function string_build1(n) { @@ -951,6 +975,8 @@ function main(argc, argv, g) array_for_in, array_for_of, math_min, + regexp_ascii, + regexp_utf16, string_build1, string_build2, //string_build3,