Optimize RegExp ASCII literal matching (#94)

Add REOP_char8 that matches single bytes. Compresses bytecode for the
ASCII common case by 33% and reduces regexp_ascii benchmark running time
by 4%. The regexp_utf16 benchmark is unaffected.
This commit is contained in:
Ben Noordhuis 2023-11-19 17:26:45 +01:00 committed by GitHub
parent e49da8e96f
commit e2bc6441f8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 12 deletions

View file

@ -25,7 +25,8 @@
#ifdef DEF #ifdef DEF
DEF(invalid, 1) /* never used */ DEF(invalid, 1) /* never used */
DEF(char, 3) DEF(char8, 2) /* 7 bits in fact */
DEF(char16, 3)
DEF(char32, 5) DEF(char32, 5)
DEF(dot, 1) DEF(dot, 1)
DEF(any, 1) /* same as dot but match any character including line terminator */ DEF(any, 1) /* same as dot but match any character including line terminator */

View file

@ -315,15 +315,15 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
} }
printf("%s", reopcode_info[opcode].name); printf("%s", reopcode_info[opcode].name);
switch(opcode) { switch(opcode) {
case REOP_char: case REOP_char8:
val = get_u8(buf + pos + 1);
goto printchar;
case REOP_char16:
val = get_u16(buf + pos + 1); val = get_u16(buf + pos + 1);
if (val >= ' ' && val <= 126) goto printchar;
printf(" '%c'", val);
else
printf(" 0x%04x", val);
break;
case REOP_char32: case REOP_char32:
val = get_u32(buf + pos + 1); val = get_u32(buf + pos + 1);
printchar:
if (val >= ' ' && val <= 126) if (val >= ' ' && val <= 126)
printf(" '%c'", val); printf(" '%c'", val);
else else
@ -971,8 +971,9 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
val = get_u16(bc_buf + pos + 1); val = get_u16(bc_buf + pos + 1);
len += val * 8; len += val * 8;
goto simple_char; goto simple_char;
case REOP_char:
case REOP_char32: case REOP_char32:
case REOP_char16:
case REOP_char8:
case REOP_dot: case REOP_dot:
case REOP_any: case REOP_any:
simple_char: simple_char:
@ -1050,8 +1051,9 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
val = get_u16(bc_buf + pos + 1); val = get_u16(bc_buf + pos + 1);
len += val * 8; len += val * 8;
goto simple_char; goto simple_char;
case REOP_char:
case REOP_char32: case REOP_char32:
case REOP_char16:
case REOP_char8:
case REOP_dot: case REOP_dot:
case REOP_any: case REOP_any:
simple_char: simple_char:
@ -1494,8 +1496,10 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
} else { } else {
if (s->ignore_case) if (s->ignore_case)
c = lre_canonicalize(c, s->is_utf16); c = lre_canonicalize(c, s->is_utf16);
if (c <= 0xffff) if (c <= 0x7f)
re_emit_op_u16(s, REOP_char, c); re_emit_op_u8(s, REOP_char8, c);
else if (c <= 0xffff)
re_emit_op_u16(s, REOP_char16, c);
else else
re_emit_op_u32(s, REOP_char32, c); re_emit_op_u32(s, REOP_char32, c);
} }
@ -2181,9 +2185,13 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
val = get_u32(pc); val = get_u32(pc);
pc += 4; pc += 4;
goto test_char; goto test_char;
case REOP_char: case REOP_char16:
val = get_u16(pc); val = get_u16(pc);
pc += 2; pc += 2;
goto test_char;
case REOP_char8:
val = get_u8(pc);
pc += 1;
test_char: test_char:
if (cptr >= cbuf_end) if (cptr >= cbuf_end)
goto no_match; goto no_match;

View file

@ -654,6 +654,30 @@ function math_min(n)
return n * 1000; return n * 1000;
} }
function regexp_ascii(n)
{
var i, j, r, s;
s = "the quick brown fox jumped over the lazy dog"
for(j = 0; j < n; j++) {
for(i = 0; i < 10000; i++)
r = /the quick brown fox/.exec(s)
global_res = r;
}
return n * 10000;
}
function regexp_utf16(n)
{
var i, j, r, s;
s = "the quick brown ᶠᵒˣ jumped over the lazy ᵈᵒᵍ"
for(j = 0; j < n; j++) {
for(i = 0; i < 10000; i++)
r = /the quick brown ᶠᵒˣ/.exec(s)
global_res = r;
}
return n * 10000;
}
/* incremental string contruction as local var */ /* incremental string contruction as local var */
function string_build1(n) function string_build1(n)
{ {
@ -951,6 +975,8 @@ function main(argc, argv, g)
array_for_in, array_for_in,
array_for_of, array_for_of,
math_min, math_min,
regexp_ascii,
regexp_utf16,
string_build1, string_build1,
string_build2, string_build2,
//string_build3, //string_build3,