Optimize RegExp ASCII literal matching (#94)
Add REOP_char8 that matches single bytes. Compresses bytecode for the ASCII common case by 33% and reduces regexp_ascii benchmark running time by 4%. The regexp_utf16 benchmark is unaffected.
This commit is contained in:
parent
e49da8e96f
commit
e2bc6441f8
3 changed files with 47 additions and 12 deletions
|
@ -25,7 +25,8 @@
|
|||
#ifdef DEF
|
||||
|
||||
DEF(invalid, 1) /* never used */
|
||||
DEF(char, 3)
|
||||
DEF(char8, 2) /* 7 bits in fact */
|
||||
DEF(char16, 3)
|
||||
DEF(char32, 5)
|
||||
DEF(dot, 1)
|
||||
DEF(any, 1) /* same as dot but match any character including line terminator */
|
||||
|
|
30
libregexp.c
30
libregexp.c
|
@ -315,15 +315,15 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
|
|||
}
|
||||
printf("%s", reopcode_info[opcode].name);
|
||||
switch(opcode) {
|
||||
case REOP_char:
|
||||
case REOP_char8:
|
||||
val = get_u8(buf + pos + 1);
|
||||
goto printchar;
|
||||
case REOP_char16:
|
||||
val = get_u16(buf + pos + 1);
|
||||
if (val >= ' ' && val <= 126)
|
||||
printf(" '%c'", val);
|
||||
else
|
||||
printf(" 0x%04x", val);
|
||||
break;
|
||||
goto printchar;
|
||||
case REOP_char32:
|
||||
val = get_u32(buf + pos + 1);
|
||||
printchar:
|
||||
if (val >= ' ' && val <= 126)
|
||||
printf(" '%c'", val);
|
||||
else
|
||||
|
@ -971,8 +971,9 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
|
|||
val = get_u16(bc_buf + pos + 1);
|
||||
len += val * 8;
|
||||
goto simple_char;
|
||||
case REOP_char:
|
||||
case REOP_char32:
|
||||
case REOP_char16:
|
||||
case REOP_char8:
|
||||
case REOP_dot:
|
||||
case REOP_any:
|
||||
simple_char:
|
||||
|
@ -1050,8 +1051,9 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
|
|||
val = get_u16(bc_buf + pos + 1);
|
||||
len += val * 8;
|
||||
goto simple_char;
|
||||
case REOP_char:
|
||||
case REOP_char32:
|
||||
case REOP_char16:
|
||||
case REOP_char8:
|
||||
case REOP_dot:
|
||||
case REOP_any:
|
||||
simple_char:
|
||||
|
@ -1494,8 +1496,10 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
|||
} else {
|
||||
if (s->ignore_case)
|
||||
c = lre_canonicalize(c, s->is_utf16);
|
||||
if (c <= 0xffff)
|
||||
re_emit_op_u16(s, REOP_char, c);
|
||||
if (c <= 0x7f)
|
||||
re_emit_op_u8(s, REOP_char8, c);
|
||||
else if (c <= 0xffff)
|
||||
re_emit_op_u16(s, REOP_char16, c);
|
||||
else
|
||||
re_emit_op_u32(s, REOP_char32, c);
|
||||
}
|
||||
|
@ -2181,9 +2185,13 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|||
val = get_u32(pc);
|
||||
pc += 4;
|
||||
goto test_char;
|
||||
case REOP_char:
|
||||
case REOP_char16:
|
||||
val = get_u16(pc);
|
||||
pc += 2;
|
||||
goto test_char;
|
||||
case REOP_char8:
|
||||
val = get_u8(pc);
|
||||
pc += 1;
|
||||
test_char:
|
||||
if (cptr >= cbuf_end)
|
||||
goto no_match;
|
||||
|
|
|
@ -654,6 +654,30 @@ function math_min(n)
|
|||
return n * 1000;
|
||||
}
|
||||
|
||||
function regexp_ascii(n)
|
||||
{
|
||||
var i, j, r, s;
|
||||
s = "the quick brown fox jumped over the lazy dog"
|
||||
for(j = 0; j < n; j++) {
|
||||
for(i = 0; i < 10000; i++)
|
||||
r = /the quick brown fox/.exec(s)
|
||||
global_res = r;
|
||||
}
|
||||
return n * 10000;
|
||||
}
|
||||
|
||||
function regexp_utf16(n)
|
||||
{
|
||||
var i, j, r, s;
|
||||
s = "the quick brown ᶠᵒˣ jumped over the lazy ᵈᵒᵍ"
|
||||
for(j = 0; j < n; j++) {
|
||||
for(i = 0; i < 10000; i++)
|
||||
r = /the quick brown ᶠᵒˣ/.exec(s)
|
||||
global_res = r;
|
||||
}
|
||||
return n * 10000;
|
||||
}
|
||||
|
||||
/* incremental string contruction as local var */
|
||||
function string_build1(n)
|
||||
{
|
||||
|
@ -951,6 +975,8 @@ function main(argc, argv, g)
|
|||
array_for_in,
|
||||
array_for_of,
|
||||
math_min,
|
||||
regexp_ascii,
|
||||
regexp_utf16,
|
||||
string_build1,
|
||||
string_build2,
|
||||
//string_build3,
|
||||
|
|
Loading…
Reference in a new issue