Optimize RegExp ASCII literal matching (#94)
Add REOP_char8 that matches single bytes. Compresses bytecode for the ASCII common case by 33% and reduces regexp_ascii benchmark running time by 4%. The regexp_utf16 benchmark is unaffected.
This commit is contained in:
parent
e49da8e96f
commit
e2bc6441f8
3 changed files with 47 additions and 12 deletions
|
@ -25,7 +25,8 @@
|
||||||
#ifdef DEF
|
#ifdef DEF
|
||||||
|
|
||||||
DEF(invalid, 1) /* never used */
|
DEF(invalid, 1) /* never used */
|
||||||
DEF(char, 3)
|
DEF(char8, 2) /* 7 bits in fact */
|
||||||
|
DEF(char16, 3)
|
||||||
DEF(char32, 5)
|
DEF(char32, 5)
|
||||||
DEF(dot, 1)
|
DEF(dot, 1)
|
||||||
DEF(any, 1) /* same as dot but match any character including line terminator */
|
DEF(any, 1) /* same as dot but match any character including line terminator */
|
||||||
|
|
30
libregexp.c
30
libregexp.c
|
@ -315,15 +315,15 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
|
||||||
}
|
}
|
||||||
printf("%s", reopcode_info[opcode].name);
|
printf("%s", reopcode_info[opcode].name);
|
||||||
switch(opcode) {
|
switch(opcode) {
|
||||||
case REOP_char:
|
case REOP_char8:
|
||||||
|
val = get_u8(buf + pos + 1);
|
||||||
|
goto printchar;
|
||||||
|
case REOP_char16:
|
||||||
val = get_u16(buf + pos + 1);
|
val = get_u16(buf + pos + 1);
|
||||||
if (val >= ' ' && val <= 126)
|
goto printchar;
|
||||||
printf(" '%c'", val);
|
|
||||||
else
|
|
||||||
printf(" 0x%04x", val);
|
|
||||||
break;
|
|
||||||
case REOP_char32:
|
case REOP_char32:
|
||||||
val = get_u32(buf + pos + 1);
|
val = get_u32(buf + pos + 1);
|
||||||
|
printchar:
|
||||||
if (val >= ' ' && val <= 126)
|
if (val >= ' ' && val <= 126)
|
||||||
printf(" '%c'", val);
|
printf(" '%c'", val);
|
||||||
else
|
else
|
||||||
|
@ -971,8 +971,9 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
|
||||||
val = get_u16(bc_buf + pos + 1);
|
val = get_u16(bc_buf + pos + 1);
|
||||||
len += val * 8;
|
len += val * 8;
|
||||||
goto simple_char;
|
goto simple_char;
|
||||||
case REOP_char:
|
|
||||||
case REOP_char32:
|
case REOP_char32:
|
||||||
|
case REOP_char16:
|
||||||
|
case REOP_char8:
|
||||||
case REOP_dot:
|
case REOP_dot:
|
||||||
case REOP_any:
|
case REOP_any:
|
||||||
simple_char:
|
simple_char:
|
||||||
|
@ -1050,8 +1051,9 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
|
||||||
val = get_u16(bc_buf + pos + 1);
|
val = get_u16(bc_buf + pos + 1);
|
||||||
len += val * 8;
|
len += val * 8;
|
||||||
goto simple_char;
|
goto simple_char;
|
||||||
case REOP_char:
|
|
||||||
case REOP_char32:
|
case REOP_char32:
|
||||||
|
case REOP_char16:
|
||||||
|
case REOP_char8:
|
||||||
case REOP_dot:
|
case REOP_dot:
|
||||||
case REOP_any:
|
case REOP_any:
|
||||||
simple_char:
|
simple_char:
|
||||||
|
@ -1494,8 +1496,10 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||||
} else {
|
} else {
|
||||||
if (s->ignore_case)
|
if (s->ignore_case)
|
||||||
c = lre_canonicalize(c, s->is_utf16);
|
c = lre_canonicalize(c, s->is_utf16);
|
||||||
if (c <= 0xffff)
|
if (c <= 0x7f)
|
||||||
re_emit_op_u16(s, REOP_char, c);
|
re_emit_op_u8(s, REOP_char8, c);
|
||||||
|
else if (c <= 0xffff)
|
||||||
|
re_emit_op_u16(s, REOP_char16, c);
|
||||||
else
|
else
|
||||||
re_emit_op_u32(s, REOP_char32, c);
|
re_emit_op_u32(s, REOP_char32, c);
|
||||||
}
|
}
|
||||||
|
@ -2181,9 +2185,13 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
||||||
val = get_u32(pc);
|
val = get_u32(pc);
|
||||||
pc += 4;
|
pc += 4;
|
||||||
goto test_char;
|
goto test_char;
|
||||||
case REOP_char:
|
case REOP_char16:
|
||||||
val = get_u16(pc);
|
val = get_u16(pc);
|
||||||
pc += 2;
|
pc += 2;
|
||||||
|
goto test_char;
|
||||||
|
case REOP_char8:
|
||||||
|
val = get_u8(pc);
|
||||||
|
pc += 1;
|
||||||
test_char:
|
test_char:
|
||||||
if (cptr >= cbuf_end)
|
if (cptr >= cbuf_end)
|
||||||
goto no_match;
|
goto no_match;
|
||||||
|
|
|
@ -654,6 +654,30 @@ function math_min(n)
|
||||||
return n * 1000;
|
return n * 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function regexp_ascii(n)
|
||||||
|
{
|
||||||
|
var i, j, r, s;
|
||||||
|
s = "the quick brown fox jumped over the lazy dog"
|
||||||
|
for(j = 0; j < n; j++) {
|
||||||
|
for(i = 0; i < 10000; i++)
|
||||||
|
r = /the quick brown fox/.exec(s)
|
||||||
|
global_res = r;
|
||||||
|
}
|
||||||
|
return n * 10000;
|
||||||
|
}
|
||||||
|
|
||||||
|
function regexp_utf16(n)
|
||||||
|
{
|
||||||
|
var i, j, r, s;
|
||||||
|
s = "the quick brown ᶠᵒˣ jumped over the lazy ᵈᵒᵍ"
|
||||||
|
for(j = 0; j < n; j++) {
|
||||||
|
for(i = 0; i < 10000; i++)
|
||||||
|
r = /the quick brown ᶠᵒˣ/.exec(s)
|
||||||
|
global_res = r;
|
||||||
|
}
|
||||||
|
return n * 10000;
|
||||||
|
}
|
||||||
|
|
||||||
/* incremental string contruction as local var */
|
/* incremental string contruction as local var */
|
||||||
function string_build1(n)
|
function string_build1(n)
|
||||||
{
|
{
|
||||||
|
@ -951,6 +975,8 @@ function main(argc, argv, g)
|
||||||
array_for_in,
|
array_for_in,
|
||||||
array_for_of,
|
array_for_of,
|
||||||
math_min,
|
math_min,
|
||||||
|
regexp_ascii,
|
||||||
|
regexp_utf16,
|
||||||
string_build1,
|
string_build1,
|
||||||
string_build2,
|
string_build2,
|
||||||
//string_build3,
|
//string_build3,
|
||||||
|
|
Loading…
Reference in a new issue