ICU-2422 Regexp speed optimizations, work in progress

X-SVN-Rev: 11401
This commit is contained in:
Andy Heninger 2003-03-26 06:25:55 +00:00
parent ec8e5274ba
commit 4b469843ee
3 changed files with 76 additions and 55 deletions

View File

@ -525,6 +525,7 @@ void RegexCompile::compile(
// Optimization passes
//
matchStartType();
OptDotStar();
stripNOPs();
OptEndingLoop();
@ -3195,18 +3196,30 @@ void RegexCompile::OptEndingLoop() {
//----------------------------------------------------------------------------------------
//
// OptDotStar Optimize patterns that end with a '.*' to
// just advance the input to the end without further todo.
// OptDotStar Optimize patterns that end with a '.*' or '.+' to
// just advance the input to the end.
//
// Transform this compiled sequence
// [DOT_ANY | DOT_ANY_ALL]
// JMP_SAV to previous instruction
// [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
// END
//
// To
// NOP
// [DOT_ANY_PL | DOT_ANY_ALL_PL]
// [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]*
// END
//
//----------------------------------------------------------------------------------------
void RegexCompile::OptDotStar() {
// Scan backwards in the pattern, looking for a JMP_SAV near the end.
int32_t jmp_loc;
int32_t jmpLoc;
int32_t op;
int32_t opType;
for (jmp_loc=fRXPat->fCompiledPat->size(); jmp_loc--;) {
U_ASSERT(jmp_loc>0);
op = fRXPat->fCompiledPat->elementAti(jmp_loc);
for (jmpLoc=fRXPat->fCompiledPat->size(); jmpLoc--;) {
U_ASSERT(jmpLoc>0);
op = fRXPat->fCompiledPat->elementAti(jmpLoc);
opType = URX_TYPE(op);
switch(opType) {
@ -3214,6 +3227,9 @@ void RegexCompile::OptDotStar() {
case URX_END:
case URX_NOP:
case URX_END_CAPTURE:
case URX_DOLLAR_M:
case URX_DOLLAR:
case URX_BACKSLASH_Z:
// These ops may follow the JMP_SAV without preventing us from
// doing this optimization.
continue;
@ -3230,47 +3246,31 @@ void RegexCompile::OptDotStar() {
}
// We found in URX_JMP_SAV near the end that is a candidate for optimizing.
// Scan the body of the loop for anything that prevents the optimization,
// which is anything that does a state save, or anything that
// alters the current stack frame (like a capture start/end)
// Is the target address the previous instruction?
// Is the previous instruction a flavor of URX_DOTANY
int32_t loopTopLoc = URX_VAL(op);
U_ASSERT(loopTopLoc > 1 && loopTopLoc < jmp_loc);
int32_t loc;
for (loc=loopTopLoc; loc<jmp_loc; loc++) {
op = fRXPat->fCompiledPat->elementAti(loc);
opType = URX_TYPE(op);
switch(opType) {
case URX_STATE_SAVE:
case URX_JMP_SAV:
case URX_JMP_SAV_X:
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_LD_SP:
case URX_END_CAPTURE:
case URX_START_CAPTURE:
// These ops do a state save.
// Can not do the optimization.
if (loopTopLoc != jmpLoc-1) {
return;
default:
// Other ops within the loop are OK.
;// keep looking.
}
int32_t newOp;
int32_t oldOp = fRXPat->fCompiledPat->elementAti(loopTopLoc);
int32_t oldOpType = opType = URX_TYPE(oldOp);
if (oldOpType == URX_DOTANY) {
newOp = URX_BUILD(URX_DOTANY_PL, 0);
}
else if (oldOpType == URX_DOTANY_ALL) {
newOp = URX_BUILD(URX_DOTANY_ALL_PL, 0);
} else {
return; // Sequence we were looking for isn't there.
}
// Everything checks out. We can do the optimization.
insertOp(jmp_loc); // Make space for the extra operand word 0f URX_JMP_SAV_X
op = URX_BUILD(URX_JMP_SAV_X, loopTopLoc);
fRXPat->fCompiledPat->setElementAt(op, jmp_loc);
int32_t dataLoc = fRXPat->fDataSize;
fRXPat->fDataSize += 1;
fRXPat->fCompiledPat->setElementAt(dataLoc, jmp_loc+1);
// Substitute the new instructions into the pattern.
// The NOP will be removed in a later optimization step.
fRXPat->fCompiledPat->setElementAt(URX_BUILD(URX_NOP, 0), loopTopLoc);
fRXPat->fCompiledPat->setElementAt(newOp, jmpLoc);
}
//----------------------------------------------------------------------------------------
//
// Error Report a rule parse error.

View File

@ -162,7 +162,7 @@ enum {
// Used for debug printing only.
#define URX_OPCODE_NAMES \
" ", \
"URX_BACKTRACK", \
"BACKTRACK", \
"END", \
"ONECHAR", \
"STRING", \
@ -176,22 +176,22 @@ enum {
"DOTANY", \
"JMP", \
"FAIL", \
"URX_JMP_SAV", \
"URX_BACKSLASH_B", \
"URX_BACKSLASH_G", \
"URX_JMP_SAV_X", \
"URX_BACKSLASH_X", \
"URX_BACKSLASH_Z", \
"URX_DOTANY_ALL", \
"URX_BACKSLASH_D", \
"URX_CARET", \
"URX_DOLLAR", \
"JMP_SAV", \
"BACKSLASH_B", \
"BACKSLASH_G", \
"JMP_SAV_X", \
"BACKSLASH_X", \
"BACKSLASH_Z", \
"DOTANY_ALL", \
"BACKSLASH_D", \
"CARET", \
"DOLLAR", \
"CTR_INIT", \
"CTR_INIT_NG", \
"CTR_UNUSED_2", \
"DOTANY_PL", \
"CTR_LOOP", \
"CTR_LOOP_NG", \
"CTR_UNUSED_3", \
"DOTANY_ALL_PL", \
"RELOC_OPRND", \
"STO_SP", \
"LD_SP", \

View File

@ -280,6 +280,27 @@
"$" "abc<0></0>"
#
# Optimizations of .* at end of patterns
#
"abc.*" "<0>abcdef</0>"
"abc.*$" "<0>abcdef</0>"
"abc(.*)" "<0>abc<1>def</1></0>"
"abc(.*)" "<0>abc<1></1></0>"
"abc.*" "<0>abc</0>\ndef"
"abc.*" s "<0>abc\ndef</0>"
"abc.*$" s "<0>abc\ndef</0>"
"abc.*$" "abc\ndef"
"abc.*$" m "<0>abc</0>\ndef"
"abc.*\Z" m "abc\ndef"
"abc.*\Z" sm "<0>abc\ndef</0>"
"abc*" "<0>abccc</0>d"
"abc*$" "<0>abccc</0>"
"ab(?:ab[xyz]\s)*" "<0>ababy abx </0>abc"
"(?:abc|a)(?:bc)+" "<0>abc</0>"
#
# Random debugging, Temporary
#