ICU-2422 Regexp, more speed optimizations, work in progress

X-SVN-Rev: 11400
This commit is contained in:
Andy Heninger 2003-03-26 01:17:16 +00:00
parent bb08191a89
commit ec8e5274ba
5 changed files with 265 additions and 20 deletions

View File

@ -522,13 +522,11 @@ void RegexCompile::compile(
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
//
// Optimization pass: Categorize how a match can start, for use by find()
// Optimization passes
//
matchStartType();
// Optimization: strip out uneeded NOPs from the compiled pattern.
matchStartType();
stripNOPs();
OptEndingLoop();
//
// A stupid bit of non-sense to prevent code coverage testing from complaining
@ -2276,6 +2274,8 @@ void RegexCompile::matchStartType() {
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
case URX_DOTANY_ALL: // . matches one or two.
case URX_DOTANY:
case URX_DOTANY_ALL_PL:
case URX_DOTANY_PL:
if (currentLen == 0) {
// These constructs are all bad news when they appear at the start
// of a match. Any character can begin the match.
@ -2310,6 +2310,7 @@ void RegexCompile::matchStartType() {
break;
case URX_JMP_SAV:
case URX_JMP_SAV_X:
// Combo of state save to the next loc, + jmp backwards.
// Net effect on min. length computation is nothing.
atStart = FALSE;
@ -2601,6 +2602,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_LD_SP:
case URX_JMP_SAV:
case URX_JMP_SAV_X:
break;
@ -2614,6 +2616,8 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
case URX_DOTANY_ALL: // . matches one or two.
case URX_DOTANY:
case URX_DOTANY_PL:
case URX_DOTANY_ALL_PL:
currentLen++;
break;
@ -2840,6 +2844,8 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
case URX_BACKREF_I:
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
case URX_DOTANY_PL:
case URX_DOTANY_ALL_PL:
currentLen = INT32_MAX;
break;
@ -2869,6 +2875,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
case URX_JMP:
case URX_JMPX:
case URX_JMP_SAV:
case URX_JMP_SAV_X:
{
int32_t jmpDest = URX_VAL(op);
if (jmpDest < loc) {
@ -3034,6 +3041,7 @@ void RegexCompile::stripNOPs() {
case URX_RELOC_OPRND:
case URX_JMPX:
case URX_JMP_SAV:
case URX_JMP_SAV_X:
// These are instructions with operands that refer to code locations.
{
int32_t operandAddress = URX_VAL(op);
@ -3060,10 +3068,11 @@ void RegexCompile::stripNOPs() {
case URX_FAIL:
case URX_BACKSLASH_B:
case URX_BACKSLASH_G:
case URX_UNUSED_1:
case URX_BACKSLASH_X:
case URX_BACKSLASH_Z:
case URX_DOTANY_ALL:
case URX_DOTANY_ALL_PL:
case URX_DOTANY_PL:
case URX_BACKSLASH_D:
case URX_CARET:
case URX_DOLLAR:
@ -3102,6 +3111,166 @@ void RegexCompile::stripNOPs() {
//----------------------------------------------------------------------------------------
//
// OptEndingLoop Optimize patterns that end with a '*' or a '+' to not
// save state on each iteration, when possible.
// These patterns end with a JMP_SAV op. Replace it with
// a JMP_SAV_X if the body of the loop is simple
// (does not itself do any state saves.)
//
//----------------------------------------------------------------------------------------
void RegexCompile::OptEndingLoop() {
// Scan backwards in the pattern, looking for a JMP_SAV near the end.
int32_t jmp_loc;
int32_t op;
int32_t opType;
for (jmp_loc=fRXPat->fCompiledPat->size(); jmp_loc--;) {
U_ASSERT(jmp_loc>0);
op = fRXPat->fCompiledPat->elementAti(jmp_loc);
opType = URX_TYPE(op);
switch(opType) {
case URX_END:
case URX_NOP:
case URX_END_CAPTURE:
// These ops may follow the JMP_SAV without preventing us from
// doing this optimization.
continue;
case URX_JMP_SAV:
// Got a trailing JMP_SAV that's a candidate for optimization.
break;
default:
// This optimization not possible.
return;
}
break; // from the for loop.
}
// We found in URX_JMP_SAV near the end that is a candidate for optimizing.
// Scan the body of the loop for anything that prevents the optimization,
// which is anything that does a state save, or anything that
// alters the current stack frame (like a capture start/end)
int32_t loopTopLoc = URX_VAL(op);
U_ASSERT(loopTopLoc > 1 && loopTopLoc < jmp_loc);
int32_t loc;
for (loc=loopTopLoc; loc<jmp_loc; loc++) {
op = fRXPat->fCompiledPat->elementAti(loc);
opType = URX_TYPE(op);
switch(opType) {
case URX_STATE_SAVE:
case URX_JMP_SAV:
case URX_JMP_SAV_X:
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_LD_SP:
case URX_END_CAPTURE:
case URX_START_CAPTURE:
// These ops do a state save.
// Can not do the optimization.
return;
default:
// Other ops within the loop are OK.
;// keep looking.
}
}
// Everything checks out. We can do the optimization.
insertOp(jmp_loc); // Make space for the extra operand word 0f URX_JMP_SAV_X
op = URX_BUILD(URX_JMP_SAV_X, loopTopLoc);
fRXPat->fCompiledPat->setElementAt(op, jmp_loc);
int32_t dataLoc = fRXPat->fDataSize;
fRXPat->fDataSize += 1;
fRXPat->fCompiledPat->setElementAt(dataLoc, jmp_loc+1);
}
//----------------------------------------------------------------------------------------
//
// OptDotStar Optimize patterns that end with a '.*' to
// just advance the input to the end without further todo.
//
//----------------------------------------------------------------------------------------
void RegexCompile::OptDotStar() {
// Scan backwards in the pattern, looking for a JMP_SAV near the end.
int32_t jmp_loc;
int32_t op;
int32_t opType;
for (jmp_loc=fRXPat->fCompiledPat->size(); jmp_loc--;) {
U_ASSERT(jmp_loc>0);
op = fRXPat->fCompiledPat->elementAti(jmp_loc);
opType = URX_TYPE(op);
switch(opType) {
case URX_END:
case URX_NOP:
case URX_END_CAPTURE:
// These ops may follow the JMP_SAV without preventing us from
// doing this optimization.
continue;
case URX_JMP_SAV:
// Got a trailing JMP_SAV that's a candidate for optimization.
break;
default:
// This optimization not possible.
return;
}
break; // from the for loop.
}
// We found in URX_JMP_SAV near the end that is a candidate for optimizing.
// Scan the body of the loop for anything that prevents the optimization,
// which is anything that does a state save, or anything that
// alters the current stack frame (like a capture start/end)
int32_t loopTopLoc = URX_VAL(op);
U_ASSERT(loopTopLoc > 1 && loopTopLoc < jmp_loc);
int32_t loc;
for (loc=loopTopLoc; loc<jmp_loc; loc++) {
op = fRXPat->fCompiledPat->elementAti(loc);
opType = URX_TYPE(op);
switch(opType) {
case URX_STATE_SAVE:
case URX_JMP_SAV:
case URX_JMP_SAV_X:
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_LD_SP:
case URX_END_CAPTURE:
case URX_START_CAPTURE:
// These ops do a state save.
// Can not do the optimization.
return;
default:
// Other ops within the loop are OK.
;// keep looking.
}
}
// Everything checks out. We can do the optimization.
insertOp(jmp_loc); // Make space for the extra operand word 0f URX_JMP_SAV_X
op = URX_BUILD(URX_JMP_SAV_X, loopTopLoc);
fRXPat->fCompiledPat->setElementAt(op, jmp_loc);
int32_t dataLoc = fRXPat->fDataSize;
fRXPat->fDataSize += 1;
fRXPat->fCompiledPat->setElementAt(dataLoc, jmp_loc+1);
}
//----------------------------------------------------------------------------------------
//
// Error Report a rule parse error.

View File

@ -111,6 +111,8 @@ private:
int32_t end);
void matchStartType();
void stripNOPs();
void OptEndingLoop();
void OptDotStar();
UErrorCode *fStatus;

View File

@ -74,7 +74,9 @@ enum {
URX_JMP_SAV = 15, // Operand: JMP destination location
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
URX_BACKSLASH_G = 17,
URX_UNUSED_1 = 18, // Value field: 0: \w 1: \W
URX_JMP_SAV_X = 18, // JMP + Conditional Save
// First Operand: Jmp location
// Second Operand: Data loc. Save if data != 0.
URX_BACKSLASH_X = 19,
URX_BACKSLASH_Z = 20, // \z Unconditional end of line.
@ -85,15 +87,20 @@ enum {
URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possesive.
URX_UNUSED_2 = 27, // These are 4 word opcodes. See description.
// These are 4 word opcodes. See description.
// First Operand: Data loc of counter variable
// 2nd Operand: Pat loc of the URX_CTR_LOOPx
// at the end of the loop.
// 3rd Operand: Minimum count.
// 4th Operand: Max count, -1 for unbounded.
URX_DOTANY_PL = 27, // .+, match rest of the line. Fail already at end.
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
URX_CTR_LOOP_NG = 29, // Also in three flavors.
URX_UNUSED_3 = 30, // Operand is loc of corresponding CTR_INIT.
// Operand is loc of corresponding CTR_INIT.
URX_DOTANY_ALL_PL = 30, // .+, match rest of the Input. Fail if already at end
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
// back into compiled pattern code, and thus must
@ -172,7 +179,7 @@ enum {
"URX_JMP_SAV", \
"URX_BACKSLASH_B", \
"URX_BACKSLASH_G", \
"URX_UNUSED_1", \
"URX_JMP_SAV_X", \
"URX_BACKSLASH_X", \
"URX_BACKSLASH_Z", \
"URX_DOTANY_ALL", \

View File

@ -925,6 +925,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
fp->fPatIdx = 0;
fp->fInputIdx = startIdx;
// Zero out the pattern's static data
int32_t i;
for (i = 0; i<fPattern->fDataSize; i++) {
fData[i] = 0;
}
//
// Main loop for interpreting the compiled pattern.
@ -1010,16 +1015,6 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
}
break;
#if 0
if (stringEndIndex <= inputLen &&
u_strncmp(inputBuf+fp->fInputIdx, litText+stringStartIdx, stringLen) == 0) {
// Success. Advance the current input position.
fp->fInputIdx = stringEndIndex;
} else {
// No match. Back up matching to a saved state
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
#endif
}
break;
@ -1348,6 +1343,51 @@ GC_Done:
}
break;
case URX_DOTANY_PL:
// Match all up to and end-of-line or end-of-input.
{
// Fail if input already exhausted.
if (fp->fInputIdx >= inputLen) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
// There is input left. Fail if we are at the end of a line.
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
(c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
// End of line in normal mode. . does not match.
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
// There was input left. Consume it until we hit the end of a line,
// or until it's exhausted.
while (fp->fInputIdx < inputLen) {
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
(c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
U16_BACK_1(inputBuf, 0, fp->fInputIdx)
// Scan has reached a line-end. We are done.
break;
}
}
}
break;
case URX_DOTANY_ALL_PL:
{
// Match up to end of input. Fail if already at end of input.
if (fp->fInputIdx >= inputLen) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
} else {
fp->fInputIdx = inputLen;
}
}
break;
case URX_JMP:
fp->fPatIdx = opValue;
break;
@ -1357,10 +1397,33 @@ GC_Done:
goto breakFromLoop;
case URX_JMP_SAV:
U_ASSERT(opValue < fPattern->fCompiledPat->size());
fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current
fp->fPatIdx = opValue; // Then JMP.
break;
case URX_JMP_SAV_X:
// This opcode is used when a pattern ends with (something)*
// There is no need to save state with each loop for the '*', because
// there is no following pattern that could use that saved state.
// Use a flag to only save state the first time through.
{
U_ASSERT(opValue < fPattern->fCompiledPat->size());
int32_t dataLoc = URX_VAL(pat[fp->fPatIdx]);
U_ASSERT(dataLoc >= 0 && dataLoc < frameSize);
int32_t flag = fData[dataLoc];
U_ASSERT(flag==0 || flag==1);
if (flag == 0) {
fp = StateSave(fp, (fp->fPatIdx)+1, frameSize, status);
fData[dataLoc] = 1;
} else {
REStackFrame *prevFrame = (REStackFrame *)((int32_t *)fp-frameSize);
prevFrame->fInputIdx = fp->fInputIdx;
}
fp->fPatIdx = opValue;
}
break;
case URX_CTR_INIT:
{
U_ASSERT(opValue >= 0 && opValue < frameSize-2);

View File

@ -402,6 +402,9 @@ void RegexPattern::dumpOp(int32_t index) const {
switch (type) {
case URX_NOP:
case URX_DOTANY:
case URX_DOTANY_ALL:
case URX_DOTANY_PL:
case URX_DOTANY_ALL_PL:
case URX_FAIL:
case URX_CARET:
case URX_DOLLAR:
@ -419,6 +422,7 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_STATE_SAVE:
case URX_JMP:
case URX_JMP_SAV:
case URX_JMP_SAV_X:
case URX_BACKSLASH_B:
case URX_BACKSLASH_D:
case URX_BACKSLASH_Z: