ICU-2422 Regexp, more speed optimizations, work in progress
X-SVN-Rev: 11400
This commit is contained in:
parent
bb08191a89
commit
ec8e5274ba
@ -522,13 +522,11 @@ void RegexCompile::compile(
|
||||
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
||||
|
||||
//
|
||||
// Optimization pass: Categorize how a match can start, for use by find()
|
||||
// Optimization passes
|
||||
//
|
||||
matchStartType();
|
||||
|
||||
// Optimization: strip out uneeded NOPs from the compiled pattern.
|
||||
matchStartType();
|
||||
stripNOPs();
|
||||
|
||||
OptEndingLoop();
|
||||
|
||||
//
|
||||
// A stupid bit of non-sense to prevent code coverage testing from complaining
|
||||
@ -2276,6 +2274,8 @@ void RegexCompile::matchStartType() {
|
||||
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
||||
case URX_DOTANY_ALL: // . matches one or two.
|
||||
case URX_DOTANY:
|
||||
case URX_DOTANY_ALL_PL:
|
||||
case URX_DOTANY_PL:
|
||||
if (currentLen == 0) {
|
||||
// These constructs are all bad news when they appear at the start
|
||||
// of a match. Any character can begin the match.
|
||||
@ -2310,6 +2310,7 @@ void RegexCompile::matchStartType() {
|
||||
break;
|
||||
|
||||
case URX_JMP_SAV:
|
||||
case URX_JMP_SAV_X:
|
||||
// Combo of state save to the next loc, + jmp backwards.
|
||||
// Net effect on min. length computation is nothing.
|
||||
atStart = FALSE;
|
||||
@ -2601,6 +2602,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
case URX_LD_SP:
|
||||
|
||||
case URX_JMP_SAV:
|
||||
case URX_JMP_SAV_X:
|
||||
break;
|
||||
|
||||
|
||||
@ -2614,6 +2616,8 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
||||
case URX_DOTANY_ALL: // . matches one or two.
|
||||
case URX_DOTANY:
|
||||
case URX_DOTANY_PL:
|
||||
case URX_DOTANY_ALL_PL:
|
||||
currentLen++;
|
||||
break;
|
||||
|
||||
@ -2840,6 +2844,8 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
||||
case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
|
||||
case URX_BACKREF_I:
|
||||
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
||||
case URX_DOTANY_PL:
|
||||
case URX_DOTANY_ALL_PL:
|
||||
currentLen = INT32_MAX;
|
||||
break;
|
||||
|
||||
@ -2869,6 +2875,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
||||
case URX_JMP:
|
||||
case URX_JMPX:
|
||||
case URX_JMP_SAV:
|
||||
case URX_JMP_SAV_X:
|
||||
{
|
||||
int32_t jmpDest = URX_VAL(op);
|
||||
if (jmpDest < loc) {
|
||||
@ -3034,6 +3041,7 @@ void RegexCompile::stripNOPs() {
|
||||
case URX_RELOC_OPRND:
|
||||
case URX_JMPX:
|
||||
case URX_JMP_SAV:
|
||||
case URX_JMP_SAV_X:
|
||||
// These are instructions with operands that refer to code locations.
|
||||
{
|
||||
int32_t operandAddress = URX_VAL(op);
|
||||
@ -3060,10 +3068,11 @@ void RegexCompile::stripNOPs() {
|
||||
case URX_FAIL:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_UNUSED_1:
|
||||
case URX_BACKSLASH_X:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_DOTANY_ALL:
|
||||
case URX_DOTANY_ALL_PL:
|
||||
case URX_DOTANY_PL:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
@ -3102,6 +3111,166 @@ void RegexCompile::stripNOPs() {
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// OptEndingLoop Optimize patterns that end with a '*' or a '+' to not
|
||||
// save state on each iteration, when possible.
|
||||
// These patterns end with a JMP_SAV op. Replace it with
|
||||
// a JMP_SAV_X if the body of the loop is simple
|
||||
// (does not itself do any state saves.)
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
void RegexCompile::OptEndingLoop() {
|
||||
// Scan backwards in the pattern, looking for a JMP_SAV near the end.
|
||||
int32_t jmp_loc;
|
||||
int32_t op;
|
||||
int32_t opType;
|
||||
for (jmp_loc=fRXPat->fCompiledPat->size(); jmp_loc--;) {
|
||||
U_ASSERT(jmp_loc>0);
|
||||
op = fRXPat->fCompiledPat->elementAti(jmp_loc);
|
||||
opType = URX_TYPE(op);
|
||||
switch(opType) {
|
||||
|
||||
|
||||
case URX_END:
|
||||
case URX_NOP:
|
||||
case URX_END_CAPTURE:
|
||||
// These ops may follow the JMP_SAV without preventing us from
|
||||
// doing this optimization.
|
||||
continue;
|
||||
|
||||
case URX_JMP_SAV:
|
||||
// Got a trailing JMP_SAV that's a candidate for optimization.
|
||||
break;
|
||||
|
||||
default:
|
||||
// This optimization not possible.
|
||||
return;
|
||||
}
|
||||
break; // from the for loop.
|
||||
}
|
||||
|
||||
// We found in URX_JMP_SAV near the end that is a candidate for optimizing.
|
||||
// Scan the body of the loop for anything that prevents the optimization,
|
||||
// which is anything that does a state save, or anything that
|
||||
// alters the current stack frame (like a capture start/end)
|
||||
int32_t loopTopLoc = URX_VAL(op);
|
||||
U_ASSERT(loopTopLoc > 1 && loopTopLoc < jmp_loc);
|
||||
int32_t loc;
|
||||
for (loc=loopTopLoc; loc<jmp_loc; loc++) {
|
||||
op = fRXPat->fCompiledPat->elementAti(loc);
|
||||
opType = URX_TYPE(op);
|
||||
switch(opType) {
|
||||
|
||||
case URX_STATE_SAVE:
|
||||
case URX_JMP_SAV:
|
||||
case URX_JMP_SAV_X:
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_LD_SP:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_START_CAPTURE:
|
||||
// These ops do a state save.
|
||||
// Can not do the optimization.
|
||||
return;
|
||||
|
||||
default:
|
||||
// Other ops within the loop are OK.
|
||||
;// keep looking.
|
||||
}
|
||||
}
|
||||
|
||||
// Everything checks out. We can do the optimization.
|
||||
insertOp(jmp_loc); // Make space for the extra operand word 0f URX_JMP_SAV_X
|
||||
op = URX_BUILD(URX_JMP_SAV_X, loopTopLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, jmp_loc);
|
||||
|
||||
int32_t dataLoc = fRXPat->fDataSize;
|
||||
fRXPat->fDataSize += 1;
|
||||
fRXPat->fCompiledPat->setElementAt(dataLoc, jmp_loc+1);
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// OptDotStar Optimize patterns that end with a '.*' to
|
||||
// just advance the input to the end without further todo.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
void RegexCompile::OptDotStar() {
|
||||
// Scan backwards in the pattern, looking for a JMP_SAV near the end.
|
||||
int32_t jmp_loc;
|
||||
int32_t op;
|
||||
int32_t opType;
|
||||
for (jmp_loc=fRXPat->fCompiledPat->size(); jmp_loc--;) {
|
||||
U_ASSERT(jmp_loc>0);
|
||||
op = fRXPat->fCompiledPat->elementAti(jmp_loc);
|
||||
opType = URX_TYPE(op);
|
||||
switch(opType) {
|
||||
|
||||
|
||||
case URX_END:
|
||||
case URX_NOP:
|
||||
case URX_END_CAPTURE:
|
||||
// These ops may follow the JMP_SAV without preventing us from
|
||||
// doing this optimization.
|
||||
continue;
|
||||
|
||||
case URX_JMP_SAV:
|
||||
// Got a trailing JMP_SAV that's a candidate for optimization.
|
||||
break;
|
||||
|
||||
default:
|
||||
// This optimization not possible.
|
||||
return;
|
||||
}
|
||||
break; // from the for loop.
|
||||
}
|
||||
|
||||
// We found in URX_JMP_SAV near the end that is a candidate for optimizing.
|
||||
// Scan the body of the loop for anything that prevents the optimization,
|
||||
// which is anything that does a state save, or anything that
|
||||
// alters the current stack frame (like a capture start/end)
|
||||
int32_t loopTopLoc = URX_VAL(op);
|
||||
U_ASSERT(loopTopLoc > 1 && loopTopLoc < jmp_loc);
|
||||
int32_t loc;
|
||||
for (loc=loopTopLoc; loc<jmp_loc; loc++) {
|
||||
op = fRXPat->fCompiledPat->elementAti(loc);
|
||||
opType = URX_TYPE(op);
|
||||
switch(opType) {
|
||||
|
||||
case URX_STATE_SAVE:
|
||||
case URX_JMP_SAV:
|
||||
case URX_JMP_SAV_X:
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_LD_SP:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_START_CAPTURE:
|
||||
// These ops do a state save.
|
||||
// Can not do the optimization.
|
||||
return;
|
||||
|
||||
default:
|
||||
// Other ops within the loop are OK.
|
||||
;// keep looking.
|
||||
}
|
||||
}
|
||||
|
||||
// Everything checks out. We can do the optimization.
|
||||
insertOp(jmp_loc); // Make space for the extra operand word 0f URX_JMP_SAV_X
|
||||
op = URX_BUILD(URX_JMP_SAV_X, loopTopLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, jmp_loc);
|
||||
|
||||
int32_t dataLoc = fRXPat->fDataSize;
|
||||
fRXPat->fDataSize += 1;
|
||||
fRXPat->fCompiledPat->setElementAt(dataLoc, jmp_loc+1);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// Error Report a rule parse error.
|
||||
|
@ -111,6 +111,8 @@ private:
|
||||
int32_t end);
|
||||
void matchStartType();
|
||||
void stripNOPs();
|
||||
void OptEndingLoop();
|
||||
void OptDotStar();
|
||||
|
||||
|
||||
UErrorCode *fStatus;
|
||||
|
@ -74,7 +74,9 @@ enum {
|
||||
URX_JMP_SAV = 15, // Operand: JMP destination location
|
||||
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
|
||||
URX_BACKSLASH_G = 17,
|
||||
URX_UNUSED_1 = 18, // Value field: 0: \w 1: \W
|
||||
URX_JMP_SAV_X = 18, // JMP + Conditional Save
|
||||
// First Operand: Jmp location
|
||||
// Second Operand: Data loc. Save if data != 0.
|
||||
URX_BACKSLASH_X = 19,
|
||||
URX_BACKSLASH_Z = 20, // \z Unconditional end of line.
|
||||
|
||||
@ -85,15 +87,20 @@ enum {
|
||||
|
||||
URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
|
||||
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possesive.
|
||||
URX_UNUSED_2 = 27, // These are 4 word opcodes. See description.
|
||||
// These are 4 word opcodes. See description.
|
||||
// First Operand: Data loc of counter variable
|
||||
// 2nd Operand: Pat loc of the URX_CTR_LOOPx
|
||||
// at the end of the loop.
|
||||
// 3rd Operand: Minimum count.
|
||||
// 4th Operand: Max count, -1 for unbounded.
|
||||
|
||||
URX_DOTANY_PL = 27, // .+, match rest of the line. Fail already at end.
|
||||
|
||||
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
|
||||
URX_CTR_LOOP_NG = 29, // Also in three flavors.
|
||||
URX_UNUSED_3 = 30, // Operand is loc of corresponding CTR_INIT.
|
||||
// Operand is loc of corresponding CTR_INIT.
|
||||
|
||||
URX_DOTANY_ALL_PL = 30, // .+, match rest of the Input. Fail if already at end
|
||||
|
||||
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
|
||||
// back into compiled pattern code, and thus must
|
||||
@ -172,7 +179,7 @@ enum {
|
||||
"URX_JMP_SAV", \
|
||||
"URX_BACKSLASH_B", \
|
||||
"URX_BACKSLASH_G", \
|
||||
"URX_UNUSED_1", \
|
||||
"URX_JMP_SAV_X", \
|
||||
"URX_BACKSLASH_X", \
|
||||
"URX_BACKSLASH_Z", \
|
||||
"URX_DOTANY_ALL", \
|
||||
|
@ -925,6 +925,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
fp->fPatIdx = 0;
|
||||
fp->fInputIdx = startIdx;
|
||||
|
||||
// Zero out the pattern's static data
|
||||
int32_t i;
|
||||
for (i = 0; i<fPattern->fDataSize; i++) {
|
||||
fData[i] = 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Main loop for interpreting the compiled pattern.
|
||||
@ -1010,16 +1015,6 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
}
|
||||
break;
|
||||
|
||||
#if 0
|
||||
if (stringEndIndex <= inputLen &&
|
||||
u_strncmp(inputBuf+fp->fInputIdx, litText+stringStartIdx, stringLen) == 0) {
|
||||
// Success. Advance the current input position.
|
||||
fp->fInputIdx = stringEndIndex;
|
||||
} else {
|
||||
// No match. Back up matching to a saved state
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
|
||||
@ -1348,6 +1343,51 @@ GC_Done:
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_DOTANY_PL:
|
||||
// Match all up to and end-of-line or end-of-input.
|
||||
{
|
||||
// Fail if input already exhausted.
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
|
||||
// There is input left. Fail if we are at the end of a line.
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
|
||||
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
||||
(c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
|
||||
// End of line in normal mode. . does not match.
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
|
||||
// There was input left. Consume it until we hit the end of a line,
|
||||
// or until it's exhausted.
|
||||
while (fp->fInputIdx < inputLen) {
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
|
||||
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
||||
(c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
|
||||
U16_BACK_1(inputBuf, 0, fp->fInputIdx)
|
||||
// Scan has reached a line-end. We are done.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_DOTANY_ALL_PL:
|
||||
{
|
||||
// Match up to end of input. Fail if already at end of input.
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
} else {
|
||||
fp->fInputIdx = inputLen;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_JMP:
|
||||
fp->fPatIdx = opValue;
|
||||
break;
|
||||
@ -1357,10 +1397,33 @@ GC_Done:
|
||||
goto breakFromLoop;
|
||||
|
||||
case URX_JMP_SAV:
|
||||
U_ASSERT(opValue < fPattern->fCompiledPat->size());
|
||||
fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current
|
||||
fp->fPatIdx = opValue; // Then JMP.
|
||||
break;
|
||||
|
||||
case URX_JMP_SAV_X:
|
||||
// This opcode is used when a pattern ends with (something)*
|
||||
// There is no need to save state with each loop for the '*', because
|
||||
// there is no following pattern that could use that saved state.
|
||||
// Use a flag to only save state the first time through.
|
||||
{
|
||||
U_ASSERT(opValue < fPattern->fCompiledPat->size());
|
||||
int32_t dataLoc = URX_VAL(pat[fp->fPatIdx]);
|
||||
U_ASSERT(dataLoc >= 0 && dataLoc < frameSize);
|
||||
int32_t flag = fData[dataLoc];
|
||||
U_ASSERT(flag==0 || flag==1);
|
||||
if (flag == 0) {
|
||||
fp = StateSave(fp, (fp->fPatIdx)+1, frameSize, status);
|
||||
fData[dataLoc] = 1;
|
||||
} else {
|
||||
REStackFrame *prevFrame = (REStackFrame *)((int32_t *)fp-frameSize);
|
||||
prevFrame->fInputIdx = fp->fInputIdx;
|
||||
}
|
||||
fp->fPatIdx = opValue;
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_CTR_INIT:
|
||||
{
|
||||
U_ASSERT(opValue >= 0 && opValue < frameSize-2);
|
||||
|
@ -402,6 +402,9 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
switch (type) {
|
||||
case URX_NOP:
|
||||
case URX_DOTANY:
|
||||
case URX_DOTANY_ALL:
|
||||
case URX_DOTANY_PL:
|
||||
case URX_DOTANY_ALL_PL:
|
||||
case URX_FAIL:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
@ -419,6 +422,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
case URX_STATE_SAVE:
|
||||
case URX_JMP:
|
||||
case URX_JMP_SAV:
|
||||
case URX_JMP_SAV_X:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_BACKSLASH_Z:
|
||||
|
Loading…
Reference in New Issue
Block a user