ICU-2422 Regexp, optimizing find() operations, work in progress.

X-SVN-Rev: 11349
This commit is contained in:
Andy Heninger 2003-03-18 01:51:36 +00:00
parent 613179cd77
commit bf35d2da2e
6 changed files with 154 additions and 15 deletions

View File

@ -525,6 +525,9 @@ void RegexCompile::compile(
//
matchStartType();
// Optimization: strip out uneeded NOPs from the compiled pattern.
stripNOPs();
//
// A stupid bit of non-sense to prevent code coverage testing from complaining
@ -934,19 +937,23 @@ UBool RegexCompile::doParseActions(EParseAction action)
// 2. state-save 4
// 3. jmp 1
// 4. ...
// Normal '+' compiles to
// 1. stuff to be repeated (already built)
// 2. jmp-sav 1
// 3. ...
{
int32_t topLoc = blockTopLoc(FALSE); // location of item #1
// Locate the position in the compiled pattern where the match will continue
// after completing the + (4 in the comment above)
int32_t continueLoc = fRXPat->fCompiledPat->size()+2;
//int32_t continueLoc = fRXPat->fCompiledPat->size()+2;
// Emit the STATE_SAVE
int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
//int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
//fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
// Emit the JMP
int32_t jmpOp = URX_BUILD(URX_JMP, topLoc);
int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc);
fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
}
break;
@ -1692,6 +1699,7 @@ void RegexCompile::insertOp(int32_t where) {
opType == URX_CTR_LOOP ||
opType == URX_CTR_LOOP_NG ||
opType == URX_CTR_LOOP_P ||
opType == URX_JMP_SAV ||
opType == URX_RELOC_OPRND) && opValue > where) {
// Target location for this opcode is after the insertion point and
// needs to be incremented to adjust for the insertion.
@ -1933,12 +1941,13 @@ void RegexCompile::handleCloseParen() {
// Insert the min and max match len bounds into the URX_LB_CONT op that
// appears at the top of the look-behind block, at location fMatchOpenParen+1
fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
// Insert the pattern location to continue at after a successful match
// as the last operand of the URX_LBN_CONT
fRXPat->fCompiledPat->setElementAt(fRXPat->fCompiledPat->size(), fMatchOpenParen-1);
op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
}
break;
@ -2254,6 +2263,7 @@ void RegexCompile::matchStartType() {
// of a match. Any character can begin the match.
fRXPat->fInitialChars->clear();
fRXPat->fInitialChars->complement();
numInitialStrings += 2;
}
currentLen++;
atStart = FALSE;
@ -2281,6 +2291,12 @@ void RegexCompile::matchStartType() {
atStart = FALSE;
break;
case URX_JMP_SAV:
// Combo of state save to the next loc, + jmp backwards.
// Net effect on min. length computation is nothing.
atStart = FALSE;
break;
case URX_FAIL:
// Fails are kind of like a branch, except that the min length was
// propagated already, by the state save.
@ -2463,7 +2479,8 @@ void RegexCompile::matchStartType() {
// Match beginning only with a literal string.
UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
U_ASSERT(fRXPat->fInitialChars->contains(c));
fRXPat->fStartType = START_STRING;
fRXPat->fStartType = START_STRING;
fRXPat->fInitialChar = c;
} else if (fRXPat->fStartType == START_LINE) {
// Match at start of line in Mulit-Line mode.
// Nothing to do here; everything is already set.
@ -2566,6 +2583,8 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
case URX_LD_SP:
case URX_JMP_SAV:
break;
@ -2835,6 +2854,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
//
case URX_JMP:
case URX_JMPX:
case URX_JMP_SAV:
{
int32_t jmpDest = URX_VAL(op);
if (jmpDest < loc) {
@ -2953,6 +2973,79 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
}
//----------------------------------------------------------------------------------------
//
// stripNOPs Remove any NOP operations from the compiled pattern code.
// Extra NOPs are inserted for some constructs during the initial
// code generation to provide locations that may be patched later.
// Many end up unneeded, and are removed by this function.
//
//----------------------------------------------------------------------------------------
void RegexCompile::stripNOPs() {
if (U_FAILURE(*fStatus)) {
return;
}
int32_t end = fRXPat->fCompiledPat->size();
UVector32 deltas(end, *fStatus);
// Make a first pass over the code, computing the amount that things
// will be offset at each location in the original code.
int32_t loc;
int32_t d = 0;
for (loc=0; loc<end; loc++) {
deltas.addElement(d, *fStatus);
int32_t op = fRXPat->fCompiledPat->elementAti(loc);
if (URX_TYPE(op) == URX_NOP) {
d++;
}
}
// Make a second pass over the code, removing the NOPs by moving following
// code up, and patching operands that refer to code locations that
// are being moved. The array of offsets from the first step is used
// to compute the new operand values.
int32_t src;
int32_t dst = 0;
for (src=0; src<end; src++) {
int32_t op = fRXPat->fCompiledPat->elementAti(src);
int32_t opType = URX_TYPE(op);
switch (opType) {
case URX_NOP:
break;
case URX_STATE_SAVE:
case URX_JMP:
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
case URX_RELOC_OPRND:
case URX_JMPX:
case URX_JMP_SAV:
// These are instructions with operands that refer to code locations.
{
int32_t operandAddress = URX_VAL(op);
U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
op = URX_BUILD(opType, fixedOperandAddress);
fRXPat->fCompiledPat->setElementAt(op, dst);
dst++;
break;
}
default:
// The remaining instructions are unaltered by the relocation.
fRXPat->fCompiledPat->setElementAt(op, dst);
dst++;
break;
}
}
}
//----------------------------------------------------------------------------------------
//
// Error Report a rule parse error.

View File

@ -112,6 +112,7 @@ private:
int32_t maxMatchLength(int32_t start,
int32_t end);
void matchStartType();
void stripNOPs();
UErrorCode *fStatus;

View File

@ -17,7 +17,7 @@ U_NAMESPACE_BEGIN
//
// debugging support. Enable one or more of the #defines immediately following
//
#ifdef DEBUG
#ifdef _DEBUG
//#define REGEX_SCAN_DEBUG
#define REGEX_DUMP_DEBUG
#define REGEX_RUN_DEBUG
@ -70,7 +70,7 @@ enum {
// the pattern.
URX_FAIL = 14, // Stop match operation, No match.
URX_UNUSED = 15,
URX_JMP_SAV = 15, // Operand: JMP destination location
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
URX_BACKSLASH_G = 17,
URX_UNUSED_1 = 18, // Value field: 0: \w 1: \W
@ -168,7 +168,7 @@ enum {
"DOTANY", \
"JMP", \
"FAIL", \
"URX_UNUSED", \
"URX_JMP_SAV", \
"URX_BACKSLASH_B", \
"URX_BACKSLASH_G", \
"URX_UNUSED_1", \

View File

@ -287,8 +287,6 @@ UBool RegexMatcher::find() {
U_ASSERT(startPos >= 0);
switch (fPattern->fStartType) {
case START_LINE:
case START_STRING:
case START_NO_INFO:
// No optimization was found.
// Try a match at each input position.
@ -346,6 +344,7 @@ UBool RegexMatcher::find() {
}
U_ASSERT(FALSE);
case START_STRING:
case START_CHAR:
{
// Match starts on exactly one char.
@ -369,10 +368,48 @@ UBool RegexMatcher::find() {
}
}
U_ASSERT(FALSE);
case START_LINE:
{
UChar32 c;
if (startPos == 0) {
MatchAt(startPos, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
}
for (;;) {
UChar32 c = inputBuf[startPos-1];
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
(c == 0x0a || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 ||
c == 0x0d && startPos+1 < inputLen && inputBuf[startPos+1] != 0x0a)) {
MatchAt(startPos, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
}
if (startPos >= testLen) {
return FALSE;
}
U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
}
}
default:
U_ASSERT(FALSE);
}
U_ASSERT(FALSE);
return FALSE;
}
@ -1168,6 +1205,11 @@ GC_Done:
isMatch = FALSE;
goto breakFromLoop;
case URX_JMP_SAV:
fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current
fp->fPatIdx = opValue; // Then JMP.
break;
case URX_CTR_INIT:
{
U_ASSERT(opValue >= 0 && opValue < frameSize-2);
@ -1542,6 +1584,7 @@ GC_Done:
int32_t minML = pat[fp->fPatIdx++];
int32_t maxML = pat[fp->fPatIdx++];
int32_t continueLoc = pat[fp->fPatIdx++];
continueLoc = URX_VAL(continueLoc);
U_ASSERT(minML <= maxML);
U_ASSERT(minML >= 0);
U_ASSERT(continueLoc > fp->fPatIdx);

View File

@ -486,6 +486,7 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_END_CAPTURE:
case URX_STATE_SAVE:
case URX_JMP:
case URX_JMP_SAV:
case URX_BACKSLASH_B:
case URX_BACKSLASH_D:
case URX_BACKSLASH_Z:

View File

@ -284,6 +284,7 @@
"((?:a|b|c)whoop-dee-do) | [jkl]|zed" "x"
"astring|another[bcd]|alpha|a|[a]" "x"
#
# Regexps from http://www.regexlib.com
#