From bf35d2da2e917b09e41354251abc63d4b4816f39 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Tue, 18 Mar 2003 01:51:36 +0000 Subject: [PATCH] ICU-2422 Regexp, optimizing find() operations, work in progress. X-SVN-Rev: 11349 --- icu4c/source/i18n/regexcmp.cpp | 109 ++++++++++++++++++++++-- icu4c/source/i18n/regexcmp.h | 1 + icu4c/source/i18n/regeximp.h | 6 +- icu4c/source/i18n/rematch.cpp | 51 ++++++++++- icu4c/source/i18n/repattrn.cpp | 1 + icu4c/source/test/testdata/regextst.txt | 1 + 6 files changed, 154 insertions(+), 15 deletions(-) diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 1bd10c36af..941fd3a541 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -525,6 +525,9 @@ void RegexCompile::compile( // matchStartType(); + // Optimization: strip out uneeded NOPs from the compiled pattern. + stripNOPs(); + // // A stupid bit of non-sense to prevent code coverage testing from complaining @@ -934,19 +937,23 @@ UBool RegexCompile::doParseActions(EParseAction action) // 2. state-save 4 // 3. jmp 1 // 4. ... + // Normal '+' compiles to + // 1. stuff to be repeated (already built) + // 2. jmp-sav 1 + // 3. ... { int32_t topLoc = blockTopLoc(FALSE); // location of item #1 // Locate the position in the compiled pattern where the match will continue // after completing the + (4 in the comment above) - int32_t continueLoc = fRXPat->fCompiledPat->size()+2; + //int32_t continueLoc = fRXPat->fCompiledPat->size()+2; // Emit the STATE_SAVE - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); - fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); + //int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); + //fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); // Emit the JMP - int32_t jmpOp = URX_BUILD(URX_JMP, topLoc); + int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); } break; @@ -1692,6 +1699,7 @@ void RegexCompile::insertOp(int32_t where) { opType == URX_CTR_LOOP || opType == URX_CTR_LOOP_NG || opType == URX_CTR_LOOP_P || + opType == URX_JMP_SAV || opType == URX_RELOC_OPRND) && opValue > where) { // Target location for this opcode is after the insertion point and // needs to be incremented to adjust for the insertion. @@ -1933,12 +1941,13 @@ void RegexCompile::handleCloseParen() { // Insert the min and max match len bounds into the URX_LB_CONT op that // appears at the top of the look-behind block, at location fMatchOpenParen+1 - fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); - fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); + fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); + fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); // Insert the pattern location to continue at after a successful match // as the last operand of the URX_LBN_CONT - fRXPat->fCompiledPat->setElementAt(fRXPat->fCompiledPat->size(), fMatchOpenParen-1); + op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); + fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); } break; @@ -2254,6 +2263,7 @@ void RegexCompile::matchStartType() { // of a match. Any character can begin the match. fRXPat->fInitialChars->clear(); fRXPat->fInitialChars->complement(); + numInitialStrings += 2; } currentLen++; atStart = FALSE; @@ -2281,6 +2291,12 @@ void RegexCompile::matchStartType() { atStart = FALSE; break; + case URX_JMP_SAV: + // Combo of state save to the next loc, + jmp backwards. + // Net effect on min. length computation is nothing. + atStart = FALSE; + break; + case URX_FAIL: // Fails are kind of like a branch, except that the min length was // propagated already, by the state save. @@ -2463,7 +2479,8 @@ void RegexCompile::matchStartType() { // Match beginning only with a literal string. UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); U_ASSERT(fRXPat->fInitialChars->contains(c)); - fRXPat->fStartType = START_STRING; + fRXPat->fStartType = START_STRING; + fRXPat->fInitialChar = c; } else if (fRXPat->fStartType == START_LINE) { // Match at start of line in Mulit-Line mode. // Nothing to do here; everything is already set. @@ -2566,6 +2583,8 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. case URX_LD_SP: + + case URX_JMP_SAV: break; @@ -2835,6 +2854,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { // case URX_JMP: case URX_JMPX: + case URX_JMP_SAV: { int32_t jmpDest = URX_VAL(op); if (jmpDest < loc) { @@ -2953,6 +2973,79 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { } +//---------------------------------------------------------------------------------------- +// +// stripNOPs Remove any NOP operations from the compiled pattern code. +// Extra NOPs are inserted for some constructs during the initial +// code generation to provide locations that may be patched later. +// Many end up unneeded, and are removed by this function. +// +//---------------------------------------------------------------------------------------- +void RegexCompile::stripNOPs() { + + if (U_FAILURE(*fStatus)) { + return; + } + + int32_t end = fRXPat->fCompiledPat->size(); + UVector32 deltas(end, *fStatus); + + // Make a first pass over the code, computing the amount that things + // will be offset at each location in the original code. + int32_t loc; + int32_t d = 0; + for (loc=0; locfCompiledPat->elementAti(loc); + if (URX_TYPE(op) == URX_NOP) { + d++; + } + } + + // Make a second pass over the code, removing the NOPs by moving following + // code up, and patching operands that refer to code locations that + // are being moved. The array of offsets from the first step is used + // to compute the new operand values. + int32_t src; + int32_t dst = 0; + for (src=0; srcfCompiledPat->elementAti(src); + int32_t opType = URX_TYPE(op); + switch (opType) { + case URX_NOP: + break; + + case URX_STATE_SAVE: + case URX_JMP: + case URX_CTR_LOOP: + case URX_CTR_LOOP_NG: + case URX_CTR_LOOP_P: + case URX_RELOC_OPRND: + case URX_JMPX: + case URX_JMP_SAV: + // These are instructions with operands that refer to code locations. + { + int32_t operandAddress = URX_VAL(op); + U_ASSERT(operandAddress>=0 && operandAddressfCompiledPat->setElementAt(op, dst); + dst++; + break; + } + + default: + // The remaining instructions are unaltered by the relocation. + fRXPat->fCompiledPat->setElementAt(op, dst); + dst++; + break; + } + } + +} + + + //---------------------------------------------------------------------------------------- // // Error Report a rule parse error. diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h index 53dfe3c02f..729319ffe8 100644 --- a/icu4c/source/i18n/regexcmp.h +++ b/icu4c/source/i18n/regexcmp.h @@ -112,6 +112,7 @@ private: int32_t maxMatchLength(int32_t start, int32_t end); void matchStartType(); + void stripNOPs(); UErrorCode *fStatus; diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index c7798a2688..e5ce388acb 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -17,7 +17,7 @@ U_NAMESPACE_BEGIN // // debugging support. Enable one or more of the #defines immediately following // -#ifdef DEBUG +#ifdef _DEBUG //#define REGEX_SCAN_DEBUG #define REGEX_DUMP_DEBUG #define REGEX_RUN_DEBUG @@ -70,7 +70,7 @@ enum { // the pattern. URX_FAIL = 14, // Stop match operation, No match. - URX_UNUSED = 15, + URX_JMP_SAV = 15, // Operand: JMP destination location URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B URX_BACKSLASH_G = 17, URX_UNUSED_1 = 18, // Value field: 0: \w 1: \W @@ -168,7 +168,7 @@ enum { "DOTANY", \ "JMP", \ "FAIL", \ - "URX_UNUSED", \ + "URX_JMP_SAV", \ "URX_BACKSLASH_B", \ "URX_BACKSLASH_G", \ "URX_UNUSED_1", \ diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 87b170a1ac..51e1190b08 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -287,8 +287,6 @@ UBool RegexMatcher::find() { U_ASSERT(startPos >= 0); switch (fPattern->fStartType) { - case START_LINE: - case START_STRING: case START_NO_INFO: // No optimization was found. // Try a match at each input position. @@ -346,6 +344,7 @@ UBool RegexMatcher::find() { } U_ASSERT(FALSE); + case START_STRING: case START_CHAR: { // Match starts on exactly one char. @@ -369,10 +368,48 @@ UBool RegexMatcher::find() { } } U_ASSERT(FALSE); + + case START_LINE: + { + UChar32 c; + if (startPos == 0) { + MatchAt(startPos, status); + if (U_FAILURE(status)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++]; + } + + for (;;) { + UChar32 c = inputBuf[startPos-1]; + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible + (c == 0x0a || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 || + c == 0x0d && startPos+1 < inputLen && inputBuf[startPos+1] != 0x0a)) { + MatchAt(startPos, status); + if (U_FAILURE(status)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (startPos >= testLen) { + return FALSE; + } + U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++]; + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testLen the last time through. + } + } + + default: + U_ASSERT(FALSE); } - - U_ASSERT(FALSE); return FALSE; } @@ -1168,6 +1205,11 @@ GC_Done: isMatch = FALSE; goto breakFromLoop; + case URX_JMP_SAV: + fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current + fp->fPatIdx = opValue; // Then JMP. + break; + case URX_CTR_INIT: { U_ASSERT(opValue >= 0 && opValue < frameSize-2); @@ -1542,6 +1584,7 @@ GC_Done: int32_t minML = pat[fp->fPatIdx++]; int32_t maxML = pat[fp->fPatIdx++]; int32_t continueLoc = pat[fp->fPatIdx++]; + continueLoc = URX_VAL(continueLoc); U_ASSERT(minML <= maxML); U_ASSERT(minML >= 0); U_ASSERT(continueLoc > fp->fPatIdx); diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index f128584ed1..25eddaac05 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -486,6 +486,7 @@ void RegexPattern::dumpOp(int32_t index) const { case URX_END_CAPTURE: case URX_STATE_SAVE: case URX_JMP: + case URX_JMP_SAV: case URX_BACKSLASH_B: case URX_BACKSLASH_D: case URX_BACKSLASH_Z: diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index 62eefd9064..a06d23570f 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -284,6 +284,7 @@ "((?:a|b|c)whoop-dee-do) | [jkl]|zed" "x" "astring|another[bcd]|alpha|a|[a]" "x" + # # Regexps from http://www.regexlib.com #