ICU-2422 Regexp, optimizing find() operations, work in progress.

X-SVN-Rev: 11290
This commit is contained in:
Andy Heninger 2003-03-13 01:56:01 +00:00
parent 708317c997
commit fef34e930e
6 changed files with 197 additions and 44 deletions

View File

@ -520,6 +520,12 @@ void RegexCompile::compile(
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
fRXPat->fMaxMatchLen = maxMatchLength(3, fRXPat->fCompiledPat->size()-1);
//
// Optimization pass: Categorize how a match can start, for use by find()
//
matchStartType();
//
// A stupid bit of non-sense to prevent code coverage testing from complaining
// about the pattern.dump() debug function. Go through the motions of dumping,
@ -2073,24 +2079,27 @@ UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
// matchStartType Determine how a match can start.
// Used to optimize find() operations.
//
// Operation is very similar to minMatchLength(). Walk the compiled
// pattern, keeping an on-going minimum-match-length. For any
// op where the min match coming in is zero, add that ops possible
// starting matches to the possible starts for the overall pattern.
//
//----------------------------------------------------------------------------------------
int32_t RegexCompile::matchStartType() {
void RegexCompile::matchStartType() {
if (U_FAILURE(*fStatus)) {
return 0;
return;
}
int32_t loc;
int32_t op;
int32_t opType;
int32_t currentLen = 0;
int32_t loc; // Location in the pattern of the current op being processed.
int32_t op; // The op being processed
int32_t opType; // The opcode type of the op
int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
UnicodeSet startingChars;
int32_t startStringIndex;
int32_t startStringLen;
UBool atStart = TRUE; // True if no part of the pattern yet encountered
// could have advanced the position in a match.
UBool atStart = TRUE; // True if no part of the pattern yet encountered
// could have advanced the position in a match.
// (Maximum match length so far == 0)
// forwardedLength is a vector holding minimum-match-length values that
// are propagated forward in the pattern by JMP or STATE_SAVE operations.
@ -2155,19 +2164,19 @@ int32_t RegexCompile::matchStartType() {
if (currentLen == 0) {
// This character could appear at the start of a match.
// Add it to the set of possible starting characters.
startingChars.add(URX_VAL(op));
fRXPat->fInitialChars->add(URX_VAL(op));
}
currentLen++;
atStart = FALSE;
break;
case URX_SETREF: // TODO: Sense of op, invert the set
case URX_SETREF:
if (currentLen == 0) {
int32_t sn = URX_VAL(op);
U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
startingChars.addAll(*s);
fRXPat->fInitialChars->addAll(*s);
}
currentLen++;
atStart = FALSE;
@ -2183,9 +2192,9 @@ int32_t RegexCompile::matchStartType() {
if (negated) {
UnicodeSet sc(*s);
sc.complement();
startingChars.addAll(sc);
fRXPat->fInitialChars->addAll(sc);
} else {
startingChars.addAll(*s);
fRXPat->fInitialChars->addAll(*s);
}
}
currentLen++;
@ -2197,9 +2206,12 @@ int32_t RegexCompile::matchStartType() {
case URX_BACKSLASH_D:
// Digit Char
if (currentLen == 0) {
UnicodeSet s; // TODO: sense of op, invert the set.
UnicodeSet s;
s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
startingChars.addAll(s);
if (URX_VAL(op) != 0) {
s.complement();
}
fRXPat->fInitialChars->addAll(s);
}
currentLen++;
atStart = FALSE;
@ -2215,21 +2227,27 @@ int32_t RegexCompile::matchStartType() {
// to the set of possible starting match chars.
UnicodeSet s(c, c);
s.closeOver(USET_CASE);
startingChars.addAll(s);
fRXPat->fInitialChars->addAll(s);
} else {
// Char has no case variants. Just add it as-is to the
// set of possible starting chars.
startingChars.add(c);
fRXPat->fInitialChars->add(c);
}
}
currentLen++;
atStart = FALSE;
break;
case URX_BACKSLASH_W:
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
case URX_DOTANY_ALL: // . matches one or two.
case URX_DOTANY:
if (currentLen == 0) {
// These constructs are all bad news when they appear at the start
// of a match. Any character can begin the match.
fRXPat->fInitialChars->clear();
fRXPat->fInitialChars->complement();
}
currentLen++;
atStart = FALSE;
break;
@ -2252,12 +2270,14 @@ int32_t RegexCompile::matchStartType() {
}
}
}
atStart = FALSE;
break;
case URX_FAIL:
// Fails are kind of like a branch, except that the min length was
// propagated already, by the state save.
currentLen = forwardedLength.elementAti(loc+1);
atStart = FALSE;
break;
@ -2272,20 +2292,61 @@ int32_t RegexCompile::matchStartType() {
}
}
}
atStart = FALSE;
break;
case URX_STRING:
case URX_STRING_I:
{
loc++;
int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
currentLen += URX_VAL(stringLenOp);
int32_t stringLen = URX_VAL(stringLenOp);
U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
U_ASSERT(stringLenOp >= 2);
if (currentLen == 0) {
// Add the starting character of this string to the set of possible starting
// characters for this pattern.
int32_t stringStartIdx = URX_VAL(op);
UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
fRXPat->fInitialChars->add(c);
// Remember this string. After the entire pattern has been checked,
// if nothing else is identified that can start a match, we'll use it.
numInitialStrings++;
fRXPat->fInitialStringIdx = stringStartIdx;
fRXPat->fInitialStringLen = stringLen;
}
currentLen += stringLen;
atStart = FALSE;
}
break;
case URX_STRING_I:
{
// Case-insensitive string. Unlike exact-match strings, we won't
// attempt a string search for possible match positions. But we
// do update the set of possible starting characters.
loc++;
int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
int32_t stringLen = URX_VAL(stringLenOp);
U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
U_ASSERT(stringLenOp >= 2);
if (currentLen == 0) {
// Add the starting character of this string to the set of possible starting
// characters for this pattern.
int32_t stringStartIdx = URX_VAL(op);
UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
UnicodeSet s(c, c);
s.closeOver(USET_CASE);
fRXPat->fInitialChars->addAll(s);
}
currentLen += stringLen;
atStart = FALSE;
}
break;
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
@ -2295,6 +2356,7 @@ int32_t RegexCompile::matchStartType() {
// so location must be updated accordingly.
loc+=3;
}
atStart = FALSE;
break;
@ -2303,6 +2365,7 @@ int32_t RegexCompile::matchStartType() {
case URX_CTR_LOOP_P:
// Loop ops.
// The jump is conditional, backwards only.
atStart = FALSE;
break;
@ -2312,8 +2375,6 @@ int32_t RegexCompile::matchStartType() {
{
// Look-around. Scan forward until the matching look-ahead end,
// without processing the look-around block. This is overly pessimistic.
// TODO: Positive lookahead could recursively do the block, then continue
// with the longer of the block or the value coming in.
int32_t depth = 0;
for (;;) {
loc++;
@ -2337,8 +2398,9 @@ int32_t RegexCompile::matchStartType() {
case URX_LB_END:
case URX_LBN_CONT:
case URX_LBN_END:
// Only come here if the matching URX_LA_START or URX_LB_START was not in the
// range being sized, which happens when measuring size of look-behind blocks.
U_ASSERT(FALSE); // Shouldn't get here. These ops should be
// consumed by the scan in URX_LA_START and LB_START
break;
default:
@ -2353,8 +2415,41 @@ int32_t RegexCompile::matchStartType() {
if (forwardedLength.elementAti(end+1) < currentLen) {
currentLen = forwardedLength.elementAti(end+1);
}
return currentLen;
// Sort out what we should check for when looking for candidate match start positions.
// In order of preference,
// 1. Start of input text buffer.
// 2. A literal string.
// 3. Start of line in multi-line mode.
// 4. A single literal character.
// 5. A character from a set of characters.
//
if (fRXPat->fStartType == START_START) {
// Match only at the start of an input text string.
// start type is already set. We're done.
} else if (numInitialStrings == 1 && fRXPat->fInitialChars->size() == 1) {
// Match beginning only with a literal string.
UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
U_ASSERT(fRXPat->fInitialChars->contains(c));
fRXPat->fStartType = START_STRING;
} else if (fRXPat->fStartType == START_LINE) {
// Match at start of line in Mulit-Line mode.
// Nothing to do here; everything is already set.
} else if (fRXPat->fInitialChars->size() == 1) {
// All matches begin with the same char.
fRXPat->fStartType = START_CHAR;
fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
} else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE) {
// Matches start with a set of character smaller than the set of all chars.
fRXPat->fStartType = START_SET;
} else {
// Matches can start with anything
fRXPat->fStartType = START_NO_INFO;
}
return;
}
@ -2444,7 +2539,6 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_SETREF:
case URX_BACKSLASH_D:
case URX_ONECHAR_I:
case URX_BACKSLASH_W:
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
case URX_DOTANY_ALL: // . matches one or two.
case URX_DOTANY:
@ -2662,7 +2756,6 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
case URX_SETREF:
case URX_BACKSLASH_D:
case URX_ONECHAR_I:
case URX_BACKSLASH_W:
case URX_DOTANY_ALL:
case URX_DOTANY:
currentLen+=2;

View File

@ -111,7 +111,7 @@ private:
int32_t end);
int32_t maxMatchLength(int32_t start,
int32_t end);
int32_t matchStartType();
void matchStartType();
UErrorCode *fStatus;

View File

@ -71,7 +71,7 @@ enum {
URX_UNUSED = 15,
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
URX_BACKSLASH_G = 17,
URX_BACKSLASH_W = 18, // Value field: 0: \w 1: \W
URX_UNUSED_1 = 18, // Value field: 0: \w 1: \W
URX_BACKSLASH_X = 19,
URX_BACKSLASH_Z = 20, // \z Unconditional end of line.
@ -169,7 +169,7 @@ enum {
"URX_UNUSED", \
"URX_BACKSLASH_B", \
"URX_BACKSLASH_G", \
"URX_BACKSLASH_W", \
"URX_UNUSED_1", \
"URX_BACKSLASH_X", \
"URX_BACKSLASH_Z", \
"URX_DOTANY_ALL", \
@ -258,6 +258,14 @@ enum StartOfMatch {
START_LINE, // Match starts with ^ in multi-line mode.
START_STRING // Match starts with a literal string.
};
#define START_OF_MATCH_STR(v) ((v)==START_NO_INFO? "START_NO_INFO" : \
(v)==START_CHAR? "START_CHAR" : \
(v)==START_SET? "START_SET" : \
(v)==START_START? "START_START" : \
(v)==START_LINE? "START_LINE" : \
(v)==START_STRING? "START_STRING" : \
"ILLEGAL")
U_NAMESPACE_END
#endif

View File

@ -69,9 +69,13 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fMinMatchLen = other.fMinMatchLen;
fMaxMatchLen = other.fMaxMatchLen;
fMaxCaptureDigits = other.fMaxCaptureDigits;
fStaticSets = other.fStaticSets;
fStaticSets = other.fStaticSets;
fStartType = other.fStartType;
fStartInfo = other.fStartInfo;
fInitialStringIdx = other.fInitialStringIdx;
fInitialStringLen = other.fInitialStringLen;
fInitialChars = new UnicodeSet(*other.fInitialChars);
fInitialChar = other.fInitialChar;
if (fBadState) {
return *this;
}
@ -123,14 +127,18 @@ void RegexPattern::init() {
fFrameSize = 0;
fDataSize = 0;
fStartType = START_NO_INFO;
fStartInfo = 0;
fInitialStringIdx = 0;
fInitialStringLen = 0;
fInitialChars = NULL;
fInitialChar = 0;
UErrorCode status=U_ZERO_ERROR;
// Init of a completely new RegexPattern.
fCompiledPat = new UVector32(status);
fGroupMap = new UVector32(status);
fSets = new UVector(status);
if (U_FAILURE(status) || fCompiledPat == NULL || fSets == NULL) {
fCompiledPat = new UVector32(status);
fGroupMap = new UVector32(status);
fSets = new UVector(status);
fInitialChars = new UnicodeSet;
if (U_FAILURE(status) || fCompiledPat == NULL || fSets == NULL || fInitialChars == NULL) {
fBadState = TRUE;
return;
}
@ -162,6 +170,8 @@ void RegexPattern::zap() {
fSets = NULL;
delete fGroupMap;
fGroupMap = NULL;
delete fInitialChars;
fInitialChars = NULL;
}
@ -478,7 +488,6 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_JMP:
case URX_BACKSLASH_B:
case URX_BACKSLASH_D:
case URX_BACKSLASH_W:
case URX_BACKSLASH_Z:
case URX_STRING_LEN:
case URX_CTR_INIT:
@ -576,6 +585,41 @@ void RegexPattern::dump() const {
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n" , fBadState? "no" : "yes");
REGEX_DUMP_DEBUG_PRINTF(" Min Match Length: %d\n", fMinMatchLen);
REGEX_DUMP_DEBUG_PRINTF(" Max Match Length: %d\n", fMaxMatchLen);
REGEX_DUMP_DEBUG_PRINTF(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
if (fStartType == START_STRING) {
REGEX_DUMP_DEBUG_PRINTF(" Initial match sting: \"");
for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
REGEX_DUMP_DEBUG_PRINTF("%c", fLiteralText[i]); // TODO: non-printables, surrogates.
}
} else if (fStartType == START_SET) {
int32_t numSetChars = fInitialChars->size();
if (numSetChars > 20) {
numSetChars = 20;
}
REGEX_DUMP_DEBUG_PRINTF(" Match First Chars : ");
for (i=0; i<numSetChars; i++) {
UChar32 c = fInitialChars->charAt(i);
if (0x20<c && c <0x7e) {
REGEX_DUMP_DEBUG_PRINTF("%c ", c);
} else {
REGEX_DUMP_DEBUG_PRINTF("%#x ", c);
}
}
if (numSetChars < fInitialChars->size()) {
REGEX_DUMP_DEBUG_PRINTF(" ...");
}
REGEX_DUMP_DEBUG_PRINTF("\n");
} else if (fStartType == START_CHAR) {
REGEX_DUMP_DEBUG_PRINTF(" First char of Match : ");
if (0x20 < fInitialChar && fInitialChar<0x7e) {
REGEX_DUMP_DEBUG_PRINTF("%c\n", fInitialChar);
} else {
REGEX_DUMP_DEBUG_PRINTF("%#x\n", fInitialChar);
}
}
REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
"-------------------------------------------\n");
for (index = 0; index<fCompiledPat->size(); index++) {
@ -584,6 +628,8 @@ void RegexPattern::dump() const {
REGEX_DUMP_DEBUG_PRINTF("\n\n");
};
const char RegexPattern::fgClassID = 0;
//----------------------------------------------------------------------------------

View File

@ -351,7 +351,10 @@ private:
// regex character classes, e.g. Word.
int32_t fStartType; // Info on how a match must start.
int32_t fStartInfo; // Data, interpretation depends on start type.
int32_t fInitialStringIdx; //
int32_t fInitialStringLen;
UnicodeSet *fInitialChars;
UChar32 fInitialChar;
/**
* The address of this static class variable serves as this class's ID

View File

@ -279,6 +279,9 @@
#"^(?:a?b?)*$" d "a--"
"^(?:a?b?)*$" "a--"
"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings</0>"
"((?:a|b|c)whoop-dee-do) | [jkl]|zed" "x"
"astring|another[bcd]|alpha|a|[a]" d "x"
#
# Regexps from http://www.regexlib.com