ICU-2422 Regexp, optimizing find() operations, work in progress.
X-SVN-Rev: 11290
This commit is contained in:
parent
708317c997
commit
fef34e930e
@ -520,6 +520,12 @@ void RegexCompile::compile(
|
||||
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
||||
fRXPat->fMaxMatchLen = maxMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
||||
|
||||
//
|
||||
// Optimization pass: Categorize how a match can start, for use by find()
|
||||
//
|
||||
matchStartType();
|
||||
|
||||
|
||||
//
|
||||
// A stupid bit of non-sense to prevent code coverage testing from complaining
|
||||
// about the pattern.dump() debug function. Go through the motions of dumping,
|
||||
@ -2073,24 +2079,27 @@ UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
|
||||
// matchStartType Determine how a match can start.
|
||||
// Used to optimize find() operations.
|
||||
//
|
||||
// Operation is very similar to minMatchLength(). Walk the compiled
|
||||
// pattern, keeping an on-going minimum-match-length. For any
|
||||
// op where the min match coming in is zero, add that ops possible
|
||||
// starting matches to the possible starts for the overall pattern.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
int32_t RegexCompile::matchStartType() {
|
||||
void RegexCompile::matchStartType() {
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return 0;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
int32_t loc;
|
||||
int32_t op;
|
||||
int32_t opType;
|
||||
int32_t currentLen = 0;
|
||||
int32_t loc; // Location in the pattern of the current op being processed.
|
||||
int32_t op; // The op being processed
|
||||
int32_t opType; // The opcode type of the op
|
||||
int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
|
||||
int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
|
||||
|
||||
UnicodeSet startingChars;
|
||||
int32_t startStringIndex;
|
||||
int32_t startStringLen;
|
||||
|
||||
UBool atStart = TRUE; // True if no part of the pattern yet encountered
|
||||
// could have advanced the position in a match.
|
||||
UBool atStart = TRUE; // True if no part of the pattern yet encountered
|
||||
// could have advanced the position in a match.
|
||||
// (Maximum match length so far == 0)
|
||||
|
||||
// forwardedLength is a vector holding minimum-match-length values that
|
||||
// are propagated forward in the pattern by JMP or STATE_SAVE operations.
|
||||
@ -2155,19 +2164,19 @@ int32_t RegexCompile::matchStartType() {
|
||||
if (currentLen == 0) {
|
||||
// This character could appear at the start of a match.
|
||||
// Add it to the set of possible starting characters.
|
||||
startingChars.add(URX_VAL(op));
|
||||
fRXPat->fInitialChars->add(URX_VAL(op));
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
case URX_SETREF: // TODO: Sense of op, invert the set
|
||||
case URX_SETREF:
|
||||
if (currentLen == 0) {
|
||||
int32_t sn = URX_VAL(op);
|
||||
U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
|
||||
const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
|
||||
startingChars.addAll(*s);
|
||||
fRXPat->fInitialChars->addAll(*s);
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
@ -2183,9 +2192,9 @@ int32_t RegexCompile::matchStartType() {
|
||||
if (negated) {
|
||||
UnicodeSet sc(*s);
|
||||
sc.complement();
|
||||
startingChars.addAll(sc);
|
||||
fRXPat->fInitialChars->addAll(sc);
|
||||
} else {
|
||||
startingChars.addAll(*s);
|
||||
fRXPat->fInitialChars->addAll(*s);
|
||||
}
|
||||
}
|
||||
currentLen++;
|
||||
@ -2197,9 +2206,12 @@ int32_t RegexCompile::matchStartType() {
|
||||
case URX_BACKSLASH_D:
|
||||
// Digit Char
|
||||
if (currentLen == 0) {
|
||||
UnicodeSet s; // TODO: sense of op, invert the set.
|
||||
UnicodeSet s;
|
||||
s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
|
||||
startingChars.addAll(s);
|
||||
if (URX_VAL(op) != 0) {
|
||||
s.complement();
|
||||
}
|
||||
fRXPat->fInitialChars->addAll(s);
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
@ -2215,21 +2227,27 @@ int32_t RegexCompile::matchStartType() {
|
||||
// to the set of possible starting match chars.
|
||||
UnicodeSet s(c, c);
|
||||
s.closeOver(USET_CASE);
|
||||
startingChars.addAll(s);
|
||||
fRXPat->fInitialChars->addAll(s);
|
||||
} else {
|
||||
// Char has no case variants. Just add it as-is to the
|
||||
// set of possible starting chars.
|
||||
startingChars.add(c);
|
||||
fRXPat->fInitialChars->add(c);
|
||||
}
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
case URX_BACKSLASH_W:
|
||||
|
||||
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
||||
case URX_DOTANY_ALL: // . matches one or two.
|
||||
case URX_DOTANY:
|
||||
if (currentLen == 0) {
|
||||
// These constructs are all bad news when they appear at the start
|
||||
// of a match. Any character can begin the match.
|
||||
fRXPat->fInitialChars->clear();
|
||||
fRXPat->fInitialChars->complement();
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
@ -2252,12 +2270,14 @@ int32_t RegexCompile::matchStartType() {
|
||||
}
|
||||
}
|
||||
}
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
case URX_FAIL:
|
||||
// Fails are kind of like a branch, except that the min length was
|
||||
// propagated already, by the state save.
|
||||
currentLen = forwardedLength.elementAti(loc+1);
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
@ -2272,20 +2292,61 @@ int32_t RegexCompile::matchStartType() {
|
||||
}
|
||||
}
|
||||
}
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
|
||||
|
||||
case URX_STRING:
|
||||
case URX_STRING_I:
|
||||
{
|
||||
loc++;
|
||||
int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
|
||||
currentLen += URX_VAL(stringLenOp);
|
||||
int32_t stringLen = URX_VAL(stringLenOp);
|
||||
U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
|
||||
U_ASSERT(stringLenOp >= 2);
|
||||
if (currentLen == 0) {
|
||||
// Add the starting character of this string to the set of possible starting
|
||||
// characters for this pattern.
|
||||
int32_t stringStartIdx = URX_VAL(op);
|
||||
UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
|
||||
fRXPat->fInitialChars->add(c);
|
||||
|
||||
// Remember this string. After the entire pattern has been checked,
|
||||
// if nothing else is identified that can start a match, we'll use it.
|
||||
numInitialStrings++;
|
||||
fRXPat->fInitialStringIdx = stringStartIdx;
|
||||
fRXPat->fInitialStringLen = stringLen;
|
||||
}
|
||||
|
||||
currentLen += stringLen;
|
||||
atStart = FALSE;
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_STRING_I:
|
||||
{
|
||||
// Case-insensitive string. Unlike exact-match strings, we won't
|
||||
// attempt a string search for possible match positions. But we
|
||||
// do update the set of possible starting characters.
|
||||
loc++;
|
||||
int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
|
||||
int32_t stringLen = URX_VAL(stringLenOp);
|
||||
U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
|
||||
U_ASSERT(stringLenOp >= 2);
|
||||
if (currentLen == 0) {
|
||||
// Add the starting character of this string to the set of possible starting
|
||||
// characters for this pattern.
|
||||
int32_t stringStartIdx = URX_VAL(op);
|
||||
UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
|
||||
UnicodeSet s(c, c);
|
||||
s.closeOver(USET_CASE);
|
||||
fRXPat->fInitialChars->addAll(s);
|
||||
}
|
||||
currentLen += stringLen;
|
||||
atStart = FALSE;
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
@ -2295,6 +2356,7 @@ int32_t RegexCompile::matchStartType() {
|
||||
// so location must be updated accordingly.
|
||||
loc+=3;
|
||||
}
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
@ -2303,6 +2365,7 @@ int32_t RegexCompile::matchStartType() {
|
||||
case URX_CTR_LOOP_P:
|
||||
// Loop ops.
|
||||
// The jump is conditional, backwards only.
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
@ -2312,8 +2375,6 @@ int32_t RegexCompile::matchStartType() {
|
||||
{
|
||||
// Look-around. Scan forward until the matching look-ahead end,
|
||||
// without processing the look-around block. This is overly pessimistic.
|
||||
// TODO: Positive lookahead could recursively do the block, then continue
|
||||
// with the longer of the block or the value coming in.
|
||||
int32_t depth = 0;
|
||||
for (;;) {
|
||||
loc++;
|
||||
@ -2337,8 +2398,9 @@ int32_t RegexCompile::matchStartType() {
|
||||
case URX_LB_END:
|
||||
case URX_LBN_CONT:
|
||||
case URX_LBN_END:
|
||||
// Only come here if the matching URX_LA_START or URX_LB_START was not in the
|
||||
// range being sized, which happens when measuring size of look-behind blocks.
|
||||
U_ASSERT(FALSE); // Shouldn't get here. These ops should be
|
||||
// consumed by the scan in URX_LA_START and LB_START
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -2353,8 +2415,41 @@ int32_t RegexCompile::matchStartType() {
|
||||
if (forwardedLength.elementAti(end+1) < currentLen) {
|
||||
currentLen = forwardedLength.elementAti(end+1);
|
||||
}
|
||||
|
||||
return currentLen;
|
||||
|
||||
|
||||
// Sort out what we should check for when looking for candidate match start positions.
|
||||
// In order of preference,
|
||||
// 1. Start of input text buffer.
|
||||
// 2. A literal string.
|
||||
// 3. Start of line in multi-line mode.
|
||||
// 4. A single literal character.
|
||||
// 5. A character from a set of characters.
|
||||
//
|
||||
if (fRXPat->fStartType == START_START) {
|
||||
// Match only at the start of an input text string.
|
||||
// start type is already set. We're done.
|
||||
} else if (numInitialStrings == 1 && fRXPat->fInitialChars->size() == 1) {
|
||||
// Match beginning only with a literal string.
|
||||
UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
|
||||
U_ASSERT(fRXPat->fInitialChars->contains(c));
|
||||
fRXPat->fStartType = START_STRING;
|
||||
} else if (fRXPat->fStartType == START_LINE) {
|
||||
// Match at start of line in Mulit-Line mode.
|
||||
// Nothing to do here; everything is already set.
|
||||
} else if (fRXPat->fInitialChars->size() == 1) {
|
||||
// All matches begin with the same char.
|
||||
fRXPat->fStartType = START_CHAR;
|
||||
fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
|
||||
U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
|
||||
} else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE) {
|
||||
// Matches start with a set of character smaller than the set of all chars.
|
||||
fRXPat->fStartType = START_SET;
|
||||
} else {
|
||||
// Matches can start with anything
|
||||
fRXPat->fStartType = START_NO_INFO;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@ -2444,7 +2539,6 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
case URX_SETREF:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_ONECHAR_I:
|
||||
case URX_BACKSLASH_W:
|
||||
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
||||
case URX_DOTANY_ALL: // . matches one or two.
|
||||
case URX_DOTANY:
|
||||
@ -2662,7 +2756,6 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
||||
case URX_SETREF:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_ONECHAR_I:
|
||||
case URX_BACKSLASH_W:
|
||||
case URX_DOTANY_ALL:
|
||||
case URX_DOTANY:
|
||||
currentLen+=2;
|
||||
|
@ -111,7 +111,7 @@ private:
|
||||
int32_t end);
|
||||
int32_t maxMatchLength(int32_t start,
|
||||
int32_t end);
|
||||
int32_t matchStartType();
|
||||
void matchStartType();
|
||||
|
||||
|
||||
UErrorCode *fStatus;
|
||||
|
@ -71,7 +71,7 @@ enum {
|
||||
URX_UNUSED = 15,
|
||||
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
|
||||
URX_BACKSLASH_G = 17,
|
||||
URX_BACKSLASH_W = 18, // Value field: 0: \w 1: \W
|
||||
URX_UNUSED_1 = 18, // Value field: 0: \w 1: \W
|
||||
URX_BACKSLASH_X = 19,
|
||||
URX_BACKSLASH_Z = 20, // \z Unconditional end of line.
|
||||
|
||||
@ -169,7 +169,7 @@ enum {
|
||||
"URX_UNUSED", \
|
||||
"URX_BACKSLASH_B", \
|
||||
"URX_BACKSLASH_G", \
|
||||
"URX_BACKSLASH_W", \
|
||||
"URX_UNUSED_1", \
|
||||
"URX_BACKSLASH_X", \
|
||||
"URX_BACKSLASH_Z", \
|
||||
"URX_DOTANY_ALL", \
|
||||
@ -258,6 +258,14 @@ enum StartOfMatch {
|
||||
START_LINE, // Match starts with ^ in multi-line mode.
|
||||
START_STRING // Match starts with a literal string.
|
||||
};
|
||||
|
||||
#define START_OF_MATCH_STR(v) ((v)==START_NO_INFO? "START_NO_INFO" : \
|
||||
(v)==START_CHAR? "START_CHAR" : \
|
||||
(v)==START_SET? "START_SET" : \
|
||||
(v)==START_START? "START_START" : \
|
||||
(v)==START_LINE? "START_LINE" : \
|
||||
(v)==START_STRING? "START_STRING" : \
|
||||
"ILLEGAL")
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
||||
|
@ -69,9 +69,13 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
||||
fMinMatchLen = other.fMinMatchLen;
|
||||
fMaxMatchLen = other.fMaxMatchLen;
|
||||
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
||||
fStaticSets = other.fStaticSets;
|
||||
fStaticSets = other.fStaticSets;
|
||||
|
||||
fStartType = other.fStartType;
|
||||
fStartInfo = other.fStartInfo;
|
||||
fInitialStringIdx = other.fInitialStringIdx;
|
||||
fInitialStringLen = other.fInitialStringLen;
|
||||
fInitialChars = new UnicodeSet(*other.fInitialChars);
|
||||
fInitialChar = other.fInitialChar;
|
||||
if (fBadState) {
|
||||
return *this;
|
||||
}
|
||||
@ -123,14 +127,18 @@ void RegexPattern::init() {
|
||||
fFrameSize = 0;
|
||||
fDataSize = 0;
|
||||
fStartType = START_NO_INFO;
|
||||
fStartInfo = 0;
|
||||
fInitialStringIdx = 0;
|
||||
fInitialStringLen = 0;
|
||||
fInitialChars = NULL;
|
||||
fInitialChar = 0;
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
// Init of a completely new RegexPattern.
|
||||
fCompiledPat = new UVector32(status);
|
||||
fGroupMap = new UVector32(status);
|
||||
fSets = new UVector(status);
|
||||
if (U_FAILURE(status) || fCompiledPat == NULL || fSets == NULL) {
|
||||
fCompiledPat = new UVector32(status);
|
||||
fGroupMap = new UVector32(status);
|
||||
fSets = new UVector(status);
|
||||
fInitialChars = new UnicodeSet;
|
||||
if (U_FAILURE(status) || fCompiledPat == NULL || fSets == NULL || fInitialChars == NULL) {
|
||||
fBadState = TRUE;
|
||||
return;
|
||||
}
|
||||
@ -162,6 +170,8 @@ void RegexPattern::zap() {
|
||||
fSets = NULL;
|
||||
delete fGroupMap;
|
||||
fGroupMap = NULL;
|
||||
delete fInitialChars;
|
||||
fInitialChars = NULL;
|
||||
}
|
||||
|
||||
|
||||
@ -478,7 +488,6 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
case URX_JMP:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_BACKSLASH_W:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_STRING_LEN:
|
||||
case URX_CTR_INIT:
|
||||
@ -576,6 +585,41 @@ void RegexPattern::dump() const {
|
||||
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n" , fBadState? "no" : "yes");
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Min Match Length: %d\n", fMinMatchLen);
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Max Match Length: %d\n", fMaxMatchLen);
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
|
||||
if (fStartType == START_STRING) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Initial match sting: \"");
|
||||
for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
|
||||
REGEX_DUMP_DEBUG_PRINTF("%c", fLiteralText[i]); // TODO: non-printables, surrogates.
|
||||
}
|
||||
|
||||
} else if (fStartType == START_SET) {
|
||||
int32_t numSetChars = fInitialChars->size();
|
||||
if (numSetChars > 20) {
|
||||
numSetChars = 20;
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Match First Chars : ");
|
||||
for (i=0; i<numSetChars; i++) {
|
||||
UChar32 c = fInitialChars->charAt(i);
|
||||
if (0x20<c && c <0x7e) {
|
||||
REGEX_DUMP_DEBUG_PRINTF("%c ", c);
|
||||
} else {
|
||||
REGEX_DUMP_DEBUG_PRINTF("%#x ", c);
|
||||
}
|
||||
}
|
||||
if (numSetChars < fInitialChars->size()) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(" ...");
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF("\n");
|
||||
|
||||
} else if (fStartType == START_CHAR) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(" First char of Match : ");
|
||||
if (0x20 < fInitialChar && fInitialChar<0x7e) {
|
||||
REGEX_DUMP_DEBUG_PRINTF("%c\n", fInitialChar);
|
||||
} else {
|
||||
REGEX_DUMP_DEBUG_PRINTF("%#x\n", fInitialChar);
|
||||
}
|
||||
}
|
||||
|
||||
REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
|
||||
"-------------------------------------------\n");
|
||||
for (index = 0; index<fCompiledPat->size(); index++) {
|
||||
@ -584,6 +628,8 @@ void RegexPattern::dump() const {
|
||||
REGEX_DUMP_DEBUG_PRINTF("\n\n");
|
||||
};
|
||||
|
||||
|
||||
|
||||
const char RegexPattern::fgClassID = 0;
|
||||
|
||||
//----------------------------------------------------------------------------------
|
||||
|
@ -351,7 +351,10 @@ private:
|
||||
// regex character classes, e.g. Word.
|
||||
|
||||
int32_t fStartType; // Info on how a match must start.
|
||||
int32_t fStartInfo; // Data, interpretation depends on start type.
|
||||
int32_t fInitialStringIdx; //
|
||||
int32_t fInitialStringLen;
|
||||
UnicodeSet *fInitialChars;
|
||||
UChar32 fInitialChar;
|
||||
|
||||
/**
|
||||
* The address of this static class variable serves as this class's ID
|
||||
|
3
icu4c/source/test/testdata/regextst.txt
vendored
3
icu4c/source/test/testdata/regextst.txt
vendored
@ -279,6 +279,9 @@
|
||||
#"^(?:a?b?)*$" d "a--"
|
||||
"^(?:a?b?)*$" "a--"
|
||||
|
||||
"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings</0>"
|
||||
"((?:a|b|c)whoop-dee-do) | [jkl]|zed" "x"
|
||||
"astring|another[bcd]|alpha|a|[a]" d "x"
|
||||
|
||||
#
|
||||
# Regexps from http://www.regexlib.com
|
||||
|
Loading…
Reference in New Issue
Block a user