ICU-2422 Regexp, optimizing find() operations, work in progress.
X-SVN-Rev: 11290
This commit is contained in:
parent
708317c997
commit
fef34e930e
@ -520,6 +520,12 @@ void RegexCompile::compile(
|
|||||||
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
||||||
fRXPat->fMaxMatchLen = maxMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
fRXPat->fMaxMatchLen = maxMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Optimization pass: Categorize how a match can start, for use by find()
|
||||||
|
//
|
||||||
|
matchStartType();
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// A stupid bit of non-sense to prevent code coverage testing from complaining
|
// A stupid bit of non-sense to prevent code coverage testing from complaining
|
||||||
// about the pattern.dump() debug function. Go through the motions of dumping,
|
// about the pattern.dump() debug function. Go through the motions of dumping,
|
||||||
@ -2073,24 +2079,27 @@ UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
|
|||||||
// matchStartType Determine how a match can start.
|
// matchStartType Determine how a match can start.
|
||||||
// Used to optimize find() operations.
|
// Used to optimize find() operations.
|
||||||
//
|
//
|
||||||
|
// Operation is very similar to minMatchLength(). Walk the compiled
|
||||||
|
// pattern, keeping an on-going minimum-match-length. For any
|
||||||
|
// op where the min match coming in is zero, add that ops possible
|
||||||
|
// starting matches to the possible starts for the overall pattern.
|
||||||
|
//
|
||||||
//----------------------------------------------------------------------------------------
|
//----------------------------------------------------------------------------------------
|
||||||
int32_t RegexCompile::matchStartType() {
|
void RegexCompile::matchStartType() {
|
||||||
if (U_FAILURE(*fStatus)) {
|
if (U_FAILURE(*fStatus)) {
|
||||||
return 0;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int32_t loc;
|
int32_t loc; // Location in the pattern of the current op being processed.
|
||||||
int32_t op;
|
int32_t op; // The op being processed
|
||||||
int32_t opType;
|
int32_t opType; // The opcode type of the op
|
||||||
int32_t currentLen = 0;
|
int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
|
||||||
|
int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
|
||||||
|
|
||||||
UnicodeSet startingChars;
|
UBool atStart = TRUE; // True if no part of the pattern yet encountered
|
||||||
int32_t startStringIndex;
|
// could have advanced the position in a match.
|
||||||
int32_t startStringLen;
|
// (Maximum match length so far == 0)
|
||||||
|
|
||||||
UBool atStart = TRUE; // True if no part of the pattern yet encountered
|
|
||||||
// could have advanced the position in a match.
|
|
||||||
|
|
||||||
// forwardedLength is a vector holding minimum-match-length values that
|
// forwardedLength is a vector holding minimum-match-length values that
|
||||||
// are propagated forward in the pattern by JMP or STATE_SAVE operations.
|
// are propagated forward in the pattern by JMP or STATE_SAVE operations.
|
||||||
@ -2155,19 +2164,19 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
if (currentLen == 0) {
|
if (currentLen == 0) {
|
||||||
// This character could appear at the start of a match.
|
// This character could appear at the start of a match.
|
||||||
// Add it to the set of possible starting characters.
|
// Add it to the set of possible starting characters.
|
||||||
startingChars.add(URX_VAL(op));
|
fRXPat->fInitialChars->add(URX_VAL(op));
|
||||||
}
|
}
|
||||||
currentLen++;
|
currentLen++;
|
||||||
atStart = FALSE;
|
atStart = FALSE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
||||||
case URX_SETREF: // TODO: Sense of op, invert the set
|
case URX_SETREF:
|
||||||
if (currentLen == 0) {
|
if (currentLen == 0) {
|
||||||
int32_t sn = URX_VAL(op);
|
int32_t sn = URX_VAL(op);
|
||||||
U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
|
U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
|
||||||
const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
|
const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
|
||||||
startingChars.addAll(*s);
|
fRXPat->fInitialChars->addAll(*s);
|
||||||
}
|
}
|
||||||
currentLen++;
|
currentLen++;
|
||||||
atStart = FALSE;
|
atStart = FALSE;
|
||||||
@ -2183,9 +2192,9 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
if (negated) {
|
if (negated) {
|
||||||
UnicodeSet sc(*s);
|
UnicodeSet sc(*s);
|
||||||
sc.complement();
|
sc.complement();
|
||||||
startingChars.addAll(sc);
|
fRXPat->fInitialChars->addAll(sc);
|
||||||
} else {
|
} else {
|
||||||
startingChars.addAll(*s);
|
fRXPat->fInitialChars->addAll(*s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentLen++;
|
currentLen++;
|
||||||
@ -2197,9 +2206,12 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
case URX_BACKSLASH_D:
|
case URX_BACKSLASH_D:
|
||||||
// Digit Char
|
// Digit Char
|
||||||
if (currentLen == 0) {
|
if (currentLen == 0) {
|
||||||
UnicodeSet s; // TODO: sense of op, invert the set.
|
UnicodeSet s;
|
||||||
s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
|
s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
|
||||||
startingChars.addAll(s);
|
if (URX_VAL(op) != 0) {
|
||||||
|
s.complement();
|
||||||
|
}
|
||||||
|
fRXPat->fInitialChars->addAll(s);
|
||||||
}
|
}
|
||||||
currentLen++;
|
currentLen++;
|
||||||
atStart = FALSE;
|
atStart = FALSE;
|
||||||
@ -2215,21 +2227,27 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
// to the set of possible starting match chars.
|
// to the set of possible starting match chars.
|
||||||
UnicodeSet s(c, c);
|
UnicodeSet s(c, c);
|
||||||
s.closeOver(USET_CASE);
|
s.closeOver(USET_CASE);
|
||||||
startingChars.addAll(s);
|
fRXPat->fInitialChars->addAll(s);
|
||||||
} else {
|
} else {
|
||||||
// Char has no case variants. Just add it as-is to the
|
// Char has no case variants. Just add it as-is to the
|
||||||
// set of possible starting chars.
|
// set of possible starting chars.
|
||||||
startingChars.add(c);
|
fRXPat->fInitialChars->add(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentLen++;
|
currentLen++;
|
||||||
atStart = FALSE;
|
atStart = FALSE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case URX_BACKSLASH_W:
|
|
||||||
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
||||||
case URX_DOTANY_ALL: // . matches one or two.
|
case URX_DOTANY_ALL: // . matches one or two.
|
||||||
case URX_DOTANY:
|
case URX_DOTANY:
|
||||||
|
if (currentLen == 0) {
|
||||||
|
// These constructs are all bad news when they appear at the start
|
||||||
|
// of a match. Any character can begin the match.
|
||||||
|
fRXPat->fInitialChars->clear();
|
||||||
|
fRXPat->fInitialChars->complement();
|
||||||
|
}
|
||||||
currentLen++;
|
currentLen++;
|
||||||
atStart = FALSE;
|
atStart = FALSE;
|
||||||
break;
|
break;
|
||||||
@ -2252,12 +2270,14 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
atStart = FALSE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case URX_FAIL:
|
case URX_FAIL:
|
||||||
// Fails are kind of like a branch, except that the min length was
|
// Fails are kind of like a branch, except that the min length was
|
||||||
// propagated already, by the state save.
|
// propagated already, by the state save.
|
||||||
currentLen = forwardedLength.elementAti(loc+1);
|
currentLen = forwardedLength.elementAti(loc+1);
|
||||||
|
atStart = FALSE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
||||||
@ -2272,20 +2292,61 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
atStart = FALSE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
case URX_STRING:
|
case URX_STRING:
|
||||||
case URX_STRING_I:
|
|
||||||
{
|
{
|
||||||
loc++;
|
loc++;
|
||||||
int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
|
int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
|
||||||
currentLen += URX_VAL(stringLenOp);
|
int32_t stringLen = URX_VAL(stringLenOp);
|
||||||
|
U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
|
||||||
|
U_ASSERT(stringLenOp >= 2);
|
||||||
|
if (currentLen == 0) {
|
||||||
|
// Add the starting character of this string to the set of possible starting
|
||||||
|
// characters for this pattern.
|
||||||
|
int32_t stringStartIdx = URX_VAL(op);
|
||||||
|
UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
|
||||||
|
fRXPat->fInitialChars->add(c);
|
||||||
|
|
||||||
|
// Remember this string. After the entire pattern has been checked,
|
||||||
|
// if nothing else is identified that can start a match, we'll use it.
|
||||||
|
numInitialStrings++;
|
||||||
|
fRXPat->fInitialStringIdx = stringStartIdx;
|
||||||
|
fRXPat->fInitialStringLen = stringLen;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentLen += stringLen;
|
||||||
|
atStart = FALSE;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case URX_STRING_I:
|
||||||
|
{
|
||||||
|
// Case-insensitive string. Unlike exact-match strings, we won't
|
||||||
|
// attempt a string search for possible match positions. But we
|
||||||
|
// do update the set of possible starting characters.
|
||||||
|
loc++;
|
||||||
|
int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
|
||||||
|
int32_t stringLen = URX_VAL(stringLenOp);
|
||||||
|
U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
|
||||||
|
U_ASSERT(stringLenOp >= 2);
|
||||||
|
if (currentLen == 0) {
|
||||||
|
// Add the starting character of this string to the set of possible starting
|
||||||
|
// characters for this pattern.
|
||||||
|
int32_t stringStartIdx = URX_VAL(op);
|
||||||
|
UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
|
||||||
|
UnicodeSet s(c, c);
|
||||||
|
s.closeOver(USET_CASE);
|
||||||
|
fRXPat->fInitialChars->addAll(s);
|
||||||
|
}
|
||||||
|
currentLen += stringLen;
|
||||||
|
atStart = FALSE;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
case URX_CTR_INIT:
|
case URX_CTR_INIT:
|
||||||
case URX_CTR_INIT_NG:
|
case URX_CTR_INIT_NG:
|
||||||
@ -2295,6 +2356,7 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
// so location must be updated accordingly.
|
// so location must be updated accordingly.
|
||||||
loc+=3;
|
loc+=3;
|
||||||
}
|
}
|
||||||
|
atStart = FALSE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
||||||
@ -2303,6 +2365,7 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
case URX_CTR_LOOP_P:
|
case URX_CTR_LOOP_P:
|
||||||
// Loop ops.
|
// Loop ops.
|
||||||
// The jump is conditional, backwards only.
|
// The jump is conditional, backwards only.
|
||||||
|
atStart = FALSE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
||||||
@ -2312,8 +2375,6 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
{
|
{
|
||||||
// Look-around. Scan forward until the matching look-ahead end,
|
// Look-around. Scan forward until the matching look-ahead end,
|
||||||
// without processing the look-around block. This is overly pessimistic.
|
// without processing the look-around block. This is overly pessimistic.
|
||||||
// TODO: Positive lookahead could recursively do the block, then continue
|
|
||||||
// with the longer of the block or the value coming in.
|
|
||||||
int32_t depth = 0;
|
int32_t depth = 0;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
loc++;
|
loc++;
|
||||||
@ -2337,8 +2398,9 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
case URX_LB_END:
|
case URX_LB_END:
|
||||||
case URX_LBN_CONT:
|
case URX_LBN_CONT:
|
||||||
case URX_LBN_END:
|
case URX_LBN_END:
|
||||||
// Only come here if the matching URX_LA_START or URX_LB_START was not in the
|
U_ASSERT(FALSE); // Shouldn't get here. These ops should be
|
||||||
// range being sized, which happens when measuring size of look-behind blocks.
|
// consumed by the scan in URX_LA_START and LB_START
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
@ -2353,8 +2415,41 @@ int32_t RegexCompile::matchStartType() {
|
|||||||
if (forwardedLength.elementAti(end+1) < currentLen) {
|
if (forwardedLength.elementAti(end+1) < currentLen) {
|
||||||
currentLen = forwardedLength.elementAti(end+1);
|
currentLen = forwardedLength.elementAti(end+1);
|
||||||
}
|
}
|
||||||
|
|
||||||
return currentLen;
|
|
||||||
|
// Sort out what we should check for when looking for candidate match start positions.
|
||||||
|
// In order of preference,
|
||||||
|
// 1. Start of input text buffer.
|
||||||
|
// 2. A literal string.
|
||||||
|
// 3. Start of line in multi-line mode.
|
||||||
|
// 4. A single literal character.
|
||||||
|
// 5. A character from a set of characters.
|
||||||
|
//
|
||||||
|
if (fRXPat->fStartType == START_START) {
|
||||||
|
// Match only at the start of an input text string.
|
||||||
|
// start type is already set. We're done.
|
||||||
|
} else if (numInitialStrings == 1 && fRXPat->fInitialChars->size() == 1) {
|
||||||
|
// Match beginning only with a literal string.
|
||||||
|
UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
|
||||||
|
U_ASSERT(fRXPat->fInitialChars->contains(c));
|
||||||
|
fRXPat->fStartType = START_STRING;
|
||||||
|
} else if (fRXPat->fStartType == START_LINE) {
|
||||||
|
// Match at start of line in Mulit-Line mode.
|
||||||
|
// Nothing to do here; everything is already set.
|
||||||
|
} else if (fRXPat->fInitialChars->size() == 1) {
|
||||||
|
// All matches begin with the same char.
|
||||||
|
fRXPat->fStartType = START_CHAR;
|
||||||
|
fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
|
||||||
|
U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
|
||||||
|
} else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE) {
|
||||||
|
// Matches start with a set of character smaller than the set of all chars.
|
||||||
|
fRXPat->fStartType = START_SET;
|
||||||
|
} else {
|
||||||
|
// Matches can start with anything
|
||||||
|
fRXPat->fStartType = START_NO_INFO;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -2444,7 +2539,6 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
|||||||
case URX_SETREF:
|
case URX_SETREF:
|
||||||
case URX_BACKSLASH_D:
|
case URX_BACKSLASH_D:
|
||||||
case URX_ONECHAR_I:
|
case URX_ONECHAR_I:
|
||||||
case URX_BACKSLASH_W:
|
|
||||||
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
||||||
case URX_DOTANY_ALL: // . matches one or two.
|
case URX_DOTANY_ALL: // . matches one or two.
|
||||||
case URX_DOTANY:
|
case URX_DOTANY:
|
||||||
@ -2662,7 +2756,6 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
|||||||
case URX_SETREF:
|
case URX_SETREF:
|
||||||
case URX_BACKSLASH_D:
|
case URX_BACKSLASH_D:
|
||||||
case URX_ONECHAR_I:
|
case URX_ONECHAR_I:
|
||||||
case URX_BACKSLASH_W:
|
|
||||||
case URX_DOTANY_ALL:
|
case URX_DOTANY_ALL:
|
||||||
case URX_DOTANY:
|
case URX_DOTANY:
|
||||||
currentLen+=2;
|
currentLen+=2;
|
||||||
|
@ -111,7 +111,7 @@ private:
|
|||||||
int32_t end);
|
int32_t end);
|
||||||
int32_t maxMatchLength(int32_t start,
|
int32_t maxMatchLength(int32_t start,
|
||||||
int32_t end);
|
int32_t end);
|
||||||
int32_t matchStartType();
|
void matchStartType();
|
||||||
|
|
||||||
|
|
||||||
UErrorCode *fStatus;
|
UErrorCode *fStatus;
|
||||||
|
@ -71,7 +71,7 @@ enum {
|
|||||||
URX_UNUSED = 15,
|
URX_UNUSED = 15,
|
||||||
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
|
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
|
||||||
URX_BACKSLASH_G = 17,
|
URX_BACKSLASH_G = 17,
|
||||||
URX_BACKSLASH_W = 18, // Value field: 0: \w 1: \W
|
URX_UNUSED_1 = 18, // Value field: 0: \w 1: \W
|
||||||
URX_BACKSLASH_X = 19,
|
URX_BACKSLASH_X = 19,
|
||||||
URX_BACKSLASH_Z = 20, // \z Unconditional end of line.
|
URX_BACKSLASH_Z = 20, // \z Unconditional end of line.
|
||||||
|
|
||||||
@ -169,7 +169,7 @@ enum {
|
|||||||
"URX_UNUSED", \
|
"URX_UNUSED", \
|
||||||
"URX_BACKSLASH_B", \
|
"URX_BACKSLASH_B", \
|
||||||
"URX_BACKSLASH_G", \
|
"URX_BACKSLASH_G", \
|
||||||
"URX_BACKSLASH_W", \
|
"URX_UNUSED_1", \
|
||||||
"URX_BACKSLASH_X", \
|
"URX_BACKSLASH_X", \
|
||||||
"URX_BACKSLASH_Z", \
|
"URX_BACKSLASH_Z", \
|
||||||
"URX_DOTANY_ALL", \
|
"URX_DOTANY_ALL", \
|
||||||
@ -258,6 +258,14 @@ enum StartOfMatch {
|
|||||||
START_LINE, // Match starts with ^ in multi-line mode.
|
START_LINE, // Match starts with ^ in multi-line mode.
|
||||||
START_STRING // Match starts with a literal string.
|
START_STRING // Match starts with a literal string.
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define START_OF_MATCH_STR(v) ((v)==START_NO_INFO? "START_NO_INFO" : \
|
||||||
|
(v)==START_CHAR? "START_CHAR" : \
|
||||||
|
(v)==START_SET? "START_SET" : \
|
||||||
|
(v)==START_START? "START_START" : \
|
||||||
|
(v)==START_LINE? "START_LINE" : \
|
||||||
|
(v)==START_STRING? "START_STRING" : \
|
||||||
|
"ILLEGAL")
|
||||||
|
|
||||||
U_NAMESPACE_END
|
U_NAMESPACE_END
|
||||||
#endif
|
#endif
|
||||||
|
@ -69,9 +69,13 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||||||
fMinMatchLen = other.fMinMatchLen;
|
fMinMatchLen = other.fMinMatchLen;
|
||||||
fMaxMatchLen = other.fMaxMatchLen;
|
fMaxMatchLen = other.fMaxMatchLen;
|
||||||
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
||||||
fStaticSets = other.fStaticSets;
|
fStaticSets = other.fStaticSets;
|
||||||
|
|
||||||
fStartType = other.fStartType;
|
fStartType = other.fStartType;
|
||||||
fStartInfo = other.fStartInfo;
|
fInitialStringIdx = other.fInitialStringIdx;
|
||||||
|
fInitialStringLen = other.fInitialStringLen;
|
||||||
|
fInitialChars = new UnicodeSet(*other.fInitialChars);
|
||||||
|
fInitialChar = other.fInitialChar;
|
||||||
if (fBadState) {
|
if (fBadState) {
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
@ -123,14 +127,18 @@ void RegexPattern::init() {
|
|||||||
fFrameSize = 0;
|
fFrameSize = 0;
|
||||||
fDataSize = 0;
|
fDataSize = 0;
|
||||||
fStartType = START_NO_INFO;
|
fStartType = START_NO_INFO;
|
||||||
fStartInfo = 0;
|
fInitialStringIdx = 0;
|
||||||
|
fInitialStringLen = 0;
|
||||||
|
fInitialChars = NULL;
|
||||||
|
fInitialChar = 0;
|
||||||
|
|
||||||
UErrorCode status=U_ZERO_ERROR;
|
UErrorCode status=U_ZERO_ERROR;
|
||||||
// Init of a completely new RegexPattern.
|
// Init of a completely new RegexPattern.
|
||||||
fCompiledPat = new UVector32(status);
|
fCompiledPat = new UVector32(status);
|
||||||
fGroupMap = new UVector32(status);
|
fGroupMap = new UVector32(status);
|
||||||
fSets = new UVector(status);
|
fSets = new UVector(status);
|
||||||
if (U_FAILURE(status) || fCompiledPat == NULL || fSets == NULL) {
|
fInitialChars = new UnicodeSet;
|
||||||
|
if (U_FAILURE(status) || fCompiledPat == NULL || fSets == NULL || fInitialChars == NULL) {
|
||||||
fBadState = TRUE;
|
fBadState = TRUE;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -162,6 +170,8 @@ void RegexPattern::zap() {
|
|||||||
fSets = NULL;
|
fSets = NULL;
|
||||||
delete fGroupMap;
|
delete fGroupMap;
|
||||||
fGroupMap = NULL;
|
fGroupMap = NULL;
|
||||||
|
delete fInitialChars;
|
||||||
|
fInitialChars = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -478,7 +488,6 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||||||
case URX_JMP:
|
case URX_JMP:
|
||||||
case URX_BACKSLASH_B:
|
case URX_BACKSLASH_B:
|
||||||
case URX_BACKSLASH_D:
|
case URX_BACKSLASH_D:
|
||||||
case URX_BACKSLASH_W:
|
|
||||||
case URX_BACKSLASH_Z:
|
case URX_BACKSLASH_Z:
|
||||||
case URX_STRING_LEN:
|
case URX_STRING_LEN:
|
||||||
case URX_CTR_INIT:
|
case URX_CTR_INIT:
|
||||||
@ -576,6 +585,41 @@ void RegexPattern::dump() const {
|
|||||||
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n" , fBadState? "no" : "yes");
|
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n" , fBadState? "no" : "yes");
|
||||||
REGEX_DUMP_DEBUG_PRINTF(" Min Match Length: %d\n", fMinMatchLen);
|
REGEX_DUMP_DEBUG_PRINTF(" Min Match Length: %d\n", fMinMatchLen);
|
||||||
REGEX_DUMP_DEBUG_PRINTF(" Max Match Length: %d\n", fMaxMatchLen);
|
REGEX_DUMP_DEBUG_PRINTF(" Max Match Length: %d\n", fMaxMatchLen);
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
|
||||||
|
if (fStartType == START_STRING) {
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF(" Initial match sting: \"");
|
||||||
|
for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%c", fLiteralText[i]); // TODO: non-printables, surrogates.
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if (fStartType == START_SET) {
|
||||||
|
int32_t numSetChars = fInitialChars->size();
|
||||||
|
if (numSetChars > 20) {
|
||||||
|
numSetChars = 20;
|
||||||
|
}
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF(" Match First Chars : ");
|
||||||
|
for (i=0; i<numSetChars; i++) {
|
||||||
|
UChar32 c = fInitialChars->charAt(i);
|
||||||
|
if (0x20<c && c <0x7e) {
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%c ", c);
|
||||||
|
} else {
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%#x ", c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (numSetChars < fInitialChars->size()) {
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF(" ...");
|
||||||
|
}
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("\n");
|
||||||
|
|
||||||
|
} else if (fStartType == START_CHAR) {
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF(" First char of Match : ");
|
||||||
|
if (0x20 < fInitialChar && fInitialChar<0x7e) {
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%c\n", fInitialChar);
|
||||||
|
} else {
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%#x\n", fInitialChar);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
|
REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
|
||||||
"-------------------------------------------\n");
|
"-------------------------------------------\n");
|
||||||
for (index = 0; index<fCompiledPat->size(); index++) {
|
for (index = 0; index<fCompiledPat->size(); index++) {
|
||||||
@ -584,6 +628,8 @@ void RegexPattern::dump() const {
|
|||||||
REGEX_DUMP_DEBUG_PRINTF("\n\n");
|
REGEX_DUMP_DEBUG_PRINTF("\n\n");
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
const char RegexPattern::fgClassID = 0;
|
const char RegexPattern::fgClassID = 0;
|
||||||
|
|
||||||
//----------------------------------------------------------------------------------
|
//----------------------------------------------------------------------------------
|
||||||
|
@ -351,7 +351,10 @@ private:
|
|||||||
// regex character classes, e.g. Word.
|
// regex character classes, e.g. Word.
|
||||||
|
|
||||||
int32_t fStartType; // Info on how a match must start.
|
int32_t fStartType; // Info on how a match must start.
|
||||||
int32_t fStartInfo; // Data, interpretation depends on start type.
|
int32_t fInitialStringIdx; //
|
||||||
|
int32_t fInitialStringLen;
|
||||||
|
UnicodeSet *fInitialChars;
|
||||||
|
UChar32 fInitialChar;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The address of this static class variable serves as this class's ID
|
* The address of this static class variable serves as this class's ID
|
||||||
|
3
icu4c/source/test/testdata/regextst.txt
vendored
3
icu4c/source/test/testdata/regextst.txt
vendored
@ -279,6 +279,9 @@
|
|||||||
#"^(?:a?b?)*$" d "a--"
|
#"^(?:a?b?)*$" d "a--"
|
||||||
"^(?:a?b?)*$" "a--"
|
"^(?:a?b?)*$" "a--"
|
||||||
|
|
||||||
|
"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings</0>"
|
||||||
|
"((?:a|b|c)whoop-dee-do) | [jkl]|zed" "x"
|
||||||
|
"astring|another[bcd]|alpha|a|[a]" d "x"
|
||||||
|
|
||||||
#
|
#
|
||||||
# Regexps from http://www.regexlib.com
|
# Regexps from http://www.regexlib.com
|
||||||
|
Loading…
Reference in New Issue
Block a user