ICU-2422 Regexp, optimizing find() operations, work in progress.
X-SVN-Rev: 11250
This commit is contained in:
parent
03c212daf4
commit
f5fa67bfe3
@ -1166,7 +1166,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
break;
|
||||
|
||||
case doBackslashA:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashB:
|
||||
@ -2068,6 +2068,268 @@ UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// matchStartType Determine how a match can start.
|
||||
// Used to optimize find() operations.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
int32_t RegexCompile::matchStartType() {
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int32_t loc;
|
||||
int32_t op;
|
||||
int32_t opType;
|
||||
int32_t currentLen = 0;
|
||||
|
||||
UnicodeSet startingChars;
|
||||
int32_t startStringIndex;
|
||||
int32_t startStringChars;
|
||||
|
||||
UBool atStart = TRUE; // True if no part of the pattern yet encountered
|
||||
// could have advanced the position in a match.
|
||||
|
||||
// forwardedLength is a vector holding minimum-match-length values that
|
||||
// are propagated forward in the pattern by JMP or STATE_SAVE operations.
|
||||
// It must be one longer than the pattern being checked because some ops
|
||||
// will jmp to a end-of-block+1 location from within a block, and we must
|
||||
// count those when checking the block.
|
||||
int32_t end = fRXPat->fCompiledPat->size();
|
||||
UVector32 forwardedLength(end+1, *fStatus);
|
||||
forwardedLength.setSize(end+1);
|
||||
for (loc=3; loc<end; loc++) {
|
||||
forwardedLength.setElementAt(INT32_MAX, loc);
|
||||
}
|
||||
|
||||
for (loc = 3; loc<end; loc++) {
|
||||
op = fRXPat->fCompiledPat->elementAti(loc);
|
||||
opType = URX_TYPE(op);
|
||||
|
||||
// The loop is advancing linearly through the pattern.
|
||||
// If the op we are now at was the destination of a branch in the pattern,
|
||||
// and that path has a shorter minimum length than the current accumulated value,
|
||||
// replace the current accumulated value.
|
||||
if (forwardedLength.elementAti(loc) < currentLen) {
|
||||
currentLen = forwardedLength.elementAti(loc);
|
||||
}
|
||||
|
||||
switch (opType) {
|
||||
// Ops that don't change the total length matched
|
||||
case URX_RESERVED_OP:
|
||||
case URX_END:
|
||||
case URX_STRING_LEN:
|
||||
case URX_NOP:
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_DOLLAR:
|
||||
case URX_RELOC_OPRND:
|
||||
case URX_STO_INP_LOC:
|
||||
case URX_DOLLAR_M:
|
||||
case URX_BACKTRACK:
|
||||
case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
|
||||
case URX_BACKREF_I:
|
||||
|
||||
case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
|
||||
case URX_LD_SP:
|
||||
break;
|
||||
|
||||
case URX_CARET:
|
||||
if (atStart) {
|
||||
fRXPat->fStartType = START_START;
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_CARET_M:
|
||||
if (atStart) {
|
||||
fRXPat->fStartType = START_LINE;
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_ONECHAR:
|
||||
if (currentLen == 0) {
|
||||
// This character could appear at the start of a match.
|
||||
// Add it to the set of possible starting characters.
|
||||
startingChars.add(URX_VAL(op));
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
case URX_SETREF: // TODO: Sense of op, invert the set
|
||||
if (currentLen == 0) {
|
||||
int32_t sn = URX_VAL(op);
|
||||
U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
|
||||
const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
|
||||
startingChars.addAll(*s);
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
case URX_STATIC_SETREF: // TODO: sense of op, invert the set.
|
||||
if (currentLen == 0) {
|
||||
int32_t sn = URX_VAL(op);
|
||||
const UnicodeSet *s = fRXPat->fStaticSets[sn];
|
||||
startingChars.addAll(*s);
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_BACKSLASH_D:
|
||||
if (currentLen == 0) {
|
||||
UnicodeSet s; // TODO: sense of op, invert the set.
|
||||
s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
|
||||
startingChars.addAll(s);
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
case URX_ONECHAR_I:
|
||||
case URX_BACKSLASH_W:
|
||||
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
||||
case URX_DOTANY_ALL: // . matches one or two.
|
||||
case URX_DOTANY:
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
case URX_JMP:
|
||||
case URX_JMPX:
|
||||
{
|
||||
int32_t jmpDest = URX_VAL(op);
|
||||
if (jmpDest < loc) {
|
||||
// Loop of some kind. Can safely ignore, the worst that will happen
|
||||
// is that we understate the true minimum length
|
||||
currentLen = forwardedLength.elementAti(loc+1);
|
||||
|
||||
} else {
|
||||
// Forward jump. Propagate the current min length to the target loc of the jump.
|
||||
U_ASSERT(jmpDest <= end+1);
|
||||
if (forwardedLength.elementAti(jmpDest) > currentLen) {
|
||||
forwardedLength.setElementAt(currentLen, jmpDest);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_FAIL:
|
||||
// Fails are kind of like a branch, except that the min length was
|
||||
// propagated already, by the state save.
|
||||
currentLen = forwardedLength.elementAti(loc+1);
|
||||
break;
|
||||
|
||||
|
||||
case URX_STATE_SAVE:
|
||||
{
|
||||
// State Save, for forward jumps, propagate the current minimum.
|
||||
// of the state save.
|
||||
int32_t jmpDest = URX_VAL(op);
|
||||
if (jmpDest > loc) {
|
||||
if (currentLen < forwardedLength.elementAti(jmpDest)) {
|
||||
forwardedLength.setElementAt(currentLen, jmpDest);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
|
||||
case URX_STRING:
|
||||
case URX_STRING_I:
|
||||
{
|
||||
loc++;
|
||||
int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
|
||||
currentLen += URX_VAL(stringLenOp);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_INIT_P:
|
||||
{
|
||||
// Loop Init Ops. These don't change the min length, but they are 4 word ops
|
||||
// so location must be updated accordingly.
|
||||
loc+=3;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
// Loop ops.
|
||||
// The jump is conditional, backwards only.
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_LA_START:
|
||||
case URX_LB_START:
|
||||
{
|
||||
// Look-around. Scan forward until the matching look-ahead end,
|
||||
// without processing the look-around block. This is overly pessimistic.
|
||||
// TODO: Positive lookahead could recursively do the block, then continue
|
||||
// with the longer of the block or the value coming in.
|
||||
int32_t depth = 0;
|
||||
for (;;) {
|
||||
loc++;
|
||||
op = fRXPat->fCompiledPat->elementAti(loc);
|
||||
if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
|
||||
depth++;
|
||||
}
|
||||
if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
|
||||
if (depth == 0) {
|
||||
break;
|
||||
}
|
||||
depth--;
|
||||
}
|
||||
U_ASSERT(loc <= end);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_LA_END:
|
||||
case URX_LB_CONT:
|
||||
case URX_LB_END:
|
||||
case URX_LBN_CONT:
|
||||
case URX_LBN_END:
|
||||
// Only come here if the matching URX_LA_START or URX_LB_START was not in the
|
||||
// range being sized, which happens when measuring size of look-behind blocks.
|
||||
break;
|
||||
|
||||
default:
|
||||
U_ASSERT(FALSE);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// We have finished walking through the ops. Check whether some forward jump
|
||||
// propagated a shorter length to location end+1.
|
||||
if (forwardedLength.elementAti(end+1) < currentLen) {
|
||||
currentLen = forwardedLength.elementAti(end+1);
|
||||
}
|
||||
|
||||
return currentLen;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// minMatchLength Calculate the length of the shortest string that could
|
||||
@ -2128,7 +2390,6 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
case URX_NOP:
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_BACKSLASH_A:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_Z:
|
||||
@ -2335,7 +2596,6 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
||||
case URX_NOP:
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_BACKSLASH_A:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_Z:
|
||||
|
@ -111,6 +111,7 @@ private:
|
||||
int32_t end);
|
||||
int32_t maxMatchLength(int32_t start,
|
||||
int32_t end);
|
||||
int32_t matchStartType();
|
||||
|
||||
|
||||
UErrorCode *fStatus;
|
||||
|
@ -68,7 +68,7 @@ enum {
|
||||
// the pattern.
|
||||
URX_FAIL = 14, // Stop match operation, No match.
|
||||
|
||||
URX_BACKSLASH_A = 15,
|
||||
URX_UNUSED = 15,
|
||||
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
|
||||
URX_BACKSLASH_G = 17,
|
||||
URX_BACKSLASH_W = 18, // Value field: 0: \w 1: \W
|
||||
@ -166,7 +166,7 @@ enum {
|
||||
"DOTANY", \
|
||||
"JMP", \
|
||||
"FAIL", \
|
||||
"URX_BACKSLASH_A", \
|
||||
"URX_UNUSED", \
|
||||
"URX_BACKSLASH_B", \
|
||||
"URX_BACKSLASH_G", \
|
||||
"URX_BACKSLASH_W", \
|
||||
@ -246,6 +246,19 @@ struct REStackFrame {
|
||||
// Locations assigned at pattern compile time.
|
||||
};
|
||||
|
||||
//
|
||||
// Start-Of-Match type. Used by find() to quickly scan to positions where a
|
||||
// match might start before firing up the full match engine.
|
||||
//
|
||||
enum StartOfMatch {
|
||||
START_NO_INFO, // No hint available.
|
||||
START_CHAR, // Match starts with a literal code point.
|
||||
START_SET, // Match starts with something matching a set.
|
||||
START_START, // Match starts at start of buffer only (^ or \A)
|
||||
START_LINE, // Match starts with ^ in multi-line mode.
|
||||
START_STRING // Match starts with a literal string.
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
||||
|
||||
|
@ -858,12 +858,6 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_A: // Test for start of input
|
||||
if (fp->fInputIdx != 0) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_BACKSLASH_B: // Test for word boundaries
|
||||
{
|
||||
UBool success = isWordBoundary(fp->fInputIdx);
|
||||
|
@ -70,6 +70,8 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
||||
fMaxMatchLen = other.fMaxMatchLen;
|
||||
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
||||
fStaticSets = other.fStaticSets;
|
||||
fStartType = other.fStartType;
|
||||
fStartInfo = other.fStartInfo;
|
||||
if (fBadState) {
|
||||
return *this;
|
||||
}
|
||||
@ -120,6 +122,8 @@ void RegexPattern::init() {
|
||||
fMatcher = NULL;
|
||||
fFrameSize = 0;
|
||||
fDataSize = 0;
|
||||
fStartType = START_NO_INFO;
|
||||
fStartInfo = 0;
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
// Init of a completely new RegexPattern.
|
||||
@ -459,7 +463,6 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
case URX_FAIL:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
case URX_BACKSLASH_A:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_X:
|
||||
case URX_END:
|
||||
|
@ -350,6 +350,9 @@ private:
|
||||
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
|
||||
// regex character classes, e.g. Word.
|
||||
|
||||
int32_t fStartType; // Info on how a match must start.
|
||||
int32_t fStartInfo; // Data, interpretation depends on start type.
|
||||
|
||||
/**
|
||||
* The address of this static class variable serves as this class's ID
|
||||
* for ICU "poor man's RTTI".
|
||||
|
Loading…
Reference in New Issue
Block a user