ICU-2422 look-behind op, plus some optimizations, work-in-progress.
X-SVN-Rev: 11168
This commit is contained in:
parent
2b7ba6ffc3
commit
2d3301b7f9
@ -512,6 +512,10 @@ void RegexCompile::compile(
|
||||
//
|
||||
fRXPat->fFrameSize+=2;
|
||||
|
||||
//
|
||||
//
|
||||
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
||||
|
||||
//
|
||||
// A stupid bit of non-sense to prevent code coverage testing from complaining
|
||||
// about the pattern.dump() debug function. Go through the motions of dumping,
|
||||
@ -1907,6 +1911,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
int32_t patSegLen = end - start + 1;
|
||||
int32_t loc;
|
||||
int32_t op;
|
||||
int32_t opType;
|
||||
int32_t currentLen = 0;
|
||||
UVector32 lengthSoFar(fRXPat->fCompiledPat->size(), *fStatus);
|
||||
lengthSoFar.setSize(fRXPat->fCompiledPat->size());
|
||||
@ -1918,11 +1923,17 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
loc = start-1;
|
||||
for (loc = start; loc<=end; loc++) {
|
||||
op = fRXPat->fCompiledPat->elementAti(loc);
|
||||
opType = URX_TYPE(op);
|
||||
|
||||
// The loop is advancing linearly through the pattern.
|
||||
// If the op we are now at was the destination of a branch in the pattern,
|
||||
// and that path has a shorter minimum length than the current accumulated value,
|
||||
// replace the current accumulated value.
|
||||
if (lengthSoFar.elementAti(loc) < currentLen) {
|
||||
currentLen = lengthSoFar.elementAti(loc);
|
||||
}
|
||||
|
||||
switch (op) {
|
||||
switch (opType) {
|
||||
// Ops that don't change the total length matched
|
||||
case URX_RESERVED_OP:
|
||||
case URX_END:
|
||||
@ -1936,9 +1947,6 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_INIT_P:
|
||||
case URX_RELOC_OPRND:
|
||||
case URX_STO_INP_LOC:
|
||||
case URX_DOLLAR_M:
|
||||
@ -2018,18 +2026,25 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_INIT_P:
|
||||
{
|
||||
// Loop ops. These are four word instructions.
|
||||
// The jump is conditional, backwards only.
|
||||
// Loop Init Ops. These don't change the min length, but they are 4 word ops
|
||||
// so location must be updated accordingly.
|
||||
loc+=3;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
// Loop ops.
|
||||
// The jump is conditional, backwards only.
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_LA_START:
|
||||
{
|
||||
@ -2041,10 +2056,10 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
for (;;) {
|
||||
loc++;
|
||||
op = fRXPat->fCompiledPat->elementAti(loc);
|
||||
if (URX_VAL(op) == URX_LA_START) {
|
||||
if (URX_TYPE(op) == URX_LA_START) {
|
||||
depth++;
|
||||
}
|
||||
if (URX_VAL(op) == URX_LA_END) {
|
||||
if (URX_TYPE(op) == URX_LA_END) {
|
||||
if (depth == 0) {
|
||||
break;
|
||||
}
|
||||
|
@ -83,9 +83,14 @@ enum {
|
||||
URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
|
||||
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possesive.
|
||||
URX_CTR_INIT_P = 27, // These are 4 word opcodes. See description.
|
||||
// First Operand: Data loc of counter variable
|
||||
// 2nd Operand: Pat loc of the URX_CTR_LOOPx
|
||||
// at the end of the loop.
|
||||
// 3rd Operand: Minimum count.
|
||||
// 4th Operand: Max count, -1 for unbounded.
|
||||
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
|
||||
URX_CTR_LOOP_NG = 29, // Also in three flavors.
|
||||
URX_CTR_LOOP_P = 30,
|
||||
URX_CTR_LOOP_P = 30, // Operand is loc of corresponding CTR_INIT.
|
||||
|
||||
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
|
||||
// back into compiled pattern code, and thus must
|
||||
|
@ -1198,6 +1198,8 @@ GC_Done:
|
||||
}
|
||||
break;
|
||||
|
||||
// TODO: Possessive flavor of loop ops, or take them out if no longer needed.
|
||||
|
||||
case URX_STO_SP:
|
||||
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
|
||||
fData[opValue] = fStack->size();
|
||||
|
@ -66,6 +66,8 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
||||
fFlags = other.fFlags;
|
||||
fLiteralText = other.fLiteralText;
|
||||
fBadState = other.fBadState;
|
||||
fMinMatchLen = other.fMinMatchLen;
|
||||
fMaxMatchLen = other.fMaxMatchLen;
|
||||
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
||||
fStaticSets = other.fStaticSets;
|
||||
if (fBadState) {
|
||||
@ -111,7 +113,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
||||
void RegexPattern::init() {
|
||||
fFlags = 0;
|
||||
fBadState = FALSE;
|
||||
fMaxCaptureDigits = 1; // TODO: calculate for real.
|
||||
fMinMatchLen = 0;
|
||||
fMaxMatchLen = -1;
|
||||
fMaxCaptureDigits = 1;
|
||||
fStaticSets = NULL;
|
||||
fMatcher = NULL;
|
||||
fFrameSize = 0;
|
||||
@ -353,6 +357,7 @@ int32_t RegexPattern::split(const UnicodeString &input,
|
||||
//
|
||||
// If we don't already have a cached matcher object from a previous call
|
||||
// to split(), create one now.
|
||||
// TODO: NOT THREAD SAFE. FIX.
|
||||
//
|
||||
if (fMatcher == NULL) {
|
||||
RegexMatcher *m = matcher(input, status);
|
||||
@ -560,7 +565,9 @@ void RegexPattern::dump() const {
|
||||
REGEX_DUMP_DEBUG_PRINTF("%c", fPattern.charAt(i));
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF("\n");
|
||||
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n", fBadState? "no" : "yes");
|
||||
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n" , fBadState? "no" : "yes");
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Min Match Length: %d\n", fMinMatchLen);
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Max Match Length: %d\n", fMaxMatchLen);
|
||||
REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
|
||||
"-------------------------------------------\n");
|
||||
for (index = 0; index<fCompiledPat->size(); index++) {
|
||||
|
@ -310,16 +310,30 @@ private:
|
||||
UnicodeString fPattern; // The original pattern string.
|
||||
uint32_t fFlags; // The flags used when compiling the pattern.
|
||||
//
|
||||
UVector32 *fCompiledPat; // The compiled pattern.
|
||||
UVector32 *fCompiledPat; // The compiled pattern p-code.
|
||||
UnicodeString fLiteralText; // Any literal string data from the pattern,
|
||||
// after un-escaping, for use during the match.
|
||||
|
||||
UVector *fSets; // Any UnicodeSets referenced from the pattern.
|
||||
|
||||
UBool fBadState; // True if some prior error has left this
|
||||
// RegexPattern in an unusable state.
|
||||
|
||||
int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
|
||||
// >= this value. For some patterns, this calculated
|
||||
// value may be less than the true shortest
|
||||
// possible match.
|
||||
|
||||
int32_t fMaxMatchLen; // Maximum Match Length. All matches will have length
|
||||
// <= this value. For some patterns, this calculated
|
||||
// value may be greater than the true longest
|
||||
// possible match. For patterns with unbounded
|
||||
// match length, value = -1.
|
||||
|
||||
RegexMatcher *fMatcher; // A cached matcher for this pattern, used for
|
||||
// split(), to avoid having to
|
||||
// make new ones on each call.
|
||||
// TODO: fix thread safety problems.
|
||||
|
||||
int32_t fFrameSize; // Size of a state stack frame in the
|
||||
// execution engine.
|
||||
@ -328,13 +342,13 @@ private:
|
||||
// does not go on the state stack, but has just
|
||||
// a single copy per matcher.
|
||||
|
||||
UVector32 *fGroupMap; // Map from capture group number to position of
|
||||
UVector32 *fGroupMap; // Map from capture group number to position of
|
||||
// the group's variables in the matcher stack frame.
|
||||
|
||||
int32_t fMaxCaptureDigits;
|
||||
|
||||
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
|
||||
// regex character classes, e.g. Word.
|
||||
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
|
||||
// regex character classes, e.g. Word.
|
||||
|
||||
/**
|
||||
* The address of this static class variable serves as this class's ID
|
||||
|
Loading…
Reference in New Issue
Block a user