ICU-2422 look-behind op, plus some optimizations, work-in-progress.

X-SVN-Rev: 11168
This commit is contained in:
Andy Heninger 2003-02-26 05:16:49 +00:00
parent 2b7ba6ffc3
commit 2d3301b7f9
5 changed files with 62 additions and 19 deletions

View File

@ -512,6 +512,10 @@ void RegexCompile::compile(
//
fRXPat->fFrameSize+=2;
//
//
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
//
// A stupid bit of non-sense to prevent code coverage testing from complaining
// about the pattern.dump() debug function. Go through the motions of dumping,
@ -1907,6 +1911,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
int32_t patSegLen = end - start + 1;
int32_t loc;
int32_t op;
int32_t opType;
int32_t currentLen = 0;
UVector32 lengthSoFar(fRXPat->fCompiledPat->size(), *fStatus);
lengthSoFar.setSize(fRXPat->fCompiledPat->size());
@ -1918,11 +1923,17 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
loc = start-1;
for (loc = start; loc<=end; loc++) {
op = fRXPat->fCompiledPat->elementAti(loc);
opType = URX_TYPE(op);
// The loop is advancing linearly through the pattern.
// If the op we are now at was the destination of a branch in the pattern,
// and that path has a shorter minimum length than the current accumulated value,
// replace the current accumulated value.
if (lengthSoFar.elementAti(loc) < currentLen) {
currentLen = lengthSoFar.elementAti(loc);
}
switch (op) {
switch (opType) {
// Ops that don't change the total length matched
case URX_RESERVED_OP:
case URX_END:
@ -1936,9 +1947,6 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_BACKSLASH_Z:
case URX_CARET:
case URX_DOLLAR:
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_INIT_P:
case URX_RELOC_OPRND:
case URX_STO_INP_LOC:
case URX_DOLLAR_M:
@ -2018,18 +2026,25 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
break;
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_INIT_P:
{
// Loop ops. These are four word instructions.
// The jump is conditional, backwards only.
// Loop Init Ops. These don't change the min length, but they are 4 word ops
// so location must be updated accordingly.
loc+=3;
}
break;
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
// Loop ops.
// The jump is conditional, backwards only.
break;
case URX_LA_START:
{
@ -2041,10 +2056,10 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
for (;;) {
loc++;
op = fRXPat->fCompiledPat->elementAti(loc);
if (URX_VAL(op) == URX_LA_START) {
if (URX_TYPE(op) == URX_LA_START) {
depth++;
}
if (URX_VAL(op) == URX_LA_END) {
if (URX_TYPE(op) == URX_LA_END) {
if (depth == 0) {
break;
}

View File

@ -83,9 +83,14 @@ enum {
URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possesive.
URX_CTR_INIT_P = 27, // These are 4 word opcodes. See description.
// First Operand: Data loc of counter variable
// 2nd Operand: Pat loc of the URX_CTR_LOOPx
// at the end of the loop.
// 3rd Operand: Minimum count.
// 4th Operand: Max count, -1 for unbounded.
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
URX_CTR_LOOP_NG = 29, // Also in three flavors.
URX_CTR_LOOP_P = 30,
URX_CTR_LOOP_P = 30, // Operand is loc of corresponding CTR_INIT.
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
// back into compiled pattern code, and thus must

View File

@ -1198,6 +1198,8 @@ GC_Done:
}
break;
// TODO: Possessive flavor of loop ops, or take them out if no longer needed.
case URX_STO_SP:
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
fData[opValue] = fStack->size();

View File

@ -66,6 +66,8 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fFlags = other.fFlags;
fLiteralText = other.fLiteralText;
fBadState = other.fBadState;
fMinMatchLen = other.fMinMatchLen;
fMaxMatchLen = other.fMaxMatchLen;
fMaxCaptureDigits = other.fMaxCaptureDigits;
fStaticSets = other.fStaticSets;
if (fBadState) {
@ -111,7 +113,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
void RegexPattern::init() {
fFlags = 0;
fBadState = FALSE;
fMaxCaptureDigits = 1; // TODO: calculate for real.
fMinMatchLen = 0;
fMaxMatchLen = -1;
fMaxCaptureDigits = 1;
fStaticSets = NULL;
fMatcher = NULL;
fFrameSize = 0;
@ -353,6 +357,7 @@ int32_t RegexPattern::split(const UnicodeString &input,
//
// If we don't already have a cached matcher object from a previous call
// to split(), create one now.
// TODO: NOT THREAD SAFE. FIX.
//
if (fMatcher == NULL) {
RegexMatcher *m = matcher(input, status);
@ -560,7 +565,9 @@ void RegexPattern::dump() const {
REGEX_DUMP_DEBUG_PRINTF("%c", fPattern.charAt(i));
}
REGEX_DUMP_DEBUG_PRINTF("\n");
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n", fBadState? "no" : "yes");
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n" , fBadState? "no" : "yes");
REGEX_DUMP_DEBUG_PRINTF(" Min Match Length: %d\n", fMinMatchLen);
REGEX_DUMP_DEBUG_PRINTF(" Max Match Length: %d\n", fMaxMatchLen);
REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
"-------------------------------------------\n");
for (index = 0; index<fCompiledPat->size(); index++) {

View File

@ -310,16 +310,30 @@ private:
UnicodeString fPattern; // The original pattern string.
uint32_t fFlags; // The flags used when compiling the pattern.
//
UVector32 *fCompiledPat; // The compiled pattern.
UVector32 *fCompiledPat; // The compiled pattern p-code.
UnicodeString fLiteralText; // Any literal string data from the pattern,
// after un-escaping, for use during the match.
UVector *fSets; // Any UnicodeSets referenced from the pattern.
UBool fBadState; // True if some prior error has left this
// RegexPattern in an unusable state.
int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
// >= this value. For some patterns, this calculated
// value may be less than the true shortest
// possible match.
int32_t fMaxMatchLen; // Maximum Match Length. All matches will have length
// <= this value. For some patterns, this calculated
// value may be greater than the true longest
// possible match. For patterns with unbounded
// match length, value = -1.
RegexMatcher *fMatcher; // A cached matcher for this pattern, used for
// split(), to avoid having to
// make new ones on each call.
// TODO: fix thread safety problems.
int32_t fFrameSize; // Size of a state stack frame in the
// execution engine.
@ -328,13 +342,13 @@ private:
// does not go on the state stack, but has just
// a single copy per matcher.
UVector32 *fGroupMap; // Map from capture group number to position of
UVector32 *fGroupMap; // Map from capture group number to position of
// the group's variables in the matcher stack frame.
int32_t fMaxCaptureDigits;
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
// regex character classes, e.g. Word.
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
// regex character classes, e.g. Word.
/**
* The address of this static class variable serves as this class's ID