ICU-2422 add possessive quantifiers

X-SVN-Rev: 10869
This commit is contained in:
Andy Heninger 2003-01-20 06:25:23 +00:00
parent 7ec4d2f3e9
commit 8501288a1e
4 changed files with 156 additions and 37 deletions

View File

@ -504,13 +504,23 @@ UBool RegexCompile::doParseActions(EParseAction action)
// Compile to a
// - NOP, which later may be replaced by a save-state if the
// parenthesized group gets a * quantifier, followed by
// - START_CAPTURE
// - START_CAPTURE n where n is stack frame offset to the capture group variables.
// - NOP, which may later be replaced by a save-state if there
// is an '|' alternation within the parens.
//
// Each capture group gets three slots in the save stack frame:
// 0: Capture Group start position (in input string being matched.)
// 1: Capture Group end positino.
// 2: Start of Match-in-progress.
// The first two locations are for a completed capture group, and are
// referred to by back references and the like.
// The third location stores the capture start position when an START_CAPTURE is
// encountered. This will be promoted to a completed capture when (and if) the corresponding
// END_CAPure is encountered.
{
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
int32_t varsLoc = fRXPat->fFrameSize; // Reserve two slots in match stack frame.
fRXPat->fFrameSize += 2;
int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
fRXPat->fFrameSize += 3;
int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
fRXPat->fCompiledPat->addElement(cop, *fStatus);
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
@ -701,7 +711,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
// Compiles to
// 1. STATE_SAVE 3
// 2. body of stuff being iterated over
// 3. JMP 0
// 3. JMP 1
// 4. ...
//
{
@ -918,13 +928,109 @@ UBool RegexCompile::doParseActions(EParseAction action)
error(U_REGEX_UNIMPLEMENTED);
break;
case doPossesiveStar:
case doPossesivePlus:
case doPossesiveOpt:
// TODO: implement
error(U_REGEX_UNIMPLEMENTED);
// Possessive ++ quantifier.
// Compiles to
// 1. STO_SP
// 2. body of stuff being iterated over
// 3. STATE_SAVE 5
// 4. JMP 2
// 5. LD_SP
// 6. ...
//
// Note: TODO: This is pretty inefficient. A mass of saved state is built up
// then unconditionally discarded. Perhaps introduce a new opcode
//
{
// Emit the STO_SP
int32_t topLoc = blockTopLoc(TRUE);
int32_t stoLoc = fRXPat->fDataSize;
fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
fRXPat->fCompiledPat->setElementAt(op, topLoc);
// Emit the STATE_SAVE
op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
fRXPat->fCompiledPat->addElement(op, *fStatus);
// Emit the JMP
op = URX_BUILD(URX_JMP, topLoc+1);
fRXPat->fCompiledPat->addElement(op, *fStatus);
// Emit the LD_SP
op = URX_BUILD(URX_LD_SP, stoLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
break;
case doPossesiveStar:
// Possessive *+ quantifier.
// Compiles to
// 1. STO_SP loc
// 2. STATE_SAVE 5
// 3. body of stuff being iterated over
// 4. JMP 2
// 5. LD_SP loc
// 6 ...
//
{
// Reserve two slots at the top of the block.
int32_t topLoc = blockTopLoc(TRUE);
insertOp(topLoc);
// emit STO_SP loc
int32_t stoLoc = fRXPat->fDataSize;
fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
fRXPat->fCompiledPat->setElementAt(op, topLoc);
// Emit the SAVE_STATE 5
int32_t L7 = fRXPat->fCompiledPat->size()+1;
op = URX_BUILD(URX_STATE_SAVE, L7);
fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
// Append the JMP operation.
op = URX_BUILD(URX_JMP, topLoc+1);
fRXPat->fCompiledPat->addElement(op, *fStatus);
// Emit the LD_SP loc
op = URX_BUILD(URX_LD_SP, stoLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
break;
case doPossesiveOpt:
// Possessive ?+ quantifier.
// Compiles to
// 1. STO_SP loc
// 2. SAVE_STATE 5
// 3. body of optional block
// 4. LD_SP loc
// 5. ...
//
{
// Reserve two slots at the top of the block.
int32_t topLoc = blockTopLoc(TRUE);
insertOp(topLoc);
// Emit the STO_SP
int32_t stoLoc = fRXPat->fDataSize;
fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
fRXPat->fCompiledPat->setElementAt(op, topLoc);
// Emit the SAVE_STATE
int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
op = URX_BUILD(URX_STATE_SAVE, continueLoc);
fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
// Emit the LD_SP
op = URX_BUILD(URX_LD_SP, stoLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
break;
case doMatchMode: // (?i) and similar
// TODO: implement
error(U_REGEX_UNIMPLEMENTED);
@ -1236,8 +1342,8 @@ void RegexCompile::handleCloseParen() {
{
int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
int32_t framVarLocation = URX_VAL(captureOp);
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, framVarLocation+1);
int32_t frameVarLocation = URX_VAL(captureOp);
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
}
break;

View File

@ -133,7 +133,7 @@ enum {
"CTR_LOOP_P", \
"RELOC_OPRND", \
"STO_SP", \
"LD_SP"
"LD_SP"
//
// Convenience macros for assembling and disassembling a compiled operation.

View File

@ -25,6 +25,7 @@
#include "regeximp.h"
#include "stdio.h"
#include "malloc.h"
U_NAMESPACE_BEGIN
@ -41,7 +42,7 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
UErrorCode status = U_ZERO_ERROR;
fStack = new UVector32(status); // TODO: do something with status.
fData = fSmallData;
if (pat->fDataSize > sizeof(fSmallData)/sizeof(fSmallData[0])) {
if (pat->fDataSize > sizeof(fSmallData)/sizeof(int32_t)) {
fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); // TODO: null check
}
@ -206,15 +207,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
U_ASSERT(groupOffset < fPattern->fFrameSize);
U_ASSERT(groupOffset >= 0);
// Note: When the match engine backs out of a capture group, it sets the
// group's start position to -1. The end position is left with junk.
// So, before returning an end position, we must first check that
// the start position indicates that the group matched something.
int32_t s = fFrame->fExtra[groupOffset];
if (s != -1) {
e = fFrame->fExtra[groupOffset + 1];
}
e = fFrame->fExtra[groupOffset + 1];
}
return e;
}
@ -584,7 +577,6 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
}
printf("\n");
printf("\n");
printf(" PatLoc inputIdx char\n");
}
#endif
@ -613,11 +605,16 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
// One iteration of the loop per pattern operation performed.
//
for (;;) {
#if 0
if (_heapchk() != _HEAPOK) {
fprintf(stderr, "Heap Trouble\n");
}
#endif
op = pat[fp->fPatIdx];
opType = URX_TYPE(op);
opValue = URX_VAL(op);
#ifdef REGEX_RUN_DEBUG
printf("inputIdx=%d inputChar=%c sp=%d ", fp->fInputIdx,
printf("inputIdx=%d inputChar=%c sp=%3d ", fp->fInputIdx,
fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer());
fPattern->dumpOp(fp->fPatIdx);
#endif
@ -690,16 +687,23 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
isMatch = TRUE;
goto breakFromLoop;
// Start and End Capture stack frame variables are layout out like this:
// fp->fExtra[opValue] - The start of a completed capture group
// opValue+1 - The end of a completed capture group
// opValue+2 - the start of a capture group that end
// has not yet been reached (and might not ever be).
case URX_START_CAPTURE:
U_ASSERT(opValue >= 0 && opValue < frameSize-3);
fp->fExtra[opValue] = fp->fInputIdx;
fp->fExtra[opValue+2] = fp->fInputIdx;
break;
case URX_END_CAPTURE:
U_ASSERT(opValue > 0 && opValue < frameSize-2);
U_ASSERT(fp->fExtra[opValue-1] >= 0); // Start pos for this group must be set.
fp->fExtra[opValue] = fp->fInputIdx;
U_ASSERT(opValue >= 0 && opValue < frameSize-3);
U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
fp->fExtra[opValue+1] = fp->fInputIdx; // End position
U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
break;
@ -1054,12 +1058,15 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
int32_t newStackSize = fData[opValue];
U_ASSERT(newStackSize <= fStack->size());
REStackFrame *newFP = (REStackFrame *)(fStack->getBuffer() + newStackSize - frameSize);
int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
if (newFP == (int32_t *)fp) {
break;
}
int32_t i;
for (i=0; i<frameSize; i++) {
newFP[i] = fp[i];
newFP[i] = ((int32_t *)fp)[i];
}
fp = newFP;
fp = (REStackFrame *)newFP;
fStack->setSize(newStackSize);
}
break;

View File

@ -368,7 +368,8 @@ void RegexTest::Basic() {
//
#if 0
{
REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
REGEX_TESTLM("(abc)*+a", "abcabcabc", FALSE, FALSE);
// REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
}
exit(1);
@ -1234,7 +1235,17 @@ void RegexTest::Extended() {
// Atomic Grouping
REGEX_FIND("(?>.*)abc", "abcabcabc"); // no match. .* consumed entire string.
//REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0><1>abcc</1><2>ccc</2></0>ddd");
REGEX_FIND("(\\.\\d\\d(?>[1-9]?))\\d+", "1.625");
REGEX_FIND("(\\.\\d\\d(?>[1-9]?))\\d+", "1<0><1>.625</1>0</0>");
// Possessive *+
REGEX_FIND("(abc)*+a", "abcabcabc");
REGEX_FIND("(abc)*+a", "<0>abc<1>abc</1>a</0>b");
REGEX_FIND("(a*b)*+a", "<0><1>aaaab</1>a</0>aaa");
// Possessive ?+
REGEX_FIND("c?+ddd", "<0>cddd</0>");
}
@ -1272,11 +1283,6 @@ void RegexTest::Errors() {
REGEX_ERR("abc(?<!xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // negated look-behind
REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
// Possessive Quantifiers
REGEX_ERR("abc++d", 1, 5, U_REGEX_UNIMPLEMENTED);
REGEX_ERR("abc*+d", 1, 5, U_REGEX_UNIMPLEMENTED);
REGEX_ERR("abc?+d", 1, 5, U_REGEX_UNIMPLEMENTED);
// Attempt to use non-default flags
{
UParseError pe;