ICU-2422 add possessive quantifiers
X-SVN-Rev: 10869
This commit is contained in:
parent
7ec4d2f3e9
commit
8501288a1e
@ -504,13 +504,23 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
// Compile to a
|
||||
// - NOP, which later may be replaced by a save-state if the
|
||||
// parenthesized group gets a * quantifier, followed by
|
||||
// - START_CAPTURE
|
||||
// - START_CAPTURE n where n is stack frame offset to the capture group variables.
|
||||
// - NOP, which may later be replaced by a save-state if there
|
||||
// is an '|' alternation within the parens.
|
||||
//
|
||||
// Each capture group gets three slots in the save stack frame:
|
||||
// 0: Capture Group start position (in input string being matched.)
|
||||
// 1: Capture Group end positino.
|
||||
// 2: Start of Match-in-progress.
|
||||
// The first two locations are for a completed capture group, and are
|
||||
// referred to by back references and the like.
|
||||
// The third location stores the capture start position when an START_CAPTURE is
|
||||
// encountered. This will be promoted to a completed capture when (and if) the corresponding
|
||||
// END_CAPure is encountered.
|
||||
{
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
int32_t varsLoc = fRXPat->fFrameSize; // Reserve two slots in match stack frame.
|
||||
fRXPat->fFrameSize += 2;
|
||||
int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
|
||||
fRXPat->fFrameSize += 3;
|
||||
int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
|
||||
fRXPat->fCompiledPat->addElement(cop, *fStatus);
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
@ -701,7 +711,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
// Compiles to
|
||||
// 1. STATE_SAVE 3
|
||||
// 2. body of stuff being iterated over
|
||||
// 3. JMP 0
|
||||
// 3. JMP 1
|
||||
// 4. ...
|
||||
//
|
||||
{
|
||||
@ -918,13 +928,109 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
case doPossesiveStar:
|
||||
case doPossesivePlus:
|
||||
case doPossesiveOpt:
|
||||
// TODO: implement
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
// Possessive ++ quantifier.
|
||||
// Compiles to
|
||||
// 1. STO_SP
|
||||
// 2. body of stuff being iterated over
|
||||
// 3. STATE_SAVE 5
|
||||
// 4. JMP 2
|
||||
// 5. LD_SP
|
||||
// 6. ...
|
||||
//
|
||||
// Note: TODO: This is pretty inefficient. A mass of saved state is built up
|
||||
// then unconditionally discarded. Perhaps introduce a new opcode
|
||||
//
|
||||
{
|
||||
// Emit the STO_SP
|
||||
int32_t topLoc = blockTopLoc(TRUE);
|
||||
int32_t stoLoc = fRXPat->fDataSize;
|
||||
fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
|
||||
int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, topLoc);
|
||||
|
||||
// Emit the STATE_SAVE
|
||||
op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
// Emit the JMP
|
||||
op = URX_BUILD(URX_JMP, topLoc+1);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
// Emit the LD_SP
|
||||
op = URX_BUILD(URX_LD_SP, stoLoc);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
case doPossesiveStar:
|
||||
// Possessive *+ quantifier.
|
||||
// Compiles to
|
||||
// 1. STO_SP loc
|
||||
// 2. STATE_SAVE 5
|
||||
// 3. body of stuff being iterated over
|
||||
// 4. JMP 2
|
||||
// 5. LD_SP loc
|
||||
// 6 ...
|
||||
//
|
||||
{
|
||||
// Reserve two slots at the top of the block.
|
||||
int32_t topLoc = blockTopLoc(TRUE);
|
||||
insertOp(topLoc);
|
||||
|
||||
// emit STO_SP loc
|
||||
int32_t stoLoc = fRXPat->fDataSize;
|
||||
fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
|
||||
int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, topLoc);
|
||||
|
||||
// Emit the SAVE_STATE 5
|
||||
int32_t L7 = fRXPat->fCompiledPat->size()+1;
|
||||
op = URX_BUILD(URX_STATE_SAVE, L7);
|
||||
fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
|
||||
|
||||
// Append the JMP operation.
|
||||
op = URX_BUILD(URX_JMP, topLoc+1);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
// Emit the LD_SP loc
|
||||
op = URX_BUILD(URX_LD_SP, stoLoc);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
case doPossesiveOpt:
|
||||
// Possessive ?+ quantifier.
|
||||
// Compiles to
|
||||
// 1. STO_SP loc
|
||||
// 2. SAVE_STATE 5
|
||||
// 3. body of optional block
|
||||
// 4. LD_SP loc
|
||||
// 5. ...
|
||||
//
|
||||
{
|
||||
// Reserve two slots at the top of the block.
|
||||
int32_t topLoc = blockTopLoc(TRUE);
|
||||
insertOp(topLoc);
|
||||
|
||||
// Emit the STO_SP
|
||||
int32_t stoLoc = fRXPat->fDataSize;
|
||||
fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
|
||||
int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, topLoc);
|
||||
|
||||
// Emit the SAVE_STATE
|
||||
int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
|
||||
op = URX_BUILD(URX_STATE_SAVE, continueLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
|
||||
|
||||
// Emit the LD_SP
|
||||
op = URX_BUILD(URX_LD_SP, stoLoc);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case doMatchMode: // (?i) and similar
|
||||
// TODO: implement
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
@ -1236,8 +1342,8 @@ void RegexCompile::handleCloseParen() {
|
||||
{
|
||||
int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
|
||||
U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
|
||||
int32_t framVarLocation = URX_VAL(captureOp);
|
||||
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, framVarLocation+1);
|
||||
int32_t frameVarLocation = URX_VAL(captureOp);
|
||||
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
|
||||
fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
@ -133,7 +133,7 @@ enum {
|
||||
"CTR_LOOP_P", \
|
||||
"RELOC_OPRND", \
|
||||
"STO_SP", \
|
||||
"LD_SP"
|
||||
"LD_SP"
|
||||
|
||||
//
|
||||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "regeximp.h"
|
||||
|
||||
#include "stdio.h"
|
||||
#include "malloc.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
@ -41,7 +42,7 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fStack = new UVector32(status); // TODO: do something with status.
|
||||
fData = fSmallData;
|
||||
if (pat->fDataSize > sizeof(fSmallData)/sizeof(fSmallData[0])) {
|
||||
if (pat->fDataSize > sizeof(fSmallData)/sizeof(int32_t)) {
|
||||
fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); // TODO: null check
|
||||
}
|
||||
|
||||
@ -206,15 +207,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
|
||||
int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
|
||||
U_ASSERT(groupOffset < fPattern->fFrameSize);
|
||||
U_ASSERT(groupOffset >= 0);
|
||||
|
||||
// Note: When the match engine backs out of a capture group, it sets the
|
||||
// group's start position to -1. The end position is left with junk.
|
||||
// So, before returning an end position, we must first check that
|
||||
// the start position indicates that the group matched something.
|
||||
int32_t s = fFrame->fExtra[groupOffset];
|
||||
if (s != -1) {
|
||||
e = fFrame->fExtra[groupOffset + 1];
|
||||
}
|
||||
e = fFrame->fExtra[groupOffset + 1];
|
||||
}
|
||||
return e;
|
||||
}
|
||||
@ -584,7 +577,6 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
}
|
||||
printf("\n");
|
||||
printf("\n");
|
||||
printf(" PatLoc inputIdx char\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -613,11 +605,16 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
// One iteration of the loop per pattern operation performed.
|
||||
//
|
||||
for (;;) {
|
||||
#if 0
|
||||
if (_heapchk() != _HEAPOK) {
|
||||
fprintf(stderr, "Heap Trouble\n");
|
||||
}
|
||||
#endif
|
||||
op = pat[fp->fPatIdx];
|
||||
opType = URX_TYPE(op);
|
||||
opValue = URX_VAL(op);
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
printf("inputIdx=%d inputChar=%c sp=%d ", fp->fInputIdx,
|
||||
printf("inputIdx=%d inputChar=%c sp=%3d ", fp->fInputIdx,
|
||||
fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer());
|
||||
fPattern->dumpOp(fp->fPatIdx);
|
||||
#endif
|
||||
@ -690,16 +687,23 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
isMatch = TRUE;
|
||||
goto breakFromLoop;
|
||||
|
||||
// Start and End Capture stack frame variables are layout out like this:
|
||||
// fp->fExtra[opValue] - The start of a completed capture group
|
||||
// opValue+1 - The end of a completed capture group
|
||||
// opValue+2 - the start of a capture group that end
|
||||
// has not yet been reached (and might not ever be).
|
||||
case URX_START_CAPTURE:
|
||||
U_ASSERT(opValue >= 0 && opValue < frameSize-3);
|
||||
fp->fExtra[opValue] = fp->fInputIdx;
|
||||
fp->fExtra[opValue+2] = fp->fInputIdx;
|
||||
break;
|
||||
|
||||
|
||||
case URX_END_CAPTURE:
|
||||
U_ASSERT(opValue > 0 && opValue < frameSize-2);
|
||||
U_ASSERT(fp->fExtra[opValue-1] >= 0); // Start pos for this group must be set.
|
||||
fp->fExtra[opValue] = fp->fInputIdx;
|
||||
U_ASSERT(opValue >= 0 && opValue < frameSize-3);
|
||||
U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
|
||||
fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
|
||||
fp->fExtra[opValue+1] = fp->fInputIdx; // End position
|
||||
U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
|
||||
break;
|
||||
|
||||
|
||||
@ -1054,12 +1058,15 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
|
||||
int32_t newStackSize = fData[opValue];
|
||||
U_ASSERT(newStackSize <= fStack->size());
|
||||
REStackFrame *newFP = (REStackFrame *)(fStack->getBuffer() + newStackSize - frameSize);
|
||||
int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
|
||||
if (newFP == (int32_t *)fp) {
|
||||
break;
|
||||
}
|
||||
int32_t i;
|
||||
for (i=0; i<frameSize; i++) {
|
||||
newFP[i] = fp[i];
|
||||
newFP[i] = ((int32_t *)fp)[i];
|
||||
}
|
||||
fp = newFP;
|
||||
fp = (REStackFrame *)newFP;
|
||||
fStack->setSize(newStackSize);
|
||||
}
|
||||
break;
|
||||
|
@ -368,7 +368,8 @@ void RegexTest::Basic() {
|
||||
//
|
||||
#if 0
|
||||
{
|
||||
REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
|
||||
REGEX_TESTLM("(abc)*+a", "abcabcabc", FALSE, FALSE);
|
||||
// REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
|
||||
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
|
||||
}
|
||||
exit(1);
|
||||
@ -1234,7 +1235,17 @@ void RegexTest::Extended() {
|
||||
|
||||
// Atomic Grouping
|
||||
REGEX_FIND("(?>.*)abc", "abcabcabc"); // no match. .* consumed entire string.
|
||||
//REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
|
||||
REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0><1>abcc</1><2>ccc</2></0>ddd");
|
||||
REGEX_FIND("(\\.\\d\\d(?>[1-9]?))\\d+", "1.625");
|
||||
REGEX_FIND("(\\.\\d\\d(?>[1-9]?))\\d+", "1<0><1>.625</1>0</0>");
|
||||
|
||||
// Possessive *+
|
||||
REGEX_FIND("(abc)*+a", "abcabcabc");
|
||||
REGEX_FIND("(abc)*+a", "<0>abc<1>abc</1>a</0>b");
|
||||
REGEX_FIND("(a*b)*+a", "<0><1>aaaab</1>a</0>aaa");
|
||||
|
||||
// Possessive ?+
|
||||
REGEX_FIND("c?+ddd", "<0>cddd</0>");
|
||||
|
||||
}
|
||||
|
||||
@ -1272,11 +1283,6 @@ void RegexTest::Errors() {
|
||||
REGEX_ERR("abc(?<!xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // negated look-behind
|
||||
REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
|
||||
|
||||
// Possessive Quantifiers
|
||||
REGEX_ERR("abc++d", 1, 5, U_REGEX_UNIMPLEMENTED);
|
||||
REGEX_ERR("abc*+d", 1, 5, U_REGEX_UNIMPLEMENTED);
|
||||
REGEX_ERR("abc?+d", 1, 5, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
// Attempt to use non-default flags
|
||||
{
|
||||
UParseError pe;
|
||||
|
Loading…
Reference in New Issue
Block a user