ICU-2422 regexp, fix a number of bugs uncovered by perl regexp tests. Some still remain.

X-SVN-Rev: 10905
This commit is contained in:
Andy Heninger 2003-01-25 18:57:42 +00:00
parent 7df773644e
commit 41e90b5773
9 changed files with 271 additions and 157 deletions

View File

@ -153,8 +153,6 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
fCharNum = 0;
fQuoteMode = FALSE;
fFreeForm = FALSE;
fMatcherDataEnd = 0;
fBackRefMax = 0;
fMatchOpenParen = -1;
fMatchCloseParen = -1;
@ -738,27 +736,55 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doStar:
// Normal (greedy) * quantifier.
// Compiles to
// 1. STATE_SAVE 3
// 1. STATE_SAVE 4
// 2. body of stuff being iterated over
// 3. JMP 1
// 4. ...
//
// Or, if the body can match a zero-length string, to inhibit infinite loops,
// 1. STATE_SAVE 6
// 2. POS_SAVE data-loc
// 3. body of stuff
// 4. JMPX 1
// 5 data-loc (extra operand of JMPX)
// 6. ...
{
// location of item #1, the STATE_SAVE
int32_t saveStateLoc = blockTopLoc(TRUE);
int32_t dataLoc = -1;
if (possibleNullMatch(saveStateLoc, fRXPat->fCompiledPat->size()-1)) {
insertOp(saveStateLoc);
dataLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
}
// Locate the position in the compiled pattern where the match will continue
// after completing the *. (4 in the comment above)
int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
if (dataLoc != -1) {
continueLoc++;
}
// Put together the save state op store it into the compiled code.
int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
// Append the URX_JMP operation to the compiled pattern. Its target
// Append the URX_JMP or URX_JMPX operation to the compiled pattern. Its target
// is the locaton of the state-save, above.
int32_t jmpOp = URX_BUILD(URX_JMP, saveStateLoc);
fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
if (dataLoc == -1) {
int32_t jmpOp = URX_BUILD(URX_JMP, saveStateLoc);
fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
} else {
int32_t op = URX_BUILD(URX_JMPX, saveStateLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
op = URX_BUILD(URX_RESERVED_OP, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
}
break;
@ -963,11 +989,12 @@ UBool RegexCompile::doParseActions(EParseAction action)
for (;;) {
// Loop once per digit, for max allowed number of digits in a back reference.
groupNum = groupNum * 10 + u_charDigitValue(c);
int32_t digit = u_charDigitValue(c);
groupNum = groupNum * 10 + digit;
if (groupNum >= numCaptureGroups) {
break;
}
UChar32 c = peekCharLL();
c = peekCharLL();
if (gRuleDigits->contains(c) == FALSE) {
break;
}
@ -1284,8 +1311,11 @@ void RegexCompile::insertOp(int32_t where) {
int32_t opType = URX_TYPE(op);
int32_t opValue = URX_VAL(op);
if ((opType == URX_JMP ||
opType == URX_JMPX ||
opType == URX_STATE_SAVE ||
opType == URX_CTR_LOOP ||
opType == URX_CTR_LOOP_NG ||
opType == URX_CTR_LOOP_P ||
opType == URX_RELOC_OPRND) && opValue > where) {
// Target location for this opcode is after the insertion point and
// needs to be incremented to adjust for the insertion.
@ -1541,6 +1571,20 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
}
//----------------------------------------------------------------------------------------
//
// possibleNullMatch Test a range of compiled pattern for the possibility that it
// might match an empty string. Used to control the generation
// of extra checking code to prevent infinite loops in the match
// engine on repeated empty matches, such as might happen with
// (x?)*
// when the input string is not at an x.
//
//----------------------------------------------------------------------------------------
UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
// for now, just return true. TODO: make a real implementation
return TRUE;
}
//----------------------------------------------------------------------------------------

View File

@ -87,6 +87,8 @@ private:
void fixLiterals(UBool split=FALSE); // Fix literal strings.
void insertOp(int32_t where); // Open up a slot for a new op in the
// generated code at the specified location.
UBool possibleNullMatch(int32_t start, // Test a range of compiled pattern for
int32_t end); // for possibly matching an empty string.
UErrorCode *fStatus;
@ -152,14 +154,6 @@ private:
// -1 for the upper interval value means none
// was specified (unlimited occurences.)
int32_t fMatcherDataEnd; // Location Counter for allocation of data
// to be used by the matcher at match time.
int32_t fBackRefMax; // Number of the largest capture group with a
// back reference. Capture groups can be forward-
// referenced, so we can't flag an error on
// a too-big back ref number until the end of the
// pattern is reached.
};
U_NAMESPACE_END

View File

@ -103,99 +103,106 @@ struct RegexTableEl {
static const struct RegexTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doPatStart, 255, 2,0, FALSE} // 1 start
, {doLiteralChar, 254, 12,0, TRUE} // 2 term
, {doLiteralChar, 130, 12,0, TRUE} // 3
, {doScanUnicodeSet, 91 /* [ */, 12,0, TRUE} // 4
, {doNOP, 40 /* ( */, 20,0, TRUE} // 5
, {doDotAny, 46 /* . */, 12,0, TRUE} // 6
, {doLiteralChar, 254, 14,0, TRUE} // 2 term
, {doLiteralChar, 130, 14,0, TRUE} // 3
, {doScanUnicodeSet, 91 /* [ */, 14,0, TRUE} // 4
, {doNOP, 40 /* ( */, 27,0, TRUE} // 5
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
, {doNOP, 92 /* \ */, 72,0, TRUE} // 9
, {doPatFinish, 253, 2,0, FALSE} // 10
, {doRuleError, 255, 94,0, FALSE} // 11
, {doNOP, 42 /* * */, 50,0, TRUE} // 12 expr-quant
, {doNOP, 43 /* + */, 53,0, TRUE} // 13
, {doNOP, 63 /* ? */, 56,0, TRUE} // 14
, {doIntervalInit, 123 /* { */, 59,0, TRUE} // 15
, {doNOP, 255, 17,0, FALSE} // 16
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 17 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 18
, {doNOP, 255, 2,0, FALSE} // 19
, {doNOP, 63 /* ? */, 22,0, TRUE} // 20 open-paren
, {doOpenCaptureParen, 255, 2, 12, FALSE} // 21
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 22 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 2, 12, TRUE} // 23
, {doOpenLookAhead, 61 /* = */, 2, 17, TRUE} // 24
, {doOpenLookAheadNeg, 33 /* ! */, 2, 17, TRUE} // 25
, {doNOP, 60 /* < */, 36,0, TRUE} // 26
, {doNOP, 35 /* # */, 39,0, TRUE} // 27
, {doMatchMode, 105 /* i */, 42,0, TRUE} // 28
, {doMatchMode, 120 /* x */, 42,0, TRUE} // 29
, {doMatchMode, 115 /* s */, 42,0, TRUE} // 30
, {doMatchMode, 109 /* m */, 42,0, TRUE} // 31
, {doMatchMode, 45 /* - */, 42,0, TRUE} // 32
, {doConditionalExpr, 40 /* ( */, 94,0, TRUE} // 33
, {doPerlInline, 123 /* { */, 94,0, TRUE} // 34
, {doBadOpenParenType, 255, 94,0, FALSE} // 35
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 36 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 37
, {doBadOpenParenType, 255, 94,0, FALSE} // 38
, {doNOP, 41 /* ) */, 2,0, TRUE} // 39 paren-comment
, {doMismatchedParenErr, 253, 94,0, FALSE} // 40
, {doNOP, 255, 39,0, TRUE} // 41
, {doMatchMode, 105 /* i */, 42,0, TRUE} // 42 paren-flag
, {doMatchMode, 115 /* s */, 42,0, TRUE} // 43
, {doMatchMode, 109 /* m */, 42,0, TRUE} // 44
, {doMatchMode, 120 /* x */, 42,0, TRUE} // 45
, {doMatchMode, 45 /* - */, 42,0, TRUE} // 46
, {doNOP, 41 /* ) */, 2,0, TRUE} // 47
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 48
, {doNOP, 255, 94,0, FALSE} // 49
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 50 quant-star
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 51
, {doStar, 255, 17,0, FALSE} // 52
, {doNGPlus, 63 /* ? */, 17,0, TRUE} // 53 quant-plus
, {doPossesivePlus, 43 /* + */, 17,0, TRUE} // 54
, {doPlus, 255, 17,0, FALSE} // 55
, {doNGOpt, 63 /* ? */, 17,0, TRUE} // 56 quant-opt
, {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 57
, {doOpt, 255, 17,0, FALSE} // 58
, {doNOP, 129, 59,0, TRUE} // 59 interval-open
, {doNOP, 128, 62,0, FALSE} // 60
, {doIntervalError, 255, 94,0, FALSE} // 61
, {doIntevalLowerDigit, 128, 62,0, TRUE} // 62 interval-lower
, {doNOP, 44 /* , */, 66,0, TRUE} // 63
, {doIntervalSame, 125 /* } */, 69,0, TRUE} // 64
, {doIntervalError, 255, 94,0, FALSE} // 65
, {doIntervalUpperDigit, 128, 66,0, TRUE} // 66 interval-upper
, {doNOP, 125 /* } */, 69,0, TRUE} // 67
, {doIntervalError, 255, 94,0, FALSE} // 68
, {doNGInterval, 63 /* ? */, 17,0, TRUE} // 69 interval-type
, {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 70
, {doInterval, 255, 17,0, FALSE} // 71
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 72 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 73
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 74
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 75
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 76
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 77
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 78
, {doProperty, 112 /* p */, 12,0, FALSE} // 79
, {doProperty, 80 /* P */, 12,0, FALSE} // 80
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 81
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 82
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 83
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 84
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 85
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 86
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 87
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 88
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 89
, {doOctal, 48 /* 0 */, 12,0, TRUE} // 90
, {doBackRef, 128, 12,0, TRUE} // 91
, {doEscapeError, 253, 94,0, FALSE} // 92
, {doLiteralChar, 255, 12,0, TRUE} // 93
, {doExit, 255, 94,0, TRUE} // 94 errorDeath
, {doNOP, 92 /* \ */, 79,0, TRUE} // 9
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
, {doPatFinish, 253, 2,0, FALSE} // 12
, {doRuleError, 255, 101,0, FALSE} // 13
, {doNOP, 42 /* * */, 57,0, TRUE} // 14 expr-quant
, {doNOP, 43 /* + */, 60,0, TRUE} // 15
, {doNOP, 63 /* ? */, 63,0, TRUE} // 16
, {doIntervalInit, 123 /* { */, 66,0, TRUE} // 17
, {doNOP, 40 /* ( */, 23,0, TRUE} // 18
, {doNOP, 255, 20,0, FALSE} // 19
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 21
, {doNOP, 255, 2,0, FALSE} // 22
, {doNOP, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
, {doNOP, 255, 27,0, FALSE} // 24
, {doNOP, 35 /* # */, 46, 14, TRUE} // 25 open-paren-quant2
, {doNOP, 255, 29,0, FALSE} // 26
, {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
, {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE} // 29 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30
, {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31
, {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32
, {doNOP, 60 /* < */, 43,0, TRUE} // 33
, {doNOP, 35 /* # */, 46, 2, TRUE} // 34
, {doMatchMode, 105 /* i */, 49,0, TRUE} // 35
, {doMatchMode, 120 /* x */, 49,0, TRUE} // 36
, {doMatchMode, 115 /* s */, 49,0, TRUE} // 37
, {doMatchMode, 109 /* m */, 49,0, TRUE} // 38
, {doMatchMode, 45 /* - */, 49,0, TRUE} // 39
, {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 40
, {doPerlInline, 123 /* { */, 101,0, TRUE} // 41
, {doBadOpenParenType, 255, 101,0, FALSE} // 42
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 43 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 44
, {doBadOpenParenType, 255, 101,0, FALSE} // 45
, {doNOP, 41 /* ) */, 255,0, TRUE} // 46 paren-comment
, {doMismatchedParenErr, 253, 101,0, FALSE} // 47
, {doNOP, 255, 46,0, TRUE} // 48
, {doMatchMode, 105 /* i */, 49,0, TRUE} // 49 paren-flag
, {doMatchMode, 115 /* s */, 49,0, TRUE} // 50
, {doMatchMode, 109 /* m */, 49,0, TRUE} // 51
, {doMatchMode, 120 /* x */, 49,0, TRUE} // 52
, {doMatchMode, 45 /* - */, 49,0, TRUE} // 53
, {doNOP, 41 /* ) */, 2,0, TRUE} // 54
, {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE} // 55
, {doNOP, 255, 101,0, FALSE} // 56
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 57 quant-star
, {doPossesiveStar, 43 /* + */, 20,0, TRUE} // 58
, {doStar, 255, 20,0, FALSE} // 59
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 60 quant-plus
, {doPossesivePlus, 43 /* + */, 20,0, TRUE} // 61
, {doPlus, 255, 20,0, FALSE} // 62
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 63 quant-opt
, {doPossesiveOpt, 43 /* + */, 20,0, TRUE} // 64
, {doOpt, 255, 20,0, FALSE} // 65
, {doNOP, 129, 66,0, TRUE} // 66 interval-open
, {doNOP, 128, 69,0, FALSE} // 67
, {doIntervalError, 255, 101,0, FALSE} // 68
, {doIntevalLowerDigit, 128, 69,0, TRUE} // 69 interval-lower
, {doNOP, 44 /* , */, 73,0, TRUE} // 70
, {doIntervalSame, 125 /* } */, 76,0, TRUE} // 71
, {doIntervalError, 255, 101,0, FALSE} // 72
, {doIntervalUpperDigit, 128, 73,0, TRUE} // 73 interval-upper
, {doNOP, 125 /* } */, 76,0, TRUE} // 74
, {doIntervalError, 255, 101,0, FALSE} // 75
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 76 interval-type
, {doPossesiveInterval, 43 /* + */, 20,0, TRUE} // 77
, {doInterval, 255, 20,0, FALSE} // 78
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 79 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 80
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 81
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 82
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 83
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 84
, {doNamedChar, 78 /* N */, 14,0, TRUE} // 85
, {doProperty, 112 /* p */, 14,0, FALSE} // 86
, {doProperty, 80 /* P */, 14,0, FALSE} // 87
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 88
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 89
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 90
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 91
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 92
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 93
, {doBackslashx, 120 /* x */, 14,0, TRUE} // 94
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 95
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 96
, {doOctal, 48 /* 0 */, 14,0, TRUE} // 97
, {doBackRef, 128, 14,0, TRUE} // 98
, {doEscapeError, 253, 101,0, FALSE} // 99
, {doLiteralChar, 255, 14,0, TRUE} // 100
, {doExit, 255, 101,0, TRUE} // 101 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@ -208,14 +215,21 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
"expr-quant",
0,
0,
0,
0,
0,
"expr-cont",
0,
0,
"open-paren-quant",
0,
"open-paren-quant2",
0,
"open-paren",
0,

View File

@ -64,14 +64,16 @@ start:
# term. At a position where we can accept the start most items in a pattern.
#
term:
quoted n expr-quant doLiteralChar
rule_char n expr-quant doLiteralChar
quoted n expr-quant doLiteralChar
rule_char n expr-quant doLiteralChar
'[' n expr-quant doScanUnicodeSet
'(' n open-paren
'.' n expr-quant doDotAny
'^' n term doCaret
'$' n term doDollar
'\' n backslash
'|' n term doOrOperator
')' n pop doCloseParen
eof term doPatFinish
default errorDeath doRuleError
@ -86,6 +88,7 @@ expr-quant:
'+' n quant-plus
'?' n quant-opt
'{' n interval-open doIntervalInit
'(' n open-paren-quant
default expr-cont
@ -99,6 +102,21 @@ expr-cont:
default term
#
# open-paren-quant Special case handling for comments appearing before a quantifier,
# e.g. x(?#comment )*
# Open parens from expr-quant come here; anything but a (?# comment
# branches into the normal parenthesis sequence as quickly as possible.
#
open-paren-quant:
'?' n open-paren-quant2
default open-paren
open-paren-quant2:
'#' n paren-comment ^expr-quant
default open-paren-extended
#
# open-paren We've got an open paren. We need to scan further to
# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
@ -113,7 +131,7 @@ open-paren-extended:
'=' n term ^expr-cont doOpenLookAhead # (?=
'!' n term ^expr-cont doOpenLookAheadNeg # (?!
'<' n open-paren-lookbehind
'#' n paren-comment
'#' n paren-comment ^term
'i' n paren-flag doMatchMode
'x' n paren-flag doMatchMode
's' n paren-flag doMatchMode
@ -134,8 +152,8 @@ open-paren-lookbehind:
# TODO: should parens nest here? Check what perl does.
#
paren-comment:
')' n term
eof errorDeath doMismatchedParenErr
')' n pop
eof errorDeath doMismatchedParenErr
default n paren-comment
#

View File

@ -95,8 +95,16 @@ enum {
// matcher data (not stack data) to store it.
URX_LD_SP = 33, // Load the stack pointer. Operand is location
// to load from.
URX_BACKREF = 34 // Back Reference. Parameter is the index of the
URX_BACKREF = 34, // Back Reference. Parameter is the index of the
// capture group variables in the state stack frame.
URX_STO_INP_LOC = 35, // Store the input location. Operand is location
// within the matcher data (not stack).
URX_JMPX = 36 // Conditional JMP.
// First Operand: JMP target location.
// Second Operand: Data location containing an
// input position. If current input position ==
// saved input position, FAIL rather than taking
// the JMP.
};
// Keep this list of opcode names in sync with the above enum
@ -136,7 +144,9 @@ enum {
"RELOC_OPRND", \
"STO_SP", \
"LD_SP", \
"BACKREF"
"BACKREF", \
"STO_INP_LOC", \
"JMPX"
//
// Convenience macros for assembling and disassembling a compiled operation.

View File

@ -228,7 +228,9 @@ UBool RegexMatcher::find() {
// TODO: needs to go up to the very end, so a pattern that can match a zero lenght
// string can match at the end of a string. Can't do until loop-breaking
// is added to the engine, though, otherwise it triggers too many bugs.
for (startPos=fMatchEnd; startPos < fInputLength; startPos = fInput->moveIndex32(startPos, 1)) {
startPos = fMatchEnd;
U_ASSERT(startPos >= 0 && startPos <= fInputLength);
for (;;) {
MatchAt(startPos, status);
if (U_FAILURE(status)) {
return FALSE;
@ -236,6 +238,10 @@ UBool RegexMatcher::find() {
if (fMatch) {
return TRUE;
}
if (startPos >= fInputLength) {
break;
}
startPos = fInput->moveIndex32(startPos, 1);
}
return FALSE;
}
@ -858,17 +864,19 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
// The high bit of the op value is a flag for the match polarity.
// 0: success if input char is in set.
// 1: success if input char is not in set.
if (fp->fInputIdx >= fInputLength) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
opValue &= ~URX_NEG_SET;
if (fp->fInputIdx < fInputLength) {
// There is input left. Pick up one char and test it for set membership.
UChar32 c;
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
const UnicodeSet *s = fPattern->fStaticSets[opValue];
if (s->contains(c)) {
success = !success;
}
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
UChar32 c;
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
const UnicodeSet *s = fPattern->fStaticSets[opValue];
if (s->contains(c)) {
success = !success;
}
if (!success) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
@ -1102,6 +1110,29 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
}
break;
case URX_STO_INP_LOC:
{
U_ASSERT(opValue >= 0 && opValue < frameSize);
fp->fExtra[opValue] = fp->fInputIdx;
}
break;
case URX_JMPX:
{
int32_t instrOperandLoc = fp->fPatIdx;
fp->fPatIdx += 1;
int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
U_ASSERT(dataLoc >= 0 && dataLoc < frameSize);
int32_t savedInputIdx = fp->fExtra[dataLoc];
U_ASSERT(savedInputIdx <= fp->fInputIdx);
if (savedInputIdx < fp->fInputIdx) {
fp->fPatIdx = opValue; // JMP
} else {
fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no progress in loop.
}
}
break;
default:
// Trouble. The compiled pattern contains an entry with an

View File

@ -471,6 +471,9 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_RELOC_OPRND:
case URX_STO_SP:
case URX_LD_SP:
case URX_BACKREF:
case URX_STO_INP_LOC:
case URX_JMPX:
// types with an integer operand field.
REGEX_DUMP_DEBUG_PRINTF("%d", val);

View File

@ -61,7 +61,7 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
if (exec) Errors();
break;
case 6: name = "PerlTests";
// if (exec) PerlTests();
if (exec) PerlTests();
break;
@ -124,7 +124,7 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %d\n", line, status);
return FALSE;
}
// REPattern->dump();
if (line==376) { REPattern->dump();}
UnicodeString inputString(inputText);
UnicodeString unEscapedInput = inputString.unescape();
@ -373,7 +373,7 @@ void RegexTest::Basic() {
//
#if 0
{
REGEX_TESTLM("\\ba\\b", "-a", FALSE, TRUE);
REGEX_TESTLM("\\W", "a", FALSE, FALSE);
// REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
}
@ -1284,7 +1284,7 @@ void RegexTest::Errors() {
// Extra close paren
REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
REGEX_ERR(")))))))", 1, 1, U_REGEX_RULE_SYNTAX);
REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
// Flag settings not yet implemented

View File

@ -1,4 +1,4 @@
abc abc y $& abc
abc abc y $& abc
abc abc y $-[0] 0
abc abc y $+[0] 3
abc xbc n - -
@ -557,14 +557,14 @@ $(?<=^(a)) a y $1 a
(?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a}) yaaxxaaaacd y $b 4
(>a+)ab aaab n - -
(?>a+)b aaab y - -
([[:]+) a:[b]: y $1 :[
([[=]+) a=[b]= y $1 =[
([[.]+) a.[b]. y $1 .[
([[:]+) a:[b]: iy $1 :[
([[=]+) a=[b]= iy $1 =[
([[.]+) a.[b]. iy $1 .[
[a[:xyz: - c - Unmatched [
[a[:xyz:] - c - POSIX class [:xyz:] unknown
[a[:]b[:c] abc y $& abc
[a[:]b[:c] abc iy $& abc
([a[:xyz:]b]+) pbaq c - POSIX class [:xyz:] unknown
[a[:]b[:c] abc y $& abc
[a[:]b[:c] abc iy $& abc
([[:alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd
([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy
([[:ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- ${nulnul}
@ -803,31 +803,31 @@ round\(((?>[^()]+))\) _I(round(xs * sz),1) y $1 xs * sz
'((?-x:.) )'x x y $1- x-
foo.bart foo.bart y - -
'^d[x][x][x]'m abcd\ndxxx y - -
.X(.+)+X bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
.X(.+)+XX bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
.XX(.+)+X bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
.X(.+)+X bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
.X(.+)+XX bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
.XX(.+)+X bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
.X(.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
.X(.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
.XX(.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
.X(.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
.X(.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
.XX(.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
.[X](.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
.[X](.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
.[X][X](.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
.[X](.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
.X(.+)+X bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
.X(.+)+XX bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
.XX(.+)+X bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
.X(.+)+X bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
.X(.+)+XX bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
.XX(.+)+X bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
.X(.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
.X(.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
.XX(.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
.X(.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
.X(.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
.XX(.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
.[X](.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
.[X](.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
.[X][X](.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
.[X](.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
tt+$ xxxtt y - -
([a-\d]+) za-9z y $1 a-9
([\d-z]+) a0-za y $1 0-z
([\d-\s]+) a0- z y $1 0-
([a-[:digit:]]+) za-9z y $1 a-9
([[:digit:]-z]+) =0-z= y $1 0-z
([[:digit:]-[:alpha:]]+) =0-z= y $1 0-z
([a-[:digit:]]+) za-9z iy $1 a-9
([[:digit:]-z]+) =0-z= iy $1 0-z
([[:digit:]-[:alpha:]]+) =0-z= iy $1 0-z
\GX.*X aaaXbX n - -
(\d+\.\d+) 3.1415926 y $1 3.1415926
(\ba.{0,10}br) have a web browser y $1 a web br
@ -876,7 +876,7 @@ tt+$ xxxtt y - -
(abc)?(abc)+ abc y $1:$2 :abc -
'b\s^'m a\nb\n n - -
\ba a y - -
^(a(??{"(?!)"})|(a)(?{1}))b ab y $2 a # [ID 20010811.006]
^(a(??{"(?!)"})|(a)(?{1}))b ab yi $2 a # [ID 20010811.006]
ab(?i)cd AbCd n - - # [ID 20010809.023]
ab(?i)cd abCd y - -
(A|B)*(?(1)(CD)|(CD)) CD y $2-$3 -CD
@ -921,4 +921,4 @@ ab(?i)cd abCd y - -
(.*?)(?<=[bc]) abcd y $1 ab
(.*?)(?<=[bc])c abcd y $1 ab
2(]*)?$\1 2 y $& 2
(??{}) x y - -
(??{}) x yi - -