ICU-2422 regexp, fix a number of bugs uncovered by perl regexp tests. Some still remain.
X-SVN-Rev: 10905
This commit is contained in:
parent
7df773644e
commit
41e90b5773
@ -153,8 +153,6 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
||||
fCharNum = 0;
|
||||
fQuoteMode = FALSE;
|
||||
fFreeForm = FALSE;
|
||||
fMatcherDataEnd = 0;
|
||||
fBackRefMax = 0;
|
||||
|
||||
fMatchOpenParen = -1;
|
||||
fMatchCloseParen = -1;
|
||||
@ -738,27 +736,55 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
case doStar:
|
||||
// Normal (greedy) * quantifier.
|
||||
// Compiles to
|
||||
// 1. STATE_SAVE 3
|
||||
// 1. STATE_SAVE 4
|
||||
// 2. body of stuff being iterated over
|
||||
// 3. JMP 1
|
||||
// 4. ...
|
||||
//
|
||||
// Or, if the body can match a zero-length string, to inhibit infinite loops,
|
||||
// 1. STATE_SAVE 6
|
||||
// 2. POS_SAVE data-loc
|
||||
// 3. body of stuff
|
||||
// 4. JMPX 1
|
||||
// 5 data-loc (extra operand of JMPX)
|
||||
// 6. ...
|
||||
{
|
||||
// location of item #1, the STATE_SAVE
|
||||
int32_t saveStateLoc = blockTopLoc(TRUE);
|
||||
int32_t dataLoc = -1;
|
||||
|
||||
if (possibleNullMatch(saveStateLoc, fRXPat->fCompiledPat->size()-1)) {
|
||||
insertOp(saveStateLoc);
|
||||
dataLoc = fRXPat->fFrameSize;
|
||||
fRXPat->fFrameSize++;
|
||||
|
||||
int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
|
||||
}
|
||||
|
||||
// Locate the position in the compiled pattern where the match will continue
|
||||
// after completing the *. (4 in the comment above)
|
||||
int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
|
||||
if (dataLoc != -1) {
|
||||
continueLoc++;
|
||||
}
|
||||
|
||||
// Put together the save state op store it into the compiled code.
|
||||
int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
|
||||
|
||||
// Append the URX_JMP operation to the compiled pattern. Its target
|
||||
// Append the URX_JMP or URX_JMPX operation to the compiled pattern. Its target
|
||||
// is the locaton of the state-save, above.
|
||||
int32_t jmpOp = URX_BUILD(URX_JMP, saveStateLoc);
|
||||
fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
|
||||
if (dataLoc == -1) {
|
||||
int32_t jmpOp = URX_BUILD(URX_JMP, saveStateLoc);
|
||||
fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
|
||||
} else {
|
||||
int32_t op = URX_BUILD(URX_JMPX, saveStateLoc);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
op = URX_BUILD(URX_RESERVED_OP, dataLoc);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
|
||||
}
|
||||
break;
|
||||
|
||||
@ -963,11 +989,12 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
|
||||
for (;;) {
|
||||
// Loop once per digit, for max allowed number of digits in a back reference.
|
||||
groupNum = groupNum * 10 + u_charDigitValue(c);
|
||||
int32_t digit = u_charDigitValue(c);
|
||||
groupNum = groupNum * 10 + digit;
|
||||
if (groupNum >= numCaptureGroups) {
|
||||
break;
|
||||
}
|
||||
UChar32 c = peekCharLL();
|
||||
c = peekCharLL();
|
||||
if (gRuleDigits->contains(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
@ -1284,8 +1311,11 @@ void RegexCompile::insertOp(int32_t where) {
|
||||
int32_t opType = URX_TYPE(op);
|
||||
int32_t opValue = URX_VAL(op);
|
||||
if ((opType == URX_JMP ||
|
||||
opType == URX_JMPX ||
|
||||
opType == URX_STATE_SAVE ||
|
||||
opType == URX_CTR_LOOP ||
|
||||
opType == URX_CTR_LOOP_NG ||
|
||||
opType == URX_CTR_LOOP_P ||
|
||||
opType == URX_RELOC_OPRND) && opValue > where) {
|
||||
// Target location for this opcode is after the insertion point and
|
||||
// needs to be incremented to adjust for the insertion.
|
||||
@ -1541,6 +1571,20 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
|
||||
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// possibleNullMatch Test a range of compiled pattern for the possibility that it
|
||||
// might match an empty string. Used to control the generation
|
||||
// of extra checking code to prevent infinite loops in the match
|
||||
// engine on repeated empty matches, such as might happen with
|
||||
// (x?)*
|
||||
// when the input string is not at an x.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
|
||||
// for now, just return true. TODO: make a real implementation
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
|
@ -87,6 +87,8 @@ private:
|
||||
void fixLiterals(UBool split=FALSE); // Fix literal strings.
|
||||
void insertOp(int32_t where); // Open up a slot for a new op in the
|
||||
// generated code at the specified location.
|
||||
UBool possibleNullMatch(int32_t start, // Test a range of compiled pattern for
|
||||
int32_t end); // for possibly matching an empty string.
|
||||
|
||||
|
||||
UErrorCode *fStatus;
|
||||
@ -152,14 +154,6 @@ private:
|
||||
// -1 for the upper interval value means none
|
||||
// was specified (unlimited occurences.)
|
||||
|
||||
int32_t fMatcherDataEnd; // Location Counter for allocation of data
|
||||
// to be used by the matcher at match time.
|
||||
|
||||
int32_t fBackRefMax; // Number of the largest capture group with a
|
||||
// back reference. Capture groups can be forward-
|
||||
// referenced, so we can't flag an error on
|
||||
// a too-big back ref number until the end of the
|
||||
// pattern is reached.
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -103,99 +103,106 @@ struct RegexTableEl {
|
||||
static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
{doNOP, 0, 0, 0, TRUE}
|
||||
, {doPatStart, 255, 2,0, FALSE} // 1 start
|
||||
, {doLiteralChar, 254, 12,0, TRUE} // 2 term
|
||||
, {doLiteralChar, 130, 12,0, TRUE} // 3
|
||||
, {doScanUnicodeSet, 91 /* [ */, 12,0, TRUE} // 4
|
||||
, {doNOP, 40 /* ( */, 20,0, TRUE} // 5
|
||||
, {doDotAny, 46 /* . */, 12,0, TRUE} // 6
|
||||
, {doLiteralChar, 254, 14,0, TRUE} // 2 term
|
||||
, {doLiteralChar, 130, 14,0, TRUE} // 3
|
||||
, {doScanUnicodeSet, 91 /* [ */, 14,0, TRUE} // 4
|
||||
, {doNOP, 40 /* ( */, 27,0, TRUE} // 5
|
||||
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
|
||||
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
|
||||
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
|
||||
, {doNOP, 92 /* \ */, 72,0, TRUE} // 9
|
||||
, {doPatFinish, 253, 2,0, FALSE} // 10
|
||||
, {doRuleError, 255, 94,0, FALSE} // 11
|
||||
, {doNOP, 42 /* * */, 50,0, TRUE} // 12 expr-quant
|
||||
, {doNOP, 43 /* + */, 53,0, TRUE} // 13
|
||||
, {doNOP, 63 /* ? */, 56,0, TRUE} // 14
|
||||
, {doIntervalInit, 123 /* { */, 59,0, TRUE} // 15
|
||||
, {doNOP, 255, 17,0, FALSE} // 16
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 17 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 18
|
||||
, {doNOP, 255, 2,0, FALSE} // 19
|
||||
, {doNOP, 63 /* ? */, 22,0, TRUE} // 20 open-paren
|
||||
, {doOpenCaptureParen, 255, 2, 12, FALSE} // 21
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 22 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 2, 12, TRUE} // 23
|
||||
, {doOpenLookAhead, 61 /* = */, 2, 17, TRUE} // 24
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 2, 17, TRUE} // 25
|
||||
, {doNOP, 60 /* < */, 36,0, TRUE} // 26
|
||||
, {doNOP, 35 /* # */, 39,0, TRUE} // 27
|
||||
, {doMatchMode, 105 /* i */, 42,0, TRUE} // 28
|
||||
, {doMatchMode, 120 /* x */, 42,0, TRUE} // 29
|
||||
, {doMatchMode, 115 /* s */, 42,0, TRUE} // 30
|
||||
, {doMatchMode, 109 /* m */, 42,0, TRUE} // 31
|
||||
, {doMatchMode, 45 /* - */, 42,0, TRUE} // 32
|
||||
, {doConditionalExpr, 40 /* ( */, 94,0, TRUE} // 33
|
||||
, {doPerlInline, 123 /* { */, 94,0, TRUE} // 34
|
||||
, {doBadOpenParenType, 255, 94,0, FALSE} // 35
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 36 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 37
|
||||
, {doBadOpenParenType, 255, 94,0, FALSE} // 38
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 39 paren-comment
|
||||
, {doMismatchedParenErr, 253, 94,0, FALSE} // 40
|
||||
, {doNOP, 255, 39,0, TRUE} // 41
|
||||
, {doMatchMode, 105 /* i */, 42,0, TRUE} // 42 paren-flag
|
||||
, {doMatchMode, 115 /* s */, 42,0, TRUE} // 43
|
||||
, {doMatchMode, 109 /* m */, 42,0, TRUE} // 44
|
||||
, {doMatchMode, 120 /* x */, 42,0, TRUE} // 45
|
||||
, {doMatchMode, 45 /* - */, 42,0, TRUE} // 46
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 47
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 48
|
||||
, {doNOP, 255, 94,0, FALSE} // 49
|
||||
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 50 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 51
|
||||
, {doStar, 255, 17,0, FALSE} // 52
|
||||
, {doNGPlus, 63 /* ? */, 17,0, TRUE} // 53 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 17,0, TRUE} // 54
|
||||
, {doPlus, 255, 17,0, FALSE} // 55
|
||||
, {doNGOpt, 63 /* ? */, 17,0, TRUE} // 56 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 57
|
||||
, {doOpt, 255, 17,0, FALSE} // 58
|
||||
, {doNOP, 129, 59,0, TRUE} // 59 interval-open
|
||||
, {doNOP, 128, 62,0, FALSE} // 60
|
||||
, {doIntervalError, 255, 94,0, FALSE} // 61
|
||||
, {doIntevalLowerDigit, 128, 62,0, TRUE} // 62 interval-lower
|
||||
, {doNOP, 44 /* , */, 66,0, TRUE} // 63
|
||||
, {doIntervalSame, 125 /* } */, 69,0, TRUE} // 64
|
||||
, {doIntervalError, 255, 94,0, FALSE} // 65
|
||||
, {doIntervalUpperDigit, 128, 66,0, TRUE} // 66 interval-upper
|
||||
, {doNOP, 125 /* } */, 69,0, TRUE} // 67
|
||||
, {doIntervalError, 255, 94,0, FALSE} // 68
|
||||
, {doNGInterval, 63 /* ? */, 17,0, TRUE} // 69 interval-type
|
||||
, {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 70
|
||||
, {doInterval, 255, 17,0, FALSE} // 71
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 72 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 73
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 74
|
||||
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 75
|
||||
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 76
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 77
|
||||
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 78
|
||||
, {doProperty, 112 /* p */, 12,0, FALSE} // 79
|
||||
, {doProperty, 80 /* P */, 12,0, FALSE} // 80
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 81
|
||||
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 82
|
||||
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 83
|
||||
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 84
|
||||
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 85
|
||||
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 86
|
||||
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 87
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 88
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 89
|
||||
, {doOctal, 48 /* 0 */, 12,0, TRUE} // 90
|
||||
, {doBackRef, 128, 12,0, TRUE} // 91
|
||||
, {doEscapeError, 253, 94,0, FALSE} // 92
|
||||
, {doLiteralChar, 255, 12,0, TRUE} // 93
|
||||
, {doExit, 255, 94,0, TRUE} // 94 errorDeath
|
||||
, {doNOP, 92 /* \ */, 79,0, TRUE} // 9
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
|
||||
, {doPatFinish, 253, 2,0, FALSE} // 12
|
||||
, {doRuleError, 255, 101,0, FALSE} // 13
|
||||
, {doNOP, 42 /* * */, 57,0, TRUE} // 14 expr-quant
|
||||
, {doNOP, 43 /* + */, 60,0, TRUE} // 15
|
||||
, {doNOP, 63 /* ? */, 63,0, TRUE} // 16
|
||||
, {doIntervalInit, 123 /* { */, 66,0, TRUE} // 17
|
||||
, {doNOP, 40 /* ( */, 23,0, TRUE} // 18
|
||||
, {doNOP, 255, 20,0, FALSE} // 19
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 21
|
||||
, {doNOP, 255, 2,0, FALSE} // 22
|
||||
, {doNOP, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
|
||||
, {doNOP, 255, 27,0, FALSE} // 24
|
||||
, {doNOP, 35 /* # */, 46, 14, TRUE} // 25 open-paren-quant2
|
||||
, {doNOP, 255, 29,0, FALSE} // 26
|
||||
, {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren
|
||||
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE} // 29 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30
|
||||
, {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32
|
||||
, {doNOP, 60 /* < */, 43,0, TRUE} // 33
|
||||
, {doNOP, 35 /* # */, 46, 2, TRUE} // 34
|
||||
, {doMatchMode, 105 /* i */, 49,0, TRUE} // 35
|
||||
, {doMatchMode, 120 /* x */, 49,0, TRUE} // 36
|
||||
, {doMatchMode, 115 /* s */, 49,0, TRUE} // 37
|
||||
, {doMatchMode, 109 /* m */, 49,0, TRUE} // 38
|
||||
, {doMatchMode, 45 /* - */, 49,0, TRUE} // 39
|
||||
, {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 40
|
||||
, {doPerlInline, 123 /* { */, 101,0, TRUE} // 41
|
||||
, {doBadOpenParenType, 255, 101,0, FALSE} // 42
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 43 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 44
|
||||
, {doBadOpenParenType, 255, 101,0, FALSE} // 45
|
||||
, {doNOP, 41 /* ) */, 255,0, TRUE} // 46 paren-comment
|
||||
, {doMismatchedParenErr, 253, 101,0, FALSE} // 47
|
||||
, {doNOP, 255, 46,0, TRUE} // 48
|
||||
, {doMatchMode, 105 /* i */, 49,0, TRUE} // 49 paren-flag
|
||||
, {doMatchMode, 115 /* s */, 49,0, TRUE} // 50
|
||||
, {doMatchMode, 109 /* m */, 49,0, TRUE} // 51
|
||||
, {doMatchMode, 120 /* x */, 49,0, TRUE} // 52
|
||||
, {doMatchMode, 45 /* - */, 49,0, TRUE} // 53
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 54
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE} // 55
|
||||
, {doNOP, 255, 101,0, FALSE} // 56
|
||||
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 57 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 20,0, TRUE} // 58
|
||||
, {doStar, 255, 20,0, FALSE} // 59
|
||||
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 60 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 20,0, TRUE} // 61
|
||||
, {doPlus, 255, 20,0, FALSE} // 62
|
||||
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 63 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 20,0, TRUE} // 64
|
||||
, {doOpt, 255, 20,0, FALSE} // 65
|
||||
, {doNOP, 129, 66,0, TRUE} // 66 interval-open
|
||||
, {doNOP, 128, 69,0, FALSE} // 67
|
||||
, {doIntervalError, 255, 101,0, FALSE} // 68
|
||||
, {doIntevalLowerDigit, 128, 69,0, TRUE} // 69 interval-lower
|
||||
, {doNOP, 44 /* , */, 73,0, TRUE} // 70
|
||||
, {doIntervalSame, 125 /* } */, 76,0, TRUE} // 71
|
||||
, {doIntervalError, 255, 101,0, FALSE} // 72
|
||||
, {doIntervalUpperDigit, 128, 73,0, TRUE} // 73 interval-upper
|
||||
, {doNOP, 125 /* } */, 76,0, TRUE} // 74
|
||||
, {doIntervalError, 255, 101,0, FALSE} // 75
|
||||
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 76 interval-type
|
||||
, {doPossesiveInterval, 43 /* + */, 20,0, TRUE} // 77
|
||||
, {doInterval, 255, 20,0, FALSE} // 78
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 79 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 80
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 81
|
||||
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 82
|
||||
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 83
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 84
|
||||
, {doNamedChar, 78 /* N */, 14,0, TRUE} // 85
|
||||
, {doProperty, 112 /* p */, 14,0, FALSE} // 86
|
||||
, {doProperty, 80 /* P */, 14,0, FALSE} // 87
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 88
|
||||
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 89
|
||||
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 90
|
||||
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 91
|
||||
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 92
|
||||
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 93
|
||||
, {doBackslashx, 120 /* x */, 14,0, TRUE} // 94
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 95
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 96
|
||||
, {doOctal, 48 /* 0 */, 14,0, TRUE} // 97
|
||||
, {doBackRef, 128, 14,0, TRUE} // 98
|
||||
, {doEscapeError, 253, 101,0, FALSE} // 99
|
||||
, {doLiteralChar, 255, 14,0, TRUE} // 100
|
||||
, {doExit, 255, 101,0, TRUE} // 101 errorDeath
|
||||
};
|
||||
static const char * const RegexStateNames[] = { 0,
|
||||
"start",
|
||||
@ -208,14 +215,21 @@ static const char * const RegexStateNames[] = { 0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"expr-quant",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"expr-cont",
|
||||
0,
|
||||
0,
|
||||
"open-paren-quant",
|
||||
0,
|
||||
"open-paren-quant2",
|
||||
0,
|
||||
"open-paren",
|
||||
0,
|
||||
|
@ -64,14 +64,16 @@ start:
|
||||
# term. At a position where we can accept the start most items in a pattern.
|
||||
#
|
||||
term:
|
||||
quoted n expr-quant doLiteralChar
|
||||
rule_char n expr-quant doLiteralChar
|
||||
quoted n expr-quant doLiteralChar
|
||||
rule_char n expr-quant doLiteralChar
|
||||
'[' n expr-quant doScanUnicodeSet
|
||||
'(' n open-paren
|
||||
'.' n expr-quant doDotAny
|
||||
'^' n term doCaret
|
||||
'$' n term doDollar
|
||||
'\' n backslash
|
||||
'|' n term doOrOperator
|
||||
')' n pop doCloseParen
|
||||
eof term doPatFinish
|
||||
default errorDeath doRuleError
|
||||
|
||||
@ -86,6 +88,7 @@ expr-quant:
|
||||
'+' n quant-plus
|
||||
'?' n quant-opt
|
||||
'{' n interval-open doIntervalInit
|
||||
'(' n open-paren-quant
|
||||
default expr-cont
|
||||
|
||||
|
||||
@ -99,6 +102,21 @@ expr-cont:
|
||||
default term
|
||||
|
||||
|
||||
#
|
||||
# open-paren-quant Special case handling for comments appearing before a quantifier,
|
||||
# e.g. x(?#comment )*
|
||||
# Open parens from expr-quant come here; anything but a (?# comment
|
||||
# branches into the normal parenthesis sequence as quickly as possible.
|
||||
#
|
||||
open-paren-quant:
|
||||
'?' n open-paren-quant2
|
||||
default open-paren
|
||||
|
||||
open-paren-quant2:
|
||||
'#' n paren-comment ^expr-quant
|
||||
default open-paren-extended
|
||||
|
||||
|
||||
#
|
||||
# open-paren We've got an open paren. We need to scan further to
|
||||
# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
|
||||
@ -113,7 +131,7 @@ open-paren-extended:
|
||||
'=' n term ^expr-cont doOpenLookAhead # (?=
|
||||
'!' n term ^expr-cont doOpenLookAheadNeg # (?!
|
||||
'<' n open-paren-lookbehind
|
||||
'#' n paren-comment
|
||||
'#' n paren-comment ^term
|
||||
'i' n paren-flag doMatchMode
|
||||
'x' n paren-flag doMatchMode
|
||||
's' n paren-flag doMatchMode
|
||||
@ -134,8 +152,8 @@ open-paren-lookbehind:
|
||||
# TODO: should parens nest here? Check what perl does.
|
||||
#
|
||||
paren-comment:
|
||||
')' n term
|
||||
eof errorDeath doMismatchedParenErr
|
||||
')' n pop
|
||||
eof errorDeath doMismatchedParenErr
|
||||
default n paren-comment
|
||||
|
||||
#
|
||||
|
@ -95,8 +95,16 @@ enum {
|
||||
// matcher data (not stack data) to store it.
|
||||
URX_LD_SP = 33, // Load the stack pointer. Operand is location
|
||||
// to load from.
|
||||
URX_BACKREF = 34 // Back Reference. Parameter is the index of the
|
||||
URX_BACKREF = 34, // Back Reference. Parameter is the index of the
|
||||
// capture group variables in the state stack frame.
|
||||
URX_STO_INP_LOC = 35, // Store the input location. Operand is location
|
||||
// within the matcher data (not stack).
|
||||
URX_JMPX = 36 // Conditional JMP.
|
||||
// First Operand: JMP target location.
|
||||
// Second Operand: Data location containing an
|
||||
// input position. If current input position ==
|
||||
// saved input position, FAIL rather than taking
|
||||
// the JMP.
|
||||
};
|
||||
|
||||
// Keep this list of opcode names in sync with the above enum
|
||||
@ -136,7 +144,9 @@ enum {
|
||||
"RELOC_OPRND", \
|
||||
"STO_SP", \
|
||||
"LD_SP", \
|
||||
"BACKREF"
|
||||
"BACKREF", \
|
||||
"STO_INP_LOC", \
|
||||
"JMPX"
|
||||
|
||||
//
|
||||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
|
@ -228,7 +228,9 @@ UBool RegexMatcher::find() {
|
||||
// TODO: needs to go up to the very end, so a pattern that can match a zero lenght
|
||||
// string can match at the end of a string. Can't do until loop-breaking
|
||||
// is added to the engine, though, otherwise it triggers too many bugs.
|
||||
for (startPos=fMatchEnd; startPos < fInputLength; startPos = fInput->moveIndex32(startPos, 1)) {
|
||||
startPos = fMatchEnd;
|
||||
U_ASSERT(startPos >= 0 && startPos <= fInputLength);
|
||||
for (;;) {
|
||||
MatchAt(startPos, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
@ -236,6 +238,10 @@ UBool RegexMatcher::find() {
|
||||
if (fMatch) {
|
||||
return TRUE;
|
||||
}
|
||||
if (startPos >= fInputLength) {
|
||||
break;
|
||||
}
|
||||
startPos = fInput->moveIndex32(startPos, 1);
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
@ -858,17 +864,19 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
// The high bit of the op value is a flag for the match polarity.
|
||||
// 0: success if input char is in set.
|
||||
// 1: success if input char is not in set.
|
||||
if (fp->fInputIdx >= fInputLength) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
|
||||
UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
|
||||
opValue &= ~URX_NEG_SET;
|
||||
if (fp->fInputIdx < fInputLength) {
|
||||
// There is input left. Pick up one char and test it for set membership.
|
||||
UChar32 c;
|
||||
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
|
||||
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
|
||||
const UnicodeSet *s = fPattern->fStaticSets[opValue];
|
||||
if (s->contains(c)) {
|
||||
success = !success;
|
||||
}
|
||||
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
|
||||
UChar32 c;
|
||||
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
|
||||
const UnicodeSet *s = fPattern->fStaticSets[opValue];
|
||||
if (s->contains(c)) {
|
||||
success = !success;
|
||||
}
|
||||
if (!success) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
@ -1102,6 +1110,29 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_STO_INP_LOC:
|
||||
{
|
||||
U_ASSERT(opValue >= 0 && opValue < frameSize);
|
||||
fp->fExtra[opValue] = fp->fInputIdx;
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_JMPX:
|
||||
{
|
||||
int32_t instrOperandLoc = fp->fPatIdx;
|
||||
fp->fPatIdx += 1;
|
||||
int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
|
||||
U_ASSERT(dataLoc >= 0 && dataLoc < frameSize);
|
||||
int32_t savedInputIdx = fp->fExtra[dataLoc];
|
||||
U_ASSERT(savedInputIdx <= fp->fInputIdx);
|
||||
if (savedInputIdx < fp->fInputIdx) {
|
||||
fp->fPatIdx = opValue; // JMP
|
||||
} else {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no progress in loop.
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
// Trouble. The compiled pattern contains an entry with an
|
||||
|
@ -471,6 +471,9 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
case URX_RELOC_OPRND:
|
||||
case URX_STO_SP:
|
||||
case URX_LD_SP:
|
||||
case URX_BACKREF:
|
||||
case URX_STO_INP_LOC:
|
||||
case URX_JMPX:
|
||||
|
||||
// types with an integer operand field.
|
||||
REGEX_DUMP_DEBUG_PRINTF("%d", val);
|
||||
|
@ -61,7 +61,7 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
|
||||
if (exec) Errors();
|
||||
break;
|
||||
case 6: name = "PerlTests";
|
||||
// if (exec) PerlTests();
|
||||
if (exec) PerlTests();
|
||||
break;
|
||||
|
||||
|
||||
@ -124,7 +124,7 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
|
||||
errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %d\n", line, status);
|
||||
return FALSE;
|
||||
}
|
||||
// REPattern->dump();
|
||||
if (line==376) { REPattern->dump();}
|
||||
|
||||
UnicodeString inputString(inputText);
|
||||
UnicodeString unEscapedInput = inputString.unescape();
|
||||
@ -373,7 +373,7 @@ void RegexTest::Basic() {
|
||||
//
|
||||
#if 0
|
||||
{
|
||||
REGEX_TESTLM("\\ba\\b", "-a", FALSE, TRUE);
|
||||
REGEX_TESTLM("\\W", "a", FALSE, FALSE);
|
||||
// REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
|
||||
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
|
||||
}
|
||||
@ -1284,7 +1284,7 @@ void RegexTest::Errors() {
|
||||
|
||||
// Extra close paren
|
||||
REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
|
||||
REGEX_ERR(")))))))", 1, 1, U_REGEX_RULE_SYNTAX);
|
||||
REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
|
||||
REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
|
||||
|
||||
// Flag settings not yet implemented
|
||||
|
58
icu4c/source/test/testdata/re_tests.txt
vendored
58
icu4c/source/test/testdata/re_tests.txt
vendored
@ -1,4 +1,4 @@
|
||||
abc abc y $& abc
|
||||
abc abc y $& abc
|
||||
abc abc y $-[0] 0
|
||||
abc abc y $+[0] 3
|
||||
abc xbc n - -
|
||||
@ -557,14 +557,14 @@ $(?<=^(a)) a y $1 a
|
||||
(?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a}) yaaxxaaaacd y $b 4
|
||||
(>a+)ab aaab n - -
|
||||
(?>a+)b aaab y - -
|
||||
([[:]+) a:[b]: y $1 :[
|
||||
([[=]+) a=[b]= y $1 =[
|
||||
([[.]+) a.[b]. y $1 .[
|
||||
([[:]+) a:[b]: iy $1 :[
|
||||
([[=]+) a=[b]= iy $1 =[
|
||||
([[.]+) a.[b]. iy $1 .[
|
||||
[a[:xyz: - c - Unmatched [
|
||||
[a[:xyz:] - c - POSIX class [:xyz:] unknown
|
||||
[a[:]b[:c] abc y $& abc
|
||||
[a[:]b[:c] abc iy $& abc
|
||||
([a[:xyz:]b]+) pbaq c - POSIX class [:xyz:] unknown
|
||||
[a[:]b[:c] abc y $& abc
|
||||
[a[:]b[:c] abc iy $& abc
|
||||
([[:alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd
|
||||
([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy
|
||||
([[:ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- ${nulnul}
|
||||
@ -803,31 +803,31 @@ round\(((?>[^()]+))\) _I(round(xs * sz),1) y $1 xs * sz
|
||||
'((?-x:.) )'x x y $1- x-
|
||||
foo.bart foo.bart y - -
|
||||
'^d[x][x][x]'m abcd\ndxxx y - -
|
||||
.X(.+)+X bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
|
||||
.X(.+)+XX bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
|
||||
.XX(.+)+X bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
|
||||
.X(.+)+X bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
|
||||
.X(.+)+XX bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
|
||||
.XX(.+)+X bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
|
||||
.X(.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
|
||||
.X(.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
|
||||
.XX(.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
|
||||
.X(.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
|
||||
.X(.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
|
||||
.XX(.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
|
||||
.[X](.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
|
||||
.[X](.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
|
||||
.[X][X](.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - -
|
||||
.[X](.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
|
||||
.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
|
||||
.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
|
||||
.X(.+)+X bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
|
||||
.X(.+)+XX bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
|
||||
.XX(.+)+X bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
|
||||
.X(.+)+X bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
|
||||
.X(.+)+XX bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
|
||||
.XX(.+)+X bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
|
||||
.X(.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
|
||||
.X(.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
|
||||
.XX(.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
|
||||
.X(.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
|
||||
.X(.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
|
||||
.XX(.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
|
||||
.[X](.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
|
||||
.[X](.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
|
||||
.[X][X](.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - -
|
||||
.[X](.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
|
||||
.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
|
||||
.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
|
||||
tt+$ xxxtt y - -
|
||||
([a-\d]+) za-9z y $1 a-9
|
||||
([\d-z]+) a0-za y $1 0-z
|
||||
([\d-\s]+) a0- z y $1 0-
|
||||
([a-[:digit:]]+) za-9z y $1 a-9
|
||||
([[:digit:]-z]+) =0-z= y $1 0-z
|
||||
([[:digit:]-[:alpha:]]+) =0-z= y $1 0-z
|
||||
([a-[:digit:]]+) za-9z iy $1 a-9
|
||||
([[:digit:]-z]+) =0-z= iy $1 0-z
|
||||
([[:digit:]-[:alpha:]]+) =0-z= iy $1 0-z
|
||||
\GX.*X aaaXbX n - -
|
||||
(\d+\.\d+) 3.1415926 y $1 3.1415926
|
||||
(\ba.{0,10}br) have a web browser y $1 a web br
|
||||
@ -876,7 +876,7 @@ tt+$ xxxtt y - -
|
||||
(abc)?(abc)+ abc y $1:$2 :abc -
|
||||
'b\s^'m a\nb\n n - -
|
||||
\ba a y - -
|
||||
^(a(??{"(?!)"})|(a)(?{1}))b ab y $2 a # [ID 20010811.006]
|
||||
^(a(??{"(?!)"})|(a)(?{1}))b ab yi $2 a # [ID 20010811.006]
|
||||
ab(?i)cd AbCd n - - # [ID 20010809.023]
|
||||
ab(?i)cd abCd y - -
|
||||
(A|B)*(?(1)(CD)|(CD)) CD y $2-$3 -CD
|
||||
@ -921,4 +921,4 @@ ab(?i)cd abCd y - -
|
||||
(.*?)(?<=[bc]) abcd y $1 ab
|
||||
(.*?)(?<=[bc])c abcd y $1 ab
|
||||
2(]*)?$\1 2 y $& 2
|
||||
(??{}) x y - -
|
||||
(??{}) x yi - -
|
||||
|
Loading…
Reference in New Issue
Block a user