From 41e90b577300f8c2ef1c4ecfe3957f4fec826cb3 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Sat, 25 Jan 2003 18:57:42 +0000 Subject: [PATCH] ICU-2422 regexp, fix a number of bugs uncovered by perl regexp tests. Some still remain. X-SVN-Rev: 10905 --- icu4c/source/i18n/regexcmp.cpp | 60 +++++++- icu4c/source/i18n/regexcmp.h | 10 +- icu4c/source/i18n/regexcst.h | 196 +++++++++++++----------- icu4c/source/i18n/regexcst.txt | 28 +++- icu4c/source/i18n/regeximp.h | 14 +- icu4c/source/i18n/rematch.cpp | 51 ++++-- icu4c/source/i18n/repattrn.cpp | 3 + icu4c/source/test/intltest/regextst.cpp | 8 +- icu4c/source/test/testdata/re_tests.txt | 58 +++---- 9 files changed, 271 insertions(+), 157 deletions(-) diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index f51c6cc3d3..ddbc73daeb 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -153,8 +153,6 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status) fCharNum = 0; fQuoteMode = FALSE; fFreeForm = FALSE; - fMatcherDataEnd = 0; - fBackRefMax = 0; fMatchOpenParen = -1; fMatchCloseParen = -1; @@ -738,27 +736,55 @@ UBool RegexCompile::doParseActions(EParseAction action) case doStar: // Normal (greedy) * quantifier. // Compiles to - // 1. STATE_SAVE 3 + // 1. STATE_SAVE 4 // 2. body of stuff being iterated over // 3. JMP 1 // 4. ... // + // Or, if the body can match a zero-length string, to inhibit infinite loops, + // 1. STATE_SAVE 6 + // 2. POS_SAVE data-loc + // 3. body of stuff + // 4. JMPX 1 + // 5 data-loc (extra operand of JMPX) + // 6. ... { // location of item #1, the STATE_SAVE int32_t saveStateLoc = blockTopLoc(TRUE); + int32_t dataLoc = -1; + if (possibleNullMatch(saveStateLoc, fRXPat->fCompiledPat->size()-1)) { + insertOp(saveStateLoc); + dataLoc = fRXPat->fFrameSize; + fRXPat->fFrameSize++; + + int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); + fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); + } + // Locate the position in the compiled pattern where the match will continue // after completing the *. (4 in the comment above) int32_t continueLoc = fRXPat->fCompiledPat->size()+1; + if (dataLoc != -1) { + continueLoc++; + } // Put together the save state op store it into the compiled code. int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); - // Append the URX_JMP operation to the compiled pattern. Its target + // Append the URX_JMP or URX_JMPX operation to the compiled pattern. Its target // is the locaton of the state-save, above. - int32_t jmpOp = URX_BUILD(URX_JMP, saveStateLoc); - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); + if (dataLoc == -1) { + int32_t jmpOp = URX_BUILD(URX_JMP, saveStateLoc); + fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); + } else { + int32_t op = URX_BUILD(URX_JMPX, saveStateLoc); + fRXPat->fCompiledPat->addElement(op, *fStatus); + op = URX_BUILD(URX_RESERVED_OP, dataLoc); + fRXPat->fCompiledPat->addElement(op, *fStatus); + } + } break; @@ -963,11 +989,12 @@ UBool RegexCompile::doParseActions(EParseAction action) for (;;) { // Loop once per digit, for max allowed number of digits in a back reference. - groupNum = groupNum * 10 + u_charDigitValue(c); + int32_t digit = u_charDigitValue(c); + groupNum = groupNum * 10 + digit; if (groupNum >= numCaptureGroups) { break; } - UChar32 c = peekCharLL(); + c = peekCharLL(); if (gRuleDigits->contains(c) == FALSE) { break; } @@ -1284,8 +1311,11 @@ void RegexCompile::insertOp(int32_t where) { int32_t opType = URX_TYPE(op); int32_t opValue = URX_VAL(op); if ((opType == URX_JMP || + opType == URX_JMPX || opType == URX_STATE_SAVE || opType == URX_CTR_LOOP || + opType == URX_CTR_LOOP_NG || + opType == URX_CTR_LOOP_P || opType == URX_RELOC_OPRND) && opValue > where) { // Target location for this opcode is after the insertion point and // needs to be incremented to adjust for the insertion. @@ -1541,6 +1571,20 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) } +//---------------------------------------------------------------------------------------- +// +// possibleNullMatch Test a range of compiled pattern for the possibility that it +// might match an empty string. Used to control the generation +// of extra checking code to prevent infinite loops in the match +// engine on repeated empty matches, such as might happen with +// (x?)* +// when the input string is not at an x. +// +//---------------------------------------------------------------------------------------- +UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) { + // for now, just return true. TODO: make a real implementation + return TRUE; +} //---------------------------------------------------------------------------------------- diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h index 2a2d1fd310..053e825be3 100644 --- a/icu4c/source/i18n/regexcmp.h +++ b/icu4c/source/i18n/regexcmp.h @@ -87,6 +87,8 @@ private: void fixLiterals(UBool split=FALSE); // Fix literal strings. void insertOp(int32_t where); // Open up a slot for a new op in the // generated code at the specified location. + UBool possibleNullMatch(int32_t start, // Test a range of compiled pattern for + int32_t end); // for possibly matching an empty string. UErrorCode *fStatus; @@ -152,14 +154,6 @@ private: // -1 for the upper interval value means none // was specified (unlimited occurences.) - int32_t fMatcherDataEnd; // Location Counter for allocation of data - // to be used by the matcher at match time. - - int32_t fBackRefMax; // Number of the largest capture group with a - // back reference. Capture groups can be forward- - // referenced, so we can't flag an error on - // a too-big back ref number until the end of the - // pattern is reached. }; U_NAMESPACE_END diff --git a/icu4c/source/i18n/regexcst.h b/icu4c/source/i18n/regexcst.h index f0082e7932..8a08035d5d 100644 --- a/icu4c/source/i18n/regexcst.h +++ b/icu4c/source/i18n/regexcst.h @@ -103,99 +103,106 @@ struct RegexTableEl { static const struct RegexTableEl gRuleParseStateTable[] = { {doNOP, 0, 0, 0, TRUE} , {doPatStart, 255, 2,0, FALSE} // 1 start - , {doLiteralChar, 254, 12,0, TRUE} // 2 term - , {doLiteralChar, 130, 12,0, TRUE} // 3 - , {doScanUnicodeSet, 91 /* [ */, 12,0, TRUE} // 4 - , {doNOP, 40 /* ( */, 20,0, TRUE} // 5 - , {doDotAny, 46 /* . */, 12,0, TRUE} // 6 + , {doLiteralChar, 254, 14,0, TRUE} // 2 term + , {doLiteralChar, 130, 14,0, TRUE} // 3 + , {doScanUnicodeSet, 91 /* [ */, 14,0, TRUE} // 4 + , {doNOP, 40 /* ( */, 27,0, TRUE} // 5 + , {doDotAny, 46 /* . */, 14,0, TRUE} // 6 , {doCaret, 94 /* ^ */, 2,0, TRUE} // 7 , {doDollar, 36 /* $ */, 2,0, TRUE} // 8 - , {doNOP, 92 /* \ */, 72,0, TRUE} // 9 - , {doPatFinish, 253, 2,0, FALSE} // 10 - , {doRuleError, 255, 94,0, FALSE} // 11 - , {doNOP, 42 /* * */, 50,0, TRUE} // 12 expr-quant - , {doNOP, 43 /* + */, 53,0, TRUE} // 13 - , {doNOP, 63 /* ? */, 56,0, TRUE} // 14 - , {doIntervalInit, 123 /* { */, 59,0, TRUE} // 15 - , {doNOP, 255, 17,0, FALSE} // 16 - , {doOrOperator, 124 /* | */, 2,0, TRUE} // 17 expr-cont - , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 18 - , {doNOP, 255, 2,0, FALSE} // 19 - , {doNOP, 63 /* ? */, 22,0, TRUE} // 20 open-paren - , {doOpenCaptureParen, 255, 2, 12, FALSE} // 21 - , {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 22 open-paren-extended - , {doOpenAtomicParen, 62 /* > */, 2, 12, TRUE} // 23 - , {doOpenLookAhead, 61 /* = */, 2, 17, TRUE} // 24 - , {doOpenLookAheadNeg, 33 /* ! */, 2, 17, TRUE} // 25 - , {doNOP, 60 /* < */, 36,0, TRUE} // 26 - , {doNOP, 35 /* # */, 39,0, TRUE} // 27 - , {doMatchMode, 105 /* i */, 42,0, TRUE} // 28 - , {doMatchMode, 120 /* x */, 42,0, TRUE} // 29 - , {doMatchMode, 115 /* s */, 42,0, TRUE} // 30 - , {doMatchMode, 109 /* m */, 42,0, TRUE} // 31 - , {doMatchMode, 45 /* - */, 42,0, TRUE} // 32 - , {doConditionalExpr, 40 /* ( */, 94,0, TRUE} // 33 - , {doPerlInline, 123 /* { */, 94,0, TRUE} // 34 - , {doBadOpenParenType, 255, 94,0, FALSE} // 35 - , {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 36 open-paren-lookbehind - , {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 37 - , {doBadOpenParenType, 255, 94,0, FALSE} // 38 - , {doNOP, 41 /* ) */, 2,0, TRUE} // 39 paren-comment - , {doMismatchedParenErr, 253, 94,0, FALSE} // 40 - , {doNOP, 255, 39,0, TRUE} // 41 - , {doMatchMode, 105 /* i */, 42,0, TRUE} // 42 paren-flag - , {doMatchMode, 115 /* s */, 42,0, TRUE} // 43 - , {doMatchMode, 109 /* m */, 42,0, TRUE} // 44 - , {doMatchMode, 120 /* x */, 42,0, TRUE} // 45 - , {doMatchMode, 45 /* - */, 42,0, TRUE} // 46 - , {doNOP, 41 /* ) */, 2,0, TRUE} // 47 - , {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 48 - , {doNOP, 255, 94,0, FALSE} // 49 - , {doNGStar, 63 /* ? */, 17,0, TRUE} // 50 quant-star - , {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 51 - , {doStar, 255, 17,0, FALSE} // 52 - , {doNGPlus, 63 /* ? */, 17,0, TRUE} // 53 quant-plus - , {doPossesivePlus, 43 /* + */, 17,0, TRUE} // 54 - , {doPlus, 255, 17,0, FALSE} // 55 - , {doNGOpt, 63 /* ? */, 17,0, TRUE} // 56 quant-opt - , {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 57 - , {doOpt, 255, 17,0, FALSE} // 58 - , {doNOP, 129, 59,0, TRUE} // 59 interval-open - , {doNOP, 128, 62,0, FALSE} // 60 - , {doIntervalError, 255, 94,0, FALSE} // 61 - , {doIntevalLowerDigit, 128, 62,0, TRUE} // 62 interval-lower - , {doNOP, 44 /* , */, 66,0, TRUE} // 63 - , {doIntervalSame, 125 /* } */, 69,0, TRUE} // 64 - , {doIntervalError, 255, 94,0, FALSE} // 65 - , {doIntervalUpperDigit, 128, 66,0, TRUE} // 66 interval-upper - , {doNOP, 125 /* } */, 69,0, TRUE} // 67 - , {doIntervalError, 255, 94,0, FALSE} // 68 - , {doNGInterval, 63 /* ? */, 17,0, TRUE} // 69 interval-type - , {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 70 - , {doInterval, 255, 17,0, FALSE} // 71 - , {doBackslashA, 65 /* A */, 2,0, TRUE} // 72 backslash - , {doBackslashB, 66 /* B */, 2,0, TRUE} // 73 - , {doBackslashb, 98 /* b */, 2,0, TRUE} // 74 - , {doBackslashd, 100 /* d */, 12,0, TRUE} // 75 - , {doBackslashD, 68 /* D */, 12,0, TRUE} // 76 - , {doBackslashG, 71 /* G */, 2,0, TRUE} // 77 - , {doNamedChar, 78 /* N */, 12,0, TRUE} // 78 - , {doProperty, 112 /* p */, 12,0, FALSE} // 79 - , {doProperty, 80 /* P */, 12,0, FALSE} // 80 - , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 81 - , {doBackslashS, 83 /* S */, 12,0, TRUE} // 82 - , {doBackslashs, 115 /* s */, 12,0, TRUE} // 83 - , {doBackslashW, 87 /* W */, 12,0, TRUE} // 84 - , {doBackslashw, 119 /* w */, 12,0, TRUE} // 85 - , {doBackslashX, 88 /* X */, 12,0, TRUE} // 86 - , {doBackslashx, 120 /* x */, 12,0, TRUE} // 87 - , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 88 - , {doBackslashz, 122 /* z */, 2,0, TRUE} // 89 - , {doOctal, 48 /* 0 */, 12,0, TRUE} // 90 - , {doBackRef, 128, 12,0, TRUE} // 91 - , {doEscapeError, 253, 94,0, FALSE} // 92 - , {doLiteralChar, 255, 12,0, TRUE} // 93 - , {doExit, 255, 94,0, TRUE} // 94 errorDeath + , {doNOP, 92 /* \ */, 79,0, TRUE} // 9 + , {doOrOperator, 124 /* | */, 2,0, TRUE} // 10 + , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11 + , {doPatFinish, 253, 2,0, FALSE} // 12 + , {doRuleError, 255, 101,0, FALSE} // 13 + , {doNOP, 42 /* * */, 57,0, TRUE} // 14 expr-quant + , {doNOP, 43 /* + */, 60,0, TRUE} // 15 + , {doNOP, 63 /* ? */, 63,0, TRUE} // 16 + , {doIntervalInit, 123 /* { */, 66,0, TRUE} // 17 + , {doNOP, 40 /* ( */, 23,0, TRUE} // 18 + , {doNOP, 255, 20,0, FALSE} // 19 + , {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont + , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 21 + , {doNOP, 255, 2,0, FALSE} // 22 + , {doNOP, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant + , {doNOP, 255, 27,0, FALSE} // 24 + , {doNOP, 35 /* # */, 46, 14, TRUE} // 25 open-paren-quant2 + , {doNOP, 255, 29,0, FALSE} // 26 + , {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren + , {doOpenCaptureParen, 255, 2, 14, FALSE} // 28 + , {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE} // 29 open-paren-extended + , {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30 + , {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31 + , {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32 + , {doNOP, 60 /* < */, 43,0, TRUE} // 33 + , {doNOP, 35 /* # */, 46, 2, TRUE} // 34 + , {doMatchMode, 105 /* i */, 49,0, TRUE} // 35 + , {doMatchMode, 120 /* x */, 49,0, TRUE} // 36 + , {doMatchMode, 115 /* s */, 49,0, TRUE} // 37 + , {doMatchMode, 109 /* m */, 49,0, TRUE} // 38 + , {doMatchMode, 45 /* - */, 49,0, TRUE} // 39 + , {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 40 + , {doPerlInline, 123 /* { */, 101,0, TRUE} // 41 + , {doBadOpenParenType, 255, 101,0, FALSE} // 42 + , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 43 open-paren-lookbehind + , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 44 + , {doBadOpenParenType, 255, 101,0, FALSE} // 45 + , {doNOP, 41 /* ) */, 255,0, TRUE} // 46 paren-comment + , {doMismatchedParenErr, 253, 101,0, FALSE} // 47 + , {doNOP, 255, 46,0, TRUE} // 48 + , {doMatchMode, 105 /* i */, 49,0, TRUE} // 49 paren-flag + , {doMatchMode, 115 /* s */, 49,0, TRUE} // 50 + , {doMatchMode, 109 /* m */, 49,0, TRUE} // 51 + , {doMatchMode, 120 /* x */, 49,0, TRUE} // 52 + , {doMatchMode, 45 /* - */, 49,0, TRUE} // 53 + , {doNOP, 41 /* ) */, 2,0, TRUE} // 54 + , {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE} // 55 + , {doNOP, 255, 101,0, FALSE} // 56 + , {doNGStar, 63 /* ? */, 20,0, TRUE} // 57 quant-star + , {doPossesiveStar, 43 /* + */, 20,0, TRUE} // 58 + , {doStar, 255, 20,0, FALSE} // 59 + , {doNGPlus, 63 /* ? */, 20,0, TRUE} // 60 quant-plus + , {doPossesivePlus, 43 /* + */, 20,0, TRUE} // 61 + , {doPlus, 255, 20,0, FALSE} // 62 + , {doNGOpt, 63 /* ? */, 20,0, TRUE} // 63 quant-opt + , {doPossesiveOpt, 43 /* + */, 20,0, TRUE} // 64 + , {doOpt, 255, 20,0, FALSE} // 65 + , {doNOP, 129, 66,0, TRUE} // 66 interval-open + , {doNOP, 128, 69,0, FALSE} // 67 + , {doIntervalError, 255, 101,0, FALSE} // 68 + , {doIntevalLowerDigit, 128, 69,0, TRUE} // 69 interval-lower + , {doNOP, 44 /* , */, 73,0, TRUE} // 70 + , {doIntervalSame, 125 /* } */, 76,0, TRUE} // 71 + , {doIntervalError, 255, 101,0, FALSE} // 72 + , {doIntervalUpperDigit, 128, 73,0, TRUE} // 73 interval-upper + , {doNOP, 125 /* } */, 76,0, TRUE} // 74 + , {doIntervalError, 255, 101,0, FALSE} // 75 + , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 76 interval-type + , {doPossesiveInterval, 43 /* + */, 20,0, TRUE} // 77 + , {doInterval, 255, 20,0, FALSE} // 78 + , {doBackslashA, 65 /* A */, 2,0, TRUE} // 79 backslash + , {doBackslashB, 66 /* B */, 2,0, TRUE} // 80 + , {doBackslashb, 98 /* b */, 2,0, TRUE} // 81 + , {doBackslashd, 100 /* d */, 14,0, TRUE} // 82 + , {doBackslashD, 68 /* D */, 14,0, TRUE} // 83 + , {doBackslashG, 71 /* G */, 2,0, TRUE} // 84 + , {doNamedChar, 78 /* N */, 14,0, TRUE} // 85 + , {doProperty, 112 /* p */, 14,0, FALSE} // 86 + , {doProperty, 80 /* P */, 14,0, FALSE} // 87 + , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 88 + , {doBackslashS, 83 /* S */, 14,0, TRUE} // 89 + , {doBackslashs, 115 /* s */, 14,0, TRUE} // 90 + , {doBackslashW, 87 /* W */, 14,0, TRUE} // 91 + , {doBackslashw, 119 /* w */, 14,0, TRUE} // 92 + , {doBackslashX, 88 /* X */, 14,0, TRUE} // 93 + , {doBackslashx, 120 /* x */, 14,0, TRUE} // 94 + , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 95 + , {doBackslashz, 122 /* z */, 2,0, TRUE} // 96 + , {doOctal, 48 /* 0 */, 14,0, TRUE} // 97 + , {doBackRef, 128, 14,0, TRUE} // 98 + , {doEscapeError, 253, 101,0, FALSE} // 99 + , {doLiteralChar, 255, 14,0, TRUE} // 100 + , {doExit, 255, 101,0, TRUE} // 101 errorDeath }; static const char * const RegexStateNames[] = { 0, "start", @@ -208,14 +215,21 @@ static const char * const RegexStateNames[] = { 0, 0, 0, 0, + 0, + 0, 0, "expr-quant", 0, 0, 0, + 0, 0, "expr-cont", 0, + 0, + "open-paren-quant", + 0, + "open-paren-quant2", 0, "open-paren", 0, diff --git a/icu4c/source/i18n/regexcst.txt b/icu4c/source/i18n/regexcst.txt index 691de4814f..4950e870fa 100644 --- a/icu4c/source/i18n/regexcst.txt +++ b/icu4c/source/i18n/regexcst.txt @@ -64,14 +64,16 @@ start: # term. At a position where we can accept the start most items in a pattern. # term: - quoted n expr-quant doLiteralChar - rule_char n expr-quant doLiteralChar + quoted n expr-quant doLiteralChar + rule_char n expr-quant doLiteralChar '[' n expr-quant doScanUnicodeSet '(' n open-paren '.' n expr-quant doDotAny '^' n term doCaret '$' n term doDollar '\' n backslash + '|' n term doOrOperator + ')' n pop doCloseParen eof term doPatFinish default errorDeath doRuleError @@ -86,6 +88,7 @@ expr-quant: '+' n quant-plus '?' n quant-opt '{' n interval-open doIntervalInit + '(' n open-paren-quant default expr-cont @@ -99,6 +102,21 @@ expr-cont: default term +# +# open-paren-quant Special case handling for comments appearing before a quantifier, +# e.g. x(?#comment )* +# Open parens from expr-quant come here; anything but a (?# comment +# branches into the normal parenthesis sequence as quickly as possible. +# +open-paren-quant: + '?' n open-paren-quant2 + default open-paren + +open-paren-quant2: + '#' n paren-comment ^expr-quant + default open-paren-extended + + # # open-paren We've got an open paren. We need to scan further to # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. @@ -113,7 +131,7 @@ open-paren-extended: '=' n term ^expr-cont doOpenLookAhead # (?= '!' n term ^expr-cont doOpenLookAheadNeg # (?! '<' n open-paren-lookbehind - '#' n paren-comment + '#' n paren-comment ^term 'i' n paren-flag doMatchMode 'x' n paren-flag doMatchMode 's' n paren-flag doMatchMode @@ -134,8 +152,8 @@ open-paren-lookbehind: # TODO: should parens nest here? Check what perl does. # paren-comment: - ')' n term - eof errorDeath doMismatchedParenErr + ')' n pop + eof errorDeath doMismatchedParenErr default n paren-comment # diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index b3b5e16a9f..05f0696b85 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -95,8 +95,16 @@ enum { // matcher data (not stack data) to store it. URX_LD_SP = 33, // Load the stack pointer. Operand is location // to load from. - URX_BACKREF = 34 // Back Reference. Parameter is the index of the + URX_BACKREF = 34, // Back Reference. Parameter is the index of the // capture group variables in the state stack frame. + URX_STO_INP_LOC = 35, // Store the input location. Operand is location + // within the matcher data (not stack). + URX_JMPX = 36 // Conditional JMP. + // First Operand: JMP target location. + // Second Operand: Data location containing an + // input position. If current input position == + // saved input position, FAIL rather than taking + // the JMP. }; // Keep this list of opcode names in sync with the above enum @@ -136,7 +144,9 @@ enum { "RELOC_OPRND", \ "STO_SP", \ "LD_SP", \ - "BACKREF" + "BACKREF", \ + "STO_INP_LOC", \ + "JMPX" // // Convenience macros for assembling and disassembling a compiled operation. diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 79144d88bf..ff0e59d7be 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -228,7 +228,9 @@ UBool RegexMatcher::find() { // TODO: needs to go up to the very end, so a pattern that can match a zero lenght // string can match at the end of a string. Can't do until loop-breaking // is added to the engine, though, otherwise it triggers too many bugs. - for (startPos=fMatchEnd; startPos < fInputLength; startPos = fInput->moveIndex32(startPos, 1)) { + startPos = fMatchEnd; + U_ASSERT(startPos >= 0 && startPos <= fInputLength); + for (;;) { MatchAt(startPos, status); if (U_FAILURE(status)) { return FALSE; @@ -236,6 +238,10 @@ UBool RegexMatcher::find() { if (fMatch) { return TRUE; } + if (startPos >= fInputLength) { + break; + } + startPos = fInput->moveIndex32(startPos, 1); } return FALSE; } @@ -858,17 +864,19 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { // The high bit of the op value is a flag for the match polarity. // 0: success if input char is in set. // 1: success if input char is not in set. + if (fp->fInputIdx >= fInputLength) { + fp = (REStackFrame *)fStack->popFrame(frameSize); + break; + } + UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); opValue &= ~URX_NEG_SET; - if (fp->fInputIdx < fInputLength) { - // There is input left. Pick up one char and test it for set membership. - UChar32 c; - U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c); - U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); - const UnicodeSet *s = fPattern->fStaticSets[opValue]; - if (s->contains(c)) { - success = !success; - } + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); + UChar32 c; + U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c); + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c)) { + success = !success; } if (!success) { fp = (REStackFrame *)fStack->popFrame(frameSize); @@ -1102,6 +1110,29 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { } break; + case URX_STO_INP_LOC: + { + U_ASSERT(opValue >= 0 && opValue < frameSize); + fp->fExtra[opValue] = fp->fInputIdx; + } + break; + + case URX_JMPX: + { + int32_t instrOperandLoc = fp->fPatIdx; + fp->fPatIdx += 1; + int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); + U_ASSERT(dataLoc >= 0 && dataLoc < frameSize); + int32_t savedInputIdx = fp->fExtra[dataLoc]; + U_ASSERT(savedInputIdx <= fp->fInputIdx); + if (savedInputIdx < fp->fInputIdx) { + fp->fPatIdx = opValue; // JMP + } else { + fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no progress in loop. + } + } + break; + default: // Trouble. The compiled pattern contains an entry with an diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index dc9f8bd742..93c8111b5a 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -471,6 +471,9 @@ void RegexPattern::dumpOp(int32_t index) const { case URX_RELOC_OPRND: case URX_STO_SP: case URX_LD_SP: + case URX_BACKREF: + case URX_STO_INP_LOC: + case URX_JMPX: // types with an integer operand field. REGEX_DUMP_DEBUG_PRINTF("%d", val); diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index aefb49d15b..9a73f05ad0 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -61,7 +61,7 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch if (exec) Errors(); break; case 6: name = "PerlTests"; - // if (exec) PerlTests(); + if (exec) PerlTests(); break; @@ -124,7 +124,7 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %d\n", line, status); return FALSE; } - // REPattern->dump(); + if (line==376) { REPattern->dump();} UnicodeString inputString(inputText); UnicodeString unEscapedInput = inputString.unescape(); @@ -373,7 +373,7 @@ void RegexTest::Basic() { // #if 0 { - REGEX_TESTLM("\\ba\\b", "-a", FALSE, TRUE); + REGEX_TESTLM("\\W", "a", FALSE, FALSE); // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc<2>cccddd"); // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); } @@ -1284,7 +1284,7 @@ void RegexTest::Errors() { // Extra close paren REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); - REGEX_ERR(")))))))", 1, 1, U_REGEX_RULE_SYNTAX); + REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN); REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); // Flag settings not yet implemented diff --git a/icu4c/source/test/testdata/re_tests.txt b/icu4c/source/test/testdata/re_tests.txt index b5bf80048c..6873e3aa70 100644 --- a/icu4c/source/test/testdata/re_tests.txt +++ b/icu4c/source/test/testdata/re_tests.txt @@ -1,4 +1,4 @@ -abc abc y $& abc +abc abc y $& abc abc abc y $-[0] 0 abc abc y $+[0] 3 abc xbc n - - @@ -557,14 +557,14 @@ $(?<=^(a)) a y $1 a (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a}) yaaxxaaaacd y $b 4 (>a+)ab aaab n - - (?>a+)b aaab y - - -([[:]+) a:[b]: y $1 :[ -([[=]+) a=[b]= y $1 =[ -([[.]+) a.[b]. y $1 .[ +([[:]+) a:[b]: iy $1 :[ +([[=]+) a=[b]= iy $1 =[ +([[.]+) a.[b]. iy $1 .[ [a[:xyz: - c - Unmatched [ [a[:xyz:] - c - POSIX class [:xyz:] unknown -[a[:]b[:c] abc y $& abc +[a[:]b[:c] abc iy $& abc ([a[:xyz:]b]+) pbaq c - POSIX class [:xyz:] unknown -[a[:]b[:c] abc y $& abc +[a[:]b[:c] abc iy $& abc ([[:alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd ([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy ([[:ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- ${nulnul} @@ -803,31 +803,31 @@ round\(((?>[^()]+))\) _I(round(xs * sz),1) y $1 xs * sz '((?-x:.) )'x x y $1- x- foo.bart foo.bart y - - '^d[x][x][x]'m abcd\ndxxx y - - -.X(.+)+X bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - -.X(.+)+XX bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - -.XX(.+)+X bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - -.X(.+)+X bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - -.X(.+)+XX bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - -.XX(.+)+X bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - -.X(.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - -.X(.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - -.XX(.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - -.X(.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - -.X(.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - -.XX(.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - -.[X](.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - -.[X](.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - -.[X][X](.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - -.[X](.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - -.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - -.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - +.X(.+)+X bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.X(.+)+XX bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.XX(.+)+X bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.X(.+)+X bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.X(.+)+XX bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.XX(.+)+X bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.X(.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.X(.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.XX(.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.X(.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.X(.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.XX(.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.[X](.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.[X](.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.[X][X](.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.[X](.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - tt+$ xxxtt y - - ([a-\d]+) za-9z y $1 a-9 ([\d-z]+) a0-za y $1 0-z ([\d-\s]+) a0- z y $1 0- -([a-[:digit:]]+) za-9z y $1 a-9 -([[:digit:]-z]+) =0-z= y $1 0-z -([[:digit:]-[:alpha:]]+) =0-z= y $1 0-z +([a-[:digit:]]+) za-9z iy $1 a-9 +([[:digit:]-z]+) =0-z= iy $1 0-z +([[:digit:]-[:alpha:]]+) =0-z= iy $1 0-z \GX.*X aaaXbX n - - (\d+\.\d+) 3.1415926 y $1 3.1415926 (\ba.{0,10}br) have a web browser y $1 a web br @@ -876,7 +876,7 @@ tt+$ xxxtt y - - (abc)?(abc)+ abc y $1:$2 :abc - 'b\s^'m a\nb\n n - - \ba a y - - -^(a(??{"(?!)"})|(a)(?{1}))b ab y $2 a # [ID 20010811.006] +^(a(??{"(?!)"})|(a)(?{1}))b ab yi $2 a # [ID 20010811.006] ab(?i)cd AbCd n - - # [ID 20010809.023] ab(?i)cd abCd y - - (A|B)*(?(1)(CD)|(CD)) CD y $2-$3 -CD @@ -921,4 +921,4 @@ ab(?i)cd abCd y - - (.*?)(?<=[bc]) abcd y $1 ab (.*?)(?<=[bc])c abcd y $1 ab 2(]*)?$\1 2 y $& 2 -(??{}) x y - - +(??{}) x yi - -