From a868ba8a94d346532f125efd62e0d57dbe226f24 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Tue, 21 Jan 2003 04:56:14 +0000 Subject: [PATCH] ICU-2422 add back references X-SVN-Rev: 10878 --- icu4c/source/common/putil.c | 3 +- icu4c/source/common/unicode/utypes.h | 19 ++++++----- icu4c/source/i18n/regexcmp.cpp | 44 +++++++++++++++++++++++-- icu4c/source/i18n/regexcst.h | 25 +++++++------- icu4c/source/i18n/regexcst.txt | 3 +- icu4c/source/i18n/regeximp.h | 7 ++-- icu4c/source/i18n/rematch.cpp | 23 +++++++++++++ icu4c/source/test/intltest/regextst.cpp | 10 +++++- 8 files changed, 107 insertions(+), 27 deletions(-) diff --git a/icu4c/source/common/putil.c b/icu4c/source/common/putil.c index 0617e3c2d1..8ffc3feda8 100644 --- a/icu4c/source/common/putil.c +++ b/icu4c/source/common/putil.c @@ -1842,7 +1842,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = { "U_REGEX_MISMATCHED_PAREN", "U_REGEX_NUMBER_TOO_BIG", "U_REGEX_BAD_INTERVAL", - "U_REGEX_MAX_LT_MIN" + "U_REGEX_MAX_LT_MIN", + "U_REGEX_INVALID_BACK_REF" }; U_CAPI const char * U_EXPORT2 diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index d77e6caa7c..89eae0f8fb 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -617,17 +617,18 @@ typedef enum UErrorCode { /* * The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs */ - U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */ - U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */ - U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */ + U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */ + U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */ + U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */ U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */ - U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */ - U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */ + U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */ + U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */ U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */ - U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */ - U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */ - U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */ - U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */ + U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */ + U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */ + U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */ + U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */ + U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */ U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */ U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index b60e143f94..72781a92e6 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -64,6 +64,7 @@ static const UChar gRuleSet_rule_char_pattern[] = { static const UChar gRuleSet_digit_char_pattern[] = { // [ 0 - 9 ] 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; +static const UnicodeSet *gRuleDigits = NULL; static UnicodeSet *gRuleSets[10]; // Array of ptrs to the actual UnicodeSet objects. @@ -175,6 +176,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status) ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status); ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status); ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_digit_char-128], gRuleSet_digit_char_pattern, status); + gRuleDigits = gRuleSets[kRuleSet_digit_char-128]; ThreadSafeUnicodeSetInit(&gUnescapeCharSet, gUnescapeCharPattern, status); ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET], gIsWordPattern, status); ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET], gIsSpacePattern, status); @@ -919,10 +921,48 @@ UBool RegexCompile::doParseActions(EParseAction action) break; case doBackRef: - // TODO: implement back references. + // BackReference. Somewhat unusual in that the front-end can not completely parse + // the regular expression, because the number of digits to be consumed + // depends on the number of capture groups that have been defined. So + // we have to do it here instead. + { + int32_t numCaptureGroups = fRXPat->fGroupMap->size(); + int32_t groupNum = 0; + UChar32 c = fC.fChar; + int32_t t; + + for (t=numCaptureGroups; t>0; t=t/10) { + // Loop once per digit, for max allowed number of digits in a back reference. + groupNum = groupNum * 10 + u_charDigitValue(c); + if (groupNum >= numCaptureGroups) { + break; + } + UChar32 c = peekCharLL(); + if (gRuleDigits->contains(c) == FALSE) { + break; + } + nextCharLL(); + } + if (groupNum > numCaptureGroups) { + error(U_REGEX_INVALID_BACK_REF); + break; + } + + // Scan of the back reference in the source regexp is complete. Now generate + // the compiled code for it. + U_ASSERT(groupNum > 0); + int32_t varsLoc = fRXPat->fGroupMap->elementAti(groupNum-1); + int32_t op = URX_BUILD(URX_BACKREF, varsLoc); + fRXPat->fCompiledPat->addElement(op, *fStatus); + } + break; + + + case doOctal: error(U_REGEX_UNIMPLEMENTED); break; + case doNamedChar: // \N{NAMED_CHAR} // TODO: implement error(U_REGEX_UNIMPLEMENTED); @@ -972,7 +1012,7 @@ UBool RegexCompile::doParseActions(EParseAction action) // 4. JMP 2 // 5. LD_SP loc // 6 ... - // + // TODO: do something to cut back the state stack each time through the loop. { // Reserve two slots at the top of the block. int32_t topLoc = blockTopLoc(TRUE); diff --git a/icu4c/source/i18n/regexcst.h b/icu4c/source/i18n/regexcst.h index e9cb61faec..0d92d5986d 100644 --- a/icu4c/source/i18n/regexcst.h +++ b/icu4c/source/i18n/regexcst.h @@ -30,6 +30,7 @@ enum Regex_PatternParseAction { doRuleError, doIntevalLowerDigit, doBackslashs, + doOctal, doNGOpt, doNamedChar, doBackslashw, @@ -108,7 +109,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doDollar, 36 /* $ */, 2,0, TRUE} // 8 , {doNOP, 92 /* \ */, 70,0, TRUE} // 9 , {doPatFinish, 253, 2,0, FALSE} // 10 - , {doRuleError, 255, 90,0, FALSE} // 11 + , {doRuleError, 255, 91,0, FALSE} // 11 , {doNOP, 42 /* * */, 48,0, TRUE} // 12 expr-quant , {doNOP, 43 /* + */, 51,0, TRUE} // 13 , {doNOP, 63 /* ? */, 54,0, TRUE} // 14 @@ -130,12 +131,12 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doMatchMode, 115 /* s */, 40,0, TRUE} // 30 , {doMatchMode, 109 /* m */, 40,0, TRUE} // 31 , {doMatchMode, 45 /* - */, 40,0, TRUE} // 32 - , {doBadOpenParenType, 255, 90,0, FALSE} // 33 + , {doBadOpenParenType, 255, 91,0, FALSE} // 33 , {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 34 open-paren-lookbehind , {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 35 - , {doBadOpenParenType, 255, 90,0, FALSE} // 36 + , {doBadOpenParenType, 255, 91,0, FALSE} // 36 , {doNOP, 41 /* ) */, 2,0, TRUE} // 37 paren-comment - , {doMismatchedParenErr, 253, 90,0, FALSE} // 38 + , {doMismatchedParenErr, 253, 91,0, FALSE} // 38 , {doNOP, 255, 37,0, TRUE} // 39 , {doMatchMode, 105 /* i */, 40,0, TRUE} // 40 paren-flag , {doMatchMode, 115 /* s */, 40,0, TRUE} // 41 @@ -144,7 +145,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doMatchMode, 45 /* - */, 40,0, TRUE} // 44 , {doNOP, 41 /* ) */, 2,0, TRUE} // 45 , {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 46 - , {doNOP, 255, 90,0, FALSE} // 47 + , {doNOP, 255, 91,0, FALSE} // 47 , {doNGStar, 63 /* ? */, 17,0, TRUE} // 48 quant-star , {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 49 , {doStar, 255, 17,0, FALSE} // 50 @@ -156,14 +157,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doOpt, 255, 17,0, FALSE} // 56 , {doNOP, 129, 57,0, TRUE} // 57 interval-open , {doNOP, 128, 60,0, FALSE} // 58 - , {doIntervalError, 255, 90,0, FALSE} // 59 + , {doIntervalError, 255, 91,0, FALSE} // 59 , {doIntevalLowerDigit, 128, 60,0, TRUE} // 60 interval-lower , {doNOP, 44 /* , */, 64,0, TRUE} // 61 , {doIntervalSame, 125 /* } */, 67,0, TRUE} // 62 - , {doIntervalError, 255, 90,0, FALSE} // 63 + , {doIntervalError, 255, 91,0, FALSE} // 63 , {doIntervalUpperDigit, 128, 64,0, TRUE} // 64 interval-upper , {doNOP, 125 /* } */, 67,0, TRUE} // 65 - , {doIntervalError, 255, 90,0, FALSE} // 66 + , {doIntervalError, 255, 91,0, FALSE} // 66 , {doNGInterval, 63 /* ? */, 17,0, TRUE} // 67 interval-type , {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 68 , {doInterval, 255, 17,0, FALSE} // 69 @@ -185,9 +186,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doBackslashx, 120 /* x */, 12,0, TRUE} // 85 , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 86 , {doBackslashz, 122 /* z */, 2,0, TRUE} // 87 - , {doBackRef, 128, 12,0, TRUE} // 88 - , {doLiteralChar, 255, 12,0, TRUE} // 89 - , {doExit, 255, 90,0, TRUE} // 90 errorDeath + , {doOctal, 48 /* 0 */, 12,0, TRUE} // 88 + , {doBackRef, 128, 12,0, TRUE} // 89 + , {doLiteralChar, 255, 12,0, TRUE} // 90 + , {doExit, 255, 91,0, TRUE} // 91 errorDeath }; static const char * const RegexStateNames[] = { 0, "start", @@ -278,6 +280,7 @@ static const char * const RegexStateNames[] = { 0, 0, 0, 0, + 0, 0, "errorDeath", 0}; diff --git a/icu4c/source/i18n/regexcst.txt b/icu4c/source/i18n/regexcst.txt index b2333d7b37..0d70174520 100644 --- a/icu4c/source/i18n/regexcst.txt +++ b/icu4c/source/i18n/regexcst.txt @@ -229,7 +229,8 @@ backslash: 'x' n expr-quant doBackslashx 'Z' n term doBackslashZ 'z' n term doBackslashz - digit_char n expr-quant doBackRef + '0' n expr-quant doOctal + digit_char expr-quant doBackRef # Will scan multiple digits default n expr-quant doLiteralChar # Escaped literal char. diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index 24109eb32a..b3b5e16a9f 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -93,8 +93,10 @@ enum { URX_STO_SP = 32, // Store the stack ptr. Operand is location within // matcher data (not stack data) to store it. - URX_LD_SP = 33 // Load the stack pointer. Operand is location + URX_LD_SP = 33, // Load the stack pointer. Operand is location // to load from. + URX_BACKREF = 34 // Back Reference. Parameter is the index of the + // capture group variables in the state stack frame. }; // Keep this list of opcode names in sync with the above enum @@ -133,7 +135,8 @@ enum { "CTR_LOOP_P", \ "RELOC_OPRND", \ "STO_SP", \ - "LD_SP" + "LD_SP", \ + "BACKREF" // // Convenience macros for assembling and disassembling a compiled operation. diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 9a2fe4e768..f80421e94f 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -1071,6 +1071,29 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { } break; + case URX_BACKREF: + { + U_ASSERT(opValue < frameSize); + int32_t groupStartIdx = fp->fExtra[opValue]; + int32_t groupEndIdx = fp->fExtra[opValue+1]; + U_ASSERT(groupStartIdx <= groupEndIdx); + int32_t len = groupEndIdx-groupStartIdx; + if (groupStartIdx < 0 || len == 0) { + // This capture group has not participated in the match thus far, + // or the match was of an empty string. + // Verified by testing: Perl matches succeed in these cases, so + // we do too. + break; + } + if ((fp->fInputIdx + len > inputLen) || + u_strncmp(fInputUC+groupStartIdx, fInputUC+fp->fInputIdx, len) != 0) { + fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match. + } else { + fp->fInputIdx += len; // Match. Advance current input position. + } + } + break; + default: // Trouble. The compiled pattern contains an entry with an diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index a6d828592f..98c6681145 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -1246,6 +1246,15 @@ void RegexTest::Extended() { // Possessive ?+ REGEX_FIND("c?+ddd", "<0>cddd"); + REGEX_FIND("c?+cddd", "cddd"); + REGEX_FIND("c?cddd", "<0>cddd"); + + // Back Reference + REGEX_FIND("(?:ab(..)cd\\1)*", "<0>ab23cd23ab<1>wwcdwwabxxcdyy"); + REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>abc<2>"); + REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>ab<1>d<2>d"); + REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>ab<1><2>e"); + REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>ab<1><2>"); } @@ -1258,7 +1267,6 @@ void RegexTest::Extended() { //--------------------------------------------------------------------------- void RegexTest::Errors() { // \escape sequences that aren't implemented yet. - REGEX_ERR("No (support) for \\1 BackReferences yet.", 1, 19, U_REGEX_UNIMPLEMENTED); REGEX_ERR("named chars \\N{GREEK CAPITAL LETTER ALPHA} not implementd", 1, 14, U_REGEX_UNIMPLEMENTED); REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);