ICU-2422 add back references
X-SVN-Rev: 10878
This commit is contained in:
parent
dbb4a89263
commit
a868ba8a94
@ -1842,7 +1842,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
||||
"U_REGEX_MISMATCHED_PAREN",
|
||||
"U_REGEX_NUMBER_TOO_BIG",
|
||||
"U_REGEX_BAD_INTERVAL",
|
||||
"U_REGEX_MAX_LT_MIN"
|
||||
"U_REGEX_MAX_LT_MIN",
|
||||
"U_REGEX_INVALID_BACK_REF"
|
||||
};
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
|
@ -617,17 +617,18 @@ typedef enum UErrorCode {
|
||||
/*
|
||||
* The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
|
||||
*/
|
||||
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
|
||||
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
|
||||
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
|
||||
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
|
||||
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
|
||||
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
|
||||
U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */
|
||||
U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
|
||||
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
|
||||
U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
|
||||
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
|
||||
U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
|
||||
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
|
||||
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
|
||||
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
|
||||
U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */
|
||||
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
|
||||
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
|
||||
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
|
||||
U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */
|
||||
U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */
|
||||
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
|
||||
|
||||
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
|
@ -64,6 +64,7 @@ static const UChar gRuleSet_rule_char_pattern[] = {
|
||||
static const UChar gRuleSet_digit_char_pattern[] = {
|
||||
// [ 0 - 9 ]
|
||||
0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
|
||||
static const UnicodeSet *gRuleDigits = NULL;
|
||||
|
||||
|
||||
static UnicodeSet *gRuleSets[10]; // Array of ptrs to the actual UnicodeSet objects.
|
||||
@ -175,6 +176,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
||||
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_digit_char-128], gRuleSet_digit_char_pattern, status);
|
||||
gRuleDigits = gRuleSets[kRuleSet_digit_char-128];
|
||||
ThreadSafeUnicodeSetInit(&gUnescapeCharSet, gUnescapeCharPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET], gIsWordPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET], gIsSpacePattern, status);
|
||||
@ -919,10 +921,48 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
break;
|
||||
|
||||
case doBackRef:
|
||||
// TODO: implement back references.
|
||||
// BackReference. Somewhat unusual in that the front-end can not completely parse
|
||||
// the regular expression, because the number of digits to be consumed
|
||||
// depends on the number of capture groups that have been defined. So
|
||||
// we have to do it here instead.
|
||||
{
|
||||
int32_t numCaptureGroups = fRXPat->fGroupMap->size();
|
||||
int32_t groupNum = 0;
|
||||
UChar32 c = fC.fChar;
|
||||
int32_t t;
|
||||
|
||||
for (t=numCaptureGroups; t>0; t=t/10) {
|
||||
// Loop once per digit, for max allowed number of digits in a back reference.
|
||||
groupNum = groupNum * 10 + u_charDigitValue(c);
|
||||
if (groupNum >= numCaptureGroups) {
|
||||
break;
|
||||
}
|
||||
UChar32 c = peekCharLL();
|
||||
if (gRuleDigits->contains(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
nextCharLL();
|
||||
}
|
||||
if (groupNum > numCaptureGroups) {
|
||||
error(U_REGEX_INVALID_BACK_REF);
|
||||
break;
|
||||
}
|
||||
|
||||
// Scan of the back reference in the source regexp is complete. Now generate
|
||||
// the compiled code for it.
|
||||
U_ASSERT(groupNum > 0);
|
||||
int32_t varsLoc = fRXPat->fGroupMap->elementAti(groupNum-1);
|
||||
int32_t op = URX_BUILD(URX_BACKREF, varsLoc);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case doOctal:
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
|
||||
case doNamedChar: // \N{NAMED_CHAR}
|
||||
// TODO: implement
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
@ -972,7 +1012,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
// 4. JMP 2
|
||||
// 5. LD_SP loc
|
||||
// 6 ...
|
||||
//
|
||||
// TODO: do something to cut back the state stack each time through the loop.
|
||||
{
|
||||
// Reserve two slots at the top of the block.
|
||||
int32_t topLoc = blockTopLoc(TRUE);
|
||||
|
@ -30,6 +30,7 @@ enum Regex_PatternParseAction {
|
||||
doRuleError,
|
||||
doIntevalLowerDigit,
|
||||
doBackslashs,
|
||||
doOctal,
|
||||
doNGOpt,
|
||||
doNamedChar,
|
||||
doBackslashw,
|
||||
@ -108,7 +109,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
|
||||
, {doNOP, 92 /* \ */, 70,0, TRUE} // 9
|
||||
, {doPatFinish, 253, 2,0, FALSE} // 10
|
||||
, {doRuleError, 255, 90,0, FALSE} // 11
|
||||
, {doRuleError, 255, 91,0, FALSE} // 11
|
||||
, {doNOP, 42 /* * */, 48,0, TRUE} // 12 expr-quant
|
||||
, {doNOP, 43 /* + */, 51,0, TRUE} // 13
|
||||
, {doNOP, 63 /* ? */, 54,0, TRUE} // 14
|
||||
@ -130,12 +131,12 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 30
|
||||
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 31
|
||||
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 32
|
||||
, {doBadOpenParenType, 255, 90,0, FALSE} // 33
|
||||
, {doBadOpenParenType, 255, 91,0, FALSE} // 33
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 34 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 35
|
||||
, {doBadOpenParenType, 255, 90,0, FALSE} // 36
|
||||
, {doBadOpenParenType, 255, 91,0, FALSE} // 36
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 37 paren-comment
|
||||
, {doMismatchedParenErr, 253, 90,0, FALSE} // 38
|
||||
, {doMismatchedParenErr, 253, 91,0, FALSE} // 38
|
||||
, {doNOP, 255, 37,0, TRUE} // 39
|
||||
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 40 paren-flag
|
||||
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 41
|
||||
@ -144,7 +145,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 44
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 45
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 46
|
||||
, {doNOP, 255, 90,0, FALSE} // 47
|
||||
, {doNOP, 255, 91,0, FALSE} // 47
|
||||
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 48 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 49
|
||||
, {doStar, 255, 17,0, FALSE} // 50
|
||||
@ -156,14 +157,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doOpt, 255, 17,0, FALSE} // 56
|
||||
, {doNOP, 129, 57,0, TRUE} // 57 interval-open
|
||||
, {doNOP, 128, 60,0, FALSE} // 58
|
||||
, {doIntervalError, 255, 90,0, FALSE} // 59
|
||||
, {doIntervalError, 255, 91,0, FALSE} // 59
|
||||
, {doIntevalLowerDigit, 128, 60,0, TRUE} // 60 interval-lower
|
||||
, {doNOP, 44 /* , */, 64,0, TRUE} // 61
|
||||
, {doIntervalSame, 125 /* } */, 67,0, TRUE} // 62
|
||||
, {doIntervalError, 255, 90,0, FALSE} // 63
|
||||
, {doIntervalError, 255, 91,0, FALSE} // 63
|
||||
, {doIntervalUpperDigit, 128, 64,0, TRUE} // 64 interval-upper
|
||||
, {doNOP, 125 /* } */, 67,0, TRUE} // 65
|
||||
, {doIntervalError, 255, 90,0, FALSE} // 66
|
||||
, {doIntervalError, 255, 91,0, FALSE} // 66
|
||||
, {doNGInterval, 63 /* ? */, 17,0, TRUE} // 67 interval-type
|
||||
, {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 68
|
||||
, {doInterval, 255, 17,0, FALSE} // 69
|
||||
@ -185,9 +186,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 85
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 86
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 87
|
||||
, {doBackRef, 128, 12,0, TRUE} // 88
|
||||
, {doLiteralChar, 255, 12,0, TRUE} // 89
|
||||
, {doExit, 255, 90,0, TRUE} // 90 errorDeath
|
||||
, {doOctal, 48 /* 0 */, 12,0, TRUE} // 88
|
||||
, {doBackRef, 128, 12,0, TRUE} // 89
|
||||
, {doLiteralChar, 255, 12,0, TRUE} // 90
|
||||
, {doExit, 255, 91,0, TRUE} // 91 errorDeath
|
||||
};
|
||||
static const char * const RegexStateNames[] = { 0,
|
||||
"start",
|
||||
@ -278,6 +280,7 @@ static const char * const RegexStateNames[] = { 0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"errorDeath",
|
||||
0};
|
||||
|
@ -229,7 +229,8 @@ backslash:
|
||||
'x' n expr-quant doBackslashx
|
||||
'Z' n term doBackslashZ
|
||||
'z' n term doBackslashz
|
||||
digit_char n expr-quant doBackRef
|
||||
'0' n expr-quant doOctal
|
||||
digit_char expr-quant doBackRef # Will scan multiple digits
|
||||
default n expr-quant doLiteralChar # Escaped literal char.
|
||||
|
||||
|
||||
|
@ -93,8 +93,10 @@ enum {
|
||||
|
||||
URX_STO_SP = 32, // Store the stack ptr. Operand is location within
|
||||
// matcher data (not stack data) to store it.
|
||||
URX_LD_SP = 33 // Load the stack pointer. Operand is location
|
||||
URX_LD_SP = 33, // Load the stack pointer. Operand is location
|
||||
// to load from.
|
||||
URX_BACKREF = 34 // Back Reference. Parameter is the index of the
|
||||
// capture group variables in the state stack frame.
|
||||
};
|
||||
|
||||
// Keep this list of opcode names in sync with the above enum
|
||||
@ -133,7 +135,8 @@ enum {
|
||||
"CTR_LOOP_P", \
|
||||
"RELOC_OPRND", \
|
||||
"STO_SP", \
|
||||
"LD_SP"
|
||||
"LD_SP", \
|
||||
"BACKREF"
|
||||
|
||||
//
|
||||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
|
@ -1071,6 +1071,29 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_BACKREF:
|
||||
{
|
||||
U_ASSERT(opValue < frameSize);
|
||||
int32_t groupStartIdx = fp->fExtra[opValue];
|
||||
int32_t groupEndIdx = fp->fExtra[opValue+1];
|
||||
U_ASSERT(groupStartIdx <= groupEndIdx);
|
||||
int32_t len = groupEndIdx-groupStartIdx;
|
||||
if (groupStartIdx < 0 || len == 0) {
|
||||
// This capture group has not participated in the match thus far,
|
||||
// or the match was of an empty string.
|
||||
// Verified by testing: Perl matches succeed in these cases, so
|
||||
// we do too.
|
||||
break;
|
||||
}
|
||||
if ((fp->fInputIdx + len > inputLen) ||
|
||||
u_strncmp(fInputUC+groupStartIdx, fInputUC+fp->fInputIdx, len) != 0) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
|
||||
} else {
|
||||
fp->fInputIdx += len; // Match. Advance current input position.
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
// Trouble. The compiled pattern contains an entry with an
|
||||
|
@ -1246,6 +1246,15 @@ void RegexTest::Extended() {
|
||||
|
||||
// Possessive ?+
|
||||
REGEX_FIND("c?+ddd", "<0>cddd</0>");
|
||||
REGEX_FIND("c?+cddd", "cddd");
|
||||
REGEX_FIND("c?cddd", "<0>cddd</0>");
|
||||
|
||||
// Back Reference
|
||||
REGEX_FIND("(?:ab(..)cd\\1)*", "<0>ab23cd23ab<1>ww</1>cdww</0>abxxcdyy");
|
||||
REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>abc<2></2></0>");
|
||||
REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>ab<1>d</1><2>d</2></0>");
|
||||
REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>ab<1></1><2></2></0>e");
|
||||
REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>ab<1></1><2></2></0>");
|
||||
|
||||
}
|
||||
|
||||
@ -1258,7 +1267,6 @@ void RegexTest::Extended() {
|
||||
//---------------------------------------------------------------------------
|
||||
void RegexTest::Errors() {
|
||||
// \escape sequences that aren't implemented yet.
|
||||
REGEX_ERR("No (support) for \\1 BackReferences yet.", 1, 19, U_REGEX_UNIMPLEMENTED);
|
||||
REGEX_ERR("named chars \\N{GREEK CAPITAL LETTER ALPHA} not implementd", 1, 14, U_REGEX_UNIMPLEMENTED);
|
||||
REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user