ICU-2422 add back references

X-SVN-Rev: 10878
This commit is contained in:
Andy Heninger 2003-01-21 04:56:14 +00:00
parent dbb4a89263
commit a868ba8a94
8 changed files with 107 additions and 27 deletions

View File

@ -1842,7 +1842,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_MISMATCHED_PAREN",
"U_REGEX_NUMBER_TOO_BIG",
"U_REGEX_BAD_INTERVAL",
"U_REGEX_MAX_LT_MIN"
"U_REGEX_MAX_LT_MIN",
"U_REGEX_INVALID_BACK_REF"
};
U_CAPI const char * U_EXPORT2

View File

@ -617,17 +617,18 @@ typedef enum UErrorCode {
/*
* The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
*/
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */
U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */
U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */

View File

@ -64,6 +64,7 @@ static const UChar gRuleSet_rule_char_pattern[] = {
static const UChar gRuleSet_digit_char_pattern[] = {
// [ 0 - 9 ]
0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
static const UnicodeSet *gRuleDigits = NULL;
static UnicodeSet *gRuleSets[10]; // Array of ptrs to the actual UnicodeSet objects.
@ -175,6 +176,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status);
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status);
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_digit_char-128], gRuleSet_digit_char_pattern, status);
gRuleDigits = gRuleSets[kRuleSet_digit_char-128];
ThreadSafeUnicodeSetInit(&gUnescapeCharSet, gUnescapeCharPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET], gIsWordPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET], gIsSpacePattern, status);
@ -919,10 +921,48 @@ UBool RegexCompile::doParseActions(EParseAction action)
break;
case doBackRef:
// TODO: implement back references.
// BackReference. Somewhat unusual in that the front-end can not completely parse
// the regular expression, because the number of digits to be consumed
// depends on the number of capture groups that have been defined. So
// we have to do it here instead.
{
int32_t numCaptureGroups = fRXPat->fGroupMap->size();
int32_t groupNum = 0;
UChar32 c = fC.fChar;
int32_t t;
for (t=numCaptureGroups; t>0; t=t/10) {
// Loop once per digit, for max allowed number of digits in a back reference.
groupNum = groupNum * 10 + u_charDigitValue(c);
if (groupNum >= numCaptureGroups) {
break;
}
UChar32 c = peekCharLL();
if (gRuleDigits->contains(c) == FALSE) {
break;
}
nextCharLL();
}
if (groupNum > numCaptureGroups) {
error(U_REGEX_INVALID_BACK_REF);
break;
}
// Scan of the back reference in the source regexp is complete. Now generate
// the compiled code for it.
U_ASSERT(groupNum > 0);
int32_t varsLoc = fRXPat->fGroupMap->elementAti(groupNum-1);
int32_t op = URX_BUILD(URX_BACKREF, varsLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
break;
case doOctal:
error(U_REGEX_UNIMPLEMENTED);
break;
case doNamedChar: // \N{NAMED_CHAR}
// TODO: implement
error(U_REGEX_UNIMPLEMENTED);
@ -972,7 +1012,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
// 4. JMP 2
// 5. LD_SP loc
// 6 ...
//
// TODO: do something to cut back the state stack each time through the loop.
{
// Reserve two slots at the top of the block.
int32_t topLoc = blockTopLoc(TRUE);

View File

@ -30,6 +30,7 @@ enum Regex_PatternParseAction {
doRuleError,
doIntevalLowerDigit,
doBackslashs,
doOctal,
doNGOpt,
doNamedChar,
doBackslashw,
@ -108,7 +109,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
, {doNOP, 92 /* \ */, 70,0, TRUE} // 9
, {doPatFinish, 253, 2,0, FALSE} // 10
, {doRuleError, 255, 90,0, FALSE} // 11
, {doRuleError, 255, 91,0, FALSE} // 11
, {doNOP, 42 /* * */, 48,0, TRUE} // 12 expr-quant
, {doNOP, 43 /* + */, 51,0, TRUE} // 13
, {doNOP, 63 /* ? */, 54,0, TRUE} // 14
@ -130,12 +131,12 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 30
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 31
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 32
, {doBadOpenParenType, 255, 90,0, FALSE} // 33
, {doBadOpenParenType, 255, 91,0, FALSE} // 33
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 34 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 35
, {doBadOpenParenType, 255, 90,0, FALSE} // 36
, {doBadOpenParenType, 255, 91,0, FALSE} // 36
, {doNOP, 41 /* ) */, 2,0, TRUE} // 37 paren-comment
, {doMismatchedParenErr, 253, 90,0, FALSE} // 38
, {doMismatchedParenErr, 253, 91,0, FALSE} // 38
, {doNOP, 255, 37,0, TRUE} // 39
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 40 paren-flag
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 41
@ -144,7 +145,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 44
, {doNOP, 41 /* ) */, 2,0, TRUE} // 45
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 46
, {doNOP, 255, 90,0, FALSE} // 47
, {doNOP, 255, 91,0, FALSE} // 47
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 48 quant-star
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 49
, {doStar, 255, 17,0, FALSE} // 50
@ -156,14 +157,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpt, 255, 17,0, FALSE} // 56
, {doNOP, 129, 57,0, TRUE} // 57 interval-open
, {doNOP, 128, 60,0, FALSE} // 58
, {doIntervalError, 255, 90,0, FALSE} // 59
, {doIntervalError, 255, 91,0, FALSE} // 59
, {doIntevalLowerDigit, 128, 60,0, TRUE} // 60 interval-lower
, {doNOP, 44 /* , */, 64,0, TRUE} // 61
, {doIntervalSame, 125 /* } */, 67,0, TRUE} // 62
, {doIntervalError, 255, 90,0, FALSE} // 63
, {doIntervalError, 255, 91,0, FALSE} // 63
, {doIntervalUpperDigit, 128, 64,0, TRUE} // 64 interval-upper
, {doNOP, 125 /* } */, 67,0, TRUE} // 65
, {doIntervalError, 255, 90,0, FALSE} // 66
, {doIntervalError, 255, 91,0, FALSE} // 66
, {doNGInterval, 63 /* ? */, 17,0, TRUE} // 67 interval-type
, {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 68
, {doInterval, 255, 17,0, FALSE} // 69
@ -185,9 +186,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 85
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 86
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 87
, {doBackRef, 128, 12,0, TRUE} // 88
, {doLiteralChar, 255, 12,0, TRUE} // 89
, {doExit, 255, 90,0, TRUE} // 90 errorDeath
, {doOctal, 48 /* 0 */, 12,0, TRUE} // 88
, {doBackRef, 128, 12,0, TRUE} // 89
, {doLiteralChar, 255, 12,0, TRUE} // 90
, {doExit, 255, 91,0, TRUE} // 91 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@ -278,6 +280,7 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
"errorDeath",
0};

View File

@ -229,7 +229,8 @@ backslash:
'x' n expr-quant doBackslashx
'Z' n term doBackslashZ
'z' n term doBackslashz
digit_char n expr-quant doBackRef
'0' n expr-quant doOctal
digit_char expr-quant doBackRef # Will scan multiple digits
default n expr-quant doLiteralChar # Escaped literal char.

View File

@ -93,8 +93,10 @@ enum {
URX_STO_SP = 32, // Store the stack ptr. Operand is location within
// matcher data (not stack data) to store it.
URX_LD_SP = 33 // Load the stack pointer. Operand is location
URX_LD_SP = 33, // Load the stack pointer. Operand is location
// to load from.
URX_BACKREF = 34 // Back Reference. Parameter is the index of the
// capture group variables in the state stack frame.
};
// Keep this list of opcode names in sync with the above enum
@ -133,7 +135,8 @@ enum {
"CTR_LOOP_P", \
"RELOC_OPRND", \
"STO_SP", \
"LD_SP"
"LD_SP", \
"BACKREF"
//
// Convenience macros for assembling and disassembling a compiled operation.

View File

@ -1071,6 +1071,29 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
}
break;
case URX_BACKREF:
{
U_ASSERT(opValue < frameSize);
int32_t groupStartIdx = fp->fExtra[opValue];
int32_t groupEndIdx = fp->fExtra[opValue+1];
U_ASSERT(groupStartIdx <= groupEndIdx);
int32_t len = groupEndIdx-groupStartIdx;
if (groupStartIdx < 0 || len == 0) {
// This capture group has not participated in the match thus far,
// or the match was of an empty string.
// Verified by testing: Perl matches succeed in these cases, so
// we do too.
break;
}
if ((fp->fInputIdx + len > inputLen) ||
u_strncmp(fInputUC+groupStartIdx, fInputUC+fp->fInputIdx, len) != 0) {
fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
} else {
fp->fInputIdx += len; // Match. Advance current input position.
}
}
break;
default:
// Trouble. The compiled pattern contains an entry with an

View File

@ -1246,6 +1246,15 @@ void RegexTest::Extended() {
// Possessive ?+
REGEX_FIND("c?+ddd", "<0>cddd</0>");
REGEX_FIND("c?+cddd", "cddd");
REGEX_FIND("c?cddd", "<0>cddd</0>");
// Back Reference
REGEX_FIND("(?:ab(..)cd\\1)*", "<0>ab23cd23ab<1>ww</1>cdww</0>abxxcdyy");
REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>abc<2></2></0>");
REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>ab<1>d</1><2>d</2></0>");
REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>ab<1></1><2></2></0>e");
REGEX_FIND("ab(?:c|(d?))(\\1)", "<0>ab<1></1><2></2></0>");
}
@ -1258,7 +1267,6 @@ void RegexTest::Extended() {
//---------------------------------------------------------------------------
void RegexTest::Errors() {
// \escape sequences that aren't implemented yet.
REGEX_ERR("No (support) for \\1 BackReferences yet.", 1, 19, U_REGEX_UNIMPLEMENTED);
REGEX_ERR("named chars \\N{GREEK CAPITAL LETTER ALPHA} not implementd", 1, 14, U_REGEX_UNIMPLEMENTED);
REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);