ICU-2422 regexp, back out nested capture group hack. The failing test is

either a bug or a really odd specification in Perl.
Clean up test memory leaks.

X-SVN-Rev: 10923
This commit is contained in:
Andy Heninger 2003-01-29 01:40:59 +00:00
parent e2715acda5
commit 6696782926
5 changed files with 84 additions and 71 deletions

View File

@ -1436,30 +1436,15 @@ void RegexCompile::handleCloseParen() {
case -2:
// Capturing Parentheses.
// Insert a End Capture op into the pattern.
// If this capture group contains other nested capture groups, e.g.
// (a|(b))+
// emit the variant END_CAPTURE_N, with an extra operand containing
// the number of the last nested group.
// The group number of this cg is obtained from the start capture op
// and put it into the end-capture op.
// The frame offset of the variables for this cg is obtained from the
// start capture op and put it into the end-capture op.
{
int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
int32_t frameVarLocation = URX_VAL(captureOp);
int32_t lastCG = fRXPat->fGroupMap->size();
int32_t lastCGframeVarLoc = fRXPat->fGroupMap->elementAti(lastCG-1);
if (frameVarLocation == lastCGframeVarLoc) {
// There are no nested capture groups. The current one is the
// last one that was encountered. Emit tha plain END_CAPTURE.
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
} else {
// There are nested Captures Groups.
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE_N, frameVarLocation);
fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
fRXPat->fCompiledPat->addElement(lastCG, *fStatus);
}
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
}
break;
case -3:

View File

@ -99,15 +99,12 @@ enum {
// capture group variables in the state stack frame.
URX_STO_INP_LOC = 35, // Store the input location. Operand is location
// within the matcher data (not stack).
URX_JMPX = 36, // Conditional JMP.
URX_JMPX = 36 // Conditional JMP.
// First Operand: JMP target location.
// Second Operand: Data location containing an
// input position. If current input position ==
// saved input position, FAIL rather than taking
// the JMP.
URX_END_CAPTURE_N = 37 // End Capture when cg contains nested groups.
// first operand: Capture group being closed.
// second operand: Last nested capture group.
};
// Keep this list of opcode names in sync with the above enum
@ -149,8 +146,7 @@ enum {
"LD_SP", \
"BACKREF", \
"STO_INP_LOC", \
"JMPX", \
"END_CAPTURE_N"
"JMPX"
//
// Convenience macros for assembling and disassembling a compiled operation.

View File

@ -733,31 +733,6 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
break;
case URX_END_CAPTURE_N:
{
U_ASSERT(opValue >= 0 && opValue < frameSize-3);
U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
fp->fExtra[opValue+1] = fp->fInputIdx; // End position
U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
int32_t lastNestedGroup = pat[fp->fPatIdx];
fp->fPatIdx++;
U_ASSERT(lastNestedGroup>1 && lastNestedGroup<=fPattern->fGroupMap->size());
int32_t nestedGroupNum;
for (nestedGroupNum=lastNestedGroup; ;nestedGroupNum--) {
int32_t nestedGroupVarsIdx = fPattern->fGroupMap->elementAti(nestedGroupNum-1);
if (nestedGroupVarsIdx == opValue) {
break;
}
U_ASSERT(nestedGroupNum>=2);
if (fp->fExtra[nestedGroupVarsIdx] < fp->fExtra[opValue]) {
fp->fExtra[nestedGroupVarsIdx] = -1;
fp->fExtra[nestedGroupVarsIdx+1] = -1;
}
}
}
break;
case URX_DOLLAR: // $, test for End of line
// or for position before new line at end of input

View File

@ -1465,6 +1465,45 @@ cleanUpAndReturn:
//-------------------------------------------------------------------------------
//
// PerlTests - Run Perl's regular expression tests
// The input file for this test is re_tests, the standard regular
// expression test data distributed with the Perl source code.
//
// Here is Perl's description of the test data file:
//
// # The tests are in a separate file 't/op/re_tests'.
// # Each line in that file is a separate test.
// # There are five columns, separated by tabs.
// #
// # Column 1 contains the pattern, optionally enclosed in C<''>.
// # Modifiers can be put after the closing C<'>.
// #
// # Column 2 contains the string to be matched.
// #
// # Column 3 contains the expected result:
// # y expect a match
// # n expect no match
// # c expect an error
// # B test exposes a known bug in Perl, should be skipped
// # b test exposes a known bug in Perl, should be skipped if noamp
// #
// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
// #
// # Column 4 contains a string, usually C<$&>.
// #
// # Column 5 contains the expected result of double-quote
// # interpolating that string after the match, or start of error message.
// #
// # Column 6, if present, contains a reason why the test is skipped.
// # This is printed with "skipped", for harness to pick up.
// #
// # \n in the tests are interpolated, as are variables of the form ${\w+}.
// #
// # If you want to add a regular expression test that can't be expressed
// # in this format, don't add it here: put it in op/pat.t instead.
//
// For ICU, if field 3 contains an 'i', the test will be skipped.
// The test exposes is some known incompatibility between ICU and Perl regexps.
// (The i is in addition to whatever was there before.)
//
//-------------------------------------------------------------------------------
void RegexTest::PerlTests() {
@ -1502,28 +1541,21 @@ void RegexTest::PerlTests() {
//
// Regex to identify test patterns with flag settings, and to separate them.
// Test patterns with flags look like 'pattern'i
// Test patterns without flags are not quoted: paterrn
// Test patterns without flags are not quoted: pattern
// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
//
RegexPattern *flagPat = RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe, status);
RegexMatcher* flagMat = flagPat->matcher("", status);
//
// Regex to find ${bang}. Perl doesn't put literal '!'s into patterns.
//
RegexPattern *bangPat = RegexPattern::compile("\\$\\{bang\\}", 0, pe, status);
RegexMatcher *bangMat = bangPat->matcher("", status);
//
// ${nulnul} and its replacement string.
// The Perl tests reference several perl-isms, which are evaluated/substituted
// in the test data. Not being perl, this must be done explicitly. Here
// are string constants and REs for these constructs.
//
UnicodeString nulnulSrc("${nulnul}");
UnicodeString nulnul("\\u0000\\u0000");
nulnul = nulnul.unescape();
//
// Regex to find ${ffff}. Perl doesn't put \uffff into patterns.
//
UnicodeString ffffSrc("${ffff}");
UnicodeString ffff("\\uffff");
ffff = ffff.unescape();
@ -1537,10 +1569,19 @@ void RegexTest::PerlTests() {
RegexMatcher *cgMat = cgPat->matcher("", status);
//
// Main Loop for the Perl Tests, runs once per line from the
// test data file.
//
int32_t lineNum = 0;
int32_t skippedUnimplementedCount = 0;
while (lineMat->find()) {
lineNum++;
//
// Get a line, break it into its fields, do the Perl
// variable substitutions.
//
UnicodeString line = lineMat->group(1, status);
UnicodeString fields[7];
fieldPat->split(line, fields, 7, status);
@ -1548,10 +1589,14 @@ void RegexTest::PerlTests() {
flagMat->reset(fields[0]);
flagMat->matches(status);
UnicodeString pattern = flagMat->group(2, status);
bangMat->reset(pattern);
pattern = bangMat->replaceAll("!", status);
pattern.findAndReplace("${bang}", "!");
pattern.findAndReplace(nulnulSrc, "\\u0000\\u0000");
pattern.findAndReplace(ffffSrc, ffff);
//
// Identify patterns that include match flag settings,
// split off the flags, remove the extra quotes.
//
UnicodeString flagStr = flagMat->group(3, status);
// printf("pattern = %s\n", cstar(pattern));
// printf(" flags = %s\n", cstar(flags));
@ -1559,10 +1604,9 @@ void RegexTest::PerlTests() {
errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
return;
}
int32_t flags = 0;
const UChar UChar_c = 0x63; // Damn the lack of Unicode support in C
const UChar UChar_i = 0x69;
const UChar UChar_c = 0x63; // Char constants for the flag letters.
const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
const UChar UChar_m = 0x6d;
const UChar UChar_x = 0x78;
const UChar UChar_y = 0x79;
@ -1582,6 +1626,9 @@ void RegexTest::PerlTests() {
status = U_ZERO_ERROR;
RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
if (status == U_REGEX_UNIMPLEMENTED) {
//
// Test of a feature that is planned for ICU, but not yet implemented.
// skip the test.
skippedUnimplementedCount++;
delete testPat;
status = U_ZERO_ERROR;
@ -1630,7 +1677,7 @@ void RegexTest::PerlTests() {
//
// Run the test
// Run the test, check for expected match/don't match result.
//
RegexMatcher *testMat = testPat->matcher(matchString, status);
UBool found = testMat->find();
@ -1644,7 +1691,11 @@ void RegexTest::PerlTests() {
}
//
// Interpret the Perl expression from the fourth field of the data file.
// Interpret the Perl expression from the fourth field of the data file,
// building up an ICU string from the results of the ICU match.
// The Perl expression will contain references to the results of
// a regex match, including the matched string, capture group strings,
// group starting and ending indicies, etc.
//
UnicodeString resultString;
UnicodeString perlExpr = fields[3];
@ -1746,8 +1797,11 @@ void RegexTest::PerlTests() {
delete testMat;
delete testPat;
}
//
// All done. Clean up allocated stuff.
//
delete cgMat;
delete cgPat;
@ -1756,8 +1810,12 @@ void RegexTest::PerlTests() {
delete flagMat;
delete flagPat;
delete lineMat;
delete linePat;
delete fieldPat;
delete [] testData;
logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);

View File

@ -857,8 +857,7 @@ tt+$ xxxtt y - -
^([^,]{0,3},){0,3}d aaa,b,c,d y $1 c,
(?i) y - -
'(?!\A)x'm a\nxb\n y - -
^(a(b)?)+$ aba y -$1-$2- -a--
^(aa(bb)?)+$ aabbaa y -$1-$2- -aa--
^(a(b)?)+$ aba iy -$1-$2- -a--
'^.{9}abc.*\n'm 123\nabcabcabcabc\n y - -
^(a)?a$ a y -$1- --
^(a)?(?(1)a|b)+$ a n - -