diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 9a1132bdaa..8f1652bcae 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -1436,30 +1436,15 @@ void RegexCompile::handleCloseParen() { case -2: // Capturing Parentheses. // Insert a End Capture op into the pattern. - // If this capture group contains other nested capture groups, e.g. - // (a|(b))+ - // emit the variant END_CAPTURE_N, with an extra operand containing - // the number of the last nested group. - // The group number of this cg is obtained from the start capture op - // and put it into the end-capture op. + // The frame offset of the variables for this cg is obtained from the + // start capture op and put it into the end-capture op. { int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); int32_t frameVarLocation = URX_VAL(captureOp); - int32_t lastCG = fRXPat->fGroupMap->size(); - int32_t lastCGframeVarLoc = fRXPat->fGroupMap->elementAti(lastCG-1); - if (frameVarLocation == lastCGframeVarLoc) { - // There are no nested capture groups. The current one is the - // last one that was encountered. Emit tha plain END_CAPTURE. - int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation); - fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); - } else { - // There are nested Captures Groups. - int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE_N, frameVarLocation); - fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); - fRXPat->fCompiledPat->addElement(lastCG, *fStatus); - } + int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation); + fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); } break; case -3: diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index 3de5aaafd2..1036c5ae08 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -99,15 +99,12 @@ enum { // capture group variables in the state stack frame. URX_STO_INP_LOC = 35, // Store the input location. Operand is location // within the matcher data (not stack). - URX_JMPX = 36, // Conditional JMP. + URX_JMPX = 36 // Conditional JMP. // First Operand: JMP target location. // Second Operand: Data location containing an // input position. If current input position == // saved input position, FAIL rather than taking // the JMP. - URX_END_CAPTURE_N = 37 // End Capture when cg contains nested groups. - // first operand: Capture group being closed. - // second operand: Last nested capture group. }; // Keep this list of opcode names in sync with the above enum @@ -149,8 +146,7 @@ enum { "LD_SP", \ "BACKREF", \ "STO_INP_LOC", \ - "JMPX", \ - "END_CAPTURE_N" + "JMPX" // // Convenience macros for assembling and disassembling a compiled operation. diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 7904019968..adc8a4eba2 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -733,31 +733,6 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); break; - case URX_END_CAPTURE_N: - { - U_ASSERT(opValue >= 0 && opValue < frameSize-3); - U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. - fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. - fp->fExtra[opValue+1] = fp->fInputIdx; // End position - U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); - - int32_t lastNestedGroup = pat[fp->fPatIdx]; - fp->fPatIdx++; - U_ASSERT(lastNestedGroup>1 && lastNestedGroup<=fPattern->fGroupMap->size()); - int32_t nestedGroupNum; - for (nestedGroupNum=lastNestedGroup; ;nestedGroupNum--) { - int32_t nestedGroupVarsIdx = fPattern->fGroupMap->elementAti(nestedGroupNum-1); - if (nestedGroupVarsIdx == opValue) { - break; - } - U_ASSERT(nestedGroupNum>=2); - if (fp->fExtra[nestedGroupVarsIdx] < fp->fExtra[opValue]) { - fp->fExtra[nestedGroupVarsIdx] = -1; - fp->fExtra[nestedGroupVarsIdx+1] = -1; - } - } - } - break; case URX_DOLLAR: // $, test for End of line // or for position before new line at end of input diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 1e771a568d..a98de3eea1 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -1465,6 +1465,45 @@ cleanUpAndReturn: //------------------------------------------------------------------------------- // // PerlTests - Run Perl's regular expression tests +// The input file for this test is re_tests, the standard regular +// expression test data distributed with the Perl source code. +// +// Here is Perl's description of the test data file: +// +// # The tests are in a separate file 't/op/re_tests'. +// # Each line in that file is a separate test. +// # There are five columns, separated by tabs. +// # +// # Column 1 contains the pattern, optionally enclosed in C<''>. +// # Modifiers can be put after the closing C<'>. +// # +// # Column 2 contains the string to be matched. +// # +// # Column 3 contains the expected result: +// # y expect a match +// # n expect no match +// # c expect an error +// # B test exposes a known bug in Perl, should be skipped +// # b test exposes a known bug in Perl, should be skipped if noamp +// # +// # Columns 4 and 5 are used only if column 3 contains C or C. +// # +// # Column 4 contains a string, usually C<$&>. +// # +// # Column 5 contains the expected result of double-quote +// # interpolating that string after the match, or start of error message. +// # +// # Column 6, if present, contains a reason why the test is skipped. +// # This is printed with "skipped", for harness to pick up. +// # +// # \n in the tests are interpolated, as are variables of the form ${\w+}. +// # +// # If you want to add a regular expression test that can't be expressed +// # in this format, don't add it here: put it in op/pat.t instead. +// +// For ICU, if field 3 contains an 'i', the test will be skipped. +// The test exposes is some known incompatibility between ICU and Perl regexps. +// (The i is in addition to whatever was there before.) // //------------------------------------------------------------------------------- void RegexTest::PerlTests() { @@ -1502,28 +1541,21 @@ void RegexTest::PerlTests() { // // Regex to identify test patterns with flag settings, and to separate them. // Test patterns with flags look like 'pattern'i - // Test patterns without flags are not quoted: paterrn + // Test patterns without flags are not quoted: pattern // Coming out, capture group 2 is the pattern, capture group 3 is the flags. // RegexPattern *flagPat = RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe, status); RegexMatcher* flagMat = flagPat->matcher("", status); // - // Regex to find ${bang}. Perl doesn't put literal '!'s into patterns. - // - RegexPattern *bangPat = RegexPattern::compile("\\$\\{bang\\}", 0, pe, status); - RegexMatcher *bangMat = bangPat->matcher("", status); - - // - // ${nulnul} and its replacement string. + // The Perl tests reference several perl-isms, which are evaluated/substituted + // in the test data. Not being perl, this must be done explicitly. Here + // are string constants and REs for these constructs. // UnicodeString nulnulSrc("${nulnul}"); UnicodeString nulnul("\\u0000\\u0000"); nulnul = nulnul.unescape(); - // - // Regex to find ${ffff}. Perl doesn't put \uffff into patterns. - // UnicodeString ffffSrc("${ffff}"); UnicodeString ffff("\\uffff"); ffff = ffff.unescape(); @@ -1537,10 +1569,19 @@ void RegexTest::PerlTests() { RegexMatcher *cgMat = cgPat->matcher("", status); + // + // Main Loop for the Perl Tests, runs once per line from the + // test data file. + // int32_t lineNum = 0; int32_t skippedUnimplementedCount = 0; while (lineMat->find()) { lineNum++; + + // + // Get a line, break it into its fields, do the Perl + // variable substitutions. + // UnicodeString line = lineMat->group(1, status); UnicodeString fields[7]; fieldPat->split(line, fields, 7, status); @@ -1548,10 +1589,14 @@ void RegexTest::PerlTests() { flagMat->reset(fields[0]); flagMat->matches(status); UnicodeString pattern = flagMat->group(2, status); - bangMat->reset(pattern); - pattern = bangMat->replaceAll("!", status); + pattern.findAndReplace("${bang}", "!"); pattern.findAndReplace(nulnulSrc, "\\u0000\\u0000"); pattern.findAndReplace(ffffSrc, ffff); + + // + // Identify patterns that include match flag settings, + // split off the flags, remove the extra quotes. + // UnicodeString flagStr = flagMat->group(3, status); // printf("pattern = %s\n", cstar(pattern)); // printf(" flags = %s\n", cstar(flags)); @@ -1559,10 +1604,9 @@ void RegexTest::PerlTests() { errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); return; } - int32_t flags = 0; - const UChar UChar_c = 0x63; // Damn the lack of Unicode support in C - const UChar UChar_i = 0x69; + const UChar UChar_c = 0x63; // Char constants for the flag letters. + const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) const UChar UChar_m = 0x6d; const UChar UChar_x = 0x78; const UChar UChar_y = 0x79; @@ -1582,6 +1626,9 @@ void RegexTest::PerlTests() { status = U_ZERO_ERROR; RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status); if (status == U_REGEX_UNIMPLEMENTED) { + // + // Test of a feature that is planned for ICU, but not yet implemented. + // skip the test. skippedUnimplementedCount++; delete testPat; status = U_ZERO_ERROR; @@ -1630,7 +1677,7 @@ void RegexTest::PerlTests() { // - // Run the test + // Run the test, check for expected match/don't match result. // RegexMatcher *testMat = testPat->matcher(matchString, status); UBool found = testMat->find(); @@ -1644,7 +1691,11 @@ void RegexTest::PerlTests() { } // - // Interpret the Perl expression from the fourth field of the data file. + // Interpret the Perl expression from the fourth field of the data file, + // building up an ICU string from the results of the ICU match. + // The Perl expression will contain references to the results of + // a regex match, including the matched string, capture group strings, + // group starting and ending indicies, etc. // UnicodeString resultString; UnicodeString perlExpr = fields[3]; @@ -1746,8 +1797,11 @@ void RegexTest::PerlTests() { delete testMat; delete testPat; - } + + // + // All done. Clean up allocated stuff. + // delete cgMat; delete cgPat; @@ -1756,8 +1810,12 @@ void RegexTest::PerlTests() { delete flagMat; delete flagPat; + + delete lineMat; + delete linePat; delete fieldPat; + delete [] testData; logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); diff --git a/icu4c/source/test/testdata/re_tests.txt b/icu4c/source/test/testdata/re_tests.txt index bfb48bce3d..c91e74f048 100644 --- a/icu4c/source/test/testdata/re_tests.txt +++ b/icu4c/source/test/testdata/re_tests.txt @@ -857,8 +857,7 @@ tt+$ xxxtt y - - ^([^,]{0,3},){0,3}d aaa,b,c,d y $1 c, (?i) y - - '(?!\A)x'm a\nxb\n y - - -^(a(b)?)+$ aba y -$1-$2- -a-- -^(aa(bb)?)+$ aabbaa y -$1-$2- -aa-- +^(a(b)?)+$ aba iy -$1-$2- -a-- '^.{9}abc.*\n'm 123\nabcabcabcabc\n y - - ^(a)?a$ a y -$1- -- ^(a)?(?(1)a|b)+$ a n - -