ICU-4024 fix new line handling bugs in regexp

X-SVN-Rev: 16104
This commit is contained in:
Andy Heninger 2004-08-04 04:47:18 +00:00
parent ecbbbe2756
commit 1b339e97db
3 changed files with 103 additions and 9 deletions

View File

@ -317,6 +317,7 @@ UBool RegexMatcher::find() {
// Previous match had zero length. Move start position up one position
// to avoid sending find() into a loop on zero-length matches.
if (startPos == fInput->length()) {
fMatch = FALSE;
return FALSE;
}
startPos = fInput->moveIndex32(startPos, 1);
@ -324,6 +325,7 @@ UBool RegexMatcher::find() {
int32_t inputLen = fInput->length();
int32_t testLen = inputLen - fPattern->fMinMatchLen;
if (startPos > testLen) {
fMatch = FALSE;
return FALSE;
}
@ -357,6 +359,7 @@ UBool RegexMatcher::find() {
// Matches are only possible at the start of the input string
// (pattern begins with ^ or \A)
if (startPos > 0) {
fMatch = FALSE;
return FALSE;
}
MatchAt(startPos, fDeferredStatus);
@ -384,6 +387,7 @@ UBool RegexMatcher::find() {
}
}
if (pos >= testLen) {
fMatch = FALSE;
return FALSE;
}
}
@ -409,6 +413,7 @@ UBool RegexMatcher::find() {
}
}
if (pos >= testLen) {
fMatch = FALSE;
return FALSE;
}
}
@ -432,8 +437,10 @@ UBool RegexMatcher::find() {
for (;;) {
UChar32 c = inputBuf[startPos-1];
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
(c == 0x0a || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 ||
c == 0x0d && startPos+1 < inputLen && inputBuf[startPos+1] != 0x0a)) {
(c == 0x0a || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 )) {
if (c == 0x0d && startPos < inputLen && inputBuf[startPos] == 0x0a) {
startPos++;
}
MatchAt(startPos, fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return FALSE;
@ -443,6 +450,7 @@ UBool RegexMatcher::find() {
}
}
if (startPos >= testLen) {
fMatch = FALSE;
return FALSE;
}
U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
@ -1204,7 +1212,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
if (fp->fInputIdx == inputLen-1) {
UChar32 c = fInput->char32At(fp->fInputIdx);
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
break; // At new-line at end of input. Success
// If not in the middle of a CR/LF sequence
if ( !(c==0x0a && fp->fInputIdx>0 && inputBuf[fp->fInputIdx-1]==0x0d)) {
break;
// At new-line at end of input. Success
}
}
}
@ -1225,12 +1237,16 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
// We really are at the end of input. Success.
break;
}
// If we are positioned just before a new-line , succeed.
// If we are positioned just before a new-line, succeed.
// It makes no difference where the new-line is within the input.
UChar32 c = inputBuf[fp->fInputIdx];
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
break; // At new-line at end of input. Success
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence
if ( !(c==0x0a && fp->fInputIdx>0 && inputBuf[fp->fInputIdx-1]==0x0d)) {
break; // At new-line at end of input. Success
}
}
// not at a new line. Fail.
fp = (REStackFrame *)fStack->popFrame(frameSize);
}

View File

@ -223,6 +223,8 @@ void RegexTest::regex_find(const UnicodeString &pattern,
UVector groupEnds(status);
UBool isMatch;
UBool failed = FALSE;
int numFinds;
int i;
//
// Compile the caller's pattern
@ -259,6 +261,20 @@ void RegexTest::regex_find(const UnicodeString &pattern,
RegexPatternDump(callerPattern);
}
//
// Number of times find() should be called on the test string, default to 1
//
numFinds = 1;
for (i=2; i<=9; i++) {
if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
if (numFinds != 1) {
errln("Line %d: more than one digit flag. Scanning %d.", line, i);
goto cleanupAndReturn;
}
numFinds = i;
}
}
//
// Find the tags in the input data, remove them, and record the group boundary
// positions.
@ -293,7 +309,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
matcher->setTrace(TRUE);
}
isMatch = matcher->find();
for (i=0; i<numFinds; i++) {
isMatch = matcher->find();
}
matcher->setTrace(FALSE);
//
@ -319,7 +337,6 @@ void RegexTest::regex_find(const UnicodeString &pattern,
goto cleanupAndReturn;
}
int i;
for (i=0; i<=matcher->groupCount(); i++) {
int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
if (matcher->start(i, status) != expectedStart) {
@ -1367,9 +1384,9 @@ void RegexTest::Extended() {
//
UnicodeString testString(FALSE, testData, len);
RegexMatcher quotedStuffMat("\\s*([\\'\\\"/])(.+?)\\1", 0, status);
RegexMatcher quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status);
RegexMatcher commentMat ("\\s*(#.*)?$", 0, status);
RegexMatcher flagsMat ("\\s*([ixsmdtGv]*)([:letter:]*)", 0, status);
RegexMatcher flagsMat ("\\s*([ixsmdtGv2-9]*)([:letter:]*)", 0, status);
RegexMatcher lineMat("(.*?)\\r?\\n", testString, 0, status);
UnicodeString testPattern; // The pattern for test from the test file.

View File

@ -22,6 +22,9 @@
# regex test pattern should not compile.
# d dump the compiled pattern
# t trace operation of match engine.
# 2-9 a digit between 2 and 9, specifies the number of
# times to execute find(). The expected results are
# for the last find() in the sequence.
# White space must be present between the flags and the match string.
#
@ -400,6 +403,64 @@
"\D" "<0>\u0100</0>DEF"
"\D" "123<0>\u0100</0>DEF"
#
#bug 4024, new line sequence handling
#
"(?m)^" "<0></0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"(?m)^" 2 "AA\u000d\u000a<0></0>BB\u000d\u000aCC\u000d\u000a"
"(?m)^" 3 "AA\u000d\u000aBB\u000d\u000a<0></0>CC\u000d\u000a"
"(?m)^" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"(?m)$" "AA<0></0>\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"(?m)$" 2 "AA\u000d\u000aBB<0></0>\u000d\u000aCC\u000d\u000a"
"(?m)$" 3 "AA\u000d\u000aBB\u000d\u000aCC<0></0>\u000d\u000a"
"(?m)$" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0></0>"
"(?m)$" 5 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"$" "AA\u000d\u000aBB\u000d\u000aCC<0></0>\u000d\u000a"
"$" 2 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0></0>"
"$" 3 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"$" "\u000a\u0000a<0></0>\u000a"
"$" 2 "\u000a\u0000a\u000a<0></0>"
"$" 3 "\u000a\u0000a\u000a"
"$" "<0></0>"
"$" 2 ""
"$" "<0></0>\u000a"
"$" 2 "\u000a<0></0>"
"$" 3 "\u000a"
"^" "<0></0>"
"^" 2 ""
# No matching ^ at interior new-lines if not in multi-line mode.
"^" "<0></0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
"^" 2 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
#
# Dot-matches-any mode, and stopping at new-lines if off.
#
"." "<0>1</0>23\u000aXYZ"
"." 2 "1<0>2</0>3\u000aXYZ"
"." 3 "12<0>3</0>\u000aXYZ"
"." 4 "123\u000a<0>X</0>YZ" # . doesn't match newlines
"." 4 "123\u000c<0>X</0>YZ"
"." 4 "123\u000d<0>X</0>YZ"
"." 4 "123\u000d\u000a<0>X</0>YZ"
"." 4 "123\u0085<0>X</0>YZ"
"." 4 "123\u2028<0>X</0>YZ"
"." 4 "123\u2029<0>X</0>YZ"
"." 4s "123<0>\u000a</0>XYZ" # . matches any
"." 4s "123<0>\u000c</0>XYZ"
"." 4s "123<0>\u000d</0>XYZ"
"." 4s "123<0>\u000d\u000a</0>XYZ"
"." 4s "123<0>\u0085</0>XYZ"
"." 4s "123<0>\u2028</0>XYZ"
"." 4s "123<0>\u2029</0>XYZ"
".{6}" "123\u000a\u000dXYZ"
".{6}" s "<0>123\u000a\u000dX</0>Y"
#
# Random debugging, Temporary
#