ICU-4024 fix new line handling bugs in regexp
X-SVN-Rev: 16104
This commit is contained in:
parent
ecbbbe2756
commit
1b339e97db
@ -317,6 +317,7 @@ UBool RegexMatcher::find() {
|
||||
// Previous match had zero length. Move start position up one position
|
||||
// to avoid sending find() into a loop on zero-length matches.
|
||||
if (startPos == fInput->length()) {
|
||||
fMatch = FALSE;
|
||||
return FALSE;
|
||||
}
|
||||
startPos = fInput->moveIndex32(startPos, 1);
|
||||
@ -324,6 +325,7 @@ UBool RegexMatcher::find() {
|
||||
int32_t inputLen = fInput->length();
|
||||
int32_t testLen = inputLen - fPattern->fMinMatchLen;
|
||||
if (startPos > testLen) {
|
||||
fMatch = FALSE;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
@ -357,6 +359,7 @@ UBool RegexMatcher::find() {
|
||||
// Matches are only possible at the start of the input string
|
||||
// (pattern begins with ^ or \A)
|
||||
if (startPos > 0) {
|
||||
fMatch = FALSE;
|
||||
return FALSE;
|
||||
}
|
||||
MatchAt(startPos, fDeferredStatus);
|
||||
@ -384,6 +387,7 @@ UBool RegexMatcher::find() {
|
||||
}
|
||||
}
|
||||
if (pos >= testLen) {
|
||||
fMatch = FALSE;
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
@ -409,6 +413,7 @@ UBool RegexMatcher::find() {
|
||||
}
|
||||
}
|
||||
if (pos >= testLen) {
|
||||
fMatch = FALSE;
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
@ -432,8 +437,10 @@ UBool RegexMatcher::find() {
|
||||
for (;;) {
|
||||
UChar32 c = inputBuf[startPos-1];
|
||||
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
||||
(c == 0x0a || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 ||
|
||||
c == 0x0d && startPos+1 < inputLen && inputBuf[startPos+1] != 0x0a)) {
|
||||
(c == 0x0a || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 )) {
|
||||
if (c == 0x0d && startPos < inputLen && inputBuf[startPos] == 0x0a) {
|
||||
startPos++;
|
||||
}
|
||||
MatchAt(startPos, fDeferredStatus);
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
return FALSE;
|
||||
@ -443,6 +450,7 @@ UBool RegexMatcher::find() {
|
||||
}
|
||||
}
|
||||
if (startPos >= testLen) {
|
||||
fMatch = FALSE;
|
||||
return FALSE;
|
||||
}
|
||||
U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
|
||||
@ -1204,7 +1212,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
if (fp->fInputIdx == inputLen-1) {
|
||||
UChar32 c = fInput->char32At(fp->fInputIdx);
|
||||
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
break; // At new-line at end of input. Success
|
||||
// If not in the middle of a CR/LF sequence
|
||||
if ( !(c==0x0a && fp->fInputIdx>0 && inputBuf[fp->fInputIdx-1]==0x0d)) {
|
||||
break;
|
||||
// At new-line at end of input. Success
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1225,12 +1237,16 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
// We really are at the end of input. Success.
|
||||
break;
|
||||
}
|
||||
// If we are positioned just before a new-line , succeed.
|
||||
// If we are positioned just before a new-line, succeed.
|
||||
// It makes no difference where the new-line is within the input.
|
||||
UChar32 c = inputBuf[fp->fInputIdx];
|
||||
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
break; // At new-line at end of input. Success
|
||||
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence
|
||||
if ( !(c==0x0a && fp->fInputIdx>0 && inputBuf[fp->fInputIdx-1]==0x0d)) {
|
||||
break; // At new-line at end of input. Success
|
||||
}
|
||||
}
|
||||
|
||||
// not at a new line. Fail.
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
|
@ -223,6 +223,8 @@ void RegexTest::regex_find(const UnicodeString &pattern,
|
||||
UVector groupEnds(status);
|
||||
UBool isMatch;
|
||||
UBool failed = FALSE;
|
||||
int numFinds;
|
||||
int i;
|
||||
|
||||
//
|
||||
// Compile the caller's pattern
|
||||
@ -259,6 +261,20 @@ void RegexTest::regex_find(const UnicodeString &pattern,
|
||||
RegexPatternDump(callerPattern);
|
||||
}
|
||||
|
||||
//
|
||||
// Number of times find() should be called on the test string, default to 1
|
||||
//
|
||||
numFinds = 1;
|
||||
for (i=2; i<=9; i++) {
|
||||
if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
|
||||
if (numFinds != 1) {
|
||||
errln("Line %d: more than one digit flag. Scanning %d.", line, i);
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
numFinds = i;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Find the tags in the input data, remove them, and record the group boundary
|
||||
// positions.
|
||||
@ -293,7 +309,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
|
||||
matcher->setTrace(TRUE);
|
||||
}
|
||||
|
||||
isMatch = matcher->find();
|
||||
for (i=0; i<numFinds; i++) {
|
||||
isMatch = matcher->find();
|
||||
}
|
||||
matcher->setTrace(FALSE);
|
||||
|
||||
//
|
||||
@ -319,7 +337,6 @@ void RegexTest::regex_find(const UnicodeString &pattern,
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
int i;
|
||||
for (i=0; i<=matcher->groupCount(); i++) {
|
||||
int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
|
||||
if (matcher->start(i, status) != expectedStart) {
|
||||
@ -1367,9 +1384,9 @@ void RegexTest::Extended() {
|
||||
//
|
||||
UnicodeString testString(FALSE, testData, len);
|
||||
|
||||
RegexMatcher quotedStuffMat("\\s*([\\'\\\"/])(.+?)\\1", 0, status);
|
||||
RegexMatcher quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status);
|
||||
RegexMatcher commentMat ("\\s*(#.*)?$", 0, status);
|
||||
RegexMatcher flagsMat ("\\s*([ixsmdtGv]*)([:letter:]*)", 0, status);
|
||||
RegexMatcher flagsMat ("\\s*([ixsmdtGv2-9]*)([:letter:]*)", 0, status);
|
||||
|
||||
RegexMatcher lineMat("(.*?)\\r?\\n", testString, 0, status);
|
||||
UnicodeString testPattern; // The pattern for test from the test file.
|
||||
|
61
icu4c/source/test/testdata/regextst.txt
vendored
61
icu4c/source/test/testdata/regextst.txt
vendored
@ -22,6 +22,9 @@
|
||||
# regex test pattern should not compile.
|
||||
# d dump the compiled pattern
|
||||
# t trace operation of match engine.
|
||||
# 2-9 a digit between 2 and 9, specifies the number of
|
||||
# times to execute find(). The expected results are
|
||||
# for the last find() in the sequence.
|
||||
# White space must be present between the flags and the match string.
|
||||
#
|
||||
|
||||
@ -400,6 +403,64 @@
|
||||
"\D" "<0>\u0100</0>DEF"
|
||||
"\D" "123<0>\u0100</0>DEF"
|
||||
|
||||
#
|
||||
#bug 4024, new line sequence handling
|
||||
#
|
||||
"(?m)^" "<0></0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
|
||||
"(?m)^" 2 "AA\u000d\u000a<0></0>BB\u000d\u000aCC\u000d\u000a"
|
||||
"(?m)^" 3 "AA\u000d\u000aBB\u000d\u000a<0></0>CC\u000d\u000a"
|
||||
"(?m)^" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
|
||||
|
||||
"(?m)$" "AA<0></0>\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
|
||||
"(?m)$" 2 "AA\u000d\u000aBB<0></0>\u000d\u000aCC\u000d\u000a"
|
||||
"(?m)$" 3 "AA\u000d\u000aBB\u000d\u000aCC<0></0>\u000d\u000a"
|
||||
"(?m)$" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0></0>"
|
||||
"(?m)$" 5 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
|
||||
|
||||
"$" "AA\u000d\u000aBB\u000d\u000aCC<0></0>\u000d\u000a"
|
||||
"$" 2 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0></0>"
|
||||
"$" 3 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
|
||||
|
||||
"$" "\u000a\u0000a<0></0>\u000a"
|
||||
"$" 2 "\u000a\u0000a\u000a<0></0>"
|
||||
"$" 3 "\u000a\u0000a\u000a"
|
||||
|
||||
"$" "<0></0>"
|
||||
"$" 2 ""
|
||||
|
||||
"$" "<0></0>\u000a"
|
||||
"$" 2 "\u000a<0></0>"
|
||||
"$" 3 "\u000a"
|
||||
|
||||
"^" "<0></0>"
|
||||
"^" 2 ""
|
||||
|
||||
# No matching ^ at interior new-lines if not in multi-line mode.
|
||||
"^" "<0></0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
|
||||
"^" 2 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
|
||||
|
||||
#
|
||||
# Dot-matches-any mode, and stopping at new-lines if off.
|
||||
#
|
||||
"." "<0>1</0>23\u000aXYZ"
|
||||
"." 2 "1<0>2</0>3\u000aXYZ"
|
||||
"." 3 "12<0>3</0>\u000aXYZ"
|
||||
"." 4 "123\u000a<0>X</0>YZ" # . doesn't match newlines
|
||||
"." 4 "123\u000c<0>X</0>YZ"
|
||||
"." 4 "123\u000d<0>X</0>YZ"
|
||||
"." 4 "123\u000d\u000a<0>X</0>YZ"
|
||||
"." 4 "123\u0085<0>X</0>YZ"
|
||||
"." 4 "123\u2028<0>X</0>YZ"
|
||||
"." 4 "123\u2029<0>X</0>YZ"
|
||||
"." 4s "123<0>\u000a</0>XYZ" # . matches any
|
||||
"." 4s "123<0>\u000c</0>XYZ"
|
||||
"." 4s "123<0>\u000d</0>XYZ"
|
||||
"." 4s "123<0>\u000d\u000a</0>XYZ"
|
||||
"." 4s "123<0>\u0085</0>XYZ"
|
||||
"." 4s "123<0>\u2028</0>XYZ"
|
||||
"." 4s "123<0>\u2029</0>XYZ"
|
||||
".{6}" "123\u000a\u000dXYZ"
|
||||
".{6}" s "<0>123\u000a\u000dX</0>Y"
|
||||
#
|
||||
# Random debugging, Temporary
|
||||
#
|
||||
|
Loading…
Reference in New Issue
Block a user