ICU-4024 fix new line handling bugs in regexp

X-SVN-Rev: 16104
2004-08-04 04:47:18 +00:00 · 2004-08-04 04:47:18 +00:00 · 1b339e97db
commit 1b339e97db
parent ecbbbe2756
3 changed files with 103 additions and 9 deletions
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -317,6 +317,7 @@ UBool RegexMatcher::find() {
        // Previous match had zero length.  Move start position up one position
        //  to avoid sending find() into a loop on zero-length matches.
        if (startPos == fInput->length()) {
+            fMatch = FALSE;
            return FALSE;
        }
        startPos = fInput->moveIndex32(startPos, 1);
@ -324,6 +325,7 @@ UBool RegexMatcher::find() {
    int32_t inputLen = fInput->length();
    int32_t testLen  = inputLen - fPattern->fMinMatchLen;
    if (startPos > testLen) {
+        fMatch = FALSE;
        return FALSE;
    }

@ -357,6 +359,7 @@ UBool RegexMatcher::find() {
        // Matches are only possible at the start of the input string
        //   (pattern begins with ^ or \A)
        if (startPos > 0) {
+            fMatch = FALSE;
            return FALSE;
        }
        MatchAt(startPos, fDeferredStatus);
@ -384,6 +387,7 @@ UBool RegexMatcher::find() {
                    }
                }
                if (pos >= testLen) {
+                    fMatch = FALSE;
                    return FALSE;
                }
            }
@ -409,6 +413,7 @@ UBool RegexMatcher::find() {
                    }
                }
                if (pos >= testLen) {
+                    fMatch = FALSE;
                    return FALSE;
                }
            }
@ -432,8 +437,10 @@ UBool RegexMatcher::find() {
            for (;;) {
                UChar32 c = inputBuf[startPos-1];
                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
-                    (c == 0x0a ||  c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 || 
-                    c == 0x0d && startPos+1 < inputLen && inputBuf[startPos+1] != 0x0a)) {
+                    (c == 0x0a ||  c==0x0c || c==0x85 ||c==0x2028 || c==0x2029 )) {
+                    if (c == 0x0d && startPos < inputLen && inputBuf[startPos] == 0x0a) {
+                        startPos++;
+                    }
                    MatchAt(startPos, fDeferredStatus);
                    if (U_FAILURE(fDeferredStatus)) {
                        return FALSE;
@ -443,6 +450,7 @@ UBool RegexMatcher::find() {
                    }
                }
                if (startPos >= testLen) {
+                    fMatch = FALSE;
                    return FALSE;
                }
                U16_NEXT(inputBuf, startPos, inputLen, c);  // like c = inputBuf[startPos++];
@ -1204,7 +1212,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
            if (fp->fInputIdx == inputLen-1) {
                UChar32 c = fInput->char32At(fp->fInputIdx);
                if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
-                    break;                         // At new-line at end of input. Success
+                    // If not in the middle of a CR/LF sequence
+                    if ( !(c==0x0a && fp->fInputIdx>0 && inputBuf[fp->fInputIdx-1]==0x0d)) {
+                        break;
+                        // At new-line at end of input. Success
+                    }
                }
            }

@ -1225,12 +1237,16 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
                     // We really are at the end of input.  Success.
                     break;
                 }
-                 // If we are positioned just before a new-line , succeed.
+                 // If we are positioned just before a new-line, succeed.
                 // It makes no difference where the new-line is within the input.
                 UChar32 c = inputBuf[fp->fInputIdx];
                 if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
-                     break;                         // At new-line at end of input. Success
+                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
+                     if ( !(c==0x0a && fp->fInputIdx>0 && inputBuf[fp->fInputIdx-1]==0x0d)) {
+                        break;                         // At new-line at end of input. Success
+                     }
                 }
+                 
                 // not at a new line.  Fail.
                 fp = (REStackFrame *)fStack->popFrame(frameSize);
             }
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -223,6 +223,8 @@ void RegexTest::regex_find(const UnicodeString &pattern,
    UVector             groupEnds(status);
    UBool               isMatch;
    UBool               failed         = FALSE;
+    int                 numFinds;
+    int                 i;

    //
    //  Compile the caller's pattern
@ -259,6 +261,20 @@ void RegexTest::regex_find(const UnicodeString &pattern,
        RegexPatternDump(callerPattern);
    }

+    //
+    // Number of times find() should be called on the test string, default to 1
+    //
+    numFinds = 1;
+    for (i=2; i<=9; i++) {
+        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
+            if (numFinds != 1) {
+                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
+                goto cleanupAndReturn;
+            }
+            numFinds = i;
+        }
+    }
+
    //
    //  Find the tags in the input data, remove them, and record the group boundary
    //    positions.
@ -293,7 +309,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
        matcher->setTrace(TRUE);
    }

-    isMatch = matcher->find();
+    for (i=0; i<numFinds; i++) {
+        isMatch = matcher->find();
+    }
    matcher->setTrace(FALSE);

    //
@ -319,7 +337,6 @@ void RegexTest::regex_find(const UnicodeString &pattern,
        goto cleanupAndReturn;   
    }
    
-    int i;
    for (i=0; i<=matcher->groupCount(); i++) {
        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
        if (matcher->start(i, status) != expectedStart) {
@ -1367,9 +1384,9 @@ void RegexTest::Extended() {
    //
    UnicodeString testString(FALSE, testData, len);

-    RegexMatcher    quotedStuffMat("\\s*([\\'\\\"/])(.+?)\\1", 0, status);
+    RegexMatcher    quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status);
    RegexMatcher    commentMat    ("\\s*(#.*)?$", 0, status); 
-    RegexMatcher    flagsMat      ("\\s*([ixsmdtGv]*)([:letter:]*)", 0, status);
+    RegexMatcher    flagsMat      ("\\s*([ixsmdtGv2-9]*)([:letter:]*)", 0, status);

    RegexMatcher    lineMat("(.*?)\\r?\\n", testString, 0, status);
    UnicodeString   testPattern;   // The pattern for test from the test file.
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -22,6 +22,9 @@
 #                                          regex test pattern should not compile.
 #                                   d      dump the compiled pattern
 #                                   t      trace operation of match engine.
+#                                   2-9    a digit between 2 and 9, specifies the number of 
+#                                          times to execute find().  The expected results are
+#                                          for the last find() in the sequence.
 #                                 White space must be present between the flags and the match string.
 #

@ -400,6 +403,64 @@
 "\D"                              "<0>\u0100</0>DEF"
 "\D"                              "123<0>\u0100</0>DEF"

+#
+#bug 4024, new line sequence handling
+#
+"(?m)^"                           "<0></0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
+"(?m)^"                       2   "AA\u000d\u000a<0></0>BB\u000d\u000aCC\u000d\u000a"
+"(?m)^"                       3   "AA\u000d\u000aBB\u000d\u000a<0></0>CC\u000d\u000a"
+"(?m)^"                       4   "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
+
+"(?m)$"                           "AA<0></0>\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
+"(?m)$"                       2   "AA\u000d\u000aBB<0></0>\u000d\u000aCC\u000d\u000a"
+"(?m)$"                       3   "AA\u000d\u000aBB\u000d\u000aCC<0></0>\u000d\u000a"
+"(?m)$"                       4   "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0></0>"
+"(?m)$"                       5   "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
+
+"$"                               "AA\u000d\u000aBB\u000d\u000aCC<0></0>\u000d\u000a"
+"$"                           2   "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0></0>"
+"$"                           3   "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
+
+"$"                               "\u000a\u0000a<0></0>\u000a"
+"$"                           2   "\u000a\u0000a\u000a<0></0>"
+"$"                           3   "\u000a\u0000a\u000a"
+
+"$"                               "<0></0>"
+"$"                           2   ""
+
+"$"                               "<0></0>\u000a"
+"$"                           2   "\u000a<0></0>"
+"$"                           3   "\u000a"
+
+"^"                               "<0></0>"
+"^"                           2   ""
+
+# No matching ^ at interior new-lines if not in multi-line mode.
+"^"                               "<0></0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
+"^"                           2   "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a"
+
+#
+# Dot-matches-any mode, and stopping at new-lines if off.
+#
+"."                               "<0>1</0>23\u000aXYZ"
+"."                           2   "1<0>2</0>3\u000aXYZ"
+"."                           3   "12<0>3</0>\u000aXYZ"
+"."                           4   "123\u000a<0>X</0>YZ"    # . doesn't match newlines
+"."                           4   "123\u000c<0>X</0>YZ"
+"."                           4   "123\u000d<0>X</0>YZ"
+"."                           4   "123\u000d\u000a<0>X</0>YZ"
+"."                           4   "123\u0085<0>X</0>YZ"
+"."                           4   "123\u2028<0>X</0>YZ"
+"."                           4   "123\u2029<0>X</0>YZ"
+"."                           4s  "123<0>\u000a</0>XYZ"    # . matches any
+"."                           4s  "123<0>\u000c</0>XYZ"
+"."                           4s  "123<0>\u000d</0>XYZ"
+"."                           4s  "123<0>\u000d\u000a</0>XYZ"
+"."                           4s  "123<0>\u0085</0>XYZ"
+"."                           4s  "123<0>\u2028</0>XYZ"
+"."                           4s  "123<0>\u2029</0>XYZ"
+".{6}"                            "123\u000a\u000dXYZ"
+".{6}"                         s  "<0>123\u000a\u000dX</0>Y"
 #
 #  Random debugging, Temporary
 #