ICU-11049 fix regex find() memory overrun.

X-SVN-Rev: 36124
2014-08-06 21:49:08 +00:00 · 2014-08-06 21:49:08 +00:00 · e03585d7cf
commit e03585d7cf
parent 1c22b8cf53
4 changed files with 62 additions and 3 deletions
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -1,6 +1,6 @@
 /*
 **************************************************************************
-*   Copyright (C) 2002-2013 International Business Machines Corporation  *
+*   Copyright (C) 2002-2014 International Business Machines Corporation  *
 *   and others. All rights reserved.                                     *
 **************************************************************************
 */
@ -983,7 +983,7 @@ UBool RegexMatcher::findUsingChunk() {
                    return TRUE;
                }
            }
-            if (pos >= testLen) {
+            if (startPos > testLen) {
                fMatch = FALSE;
                fHitEnd = TRUE;
                return FALSE;
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -23,6 +23,7 @@
 #include "intltest.h"
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

+#include "unicode/localpointer.h"
 #include "unicode/regex.h"
 #include "unicode/uchar.h"
 #include "unicode/ucnv.h"
@ -140,7 +141,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
        case 23: name = "TestCaseInsensitiveStarters";
            if (exec) TestCaseInsensitiveStarters();
            break;
-
+        case 24: name = "TestBug11049";
+            if (exec) TestBug11049();
+            break;
        default: name = "";
            break; //needed to end loop
    }
@ -5303,5 +5306,51 @@ void RegexTest::TestCaseInsensitiveStarters() {
 }


+void RegexTest::TestBug11049() {
+    // Original bug report: pattern with match start consisting of one of several individual characters,
+    //  and the text being matched ending with a supplementary character. find() would read past the
+    //  end of the input text when searching for potential match starting points.
+
+    // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
+    // detect the bad read.
+
+    UnicodeString patternString("A|B|C");
+    UnicodeString txtString = UnicodeString("a string \\ud800\\udc00").unescape();
+    UChar *exactBuffer = new UChar[txtString.length()];
+    UErrorCode status = U_ZERO_ERROR;
+    txtString.extract(exactBuffer, txtString.length(), status);
+    UText *ut = utext_openUChars(NULL, exactBuffer, txtString.length(), &status);
+
+    LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, 0, status));
+    REGEX_CHECK_STATUS;
+    LocalPointer<RegexMatcher> matcher(pattern->matcher(status));
+    matcher->reset(ut);
+    REGEX_CHECK_STATUS;
+    UBool result = matcher->find();
+    REGEX_ASSERT(result == FALSE);
+
+    // Verify that match starting on the last char in input will be found.
+    txtString = UnicodeString("string matches at end C");
+    matcher->reset(txtString);
+    result = matcher->find();
+    REGEX_ASSERT(result == TRUE);
+
+    // Put an unpaired surrogate at the end of the input text,
+    // let valgrind verify that find() doesn't look off the end.
+    txtString = UnicodeString("a string \\ud800").unescape();
+    delete [] exactBuffer;
+    exactBuffer = new UChar[txtString.length()];
+    txtString.extract(exactBuffer, txtString.length(), status);
+    utext_openUChars(ut, exactBuffer, txtString.length(), &status);
+    matcher->reset(ut);
+    result = matcher->find();
+    REGEX_ASSERT(result == FALSE);
+    REGEX_CHECK_STATUS;
+
+    utext_close(ut);
+    delete [] exactBuffer;
+}
+
+
 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */

--- a/icu4c/source/test/intltest/regextst.h
+++ b/icu4c/source/test/intltest/regextst.h
@ -49,6 +49,7 @@ public:
    virtual void CheckInvBufSize();
    virtual void Bug10459();
    virtual void TestCaseInsensitiveStarters();
+    virtual void TestBug11049();
    
    // The following functions are internal to the regexp tests.
    virtual void assertUText(const char *expected, UText *actual, const char *file, int line);
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -1192,6 +1192,15 @@
 "^(\w+\d\w+:\w+)$"              "<0><1>DiesIst1Beispiel:text</1></0>"
 "^(\w+\d\w+:\w+)$"       i      "<0><1>DiesIst1Beispiel:text</1></0>"

+# Bug 11049
+#   Edge cases in find() when pattern match begins with set of code points
+#   and the match begins at the end of the string.
+
+"A|B|C"                         "hello <0>A</0>"
+"A|B|C"                         "hello \U00011234"
+"A|B|\U00012345"                "hello <0>\U00012345</0>"
+"A|B|\U00010000"                "hello \ud800"
+
 #  Random debugging, Temporary
 #