ICU-11049 fix regex find() memory overrun.

X-SVN-Rev: 36124
This commit is contained in:
Andy Heninger 2014-08-06 21:49:08 +00:00
parent 1c22b8cf53
commit e03585d7cf
4 changed files with 62 additions and 3 deletions

View File

@ -1,6 +1,6 @@
/*
**************************************************************************
* Copyright (C) 2002-2013 International Business Machines Corporation *
* Copyright (C) 2002-2014 International Business Machines Corporation *
* and others. All rights reserved. *
**************************************************************************
*/
@ -983,7 +983,7 @@ UBool RegexMatcher::findUsingChunk() {
return TRUE;
}
}
if (pos >= testLen) {
if (startPos > testLen) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;

View File

@ -23,6 +23,7 @@
#include "intltest.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/localpointer.h"
#include "unicode/regex.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
@ -140,7 +141,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
case 23: name = "TestCaseInsensitiveStarters";
if (exec) TestCaseInsensitiveStarters();
break;
case 24: name = "TestBug11049";
if (exec) TestBug11049();
break;
default: name = "";
break; //needed to end loop
}
@ -5303,5 +5306,51 @@ void RegexTest::TestCaseInsensitiveStarters() {
}
void RegexTest::TestBug11049() {
// Original bug report: pattern with match start consisting of one of several individual characters,
// and the text being matched ending with a supplementary character. find() would read past the
// end of the input text when searching for potential match starting points.
// To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
// detect the bad read.
UnicodeString patternString("A|B|C");
UnicodeString txtString = UnicodeString("a string \\ud800\\udc00").unescape();
UChar *exactBuffer = new UChar[txtString.length()];
UErrorCode status = U_ZERO_ERROR;
txtString.extract(exactBuffer, txtString.length(), status);
UText *ut = utext_openUChars(NULL, exactBuffer, txtString.length(), &status);
LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, 0, status));
REGEX_CHECK_STATUS;
LocalPointer<RegexMatcher> matcher(pattern->matcher(status));
matcher->reset(ut);
REGEX_CHECK_STATUS;
UBool result = matcher->find();
REGEX_ASSERT(result == FALSE);
// Verify that match starting on the last char in input will be found.
txtString = UnicodeString("string matches at end C");
matcher->reset(txtString);
result = matcher->find();
REGEX_ASSERT(result == TRUE);
// Put an unpaired surrogate at the end of the input text,
// let valgrind verify that find() doesn't look off the end.
txtString = UnicodeString("a string \\ud800").unescape();
delete [] exactBuffer;
exactBuffer = new UChar[txtString.length()];
txtString.extract(exactBuffer, txtString.length(), status);
utext_openUChars(ut, exactBuffer, txtString.length(), &status);
matcher->reset(ut);
result = matcher->find();
REGEX_ASSERT(result == FALSE);
REGEX_CHECK_STATUS;
utext_close(ut);
delete [] exactBuffer;
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

View File

@ -49,6 +49,7 @@ public:
virtual void CheckInvBufSize();
virtual void Bug10459();
virtual void TestCaseInsensitiveStarters();
virtual void TestBug11049();
// The following functions are internal to the regexp tests.
virtual void assertUText(const char *expected, UText *actual, const char *file, int line);

View File

@ -1192,6 +1192,15 @@
"^(\w+\d\w+:\w+)$" "<0><1>DiesIst1Beispiel:text</1></0>"
"^(\w+\d\w+:\w+)$" i "<0><1>DiesIst1Beispiel:text</1></0>"
# Bug 11049
# Edge cases in find() when pattern match begins with set of code points
# and the match begins at the end of the string.
"A|B|C" "hello <0>A</0>"
"A|B|C" "hello \U00011234"
"A|B|\U00012345" "hello <0>\U00012345</0>"
"A|B|\U00010000" "hello \ud800"
# Random debugging, Temporary
#