ICU-8482 Changed the behavior of already broken match boundary condition - not to expand the result match length when 1) the end of CE match is a part of expansion 2) the limit of the CE match is already on a break boundary. Also flipped the behavior of isBreakBoundary, which was actually is*Not*BreakBoundary before.

X-SVN-Rev: 29848
This commit is contained in:
Yoshito Umaoka 2011-04-22 05:37:46 +00:00
parent 2861a47a86
commit 39f9a8686f
4 changed files with 124 additions and 22 deletions

View File

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1996-2010, International Business Machines *
* Copyright (C) 1996-2011, International Business Machines *
* Corporation and others. All Rights Reserved. *
******************************************************************************
*/
@ -775,10 +775,20 @@ UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
mLimit = maxLimit;
if (minLimit < maxLimit) {
int32_t nbb = target->nextBreakBoundary(minLimit);
// When the last CE's low index is same with its high index, the CE is likely
// a part of expansion. In this case, the index is located just after the
// character corresponding to the CEs compared above. If the index is right
// at the break boundary, move the position to the next boundary will result
// incorrect match length when there are ignorable characters exist between
// the position and the next character produces CE(s). See ticket#8482.
if (minLimit == lastCEI.highOffset && target->isBreakBoundary(minLimit)) {
mLimit = minLimit;
} else {
int32_t nbb = target->nextBreakBoundary(minLimit);
if (nbb >= lastCEI.highOffset) {
mLimit = nbb;
if (nbb >= lastCEI.highOffset) {
mLimit = nbb;
}
}
}

View File

@ -3659,7 +3659,7 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
U_ASSERT(index<=textLen);
if (index>=textLen || index<=0) {
return FALSE;
return TRUE;
}
// If the character at the current index is not a GRAPHEME_EXTEND
@ -3668,7 +3668,7 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
U16_GET(text, 0, index, textLen, c);
int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
return FALSE;
return TRUE;
}
// We are at a combining mark. If the preceding character is anything
@ -3676,7 +3676,7 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
U16_PREV(text, 0, index, c);
gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
return combining;
return !combining;
#elif !UCONFIG_NO_BREAK_ITERATION
UBreakIterator *breakiterator = strsrch->search->breakIter;
@ -3684,10 +3684,10 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
breakiterator = strsrch->search->internalBreakIter;
}
return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index));
return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
#else
// **** or use the original code? ****
return FALSE;
return TRUE;
#endif
}
@ -3906,14 +3906,14 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
// 1. The match extended to the last CE from the target text, which is OK, or
// 2. The last CE that was part of the match is in an expansion that extends
// to the first CE after the match. In this case, we reject the match.
const CEI *nextCEI = 0;
if (strsrch->search->elementComparisonType == 0) {
const CEI *nextCEI = ceb.get(targetIx + targetIxOffset);
nextCEI = ceb.get(targetIx + targetIxOffset);
maxLimit = nextCEI->lowIndex;
if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
found = FALSE;
}
} else {
const CEI *nextCEI;
for ( ; ; ++targetIxOffset ) {
nextCEI = ceb.get(targetIx + targetIxOffset);
maxLimit = nextCEI->lowIndex;
@ -3949,7 +3949,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
// to something else.
// This type of match should be rejected for not completely consuming a
// combining sequence.
if (isBreakBoundary(strsrch, mStart)) {
if (!isBreakBoundary(strsrch, mStart)) {
found = FALSE;
}
@ -3967,10 +3967,19 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
// This advances the index over any combining charcters.
mLimit = maxLimit;
if (minLimit < maxLimit) {
int32_t nba = nextBoundaryAfter(strsrch, minLimit);
if (nba >= lastCEI->highIndex) {
mLimit = nba;
// When the last CE's low index is same with its high index, the CE is likely
// a part of expansion. In this case, the index is located just after the
// character corresponding to the CEs compared above. If the index is right
// at the break boundary, move the position to the next boundary will result
// incorrect match length when there are ignorable characters exist between
// the position and the next character produces CE(s). See ticket#8482.
if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
mLimit = minLimit;
} else {
int32_t nba = nextBoundaryAfter(strsrch, minLimit);
if (nba >= lastCEI->highIndex) {
mLimit = nba;
}
}
}
@ -3986,7 +3995,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
found = FALSE;
}
if (isBreakBoundary(strsrch, mLimit)) {
if (!isBreakBoundary(strsrch, mLimit)) {
found = FALSE;
}
@ -4165,7 +4174,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
// to something else.
// This type of match should be rejected for not completely consuming a
// combining sequence.
if (isBreakBoundary(strsrch, mStart)) {
if (!isBreakBoundary(strsrch, mStart)) {
found = FALSE;
}
@ -4213,7 +4222,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
}
// Make sure the end of the match is on a break boundary
if (isBreakBoundary(strsrch, mLimit)) {
if (!isBreakBoundary(strsrch, mLimit)) {
found = FALSE;
}

View File

@ -2914,6 +2914,78 @@ static void TestPCEBuffer_2surr() {
TestPCEBuffer_with(search,searchLen,source,sourceLen);
}
static void TestMatchFollowedByIgnorables(void) {
/* test case for ticket#8482 */
UChar search[] = { 0x00c9 };
UChar source[] = { 0x00c9, 0x0000, 0x0041 };
int32_t searchLen;
int32_t sourceLen;
UErrorCode icuStatus = U_ZERO_ERROR;
UCollator *coll;
const char *locale;
UBreakIterator *ubrk;
UStringSearch *usearch;
int32_t match = 0;
int32_t matchLength = 0;
const int32_t expectedMatchLength = 1;
searchLen = sizeof(search)/sizeof(UChar);
sourceLen = sizeof(source)/sizeof(UChar);
coll = ucol_openFromShortString("LHR_AN_CX_EX_FX_HX_NX_S3",
FALSE,
NULL,
&icuStatus);
if (U_FAILURE(icuStatus)) {
log_err("ucol_openFromShortString error\n");
}
locale = ucol_getLocaleByType(coll,
ULOC_VALID_LOCALE,
&icuStatus);
if (U_FAILURE(icuStatus)) {
log_err("ucol_getLocaleByType error\n");
}
ubrk = ubrk_open(UBRK_CHARACTER,
locale,
source,
sourceLen,
&icuStatus);
if (U_FAILURE(icuStatus)) {
log_err("ubrk_open error\n");
}
usearch = usearch_openFromCollator(search,
searchLen,
source,
sourceLen,
coll,
ubrk,
&icuStatus);
if (U_FAILURE(icuStatus)) {
log_err("usearch_openFromCollator error\n");
}
match = usearch_first(usearch,
&icuStatus);
if (U_FAILURE(icuStatus)) {
log_err("usearch_first error\n");
}
log_verbose("match=%d\n", match);
matchLength = usearch_getMatchedLength(usearch);
if (U_FAILURE(icuStatus)) {
log_err("usearch_getMatchedLength error\n");
}
if (matchLength != expectedMatchLength) {
log_err("Error: matchLength=%d, expected=%d\n", matchLength, expectedMatchLength);
}
}
/**
* addSearchTest
*/
@ -2975,6 +3047,7 @@ void addSearchTest(TestNode** root)
addTest(root, &TestUsingSearchCollator, "tscoll/usrchtst/TestUsingSearchCollator");
addTest(root, &TestPCEBuffer_100df, "tscoll/usrchtst/TestPCEBuffer/1_00df");
addTest(root, &TestPCEBuffer_2surr, "tscoll/usrchtst/TestPCEBuffer/2_dfff");
addTest(root, &TestMatchFollowedByIgnorables, "tscoll/usrchtst/TestMatchFollowedByIgnorables");
}
#endif /* #if !UCONFIG_NO_COLLATION */

View File

@ -2010,10 +2010,20 @@ static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t
// that's after the last CE in the match, use that index
// as the end of the match.
if (minLimit < maxLimit) {
int32_t nba = ubrk_following(charBreakIterator, minLimit);
// When the last CE's low index is same with its high index, the CE is likely
// a part of expansion. In this case, the index is located just after the
// character corresponding to the CEs compared above. If the index is right
// at the break boundary, move the position to the next boundary will result
// incorrect match length when there are ignorable characters exist between
// the position and the next character produces CE(s). See ticket#8482.
if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
mend = minLimit;
} else {
int32_t nba = ubrk_following(charBreakIterator, minLimit);
if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
mend = nba;
if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
mend = nba;
}
}
}