ICU-8482 Changed the behavior of already broken match boundary condition - not to expand the result match length when 1) the end of CE match is a part of expansion 2) the limit of the CE match is already on a break boundary. Also flipped the behavior of isBreakBoundary, which was actually is*Not*BreakBoundary before.
X-SVN-Rev: 29848
This commit is contained in:
parent
2861a47a86
commit
39f9a8686f
@ -1,6 +1,6 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2010, International Business Machines *
|
||||
* Copyright (C) 1996-2011, International Business Machines *
|
||||
* Corporation and others. All Rights Reserved. *
|
||||
******************************************************************************
|
||||
*/
|
||||
@ -775,10 +775,20 @@ UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
|
||||
|
||||
mLimit = maxLimit;
|
||||
if (minLimit < maxLimit) {
|
||||
int32_t nbb = target->nextBreakBoundary(minLimit);
|
||||
// When the last CE's low index is same with its high index, the CE is likely
|
||||
// a part of expansion. In this case, the index is located just after the
|
||||
// character corresponding to the CEs compared above. If the index is right
|
||||
// at the break boundary, move the position to the next boundary will result
|
||||
// incorrect match length when there are ignorable characters exist between
|
||||
// the position and the next character produces CE(s). See ticket#8482.
|
||||
if (minLimit == lastCEI.highOffset && target->isBreakBoundary(minLimit)) {
|
||||
mLimit = minLimit;
|
||||
} else {
|
||||
int32_t nbb = target->nextBreakBoundary(minLimit);
|
||||
|
||||
if (nbb >= lastCEI.highOffset) {
|
||||
mLimit = nbb;
|
||||
if (nbb >= lastCEI.highOffset) {
|
||||
mLimit = nbb;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3659,7 +3659,7 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
|
||||
U_ASSERT(index<=textLen);
|
||||
|
||||
if (index>=textLen || index<=0) {
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
// If the character at the current index is not a GRAPHEME_EXTEND
|
||||
@ -3668,7 +3668,7 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
|
||||
U16_GET(text, 0, index, textLen, c);
|
||||
int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
|
||||
if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
// We are at a combining mark. If the preceding character is anything
|
||||
@ -3676,7 +3676,7 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
|
||||
U16_PREV(text, 0, index, c);
|
||||
gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
|
||||
UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
|
||||
return combining;
|
||||
return !combining;
|
||||
#elif !UCONFIG_NO_BREAK_ITERATION
|
||||
UBreakIterator *breakiterator = strsrch->search->breakIter;
|
||||
|
||||
@ -3684,10 +3684,10 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
|
||||
breakiterator = strsrch->search->internalBreakIter;
|
||||
}
|
||||
|
||||
return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index));
|
||||
return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
|
||||
#else
|
||||
// **** or use the original code? ****
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -3906,14 +3906,14 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
// 1. The match extended to the last CE from the target text, which is OK, or
|
||||
// 2. The last CE that was part of the match is in an expansion that extends
|
||||
// to the first CE after the match. In this case, we reject the match.
|
||||
const CEI *nextCEI = 0;
|
||||
if (strsrch->search->elementComparisonType == 0) {
|
||||
const CEI *nextCEI = ceb.get(targetIx + targetIxOffset);
|
||||
nextCEI = ceb.get(targetIx + targetIxOffset);
|
||||
maxLimit = nextCEI->lowIndex;
|
||||
if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
|
||||
found = FALSE;
|
||||
}
|
||||
} else {
|
||||
const CEI *nextCEI;
|
||||
for ( ; ; ++targetIxOffset ) {
|
||||
nextCEI = ceb.get(targetIx + targetIxOffset);
|
||||
maxLimit = nextCEI->lowIndex;
|
||||
@ -3949,7 +3949,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
// to something else.
|
||||
// This type of match should be rejected for not completely consuming a
|
||||
// combining sequence.
|
||||
if (isBreakBoundary(strsrch, mStart)) {
|
||||
if (!isBreakBoundary(strsrch, mStart)) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
@ -3967,10 +3967,19 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
// This advances the index over any combining charcters.
|
||||
mLimit = maxLimit;
|
||||
if (minLimit < maxLimit) {
|
||||
int32_t nba = nextBoundaryAfter(strsrch, minLimit);
|
||||
|
||||
if (nba >= lastCEI->highIndex) {
|
||||
mLimit = nba;
|
||||
// When the last CE's low index is same with its high index, the CE is likely
|
||||
// a part of expansion. In this case, the index is located just after the
|
||||
// character corresponding to the CEs compared above. If the index is right
|
||||
// at the break boundary, move the position to the next boundary will result
|
||||
// incorrect match length when there are ignorable characters exist between
|
||||
// the position and the next character produces CE(s). See ticket#8482.
|
||||
if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
|
||||
mLimit = minLimit;
|
||||
} else {
|
||||
int32_t nba = nextBoundaryAfter(strsrch, minLimit);
|
||||
if (nba >= lastCEI->highIndex) {
|
||||
mLimit = nba;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -3986,7 +3995,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
if (isBreakBoundary(strsrch, mLimit)) {
|
||||
if (!isBreakBoundary(strsrch, mLimit)) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
@ -4165,7 +4174,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
|
||||
// to something else.
|
||||
// This type of match should be rejected for not completely consuming a
|
||||
// combining sequence.
|
||||
if (isBreakBoundary(strsrch, mStart)) {
|
||||
if (!isBreakBoundary(strsrch, mStart)) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
@ -4213,7 +4222,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
|
||||
}
|
||||
|
||||
// Make sure the end of the match is on a break boundary
|
||||
if (isBreakBoundary(strsrch, mLimit)) {
|
||||
if (!isBreakBoundary(strsrch, mLimit)) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
|
@ -2914,6 +2914,78 @@ static void TestPCEBuffer_2surr() {
|
||||
TestPCEBuffer_with(search,searchLen,source,sourceLen);
|
||||
}
|
||||
|
||||
static void TestMatchFollowedByIgnorables(void) {
|
||||
/* test case for ticket#8482 */
|
||||
UChar search[] = { 0x00c9 };
|
||||
UChar source[] = { 0x00c9, 0x0000, 0x0041 };
|
||||
int32_t searchLen;
|
||||
int32_t sourceLen;
|
||||
UErrorCode icuStatus = U_ZERO_ERROR;
|
||||
UCollator *coll;
|
||||
const char *locale;
|
||||
UBreakIterator *ubrk;
|
||||
UStringSearch *usearch;
|
||||
int32_t match = 0;
|
||||
int32_t matchLength = 0;
|
||||
const int32_t expectedMatchLength = 1;
|
||||
|
||||
searchLen = sizeof(search)/sizeof(UChar);
|
||||
sourceLen = sizeof(source)/sizeof(UChar);
|
||||
|
||||
coll = ucol_openFromShortString("LHR_AN_CX_EX_FX_HX_NX_S3",
|
||||
FALSE,
|
||||
NULL,
|
||||
&icuStatus);
|
||||
if (U_FAILURE(icuStatus)) {
|
||||
log_err("ucol_openFromShortString error\n");
|
||||
}
|
||||
|
||||
locale = ucol_getLocaleByType(coll,
|
||||
ULOC_VALID_LOCALE,
|
||||
&icuStatus);
|
||||
if (U_FAILURE(icuStatus)) {
|
||||
log_err("ucol_getLocaleByType error\n");
|
||||
}
|
||||
|
||||
ubrk = ubrk_open(UBRK_CHARACTER,
|
||||
locale,
|
||||
source,
|
||||
sourceLen,
|
||||
&icuStatus);
|
||||
if (U_FAILURE(icuStatus)) {
|
||||
log_err("ubrk_open error\n");
|
||||
}
|
||||
|
||||
usearch = usearch_openFromCollator(search,
|
||||
searchLen,
|
||||
source,
|
||||
sourceLen,
|
||||
coll,
|
||||
ubrk,
|
||||
&icuStatus);
|
||||
if (U_FAILURE(icuStatus)) {
|
||||
log_err("usearch_openFromCollator error\n");
|
||||
}
|
||||
|
||||
match = usearch_first(usearch,
|
||||
&icuStatus);
|
||||
if (U_FAILURE(icuStatus)) {
|
||||
log_err("usearch_first error\n");
|
||||
}
|
||||
|
||||
log_verbose("match=%d\n", match);
|
||||
|
||||
matchLength = usearch_getMatchedLength(usearch);
|
||||
|
||||
if (U_FAILURE(icuStatus)) {
|
||||
log_err("usearch_getMatchedLength error\n");
|
||||
}
|
||||
|
||||
if (matchLength != expectedMatchLength) {
|
||||
log_err("Error: matchLength=%d, expected=%d\n", matchLength, expectedMatchLength);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* addSearchTest
|
||||
*/
|
||||
@ -2975,6 +3047,7 @@ void addSearchTest(TestNode** root)
|
||||
addTest(root, &TestUsingSearchCollator, "tscoll/usrchtst/TestUsingSearchCollator");
|
||||
addTest(root, &TestPCEBuffer_100df, "tscoll/usrchtst/TestPCEBuffer/1_00df");
|
||||
addTest(root, &TestPCEBuffer_2surr, "tscoll/usrchtst/TestPCEBuffer/2_dfff");
|
||||
addTest(root, &TestMatchFollowedByIgnorables, "tscoll/usrchtst/TestMatchFollowedByIgnorables");
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_COLLATION */
|
||||
|
@ -2010,10 +2010,20 @@ static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t
|
||||
// that's after the last CE in the match, use that index
|
||||
// as the end of the match.
|
||||
if (minLimit < maxLimit) {
|
||||
int32_t nba = ubrk_following(charBreakIterator, minLimit);
|
||||
// When the last CE's low index is same with its high index, the CE is likely
|
||||
// a part of expansion. In this case, the index is located just after the
|
||||
// character corresponding to the CEs compared above. If the index is right
|
||||
// at the break boundary, move the position to the next boundary will result
|
||||
// incorrect match length when there are ignorable characters exist between
|
||||
// the position and the next character produces CE(s). See ticket#8482.
|
||||
if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
|
||||
mend = minLimit;
|
||||
} else {
|
||||
int32_t nba = ubrk_following(charBreakIterator, minLimit);
|
||||
|
||||
if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
|
||||
mend = nba;
|
||||
if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
|
||||
mend = nba;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user