ICU-11750 For Indic search: Allow match end at normalization boundary in middle of grapheme cluster

X-SVN-Rev: 37949
This commit is contained in:
Peter Edberg 2015-09-13 07:43:51 +00:00
parent 9bb043757a
commit ae7f45d5c1
3 changed files with 95 additions and 21 deletions

View File

@ -4002,6 +4002,25 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
found = FALSE;
}
// Allow matches to end in the middle of a grapheme cluster if the following
// conditions are met; this is needed to make prefix search work properly in
// Indic, see #11750
// * the default breakIter is being used
// * the next collation element beloging to this combining sequence
// - has non-zero primary weight
// - corresponds to a separate character following the one at end of the current match
// * the match end is a normalization boundary
UChar32 nextChar = 0;
U16_GET(strsrch->search->text, 0, maxLimit, strsrch->search->textLength, nextChar);
UBool allowMidclusterMatch = (strsrch->search->breakIter == NULL &&
nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
strsrch->nfd->hasBoundaryBefore(nextChar));
// If those conditions are met, then:
// * do NOT advance the match position to a break boundary
// * do NOT require that end of the combining sequence not extend beyond the match in CE space
// * do NOT require that match end position be on a breakIter boundary
// Advance the match end position to the first acceptable match boundary.
// This advances the index over any combining charcters.
mLimit = maxLimit;
@ -4016,7 +4035,9 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
mLimit = minLimit;
} else {
int32_t nba = nextBoundaryAfter(strsrch, minLimit);
if (nba >= lastCEI->highIndex) {
// Note that we can have nba < maxLimit, in which case we want
// to set mLimit to nba regardless of allowMidclusterMatch
if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
mLimit = nba;
}
}
@ -4028,14 +4049,16 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
}
#endif
// If advancing to the end of a combining sequence in character indexing space
// advanced us beyond the end of the match in CE space, reject this match.
if (mLimit > maxLimit) {
found = FALSE;
}
if (!allowMidclusterMatch) {
// If advancing to the end of a combining sequence in character indexing space
// advanced us beyond the end of the match in CE space, reject this match.
if (mLimit > maxLimit) {
found = FALSE;
}
if (!isBreakBoundary(strsrch, mLimit)) {
found = FALSE;
if (!isBreakBoundary(strsrch, mLimit)) {
found = FALSE;
}
}
if (! checkIdentical(strsrch, mStart, mLimit)) {
@ -4252,25 +4275,47 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
mLimit = maxLimit = nextCEI->lowIndex;
// Allow matches to end in the middle of a grapheme cluster if the following
// conditions are met; this is needed to make prefix search work properly in
// Indic, see #11750
// * the default breakIter is being used
// * the next collation element beloging to this combining sequence
// - has non-zero primary weight
// - corresponds to a separate character following the one at end of the current match
// * the match end is a normalization boundary
UChar32 nextChar = 0;
U16_GET(strsrch->search->text, 0, maxLimit, strsrch->search->textLength, nextChar);
UBool allowMidclusterMatch = (strsrch->search->breakIter == NULL &&
nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
strsrch->nfd->hasBoundaryBefore(nextChar));
// If those conditions are met, then:
// * do NOT advance the match position to a break boundary
// * do NOT require that end of the combining sequence not extend beyond the match in CE space
// * do NOT require that match end position be on a breakIter boundary
// Advance the match end position to the first acceptable match boundary.
// This advances the index over any combining charcters.
// This advances the index over any combining characters.
if (minLimit < maxLimit) {
int32_t nba = nextBoundaryAfter(strsrch, minLimit);
if (nba >= lastCEI->highIndex) {
// Note that we can have nba < maxLimit, in which case we want
// to set mLimit to nba regardless of allowMidclusterMatch
if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
mLimit = nba;
}
}
// If advancing to the end of a combining sequence in character indexing space
// advanced us beyond the end of the match in CE space, reject this match.
if (mLimit > maxLimit) {
found = FALSE;
}
if (!allowMidclusterMatch) {
// If advancing to the end of a combining sequence in character indexing space
// advanced us beyond the end of the match in CE space, reject this match.
if (mLimit > maxLimit) {
found = FALSE;
}
// Make sure the end of the match is on a break boundary
if (!isBreakBoundary(strsrch, mLimit)) {
found = FALSE;
// Make sure the end of the match is on a break boundary
if (!isBreakBoundary(strsrch, mLimit)) {
found = FALSE;
}
}
} else {

View File

@ -1,5 +1,5 @@
/********************************************************************
* Copyright (c) 2001-2011 International Business Machines
* Copyright (c) 2001-2011,2015 International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************
* File USRCHDAT.H
@ -754,6 +754,16 @@ static const SearchData DIACRITICMATCH[] = {
{NULL, NULL, NULL, UCOL_TERTIARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {-1}, {0}}
};
static const SearchData INDICPREFIXMATCH[] = { // <rdar://problem/18063262>
{"\\u0915\\u0020\\u0915\\u0901\\u0020\\u0915\\u0902\\u0020\\u0915\\u0903\\u0020\\u0915\\u0940\\u0020\\u0915\\u093F\\u0020\\u0915\\u0943\\u0020\\u0915\\u093C\\u0020\\u0958",
"\\u0915", NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {0, 2, 5, 8, 11, 14, 17, 20, 23,-1}, {1, 2, 2, 2, 1, 1, 1, 2, 1}},
{"\\u0915\\u0924\\u0020\\u0915\\u0924\\u0940\\u0020\\u0915\\u0924\\u093F\\u0020\\u0915\\u0924\\u0947\\u0020\\u0915\\u0943\\u0924\\u0020\\u0915\\u0943\\u0924\\u0947",
"\\u0915\\u0924", NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {0, 3, 7, 11, -1}, {2, 2, 2, 2}},
{"\\u0915\\u0924\\u0020\\u0915\\u0924\\u0940\\u0020\\u0915\\u0924\\u093F\\u0020\\u0915\\u0924\\u0947\\u0020\\u0915\\u0943\\u0924\\u0020\\u0915\\u0943\\u0924\\u0947",
"\\u0915\\u0943\\u0924", NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {15, 19, -1}, {3, 3}},
{NULL, NULL, NULL, UCOL_TERTIARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {-1}, {0}}
};
#endif /* #if !UCONFIG_NO_COLLATION */
#endif

View File

@ -1,5 +1,5 @@
/********************************************************************
* Copyright (c) 2001-2011 International Business Machines
* Copyright (c) 2001-2011,2015 International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************
* File usrchtst.c
@ -2987,6 +2987,24 @@ static void TestMatchFollowedByIgnorables(void) {
ucol_close(coll);
}
static void TestIndicPrefixMatch(void) // <rdar://problem/18063262>
{
int count = 0;
UErrorCode status = U_ZERO_ERROR;
open(&status);
if (U_FAILURE(status)) {
log_err_status(status, "Unable to open static collators %s\n", u_errorName(status));
return;
}
while (INDICPREFIXMATCH[count].text != NULL) {
if (!assertEqual(INDICPREFIXMATCH[count])) {
log_err("Error at test number %d\n", count);
}
count ++;
}
close();
}
/**
* addSearchTest
*/
@ -3049,6 +3067,7 @@ void addSearchTest(TestNode** root)
addTest(root, &TestPCEBuffer_100df, "tscoll/usrchtst/TestPCEBuffer/1_00df");
addTest(root, &TestPCEBuffer_2surr, "tscoll/usrchtst/TestPCEBuffer/2_dfff");
addTest(root, &TestMatchFollowedByIgnorables, "tscoll/usrchtst/TestMatchFollowedByIgnorables");
addTest(root, &TestIndicPrefixMatch, "tscoll/usrchtst/TestIndicPrefixMatch");
}
#endif /* #if !UCONFIG_NO_COLLATION */