ICU-11750 For Indic search: Allow match end at normalization boundary in middle of grapheme cluster
X-SVN-Rev: 37949
This commit is contained in:
parent
9bb043757a
commit
ae7f45d5c1
@ -4002,6 +4002,25 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
// Allow matches to end in the middle of a grapheme cluster if the following
|
||||
// conditions are met; this is needed to make prefix search work properly in
|
||||
// Indic, see #11750
|
||||
// * the default breakIter is being used
|
||||
// * the next collation element beloging to this combining sequence
|
||||
// - has non-zero primary weight
|
||||
// - corresponds to a separate character following the one at end of the current match
|
||||
// * the match end is a normalization boundary
|
||||
UChar32 nextChar = 0;
|
||||
U16_GET(strsrch->search->text, 0, maxLimit, strsrch->search->textLength, nextChar);
|
||||
UBool allowMidclusterMatch = (strsrch->search->breakIter == NULL &&
|
||||
nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
|
||||
maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
|
||||
strsrch->nfd->hasBoundaryBefore(nextChar));
|
||||
// If those conditions are met, then:
|
||||
// * do NOT advance the match position to a break boundary
|
||||
// * do NOT require that end of the combining sequence not extend beyond the match in CE space
|
||||
// * do NOT require that match end position be on a breakIter boundary
|
||||
|
||||
// Advance the match end position to the first acceptable match boundary.
|
||||
// This advances the index over any combining charcters.
|
||||
mLimit = maxLimit;
|
||||
@ -4016,7 +4035,9 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
mLimit = minLimit;
|
||||
} else {
|
||||
int32_t nba = nextBoundaryAfter(strsrch, minLimit);
|
||||
if (nba >= lastCEI->highIndex) {
|
||||
// Note that we can have nba < maxLimit, in which case we want
|
||||
// to set mLimit to nba regardless of allowMidclusterMatch
|
||||
if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
|
||||
mLimit = nba;
|
||||
}
|
||||
}
|
||||
@ -4028,14 +4049,16 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
}
|
||||
#endif
|
||||
|
||||
// If advancing to the end of a combining sequence in character indexing space
|
||||
// advanced us beyond the end of the match in CE space, reject this match.
|
||||
if (mLimit > maxLimit) {
|
||||
found = FALSE;
|
||||
}
|
||||
if (!allowMidclusterMatch) {
|
||||
// If advancing to the end of a combining sequence in character indexing space
|
||||
// advanced us beyond the end of the match in CE space, reject this match.
|
||||
if (mLimit > maxLimit) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
if (!isBreakBoundary(strsrch, mLimit)) {
|
||||
found = FALSE;
|
||||
if (!isBreakBoundary(strsrch, mLimit)) {
|
||||
found = FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
if (! checkIdentical(strsrch, mStart, mLimit)) {
|
||||
@ -4252,25 +4275,47 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
|
||||
|
||||
mLimit = maxLimit = nextCEI->lowIndex;
|
||||
|
||||
// Allow matches to end in the middle of a grapheme cluster if the following
|
||||
// conditions are met; this is needed to make prefix search work properly in
|
||||
// Indic, see #11750
|
||||
// * the default breakIter is being used
|
||||
// * the next collation element beloging to this combining sequence
|
||||
// - has non-zero primary weight
|
||||
// - corresponds to a separate character following the one at end of the current match
|
||||
// * the match end is a normalization boundary
|
||||
UChar32 nextChar = 0;
|
||||
U16_GET(strsrch->search->text, 0, maxLimit, strsrch->search->textLength, nextChar);
|
||||
UBool allowMidclusterMatch = (strsrch->search->breakIter == NULL &&
|
||||
nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
|
||||
maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
|
||||
strsrch->nfd->hasBoundaryBefore(nextChar));
|
||||
// If those conditions are met, then:
|
||||
// * do NOT advance the match position to a break boundary
|
||||
// * do NOT require that end of the combining sequence not extend beyond the match in CE space
|
||||
// * do NOT require that match end position be on a breakIter boundary
|
||||
|
||||
// Advance the match end position to the first acceptable match boundary.
|
||||
// This advances the index over any combining charcters.
|
||||
// This advances the index over any combining characters.
|
||||
if (minLimit < maxLimit) {
|
||||
int32_t nba = nextBoundaryAfter(strsrch, minLimit);
|
||||
|
||||
if (nba >= lastCEI->highIndex) {
|
||||
// Note that we can have nba < maxLimit, in which case we want
|
||||
// to set mLimit to nba regardless of allowMidclusterMatch
|
||||
if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
|
||||
mLimit = nba;
|
||||
}
|
||||
}
|
||||
|
||||
// If advancing to the end of a combining sequence in character indexing space
|
||||
// advanced us beyond the end of the match in CE space, reject this match.
|
||||
if (mLimit > maxLimit) {
|
||||
found = FALSE;
|
||||
}
|
||||
if (!allowMidclusterMatch) {
|
||||
// If advancing to the end of a combining sequence in character indexing space
|
||||
// advanced us beyond the end of the match in CE space, reject this match.
|
||||
if (mLimit > maxLimit) {
|
||||
found = FALSE;
|
||||
}
|
||||
|
||||
// Make sure the end of the match is on a break boundary
|
||||
if (!isBreakBoundary(strsrch, mLimit)) {
|
||||
found = FALSE;
|
||||
// Make sure the end of the match is on a break boundary
|
||||
if (!isBreakBoundary(strsrch, mLimit)) {
|
||||
found = FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/********************************************************************
|
||||
* Copyright (c) 2001-2011 International Business Machines
|
||||
* Copyright (c) 2001-2011,2015 International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
********************************************************************
|
||||
* File USRCHDAT.H
|
||||
@ -754,6 +754,16 @@ static const SearchData DIACRITICMATCH[] = {
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
static const SearchData INDICPREFIXMATCH[] = { // <rdar://problem/18063262>
|
||||
{"\\u0915\\u0020\\u0915\\u0901\\u0020\\u0915\\u0902\\u0020\\u0915\\u0903\\u0020\\u0915\\u0940\\u0020\\u0915\\u093F\\u0020\\u0915\\u0943\\u0020\\u0915\\u093C\\u0020\\u0958",
|
||||
"\\u0915", NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {0, 2, 5, 8, 11, 14, 17, 20, 23,-1}, {1, 2, 2, 2, 1, 1, 1, 2, 1}},
|
||||
{"\\u0915\\u0924\\u0020\\u0915\\u0924\\u0940\\u0020\\u0915\\u0924\\u093F\\u0020\\u0915\\u0924\\u0947\\u0020\\u0915\\u0943\\u0924\\u0020\\u0915\\u0943\\u0924\\u0947",
|
||||
"\\u0915\\u0924", NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {0, 3, 7, 11, -1}, {2, 2, 2, 2}},
|
||||
{"\\u0915\\u0924\\u0020\\u0915\\u0924\\u0940\\u0020\\u0915\\u0924\\u093F\\u0020\\u0915\\u0924\\u0947\\u0020\\u0915\\u0943\\u0924\\u0020\\u0915\\u0943\\u0924\\u0947",
|
||||
"\\u0915\\u0943\\u0924", NULL, UCOL_PRIMARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {15, 19, -1}, {3, 3}},
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, USEARCH_STANDARD_ELEMENT_COMPARISON, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
#endif /* #if !UCONFIG_NO_COLLATION */
|
||||
|
||||
#endif
|
||||
|
@ -1,5 +1,5 @@
|
||||
/********************************************************************
|
||||
* Copyright (c) 2001-2011 International Business Machines
|
||||
* Copyright (c) 2001-2011,2015 International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
********************************************************************
|
||||
* File usrchtst.c
|
||||
@ -2987,6 +2987,24 @@ static void TestMatchFollowedByIgnorables(void) {
|
||||
ucol_close(coll);
|
||||
}
|
||||
|
||||
static void TestIndicPrefixMatch(void) // <rdar://problem/18063262>
|
||||
{
|
||||
int count = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
open(&status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err_status(status, "Unable to open static collators %s\n", u_errorName(status));
|
||||
return;
|
||||
}
|
||||
while (INDICPREFIXMATCH[count].text != NULL) {
|
||||
if (!assertEqual(INDICPREFIXMATCH[count])) {
|
||||
log_err("Error at test number %d\n", count);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
close();
|
||||
}
|
||||
|
||||
/**
|
||||
* addSearchTest
|
||||
*/
|
||||
@ -3049,6 +3067,7 @@ void addSearchTest(TestNode** root)
|
||||
addTest(root, &TestPCEBuffer_100df, "tscoll/usrchtst/TestPCEBuffer/1_00df");
|
||||
addTest(root, &TestPCEBuffer_2surr, "tscoll/usrchtst/TestPCEBuffer/2_dfff");
|
||||
addTest(root, &TestMatchFollowedByIgnorables, "tscoll/usrchtst/TestMatchFollowedByIgnorables");
|
||||
addTest(root, &TestIndicPrefixMatch, "tscoll/usrchtst/TestIndicPrefixMatch");
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_COLLATION */
|
||||
|
Loading…
Reference in New Issue
Block a user