ICU-1745
strictly restrict matches to fit the breakiterator range exactly. X-SVN-Rev: 8125
This commit is contained in:
parent
c003ac3669
commit
ce3295e827
@ -31,7 +31,8 @@ StringSearch::StringSearch(const UnicodeString &pattern,
|
||||
|
||||
m_strsrch_ = usearch_open(m_pattern_.getBuffer(), m_pattern_.length(),
|
||||
m_text_.getBuffer(), m_text_.length(),
|
||||
locale.getName(), NULL, &status);
|
||||
locale.getName(), (UBreakIterator *)breakiter,
|
||||
&status);
|
||||
uprv_free(m_search_);
|
||||
m_search_ = NULL;
|
||||
|
||||
@ -65,9 +66,11 @@ StringSearch::StringSearch(const UnicodeString &pattern,
|
||||
return;
|
||||
}
|
||||
m_strsrch_ = usearch_openFromCollator(m_pattern_.getBuffer(),
|
||||
m_pattern_.length(), m_text_.getBuffer(),
|
||||
m_pattern_.length(),
|
||||
m_text_.getBuffer(),
|
||||
m_text_.length(), coll->ucollator,
|
||||
NULL, &status);
|
||||
(UBreakIterator *)breakiter,
|
||||
&status);
|
||||
uprv_free(m_search_);
|
||||
m_search_ = NULL;
|
||||
|
||||
@ -97,7 +100,8 @@ StringSearch::StringSearch(const UnicodeString &pattern,
|
||||
}
|
||||
m_strsrch_ = usearch_open(m_pattern_.getBuffer(), m_pattern_.length(),
|
||||
m_text_.getBuffer(), m_text_.length(),
|
||||
locale.getName(), NULL, &status);
|
||||
locale.getName(), (UBreakIterator *)breakiter,
|
||||
&status);
|
||||
uprv_free(m_search_);
|
||||
m_search_ = NULL;
|
||||
|
||||
@ -131,9 +135,11 @@ StringSearch::StringSearch(const UnicodeString &pattern,
|
||||
return;
|
||||
}
|
||||
m_strsrch_ = usearch_openFromCollator(m_pattern_.getBuffer(),
|
||||
m_pattern_.length(), m_text_.getBuffer(),
|
||||
m_pattern_.length(),
|
||||
m_text_.getBuffer(),
|
||||
m_text_.length(), coll->ucollator,
|
||||
NULL, &status);
|
||||
(UBreakIterator *)breakiter,
|
||||
&status);
|
||||
uprv_free(m_search_);
|
||||
m_search_ = NULL;
|
||||
|
||||
@ -161,9 +167,11 @@ StringSearch::StringSearch(const StringSearch &that) :
|
||||
else {
|
||||
m_strsrch_ = usearch_openFromCollator(m_pattern_.getBuffer(),
|
||||
m_pattern_.length(),
|
||||
m_text_.getBuffer(), m_text_.length(),
|
||||
m_text_.getBuffer(),
|
||||
m_text_.length(),
|
||||
that.m_strsrch_->collator,
|
||||
NULL, &status);
|
||||
(UBreakIterator *)that.m_breakiterator_,
|
||||
&status);
|
||||
}
|
||||
uprv_free(m_search_);
|
||||
m_search_ = NULL;
|
||||
|
@ -78,6 +78,9 @@ U_NAMESPACE_BEGIN
|
||||
* <\ul>
|
||||
* <p>
|
||||
* A breakiterator can be used if only matches at logical breaks are desired.
|
||||
* Using a breakiterator will only give you results that exactly matches the
|
||||
* boundaries given by the breakiterator. For instance the pattern "e" will
|
||||
* not be found in the string "\u00e9" if a character break iterator is used.
|
||||
* <p>
|
||||
* Options are provided to handle overlapping matches.
|
||||
* E.g. In English, overlapping matches produces the result 0 and 2
|
||||
|
@ -75,6 +75,9 @@
|
||||
* <\ul>
|
||||
* <p>
|
||||
* A breakiterator can be used if only matches at logical breaks are desired.
|
||||
* Using a breakiterator will only give you results that exactly matches the
|
||||
* boundaries given by the breakiterator. For instance the pattern "e" will
|
||||
* not be found in the string "\u00e9" if a character break iterator is used.
|
||||
* <p>
|
||||
* Options are provided to handle overlapping matches.
|
||||
* E.g. In English, overlapping matches produces the result 0 and 2
|
||||
|
@ -413,10 +413,30 @@ inline UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
|
||||
// otherwise, we can use following() on the position before the
|
||||
// specified one and return true of the position we get back is the
|
||||
// one the user specified
|
||||
return (start == startindex ||
|
||||
UBool result = (start == startindex ||
|
||||
ubrk_following(breakiterator, start - 1) == start) &&
|
||||
(end == endindex ||
|
||||
ubrk_following(breakiterator, end - 1) == end);
|
||||
if (result) {
|
||||
// iterates the individual ces
|
||||
UCollationElements *coleiter = strsrch->utilIter;
|
||||
const UChar *text = strsrch->search->text +
|
||||
start;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ucol_setText(coleiter, text, end - start, &status);
|
||||
for (int32_t count = 0; count < strsrch->pattern.CELength;
|
||||
count ++) {
|
||||
uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
|
||||
if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
if (ucol_next(coleiter, &status) != UCOL_NULLORDER) {
|
||||
// extra collation elements at the end of the match
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
@ -905,7 +925,8 @@ inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
|
||||
* Checks match for contraction.
|
||||
* If the match ends with a partial contraction we fail.
|
||||
* If the match starts too far off (because of backwards iteration) we try to
|
||||
* chip off the extra characters.
|
||||
* chip off the extra characters depending on whether a breakiterator has
|
||||
* been used.
|
||||
* Internal method, error assumed to be success, caller has to check status
|
||||
* before calling this method.
|
||||
* @param strsrch string search data
|
||||
@ -982,7 +1003,7 @@ UBool checkNextExactContractionMatch(UStringSearch *strsrch,
|
||||
* <li> the potential match does not repeat the previous match
|
||||
* <li> boundaries are correct
|
||||
* <li> exact matches has no extra accents
|
||||
* <li> identical matches
|
||||
* <li> identical matchesb
|
||||
* <li> potential match does not end in the middle of a contraction
|
||||
* <\ul>
|
||||
* Otherwise the offset will be shifted to the next character.
|
||||
|
@ -50,7 +50,7 @@ static const SearchData BASIC[] = {
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
static const SearchData BREAKITERATOR[] = {
|
||||
static const SearchData BREAKITERATOREXACT[] = {
|
||||
{"foxy fox", "fox", NULL, UCOL_TERTIARY, "characterbreaker", {0, 5, -1},
|
||||
{3, 3}},
|
||||
{"foxy fox", "fox", NULL, UCOL_TERTIARY, "wordbreaker", {5, -1}, {3}},
|
||||
@ -60,6 +60,11 @@ static const SearchData BREAKITERATOR[] = {
|
||||
{10, -1}, {3}},
|
||||
{"Channel, another channel, more channels, and one last Channel",
|
||||
"Channel", "es", UCOL_TERTIARY, "wordbreaker", {0, 54, -1}, {7, 7}},
|
||||
// jitterbug 1745
|
||||
{"testing that \\u00e9 does not match e", "e", NULL, UCOL_TERTIARY,
|
||||
"characterbreaker", {1, 17, 30, -1}, {1, 1, 1}},
|
||||
{"testing that string ab\\u00e9cd does not match e", "e", NULL,
|
||||
UCOL_TERTIARY, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}},
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
@ -284,6 +289,11 @@ static const SearchData BREAKITERATORCANONICAL[] = {
|
||||
{10, -1}, {3}},
|
||||
{"Channel, another channel, more channels, and one last Channel",
|
||||
"Channel", "es", UCOL_TERTIARY, "wordbreaker", {0, 54, -1}, {7, 7}},
|
||||
// jitterbug 1745
|
||||
{"testing that \\u00e9 does not match e", "e", NULL, UCOL_TERTIARY,
|
||||
"characterbreaker", {1, 17, 30, -1}, {1, 1, 1}},
|
||||
{"testing that string ab\\u00e9cd does not match e", "e", NULL,
|
||||
UCOL_TERTIARY, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}},
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
|
@ -547,8 +547,8 @@ static void TestBreakIterator() {
|
||||
if (usearch_getBreakIterator(NULL) != NULL) {
|
||||
log_err("Expected NULL breakiterator from NULL string search\n");
|
||||
}
|
||||
u_unescape(BREAKITERATOR[0].text, text, 128);
|
||||
u_unescape(BREAKITERATOR[0].pattern, pattern, 32);
|
||||
u_unescape(BREAKITERATOREXACT[0].text, text, 128);
|
||||
u_unescape(BREAKITERATOREXACT[0].pattern, pattern, 32);
|
||||
strsrch = usearch_openFromCollator(pattern, -1, text, -1, EN_US_, NULL,
|
||||
&status);
|
||||
if (U_FAILURE(status)) {
|
||||
@ -580,7 +580,8 @@ static void TestBreakIterator() {
|
||||
|
||||
count = 0;
|
||||
while (count < 4) {
|
||||
const SearchData *search = &(BREAKITERATOR[count]);
|
||||
// 0-3 test are fixed
|
||||
const SearchData *search = &(BREAKITERATOREXACT[count]);
|
||||
UCollator *collator = getCollator(search->collator);
|
||||
UBreakIterator *breaker = getBreakIterator(search->breaker);
|
||||
|
||||
@ -602,7 +603,7 @@ static void TestBreakIterator() {
|
||||
usearch_close(strsrch);
|
||||
goto ENDTESTBREAKITERATOR;
|
||||
}
|
||||
search = &(BREAKITERATOR[count + 1]);
|
||||
search = &(BREAKITERATOREXACT[count + 1]);
|
||||
breaker = getBreakIterator(search->breaker);
|
||||
usearch_setBreakIterator(strsrch, breaker, &status);
|
||||
if (U_FAILURE(status) ||
|
||||
@ -620,8 +621,8 @@ static void TestBreakIterator() {
|
||||
count += 2;
|
||||
}
|
||||
count = 0;
|
||||
while (BREAKITERATOR[count].text != NULL) {
|
||||
if (!assertEqual(BREAKITERATOR[count])) {
|
||||
while (BREAKITERATOREXACT[count].text != NULL) {
|
||||
if (!assertEqual(BREAKITERATOREXACT[count])) {
|
||||
log_err("Error at test number %d\n", count);
|
||||
goto ENDTESTBREAKITERATOR;
|
||||
}
|
||||
@ -1500,6 +1501,7 @@ static void TestBreakIteratorCanonical() {
|
||||
|
||||
open();
|
||||
while (count < 4) {
|
||||
// 0-3 test are fixed
|
||||
UChar pattern[32];
|
||||
UChar text[128];
|
||||
const SearchData *search = &(BREAKITERATORCANONICAL[count]);
|
||||
@ -1527,7 +1529,7 @@ static void TestBreakIteratorCanonical() {
|
||||
usearch_close(strsrch);
|
||||
goto ENDTESTBREAKITERATOR;
|
||||
}
|
||||
search = &(BREAKITERATOR[count + 1]);
|
||||
search = &(BREAKITERATOREXACT[count + 1]);
|
||||
breaker = getBreakIterator(search->breaker);
|
||||
usearch_setBreakIterator(strsrch, breaker, &status);
|
||||
if (U_FAILURE(status) ||
|
||||
|
@ -663,10 +663,10 @@ void StringSearchTest::TestStrength()
|
||||
void StringSearchTest::TestBreakIterator()
|
||||
{
|
||||
UChar temp[128];
|
||||
u_unescape(BREAKITERATOR[0].text, temp, 128);
|
||||
u_unescape(BREAKITERATOREXACT[0].text, temp, 128);
|
||||
UnicodeString text;
|
||||
text.setTo(temp, u_strlen(temp));
|
||||
u_unescape(BREAKITERATOR[0].pattern, temp, 128);
|
||||
u_unescape(BREAKITERATOREXACT[0].pattern, temp, 128);
|
||||
UnicodeString pattern;
|
||||
pattern.setTo(temp, u_strlen(temp));
|
||||
|
||||
@ -698,7 +698,8 @@ void StringSearchTest::TestBreakIterator()
|
||||
|
||||
int count = 0;
|
||||
while (count < 4) {
|
||||
const SearchData *search = &(BREAKITERATOR[count]);
|
||||
// special purposes for tests numbers 0-3
|
||||
const SearchData *search = &(BREAKITERATOREXACT[count]);
|
||||
RuleBasedCollator *collator = getCollator(search->collator);
|
||||
BreakIterator *breaker = getBreakIterator(search->breaker);
|
||||
StringSearch *strsrch;
|
||||
@ -724,7 +725,7 @@ void StringSearchTest::TestBreakIterator()
|
||||
collator->setStrength(getECollationStrength(UCOL_TERTIARY));
|
||||
delete strsrch;
|
||||
}
|
||||
search = &(BREAKITERATOR[count + 1]);
|
||||
search = &(BREAKITERATOREXACT[count + 1]);
|
||||
breaker = getBreakIterator(search->breaker);
|
||||
if (breaker != NULL) {
|
||||
breaker->setText(text);
|
||||
@ -743,8 +744,8 @@ void StringSearchTest::TestBreakIterator()
|
||||
count += 2;
|
||||
}
|
||||
count = 0;
|
||||
while (BREAKITERATOR[count].text != NULL) {
|
||||
if (!assertEqual(&BREAKITERATOR[count])) {
|
||||
while (BREAKITERATOREXACT[count].text != NULL) {
|
||||
if (!assertEqual(&BREAKITERATOREXACT[count])) {
|
||||
errln("Error at test number %d", count);
|
||||
}
|
||||
count ++;
|
||||
@ -1507,6 +1508,7 @@ void StringSearchTest::TestBreakIteratorCanonical()
|
||||
int count = 0;
|
||||
|
||||
while (count < 4) {
|
||||
// special purposes for tests numbers 0-3
|
||||
UChar temp[128];
|
||||
const SearchData *search = &(BREAKITERATORCANONICAL[count]);
|
||||
|
||||
@ -1538,7 +1540,7 @@ void StringSearchTest::TestBreakIteratorCanonical()
|
||||
delete strsrch;
|
||||
return;
|
||||
}
|
||||
search = &(BREAKITERATOR[count + 1]);
|
||||
search = &(BREAKITERATOREXACT[count + 1]);
|
||||
breaker = getBreakIterator(search->breaker);
|
||||
breaker->setText(strsrch->getText());
|
||||
strsrch->setBreakIterator(breaker, status);
|
||||
|
Loading…
Reference in New Issue
Block a user