strictly restrict matches to fit the breakiterator range exactly.

X-SVN-Rev: 8125
This commit is contained in:
Syn Wee Quek 2002-03-19 21:50:15 +00:00
parent c003ac3669
commit ce3295e827
7 changed files with 75 additions and 26 deletions

View File

@ -31,7 +31,8 @@ StringSearch::StringSearch(const UnicodeString &pattern,
m_strsrch_ = usearch_open(m_pattern_.getBuffer(), m_pattern_.length(),
m_text_.getBuffer(), m_text_.length(),
locale.getName(), NULL, &status);
locale.getName(), (UBreakIterator *)breakiter,
&status);
uprv_free(m_search_);
m_search_ = NULL;
@ -65,9 +66,11 @@ StringSearch::StringSearch(const UnicodeString &pattern,
return;
}
m_strsrch_ = usearch_openFromCollator(m_pattern_.getBuffer(),
m_pattern_.length(), m_text_.getBuffer(),
m_pattern_.length(),
m_text_.getBuffer(),
m_text_.length(), coll->ucollator,
NULL, &status);
(UBreakIterator *)breakiter,
&status);
uprv_free(m_search_);
m_search_ = NULL;
@ -97,7 +100,8 @@ StringSearch::StringSearch(const UnicodeString &pattern,
}
m_strsrch_ = usearch_open(m_pattern_.getBuffer(), m_pattern_.length(),
m_text_.getBuffer(), m_text_.length(),
locale.getName(), NULL, &status);
locale.getName(), (UBreakIterator *)breakiter,
&status);
uprv_free(m_search_);
m_search_ = NULL;
@ -131,9 +135,11 @@ StringSearch::StringSearch(const UnicodeString &pattern,
return;
}
m_strsrch_ = usearch_openFromCollator(m_pattern_.getBuffer(),
m_pattern_.length(), m_text_.getBuffer(),
m_pattern_.length(),
m_text_.getBuffer(),
m_text_.length(), coll->ucollator,
NULL, &status);
(UBreakIterator *)breakiter,
&status);
uprv_free(m_search_);
m_search_ = NULL;
@ -161,9 +167,11 @@ StringSearch::StringSearch(const StringSearch &that) :
else {
m_strsrch_ = usearch_openFromCollator(m_pattern_.getBuffer(),
m_pattern_.length(),
m_text_.getBuffer(), m_text_.length(),
m_text_.getBuffer(),
m_text_.length(),
that.m_strsrch_->collator,
NULL, &status);
(UBreakIterator *)that.m_breakiterator_,
&status);
}
uprv_free(m_search_);
m_search_ = NULL;

View File

@ -78,6 +78,9 @@ U_NAMESPACE_BEGIN
* <\ul>
* <p>
* A breakiterator can be used if only matches at logical breaks are desired.
* Using a breakiterator will only give you results that exactly matches the
* boundaries given by the breakiterator. For instance the pattern "e" will
* not be found in the string "\u00e9" if a character break iterator is used.
* <p>
* Options are provided to handle overlapping matches.
* E.g. In English, overlapping matches produces the result 0 and 2

View File

@ -75,6 +75,9 @@
* <\ul>
* <p>
* A breakiterator can be used if only matches at logical breaks are desired.
* Using a breakiterator will only give you results that exactly matches the
* boundaries given by the breakiterator. For instance the pattern "e" will
* not be found in the string "\u00e9" if a character break iterator is used.
* <p>
* Options are provided to handle overlapping matches.
* E.g. In English, overlapping matches produces the result 0 and 2

View File

@ -413,10 +413,30 @@ inline UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
// otherwise, we can use following() on the position before the
// specified one and return true of the position we get back is the
// one the user specified
return (start == startindex ||
UBool result = (start == startindex ||
ubrk_following(breakiterator, start - 1) == start) &&
(end == endindex ||
ubrk_following(breakiterator, end - 1) == end);
if (result) {
// iterates the individual ces
UCollationElements *coleiter = strsrch->utilIter;
const UChar *text = strsrch->search->text +
start;
UErrorCode status = U_ZERO_ERROR;
ucol_setText(coleiter, text, end - start, &status);
for (int32_t count = 0; count < strsrch->pattern.CELength;
count ++) {
uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
return FALSE;
}
}
if (ucol_next(coleiter, &status) != UCOL_NULLORDER) {
// extra collation elements at the end of the match
return FALSE;
}
}
return result;
}
return TRUE;
}
@ -905,7 +925,8 @@ inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
* Checks match for contraction.
* If the match ends with a partial contraction we fail.
* If the match starts too far off (because of backwards iteration) we try to
* chip off the extra characters.
* chip off the extra characters depending on whether a breakiterator has
* been used.
* Internal method, error assumed to be success, caller has to check status
* before calling this method.
* @param strsrch string search data
@ -982,7 +1003,7 @@ UBool checkNextExactContractionMatch(UStringSearch *strsrch,
* <li> the potential match does not repeat the previous match
* <li> boundaries are correct
* <li> exact matches has no extra accents
* <li> identical matches
* <li> identical matchesb
* <li> potential match does not end in the middle of a contraction
* <\ul>
* Otherwise the offset will be shifted to the next character.

View File

@ -50,7 +50,7 @@ static const SearchData BASIC[] = {
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
static const SearchData BREAKITERATOR[] = {
static const SearchData BREAKITERATOREXACT[] = {
{"foxy fox", "fox", NULL, UCOL_TERTIARY, "characterbreaker", {0, 5, -1},
{3, 3}},
{"foxy fox", "fox", NULL, UCOL_TERTIARY, "wordbreaker", {5, -1}, {3}},
@ -60,6 +60,11 @@ static const SearchData BREAKITERATOR[] = {
{10, -1}, {3}},
{"Channel, another channel, more channels, and one last Channel",
"Channel", "es", UCOL_TERTIARY, "wordbreaker", {0, 54, -1}, {7, 7}},
// jitterbug 1745
{"testing that \\u00e9 does not match e", "e", NULL, UCOL_TERTIARY,
"characterbreaker", {1, 17, 30, -1}, {1, 1, 1}},
{"testing that string ab\\u00e9cd does not match e", "e", NULL,
UCOL_TERTIARY, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}},
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
@ -284,6 +289,11 @@ static const SearchData BREAKITERATORCANONICAL[] = {
{10, -1}, {3}},
{"Channel, another channel, more channels, and one last Channel",
"Channel", "es", UCOL_TERTIARY, "wordbreaker", {0, 54, -1}, {7, 7}},
// jitterbug 1745
{"testing that \\u00e9 does not match e", "e", NULL, UCOL_TERTIARY,
"characterbreaker", {1, 17, 30, -1}, {1, 1, 1}},
{"testing that string ab\\u00e9cd does not match e", "e", NULL,
UCOL_TERTIARY, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}},
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};

View File

@ -547,8 +547,8 @@ static void TestBreakIterator() {
if (usearch_getBreakIterator(NULL) != NULL) {
log_err("Expected NULL breakiterator from NULL string search\n");
}
u_unescape(BREAKITERATOR[0].text, text, 128);
u_unescape(BREAKITERATOR[0].pattern, pattern, 32);
u_unescape(BREAKITERATOREXACT[0].text, text, 128);
u_unescape(BREAKITERATOREXACT[0].pattern, pattern, 32);
strsrch = usearch_openFromCollator(pattern, -1, text, -1, EN_US_, NULL,
&status);
if (U_FAILURE(status)) {
@ -580,7 +580,8 @@ static void TestBreakIterator() {
count = 0;
while (count < 4) {
const SearchData *search = &(BREAKITERATOR[count]);
// 0-3 test are fixed
const SearchData *search = &(BREAKITERATOREXACT[count]);
UCollator *collator = getCollator(search->collator);
UBreakIterator *breaker = getBreakIterator(search->breaker);
@ -602,7 +603,7 @@ static void TestBreakIterator() {
usearch_close(strsrch);
goto ENDTESTBREAKITERATOR;
}
search = &(BREAKITERATOR[count + 1]);
search = &(BREAKITERATOREXACT[count + 1]);
breaker = getBreakIterator(search->breaker);
usearch_setBreakIterator(strsrch, breaker, &status);
if (U_FAILURE(status) ||
@ -620,8 +621,8 @@ static void TestBreakIterator() {
count += 2;
}
count = 0;
while (BREAKITERATOR[count].text != NULL) {
if (!assertEqual(BREAKITERATOR[count])) {
while (BREAKITERATOREXACT[count].text != NULL) {
if (!assertEqual(BREAKITERATOREXACT[count])) {
log_err("Error at test number %d\n", count);
goto ENDTESTBREAKITERATOR;
}
@ -1500,6 +1501,7 @@ static void TestBreakIteratorCanonical() {
open();
while (count < 4) {
// 0-3 test are fixed
UChar pattern[32];
UChar text[128];
const SearchData *search = &(BREAKITERATORCANONICAL[count]);
@ -1527,7 +1529,7 @@ static void TestBreakIteratorCanonical() {
usearch_close(strsrch);
goto ENDTESTBREAKITERATOR;
}
search = &(BREAKITERATOR[count + 1]);
search = &(BREAKITERATOREXACT[count + 1]);
breaker = getBreakIterator(search->breaker);
usearch_setBreakIterator(strsrch, breaker, &status);
if (U_FAILURE(status) ||

View File

@ -663,10 +663,10 @@ void StringSearchTest::TestStrength()
void StringSearchTest::TestBreakIterator()
{
UChar temp[128];
u_unescape(BREAKITERATOR[0].text, temp, 128);
u_unescape(BREAKITERATOREXACT[0].text, temp, 128);
UnicodeString text;
text.setTo(temp, u_strlen(temp));
u_unescape(BREAKITERATOR[0].pattern, temp, 128);
u_unescape(BREAKITERATOREXACT[0].pattern, temp, 128);
UnicodeString pattern;
pattern.setTo(temp, u_strlen(temp));
@ -698,7 +698,8 @@ void StringSearchTest::TestBreakIterator()
int count = 0;
while (count < 4) {
const SearchData *search = &(BREAKITERATOR[count]);
// special purposes for tests numbers 0-3
const SearchData *search = &(BREAKITERATOREXACT[count]);
RuleBasedCollator *collator = getCollator(search->collator);
BreakIterator *breaker = getBreakIterator(search->breaker);
StringSearch *strsrch;
@ -724,7 +725,7 @@ void StringSearchTest::TestBreakIterator()
collator->setStrength(getECollationStrength(UCOL_TERTIARY));
delete strsrch;
}
search = &(BREAKITERATOR[count + 1]);
search = &(BREAKITERATOREXACT[count + 1]);
breaker = getBreakIterator(search->breaker);
if (breaker != NULL) {
breaker->setText(text);
@ -743,8 +744,8 @@ void StringSearchTest::TestBreakIterator()
count += 2;
}
count = 0;
while (BREAKITERATOR[count].text != NULL) {
if (!assertEqual(&BREAKITERATOR[count])) {
while (BREAKITERATOREXACT[count].text != NULL) {
if (!assertEqual(&BREAKITERATOREXACT[count])) {
errln("Error at test number %d", count);
}
count ++;
@ -1507,6 +1508,7 @@ void StringSearchTest::TestBreakIteratorCanonical()
int count = 0;
while (count < 4) {
// special purposes for tests numbers 0-3
UChar temp[128];
const SearchData *search = &(BREAKITERATORCANONICAL[count]);
@ -1538,7 +1540,7 @@ void StringSearchTest::TestBreakIteratorCanonical()
delete strsrch;
return;
}
search = &(BREAKITERATOR[count + 1]);
search = &(BREAKITERATOREXACT[count + 1]);
breaker = getBreakIterator(search->breaker);
breaker->setText(strsrch->getText());
strsrch->setBreakIterator(breaker, status);