ICU-8482 Changed the behavior of already broken match boundary condition - not to expand the result match length when 1) the end of CE match is a part of expansion 2) the limit of the CE match is already on a break boundary. Also flipped the behavior of isBreakBoundary, which was actually is*Not*BreakBoundary before.

X-SVN-Rev: 29848
2011-04-22 05:37:46 +00:00 · 2011-04-22 05:37:46 +00:00 · 39f9a8686f
commit 39f9a8686f
parent 2861a47a86
4 changed files with 124 additions and 22 deletions
--- a/icu4c/source/i18n/bmsearch.cpp
+++ b/icu4c/source/i18n/bmsearch.cpp
@ -1,6 +1,6 @@
 /*
 ******************************************************************************
- *   Copyright (C) 1996-2010, International Business Machines                 *
+ *   Copyright (C) 1996-2011, International Business Machines                 *
 *   Corporation and others.  All Rights Reserved.                            *
 ******************************************************************************
 */
@ -775,10 +775,20 @@ UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)

            mLimit = maxLimit;
            if (minLimit < maxLimit) {
-                int32_t nbb = target->nextBreakBoundary(minLimit);
+                // When the last CE's low index is same with its high index, the CE is likely
+                // a part of expansion. In this case, the index is located just after the
+                // character corresponding to the CEs compared above. If the index is right
+                // at the break boundary, move the position to the next boundary will result
+                // incorrect match length when there are ignorable characters exist between
+                // the position and the next character produces CE(s). See ticket#8482.
+                if (minLimit == lastCEI.highOffset && target->isBreakBoundary(minLimit)) {
+                    mLimit = minLimit;
+                } else {
+                    int32_t nbb = target->nextBreakBoundary(minLimit);

-                if (nbb >= lastCEI.highOffset) {
-                    mLimit = nbb;
+                    if (nbb >= lastCEI.highOffset) {
+                        mLimit = nbb;
+                    }
                }
            }

--- a/icu4c/source/i18n/usearch.cpp
+++ b/icu4c/source/i18n/usearch.cpp
@ -3659,7 +3659,7 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
    U_ASSERT(index<=textLen);

    if (index>=textLen || index<=0) {
-        return FALSE;
+        return TRUE;
    }

    // If the character at the current index is not a GRAPHEME_EXTEND
@ -3668,7 +3668,7 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
    U16_GET(text, 0, index, textLen, c);
    int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
    if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
-        return FALSE;
+        return TRUE;
    }

    // We are at a combining mark.  If the preceding character is anything
@ -3676,7 +3676,7 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
    U16_PREV(text, 0, index, c);
    gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
    UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
-    return combining;
+    return !combining;
 #elif !UCONFIG_NO_BREAK_ITERATION
    UBreakIterator *breakiterator = strsrch->search->breakIter;

@ -3684,10 +3684,10 @@ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
        breakiterator = strsrch->search->internalBreakIter;
    }

-    return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index));
+    return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
 #else
    // **** or use the original code? ****
-    return FALSE;
+    return TRUE;
 #endif
 }

@ -3906,14 +3906,14 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
        //    1. The match extended to the last CE from the target text, which is OK, or
        //    2. The last CE that was part of the match is in an expansion that extends
        //       to the first CE after the match. In this case, we reject the match.
+        const CEI *nextCEI = 0;
        if (strsrch->search->elementComparisonType == 0) {
-            const CEI *nextCEI  = ceb.get(targetIx + targetIxOffset);
+            nextCEI  = ceb.get(targetIx + targetIxOffset);
            maxLimit = nextCEI->lowIndex;
            if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
                found = FALSE;
            }
        } else {
-            const CEI *nextCEI;
            for ( ; ; ++targetIxOffset ) {
                nextCEI = ceb.get(targetIx + targetIxOffset);
                maxLimit = nextCEI->lowIndex;
@ -3949,7 +3949,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
        //    to something else.
        //   This type of match should be rejected for not completely consuming a
        //   combining sequence.
-        if (isBreakBoundary(strsrch, mStart)) {
+        if (!isBreakBoundary(strsrch, mStart)) {
            found = FALSE;
        }

@ -3967,10 +3967,19 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
        //    This advances the index over any combining charcters.
        mLimit = maxLimit;
        if (minLimit < maxLimit) {
-            int32_t nba = nextBoundaryAfter(strsrch, minLimit);
-
-            if (nba >= lastCEI->highIndex) {
-                mLimit = nba;
+            // When the last CE's low index is same with its high index, the CE is likely
+            // a part of expansion. In this case, the index is located just after the
+            // character corresponding to the CEs compared above. If the index is right
+            // at the break boundary, move the position to the next boundary will result
+            // incorrect match length when there are ignorable characters exist between
+            // the position and the next character produces CE(s). See ticket#8482.
+            if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
+                mLimit = minLimit;
+            } else {
+                int32_t nba = nextBoundaryAfter(strsrch, minLimit);
+                if (nba >= lastCEI->highIndex) {
+                    mLimit = nba;
+                }
            }
        }

@ -3986,7 +3995,7 @@ U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
            found = FALSE;
        }

-        if (isBreakBoundary(strsrch, mLimit)) {
+        if (!isBreakBoundary(strsrch, mLimit)) {
            found = FALSE;
        }

@ -4165,7 +4174,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
        //    to something else.
        //   This type of match should be rejected for not completely consuming a
        //   combining sequence.
-        if (isBreakBoundary(strsrch, mStart)) {
+        if (!isBreakBoundary(strsrch, mStart)) {
            found = FALSE;
        }

@ -4213,7 +4222,7 @@ U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
            }

            // Make sure the end of the match is on a break boundary
-            if (isBreakBoundary(strsrch, mLimit)) {
+            if (!isBreakBoundary(strsrch, mLimit)) {
                found = FALSE;
            }

--- a/icu4c/source/test/cintltst/usrchtst.c
+++ b/icu4c/source/test/cintltst/usrchtst.c
@ -2914,6 +2914,78 @@ static void TestPCEBuffer_2surr() {
  TestPCEBuffer_with(search,searchLen,source,sourceLen);
 }

+static void TestMatchFollowedByIgnorables(void) {
+    /* test case for ticket#8482 */
+    UChar search[] = { 0x00c9 };
+    UChar source[] = { 0x00c9, 0x0000, 0x0041 };
+    int32_t searchLen;
+    int32_t sourceLen;
+    UErrorCode icuStatus = U_ZERO_ERROR;
+    UCollator *coll;
+    const char *locale;
+    UBreakIterator *ubrk;
+    UStringSearch *usearch;
+    int32_t match = 0;
+    int32_t matchLength = 0;
+    const int32_t expectedMatchLength = 1;
+
+    searchLen = sizeof(search)/sizeof(UChar);
+    sourceLen = sizeof(source)/sizeof(UChar);
+
+    coll = ucol_openFromShortString("LHR_AN_CX_EX_FX_HX_NX_S3",
+                                    FALSE,
+                                    NULL,
+                                    &icuStatus);
+    if (U_FAILURE(icuStatus)) {
+        log_err("ucol_openFromShortString error\n");
+    }
+
+    locale = ucol_getLocaleByType(coll,
+                                    ULOC_VALID_LOCALE,
+                                    &icuStatus);
+    if (U_FAILURE(icuStatus)) {
+        log_err("ucol_getLocaleByType error\n");
+    }
+
+    ubrk = ubrk_open(UBRK_CHARACTER,
+                        locale,
+                        source,
+                        sourceLen,
+                        &icuStatus);
+    if (U_FAILURE(icuStatus)) {
+        log_err("ubrk_open error\n");
+    }
+
+    usearch = usearch_openFromCollator(search,
+                                        searchLen,
+                                        source,
+                                        sourceLen,
+                                        coll,
+                                        ubrk,
+                                        &icuStatus);
+    if (U_FAILURE(icuStatus)) {
+        log_err("usearch_openFromCollator error\n");
+    }
+
+    match = usearch_first(usearch,
+                            &icuStatus);
+    if (U_FAILURE(icuStatus)) {
+        log_err("usearch_first error\n");
+    }
+
+    log_verbose("match=%d\n", match);
+
+    matchLength = usearch_getMatchedLength(usearch);
+
+    if (U_FAILURE(icuStatus)) {
+        log_err("usearch_getMatchedLength error\n");
+    }
+
+    if (matchLength != expectedMatchLength) {
+        log_err("Error: matchLength=%d, expected=%d\n", matchLength, expectedMatchLength);
+    }
+}
+
 /**
 * addSearchTest
 */
@ -2975,6 +3047,7 @@ void addSearchTest(TestNode** root)
    addTest(root, &TestUsingSearchCollator, "tscoll/usrchtst/TestUsingSearchCollator");
    addTest(root, &TestPCEBuffer_100df, "tscoll/usrchtst/TestPCEBuffer/1_00df");
    addTest(root, &TestPCEBuffer_2surr, "tscoll/usrchtst/TestPCEBuffer/2_dfff");
+    addTest(root, &TestMatchFollowedByIgnorables, "tscoll/usrchtst/TestMatchFollowedByIgnorables");
 }

 #endif /* #if !UCONFIG_NO_COLLATION */
--- a/icu4c/source/test/intltest/ssearch.cpp
+++ b/icu4c/source/test/intltest/ssearch.cpp
@ -2010,10 +2010,20 @@ static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t
            // that's after the last CE in the match, use that index
            // as the end of the match.
            if (minLimit < maxLimit) {
-                int32_t nba = ubrk_following(charBreakIterator, minLimit);
+                // When the last CE's low index is same with its high index, the CE is likely
+                // a part of expansion. In this case, the index is located just after the
+                // character corresponding to the CEs compared above. If the index is right
+                // at the break boundary, move the position to the next boundary will result
+                // incorrect match length when there are ignorable characters exist between
+                // the position and the next character produces CE(s). See ticket#8482.
+                if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
+                    mend = minLimit;
+                } else {
+                    int32_t nba = ubrk_following(charBreakIterator, minLimit);

-                if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
-                    mend = nba;
+                    if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
+                        mend = nba;
+                    }
                }
            }