ICU-5595 Fix scrambled dictionary break cache when built in reverse with multiple spans

X-SVN-Rev: 21421
2007-04-17 23:01:42 +00:00 · 2007-04-17 23:01:42 +00:00 · 8cf4403816
commit 8cf4403816
parent 17717d9c7f
2 changed files with 32 additions and 21 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -527,6 +527,11 @@ int32_t RuleBasedBreakIterator::previous(void) {
    if (fCachedBreakPositions != NULL) {
        if (fPositionInCache > 0) {
            --fPositionInCache;
+            // If we're at the beginning of the cache, need to reevaluate the
+            // rule status
+            if (fPositionInCache <= 0) {
+                fLastStatusIndexValid = FALSE;
+            }
            int32_t pos = fCachedBreakPositions[fPositionInCache];
            utext_setNativeIndex(fText, pos);
            return pos;
@ -731,6 +736,11 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
                   && offset > fCachedBreakPositions[fPositionInCache])
                ++fPositionInCache;
            --fPositionInCache;
+            // If we're at the beginning of the cache, need to reevaluate the
+            // rule status
+            if (fPositionInCache <= 0) {
+                fLastStatusIndexValid = FALSE;
+            }
            utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
            return fCachedBreakPositions[fPositionInCache];
        }
@ -1595,25 +1605,19 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
    // Loop through the text, looking for ranges of dictionary characters.
    // For each span, find the appropriate break engine, and ask it to find
    // any breaks within the span.
+    // Note: we always do this in the forward direction, so that the break
+    // cache is built in the right order.
+    if (reverse) {
+        utext_setNativeIndex(fText, rangeStart);
+    }
    while(U_SUCCESS(status)) {
-        if (reverse) {
-            while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) > rangeStart && (category & 0x4000) == 0) {
-                c = UTEXT_PREVIOUS32(fText);
-                UTRIE_GET16(&fData->fTrie, c, category);
-            }
-            if (current <= rangeStart) {
-                break;
-            }
+        while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
+            utext_next32(fText);           // TODO:  tweak for post-increment operation
+            c = utext_current32(fText);
+            UTRIE_GET16(&fData->fTrie, c, category);
        }
-        else {
-            while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
-                utext_next32(fText);           // TODO:  tweak for post-increment operation
-                c = utext_current32(fText);
-                UTRIE_GET16(&fData->fTrie, c, category);
-            }
-            if (current >= rangeEnd) {
-                break;
-            }
+        if (current >= rangeEnd) {
+            break;
        }
        
        // We now have a dictionary character. Get the appropriate language object
@ -1623,7 +1627,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
        // Ask the language object if there are any breaks. It will leave the text
        // pointer on the other side of its range, ready to search for the next one.
        if (lbe != NULL) {
-            foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, reverse, fBreakType, breaks);
+            foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
        }
        
        // Reload the loop variables for the next go-round
@ -1667,9 +1671,8 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
        // If the allocation failed, just fall through to the "no breaks found" case.
    }

-    // If we get here, there were no language-based breaks. As a result, the
-    // text pointer should be back to where it started, but set it just to
-    // make sure.
+    // If we get here, there were no language-based breaks. Set the text pointer
+    // to the original proposed break.
    utext_setNativeIndex(fText, reverse ? startPos : endPos);
    return (reverse ? startPos : endPos);
 }
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@ -529,4 +529,12 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
 #
 <data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data>

+#
+#  Trac ticket 5595 Test Case
+<data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลาง<200>\
+ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200>ป้า<200>เอ็ม<200>\
+ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<200>ไม้<200>\
+สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>ทาง<200>หลาย<200>\
+ไมล์<200></data>
+