ICU-2292 line break rules updated, 15 mins testmonkey passes

X-SVN-Rev: 13663
2003-11-11 21:24:09 +00:00 · 2003-11-11 21:24:09 +00:00 · 8feb899d7d
commit 8feb899d7d
parent 7eb4264ca5
3 changed files with 90 additions and 45 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -730,8 +730,10 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
                lookaheadStatus = 0;
            } else if (result == initialPosition) {
                // Ran off end, no match found.
-                // Treat as a break at the end of the input string.
-                result = fText->endIndex();
+                // move forward one
+                fText->setIndex(initialPosition);
+                fText->next32();
+                fText->getIndex();
            }
            break;
        }
--- a/icu4c/source/data/brkitr/line.txt
+++ b/icu4c/source/data/brkitr/line.txt
@ -44,6 +44,7 @@ $SA = [:LineBreak =  Complex_Context:];
 $SG = [:LineBreak =  Surrogate:];
 $SP = [:LineBreak =  Space:];
 $SY = [:LineBreak =  Break_Symbols:];
+$WJ = [:LineBreak =  Word_Joiner:];
 $XX = [:LineBreak =  Unknown:];
 $ZW = [:LineBreak =  ZWSpace:];

@ -60,7 +61,6 @@ $LVT = [:Hangul_Syllable_Type = LVT:];

 $HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;

-
 #
 #  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
 #                               SA  (South East Asian: Thai, Lao, Khmer)
@ -91,6 +91,7 @@ $PRcm = $PR $CM*;
 $QUcm = $QU $CM*;
 $SPcm = $SP $CM*;
 $SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;

 #
 #  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
@ -114,6 +115,7 @@ $PR $CM+;
 $QU $CM+;
 $SP $CM+;
 $SY $CM+;
+$WJ $CM+;

 ## -------------------------------------------------

@ -131,12 +133,19 @@ $CR $LF {100};

 # LB 4         x SP
 #              x ZW
-$LB3NonBreaks      [$SP $ZW];
+$ZW [$SP $ZW];
 $LB5NonBreaks $CM* [$SP $ZW];

 # LB 5         Break after zero width space
 $LB5Breaks = [$LB3Breaks $ZW];

+# LB 6
+#
+# Korean Syllable Definitions
+#
+
+($HangulSyllable) $CM*;
+
 # LB 7     Combining marks.  TODO:  get it right!
 #                                   $SP $CM needs to behave like $ID.
 #                                   X   $CM needs to behave like X, where X is not $SP.
@ -163,10 +172,8 @@ $CLcm $SP* $NScm;
 ($B2cm)+;

 # LB 11b
-$LB5NonBreaks $CM* $GLcm .?;
-$LB5NonBreaks $CM* $GLcm $LB5NonBreaks $CM*;
-$GLcm $LB3NonBreaks?;
-$GLcm $LB5NonBreaks $CM*;
+$LB5NonBreaks $CM* ($GLcm | $WJcm);
+($GLcm | $WJcm) .?;

 # LB 12
 $LB12NonBreaks = [$LB5NonBreaks - $SP];
@ -184,14 +191,12 @@ $QUcm $LB5NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
 $LB14NonBreaks = [$LB12NonBreaks - $CB];
 $LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;

-
 # LB 15
 $LB14CanBreakAfter ($BAcm | $HYcm | $NScm);   
 $BBcm [^$CB];
 $BBcm [^$CB $CR $LF $BK $NL $ZW] $CM*;  

 # LB 16
-#($ALcm | $IDcm | $SP $CM+ | $INcm | $NUcm) $INcm;
 $ALcm    $INcm;
 $CM+     $INcm;     #  by rule 7c, any otherwise unattached CM behaves as AL
 $IDcm    $INcm;
@ -206,11 +211,8 @@ $ALcm+ $NUcm;       # includes $LB19
 $CM+   $NUcm;       # Rule 7c
 $NUcm $ALcm+;

-
-
 # LB 18
 $PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;
-#$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm?;

 # LB 19
 $CM* $ALcm+;    # The $CM* is from rule 7C, and unattached CM is treated as AL
@ -226,7 +228,6 @@ $CM* $ALcm+;    # The $CM* is from rule 7C, and unattached CM is treated as AL

 !!reverse;

-
 $CM+ $ALPlus;
 $CM+ $BA;
 $CM+ $BB;
@ -246,6 +247,7 @@ $CM+ $PR;
 $CM+ $QU;
 $CM+ $SP;
 $CM+ $SY;
+$CM+ $WJ;

 # LB 3

@ -262,6 +264,9 @@ $LF $CR;

 # LB 6 Jamo is treated like an alphabet

+$BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+;
+$CM* $BackHangulSyllable;
+
 # LB 7 Combining marks. 
 #    $SP $CM needs to behave like $ID.
 #    X   $CM needs to behave like X, where X is not $SP.
@ -288,9 +293,9 @@ $CM* $NS $SP* $CM* $CL;
 ($CM* $B2)+;

 # LB 11b
-$CM* $GL $CM* $LB5NonBreaks;
-$CM* $LB5NonBreaks $CM* $GL;
-$LB3NonBreaks $CM* $GL;
+$CM* ($GL | $WJ) $CM* $LB5NonBreaks;
+$CM* $LB5NonBreaks $CM* ($GL | $WJ);
+. $CM* ($GL | $WJ);

 # LB 12

@ -340,6 +345,9 @@ $CM* $ALPlus $CM+ / $LB5Breaks;

 !!safe_reverse;

+# LB 6
+$V+ $L;
+
 # LB 7
 $CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
 $CM+ $SP / .;
@ -354,13 +362,16 @@ $SP+ $CM* $QU;
 $SP+ $CM* $CL;

 # LB 18
-$IS+ $CM* $NU;
+($CM* $IS)+ $CM* $NU;
 $CL $CM* ($NU | $IS);

 ## -------------------------------------------------

 !!safe_forward;

+# LB 6
+$V+ $T;
+
 # LB 7
 [^$BK $CR $LF $NL $ZW $SP] $CM+;
 $SP $CM+ / [^$CM];
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -595,11 +595,11 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
             if(exec) TestExtended();                          break;
        case 17: name = "TestMonkey";
             if(exec) {
-#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+ #if !UCONFIG_NO_REGULAR_EXPRESSIONS
               TestMonkey(params);
-#else
+ #else
               logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
-#endif
+ #endif
             }
             break;
        default: name = ""; break; //needed to end loop
@ -2295,7 +2295,6 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
            break;
        }

- 
        // Rule (5).   ALetter x ALetter
        if (fALetterSet->contains(c1) &&
            fALetterSet->contains(c2))  {
@ -2494,6 +2493,7 @@ RBBILineMonkey::RBBILineMonkey()
    fAL    = new UnicodeSet("[\\p{Line_break=AL}]", status);
    fID    = new UnicodeSet("[\\p{Line_break=ID}]", status);
    fSA    = new UnicodeSet("[\\p{Line_break=SA}]", status);
+    fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]", status);
    fXX    = new UnicodeSet("[\\p{Line_break=XX}]", status);

    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
@ -2530,6 +2530,7 @@ RBBILineMonkey::RBBILineMonkey()
    fSets->addElement(fAI, status);
    fSets->addElement(fAL, status);
    fSets->addElement(fID, status);
+    fSets->addElement(fWJ, status);
    fSets->addElement(fSA, status);
    // fSets->addElement(fXX, status);

@ -2600,7 +2601,8 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
    //  advance over any CM class chars.  (Line Break CM class is different from
    //    grapheme cluster CM, so we need to do this even for HangulSyllables.
    //    Line Break may eat additional stuff as combining, beyond what graphem cluster did.
-    if (!(fBK->contains(*posChar) || *posChar==0x0a || *posChar==0x0d || *posChar==0x85)) {
+    if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a 
+        || *posChar==0x0d || *posChar==0x85)) {
        for (;;) {
            *nextChar = fText->char32At(nPos);
            if (!fCM->contains(*nextChar)) {
@ -2791,11 +2793,21 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        // LB 9  Don't break after OP SP*
        /// UBool cmFlag = FALSE;
        for (tPos=prevPos; ; tPos=fCharBI->preceding(tPos)) {
+            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
+                tPos=fText->moveIndex32(tPos, -1);
+            }
            if (fOP->contains(fText->char32At(tPos))) {
                break;
            }
-            if (fSP->contains(prevChar) == FALSE
-                || fSP->contains(fText->char32At(tPos)) == FALSE 
+            if (fSP->contains(fText->char32At(tPos)) == TRUE) {
+                int32_t temp = fText->moveIndex32(tPos, 1);
+                if (fCM->contains(fText->char32At(temp))) {
+                    // if we have $SP$CM+ which is an $ID
+                    goto fall_through_9;
+                }
+            }
+            // fSP->contains(prevChar) == FALSE || 
+            if (fSP->contains(fText->char32At(tPos)) == FALSE 
                || tPos == 0) {
                /// || cmFlag == TRUE) {
                // if we have $SP$CM+ which is an $ID
@ -2834,6 +2846,9 @@ fall_through_9:
        if (fGL->contains(thisChar) || fGL->contains(prevChar)) {
            continue;
        }
+        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
+            continue;
+        }

        // LB 12    break after space
        if (fSP->contains(prevChar)) {
@ -2896,6 +2911,10 @@ fall_through_9:
                    nextPos = numEndIdx;
                    pos = fCharBI->preceding(numEndIdx); 
                    thisChar = fText->char32At(pos);
+                    while (fCM->contains(thisChar)) {
+                        pos = fCharBI->preceding(pos);
+                        thisChar = fText->char32At(pos);
+                    }
                }
                continue;
            }
@ -3010,7 +3029,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
 {
    int count = 0;
    int i = 0;
-    int forward[20];
+    int forward[50];
    bi->setText(ustr);
    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
        forward[count] = i;
@ -3078,9 +3097,14 @@ void RBBITest::TestWordBreaks(void)
    UErrorCode    status = U_ZERO_ERROR;
    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
-    UChar         str[25]; 
+    UChar         str[300]; 
    char          *strlist[] = 
    {
+    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
+    "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
+    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
+    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
+    "\\u90ca\\u3588\\u009c\\u0953\\u194b",
    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
    "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
@ -3124,7 +3148,7 @@ void RBBITest::TestWordBreaks(void)
        // RBBICharMonkey monkey;
        RBBIWordMonkey monkey;

-        int expected[20];
+        int expected[50];
        int expectedcount = 0;

        monkey.setText(ustr);
@ -3144,7 +3168,7 @@ void RBBITest::TestWordBoundary(void)
    UErrorCode    status = U_ZERO_ERROR;
    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
-    UChar         str[20]; 
+    UChar         str[50]; 
    char          *strlist[] = 
    {
    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
@ -3182,7 +3206,7 @@ void RBBITest::TestWordBoundary(void)
        // printf("looping %d\n", loop);
        u_unescape(strlist[loop], str, 20);
        UnicodeString ustr(str);
-        int forward[20];
+        int forward[50];
        int count = 0;
        
        bi->setText(ustr);
@ -3217,9 +3241,21 @@ void RBBITest::TestLineBreaks(void)
    Locale        locale("en");
    UErrorCode    status = U_ZERO_ERROR;
    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
-    UChar         str[20]; 
+    UChar         str[50]; 
    char          *strlist[] = 
    {
+     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
+     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
+     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
+     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
+     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
+     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
+     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
+     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
+     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
+     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
+     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
+     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
@ -3235,7 +3271,6 @@ void RBBITest::TestLineBreaks(void)
     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
-     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
@ -3253,7 +3288,7 @@ void RBBITest::TestLineBreaks(void)
        UnicodeString ustr(str);
        RBBILineMonkey monkey;

-        int expected[20];
+        int expected[50];
        int expectedcount = 0;

        monkey.setText(ustr);
@ -3386,6 +3421,8 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
    UnicodeString    testText;
    int32_t          numCharClasses;
    UVector          *chClasses;
+    int              expected[TESTSTRINGLEN*2 + 1];
+    int              expectedCount = 0;
    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
    char             reverseBreaks[TESTSTRINGLEN*2+1];
@ -3443,6 +3480,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
        memset(expectedBreaks, 0, sizeof(expectedBreaks));
        expectedBreaks[0] = 1;
        int32_t breakPos = 0;
+        expectedCount = 0;
        for (;;) {
            breakPos = mk.next(breakPos);
            if (breakPos == -1) {
@ -3452,6 +3490,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
                errln("breakPos > testText.length()");
            }
            expectedBreaks[breakPos] = 1;
+            expected[expectedCount ++] = breakPos;
        }

        // Find the break positions using forward iteration
@ -3528,20 +3567,13 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint

                // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
                UnicodeString errorText = "<data>";
-                /*** if (strcmp(errorType, "next()") == 0) {
+                /***if (strcmp(errorType, "next()") == 0) {
                    startContext = 0;
-                    int j = i;
-                    while (true) {
-                        if (forwardBreaks[j ++] != 0) {
-                            printf("%d\n", j);
-                            break;
-                        }
-                        if (j % 100 == 0) {
-                            printf("continue %d\n", j);
-                        }
-                    }
-                    endContext = j + 1;
+                    endContext = testText.length();
+                   
+                    printStringBreaks(testText, expected, expectedCount);
                }***/
+
                for (ci=startContext; ci<endContext;) {
                    UnicodeString hexChars("0123456789abcdef");
                    UChar32  c;