ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12701
2003-07-28 06:40:25 +00:00 · 2003-07-28 06:40:25 +00:00 · 6bbbeb7637
commit 6bbbeb7637
parent 468a10f112
3 changed files with 88 additions and 48 deletions
--- a/icu4c/source/data/brkitr/line.txt
+++ b/icu4c/source/data/brkitr/line.txt
@ -7,6 +7,11 @@
 #         Implement default line breaking as defined by Unicode TR 14.
 #

+#   Known Deviations from TR14:
+#      LB  7a The Sequence SP CM+  is not treated as an ID.  
+#             The SP  in SP CM is not distinguished from any other SP.
+#      LB 14a, break before and after CB, is not implemented.
+

 #
 #  Character Classes defined by TR 14.
@ -44,12 +49,6 @@ $XX = [:LineBreak =  Unknown:];
 $ZW = [:LineBreak =  ZWSpace:];


-#
-#  Character classes from TR 29.  Needed for finding characters.
-#
-#
-$Extend  = [:Grapheme_Extend = TRUE:];
-

 #
 #  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
@ -63,23 +62,24 @@ $ALPlus = $AL | $AI | $SA | $XX;
 #  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
 #
 $ALcm = $ALPlus $CM*;
-$BAcm = $BA $Extend*;
-$BBcm = $BB $Extend*;
-$B2cm = $B2 $Extend*;
-$CLcm = $CL $Extend*;
-$EXcm = $EX $Extend*;
-$GLcm = $GL $Extend*;
-$HYcm = $HY $Extend*;
-$IDcm = ($ID $CM* | $SP $CM+);
-$INcm = $IN $Extend*;
-$IScm = $IS $Extend*;
-$NScm = $NS $Extend*;
-$NUcm = $NU $Extend*;
-$OPcm = $OP $Extend*;
-$POcm = $PO $Extend*;
-$PRcm = $PR $Extend*;
-$QUcm = $QU $Extend*;
-$SYcm = $SY $Extend*;
+$BAcm = $BA $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HYcm = $HY $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$NScm = $NS $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$SPcm = $SP $CM*;
+$SYcm = $SY $CM*;


 #  New Lines.  Always break after, never break before.
@ -90,22 +90,22 @@ $SYcm = $SY $Extend*;
 #              appears at the end of line break rule.
 #
 $NLF = $BK | $CR | $LF | $NL | $CR $LF;
-$EndingsSoft = $SP* $ZW*;
-$EndingsHard = $SP* $ZW* $NLF;
+$EndingsSoft = ($ZW* $SP)* $ZW*;
+$EndingsHard = ($ZW* $SP)* $ZW* $NLF;


 #
 #  Openings  Sequences that can precede Words, and that should not be separated from them.
 #            Rules LB 9, 10
 #
-$Openings = (($QUcm $SP*)? $OPcm $SP*)*;
+$Openings = (($QUcm ($ZW* $SP)*)? $OPcm ($ZW* $SP)*)*;

 #
 #  Closings  Seqences that follow words, and that should not be separated from them,
 #            Rule LB 8, 11, 15
-$Closings =  ($SP*( ($CL ($SP* $NScm)?  |  $EX  | $IS  | $SY) $Extend*) | $BAcm | $HYcm  | $NScm)*;
+$Closings =  (($ZW* $SP)*( ($CLcm (($ZW* $SP)* $NScm)?  |  $EX  | $IS  | $SY) $CM*) | $BAcm | $HYcm  | $NScm)*;

-$WordClosings = ($CLcm | $EXcm | $IScm | $SYcm | $BAcm | $HYcm | $NScm)*;
+$WordClosings = ($SP* $CLcm | $SP* $EXcm | $SP* $IScm | $SP* $SYcm | $BAcm | $HYcm | $NScm)*;

 #
 #  Words.  Includes mixed Alpha-numerics.
@ -115,12 +115,14 @@ $Number         =  $PRcm? ($OPcm | $HYcm)? $NU ($NU | $IS)* $CL? $POcm?; # Numbe
                                                                       # Regex form, rather than rule 18
                                                                       
 # Alpha-numeric.   16, 17 
-$Word   = ($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?) |
-           $INcm  |
-           $CB;                                             # Deviation from TR for $CB
+$Word   =  ($ALcm | $NUcm)+  $INcm*  |
+           $IDcm ($POcm? | $INcm*)   |
+           $CM+  ($POcm? | $INcm*)   |                      # CM with no base is like ID  (LB 7a)
+           $INcm+                    |
+           $CB;                                             # Deviation from Unicode spec for $CB
                                                            #   We treat as a single char word
                                                            
-$Dashes = (($B2cm $SP*)*);                                             # Dashes           11a   
+$Dashes = (($B2cm ($ZW* $SP)*)*);                                             # Dashes           11a   
        
        

@ -132,11 +134,11 @@ $HYMinus = $HYcm ($NUcm ($NUcm | $IS)* $CL? $POcm?)?;       # For Rle LB15, Don'
        
 $Word15 = $Openings? (
             ($BBcm* $Openings ($Word | $Number | $Dashes)? ($BAcm | $HYMinus | $NScm)*) |   # Rule 15. Stuff sticks around words.
-             $BBcm* [^[:Cc:] $BK $CR $LF $ZW $SP $GL ] $Extend*  |                 # Allow characters that don't meet the
-             $BBcm* [^$BK $CR $LF $ZW $SP $GL ]                                 #  more elaborate definitions for WORD
+             $BBcm* [^[:Cc:] $BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] $CM*  |                 # Allow characters that don't meet the
+             $BBcm* [^$BK $CR $LF $NL $ZW ($ZW* $SP) $GL ]                                 #  more elaborate definitions for WORD
             )  $WordClosings?;                                                          #  to be glued.
       
-$GluedWord  = $Openings? ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*;  # "Glue" will stick anything below it together.
+$GluedWord  = $Openings? (($ZW* $SP)* $GLcm | $QUcm)? $Word15 ((($ZW* $SP)* $GLcm | $QUcm) $Word15)*;  # "Glue" will stick anything below it together.
                                                                    # Rules 13, 14

 #
@ -158,7 +160,8 @@ $Openings $GluedWord  $Closings $EndingsHard{100};
 #     containing a space that may inhibit a break from occuring.
 #

-$SpaceGlue  = ([$ZW $CL $IS $NS $OP]  ($Extend* $SP)) | (($Extend* $SP)+ $OP);
+$SpaceGlue  = ([$ZW $CL $IS $NS $OP]  ($CM* $SP)) | (($CM* $SP)+ $OP);
 $ClumpingChars = [^$SP $BK $CR $LF];

-!. . $ClumpingChars*  ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
+#!. . $ClumpingChars*  ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
+!.*;
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -2532,7 +2532,6 @@ RBBILineMonkey::RBBILineMonkey()

    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);

-
    if (U_FAILURE(status)) {
        deferredStatus = status;
    }
@ -2559,19 +2558,38 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
        return -1;
    }

-  
+    // We need to figure out where the next character of interest starts
+    //   Depends on the previous char, and whether it eats following CombiningMarks
+    //   or not.
+    UChar32   c = fText->char32At(prevPos);
+    if (c == 0x0d || c == 0x0a || c == 0x85 || fBK->contains(c) || fSP->contains(c)) {
+        // char doesn't automatically combine with CM.
+        nextPos = fText->moveIndex32(prevPos, 1);
+    } else {
+        nextPos = fCharBI->following(prevPos);
+        for (;;) {
+            UChar32 c = fText->char32At(nextPos);
+            if (!fCM->contains(c)) {
+                break;
+            }
+            nextPos = fText->moveIndex32(nextPos, 1);
+        }
+    }
+    pos = prevPos;
+
+
    // Loop runs once per position in the test text, until a break position
    //  is found.
-    nextPos = fCharBI->following(prevPos);
-    pos     = prevPos;
    for (;;) {
        prevPos   = pos;
        pos       = nextPos;
-        nextCPPos = fText->moveIndex32(pos, 1);
-        nextPos   = fCharBI->following(pos);     // Advance by grapheme cluster
+
        UChar32 prevChar = fText->char32At(prevPos);
        UChar32 thisChar = fText->char32At(pos);

+        nextCPPos = fText->moveIndex32(pos, 1);
+        nextPos   = nextCPPos;
+
        // Break at end of text.
        if (pos >= fText->length()) {
            break;
@ -2598,7 +2616,7 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
                continue;
        }

-        // LB 4  DOn't break before spaces or zero-width space.
+        // LB 4  Don't break before spaces or zero-width space.
        if (fSP->contains(thisChar)) {
            continue;
        }
@ -2606,6 +2624,23 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
            continue;
        }

+        if (!fSP->contains(thisChar)) {
+            // nextPos advances over Hangul Syllables plus any chars
+            //    of line break class CM.
+            // Advancing by a grapheme cluster with a character break iterator
+            //  almost gets this, except Line Break CM includes some
+            //  stuff that is not combining from the grapheme cluster definition.
+            nextPos   = fCharBI->following(pos);     // Advance by grapheme cluster
+            // now advance over any CM class chars that were missed
+            for (;;) {
+                UChar32 c = fText->char32At(nextPos);
+                if (!fCM->contains(c)) {
+                    break;
+                }
+                nextPos = fText->moveIndex32(nextPos, 1);
+            }
+        }
+

        // LB 5  Break after zero width space
        if (fZW->contains(prevChar)) {
@ -2739,8 +2774,11 @@ fall_through_11:
        }


-        // LB 17
+        // LB 17    ID x PO    (Note:  Leading CM behaves like ID)
+        //          AL x NU
+        //          NU x AL
        if (fID->contains(prevChar) && fPO->contains(thisChar) ||
+            fCM->contains(prevChar) && fPO->contains(thisChar) || 
            fAL->contains(prevChar) && fNU->contains(thisChar) ||
            fNU->contains(prevChar) && fAL->contains(thisChar) )   {
            continue; 
@ -2773,7 +2811,6 @@ fall_through_11:
            
    }
    
-    // We should never get here.
    return pos;
 }

@ -2916,7 +2953,7 @@ void RBBITest::TestMonkey(char *params) {
    }

    if (breakType == "line" || breakType == "all") {
-#if 0
+#if 1
        // TODO:  Enable test
        RBBILineMonkey  m;
        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@ -462,7 +462,7 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"

 #      Surrogate line break tests.
 #
-<data>•\u4e01•\ud840\udc01•\u4e02•abc•\ue000•\udb80\udc01•</data>
+<data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data>

 #      Regression for bug 836
 <data>•AAA•(AAA •</data>