ICU-2093 Update word breakr rules to latest Unicode TR, work in progress

X-SVN-Rev: 11472
2003-04-08 05:35:13 +00:00 · 2003-04-08 05:35:13 +00:00 · 806b6d974f
commit 806b6d974f
parent f70487d239
3 changed files with 63 additions and 58 deletions
--- a/icu4c/source/data/brkitr/word.txt
+++ b/icu4c/source/data/brkitr/word.txt
@ -1,43 +1,49 @@
 #
-#   Copyright (C) 2002, International Business Machines Corporation and others.
+#   Copyright (C) 2002, 2003, International Business Machines Corporation and others.
 #       All Rights Reserved.
 #
 #   file:  word.txt   
 #
 #   ICU Word Break Rules
-#      See Unicode Technical Report #29.
+#      See Unicode Standard Annex #29.
-#      These rules are based on the proposed draft dated 2002-08-06
+#      These rules are based on the proposed draft dated 2003-03-31
 #
 ####################################################################################
 #
-#  Definitions imported from Line Break Rules.
+#  Character class definitions from TR 29
 #
 ####################################################################################
-$Numeric = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
+$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
-        \u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
+                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
-        \u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
+                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
-        \u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];
+                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
 $ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] 
                           - [:Ideographic:]
                           - [:Katakana:]
                           - [:Script = Thai:]
                           - [:Script = Lao:]
                           - [:Script = Hiragana:]];
 $MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW PUNCTUATION GERSHAYIM:]
              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];  
 $MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
 $MidNum    = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
 $Numeric   = [:LineBreak = Numeric:];
 ####################################################################################
 #
 #  Definitions imported from Character Break Rules.
 #
 ####################################################################################
 #
 #  Character Class Definitions.
 #    The names are those from TR29.
 #
 $Control    = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
-
+$Extend     = [[:Grapheme_Extend = TRUE:]]; 
 # Note on $Extend:  Earlier versions of TR29 included Mc characters.
 #                   To avoid test breakage, Mc is still included for the time being.
 # $Extend     = [[:Mn:] [:Me:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_Grapheme_Extend
 $Extend     = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_Grapheme_Extend
@ -48,67 +54,63 @@ $Extend     = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_G
 #
 ####################################################################################
 $Katakana  = [[:Kana:]  \u30fc \uff70 \uff9e-\uff9f];
 $Hiragana  = [[:Hira:]];
 $Letter    = [[[:Alphabetic:]  \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3] -
             [[:IDEOGRAPHIC:] [:THAI:] [:LAO:] $Hiragana $Katakana ]];
 $Format    = [[:Cf:]];
 $MidLetter = [\u0027 \u00ad \u05f4 \u2019];
 $MidNumLet = [\u002e \u003a];
-# From Line Break, IS - Numeric Separator (Infix)
+# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#  $IS = [\u002c \u002e \u003a \u003b \u0589];
+#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
-$MidNum    = [\u002c \u003b \u0589];
+#          because we don't need to find the boundaries between adjacent syllables -
-
+#          they won't be word boundaries.
 #
 #  "Extended"  definitions.  Classes of characters including trailing combining chars and,
 #                            for types of chars that can appear in the interior of a word only,
 #                            trailing format characters.
 #
 $LetterEx     = $Letter    $Extend*; 
 $NumericEx    = $Numeric   $Extend*;
 $MidNumExF    = $MidNum    $Extend* $Format*;
 $MidNumLetExF = $MidNumLet $Extend* $Format*;
 $MidLetterExF = $MidLetter $Extend* $Format*;
 #
-#  Numbers.  Rules 6, 9, 10 form the TR.
+#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
 #
-$NumberSequence = $NumericEx ($Format* ($MidNumExF | $MidNumLetExF)? $NumericEx)*;
+$ALetterEx    = $ALetter   $Extend* $Format*; 
 $NumericEx    = $Numeric   $Extend* $Format*;
 $MidNumEx     = $MidNum    $Extend* $Format*;
 $MidNumLetEx  = $MidNumLet $Extend* $Format*;
 $MidLetterEx  = $MidLetter $Extend* $Format*;
 $KatakanaEx   = $Katakana  $Extend* $Format*;
 #
 #  Numbers.  Rules 8, 11, 12 form the TR.
 #
 $NumberSequence = $NumericEx (($MidNumEx | $MidNumLetEx)? $NumericEx)*;
 $NumberSequence {100};
 #
-#  Words.  Alpha-numerics.  Rule 3 - 10
+#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
 #     - must include at least one letter. 
 #     - may include both letters and numbers.
 #     - may include  MideLetter, MidNumber punctuation.
 #
-$LetterSequence = $LetterEx ($Format* ($MidLetterExF | $MidNumLetExF)? $LetterEx)*;
+$LetterSequence = $ALetterEx (($MidLetterEx | $MidNumLetEx)? $ALetterEx)*;     # rules #6, #7
 $NumberSequence? $LetterSequence ($NumberSequence | $LetterSequence)* {200};
 #
-#  Hiragana and Katakana
+#  Do not break between Katakana.   Rule #13.
 #
-$Hiragana $Extend* {300};
+$KatakanaEx+ {300};
-$Katakana $Extend* ($Format* $Katakana $Extend*)* {300};
+[:Hiragana:] $Extend* {300};
 #
 #  Ideographic Characters.  Stand by themselves as words.
 #                           Separated from the "Everything Else" rule, below, only so that they
 #                           can be tagged with a return value.   TODO:  is this what we want?
 #
 [:IDEOGRAPHIC:] $Extend* {400};
 #
 #  Everything Else, with no tag.
 #                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are returned by themselves.
+#                   Controls are do not.
 #
-[^$Control] $Extend*;
+[^$Control [:Ideographic:]] $Extend*;
-\r\n;
+[\u000d][\u000a];
 .;
 #
 #  Reverse Rules.   Back up over any of the chars that can group together.
@ -121,6 +123,7 @@ $Katakana $Extend* ($Format* $Katakana $Extend*)* {300};
 #    reaches something that can only be the start (and probably only) char in a "word".
 #    A space or punctuation meets the test.
 #
-$NonStarters = [$Numeric $Letter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format \u000a];
+$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];
-! $NonStarters* .;
+!.*;
 #! ($NonStarters* | \n \r) .;
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -352,7 +352,7 @@ void RBBIAPITest::TestFirstNextFollowing()
        doTest(testString, p, q, 17, " here."); 
        // hindi starts here
        p=q;
-        q=charIter1->next(4);
+        q=charIter1->next(5);
        doTest(testString, p, q, 22, " \\u092d\\u093e\\u0930\\u0924");
        p=q;
        q=charIter1->next(2);
@ -515,7 +515,10 @@ void RBBIAPITest::TestLastPreviousPreceding()
        doTest(testString, p, q, 31, "\\u0964");
        p=q;
        q=charIter1->previous();
-        doTest(testString, p, q, 29, "\\u0939\\u094c");
+        doTest(testString, p, q, 30, "\\u094c");
        p=q;
        q=charIter1->previous();
        doTest(testString, p, q, 29, "\\u0939");
        q=charIter1->preceding(26);
        doTest(testString, 26, q, 23, "\\u0938\\u0941\\u0902");
        q=charIter1->preceding(16);
@ -609,7 +612,7 @@ void RBBIAPITest::TestIsBoundary(){
        errln("FAIL: in construction");
    else{
         charIter1->setText(testString1);
-         int32_t bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26};
+         int32_t bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 25, 26};
         doBoundaryTest(*charIter1, testString1, bounds1);
    }
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -2178,8 +2178,7 @@ void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
                    }
                }
                if (!seen2) {
-                    errln("No break between U+" + UCharToUnicodeString(c1)
+                    errln("No Break between \\U%04x and \\U%04x", c1, c2);
                                + " and U+" + UCharToUnicodeString(c2));
                    errCount++;
                    if (errCount >= 75)
                        return;
@ -2206,8 +2205,8 @@ void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
            tb.setText(work);
            for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())
                if (k == 2) {
-                    errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) + 
+                    errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",
-                        ", U+d U+a U+" + UCharToUnicodeString(work[3]));
+                        work[0], work[1], work[2], work[3]);
                    errCount++;
                    if (errCount >= 75)
                        return;