ICU-2093 Update word breakr rules to latest Unicode TR, work in progress

X-SVN-Rev: 11472
2003-04-08 05:35:13 +00:00 · 2003-04-08 05:35:13 +00:00 · 806b6d974f
commit 806b6d974f
parent f70487d239
3 changed files with 63 additions and 58 deletions
--- a/icu4c/source/data/brkitr/word.txt
+++ b/icu4c/source/data/brkitr/word.txt
@ -1,43 +1,49 @@
 #
-#   Copyright (C) 2002, International Business Machines Corporation and others.
+#   Copyright (C) 2002, 2003, International Business Machines Corporation and others.
 #       All Rights Reserved.
 #
 #   file:  word.txt   
 #
 #   ICU Word Break Rules
-#      See Unicode Technical Report #29.
-#      These rules are based on the proposed draft dated 2002-08-06
+#      See Unicode Standard Annex #29.
+#      These rules are based on the proposed draft dated 2003-03-31
 #



 ####################################################################################
 #
-#  Definitions imported from Line Break Rules.
+#  Character class definitions from TR 29
 #
 ####################################################################################
-$Numeric = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
-        \u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
-        \u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
-        \u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];
+$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];


+$ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] 
+                           - [:Ideographic:]
+                           - [:Katakana:]
+                           - [:Script = Thai:]
+                           - [:Script = Lao:]
+                           - [:Script = Hiragana:]];
+                           
+$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW PUNCTUATION GERSHAYIM:]
+              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];  
+              
+$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
+
+$MidNum    = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
+$Numeric   = [:LineBreak = Numeric:];
+

-####################################################################################
-#
-#  Definitions imported from Character Break Rules.
-#
-####################################################################################
 #
 #  Character Class Definitions.
 #    The names are those from TR29.
 #
 $Control    = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
-
-# Note on $Extend:  Earlier versions of TR29 included Mc characters.
-#                   To avoid test breakage, Mc is still included for the time being.
-# $Extend     = [[:Mn:] [:Me:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_Grapheme_Extend
-$Extend     = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_Grapheme_Extend
+$Extend     = [[:Grapheme_Extend = TRUE:]]; 



@ -48,67 +54,63 @@ $Extend     = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_G
 #
 ####################################################################################

-$Katakana  = [[:Kana:]  \u30fc \uff70 \uff9e-\uff9f];
-$Hiragana  = [[:Hira:]];
-$Letter    = [[[:Alphabetic:]  \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3] -
-             [[:IDEOGRAPHIC:] [:THAI:] [:LAO:] $Hiragana $Katakana ]];
 $Format    = [[:Cf:]];

-$MidLetter = [\u0027 \u00ad \u05f4 \u2019];
-
-$MidNumLet = [\u002e \u003a];


-# From Line Break, IS - Numeric Separator (Infix)
-#  $IS = [\u002c \u002e \u003a \u003b \u0589];
-$MidNum    = [\u002c \u003b \u0589];
-
+# Rule 3:  Treat a grapheme cluster as if it were a single character.
+#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+#          because we don't need to find the boundaries between adjacent syllables -
+#          they won't be word boundaries.
 #
-#  "Extended"  definitions.  Classes of characters including trailing combining chars and,
-#                            for types of chars that can appear in the interior of a word only,
-#                            trailing format characters.
-#
-$LetterEx     = $Letter    $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumExF    = $MidNum    $Extend* $Format*;
-$MidNumLetExF = $MidNumLet $Extend* $Format*;
-$MidLetterExF = $MidLetter $Extend* $Format*;


 #
-#  Numbers.  Rules 6, 9, 10 form the TR.
+#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
 #
-$NumberSequence = $NumericEx ($Format* ($MidNumExF | $MidNumLetExF)? $NumericEx)*;
+$ALetterEx    = $ALetter   $Extend* $Format*; 
+$NumericEx    = $Numeric   $Extend* $Format*;
+$MidNumEx     = $MidNum    $Extend* $Format*;
+$MidNumLetEx  = $MidNumLet $Extend* $Format*;
+$MidLetterEx  = $MidLetter $Extend* $Format*;
+$KatakanaEx   = $Katakana  $Extend* $Format*;
+
+
+#
+#  Numbers.  Rules 8, 11, 12 form the TR.
+#
+$NumberSequence = $NumericEx (($MidNumEx | $MidNumLetEx)? $NumericEx)*;
 $NumberSequence {100};

 #
-#  Words.  Alpha-numerics.  Rule 3 - 10
+#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
 #     - must include at least one letter. 
 #     - may include both letters and numbers.
 #     - may include  MideLetter, MidNumber punctuation.
 #
-$LetterSequence = $LetterEx ($Format* ($MidLetterExF | $MidNumLetExF)? $LetterEx)*;
+$LetterSequence = $ALetterEx (($MidLetterEx | $MidNumLetEx)? $ALetterEx)*;     # rules #6, #7
 $NumberSequence? $LetterSequence ($NumberSequence | $LetterSequence)* {200};

 #
-#  Hiragana and Katakana
+#  Do not break between Katakana.   Rule #13.
 #
-$Hiragana $Extend* {300};
-$Katakana $Extend* ($Format* $Katakana $Extend*)* {300};
+$KatakanaEx+ {300};
+[:Hiragana:] $Extend* {300};

 #
 #  Ideographic Characters.  Stand by themselves as words.
+#                           Separated from the "Everything Else" rule, below, only so that they
+#                           can be tagged with a return value.   TODO:  is this what we want?
 #
 [:IDEOGRAPHIC:] $Extend* {400};

 #
 #  Everything Else, with no tag.
 #                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are returned by themselves.
+#                   Controls are do not.
 #
-[^$Control] $Extend*;
-\r\n;
-.;
+[^$Control [:Ideographic:]] $Extend*;
+[\u000d][\u000a];

 #
 #  Reverse Rules.   Back up over any of the chars that can group together.
@ -121,6 +123,7 @@ $Katakana $Extend* ($Format* $Katakana $Extend*)* {300};
 #    reaches something that can only be the start (and probably only) char in a "word".
 #    A space or punctuation meets the test.
 #
-$NonStarters = [$Numeric $Letter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format \u000a];
+$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];

-! $NonStarters* .;
+!.*;
+#! ($NonStarters* | \n \r) .;
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -352,7 +352,7 @@ void RBBIAPITest::TestFirstNextFollowing()
        doTest(testString, p, q, 17, " here."); 
        // hindi starts here
        p=q;
-        q=charIter1->next(4);
+        q=charIter1->next(5);
        doTest(testString, p, q, 22, " \\u092d\\u093e\\u0930\\u0924");
        p=q;
        q=charIter1->next(2);
@ -515,7 +515,10 @@ void RBBIAPITest::TestLastPreviousPreceding()
        doTest(testString, p, q, 31, "\\u0964");
        p=q;
        q=charIter1->previous();
-        doTest(testString, p, q, 29, "\\u0939\\u094c");
+        doTest(testString, p, q, 30, "\\u094c");
+        p=q;
+        q=charIter1->previous();
+        doTest(testString, p, q, 29, "\\u0939");
        q=charIter1->preceding(26);
        doTest(testString, 26, q, 23, "\\u0938\\u0941\\u0902");
        q=charIter1->preceding(16);
@ -609,7 +612,7 @@ void RBBIAPITest::TestIsBoundary(){
        errln("FAIL: in construction");
    else{
         charIter1->setText(testString1);
-         int32_t bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26};
+         int32_t bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 25, 26};
         doBoundaryTest(*charIter1, testString1, bounds1);
    }

--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -2178,8 +2178,7 @@ void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
                    }
                }
                if (!seen2) {
-                    errln("No break between U+" + UCharToUnicodeString(c1)
-                                + " and U+" + UCharToUnicodeString(c2));
+                    errln("No Break between \\U%04x and \\U%04x", c1, c2);
                    errCount++;
                    if (errCount >= 75)
                        return;
@ -2206,8 +2205,8 @@ void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
            tb.setText(work);
            for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())
                if (k == 2) {
-                    errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) + 
-                        ", U+d U+a U+" + UCharToUnicodeString(work[3]));
+                    errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",
+                        work[0], work[1], work[2], work[3]);
                    errCount++;
                    if (errCount >= 75)
                        return;