ICU-9600 sync up word.txt and word_POSIX.txt

X-SVN-Rev: 32470
2012-09-28 21:31:14 +00:00 · 2012-09-28 21:31:14 +00:00 · 038449fa93
commit 038449fa93
parent b5bed9e922
2 changed files with 36 additions and 17 deletions
--- a/icu4c/source/data/brkitr/word.txt
+++ b/icu4c/source/data/brkitr/word.txt
@ -9,7 +9,7 @@
 #      These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
 #
 # Note:  Updates to word.txt will usually need to be merged into
-#        word_POSIX.txt and word_ja.txt also.
+#        word_POSIX.txt also.

 ##############################################################################
 #
--- a/icu4c/source/data/brkitr/word_POSIX.txt
+++ b/icu4c/source/data/brkitr/word_POSIX.txt
@ -9,7 +9,7 @@
 #      These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
 #
 # Note:  Updates to word.txt will usually need to be merged into
-#        word_POSIX.txt and word_ja.txt also.
+#        word_POSIX.txt also.

 ##############################################################################
 #
@ -29,14 +29,16 @@ $LF           = [\p{Word_Break = LF}];
 $Newline      = [\p{Word_Break = Newline}];
 $Extend       = [\p{Word_Break = Extend}];
 $Format       = [\p{Word_Break = Format}];
+$Hiragana     = [:Hiragana:];
 $Katakana     = [\p{Word_Break = Katakana}];
+$Han          = [:Han:];
 $ALetter      = [\p{Word_Break = ALetter}];
 $MidNumLet    = [\p{Word_Break = MidNumLet} - [.]];
 $MidLetter    = [\p{Word_Break = MidLetter} - [\:]];
 $MidNum       = [\p{Word_Break = MidNum} [.]];
 $Numeric      = [\p{Word_Break = Numeric}];
-$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];


 #   Dictionary character set, for triggering language-based break engines. Currently
@ -44,15 +46,22 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
 #   5.0 or later as the definition of Complex_Context was corrected to include all
 #   characters requiring dictionary break.

-$dictionary   = [:LineBreak = Complex_Context:];
 $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
-$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
-                                                             #  include the dictionary characters.
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+

 #
 #  Rules 4    Ignore Format and Extend characters, 
 #             except when they appear at the beginning of a region of text.
 #
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
 $KatakanaEx     = $Katakana     ($Extend |  $Format)*;
 $ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
 $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
@ -62,7 +71,6 @@ $NumericEx      = $Numeric      ($Extend |  $Format)*;
 $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
 $Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;

-$Hiragana       = [\p{script=Hiragana}];
 $Ideographic    = [\p{Ideographic}];
 $HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
 $IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
@ -80,13 +88,14 @@ $CR $LF;
 #          of a region of Text.   The rule here comes into play when the start of text
 #          begins with a group of Format chars, or with a "word" consisting of a single
 #          char that is not in any of the listed word break categories followed by
-#          format char(s).
-[^$CR $LF $Newline]? ($Extend |  $Format)+;
+#          format char(s), or is not a CJK dictionary character.
+[^$CR $LF $Newline $dictionaryCJK]? ($Extend |  $Format)+;

 $NumericEx {100};
 $ALetterEx {200};
-$KatakanaEx {300};       # note:  these status values override those from rule 5
-$HiraganaEx {300};       #        by virtual of being numerically larger.
+$HangulSyllable {200};
+$KatakanaEx {400};       # note:  these status values override those from rule 5
+$HiraganaEx {400};       #        by virtue of being numerically larger.
 $IdeographicEx {400};    #

 #
@ -115,24 +124,29 @@ $NumericEx $ALetterEx {200};
 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};

 # rule 13
-
-$KatakanaEx  $KatakanaEx {300};
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$KatakanaEx  $KatakanaEx {400};

 # rule 13a/b

 $ALetterEx      $ExtendNumLetEx {200};    #  (13a)
 $NumericEx      $ExtendNumLetEx {100};    #  (13a)
-$KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
+$KatakanaEx     $ExtendNumLetEx {400};    #  (13a)
 $ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)

 $ExtendNumLetEx $ALetterEx  {200};    #  (13b)
 $ExtendNumLetEx $NumericEx  {100};    #  (13b)
-$ExtendNumLetEx $KatakanaEx {300};    #  (13b)
- 
+$ExtendNumLetEx $KatakanaEx {400};    #  (13b)
+
 # rule 13c

 $Regional_IndicatorEx $Regional_IndicatorEx;

+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found 


 ## -------------------------------------------------
@ -145,6 +159,7 @@ $BackNumericEx            = ($Format | $Extend)* $Numeric;
 $BackMidNumEx             = ($Format | $Extend)* $MidNum;
 $BackMidLetterEx          = ($Format | $Extend)* $MidLetter;
 $BackKatakanaEx           = ($Format | $Extend)* $Katakana;
+$BackHiraganaEx           = ($Format | $Extend)* $Hiragana;
 $BackExtendNumLetEx       = ($Format | $Extend)* $ExtendNumLet;
 $BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;

@ -152,7 +167,7 @@ $BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
 $LF $CR;

 # rule 4
-($Format | $Extend)*  [^$CR $LF $Newline]?;
+($Format | $Extend)*  [^$CR $LF $Newline $dictionaryCJK]?;

 # rule 5

@ -192,6 +207,10 @@ $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackEx

 $BackRegional_IndicatorEx $BackRegional_IndicatorEx;

+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable;
+$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
+
 ## -------------------------------------------------

 !!safe_reverse;