From 244f4e3ac2e6f20ec8ff3b5e0b8de62f2a271b8a Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Mon, 25 Oct 2004 23:50:11 +0000 Subject: [PATCH] ICU-4157 Word Break, undo failed attempt at a branch for TR29 proposed updates X-SVN-Rev: 16625 --- icu4c/source/data/brkitr/word.txt | 139 ++++++++++++++++-------------- 1 file changed, 76 insertions(+), 63 deletions(-) diff --git a/icu4c/source/data/brkitr/word.txt b/icu4c/source/data/brkitr/word.txt index af5af09fba..952b2ec547 100644 --- a/icu4c/source/data/brkitr/word.txt +++ b/icu4c/source/data/brkitr/word.txt @@ -17,6 +17,7 @@ ############################################################################## !!chain; +!!LBCMNoChain; $Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] @@ -28,23 +29,20 @@ $Katakana = [[:Script = KATAKANA:] $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] - [:Ideographic:] - $Katakana - - [:Script = Hiragana:] - [:Script = Thai:] - [:Script = Lao:] - - [:Grapheme_Extend = TRUE:]]; - + - [:Script = Hiragana:]]; + $ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]]; +$ACMLetter = [$ALetter & [:Grapheme_Extend = TRUE:]]; $MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; -#$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]]; -$MidNumLet = [[:name = COLON:]]; -$ExtendNumLet = [[:Pc:] - [:name = KATAKANA MIDDLE DOT:] - - [:name = HALFWIDTH KATAKANA MIDDLE DOT:]]; - +$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]]; + $MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet]; $Numeric = [:LineBreak = Numeric:]; @@ -53,86 +51,89 @@ $Numeric = [:LineBreak = Numeric:]; # The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Extend = [[:Grapheme_Extend = TRUE:]]; -$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend]; -$Format = [[:Cf:] - $Extend]; -$Hiragana = [:Hiragana:]; -$Ideographic = [:IDEOGRAPHIC:]; - -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidNumLetEx = $MidNumLet $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$ExtendNumLetEx = $ExtendNumLet $Extend*; +$CR = \u000d; +$LF = \u000a; +$Extend = [[:Grapheme_Extend = TRUE:]]; +$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend]; +$Format = [[:Cf:] - $Extend]; +$Hiragana = [:Hiragana:]; +$Ideographic = [:IDEOGRAPHIC:]; ## ------------------------------------------------- !!forward; +$CR $LF; + +# rule 3 and 4 + +$ALetterEx = $ALetter $Extend*; +$ABaseLetterEx = $ABaseLetter $Extend*; +$ACMLetterEx = $ACMLetter $Extend*; +$NumericEx = $Numeric $Extend*; +$MidNumEx = $MidNum $Extend*; +$MidNumLetEx = $MidNumLet $Extend*; +$MidLetterEx = $MidLetter $Extend*; +$KatakanaEx = $Katakana $Extend*; -# Rule 3 - don't break grapheme clusters. # see character breaks -$CR $LF; [^$Control] $Extend*; -$NumericEx $Extend* {100}; -$ALetterEx $Extend* {200}; # rule 5 -$ALetterEx $Format* $ALetterEx {200}; +$ALetterEx ($Format* $ALetterEx)* {200}; # rule 6 and 7 -$ALetterEx $Format* ($MidLetterEx | $MidNumLetEx) $Format* $ALetterEx {200}; + +$MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx); + +$ALetterSeq = +$ALetterEx +( + $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx +)*; + +$MidALetterSeq = +$MidALetterEx +( + $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx +)*; # rule 8 -$NumericEx $Format* $NumericEx {100}; +$NumericEx ($Format* $NumericEx)* {100}; # rule 9 -$ALetterEx $Format* $NumericEx {200}; +$ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200}; # rule 10 -$NumericEx $Format* $ALetterEx {200}; +$NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200}; # rule 11 and 12 -$NumericEx $Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx {100}; +$NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100}; # rule 13 -$KatakanaEx $Format* $KatakanaEx {300}; -$Hiragana $Extend* {300}; +$KatakanaEx ($Format* $KatakanaEx)* {300}; +$Hiragana $Extend* {300}; $Ideographic $Extend* {400}; -# New Rules X Y -$ALetterEx $Format* $ExtendNumLetEx {200}; # (X) -$NumericEx $Format* $ExtendNumLetEx {100}; # (X) -$KatakanaEx $Format* $ExtendNumLetEx {300}; # (X) - -$ExtendNumLetEx $Format* $ALetterEx {200}; # (Y) -$ExtendNumLetEx $Format* $NumericEx {100}; # (Y) -$ExtendNumLetEx $Format* $KatakanaEx {300}; # (Y) - - - ## ------------------------------------------------- !!reverse; $BackALetterEx = $Extend* $ALetter; +$BackABaseLetterEx = $Extend* $ABaseLetter; +$BackACMLetterEx = $Extend* $ACMLetter; $BackNumericEx = $Extend* $Numeric; $BackMidNumEx = $Extend* $MidNum; $BackMidNumLetEx = $Extend* $MidNumLet; $BackMidLetterEx = $Extend* $MidLetter; $BackKatakanaEx = $Extend* $Katakana; -$BackExtendNumLetEx= $Extend* $ExtendNumLet; $LF $CR; @@ -142,24 +143,43 @@ $Extend* [^$Control]; # rule 5 -$BackALetterEx $Format* $BackALetterEx; +($BackALetterEx $Format*)* $BackABaseLetterEx; +($BackALetterEx $Format*)* $BackACMLetterEx / $Control; # rule 6 and 7 -$BackALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* $BackALetterEx; +$BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format); +$BackALetterSeq = +( + $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* +)* +$BackABaseLetterEx; + +$BackMidALetterSeq = +( + $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* +)* +$BackMidALetterEx; # rule 8 $BackNumericEx $Format* $BackNumericEx; -# rule 9 +# rule 10 -$BackNumericEx $Format* $BackALetterEx; +(($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq; + +# to handle letter sequences ending with a combining mark +(($BackNumericEx | $BackMidALetterSeq) $Format*)* +( + $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* +)* +$BackACMLetterEx / $Control; # rule 10 -$BackALetterEx $Format* $BackNumericEx; +($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx; # rule 11 and 12 @@ -169,11 +189,6 @@ $BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumeric $BackKatakanaEx $Format* $BackKatakanaEx; -# New Rules X and Y -# -($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $Format* $BackExtendNumLetEx; -$BackExtendNumLetEx $Format* ($BackALetterEx | $BackNumericEx | $BackKatakanaEx); - ## ------------------------------------------------- !!safe_reverse; @@ -183,17 +198,18 @@ $Extend+ [^$Extend]; $Extend+; # comes into play when buffer _begins_ with an $Extend+. # rule 4 -$Format+ $BackALetterEx; +$Format+ $BackABaseLetterEx; +$Format+ $BackACMLetterEx / $Control; $Format+ $BackNumericEx; $Format+ $BackMidLetterEx; $Format+ $BackMidNumLetEx; $Format+ $BackMidNumEx; $Format+ $BackKatakanaEx; -$Format+ $BackExtendNumLetEx; # rule 6 -($MidLetter | $MidNumLet) $Format* $BackALetterEx; +($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx; +($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control; # rule 11 ($MidNum | $MidNumLet) $Format* $BackNumericEx; @@ -212,7 +228,6 @@ $Extend* $Format+ $MidLetterEx; $Extend* $Format+ $MidNumLetEx; $Extend* $Format+ $MidNumEx; $Extend* $Format+ $KatakanaEx; -$Extend* $Format+ $ExtendNumLetEx; $Extend+ $Format* $ALetterEx; $Extend+ $Format* $NumericEx; @@ -220,11 +235,9 @@ $Extend+ $Format* $MidLetterEx; $Extend+ $Format* $MidNumLetEx; $Extend+ $Format* $MidNumEx; $Extend+ $Format* $KatakanaEx; -$Extend+ $Format* $ExtendNumLetEx; # rule 6 ($MidLetterEx | $MidNumLetEx) $Format* $ALetterEx; # rule 11 ($MidNumEx | $MidNumLetEx) $Format* $NumericEx; -