# # Copyright (C) 2002-2003, # International Business Machines Corporation and others. # All Rights Reserved. # # file: word.txt # # ICU Word Break Rules # See Unicode Standard Annex #29. # These rules are based on Version 4.0.0, dated 2003-04-17 # ############################################################################## # # Character class definitions from TR 29 # ############################################################################## !!chain; !!LBCMNoChain; $Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] - [:Ideographic:] - $Katakana - [:Script = Thai:] - [:Script = Lao:] - [:Script = Hiragana:]]; $ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]]; $ACMLetter = [$ALetter & [:Grapheme_Extend = TRUE:]]; $MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; $MidNumLet = [[:name = FULL STOP:] [:name = COLON:]]; $MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet]; $Numeric = [:LineBreak = Numeric:]; # # Character Class Definitions. # The names are those from TR29. # $CR = \u000d; $LF = \u000a; $Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]]; $Extend = [[:Grapheme_Extend = TRUE:]]; $Format = [[:Cf:]]; $Hiragana = [:Hiragana:]; $Ideographic = [:IDEOGRAPHIC:]; ## ------------------------------------------------- !!forward; $CR $LF; # rule 3 and 4 $ALetterEx = $ALetter $Extend*; $ABaseLetterEx = $ABaseLetter $Extend*; $ACMLetterEx = $ACMLetter $Extend*; $NumericEx = $Numeric $Extend*; $MidNumEx = $MidNum $Extend*; $MidNumLetEx = $MidNumLet $Extend*; $MidLetterEx = $MidLetter $Extend*; $KatakanaEx = $Katakana $Extend*; # see character breaks [^$Control] $Extend*; # rule 5 $ALetterEx ($Format* $ALetterEx)* {200}; # rule 6 and 7 $MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx); $ALetterSeq = $ALetterEx ( $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx )*; $MidALetterSeq = $MidALetterEx ( $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx )*; # rule 8 $NumericEx ($Format* $NumericEx)* {100}; # rule 9 $ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200}; # rule 10 $NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200}; # rule 11 and 12 $NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100}; # rule 13 $KatakanaEx ($Format* $KatakanaEx)* {300}; $Hiragana $Extend* {300}; $Ideographic $Extend* {400}; ## ------------------------------------------------- !!reverse; $BackALetterEx = $Extend* $ALetter; $BackABaseLetterEx = $Extend* $ABaseLetter; $BackACMLetterEx = $Extend* $ACMLetter; $BackNumericEx = $Extend* $Numeric; $BackMidNumEx = $Extend* $MidNum; $BackMidNumLetEx = $Extend* $MidNumLet; $BackMidLetterEx = $Extend* $MidLetter; $BackKatakanaEx = $Extend* $Katakana; $LF $CR; # see character breaks $Extend* [^$Control]; # rule 5 ($BackALetterEx $Format*)* $BackABaseLetterEx; ($BackALetterEx $Format*)* $BackACMLetterEx / $Control; # rule 6 and 7 $BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format); $BackALetterSeq = ( $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* )* $BackABaseLetterEx; $BackMidALetterSeq = ( $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* )* $BackMidALetterEx; # rule 8 $BackNumericEx $Format* $BackNumericEx; # rule 10 (($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq; # to handle letter sequences ending with a combining mark (($BackNumericEx | $BackMidALetterSeq) $Format*)* ( $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* )* $BackACMLetterEx / $Control; # rule 10 ($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx; # rule 11 and 12 $BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx; # rule 13 $BackKatakanaEx $Format* $BackKatakanaEx; ## ------------------------------------------------- !!safe_reverse; # rule 3 $Extend+ [^$Extend]; # rule 4 $Format+ $BackABaseLetterEx; $Format+ $BackACMLetterEx / $Control; $Format+ $BackNumericEx; $Format+ $BackMidLetterEx; $Format+ $BackMidNumLetEx; $Format+ $BackMidNumEx; $Format+ $BackKatakanaEx; # rule 6 ($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx; ($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control; # rule 11 ($MidNum | $MidNumLet) $Format* $BackNumericEx; ## ------------------------------------------------- !!safe_forward; # rule 3 $Extend+; # rule 4 $Format+ $ALetterEx; $Format+ $NumericEx; $Format+ $MidLetterEx; $Format+ $MidNumLetEx; $Format+ $MidNumEx; $Format+ $KatakanaEx; # rule 6 ($MidLetter | $MidNumLet) $Format* $ALetterEx; # rule 11 ($MidNum | $MidNumLet) $Format* $NumericEx;