# # word.txt Word Breaking Rules for ICU Rules Based Break Iterator. # $Hiragana = [[:L:] & [:Hira:]]; $Katakana = [[:L:] & [:Kana:]]; # # Definition of $Ideographic is from TR14, Line Breaking. # $Ideographic = [ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB \u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029 \u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062 \u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F \u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4 \u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF \u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243 \u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD \u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6 \uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46 \uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03 \uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E \uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4 \U00020000-\U0002A6D6 \U0002F800-\U0002FA1D]; # # These definitions are from the character break rules. # $CGJ = [\u034f]; #Combining Grapheme Joiner $Link = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2]; $NotControl = [^[:Zl:] [:Zp:] [:Cc:]]; #Line Separator, #Paragraph Separtor, # General Category == Control $Extend = # From UNIDATA/DerivedCoreProperties.txt [\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9 \u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC \u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A \u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948 \u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC \u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3 \u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C \u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5 \u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E \u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57 \u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7 \u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C \u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6 \u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40 \u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1 \u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39 \u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19 \u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84 \u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031 \u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714 \u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD \u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D \u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA \u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F \U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172 \U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD]; # # Korean, also taken from character break rules. # # # Korean Syllable Sequences # $L = [\u1100-\u115f]; $V = [\u1160-\u11a2]; $T = [\u11a8-\u11f9]; $LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4 \uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64 \uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124 \ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4 \ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4 \ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664 \ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824 \ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4 \uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4 \ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64 \ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24 \ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4 \uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4 \uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464 \uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624 \uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4 \uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4 \uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64 \ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24 \ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4 \ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4 \ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264 \ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424 \ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4 \ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ]; $LVT = [[\uac00-\ud7a3] - $LV]; $Hangul_Sequence = ((($L+ $LV?) | ($L* $LV)) $V* $T* ) | ($L* $LVT $T*); # # Thai Dictionary Related Rules # $dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English $paiyannoi = [\u0e2f]; $maiyamok = [\u0e46]; $thai_etc = $paiyannoi \u0e25 $paiyannoi; $dictionary+ ($paiyannoi? $maiyamok)?; $dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]); $thai_etc; # # Definitions for building up Letters, so that breaks will not occur # within a single letter (Grapheme Cluster). See the character break rules. # $LineBreak = [$Ideographic $Hiragana $Katakana]; $Letter = [[[:L:] [:Sk:]] & [^$LineBreak $dictionary]]; #$MidLetter = [\u0027 \u2019 \u0029 \u00ad \u05f3 \u05f4]; $MidLetter = [\u0027 \u2019 \u003a \u0029 \u00ad \u05f3 \u05f4]; $Base = [^[:Cc:] [:Cf:] [:Cs:] [:Co:] [:Cn:] [:Zl:] [:Zp:] $Extend $Link $CGJ]; $LetterBase = [:L:]; $CGJSequence = $CGJ+ ($Base | $Hangul_Sequence); $Join_Control = [\u200d-\u200e]; # Zero Width Joiner, Zero Width Non-Joiner $LinkSequence = $Link+ $Extend* $Join_Control? $LetterBase; $LetterEx = ($Letter | $Hangul_Sequence) $Extend* ((($LinkSequence | $CGJSequence) $Extend*)*); # # Numeric Definitions # TODO: More complete handling of $Extend combining chars. # $Numeric = [:Nd:]; #TODO remove FULL WIDTH $NumericEx = $Numeric $Extend*; $InfixNumeric = [\u002c \u002e \u003a \u003b \u0589]; $PostfixNumeric = [\% \u00a2 \u00b0 \u2030 \u2031 \u2032-\u2037 \u20a7 \u2103 \u2109 \u2126 \ufe6a \uff05 \uffe0]; $PrefixNumeric = [[[:Sc:] \u002b \u005c \u00b1 \u2116 \u2212 \u2213 \-] - [$PostfixNumeric]]; $NumericPrefix = $PrefixNumeric $NumericEx ($InfixNumeric $NumericEx)?; $NumericInterior = $NumericEx ($InfixNumeric? $NumericEx)*; # # The Big Rule. Gloms everything together. # $NumericPrefix? (($LetterEx ($MidLetter $LetterEx)*)? $NumericInterior?)* ($NumericInterior $PostfixNumeric)?; # # Lesser rules # ($Hiragana $Extend*)*; ($Katakana $Extend*)*; $NotControl $Extend*; \r\n; .; # # Reverse Rules. Back up over any of the chars that can group together. # (Reverse rules do not need to be exact; they can back up a bit too far, # but must back up at least enough.) # ! ( $Letter | $MidLetter | $Numeric | $PrefixNumeric | $Join_Control | $CGJ | $Link | $InfixNumeric | $PostfixNumeric | $Extend | $T | $V | $L | $LV | $LVT)*; ! ($Hiragana | $Extend)*; ! ($Katakana | $Extend)*; ! $Extend* .; ! \n\r; #!.*; ! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*;