32c09250b7
X-SVN-Rev: 8939
178 lines
9.3 KiB
Plaintext
178 lines
9.3 KiB
Plaintext
#
|
|
# word.txt Word Breaking Rules for ICU Rules Based Break Iterator.
|
|
#
|
|
|
|
|
|
$Hiragana = [[:L:] & [:Hira:]];
|
|
$Katakana = [[:L:] & [:Kana:]];
|
|
|
|
#
|
|
# Definition of $Ideographic is from TR14, Line Breaking.
|
|
#
|
|
$Ideographic =
|
|
[ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
|
|
\u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
|
|
\u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
|
|
\u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F
|
|
\u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4
|
|
\u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF
|
|
\u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243
|
|
\u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD
|
|
\u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6
|
|
\uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46
|
|
\uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03
|
|
\uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
|
|
\uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
|
|
\U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];
|
|
|
|
#
|
|
# These definitions are from the character break rules.
|
|
#
|
|
$CGJ = [\u034f]; #Combining Grapheme Joiner
|
|
$Link = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2];
|
|
$NotControl = [^[:Zl:] [:Zp:] [:Cc:]]; #Line Separator,
|
|
#Paragraph Separtor,
|
|
# General Category == Control
|
|
$Extend = # From UNIDATA/DerivedCoreProperties.txt
|
|
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
|
|
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
|
|
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
|
|
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
|
|
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
|
|
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
|
|
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
|
|
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
|
|
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
|
|
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
|
|
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
|
|
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
|
|
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
|
|
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
|
|
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
|
|
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
|
|
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
|
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
|
|
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
|
|
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
|
|
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
|
|
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
|
|
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
|
|
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
|
|
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
|
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
|
|
|
#
|
|
# Korean, also taken from character break rules.
|
|
#
|
|
#
|
|
# Korean Syllable Sequences
|
|
#
|
|
$L = [\u1100-\u115f];
|
|
$V = [\u1160-\u11a2];
|
|
$T = [\u11a8-\u11f9];
|
|
$LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4
|
|
\uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64
|
|
\uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124
|
|
\ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4
|
|
\ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4
|
|
\ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664
|
|
\ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824
|
|
\ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4
|
|
\uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4
|
|
\ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64
|
|
\ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24
|
|
\ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4
|
|
\uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4
|
|
\uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464
|
|
\uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624
|
|
\uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4
|
|
\uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4
|
|
\uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64
|
|
\ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24
|
|
\ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4
|
|
\ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4
|
|
\ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264
|
|
\ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424
|
|
\ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4
|
|
\ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ];
|
|
$LVT = [[\uac00-\ud7a3] - $LV];
|
|
$Hangul_Sequence = ((($L+ $LV?) | ($L* $LV)) $V* $T* ) | ($L* $LVT $T*);
|
|
|
|
|
|
#
|
|
# Thai Dictionary Related Rules
|
|
#
|
|
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
|
|
$paiyannoi = [\u0e2f];
|
|
$maiyamok = [\u0e46];
|
|
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
|
|
|
|
|
|
$dictionary+ ($paiyannoi? $maiyamok)?;
|
|
$dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]);
|
|
$thai_etc;
|
|
|
|
|
|
#
|
|
# Definitions for building up Letters, so that breaks will not occur
|
|
# within a single letter (Grapheme Cluster). See the character break rules.
|
|
#
|
|
$LineBreak = [$Ideographic $Hiragana $Katakana];
|
|
$Letter = [[[:L:] [:Sk:]] & [^$LineBreak $dictionary]];
|
|
#$MidLetter = [\u0027 \u2019 \u0029 \u00ad \u05f3 \u05f4];
|
|
$MidLetter = [\u0027 \u2019 \u003a \u0029 \u00ad \u05f3 \u05f4];
|
|
|
|
$Base = [^[:Cc:] [:Cf:] [:Cs:] [:Co:] [:Cn:] [:Zl:] [:Zp:] $Extend $Link $CGJ];
|
|
$LetterBase = [:L:];
|
|
$CGJSequence = $CGJ+ ($Base | $Hangul_Sequence);
|
|
$Join_Control = [\u200d-\u200e]; # Zero Width Joiner, Zero Width Non-Joiner
|
|
$LinkSequence = $Link+ $Extend* $Join_Control? $LetterBase;
|
|
$LetterEx = ($Letter | $Hangul_Sequence) $Extend* ((($LinkSequence | $CGJSequence) $Extend*)*);
|
|
|
|
|
|
|
|
#
|
|
# Numeric Definitions
|
|
# TODO: More complete handling of $Extend combining chars.
|
|
#
|
|
$Numeric = [:Nd:]; #TODO remove FULL WIDTH
|
|
$NumericEx = $Numeric $Extend*;
|
|
$InfixNumeric = [\u002c \u002e \u003a \u003b \u0589];
|
|
$PostfixNumeric = [\% \u00a2 \u00b0 \u2030 \u2031 \u2032-\u2037 \u20a7
|
|
\u2103 \u2109 \u2126 \ufe6a \uff05 \uffe0];
|
|
$PrefixNumeric = [[[:Sc:] \u002b \u005c \u00b1 \u2116 \u2212 \u2213 \-] - [$PostfixNumeric]];
|
|
|
|
$NumericPrefix = $PrefixNumeric $NumericEx ($InfixNumeric $NumericEx)?;
|
|
$NumericInterior = $NumericEx ($InfixNumeric? $NumericEx)*;
|
|
|
|
|
|
#
|
|
# The Big Rule. Gloms everything together.
|
|
#
|
|
$NumericPrefix? (($LetterEx ($MidLetter $LetterEx)*)? $NumericInterior?)* ($NumericInterior $PostfixNumeric)?;
|
|
|
|
#
|
|
# Lesser rules
|
|
#
|
|
($Hiragana $Extend*)*;
|
|
($Katakana $Extend*)*;
|
|
$NotControl $Extend*;
|
|
\r\n;
|
|
.;
|
|
|
|
#
|
|
# Reverse Rules. Back up over any of the chars that can group together.
|
|
# (Reverse rules do not need to be exact; they can back up a bit too far,
|
|
# but must back up at least enough.)
|
|
#
|
|
! ( $Letter | $MidLetter | $Numeric | $PrefixNumeric | $Join_Control |
|
|
$CGJ | $Link | $InfixNumeric | $PostfixNumeric | $Extend |
|
|
$T | $V | $L | $LV | $LVT)*;
|
|
! ($Hiragana | $Extend)*;
|
|
! ($Katakana | $Extend)*;
|
|
! $Extend* .;
|
|
! \n\r;
|
|
#!.*;
|
|
|
|
! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*;
|