32c09250b7
X-SVN-Rev: 8939
131 lines
6.9 KiB
Plaintext
131 lines
6.9 KiB
Plaintext
#
|
|
# Character Break Rules, also known as Grapheme Cluster Boundaries
|
|
# See Unicode Technical Report #29.
|
|
# These rules are based on the proposed draft dated 2001-03-11
|
|
#
|
|
#
|
|
|
|
|
|
#
|
|
# Character Class Definitions.
|
|
# The names are those from TR29.
|
|
#
|
|
$CR = \r;
|
|
$LF = \n;
|
|
$NotControl = [^[:Zl:] [:Zp:] [:Cc:]]; #Line Separator,
|
|
#Paragraph Separtor,
|
|
# General Category == Control
|
|
|
|
$CGJ = [\u034f]; #Combining Grapheme Joiner
|
|
$Join_Control = [\u200d-\u200e]; # Zero Width Joiner, Zero Width Non-Joiner
|
|
|
|
#
|
|
# Grapheme_Link, Grapheme_Extend, Grapheme_Base as determined by the UCD.
|
|
# See http://www.unicode.org/Public/UNIDATA/PropList.txt
|
|
#
|
|
$Link = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2];
|
|
|
|
|
|
$Extend = # From UNIDATA/DerivedCoreProperties.txt
|
|
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
|
|
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
|
|
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
|
|
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
|
|
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
|
|
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
|
|
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
|
|
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
|
|
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
|
|
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
|
|
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
|
|
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
|
|
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
|
|
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
|
|
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
|
|
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
|
|
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
|
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
|
|
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
|
|
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
|
|
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
|
|
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
|
|
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
|
|
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
|
|
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
|
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
|
|
|
$Base = [^[:Cc:] [:Cf:] [:Cs:] [:Co:] [:Cn:] [:Zl:] [:Zp:] $Extend $Link $CGJ];
|
|
|
|
$LetterBase = [:L:];
|
|
|
|
#
|
|
# Korean Syllable Sequences
|
|
#
|
|
$L = [\u1100-\u115f];
|
|
$V = [\u1160-\u11a2];
|
|
$T = [\u11a8-\u11f9];
|
|
|
|
$LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4
|
|
\uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64
|
|
\uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124
|
|
\ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4
|
|
\ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4
|
|
\ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664
|
|
\ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824
|
|
\ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4
|
|
\uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4
|
|
\ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64
|
|
\ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24
|
|
\ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4
|
|
\uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4
|
|
\uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464
|
|
\uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624
|
|
\uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4
|
|
\uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4
|
|
\uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64
|
|
\ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24
|
|
\ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4
|
|
\ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4
|
|
\ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264
|
|
\ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424
|
|
\ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4
|
|
\ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ];
|
|
$LVT = [[\uac00-\ud7a3] - $LV];
|
|
|
|
$Hangul_Sequence = ($L* $LV? $V* $T* ) | ($L* $LVT $T*);
|
|
|
|
#
|
|
# Do not break between linking characters and letters, or before linking characters.
|
|
# THis provides for Indic graphemes, where virama (halant) will link character
|
|
# clusters together.
|
|
#
|
|
$LinkSequence = $Link+ $Extend* $Join_Control? $LetterBase;
|
|
|
|
#
|
|
# Do not break around a Combining Grapheme Joiner
|
|
$CGJSequence = $CGJ+ ($Base | $Hangul_Sequence);
|
|
|
|
# Do not break between a CR and LF.
|
|
$CR $LF;
|
|
|
|
#
|
|
# Here are the main rules. $NotControl is what matches most ordinary characters.
|
|
#
|
|
($NotControl | $Hangul_Sequence) $Extend* (($LinkSequence | $CGJSequence) $Extend*)*;
|
|
(($LinkSequence | $CGJSequence) $Extend*)*;
|
|
|
|
|
|
# Otherwise break after every character.
|
|
# This matches control chars, which do not match the main rules.
|
|
#
|
|
.;
|
|
|
|
|
|
#
|
|
# Reverse Rules, find a safe point to back up to.
|
|
#
|
|
! [^$LetterBase]* $LetterBase ([^$LetterBase]* $Link+ [^$LetterBase]* $LetterBase)*;
|
|
! $Extend* ($LVT | ($T* $V* $LV?) $L*);
|
|
! $Extend* .;
|
|
|