# Copyright (c) 2002-2005, International Business Machines Corporation and # others. All Rights Reserved. # # file: line.txt # # Line Breaking Rules for ICU rules based break iteration. # Implement default line breaking as defined by Unicode TR 14. # # TODO: Rework the rules not pertaining to Thai to be based on the # default line break rules. Not done yet because of interactions # between exact reverse rules and the Dictionary code. # # These rules, in their current form, do not conform to TR-14 for # non-Thai breaks. # $LF = [\p{LineBreak = LF}]; $IN = [\p{LineBreak = IN}]; $SY = [\p{LineBreak = SY}]; $EX = [\p{LineBreak = EX}]; $BA = [\p{LineBreak = BA}]; $IS = [\p{LineBreak = IS}]; $BB = [\p{LineBreak = BB}]; $SA = [\p{LineBreak = SA}]; $CB = [\p{LineBreak = CB}]; $XX = [\p{LineBreak = XX}]; $HY = [\p{LineBreak = HY}]; $AI = [\p{LineBreak = AI}]; $ZW = [\p{LineBreak = ZW}]; $SG = [\p{LineBreak = SG}]; $AL = [\p{LineBreak = AL}]; $OP = [\p{LineBreak = OP}]; $BK = [\p{LineBreak = BK}]; $PO = [\p{LineBreak = PO}]; $NS = [\p{LineBreak = NS}]; $CL = [\p{LineBreak = CL}]; $NU = [\p{LineBreak = NU}]; $CM = [\p{LineBreak = CM}]; $PR = [\p{LineBreak = PR}]; $B2 = [\p{LineBreak = B2}]; $ID = [\p{LineBreak = ID}]; $SP = [\p{LineBreak = SP}]; $QU = [\p{LineBreak = QU}]; $CR = [\p{LineBreak = CR}]; $GL = [\p{LineBreak = GL}]; $JL = [\p{LineBreak = JL}]; $JV = [\p{LineBreak = JV}]; $JT = [\p{LineBreak = JT}]; $H2 = [\p{LineBreak = H2}]; $H3 = [\p{LineBreak = H3}]; $Extend = [\{p{Grapheme_Cluster_Break = Extend}]; # # Thai Dictionary related definitions and rules # $dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English $paiyannoi = [\u0e2f]; $maiyamok = [\u0e46]; $thai_etc = $paiyannoi \u0e25 $paiyannoi; # # Rule LB1. By default, treat AI (characters with ambiguous east Asian width) and # SA (South East Asian: Thai, Lao, Khmer) as $AL (Alphabetic) # $ALPlus = $AL | $AI | [$SA - $dictionary]; # # Combining Marks. X $CM* behaves as if it were X. Rule LB6. # TODO: This is going to produce some odd results, because of the non-combining # chars that are included in $CM. Use $Extend instead, where possible. # $ALcm = $ALPlus $CM*; $IDcm = $ID $CM*; $NUcm = $NU $Extend*; $HYcm = $HY $Extend*; $SPcm = $SP $Extend*; $QUcm = $QU $Extend*; $POcm = $PO $Extend*; $OPcm = $OP $Extend*; $BAcm = $BA $Extend*; $BBcm = $BB $Extend*; $NScm = $NS $Extend*; $GLcm = $GL $Extend*; $B2cm = $B2 $Extend*; $INcm = $IN $Extend*; # New Lines. Always break after, never break before. # Rule LB 3 # # Endings. NewLine or Zero Width Space, or both. Rules 4, 5 # Because we never break before these things, $Endings # appears at the end of line break rule. # $NLF = $BK | $CR | $LF | $CR $LF; $Endings = $SPcm* $ZW* $NLF?; $EndingsMandatory = $SPcm* $NLF | $SPcm* $ZW $NLF?; # # Openings Sequences that can precede Words, and that should not be separated from them. # Rules LB 9, 10 # $Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*; # # Closings Seqences that follow words, and that should not be separated from them, # Rule LB 8, 11, 15 $Closings = ($SPcm*( ($CL ($SPcm* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm | $maiyamok)*; # # Words. Includes mixed Alpha-numerics. # Rules 11a, 16, 17, 19, more or less. # $NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+; $Number = $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18 $Word = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?)); # Alpha-numeric. 16, 17 $Dashes = (($B2cm $SPcm*)*); # Dashes 11a $ThaiRange = $dictionary+ | $thai_etc; $WordLikeThing = $Number | $Word | $Dashes | $ThaiRange; $Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) | # Rule 15. Stuff sticks around words. [^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend* | # Allow characters that don't meet the [^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD to be glued. $GluedWord = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together. # Rules 13, 14 # # The actual rules, a combination of everything defined above. # $Openings $GluedWord $Closings $paiyannoi? $EndingsMandatory; $Openings $GluedWord $Closings $Endings; $Openings $GluedWord $Closings $paiyannoi / ([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]); #"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|" # + "\u0e25[^$paiyannoi$_ignore_]);" # # LB 18b. Do not break a Korean syllable # $JL+ $JV* $JT* $Extend*; $JV+ $JT* $Extend*; $JT+ $Extend*; $H2 $JV* $JT* $Extend*; $H3 $JT* $Extend*; # # Reverse Rules. # # Back up to a hard break or a space that will cause a boundary. # Not all spaces cause line breaks. $SpaceGlue represents a sequence # containing a space that may inhibit a break from occuring. # $SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) | (($Extend* $SP)+ $OP); $ClumpingChars = [^$SP $BK $CR $LF]; !. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);