scuffed-code/icu4c/source/data/brkitr/line_th.txt

# Copyright (c) 2002-2006, International Business Machines Corporation and
# others. All Rights Reserved.
#
#  file:  line.txt
#
#         Line Breaking Rules for ICU rules based break iteration.
#         Implement default line breaking as defined by Unicode TR 14.
#
#         TODO:  Rework the rules not pertaining to Thai to be based on the
#                default line break rules.  Not done yet because of interactions
#                between exact reverse rules and the Dictionary code.
#
#                These rules, in their current form, do not conform to TR-14 for
#                non-Thai breaks.
#

$LF = [\p{LineBreak = LF}];
$IN = [\p{LineBreak = IN}];
$SY = [\p{LineBreak = SY}];
$EX = [\p{LineBreak = EX}];
$BA = [\p{LineBreak = BA}];
$IS = [\p{LineBreak = IS}];
$BB = [\p{LineBreak = BB}];
$SA = [\p{LineBreak = SA}];
$CB = [\p{LineBreak = CB}];
$XX = [\p{LineBreak = XX}];
$HY = [\p{LineBreak = HY}];
$AI = [\p{LineBreak = AI}];
$ZW = [\p{LineBreak = ZW}];
$SG = [\p{LineBreak = SG}];
$AL = [\p{LineBreak = AL}];
$OP = [\p{LineBreak = OP}];
$BK = [\p{LineBreak = BK}];
$PO = [\p{LineBreak = PO}];
$NS = [\p{LineBreak = NS}];
$CL = [\p{LineBreak = CL}];
$NU = [\p{LineBreak = NU}];
$CM = [\p{LineBreak = CM}];
$PR = [\p{LineBreak = PR}];
$B2 = [\p{LineBreak = B2}];
$ID = [\p{LineBreak = ID}];
$SP = [\p{LineBreak = SP}];
$QU = [\p{LineBreak = QU}];
$CR = [\p{LineBreak = CR}];
$GL = [\p{LineBreak = GL}];

$JL = [\p{LineBreak = JL}];
$JV = [\p{LineBreak = JV}];
$JT = [\p{LineBreak = JT}];
$H2 = [\p{LineBreak = H2}];
$H3 = [\p{LineBreak = H3}];


$Extend = [\p{Grapheme_Cluster_Break = Extend}];


#
#  Thai Dictionary related definitions and rules
#

$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e];  # this rule breaks the iterator with mixed Thai and English
$paiyannoi  = [\u0e2f];
$maiyamok   = [\u0e46];
$thai_etc   = $paiyannoi \u0e25 $paiyannoi;


#
#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width) and
#                               SA  (South East Asian: Thai, Lao, Khmer) as $AL  (Alphabetic)
#
$ALPlus = $AL | $AI | [$SA - $dictionary];

#
#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
#                     TODO:  This is going to produce some odd results, because of the non-combining
#                            chars that are included in $CM.  Use $Extend instead, where possible.
#
$ALcm = $ALPlus $CM*;
$IDcm = $ID $CM*;
$NUcm = $NU $Extend*;
$HYcm = $HY $Extend*;
$SPcm = $SP $Extend*;
$QUcm = $QU $Extend*;
$POcm = $PO $Extend*;
$OPcm = $OP $Extend*;
$BAcm = $BA $Extend*;
$BBcm = $BB $Extend*;
$NScm = $NS $Extend*;
$GLcm = $GL $Extend*;
$B2cm = $B2 $Extend*;
$INcm = $IN $Extend*;


#  New Lines.  Always break after, never break before.
#              Rule LB 3
#
#  Endings.    NewLine or Zero Width Space, or both.  Rules 4, 5
#              Because we never break before these things, $Endings
#              appears at the end of line break rule.
#
$NLF = $BK | $CR | $LF | $CR $LF;
$Endings = $SPcm* $ZW* $NLF?;
$EndingsMandatory = $SPcm* $NLF | $SPcm* $ZW $NLF?;


#
#  Openings  Sequences that can precede Words, and that should not be separated from them.
#            Rules LB 9, 10
#
$Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;

#
#  Closings  Seqences that follow words, and that should not be separated from them,
#            Rule LB 8, 11, 15
$Closings =  ($SPcm*( ($CL ($SPcm* $NScm)?  |  $EX  | $IS  | $SY) $Extend*) | $BAcm | $HYcm  | $NScm | $maiyamok)*;

#
#  Words.  Includes mixed Alpha-numerics.
#          Rules 11a, 16, 17, 19, more or less.
#
$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
$Number         =  $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?;   # Fancy Number     18
$Word           = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?));       # Alpha-numeric.   16, 17
$Dashes         = (($B2cm $SPcm*)*);                                    # Dashes           11a
$ThaiRange      = $dictionary+ | $thai_etc;
$WordLikeThing  = $Number | $Word | $Dashes | $ThaiRange;


$Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) |     # Rule 15. Stuff sticks around words.
          [^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend*  |             # Allow characters that don't meet the
          [^$BK $CR $LF $ZW $SP $GL ];                              #  more elaborate definitions for WORD to be glued.


$GluedWord  = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*;  # "Glue" will stick anything below it together.
                                                                    # Rules 13, 14

#
#  The actual rules, a combination of everything defined above.
#
$Openings $GluedWord  $Closings $paiyannoi? $EndingsMandatory;
$Openings $GluedWord  $Closings  $Endings;

$Openings $GluedWord  $Closings $paiyannoi   /
               ([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]);


 #"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|"
 #                       + "\u0e25[^$paiyannoi$_ignore_]);"

#
# LB 18b.  Do not break a Korean syllable
#
$JL+ $JV* $JT* $Extend*;
$JV+ $JT* $Extend*;
$JT+ $Extend*;
$H2 $JV* $JT* $Extend*;
$H3 $JT* $Extend*;

#
#  Reverse Rules.
#
#     Back up to a hard break or a space that will cause a boundary.
#     Not all spaces cause line breaks.  $SpaceGlue represents a sequence
#     containing a space that may inhibit a break from occuring.
#
$SpaceGlue  = ([$ZW $CL $IS $NS $OP]  ($Extend* $SP)) | (($Extend* $SP)+ $OP);
$ClumpingChars = [^$SP $BK $CR $LF];

!. . $ClumpingChars*  ($SpaceGlue $ClumpingChars*)* (. | $LF $CR)?;