98b4e4f7a2
X-SVN-Rev: 19298
174 lines
5.4 KiB
Plaintext
174 lines
5.4 KiB
Plaintext
# Copyright (c) 2002-2006, International Business Machines Corporation and
|
|
# others. All Rights Reserved.
|
|
#
|
|
# file: line.txt
|
|
#
|
|
# Line Breaking Rules for ICU rules based break iteration.
|
|
# Implement default line breaking as defined by Unicode TR 14.
|
|
#
|
|
# TODO: Rework the rules not pertaining to Thai to be based on the
|
|
# default line break rules. Not done yet because of interactions
|
|
# between exact reverse rules and the Dictionary code.
|
|
#
|
|
# These rules, in their current form, do not conform to TR-14 for
|
|
# non-Thai breaks.
|
|
#
|
|
|
|
$LF = [\p{LineBreak = LF}];
|
|
$IN = [\p{LineBreak = IN}];
|
|
$SY = [\p{LineBreak = SY}];
|
|
$EX = [\p{LineBreak = EX}];
|
|
$BA = [\p{LineBreak = BA}];
|
|
$IS = [\p{LineBreak = IS}];
|
|
$BB = [\p{LineBreak = BB}];
|
|
$SA = [\p{LineBreak = SA}];
|
|
$CB = [\p{LineBreak = CB}];
|
|
$XX = [\p{LineBreak = XX}];
|
|
$HY = [\p{LineBreak = HY}];
|
|
$AI = [\p{LineBreak = AI}];
|
|
$ZW = [\p{LineBreak = ZW}];
|
|
$SG = [\p{LineBreak = SG}];
|
|
$AL = [\p{LineBreak = AL}];
|
|
$OP = [\p{LineBreak = OP}];
|
|
$BK = [\p{LineBreak = BK}];
|
|
$PO = [\p{LineBreak = PO}];
|
|
$NS = [\p{LineBreak = NS}];
|
|
$CL = [\p{LineBreak = CL}];
|
|
$NU = [\p{LineBreak = NU}];
|
|
$CM = [\p{LineBreak = CM}];
|
|
$PR = [\p{LineBreak = PR}];
|
|
$B2 = [\p{LineBreak = B2}];
|
|
$ID = [\p{LineBreak = ID}];
|
|
$SP = [\p{LineBreak = SP}];
|
|
$QU = [\p{LineBreak = QU}];
|
|
$CR = [\p{LineBreak = CR}];
|
|
$GL = [\p{LineBreak = GL}];
|
|
|
|
$JL = [\p{LineBreak = JL}];
|
|
$JV = [\p{LineBreak = JV}];
|
|
$JT = [\p{LineBreak = JT}];
|
|
$H2 = [\p{LineBreak = H2}];
|
|
$H3 = [\p{LineBreak = H3}];
|
|
|
|
|
|
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
|
|
|
|
|
|
#
|
|
# Thai Dictionary related definitions and rules
|
|
#
|
|
|
|
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
|
|
$paiyannoi = [\u0e2f];
|
|
$maiyamok = [\u0e46];
|
|
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
|
|
|
|
|
|
|
|
#
|
|
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width) and
|
|
# SA (South East Asian: Thai, Lao, Khmer) as $AL (Alphabetic)
|
|
#
|
|
$ALPlus = $AL | $AI | [$SA - $dictionary];
|
|
|
|
#
|
|
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
|
# TODO: This is going to produce some odd results, because of the non-combining
|
|
# chars that are included in $CM. Use $Extend instead, where possible.
|
|
#
|
|
$ALcm = $ALPlus $CM*;
|
|
$IDcm = $ID $CM*;
|
|
$NUcm = $NU $Extend*;
|
|
$HYcm = $HY $Extend*;
|
|
$SPcm = $SP $Extend*;
|
|
$QUcm = $QU $Extend*;
|
|
$POcm = $PO $Extend*;
|
|
$OPcm = $OP $Extend*;
|
|
$BAcm = $BA $Extend*;
|
|
$BBcm = $BB $Extend*;
|
|
$NScm = $NS $Extend*;
|
|
$GLcm = $GL $Extend*;
|
|
$B2cm = $B2 $Extend*;
|
|
$INcm = $IN $Extend*;
|
|
|
|
|
|
# New Lines. Always break after, never break before.
|
|
# Rule LB 3
|
|
#
|
|
# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
|
|
# Because we never break before these things, $Endings
|
|
# appears at the end of line break rule.
|
|
#
|
|
$NLF = $BK | $CR | $LF | $CR $LF;
|
|
$Endings = $SPcm* $ZW* $NLF?;
|
|
$EndingsMandatory = $SPcm* $NLF | $SPcm* $ZW $NLF?;
|
|
|
|
|
|
#
|
|
# Openings Sequences that can precede Words, and that should not be separated from them.
|
|
# Rules LB 9, 10
|
|
#
|
|
$Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;
|
|
|
|
#
|
|
# Closings Seqences that follow words, and that should not be separated from them,
|
|
# Rule LB 8, 11, 15
|
|
$Closings = ($SPcm*( ($CL ($SPcm* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm | $maiyamok)*;
|
|
|
|
#
|
|
# Words. Includes mixed Alpha-numerics.
|
|
# Rules 11a, 16, 17, 19, more or less.
|
|
#
|
|
$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
|
|
$Number = $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18
|
|
$Word = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?)); # Alpha-numeric. 16, 17
|
|
$Dashes = (($B2cm $SPcm*)*); # Dashes 11a
|
|
$ThaiRange = $dictionary+ | $thai_etc;
|
|
$WordLikeThing = $Number | $Word | $Dashes | $ThaiRange;
|
|
|
|
|
|
|
|
|
|
$Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) | # Rule 15. Stuff sticks around words.
|
|
[^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend* | # Allow characters that don't meet the
|
|
[^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD to be glued.
|
|
|
|
|
|
$GluedWord = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
|
|
# Rules 13, 14
|
|
|
|
#
|
|
# The actual rules, a combination of everything defined above.
|
|
#
|
|
$Openings $GluedWord $Closings $paiyannoi? $EndingsMandatory;
|
|
$Openings $GluedWord $Closings $Endings;
|
|
|
|
$Openings $GluedWord $Closings $paiyannoi /
|
|
([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]);
|
|
|
|
|
|
#"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|"
|
|
# + "\u0e25[^$paiyannoi$_ignore_]);"
|
|
|
|
#
|
|
# LB 18b. Do not break a Korean syllable
|
|
#
|
|
$JL+ $JV* $JT* $Extend*;
|
|
$JV+ $JT* $Extend*;
|
|
$JT+ $Extend*;
|
|
$H2 $JV* $JT* $Extend*;
|
|
$H3 $JT* $Extend*;
|
|
|
|
#
|
|
# Reverse Rules.
|
|
#
|
|
# Back up to a hard break or a space that will cause a boundary.
|
|
# Not all spaces cause line breaks. $SpaceGlue represents a sequence
|
|
# containing a space that may inhibit a break from occuring.
|
|
#
|
|
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) | (($Extend* $SP)+ $OP);
|
|
$ClumpingChars = [^$SP $BK $CR $LF];
|
|
|
|
!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR)?;
|
|
|