scuffed-code/icu4c/source/data/brkitr/line.txt
2003-07-21 05:37:08 +00:00

150 lines
4.8 KiB
Plaintext

# Copyright (c) 2002-2003 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line.txt
#
# Line Breaking Rules
# Implement default line breaking as defined by Unicode TR 14.
#
#
# Character Classes defined by TR 14.
#
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$BA = [:LineBreak = Break_After:];
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
$B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
$CL = [:LineBreak = Close_Punctuation:];
$CM = [:LineBreak = Combining_Mark:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
$GL = [:LineBreak = Glue:];
$HY = [:LineBreak = Hyphen:];
$ID = [:LineBreak = Ideographic:];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$LF = [:LineBreak = Line_Feed:];
$NL = [:LineBreak = Next_Line:];
$NS = [:LineBreak = Nonstarter:];
$NU = [:LineBreak = Numeric:];
$OP = [:LineBreak = Open_Punctuation:];
$PO = [:LineBreak = Postfix_Numeric:];
$PR = [:LineBreak = Prefix_Numeric:];
$QU = [:LineBreak = Quotation:];
$SA = [:LineBreak = Complex_Context:];
$SG = [:LineBreak = Surrogate:];
$SP = [:LineBreak = Space:];
$SY = [:LineBreak = Break_Symbols:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
#
# Character classes from TR 29. Needed for finding characters.
#
#
$Extend = [:Grapheme_Extend = TRUE:];
#
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width) and
# SA (South East Asian: Thai, Lao, Khmer) as $AL (Alphabetic)
#
$ALPlus = $AL | $AI | $SA;
#
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
#
$ALcm = $ALPlus $CM*;
$IDcm = ($ID $CM* | $SP $CM+);
$NUcm = $NU $Extend*;
$HYcm = $HY $Extend*;
$QUcm = $QU $Extend*;
$POcm = $PO $Extend*;
$OPcm = $OP $Extend*;
$BAcm = $BA $Extend*;
$BBcm = $BB $Extend*;
$NScm = $NS $Extend*;
$GLcm = $GL $Extend*;
$B2cm = $B2 $Extend*;
$INcm = $IN $Extend*;
# New Lines. Always break after, never break before.
# Rule LB 3
#
# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
# Because we never break before these things, $Endings
# appears at the end of line break rule.
#
$NLF = $BK | $CR | $LF | $NL | $CR $LF;
$EndingsSoft = $SP* $ZW*;
$EndingsHard = $SP* $ZW* $NLF;
#
# Openings Sequences that can precede Words, and that should not be separated from them.
# Rules LB 9, 10
#
$Openings = (($QUcm $SP*)? $OPcm $SP*)*;
#
# Closings Seqences that follow words, and that should not be separated from them,
# Rule LB 8, 11, 15
$Closings = ($SP*( ($CL ($SP* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm)*;
#
# Words. Includes mixed Alpha-numerics.
# Rules 11a, 16, 17, 19, more or less.
#
$Number = $PR? ($OPcm | $HYcm)? $NU ($NU | $IS)* $CL? $POcm?; # Numbers
# Regex form, rather than rule 18
$Word = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?)) ; # Alpha-numeric. 16, 17
$Dashes = (($B2cm $SP*)*); # Dashes 11a
$HYMinus = $HYcm ($NUcm ($NUcm | $IS)* $CL? $POcm?)?; # For Rle LB15, Don't break before Hypen-minus,
# we also need to match a whole number, if that
# is what follows the '-'
$Word15 = ($BBcm* ($Word | $Number | $Dashes)? ($BAcm | $HYMinus | $NScm)*) | # Rule 15. Stuff sticks around words.
$BBcm* [^[:Cc:] $BK $CR $LF $ZW $SP $GL ] $Extend* | # Allow characters that don't meet the
$BBcm* [^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD
# to be glued.
$GluedWord = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
# Rules 13, 14
#
# The actual rule, a combination of everything defined above.
#
$Openings $GluedWord $Closings $EndingsSoft{0};
$Openings $GluedWord $Closings $EndingsHard{100};
# $GluedWord;
#
# Reverse Rules.
#
# Back up to a hard break or a space that will cause a boundary.
# Not all spaces cause line breaks. $SpaceGlue represents a sequence
# containing a space that may inhibit a break from occuring.
#
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) | (($Extend* $SP)+ $OP);
$ClumpingChars = [^$SP $BK $CR $LF];
!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);