scuffed-code/icu4c/source/data/brkitr/line.txt

# Copyright (c) 2002-2003  International Business Machines Corporation and
# others. All Rights Reserved.
#
#  file:  line.txt
#
#         Line Breaking Rules
#         Implement default line breaking as defined by Unicode TR 14.
#

#   Known Deviations from TR14:
#      LB  7a The Sequence SP CM+  is not treated as an ID.
#             The SP  in SP CM is not distinguished from any other SP.
#      LB 14a, break before and after CB, is not implemented.


#
#  Character Classes defined by TR 14.
#

$AI = [:LineBreak =  Ambiguous:];
$AL = [:LineBreak =  Alphabetic:];
$BA = [:LineBreak =  Break_After:];
$BB = [:LineBreak =  Break_Before:];
$BK = [:LineBreak =  Mandatory_Break:];
$B2 = [:LineBreak =  Break_Both:];
$CB = [:LineBreak =  Contingent_Break:];
$CL = [:LineBreak =  Close_Punctuation:];
$CM = [:LineBreak =  Combining_Mark:];
$CR = [:LineBreak =  Carriage_Return:];
$EX = [:LineBreak =  Exclamation:];
$GL = [:LineBreak =  Glue:];
$HY = [:LineBreak =  Hyphen:];
$ID = [:LineBreak =  Ideographic:];
$IN = [:LineBreak =  Inseperable:];
$IS = [:LineBreak =  Infix_Numeric:];
$LF = [:LineBreak =  Line_Feed:];
$NL = [:LineBreak =  Next_Line:];
$NS = [:LineBreak =  Nonstarter:];
$NU = [:LineBreak =  Numeric:];
$OP = [:LineBreak =  Open_Punctuation:];
$PO = [:LineBreak =  Postfix_Numeric:];
$PR = [:LineBreak =  Prefix_Numeric:];
$QU = [:LineBreak =  Quotation:];
$SA = [:LineBreak =  Complex_Context:];
$SG = [:LineBreak =  Surrogate:];
$SP = [:LineBreak =  Space:];
$SY = [:LineBreak =  Break_Symbols:];
$XX = [:LineBreak =  Unknown:];
$ZW = [:LineBreak =  ZWSpace:];


#
#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
#                               SA  (South East Asian: Thai, Lao, Khmer)
#                               XX  (Unknown, unassigned)
#                         as $AL  (Alphabetic)
#
$ALPlus = $AL | $AI | $SA | $XX;

#
#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
#
$ALcm = $ALPlus $CM*;
$BAcm = $BA $CM*;
$BBcm = $BB $CM*;
$B2cm = $B2 $CM*;
$CLcm = $CL $CM*;
$EXcm = $EX $CM*;
$GLcm = $GL $CM*;
$HYcm = $HY $CM*;
$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$NScm = $NS $CM*;
$NUcm = $NU $CM*;
$OPcm = $OP $CM*;
$POcm = $PO $CM*;
$PRcm = $PR $CM*;
$QUcm = $QU $CM*;
$SPcm = $SP $CM*;
$SYcm = $SY $CM*;


#  New Lines.  Always break after, never break before.
#              Rule LB 3
#
#  Endings.    NewLine or Zero Width Space, or both.  Rules 4, 5
#              Because we never break before these things, $Endings
#              appears at the end of line break rule.
#
$NLF = $BK | $CR | $LF | $NL | $CR $LF;
$EndingsSoft = ($ZW* $SP)* $ZW*;
$EndingsHard = ($ZW* $SP)* $ZW* $NLF;


#
#  Openings  Sequences that can precede Words, and that should not be separated from them.
#            Rules LB 9, 10
#
$Openings = ((($QUcm ($ZW* $SP)*)? $OPcm ($ZW* $SP)*) | $GLcm)+;

#
#  Closings  Seqences that follow words, and that should not be separated from them,
#            Rule LB 8, 11, 15
$Closings =  (($ZW* $SP)*( ($CLcm (($ZW* $SP)* $NScm)?  |  $EX  | $IS  | $SY) $CM*) | $BAcm | $HYcm  | $NScm)*;

$WordClosings = ($SP* $CLcm | $SP* $EXcm | $SP* $IScm | $SP* $SYcm | $BAcm | $HYcm | $NScm)*;

#
#  Words.  Includes mixed Alpha-numerics.
#          Rules 11a, 16, 17, 19, more or less.
#
$Number         =  $PRcm? ($OPcm | $HYcm)? $NU ($NU | $IS)* $CL? $POcm?; # Numbers
                                                                       # Regex form, rather than rule 18

# Alpha-numeric.   16, 17
$Word   =  ($ALcm | $NUcm)+  $INcm*  |
           $IDcm ($POcm? | $INcm*)   |
           $CM+  ($POcm? | $INcm*)   |                      # CM with no base is like ID  (LB 7a)
           $INcm+                    |
           $CB;                                             # Deviation from Unicode spec for $CB
                                                            #   We treat as a single char word

$Dashes = (($B2cm ($ZW* $SP)*)*);                                             # Dashes           11a


$HYMinus = $HYcm ($NUcm ($NUcm | $IS)* $CL? $POcm?)?;       # For Rle LB15, Don't break before  Hypen-minus,
                                                            #  we also need to match a whole number, if that
                                                            #  is what follows the '-'


$Word15 = $Openings? (
             ($BBcm* $Openings? ($Word | $Number | $Dashes)? ($BAcm | $HYMinus | $NScm)*) |   # Rule 15. Stuff sticks around words.
             $BBcm* [^[:Cc:] $BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] $CM*  |                 # Allow characters that don't meet the
             $BBcm* [^$BK $CR $LF $NL $ZW ($ZW* $SP) $GL ]                                 #  more elaborate definitions for WORD
             )  $WordClosings?;                                                          #  to be glued.

$GluedWord  = $Openings? $Word15 ((($ZW* $SP)* $GLcm | $QUcm) $Word15)*;  # "Glue" will stick anything below it together.
 #$GluedWord  = $Openings? $Word15 ((($ZW* $SP)* $Openings) $Word15)*;  # "Glue" will stick anything below it together.
                                                                    # Rules 13, 14

#
#  The actual rule, a combination of everything defined above.
#
$Openings? $GluedWord  $Closings $EndingsSoft{0};
$Openings? $GluedWord  $Closings $EndingsHard{100};
# $GluedWord;


#
#  Reverse Rules.
#
#     Back up to a hard break or a space that will cause a boundary.
#     Not all spaces cause line breaks.  $SpaceGlue represents a sequence
#     containing a space that may inhibit a break from occuring.
#

$SpaceGlue  = ([$ZW $CL $IS $NS $OP]  ($CM* $SP)) | (($CM* $SP)+ $OP);
$ClumpingChars = [^$SP $BK $CR $LF];

#!. . $ClumpingChars*  ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
!.*;