5f352ade23
X-SVN-Rev: 12706
168 lines
5.8 KiB
Plaintext
168 lines
5.8 KiB
Plaintext
# Copyright (c) 2002-2003 International Business Machines Corporation and
|
|
# others. All Rights Reserved.
|
|
#
|
|
# file: line.txt
|
|
#
|
|
# Line Breaking Rules
|
|
# Implement default line breaking as defined by Unicode TR 14.
|
|
#
|
|
|
|
# Known Deviations from TR14:
|
|
# LB 7a The Sequence SP CM+ is not treated as an ID.
|
|
# The SP in SP CM is not distinguished from any other SP.
|
|
# LB 14a, break before and after CB, is not implemented.
|
|
|
|
|
|
#
|
|
# Character Classes defined by TR 14.
|
|
#
|
|
|
|
$AI = [:LineBreak = Ambiguous:];
|
|
$AL = [:LineBreak = Alphabetic:];
|
|
$BA = [:LineBreak = Break_After:];
|
|
$BB = [:LineBreak = Break_Before:];
|
|
$BK = [:LineBreak = Mandatory_Break:];
|
|
$B2 = [:LineBreak = Break_Both:];
|
|
$CB = [:LineBreak = Contingent_Break:];
|
|
$CL = [:LineBreak = Close_Punctuation:];
|
|
$CM = [:LineBreak = Combining_Mark:];
|
|
$CR = [:LineBreak = Carriage_Return:];
|
|
$EX = [:LineBreak = Exclamation:];
|
|
$GL = [:LineBreak = Glue:];
|
|
$HY = [:LineBreak = Hyphen:];
|
|
$ID = [:LineBreak = Ideographic:];
|
|
$IN = [:LineBreak = Inseperable:];
|
|
$IS = [:LineBreak = Infix_Numeric:];
|
|
$LF = [:LineBreak = Line_Feed:];
|
|
$NL = [:LineBreak = Next_Line:];
|
|
$NS = [:LineBreak = Nonstarter:];
|
|
$NU = [:LineBreak = Numeric:];
|
|
$OP = [:LineBreak = Open_Punctuation:];
|
|
$PO = [:LineBreak = Postfix_Numeric:];
|
|
$PR = [:LineBreak = Prefix_Numeric:];
|
|
$QU = [:LineBreak = Quotation:];
|
|
$SA = [:LineBreak = Complex_Context:];
|
|
$SG = [:LineBreak = Surrogate:];
|
|
$SP = [:LineBreak = Space:];
|
|
$SY = [:LineBreak = Break_Symbols:];
|
|
$XX = [:LineBreak = Unknown:];
|
|
$ZW = [:LineBreak = ZWSpace:];
|
|
|
|
|
|
|
|
#
|
|
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
|
# SA (South East Asian: Thai, Lao, Khmer)
|
|
# XX (Unknown, unassigned)
|
|
# as $AL (Alphabetic)
|
|
#
|
|
$ALPlus = $AL | $AI | $SA | $XX;
|
|
|
|
#
|
|
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
|
#
|
|
$ALcm = $ALPlus $CM*;
|
|
$BAcm = $BA $CM*;
|
|
$BBcm = $BB $CM*;
|
|
$B2cm = $B2 $CM*;
|
|
$CLcm = $CL $CM*;
|
|
$EXcm = $EX $CM*;
|
|
$GLcm = $GL $CM*;
|
|
$HYcm = $HY $CM*;
|
|
$IDcm = $ID $CM*;
|
|
$INcm = $IN $CM*;
|
|
$IScm = $IS $CM*;
|
|
$NScm = $NS $CM*;
|
|
$NUcm = $NU $CM*;
|
|
$OPcm = $OP $CM*;
|
|
$POcm = $PO $CM*;
|
|
$PRcm = $PR $CM*;
|
|
$QUcm = $QU $CM*;
|
|
$SPcm = $SP $CM*;
|
|
$SYcm = $SY $CM*;
|
|
|
|
|
|
# New Lines. Always break after, never break before.
|
|
# Rule LB 3
|
|
#
|
|
# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
|
|
# Because we never break before these things, $Endings
|
|
# appears at the end of line break rule.
|
|
#
|
|
$NLF = $BK | $CR | $LF | $NL | $CR $LF;
|
|
$EndingsSoft = ($ZW* $SP)* $ZW*;
|
|
$EndingsHard = ($ZW* $SP)* $ZW* $NLF;
|
|
|
|
|
|
#
|
|
# Openings Sequences that can precede Words, and that should not be separated from them.
|
|
# Rules LB 9, 10
|
|
#
|
|
$Openings = ((($QUcm ($ZW* $SP)*)? $OPcm ($ZW* $SP)*) | $GLcm)+;
|
|
|
|
#
|
|
# Closings Seqences that follow words, and that should not be separated from them,
|
|
# Rule LB 8, 11, 15
|
|
$Closings = (($ZW* $SP)*( ($CLcm (($ZW* $SP)* $NScm)? | $EX | $IS | $SY) $CM*) | $BAcm | $HYcm | $NScm)*;
|
|
|
|
$WordClosings = ($SP* $CLcm | $SP* $EXcm | $SP* $IScm | $SP* $SYcm | $BAcm | $HYcm | $NScm)*;
|
|
|
|
#
|
|
# Words. Includes mixed Alpha-numerics.
|
|
# Rules 11a, 16, 17, 19, more or less.
|
|
#
|
|
$Number = $PRcm? ($OPcm | $HYcm)? $NU ($NU | $IS)* $CL? $POcm?; # Numbers
|
|
# Regex form, rather than rule 18
|
|
|
|
# Alpha-numeric. 16, 17
|
|
$Word = ($ALcm | $NUcm)+ $INcm* |
|
|
$IDcm ($POcm? | $INcm*) |
|
|
$CM+ ($POcm? | $INcm*) | # CM with no base is like ID (LB 7a)
|
|
$INcm+ |
|
|
$CB; # Deviation from Unicode spec for $CB
|
|
# We treat as a single char word
|
|
|
|
$Dashes = (($B2cm ($ZW* $SP)*)*); # Dashes 11a
|
|
|
|
|
|
|
|
$HYMinus = $HYcm ($NUcm ($NUcm | $IS)* $CL? $POcm?)?; # For Rle LB15, Don't break before Hypen-minus,
|
|
# we also need to match a whole number, if that
|
|
# is what follows the '-'
|
|
|
|
|
|
|
|
$Word15 = $Openings? (
|
|
($BBcm* $Openings? ($Word | $Number | $Dashes)? ($BAcm | $HYMinus | $NScm)*) | # Rule 15. Stuff sticks around words.
|
|
$BBcm* [^[:Cc:] $BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] $CM* | # Allow characters that don't meet the
|
|
$BBcm* [^$BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] # more elaborate definitions for WORD
|
|
) $WordClosings?; # to be glued.
|
|
|
|
$GluedWord = $Openings? $Word15 ((($ZW* $SP)* $GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
|
|
#$GluedWord = $Openings? $Word15 ((($ZW* $SP)* $Openings) $Word15)*; # "Glue" will stick anything below it together.
|
|
# Rules 13, 14
|
|
|
|
#
|
|
# The actual rule, a combination of everything defined above.
|
|
#
|
|
$Openings? $GluedWord $Closings $EndingsSoft{0};
|
|
$Openings? $GluedWord $Closings $EndingsHard{100};
|
|
# $GluedWord;
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
# Reverse Rules.
|
|
#
|
|
# Back up to a hard break or a space that will cause a boundary.
|
|
# Not all spaces cause line breaks. $SpaceGlue represents a sequence
|
|
# containing a space that may inhibit a break from occuring.
|
|
#
|
|
|
|
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($CM* $SP)) | (($CM* $SP)+ $OP);
|
|
$ClumpingChars = [^$SP $BK $CR $LF];
|
|
|
|
#!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
|
|
!.*; |