ICU-11556 Line Break rules update for L2/16-043R, don't break CA$; also LB rules refactored for reduced memory consumption.
X-SVN-Rev: 38643
This commit is contained in:
parent
25b5619604
commit
ac9c717990
@ -24,41 +24,6 @@
|
||||
|
||||
!!chain;
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
|
||||
# and only used for the line break rules.
|
||||
#
|
||||
# It is used in the implementation of rule LB 10
|
||||
# which says to treat any combining mark that is not attached to a base
|
||||
# character as if it were of class AL (alphabetic).
|
||||
#
|
||||
# The problem occurs in the reverse rules.
|
||||
#
|
||||
# Consider a sequence like, with correct breaks as shown
|
||||
# LF ID CM AL AL
|
||||
# ^ ^ ^
|
||||
# Then consider the sequence without the initial ID (ideographic)
|
||||
# LF CM AL AL
|
||||
# ^ ^
|
||||
# Our CM, which in the first example was attached to the ideograph,
|
||||
# is now unattached, becomes an alpha, and joins in with the other
|
||||
# alphas.
|
||||
#
|
||||
# When iterating forwards, these sequences do not present any problems
|
||||
# When iterating backwards, we need to look ahead when encountering
|
||||
# a CM to see whether it attaches to something further on or not.
|
||||
# (Look-ahead in a reverse rule is looking towards the start)
|
||||
#
|
||||
# If the CM is unattached, we need to force a break.
|
||||
#
|
||||
# !!lookAheadHardBreak forces the run time state machine to
|
||||
# stop immediately when a look ahead rule ( '/' operator) matches,
|
||||
# and set the match position to that of the look-ahead operator,
|
||||
# no matter what other rules may be in play at the time.
|
||||
#
|
||||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
@ -123,73 +88,11 @@ $dictionary = [:LineBreak = Complex_Context:];
|
||||
#
|
||||
$ALPlus = [$AL $AI $SA $SG $XX];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$BAcm = $BA $CM*;
|
||||
$BBcm = $BB $CM*;
|
||||
$B2cm = $B2 $CM*;
|
||||
$CLcm = $CL $CM*;
|
||||
$CPcm = $CP $CM*;
|
||||
$EXcm = $EX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HLcm = $HL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
$JVcm = $JV $CM*;
|
||||
$JTcm = $JT $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
$NUcm = $NU $CM*;
|
||||
$OPcm = $OP $CM*;
|
||||
$POcm = $PO $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$RIcm = $RI $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
#
|
||||
$ALPlus $CM+;
|
||||
$BA $CM+;
|
||||
$BB $CM+;
|
||||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
$HY $CM+;
|
||||
$H2 $CM+;
|
||||
$H3 $CM+;
|
||||
$ID $CM+;
|
||||
$IN $CM+;
|
||||
$IS $CM+;
|
||||
$JL $CM+;
|
||||
$JV $CM+;
|
||||
$JT $CM+;
|
||||
$NS $CM+;
|
||||
$NU $CM+;
|
||||
$OP $CM+;
|
||||
$PO $CM+;
|
||||
$PR $CM+;
|
||||
$QU $CM+;
|
||||
$RI $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
@ -204,12 +107,8 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
# Chaining is disabled with CM because it causes other failures,
|
||||
# so for this one case we need to manually list out longer sequences.
|
||||
#
|
||||
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
|
||||
$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];
|
||||
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -255,26 +154,24 @@ $CAN_CM $CM+; # Stick together any combining sequences that d
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
|
||||
^$CM+ $GLcm;
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
@ -305,19 +202,19 @@ $CAN_CM $CM* $SY;
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OPcm $SP* $CAN_CM $CM*;
|
||||
$OPcm $SP* $CANT_CM;
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QUcm $SP* $OPcm;
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
($CLcm | $CPcm) $SP* $NScm;
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2cm $SP* $B2cm;
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
@ -328,11 +225,11 @@ $LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
@ -344,83 +241,88 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
^$CM+ ($BAcm | $HYcm | $NScm);
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm) [^$CB]?;
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SYcm $HLcm;
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
|
||||
($JVcm | $H2cm) ($JVcm | $JTcm);
|
||||
($JTcm | $H3cm) $JTcm;
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
|
||||
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
@ -432,37 +334,10 @@ $EB $CM* $EM;
|
||||
|
||||
!!reverse;
|
||||
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
@ -481,16 +356,6 @@ $AL_FOLLOW $CM+ / (
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
# The CM needs to behave as an AL
|
||||
# This rule is concerned about getting the second of the two <breaks> in place.
|
||||
#
|
||||
|
||||
[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
|
||||
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
@ -555,21 +420,11 @@ $EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
# Rule 13 & 14 taken together for an edge case.
|
||||
# Match this, shown forward
|
||||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
@ -620,14 +475,16 @@ $IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
|
@ -7,7 +7,7 @@
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
@ -19,6 +19,9 @@
|
||||
# This tailors the line break behavior for Finnish, while otherwise behaving
|
||||
# per UAX 14 which corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
|
||||
# It sets characters of class CJ to behave like NS.
|
||||
#
|
||||
# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
|
||||
# It sets characters of class CJ to behave like NS.
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
@ -26,41 +29,6 @@
|
||||
|
||||
!!chain;
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
|
||||
# and only used for the line break rules.
|
||||
#
|
||||
# It is used in the implementation of rule LB 10
|
||||
# which says to treat any combining mark that is not attached to a base
|
||||
# character as if it were of class AL (alphabetic).
|
||||
#
|
||||
# The problem occurs in the reverse rules.
|
||||
#
|
||||
# Consider a sequence like, with correct breaks as shown
|
||||
# LF ID CM AL AL
|
||||
# ^ ^ ^
|
||||
# Then consider the sequence without the initial ID (ideographic)
|
||||
# LF CM AL AL
|
||||
# ^ ^
|
||||
# Our CM, which in the first example was attached to the ideograph,
|
||||
# is now unattached, becomes an alpha, and joins in with the other
|
||||
# alphas.
|
||||
#
|
||||
# When iterating forwards, these sequences do not present any problems
|
||||
# When iterating backwards, we need to look ahead when encountering
|
||||
# a CM to see whether it attaches to something further on or not.
|
||||
# (Look-ahead in a reverse rule is looking towards the start)
|
||||
#
|
||||
# If the CM is unattached, we need to force a break.
|
||||
#
|
||||
# !!lookAheadHardBreak forces the run time state machine to
|
||||
# stop immediately when a look ahead rule ( '/' operator) matches,
|
||||
# and set the match position to that of the look-ahead operator,
|
||||
# no matter what other rules may be in play at the time.
|
||||
#
|
||||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
@ -70,7 +38,7 @@ $EM = [\U0001F3FB-\U0001F3FF];
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
@ -126,75 +94,11 @@ $dictionary = [:LineBreak = Complex_Context:];
|
||||
#
|
||||
$ALPlus = [$AL $AI $SA $SG $XX];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$BAcm = $BA $CM*;
|
||||
$HHcm = $HH $CM*;
|
||||
$BBcm = $BB $CM*;
|
||||
$B2cm = $B2 $CM*;
|
||||
$CLcm = $CL $CM*;
|
||||
$CPcm = $CP $CM*;
|
||||
$EXcm = $EX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HLcm = $HL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
$JVcm = $JV $CM*;
|
||||
$JTcm = $JT $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
$NUcm = $NU $CM*;
|
||||
$OPcm = $OP $CM*;
|
||||
$POcm = $PO $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$RIcm = $RI $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
#
|
||||
$ALPlus $CM+;
|
||||
$BA $CM+;
|
||||
$HH $CM+;
|
||||
$BB $CM+;
|
||||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
$HY $CM+;
|
||||
$H2 $CM+;
|
||||
$H3 $CM+;
|
||||
$ID $CM+;
|
||||
$IN $CM+;
|
||||
$IS $CM+;
|
||||
$JL $CM+;
|
||||
$JV $CM+;
|
||||
$JT $CM+;
|
||||
$NS $CM+;
|
||||
$NU $CM+;
|
||||
$OP $CM+;
|
||||
$PO $CM+;
|
||||
$PR $CM+;
|
||||
$QU $CM+;
|
||||
$RI $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
@ -209,12 +113,8 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
# Chaining is disabled with CM because it causes other failures,
|
||||
# so for this one case we need to manually list out longer sequences.
|
||||
#
|
||||
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
|
||||
$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $ALPlus];
|
||||
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -260,26 +160,24 @@ $CAN_CM $CM+; # Stick together any combining sequences that d
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
|
||||
^$CM+ $GLcm;
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
@ -310,19 +208,19 @@ $CAN_CM $CM* $SY;
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OPcm $SP* $CAN_CM $CM*;
|
||||
$OPcm $SP* $CANT_CM;
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QUcm $SP* $OPcm;
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
($CLcm | $CPcm) $SP* $NScm;
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2cm $SP* $B2cm;
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
@ -333,11 +231,11 @@ $LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
@ -350,85 +248,90 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
|
||||
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL;
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS);
|
||||
($HY | $HH) $AL;
|
||||
^$CM+ ($BAcm | $HYcm | $HHcm | $NScm);
|
||||
^$CM+ ($BA | $HY | $HH | $NS);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?;
|
||||
#
|
||||
$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SYcm $HLcm;
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
|
||||
($JVcm | $H2cm) ($JVcm | $JTcm);
|
||||
($JTcm | $H3cm) $JTcm;
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
|
||||
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
@ -440,38 +343,10 @@ $EB $CM* $EM;
|
||||
|
||||
!!reverse;
|
||||
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $HH;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
@ -490,16 +365,6 @@ $AL_FOLLOW $CM+ / (
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
# The CM needs to behave as an AL
|
||||
# This rule is concerned about getting the second of the two <breaks> in place.
|
||||
#
|
||||
|
||||
[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
|
||||
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
@ -564,21 +429,11 @@ $EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
# Rule 13 & 14 taken together for an edge case.
|
||||
# Match this, shown forward
|
||||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
@ -615,7 +470,7 @@ $AL ($HY | $HH) / $SP;
|
||||
# LB 21
|
||||
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
@ -632,14 +487,16 @@ $IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
|
@ -31,41 +31,6 @@
|
||||
|
||||
!!chain;
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
|
||||
# and only used for the line break rules.
|
||||
#
|
||||
# It is used in the implementation of rule LB 10
|
||||
# which says to treat any combining mark that is not attached to a base
|
||||
# character as if it were of class AL (alphabetic).
|
||||
#
|
||||
# The problem occurs in the reverse rules.
|
||||
#
|
||||
# Consider a sequence like, with correct breaks as shown
|
||||
# LF ID CM AL AL
|
||||
# ^ ^ ^
|
||||
# Then consider the sequence without the initial ID (ideographic)
|
||||
# LF CM AL AL
|
||||
# ^ ^
|
||||
# Our CM, which in the first example was attached to the ideograph,
|
||||
# is now unattached, becomes an alpha, and joins in with the other
|
||||
# alphas.
|
||||
#
|
||||
# When iterating forwards, these sequences do not present any problems
|
||||
# When iterating backwards, we need to look ahead when encountering
|
||||
# a CM to see whether it attaches to something further on or not.
|
||||
# (Look-ahead in a reverse rule is looking towards the start)
|
||||
#
|
||||
# If the CM is unattached, we need to force a break.
|
||||
#
|
||||
# !!lookAheadHardBreak forces the run time state machine to
|
||||
# stop immediately when a look ahead rule ( '/' operator) matches,
|
||||
# and set the match position to that of the look-ahead operator,
|
||||
# no matter what other rules may be in play at the time.
|
||||
#
|
||||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
@ -131,75 +96,11 @@ $dictionary = [:LineBreak = Complex_Context:];
|
||||
#
|
||||
$ALPlus = [$AL $AI $SA $SG $XX];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$BAcm = $BA $CM*;
|
||||
$BBcm = $BB $CM*;
|
||||
$B2cm = $B2 $CM*;
|
||||
$CLcm = $CL $CM*;
|
||||
$CPcm = $CP $CM*;
|
||||
$EXcm = $EX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HLcm = $HL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
$JVcm = $JV $CM*;
|
||||
$JTcm = $JT $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
$NSXcm = $NSX $CM*;
|
||||
$NUcm = $NU $CM*;
|
||||
$OPcm = $OP $CM*;
|
||||
$POcm = $PO $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$RIcm = $RI $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
#
|
||||
$ALPlus $CM+;
|
||||
$BA $CM+;
|
||||
$BB $CM+;
|
||||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
$HY $CM+;
|
||||
$H2 $CM+;
|
||||
$H3 $CM+;
|
||||
$ID $CM+;
|
||||
$IN $CM+;
|
||||
$IS $CM+;
|
||||
$JL $CM+;
|
||||
$JV $CM+;
|
||||
$JT $CM+;
|
||||
$NS $CM+;
|
||||
$NSX $CM+;
|
||||
$NU $CM+;
|
||||
$OP $CM+;
|
||||
$PO $CM+;
|
||||
$PR $CM+;
|
||||
$QU $CM+;
|
||||
$RI $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
@ -214,12 +115,8 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
# Chaining is disabled with CM because it causes other failures,
|
||||
# so for this one case we need to manually list out longer sequences.
|
||||
#
|
||||
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
|
||||
$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];
|
||||
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -265,26 +162,24 @@ $CAN_CM $CM+; # Stick together any combining sequences that d
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
|
||||
^$CM+ $GLcm;
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
@ -315,21 +210,21 @@ $CAN_CM $CM* $SY;
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OPcm $SP* $CAN_CM $CM*;
|
||||
$OPcm $SP* $CANT_CM;
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QUcm $SP* $OPcm;
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
# Do not break between closing punctuation and $NS, even with intervening spaces
|
||||
# But DO allow a break between closing punctuation and $NSX, don't include it here
|
||||
($CLcm | $CPcm) $SP* $NScm;
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2cm $SP* $B2cm;
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
@ -340,11 +235,11 @@ $LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
@ -356,84 +251,89 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before NSXcm, so don't include it
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
^$CM+ ($BAcm | $HYcm | $NScm);
|
||||
# DO allow breaks here before NSX, so don't include it
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm) [^$CB]?;
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SYcm $HLcm;
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
# $INcm $INcm; # delete this rule for CSS loose
|
||||
$NUcm $INcm;
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
|
||||
($JVcm | $H2cm) ($JVcm | $JTcm);
|
||||
($JTcm | $H3cm) $JTcm;
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
|
||||
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
@ -445,38 +345,10 @@ $EB $CM* $EM;
|
||||
|
||||
!!reverse;
|
||||
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NSX;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
@ -495,16 +367,6 @@ $AL_FOLLOW $CM+ / (
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
# The CM needs to behave as an AL
|
||||
# This rule is concerned about getting the second of the two <breaks> in place.
|
||||
#
|
||||
|
||||
[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
|
||||
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
@ -569,21 +431,11 @@ $EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
# Rule 13 & 14 taken together for an edge case.
|
||||
# Match this, shown forward
|
||||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
@ -636,14 +488,16 @@ $IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
|
@ -11,7 +11,7 @@
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
@ -38,41 +38,6 @@
|
||||
|
||||
!!chain;
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
|
||||
# and only used for the line break rules.
|
||||
#
|
||||
# It is used in the implementation of rule LB 10
|
||||
# which says to treat any combining mark that is not attached to a base
|
||||
# character as if it were of class AL (alphabetic).
|
||||
#
|
||||
# The problem occurs in the reverse rules.
|
||||
#
|
||||
# Consider a sequence like, with correct breaks as shown
|
||||
# LF ID CM AL AL
|
||||
# ^ ^ ^
|
||||
# Then consider the sequence without the initial ID (ideographic)
|
||||
# LF CM AL AL
|
||||
# ^ ^
|
||||
# Our CM, which in the first example was attached to the ideograph,
|
||||
# is now unattached, becomes an alpha, and joins in with the other
|
||||
# alphas.
|
||||
#
|
||||
# When iterating forwards, these sequences do not present any problems
|
||||
# When iterating backwards, we need to look ahead when encountering
|
||||
# a CM to see whether it attaches to something further on or not.
|
||||
# (Look-ahead in a reverse rule is looking towards the start)
|
||||
#
|
||||
# If the CM is unattached, we need to force a break.
|
||||
#
|
||||
# !!lookAheadHardBreak forces the run time state machine to
|
||||
# stop immediately when a look ahead rule ( '/' operator) matches,
|
||||
# and set the match position to that of the look-ahead operator,
|
||||
# no matter what other rules may be in play at the time.
|
||||
#
|
||||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
@ -142,83 +107,11 @@ $dictionary = [:LineBreak = Complex_Context:];
|
||||
#
|
||||
$ALPlus = [$AL $AI $SA $SG $XX];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$BAcm = $BA $CM*;
|
||||
$BAXcm = $BAX $CM*;
|
||||
$BBcm = $BB $CM*;
|
||||
$B2cm = $B2 $CM*;
|
||||
$CLcm = $CL $CM*;
|
||||
$CPcm = $CP $CM*;
|
||||
$EXcm = $EX $CM*;
|
||||
$EXXcm = $EXX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HLcm = $HL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
$JVcm = $JV $CM*;
|
||||
$JTcm = $JT $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
$NSXcm = $NSX $CM*;
|
||||
$NUcm = $NU $CM*;
|
||||
$OPcm = $OP $CM*;
|
||||
$POcm = $PO $CM*;
|
||||
$POXcm = $POX $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$PRXcm = $PRX $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$RIcm = $RI $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
#
|
||||
$ALPlus $CM+;
|
||||
$BA $CM+;
|
||||
$BAX $CM+;
|
||||
$BB $CM+;
|
||||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$EXX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
$HY $CM+;
|
||||
$H2 $CM+;
|
||||
$H3 $CM+;
|
||||
$ID $CM+;
|
||||
$IN $CM+;
|
||||
$IS $CM+;
|
||||
$JL $CM+;
|
||||
$JV $CM+;
|
||||
$JT $CM+;
|
||||
$NS $CM+;
|
||||
$NSX $CM+;
|
||||
$NU $CM+;
|
||||
$OP $CM+;
|
||||
$PO $CM+;
|
||||
$POX $CM+;
|
||||
$PR $CM+;
|
||||
$PRX $CM+;
|
||||
$QU $CM+;
|
||||
$RI $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
@ -233,12 +126,8 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
# Chaining is disabled with CM because it causes other failures,
|
||||
# so for this one case we need to manually list out longer sequences.
|
||||
#
|
||||
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
|
||||
$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];
|
||||
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -284,26 +173,24 @@ $CAN_CM $CM+; # Stick together any combining sequences that d
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
|
||||
^$CM+ $GLcm;
|
||||
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
#
|
||||
@ -334,21 +221,21 @@ $CAN_CM $CM* $SY;
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OPcm $SP* $CAN_CM $CM*;
|
||||
$OPcm $SP* $CANT_CM;
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QUcm $SP* $OPcm;
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
# Do not break between closing punctuation and $NS, even with intervening spaces
|
||||
# But DO allow a break between closing punctuation and $NSX, don't include it here
|
||||
($CLcm | $CPcm) $SP* $NScm;
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2cm $SP* $B2cm;
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
@ -359,11 +246,11 @@ $LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
@ -375,88 +262,93 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
^$CM+ ($BAcm | $HYcm | $NScm);
|
||||
# DO allow breaks here before $BAX and $NSX, so don't include them
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
|
||||
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SYcm $HLcm;
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
# $INcm $INcm; # delete this rule for CSS loose
|
||||
$NUcm $INcm;
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
# Do not include $POX here
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
# Do not include $PRX here
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
($POcm | $POXcm) ($ALcm | $HLcm);
|
||||
($PR | $PO | $POX) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO | $POX); # TODO: should this be ($PR | $PRX | $PO)
|
||||
^$CM+ ($PR | $PO | $POX); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
# Here do not include $PRX at the beginning or $POX at the end
|
||||
($PRcm | $POcm | $POXcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $PRXcm | $POcm)?;
|
||||
(($PR | $PO | $POX) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PRX | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
|
||||
($JVcm | $H2cm) ($JVcm | $JTcm);
|
||||
($JTcm | $H3cm) $JTcm;
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
# Do not include $POX or $PRX here
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
|
||||
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
@ -468,42 +360,10 @@ $EB $CM* $EM;
|
||||
|
||||
!!reverse;
|
||||
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BAX;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $EXX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NSX;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $POX;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $PRX;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
@ -522,16 +382,6 @@ $AL_FOLLOW $CM+ / (
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
# The CM needs to behave as an AL
|
||||
# This rule is concerned about getting the second of the two <breaks> in place.
|
||||
#
|
||||
|
||||
[$PR $PRX ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
|
||||
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
@ -597,21 +447,11 @@ $EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
# Rule 13 & 14 taken together for an edge case.
|
||||
# Match this, shown forward
|
||||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
@ -651,7 +491,7 @@ $CAN_CM $CM* $QU; # QU x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
|
||||
[^$CM $CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$HL $CM* $SY;
|
||||
@ -661,19 +501,23 @@ $IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$CM* $IN $CM* $NU;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
# Do not include $POX here
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
# LB23a
|
||||
# Do not include $PRX here
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* ($PO | $POX);
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
# Do not include $PRX here
|
||||
($ALPlus | $HL) $CM* ($PR | $PO | $POX);
|
||||
($PR | $PO | $POX) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
# Here do not include $POX at the beginning or $PRX at the end
|
||||
@ -688,7 +532,7 @@ $JT $CM* ($H3 | $JT);
|
||||
# Do not include $POX or $PRX here
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
@ -708,6 +552,7 @@ $OP $CM* ($ALPlus | $HL | $NU);
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
# Line Loose tailoring: Don't include NSX here.
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
@ -18,52 +19,16 @@
|
||||
# not because the older behavior is desirable.
|
||||
#
|
||||
# This tailors the line break behavior both for Finnish and to correpond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
|
||||
# Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks before 3005, 303B, 309D, 309E, 30FD, 30FE (all NS).
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
!!chain;
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
|
||||
# and only used for the line break rules.
|
||||
#
|
||||
# It is used in the implementation of rule LB 10
|
||||
# which says to treat any combining mark that is not attached to a base
|
||||
# character as if it were of class AL (alphabetic).
|
||||
#
|
||||
# The problem occurs in the reverse rules.
|
||||
#
|
||||
# Consider a sequence like, with correct breaks as shown
|
||||
# LF ID CM AL AL
|
||||
# ^ ^ ^
|
||||
# Then consider the sequence without the initial ID (ideographic)
|
||||
# LF CM AL AL
|
||||
# ^ ^
|
||||
# Our CM, which in the first example was attached to the ideograph,
|
||||
# is now unattached, becomes an alpha, and joins in with the other
|
||||
# alphas.
|
||||
#
|
||||
# When iterating forwards, these sequences do not present any problems
|
||||
# When iterating backwards, we need to look ahead when encountering
|
||||
# a CM to see whether it attaches to something further on or not.
|
||||
# (Look-ahead in a reverse rule is looking towards the start)
|
||||
#
|
||||
# If the CM is unattached, we need to force a break.
|
||||
#
|
||||
# !!lookAheadHardBreak forces the run time state machine to
|
||||
# stop immediately when a look ahead rule ( '/' operator) matches,
|
||||
# and set the match position to that of the look-ahead operator,
|
||||
# no matter what other rules may be in play at the time.
|
||||
#
|
||||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
@ -72,8 +37,8 @@ $EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
|
||||
$BA = [[:LineBreak = Break_After:] - [\u2010]];
|
||||
$HH = [\u2010];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
@ -130,77 +95,11 @@ $dictionary = [:LineBreak = Complex_Context:];
|
||||
#
|
||||
$ALPlus = [$AL $AI $SA $SG $XX];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$BAcm = $BA $CM*;
|
||||
$HHcm = $HH $CM*;
|
||||
$BBcm = $BB $CM*;
|
||||
$B2cm = $B2 $CM*;
|
||||
$CLcm = $CL $CM*;
|
||||
$CPcm = $CP $CM*;
|
||||
$EXcm = $EX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HLcm = $HL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
$JVcm = $JV $CM*;
|
||||
$JTcm = $JT $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
$NSXcm = $NSX $CM*;
|
||||
$NUcm = $NU $CM*;
|
||||
$OPcm = $OP $CM*;
|
||||
$POcm = $PO $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$RIcm = $RI $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
#
|
||||
$ALPlus $CM+;
|
||||
$BA $CM+;
|
||||
$HH $CM+;
|
||||
$BB $CM+;
|
||||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
$HY $CM+;
|
||||
$H2 $CM+;
|
||||
$H3 $CM+;
|
||||
$ID $CM+;
|
||||
$IN $CM+;
|
||||
$IS $CM+;
|
||||
$JL $CM+;
|
||||
$JV $CM+;
|
||||
$JT $CM+;
|
||||
$NS $CM+;
|
||||
$NSX $CM+;
|
||||
$NU $CM+;
|
||||
$OP $CM+;
|
||||
$PO $CM+;
|
||||
$PR $CM+;
|
||||
$QU $CM+;
|
||||
$RI $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
@ -215,12 +114,8 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
# Chaining is disabled with CM because it causes other failures,
|
||||
# so for this one case we need to manually list out longer sequences.
|
||||
#
|
||||
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
|
||||
$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $NSX $IN $NU $ALPlus];
|
||||
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -266,26 +161,24 @@ $CAN_CM $CM+; # Stick together any combining sequences that d
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
|
||||
^$CM+ $GLcm;
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
@ -316,21 +209,21 @@ $CAN_CM $CM* $SY;
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OPcm $SP* $CAN_CM $CM*;
|
||||
$OPcm $SP* $CANT_CM;
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QUcm $SP* $OPcm;
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
# Do not break between closing punctuation and $NS, even with intervening spaces
|
||||
# But DO allow a break between closing punctuation and $NSX, don't include it here
|
||||
($CLcm | $CPcm) $SP* $NScm;
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2cm $SP* $B2cm;
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
@ -341,11 +234,12 @@ $LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
@ -357,86 +251,91 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before NSXcm, so don't include it
|
||||
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
|
||||
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
|
||||
# DO allow breaks here before NSX, so don't include it
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL;
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS);
|
||||
($HY | $HH) $AL;
|
||||
^$CM+ ($BAcm | $HHcm | $HYcm | $NScm);
|
||||
^$CM+ ($BA | $HY | $HH | $NS);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?;
|
||||
$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SYcm $HLcm;
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
|
||||
($JVcm | $H2cm) ($JVcm | $JTcm);
|
||||
($JTcm | $H3cm) $JTcm;
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
|
||||
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM] {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
@ -448,39 +347,10 @@ $EB $CM* $EM;
|
||||
|
||||
!!reverse;
|
||||
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HH;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NSX;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
@ -499,16 +369,6 @@ $AL_FOLLOW $CM+ / (
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
# The CM needs to behave as an AL
|
||||
# This rule is concerned about getting the second of the two <breaks> in place.
|
||||
#
|
||||
|
||||
[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
|
||||
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
@ -573,21 +433,11 @@ $EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
# Rule 13 & 14 taken together for an edge case.
|
||||
# Match this, shown forward
|
||||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
@ -639,18 +489,20 @@ $HL $CM* $SY;
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
@ -684,8 +536,9 @@ $OP $CM* ($ALPlus | $HL | $NU);
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $NSX $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $NSX $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
# Line Loose tailoring: Don't include NSX here.
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
@ -719,7 +572,7 @@ $SP+ $CM* ($CL | $CP);
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
($HY | $BA | $HH) $CM* $HL;
|
||||
$CM* ($HY | $BA | $HH) $CM* $HL;
|
||||
|
||||
# LB 25
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
@ -744,6 +597,6 @@ $dictionary $dictionary;
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $RI $ZWJ $dictionary];
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -28,41 +28,6 @@
|
||||
|
||||
!!chain;
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
|
||||
# and only used for the line break rules.
|
||||
#
|
||||
# It is used in the implementation of rule LB 10
|
||||
# which says to treat any combining mark that is not attached to a base
|
||||
# character as if it were of class AL (alphabetic).
|
||||
#
|
||||
# The problem occurs in the reverse rules.
|
||||
#
|
||||
# Consider a sequence like, with correct breaks as shown
|
||||
# LF ID CM AL AL
|
||||
# ^ ^ ^
|
||||
# Then consider the sequence without the initial ID (ideographic)
|
||||
# LF CM AL AL
|
||||
# ^ ^
|
||||
# Our CM, which in the first example was attached to the ideograph,
|
||||
# is now unattached, becomes an alpha, and joins in with the other
|
||||
# alphas.
|
||||
#
|
||||
# When iterating forwards, these sequences do not present any problems
|
||||
# When iterating backwards, we need to look ahead when encountering
|
||||
# a CM to see whether it attaches to something further on or not.
|
||||
# (Look-ahead in a reverse rule is looking towards the start)
|
||||
#
|
||||
# If the CM is unattached, we need to force a break.
|
||||
#
|
||||
# !!lookAheadHardBreak forces the run time state machine to
|
||||
# stop immediately when a look ahead rule ( '/' operator) matches,
|
||||
# and set the match position to that of the look-ahead operator,
|
||||
# no matter what other rules may be in play at the time.
|
||||
#
|
||||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
@ -127,73 +92,11 @@ $dictionary = [:LineBreak = Complex_Context:];
|
||||
#
|
||||
$ALPlus = [$AL $AI $SA $SG $XX];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$BAcm = $BA $CM*;
|
||||
$BBcm = $BB $CM*;
|
||||
$B2cm = $B2 $CM*;
|
||||
$CLcm = $CL $CM*;
|
||||
$CPcm = $CP $CM*;
|
||||
$EXcm = $EX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HLcm = $HL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
$JVcm = $JV $CM*;
|
||||
$JTcm = $JT $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
$NUcm = $NU $CM*;
|
||||
$OPcm = $OP $CM*;
|
||||
$POcm = $PO $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$RIcm = $RI $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
#
|
||||
$ALPlus $CM+;
|
||||
$BA $CM+;
|
||||
$BB $CM+;
|
||||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
$HY $CM+;
|
||||
$H2 $CM+;
|
||||
$H3 $CM+;
|
||||
$ID $CM+;
|
||||
$IN $CM+;
|
||||
$IS $CM+;
|
||||
$JL $CM+;
|
||||
$JV $CM+;
|
||||
$JT $CM+;
|
||||
$NS $CM+;
|
||||
$NU $CM+;
|
||||
$OP $CM+;
|
||||
$PO $CM+;
|
||||
$PR $CM+;
|
||||
$QU $CM+;
|
||||
$RI $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
@ -208,12 +111,8 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
# Chaining is disabled with CM because it causes other failures,
|
||||
# so for this one case we need to manually list out longer sequences.
|
||||
#
|
||||
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
|
||||
$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];
|
||||
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -259,26 +158,24 @@ $CAN_CM $CM+; # Stick together any combining sequences that d
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
|
||||
^$CM+ $GLcm;
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
@ -309,19 +206,19 @@ $CAN_CM $CM* $SY;
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OPcm $SP* $CAN_CM $CM*;
|
||||
$OPcm $SP* $CANT_CM;
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QUcm $SP* $OPcm;
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
($CLcm | $CPcm) $SP* $NScm;
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2cm $SP* $B2cm;
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
@ -332,11 +229,11 @@ $LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
@ -348,83 +245,88 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
^$CM+ ($BAcm | $HYcm | $NScm);
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm) [^$CB]?;
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SYcm $HLcm;
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
|
||||
($JVcm | $H2cm) ($JVcm | $JTcm);
|
||||
($JTcm | $H3cm) $JTcm;
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
|
||||
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
@ -436,37 +338,10 @@ $EB $CM* $EM;
|
||||
|
||||
!!reverse;
|
||||
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
@ -485,16 +360,6 @@ $AL_FOLLOW $CM+ / (
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
# The CM needs to behave as an AL
|
||||
# This rule is concerned about getting the second of the two <breaks> in place.
|
||||
#
|
||||
|
||||
[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
|
||||
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
@ -559,21 +424,11 @@ $EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
# Rule 13 & 14 taken together for an edge case.
|
||||
# Match this, shown forward
|
||||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
@ -624,14 +479,16 @@ $IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
|
@ -29,41 +29,6 @@
|
||||
|
||||
!!chain;
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
|
||||
# and only used for the line break rules.
|
||||
#
|
||||
# It is used in the implementation of rule LB 10
|
||||
# which says to treat any combining mark that is not attached to a base
|
||||
# character as if it were of class AL (alphabetic).
|
||||
#
|
||||
# The problem occurs in the reverse rules.
|
||||
#
|
||||
# Consider a sequence like, with correct breaks as shown
|
||||
# LF ID CM AL AL
|
||||
# ^ ^ ^
|
||||
# Then consider the sequence without the initial ID (ideographic)
|
||||
# LF CM AL AL
|
||||
# ^ ^
|
||||
# Our CM, which in the first example was attached to the ideograph,
|
||||
# is now unattached, becomes an alpha, and joins in with the other
|
||||
# alphas.
|
||||
#
|
||||
# When iterating forwards, these sequences do not present any problems
|
||||
# When iterating backwards, we need to look ahead when encountering
|
||||
# a CM to see whether it attaches to something further on or not.
|
||||
# (Look-ahead in a reverse rule is looking towards the start)
|
||||
#
|
||||
# If the CM is unattached, we need to force a break.
|
||||
#
|
||||
# !!lookAheadHardBreak forces the run time state machine to
|
||||
# stop immediately when a look ahead rule ( '/' operator) matches,
|
||||
# and set the match position to that of the look-ahead operator,
|
||||
# no matter what other rules may be in play at the time.
|
||||
#
|
||||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
@ -130,77 +95,11 @@ $dictionary = [:LineBreak = Complex_Context:];
|
||||
#
|
||||
$ALPlus = [$AL $AI $SA $SG $XX];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$BAcm = $BA $CM*;
|
||||
$BAXcm = $BAX $CM*;
|
||||
$BBcm = $BB $CM*;
|
||||
$B2cm = $B2 $CM*;
|
||||
$CLcm = $CL $CM*;
|
||||
$CPcm = $CP $CM*;
|
||||
$EXcm = $EX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HLcm = $HL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
$JVcm = $JV $CM*;
|
||||
$JTcm = $JT $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
$NSXcm = $NSX $CM*;
|
||||
$NUcm = $NU $CM*;
|
||||
$OPcm = $OP $CM*;
|
||||
$POcm = $PO $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$RIcm = $RI $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
#
|
||||
$ALPlus $CM+;
|
||||
$BA $CM+;
|
||||
$BAX $CM+;
|
||||
$BB $CM+;
|
||||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
$HY $CM+;
|
||||
$H2 $CM+;
|
||||
$H3 $CM+;
|
||||
$ID $CM+;
|
||||
$IN $CM+;
|
||||
$IS $CM+;
|
||||
$JL $CM+;
|
||||
$JV $CM+;
|
||||
$JT $CM+;
|
||||
$NS $CM+;
|
||||
$NSX $CM+;
|
||||
$NU $CM+;
|
||||
$OP $CM+;
|
||||
$PO $CM+;
|
||||
$PR $CM+;
|
||||
$QU $CM+;
|
||||
$RI $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
@ -215,12 +114,8 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
# Chaining is disabled with CM because it causes other failures,
|
||||
# so for this one case we need to manually list out longer sequences.
|
||||
#
|
||||
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
|
||||
$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];
|
||||
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -266,26 +161,24 @@ $CAN_CM $CM+; # Stick together any combining sequences that d
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
|
||||
^$CM+ $GLcm;
|
||||
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
@ -316,21 +209,21 @@ $CAN_CM $CM* $SY;
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OPcm $SP* $CAN_CM $CM*;
|
||||
$OPcm $SP* $CANT_CM;
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QUcm $SP* $OPcm;
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
# Do not break between closing punctuation and $NS, even with intervening spaces
|
||||
# But DO allow a break between closing punctuation and $NSX, don't include it here
|
||||
($CLcm | $CPcm) $SP* $NScm;
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2cm $SP* $B2cm;
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
@ -341,11 +234,11 @@ $LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
@ -358,83 +251,88 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
^$CM+ ($BAcm | $HYcm | $NScm);
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
|
||||
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SYcm $HLcm;
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
|
||||
($JVcm | $H2cm) ($JVcm | $JTcm);
|
||||
($JTcm | $H3cm) $JTcm;
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
|
||||
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
@ -446,39 +344,10 @@ $EB $CM* $EM;
|
||||
|
||||
!!reverse;
|
||||
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BAX;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NSX;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
@ -497,16 +366,6 @@ $AL_FOLLOW $CM+ / (
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
# The CM needs to behave as an AL
|
||||
# This rule is concerned about getting the second of the two <breaks> in place.
|
||||
#
|
||||
|
||||
[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
|
||||
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
@ -571,21 +430,11 @@ $EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
# Rule 13 & 14 taken together for an edge case.
|
||||
# Match this, shown forward
|
||||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
@ -638,14 +487,16 @@ $IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
|
@ -7,11 +7,12 @@
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# Tailored as noted in 2nd paragraph below.
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
@ -28,41 +29,6 @@
|
||||
|
||||
!!chain;
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
|
||||
# and only used for the line break rules.
|
||||
#
|
||||
# It is used in the implementation of rule LB 10
|
||||
# which says to treat any combining mark that is not attached to a base
|
||||
# character as if it were of class AL (alphabetic).
|
||||
#
|
||||
# The problem occurs in the reverse rules.
|
||||
#
|
||||
# Consider a sequence like, with correct breaks as shown
|
||||
# LF ID CM AL AL
|
||||
# ^ ^ ^
|
||||
# Then consider the sequence without the initial ID (ideographic)
|
||||
# LF CM AL AL
|
||||
# ^ ^
|
||||
# Our CM, which in the first example was attached to the ideograph,
|
||||
# is now unattached, becomes an alpha, and joins in with the other
|
||||
# alphas.
|
||||
#
|
||||
# When iterating forwards, these sequences do not present any problems
|
||||
# When iterating backwards, we need to look ahead when encountering
|
||||
# a CM to see whether it attaches to something further on or not.
|
||||
# (Look-ahead in a reverse rule is looking towards the start)
|
||||
#
|
||||
# If the CM is unattached, we need to force a break.
|
||||
#
|
||||
# !!lookAheadHardBreak forces the run time state machine to
|
||||
# stop immediately when a look ahead rule ( '/' operator) matches,
|
||||
# and set the match position to that of the look-ahead operator,
|
||||
# no matter what other rules may be in play at the time.
|
||||
#
|
||||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
@ -71,8 +37,8 @@ $EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
|
||||
$BA = [[:LineBreak = Break_After:]-[\u2010]];
|
||||
$HH = [\u2010];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
@ -128,75 +94,11 @@ $dictionary = [:LineBreak = Complex_Context:];
|
||||
#
|
||||
$ALPlus = [$AL $AI $SA $SG $XX];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$BAcm = $BA $CM*;
|
||||
$HHcm = $HH $CM*;
|
||||
$BBcm = $BB $CM*;
|
||||
$B2cm = $B2 $CM*;
|
||||
$CLcm = $CL $CM*;
|
||||
$CPcm = $CP $CM*;
|
||||
$EXcm = $EX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HLcm = $HL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
$JVcm = $JV $CM*;
|
||||
$JTcm = $JT $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
$NUcm = $NU $CM*;
|
||||
$OPcm = $OP $CM*;
|
||||
$POcm = $PO $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$RIcm = $RI $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
#
|
||||
$ALPlus $CM+;
|
||||
$BA $CM+;
|
||||
$HH $CM+;
|
||||
$BB $CM+;
|
||||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
$HY $CM+;
|
||||
$H2 $CM+;
|
||||
$H3 $CM+;
|
||||
$ID $CM+;
|
||||
$IN $CM+;
|
||||
$IS $CM+;
|
||||
$JL $CM+;
|
||||
$JV $CM+;
|
||||
$JT $CM+;
|
||||
$NS $CM+;
|
||||
$NU $CM+;
|
||||
$OP $CM+;
|
||||
$PO $CM+;
|
||||
$PR $CM+;
|
||||
$QU $CM+;
|
||||
$RI $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
@ -211,12 +113,8 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
# Chaining is disabled with CM because it causes other failures,
|
||||
# so for this one case we need to manually list out longer sequences.
|
||||
#
|
||||
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
|
||||
$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $ALPlus];
|
||||
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -262,26 +160,24 @@ $CAN_CM $CM+; # Stick together any combining sequences that d
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
|
||||
^$CM+ $GLcm;
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
@ -312,19 +208,19 @@ $CAN_CM $CM* $SY;
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OPcm $SP* $CAN_CM $CM*;
|
||||
$OPcm $SP* $CANT_CM;
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QUcm $SP* $OPcm;
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
($CLcm | $CPcm) $SP* $NScm;
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2cm $SP* $B2cm;
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
@ -335,11 +231,11 @@ $LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
@ -352,85 +248,90 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
|
||||
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL;
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS);
|
||||
($HY | $HH) $AL;
|
||||
^$CM+ ($BAcm | $HHcm | $HYcm | $NScm);
|
||||
^$CM+ ($BA | $HY | $HH | $NS);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?;
|
||||
$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SYcm $HLcm;
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
|
||||
($JVcm | $H2cm) ($JVcm | $JTcm);
|
||||
($JTcm | $H3cm) $JTcm;
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
|
||||
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
@ -442,38 +343,10 @@ $EB $CM* $EM;
|
||||
|
||||
!!reverse;
|
||||
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $HH;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
@ -492,16 +365,6 @@ $AL_FOLLOW $CM+ / (
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
# The CM needs to behave as an AL
|
||||
# This rule is concerned about getting the second of the two <breaks> in place.
|
||||
#
|
||||
|
||||
[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
|
||||
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
@ -566,21 +429,11 @@ $EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
# Rule 13 & 14 taken together for an edge case.
|
||||
# Match this, shown forward
|
||||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
@ -634,14 +487,16 @@ $IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
@ -675,14 +530,14 @@ $OP $CM* ($ALPlus | $HL | $NU);
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA | $HH)$CM* $HL));
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
@ -735,6 +590,6 @@ $dictionary $dictionary;
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $RI $ZWJ $dictionary];
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -3515,25 +3515,36 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
}
|
||||
|
||||
|
||||
// LB 23 ID x PO
|
||||
// AL x NU
|
||||
// HL x NU
|
||||
// NU x AL
|
||||
if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
|
||||
(fAL->contains(prevChar) && fNU->contains(thisChar)) ||
|
||||
(fHL->contains(prevChar) && fNU->contains(thisChar)) ||
|
||||
(fNU->contains(prevChar) && fAL->contains(thisChar)) ||
|
||||
(fNU->contains(prevChar) && fHL->contains(thisChar)) ) {
|
||||
// LB 23 (AL | HL) x NU
|
||||
// NU x (AL | HL)
|
||||
if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
|
||||
// PR x (ID | EB | EM)
|
||||
// (ID | EB | EM) x PO
|
||||
if (fPR->contains(prevChar) &&
|
||||
(fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
|
||||
fPO->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 24 Do not break between prefix and letters or ideographs.
|
||||
// PR x ID
|
||||
// PR x (AL | HL)
|
||||
// PO x (AL | HL)
|
||||
if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
|
||||
(fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
|
||||
(fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) {
|
||||
// (PR | PO) x (AL | HL)
|
||||
// (AL | HL) x (PR | PO)
|
||||
if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
|
||||
(fAL->contains(thisChar) || fHL->contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
|
||||
(fPR->contains(thisChar) || fPO->contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
822
icu4c/source/test/testdata/LineBreakTest.txt
vendored
822
icu4c/source/test/testdata/LineBreakTest.txt
vendored
File diff suppressed because it is too large
Load Diff
13
icu4c/source/test/testdata/break_rules/line.txt
vendored
13
icu4c/source/test/testdata/break_rules/line.txt
vendored
@ -153,13 +153,14 @@ LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (ID | EB | EM) CM* PO;
|
||||
LB23.2: (AL | HL | CM) CM* NU;
|
||||
LB23.3: NU CM* (AL | HL);
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB24.1: PR CM* (ID | EB | EM);
|
||||
LB24.2: PR CM* (AL | HL);
|
||||
LB24.3: PO CM* (AL | HL);
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAX 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
@ -161,13 +161,14 @@ LB22.3: (ID | EB | EM) CM* IN;
|
||||
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (ID | EB | EM) CM* PO;
|
||||
LB23.2: (AL | HL | CM) CM* NU;
|
||||
LB23.3: NU CM* (AL | HL);
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB24.1: PR CM* (ID | EB | EM);
|
||||
LB24.2: PR CM* (AL | HL);
|
||||
LB24.3: PO CM* (AL | HL);
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
@ -181,13 +181,14 @@ LB22.3: (ID | EB | EM) CM* IN;
|
||||
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (ID | EB | EM) CM* PO;
|
||||
LB23.2: (AL | HL | CM) CM* NU;
|
||||
LB23.3: NU CM* (AL | HL);
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB24.1: PR CM* (ID | EB | EM);
|
||||
LB24.2: PR CM* (AL | HL);
|
||||
LB24.3: (PO | POX) CM* (AL | HL);
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO | POX) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO | POX);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
# Loose_cj tailoring: do not include $PRX at the beginning or $POX at the end.
|
||||
|
@ -167,13 +167,14 @@ LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (ID | EB | EM) CM* PO;
|
||||
LB23.2: (AL | HL | CM) CM* NU;
|
||||
LB23.3: NU CM* (AL | HL);
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB24.1: PR CM* (ID | EB | EM);
|
||||
LB24.2: PR CM* (AL | HL);
|
||||
LB24.3: PO CM* (AL | HL);
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
@ -175,13 +175,14 @@ LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (ID | EB | EM) CM* PO;
|
||||
LB23.2: (AL | HL | CM) CM* NU;
|
||||
LB23.3: NU CM* (AL | HL);
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB24.1: PR CM* (ID | EB | EM);
|
||||
LB24.2: PR CM* (AL | HL);
|
||||
LB24.3: PO CM* (AL | HL);
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
65
icu4c/source/test/testdata/rbbitst.txt
vendored
65
icu4c/source/test/testdata/rbbitst.txt
vendored
@ -15,11 +15,11 @@
|
||||
# <locale local_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
|
||||
# <data> ... </data> test data. May span multiple lines.
|
||||
# <> Break position, status == 0
|
||||
# • Break position, status == 0 (Bullet, \u2022)
|
||||
# • Break position, status == 0 (Bullet, \u2022)
|
||||
# <nnn> Break position, status == nnn
|
||||
# \ Escape. Normal ICU unescape applied.
|
||||
# \ Escape. Normal ICU unescape applied.
|
||||
# \ at end of line -> Line Continuation. Remove both the backslash and the new line
|
||||
#
|
||||
#
|
||||
# In ICU4C, this test data is run by intltest, rbbi/RBBITest/TestExtended.
|
||||
# In ICU4J, this test data is run by com.ibm.icu.dev.test.rbbi.RBBITestExtended
|
||||
#
|
||||
@ -49,7 +49,7 @@
|
||||
|
||||
<locale>
|
||||
|
||||
# Temp debugging tests
|
||||
# Temp debugging tests
|
||||
<sent>
|
||||
<data>•\u00c0.•</data>
|
||||
|
||||
@ -89,7 +89,7 @@
|
||||
# LVT : \uAC01
|
||||
|
||||
<data>•\u1100\u1161\u11a8•\u1100\u1161\u11a8•</data> #LVT
|
||||
<data>•\u1100\u1161•\u1100\u1161•</data>
|
||||
<data>•\u1100\u1161•\u1100\u1161•</data>
|
||||
<data>•\u1100\u1161\u11a8•\u1161•\u1100•\u11a8•\u1161\u1161\u1161\u11a8•</data>
|
||||
<data>•\u1100\u1100\uac01•\u1100\uac01•\u1100\uac01\u0301•\uac01•</data>
|
||||
<data>•\u1100\u0301•\u1161\u11a8\u0301•\u11a8•</data>
|
||||
@ -97,7 +97,7 @@
|
||||
|
||||
|
||||
# Hindi combining chars. (An old test)
|
||||
# TODO: Update these tests for Unicode 5.1 Extended Grapheme clusters
|
||||
# TODO: Update these tests for Unicode 5.1 Extended Grapheme clusters
|
||||
#<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930•
|
||||
#•\u0939•\u094c•\u0964•</data>
|
||||
#<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data>
|
||||
@ -209,7 +209,7 @@
|
||||
|
||||
# Words with interior formatting characters
|
||||
<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
|
||||
|
||||
|
||||
# to test for bug #4097779
|
||||
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
|
||||
|
||||
@ -235,13 +235,13 @@
|
||||
<data>•\u06c9<200>\uc799\ufffa•</data>
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
# Try some words from other scripts.
|
||||
#
|
||||
#
|
||||
|
||||
# Try some words from other scripts.
|
||||
# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
|
||||
#
|
||||
#
|
||||
<data>•ΑΒΓ<200> •БВГ<200> •אבג֓<200> •ابت<200> •١٢٣<100> •\u10A0\u10A1\u10A2<200> •ABC<200> •</data>
|
||||
|
||||
<data>•\u0301•A<200></data>
|
||||
@ -251,7 +251,7 @@
|
||||
# Hindi word break tests, imported from the old RBBI tests.
|
||||
# An historical note: a much earlier version of ICU break iterators had a number
|
||||
# of special case rules for Hindi, which were tested by an earlier version of
|
||||
# this test data. The current RBBI rules do not special case Hindi in
|
||||
# this test data. The current RBBI rules do not special case Hindi in
|
||||
# any way, making this test data much less signfificant.
|
||||
#
|
||||
<data>•\u0917\u092a\u00ad\u0936\u092a<200>!•\u092f\u0939<200> •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•\n•:•\u092a\u094d\u0930\u093e\u092f\u0903<200>
|
||||
@ -306,7 +306,7 @@ doing? •This\n<100> costs $20,00,000. •</data>
|
||||
"This isn't it." •Hi! \
|
||||
•This is a simple sample sentence. •(This is it.) •This is a simple sample sentence. •\
|
||||
"This isn't it." •\
|
||||
Hi! •This is a simple sample sentence. •It does not have to make any sense as you can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don't rock the boat.\u2029•Because I am the daddy, that is why.
|
||||
Hi! •This is a simple sample sentence. •It does not have to make any sense as you can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don't rock the boat.\u2029•Because I am the daddy, that is why.
|
||||
•Not on my time (el timo.)! •</data>
|
||||
|
||||
<data>•Hello. •So what!!\u2029•"But now," he said, \
|
||||
@ -351,11 +351,11 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
<data>•How do you do? •(fine). •</data>
|
||||
|
||||
#
|
||||
<data>•Hello.123<100></data> # Rule 6
|
||||
<data>•Hello?•123<100></data>
|
||||
|
||||
<data>•HELLO.Bye<100></data> # Rule 7
|
||||
<data>•HELLO?•Bye<100></data>
|
||||
<data>•Hello.123<100></data> # Rule 6
|
||||
<data>•Hello?•123<100></data>
|
||||
|
||||
<data>•HELLO.Bye<100></data> # Rule 7
|
||||
<data>•HELLO?•Bye<100></data>
|
||||
|
||||
<data>•Hello.goodbye<100></data> #Rule 8
|
||||
<data>•Hello. •Goodbye<100></data>
|
||||
@ -368,36 +368,36 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
#
|
||||
<data>•\<P>Provides a set of "lightweight" (all-java\<FONT SIZE="-2">\<SUP>TM\</SUP>\</FONT> language) components that, to the maximum degree possible, work the same on all platforms. •</data>
|
||||
<data>•Another test.\u2029•</data>
|
||||
|
||||
|
||||
# test for bug #4143071: Make sure sentences that end with digits
|
||||
# work right
|
||||
#
|
||||
<data>•Today is the 27th of May, 1998. •</data>
|
||||
<data>•Tomorrow with be 28 May 1998. •</data>
|
||||
<data>•The day after will be the 30th.\u2029•</data>
|
||||
|
||||
|
||||
# test for bug #4152416: Make sure sentences ending with a capital
|
||||
# letter are treated correctly
|
||||
#
|
||||
<data>•The type of all primitive \<code>boolean\</code> values accessed in the target VM. •Calls to xxx will return an implementor of this interface. \u2029•</data>
|
||||
|
||||
|
||||
# test for bug #4152117: Make sure sentence breaking is handling
|
||||
# punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
|
||||
# HERE TO MAKE SURE IT DOESN'T CROP UP]
|
||||
#
|
||||
<data>•Constructs a randomly generated BigInteger, uniformly distributed over the range \<tt>0\</tt> to \<tt>(2\<sup>numBits\</sup> - 1\)\</tt>, inclusive. •The uniformity of the distribution assumes that a fair source of random bits is provided in \<tt>rnd\</tt>. •Note that this constructor always constructs a non-negative biginteger. \n•Ahh abc.
|
||||
<data>•Constructs a randomly generated BigInteger, uniformly distributed over the range \<tt>0\</tt> to \<tt>(2\<sup>numBits\</sup> - 1\)\</tt>, inclusive. •The uniformity of the distribution assumes that a fair source of random bits is provided in \<tt>rnd\</tt>. •Note that this constructor always constructs a non-negative biginteger. \n•Ahh abc.
|
||||
•</data>
|
||||
|
||||
# sentence breaks for hindi which used Devanagari script
|
||||
# make sure there is sentence break after ?,danda(hindi phrase separator),
|
||||
# fullstop followed by space. (VERY old test)
|
||||
#
|
||||
#
|
||||
<data>•\u0928\u092e\u0938\u094d\u200d\u0924\u0947 \u0930\u092e\u0947\u0936\u0905\u093e\u092a\u0915\u0948\u0938\u0947 \u0939\u0948?•\u092e\u0948 \u0905\u091a\u094d\u200d \u091b\u093e \u0939\u0942\u0901\u0964 •\u0905\u093e\u092a\r\n<100>\
|
||||
\u0915\u0948\u0938\u0947 \u0939\u0948?•\u0935\u0939 \u0915\u094d\u200d\u092f\u093e\n\
|
||||
<100>\u0939\u0948?•\u092f\u0939 \u0905\u093e\u092e \u0939\u0948. •\u092f\u0939 means "this". •"\u092a\u095d\u093e\u0908" meaning "education" or "studies". •\u0905\u093e\u091c(\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930 \u0926\u093f\u0935\u093e\u0938) \u0939\u0948\u0964 •Let's end here. •</data>
|
||||
|
||||
# Regression test for bug #1984, Sentence break in Arabic text.
|
||||
|
||||
|
||||
<data>\
|
||||
•\u0623\u0633\u0627\u0633\u064b\u0627\u060c\u0020\u062a\u062a\u0639\u0627"\u0645\u0644\u0020\u0627\u0644\u062d\u0648\u0627\u0633\u064a\u0628\u0020"\u0641\u0642\u0637\u0020\u0645\u0639\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u060c\u0648\u062a\u0642\u0648\u0645\u0020\u0628\u062a\u062e\u0632\u064a\u0646\u0020\u0627\u0644\u0623\u062d\u0631\u0641\u0020\u0648\u0627\u0644\u0645\u062d\u0627\u0631\u0641\u0020\u0627\u0644\u0623\u062e\u0631\u0649\u0020\u0628\u0639\u062f\u0020\u0623\u0646\u062a\u064f\u0639\u0637\u064a\u0020\u0631\u0642\u0645\u0627\u0020\u0645\u0639\u064a\u0646\u0627\u0020\u0644\u0643\u0644\u0020\u0648\u0627\u062d\u062f\u0020\u0645\u0646\u0647\u0627\u002e\u0020•\u0648\u0642\u0628\u0644\u0020\u0627\u062e\u062a\u0631\u0627\u0639\u0022\u064a\u0648\u0646\u0650\u0643\u0648\u062f\u0022\u060c\u0020\u0643\u0627\u0646\u0020\u0647\u0646\u0627\u0643\u0020\u0645\u0626\u0627\u062a\u0020\u0627\u0644\u0623\u0646\u0638\u0645\u0629\u0020\u0644\u0644\u062a\u0634\u0641\u064a\u0631\u0648\u062a\u062e\u0635\u064a\u0635\u0020\u0647\u0630\u0647\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u0020\u0644\u0644\u0645\u062d\u0627\u0631\u0641\u060c\u0020\u0648\u0644\u0645\u0020\u064a\u0648\u062c\u062f\u0020\u0646\u0638\u0627\u0645\u062a\u0634\u0641\u064a\u0020\u0639\u0644\u0649\u0020\u062c\u0645\u064a\u0639\u0020\u0627\u0644\u0645\u062d\u0627\u0631\u0641\u0020\u0627\u0644\u0636\u0631\u0648\u0631\u064a\u0629. •</data>
|
||||
|
||||
@ -535,11 +535,11 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
|
||||
<line>
|
||||
|
||||
<data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•important) •sentence.
|
||||
<data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•important) •sentence.
|
||||
<100>Hi •Hello •How\n<100>are\r<100>you\u2028<100>fine.\t•good. •Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100>all•</data>
|
||||
|
||||
<line>
|
||||
<data>•Hello! •how\r\n<100> •(are)\r<100> •you? •I'am •fine- •Thankyou. •foo\u00a0bar
|
||||
<data>•Hello! •how\r\n<100> •(are)\r<100> •you? •I'am •fine- •Thankyou. •foo\u00a0bar
|
||||
<100>How, •are, •you? •This, •costs •$20,00,000.•</data>
|
||||
|
||||
# test for bug #4068133
|
||||
@ -579,11 +579,11 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
# Regression for bug 836
|
||||
# Note: Unicode 5.1 changed this behavior
|
||||
# Unicode 5.2 changed it again, there is no break following the '('
|
||||
<data>•AAA(AAA •</data>
|
||||
<data>•AAA(AAA •</data>
|
||||
|
||||
# Try some words from other scripts.
|
||||
# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
|
||||
#
|
||||
#
|
||||
<data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data>
|
||||
|
||||
#
|
||||
@ -599,7 +599,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
|
||||
<data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0fd0\u000a<100>\u20a3•</data>
|
||||
<data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffc•</data>
|
||||
|
||||
|
||||
# Test for #10176 (in root)
|
||||
<line>
|
||||
<data>•abc/•s •def•</data>
|
||||
@ -607,6 +607,11 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
<data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data>
|
||||
<data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data>
|
||||
|
||||
# Ticket #11556 don't break "R$" or "JP¥"
|
||||
<locale en>
|
||||
<line>
|
||||
<data>•R$ •JP¥ •a9 •3a •H% •CA$ •Travi$ •Scott •Ke$ha •Curren$y •A$AP •Rocky•</data>
|
||||
|
||||
|
||||
|
||||
########################################################################################
|
||||
@ -915,10 +920,10 @@ Bangkok)•</data>
|
||||
|
||||
# Finnish line breaking
|
||||
#
|
||||
# These rules deal with hyphens when there is a space on the leading side.
|
||||
# These rules deal with hyphens when there is a space on the leading side.
|
||||
# There should be a break opportunity between the space and the hyphen, and not after the hyphen.
|
||||
# See CLDR ticket 3029.
|
||||
# See ICU ticket 8151
|
||||
# See ICU ticket 8151
|
||||
|
||||
<locale root>
|
||||
<line>
|
||||
|
Loading…
Reference in New Issue
Block a user