ICU-20893 Line break tailorings updated to Unicode 13.
This commit is contained in:
parent
017c8b762e
commit
197e0239ab
@ -7,7 +7,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
@ -70,6 +70,13 @@ $XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
@ -109,7 +116,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -212,7 +219,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL];
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
@ -283,16 +290,13 @@ $HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
# LB 22 Do not break before ellipses
|
||||
#
|
||||
$LB20NonBreaks $CM* $IN;
|
||||
^$CM+ $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
# LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
@ -338,15 +342,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
@ -8,7 +8,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
@ -76,6 +76,13 @@ $XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
@ -115,7 +122,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -218,7 +225,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL];
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
@ -292,16 +299,14 @@ $HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$NU $CM* $IN;
|
||||
|
||||
# LB 22 Do not break before ellipses
|
||||
#
|
||||
[$LB20NonBreaks - $IN] $CM* $IN; # line_loose tailoring
|
||||
^$CM+ $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
# LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
@ -347,15 +352,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
@ -7,7 +7,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
@ -87,6 +87,13 @@ $XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
@ -126,7 +133,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -229,7 +236,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL];
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
@ -303,16 +310,14 @@ $HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$NU $CM* $IN;
|
||||
|
||||
# LB 22 Do not break before ellipses
|
||||
#
|
||||
[$LB20NonBreaks - $IN] $CM* $IN; # line_loose tailoring
|
||||
^$CM+ $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
# LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
@ -362,15 +367,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
@ -7,7 +7,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
@ -71,6 +71,13 @@ $XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
@ -110,7 +117,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -213,7 +220,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL];
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
@ -284,16 +291,13 @@ $HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
# LB 22 Do not break before ellipses
|
||||
#
|
||||
$LB20NonBreaks $CM* $IN;
|
||||
^$CM+ $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
# LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
@ -339,15 +343,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
@ -7,7 +7,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
@ -75,6 +75,13 @@ $XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
@ -114,7 +121,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
@ -217,7 +224,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL];
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
@ -291,16 +298,13 @@ $HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
# LB 22 Do not break before ellipses
|
||||
#
|
||||
$LB20NonBreaks $CM* $IN;
|
||||
^$CM+ $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
# LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
@ -346,15 +350,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
11
icu4c/source/test/testdata/break_rules/line.txt
vendored
11
icu4c/source/test/testdata/break_rules/line.txt
vendored
@ -6,7 +6,14 @@
|
||||
# file: line.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0,
|
||||
# with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
|
||||
# It sets characters of class CJ to behave like NS.
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
@ -172,7 +179,7 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.2: . CM* IN;
|
||||
LB22: . CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
@ -6,7 +6,7 @@
|
||||
# file: line.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
@ -61,6 +61,20 @@ XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
@ -165,11 +179,7 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
LB22: . CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
@ -196,13 +206,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP30;
|
||||
LB30.2: CP30 CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
@ -6,7 +6,7 @@
|
||||
# file: line_loose.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
@ -69,6 +69,13 @@ XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
@ -173,11 +180,8 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters.
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
@ -204,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP30;
|
||||
LB30.2: CP30 CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
@ -6,16 +6,15 @@
|
||||
# file: line_loose_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
# Unicode Standard Annex #14
|
||||
# http://www.unicode.org/reports/tr14/, tailored as noted below.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
@ -87,6 +86,13 @@ XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
@ -196,11 +202,8 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters.
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
@ -227,13 +230,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP30;
|
||||
LB30.2: CP30 CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
@ -6,20 +6,15 @@
|
||||
# file: line_normal.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
# Unicode Standard Annex #14
|
||||
# http://www.unicode.org/reports/tr14/, tailored as noted below.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
|
||||
@ -75,6 +70,13 @@ XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
@ -179,11 +181,7 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
LB22: . CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
@ -210,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP30;
|
||||
LB30.2: CP30 CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
@ -6,20 +6,15 @@
|
||||
# file: line_normal_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
# Unicode Standard Annex #14
|
||||
# http://www.unicode.org/reports/tr14/, tailored as noted below.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
@ -78,6 +73,13 @@ XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
@ -188,11 +190,7 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
LB22: . CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
@ -218,13 +216,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP30;
|
||||
LB30.2: CP30 CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:96831df582da28121b19cf3faed7e84529ad0c1113b14cad0e01fabd4875c679
|
||||
size 12998991
|
||||
oid sha256:f9b73d720421a85704fc64aa0949c94d52e450a44af96c715881e9e6ab0fa3e6
|
||||
size 12998988
|
||||
|
@ -6,7 +6,7 @@
|
||||
# file: line.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
@ -61,6 +61,20 @@ XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
@ -165,11 +179,7 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
LB22: . CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
@ -196,13 +206,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP30;
|
||||
LB30.2: CP30 CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
@ -6,7 +6,7 @@
|
||||
# file: line_loose.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
@ -69,6 +69,13 @@ XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
@ -173,11 +180,8 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters.
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
@ -204,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP30;
|
||||
LB30.2: CP30 CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
@ -6,16 +6,15 @@
|
||||
# file: line_loose_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
# Unicode Standard Annex #14
|
||||
# http://www.unicode.org/reports/tr14/, tailored as noted below.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
@ -87,6 +86,13 @@ XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
@ -196,11 +202,8 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters.
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
@ -227,13 +230,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP30;
|
||||
LB30.2: CP30 CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
@ -6,20 +6,15 @@
|
||||
# file: line_normal.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
# Unicode Standard Annex #14
|
||||
# http://www.unicode.org/reports/tr14/, tailored as noted below.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
|
||||
@ -75,6 +70,13 @@ XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
@ -179,11 +181,7 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
LB22: . CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
@ -210,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP30;
|
||||
LB30.2: CP30 CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
@ -6,20 +6,15 @@
|
||||
# file: line_normal_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
# Unicode Standard Annex #14
|
||||
# http://www.unicode.org/reports/tr14/, tailored as noted below.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
@ -78,6 +73,13 @@ XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
|
||||
# Limitations of this monkey test rule parser require that these definitions be pulled out
|
||||
# rather than appearing in-line in LB 30.
|
||||
|
||||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
@ -188,11 +190,7 @@ LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
LB22: . CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
@ -218,13 +216,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP30;
|
||||
LB30.2: CP30 CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
Loading…
Reference in New Issue
Block a user