ICU-20893 Line break tailorings updated to Unicode 13.

This commit is contained in:
Andy Heninger 2019-11-22 11:54:17 -08:00
parent 017c8b762e
commit 197e0239ab
17 changed files with 288 additions and 233 deletions

View File

@ -7,7 +7,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# Unicode Standard Annex #14 Revision 44 for Unicode 13.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
@ -70,6 +70,13 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -109,7 +116,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
@ -212,7 +219,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
@ -283,16 +290,13 @@ $HL $CM* ($HY | $BA) $CM* [^$CB]?;
# (break between HL and SY already disallowed by LB 13 above)
$SY $CM* $HL;
# LB 22
($ALPlus | $HL) $CM* $IN;
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
$EX $CM* $IN;
($ID | $EB | $EM) $CM* $IN;
$IN $CM* $IN;
$NU $CM* $IN;
# LB 22 Do not break before ellipses
#
$LB20NonBreaks $CM* $IN;
^$CM+ $IN;
# $LB 23
# LB 23
#
($ALPlus | $HL) $CM* $NU;
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
@ -338,15 +342,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP;
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.

View File

@ -8,7 +8,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# Unicode Standard Annex #14 Revision 44 for Unicode 13.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
@ -76,6 +76,13 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -115,7 +122,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
@ -218,7 +225,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
@ -292,16 +299,14 @@ $HL $CM* ($HY | $BA) $CM* [^$CB]?;
# (break between HL and SY already disallowed by LB 13 above)
$SY $CM* $HL;
# LB 22
($ALPlus | $HL) $CM* $IN;
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
$EX $CM* $IN;
($ID | $EB | $EM) $CM* $IN;
# $IN $CM* $IN; # delete this rule for CSS loose
$NU $CM* $IN;
# LB 22 Do not break before ellipses
#
[$LB20NonBreaks - $IN] $CM* $IN; # line_loose tailoring
^$CM+ $IN;
# $LB 23
# LB 23
#
($ALPlus | $HL) $CM* $NU;
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
@ -347,15 +352,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP;
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.

View File

@ -7,7 +7,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# Unicode Standard Annex #14 Revision 44 for Unicode 13.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
@ -87,6 +87,13 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -126,7 +133,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus];
#
@ -229,7 +236,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
@ -303,16 +310,14 @@ $HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
# (break between HL and SY already disallowed by LB 13 above)
$SY $CM* $HL;
# LB 22
($ALPlus | $HL) $CM* $IN;
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
$EX $CM* $IN;
($ID | $EB | $EM) $CM* $IN;
# $IN $CM* $IN; # delete this rule for CSS loose
$NU $CM* $IN;
# LB 22 Do not break before ellipses
#
[$LB20NonBreaks - $IN] $CM* $IN; # line_loose tailoring
^$CM+ $IN;
# $LB 23
# LB 23
#
($ALPlus | $HL) $CM* $NU;
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
@ -362,15 +367,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP;
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.

View File

@ -7,7 +7,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# Unicode Standard Annex #14 Revision 44 for Unicode 13.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
@ -71,6 +71,13 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -110,7 +117,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
@ -213,7 +220,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
@ -284,16 +291,13 @@ $HL $CM* ($HY | $BA) $CM* [^$CB]?;
# (break between HL and SY already disallowed by LB 13 above)
$SY $CM* $HL;
# LB 22
($ALPlus | $HL) $CM* $IN;
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
$EX $CM* $IN;
($ID | $EB | $EM) $CM* $IN;
$IN $CM* $IN;
$NU $CM* $IN;
# LB 22 Do not break before ellipses
#
$LB20NonBreaks $CM* $IN;
^$CM+ $IN;
# $LB 23
# LB 23
#
($ALPlus | $HL) $CM* $NU;
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
@ -339,15 +343,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP;
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.

View File

@ -7,7 +7,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# Unicode Standard Annex #14 Revision 44 for Unicode 13.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
@ -75,6 +75,13 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -114,7 +121,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
@ -217,7 +224,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
@ -291,16 +298,13 @@ $HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
# (break between HL and SY already disallowed by LB 13 above)
$SY $CM* $HL;
# LB 22
($ALPlus | $HL) $CM* $IN;
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
$EX $CM* $IN;
($ID | $EB | $EM) $CM* $IN;
$IN $CM* $IN;
$NU $CM* $IN;
# LB 22 Do not break before ellipses
#
$LB20NonBreaks $CM* $IN;
^$CM+ $IN;
# $LB 23
# LB 23
#
($ALPlus | $HL) $CM* $NU;
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
@ -346,15 +350,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP;
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.

View File

@ -6,7 +6,14 @@
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0,
# with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
# there is a boundary preceding the hyphen. See rule 20.9
#
# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
# It sets characters of class CJ to behave like NS.
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -172,7 +179,7 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.2: . CM* IN;
LB22: . CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);

View File

@ -6,7 +6,7 @@
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -61,6 +61,20 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -165,11 +179,7 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB22: . CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
@ -196,13 +206,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
LB30.1: (AL | CM | HL | NU) CM* OP30;
LB30.2: CP30 CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;

View File

@ -6,7 +6,7 @@
# file: line_loose.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -69,6 +69,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -173,11 +180,8 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
LB22.5: NU CM* IN;
LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters.
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
@ -204,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
LB30.1: (AL | CM | HL | NU) CM* OP30;
LB30.2: CP30 CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;

View File

@ -6,16 +6,15 @@
# file: line_loose_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
# Unicode Standard Annex #14
# http://www.unicode.org/reports/tr14/, tailored as noted below.
#
# This tailors the line break behavior to correspond to CSS
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
@ -87,6 +86,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -196,11 +202,8 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
LB22.5: NU CM* IN;
LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters.
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
@ -227,13 +230,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
LB30.1: (AL | CM | HL | NU) CM* OP30;
LB30.2: CP30 CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;

View File

@ -6,20 +6,15 @@
# file: line_normal.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
# Unicode Standard Annex #14
# http://www.unicode.org/reports/tr14/, tailored as noted below.
#
# This tailors the line break behavior to correspond to CSS
# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
@ -75,6 +70,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -179,11 +181,7 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB22: . CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
@ -210,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
LB30.1: (AL | CM | HL | NU) CM* OP30;
LB30.2: CP30 CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;

View File

@ -6,20 +6,15 @@
# file: line_normal_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
# Unicode Standard Annex #14
# http://www.unicode.org/reports/tr14/, tailored as noted below.
#
# This tailors the line break behavior to correspond to CSS
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
@ -78,6 +73,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -188,11 +190,7 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB22: . CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
@ -218,13 +216,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
LB30.1: (AL | CM | HL | NU) CM* OP30;
LB30.2: CP30 CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:96831df582da28121b19cf3faed7e84529ad0c1113b14cad0e01fabd4875c679
size 12998991
oid sha256:f9b73d720421a85704fc64aa0949c94d52e450a44af96c715881e9e6ab0fa3e6
size 12998988

View File

@ -6,7 +6,7 @@
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -61,6 +61,20 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -165,11 +179,7 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB22: . CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
@ -196,13 +206,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
LB30.1: (AL | CM | HL | NU) CM* OP30;
LB30.2: CP30 CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;

View File

@ -6,7 +6,7 @@
# file: line_loose.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -69,6 +69,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -173,11 +180,8 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
LB22.5: NU CM* IN;
LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters.
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
@ -204,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
LB30.1: (AL | CM | HL | NU) CM* OP30;
LB30.2: CP30 CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;

View File

@ -6,16 +6,15 @@
# file: line_loose_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
# Unicode Standard Annex #14
# http://www.unicode.org/reports/tr14/, tailored as noted below.
#
# This tailors the line break behavior to correspond to CSS
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
@ -87,6 +86,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -196,11 +202,8 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
LB22.5: NU CM* IN;
LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters.
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
@ -227,13 +230,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
LB30.1: (AL | CM | HL | NU) CM* OP30;
LB30.2: CP30 CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;

View File

@ -6,20 +6,15 @@
# file: line_normal.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
# Unicode Standard Annex #14
# http://www.unicode.org/reports/tr14/, tailored as noted below.
#
# This tailors the line break behavior to correspond to CSS
# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
@ -75,6 +70,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -179,11 +181,7 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB22: . CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
@ -210,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
LB30.1: (AL | CM | HL | NU) CM* OP30;
LB30.2: CP30 CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;

View File

@ -6,20 +6,15 @@
# file: line_normal_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
# Unicode Standard Annex #14
# http://www.unicode.org/reports/tr14/, tailored as noted below.
#
# This tailors the line break behavior to correspond to CSS
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
@ -78,6 +73,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14.
# Limitations of this monkey test rule parser require that these definitions be pulled out
# rather than appearing in-line in LB 30.
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -188,11 +190,7 @@ LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB22: . CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
@ -218,13 +216,13 @@ LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
LB30.1: (AL | CM | HL | NU) CM* OP30;
LB30.2: CP30 CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;