ICU-13441 For zh/ja, tailor linebreak classes for quotations such as “ 201C and ” 201D
This commit is contained in:
parent
776b9d7f2b
commit
46a888be87
@ -39,7 +39,7 @@ BRK_DICT_SOURCE = burmesedict.txt cjdict.txt khmerdict.txt laodict.txt\
|
||||
|
||||
|
||||
# List of break iterator files (brk).
|
||||
BRK_SOURCE = char.txt line.txt line_loose.txt line_loose_cj.txt\
|
||||
BRK_SOURCE = char.txt line.txt line_cj.txt line_loose.txt line_loose_cj.txt\
|
||||
line_normal.txt line_normal_cj.txt sent.txt sent_el.txt title.txt\
|
||||
word.txt word_POSIX.txt
|
||||
|
||||
|
@ -6,6 +6,6 @@ ja{
|
||||
line:process(dependency){"line_normal.brk"}
|
||||
line_loose:process(dependency){"line_loose_cj.brk"}
|
||||
line_normal:process(dependency){"line_normal_cj.brk"}
|
||||
line_strict:process(dependency){"line.brk"}
|
||||
line_strict:process(dependency){"line_cj.brk"}
|
||||
}
|
||||
}
|
||||
|
338
icu4c/source/data/brkitr/rules/line_cj.txt
Normal file
338
icu4c/source/data/brkitr/rules/line_cj.txt
Normal file
@ -0,0 +1,338 @@
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_cj.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
|
||||
# It sets characters of class CJ to behave like NS.
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [[:LineBreak = Close_Punctuation:] \u201d];
|
||||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
$HL = [:LineBreak = Hebrew_Letter:];
|
||||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [:LineBreak = Ideographic:];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
$JV = [:LineBreak = JV:];
|
||||
$JT = [:LineBreak = JT:];
|
||||
$LF = [:LineBreak = Line_Feed:];
|
||||
$NL = [:LineBreak = Next_Line:];
|
||||
# NS includes CJ for CSS strict line breaking.
|
||||
$NS = [[:LineBreak = Nonstarter:] $CJ];
|
||||
$NU = [:LineBreak = Numeric:];
|
||||
$OP = [[:LineBreak = Open_Punctuation:] \u201c];
|
||||
$PO = [:LineBreak = Postfix_Numeric:];
|
||||
$PR = [:LineBreak = Prefix_Numeric:];
|
||||
$QU = [[:LineBreak = Quotation:] - [\u201c\u201d]];
|
||||
$RI = [:LineBreak = Regional_Indicator:];
|
||||
$SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
$SP = [:LineBreak = Space:];
|
||||
$SY = [:LineBreak = Break_Symbols:];
|
||||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
||||
$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context (SA).
|
||||
|
||||
$dictionary = [$SA];
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
# SA (Dictionary chars, excluding Mn and Mc)
|
||||
# SG (Unpaired Surrogates)
|
||||
# XX (Unknown, unassigned)
|
||||
# as $AL (Alphabetic)
|
||||
#
|
||||
$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
# for what they can combine with are _very_ different from the rest of Unicode.
|
||||
#
|
||||
# Note that $CM itself is left out of this set. If CM is needed as a base
|
||||
# it must be listed separately in the rule.
|
||||
#
|
||||
$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
|
||||
$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
# LB 6 Do not break before hard line breaks.
|
||||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ [^$CM];
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
#
|
||||
# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
|
||||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
#
|
||||
$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
||||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
@ -26,6 +26,7 @@
|
||||
# this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
|
||||
# * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
|
||||
# this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
|
||||
|
||||
#
|
||||
@ -45,7 +46,7 @@ $BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
$CL = [[:LineBreak = Close_Punctuation:] \u201d];
|
||||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
@ -70,12 +71,12 @@ $NL = [:LineBreak = Next_Line:];
|
||||
$NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
|
||||
$NS = [[:LineBreak = Nonstarter:] - $NSX];
|
||||
$NU = [:LineBreak = Numeric:];
|
||||
$OP = [:LineBreak = Open_Punctuation:];
|
||||
$OP = [[:LineBreak = Open_Punctuation:] \u201c];
|
||||
$POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
|
||||
$PO = [[:LineBreak = Postfix_Numeric:] - $POX];
|
||||
$PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
|
||||
$PR = [[:LineBreak = Prefix_Numeric:] - $PRX];
|
||||
$QU = [:LineBreak = Quotation:];
|
||||
$QU = [[:LineBreak = Quotation:] - [\u201c\u201d]];
|
||||
$RI = [:LineBreak = Regional_Indicator:];
|
||||
$SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
|
@ -18,6 +18,7 @@
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
@ -36,7 +37,7 @@ $BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
$CL = [[:LineBreak = Close_Punctuation:] \u201d];
|
||||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
@ -60,10 +61,10 @@ $NL = [:LineBreak = Next_Line:];
|
||||
$NSX = [\u301C \u30A0];
|
||||
$NS = [[:LineBreak = Nonstarter:] - $NSX];
|
||||
$NU = [:LineBreak = Numeric:];
|
||||
$OP = [:LineBreak = Open_Punctuation:];
|
||||
$OP = [[:LineBreak = Open_Punctuation:] \u201c];
|
||||
$PO = [:LineBreak = Postfix_Numeric:];
|
||||
$PR = [:LineBreak = Prefix_Numeric:];
|
||||
$QU = [:LineBreak = Quotation:];
|
||||
$QU = [[:LineBreak = Quotation:] - [\u201c\u201d]];
|
||||
$RI = [:LineBreak = Regional_Indicator:];
|
||||
$SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
|
@ -3,9 +3,9 @@
|
||||
zh{
|
||||
Version{"2.1.19.14"}
|
||||
boundaries{
|
||||
line:process(dependency){"line.brk"}
|
||||
line:process(dependency){"line_cj.brk"}
|
||||
line_loose:process(dependency){"line_loose_cj.brk"}
|
||||
line_normal:process(dependency){"line_normal_cj.brk"}
|
||||
line_strict:process(dependency){"line.brk"}
|
||||
line_strict:process(dependency){"line_cj.brk"}
|
||||
}
|
||||
}
|
||||
|
@ -3,9 +3,9 @@
|
||||
zh_Hant{
|
||||
Version{"2.1.36.98"}
|
||||
boundaries{
|
||||
line:process(dependency){"line.brk"}
|
||||
line:process(dependency){"line_cj.brk"}
|
||||
line_loose:process(dependency){"line_loose_cj.brk"}
|
||||
line_normal:process(dependency){"line_normal_cj.brk"}
|
||||
line_strict:process(dependency){"line.brk"}
|
||||
line_strict:process(dependency){"line_cj.brk"}
|
||||
}
|
||||
}
|
||||
|
@ -21,7 +21,7 @@
|
||||
<icu:line icu:dependency="line_normal.brk"/>
|
||||
<icu:line alt="loose" icu:dependency="line_loose_cj.brk"/>
|
||||
<icu:line alt="normal" icu:dependency="line_normal_cj.brk"/>
|
||||
<icu:line alt="strict" icu:dependency="line.brk"/>
|
||||
<icu:line alt="strict" icu:dependency="line_cj.brk"/>
|
||||
</icu:boundaries>
|
||||
</icu:breakIteratorData>
|
||||
</special>
|
||||
|
@ -19,10 +19,10 @@
|
||||
<special xmlns:icu="http://www.icu-project.org/">
|
||||
<icu:breakIteratorData>
|
||||
<icu:boundaries>
|
||||
<icu:line icu:dependency="line.brk"/>
|
||||
<icu:line icu:dependency="line_cj.brk"/>
|
||||
<icu:line alt="loose" icu:dependency="line_loose_cj.brk"/>
|
||||
<icu:line alt="normal" icu:dependency="line_normal_cj.brk"/>
|
||||
<icu:line alt="strict" icu:dependency="line.brk"/>
|
||||
<icu:line alt="strict" icu:dependency="line_cj.brk"/>
|
||||
</icu:boundaries>
|
||||
</icu:breakIteratorData>
|
||||
</special>
|
||||
|
@ -19,10 +19,10 @@
|
||||
<special xmlns:icu="http://www.icu-project.org/">
|
||||
<icu:breakIteratorData>
|
||||
<icu:boundaries>
|
||||
<icu:line icu:dependency="line.brk"/>
|
||||
<icu:line icu:dependency="line_cj.brk"/>
|
||||
<icu:line alt="loose" icu:dependency="line_loose_cj.brk"/>
|
||||
<icu:line alt="normal" icu:dependency="line_normal_cj.brk"/>
|
||||
<icu:line alt="strict" icu:dependency="line.brk"/>
|
||||
<icu:line alt="strict" icu:dependency="line_cj.brk"/>
|
||||
</icu:boundaries>
|
||||
</icu:breakIteratorData>
|
||||
</special>
|
||||
|
@ -906,7 +906,7 @@ void RBBIMonkeyTest::testMonkey() {
|
||||
UnicodeString params(fParams);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
|
||||
const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt",
|
||||
"line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
|
||||
NULL };
|
||||
CharString testNameFromParams;
|
||||
|
207
icu4c/source/test/testdata/break_rules/line_cj.txt
vendored
Normal file
207
icu4c/source/test/testdata/break_rules/line_cj.txt
vendored
Normal file
@ -0,0 +1,207 @@
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: line.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
||||
|
||||
type = line;
|
||||
locale = zh;
|
||||
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [[:LineBreak = Close_Punctuation:] \u201d];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EM = [:LineBreak = EM:];
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [:LineBreak = Ideographic:];
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NS = [[:LineBreak = Nonstarter:] CJ]; # CSS Strict tailoring: CJ resolves to NS.
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [[:LineBreak = Open_Punctuation:] \u201c];
|
||||
PO = [:LineBreak = Postfix_Numeric:];
|
||||
PR = [:LineBreak = Prefix_Numeric:];
|
||||
QU = [[:LineBreak = Quotation:] - [\u201c\u201d]];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
# the ZWJ behaves as just another generic CM.
|
||||
LB8a: ZWJ [^CM];
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^SP] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break after by LB8a.
|
||||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAX 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ [^CM];
|
||||
LB31.2: . CM* ÷;
|
@ -30,6 +30,7 @@
|
||||
# this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
|
||||
# * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
|
||||
# this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
|
||||
|
||||
type = line;
|
||||
@ -46,7 +47,7 @@ BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CL = [[:LineBreak = Close_Punctuation:] \u201d];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
@ -70,12 +71,12 @@ NL = [:LineBreak = Next_Line:];
|
||||
NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
|
||||
NS = [[:LineBreak = Nonstarter:] - NSX];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
OP = [[:LineBreak = Open_Punctuation:] \u201c];
|
||||
POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
|
||||
PO = [[:LineBreak = Postfix_Numeric:] - POX];
|
||||
PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
|
||||
PR = [[:LineBreak = Prefix_Numeric:] - PRX];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
QU = [[:LineBreak = Quotation:] - [\u201c\u201d]];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
|
@ -26,6 +26,7 @@
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
|
||||
type = line;
|
||||
locale = ja@lb=normal;
|
||||
@ -40,7 +41,7 @@ BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CL = [[:LineBreak = Close_Punctuation:] \u201d];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
@ -63,10 +64,10 @@ NL = [:LineBreak = Next_Line:];
|
||||
NSX = [\u301C \u30A0];
|
||||
NS = [[:LineBreak = Nonstarter:] - NSX];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
OP = [[:LineBreak = Open_Punctuation:] \u201c];
|
||||
PO = [:LineBreak = Postfix_Numeric:];
|
||||
PR = [:LineBreak = Prefix_Numeric:];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
QU = [[:LineBreak = Quotation:] - [\u201c\u201d]];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
|
Loading…
Reference in New Issue
Block a user