ICU-7270 Line Break rule LB8 updated for UAX-14 conformance. (#41)
Includes all line break tailorings. Corresponding updates to monkey test rules. State table builder, fix missed table optimization, uncovered by new rule.
This commit is contained in:
parent
4e49234da9
commit
fa5ae3dc45
@ -303,17 +303,24 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
|
||||
}
|
||||
|
||||
void RBBIRuleBuilder::optimizeTables() {
|
||||
bool didSomething;
|
||||
do {
|
||||
didSomething = false;
|
||||
|
||||
// Begin looking for duplicates with char class 3.
|
||||
// Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
|
||||
// and should not have other categories merged into them.
|
||||
IntPair duplPair = {3, 0};
|
||||
// Begin looking for duplicates with char class 3.
|
||||
// Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
|
||||
// and should not have other categories merged into them.
|
||||
IntPair duplPair = {3, 0};
|
||||
while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
|
||||
fSetBuilder->mergeCategories(duplPair);
|
||||
fForwardTable->removeColumn(duplPair.second);
|
||||
didSomething = true;
|
||||
}
|
||||
|
||||
while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
|
||||
fSetBuilder->mergeCategories(duplPair);
|
||||
fForwardTable->removeColumn(duplPair.second);
|
||||
}
|
||||
fForwardTable->removeDuplicateStates();
|
||||
while (fForwardTable->removeDuplicateStates() > 0) {
|
||||
didSomething = true;
|
||||
}
|
||||
} while (didSomething);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -1245,12 +1245,16 @@ void RBBITableBuilder::removeSafeState(IntPair duplStates) {
|
||||
/*
|
||||
* RemoveDuplicateStates
|
||||
*/
|
||||
void RBBITableBuilder::removeDuplicateStates() {
|
||||
int32_t RBBITableBuilder::removeDuplicateStates() {
|
||||
IntPair dupls = {3, 0};
|
||||
int32_t numStatesRemoved = 0;
|
||||
|
||||
while (findDuplicateState(&dupls)) {
|
||||
// printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
|
||||
removeState(dupls);
|
||||
++numStatesRemoved;
|
||||
}
|
||||
return numStatesRemoved;
|
||||
}
|
||||
|
||||
|
||||
|
@ -66,8 +66,11 @@ public:
|
||||
*/
|
||||
void removeColumn(int32_t column);
|
||||
|
||||
/** Check for, and remove dupicate states (table rows). */
|
||||
void removeDuplicateStates();
|
||||
/**
|
||||
* Check for, and remove dupicate states (table rows).
|
||||
* @return the number of states removed.
|
||||
*/
|
||||
int32_t removeDuplicateStates();
|
||||
|
||||
/** Build the safe reverse table from the already-constructed forward table. */
|
||||
void buildSafeReverseTable(UErrorCode &status);
|
||||
|
@ -132,12 +132,11 @@ $CAN_CM $CM* [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# TODO: ZW SP* <break>
|
||||
# An engine change is required to write the reverse rule for this.
|
||||
# For now, leave the Unicode 5.2 rule, ZW <break>
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
|
@ -138,12 +138,11 @@ $CAN_CM $CM* [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# TODO: ZW SP* <break>
|
||||
# An engine change is required to write the reverse rule for this.
|
||||
# For now, leave the Unicode 5.2 rule, ZW <break>
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
|
@ -141,12 +141,11 @@ $CAN_CM $CM* [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# TODO: ZW SP* <break>
|
||||
# An engine change is required to write the reverse rule for this.
|
||||
# For now, leave the Unicode 5.2 rule, ZW <break>
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
|
@ -151,12 +151,11 @@ $CAN_CM $CM* [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# TODO: ZW SP* <break>
|
||||
# An engine change is required to write the reverse rule for this.
|
||||
# For now, leave the Unicode 5.2 rule, ZW <break>
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
|
@ -137,12 +137,11 @@ $CAN_CM $CM* [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# TODO: ZW SP* <break>
|
||||
# An engine change is required to write the reverse rule for this.
|
||||
# For now, leave the Unicode 5.2 rule, ZW <break>
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
|
@ -136,12 +136,11 @@ $CAN_CM $CM* [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# TODO: ZW SP* <break>
|
||||
# An engine change is required to write the reverse rule for this.
|
||||
# For now, leave the Unicode 5.2 rule, ZW <break>
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
|
@ -139,12 +139,11 @@ $CAN_CM $CM* [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# TODO: ZW SP* <break>
|
||||
# An engine change is required to write the reverse rule for this.
|
||||
# For now, leave the Unicode 5.2 rule, ZW <break>
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
|
@ -136,12 +136,11 @@ $CAN_CM $CM* [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# TODO: ZW SP* <break>
|
||||
# An engine change is required to write the reverse rule for this.
|
||||
# For now, leave the Unicode 5.2 rule, ZW <break>
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
|
@ -1283,35 +1283,28 @@ void RBBITest::TestUnicodeFiles() {
|
||||
|
||||
|
||||
// Check for test cases from the Unicode test data files that are known to fail
|
||||
// and should be skipped because ICU is not yet able to fully implement the spec.
|
||||
// See ticket #7270.
|
||||
// and should be skipped as known issues because ICU does not fully implement
|
||||
// the Unicode specifications.
|
||||
//
|
||||
// Test cases are identified by the test data sequence, which tends to be more stable
|
||||
// across Unicode versions than the test file line numbers.
|
||||
//
|
||||
// The test case with ticket "10666" is a dummy, included as an example.
|
||||
|
||||
UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
|
||||
static struct TestCase {
|
||||
const char *fTicketNum;
|
||||
const char *fFileName;
|
||||
const UChar *fString;
|
||||
} badTestCases[] = { // Line Numbers from Unicode 7.0.0 file.
|
||||
{"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198
|
||||
{"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202
|
||||
{"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214
|
||||
{"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246
|
||||
{"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298
|
||||
{"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302
|
||||
// Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
|
||||
{"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ
|
||||
{"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
|
||||
{"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
|
||||
|
||||
// Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
|
||||
{"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK
|
||||
{"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK
|
||||
} badTestCases[] = {
|
||||
{"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"} // Fake example, for illustration.
|
||||
};
|
||||
|
||||
for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
|
||||
const TestCase &badCase = badTestCases[n];
|
||||
if (!strcmp(fileName, badCase.fFileName) &&
|
||||
testCase == UnicodeString(badCase.fString)) {
|
||||
return logKnownIssue("7270");
|
||||
return logKnownIssue(badCase.fTicketNum);
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
@ -2550,7 +2543,7 @@ private:
|
||||
UnicodeSet *fXX;
|
||||
UnicodeSet *fEB;
|
||||
UnicodeSet *fEM;
|
||||
UnicodeSet *fZJ;
|
||||
UnicodeSet *fZWJ;
|
||||
|
||||
BreakIterator *fCharBI;
|
||||
const UnicodeString *fText;
|
||||
@ -2615,7 +2608,7 @@ RBBILineMonkey::RBBILineMonkey() :
|
||||
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
|
||||
fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
|
||||
fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
|
||||
fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
|
||||
fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
@ -2627,7 +2620,7 @@ RBBILineMonkey::RBBILineMonkey() :
|
||||
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
|
||||
|
||||
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
|
||||
fCM->addAll(*fZJ); // ZWJ behaves as a CM.
|
||||
fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
|
||||
|
||||
fSets->addElement(fBK, status);
|
||||
fSets->addElement(fCR, status);
|
||||
@ -2669,7 +2662,7 @@ RBBILineMonkey::RBBILineMonkey() :
|
||||
fSets->addElement(fSG, status);
|
||||
fSets->addElement(fEB, status);
|
||||
fSets->addElement(fEM, status);
|
||||
fSets->addElement(fZJ, status);
|
||||
fSets->addElement(fZWJ, status);
|
||||
|
||||
|
||||
const char *rules =
|
||||
@ -2853,7 +2846,13 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
}
|
||||
|
||||
// LB 8 Break after zero width space
|
||||
if (fZW->contains(prevChar)) {
|
||||
// ZW SP* ÷
|
||||
// Scan backwards from prevChar for SP* ZW
|
||||
tPos = prevPos;
|
||||
while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
|
||||
tPos = fText->moveIndex32(tPos, -1);
|
||||
}
|
||||
if (fZW->contains(fText->char32At(tPos))) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -2890,7 +2889,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
{
|
||||
int32_t prevIdx = fText->moveIndex32(pos, -1);
|
||||
UChar32 prevC = fText->char32At(prevIdx);
|
||||
if (fZJ->contains(prevC)) {
|
||||
if (fZWJ->contains(prevC)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -3148,12 +3147,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB30a RI RI <break> RI
|
||||
// RI x RI
|
||||
// LB30a RI RI ÷ RI
|
||||
// RI x RI
|
||||
if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
|
||||
break;
|
||||
}
|
||||
if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
|
||||
// Two Regional Indicators have been paired.
|
||||
// Over-write the trailing one (thisChar) to prevent it from forming another pair with a
|
||||
// following RI. This is a hack.
|
||||
thisChar = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -3220,7 +3223,7 @@ RBBILineMonkey::~RBBILineMonkey() {
|
||||
delete fXX;
|
||||
delete fEB;
|
||||
delete fEM;
|
||||
delete fZJ;
|
||||
delete fZWJ;
|
||||
|
||||
delete fCharBI;
|
||||
delete fNumberMatcher;
|
||||
|
20
icu4c/source/test/testdata/break_rules/line.txt
vendored
20
icu4c/source/test/testdata/break_rules/line.txt
vendored
@ -25,7 +25,7 @@ B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
@ -66,7 +66,7 @@ dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
@ -86,14 +86,16 @@ LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
@ -188,8 +190,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -32,7 +32,7 @@ B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
@ -74,7 +74,7 @@ dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
@ -94,14 +94,16 @@ LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
@ -196,8 +198,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -46,7 +46,7 @@ B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
@ -91,7 +91,7 @@ dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
@ -111,14 +111,16 @@ LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
@ -217,8 +219,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -39,7 +39,7 @@ B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
@ -80,7 +80,7 @@ dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
@ -100,14 +100,16 @@ LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
@ -202,8 +204,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -40,7 +40,7 @@ B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
@ -82,7 +82,7 @@ dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
@ -105,14 +105,16 @@ LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
@ -211,8 +213,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -331,14 +331,21 @@ class RBBIRuleBuilder {
|
||||
}
|
||||
|
||||
void optimizeTables() {
|
||||
// Begin looking for duplicates with char class 3.
|
||||
// Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
|
||||
// and should not have other categories merged into them.
|
||||
IntPair duplPair = new IntPair(3, 0);
|
||||
while (fForwardTable.findDuplCharClassFrom(duplPair)) {
|
||||
fSetBuilder.mergeCategories(duplPair);
|
||||
fForwardTable.removeColumn(duplPair.second);
|
||||
}
|
||||
fForwardTable.removeDuplicateStates();
|
||||
boolean didSomething;
|
||||
do {
|
||||
didSomething = false;
|
||||
// Begin looking for duplicates with char class 3.
|
||||
// Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
|
||||
// and should not have other categories merged into them.
|
||||
IntPair duplPair = new IntPair(3, 0);
|
||||
while (fForwardTable.findDuplCharClassFrom(duplPair)) {
|
||||
fSetBuilder.mergeCategories(duplPair);
|
||||
fForwardTable.removeColumn(duplPair.second);
|
||||
didSomething = true;
|
||||
}
|
||||
while (fForwardTable.removeDuplicateStates() > 0) {
|
||||
didSomething = true;
|
||||
};
|
||||
} while (didSomething);
|
||||
}
|
||||
}
|
||||
|
@ -1032,14 +1032,19 @@ class RBBITableBuilder {
|
||||
|
||||
/**
|
||||
* Check for, and remove duplicate states (table rows).
|
||||
* @return the number of states removed.
|
||||
* @internal
|
||||
*/
|
||||
void removeDuplicateStates() {
|
||||
int removeDuplicateStates() {
|
||||
IntPair dupls = new IntPair(3, 0);
|
||||
int numStatesRemoved = 0;
|
||||
|
||||
while (findDuplicateState(dupls)) {
|
||||
// System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
|
||||
removeState(dupls);
|
||||
++numStatesRemoved;
|
||||
}
|
||||
return numStatesRemoved;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2cb8f12bbfbffe8a36d10f9d227668fb5468ccee6380b990d41cfa81e34ef2e0
|
||||
size 12508534
|
||||
oid sha256:70c249360d5cc010c75203f5add8040cbcc4f33229e1d82d34b6185d69832143
|
||||
size 12510210
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c2fa72ee8523fcb52b31b81106e399e6caecb1e51167f84b31ba96670e15efac
|
||||
oid sha256:93a0bf4221a173b33aeda78f4646092caad816a6832310a89278de249ec18634
|
||||
size 92857
|
||||
|
@ -651,54 +651,68 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
int fOrigPositions;
|
||||
|
||||
|
||||
// XUnicodeSet is like UnicodeSet, except that the method contains(int codePoint) does not
|
||||
// throw exceptions on out-of-range codePoints. This matches ICU4C behavior.
|
||||
// The LineMonkey test (ported from ICU4C) relies on this behavior, it uses a value of -1
|
||||
// to represent a non-codepoint that is not included in any of the property sets.
|
||||
// This happens for rule 30a.
|
||||
|
||||
class XUnicodeSet extends UnicodeSet {
|
||||
XUnicodeSet(String pattern) { super(pattern); }
|
||||
@Override
|
||||
public boolean contains(int codePoint) {
|
||||
return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ?
|
||||
false : super.contains(codePoint);
|
||||
}
|
||||
}
|
||||
|
||||
RBBILineMonkey()
|
||||
{
|
||||
fCharProperty = UProperty.LINE_BREAK;
|
||||
fSets = new ArrayList();
|
||||
|
||||
fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
|
||||
fCR = new UnicodeSet("[\\p{Line_break=CR}]");
|
||||
fLF = new UnicodeSet("[\\p{Line_break=LF}]");
|
||||
fCM = new UnicodeSet("[\\p{Line_break=CM}]");
|
||||
fNL = new UnicodeSet("[\\p{Line_break=NL}]");
|
||||
fSG = new UnicodeSet("[\\ud800-\\udfff]");
|
||||
fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
|
||||
fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
|
||||
fGL = new UnicodeSet("[\\p{Line_break=GL}]");
|
||||
fSP = new UnicodeSet("[\\p{Line_break=SP}]");
|
||||
fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
|
||||
fBA = new UnicodeSet("[\\p{Line_break=BA}]");
|
||||
fBB = new UnicodeSet("[\\p{Line_break=BB}]");
|
||||
fHY = new UnicodeSet("[\\p{Line_break=HY}]");
|
||||
fCB = new UnicodeSet("[\\p{Line_break=CB}]");
|
||||
fCL = new UnicodeSet("[\\p{Line_break=CL}]");
|
||||
fCP = new UnicodeSet("[\\p{Line_break=CP}]");
|
||||
fEX = new UnicodeSet("[\\p{Line_break=EX}]");
|
||||
fIN = new UnicodeSet("[\\p{Line_break=IN}]");
|
||||
fNS = new UnicodeSet("[\\p{Line_break=NS}]");
|
||||
fOP = new UnicodeSet("[\\p{Line_break=OP}]");
|
||||
fQU = new UnicodeSet("[\\p{Line_break=QU}]");
|
||||
fIS = new UnicodeSet("[\\p{Line_break=IS}]");
|
||||
fNU = new UnicodeSet("[\\p{Line_break=NU}]");
|
||||
fPO = new UnicodeSet("[\\p{Line_break=PO}]");
|
||||
fPR = new UnicodeSet("[\\p{Line_break=PR}]");
|
||||
fSY = new UnicodeSet("[\\p{Line_break=SY}]");
|
||||
fAI = new UnicodeSet("[\\p{Line_break=AI}]");
|
||||
fAL = new UnicodeSet("[\\p{Line_break=AL}]");
|
||||
fCJ = new UnicodeSet("[\\p{Line_break=CJ}]");
|
||||
fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
|
||||
fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
|
||||
fHL = new UnicodeSet("[\\p{Line_break=HL}]");
|
||||
fID = new UnicodeSet("[\\p{Line_break=ID}]");
|
||||
fJL = new UnicodeSet("[\\p{Line_break=JL}]");
|
||||
fJV = new UnicodeSet("[\\p{Line_break=JV}]");
|
||||
fJT = new UnicodeSet("[\\p{Line_break=JT}]");
|
||||
fRI = new UnicodeSet("[\\p{Line_break=RI}]");
|
||||
fXX = new UnicodeSet("[\\p{Line_break=XX}]");
|
||||
fEB = new UnicodeSet("[\\p{Line_break=EB}]");
|
||||
fEM = new UnicodeSet("[\\p{Line_break=EM}]");
|
||||
fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]");
|
||||
fBK = new XUnicodeSet("[\\p{Line_Break=BK}]");
|
||||
fCR = new XUnicodeSet("[\\p{Line_break=CR}]");
|
||||
fLF = new XUnicodeSet("[\\p{Line_break=LF}]");
|
||||
fCM = new XUnicodeSet("[\\p{Line_break=CM}]");
|
||||
fNL = new XUnicodeSet("[\\p{Line_break=NL}]");
|
||||
fSG = new XUnicodeSet("[\\ud800-\\udfff]");
|
||||
fWJ = new XUnicodeSet("[\\p{Line_break=WJ}]");
|
||||
fZW = new XUnicodeSet("[\\p{Line_break=ZW}]");
|
||||
fGL = new XUnicodeSet("[\\p{Line_break=GL}]");
|
||||
fSP = new XUnicodeSet("[\\p{Line_break=SP}]");
|
||||
fB2 = new XUnicodeSet("[\\p{Line_break=B2}]");
|
||||
fBA = new XUnicodeSet("[\\p{Line_break=BA}]");
|
||||
fBB = new XUnicodeSet("[\\p{Line_break=BB}]");
|
||||
fHY = new XUnicodeSet("[\\p{Line_break=HY}]");
|
||||
fCB = new XUnicodeSet("[\\p{Line_break=CB}]");
|
||||
fCL = new XUnicodeSet("[\\p{Line_break=CL}]");
|
||||
fCP = new XUnicodeSet("[\\p{Line_break=CP}]");
|
||||
fEX = new XUnicodeSet("[\\p{Line_break=EX}]");
|
||||
fIN = new XUnicodeSet("[\\p{Line_break=IN}]");
|
||||
fNS = new XUnicodeSet("[\\p{Line_break=NS}]");
|
||||
fOP = new XUnicodeSet("[\\p{Line_break=OP}]");
|
||||
fQU = new XUnicodeSet("[\\p{Line_break=QU}]");
|
||||
fIS = new XUnicodeSet("[\\p{Line_break=IS}]");
|
||||
fNU = new XUnicodeSet("[\\p{Line_break=NU}]");
|
||||
fPO = new XUnicodeSet("[\\p{Line_break=PO}]");
|
||||
fPR = new XUnicodeSet("[\\p{Line_break=PR}]");
|
||||
fSY = new XUnicodeSet("[\\p{Line_break=SY}]");
|
||||
fAI = new XUnicodeSet("[\\p{Line_break=AI}]");
|
||||
fAL = new XUnicodeSet("[\\p{Line_break=AL}]");
|
||||
fCJ = new XUnicodeSet("[\\p{Line_break=CJ}]");
|
||||
fH2 = new XUnicodeSet("[\\p{Line_break=H2}]");
|
||||
fH3 = new XUnicodeSet("[\\p{Line_break=H3}]");
|
||||
fHL = new XUnicodeSet("[\\p{Line_break=HL}]");
|
||||
fID = new XUnicodeSet("[\\p{Line_break=ID}]");
|
||||
fJL = new XUnicodeSet("[\\p{Line_break=JL}]");
|
||||
fJV = new XUnicodeSet("[\\p{Line_break=JV}]");
|
||||
fJT = new XUnicodeSet("[\\p{Line_break=JT}]");
|
||||
fRI = new XUnicodeSet("[\\p{Line_break=RI}]");
|
||||
fXX = new XUnicodeSet("[\\p{Line_break=XX}]");
|
||||
fEB = new XUnicodeSet("[\\p{Line_break=EB}]");
|
||||
fEM = new XUnicodeSet("[\\p{Line_break=EM}]");
|
||||
fZWJ = new XUnicodeSet("[\\p{Line_break=ZWJ}]");
|
||||
|
||||
// Remove dictionary characters.
|
||||
// The monkey test reference implementation of line break does not replicate the dictionary behavior,
|
||||
@ -886,7 +900,13 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
}
|
||||
|
||||
// LB 8 Break after zero width space
|
||||
if (fZW.contains(prevChar)) {
|
||||
// ZW SP* ÷
|
||||
// Scan backwards from prevChar for SP* ZW
|
||||
tPos = prevPos;
|
||||
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
|
||||
tPos = moveIndex32(fText, tPos, -1);
|
||||
}
|
||||
if (fZW.contains(UTF16.charAt(fText, tPos))) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1166,12 +1186,16 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
}
|
||||
|
||||
// LB 30a Break between pairs of Regional Indicators.
|
||||
// RI RI <break> RI
|
||||
// RI x RI
|
||||
// RI RI ÷ RI
|
||||
// RI x RI
|
||||
if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
|
||||
break;
|
||||
}
|
||||
if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
|
||||
// Two Regional Indicators have been paired.
|
||||
// Over-write the trailing one (thisChar) to prevent it from forming another pair with a
|
||||
// following RI. This is a hack.
|
||||
thisChar = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,7 @@ B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM_ = [:LineBreak = Combining_Mark:];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
@ -66,7 +66,7 @@ dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM_ ZWJ];
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
@ -86,14 +86,16 @@ LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
@ -188,8 +190,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -32,7 +32,7 @@ B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM_ = [:LineBreak = Combining_Mark:];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
@ -74,7 +74,7 @@ dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM_ ZWJ];
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
@ -94,14 +94,16 @@ LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
@ -196,8 +198,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -46,7 +46,7 @@ B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM_ = [:LineBreak = Combining_Mark:];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
@ -91,7 +91,7 @@ dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM_ ZWJ];
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
@ -111,14 +111,16 @@ LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
@ -217,8 +219,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -39,7 +39,7 @@ B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM_ = [:LineBreak = Combining_Mark:];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
@ -80,7 +80,7 @@ dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM_ ZWJ];
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
@ -100,14 +100,16 @@ LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
@ -202,8 +204,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -40,7 +40,7 @@ B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM_ = [:LineBreak = Combining_Mark:];
|
||||
CMS = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
@ -82,7 +82,7 @@ dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM_ ZWJ];
|
||||
CM = [CMS ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
@ -105,14 +105,16 @@ LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
|
||||
# and LB8 should take precedence.
|
||||
|
||||
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
|
||||
|
||||
# LB7 Do not break before spaces or zero width space.
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
@ -211,8 +213,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
Loading…
Reference in New Issue
Block a user