ICU-7270 Line Break rule LB8 updated for UAX-14 conformance. (#41)

Includes all line break tailorings.
Corresponding updates to monkey test rules.
State table builder, fix missed table optimization, uncovered by new rule.
This commit is contained in:
Andy Heninger 2018-08-09 11:28:55 -07:00 committed by Shane Carr
parent 4e49234da9
commit fa5ae3dc45
No known key found for this signature in database
GPG Key ID: FCED3B24AAB18B5C
27 changed files with 276 additions and 211 deletions

View File

@ -303,17 +303,24 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
}
void RBBIRuleBuilder::optimizeTables() {
bool didSomething;
do {
didSomething = false;
// Begin looking for duplicates with char class 3.
// Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
// and should not have other categories merged into them.
IntPair duplPair = {3, 0};
while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
fSetBuilder->mergeCategories(duplPair);
fForwardTable->removeColumn(duplPair.second);
didSomething = true;
}
fForwardTable->removeDuplicateStates();
while (fForwardTable->removeDuplicateStates() > 0) {
didSomething = true;
}
} while (didSomething);
}
U_NAMESPACE_END

View File

@ -1245,12 +1245,16 @@ void RBBITableBuilder::removeSafeState(IntPair duplStates) {
/*
* RemoveDuplicateStates
*/
void RBBITableBuilder::removeDuplicateStates() {
int32_t RBBITableBuilder::removeDuplicateStates() {
IntPair dupls = {3, 0};
int32_t numStatesRemoved = 0;
while (findDuplicateState(&dupls)) {
// printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
removeState(dupls);
++numStatesRemoved;
}
return numStatesRemoved;
}

View File

@ -66,8 +66,11 @@ public:
*/
void removeColumn(int32_t column);
/** Check for, and remove dupicate states (table rows). */
void removeDuplicateStates();
/**
* Check for, and remove dupicate states (table rows).
* @return the number of states removed.
*/
int32_t removeDuplicateStates();
/** Build the safe reverse table from the already-constructed forward table. */
void buildSafeReverseTable(UErrorCode &status);

View File

@ -132,12 +132,11 @@ $CAN_CM $CM* [$SP $ZW];
#
# LB 8 Break after zero width space
# TODO: ZW SP* <break>
# An engine change is required to write the reverse rule for this.
# For now, leave the Unicode 5.2 rule, ZW <break>
# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#

View File

@ -138,12 +138,11 @@ $CAN_CM $CM* [$SP $ZW];
#
# LB 8 Break after zero width space
# TODO: ZW SP* <break>
# An engine change is required to write the reverse rule for this.
# For now, leave the Unicode 5.2 rule, ZW <break>
# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#

View File

@ -141,12 +141,11 @@ $CAN_CM $CM* [$SP $ZW];
#
# LB 8 Break after zero width space
# TODO: ZW SP* <break>
# An engine change is required to write the reverse rule for this.
# For now, leave the Unicode 5.2 rule, ZW <break>
# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#

View File

@ -151,12 +151,11 @@ $CAN_CM $CM* [$SP $ZW];
#
# LB 8 Break after zero width space
# TODO: ZW SP* <break>
# An engine change is required to write the reverse rule for this.
# For now, leave the Unicode 5.2 rule, ZW <break>
# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#

View File

@ -137,12 +137,11 @@ $CAN_CM $CM* [$SP $ZW];
#
# LB 8 Break after zero width space
# TODO: ZW SP* <break>
# An engine change is required to write the reverse rule for this.
# For now, leave the Unicode 5.2 rule, ZW <break>
# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#

View File

@ -136,12 +136,11 @@ $CAN_CM $CM* [$SP $ZW];
#
# LB 8 Break after zero width space
# TODO: ZW SP* <break>
# An engine change is required to write the reverse rule for this.
# For now, leave the Unicode 5.2 rule, ZW <break>
# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#

View File

@ -139,12 +139,11 @@ $CAN_CM $CM* [$SP $ZW];
#
# LB 8 Break after zero width space
# TODO: ZW SP* <break>
# An engine change is required to write the reverse rule for this.
# For now, leave the Unicode 5.2 rule, ZW <break>
# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#

View File

@ -136,12 +136,11 @@ $CAN_CM $CM* [$SP $ZW];
#
# LB 8 Break after zero width space
# TODO: ZW SP* <break>
# An engine change is required to write the reverse rule for this.
# For now, leave the Unicode 5.2 rule, ZW <break>
# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#

View File

@ -1283,35 +1283,28 @@ void RBBITest::TestUnicodeFiles() {
// Check for test cases from the Unicode test data files that are known to fail
// and should be skipped because ICU is not yet able to fully implement the spec.
// See ticket #7270.
// and should be skipped as known issues because ICU does not fully implement
// the Unicode specifications.
//
// Test cases are identified by the test data sequence, which tends to be more stable
// across Unicode versions than the test file line numbers.
//
// The test case with ticket "10666" is a dummy, included as an example.
UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
static struct TestCase {
const char *fTicketNum;
const char *fFileName;
const UChar *fString;
} badTestCases[] = { // Line Numbers from Unicode 7.0.0 file.
{"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198
{"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202
{"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214
{"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246
{"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298
{"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302
// Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
{"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ
{"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
{"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
// Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
{"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK
{"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK
} badTestCases[] = {
{"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"} // Fake example, for illustration.
};
for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
const TestCase &badCase = badTestCases[n];
if (!strcmp(fileName, badCase.fFileName) &&
testCase == UnicodeString(badCase.fString)) {
return logKnownIssue("7270");
return logKnownIssue(badCase.fTicketNum);
}
}
return FALSE;
@ -2550,7 +2543,7 @@ private:
UnicodeSet *fXX;
UnicodeSet *fEB;
UnicodeSet *fEM;
UnicodeSet *fZJ;
UnicodeSet *fZWJ;
BreakIterator *fCharBI;
const UnicodeString *fText;
@ -2615,7 +2608,7 @@ RBBILineMonkey::RBBILineMonkey() :
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
if (U_FAILURE(status)) {
deferredStatus = status;
@ -2627,7 +2620,7 @@ RBBILineMonkey::RBBILineMonkey() :
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
fCM->addAll(*fZJ); // ZWJ behaves as a CM.
fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
fSets->addElement(fBK, status);
fSets->addElement(fCR, status);
@ -2669,7 +2662,7 @@ RBBILineMonkey::RBBILineMonkey() :
fSets->addElement(fSG, status);
fSets->addElement(fEB, status);
fSets->addElement(fEM, status);
fSets->addElement(fZJ, status);
fSets->addElement(fZWJ, status);
const char *rules =
@ -2853,7 +2846,13 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
}
// LB 8 Break after zero width space
if (fZW->contains(prevChar)) {
// ZW SP* ÷
// Scan backwards from prevChar for SP* ZW
tPos = prevPos;
while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
tPos = fText->moveIndex32(tPos, -1);
}
if (fZW->contains(fText->char32At(tPos))) {
break;
}
@ -2890,7 +2889,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
{
int32_t prevIdx = fText->moveIndex32(pos, -1);
UChar32 prevC = fText->char32At(prevIdx);
if (fZJ->contains(prevC)) {
if (fZWJ->contains(prevC)) {
continue;
}
}
@ -3148,12 +3147,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
continue;
}
// LB30a RI RI <break> RI
// LB30a RI RI ÷ RI
// RI x RI
if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
break;
}
if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
// Two Regional Indicators have been paired.
// Over-write the trailing one (thisChar) to prevent it from forming another pair with a
// following RI. This is a hack.
thisChar = -1;
continue;
}
@ -3220,7 +3223,7 @@ RBBILineMonkey::~RBBILineMonkey() {
delete fXX;
delete fEB;
delete fEM;
delete fZJ;
delete fZWJ;
delete fCharBI;
delete fNumberMatcher;

View File

@ -25,7 +25,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -66,7 +66,7 @@ dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -86,14 +86,16 @@ LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
# and LB8 should take precedence.
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
# LB7 Do not break before spaces or zero width space.
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
@ -189,7 +191,7 @@ LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier

View File

@ -32,7 +32,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -74,7 +74,7 @@ dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -94,14 +94,16 @@ LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
# and LB8 should take precedence.
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
# LB7 Do not break before spaces or zero width space.
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
@ -197,7 +199,7 @@ LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier

View File

@ -46,7 +46,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -91,7 +91,7 @@ dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -111,14 +111,16 @@ LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
# and LB8 should take precedence.
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
# LB7 Do not break before spaces or zero width space.
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
@ -218,7 +220,7 @@ LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier

View File

@ -39,7 +39,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -80,7 +80,7 @@ dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -100,14 +100,16 @@ LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
# and LB8 should take precedence.
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
# LB7 Do not break before spaces or zero width space.
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
@ -203,7 +205,7 @@ LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier

View File

@ -40,7 +40,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -82,7 +82,7 @@ dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -105,14 +105,16 @@ LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
# and LB8 should take precedence.
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
# LB7 Do not break before spaces or zero width space.
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
@ -212,7 +214,7 @@ LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier

View File

@ -331,6 +331,9 @@ class RBBIRuleBuilder {
}
void optimizeTables() {
boolean didSomething;
do {
didSomething = false;
// Begin looking for duplicates with char class 3.
// Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
// and should not have other categories merged into them.
@ -338,7 +341,11 @@ class RBBIRuleBuilder {
while (fForwardTable.findDuplCharClassFrom(duplPair)) {
fSetBuilder.mergeCategories(duplPair);
fForwardTable.removeColumn(duplPair.second);
didSomething = true;
}
fForwardTable.removeDuplicateStates();
while (fForwardTable.removeDuplicateStates() > 0) {
didSomething = true;
};
} while (didSomething);
}
}

View File

@ -1032,14 +1032,19 @@ class RBBITableBuilder {
/**
* Check for, and remove duplicate states (table rows).
* @return the number of states removed.
* @internal
*/
void removeDuplicateStates() {
int removeDuplicateStates() {
IntPair dupls = new IntPair(3, 0);
int numStatesRemoved = 0;
while (findDuplicateState(dupls)) {
// System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
removeState(dupls);
++numStatesRemoved;
}
return numStatesRemoved;
}

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2cb8f12bbfbffe8a36d10f9d227668fb5468ccee6380b990d41cfa81e34ef2e0
size 12508534
oid sha256:70c249360d5cc010c75203f5add8040cbcc4f33229e1d82d34b6185d69832143
size 12510210

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c2fa72ee8523fcb52b31b81106e399e6caecb1e51167f84b31ba96670e15efac
oid sha256:93a0bf4221a173b33aeda78f4646092caad816a6832310a89278de249ec18634
size 92857

View File

@ -651,54 +651,68 @@ public class RBBITestMonkey extends TestFmwk {
int fOrigPositions;
// XUnicodeSet is like UnicodeSet, except that the method contains(int codePoint) does not
// throw exceptions on out-of-range codePoints. This matches ICU4C behavior.
// The LineMonkey test (ported from ICU4C) relies on this behavior, it uses a value of -1
// to represent a non-codepoint that is not included in any of the property sets.
// This happens for rule 30a.
class XUnicodeSet extends UnicodeSet {
XUnicodeSet(String pattern) { super(pattern); }
@Override
public boolean contains(int codePoint) {
return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ?
false : super.contains(codePoint);
}
}
RBBILineMonkey()
{
fCharProperty = UProperty.LINE_BREAK;
fSets = new ArrayList();
fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
fCR = new UnicodeSet("[\\p{Line_break=CR}]");
fLF = new UnicodeSet("[\\p{Line_break=LF}]");
fCM = new UnicodeSet("[\\p{Line_break=CM}]");
fNL = new UnicodeSet("[\\p{Line_break=NL}]");
fSG = new UnicodeSet("[\\ud800-\\udfff]");
fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
fGL = new UnicodeSet("[\\p{Line_break=GL}]");
fSP = new UnicodeSet("[\\p{Line_break=SP}]");
fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
fBA = new UnicodeSet("[\\p{Line_break=BA}]");
fBB = new UnicodeSet("[\\p{Line_break=BB}]");
fHY = new UnicodeSet("[\\p{Line_break=HY}]");
fCB = new UnicodeSet("[\\p{Line_break=CB}]");
fCL = new UnicodeSet("[\\p{Line_break=CL}]");
fCP = new UnicodeSet("[\\p{Line_break=CP}]");
fEX = new UnicodeSet("[\\p{Line_break=EX}]");
fIN = new UnicodeSet("[\\p{Line_break=IN}]");
fNS = new UnicodeSet("[\\p{Line_break=NS}]");
fOP = new UnicodeSet("[\\p{Line_break=OP}]");
fQU = new UnicodeSet("[\\p{Line_break=QU}]");
fIS = new UnicodeSet("[\\p{Line_break=IS}]");
fNU = new UnicodeSet("[\\p{Line_break=NU}]");
fPO = new UnicodeSet("[\\p{Line_break=PO}]");
fPR = new UnicodeSet("[\\p{Line_break=PR}]");
fSY = new UnicodeSet("[\\p{Line_break=SY}]");
fAI = new UnicodeSet("[\\p{Line_break=AI}]");
fAL = new UnicodeSet("[\\p{Line_break=AL}]");
fCJ = new UnicodeSet("[\\p{Line_break=CJ}]");
fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
fHL = new UnicodeSet("[\\p{Line_break=HL}]");
fID = new UnicodeSet("[\\p{Line_break=ID}]");
fJL = new UnicodeSet("[\\p{Line_break=JL}]");
fJV = new UnicodeSet("[\\p{Line_break=JV}]");
fJT = new UnicodeSet("[\\p{Line_break=JT}]");
fRI = new UnicodeSet("[\\p{Line_break=RI}]");
fXX = new UnicodeSet("[\\p{Line_break=XX}]");
fEB = new UnicodeSet("[\\p{Line_break=EB}]");
fEM = new UnicodeSet("[\\p{Line_break=EM}]");
fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]");
fBK = new XUnicodeSet("[\\p{Line_Break=BK}]");
fCR = new XUnicodeSet("[\\p{Line_break=CR}]");
fLF = new XUnicodeSet("[\\p{Line_break=LF}]");
fCM = new XUnicodeSet("[\\p{Line_break=CM}]");
fNL = new XUnicodeSet("[\\p{Line_break=NL}]");
fSG = new XUnicodeSet("[\\ud800-\\udfff]");
fWJ = new XUnicodeSet("[\\p{Line_break=WJ}]");
fZW = new XUnicodeSet("[\\p{Line_break=ZW}]");
fGL = new XUnicodeSet("[\\p{Line_break=GL}]");
fSP = new XUnicodeSet("[\\p{Line_break=SP}]");
fB2 = new XUnicodeSet("[\\p{Line_break=B2}]");
fBA = new XUnicodeSet("[\\p{Line_break=BA}]");
fBB = new XUnicodeSet("[\\p{Line_break=BB}]");
fHY = new XUnicodeSet("[\\p{Line_break=HY}]");
fCB = new XUnicodeSet("[\\p{Line_break=CB}]");
fCL = new XUnicodeSet("[\\p{Line_break=CL}]");
fCP = new XUnicodeSet("[\\p{Line_break=CP}]");
fEX = new XUnicodeSet("[\\p{Line_break=EX}]");
fIN = new XUnicodeSet("[\\p{Line_break=IN}]");
fNS = new XUnicodeSet("[\\p{Line_break=NS}]");
fOP = new XUnicodeSet("[\\p{Line_break=OP}]");
fQU = new XUnicodeSet("[\\p{Line_break=QU}]");
fIS = new XUnicodeSet("[\\p{Line_break=IS}]");
fNU = new XUnicodeSet("[\\p{Line_break=NU}]");
fPO = new XUnicodeSet("[\\p{Line_break=PO}]");
fPR = new XUnicodeSet("[\\p{Line_break=PR}]");
fSY = new XUnicodeSet("[\\p{Line_break=SY}]");
fAI = new XUnicodeSet("[\\p{Line_break=AI}]");
fAL = new XUnicodeSet("[\\p{Line_break=AL}]");
fCJ = new XUnicodeSet("[\\p{Line_break=CJ}]");
fH2 = new XUnicodeSet("[\\p{Line_break=H2}]");
fH3 = new XUnicodeSet("[\\p{Line_break=H3}]");
fHL = new XUnicodeSet("[\\p{Line_break=HL}]");
fID = new XUnicodeSet("[\\p{Line_break=ID}]");
fJL = new XUnicodeSet("[\\p{Line_break=JL}]");
fJV = new XUnicodeSet("[\\p{Line_break=JV}]");
fJT = new XUnicodeSet("[\\p{Line_break=JT}]");
fRI = new XUnicodeSet("[\\p{Line_break=RI}]");
fXX = new XUnicodeSet("[\\p{Line_break=XX}]");
fEB = new XUnicodeSet("[\\p{Line_break=EB}]");
fEM = new XUnicodeSet("[\\p{Line_break=EM}]");
fZWJ = new XUnicodeSet("[\\p{Line_break=ZWJ}]");
// Remove dictionary characters.
// The monkey test reference implementation of line break does not replicate the dictionary behavior,
@ -886,7 +900,13 @@ public class RBBITestMonkey extends TestFmwk {
}
// LB 8 Break after zero width space
if (fZW.contains(prevChar)) {
// ZW SP* ÷
// Scan backwards from prevChar for SP* ZW
tPos = prevPos;
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
tPos = moveIndex32(fText, tPos, -1);
}
if (fZW.contains(UTF16.charAt(fText, tPos))) {
break;
}
@ -1166,12 +1186,16 @@ public class RBBITestMonkey extends TestFmwk {
}
// LB 30a Break between pairs of Regional Indicators.
// RI RI <break> RI
// RI RI ÷ RI
// RI x RI
if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
break;
}
if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
// Two Regional Indicators have been paired.
// Over-write the trailing one (thisChar) to prevent it from forming another pair with a
// following RI. This is a hack.
thisChar = -1;
continue;
}

View File

@ -25,7 +25,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM_ = [:LineBreak = Combining_Mark:];
CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -66,7 +66,7 @@ dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM_ ZWJ];
CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -86,14 +86,16 @@ LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
# and LB8 should take precedence.
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
# LB7 Do not break before spaces or zero width space.
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
@ -189,7 +191,7 @@ LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier

View File

@ -32,7 +32,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM_ = [:LineBreak = Combining_Mark:];
CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -74,7 +74,7 @@ dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM_ ZWJ];
CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -94,14 +94,16 @@ LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
# and LB8 should take precedence.
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
# LB7 Do not break before spaces or zero width space.
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
@ -197,7 +199,7 @@ LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier

View File

@ -46,7 +46,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM_ = [:LineBreak = Combining_Mark:];
CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -91,7 +91,7 @@ dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM_ ZWJ];
CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -111,14 +111,16 @@ LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
# and LB8 should take precedence.
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
# LB7 Do not break before spaces or zero width space.
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
@ -218,7 +220,7 @@ LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier

View File

@ -39,7 +39,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM_ = [:LineBreak = Combining_Mark:];
CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -80,7 +80,7 @@ dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM_ ZWJ];
CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -100,14 +100,16 @@ LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
# and LB8 should take precedence.
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
# LB7 Do not break before spaces or zero width space.
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
@ -203,7 +205,7 @@ LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier

View File

@ -40,7 +40,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM_ = [:LineBreak = Combining_Mark:];
CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -82,7 +82,7 @@ dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM_ ZWJ];
CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -105,14 +105,16 @@ LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
# and LB8 should take precedence.
LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
# LB7 Do not break before spaces or zero width space.
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
@ -212,7 +214,7 @@ LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier