ICU-7270 Line Break rule LB8 updated for UAX-14 conformance. (#41)

Includes all line break tailorings. Corresponding updates to monkey test rules. State table builder, fix missed table optimization, uncovered by new rule.
2018-08-09 11:28:55 -07:00 · 2018-08-09 11:28:55 -07:00 · fa5ae3dc45
commit fa5ae3dc45
parent 4e49234da9
27 changed files with 276 additions and 211 deletions
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@ -303,17 +303,24 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
 }

 void RBBIRuleBuilder::optimizeTables() {
+    bool didSomething;
+    do {
+        didSomething = false;

-    // Begin looking for duplicates with char class 3.
-    // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
-    // and should not have other categories merged into them.
-    IntPair duplPair = {3, 0};
+        // Begin looking for duplicates with char class 3.
+        // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
+        // and should not have other categories merged into them.
+        IntPair duplPair = {3, 0};
+        while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
+            fSetBuilder->mergeCategories(duplPair);
+            fForwardTable->removeColumn(duplPair.second);
+            didSomething = true;
+        }

-    while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
-        fSetBuilder->mergeCategories(duplPair);
-        fForwardTable->removeColumn(duplPair.second);
-    }
-    fForwardTable->removeDuplicateStates();
+        while (fForwardTable->removeDuplicateStates() > 0) {
+            didSomething = true;
+        }
+    } while (didSomething);
 }

 U_NAMESPACE_END
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@ -1245,12 +1245,16 @@ void RBBITableBuilder::removeSafeState(IntPair duplStates) {
 /*
 * RemoveDuplicateStates
 */
-void RBBITableBuilder::removeDuplicateStates() {
+int32_t RBBITableBuilder::removeDuplicateStates() {
    IntPair dupls = {3, 0};
+    int32_t numStatesRemoved = 0;
+
    while (findDuplicateState(&dupls)) {
        // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
        removeState(dupls);
+        ++numStatesRemoved;
    }
+    return numStatesRemoved;
 }


--- a/icu4c/source/common/rbbitblb.h
+++ b/icu4c/source/common/rbbitblb.h
@ -66,8 +66,11 @@ public:
     */
    void     removeColumn(int32_t column);

-    /** Check for, and remove dupicate states (table rows). */
-    void     removeDuplicateStates();
+    /**
+     * Check for, and remove dupicate states (table rows).
+     * @return the number of states removed.
+     */
+    int32_t  removeDuplicateStates();

    /** Build the safe reverse table from the already-constructed forward table. */
    void     buildSafeReverseTable(UErrorCode &status);
--- a/icu4c/source/data/brkitr/rules/line.txt
+++ b/icu4c/source/data/brkitr/rules/line.txt
@ -132,12 +132,11 @@ $CAN_CM $CM*  [$SP $ZW];

 #
 # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
 #
 $LB8Breaks    = [$LB4Breaks $ZW];
 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];

 # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
 #
--- a/icu4c/source/data/brkitr/rules/line_fi.txt
+++ b/icu4c/source/data/brkitr/rules/line_fi.txt
@ -138,12 +138,11 @@ $CAN_CM $CM*  [$SP $ZW];

 #
 # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
 #
 $LB8Breaks    = [$LB4Breaks $ZW];
 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];

 # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
 #
--- a/icu4c/source/data/brkitr/rules/line_loose.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose.txt
@ -141,12 +141,11 @@ $CAN_CM $CM*  [$SP $ZW];

 #
 # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
 #
 $LB8Breaks    = [$LB4Breaks $ZW];
 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];

 # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
 #
--- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
@ -151,12 +151,11 @@ $CAN_CM $CM*  [$SP $ZW];

 #
 # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
 #
 $LB8Breaks    = [$LB4Breaks $ZW];
 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];

 # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
 #
--- a/icu4c/source/data/brkitr/rules/line_loose_fi.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_fi.txt
@ -137,12 +137,11 @@ $CAN_CM $CM*  [$SP $ZW];

 #
 # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
 #
 $LB8Breaks    = [$LB4Breaks $ZW];
 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];

 # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
 #
--- a/icu4c/source/data/brkitr/rules/line_normal.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal.txt
@ -136,12 +136,11 @@ $CAN_CM $CM*  [$SP $ZW];

 #
 # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
 #
 $LB8Breaks    = [$LB4Breaks $ZW];
 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];

 # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
 #
--- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
@ -139,12 +139,11 @@ $CAN_CM $CM*  [$SP $ZW];

 #
 # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
 #
 $LB8Breaks    = [$LB4Breaks $ZW];
 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];

 # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
 #
--- a/icu4c/source/data/brkitr/rules/line_normal_fi.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_fi.txt
@ -136,12 +136,11 @@ $CAN_CM $CM*  [$SP $ZW];

 #
 # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
 #
 $LB8Breaks    = [$LB4Breaks $ZW];
 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];

 # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
 #
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -1283,35 +1283,28 @@ void RBBITest::TestUnicodeFiles() {


 // Check for test cases from the Unicode test data files that are known to fail
-// and should be skipped because ICU is not yet able to fully implement the spec.
-// See ticket #7270.
+// and should be skipped as known issues because ICU does not fully implement
+// the Unicode specifications.
+//
+// Test cases are identified by the test data sequence, which tends to be more stable
+// across Unicode versions than the test file line numbers.
+//
+// The test case with ticket "10666" is a dummy, included as an example.

 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
    static struct TestCase {
+        const char *fTicketNum;
        const char *fFileName;
        const UChar *fString;
-    } badTestCases[] = {                                // Line Numbers from Unicode 7.0.0 file.
-        {"LineBreakTest.txt", u"\u200B\u0020}"},        // Line 5198
-        {"LineBreakTest.txt", u"\u200B\u0020)"},        // Line 5202
-        {"LineBreakTest.txt", u"\u200B\u0020!"},        // Line 5214
-        {"LineBreakTest.txt", u"\u200B\u0020,"},        // Line 5246
-        {"LineBreakTest.txt", u"\u200B\u0020/"},        // Line 5298
-        {"LineBreakTest.txt", u"\u200B\u0020\u2060"},   // Line 5302
-                                                        // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
-        {"GraphemeBreakTest.txt", u"\u200D\u2640"},     // Line 656, old GB 11 test ZWJ x GAZ
-        {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
-        {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
-
-                                                        // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
-        {"WordBreakTest.txt", u"\u200D\u261D"},         // Line 1356, ZWJ x EmojiNRK
-        {"WordBreakTest.txt", u"\u200D\U0001F3FB"},     // Line 1358, ZWJ x EmojiNRK
+    } badTestCases[] = {
+        {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}    // Fake example, for illustration.
    };

    for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
        const TestCase &badCase = badTestCases[n];
        if (!strcmp(fileName, badCase.fFileName) &&
                testCase == UnicodeString(badCase.fString)) {
-            return logKnownIssue("7270");
+            return logKnownIssue(badCase.fTicketNum);
        }
    }
    return FALSE;
@ -2550,7 +2543,7 @@ private:
    UnicodeSet  *fXX;
    UnicodeSet  *fEB;
    UnicodeSet  *fEM;
-    UnicodeSet  *fZJ;
+    UnicodeSet  *fZWJ;

    BreakIterator        *fCharBI;
    const UnicodeString  *fText;
@ -2615,7 +2608,7 @@ RBBILineMonkey::RBBILineMonkey() :
    fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
    fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
    fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
-    fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
+    fZWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);

    if (U_FAILURE(status)) {
        deferredStatus = status;
@ -2627,7 +2620,7 @@ RBBILineMonkey::RBBILineMonkey() :
    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.

    fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
-    fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
+    fCM->addAll(*fZWJ);     // ZWJ behaves as a CM.

    fSets->addElement(fBK, status);
    fSets->addElement(fCR, status);
@ -2669,7 +2662,7 @@ RBBILineMonkey::RBBILineMonkey() :
    fSets->addElement(fSG, status);
    fSets->addElement(fEB, status);
    fSets->addElement(fEM, status);
-    fSets->addElement(fZJ, status);
+    fSets->addElement(fZWJ, status);


    const char *rules =
@ -2853,7 +2846,13 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        }

        // LB 8  Break after zero width space
-        if (fZW->contains(prevChar)) {
+        //       ZW SP* ÷
+        //       Scan backwards from prevChar for SP* ZW
+        tPos = prevPos;
+        while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
+            tPos = fText->moveIndex32(tPos, -1);
+        }
+        if (fZW->contains(fText->char32At(tPos))) {
            break;
        }

@ -2890,7 +2889,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        {
            int32_t prevIdx = fText->moveIndex32(pos, -1);
            UChar32 prevC = fText->char32At(prevIdx);
-            if (fZJ->contains(prevC)) {
+            if (fZWJ->contains(prevC)) {
                continue;
            }
        }
@ -3148,12 +3147,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
            continue;
        }

-        // LB30a    RI RI <break> RI
-        //             RI    x    RI
+        // LB30a    RI RI  ÷  RI
+        //             RI  x  RI
        if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
            break;
        }
        if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
+            // Two Regional Indicators have been paired.
+            // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
+            // following RI. This is a hack.
+            thisChar = -1;
            continue;
        }

@ -3220,7 +3223,7 @@ RBBILineMonkey::~RBBILineMonkey() {
    delete fXX;
    delete fEB;
    delete fEM;
-    delete fZJ;
+    delete fZWJ;

    delete fCharBI;
    delete fNumberMatcher;
--- a/icu4c/source/test/testdata/break_rules/line.txt
+++ b/icu4c/source/test/testdata/break_rules/line.txt
@ -25,7 +25,7 @@ B2 = [:LineBreak =  Break_Both:];
 CB = [:LineBreak =  Contingent_Break:];
 CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
 CP = [:LineBreak =  Close_Parenthesis:];
 CR = [:LineBreak =  Carriage_Return:];
 EB = [:LineBreak =  EB:];
@ -66,7 +66,7 @@ dictionary = SA;

 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
 #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];

 LB4:        BK ÷;
 LB5:        CR LF;
@ -86,14 +86,16 @@ LB15:        QU CM* SP* OP;
 LB16:        (CL | CP)CM* SP* NS;
 LB17:        B2 CM* SP* B2;

+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
 LB7.1:      [^ZW SP] CM* [SP ZW];
 LB7.2:      [ZW SP] [SP ZW];

-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
 # LB8a
 #      ZWJ x
 #      Don't match a CM on the right - let other rules pick up CM sequences, where
@ -188,8 +190,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
 LB30.2:      CP CM* (AL | HL | NU);

 # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
 LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;

 # LB30b Do not break between Emoji Base and Emoji Modifier
--- a/icu4c/source/test/testdata/break_rules/line_loose.txt
+++ b/icu4c/source/test/testdata/break_rules/line_loose.txt
@ -32,7 +32,7 @@ B2 = [:LineBreak =  Break_Both:];
 CB = [:LineBreak =  Contingent_Break:];
 CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
 CP = [:LineBreak =  Close_Parenthesis:];
 CR = [:LineBreak =  Carriage_Return:];
 EB = [:LineBreak =  EB:];
@ -74,7 +74,7 @@ dictionary = SA;

 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
 #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];

 LB4:        BK ÷;
 LB5:        CR LF;
@ -94,14 +94,16 @@ LB15:        QU CM* SP* OP;
 LB16:        (CL | CP)CM* SP* NS;
 LB17:        B2 CM* SP* B2;

+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
 LB7.1:      [^ZW SP] CM* [SP ZW];
 LB7.2:      [ZW SP] [SP ZW];

-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
 # LB8a
 #      ZWJ x
 #      Don't match a CM on the right - let other rules pick up CM sequences, where
@ -196,8 +198,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
 LB30.2:      CP CM* (AL | HL | NU);

 # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
 LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;

 # LB30b Do not break between Emoji Base and Emoji Modifier
--- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
+++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
@ -46,7 +46,7 @@ B2 = [:LineBreak =  Break_Both:];
 CB = [:LineBreak =  Contingent_Break:];
 CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
 CP = [:LineBreak =  Close_Parenthesis:];
 CR = [:LineBreak =  Carriage_Return:];
 EB = [:LineBreak =  EB:];
@ -91,7 +91,7 @@ dictionary = SA;

 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
 #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];

 LB4:        BK ÷;
 LB5:        CR LF;
@ -111,14 +111,16 @@ LB15:        QU CM* SP* OP;
 LB16:        (CL | CP)CM* SP* NS;
 LB17:        B2 CM* SP* B2;

+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
 LB7.1:      [^ZW SP] CM* [SP ZW];
 LB7.2:      [ZW SP] [SP ZW];

-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
 # LB8a
 #      ZWJ x
 #      Don't match a CM on the right - let other rules pick up CM sequences, where
@ -217,8 +219,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
 LB30.2:      CP CM* (AL | HL | NU);

 # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
 LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;

 # LB30b Do not break between Emoji Base and Emoji Modifier
--- a/icu4c/source/test/testdata/break_rules/line_normal.txt
+++ b/icu4c/source/test/testdata/break_rules/line_normal.txt
@ -39,7 +39,7 @@ B2 = [:LineBreak =  Break_Both:];
 CB = [:LineBreak =  Contingent_Break:];
 CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
 CP = [:LineBreak =  Close_Parenthesis:];
 CR = [:LineBreak =  Carriage_Return:];
 EB = [:LineBreak =  EB:];
@ -80,7 +80,7 @@ dictionary = SA;

 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
 #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];

 LB4:        BK ÷;
 LB5:        CR LF;
@ -100,14 +100,16 @@ LB15:        QU CM* SP* OP;
 LB16:        (CL | CP)CM* SP* NS;
 LB17:        B2 CM* SP* B2;

+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
 LB7.1:      [^ZW SP] CM* [SP ZW];
 LB7.2:      [ZW SP] [SP ZW];

-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
 # LB8a
 #      ZWJ x
 #      Don't match a CM on the right - let other rules pick up CM sequences, where
@ -202,8 +204,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
 LB30.2:      CP CM* (AL | HL | NU);

 # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
 LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;

 # LB30b Do not break between Emoji Base and Emoji Modifier
--- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
+++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
@ -40,7 +40,7 @@ B2 = [:LineBreak =  Break_Both:];
 CB = [:LineBreak =  Contingent_Break:];
 CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
 CP = [:LineBreak =  Close_Parenthesis:];
 CR = [:LineBreak =  Carriage_Return:];
 EB = [:LineBreak =  EB:];
@ -82,7 +82,7 @@ dictionary = SA;

 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
 #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];

 LB4:        BK ÷;
 LB5:        CR LF;
@ -105,14 +105,16 @@ LB15:        QU CM* SP* OP;
 LB16:        (CL | CP)CM* SP* NS;
 LB17:        B2 CM* SP* B2;

+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
 LB7.1:      [^ZW SP] CM* [SP ZW];
 LB7.2:      [ZW SP] [SP ZW];

-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
 # LB8a
 #      ZWJ x
 #      Don't match a CM on the right - let other rules pick up CM sequences, where
@ -211,8 +213,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
 LB30.2:      CP CM* (AL | HL | NU);

 # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
 LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;

 # LB30b Do not break between Emoji Base and Emoji Modifier
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
@ -331,14 +331,21 @@ class RBBIRuleBuilder {
    }

    void optimizeTables() {
-        // Begin looking for duplicates with char class 3.
-        // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
-        // and should not have other categories merged into them.
-        IntPair duplPair = new IntPair(3, 0);
-        while (fForwardTable.findDuplCharClassFrom(duplPair)) {
-            fSetBuilder.mergeCategories(duplPair);
-            fForwardTable.removeColumn(duplPair.second);
-        }
-        fForwardTable.removeDuplicateStates();
+        boolean didSomething;
+        do {
+            didSomething = false;
+            // Begin looking for duplicates with char class 3.
+            // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
+            // and should not have other categories merged into them.
+            IntPair duplPair = new IntPair(3, 0);
+            while (fForwardTable.findDuplCharClassFrom(duplPair)) {
+                fSetBuilder.mergeCategories(duplPair);
+                fForwardTable.removeColumn(duplPair.second);
+                didSomething = true;
+            }
+            while (fForwardTable.removeDuplicateStates() > 0) {
+                didSomething = true;
+            };
+        } while (didSomething);
    }
 }
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
@ -1032,14 +1032,19 @@ class RBBITableBuilder {

       /**
        *  Check for, and remove duplicate states (table rows).
+        *  @return the number of states removed.
        *  @internal
        */
-       void removeDuplicateStates() {
+       int removeDuplicateStates() {
           IntPair dupls = new IntPair(3, 0);
+           int numStatesRemoved = 0;
+
           while (findDuplicateState(dupls)) {
               // System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
               removeState(dupls);
+               ++numStatesRemoved;
           }
+           return numStatesRemoved;
       }


--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2cb8f12bbfbffe8a36d10f9d227668fb5468ccee6380b990d41cfa81e34ef2e0
-size 12508534
+oid sha256:70c249360d5cc010c75203f5add8040cbcc4f33229e1d82d34b6185d69832143
+size 12510210
--- a/icu4j/main/shared/data/icutzdata.jar
+++ b/icu4j/main/shared/data/icutzdata.jar
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2fa72ee8523fcb52b31b81106e399e6caecb1e51167f84b31ba96670e15efac
+oid sha256:93a0bf4221a173b33aeda78f4646092caad816a6832310a89278de249ec18634
 size 92857
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@ -651,54 +651,68 @@ public class RBBITestMonkey extends TestFmwk {
        int           fOrigPositions;


+        // XUnicodeSet is like UnicodeSet, except that the method contains(int codePoint) does not
+        // throw exceptions on out-of-range codePoints. This matches ICU4C behavior.
+        // The LineMonkey test (ported from ICU4C) relies on this behavior, it uses a value of -1
+        // to represent a non-codepoint that is not included in any of the property sets.
+        // This happens for rule 30a.
+
+        class XUnicodeSet extends UnicodeSet {
+            XUnicodeSet(String pattern) { super(pattern); }
+            @Override
+            public boolean contains(int codePoint) {
+                return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ?
+                        false : super.contains(codePoint);
+            }
+        }

        RBBILineMonkey()
        {
            fCharProperty  = UProperty.LINE_BREAK;
            fSets          = new ArrayList();

-            fBK    = new UnicodeSet("[\\p{Line_Break=BK}]");
-            fCR    = new UnicodeSet("[\\p{Line_break=CR}]");
-            fLF    = new UnicodeSet("[\\p{Line_break=LF}]");
-            fCM    = new UnicodeSet("[\\p{Line_break=CM}]");
-            fNL    = new UnicodeSet("[\\p{Line_break=NL}]");
-            fSG    = new UnicodeSet("[\\ud800-\\udfff]");
-            fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]");
-            fZW    = new UnicodeSet("[\\p{Line_break=ZW}]");
-            fGL    = new UnicodeSet("[\\p{Line_break=GL}]");
-            fSP    = new UnicodeSet("[\\p{Line_break=SP}]");
-            fB2    = new UnicodeSet("[\\p{Line_break=B2}]");
-            fBA    = new UnicodeSet("[\\p{Line_break=BA}]");
-            fBB    = new UnicodeSet("[\\p{Line_break=BB}]");
-            fHY    = new UnicodeSet("[\\p{Line_break=HY}]");
-            fCB    = new UnicodeSet("[\\p{Line_break=CB}]");
-            fCL    = new UnicodeSet("[\\p{Line_break=CL}]");
-            fCP    = new UnicodeSet("[\\p{Line_break=CP}]");
-            fEX    = new UnicodeSet("[\\p{Line_break=EX}]");
-            fIN    = new UnicodeSet("[\\p{Line_break=IN}]");
-            fNS    = new UnicodeSet("[\\p{Line_break=NS}]");
-            fOP    = new UnicodeSet("[\\p{Line_break=OP}]");
-            fQU    = new UnicodeSet("[\\p{Line_break=QU}]");
-            fIS    = new UnicodeSet("[\\p{Line_break=IS}]");
-            fNU    = new UnicodeSet("[\\p{Line_break=NU}]");
-            fPO    = new UnicodeSet("[\\p{Line_break=PO}]");
-            fPR    = new UnicodeSet("[\\p{Line_break=PR}]");
-            fSY    = new UnicodeSet("[\\p{Line_break=SY}]");
-            fAI    = new UnicodeSet("[\\p{Line_break=AI}]");
-            fAL    = new UnicodeSet("[\\p{Line_break=AL}]");
-            fCJ    = new UnicodeSet("[\\p{Line_break=CJ}]");
-            fH2    = new UnicodeSet("[\\p{Line_break=H2}]");
-            fH3    = new UnicodeSet("[\\p{Line_break=H3}]");
-            fHL    = new UnicodeSet("[\\p{Line_break=HL}]");
-            fID    = new UnicodeSet("[\\p{Line_break=ID}]");
-            fJL    = new UnicodeSet("[\\p{Line_break=JL}]");
-            fJV    = new UnicodeSet("[\\p{Line_break=JV}]");
-            fJT    = new UnicodeSet("[\\p{Line_break=JT}]");
-            fRI    = new UnicodeSet("[\\p{Line_break=RI}]");
-            fXX    = new UnicodeSet("[\\p{Line_break=XX}]");
-            fEB    = new UnicodeSet("[\\p{Line_break=EB}]");
-            fEM    = new UnicodeSet("[\\p{Line_break=EM}]");
-            fZWJ   = new UnicodeSet("[\\p{Line_break=ZWJ}]");
+            fBK    = new XUnicodeSet("[\\p{Line_Break=BK}]");
+            fCR    = new XUnicodeSet("[\\p{Line_break=CR}]");
+            fLF    = new XUnicodeSet("[\\p{Line_break=LF}]");
+            fCM    = new XUnicodeSet("[\\p{Line_break=CM}]");
+            fNL    = new XUnicodeSet("[\\p{Line_break=NL}]");
+            fSG    = new XUnicodeSet("[\\ud800-\\udfff]");
+            fWJ    = new XUnicodeSet("[\\p{Line_break=WJ}]");
+            fZW    = new XUnicodeSet("[\\p{Line_break=ZW}]");
+            fGL    = new XUnicodeSet("[\\p{Line_break=GL}]");
+            fSP    = new XUnicodeSet("[\\p{Line_break=SP}]");
+            fB2    = new XUnicodeSet("[\\p{Line_break=B2}]");
+            fBA    = new XUnicodeSet("[\\p{Line_break=BA}]");
+            fBB    = new XUnicodeSet("[\\p{Line_break=BB}]");
+            fHY    = new XUnicodeSet("[\\p{Line_break=HY}]");
+            fCB    = new XUnicodeSet("[\\p{Line_break=CB}]");
+            fCL    = new XUnicodeSet("[\\p{Line_break=CL}]");
+            fCP    = new XUnicodeSet("[\\p{Line_break=CP}]");
+            fEX    = new XUnicodeSet("[\\p{Line_break=EX}]");
+            fIN    = new XUnicodeSet("[\\p{Line_break=IN}]");
+            fNS    = new XUnicodeSet("[\\p{Line_break=NS}]");
+            fOP    = new XUnicodeSet("[\\p{Line_break=OP}]");
+            fQU    = new XUnicodeSet("[\\p{Line_break=QU}]");
+            fIS    = new XUnicodeSet("[\\p{Line_break=IS}]");
+            fNU    = new XUnicodeSet("[\\p{Line_break=NU}]");
+            fPO    = new XUnicodeSet("[\\p{Line_break=PO}]");
+            fPR    = new XUnicodeSet("[\\p{Line_break=PR}]");
+            fSY    = new XUnicodeSet("[\\p{Line_break=SY}]");
+            fAI    = new XUnicodeSet("[\\p{Line_break=AI}]");
+            fAL    = new XUnicodeSet("[\\p{Line_break=AL}]");
+            fCJ    = new XUnicodeSet("[\\p{Line_break=CJ}]");
+            fH2    = new XUnicodeSet("[\\p{Line_break=H2}]");
+            fH3    = new XUnicodeSet("[\\p{Line_break=H3}]");
+            fHL    = new XUnicodeSet("[\\p{Line_break=HL}]");
+            fID    = new XUnicodeSet("[\\p{Line_break=ID}]");
+            fJL    = new XUnicodeSet("[\\p{Line_break=JL}]");
+            fJV    = new XUnicodeSet("[\\p{Line_break=JV}]");
+            fJT    = new XUnicodeSet("[\\p{Line_break=JT}]");
+            fRI    = new XUnicodeSet("[\\p{Line_break=RI}]");
+            fXX    = new XUnicodeSet("[\\p{Line_break=XX}]");
+            fEB    = new XUnicodeSet("[\\p{Line_break=EB}]");
+            fEM    = new XUnicodeSet("[\\p{Line_break=EM}]");
+            fZWJ   = new XUnicodeSet("[\\p{Line_break=ZWJ}]");

            // Remove dictionary characters.
            // The monkey test reference implementation of line break does not replicate the dictionary behavior,
@ -886,7 +900,13 @@ public class RBBITestMonkey extends TestFmwk {
                }

                // LB 8  Break after zero width space
-                if (fZW.contains(prevChar)) {
+                //       ZW SP* ÷
+                //       Scan backwards from prevChar for SP* ZW
+                tPos = prevPos;
+                while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
+                    tPos = moveIndex32(fText, tPos, -1);
+                }
+                if (fZW.contains(UTF16.charAt(fText, tPos))) {
                    break;
                }

@ -1166,12 +1186,16 @@ public class RBBITestMonkey extends TestFmwk {
                }

                // LB 30a   Break between pairs of Regional Indicators.
-                //             RI RI <break> RI
-                //             RI    x    RI
+                //             RI RI  ÷  RI
+                //                RI  x  RI
                if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
                    break;
                }
                if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
+                    // Two Regional Indicators have been paired.
+                    // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
+                    // following RI. This is a hack.
+                    thisChar = -1;
                    continue;
                }

--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt
@ -25,7 +25,7 @@ B2 = [:LineBreak =  Break_Both:];
 CB = [:LineBreak =  Contingent_Break:];
 CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 CL = [:LineBreak =  Close_Punctuation:];
-CM_ = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
 CP = [:LineBreak =  Close_Parenthesis:];
 CR = [:LineBreak =  Carriage_Return:];
 EB = [:LineBreak =  EB:];
@ -66,7 +66,7 @@ dictionary = SA;

 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
 #         list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];

 LB4:        BK ÷;
 LB5:        CR LF;
@ -86,14 +86,16 @@ LB15:        QU CM* SP* OP;
 LB16:        (CL | CP)CM* SP* NS;
 LB17:        B2 CM* SP* B2;

+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
 LB7.1:      [^ZW SP] CM* [SP ZW];
 LB7.2:      [ZW SP] [SP ZW];

-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
 # LB8a
 #      ZWJ x
 #      Don't match a CM on the right - let other rules pick up CM sequences, where
@ -188,8 +190,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
 LB30.2:      CP CM* (AL | HL | NU);

 # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
 LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;

 # LB30b Do not break between Emoji Base and Emoji Modifier
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt
@ -32,7 +32,7 @@ B2 = [:LineBreak =  Break_Both:];
 CB = [:LineBreak =  Contingent_Break:];
 CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 CL = [:LineBreak =  Close_Punctuation:];
-CM_ = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
 CP = [:LineBreak =  Close_Parenthesis:];
 CR = [:LineBreak =  Carriage_Return:];
 EB = [:LineBreak =  EB:];
@ -74,7 +74,7 @@ dictionary = SA;

 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
 #         list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];

 LB4:        BK ÷;
 LB5:        CR LF;
@ -94,14 +94,16 @@ LB15:        QU CM* SP* OP;
 LB16:        (CL | CP)CM* SP* NS;
 LB17:        B2 CM* SP* B2;

+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
 LB7.1:      [^ZW SP] CM* [SP ZW];
 LB7.2:      [ZW SP] [SP ZW];

-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
 # LB8a
 #      ZWJ x
 #      Don't match a CM on the right - let other rules pick up CM sequences, where
@ -196,8 +198,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
 LB30.2:      CP CM* (AL | HL | NU);

 # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
 LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;

 # LB30b Do not break between Emoji Base and Emoji Modifier
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
@ -46,7 +46,7 @@ B2 = [:LineBreak =  Break_Both:];
 CB = [:LineBreak =  Contingent_Break:];
 CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 CL = [:LineBreak =  Close_Punctuation:];
-CM_ = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
 CP = [:LineBreak =  Close_Parenthesis:];
 CR = [:LineBreak =  Carriage_Return:];
 EB = [:LineBreak =  EB:];
@ -91,7 +91,7 @@ dictionary = SA;

 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
 #         list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];

 LB4:        BK ÷;
 LB5:        CR LF;
@ -111,14 +111,16 @@ LB15:        QU CM* SP* OP;
 LB16:        (CL | CP)CM* SP* NS;
 LB17:        B2 CM* SP* B2;

+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
 LB7.1:      [^ZW SP] CM* [SP ZW];
 LB7.2:      [ZW SP] [SP ZW];

-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
 # LB8a
 #      ZWJ x
 #      Don't match a CM on the right - let other rules pick up CM sequences, where
@ -217,8 +219,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
 LB30.2:      CP CM* (AL | HL | NU);

 # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
 LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;

 # LB30b Do not break between Emoji Base and Emoji Modifier
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt
@ -39,7 +39,7 @@ B2 = [:LineBreak =  Break_Both:];
 CB = [:LineBreak =  Contingent_Break:];
 CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 CL = [:LineBreak =  Close_Punctuation:];
-CM_ = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
 CP = [:LineBreak =  Close_Parenthesis:];
 CR = [:LineBreak =  Carriage_Return:];
 EB = [:LineBreak =  EB:];
@ -80,7 +80,7 @@ dictionary = SA;

 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
 #         list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];

 LB4:        BK ÷;
 LB5:        CR LF;
@ -100,14 +100,16 @@ LB15:        QU CM* SP* OP;
 LB16:        (CL | CP)CM* SP* NS;
 LB17:        B2 CM* SP* B2;

+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
 LB7.1:      [^ZW SP] CM* [SP ZW];
 LB7.2:      [ZW SP] [SP ZW];

-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
 # LB8a
 #      ZWJ x
 #      Don't match a CM on the right - let other rules pick up CM sequences, where
@ -202,8 +204,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
 LB30.2:      CP CM* (AL | HL | NU);

 # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
 LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;

 # LB30b Do not break between Emoji Base and Emoji Modifier
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
@ -40,7 +40,7 @@ B2 = [:LineBreak =  Break_Both:];
 CB = [:LineBreak =  Contingent_Break:];
 CJ = [:LineBreak =  Conditional_Japanese_Starter:];
 CL = [:LineBreak =  Close_Punctuation:];
-CM_ = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
 CP = [:LineBreak =  Close_Parenthesis:];
 CR = [:LineBreak =  Carriage_Return:];
 EB = [:LineBreak =  EB:];
@ -82,7 +82,7 @@ dictionary = SA;

 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
 #         list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];

 LB4:        BK ÷;
 LB5:        CR LF;
@ -105,14 +105,16 @@ LB15:        QU CM* SP* OP;
 LB16:        (CL | CP)CM* SP* NS;
 LB17:        B2 CM* SP* B2;

+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
 LB7.1:      [^ZW SP] CM* [SP ZW];
 LB7.2:      [ZW SP] [SP ZW];

-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
 # LB8a
 #      ZWJ x
 #      Don't match a CM on the right - let other rules pick up CM sequences, where
@ -211,8 +213,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
 LB30.2:      CP CM* (AL | HL | NU);

 # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
 LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;

 # LB30b Do not break between Emoji Base and Emoji Modifier