ICU-20876 Regex Grapheme Cluster matching with Break Iterators.

Change the implementation of grapheme cluster matching in regex to use an ICU break iterator instead of a little one-off state machine. The old implementation had fallen behind the Unicode UAX-29 specification for graphem clusters, and could not be easily updated. The implementation follows the same general pattern that is used for finding word boundaries with an ICU break iterator. In reviewing that code, a few improvements to the handling of ICU error codes were also made. Also note that this change adds a new dependency on Break Iteration. Regex patterns that previously would work with ICU builds that were configured with no break iteration will now fail. But only if they include \X for matching grapheme cluster boundaries.
2020-02-13 21:40:28 -08:00 · 2020-02-13 21:40:28 -08:00 · 14bcaaf58e
commit 14bcaaf58e
parent ed9ea2e7ac
4 changed files with 91 additions and 186 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -1254,11 +1254,14 @@ UBool RegexCompile::doParseActions(int32_t action)
        break;

    case doBackslashX:
+        #if  UCONFIG_NO_BREAK_ITERATION==1
+        // Grapheme Cluster Boundary requires ICU break iteration.
+        error(U_UNSUPPORTED_ERROR);
+        #endif
        fixLiterals(FALSE);
        appendOp(URX_BACKSLASH_X, 0);
        break;

-
    case doBackslashZ:
        fixLiterals(FALSE);
        appendOp(URX_DOLLAR, 0);
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -177,6 +177,7 @@ RegexMatcher::~RegexMatcher() {

    #if UCONFIG_NO_BREAK_ITERATION==0
    delete fWordBreakItr;
+    delete fGCBreakItr;
    #endif
 }

@ -222,6 +223,7 @@ void RegexMatcher::init(UErrorCode &status) {
    fDeferredStatus    = status;
    fData              = fSmallData;
    fWordBreakItr      = NULL;
+    fGCBreakItr        = NULL;

    fStack             = NULL;
    fInputText         = NULL;
@ -1854,12 +1856,15 @@ RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
    fInputUniStrMaybeMutable = TRUE;

-    if (fWordBreakItr != NULL) {
 #if UCONFIG_NO_BREAK_ITERATION==0
-        UErrorCode status = U_ZERO_ERROR;
-        fWordBreakItr->setText(fInputText, status);
-#endif
+    if (fWordBreakItr) {
+        fWordBreakItr->setText(fInputText, fDeferredStatus);
    }
+    if (fGCBreakItr) {
+        fGCBreakItr->setText(fInputText, fDeferredStatus);
+    }
+#endif
+
    return *this;
 }

@ -1876,12 +1881,14 @@ RegexMatcher &RegexMatcher::reset(UText *input) {
        delete fInput;
        fInput = NULL;

-        if (fWordBreakItr != NULL) {
 #if UCONFIG_NO_BREAK_ITERATION==0
-            UErrorCode status = U_ZERO_ERROR;
-            fWordBreakItr->setText(input, status);
-#endif
+        if (fWordBreakItr) {
+            fWordBreakItr->setText(input, fDeferredStatus);
        }
+        if (fGCBreakItr) {
+            fGCBreakItr->setText(fInputText, fDeferredStatus);
+        }
+#endif
    }
    reset();
    fInputUniStrMaybeMutable = FALSE;
@ -2611,20 +2618,24 @@ UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
 //          parameters:   pos   - the current position in the input buffer
 //
 //--------------------------------------------------------------------------------
-UBool RegexMatcher::isUWordBoundary(int64_t pos) {
+UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
    UBool       returnVal = FALSE;
+
 #if UCONFIG_NO_BREAK_ITERATION==0
+    // Note: this point will never be reached if break iteration is configured out.
+    //       Regex patterns that would require this function will fail to compile.

    // If we haven't yet created a break iterator for this matcher, do it now.
-    if (fWordBreakItr == NULL) {
-        fWordBreakItr =
-            (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
-        if (U_FAILURE(fDeferredStatus)) {
+    if (fWordBreakItr == nullptr) {
+        fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
+        if (U_FAILURE(status)) {
            return FALSE;
        }
-        fWordBreakItr->setText(fInputText, fDeferredStatus);
+        fWordBreakItr->setText(fInputText, status);
    }

+    // Note: zero width boundary tests like \b see through transparent region bounds,
+    //       which is why fLookLimit is used here, rather than fActiveLimit.
    if (pos >= fLookLimit) {
        fHitEnd = TRUE;
        returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
@ -2637,6 +2648,30 @@ UBool RegexMatcher::isUWordBoundary(int64_t pos) {
    return   returnVal;
 }

+
+int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
+    int64_t result = pos;
+
+#if UCONFIG_NO_BREAK_ITERATION==0
+    // Note: this point will never be reached if break iteration is configured out.
+    //       Regex patterns that would require this function will fail to compile.
+
+    // If we haven't yet created a break iterator for this matcher, do it now.
+    if (fGCBreakItr == nullptr) {
+        fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
+        if (U_FAILURE(status)) {
+            return pos;
+        }
+        fGCBreakItr->setText(fInputText, status);
+    }
+    result = fGCBreakItr->following(pos);
+    if (result == BreakIterator::DONE) {
+        result = pos;
+    }
+#endif
+    return result;
+}
+
 //--------------------------------------------------------------------------------
 //
 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
@ -3077,7 +3112,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {

        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
            {
-                UBool success = isUWordBoundary(fp->fInputIdx);
+                UBool success = isUWordBoundary(fp->fInputIdx, status);
                success ^= (UBool)(opValue != 0);     // flip sense for \B
                if (!success) {
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
@ -3179,99 +3214,21 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {


        case URX_BACKSLASH_X:
-            //  Match a Grapheme, as defined by Unicode TR 29.
-            //  Differs slightly from Perl, which consumes combining marks independently
-            //    of context.
-            {
+            //  Match a Grapheme, as defined by Unicode UAX 29.

-                // Fail if at end of input
-                if (fp->fInputIdx >= fActiveLimit) {
-                    fHitEnd = TRUE;
-                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
-                    break;
-                }
-
-                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
-
-                // Examine (and consume) the current char.
-                //   Dispatch into a little state machine, based on the char.
-                UChar32  c;
-                c = UTEXT_NEXT32(fInputText);
-                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
-                UnicodeSet *sets = RegexStaticSets::gStaticSets->fPropSets;
-                if (sets[URX_GC_NORMAL].contains(c))  goto GC_Extend;
-                if (sets[URX_GC_CONTROL].contains(c)) goto GC_Control;
-                if (sets[URX_GC_L].contains(c))       goto GC_L;
-                if (sets[URX_GC_LV].contains(c))      goto GC_V;
-                if (sets[URX_GC_LVT].contains(c))     goto GC_T;
-                if (sets[URX_GC_V].contains(c))       goto GC_V;
-                if (sets[URX_GC_T].contains(c))       goto GC_T;
-                goto GC_Extend;
-
-
-
-GC_L:
-                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
-                c = UTEXT_NEXT32(fInputText);
-                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
-                if (sets[URX_GC_L].contains(c))       goto GC_L;
-                if (sets[URX_GC_LV].contains(c))      goto GC_V;
-                if (sets[URX_GC_LVT].contains(c))     goto GC_T;
-                if (sets[URX_GC_V].contains(c))       goto GC_V;
-                (void)UTEXT_PREVIOUS32(fInputText);
-                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
-                goto GC_Extend;
-
-GC_V:
-                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
-                c = UTEXT_NEXT32(fInputText);
-                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
-                if (sets[URX_GC_V].contains(c))       goto GC_V;
-                if (sets[URX_GC_T].contains(c))       goto GC_T;
-                (void)UTEXT_PREVIOUS32(fInputText);
-                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
-                goto GC_Extend;
-
-GC_T:
-                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
-                c = UTEXT_NEXT32(fInputText);
-                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
-                if (sets[URX_GC_T].contains(c))       goto GC_T;
-                (void)UTEXT_PREVIOUS32(fInputText);
-                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
-                goto GC_Extend;
-
-GC_Extend:
-                // Combining characters are consumed here
-                for (;;) {
-                    if (fp->fInputIdx >= fActiveLimit) {
-                        break;
-                    }
-                    c = UTEXT_CURRENT32(fInputText);
-                    if (sets[URX_GC_EXTEND].contains(c) == FALSE) {
-                        break;
-                    }
-                    (void)UTEXT_NEXT32(fInputText);
-                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
-                }
-                goto GC_Done;
-
-GC_Control:
-                // Most control chars stand alone (don't combine with combining chars),
-                //   except for that CR/LF sequence is a single grapheme cluster.
-                if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
-                    c = UTEXT_NEXT32(fInputText);
-                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
-                }
-
-GC_Done:
-                if (fp->fInputIdx >= fActiveLimit) {
-                    fHitEnd = TRUE;
-                }
+            // Fail if at end of input
+            if (fp->fInputIdx >= fActiveLimit) {
+                fHitEnd = TRUE;
+                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
                break;
            }

-
+            fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
+            if (fp->fInputIdx >= fActiveLimit) {
+                fHitEnd = TRUE;
+                fp->fInputIdx = fActiveLimit;
+            }
+            break;


        case URX_BACKSLASH_Z:          // Test for end of Input
@ -4657,7 +4614,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu

        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
            {
-                UBool success = isUWordBoundary(fp->fInputIdx);
+                UBool success = isUWordBoundary(fp->fInputIdx, status);
                success ^= (UBool)(opValue != 0);     // flip sense for \B
                if (!success) {
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
@ -4755,12 +4712,8 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
            break;


-
        case URX_BACKSLASH_X:
-        //  Match a Grapheme, as defined by Unicode TR 29.
-        //  Differs slightly from Perl, which consumes combining marks independently
-        //    of context.
-        {
+            //  Match a Grapheme, as defined by Unicode UAX 29.

            // Fail if at end of input
            if (fp->fInputIdx >= fActiveLimit) {
@ -4769,76 +4722,12 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
                break;
            }

-            // Examine (and consume) the current char.
-            //   Dispatch into a little state machine, based on the char.
-            UChar32  c;
-            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
-            UnicodeSet *sets = RegexStaticSets::gStaticSets->fPropSets;
-            if (sets[URX_GC_NORMAL].contains(c))  goto GC_Extend;
-            if (sets[URX_GC_CONTROL].contains(c)) goto GC_Control;
-            if (sets[URX_GC_L].contains(c))       goto GC_L;
-            if (sets[URX_GC_LV].contains(c))      goto GC_V;
-            if (sets[URX_GC_LVT].contains(c))     goto GC_T;
-            if (sets[URX_GC_V].contains(c))       goto GC_V;
-            if (sets[URX_GC_T].contains(c))       goto GC_T;
-            goto GC_Extend;
-
-
-
-GC_L:
-            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
-            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
-            if (sets[URX_GC_L].contains(c))       goto GC_L;
-            if (sets[URX_GC_LV].contains(c))      goto GC_V;
-            if (sets[URX_GC_LVT].contains(c))     goto GC_T;
-            if (sets[URX_GC_V].contains(c))       goto GC_V;
-            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
-            goto GC_Extend;
-
-GC_V:
-            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
-            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
-            if (sets[URX_GC_V].contains(c))       goto GC_V;
-            if (sets[URX_GC_T].contains(c))       goto GC_T;
-            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
-            goto GC_Extend;
-
-GC_T:
-            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
-            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
-            if (sets[URX_GC_T].contains(c))       goto GC_T;
-            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
-            goto GC_Extend;
-
-GC_Extend:
-            // Combining characters are consumed here
-            for (;;) {
-                if (fp->fInputIdx >= fActiveLimit) {
-                    break;
-                }
-                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
-                if (sets[URX_GC_EXTEND].contains(c) == FALSE) {
-                    U16_BACK_1(inputBuf, 0, fp->fInputIdx);
-                    break;
-                }
-            }
-            goto GC_Done;
-
-GC_Control:
-            // Most control chars stand alone (don't combine with combining chars),
-            //   except for that CR/LF sequence is a single grapheme cluster.
-            if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
-                fp->fInputIdx++;
-            }
-
-GC_Done:
+            fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
            if (fp->fInputIdx >= fActiveLimit) {
                fHitEnd = TRUE;
+                fp->fInputIdx = fActiveLimit;
            }
            break;
-        }
-
-


        case URX_BACKSLASH_Z:          // Test for end of Input
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -66,7 +66,7 @@ class  RegexCImpl;
 class  RegexMatcher;
 class  RegexPattern;
 struct REStackFrame;
-class  RuleBasedBreakIterator;
+class  BreakIterator;
 class  UnicodeSet;
 class  UVector;
 class  UVector32;
@ -1774,7 +1774,9 @@ private:
    void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
    inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
    UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
-    UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
+    UBool                isUWordBoundary(int64_t pos, UErrorCode &status);   // perform RBBI based \b test
+    // Find a grapheme cluster boundary using a break iterator. For handling \X in regexes.
+    int64_t              followingGCBoundary(int64_t pos, UErrorCode &status);
    REStackFrame        *resetStack();
    inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
    void                 IncrementTime(UErrorCode &status);
@ -1868,7 +1870,8 @@ private:
    UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
                                           //   reported, or that permanently disables this matcher.

-    RuleBasedBreakIterator  *fWordBreakItr;
+    BreakIterator       *fWordBreakItr;
+    BreakIterator       *fGCBreakItr;
 };

 U_NAMESPACE_END
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -317,11 +317,21 @@
 "(\S+).*?(\S+).*"              "<0><1>Not-spaces</1>   <2>more-non-spaces</2>  </0>"

 # \X  consume one Grapheme Cluster.
-"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"   "<0><1>A</1><2>B</2><3> </3><4>\r\n</4></0>"
-"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"   "<0><1>A\u0301</1><2>\n</2><3>\u0305</3><4>a\u0302\u0303\u0304</4></0>"
-"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"   "<0><1>\u1100\u1161\u11a8</1><2>\u115f\u11a2\u11f9</2></0>"
-"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"   "<0><1>\u1100\uac01</1><2>\uac02</2><3>\uac03\u11b0</3></0>"
-"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"   "<0><1>\u1100\u1101\uac02\u0301</1><2>\u1100</2></0>"
+"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"  v  "<0><1>A</1><2>B</2><3> </3><4>\r\n</4></0>"
+"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"  v  "<0><1>A\u0301</1><2>\n</2><3>\u0305</3><4>a\u0302\u0303\u0304</4></0>"
+"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"  v  "<0><1>\u1100\u1161\u11a8</1><2>\u115f\u11a2\u11f9</2></0>"
+"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"  v  "<0><1>\u1100\uac01</1><2>\uac02</2><3>\uac03\u11b0</3></0>"
+"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"  v  "<0><1>\u1100\u1101\uac02\u0301</1><2>\u1100</2></0>"
+# Regional indicator pairs are grapheme clusters
+"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"  v  "<0><1>\U0001f1e6\U0001f1e8</1><2>\U0001f1ea\U0001f1ff</2></0>"
+# Grapheme Break rule 9b:  Prepend x
+"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"	v  "<0><1>\U000111C2x</1></0>"
+
+# Grapheme clusters that straddle a match region. Matching is pinned to the region limits,
+# giving boundaries inside grapheme clusters
+"(\X)?(\X)?(\X)?"        v      "a\u0301<r><0><1>\u0301\u0301</1><2>z\u0302</2></0></r>\u0302\u0302"
+# Same as previous test case, but without the region limits.
+"(\X)?(\X)?(\X)?"        v      "<0><1>a\u0301\u0301\u0301</1><2>z\u0302\u0302\u0302</2></0>"

 # ^ matches only at beginning of line
 ".*^(Hello)"                   "<0><1>Hello</1></0> Hello Hello Hello Goodbye"
@ -1485,7 +1495,7 @@
 # Bug ICU-20939
 # Incorrect word \b boundaries w UTF-8 input and non-ASCII text
 #
-"(?w)\b"                     2     "äää<0></0> äää"
+"(?w)\b"                     v2     "äää<0></0> äää"

 #  Random debugging, Temporary
 #