ICU-20876 Regex Grapheme Cluster matching with Break Iterators.
Change the implementation of grapheme cluster matching in regex to use an ICU break iterator instead of a little one-off state machine. The old implementation had fallen behind the Unicode UAX-29 specification for graphem clusters, and could not be easily updated. The implementation follows the same general pattern that is used for finding word boundaries with an ICU break iterator. In reviewing that code, a few improvements to the handling of ICU error codes were also made. Also note that this change adds a new dependency on Break Iteration. Regex patterns that previously would work with ICU builds that were configured with no break iteration will now fail. But only if they include \X for matching grapheme cluster boundaries.
This commit is contained in:
parent
ed9ea2e7ac
commit
14bcaaf58e
@ -1254,11 +1254,14 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
break;
|
||||
|
||||
case doBackslashX:
|
||||
#if UCONFIG_NO_BREAK_ITERATION==1
|
||||
// Grapheme Cluster Boundary requires ICU break iteration.
|
||||
error(U_UNSUPPORTED_ERROR);
|
||||
#endif
|
||||
fixLiterals(FALSE);
|
||||
appendOp(URX_BACKSLASH_X, 0);
|
||||
break;
|
||||
|
||||
|
||||
case doBackslashZ:
|
||||
fixLiterals(FALSE);
|
||||
appendOp(URX_DOLLAR, 0);
|
||||
|
@ -177,6 +177,7 @@ RegexMatcher::~RegexMatcher() {
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION==0
|
||||
delete fWordBreakItr;
|
||||
delete fGCBreakItr;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -222,6 +223,7 @@ void RegexMatcher::init(UErrorCode &status) {
|
||||
fDeferredStatus = status;
|
||||
fData = fSmallData;
|
||||
fWordBreakItr = NULL;
|
||||
fGCBreakItr = NULL;
|
||||
|
||||
fStack = NULL;
|
||||
fInputText = NULL;
|
||||
@ -1854,12 +1856,15 @@ RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
|
||||
// This is for compatibility for those clients who modify the input string "live" during regex operations.
|
||||
fInputUniStrMaybeMutable = TRUE;
|
||||
|
||||
if (fWordBreakItr != NULL) {
|
||||
#if UCONFIG_NO_BREAK_ITERATION==0
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fWordBreakItr->setText(fInputText, status);
|
||||
#endif
|
||||
if (fWordBreakItr) {
|
||||
fWordBreakItr->setText(fInputText, fDeferredStatus);
|
||||
}
|
||||
if (fGCBreakItr) {
|
||||
fGCBreakItr->setText(fInputText, fDeferredStatus);
|
||||
}
|
||||
#endif
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -1876,12 +1881,14 @@ RegexMatcher &RegexMatcher::reset(UText *input) {
|
||||
delete fInput;
|
||||
fInput = NULL;
|
||||
|
||||
if (fWordBreakItr != NULL) {
|
||||
#if UCONFIG_NO_BREAK_ITERATION==0
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fWordBreakItr->setText(input, status);
|
||||
#endif
|
||||
if (fWordBreakItr) {
|
||||
fWordBreakItr->setText(input, fDeferredStatus);
|
||||
}
|
||||
if (fGCBreakItr) {
|
||||
fGCBreakItr->setText(fInputText, fDeferredStatus);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
reset();
|
||||
fInputUniStrMaybeMutable = FALSE;
|
||||
@ -2611,20 +2618,24 @@ UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
|
||||
// parameters: pos - the current position in the input buffer
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UBool RegexMatcher::isUWordBoundary(int64_t pos) {
|
||||
UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
|
||||
UBool returnVal = FALSE;
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION==0
|
||||
// Note: this point will never be reached if break iteration is configured out.
|
||||
// Regex patterns that would require this function will fail to compile.
|
||||
|
||||
// If we haven't yet created a break iterator for this matcher, do it now.
|
||||
if (fWordBreakItr == NULL) {
|
||||
fWordBreakItr =
|
||||
(RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
if (fWordBreakItr == nullptr) {
|
||||
fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
fWordBreakItr->setText(fInputText, fDeferredStatus);
|
||||
fWordBreakItr->setText(fInputText, status);
|
||||
}
|
||||
|
||||
// Note: zero width boundary tests like \b see through transparent region bounds,
|
||||
// which is why fLookLimit is used here, rather than fActiveLimit.
|
||||
if (pos >= fLookLimit) {
|
||||
fHitEnd = TRUE;
|
||||
returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
|
||||
@ -2637,6 +2648,30 @@ UBool RegexMatcher::isUWordBoundary(int64_t pos) {
|
||||
return returnVal;
|
||||
}
|
||||
|
||||
|
||||
int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
|
||||
int64_t result = pos;
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION==0
|
||||
// Note: this point will never be reached if break iteration is configured out.
|
||||
// Regex patterns that would require this function will fail to compile.
|
||||
|
||||
// If we haven't yet created a break iterator for this matcher, do it now.
|
||||
if (fGCBreakItr == nullptr) {
|
||||
fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return pos;
|
||||
}
|
||||
fGCBreakItr->setText(fInputText, status);
|
||||
}
|
||||
result = fGCBreakItr->following(pos);
|
||||
if (result == BreakIterator::DONE) {
|
||||
result = pos;
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
|
||||
@ -3077,7 +3112,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
||||
|
||||
case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
|
||||
{
|
||||
UBool success = isUWordBoundary(fp->fInputIdx);
|
||||
UBool success = isUWordBoundary(fp->fInputIdx, status);
|
||||
success ^= (UBool)(opValue != 0); // flip sense for \B
|
||||
if (!success) {
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
@ -3179,99 +3214,21 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
||||
|
||||
|
||||
case URX_BACKSLASH_X:
|
||||
// Match a Grapheme, as defined by Unicode TR 29.
|
||||
// Differs slightly from Perl, which consumes combining marks independently
|
||||
// of context.
|
||||
{
|
||||
// Match a Grapheme, as defined by Unicode UAX 29.
|
||||
|
||||
// Fail if at end of input
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
break;
|
||||
}
|
||||
|
||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||
|
||||
// Examine (and consume) the current char.
|
||||
// Dispatch into a little state machine, based on the char.
|
||||
UChar32 c;
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
UnicodeSet *sets = RegexStaticSets::gStaticSets->fPropSets;
|
||||
if (sets[URX_GC_NORMAL].contains(c)) goto GC_Extend;
|
||||
if (sets[URX_GC_CONTROL].contains(c)) goto GC_Control;
|
||||
if (sets[URX_GC_L].contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT].contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
goto GC_Extend;
|
||||
|
||||
|
||||
|
||||
GC_L:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
if (sets[URX_GC_L].contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT].contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
(void)UTEXT_PREVIOUS32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_V:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
(void)UTEXT_PREVIOUS32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_T:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
(void)UTEXT_PREVIOUS32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_Extend:
|
||||
// Combining characters are consumed here
|
||||
for (;;) {
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
break;
|
||||
}
|
||||
c = UTEXT_CURRENT32(fInputText);
|
||||
if (sets[URX_GC_EXTEND].contains(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
(void)UTEXT_NEXT32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
}
|
||||
goto GC_Done;
|
||||
|
||||
GC_Control:
|
||||
// Most control chars stand alone (don't combine with combining chars),
|
||||
// except for that CR/LF sequence is a single grapheme cluster.
|
||||
if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
}
|
||||
|
||||
GC_Done:
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
}
|
||||
// Fail if at end of input
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
fp->fInputIdx = fActiveLimit;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_Z: // Test for end of Input
|
||||
@ -4657,7 +4614,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
||||
|
||||
case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
|
||||
{
|
||||
UBool success = isUWordBoundary(fp->fInputIdx);
|
||||
UBool success = isUWordBoundary(fp->fInputIdx, status);
|
||||
success ^= (UBool)(opValue != 0); // flip sense for \B
|
||||
if (!success) {
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
@ -4755,12 +4712,8 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_BACKSLASH_X:
|
||||
// Match a Grapheme, as defined by Unicode TR 29.
|
||||
// Differs slightly from Perl, which consumes combining marks independently
|
||||
// of context.
|
||||
{
|
||||
// Match a Grapheme, as defined by Unicode UAX 29.
|
||||
|
||||
// Fail if at end of input
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
@ -4769,76 +4722,12 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
||||
break;
|
||||
}
|
||||
|
||||
// Examine (and consume) the current char.
|
||||
// Dispatch into a little state machine, based on the char.
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
UnicodeSet *sets = RegexStaticSets::gStaticSets->fPropSets;
|
||||
if (sets[URX_GC_NORMAL].contains(c)) goto GC_Extend;
|
||||
if (sets[URX_GC_CONTROL].contains(c)) goto GC_Control;
|
||||
if (sets[URX_GC_L].contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT].contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
goto GC_Extend;
|
||||
|
||||
|
||||
|
||||
GC_L:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (sets[URX_GC_L].contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT].contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_V:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_T:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_Extend:
|
||||
// Combining characters are consumed here
|
||||
for (;;) {
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
break;
|
||||
}
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (sets[URX_GC_EXTEND].contains(c) == FALSE) {
|
||||
U16_BACK_1(inputBuf, 0, fp->fInputIdx);
|
||||
break;
|
||||
}
|
||||
}
|
||||
goto GC_Done;
|
||||
|
||||
GC_Control:
|
||||
// Most control chars stand alone (don't combine with combining chars),
|
||||
// except for that CR/LF sequence is a single grapheme cluster.
|
||||
if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
|
||||
fp->fInputIdx++;
|
||||
}
|
||||
|
||||
GC_Done:
|
||||
fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
fp->fInputIdx = fActiveLimit;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
case URX_BACKSLASH_Z: // Test for end of Input
|
||||
|
@ -66,7 +66,7 @@ class RegexCImpl;
|
||||
class RegexMatcher;
|
||||
class RegexPattern;
|
||||
struct REStackFrame;
|
||||
class RuleBasedBreakIterator;
|
||||
class BreakIterator;
|
||||
class UnicodeSet;
|
||||
class UVector;
|
||||
class UVector32;
|
||||
@ -1774,7 +1774,9 @@ private:
|
||||
void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
|
||||
inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
|
||||
UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
|
||||
UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
|
||||
UBool isUWordBoundary(int64_t pos, UErrorCode &status); // perform RBBI based \b test
|
||||
// Find a grapheme cluster boundary using a break iterator. For handling \X in regexes.
|
||||
int64_t followingGCBoundary(int64_t pos, UErrorCode &status);
|
||||
REStackFrame *resetStack();
|
||||
inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
|
||||
void IncrementTime(UErrorCode &status);
|
||||
@ -1868,7 +1870,8 @@ private:
|
||||
UErrorCode fDeferredStatus; // Save error state that cannot be immediately
|
||||
// reported, or that permanently disables this matcher.
|
||||
|
||||
RuleBasedBreakIterator *fWordBreakItr;
|
||||
BreakIterator *fWordBreakItr;
|
||||
BreakIterator *fGCBreakItr;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
22
icu4c/source/test/testdata/regextst.txt
vendored
22
icu4c/source/test/testdata/regextst.txt
vendored
@ -317,11 +317,21 @@
|
||||
"(\S+).*?(\S+).*" "<0><1>Not-spaces</1> <2>more-non-spaces</2> </0>"
|
||||
|
||||
# \X consume one Grapheme Cluster.
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>A</1><2>B</2><3> </3><4>\r\n</4></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>A\u0301</1><2>\n</2><3>\u0305</3><4>a\u0302\u0303\u0304</4></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>\u1100\u1161\u11a8</1><2>\u115f\u11a2\u11f9</2></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>\u1100\uac01</1><2>\uac02</2><3>\uac03\u11b0</3></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>\u1100\u1101\uac02\u0301</1><2>\u1100</2></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>A</1><2>B</2><3> </3><4>\r\n</4></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>A\u0301</1><2>\n</2><3>\u0305</3><4>a\u0302\u0303\u0304</4></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\u1161\u11a8</1><2>\u115f\u11a2\u11f9</2></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\uac01</1><2>\uac02</2><3>\uac03\u11b0</3></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\u1101\uac02\u0301</1><2>\u1100</2></0>"
|
||||
# Regional indicator pairs are grapheme clusters
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\U0001f1e6\U0001f1e8</1><2>\U0001f1ea\U0001f1ff</2></0>"
|
||||
# Grapheme Break rule 9b: Prepend x
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\U000111C2x</1></0>"
|
||||
|
||||
# Grapheme clusters that straddle a match region. Matching is pinned to the region limits,
|
||||
# giving boundaries inside grapheme clusters
|
||||
"(\X)?(\X)?(\X)?" v "a\u0301<r><0><1>\u0301\u0301</1><2>z\u0302</2></0></r>\u0302\u0302"
|
||||
# Same as previous test case, but without the region limits.
|
||||
"(\X)?(\X)?(\X)?" v "<0><1>a\u0301\u0301\u0301</1><2>z\u0302\u0302\u0302</2></0>"
|
||||
|
||||
# ^ matches only at beginning of line
|
||||
".*^(Hello)" "<0><1>Hello</1></0> Hello Hello Hello Goodbye"
|
||||
@ -1485,7 +1495,7 @@
|
||||
# Bug ICU-20939
|
||||
# Incorrect word \b boundaries w UTF-8 input and non-ASCII text
|
||||
#
|
||||
"(?w)\b" 2 "äää<0></0> äää"
|
||||
"(?w)\b" v2 "äää<0></0> äää"
|
||||
|
||||
# Random debugging, Temporary
|
||||
#
|
||||
|
Loading…
Reference in New Issue
Block a user