ICU-8404 Regular Expressions split(), fix incorrect handling of trailing empty field

X-SVN-Rev: 29801
This commit is contained in:
Andy Heninger 2011-04-15 00:48:39 +00:00
parent b6db2cf8d5
commit 8148726df2
6 changed files with 174 additions and 85 deletions

View File

@ -2170,27 +2170,33 @@ int32_t RegexMatcher::split(UText *input,
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
UBool lastGroupWasNullUText = FALSE;
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
if (i==destCapacity-1) {
if (i >= destCapacity-2) {
// Never fill the last available output string with capture group text.
// It will filled with the last field, the remainder of the
// unsplit input text.
break;
}
i++;
lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
dest[i] = group(groupNum, dest[i], status);
}
if (nextOutputStringStart == fActiveLimit) {
// The delimiter was at the end of the string. We're done.
break;
} else if (i == destCapacity-1) {
// We're out of capture groups, and the rest of the string is more important
if (lastGroupWasNullUText) {
utext_close(dest[i]);
dest[i] = NULL;
// The delimiter was at the end of the string. We're done, but first
// we output one last empty string, for the empty field following
// the delimiter at the end of input.
if (i+1 < destCapacity) {
++i;
if (dest[i] == NULL) {
dest[i] = utext_openUChars(NULL, NULL, 0, &status);
} else {
static UChar emptyString[] = {(UChar)0};
utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
}
}
}
break;
}
}
else
{

View File

@ -397,7 +397,7 @@ public:
private:
/**
* Cause a compilation error if an application accidently attempts to
* Cause a compilation error if an application accidentally attempts to
* create a matcher with a (UChar *) string as input rather than
* a UnicodeString. Avoids a dangling reference to a temporary string.
* <p>
@ -430,7 +430,7 @@ public:
/**
* Test whether a string matches a regular expression. This convenience function
* both compiles the reguluar expression and applies it in a single operation.
* both compiles the regular expression and applies it in a single operation.
* Note that if the same pattern needs to be applied repeatedly, this method will be
* less efficient than creating and reusing a RegexMatcher object.
*
@ -450,7 +450,7 @@ public:
/**
* Test whether a string matches a regular expression. This convenience function
* both compiles the reguluar expression and applies it in a single operation.
* both compiles the regular expression and applies it in a single operation.
* Note that if the same pattern needs to be applied repeatedly, this method will be
* less efficient than creating and reusing a RegexMatcher object.
*
@ -493,13 +493,26 @@ public:
/**
* Split a string into fields. Somewhat like split() from Perl.
* The pattern matches identify delimiters that separate the input
* into fields. The input data between the matches becomes the
* fields themselves.
* <p>
* For the best performance on split() operations,
* <code>RegexMatcher::split</code> is perferable to this function
* Split a string into fields. Somewhat like split() from Perl or Java.
* Pattern matches identify delimiters that separate the input
* into fields. The input data between the delimiters becomes the
* fields themselves.
*
* If the delimiter pattern includes capture groups, the captured text will
* also appear in the destination array of output strings, interspersed
* with the fields. This is similar to Perl, but differs from Java,
* which ignores the presence of capture groups in the pattern.
*
* Trailing empty fields will always be returned, assuming sufficient
* destination capacity. This differs from the default behavior for Java
* and Perl where trailing empty fields are not returned.
*
* The number of strings produced by the split operation is returned.
* This count includes the strings from capture groups in the delimiter pattern.
* This behavior differs from Java, which ignores capture groups.
*
* For the best performance on split() operations,
* <code>RegexMatcher::split</code> is preferable to this function
*
* @param input The string to be split into fields. The field delimiters
* match the pattern (in the "this" object)
@ -524,13 +537,26 @@ public:
/**
* Split a string into fields. Somewhat like split() from Perl.
* The pattern matches identify delimiters that separate the input
* into fields. The input data between the matches becomes the
* fields themselves.
* <p>
* Split a string into fields. Somewhat like split() from Perl or Java.
* Pattern matches identify delimiters that separate the input
* into fields. The input data between the delimiters becomes the
* fields themselves.
*
* If the delimiter pattern includes capture groups, the captured text will
* also appear in the destination array of output strings, interspersed
* with the fields. This is similar to Perl, but differs from Java,
* which ignores the presence of capture groups in the pattern.
*
* Trailing empty fields will always be returned, assuming sufficient
* destination capacity. This differs from the default behavior for Java
* and Perl where trailing empty fields are not returned.
*
* The number of strings produced by the split operation is returned.
* This count includes the strings from capture groups in the delimiter pattern.
* This behavior differs from Java, which ignores capture groups.
*
* For the best performance on split() operations,
* <code>RegexMatcher::split</code> is perferable to this function
* <code>RegexMatcher::split</code> is preferable to this function
*
* @param input The string to be split into fields. The field delimiters
* match the pattern (in the "this" object)
@ -544,7 +570,7 @@ public:
* of fields, the trailing part of the input string, including any
* field delimiters, is placed in the last destination string.
* @param status A reference to a UErrorCode to receive any errors.
* @return The number of fields into which the input string was split.
* @return The number of destination strings used.
*
* @draft ICU 4.6
*/
@ -637,7 +663,7 @@ private:
/**
* class RegexMatcher bundles together a reular expression pattern and
* class RegexMatcher bundles together a regular expression pattern and
* input text to which the expression can be applied. It includes methods
* for testing for matches, and for find and replace operations.
*
@ -731,7 +757,7 @@ public:
private:
/**
* Cause a compilation error if an application accidently attempts to
* Cause a compilation error if an application accidentally attempts to
* create a matcher with a (UChar *) string as input rather than
* a UnicodeString. Avoids a dangling reference to a temporary string.
* <p>
@ -956,7 +982,7 @@ public:
* @return the index of the last character matched, plus one.
* The index value returned is a native index, corresponding to
* code units for the underlying encoding type, for example,
* a byte index for UTF8.
* a byte index for UTF-8.
* @stable ICU 2.4
*/
virtual int32_t end(UErrorCode &status) const;
@ -976,7 +1002,7 @@ public:
* attempted or the last match failed and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
* @return the index of the first character following the text
* captured by the specifed group during the previous match operation.
* captured by the specified group during the previous match operation.
* Return -1 if the capture group exists in the pattern but was not part of the match.
* The index value returned is a native index, corresponding to
* code units for the underlying encoding type, for example,
@ -1084,7 +1110,7 @@ public:
private:
/**
* Cause a compilation error if an application accidently attempts to
* Cause a compilation error if an application accidentally attempts to
* reset a matcher with a (UChar *) string as input rather than
* a UnicodeString. Avoids a dangling reference to a temporary string.
* <p>
@ -1225,7 +1251,7 @@ public:
/**
* Return true if this matcher is using anchoring bounds.
* By default, matchers use anchoring region boounds.
* By default, matchers use anchoring region bounds.
*
* @return TRUE if this matcher is using anchoring bounds.
* @stable ICU 4.0
@ -1553,7 +1579,7 @@ public:
virtual int32_t getTimeLimit() const;
/**
* Set the amount of heap storage avaliable for use by the match backtracking stack.
* Set the amount of heap storage available for use by the match backtracking stack.
* The matcher is also reset, discarding any results from previous matches.
* <p>
* ICU uses a backtracking regular expression engine, with the backtrack stack
@ -1606,7 +1632,7 @@ public:
/**
* Get the callback function for this URegularExpression.
*
* @param callback Out paramater, receives a pointer to the user-supplied
* @param callback Out parameter, receives a pointer to the user-supplied
* callback function.
* @param context Out parameter, receives the user context pointer that
* was set when uregex_setMatchCallback() was called.
@ -1639,7 +1665,7 @@ public:
/**
* Get the find progress callback function for this URegularExpression.
*
* @param callback Out paramater, receives a pointer to the user-supplied
* @param callback Out parameter, receives a pointer to the user-supplied
* callback function.
* @param context Out parameter, receives the user context pointer that
* was set when uregex_setFindProgressCallback() was called.

View File

@ -33,7 +33,7 @@
struct URegularExpression;
/**
* Structure representing a compiled regular rexpression, plus the results
* Structure representing a compiled regular expression, plus the results
* of a match operation.
* @stable ICU 3.0
*/
@ -99,7 +99,7 @@ typedef enum URegexpFlag{
/** Error on Unrecognized backslash escapes.
* If set, fail with an error on patterns that contain
* backslash-escaped ASCII letters without a known specail
* backslash-escaped ASCII letters without a known special
* meaning. If this flag is not set, these
* escaped letters represent themselves.
* @stable ICU 4.0
@ -117,13 +117,13 @@ typedef enum URegexpFlag{
*
* @param pattern The Regular Expression pattern to be compiled.
* @param patternLength The length of the pattern, or -1 if the pattern is
* NUL termintated.
* NUL terminated.
* @param flags Flags that alter the default matching behavior for
* the regular expression, UREGEX_CASE_INSENSITIVE, for
* example. For default behavior, set this parameter to zero.
* See <code>enum URegexpFlag</code>. All desired flags
* are bitwise-ORed together.
* @param pe Receives the position (line and column nubers) of any syntax
* @param pe Receives the position (line and column numbers) of any syntax
* error within the source regular expression string. If this
* information is not wanted, pass NULL for this parameter.
* @param status Receives error detected by this function.
@ -153,7 +153,7 @@ uregex_open( const UChar *pattern,
* example. For default behavior, set this parameter to zero.
* See <code>enum URegexpFlag</code>. All desired flags
* are bitwise-ORed together.
* @param pe Receives the position (line and column nubers) of any syntax
* @param pe Receives the position (line and column numbers) of any syntax
* error within the source regular expression string. If this
* information is not wanted, pass NULL for this parameter.
* @param status Receives error detected by this function.
@ -174,13 +174,13 @@ uregex_openUText(UText *pattern,
* is supplied as an 8 bit char * string in the default code page.
*
* @param pattern The Regular Expression pattern to be compiled,
* NUL termintated.
* NUL terminated.
* @param flags Flags that alter the default matching behavior for
* the regular expression, UREGEX_CASE_INSENSITIVE, for
* example. For default behavior, set this parameter to zero.
* See <code>enum URegexpFlag</code>. All desired flags
* are bitwise-ORed together.
* @param pe Receives the position (line and column nubers) of any syntax
* @param pe Receives the position (line and column numbers) of any syntax
* error within the source regular expression string. If this
* information is not wanted, pass NULL for this parameter.
* @param status Receives errors detected by this function.
@ -234,7 +234,7 @@ U_NAMESPACE_END
* form of the expression, and requires less memory.
* <p>
* Note that the current input string and the position of any matched text
* within it are not cloned; only the pattern itself and and the
* within it are not cloned; only the pattern itself and the
* match mode flags are copied.
* <p>
* Cloning can be particularly useful to threaded applications that perform
@ -927,7 +927,7 @@ uregex_requireEnd(const URegularExpression *regexp,
* @param replacementLength The length of the replacement string, or
* -1 if it is NUL terminated.
* @param destBuf A (UChar *) buffer that will receive the result.
* @param destCapacity The capacity of the desitnation buffer.
* @param destCapacity The capacity of the destination buffer.
* @param status A reference to a UErrorCode to receive any errors.
* @return The length of the string resulting from the find
* and replace operation. In the event that the
@ -986,7 +986,7 @@ uregex_replaceAllUText(URegularExpression *regexp,
* @param replacementLength The length of the replacement string, or
* -1 if it is NUL terminated.
* @param destBuf A (UChar *) buffer that will receive the result.
* @param destCapacity The capacity of the desitnation buffer.
* @param destCapacity The capacity of the destination buffer.
* @param status a reference to a UErrorCode to receive any errors.
* @return The length of the string resulting from the find
* and replace operation. In the event that the
@ -1172,26 +1172,23 @@ uregex_appendTailUText(URegularExpression *regexp,
* The pattern matches identify delimiters that separate the input
* into fields. The input data between the matches becomes the
* fields themselves.
* <p>
*
* Each of the fields is copied from the input string to the destination
* buffer, and NUL terminated. The position of each field within
* the destination buffer is returned in the destFields array.
*
* Note: another choice for the design of this function would be to not
* copy the resulting fields at all, but to return indexes and
* lengths within the source text.
* Advantages would be
* o Faster. No Copying.
* o Nothing extra needed when field data may contain embedded NUL chars.
* o Less memory needed if working on large data.
* Disadvantages
* o Less consistent with C++ split, which copies into an
* array of UnicodeStrings.
* o No NUL termination, extracted fields would be less convenient
* to use in most cases.
* o Possible problems in the future, when support Unicode Normalization
* could cause the fields to not correspond exactly to
* a range of the source text.
* If the delimiter pattern includes capture groups, the captured text will
* also appear in the destination array of output strings, interspersed
* with the fields. This is similar to Perl, but differs from Java,
* which ignores the presence of capture groups in the pattern.
*
* Trailing empty fields will always be returned, assuming sufficient
* destination capacity. This differs from the default behavior for Java
* and Perl where trailing empty fields are not returned.
*
* The number of strings produced by the split operation is returned.
* This count includes the strings from capture groups in the delimiter pattern.
* This behavior differs from Java, which ignores capture groups.
*
* @param regexp The compiled regular expression.
* @param destBuf A (UChar *) buffer to receive the fields that
@ -1307,7 +1304,7 @@ uregex_getTimeLimit(const URegularExpression *regexp,
UErrorCode *status);
/**
* Set the amount of heap storage avaliable for use by the match backtracking stack.
* Set the amount of heap storage available for use by the match backtracking stack.
* <p>
* ICU uses a backtracking regular expression engine, with the backtrack stack
* maintained on the heap. This function sets the limit to the amount of memory
@ -1392,7 +1389,7 @@ uregex_setMatchCallback(URegularExpression *regexp,
* Get the callback function for this URegularExpression.
*
* @param regexp The compiled regular expression.
* @param callback Out paramater, receives a pointer to the user-supplied
* @param callback Out parameter, receives a pointer to the user-supplied
* callback function.
* @param context Out parameter, receives the user context pointer that
* was set when uregex_setMatchCallback() was called.
@ -1464,7 +1461,7 @@ uregex_setFindProgressCallback(URegularExpression *regexp,
* Get the find progress callback function for this URegularExpression.
*
* @param regexp The compiled regular expression.
* @param callback Out paramater, receives a pointer to the user-supplied
* @param callback Out parameter, receives a pointer to the user-supplied
* callback function.
* @param context Out parameter, receives the user context pointer that
* was set when uregex_setFindProgressCallback() was called.

View File

@ -1841,7 +1841,11 @@ int32_t RegexCImpl::split(RegularExpression *regexp,
// Set up to extract the capture group contents into the dest buffer.
destFields[i] = &destBuf[destIdx];
tStatus = U_ZERO_ERROR;
int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
int32_t t = uregex_group((URegularExpression*)regexp,
groupNum,
destFields[i],
REMAINING_CAPACITY(destIdx, destCapacity),
&tStatus);
destIdx += t + 1; // Record the space used in the output string buffer.
// +1 for the NUL that terminates the string.
if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
@ -1852,7 +1856,18 @@ int32_t RegexCImpl::split(RegularExpression *regexp,
}
if (nextOutputStringStart == inputLen) {
// The delimiter was at the end of the string. We're done.
// The delimiter was at the end of the string.
// Output an empty string, and then we are done.
if (destIdx < destCapacity) {
destBuf[destIdx] = 0;
}
if (i < destFieldsCapacity-1) {
++i;
}
if (destIdx < destCapacity) {
destFields[i] = destBuf + destIdx;
}
++destIdx;
break;
}

View File

@ -1225,15 +1225,16 @@ static void TestRegexCAPI(void) {
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
TEST_ASSERT(numFields == 4);
TEST_ASSERT(numFields == 5);
TEST_ASSERT_STRING("first ", fields[0], TRUE);
TEST_ASSERT_STRING("tag-a", fields[1], TRUE);
TEST_ASSERT_STRING(" second", fields[2], TRUE);
TEST_ASSERT_STRING("tag-b", fields[3], TRUE);
TEST_ASSERT(fields[4] == NULL);
TEST_ASSERT_STRING("", fields[4], TRUE);
TEST_ASSERT(fields[5] == NULL);
TEST_ASSERT(fields[8] == NULL);
TEST_ASSERT(!memcmp(&fields[9],&minus1,sizeof(UChar*)));
spaceNeeded = strlen("first .tag-a. second.tag-b."); /* "." at NUL positions */
spaceNeeded = strlen("first .tag-a. second.tag-b.."); /* "." at NUL positions */
TEST_ASSERT(spaceNeeded == requiredCapacity);
}
}
@ -2134,13 +2135,15 @@ static void TestUTextAPI(void) {
const char str_taga[] = { 0x74, 0x61, 0x67, 0x2d, 0x61, 0x00 }; /* tag-a */
const char str_second[] = { 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x00 }; /* second */
const char str_tagb[] = { 0x74, 0x61, 0x67, 0x2d, 0x62, 0x00 }; /* tag-b */
const char str_empty[] = { 0x00 };
TEST_ASSERT(numFields == 4);
TEST_ASSERT(numFields == 5);
TEST_ASSERT_UTEXT(str_first, fields[0]);
TEST_ASSERT_UTEXT(str_taga, fields[1]);
TEST_ASSERT_UTEXT(str_second, fields[2]);
TEST_ASSERT_UTEXT(str_tagb, fields[3]);
TEST_ASSERT(fields[4] == NULL);
TEST_ASSERT_UTEXT(str_empty, fields[4]);
TEST_ASSERT(fields[5] == NULL);
TEST_ASSERT(fields[8] == NULL);
TEST_ASSERT(fields[9] == &patternText);
}

View File

@ -1531,7 +1531,7 @@ void RegexTest::API_Pattern() {
n = pat1->split(" Now is the time ", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==5);
REGEX_ASSERT(n==6);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="Now");
REGEX_ASSERT(fields[2]=="is");
@ -1541,8 +1541,9 @@ void RegexTest::API_Pattern() {
n = pat1->split(" ", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==1);
REGEX_ASSERT(n==2);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="");
fields[0] = "foo";
n = pat1->split("", fields, 10, status);
@ -1559,7 +1560,7 @@ void RegexTest::API_Pattern() {
status = U_ZERO_ERROR;
n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(n==7);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
@ -1571,7 +1572,7 @@ void RegexTest::API_Pattern() {
n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(n==7);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
@ -1590,7 +1591,7 @@ void RegexTest::API_Pattern() {
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="c");
REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
REGEX_ASSERT(fields[6]=="foo");
status = U_ZERO_ERROR;
@ -1640,6 +1641,39 @@ void RegexTest::API_Pattern() {
REGEX_ASSERT(fields[4]=="20");
delete pat1;
// Test split of string with empty trailing fields
pat1 = RegexPattern::compile(",", pe, status);
REGEX_CHECK_STATUS;
n = pat1->split("a,b,c,", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==4);
REGEX_ASSERT(fields[0]=="a");
REGEX_ASSERT(fields[1]=="b");
REGEX_ASSERT(fields[2]=="c");
REGEX_ASSERT(fields[3]=="");
n = pat1->split("a,,,", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==4);
REGEX_ASSERT(fields[0]=="a");
REGEX_ASSERT(fields[1]=="");
REGEX_ASSERT(fields[2]=="");
REGEX_ASSERT(fields[3]=="");
delete pat1;
// Split Separator with zero length match.
pat1 = RegexPattern::compile(":?", pe, status);
REGEX_CHECK_STATUS;
n = pat1->split("abc", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==5);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="b");
REGEX_ASSERT(fields[3]=="c");
REGEX_ASSERT(fields[4]=="");
delete pat1;
//
// RegexPattern::pattern()
@ -2795,18 +2829,22 @@ void RegexTest::API_Pattern_UTF8() {
n = pat1->split(" Now is the time ", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==5);
REGEX_ASSERT(n==6);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="Now");
REGEX_ASSERT(fields[2]=="is");
REGEX_ASSERT(fields[3]=="the");
REGEX_ASSERT(fields[4]=="time");
REGEX_ASSERT(fields[5]=="");
REGEX_ASSERT(fields[6]=="");
fields[2] = "*";
n = pat1->split(" ", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==1);
REGEX_ASSERT(n==2);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="");
REGEX_ASSERT(fields[2]=="*");
fields[0] = "foo";
n = pat1->split("", fields, 10, status);
@ -2822,9 +2860,10 @@ void RegexTest::API_Pattern_UTF8() {
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
fields[6] = fields[7] = "*";
n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(n==7);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
@ -2832,11 +2871,13 @@ void RegexTest::API_Pattern_UTF8() {
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="c");
REGEX_ASSERT(fields[6]=="");
REGEX_ASSERT(fields[7]=="*");
REGEX_ASSERT(status==U_ZERO_ERROR);
fields[6] = fields[7] = "*";
n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(n==7);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
@ -2844,10 +2885,11 @@ void RegexTest::API_Pattern_UTF8() {
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="c");
REGEX_ASSERT(fields[6]=="");
REGEX_ASSERT(fields[7]=="*");
status = U_ZERO_ERROR;
fields[6] = "foo";
n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(fields[0]==" ");
@ -2855,7 +2897,7 @@ void RegexTest::API_Pattern_UTF8() {
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="c");
REGEX_ASSERT(fields[5]==" ");
REGEX_ASSERT(fields[6]=="foo");
status = U_ZERO_ERROR;