ICU-8404 Regular Expressions split(), fix incorrect handling of trailing empty field
X-SVN-Rev: 29801
This commit is contained in:
parent
b6db2cf8d5
commit
8148726df2
@ -2170,27 +2170,33 @@ int32_t RegexMatcher::split(UText *input,
|
||||
// If the delimiter pattern has capturing parentheses, the captured
|
||||
// text goes out into the next n destination strings.
|
||||
int32_t groupNum;
|
||||
UBool lastGroupWasNullUText = FALSE;
|
||||
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
|
||||
if (i==destCapacity-1) {
|
||||
if (i >= destCapacity-2) {
|
||||
// Never fill the last available output string with capture group text.
|
||||
// It will filled with the last field, the remainder of the
|
||||
// unsplit input text.
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
|
||||
dest[i] = group(groupNum, dest[i], status);
|
||||
}
|
||||
|
||||
if (nextOutputStringStart == fActiveLimit) {
|
||||
// The delimiter was at the end of the string. We're done.
|
||||
break;
|
||||
} else if (i == destCapacity-1) {
|
||||
// We're out of capture groups, and the rest of the string is more important
|
||||
if (lastGroupWasNullUText) {
|
||||
utext_close(dest[i]);
|
||||
dest[i] = NULL;
|
||||
// The delimiter was at the end of the string. We're done, but first
|
||||
// we output one last empty string, for the empty field following
|
||||
// the delimiter at the end of input.
|
||||
if (i+1 < destCapacity) {
|
||||
++i;
|
||||
if (dest[i] == NULL) {
|
||||
dest[i] = utext_openUChars(NULL, NULL, 0, &status);
|
||||
} else {
|
||||
static UChar emptyString[] = {(UChar)0};
|
||||
utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -397,7 +397,7 @@ public:
|
||||
|
||||
private:
|
||||
/**
|
||||
* Cause a compilation error if an application accidently attempts to
|
||||
* Cause a compilation error if an application accidentally attempts to
|
||||
* create a matcher with a (UChar *) string as input rather than
|
||||
* a UnicodeString. Avoids a dangling reference to a temporary string.
|
||||
* <p>
|
||||
@ -430,7 +430,7 @@ public:
|
||||
|
||||
/**
|
||||
* Test whether a string matches a regular expression. This convenience function
|
||||
* both compiles the reguluar expression and applies it in a single operation.
|
||||
* both compiles the regular expression and applies it in a single operation.
|
||||
* Note that if the same pattern needs to be applied repeatedly, this method will be
|
||||
* less efficient than creating and reusing a RegexMatcher object.
|
||||
*
|
||||
@ -450,7 +450,7 @@ public:
|
||||
|
||||
/**
|
||||
* Test whether a string matches a regular expression. This convenience function
|
||||
* both compiles the reguluar expression and applies it in a single operation.
|
||||
* both compiles the regular expression and applies it in a single operation.
|
||||
* Note that if the same pattern needs to be applied repeatedly, this method will be
|
||||
* less efficient than creating and reusing a RegexMatcher object.
|
||||
*
|
||||
@ -493,13 +493,26 @@ public:
|
||||
|
||||
|
||||
/**
|
||||
* Split a string into fields. Somewhat like split() from Perl.
|
||||
* The pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the matches becomes the
|
||||
* fields themselves.
|
||||
* <p>
|
||||
* For the best performance on split() operations,
|
||||
* <code>RegexMatcher::split</code> is perferable to this function
|
||||
* Split a string into fields. Somewhat like split() from Perl or Java.
|
||||
* Pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the delimiters becomes the
|
||||
* fields themselves.
|
||||
*
|
||||
* If the delimiter pattern includes capture groups, the captured text will
|
||||
* also appear in the destination array of output strings, interspersed
|
||||
* with the fields. This is similar to Perl, but differs from Java,
|
||||
* which ignores the presence of capture groups in the pattern.
|
||||
*
|
||||
* Trailing empty fields will always be returned, assuming sufficient
|
||||
* destination capacity. This differs from the default behavior for Java
|
||||
* and Perl where trailing empty fields are not returned.
|
||||
*
|
||||
* The number of strings produced by the split operation is returned.
|
||||
* This count includes the strings from capture groups in the delimiter pattern.
|
||||
* This behavior differs from Java, which ignores capture groups.
|
||||
*
|
||||
* For the best performance on split() operations,
|
||||
* <code>RegexMatcher::split</code> is preferable to this function
|
||||
*
|
||||
* @param input The string to be split into fields. The field delimiters
|
||||
* match the pattern (in the "this" object)
|
||||
@ -524,13 +537,26 @@ public:
|
||||
|
||||
|
||||
/**
|
||||
* Split a string into fields. Somewhat like split() from Perl.
|
||||
* The pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the matches becomes the
|
||||
* fields themselves.
|
||||
* <p>
|
||||
* Split a string into fields. Somewhat like split() from Perl or Java.
|
||||
* Pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the delimiters becomes the
|
||||
* fields themselves.
|
||||
*
|
||||
* If the delimiter pattern includes capture groups, the captured text will
|
||||
* also appear in the destination array of output strings, interspersed
|
||||
* with the fields. This is similar to Perl, but differs from Java,
|
||||
* which ignores the presence of capture groups in the pattern.
|
||||
*
|
||||
* Trailing empty fields will always be returned, assuming sufficient
|
||||
* destination capacity. This differs from the default behavior for Java
|
||||
* and Perl where trailing empty fields are not returned.
|
||||
*
|
||||
* The number of strings produced by the split operation is returned.
|
||||
* This count includes the strings from capture groups in the delimiter pattern.
|
||||
* This behavior differs from Java, which ignores capture groups.
|
||||
*
|
||||
* For the best performance on split() operations,
|
||||
* <code>RegexMatcher::split</code> is perferable to this function
|
||||
* <code>RegexMatcher::split</code> is preferable to this function
|
||||
*
|
||||
* @param input The string to be split into fields. The field delimiters
|
||||
* match the pattern (in the "this" object)
|
||||
@ -544,7 +570,7 @@ public:
|
||||
* of fields, the trailing part of the input string, including any
|
||||
* field delimiters, is placed in the last destination string.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return The number of fields into which the input string was split.
|
||||
* @return The number of destination strings used.
|
||||
*
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
@ -637,7 +663,7 @@ private:
|
||||
|
||||
|
||||
/**
|
||||
* class RegexMatcher bundles together a reular expression pattern and
|
||||
* class RegexMatcher bundles together a regular expression pattern and
|
||||
* input text to which the expression can be applied. It includes methods
|
||||
* for testing for matches, and for find and replace operations.
|
||||
*
|
||||
@ -731,7 +757,7 @@ public:
|
||||
|
||||
private:
|
||||
/**
|
||||
* Cause a compilation error if an application accidently attempts to
|
||||
* Cause a compilation error if an application accidentally attempts to
|
||||
* create a matcher with a (UChar *) string as input rather than
|
||||
* a UnicodeString. Avoids a dangling reference to a temporary string.
|
||||
* <p>
|
||||
@ -956,7 +982,7 @@ public:
|
||||
* @return the index of the last character matched, plus one.
|
||||
* The index value returned is a native index, corresponding to
|
||||
* code units for the underlying encoding type, for example,
|
||||
* a byte index for UTF8.
|
||||
* a byte index for UTF-8.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
virtual int32_t end(UErrorCode &status) const;
|
||||
@ -976,7 +1002,7 @@ public:
|
||||
* attempted or the last match failed and
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
|
||||
* @return the index of the first character following the text
|
||||
* captured by the specifed group during the previous match operation.
|
||||
* captured by the specified group during the previous match operation.
|
||||
* Return -1 if the capture group exists in the pattern but was not part of the match.
|
||||
* The index value returned is a native index, corresponding to
|
||||
* code units for the underlying encoding type, for example,
|
||||
@ -1084,7 +1110,7 @@ public:
|
||||
|
||||
private:
|
||||
/**
|
||||
* Cause a compilation error if an application accidently attempts to
|
||||
* Cause a compilation error if an application accidentally attempts to
|
||||
* reset a matcher with a (UChar *) string as input rather than
|
||||
* a UnicodeString. Avoids a dangling reference to a temporary string.
|
||||
* <p>
|
||||
@ -1225,7 +1251,7 @@ public:
|
||||
|
||||
/**
|
||||
* Return true if this matcher is using anchoring bounds.
|
||||
* By default, matchers use anchoring region boounds.
|
||||
* By default, matchers use anchoring region bounds.
|
||||
*
|
||||
* @return TRUE if this matcher is using anchoring bounds.
|
||||
* @stable ICU 4.0
|
||||
@ -1553,7 +1579,7 @@ public:
|
||||
virtual int32_t getTimeLimit() const;
|
||||
|
||||
/**
|
||||
* Set the amount of heap storage avaliable for use by the match backtracking stack.
|
||||
* Set the amount of heap storage available for use by the match backtracking stack.
|
||||
* The matcher is also reset, discarding any results from previous matches.
|
||||
* <p>
|
||||
* ICU uses a backtracking regular expression engine, with the backtrack stack
|
||||
@ -1606,7 +1632,7 @@ public:
|
||||
/**
|
||||
* Get the callback function for this URegularExpression.
|
||||
*
|
||||
* @param callback Out paramater, receives a pointer to the user-supplied
|
||||
* @param callback Out parameter, receives a pointer to the user-supplied
|
||||
* callback function.
|
||||
* @param context Out parameter, receives the user context pointer that
|
||||
* was set when uregex_setMatchCallback() was called.
|
||||
@ -1639,7 +1665,7 @@ public:
|
||||
/**
|
||||
* Get the find progress callback function for this URegularExpression.
|
||||
*
|
||||
* @param callback Out paramater, receives a pointer to the user-supplied
|
||||
* @param callback Out parameter, receives a pointer to the user-supplied
|
||||
* callback function.
|
||||
* @param context Out parameter, receives the user context pointer that
|
||||
* was set when uregex_setFindProgressCallback() was called.
|
||||
|
@ -33,7 +33,7 @@
|
||||
|
||||
struct URegularExpression;
|
||||
/**
|
||||
* Structure representing a compiled regular rexpression, plus the results
|
||||
* Structure representing a compiled regular expression, plus the results
|
||||
* of a match operation.
|
||||
* @stable ICU 3.0
|
||||
*/
|
||||
@ -99,7 +99,7 @@ typedef enum URegexpFlag{
|
||||
|
||||
/** Error on Unrecognized backslash escapes.
|
||||
* If set, fail with an error on patterns that contain
|
||||
* backslash-escaped ASCII letters without a known specail
|
||||
* backslash-escaped ASCII letters without a known special
|
||||
* meaning. If this flag is not set, these
|
||||
* escaped letters represent themselves.
|
||||
* @stable ICU 4.0
|
||||
@ -117,13 +117,13 @@ typedef enum URegexpFlag{
|
||||
*
|
||||
* @param pattern The Regular Expression pattern to be compiled.
|
||||
* @param patternLength The length of the pattern, or -1 if the pattern is
|
||||
* NUL termintated.
|
||||
* NUL terminated.
|
||||
* @param flags Flags that alter the default matching behavior for
|
||||
* the regular expression, UREGEX_CASE_INSENSITIVE, for
|
||||
* example. For default behavior, set this parameter to zero.
|
||||
* See <code>enum URegexpFlag</code>. All desired flags
|
||||
* are bitwise-ORed together.
|
||||
* @param pe Receives the position (line and column nubers) of any syntax
|
||||
* @param pe Receives the position (line and column numbers) of any syntax
|
||||
* error within the source regular expression string. If this
|
||||
* information is not wanted, pass NULL for this parameter.
|
||||
* @param status Receives error detected by this function.
|
||||
@ -153,7 +153,7 @@ uregex_open( const UChar *pattern,
|
||||
* example. For default behavior, set this parameter to zero.
|
||||
* See <code>enum URegexpFlag</code>. All desired flags
|
||||
* are bitwise-ORed together.
|
||||
* @param pe Receives the position (line and column nubers) of any syntax
|
||||
* @param pe Receives the position (line and column numbers) of any syntax
|
||||
* error within the source regular expression string. If this
|
||||
* information is not wanted, pass NULL for this parameter.
|
||||
* @param status Receives error detected by this function.
|
||||
@ -174,13 +174,13 @@ uregex_openUText(UText *pattern,
|
||||
* is supplied as an 8 bit char * string in the default code page.
|
||||
*
|
||||
* @param pattern The Regular Expression pattern to be compiled,
|
||||
* NUL termintated.
|
||||
* NUL terminated.
|
||||
* @param flags Flags that alter the default matching behavior for
|
||||
* the regular expression, UREGEX_CASE_INSENSITIVE, for
|
||||
* example. For default behavior, set this parameter to zero.
|
||||
* See <code>enum URegexpFlag</code>. All desired flags
|
||||
* are bitwise-ORed together.
|
||||
* @param pe Receives the position (line and column nubers) of any syntax
|
||||
* @param pe Receives the position (line and column numbers) of any syntax
|
||||
* error within the source regular expression string. If this
|
||||
* information is not wanted, pass NULL for this parameter.
|
||||
* @param status Receives errors detected by this function.
|
||||
@ -234,7 +234,7 @@ U_NAMESPACE_END
|
||||
* form of the expression, and requires less memory.
|
||||
* <p>
|
||||
* Note that the current input string and the position of any matched text
|
||||
* within it are not cloned; only the pattern itself and and the
|
||||
* within it are not cloned; only the pattern itself and the
|
||||
* match mode flags are copied.
|
||||
* <p>
|
||||
* Cloning can be particularly useful to threaded applications that perform
|
||||
@ -927,7 +927,7 @@ uregex_requireEnd(const URegularExpression *regexp,
|
||||
* @param replacementLength The length of the replacement string, or
|
||||
* -1 if it is NUL terminated.
|
||||
* @param destBuf A (UChar *) buffer that will receive the result.
|
||||
* @param destCapacity The capacity of the desitnation buffer.
|
||||
* @param destCapacity The capacity of the destination buffer.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return The length of the string resulting from the find
|
||||
* and replace operation. In the event that the
|
||||
@ -986,7 +986,7 @@ uregex_replaceAllUText(URegularExpression *regexp,
|
||||
* @param replacementLength The length of the replacement string, or
|
||||
* -1 if it is NUL terminated.
|
||||
* @param destBuf A (UChar *) buffer that will receive the result.
|
||||
* @param destCapacity The capacity of the desitnation buffer.
|
||||
* @param destCapacity The capacity of the destination buffer.
|
||||
* @param status a reference to a UErrorCode to receive any errors.
|
||||
* @return The length of the string resulting from the find
|
||||
* and replace operation. In the event that the
|
||||
@ -1172,26 +1172,23 @@ uregex_appendTailUText(URegularExpression *regexp,
|
||||
* The pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the matches becomes the
|
||||
* fields themselves.
|
||||
* <p>
|
||||
*
|
||||
* Each of the fields is copied from the input string to the destination
|
||||
* buffer, and NUL terminated. The position of each field within
|
||||
* the destination buffer is returned in the destFields array.
|
||||
*
|
||||
* Note: another choice for the design of this function would be to not
|
||||
* copy the resulting fields at all, but to return indexes and
|
||||
* lengths within the source text.
|
||||
* Advantages would be
|
||||
* o Faster. No Copying.
|
||||
* o Nothing extra needed when field data may contain embedded NUL chars.
|
||||
* o Less memory needed if working on large data.
|
||||
* Disadvantages
|
||||
* o Less consistent with C++ split, which copies into an
|
||||
* array of UnicodeStrings.
|
||||
* o No NUL termination, extracted fields would be less convenient
|
||||
* to use in most cases.
|
||||
* o Possible problems in the future, when support Unicode Normalization
|
||||
* could cause the fields to not correspond exactly to
|
||||
* a range of the source text.
|
||||
* If the delimiter pattern includes capture groups, the captured text will
|
||||
* also appear in the destination array of output strings, interspersed
|
||||
* with the fields. This is similar to Perl, but differs from Java,
|
||||
* which ignores the presence of capture groups in the pattern.
|
||||
*
|
||||
* Trailing empty fields will always be returned, assuming sufficient
|
||||
* destination capacity. This differs from the default behavior for Java
|
||||
* and Perl where trailing empty fields are not returned.
|
||||
*
|
||||
* The number of strings produced by the split operation is returned.
|
||||
* This count includes the strings from capture groups in the delimiter pattern.
|
||||
* This behavior differs from Java, which ignores capture groups.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param destBuf A (UChar *) buffer to receive the fields that
|
||||
@ -1307,7 +1304,7 @@ uregex_getTimeLimit(const URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Set the amount of heap storage avaliable for use by the match backtracking stack.
|
||||
* Set the amount of heap storage available for use by the match backtracking stack.
|
||||
* <p>
|
||||
* ICU uses a backtracking regular expression engine, with the backtrack stack
|
||||
* maintained on the heap. This function sets the limit to the amount of memory
|
||||
@ -1392,7 +1389,7 @@ uregex_setMatchCallback(URegularExpression *regexp,
|
||||
* Get the callback function for this URegularExpression.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param callback Out paramater, receives a pointer to the user-supplied
|
||||
* @param callback Out parameter, receives a pointer to the user-supplied
|
||||
* callback function.
|
||||
* @param context Out parameter, receives the user context pointer that
|
||||
* was set when uregex_setMatchCallback() was called.
|
||||
@ -1464,7 +1461,7 @@ uregex_setFindProgressCallback(URegularExpression *regexp,
|
||||
* Get the find progress callback function for this URegularExpression.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param callback Out paramater, receives a pointer to the user-supplied
|
||||
* @param callback Out parameter, receives a pointer to the user-supplied
|
||||
* callback function.
|
||||
* @param context Out parameter, receives the user context pointer that
|
||||
* was set when uregex_setFindProgressCallback() was called.
|
||||
|
@ -1841,7 +1841,11 @@ int32_t RegexCImpl::split(RegularExpression *regexp,
|
||||
// Set up to extract the capture group contents into the dest buffer.
|
||||
destFields[i] = &destBuf[destIdx];
|
||||
tStatus = U_ZERO_ERROR;
|
||||
int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
|
||||
int32_t t = uregex_group((URegularExpression*)regexp,
|
||||
groupNum,
|
||||
destFields[i],
|
||||
REMAINING_CAPACITY(destIdx, destCapacity),
|
||||
&tStatus);
|
||||
destIdx += t + 1; // Record the space used in the output string buffer.
|
||||
// +1 for the NUL that terminates the string.
|
||||
if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
|
||||
@ -1852,7 +1856,18 @@ int32_t RegexCImpl::split(RegularExpression *regexp,
|
||||
}
|
||||
|
||||
if (nextOutputStringStart == inputLen) {
|
||||
// The delimiter was at the end of the string. We're done.
|
||||
// The delimiter was at the end of the string.
|
||||
// Output an empty string, and then we are done.
|
||||
if (destIdx < destCapacity) {
|
||||
destBuf[destIdx] = 0;
|
||||
}
|
||||
if (i < destFieldsCapacity-1) {
|
||||
++i;
|
||||
}
|
||||
if (destIdx < destCapacity) {
|
||||
destFields[i] = destBuf + destIdx;
|
||||
}
|
||||
++destIdx;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1225,15 +1225,16 @@ static void TestRegexCAPI(void) {
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
TEST_ASSERT(numFields == 4);
|
||||
TEST_ASSERT(numFields == 5);
|
||||
TEST_ASSERT_STRING("first ", fields[0], TRUE);
|
||||
TEST_ASSERT_STRING("tag-a", fields[1], TRUE);
|
||||
TEST_ASSERT_STRING(" second", fields[2], TRUE);
|
||||
TEST_ASSERT_STRING("tag-b", fields[3], TRUE);
|
||||
TEST_ASSERT(fields[4] == NULL);
|
||||
TEST_ASSERT_STRING("", fields[4], TRUE);
|
||||
TEST_ASSERT(fields[5] == NULL);
|
||||
TEST_ASSERT(fields[8] == NULL);
|
||||
TEST_ASSERT(!memcmp(&fields[9],&minus1,sizeof(UChar*)));
|
||||
spaceNeeded = strlen("first .tag-a. second.tag-b."); /* "." at NUL positions */
|
||||
spaceNeeded = strlen("first .tag-a. second.tag-b.."); /* "." at NUL positions */
|
||||
TEST_ASSERT(spaceNeeded == requiredCapacity);
|
||||
}
|
||||
}
|
||||
@ -2134,13 +2135,15 @@ static void TestUTextAPI(void) {
|
||||
const char str_taga[] = { 0x74, 0x61, 0x67, 0x2d, 0x61, 0x00 }; /* tag-a */
|
||||
const char str_second[] = { 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x00 }; /* second */
|
||||
const char str_tagb[] = { 0x74, 0x61, 0x67, 0x2d, 0x62, 0x00 }; /* tag-b */
|
||||
const char str_empty[] = { 0x00 };
|
||||
|
||||
TEST_ASSERT(numFields == 4);
|
||||
TEST_ASSERT(numFields == 5);
|
||||
TEST_ASSERT_UTEXT(str_first, fields[0]);
|
||||
TEST_ASSERT_UTEXT(str_taga, fields[1]);
|
||||
TEST_ASSERT_UTEXT(str_second, fields[2]);
|
||||
TEST_ASSERT_UTEXT(str_tagb, fields[3]);
|
||||
TEST_ASSERT(fields[4] == NULL);
|
||||
TEST_ASSERT_UTEXT(str_empty, fields[4]);
|
||||
TEST_ASSERT(fields[5] == NULL);
|
||||
TEST_ASSERT(fields[8] == NULL);
|
||||
TEST_ASSERT(fields[9] == &patternText);
|
||||
}
|
||||
|
@ -1531,7 +1531,7 @@ void RegexTest::API_Pattern() {
|
||||
|
||||
n = pat1->split(" Now is the time ", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==5);
|
||||
REGEX_ASSERT(n==6);
|
||||
REGEX_ASSERT(fields[0]=="");
|
||||
REGEX_ASSERT(fields[1]=="Now");
|
||||
REGEX_ASSERT(fields[2]=="is");
|
||||
@ -1541,8 +1541,9 @@ void RegexTest::API_Pattern() {
|
||||
|
||||
n = pat1->split(" ", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==1);
|
||||
REGEX_ASSERT(n==2);
|
||||
REGEX_ASSERT(fields[0]=="");
|
||||
REGEX_ASSERT(fields[1]=="");
|
||||
|
||||
fields[0] = "foo";
|
||||
n = pat1->split("", fields, 10, status);
|
||||
@ -1559,7 +1560,7 @@ void RegexTest::API_Pattern() {
|
||||
status = U_ZERO_ERROR;
|
||||
n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==6);
|
||||
REGEX_ASSERT(n==7);
|
||||
REGEX_ASSERT(fields[0]=="");
|
||||
REGEX_ASSERT(fields[1]=="a");
|
||||
REGEX_ASSERT(fields[2]=="Now is ");
|
||||
@ -1571,7 +1572,7 @@ void RegexTest::API_Pattern() {
|
||||
|
||||
n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==6);
|
||||
REGEX_ASSERT(n==7);
|
||||
REGEX_ASSERT(fields[0]==" ");
|
||||
REGEX_ASSERT(fields[1]=="a");
|
||||
REGEX_ASSERT(fields[2]=="Now is ");
|
||||
@ -1590,7 +1591,7 @@ void RegexTest::API_Pattern() {
|
||||
REGEX_ASSERT(fields[2]=="Now is ");
|
||||
REGEX_ASSERT(fields[3]=="b");
|
||||
REGEX_ASSERT(fields[4]=="the time");
|
||||
REGEX_ASSERT(fields[5]=="c");
|
||||
REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
|
||||
REGEX_ASSERT(fields[6]=="foo");
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
@ -1640,6 +1641,39 @@ void RegexTest::API_Pattern() {
|
||||
REGEX_ASSERT(fields[4]=="20");
|
||||
delete pat1;
|
||||
|
||||
// Test split of string with empty trailing fields
|
||||
pat1 = RegexPattern::compile(",", pe, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
n = pat1->split("a,b,c,", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==4);
|
||||
REGEX_ASSERT(fields[0]=="a");
|
||||
REGEX_ASSERT(fields[1]=="b");
|
||||
REGEX_ASSERT(fields[2]=="c");
|
||||
REGEX_ASSERT(fields[3]=="");
|
||||
|
||||
n = pat1->split("a,,,", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==4);
|
||||
REGEX_ASSERT(fields[0]=="a");
|
||||
REGEX_ASSERT(fields[1]=="");
|
||||
REGEX_ASSERT(fields[2]=="");
|
||||
REGEX_ASSERT(fields[3]=="");
|
||||
delete pat1;
|
||||
|
||||
// Split Separator with zero length match.
|
||||
pat1 = RegexPattern::compile(":?", pe, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
n = pat1->split("abc", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==5);
|
||||
REGEX_ASSERT(fields[0]=="");
|
||||
REGEX_ASSERT(fields[1]=="a");
|
||||
REGEX_ASSERT(fields[2]=="b");
|
||||
REGEX_ASSERT(fields[3]=="c");
|
||||
REGEX_ASSERT(fields[4]=="");
|
||||
|
||||
delete pat1;
|
||||
|
||||
//
|
||||
// RegexPattern::pattern()
|
||||
@ -2795,18 +2829,22 @@ void RegexTest::API_Pattern_UTF8() {
|
||||
|
||||
n = pat1->split(" Now is the time ", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==5);
|
||||
REGEX_ASSERT(n==6);
|
||||
REGEX_ASSERT(fields[0]=="");
|
||||
REGEX_ASSERT(fields[1]=="Now");
|
||||
REGEX_ASSERT(fields[2]=="is");
|
||||
REGEX_ASSERT(fields[3]=="the");
|
||||
REGEX_ASSERT(fields[4]=="time");
|
||||
REGEX_ASSERT(fields[5]=="");
|
||||
REGEX_ASSERT(fields[6]=="");
|
||||
|
||||
fields[2] = "*";
|
||||
n = pat1->split(" ", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==1);
|
||||
REGEX_ASSERT(n==2);
|
||||
REGEX_ASSERT(fields[0]=="");
|
||||
REGEX_ASSERT(fields[1]=="");
|
||||
REGEX_ASSERT(fields[2]=="*");
|
||||
|
||||
fields[0] = "foo";
|
||||
n = pat1->split("", fields, 10, status);
|
||||
@ -2822,9 +2860,10 @@ void RegexTest::API_Pattern_UTF8() {
|
||||
REGEX_CHECK_STATUS;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
fields[6] = fields[7] = "*";
|
||||
n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==6);
|
||||
REGEX_ASSERT(n==7);
|
||||
REGEX_ASSERT(fields[0]=="");
|
||||
REGEX_ASSERT(fields[1]=="a");
|
||||
REGEX_ASSERT(fields[2]=="Now is ");
|
||||
@ -2832,11 +2871,13 @@ void RegexTest::API_Pattern_UTF8() {
|
||||
REGEX_ASSERT(fields[4]=="the time");
|
||||
REGEX_ASSERT(fields[5]=="c");
|
||||
REGEX_ASSERT(fields[6]=="");
|
||||
REGEX_ASSERT(fields[7]=="*");
|
||||
REGEX_ASSERT(status==U_ZERO_ERROR);
|
||||
|
||||
fields[6] = fields[7] = "*";
|
||||
n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==6);
|
||||
REGEX_ASSERT(n==7);
|
||||
REGEX_ASSERT(fields[0]==" ");
|
||||
REGEX_ASSERT(fields[1]=="a");
|
||||
REGEX_ASSERT(fields[2]=="Now is ");
|
||||
@ -2844,10 +2885,11 @@ void RegexTest::API_Pattern_UTF8() {
|
||||
REGEX_ASSERT(fields[4]=="the time");
|
||||
REGEX_ASSERT(fields[5]=="c");
|
||||
REGEX_ASSERT(fields[6]=="");
|
||||
REGEX_ASSERT(fields[7]=="*");
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
fields[6] = "foo";
|
||||
n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
|
||||
n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==6);
|
||||
REGEX_ASSERT(fields[0]==" ");
|
||||
@ -2855,7 +2897,7 @@ void RegexTest::API_Pattern_UTF8() {
|
||||
REGEX_ASSERT(fields[2]=="Now is ");
|
||||
REGEX_ASSERT(fields[3]=="b");
|
||||
REGEX_ASSERT(fields[4]=="the time");
|
||||
REGEX_ASSERT(fields[5]=="c");
|
||||
REGEX_ASSERT(fields[5]==" ");
|
||||
REGEX_ASSERT(fields[6]=="foo");
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
|
Loading…
Reference in New Issue
Block a user