ICU-8404 Regular Expressions split(), fix incorrect handling of trailing empty field

X-SVN-Rev: 29801
2011-04-15 00:48:39 +00:00 · 2011-04-15 00:48:39 +00:00 · 8148726df2
commit 8148726df2
parent b6db2cf8d5
6 changed files with 174 additions and 85 deletions
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -2170,27 +2170,33 @@ int32_t  RegexMatcher::split(UText *input,
            // If the delimiter pattern has capturing parentheses, the captured
            //  text goes out into the next n destination strings.
            int32_t groupNum;
-            UBool lastGroupWasNullUText = FALSE;
            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
-                if (i==destCapacity-1) {
+                if (i >= destCapacity-2) {
+                    // Never fill the last available output string with capture group text.
+                    // It will filled with the last field, the remainder of the
+                    //  unsplit input text.
                    break;
                }
                i++;
-                lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
                dest[i] = group(groupNum, dest[i], status);
            }

            if (nextOutputStringStart == fActiveLimit) {
-                // The delimiter was at the end of the string.  We're done.
-                break;
-            } else if (i == destCapacity-1) {
-                // We're out of capture groups, and the rest of the string is more important
-                if (lastGroupWasNullUText) {
-                    utext_close(dest[i]);
-                    dest[i] = NULL;
+                // The delimiter was at the end of the string.  We're done, but first
+                // we output one last empty string, for the empty field following
+                //   the delimiter at the end of input.
+                if (i+1 < destCapacity) {
+                    ++i;
+                    if (dest[i] == NULL) {
+                        dest[i] = utext_openUChars(NULL, NULL, 0, &status);
+                    } else {
+                        static UChar emptyString[] = {(UChar)0};
+                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
+                    }
                }
-            }
-
+                break;
+            
+            } 
        }
        else
        {
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -397,7 +397,7 @@ public:

 private:
    /**
-     * Cause a compilation error if an application accidently attempts to
+     * Cause a compilation error if an application accidentally attempts to
     *   create a matcher with a (UChar *) string as input rather than
     *   a UnicodeString.  Avoids a dangling reference to a temporary string.
     * <p>
@ -430,7 +430,7 @@ public:

   /**
    * Test whether a string matches a regular expression.  This convenience function
-    * both compiles the reguluar expression and applies it in a single operation.
+    * both compiles the regular expression and applies it in a single operation.
    * Note that if the same pattern needs to be applied repeatedly, this method will be
    * less efficient than creating and reusing a RegexMatcher object.
    *
@ -450,7 +450,7 @@ public:

   /**
    * Test whether a string matches a regular expression.  This convenience function
-    * both compiles the reguluar expression and applies it in a single operation.
+    * both compiles the regular expression and applies it in a single operation.
    * Note that if the same pattern needs to be applied repeatedly, this method will be
    * less efficient than creating and reusing a RegexMatcher object.
    *
@ -493,13 +493,26 @@ public:


    /**
-     * Split a string into fields.  Somewhat like split() from Perl.
-     * The pattern matches identify delimiters that separate the input
-     *  into fields.  The input data between the matches becomes the
-     *  fields themselves.
-     * <p>
-     *  For the best performance on split() operations,
-     *  <code>RegexMatcher::split</code> is perferable to this function
+     * Split a string into fields.  Somewhat like split() from Perl or Java.
+     * Pattern matches identify delimiters that separate the input
+     * into fields.  The input data between the delimiters becomes the
+     * fields themselves.
+     *
+     * If the delimiter pattern includes capture groups, the captured text will
+     * also appear in the destination array of output strings, interspersed
+     * with the fields.  This is similar to Perl, but differs from Java, 
+     * which ignores the presence of capture groups in the pattern.
+     * 
+     * Trailing empty fields will always be returned, assuming sufficient
+     * destination capacity.  This differs from the default behavior for Java
+     * and Perl where trailing empty fields are not returned.
+     *
+     * The number of strings produced by the split operation is returned.
+     * This count includes the strings from capture groups in the delimiter pattern.
+     * This behavior differs from Java, which ignores capture groups.
+     *
+     * For the best performance on split() operations,
+     * <code>RegexMatcher::split</code> is preferable to this function
     *
     * @param input   The string to be split into fields.  The field delimiters
     *                match the pattern (in the "this" object)
@ -524,13 +537,26 @@ public:


    /**
-     * Split a string into fields.  Somewhat like split() from Perl.
-     * The pattern matches identify delimiters that separate the input
-     *  into fields.  The input data between the matches becomes the
-     *  fields themselves.
-     * <p>
+     * Split a string into fields.  Somewhat like split() from Perl or Java.
+     * Pattern matches identify delimiters that separate the input
+     * into fields.  The input data between the delimiters becomes the
+     * fields themselves.
+     *
+     * If the delimiter pattern includes capture groups, the captured text will
+     * also appear in the destination array of output strings, interspersed
+     * with the fields.  This is similar to Perl, but differs from Java, 
+     * which ignores the presence of capture groups in the pattern.
+     * 
+     * Trailing empty fields will always be returned, assuming sufficient
+     * destination capacity.  This differs from the default behavior for Java
+     * and Perl where trailing empty fields are not returned.
+     *
+     * The number of strings produced by the split operation is returned.
+     * This count includes the strings from capture groups in the delimiter pattern.
+     * This behavior differs from Java, which ignores capture groups.
+     *
     *  For the best performance on split() operations,
-     *  <code>RegexMatcher::split</code> is perferable to this function
+     *  <code>RegexMatcher::split</code> is preferable to this function
     *
     * @param input   The string to be split into fields.  The field delimiters
     *                match the pattern (in the "this" object)
@ -544,7 +570,7 @@ public:
     *                of fields, the trailing part of the input string, including any
     *                field delimiters, is placed in the last destination string.
     * @param status  A reference to a UErrorCode to receive any errors.
-     * @return        The number of fields into which the input string was split.
+     * @return        The number of destination strings used.  
     *
     * @draft ICU 4.6
     */
@ -637,7 +663,7 @@ private:


 /**
- *  class RegexMatcher bundles together a reular expression pattern and
+ *  class RegexMatcher bundles together a regular expression pattern and
 *  input text to which the expression can be applied.  It includes methods
 *  for testing for matches, and for find and replace operations.
 *
@ -731,7 +757,7 @@ public:

 private:
    /**
-     * Cause a compilation error if an application accidently attempts to
+     * Cause a compilation error if an application accidentally attempts to
     *   create a matcher with a (UChar *) string as input rather than
     *   a UnicodeString.    Avoids a dangling reference to a temporary string.
     * <p>
@ -956,7 +982,7 @@ public:
    *    @return the index of the last character matched, plus one.
    *                        The index value returned is a native index, corresponding to
    *                        code units for the underlying encoding type, for example,
-    *                        a byte index for UTF8.
+    *                        a byte index for UTF-8.
    *   @stable ICU 2.4
    */
    virtual int32_t end(UErrorCode &status) const;
@ -976,7 +1002,7 @@ public:
    *                        attempted or the last match failed and
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
    *    @return  the index of the first character following the text
-    *              captured by the specifed group during the previous match operation.
+    *              captured by the specified group during the previous match operation.
    *              Return -1 if the capture group exists in the pattern but was not part of the match.
    *              The index value returned is a native index, corresponding to
    *              code units for the underlying encoding type, for example,
@ -1084,7 +1110,7 @@ public:

 private:
    /**
-     * Cause a compilation error if an application accidently attempts to
+     * Cause a compilation error if an application accidentally attempts to
     *   reset a matcher with a (UChar *) string as input rather than
     *   a UnicodeString.    Avoids a dangling reference to a temporary string.
     * <p>
@ -1225,7 +1251,7 @@ public:
     
    /**
      * Return true if this matcher is using anchoring bounds.
-      * By default, matchers use anchoring region boounds.
+      * By default, matchers use anchoring region bounds.
      *
      * @return TRUE if this matcher is using anchoring bounds.
      * @stable ICU 4.0
@ -1553,7 +1579,7 @@ public:
    virtual int32_t getTimeLimit() const;

  /**
-    *  Set the amount of heap storage avaliable for use by the match backtracking stack.
+    *  Set the amount of heap storage available for use by the match backtracking stack.
    *  The matcher is also reset, discarding any results from previous matches.
    *  <p>
    *  ICU uses a backtracking regular expression engine, with the backtrack stack
@ -1606,7 +1632,7 @@ public:
  /**
    *  Get the callback function for this URegularExpression.
    *
-    *    @param   callback    Out paramater, receives a pointer to the user-supplied 
+    *    @param   callback    Out parameter, receives a pointer to the user-supplied 
    *                         callback function.
    *    @param   context     Out parameter, receives the user context pointer that
    *                         was set when uregex_setMatchCallback() was called.
@ -1639,7 +1665,7 @@ public:
  /**
    *  Get the find progress callback function for this URegularExpression.
    *
-    *    @param   callback    Out paramater, receives a pointer to the user-supplied 
+    *    @param   callback    Out parameter, receives a pointer to the user-supplied 
    *                         callback function.
    *    @param   context     Out parameter, receives the user context pointer that
    *                         was set when uregex_setFindProgressCallback() was called.
--- a/icu4c/source/i18n/unicode/uregex.h
+++ b/icu4c/source/i18n/unicode/uregex.h
@ -33,7 +33,7 @@

 struct URegularExpression;
 /**
-  * Structure representing a compiled regular rexpression, plus the results
+  * Structure representing a compiled regular expression, plus the results
  *    of a match operation.
  * @stable ICU 3.0
  */
@ -99,7 +99,7 @@ typedef enum URegexpFlag{

     /**  Error on Unrecognized backslash escapes.
       *     If set, fail with an error on patterns that contain
-       *     backslash-escaped ASCII letters without a known specail
+       *     backslash-escaped ASCII letters without a known special
       *     meaning.  If this flag is not set, these
       *     escaped letters represent themselves.
       *     @stable ICU 4.0
@ -117,13 +117,13 @@ typedef enum URegexpFlag{
  *
  * @param pattern        The Regular Expression pattern to be compiled. 
  * @param patternLength  The length of the pattern, or -1 if the pattern is
-  *                       NUL termintated.
+  *                       NUL terminated.
  * @param flags          Flags that alter the default matching behavior for
  *                       the regular expression, UREGEX_CASE_INSENSITIVE, for
  *                       example.  For default behavior, set this parameter to zero.
  *                       See <code>enum URegexpFlag</code>.  All desired flags
  *                       are bitwise-ORed together.
-  * @param pe             Receives the position (line and column nubers) of any syntax
+  * @param pe             Receives the position (line and column numbers) of any syntax
  *                       error within the source regular expression string.  If this
  *                       information is not wanted, pass NULL for this parameter.
  * @param status         Receives error detected by this function.
@ -153,7 +153,7 @@ uregex_open( const  UChar          *pattern,
  *                       example.  For default behavior, set this parameter to zero.
  *                       See <code>enum URegexpFlag</code>.  All desired flags
  *                       are bitwise-ORed together.
-  * @param pe             Receives the position (line and column nubers) of any syntax
+  * @param pe             Receives the position (line and column numbers) of any syntax
  *                       error within the source regular expression string.  If this
  *                       information is not wanted, pass NULL for this parameter.
  * @param status         Receives error detected by this function.
@ -174,13 +174,13 @@ uregex_openUText(UText          *pattern,
  *   is supplied as an 8 bit char * string in the default code page.
  *
  * @param pattern        The Regular Expression pattern to be compiled, 
-  *                       NUL termintated.  
+  *                       NUL terminated.  
  * @param flags          Flags that alter the default matching behavior for
  *                       the regular expression, UREGEX_CASE_INSENSITIVE, for
  *                       example.  For default behavior, set this parameter to zero.
  *                       See <code>enum URegexpFlag</code>.  All desired flags
  *                       are bitwise-ORed together.
-  * @param pe             Receives the position (line and column nubers) of any syntax
+  * @param pe             Receives the position (line and column numbers) of any syntax
  *                       error within the source regular expression string.  If this
  *                       information is not wanted, pass NULL for this parameter.
  * @param status         Receives errors detected by this function.
@ -234,7 +234,7 @@ U_NAMESPACE_END
 * form of the expression, and requires less memory.
 * <p>
 * Note that the current input string and the position of any matched text
- *  within it are not cloned; only the pattern itself and and the
+ *  within it are not cloned; only the pattern itself and the
 *  match mode flags are copied.
 * <p>
 * Cloning can be particularly useful to threaded applications that perform
@ -927,7 +927,7 @@ uregex_requireEnd(const  URegularExpression   *regexp,
  *    @param   replacementLength  The length of the replacement string, or
  *                                -1 if it is NUL terminated.
  *    @param   destBuf            A (UChar *) buffer that will receive the result.
-  *    @param   destCapacity       The capacity of the desitnation buffer.
+  *    @param   destCapacity       The capacity of the destination buffer.
  *    @param   status             A reference to a UErrorCode to receive any errors.
  *    @return                     The length of the string resulting from the find
  *                                and replace operation.  In the event that the
@ -986,7 +986,7 @@ uregex_replaceAllUText(URegularExpression *regexp,
  *    @param   replacementLength  The length of the replacement string, or
  *                                -1 if it is NUL terminated.
  *    @param   destBuf            A (UChar *) buffer that will receive the result.
-  *    @param   destCapacity       The capacity of the desitnation buffer.
+  *    @param   destCapacity       The capacity of the destination buffer.
  *    @param   status             a reference to a UErrorCode to receive any errors.
  *    @return                     The length of the string resulting from the find
  *                                and replace operation.  In the event that the
@ -1172,26 +1172,23 @@ uregex_appendTailUText(URegularExpression    *regexp,
   *  The pattern matches identify delimiters that separate the input
   *  into fields.  The input data between the matches becomes the
   *  fields themselves.
-   * <p>
+   *
   *  Each of the fields is copied from the input string to the destination
   *  buffer, and NUL terminated.  The position of each field within
   *  the destination buffer is returned in the destFields array.
   *
-   *  Note:  another choice for the design of this function would be to not
-   *         copy the resulting fields at all, but to return indexes and
-   *         lengths within the source text.  
-   *           Advantages would be
-   *             o  Faster.  No Copying.
-   *             o  Nothing extra needed when field data may contain embedded NUL chars.
-   *             o  Less memory needed if working on large data.
-   *           Disadvantages
-   *             o  Less consistent with C++ split, which copies into an
-   *                array of UnicodeStrings.
-   *             o  No NUL termination, extracted fields would be less convenient
-   *                to use in most cases.
-   *             o  Possible problems in the future, when support Unicode Normalization
-   *                could cause the fields to not correspond exactly to
-   *                a range of the source text.
+   *  If the delimiter pattern includes capture groups, the captured text will
+   *  also appear in the destination array of output strings, interspersed
+   *  with the fields.  This is similar to Perl, but differs from Java, 
+   *  which ignores the presence of capture groups in the pattern.
+   * 
+   *  Trailing empty fields will always be returned, assuming sufficient
+   *  destination capacity.  This differs from the default behavior for Java
+   *  and Perl where trailing empty fields are not returned.
+   *
+   *  The number of strings produced by the split operation is returned.
+   *  This count includes the strings from capture groups in the delimiter pattern.
+   *  This behavior differs from Java, which ignores capture groups.
   * 
   *    @param   regexp      The compiled regular expression.
   *    @param   destBuf     A (UChar *) buffer to receive the fields that
@ -1307,7 +1304,7 @@ uregex_getTimeLimit(const URegularExpression      *regexp,
                          UErrorCode              *status);

 /**
- * Set the amount of heap storage avaliable for use by the match backtracking stack.
+ * Set the amount of heap storage available for use by the match backtracking stack.
 * <p>
 * ICU uses a backtracking regular expression engine, with the backtrack stack
 * maintained on the heap.  This function sets the limit to the amount of memory
@ -1392,7 +1389,7 @@ uregex_setMatchCallback(URegularExpression      *regexp,
 *  Get the callback function for this URegularExpression.
 *
 * @param   regexp      The compiled regular expression.
- * @param   callback    Out paramater, receives a pointer to the user-supplied 
+ * @param   callback    Out parameter, receives a pointer to the user-supplied 
 *                      callback function.
 * @param   context     Out parameter, receives the user context pointer that
 *                      was set when uregex_setMatchCallback() was called.
@ -1464,7 +1461,7 @@ uregex_setFindProgressCallback(URegularExpression              *regexp,
 *  Get the find progress callback function for this URegularExpression.
 *
 * @param   regexp      The compiled regular expression.
- * @param   callback    Out paramater, receives a pointer to the user-supplied 
+ * @param   callback    Out parameter, receives a pointer to the user-supplied 
 *                      callback function.
 * @param   context     Out parameter, receives the user context pointer that
 *                      was set when uregex_setFindProgressCallback() was called.
--- a/icu4c/source/i18n/uregex.cpp
+++ b/icu4c/source/i18n/uregex.cpp
@ -1841,7 +1841,11 @@ int32_t RegexCImpl::split(RegularExpression     *regexp,
                // Set up to extract the capture group contents into the dest buffer.
                destFields[i] = &destBuf[destIdx];
                tStatus = U_ZERO_ERROR;
-                int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
+                int32_t t = uregex_group((URegularExpression*)regexp, 
+                                         groupNum, 
+                                         destFields[i], 
+                                         REMAINING_CAPACITY(destIdx, destCapacity), 
+                                         &tStatus);
                destIdx += t + 1;    // Record the space used in the output string buffer.
                                     //  +1 for the NUL that terminates the string.
                if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
@ -1852,7 +1856,18 @@ int32_t RegexCImpl::split(RegularExpression     *regexp,
            }

            if (nextOutputStringStart == inputLen) {
-                // The delimiter was at the end of the string.  We're done.
+                // The delimiter was at the end of the string. 
+                // Output an empty string, and then we are done.
+                if (destIdx < destCapacity) {
+                    destBuf[destIdx] = 0;
+                }
+                if (i < destFieldsCapacity-1) {
+                   ++i;
+                }
+                if (destIdx < destCapacity) {
+                    destFields[i] = destBuf + destIdx;
+                }
+                ++destIdx;
                break;
            }

--- a/icu4c/source/test/cintltst/reapits.c
+++ b/icu4c/source/test/cintltst/reapits.c
@ -1225,15 +1225,16 @@ static void TestRegexCAPI(void) {

            /* The TEST_ASSERT_SUCCESS call above should change too... */
            if(U_SUCCESS(status)) {
-                TEST_ASSERT(numFields == 4);
+                TEST_ASSERT(numFields == 5);
                TEST_ASSERT_STRING("first ",  fields[0], TRUE);
                TEST_ASSERT_STRING("tag-a",   fields[1], TRUE);
                TEST_ASSERT_STRING(" second", fields[2], TRUE);
                TEST_ASSERT_STRING("tag-b",   fields[3], TRUE);
-                TEST_ASSERT(fields[4] == NULL);
+                TEST_ASSERT_STRING("",        fields[4], TRUE);
+                TEST_ASSERT(fields[5] == NULL);
                TEST_ASSERT(fields[8] == NULL);
                TEST_ASSERT(!memcmp(&fields[9],&minus1,sizeof(UChar*)));
-                spaceNeeded = strlen("first .tag-a. second.tag-b.");  /* "." at NUL positions */
+                spaceNeeded = strlen("first .tag-a. second.tag-b..");  /* "." at NUL positions */
                TEST_ASSERT(spaceNeeded == requiredCapacity);
            }
        }
@ -2134,13 +2135,15 @@ static void TestUTextAPI(void) {
                const char str_taga[] = { 0x74, 0x61, 0x67, 0x2d, 0x61, 0x00 }; /* tag-a */
                const char str_second[] = { 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x00 }; /*  second */
                const char str_tagb[] = { 0x74, 0x61, 0x67, 0x2d, 0x62, 0x00 }; /* tag-b */
+                const char str_empty[] = { 0x00 };

-                TEST_ASSERT(numFields == 4);
+                TEST_ASSERT(numFields == 5);
                TEST_ASSERT_UTEXT(str_first,  fields[0]);
                TEST_ASSERT_UTEXT(str_taga,   fields[1]);
                TEST_ASSERT_UTEXT(str_second, fields[2]);
                TEST_ASSERT_UTEXT(str_tagb,   fields[3]);
-                TEST_ASSERT(fields[4] == NULL);
+                TEST_ASSERT_UTEXT(str_empty,  fields[4]);
+                TEST_ASSERT(fields[5] == NULL);
                TEST_ASSERT(fields[8] == NULL);
                TEST_ASSERT(fields[9] == &patternText);
            }
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -1531,7 +1531,7 @@ void RegexTest::API_Pattern() {

    n = pat1->split("    Now       is the time   ", fields, 10, status);
    REGEX_CHECK_STATUS;
-    REGEX_ASSERT(n==5);
+    REGEX_ASSERT(n==6);
    REGEX_ASSERT(fields[0]=="");
    REGEX_ASSERT(fields[1]=="Now");
    REGEX_ASSERT(fields[2]=="is");
@ -1541,8 +1541,9 @@ void RegexTest::API_Pattern() {

    n = pat1->split("     ", fields, 10, status);
    REGEX_CHECK_STATUS;
-    REGEX_ASSERT(n==1);
+    REGEX_ASSERT(n==2);
    REGEX_ASSERT(fields[0]=="");
+    REGEX_ASSERT(fields[1]=="");

    fields[0] = "foo";
    n = pat1->split("", fields, 10, status);
@ -1559,7 +1560,7 @@ void RegexTest::API_Pattern() {
    status = U_ZERO_ERROR;
    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
    REGEX_CHECK_STATUS;
-    REGEX_ASSERT(n==6);
+    REGEX_ASSERT(n==7);
    REGEX_ASSERT(fields[0]=="");
    REGEX_ASSERT(fields[1]=="a");
    REGEX_ASSERT(fields[2]=="Now is ");
@ -1571,7 +1572,7 @@ void RegexTest::API_Pattern() {

    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
    REGEX_CHECK_STATUS;
-    REGEX_ASSERT(n==6);
+    REGEX_ASSERT(n==7);
    REGEX_ASSERT(fields[0]=="  ");
    REGEX_ASSERT(fields[1]=="a");
    REGEX_ASSERT(fields[2]=="Now is ");
@ -1590,7 +1591,7 @@ void RegexTest::API_Pattern() {
    REGEX_ASSERT(fields[2]=="Now is ");
    REGEX_ASSERT(fields[3]=="b");
    REGEX_ASSERT(fields[4]=="the time");
-    REGEX_ASSERT(fields[5]=="c");
+    REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
    REGEX_ASSERT(fields[6]=="foo");

    status = U_ZERO_ERROR;
@ -1640,6 +1641,39 @@ void RegexTest::API_Pattern() {
    REGEX_ASSERT(fields[4]=="20");
    delete pat1;

+    // Test split of string with empty trailing fields
+    pat1 = RegexPattern::compile(",", pe, status);
+    REGEX_CHECK_STATUS;
+    n = pat1->split("a,b,c,", fields, 10, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(n==4);
+    REGEX_ASSERT(fields[0]=="a");
+    REGEX_ASSERT(fields[1]=="b");
+    REGEX_ASSERT(fields[2]=="c");
+    REGEX_ASSERT(fields[3]=="");
+
+    n = pat1->split("a,,,", fields, 10, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(n==4);
+    REGEX_ASSERT(fields[0]=="a");
+    REGEX_ASSERT(fields[1]=="");
+    REGEX_ASSERT(fields[2]=="");
+    REGEX_ASSERT(fields[3]=="");
+    delete pat1;
+
+    // Split Separator with zero length match.
+    pat1 = RegexPattern::compile(":?", pe, status);
+    REGEX_CHECK_STATUS;
+    n = pat1->split("abc", fields, 10, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(n==5);
+    REGEX_ASSERT(fields[0]=="");
+    REGEX_ASSERT(fields[1]=="a");
+    REGEX_ASSERT(fields[2]=="b");
+    REGEX_ASSERT(fields[3]=="c");
+    REGEX_ASSERT(fields[4]=="");
+
+    delete pat1;

    //
    // RegexPattern::pattern()
@ -2795,18 +2829,22 @@ void RegexTest::API_Pattern_UTF8() {

    n = pat1->split("    Now       is the time   ", fields, 10, status);
    REGEX_CHECK_STATUS;
-    REGEX_ASSERT(n==5);
+    REGEX_ASSERT(n==6);
    REGEX_ASSERT(fields[0]=="");
    REGEX_ASSERT(fields[1]=="Now");
    REGEX_ASSERT(fields[2]=="is");
    REGEX_ASSERT(fields[3]=="the");
    REGEX_ASSERT(fields[4]=="time");
    REGEX_ASSERT(fields[5]=="");
+    REGEX_ASSERT(fields[6]=="");

+    fields[2] = "*";
    n = pat1->split("     ", fields, 10, status);
    REGEX_CHECK_STATUS;
-    REGEX_ASSERT(n==1);
+    REGEX_ASSERT(n==2);
    REGEX_ASSERT(fields[0]=="");
+    REGEX_ASSERT(fields[1]=="");
+    REGEX_ASSERT(fields[2]=="*");

    fields[0] = "foo";
    n = pat1->split("", fields, 10, status);
@ -2822,9 +2860,10 @@ void RegexTest::API_Pattern_UTF8() {
    REGEX_CHECK_STATUS;

    status = U_ZERO_ERROR;
+    fields[6] = fields[7] = "*";
    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
    REGEX_CHECK_STATUS;
-    REGEX_ASSERT(n==6);
+    REGEX_ASSERT(n==7);
    REGEX_ASSERT(fields[0]=="");
    REGEX_ASSERT(fields[1]=="a");
    REGEX_ASSERT(fields[2]=="Now is ");
@ -2832,11 +2871,13 @@ void RegexTest::API_Pattern_UTF8() {
    REGEX_ASSERT(fields[4]=="the time");
    REGEX_ASSERT(fields[5]=="c");
    REGEX_ASSERT(fields[6]=="");
+    REGEX_ASSERT(fields[7]=="*");
    REGEX_ASSERT(status==U_ZERO_ERROR);

+    fields[6] = fields[7] = "*";
    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
    REGEX_CHECK_STATUS;
-    REGEX_ASSERT(n==6);
+    REGEX_ASSERT(n==7);
    REGEX_ASSERT(fields[0]=="  ");
    REGEX_ASSERT(fields[1]=="a");
    REGEX_ASSERT(fields[2]=="Now is ");
@ -2844,10 +2885,11 @@ void RegexTest::API_Pattern_UTF8() {
    REGEX_ASSERT(fields[4]=="the time");
    REGEX_ASSERT(fields[5]=="c");
    REGEX_ASSERT(fields[6]=="");
+    REGEX_ASSERT(fields[7]=="*");

    status = U_ZERO_ERROR;
    fields[6] = "foo";
-    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
+    n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
    REGEX_CHECK_STATUS;
    REGEX_ASSERT(n==6);
    REGEX_ASSERT(fields[0]=="  ");
@ -2855,7 +2897,7 @@ void RegexTest::API_Pattern_UTF8() {
    REGEX_ASSERT(fields[2]=="Now is ");
    REGEX_ASSERT(fields[3]=="b");
    REGEX_ASSERT(fields[4]=="the time");
-    REGEX_ASSERT(fields[5]=="c");
+    REGEX_ASSERT(fields[5]==" ");
    REGEX_ASSERT(fields[6]=="foo");

    status = U_ZERO_ERROR;