ICU-2421 C API for regular expressions

X-SVN-Rev: 14768
2004-03-26 01:23:01 +00:00 · 2004-03-26 01:23:01 +00:00 · 3854e7ced0
commit 3854e7ced0
parent 1745685d1e
3 changed files with 264 additions and 25 deletions
--- a/icu4c/source/i18n/unicode/uregex.h
+++ b/icu4c/source/i18n/unicode/uregex.h
@ -563,8 +563,9 @@ uregex_appendTail(URegularExpression    *regexp,
   *                         set to NULL.
   *    @param   destCapacity The capacity of the destBuf.
   *    @param   requiredCapacity  The actual capacity required of the destBuf.
-   *                         If destCapacity is too small, requiredCapacity is the
-   *                         total capacity required to hold all of the output.
+   *                         If destCapacity is too small, requiredCapacity will return 
+   *                         the total capacity required to hold all of the output, and
+   *                         a U_BUFFER_OVERFLOW_ERROR will be returned.
   *    @param   destFields  An array to be filled with the position of each
   *                         of the extracted fields within destBuf.
   *    @param   destFieldsCapacity  The number of elements in the destFields array.
@ -574,7 +575,9 @@ uregex_appendTail(URegularExpression    *regexp,
   *                input, including any field delimiters, is treated as if it
   *                were the last field - it is copied to the destBuf, and
   *                its position is in the destBuf is stored in the last element
-   *                of destFields.
+   *                of destFields.  This behavior mimics that of Perl.  It is not
+   *                an error condition, and no error status is returned when all destField
+   *                positions are used.
   * @param status  A reference to a UErrorCode to receive any errors.
   * @return        The number of fields into which the input string was split.
   * @draft ICU 3.0
--- a/icu4c/source/i18n/uregex.cpp
+++ b/icu4c/source/i18n/uregex.cpp
@ -56,7 +56,7 @@ URegularExpression::URegularExpression() {
 //   validateRE    Do boilerplate style checks on API function parameters.
 //                 Return TRUE if they look OK.
 //----------------------------------------------------------------------------------------
-static UBool validateRE(const URegularExpression *re, UErrorCode *status) {
+static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
    if (U_FAILURE(*status)) {
        return FALSE;
    }
@ -65,6 +65,10 @@ static UBool validateRE(const URegularExpression *re, UErrorCode *status) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return FALSE;
    }
+    if (requiresText && re->fText == NULL) {
+        *status = U_REGEX_INVALID_STATE;
+        return FALSE;
+    }
    return TRUE;
 }

@ -182,7 +186,7 @@ uregex_openC( const char           *pattern,
 U_CAPI void  U_EXPORT2
 uregex_close(URegularExpression  *re) {
    UErrorCode  status = U_ZERO_ERROR;
-    if (validateRE(re, &status) == FALSE) {
+    if (validateRE(re, &status, FALSE) == FALSE) {
        return;
    }

@ -205,7 +209,7 @@ uregex_close(URegularExpression  *re) {
 //----------------------------------------------------------------------------------------
 U_CAPI URegularExpression * U_EXPORT2 
 uregex_clone(const URegularExpression *source, UErrorCode *status)  {
-    if (validateRE(source, status) == FALSE) {
+    if (validateRE(source, status, FALSE) == FALSE) {
        return NULL;
    }

@ -248,7 +252,7 @@ uregex_pattern(const  URegularExpression *regexp,
               int32_t            *patLength,
               UErrorCode         *status)  {
    
-    if (validateRE(regexp, status) == FALSE) {
+    if (validateRE(regexp, status, FALSE) == FALSE) {
        return NULL;
    }
    if (patLength != NULL) {
@ -265,7 +269,7 @@ uregex_pattern(const  URegularExpression *regexp,
 //----------------------------------------------------------------------------------------
 U_CAPI int32_t U_EXPORT2 
 uregex_flags(const URegularExpression *regexp, UErrorCode *status)  {
-    if (validateRE(regexp, status) == FALSE) {
+    if (validateRE(regexp, status, FALSE) == FALSE) {
        return 0;
    }
    int32_t flags = regexp->fPat->flags();
@ -283,7 +287,7 @@ uregex_setText(URegularExpression *regexp,
               const UChar        *text,
               int32_t             textLength,
               UErrorCode         *status)  {
-    if (validateRE(regexp, status) == FALSE) {
+    if (validateRE(regexp, status, FALSE) == FALSE) {
        return;
    }
    if (text == NULL || textLength < -1) {
@ -309,7 +313,7 @@ U_CAPI const UChar * U_EXPORT2
 uregex_getText(URegularExpression *regexp,
               int32_t            *textLength,
               UErrorCode         *status)  {
-    if (validateRE(regexp, status) == FALSE) {
+    if (validateRE(regexp, status, FALSE) == FALSE) {
        return NULL;
    }
    if (textLength != NULL) {
@ -394,7 +398,7 @@ uregex_findNext(URegularExpression *regexp,
 U_CAPI int32_t U_EXPORT2 
 uregex_groupCount(URegularExpression *regexp,
                  UErrorCode         *status)  {
-    if (validateRE(regexp, status) == FALSE) {
+    if (validateRE(regexp, status, FALSE) == FALSE) {
        return 0;
    }
    int32_t  result = regexp->fMatcher->groupCount();
@ -718,12 +722,26 @@ int32_t RegexCImpl::appendReplacement(URegularExpression    *regexp,
                       replacementText);

                if (escapedChar != (UChar32)0xFFFFFFFF) {
-                    if (capacityRemaining > 0) {
-                        dest[resultLen] = c;
-                        capacityRemaining--;
+                    if (escapedChar <= 0xffff) {
+                        if (capacityRemaining > 0) {
+                            dest[resultLen] = (UChar)escapedChar;
+                            capacityRemaining--;
+                        }
+                        resultLen++;
+                    } else {
+                        if (capacityRemaining > 0) {
+                            dest[resultLen] = U16_LEAD(escapedChar);
+                            capacityRemaining--;
+                        }
+                        resultLen++;
+                        if (capacityRemaining > 0) {
+                            dest[resultLen] = U16_TRAIL(escapedChar);
+                            capacityRemaining--;
+                        }
+                        resultLen++;
                    }
-                    resultLen++;
-                    continue;
+
+                continue;
                }
                // Note:  if the \u escape was invalid, just fall through and
                //        treat it as a plain \<anything> escape.
@ -975,8 +993,7 @@ static void copyString(UChar        *destBuffer,    //  Destination buffer.
    int32_t  di = *destIndex;
    UChar    c;

-    for (si=0; si<srcLen;
-    si++) {
+    for (si=0; si<srcLen;  si++) {
        c = srcPtr[si];
        if (di < destCapacity) {
            destBuffer[di] = c;
@ -1040,23 +1057,31 @@ uregex_split(   URegularExpression      *regexp,
            //    capture groups of the delimiter expression, in which case we will discard the
            //    last capture group saved in favor of the unprocessed remainder of the
            //    input string.)
-            i = destFieldsCapacity-1;
            int32_t remainingLength = inputLen-nextOutputStringStart;
            if (remainingLength > 0) {
-                destFields[i] = &destBuf[destIdx];
-                copyString(destBuf, destCapacity, &destIdx, regexp->fText, remainingLength);
            }
+            if (i >= destFieldsCapacity) {
+                // No fields are left.  Recycle the last one for holding the trailing part of
+                //   the input string.
+                i = destFieldsCapacity-1;
+                destIdx = destFields[i] - destFields[0];
+            }
+            
+            destFields[i] = &destBuf[destIdx];
+            copyString(destBuf, destCapacity, &destIdx, 
+                &regexp->fText[nextOutputStringStart], remainingLength);
            break;
        }
+        
        if (regexp->fMatcher->find()) {
            // We found another delimiter.  Move everything from where we started looking
            //  up until the start of the delimiter into the next output string.
            int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
            destFields[i] = &destBuf[destIdx];
            copyString(destBuf, destCapacity, &destIdx, 
-                                &regexp->fText[nextOutputStringStart], fieldLen);
+                &regexp->fText[nextOutputStringStart], fieldLen);
            nextOutputStringStart =  regexp->fMatcher->end(*status);
-
+            
            // If the delimiter pattern has capturing parentheses, the captured
            //  text goes out into the next n destination strings.
            int32_t groupNum;
@ -1066,7 +1091,7 @@ uregex_split(   URegularExpression      *regexp,
                    break;
                }
                i++;
-
+                
                // Set up to extract the capture group contents into the dest buffer.
                UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow
                                                      //  error while extracting this group.
@ -1106,6 +1131,9 @@ uregex_split(   URegularExpression      *regexp,
    if (requiredCapacity != NULL) {
        *requiredCapacity = destIdx;
    }
+    if (*requiredCapacity > destCapacity) {
+        *status = U_BUFFER_OVERFLOW_ERROR;
+    }
    return i+1;
 }

--- a/icu4c/source/test/cintltst/reapits.c
+++ b/icu4c/source/test/cintltst/reapits.c
@ -228,12 +228,20 @@ void TestRegexCAPI(void) {
        UChar  text1[50];
        UChar  text2[50];
        UBool  result;
+
        u_uastrncpy(text1, "abcccd",  sizeof(text1)/2);
        u_uastrncpy(text2, "abcccxd", sizeof(text2)/2);
        status = U_ZERO_ERROR;
        u_uastrncpy(pat, "abc*d", sizeof(pat)/2);
        re = uregex_open(pat, -1, 0, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);

+        /* Operation before doing a setText should fail... */
+        status = U_ZERO_ERROR;
+        uregex_lookingAt(re, 0, &status);
+        TEST_ASSERT( status== U_REGEX_INVALID_STATE);
+
+        status = U_ZERO_ERROR;
        uregex_setText(re, text1, -1, &status);
        result = uregex_lookingAt(re, 0, &status);
        TEST_ASSERT(result == TRUE);
@ -644,9 +652,47 @@ void TestRegexCAPI(void) {
    /*
     *  appendReplacement()
     */
+    {
+        UChar    text[100];
+        UChar    repl[100];
+        UChar    buf[100];
+        UChar   *bufPtr;
+        int32_t  bufCap;
+
+
+        status = U_ZERO_ERROR;
+        re = uregex_openC(".*", 0, 0, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        u_uastrncpy(text, "whatever",  sizeof(text)/2);
+        u_uastrncpy(repl, "some other", sizeof(repl)/2);
+        uregex_setText(re, text, -1, &status);
+
+        /* match covers whole target string */
+        uregex_find(re, 0, &status);
+        TEST_ASSERT_SUCCESS(status);
+        bufPtr = buf;
+        bufCap = sizeof(buf) / 2;
+        uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_STRING("some other", buf, TRUE);
+
+        /* Match has \u \U escapes */
+        uregex_find(re, 0, &status);
+        TEST_ASSERT_SUCCESS(status);
+        bufPtr = buf;
+        bufCap = sizeof(buf) / 2;
+        u_uastrncpy(repl, "abc\\u0041 \\U00000042 \\\\ \\abc", sizeof(repl)/2);
+        uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
+        TEST_ASSERT_SUCCESS(status);
+        /* TEST_ASSERT_STRING("abcAB \\ abc", buf, TRUE);  TODO:  */
+
+
+    }
+

    /*
-     *  appendTail()
+     *  appendTail().   Checked in ReplaceFirst(), replaceAll().
     */

    /*
@ -660,15 +706,21 @@ void TestRegexCAPI(void) {
        int32_t  numFields;
        int32_t  requiredCapacity;
        int32_t  spaceNeeded;
+        int32_t  sz;

        u_uastrncpy(textToSplit, "first : second:  third",  sizeof(textToSplit)/2);
        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);

        status = U_ZERO_ERROR;
        re = uregex_openC(":", 0, NULL, &status);
+
+
+        /*  Simple split */ 
+
        uregex_setText(re, textToSplit, -1, &status);
        TEST_ASSERT_SUCCESS(status);

+        memset(fields, -1, sizeof(fields));
        numFields = 
            uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 10, &status);
        TEST_ASSERT_SUCCESS(status);
@ -683,7 +735,163 @@ void TestRegexCAPI(void) {
                      numFields;          /* Each field gets a NUL terminator */ 

        TEST_ASSERT(spaceNeeded == requiredCapacity);
+
+    
+        /*  Split with too few output strings available */
+        status = U_ZERO_ERROR;
+        re = uregex_openC(":", 0, NULL, &status);
+        uregex_setText(re, textToSplit, -1, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        memset(fields, -1, sizeof(fields));
+        numFields = 
+            uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 2, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(numFields == 2);
+        TEST_ASSERT_STRING("first ",  fields[0], TRUE);
+        TEST_ASSERT_STRING(" second:  third", fields[1], TRUE);
+        TEST_ASSERT(fields[2] == (UChar *)-1);
+
+        spaceNeeded = u_strlen(textToSplit) -
+                      (numFields - 1)  +  /* Field delimiters do not appear in output */
+                      numFields;          /* Each field gets a NUL terminator */ 
+
+        TEST_ASSERT(spaceNeeded == requiredCapacity);
+
+        /* Split with a range of output buffer sizes.  */
+        spaceNeeded = u_strlen(textToSplit) -
+            (numFields - 1)  +  /* Field delimiters do not appear in output */
+            numFields;          /* Each field gets a NUL terminator */ 
+                
+        for (sz=0; sz < spaceNeeded+1; sz++) {
+            memset(fields, -1, sizeof(fields));
+            status = U_ZERO_ERROR;
+            numFields = 
+                uregex_split(re, buf, sz, &requiredCapacity, fields, 10, &status);
+            if (sz >= spaceNeeded) {
+                TEST_ASSERT_SUCCESS(status);
+                TEST_ASSERT_STRING("first ",  fields[0], TRUE);
+                TEST_ASSERT_STRING(" second", fields[1], TRUE);
+                TEST_ASSERT_STRING("  third", fields[2], TRUE);
+            } else {
+                TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
+            }
+            TEST_ASSERT(numFields == 3);
+            TEST_ASSERT(fields[3] == NULL);
+            TEST_ASSERT(spaceNeeded == requiredCapacity);
+        }
+    uregex_close(re);
    }
+
+
+
+
+    /* Split(), part 2.  Patterns with capture groups.  The capture group text
+     *                   comes out as additional fields.  */
+    {
+        UChar    textToSplit[80];
+        UChar    buf[200];
+        UChar    *fields[10];
+        int32_t  numFields;
+        int32_t  requiredCapacity;
+        int32_t  spaceNeeded;
+        int32_t  sz;
+
+        u_uastrncpy(textToSplit, "first <tag-a> second<tag-b>  third",  sizeof(textToSplit)/2);
+
+        status = U_ZERO_ERROR;
+        re = uregex_openC("<(.*?)>", 0, NULL, &status);
+
+        uregex_setText(re, textToSplit, -1, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        memset(fields, -1, sizeof(fields));
+        numFields = 
+            uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 10, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(numFields == 5);
+        TEST_ASSERT_STRING("first ",  fields[0], TRUE);
+        TEST_ASSERT_STRING("tag-a",   fields[1], TRUE);
+        TEST_ASSERT_STRING(" second", fields[2], TRUE);
+        TEST_ASSERT_STRING("tag-b",   fields[3], TRUE);
+        TEST_ASSERT_STRING("  third", fields[4], TRUE);
+        TEST_ASSERT(fields[5] == NULL);
+        spaceNeeded = strlen("first .tag-a. second.tag-b.  third.");  // "." at NUL positions
+        TEST_ASSERT(spaceNeeded == requiredCapacity);
+
+    
+        /*  Split with too few output strings available (2) */
+        status = U_ZERO_ERROR;
+        memset(fields, -1, sizeof(fields));
+        numFields = 
+            uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 2, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(numFields == 2);
+        TEST_ASSERT_STRING("first ",  fields[0], TRUE);
+        TEST_ASSERT_STRING(" second<tag-b>  third", fields[1], TRUE);
+        TEST_ASSERT(fields[2] == (UChar *)-1);
+
+        spaceNeeded = strlen("first . second<tag-b>  third.");  // "." at NUL positions
+        TEST_ASSERT(spaceNeeded == requiredCapacity);
+
+        /*  Split with too few output strings available (3) */
+        status = U_ZERO_ERROR;
+        memset(fields, -1, sizeof(fields));
+        numFields = 
+            uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 3, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(numFields == 3);
+        TEST_ASSERT_STRING("first ",  fields[0], TRUE);
+        TEST_ASSERT_STRING("tag-a",   fields[1], TRUE);
+        TEST_ASSERT_STRING(" second<tag-b>  third", fields[2], TRUE);
+        TEST_ASSERT(fields[3] == (UChar *)-1);
+
+        spaceNeeded = strlen("first .tag-a. second<tag-b>  third.");  // "." at NUL positions
+        TEST_ASSERT(spaceNeeded == requiredCapacity);
+
+        /*  Split with just enough output strings available (5) */
+        status = U_ZERO_ERROR;
+        memset(fields, -1, sizeof(fields));
+        numFields = 
+            uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 5, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(numFields == 5);
+        TEST_ASSERT_STRING("first ",  fields[0], TRUE);
+        TEST_ASSERT_STRING("tag-a",   fields[1], TRUE);
+        TEST_ASSERT_STRING(" second", fields[2], TRUE);
+        TEST_ASSERT_STRING("tag-b",   fields[3], TRUE);
+        TEST_ASSERT_STRING("  third", fields[4], TRUE);
+        TEST_ASSERT(fields[5] == (UChar *)-1);
+
+        spaceNeeded = strlen("first .tag-a. second.tag-b.  third.");  // "." at NUL positions
+        TEST_ASSERT(spaceNeeded == requiredCapacity);
+
+
+        /* Split, end of text is a field delimiter.   */
+        status = U_ZERO_ERROR;
+        sz = strlen("first <tag-a> second<tag-b>");
+        uregex_setText(re, textToSplit, sz, &status);
+        TEST_ASSERT_SUCCESS(status);
+        memset(fields, -1, sizeof(fields));
+        numFields = 
+            uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 9, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(numFields == 4);
+        TEST_ASSERT_STRING("first ",  fields[0], TRUE);
+        TEST_ASSERT_STRING("tag-a",   fields[1], TRUE);
+        TEST_ASSERT_STRING(" second", fields[2], TRUE);
+        TEST_ASSERT_STRING("tag-b",   fields[3], TRUE);
+        TEST_ASSERT(fields[4] == NULL);
+        TEST_ASSERT(fields[8] == NULL);
+        TEST_ASSERT(fields[9] == (UChar *)-1);
+        spaceNeeded = strlen("first .tag-a. second.tag-b.");  // "." at NUL positions
+        TEST_ASSERT(spaceNeeded == requiredCapacity);
+
+        uregex_close(re);
+    }
+
+
+
 }

 #endif   /*  !UCONFIG_NO_REGULAR_EXPRESSIONS */