ICU-2421 C API for regular expressions

X-SVN-Rev: 14768
This commit is contained in:
Andy Heninger 2004-03-26 01:23:01 +00:00
parent 1745685d1e
commit 3854e7ced0
3 changed files with 264 additions and 25 deletions

View File

@ -563,8 +563,9 @@ uregex_appendTail(URegularExpression *regexp,
* set to NULL.
* @param destCapacity The capacity of the destBuf.
* @param requiredCapacity The actual capacity required of the destBuf.
* If destCapacity is too small, requiredCapacity is the
* total capacity required to hold all of the output.
* If destCapacity is too small, requiredCapacity will return
* the total capacity required to hold all of the output, and
* a U_BUFFER_OVERFLOW_ERROR will be returned.
* @param destFields An array to be filled with the position of each
* of the extracted fields within destBuf.
* @param destFieldsCapacity The number of elements in the destFields array.
@ -574,7 +575,9 @@ uregex_appendTail(URegularExpression *regexp,
* input, including any field delimiters, is treated as if it
* were the last field - it is copied to the destBuf, and
* its position is in the destBuf is stored in the last element
* of destFields.
* of destFields. This behavior mimics that of Perl. It is not
* an error condition, and no error status is returned when all destField
* positions are used.
* @param status A reference to a UErrorCode to receive any errors.
* @return The number of fields into which the input string was split.
* @draft ICU 3.0

View File

@ -56,7 +56,7 @@ URegularExpression::URegularExpression() {
// validateRE Do boilerplate style checks on API function parameters.
// Return TRUE if they look OK.
//----------------------------------------------------------------------------------------
static UBool validateRE(const URegularExpression *re, UErrorCode *status) {
static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
if (U_FAILURE(*status)) {
return FALSE;
}
@ -65,6 +65,10 @@ static UBool validateRE(const URegularExpression *re, UErrorCode *status) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return FALSE;
}
if (requiresText && re->fText == NULL) {
*status = U_REGEX_INVALID_STATE;
return FALSE;
}
return TRUE;
}
@ -182,7 +186,7 @@ uregex_openC( const char *pattern,
U_CAPI void U_EXPORT2
uregex_close(URegularExpression *re) {
UErrorCode status = U_ZERO_ERROR;
if (validateRE(re, &status) == FALSE) {
if (validateRE(re, &status, FALSE) == FALSE) {
return;
}
@ -205,7 +209,7 @@ uregex_close(URegularExpression *re) {
//----------------------------------------------------------------------------------------
U_CAPI URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression *source, UErrorCode *status) {
if (validateRE(source, status) == FALSE) {
if (validateRE(source, status, FALSE) == FALSE) {
return NULL;
}
@ -248,7 +252,7 @@ uregex_pattern(const URegularExpression *regexp,
int32_t *patLength,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
if (validateRE(regexp, status, FALSE) == FALSE) {
return NULL;
}
if (patLength != NULL) {
@ -265,7 +269,7 @@ uregex_pattern(const URegularExpression *regexp,
//----------------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_flags(const URegularExpression *regexp, UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
if (validateRE(regexp, status, FALSE) == FALSE) {
return 0;
}
int32_t flags = regexp->fPat->flags();
@ -283,7 +287,7 @@ uregex_setText(URegularExpression *regexp,
const UChar *text,
int32_t textLength,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
if (validateRE(regexp, status, FALSE) == FALSE) {
return;
}
if (text == NULL || textLength < -1) {
@ -309,7 +313,7 @@ U_CAPI const UChar * U_EXPORT2
uregex_getText(URegularExpression *regexp,
int32_t *textLength,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
if (validateRE(regexp, status, FALSE) == FALSE) {
return NULL;
}
if (textLength != NULL) {
@ -394,7 +398,7 @@ uregex_findNext(URegularExpression *regexp,
U_CAPI int32_t U_EXPORT2
uregex_groupCount(URegularExpression *regexp,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
if (validateRE(regexp, status, FALSE) == FALSE) {
return 0;
}
int32_t result = regexp->fMatcher->groupCount();
@ -718,12 +722,26 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
replacementText);
if (escapedChar != (UChar32)0xFFFFFFFF) {
if (capacityRemaining > 0) {
dest[resultLen] = c;
capacityRemaining--;
if (escapedChar <= 0xffff) {
if (capacityRemaining > 0) {
dest[resultLen] = (UChar)escapedChar;
capacityRemaining--;
}
resultLen++;
} else {
if (capacityRemaining > 0) {
dest[resultLen] = U16_LEAD(escapedChar);
capacityRemaining--;
}
resultLen++;
if (capacityRemaining > 0) {
dest[resultLen] = U16_TRAIL(escapedChar);
capacityRemaining--;
}
resultLen++;
}
resultLen++;
continue;
continue;
}
// Note: if the \u escape was invalid, just fall through and
// treat it as a plain \<anything> escape.
@ -975,8 +993,7 @@ static void copyString(UChar *destBuffer, // Destination buffer.
int32_t di = *destIndex;
UChar c;
for (si=0; si<srcLen;
si++) {
for (si=0; si<srcLen; si++) {
c = srcPtr[si];
if (di < destCapacity) {
destBuffer[di] = c;
@ -1040,23 +1057,31 @@ uregex_split( URegularExpression *regexp,
// capture groups of the delimiter expression, in which case we will discard the
// last capture group saved in favor of the unprocessed remainder of the
// input string.)
i = destFieldsCapacity-1;
int32_t remainingLength = inputLen-nextOutputStringStart;
if (remainingLength > 0) {
destFields[i] = &destBuf[destIdx];
copyString(destBuf, destCapacity, &destIdx, regexp->fText, remainingLength);
}
if (i >= destFieldsCapacity) {
// No fields are left. Recycle the last one for holding the trailing part of
// the input string.
i = destFieldsCapacity-1;
destIdx = destFields[i] - destFields[0];
}
destFields[i] = &destBuf[destIdx];
copyString(destBuf, destCapacity, &destIdx,
&regexp->fText[nextOutputStringStart], remainingLength);
break;
}
if (regexp->fMatcher->find()) {
// We found another delimiter. Move everything from where we started looking
// up until the start of the delimiter into the next output string.
int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
destFields[i] = &destBuf[destIdx];
copyString(destBuf, destCapacity, &destIdx,
&regexp->fText[nextOutputStringStart], fieldLen);
&regexp->fText[nextOutputStringStart], fieldLen);
nextOutputStringStart = regexp->fMatcher->end(*status);
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
@ -1066,7 +1091,7 @@ uregex_split( URegularExpression *regexp,
break;
}
i++;
// Set up to extract the capture group contents into the dest buffer.
UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow
// error while extracting this group.
@ -1106,6 +1131,9 @@ uregex_split( URegularExpression *regexp,
if (requiredCapacity != NULL) {
*requiredCapacity = destIdx;
}
if (*requiredCapacity > destCapacity) {
*status = U_BUFFER_OVERFLOW_ERROR;
}
return i+1;
}

View File

@ -228,12 +228,20 @@ void TestRegexCAPI(void) {
UChar text1[50];
UChar text2[50];
UBool result;
u_uastrncpy(text1, "abcccd", sizeof(text1)/2);
u_uastrncpy(text2, "abcccxd", sizeof(text2)/2);
status = U_ZERO_ERROR;
u_uastrncpy(pat, "abc*d", sizeof(pat)/2);
re = uregex_open(pat, -1, 0, NULL, &status);
TEST_ASSERT_SUCCESS(status);
/* Operation before doing a setText should fail... */
status = U_ZERO_ERROR;
uregex_lookingAt(re, 0, &status);
TEST_ASSERT( status== U_REGEX_INVALID_STATE);
status = U_ZERO_ERROR;
uregex_setText(re, text1, -1, &status);
result = uregex_lookingAt(re, 0, &status);
TEST_ASSERT(result == TRUE);
@ -644,9 +652,47 @@ void TestRegexCAPI(void) {
/*
* appendReplacement()
*/
{
UChar text[100];
UChar repl[100];
UChar buf[100];
UChar *bufPtr;
int32_t bufCap;
status = U_ZERO_ERROR;
re = uregex_openC(".*", 0, 0, &status);
TEST_ASSERT_SUCCESS(status);
u_uastrncpy(text, "whatever", sizeof(text)/2);
u_uastrncpy(repl, "some other", sizeof(repl)/2);
uregex_setText(re, text, -1, &status);
/* match covers whole target string */
uregex_find(re, 0, &status);
TEST_ASSERT_SUCCESS(status);
bufPtr = buf;
bufCap = sizeof(buf) / 2;
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_STRING("some other", buf, TRUE);
/* Match has \u \U escapes */
uregex_find(re, 0, &status);
TEST_ASSERT_SUCCESS(status);
bufPtr = buf;
bufCap = sizeof(buf) / 2;
u_uastrncpy(repl, "abc\\u0041 \\U00000042 \\\\ \\abc", sizeof(repl)/2);
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
TEST_ASSERT_SUCCESS(status);
/* TEST_ASSERT_STRING("abcAB \\ abc", buf, TRUE); TODO: */
}
/*
* appendTail()
* appendTail(). Checked in ReplaceFirst(), replaceAll().
*/
/*
@ -660,15 +706,21 @@ void TestRegexCAPI(void) {
int32_t numFields;
int32_t requiredCapacity;
int32_t spaceNeeded;
int32_t sz;
u_uastrncpy(textToSplit, "first : second: third", sizeof(textToSplit)/2);
u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
status = U_ZERO_ERROR;
re = uregex_openC(":", 0, NULL, &status);
/* Simple split */
uregex_setText(re, textToSplit, -1, &status);
TEST_ASSERT_SUCCESS(status);
memset(fields, -1, sizeof(fields));
numFields =
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 10, &status);
TEST_ASSERT_SUCCESS(status);
@ -683,7 +735,163 @@ void TestRegexCAPI(void) {
numFields; /* Each field gets a NUL terminator */
TEST_ASSERT(spaceNeeded == requiredCapacity);
/* Split with too few output strings available */
status = U_ZERO_ERROR;
re = uregex_openC(":", 0, NULL, &status);
uregex_setText(re, textToSplit, -1, &status);
TEST_ASSERT_SUCCESS(status);
memset(fields, -1, sizeof(fields));
numFields =
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 2, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(numFields == 2);
TEST_ASSERT_STRING("first ", fields[0], TRUE);
TEST_ASSERT_STRING(" second: third", fields[1], TRUE);
TEST_ASSERT(fields[2] == (UChar *)-1);
spaceNeeded = u_strlen(textToSplit) -
(numFields - 1) + /* Field delimiters do not appear in output */
numFields; /* Each field gets a NUL terminator */
TEST_ASSERT(spaceNeeded == requiredCapacity);
/* Split with a range of output buffer sizes. */
spaceNeeded = u_strlen(textToSplit) -
(numFields - 1) + /* Field delimiters do not appear in output */
numFields; /* Each field gets a NUL terminator */
for (sz=0; sz < spaceNeeded+1; sz++) {
memset(fields, -1, sizeof(fields));
status = U_ZERO_ERROR;
numFields =
uregex_split(re, buf, sz, &requiredCapacity, fields, 10, &status);
if (sz >= spaceNeeded) {
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_STRING("first ", fields[0], TRUE);
TEST_ASSERT_STRING(" second", fields[1], TRUE);
TEST_ASSERT_STRING(" third", fields[2], TRUE);
} else {
TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
}
TEST_ASSERT(numFields == 3);
TEST_ASSERT(fields[3] == NULL);
TEST_ASSERT(spaceNeeded == requiredCapacity);
}
uregex_close(re);
}
/* Split(), part 2. Patterns with capture groups. The capture group text
* comes out as additional fields. */
{
UChar textToSplit[80];
UChar buf[200];
UChar *fields[10];
int32_t numFields;
int32_t requiredCapacity;
int32_t spaceNeeded;
int32_t sz;
u_uastrncpy(textToSplit, "first <tag-a> second<tag-b> third", sizeof(textToSplit)/2);
status = U_ZERO_ERROR;
re = uregex_openC("<(.*?)>", 0, NULL, &status);
uregex_setText(re, textToSplit, -1, &status);
TEST_ASSERT_SUCCESS(status);
memset(fields, -1, sizeof(fields));
numFields =
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 10, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(numFields == 5);
TEST_ASSERT_STRING("first ", fields[0], TRUE);
TEST_ASSERT_STRING("tag-a", fields[1], TRUE);
TEST_ASSERT_STRING(" second", fields[2], TRUE);
TEST_ASSERT_STRING("tag-b", fields[3], TRUE);
TEST_ASSERT_STRING(" third", fields[4], TRUE);
TEST_ASSERT(fields[5] == NULL);
spaceNeeded = strlen("first .tag-a. second.tag-b. third."); // "." at NUL positions
TEST_ASSERT(spaceNeeded == requiredCapacity);
/* Split with too few output strings available (2) */
status = U_ZERO_ERROR;
memset(fields, -1, sizeof(fields));
numFields =
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 2, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(numFields == 2);
TEST_ASSERT_STRING("first ", fields[0], TRUE);
TEST_ASSERT_STRING(" second<tag-b> third", fields[1], TRUE);
TEST_ASSERT(fields[2] == (UChar *)-1);
spaceNeeded = strlen("first . second<tag-b> third."); // "." at NUL positions
TEST_ASSERT(spaceNeeded == requiredCapacity);
/* Split with too few output strings available (3) */
status = U_ZERO_ERROR;
memset(fields, -1, sizeof(fields));
numFields =
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 3, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(numFields == 3);
TEST_ASSERT_STRING("first ", fields[0], TRUE);
TEST_ASSERT_STRING("tag-a", fields[1], TRUE);
TEST_ASSERT_STRING(" second<tag-b> third", fields[2], TRUE);
TEST_ASSERT(fields[3] == (UChar *)-1);
spaceNeeded = strlen("first .tag-a. second<tag-b> third."); // "." at NUL positions
TEST_ASSERT(spaceNeeded == requiredCapacity);
/* Split with just enough output strings available (5) */
status = U_ZERO_ERROR;
memset(fields, -1, sizeof(fields));
numFields =
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 5, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(numFields == 5);
TEST_ASSERT_STRING("first ", fields[0], TRUE);
TEST_ASSERT_STRING("tag-a", fields[1], TRUE);
TEST_ASSERT_STRING(" second", fields[2], TRUE);
TEST_ASSERT_STRING("tag-b", fields[3], TRUE);
TEST_ASSERT_STRING(" third", fields[4], TRUE);
TEST_ASSERT(fields[5] == (UChar *)-1);
spaceNeeded = strlen("first .tag-a. second.tag-b. third."); // "." at NUL positions
TEST_ASSERT(spaceNeeded == requiredCapacity);
/* Split, end of text is a field delimiter. */
status = U_ZERO_ERROR;
sz = strlen("first <tag-a> second<tag-b>");
uregex_setText(re, textToSplit, sz, &status);
TEST_ASSERT_SUCCESS(status);
memset(fields, -1, sizeof(fields));
numFields =
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 9, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(numFields == 4);
TEST_ASSERT_STRING("first ", fields[0], TRUE);
TEST_ASSERT_STRING("tag-a", fields[1], TRUE);
TEST_ASSERT_STRING(" second", fields[2], TRUE);
TEST_ASSERT_STRING("tag-b", fields[3], TRUE);
TEST_ASSERT(fields[4] == NULL);
TEST_ASSERT(fields[8] == NULL);
TEST_ASSERT(fields[9] == (UChar *)-1);
spaceNeeded = strlen("first .tag-a. second.tag-b."); // "." at NUL positions
TEST_ASSERT(spaceNeeded == requiredCapacity);
uregex_close(re);
}
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */