ICU-2421 C API for regular expressions
X-SVN-Rev: 14768
This commit is contained in:
parent
1745685d1e
commit
3854e7ced0
@ -563,8 +563,9 @@ uregex_appendTail(URegularExpression *regexp,
|
||||
* set to NULL.
|
||||
* @param destCapacity The capacity of the destBuf.
|
||||
* @param requiredCapacity The actual capacity required of the destBuf.
|
||||
* If destCapacity is too small, requiredCapacity is the
|
||||
* total capacity required to hold all of the output.
|
||||
* If destCapacity is too small, requiredCapacity will return
|
||||
* the total capacity required to hold all of the output, and
|
||||
* a U_BUFFER_OVERFLOW_ERROR will be returned.
|
||||
* @param destFields An array to be filled with the position of each
|
||||
* of the extracted fields within destBuf.
|
||||
* @param destFieldsCapacity The number of elements in the destFields array.
|
||||
@ -574,7 +575,9 @@ uregex_appendTail(URegularExpression *regexp,
|
||||
* input, including any field delimiters, is treated as if it
|
||||
* were the last field - it is copied to the destBuf, and
|
||||
* its position is in the destBuf is stored in the last element
|
||||
* of destFields.
|
||||
* of destFields. This behavior mimics that of Perl. It is not
|
||||
* an error condition, and no error status is returned when all destField
|
||||
* positions are used.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return The number of fields into which the input string was split.
|
||||
* @draft ICU 3.0
|
||||
|
@ -56,7 +56,7 @@ URegularExpression::URegularExpression() {
|
||||
// validateRE Do boilerplate style checks on API function parameters.
|
||||
// Return TRUE if they look OK.
|
||||
//----------------------------------------------------------------------------------------
|
||||
static UBool validateRE(const URegularExpression *re, UErrorCode *status) {
|
||||
static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
|
||||
if (U_FAILURE(*status)) {
|
||||
return FALSE;
|
||||
}
|
||||
@ -65,6 +65,10 @@ static UBool validateRE(const URegularExpression *re, UErrorCode *status) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
if (requiresText && re->fText == NULL) {
|
||||
*status = U_REGEX_INVALID_STATE;
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@ -182,7 +186,7 @@ uregex_openC( const char *pattern,
|
||||
U_CAPI void U_EXPORT2
|
||||
uregex_close(URegularExpression *re) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
if (validateRE(re, &status) == FALSE) {
|
||||
if (validateRE(re, &status, FALSE) == FALSE) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -205,7 +209,7 @@ uregex_close(URegularExpression *re) {
|
||||
//----------------------------------------------------------------------------------------
|
||||
U_CAPI URegularExpression * U_EXPORT2
|
||||
uregex_clone(const URegularExpression *source, UErrorCode *status) {
|
||||
if (validateRE(source, status) == FALSE) {
|
||||
if (validateRE(source, status, FALSE) == FALSE) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -248,7 +252,7 @@ uregex_pattern(const URegularExpression *regexp,
|
||||
int32_t *patLength,
|
||||
UErrorCode *status) {
|
||||
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
if (validateRE(regexp, status, FALSE) == FALSE) {
|
||||
return NULL;
|
||||
}
|
||||
if (patLength != NULL) {
|
||||
@ -265,7 +269,7 @@ uregex_pattern(const URegularExpression *regexp,
|
||||
//----------------------------------------------------------------------------------------
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uregex_flags(const URegularExpression *regexp, UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
if (validateRE(regexp, status, FALSE) == FALSE) {
|
||||
return 0;
|
||||
}
|
||||
int32_t flags = regexp->fPat->flags();
|
||||
@ -283,7 +287,7 @@ uregex_setText(URegularExpression *regexp,
|
||||
const UChar *text,
|
||||
int32_t textLength,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
if (validateRE(regexp, status, FALSE) == FALSE) {
|
||||
return;
|
||||
}
|
||||
if (text == NULL || textLength < -1) {
|
||||
@ -309,7 +313,7 @@ U_CAPI const UChar * U_EXPORT2
|
||||
uregex_getText(URegularExpression *regexp,
|
||||
int32_t *textLength,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
if (validateRE(regexp, status, FALSE) == FALSE) {
|
||||
return NULL;
|
||||
}
|
||||
if (textLength != NULL) {
|
||||
@ -394,7 +398,7 @@ uregex_findNext(URegularExpression *regexp,
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uregex_groupCount(URegularExpression *regexp,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
if (validateRE(regexp, status, FALSE) == FALSE) {
|
||||
return 0;
|
||||
}
|
||||
int32_t result = regexp->fMatcher->groupCount();
|
||||
@ -718,12 +722,26 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
|
||||
replacementText);
|
||||
|
||||
if (escapedChar != (UChar32)0xFFFFFFFF) {
|
||||
if (capacityRemaining > 0) {
|
||||
dest[resultLen] = c;
|
||||
capacityRemaining--;
|
||||
if (escapedChar <= 0xffff) {
|
||||
if (capacityRemaining > 0) {
|
||||
dest[resultLen] = (UChar)escapedChar;
|
||||
capacityRemaining--;
|
||||
}
|
||||
resultLen++;
|
||||
} else {
|
||||
if (capacityRemaining > 0) {
|
||||
dest[resultLen] = U16_LEAD(escapedChar);
|
||||
capacityRemaining--;
|
||||
}
|
||||
resultLen++;
|
||||
if (capacityRemaining > 0) {
|
||||
dest[resultLen] = U16_TRAIL(escapedChar);
|
||||
capacityRemaining--;
|
||||
}
|
||||
resultLen++;
|
||||
}
|
||||
resultLen++;
|
||||
continue;
|
||||
|
||||
continue;
|
||||
}
|
||||
// Note: if the \u escape was invalid, just fall through and
|
||||
// treat it as a plain \<anything> escape.
|
||||
@ -975,8 +993,7 @@ static void copyString(UChar *destBuffer, // Destination buffer.
|
||||
int32_t di = *destIndex;
|
||||
UChar c;
|
||||
|
||||
for (si=0; si<srcLen;
|
||||
si++) {
|
||||
for (si=0; si<srcLen; si++) {
|
||||
c = srcPtr[si];
|
||||
if (di < destCapacity) {
|
||||
destBuffer[di] = c;
|
||||
@ -1040,23 +1057,31 @@ uregex_split( URegularExpression *regexp,
|
||||
// capture groups of the delimiter expression, in which case we will discard the
|
||||
// last capture group saved in favor of the unprocessed remainder of the
|
||||
// input string.)
|
||||
i = destFieldsCapacity-1;
|
||||
int32_t remainingLength = inputLen-nextOutputStringStart;
|
||||
if (remainingLength > 0) {
|
||||
destFields[i] = &destBuf[destIdx];
|
||||
copyString(destBuf, destCapacity, &destIdx, regexp->fText, remainingLength);
|
||||
}
|
||||
if (i >= destFieldsCapacity) {
|
||||
// No fields are left. Recycle the last one for holding the trailing part of
|
||||
// the input string.
|
||||
i = destFieldsCapacity-1;
|
||||
destIdx = destFields[i] - destFields[0];
|
||||
}
|
||||
|
||||
destFields[i] = &destBuf[destIdx];
|
||||
copyString(destBuf, destCapacity, &destIdx,
|
||||
®exp->fText[nextOutputStringStart], remainingLength);
|
||||
break;
|
||||
}
|
||||
|
||||
if (regexp->fMatcher->find()) {
|
||||
// We found another delimiter. Move everything from where we started looking
|
||||
// up until the start of the delimiter into the next output string.
|
||||
int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
|
||||
destFields[i] = &destBuf[destIdx];
|
||||
copyString(destBuf, destCapacity, &destIdx,
|
||||
®exp->fText[nextOutputStringStart], fieldLen);
|
||||
®exp->fText[nextOutputStringStart], fieldLen);
|
||||
nextOutputStringStart = regexp->fMatcher->end(*status);
|
||||
|
||||
|
||||
// If the delimiter pattern has capturing parentheses, the captured
|
||||
// text goes out into the next n destination strings.
|
||||
int32_t groupNum;
|
||||
@ -1066,7 +1091,7 @@ uregex_split( URegularExpression *regexp,
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
|
||||
|
||||
// Set up to extract the capture group contents into the dest buffer.
|
||||
UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow
|
||||
// error while extracting this group.
|
||||
@ -1106,6 +1131,9 @@ uregex_split( URegularExpression *regexp,
|
||||
if (requiredCapacity != NULL) {
|
||||
*requiredCapacity = destIdx;
|
||||
}
|
||||
if (*requiredCapacity > destCapacity) {
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
return i+1;
|
||||
}
|
||||
|
||||
|
@ -228,12 +228,20 @@ void TestRegexCAPI(void) {
|
||||
UChar text1[50];
|
||||
UChar text2[50];
|
||||
UBool result;
|
||||
|
||||
u_uastrncpy(text1, "abcccd", sizeof(text1)/2);
|
||||
u_uastrncpy(text2, "abcccxd", sizeof(text2)/2);
|
||||
status = U_ZERO_ERROR;
|
||||
u_uastrncpy(pat, "abc*d", sizeof(pat)/2);
|
||||
re = uregex_open(pat, -1, 0, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* Operation before doing a setText should fail... */
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_lookingAt(re, 0, &status);
|
||||
TEST_ASSERT( status== U_REGEX_INVALID_STATE);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_setText(re, text1, -1, &status);
|
||||
result = uregex_lookingAt(re, 0, &status);
|
||||
TEST_ASSERT(result == TRUE);
|
||||
@ -644,9 +652,47 @@ void TestRegexCAPI(void) {
|
||||
/*
|
||||
* appendReplacement()
|
||||
*/
|
||||
{
|
||||
UChar text[100];
|
||||
UChar repl[100];
|
||||
UChar buf[100];
|
||||
UChar *bufPtr;
|
||||
int32_t bufCap;
|
||||
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC(".*", 0, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
u_uastrncpy(text, "whatever", sizeof(text)/2);
|
||||
u_uastrncpy(repl, "some other", sizeof(repl)/2);
|
||||
uregex_setText(re, text, -1, &status);
|
||||
|
||||
/* match covers whole target string */
|
||||
uregex_find(re, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
bufPtr = buf;
|
||||
bufCap = sizeof(buf) / 2;
|
||||
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_STRING("some other", buf, TRUE);
|
||||
|
||||
/* Match has \u \U escapes */
|
||||
uregex_find(re, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
bufPtr = buf;
|
||||
bufCap = sizeof(buf) / 2;
|
||||
u_uastrncpy(repl, "abc\\u0041 \\U00000042 \\\\ \\abc", sizeof(repl)/2);
|
||||
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
/* TEST_ASSERT_STRING("abcAB \\ abc", buf, TRUE); TODO: */
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* appendTail()
|
||||
* appendTail(). Checked in ReplaceFirst(), replaceAll().
|
||||
*/
|
||||
|
||||
/*
|
||||
@ -660,15 +706,21 @@ void TestRegexCAPI(void) {
|
||||
int32_t numFields;
|
||||
int32_t requiredCapacity;
|
||||
int32_t spaceNeeded;
|
||||
int32_t sz;
|
||||
|
||||
u_uastrncpy(textToSplit, "first : second: third", sizeof(textToSplit)/2);
|
||||
u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC(":", 0, NULL, &status);
|
||||
|
||||
|
||||
/* Simple split */
|
||||
|
||||
uregex_setText(re, textToSplit, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
memset(fields, -1, sizeof(fields));
|
||||
numFields =
|
||||
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 10, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
@ -683,7 +735,163 @@ void TestRegexCAPI(void) {
|
||||
numFields; /* Each field gets a NUL terminator */
|
||||
|
||||
TEST_ASSERT(spaceNeeded == requiredCapacity);
|
||||
|
||||
|
||||
/* Split with too few output strings available */
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC(":", 0, NULL, &status);
|
||||
uregex_setText(re, textToSplit, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
memset(fields, -1, sizeof(fields));
|
||||
numFields =
|
||||
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 2, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(numFields == 2);
|
||||
TEST_ASSERT_STRING("first ", fields[0], TRUE);
|
||||
TEST_ASSERT_STRING(" second: third", fields[1], TRUE);
|
||||
TEST_ASSERT(fields[2] == (UChar *)-1);
|
||||
|
||||
spaceNeeded = u_strlen(textToSplit) -
|
||||
(numFields - 1) + /* Field delimiters do not appear in output */
|
||||
numFields; /* Each field gets a NUL terminator */
|
||||
|
||||
TEST_ASSERT(spaceNeeded == requiredCapacity);
|
||||
|
||||
/* Split with a range of output buffer sizes. */
|
||||
spaceNeeded = u_strlen(textToSplit) -
|
||||
(numFields - 1) + /* Field delimiters do not appear in output */
|
||||
numFields; /* Each field gets a NUL terminator */
|
||||
|
||||
for (sz=0; sz < spaceNeeded+1; sz++) {
|
||||
memset(fields, -1, sizeof(fields));
|
||||
status = U_ZERO_ERROR;
|
||||
numFields =
|
||||
uregex_split(re, buf, sz, &requiredCapacity, fields, 10, &status);
|
||||
if (sz >= spaceNeeded) {
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_STRING("first ", fields[0], TRUE);
|
||||
TEST_ASSERT_STRING(" second", fields[1], TRUE);
|
||||
TEST_ASSERT_STRING(" third", fields[2], TRUE);
|
||||
} else {
|
||||
TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
TEST_ASSERT(numFields == 3);
|
||||
TEST_ASSERT(fields[3] == NULL);
|
||||
TEST_ASSERT(spaceNeeded == requiredCapacity);
|
||||
}
|
||||
uregex_close(re);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* Split(), part 2. Patterns with capture groups. The capture group text
|
||||
* comes out as additional fields. */
|
||||
{
|
||||
UChar textToSplit[80];
|
||||
UChar buf[200];
|
||||
UChar *fields[10];
|
||||
int32_t numFields;
|
||||
int32_t requiredCapacity;
|
||||
int32_t spaceNeeded;
|
||||
int32_t sz;
|
||||
|
||||
u_uastrncpy(textToSplit, "first <tag-a> second<tag-b> third", sizeof(textToSplit)/2);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC("<(.*?)>", 0, NULL, &status);
|
||||
|
||||
uregex_setText(re, textToSplit, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
memset(fields, -1, sizeof(fields));
|
||||
numFields =
|
||||
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 10, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(numFields == 5);
|
||||
TEST_ASSERT_STRING("first ", fields[0], TRUE);
|
||||
TEST_ASSERT_STRING("tag-a", fields[1], TRUE);
|
||||
TEST_ASSERT_STRING(" second", fields[2], TRUE);
|
||||
TEST_ASSERT_STRING("tag-b", fields[3], TRUE);
|
||||
TEST_ASSERT_STRING(" third", fields[4], TRUE);
|
||||
TEST_ASSERT(fields[5] == NULL);
|
||||
spaceNeeded = strlen("first .tag-a. second.tag-b. third."); // "." at NUL positions
|
||||
TEST_ASSERT(spaceNeeded == requiredCapacity);
|
||||
|
||||
|
||||
/* Split with too few output strings available (2) */
|
||||
status = U_ZERO_ERROR;
|
||||
memset(fields, -1, sizeof(fields));
|
||||
numFields =
|
||||
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 2, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(numFields == 2);
|
||||
TEST_ASSERT_STRING("first ", fields[0], TRUE);
|
||||
TEST_ASSERT_STRING(" second<tag-b> third", fields[1], TRUE);
|
||||
TEST_ASSERT(fields[2] == (UChar *)-1);
|
||||
|
||||
spaceNeeded = strlen("first . second<tag-b> third."); // "." at NUL positions
|
||||
TEST_ASSERT(spaceNeeded == requiredCapacity);
|
||||
|
||||
/* Split with too few output strings available (3) */
|
||||
status = U_ZERO_ERROR;
|
||||
memset(fields, -1, sizeof(fields));
|
||||
numFields =
|
||||
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 3, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(numFields == 3);
|
||||
TEST_ASSERT_STRING("first ", fields[0], TRUE);
|
||||
TEST_ASSERT_STRING("tag-a", fields[1], TRUE);
|
||||
TEST_ASSERT_STRING(" second<tag-b> third", fields[2], TRUE);
|
||||
TEST_ASSERT(fields[3] == (UChar *)-1);
|
||||
|
||||
spaceNeeded = strlen("first .tag-a. second<tag-b> third."); // "." at NUL positions
|
||||
TEST_ASSERT(spaceNeeded == requiredCapacity);
|
||||
|
||||
/* Split with just enough output strings available (5) */
|
||||
status = U_ZERO_ERROR;
|
||||
memset(fields, -1, sizeof(fields));
|
||||
numFields =
|
||||
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 5, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(numFields == 5);
|
||||
TEST_ASSERT_STRING("first ", fields[0], TRUE);
|
||||
TEST_ASSERT_STRING("tag-a", fields[1], TRUE);
|
||||
TEST_ASSERT_STRING(" second", fields[2], TRUE);
|
||||
TEST_ASSERT_STRING("tag-b", fields[3], TRUE);
|
||||
TEST_ASSERT_STRING(" third", fields[4], TRUE);
|
||||
TEST_ASSERT(fields[5] == (UChar *)-1);
|
||||
|
||||
spaceNeeded = strlen("first .tag-a. second.tag-b. third."); // "." at NUL positions
|
||||
TEST_ASSERT(spaceNeeded == requiredCapacity);
|
||||
|
||||
|
||||
/* Split, end of text is a field delimiter. */
|
||||
status = U_ZERO_ERROR;
|
||||
sz = strlen("first <tag-a> second<tag-b>");
|
||||
uregex_setText(re, textToSplit, sz, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
memset(fields, -1, sizeof(fields));
|
||||
numFields =
|
||||
uregex_split(re, buf, sizeof(buf)/2, &requiredCapacity, fields, 9, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(numFields == 4);
|
||||
TEST_ASSERT_STRING("first ", fields[0], TRUE);
|
||||
TEST_ASSERT_STRING("tag-a", fields[1], TRUE);
|
||||
TEST_ASSERT_STRING(" second", fields[2], TRUE);
|
||||
TEST_ASSERT_STRING("tag-b", fields[3], TRUE);
|
||||
TEST_ASSERT(fields[4] == NULL);
|
||||
TEST_ASSERT(fields[8] == NULL);
|
||||
TEST_ASSERT(fields[9] == (UChar *)-1);
|
||||
spaceNeeded = strlen("first .tag-a. second.tag-b."); // "." at NUL positions
|
||||
TEST_ASSERT(spaceNeeded == requiredCapacity);
|
||||
|
||||
uregex_close(re);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
|
||||
|
Loading…
Reference in New Issue
Block a user