ICU-4521 UText-based regex implementation
X-SVN-Rev: 27482
This commit is contained in:
parent
7a93a3c3e2
commit
8216117f21
2
.gitattributes
vendored
2
.gitattributes
vendored
@ -54,6 +54,8 @@ icu4c/source/data/in/nfkc.nrm -text
|
||||
icu4c/source/data/in/nfkc_cf.nrm -text
|
||||
icu4c/source/data/in/unorm.icu -text
|
||||
icu4c/source/data/locales/pool.res -text
|
||||
icu4c/source/i18n/regextxt.cpp -text
|
||||
icu4c/source/i18n/regextxt.h -text
|
||||
icu4c/source/samples/ucnv/data02.bin -text
|
||||
icu4c/source/test/perf/README -text
|
||||
icu4c/source/test/testdata/TestFont1.otf -text
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2004-2009, International Business Machines
|
||||
* Copyright (C) 2004-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -136,6 +136,7 @@
|
||||
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/rep.h"
|
||||
@ -674,6 +675,148 @@ utext_extract(UText *ut,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Compare two UTexts (binary order). The comparison begins at each source text's
|
||||
* iteration position. The iteration position of each UText will be left following
|
||||
* the last character compared.
|
||||
*
|
||||
* The comparison is done in code point order; unlike u_strCompare, you
|
||||
* cannot choose to use code unit order. This is because the characters
|
||||
* in a UText are accessed one code point at a time, and may not be from a UTF-16
|
||||
* context.
|
||||
*
|
||||
* This functions works with strings of different explicitly specified lengths
|
||||
* unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
|
||||
* A length argument of -1 signifies that as much of the string should be used as
|
||||
* is necessary to compare with the other string. If both length arguments are -1,
|
||||
* the entire remaining portionss of both strings are used.
|
||||
*
|
||||
* @param s1 First source string.
|
||||
* @param length1 Length of first source string in UTF-32 code points.
|
||||
*
|
||||
* @param s2 Second source string.
|
||||
* @param length2 Length of second source string in UTF-32 code points.
|
||||
*
|
||||
* @return <0 or 0 or >0 as usual for string comparisons
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL int32_t U_EXPORT2
|
||||
utext_compare(UText *s1, int32_t length1,
|
||||
UText *s2, int32_t length2);
|
||||
|
||||
/**
|
||||
* Compare two UTexts (binary order). The comparison begins at each source text's
|
||||
* iteration position. The iteration position of each UText will be left following
|
||||
* the last character compared. This method differs from utext_compare in that
|
||||
* it accepts native limits rather than lengths for each string.
|
||||
*
|
||||
* The comparison is done in code point order; unlike u_strCompare, you
|
||||
* cannot choose to use code unit order. This is because the characters
|
||||
* in a UText are accessed one code point at a time, and may not be from a UTF-16
|
||||
* context.
|
||||
*
|
||||
* This functions works with strings of different explicitly specified lengths
|
||||
* unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
|
||||
* A limit argument of -1 signifies that as much of the string should be used as
|
||||
* is necessary to compare with the other string. If both limit arguments are -1,
|
||||
* the entire remaining portionss of both strings are used.
|
||||
*
|
||||
* @param s1 First source string.
|
||||
* @param limit1 Native index of the last character in the first source string to be considered.
|
||||
*
|
||||
* @param s2 Second source string.
|
||||
* @param limit2 Native index of the last character in the second source string to be considered.
|
||||
*
|
||||
* @return <0 or 0 or >0 as usual for string comparisons
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL int32_t U_EXPORT2
|
||||
utext_compareNativeLimit(UText *s1, int64_t limit1,
|
||||
UText *s2, int64_t limit2);
|
||||
|
||||
/**
|
||||
* Compare two UTexts case-insensitively using full case folding. The comparison
|
||||
* begins at each source text's iteration position. The iteration position of each
|
||||
* UText will be left following the last character compared.
|
||||
*
|
||||
* The comparison is done in code point order; this is because the characters
|
||||
* in a UText are accessed one code point at a time, and may not be from a UTF-16
|
||||
* context.
|
||||
*
|
||||
* This functions works with strings of different explicitly specified lengths
|
||||
* unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
|
||||
* A length argument of -1 signifies that as much of the string should be used as
|
||||
* is necessary to compare with the other string. If both length arguments are -1,
|
||||
* the entire remaining portionss of both strings are used.
|
||||
*
|
||||
* @param s1 First source string.
|
||||
* @param length1 Length of first source string in UTF-32 code points.
|
||||
*
|
||||
* @param s2 Second source string.
|
||||
* @param length2 Length of second source string in UTF-32 code points.
|
||||
*
|
||||
* @param options A bit set of options:
|
||||
* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
|
||||
* Comparison in code point order with default case folding.
|
||||
*
|
||||
* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
|
||||
*
|
||||
* @param pErrorCode Must be a valid pointer to an error code value,
|
||||
* which must not indicate a failure before the function call.
|
||||
*
|
||||
* @return <0 or 0 or >0 as usual for string comparisons
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL int32_t U_EXPORT2
|
||||
utext_caseCompare(UText *s1, int32_t length1,
|
||||
UText *s2, int32_t length2,
|
||||
uint32_t options, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Compare two UTexts case-insensitively using full case folding. The comparison
|
||||
* begins at each source text's iteration position. The iteration position of each
|
||||
* UText will be left following the last character compared. This method differs from
|
||||
* utext_caseCompare in that it accepts native limits rather than lengths for each
|
||||
* string.
|
||||
*
|
||||
* The comparison is done in code point order; this is because the characters
|
||||
* in a UText are accessed one code point at a time, and may not be from a UTF-16
|
||||
* context.
|
||||
*
|
||||
* This functions works with strings of different explicitly specified lengths
|
||||
* unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
|
||||
* A limit argument of -1 signifies that as much of the string should be used as
|
||||
* is necessary to compare with the other string. If both length arguments are -1,
|
||||
* the entire remaining portionss of both strings are used.
|
||||
*
|
||||
* @param s1 First source string.
|
||||
* @param limit1 Native index of the last character in the first source string to be considered.
|
||||
*
|
||||
* @param s2 Second source string.
|
||||
* @param limit2 Native index of the last character in the second source string to be considered.
|
||||
*
|
||||
* @param options A bit set of options:
|
||||
* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
|
||||
* Comparison in code point order with default case folding.
|
||||
*
|
||||
* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
|
||||
*
|
||||
* @param pErrorCode Must be a valid pointer to an error code value,
|
||||
* which must not indicate a failure before the function call.
|
||||
*
|
||||
* @return <0 or 0 or >0 as usual for string comparisons
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL int32_t U_EXPORT2
|
||||
utext_caseCompareNativeLimit(UText *s1, int64_t limit1,
|
||||
UText *s2, int64_t limit2,
|
||||
uint32_t options, UErrorCode *pErrorCode);
|
||||
|
||||
|
||||
/************************************************************************************
|
||||
*
|
||||
* #define inline versions of selected performance-critical text access functions
|
||||
@ -689,6 +832,19 @@ utext_extract(UText *ut,
|
||||
*
|
||||
************************************************************************************/
|
||||
|
||||
/**
|
||||
* inline version of utext_current32(), for performance-critical situations.
|
||||
*
|
||||
* Get the code point at the current iteration position of the UText.
|
||||
* Returns U_SENTINEL (-1) if the position is at the end of the
|
||||
* text.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
#define UTEXT_CURRENT32(ut) \
|
||||
((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \
|
||||
((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut))
|
||||
|
||||
/**
|
||||
* inline version of utext_next32(), for performance-critical situations.
|
||||
*
|
||||
@ -1291,8 +1447,8 @@ struct UTextFuncs {
|
||||
* (private) Spare function pointer
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UTextClose *spare1;
|
||||
|
||||
/**
|
||||
* (private) Spare function pointer
|
||||
* @internal
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2005-2009, International Business Machines
|
||||
* Copyright (C) 2005-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -23,6 +23,7 @@
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "uassert.h"
|
||||
#include "putilimp.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
@ -450,6 +451,361 @@ utext_equals(const UText *a, const UText *b) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utext_compare(UText *s1, int32_t length1,
|
||||
UText *s2, int32_t length2) {
|
||||
UChar32 c1, c2;
|
||||
|
||||
if(length1<0 && length2<0) {
|
||||
/* strcmp style, go until end of string */
|
||||
for(;;) {
|
||||
c1 = UTEXT_NEXT32(s1);
|
||||
c2 = UTEXT_NEXT32(s2);
|
||||
if(c1 != c2) {
|
||||
break;
|
||||
} else if(c1 == U_SENTINEL) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(length1 < 0) {
|
||||
length1 = INT32_MIN;
|
||||
} else if (length2 < 0) {
|
||||
length2 = INT32_MIN;
|
||||
}
|
||||
|
||||
/* memcmp/UnicodeString style, both length-specified */
|
||||
while((length1 > 0 || length1 == INT32_MIN) && (length2 > 0 || length2 == INT32_MIN)) {
|
||||
c1 = UTEXT_NEXT32(s1);
|
||||
c2 = UTEXT_NEXT32(s2);
|
||||
|
||||
if(c1 != c2) {
|
||||
break;
|
||||
} else if(c1 == U_SENTINEL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (length1 != INT32_MIN) {
|
||||
length1 -= 1;
|
||||
}
|
||||
if (length2 != INT32_MIN) {
|
||||
length2 -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
if(length1 <= 0 && length1 != INT32_MIN) {
|
||||
if(length2 <= 0) {
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else if(length2 <= 0 && length2 != INT32_MIN) {
|
||||
if (length1 <= 0) {
|
||||
return 0;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (int32_t)c1-(int32_t)c2;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utext_compareNativeLimit(UText *s1, int64_t limit1,
|
||||
UText *s2, int64_t limit2) {
|
||||
UChar32 c1, c2;
|
||||
|
||||
if(limit1<0 && limit2<0) {
|
||||
/* strcmp style, go until end of string */
|
||||
for(;;) {
|
||||
c1 = UTEXT_NEXT32(s1);
|
||||
c2 = UTEXT_NEXT32(s2);
|
||||
if(c1 != c2) {
|
||||
return (int32_t)c1-(int32_t)c2;
|
||||
} else if(c1 == U_SENTINEL) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* memcmp/UnicodeString style, both length-specified */
|
||||
int64_t index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
|
||||
int64_t index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);
|
||||
|
||||
while((limit1 < 0 || index1 < limit1) && (limit2 < 0 || index2 < limit2)) {
|
||||
c1 = UTEXT_NEXT32(s1);
|
||||
c2 = UTEXT_NEXT32(s2);
|
||||
|
||||
if(c1 != c2) {
|
||||
return (int32_t)c1-(int32_t)c2;
|
||||
} else if(c1 == U_SENTINEL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (limit1 >= 0) {
|
||||
index1 = UTEXT_GETNATIVEINDEX(s1);
|
||||
}
|
||||
if (limit2 >= 0) {
|
||||
index2 = UTEXT_GETNATIVEINDEX(s2);
|
||||
}
|
||||
}
|
||||
|
||||
if(limit1 >= 0 && index1 >= limit1) {
|
||||
if(index2 >= limit2) {
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
if(index1 >= limit1) {
|
||||
return 0;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utext_caseCompare(UText *s1, int32_t length1,
|
||||
UText *s2, int32_t length2,
|
||||
uint32_t options, UErrorCode *pErrorCode) {
|
||||
const UCaseProps *csp;
|
||||
|
||||
/* case folding variables */
|
||||
const UChar *p;
|
||||
int32_t length;
|
||||
|
||||
/* case folding buffers, only use current-level start/limit */
|
||||
UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
|
||||
int32_t foldOffset1, foldOffset2, foldLength1, foldLength2;
|
||||
|
||||
/* current code points */
|
||||
UChar32 c1, c2;
|
||||
uint8_t cLength1, cLength2;
|
||||
|
||||
/* argument checking */
|
||||
if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(s1==NULL || s2==NULL) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
csp=ucase_getSingleton(pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* for variable-length strings */
|
||||
if(length1 < 0) {
|
||||
length1 = INT32_MIN;
|
||||
}
|
||||
if (length2 < 0) {
|
||||
length2 = INT32_MIN;
|
||||
}
|
||||
|
||||
/* initialize */
|
||||
foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0;
|
||||
|
||||
/* comparison loop */
|
||||
while((foldOffset1 < foldLength1 || length1 > 0 || length1 == INT32_MIN) &&
|
||||
(foldOffset2 < foldLength2 || length2 > 0 || length2 == INT32_MIN)) {
|
||||
if(foldOffset1 < foldLength1) {
|
||||
U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
|
||||
cLength1 = 0;
|
||||
} else {
|
||||
c1 = UTEXT_NEXT32(s1);
|
||||
if (c1 != U_SENTINEL) {
|
||||
cLength1 = U16_LENGTH(c1);
|
||||
|
||||
length = ucase_toFullFolding(csp, c1, &p, options);
|
||||
if(length >= 0) {
|
||||
if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
|
||||
u_memcpy(fold1, p, length);
|
||||
foldOffset1 = 0;
|
||||
foldLength1 = length;
|
||||
U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
|
||||
} else {
|
||||
c1 = length;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(length1 != INT32_MIN) {
|
||||
length1 -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
if(foldOffset2 < foldLength2) {
|
||||
U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
|
||||
cLength2 = 0;
|
||||
} else {
|
||||
c2 = UTEXT_NEXT32(s2);
|
||||
if (c2 != U_SENTINEL) {
|
||||
cLength2 = U16_LENGTH(c2);
|
||||
|
||||
length = ucase_toFullFolding(csp, c2, &p, options);
|
||||
if(length >= 0) {
|
||||
if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
|
||||
u_memcpy(fold2, p, length);
|
||||
foldOffset2 = 0;
|
||||
foldLength2 = length;
|
||||
U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
|
||||
} else {
|
||||
c2 = length;
|
||||
}
|
||||
}
|
||||
} else if(c1 == U_SENTINEL) {
|
||||
return 0; // end of both strings at once
|
||||
}
|
||||
|
||||
if(length2 != INT32_MIN) {
|
||||
length2 -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
if(c1 != c2) {
|
||||
return (int32_t)c1-(int32_t)c2;
|
||||
}
|
||||
}
|
||||
|
||||
/* By now at least one of the strings is out of characters */
|
||||
length1 += foldLength1 - foldOffset1;
|
||||
length2 += foldLength2 - foldOffset2;
|
||||
|
||||
if(length1 <= 0 && length1 != INT32_MIN) {
|
||||
if(length2 <= 0) {
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
if (length1 <= 0) {
|
||||
return 0;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utext_caseCompareNativeLimit(UText *s1, int64_t limit1,
|
||||
UText *s2, int64_t limit2,
|
||||
uint32_t options, UErrorCode *pErrorCode) {
|
||||
const UCaseProps *csp;
|
||||
|
||||
/* case folding variables */
|
||||
const UChar *p;
|
||||
int32_t length;
|
||||
|
||||
/* case folding buffers, only use current-level start/limit */
|
||||
UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
|
||||
int32_t foldOffset1, foldOffset2, foldLength1, foldLength2;
|
||||
|
||||
/* current code points */
|
||||
UChar32 c1, c2;
|
||||
|
||||
/* native indexes into s1 and s2 */
|
||||
int64_t index1, index2;
|
||||
|
||||
/* argument checking */
|
||||
if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(s1==NULL || s2==NULL) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
csp=ucase_getSingleton(pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* initialize */
|
||||
index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
|
||||
index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);
|
||||
|
||||
foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0;
|
||||
|
||||
/* comparison loop */
|
||||
while((foldOffset1 < foldLength1 || limit1 < 0 || index1 < limit1) &&
|
||||
(foldOffset2 < foldLength2 || limit2 < 0 || index2 < limit2)) {
|
||||
if(foldOffset1 < foldLength1) {
|
||||
U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
|
||||
} else {
|
||||
c1 = UTEXT_NEXT32(s1);
|
||||
if (c1 != U_SENTINEL) {
|
||||
length = ucase_toFullFolding(csp, c1, &p, options);
|
||||
if(length >= 0) {
|
||||
if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
|
||||
u_memcpy(fold1, p, length);
|
||||
foldOffset1 = 0;
|
||||
foldLength1 = length;
|
||||
U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
|
||||
} else {
|
||||
c1 = length;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (limit1 >= 0) {
|
||||
index1 = UTEXT_GETNATIVEINDEX(s1);
|
||||
}
|
||||
}
|
||||
|
||||
if(foldOffset2 < foldLength2) {
|
||||
U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
|
||||
} else {
|
||||
c2 = UTEXT_NEXT32(s2);
|
||||
if (c2 != U_SENTINEL) {
|
||||
length = ucase_toFullFolding(csp, c2, &p, options);
|
||||
if(length >= 0) {
|
||||
if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
|
||||
u_memcpy(fold2, p, length);
|
||||
foldOffset2 = 0;
|
||||
foldLength2 = length;
|
||||
U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
|
||||
} else {
|
||||
c2 = length;
|
||||
}
|
||||
}
|
||||
} else if(c1 == U_SENTINEL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (limit2 >= 0) {
|
||||
index2 = UTEXT_GETNATIVEINDEX(s2);
|
||||
}
|
||||
}
|
||||
|
||||
if(c1 != c2) {
|
||||
return (int32_t)c1-(int32_t)c2;
|
||||
}
|
||||
}
|
||||
|
||||
/* By now at least one of the strings is out of characters */
|
||||
index1 -= foldLength1 - foldOffset1;
|
||||
index2 -= foldLength2 - foldOffset2;
|
||||
|
||||
if(limit1 >= 0 && index1 >= limit1) {
|
||||
if(index2 >= limit2) {
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
if(index1 >= limit1) {
|
||||
return 0;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
utext_isWritable(const UText *ut)
|
||||
{
|
||||
@ -800,7 +1156,7 @@ shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
|
||||
adjustPointer(dest, &dest->p, src);
|
||||
adjustPointer(dest, &dest->q, src);
|
||||
adjustPointer(dest, &dest->r, src);
|
||||
adjustPointer(dest, (const void **)&dest->chunkContents, src);
|
||||
adjustPointer(dest, (const void **)&dest->chunkContents, src);
|
||||
|
||||
return dest;
|
||||
}
|
||||
@ -932,7 +1288,7 @@ utf8TextAccess(UText *ut, int64_t index, UBool forward) {
|
||||
if (ix>length) {
|
||||
if (length>=0) {
|
||||
ix=length;
|
||||
} else if (ix>ut->c) {
|
||||
} else if (ix>=ut->c) {
|
||||
// Zero terminated string, and requested index is beyond
|
||||
// the region that has already been scanned.
|
||||
// Scan up to either the end of the string or to the
|
||||
@ -1415,7 +1771,7 @@ utext_strFromUTF8(UChar *dest,
|
||||
if(ch<0){
|
||||
ch = 0xfffd;
|
||||
}
|
||||
if(ch<=0xFFFF){
|
||||
if(U_IS_BMP(ch)){
|
||||
*(pDest++)=(UChar)ch;
|
||||
}else{
|
||||
*(pDest++)=UTF16_LEAD(ch);
|
||||
@ -1438,7 +1794,7 @@ utext_strFromUTF8(UChar *dest,
|
||||
if(ch<0){
|
||||
ch = 0xfffd;
|
||||
}
|
||||
reqLength+=UTF_CHAR_LENGTH(ch);
|
||||
reqLength+=U16_LENGTH(ch);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1485,7 +1841,7 @@ utf8TextExtract(UText *ut,
|
||||
int i;
|
||||
if (start32 < ut->chunkNativeLimit) {
|
||||
for (i=0; i<3; i++) {
|
||||
if (U8_IS_LEAD(buf[start32]) || start32==0) {
|
||||
if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
|
||||
break;
|
||||
}
|
||||
start32--;
|
||||
@ -1494,7 +1850,7 @@ utf8TextExtract(UText *ut,
|
||||
|
||||
if (limit32 < ut->chunkNativeLimit) {
|
||||
for (i=0; i<3; i++) {
|
||||
if (U8_IS_LEAD(buf[limit32]) || limit32==0) {
|
||||
if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
|
||||
break;
|
||||
}
|
||||
limit32--;
|
||||
@ -1506,6 +1862,7 @@ utf8TextExtract(UText *ut,
|
||||
utext_strFromUTF8(dest, destCapacity, &destLength,
|
||||
(const char *)ut->context+start32, limit32-start32,
|
||||
pErrorCode);
|
||||
utf8TextAccess(ut, limit32, TRUE);
|
||||
return destLength;
|
||||
}
|
||||
|
||||
@ -1870,6 +2227,8 @@ repTextExtract(UText *ut,
|
||||
}
|
||||
UnicodeString buffer(dest, 0, destCapacity); // writable alias
|
||||
rep->extractBetween(start32, limit32, buffer);
|
||||
repTextAccess(ut, limit32, TRUE);
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, length, status);
|
||||
}
|
||||
|
||||
@ -2138,6 +2497,9 @@ unistrTextExtract(UText *t,
|
||||
trimmedLength=destCapacity;
|
||||
}
|
||||
us->extract(start32, trimmedLength, dest);
|
||||
t->chunkOffset = start32+trimmedLength;
|
||||
} else {
|
||||
t->chunkOffset = start32;
|
||||
}
|
||||
u_terminateUChars(dest, destCapacity, length, pErrorCode);
|
||||
return length;
|
||||
@ -2528,7 +2890,7 @@ ucstrTextExtract(UText *ut,
|
||||
if (strLength>=0) {
|
||||
// We have filled the destination buffer, and the string length is known.
|
||||
// Cut the loop short. There is no need to scan string termination.
|
||||
di = strLength;
|
||||
di = limit32 - start32;
|
||||
si = limit32;
|
||||
break;
|
||||
}
|
||||
@ -2548,7 +2910,7 @@ ucstrTextExtract(UText *ut,
|
||||
}
|
||||
|
||||
// Put iteration position at the point just following the extracted text
|
||||
ut->chunkOffset = si;
|
||||
ut->chunkOffset = uprv_min(strLength, start32 + destCapacity);
|
||||
|
||||
// Add a terminating NUL if space in the buffer permits,
|
||||
// and set the error status as required.
|
||||
@ -2754,15 +3116,18 @@ charIterTextExtract(UText *ut,
|
||||
int32_t limit32 = pinIndex(limit, length);
|
||||
int32_t desti = 0;
|
||||
int32_t srci;
|
||||
int32_t copyLimit;
|
||||
|
||||
CharacterIterator *ci = (CharacterIterator *)ut->context;
|
||||
ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed.
|
||||
srci = ci->getIndex();
|
||||
copyLimit = srci;
|
||||
while (srci<limit32) {
|
||||
UChar32 c = ci->next32PostInc();
|
||||
int32_t len = U16_LENGTH(c);
|
||||
if (desti+len <= destCapacity) {
|
||||
U16_APPEND_UNSAFE(dest, desti, c);
|
||||
copyLimit = srci+len;
|
||||
} else {
|
||||
desti += len;
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
@ -2770,6 +3135,8 @@ charIterTextExtract(UText *ut,
|
||||
srci += len;
|
||||
}
|
||||
|
||||
charIterTextAccess(ut, copyLimit, TRUE);
|
||||
|
||||
u_terminateUChars(dest, destCapacity, desti, status);
|
||||
return desti;
|
||||
}
|
||||
|
@ -76,7 +76,7 @@ translit.o utrans.o esctrn.o unesctrn.o funcrepl.o strrepl.o tridpars.o \
|
||||
cpdtrans.o rbt.o rbt_data.o rbt_pars.o rbt_rule.o rbt_set.o \
|
||||
nultrans.o remtrans.o casetrn.o titletrn.o tolowtrn.o toupptrn.o anytrans.o \
|
||||
name2uni.o uni2name.o nortrans.o quant.o transreg.o brktrans.o \
|
||||
regexcmp.o rematch.o repattrn.o regexst.o udatpg.o uregex.o uregexc.o \
|
||||
regexcmp.o rematch.o repattrn.o regexst.o regextxt.o udatpg.o uregex.o uregexc.o \
|
||||
ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \
|
||||
csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \
|
||||
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o \
|
||||
|
@ -3602,6 +3602,14 @@
|
||||
RelativePath=".\regexst.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\regextxt.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\regextxt.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\rematch.cpp"
|
||||
>
|
||||
|
@ -1,7 +1,7 @@
|
||||
//
|
||||
// file: regexcmp.cpp
|
||||
//
|
||||
// Copyright (C) 2002-2009 International Business Machines Corporation and others.
|
||||
// Copyright (C) 2002-2010 International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains the ICU regular expression compiler, which is responsible
|
||||
@ -13,6 +13,7 @@
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uchar.h"
|
||||
@ -21,6 +22,7 @@
|
||||
#include "unicode/parseerr.h"
|
||||
#include "unicode/regex.h"
|
||||
#include "util.h"
|
||||
#include "putilimp.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "uvectr32.h"
|
||||
@ -33,6 +35,7 @@
|
||||
// generated by a Perl script.
|
||||
#include "regexcmp.h"
|
||||
#include "regexst.h"
|
||||
#include "regextxt.h"
|
||||
|
||||
|
||||
|
||||
@ -47,11 +50,13 @@ U_NAMESPACE_BEGIN
|
||||
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
|
||||
fParenStack(status), fSetStack(status), fSetOpStack(status)
|
||||
{
|
||||
// Lazy init of all shared global sets (needed for init()'s empty text)
|
||||
RegexStaticSets::initGlobals(&status);
|
||||
|
||||
fStatus = &status;
|
||||
|
||||
fRXPat = rxp;
|
||||
fScanIndex = 0;
|
||||
fNextIndex = 0;
|
||||
fPeekChar = -1;
|
||||
fLineNum = 1;
|
||||
fCharNum = 0;
|
||||
@ -97,6 +102,24 @@ void RegexCompile::compile(
|
||||
const UnicodeString &pat, // Source pat to be compiled.
|
||||
UParseError &pp, // Error position info
|
||||
UErrorCode &e) // Error Code
|
||||
{
|
||||
UText patternText = UTEXT_INITIALIZER;
|
||||
utext_openConstUnicodeString(&patternText, &pat, &e);
|
||||
|
||||
if (U_SUCCESS(e)) {
|
||||
compile(&patternText, pp, e);
|
||||
utext_close(&patternText);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// compile, UText mode
|
||||
// All the work is actually done here.
|
||||
//
|
||||
void RegexCompile::compile(
|
||||
UText *pat, // Source pat to be compiled.
|
||||
UParseError &pp, // Error position info
|
||||
UErrorCode &e) // Error Code
|
||||
{
|
||||
fStatus = &e;
|
||||
fParseErr = &pp;
|
||||
@ -108,16 +131,16 @@ void RegexCompile::compile(
|
||||
}
|
||||
|
||||
// There should be no pattern stuff in the RegexPattern object. They can not be reused.
|
||||
U_ASSERT(fRXPat->fPattern.length() == 0);
|
||||
U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
|
||||
|
||||
// Prepare the RegexPattern object to receive the compiled pattern.
|
||||
fRXPat->fPattern = pat;
|
||||
fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
|
||||
fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
|
||||
fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
|
||||
|
||||
|
||||
// Initialize the pattern scanning state machine
|
||||
fPatternLength = pat.length();
|
||||
fPatternLength = utext_nativeLength(pat);
|
||||
uint16_t state = 1;
|
||||
const RegexTableEl *tableEl;
|
||||
nextChar(fC); // Fetch the first char from the pattern string.
|
||||
@ -250,34 +273,13 @@ void RegexCompile::compile(
|
||||
// The pattern has now been read and processed, and the compiled code generated.
|
||||
//
|
||||
|
||||
// Back-reference fixup
|
||||
//
|
||||
int32_t loc;
|
||||
for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
|
||||
int32_t op = fRXPat->fCompiledPat->elementAti(loc);
|
||||
int32_t opType = URX_TYPE(op);
|
||||
if (opType == URX_BACKREF || opType == URX_BACKREF_I) {
|
||||
int32_t where = URX_VAL(op);
|
||||
if (where > fRXPat->fGroupMap->size()) {
|
||||
error(U_REGEX_INVALID_BACK_REF);
|
||||
break;
|
||||
}
|
||||
where = fRXPat->fGroupMap->elementAti(where-1);
|
||||
op = URX_BUILD(opType, where);
|
||||
fRXPat->fCompiledPat->setElementAt(op, loc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Compute the number of digits requried for the largest capture group number.
|
||||
//
|
||||
fRXPat->fMaxCaptureDigits = 1;
|
||||
int32_t n = 10;
|
||||
for (;;) {
|
||||
if (n > fRXPat->fGroupMap->size()) {
|
||||
break;
|
||||
}
|
||||
int32_t groupCount = fRXPat->fGroupMap->size();
|
||||
while (n <= groupCount) {
|
||||
fRXPat->fMaxCaptureDigits++;
|
||||
n *= 10;
|
||||
}
|
||||
@ -286,10 +288,15 @@ void RegexCompile::compile(
|
||||
// The pattern's fFrameSize so far has accumulated the requirements for
|
||||
// storage for capture parentheses, counters, etc. that are encountered
|
||||
// in the pattern. Add space for the two variables that are always
|
||||
// present in the saved state: the input string position and the
|
||||
// position in the compiled pattern.
|
||||
// present in the saved state: the input string position (int64_t) and
|
||||
// the position in the compiled pattern.
|
||||
//
|
||||
fRXPat->fFrameSize+=2;
|
||||
fRXPat->fFrameSize+=3;
|
||||
|
||||
//
|
||||
// Optimization pass 1: NOPs, back-references, and case-folding
|
||||
//
|
||||
stripNOPs();
|
||||
|
||||
//
|
||||
// Get bounds for the minimum and maximum length of a string that this
|
||||
@ -299,10 +306,9 @@ void RegexCompile::compile(
|
||||
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
||||
|
||||
//
|
||||
// Optimization passes
|
||||
// Optimization pass 2: match start type
|
||||
//
|
||||
matchStartType();
|
||||
stripNOPs();
|
||||
|
||||
//
|
||||
// Set up fast latin-1 range sets
|
||||
@ -426,19 +432,19 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
// - NOP, which may later be replaced by a save-state if there
|
||||
// is an '|' alternation within the parens.
|
||||
//
|
||||
// Each capture group gets three slots in the save stack frame:
|
||||
// 0: Capture Group start position (in input string being matched.)
|
||||
// 1: Capture Group end positino.
|
||||
// 2: Start of Match-in-progress.
|
||||
// Each capture group gets three double-width slots in the save stack frame:
|
||||
// 0-1: Capture Group start position (in input string being matched.)
|
||||
// 2-3: Capture Group end position.
|
||||
// 4-5: Start of Match-in-progress.
|
||||
// The first two locations are for a completed capture group, and are
|
||||
// referred to by back references and the like.
|
||||
// The third location stores the capture start position when an START_CAPTURE is
|
||||
// encountered. This will be promoted to a completed capture when (and if) the corresponding
|
||||
// END_CAPure is encountered.
|
||||
// END_CAPTURE is encountered.
|
||||
{
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
|
||||
fRXPat->fFrameSize += 3;
|
||||
int32_t varsLoc = fRXPat->fFrameSize; // Reserve five slots in match stack frame.
|
||||
fRXPat->fFrameSize += 6;
|
||||
int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
|
||||
fRXPat->fCompiledPat->addElement(cop, *fStatus);
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
@ -532,10 +538,10 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
// 8. code for parenthesized stuff.
|
||||
// 9. LA_END
|
||||
//
|
||||
// Two data slots are reserved, for saving the stack ptr and the input position.
|
||||
// Three data slots are reserved, for saving the stack ptr and the (double-width) input position.
|
||||
{
|
||||
int32_t dataLoc = fRXPat->fDataSize;
|
||||
fRXPat->fDataSize += 2;
|
||||
fRXPat->fDataSize += 3;
|
||||
int32_t op = URX_BUILD(URX_LA_START, dataLoc);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
@ -576,9 +582,10 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
// 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
|
||||
// 7. END_LA // Restore match region, in case look-ahead was using
|
||||
// an alternate (transparent) region.
|
||||
// Three data slots are reserved, for saving the stack ptr and the (double-width) input position.
|
||||
{
|
||||
int32_t dataLoc = fRXPat->fDataSize;
|
||||
fRXPat->fDataSize += 2;
|
||||
fRXPat->fDataSize += 3;
|
||||
int32_t op = URX_BUILD(URX_LA_START, dataLoc);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
@ -617,12 +624,12 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
// Allocate a block of matcher data, to contain (when running a match)
|
||||
// 0: Stack ptr on entry
|
||||
// 1: Input Index on entry
|
||||
// 2: Start index of match current match attempt.
|
||||
// 3: Original Input String len.
|
||||
// 2-3: Start index of match current match attempt.
|
||||
// 4-5: Original Input String len.
|
||||
|
||||
// Allocate data space
|
||||
int32_t dataLoc = fRXPat->fDataSize;
|
||||
fRXPat->fDataSize += 4;
|
||||
fRXPat->fDataSize += 6;
|
||||
|
||||
// Emit URX_LB_START
|
||||
int32_t op = URX_BUILD(URX_LB_START, dataLoc);
|
||||
@ -670,12 +677,12 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
// Allocate a block of matcher data, to contain (when running a match)
|
||||
// 0: Stack ptr on entry
|
||||
// 1: Input Index on entry
|
||||
// 2: Start index of match current match attempt.
|
||||
// 3: Original Input String len.
|
||||
// 2-3: Start index of match current match attempt.
|
||||
// 4-5: Original Input String len.
|
||||
|
||||
// Allocate data space
|
||||
int32_t dataLoc = fRXPat->fDataSize;
|
||||
fRXPat->fDataSize += 4;
|
||||
fRXPat->fDataSize += 6;
|
||||
|
||||
// Emit URX_LB_START
|
||||
int32_t op = URX_BUILD(URX_LB_START, dataLoc);
|
||||
@ -764,7 +771,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
|
||||
fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
|
||||
frameLoc = fRXPat->fFrameSize;
|
||||
fRXPat->fFrameSize++;
|
||||
fRXPat->fFrameSize += 2; // double-width index
|
||||
int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
|
||||
fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
|
||||
break;
|
||||
@ -784,7 +791,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
}
|
||||
fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
|
||||
frameLoc = fRXPat->fFrameSize;
|
||||
fRXPat->fFrameSize++;
|
||||
fRXPat->fFrameSize += 2; // double-width index
|
||||
int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
|
||||
fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
|
||||
break;
|
||||
@ -801,7 +808,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
// Emit the code sequence that can handle it.
|
||||
insertOp(topLoc);
|
||||
frameLoc = fRXPat->fFrameSize;
|
||||
fRXPat->fFrameSize++;
|
||||
fRXPat->fFrameSize += 2; // double-width index
|
||||
|
||||
int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, topLoc);
|
||||
@ -907,7 +914,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
|
||||
fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
|
||||
dataLoc = fRXPat->fFrameSize;
|
||||
fRXPat->fFrameSize++;
|
||||
fRXPat->fFrameSize += 2; // double-width index
|
||||
int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
|
||||
fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
|
||||
break;
|
||||
@ -927,7 +934,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
}
|
||||
fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
|
||||
dataLoc = fRXPat->fFrameSize;
|
||||
fRXPat->fFrameSize++;
|
||||
fRXPat->fFrameSize += 2; // double-width index
|
||||
int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
|
||||
fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
|
||||
break;
|
||||
@ -945,7 +952,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
|
||||
insertOp(saveStateLoc);
|
||||
dataLoc = fRXPat->fFrameSize;
|
||||
fRXPat->fFrameSize++;
|
||||
fRXPat->fFrameSize += 2; // double-width index
|
||||
|
||||
int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
|
||||
@ -1658,7 +1665,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
}
|
||||
|
||||
|
||||
case doSetNegate:
|
||||
case doSetNegate:
|
||||
// Scanned a '^' at the start of a set.
|
||||
// Push the negation operator onto the set op stack.
|
||||
// A twist for case-insensitive matching:
|
||||
@ -1771,16 +1778,11 @@ void RegexCompile::literalChar(UChar32 c) {
|
||||
// Emit a OneChar op into the compiled pattern.
|
||||
emitONE_CHAR(c);
|
||||
|
||||
// Also add it to the string pool, in case we get a second adjacent literal
|
||||
// and want to change form ONE_CHAR to STRING
|
||||
// Mark that we might actually be starting a string here
|
||||
fStringOpStart = fRXPat->fLiteralText.length();
|
||||
fRXPat->fLiteralText.append(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// We are adding onto an existing string
|
||||
fRXPat->fLiteralText.append(c);
|
||||
|
||||
op = fRXPat->fCompiledPat->lastElementi();
|
||||
opType = URX_TYPE(op);
|
||||
U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
|
||||
@ -1801,6 +1803,8 @@ void RegexCompile::literalChar(UChar32 c) {
|
||||
// The most recently emitted op is a ONECHAR.
|
||||
// We've now received another adjacent char. Change the ONECHAR op
|
||||
// to a string op.
|
||||
fRXPat->fLiteralText.append(URX_VAL(op));
|
||||
|
||||
if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
|
||||
op = URX_BUILD(URX_STRING_I, fStringOpStart);
|
||||
} else {
|
||||
@ -1812,6 +1816,9 @@ void RegexCompile::literalChar(UChar32 c) {
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
|
||||
// We are adding onto an existing string
|
||||
fRXPat->fLiteralText.append(c);
|
||||
|
||||
// The pattern contains a URX_SRING / URX_STRING_LEN. Update the
|
||||
// string length to reflect the new char we just added to the string.
|
||||
stringLen = fRXPat->fLiteralText.length() - fStringOpStart;
|
||||
@ -1834,7 +1841,7 @@ void RegexCompile::emitONE_CHAR(UChar32 c) {
|
||||
if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
|
||||
u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
|
||||
// We have a cased character, and are in case insensitive matching mode.
|
||||
c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
|
||||
//c = u_foldCase(c, U_FOLD_CASE_DEFAULT); // !!!: handled in stripNOPs() now
|
||||
op = URX_BUILD(URX_ONECHAR_I, c);
|
||||
} else {
|
||||
// Uncased char, or case sensitive match mode.
|
||||
@ -2245,7 +2252,6 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
|
||||
// ignored strings, that would be better.)
|
||||
theSet->removeAllStrings();
|
||||
int32_t setSize = theSet->size();
|
||||
UChar32 firstSetChar = theSet->charAt(0);
|
||||
|
||||
switch (setSize) {
|
||||
case 0:
|
||||
@ -2261,7 +2267,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
|
||||
// The set contains only a single code point. Put it into
|
||||
// the compiled pattern as a single char operation rather
|
||||
// than a set, and discard the set itself.
|
||||
literalChar(firstSetChar);
|
||||
literalChar(theSet->charAt(0));
|
||||
delete theSet;
|
||||
}
|
||||
break;
|
||||
@ -3378,6 +3384,14 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
||||
// code generation to provide locations that may be patched later.
|
||||
// Many end up unneeded, and are removed by this function.
|
||||
//
|
||||
// In order to minimize the number of passes through the pattern,
|
||||
// back-reference fixup is also performed here (adjusting
|
||||
// back-reference operands to point to the correct frame offsets).
|
||||
//
|
||||
// In addition, case-insensitive character and string literals are
|
||||
// now case-folded here, rather than when first parsed or at match
|
||||
// time.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
void RegexCompile::stripNOPs() {
|
||||
|
||||
@ -3400,6 +3414,9 @@ void RegexCompile::stripNOPs() {
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString caseStringBuffer;
|
||||
int32_t stringDelta = 0;
|
||||
|
||||
// Make a second pass over the code, removing the NOPs by moving following
|
||||
// code up, and patching operands that refer to code locations that
|
||||
// are being moved. The array of offsets from the first step is used
|
||||
@ -3432,12 +3449,69 @@ void RegexCompile::stripNOPs() {
|
||||
break;
|
||||
}
|
||||
|
||||
case URX_ONECHAR_I:
|
||||
{
|
||||
UChar32 c = URX_VAL(op);
|
||||
if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
|
||||
// We have a cased character to fold
|
||||
c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
|
||||
op = URX_BUILD(URX_ONECHAR_I, c);
|
||||
}
|
||||
|
||||
fRXPat->fCompiledPat->setElementAt(op, dst);
|
||||
dst++;
|
||||
break;
|
||||
}
|
||||
case URX_STRING_I:
|
||||
{
|
||||
op = URX_BUILD(URX_STRING_I, URX_VAL(op)+stringDelta);
|
||||
|
||||
src++;
|
||||
int32_t lengthOp = fRXPat->fCompiledPat->elementAti(src);
|
||||
|
||||
caseStringBuffer.setTo(fRXPat->fLiteralText, URX_VAL(op), URX_VAL(lengthOp));
|
||||
caseStringBuffer.foldCase(U_FOLD_CASE_DEFAULT);
|
||||
|
||||
int32_t newLen = caseStringBuffer.length();
|
||||
if (newLen <= URX_VAL(lengthOp)) {
|
||||
// don't shift if we don't have to, take the tiny memory hit of a smaller string
|
||||
fRXPat->fLiteralText.replace(URX_VAL(op), newLen, caseStringBuffer);
|
||||
} else {
|
||||
// shift other strings over...at least UnicodeString handles this for us!
|
||||
fRXPat->fLiteralText.replace(URX_VAL(op), URX_VAL(lengthOp), caseStringBuffer);
|
||||
stringDelta += newLen - URX_VAL(lengthOp);
|
||||
}
|
||||
lengthOp = URX_BUILD(URX_STRING_LEN, newLen);
|
||||
|
||||
fRXPat->fCompiledPat->setElementAt(op, dst);
|
||||
fRXPat->fCompiledPat->setElementAt(lengthOp, dst+1);
|
||||
dst += 2;
|
||||
break;
|
||||
}
|
||||
case URX_BACKREF:
|
||||
case URX_BACKREF_I:
|
||||
{
|
||||
int32_t where = URX_VAL(op);
|
||||
if (where > fRXPat->fGroupMap->size()) {
|
||||
error(U_REGEX_INVALID_BACK_REF);
|
||||
break;
|
||||
}
|
||||
where = fRXPat->fGroupMap->elementAti(where-1);
|
||||
op = URX_BUILD(opType, where);
|
||||
fRXPat->fCompiledPat->setElementAt(op, dst);
|
||||
dst++;
|
||||
|
||||
fRXPat->fNeedsAltInput = TRUE;
|
||||
break;
|
||||
}
|
||||
case URX_STRING:
|
||||
op = URX_BUILD(URX_STRING, URX_VAL(op)+stringDelta);
|
||||
// continue
|
||||
case URX_RESERVED_OP:
|
||||
case URX_RESERVED_OP_N:
|
||||
case URX_BACKTRACK:
|
||||
case URX_END:
|
||||
case URX_ONECHAR:
|
||||
case URX_STRING:
|
||||
case URX_STRING_LEN:
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
@ -3460,13 +3534,9 @@ void RegexCompile::stripNOPs() {
|
||||
case URX_DOTANY_UNIX:
|
||||
case URX_STO_SP:
|
||||
case URX_LD_SP:
|
||||
case URX_BACKREF:
|
||||
case URX_STO_INP_LOC:
|
||||
case URX_LA_START:
|
||||
case URX_LA_END:
|
||||
case URX_ONECHAR_I:
|
||||
case URX_STRING_I:
|
||||
case URX_BACKREF_I:
|
||||
case URX_DOLLAR_M:
|
||||
case URX_CARET_M:
|
||||
case URX_CARET_M_UNIX:
|
||||
@ -3510,14 +3580,14 @@ void RegexCompile::error(UErrorCode e) {
|
||||
fParseErr->line = fLineNum;
|
||||
fParseErr->offset = fCharNum;
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
|
||||
|
||||
// Fill in the context.
|
||||
// Note: extractBetween() pins supplied indicies to the string bounds.
|
||||
uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
|
||||
uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
|
||||
fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex,
|
||||
fParseErr->preContext, 0);
|
||||
fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1,
|
||||
fParseErr->postContext, 0);
|
||||
utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
|
||||
utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3557,18 +3627,18 @@ static const UChar chLS = 0x2028; // Unicode Line Separator
|
||||
//------------------------------------------------------------------------------
|
||||
UChar32 RegexCompile::nextCharLL() {
|
||||
UChar32 ch;
|
||||
UnicodeString &pattern = fRXPat->fPattern;
|
||||
|
||||
if (fPeekChar != -1) {
|
||||
ch = fPeekChar;
|
||||
fPeekChar = -1;
|
||||
return ch;
|
||||
}
|
||||
if (fPatternLength==0 || fNextIndex >= fPatternLength) {
|
||||
return (UChar32)-1;
|
||||
|
||||
// assume we're already in the right place
|
||||
ch = UTEXT_NEXT32(fRXPat->fPattern);
|
||||
if (ch == U_SENTINEL) {
|
||||
return ch;
|
||||
}
|
||||
ch = pattern.char32At(fNextIndex);
|
||||
fNextIndex = pattern.moveIndex32(fNextIndex, 1);
|
||||
|
||||
if (ch == chCR ||
|
||||
ch == chNEL ||
|
||||
@ -3613,7 +3683,7 @@ UChar32 RegexCompile::peekCharLL() {
|
||||
//------------------------------------------------------------------------------
|
||||
void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
|
||||
fScanIndex = fNextIndex;
|
||||
fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
|
||||
c.fChar = nextCharLL();
|
||||
c.fQuoted = FALSE;
|
||||
|
||||
@ -3670,8 +3740,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
// check for backslash escaped characters.
|
||||
//
|
||||
if (c.fChar == chBackSlash) {
|
||||
int32_t startX = fNextIndex; // start and end positions of the
|
||||
int32_t endX = fNextIndex; // sequence following the '\'
|
||||
int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
|
||||
if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
|
||||
//
|
||||
// A '\' sequence that is handled by ICU's standard unescapeAt function.
|
||||
@ -3680,19 +3749,39 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
//
|
||||
nextCharLL(); // get & discard the peeked char.
|
||||
c.fQuoted = TRUE;
|
||||
c.fChar = fRXPat->fPattern.unescapeAt(endX);
|
||||
if (startX == endX) {
|
||||
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
|
||||
|
||||
if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
|
||||
int32_t endIndex = pos;
|
||||
c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, fPatternLength, (void *)fRXPat->fPattern->chunkContents);
|
||||
|
||||
if (endIndex == pos) {
|
||||
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
|
||||
}
|
||||
fCharNum += endIndex - pos;
|
||||
UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
|
||||
} else {
|
||||
int32_t offset = 0;
|
||||
struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
|
||||
|
||||
UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
|
||||
c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
|
||||
|
||||
if (offset == 0) {
|
||||
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
|
||||
} else if (context.lastOffset == offset) {
|
||||
UTEXT_PREVIOUS32(fRXPat->fPattern);
|
||||
} else if (context.lastOffset != offset-1) {
|
||||
utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
|
||||
}
|
||||
fCharNum += offset;
|
||||
}
|
||||
fCharNum += endX - startX;
|
||||
fNextIndex = endX;
|
||||
}
|
||||
else if (peekCharLL() == chDigit0) {
|
||||
// Octal Escape, using Java Regexp Conventions
|
||||
// which are \0 followed by 1-3 octal digits.
|
||||
// Different from ICU Unescape handling of Octal, which does not
|
||||
// require the leading 0.
|
||||
// Java also has the convention of only consuning 2 octal digits if
|
||||
// Java also has the convention of only consuming 2 octal digits if
|
||||
// the three digit number would be > 0xff
|
||||
//
|
||||
c.fChar = 0;
|
||||
@ -3873,13 +3962,13 @@ UnicodeSet *RegexCompile::scanPosixProp() {
|
||||
|
||||
// Save the scanner state.
|
||||
// TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
|
||||
int32_t savedScanIndex = fScanIndex;
|
||||
int32_t savedNextIndex = fNextIndex;
|
||||
int64_t savedScanIndex = fScanIndex;
|
||||
int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
|
||||
UBool savedQuoteMode = fQuoteMode;
|
||||
UBool savedInBackslashQuote = fInBackslashQuote;
|
||||
UBool savedEOLComments = fEOLComments;
|
||||
int32_t savedLineNum = fLineNum;
|
||||
int32_t savedCharNum = fCharNum;
|
||||
int64_t savedLineNum = fLineNum;
|
||||
int64_t savedCharNum = fCharNum;
|
||||
UChar32 savedLastChar = fLastChar;
|
||||
UChar32 savedPeekChar = fPeekChar;
|
||||
RegexPatternChar savedfC = fC;
|
||||
@ -3926,7 +4015,6 @@ UnicodeSet *RegexCompile::scanPosixProp() {
|
||||
// The main scanner will retry the input as a normal set expression,
|
||||
// not a [:Property:] expression.
|
||||
fScanIndex = savedScanIndex;
|
||||
fNextIndex = savedNextIndex;
|
||||
fQuoteMode = savedQuoteMode;
|
||||
fInBackslashQuote = savedInBackslashQuote;
|
||||
fEOLComments = savedEOLComments;
|
||||
@ -3935,6 +4023,7 @@ UnicodeSet *RegexCompile::scanPosixProp() {
|
||||
fLastChar = savedLastChar;
|
||||
fPeekChar = savedPeekChar;
|
||||
fC = savedfC;
|
||||
UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
|
||||
}
|
||||
return uset;
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
//
|
||||
// regexcmp.h
|
||||
//
|
||||
// Copyright (C) 2002-2008, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2002-2010, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains declarations for the class RegexCompile
|
||||
@ -54,6 +54,7 @@ public:
|
||||
RegexCompile(RegexPattern *rp, UErrorCode &e);
|
||||
|
||||
void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
|
||||
void compile(UText *pat, UParseError &pp, UErrorCode &e);
|
||||
|
||||
|
||||
virtual ~RegexCompile();
|
||||
@ -102,7 +103,7 @@ private:
|
||||
void fixLiterals(UBool split=FALSE); // Fix literal strings.
|
||||
void insertOp(int32_t where); // Open up a slot for a new op in the
|
||||
// generated code at the specified location.
|
||||
void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code,
|
||||
void emitONE_CHAR(UChar32 c); // Emit a ONE_CHAR op into the compiled code,
|
||||
// taking case mode into account.
|
||||
int32_t minMatchLength(int32_t start,
|
||||
int32_t end);
|
||||
@ -124,16 +125,14 @@ private:
|
||||
//
|
||||
// Data associated with low level character scanning
|
||||
//
|
||||
int32_t fScanIndex; // Index of current character being processed
|
||||
int64_t fScanIndex; // Index of current character being processed
|
||||
// in the rule input string.
|
||||
int32_t fNextIndex; // Index of the next character, which
|
||||
// is the first character not yet scanned.
|
||||
UBool fQuoteMode; // Scan is in a \Q...\E quoted region
|
||||
UBool fInBackslashQuote; // Scan is between a '\' and the following char.
|
||||
UBool fEOLComments; // When scan is just after '(?', inhibit #... to
|
||||
// end of line comments, in favor of (?#...) comments.
|
||||
int32_t fLineNum; // Line number in input file.
|
||||
int32_t fCharNum; // Char position within the line.
|
||||
int64_t fLineNum; // Line number in input file.
|
||||
int64_t fCharNum; // Char position within the line.
|
||||
UChar32 fLastChar; // Previous char, needed to count CR-LF
|
||||
// as a single line, not two.
|
||||
UChar32 fPeekChar; // Saved char, if we've scanned ahead.
|
||||
@ -168,7 +167,7 @@ private:
|
||||
// holds the start index within RegexPattern.
|
||||
// fLiteralText where the string is being stored.
|
||||
|
||||
int32_t fPatternLength; // Length of the input pattern string.
|
||||
int64_t fPatternLength; // Length of the input pattern string.
|
||||
|
||||
UVector32 fParenStack; // parentheses stack. Each frame consists of
|
||||
// the positions of compiled pattern operations
|
||||
@ -196,7 +195,7 @@ private:
|
||||
// -1 for the upper interval value means none
|
||||
// was specified (unlimited occurences.)
|
||||
|
||||
int32_t fNameStartPos; // Starting position of a \N{NAME} name in a
|
||||
int64_t fNameStartPos; // Starting position of a \N{NAME} name in a
|
||||
// pattern, valid while remainder of name is
|
||||
// scanned.
|
||||
|
||||
@ -208,7 +207,6 @@ private:
|
||||
UChar32 fLastSetLiteral; // The last single code point added to a set.
|
||||
// needed when "-y" is scanned, and we need
|
||||
// to turn "x-y" into a range.
|
||||
|
||||
};
|
||||
|
||||
// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
|
||||
|
@ -1,5 +1,5 @@
|
||||
//
|
||||
// Copyright (C) 2002-2007 International Business Machines Corporation
|
||||
// Copyright (C) 2002-2010 International Business Machines Corporation
|
||||
// and others. All rights reserved.
|
||||
//
|
||||
// file: regeximp.h
|
||||
@ -279,11 +279,12 @@ enum {
|
||||
// Match Engine State Stack Frame Layout.
|
||||
//
|
||||
struct REStackFrame {
|
||||
int32_t fInputIdx; // Position of next character in the input string
|
||||
int64_t fInputIdx; // Position of next character in the input string
|
||||
int32_t fPatIdx; // Position of next Op in the compiled pattern
|
||||
int32_t fExtra[2]; // Extra state, for capture group start/ends
|
||||
// atomic parentheses, repeat counts, etc.
|
||||
// Locations assigned at pattern compile time.
|
||||
// Note that this will likely end up longer than 64 bits.
|
||||
};
|
||||
|
||||
//
|
||||
@ -307,7 +308,6 @@ enum StartOfMatch {
|
||||
(v)==START_STRING? "START_STRING" : \
|
||||
"ILLEGAL")
|
||||
|
||||
|
||||
//
|
||||
// 8 bit set, to fast-path latin-1 set membership tests.
|
||||
//
|
||||
@ -347,7 +347,6 @@ inline void Regex8BitSet::operator = (const Regex8BitSet &s) {
|
||||
uprv_memcpy(d, s.d, sizeof(d));
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
//
|
||||
// regexst.h
|
||||
//
|
||||
// Copyright (C) 2004-2008, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2004-2010, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains class RegexStaticSets
|
||||
@ -214,6 +214,10 @@ fRuleDigitsAlias(NULL)
|
||||
for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
|
||||
fRuleSets[i].compact();
|
||||
}
|
||||
|
||||
// Finally, initialize an empty string for utility purposes
|
||||
fEmptyText = utext_openUChars(NULL, NULL, 0, status);
|
||||
|
||||
return; // If we reached this point, everything is fine so just exit
|
||||
|
||||
ExitConstrDeleteAll: // Remove fPropSets and fRuleSets and return error
|
||||
@ -233,6 +237,8 @@ RegexStaticSets::~RegexStaticSets() {
|
||||
fPropSets[i] = NULL;
|
||||
}
|
||||
fRuleDigitsAlias = NULL;
|
||||
|
||||
utext_close(fEmptyText);
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
//
|
||||
// regexst.h
|
||||
//
|
||||
// Copyright (C) 2003-2008, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2003-2010, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains declarations for the class RegexStaticSets
|
||||
@ -19,6 +19,7 @@
|
||||
#define REGEXST_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/utext.h"
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "regeximp.h"
|
||||
@ -45,7 +46,7 @@ public:
|
||||
UnicodeSet fUnescapeCharSet; // Set of chars handled by unescape when
|
||||
// encountered with a \ in a pattern.
|
||||
UnicodeSet *fRuleDigitsAlias;
|
||||
UnicodeString fEmptyString; // An empty string, to be used when a matcher
|
||||
UText *fEmptyText; // An empty string, to be used when a matcher
|
||||
// is created with no input.
|
||||
|
||||
};
|
||||
|
45
icu4c/source/i18n/regextxt.cpp
Normal file
45
icu4c/source/i18n/regextxt.cpp
Normal file
@ -0,0 +1,45 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2008-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
//
|
||||
// file: regextxt.cpp
|
||||
//
|
||||
// This file contains utility code for supporting UText in the regular expression engine.
|
||||
//
|
||||
|
||||
#include "regextxt.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
U_CFUNC UChar U_CALLCONV
|
||||
uregex_utext_unescape_charAt(int32_t offset, void *ct) {
|
||||
struct URegexUTextUnescapeCharContext *context = (struct URegexUTextUnescapeCharContext *)ct;
|
||||
UChar32 c;
|
||||
if (offset == context->lastOffset + 1) {
|
||||
c = UTEXT_NEXT32(context->text);
|
||||
context->lastOffset++;
|
||||
} else if (offset == context->lastOffset) {
|
||||
c = UTEXT_PREVIOUS32(context->text);
|
||||
UTEXT_NEXT32(context->text);
|
||||
} else {
|
||||
utext_moveIndex32(context->text, offset - context->lastOffset - 1);
|
||||
c = UTEXT_NEXT32(context->text);
|
||||
context->lastOffset = offset;
|
||||
}
|
||||
|
||||
// !!!: Doesn't handle characters outside BMP
|
||||
if (U_IS_BMP(c)) {
|
||||
return (UChar)c;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
U_CFUNC UChar U_CALLCONV
|
||||
uregex_ucstr_unescape_charAt(int32_t offset, void *context) {
|
||||
return ((UChar *)context)[offset];
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
48
icu4c/source/i18n/regextxt.h
Normal file
48
icu4c/source/i18n/regextxt.h
Normal file
@ -0,0 +1,48 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2008-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
//
|
||||
// file: regextxt.h
|
||||
//
|
||||
// This file contains utility code for supporting UText in the regular expression engine.
|
||||
//
|
||||
// This class is internal to the regular expression implementation.
|
||||
// For the public Regular Expression API, see the file "unicode/regex.h"
|
||||
//
|
||||
|
||||
#ifndef _REGEXTXT_H
|
||||
#define _REGEXTXT_H
|
||||
|
||||
#include <unicode/utypes.h>
|
||||
#include <unicode/utext.h>
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#define UTEXT_USES_U16(ut) (NULL==((ut)->pFuncs->mapNativeIndexToUTF16))
|
||||
|
||||
#if 0
|
||||
#define REGEX_DISABLE_CHUNK_MODE 1
|
||||
#endif
|
||||
|
||||
#ifdef REGEX_DISABLE_CHUNK_MODE
|
||||
# define UTEXT_FULL_TEXT_IN_CHUNK(ut,len) (FALSE)
|
||||
#else
|
||||
# define UTEXT_FULL_TEXT_IN_CHUNK(ut,len) ((0==((ut)->chunkNativeStart))&&((len)==((ut)->chunkNativeLimit))&&((len)==((ut)->nativeIndexingLimit)))
|
||||
#endif
|
||||
|
||||
struct URegexUTextUnescapeCharContext {
|
||||
UText *text;
|
||||
int32_t lastOffset;
|
||||
};
|
||||
#define U_REGEX_UTEXT_UNESCAPE_CONTEXT(text) { (text), -1 }
|
||||
|
||||
U_CFUNC UChar U_CALLCONV
|
||||
uregex_utext_unescape_charAt(int32_t offset, void * /* struct URegexUTextUnescapeCharContext* */ context);
|
||||
U_CFUNC UChar U_CALLCONV
|
||||
uregex_ucstr_unescape_charAt(int32_t offset, void * /* UChar* */ context);
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@ -3,7 +3,7 @@
|
||||
//
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2002-2009 International Business Machines Corporation *
|
||||
* Copyright (C) 2002-2010 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
***************************************************************************
|
||||
*/
|
||||
@ -29,11 +29,11 @@ U_NAMESPACE_BEGIN
|
||||
//
|
||||
//--------------------------------------------------------------------------
|
||||
RegexPattern::RegexPattern() {
|
||||
// Init all of this instance's data.
|
||||
init();
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
u_init(&status);
|
||||
|
||||
// Lazy init of all shared global sets.
|
||||
RegexStaticSets::initGlobals(&fDeferredStatus);
|
||||
// Init all of this instances data.
|
||||
init();
|
||||
}
|
||||
|
||||
|
||||
@ -52,7 +52,7 @@ RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
//
|
||||
// Assignmenet Operator
|
||||
// Assignment Operator
|
||||
//
|
||||
//--------------------------------------------------------------------------
|
||||
RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
||||
@ -68,7 +68,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
||||
init();
|
||||
|
||||
// Copy simple fields
|
||||
fPattern = other.fPattern;
|
||||
fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
|
||||
fFlags = other.fFlags;
|
||||
fLiteralText = other.fLiteralText;
|
||||
fDeferredStatus = other.fDeferredStatus;
|
||||
@ -85,6 +85,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
||||
*fInitialChars = *other.fInitialChars;
|
||||
fInitialChar = other.fInitialChar;
|
||||
*fInitialChars8 = *other.fInitialChars8;
|
||||
fNeedsAltInput = other.fNeedsAltInput;
|
||||
|
||||
// Copy the pattern. It's just values, nothing deep to copy.
|
||||
fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
|
||||
@ -126,7 +127,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
||||
//
|
||||
//--------------------------------------------------------------------------
|
||||
void RegexPattern::init() {
|
||||
fPattern.remove();
|
||||
fFlags = 0;
|
||||
fCompiledPat = 0;
|
||||
fLiteralText.remove();
|
||||
@ -146,7 +146,9 @@ void RegexPattern::init() {
|
||||
fInitialChars = NULL;
|
||||
fInitialChar = 0;
|
||||
fInitialChars8 = NULL;
|
||||
fNeedsAltInput = FALSE;
|
||||
|
||||
fPattern = NULL; // will be set later
|
||||
fCompiledPat = new UVector32(fDeferredStatus);
|
||||
fGroupMap = new UVector32(fDeferredStatus);
|
||||
fSets = new UVector(fDeferredStatus);
|
||||
@ -192,6 +194,9 @@ void RegexPattern::zap() {
|
||||
fInitialChars = NULL;
|
||||
delete fInitialChars8;
|
||||
fInitialChars8 = NULL;
|
||||
if (fPattern != NULL) {
|
||||
utext_close(fPattern);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -220,13 +225,27 @@ RegexPattern *RegexPattern::clone() const {
|
||||
//
|
||||
// operator == (comparison) Consider to patterns to be == if the
|
||||
// pattern strings and the flags are the same.
|
||||
// Note that pattern strings with the same
|
||||
// characters can still be considered different.
|
||||
//
|
||||
//--------------------------------------------------------------------------
|
||||
UBool RegexPattern::operator ==(const RegexPattern &other) const {
|
||||
UBool r = this->fFlags == other.fFlags &&
|
||||
this->fPattern == other.fPattern &&
|
||||
this->fDeferredStatus == other.fDeferredStatus;
|
||||
return r;
|
||||
if (this->fPattern == NULL) {
|
||||
if (other.fPattern == NULL) {
|
||||
return this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus;
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
} else {
|
||||
if (other.fPattern == NULL) {
|
||||
return FALSE;
|
||||
} else {
|
||||
UTEXT_SETNATIVEINDEX(this->fPattern, 0);
|
||||
UTEXT_SETNATIVEINDEX(other.fPattern, 0);
|
||||
return this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus &&
|
||||
utext_equals(this->fPattern, other.fPattern);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
@ -240,7 +259,57 @@ RegexPattern::compile(const UnicodeString ®ex,
|
||||
UParseError &pe,
|
||||
UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
|
||||
UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
|
||||
UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES;
|
||||
|
||||
if ((flags & ~allFlags) != 0) {
|
||||
status = U_REGEX_INVALID_FLAG;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((flags & UREGEX_CANON_EQ) != 0) {
|
||||
status = U_REGEX_UNIMPLEMENTED;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
RegexPattern *This = new RegexPattern;
|
||||
if (This == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
if (U_FAILURE(This->fDeferredStatus)) {
|
||||
status = This->fDeferredStatus;
|
||||
delete This;
|
||||
return NULL;
|
||||
}
|
||||
This->fFlags = flags;
|
||||
|
||||
RegexCompile compiler(This, status);
|
||||
compiler.compile(regex, pe, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
delete This;
|
||||
This = NULL;
|
||||
}
|
||||
|
||||
return This;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// compile, UText mode
|
||||
//
|
||||
RegexPattern * U_EXPORT2
|
||||
RegexPattern::compile(UText *regex,
|
||||
uint32_t flags,
|
||||
UParseError &pe,
|
||||
UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
@ -294,20 +363,43 @@ RegexPattern::compile(const UnicodeString ®ex,
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// compile with default flags, UText mode
|
||||
//
|
||||
RegexPattern * U_EXPORT2
|
||||
RegexPattern::compile(UText *regex,
|
||||
UParseError &pe,
|
||||
UErrorCode &err)
|
||||
{
|
||||
return compile(regex, 0, pe, err);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// compile with no UParseErr parameter.
|
||||
//
|
||||
RegexPattern * U_EXPORT2
|
||||
RegexPattern::compile( const UnicodeString ®ex,
|
||||
uint32_t flags,
|
||||
UErrorCode &err)
|
||||
RegexPattern::compile(const UnicodeString ®ex,
|
||||
uint32_t flags,
|
||||
UErrorCode &err)
|
||||
{
|
||||
UParseError pe;
|
||||
return compile(regex, flags, pe, err);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// compile with no UParseErr parameter, UText mode
|
||||
//
|
||||
RegexPattern * U_EXPORT2
|
||||
RegexPattern::compile(UText *regex,
|
||||
uint32_t flags,
|
||||
UErrorCode &err)
|
||||
{
|
||||
UParseError pe;
|
||||
return compile(regex, flags, pe, err);
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
@ -327,8 +419,21 @@ uint32_t RegexPattern::flags() const {
|
||||
RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
|
||||
UErrorCode &status) const {
|
||||
RegexMatcher *retMatcher = matcher(status);
|
||||
retMatcher->fDeferredStatus = status;
|
||||
if (retMatcher != NULL) {
|
||||
retMatcher->fDeferredStatus = status;
|
||||
retMatcher->reset(input);
|
||||
}
|
||||
return retMatcher;
|
||||
}
|
||||
|
||||
//
|
||||
// matcher, UText mode
|
||||
//
|
||||
RegexMatcher *RegexPattern::matcher(UText *input,
|
||||
UErrorCode &status) const {
|
||||
RegexMatcher *retMatcher = matcher(status);
|
||||
if (retMatcher != NULL) {
|
||||
retMatcher->fDeferredStatus = status;
|
||||
retMatcher->reset(input);
|
||||
}
|
||||
return retMatcher;
|
||||
@ -399,6 +504,31 @@ UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex,
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// matches, UText mode
|
||||
//
|
||||
UBool U_EXPORT2 RegexPattern::matches(UText *regex,
|
||||
UText *input,
|
||||
UParseError &pe,
|
||||
UErrorCode &status) {
|
||||
|
||||
if (U_FAILURE(status)) {return FALSE;}
|
||||
|
||||
UBool retVal;
|
||||
RegexPattern *pat = NULL;
|
||||
RegexMatcher *matcher = NULL;
|
||||
|
||||
pat = RegexPattern::compile(regex, 0, pe, status);
|
||||
matcher = pat->matcher(input, status);
|
||||
retVal = matcher->matches(status);
|
||||
|
||||
delete matcher;
|
||||
delete pat;
|
||||
return retVal;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
@ -407,12 +537,43 @@ UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex,
|
||||
//
|
||||
//---------------------------------------------------------------------
|
||||
UnicodeString RegexPattern::pattern() const {
|
||||
return fPattern;
|
||||
if (fPattern == NULL) {
|
||||
return UnicodeString();
|
||||
} else {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int64_t nativeLen = utext_nativeLength(fPattern);
|
||||
int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
|
||||
UnicodeString result;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UChar *resultChars = result.getBuffer(len16);
|
||||
utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
|
||||
result.releaseBuffer(len16);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// patternText
|
||||
//
|
||||
//---------------------------------------------------------------------
|
||||
UText *RegexPattern::patternText() const {
|
||||
if (fPattern != NULL) {
|
||||
return fPattern;
|
||||
} else {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
RegexStaticSets::initGlobals(&status);
|
||||
return RegexStaticSets::gStaticSets->fEmptyText;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// split
|
||||
@ -421,7 +582,28 @@ UnicodeString RegexPattern::pattern() const {
|
||||
int32_t RegexPattern::split(const UnicodeString &input,
|
||||
UnicodeString dest[],
|
||||
int32_t destCapacity,
|
||||
UErrorCode &status) const
|
||||
UErrorCode &status) const
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
};
|
||||
|
||||
RegexMatcher m(this);
|
||||
int32_t r = 0;
|
||||
// Check m's status to make sure all is ok.
|
||||
if (U_SUCCESS(m.fDeferredStatus)) {
|
||||
r = m.split(input, dest, destCapacity, status);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
//
|
||||
// split, UText mode
|
||||
//
|
||||
int32_t RegexPattern::split(UText *input,
|
||||
UText *dest[],
|
||||
int32_t destCapacity,
|
||||
UErrorCode &status) const
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
@ -572,17 +754,24 @@ RegexPatternDump(const RegexPattern *This) {
|
||||
int i;
|
||||
|
||||
REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
|
||||
for (i=0; i<This->fPattern.length(); i++) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
|
||||
UChar32 c = utext_next32From(This->fPattern, 0);
|
||||
while (c != U_SENTINEL) {
|
||||
if (c<32 || c>256) {
|
||||
c = '.';
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", c));
|
||||
|
||||
c = UTEXT_NEXT32(This->fPattern);
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("\n"));
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
|
||||
if (This->fStartType == START_STRING) {
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \""));
|
||||
for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates.
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("\"\n"));
|
||||
|
||||
} else if (This->fStartType == START_SET) {
|
||||
int32_t numSetChars = This->fInitialChars->size();
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002-2009, International Business Machines
|
||||
* Copyright (C) 2002-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: regex.h
|
||||
@ -48,6 +48,7 @@
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "unicode/parseerr.h"
|
||||
|
||||
#include "unicode/uregex.h"
|
||||
@ -187,6 +188,35 @@ public:
|
||||
UParseError &pe,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Compiles the regular expression in string form into a RegexPattern
|
||||
* object. These compile methods, rather than the constructors, are the usual
|
||||
* way that RegexPattern objects are created.
|
||||
*
|
||||
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
|
||||
* objects created from the pattern are active. RegexMatchers keep a pointer
|
||||
* back to their pattern, so premature deletion of the pattern is a
|
||||
* catastrophic error.</p>
|
||||
*
|
||||
* <p>All pattern match mode flags are set to their default values.</p>
|
||||
*
|
||||
* <p>Note that it is often more convenient to construct a RegexMatcher directly
|
||||
* from a pattern string rather than separately compiling the pattern and
|
||||
* then creating a RegexMatcher object from the pattern.</p>
|
||||
*
|
||||
* @param regex The regular expression to be compiled.
|
||||
* @param pe Receives the position (line and column nubers) of any error
|
||||
* within the regular expression.)
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return A regexPattern object for the compiled pattern.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
static RegexPattern * U_EXPORT2 compile( UText *regex,
|
||||
UParseError &pe,
|
||||
UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Compiles the regular expression in string form into a RegexPattern
|
||||
* object using the specified match mode flags. These compile methods,
|
||||
@ -204,7 +234,7 @@ public:
|
||||
*
|
||||
* @param regex The regular expression to be compiled.
|
||||
* @param flags The match mode flags to be used.
|
||||
* @param pe Receives the position (line and column nubers) of any error
|
||||
* @param pe Receives the position (line and column numbers) of any error
|
||||
* within the regular expression.)
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return A regexPattern object for the compiled pattern.
|
||||
@ -217,6 +247,36 @@ public:
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Compiles the regular expression in string form into a RegexPattern
|
||||
* object using the specified match mode flags. These compile methods,
|
||||
* rather than the constructors, are the usual way that RegexPattern objects
|
||||
* are created.
|
||||
*
|
||||
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
|
||||
* objects created from the pattern are active. RegexMatchers keep a pointer
|
||||
* back to their pattern, so premature deletion of the pattern is a
|
||||
* catastrophic error.</p>
|
||||
*
|
||||
* <p>Note that it is often more convenient to construct a RegexMatcher directly
|
||||
* from a pattern string instead of than separately compiling the pattern and
|
||||
* then creating a RegexMatcher object from the pattern.</p>
|
||||
*
|
||||
* @param regex The regular expression to be compiled.
|
||||
* @param flags The match mode flags to be used.
|
||||
* @param pe Receives the position (line and column numbers) of any error
|
||||
* within the regular expression.)
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return A regexPattern object for the compiled pattern.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
static RegexPattern * U_EXPORT2 compile( UText *regex,
|
||||
uint32_t flags,
|
||||
UParseError &pe,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Compiles the regular expression in string form into a RegexPattern
|
||||
* object using the specified match mode flags. These compile methods,
|
||||
@ -244,6 +304,33 @@ public:
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Compiles the regular expression in string form into a RegexPattern
|
||||
* object using the specified match mode flags. These compile methods,
|
||||
* rather than the constructors, are the usual way that RegexPattern objects
|
||||
* are created.
|
||||
*
|
||||
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
|
||||
* objects created from the pattern are active. RegexMatchers keep a pointer
|
||||
* back to their pattern, so premature deletion of the pattern is a
|
||||
* catastrophic error.</p>
|
||||
*
|
||||
* <p>Note that it is often more convenient to construct a RegexMatcher directly
|
||||
* from a pattern string instead of than separately compiling the pattern and
|
||||
* then creating a RegexMatcher object from the pattern.</p>
|
||||
*
|
||||
* @param regex The regular expression to be compiled.
|
||||
* @param flags The match mode flags to be used.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return A regexPattern object for the compiled pattern.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
static RegexPattern * U_EXPORT2 compile( UText *regex,
|
||||
uint32_t flags,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Get the match mode flags that were used when compiling this pattern.
|
||||
* @return the match mode flags
|
||||
@ -271,6 +358,27 @@ public:
|
||||
virtual RegexMatcher *matcher(const UnicodeString &input,
|
||||
UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Creates a RegexMatcher that will match the given input against this pattern. The
|
||||
* RegexMatcher can then be used to perform match, find or replace operations
|
||||
* on the input. Note that a RegexPattern object must not be deleted while
|
||||
* RegexMatchers created from it still exist and might possibly be used again.
|
||||
* <p>
|
||||
* The matcher will make a shallow clone of the supplied input text, and all regexp
|
||||
* pattern matching operations happen on this clone. While read-only operations on
|
||||
* the supplied text are permitted, it is critical that the underlying string not be
|
||||
* altered or deleted before use by the regular expression operations is complete.
|
||||
*
|
||||
* @param input The input text to which the regular expression will be applied.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return A RegexMatcher object for this pattern and input.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual RegexMatcher *matcher(UText *input,
|
||||
UErrorCode &status) const;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Cause a compilation error if an application accidently attempts to
|
||||
@ -280,6 +388,8 @@ private:
|
||||
* To efficiently work with UChar *strings, wrap the data in a UnicodeString
|
||||
* using one of the aliasing constructors, such as
|
||||
* <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
|
||||
* or in a UText, using
|
||||
* <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
@ -318,17 +428,54 @@ public:
|
||||
*/
|
||||
static UBool U_EXPORT2 matches(const UnicodeString ®ex,
|
||||
const UnicodeString &input,
|
||||
UParseError &pe,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Test whether a string matches a regular expression. This convenience function
|
||||
* both compiles the reguluar expression and applies it in a single operation.
|
||||
* Note that if the same pattern needs to be applied repeatedly, this method will be
|
||||
* less efficient than creating and reusing a RegexMatcher object.
|
||||
*
|
||||
* @param regex The regular expression
|
||||
* @param input The string data to be matched
|
||||
* @param pe Receives the position of any syntax errors within the regular expression
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return True if the regular expression exactly matches the full input string.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
static UBool U_EXPORT2 matches(UText *regex,
|
||||
UText *input,
|
||||
UParseError &pe,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Returns the regular expression from which this pattern was compiled.
|
||||
* @stable ICU 2.4
|
||||
* Returns the regular expression from which this pattern was compiled. This method will work
|
||||
* even if the pattern was compiled from a UText.
|
||||
*
|
||||
* Note: If the pattern was originally compiled from a UText, and that UText was modified,
|
||||
* the returned string may no longer reflect the RegexPattern object.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString pattern() const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the regular expression from which this pattern was compiled. This method will work
|
||||
* even if the pattern was compiled from a UnicodeString.
|
||||
*
|
||||
* Note: This is the original input, not a clone. If the pattern was originally compiled from a
|
||||
* UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
|
||||
* object.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual UText *patternText() const;
|
||||
|
||||
|
||||
/**
|
||||
* Split a string into fields. Somewhat like split() from Perl.
|
||||
* The pattern matches identify delimiters that separate the input
|
||||
@ -360,6 +507,37 @@ public:
|
||||
UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Split a string into fields. Somewhat like split() from Perl.
|
||||
* The pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the matches becomes the
|
||||
* fields themselves.
|
||||
* <p>
|
||||
* For the best performance on split() operations,
|
||||
* <code>RegexMatcher::split</code> is perferable to this function
|
||||
*
|
||||
* @param input The string to be split into fields. The field delimiters
|
||||
* match the pattern (in the "this" object)
|
||||
* @param dest An array of mutable UText structs to receive the results of the split.
|
||||
* If a field is NULL, a new UText is allocated to contain the results for
|
||||
* that field. This new UText is not guaranteed to be mutable.
|
||||
* @param destCapacity The number of elements in the destination array.
|
||||
* If the number of fields found is less than destCapacity, the
|
||||
* extra strings in the destination array are not altered.
|
||||
* If the number of destination strings is less than the number
|
||||
* of fields, the trailing part of the input string, including any
|
||||
* field delimiters, is placed in the last destination string.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return The number of fields into which the input string was split.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual int32_t split(UText *input,
|
||||
UText *dest[],
|
||||
int32_t destCapacity,
|
||||
UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
*
|
||||
@ -378,7 +556,7 @@ private:
|
||||
//
|
||||
// Implementation Data
|
||||
//
|
||||
UnicodeString fPattern; // The original pattern string.
|
||||
UText *fPattern; // The original pattern string.
|
||||
uint32_t fFlags; // The flags used when compiling the pattern.
|
||||
//
|
||||
UVector32 *fCompiledPat; // The compiled pattern p-code.
|
||||
@ -421,6 +599,7 @@ private:
|
||||
UnicodeSet *fInitialChars;
|
||||
UChar32 fInitialChar;
|
||||
Regex8BitSet *fInitialChars8;
|
||||
UBool fNeedsAltInput;
|
||||
|
||||
friend class RegexCompile;
|
||||
friend class RegexMatcher;
|
||||
@ -468,6 +647,23 @@ public:
|
||||
*/
|
||||
RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Construct a RegexMatcher for a regular expression.
|
||||
* This is a convenience method that avoids the need to explicitly create
|
||||
* a RegexPattern object. Note that if several RegexMatchers need to be
|
||||
* created for the same expression, it will be more efficient to
|
||||
* separately create and cache a RegexPattern object, and use
|
||||
* its matcher() method to create the RegexMatcher objects.
|
||||
*
|
||||
* @param regexp The regular expression to be compiled.
|
||||
* @param flags Regular expression options, such as case insensitive matching.
|
||||
* @see UREGEX_CASE_INSENSITIVE
|
||||
* @param status Any errors are reported by setting this UErrorCode variable.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Construct a RegexMatcher for a regular expression.
|
||||
* This is a convenience method that avoids the need to explicitly create
|
||||
@ -492,6 +688,30 @@ public:
|
||||
RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
|
||||
uint32_t flags, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Construct a RegexMatcher for a regular expression.
|
||||
* This is a convenience method that avoids the need to explicitly create
|
||||
* a RegexPattern object. Note that if several RegexMatchers need to be
|
||||
* created for the same expression, it will be more efficient to
|
||||
* separately create and cache a RegexPattern object, and use
|
||||
* its matcher() method to create the RegexMatcher objects.
|
||||
* <p>
|
||||
* The matcher will make a shallow clone of the supplied input text, and all regexp
|
||||
* pattern matching operations happen on this clone. While read-only operations on
|
||||
* the supplied text are permitted, it is critical that the underlying string not be
|
||||
* altered or deleted before use by the regular expression operations is complete.
|
||||
*
|
||||
* @param regexp The Regular Expression to be compiled.
|
||||
* @param input The string to match. The matcher retains a shallow clone of the text.
|
||||
* @param flags Regular expression options, such as case insensitive matching.
|
||||
* @see UREGEX_CASE_INSENSITIVE
|
||||
* @param status Any errors are reported by setting this UErrorCode variable.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
RegexMatcher(UText *regexp, UText *input,
|
||||
uint32_t flags, UErrorCode &status);
|
||||
|
||||
private:
|
||||
/**
|
||||
* Cause a compilation error if an application accidently attempts to
|
||||
@ -501,6 +721,8 @@ private:
|
||||
* To efficiently work with UChar *strings, wrap the data in a UnicodeString
|
||||
* using one of the aliasing constructors, such as
|
||||
* <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
|
||||
* or in a UText, using
|
||||
* <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
@ -525,6 +747,7 @@ public:
|
||||
*/
|
||||
virtual UBool matches(UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Resets the matcher, then attempts to match the input beginning
|
||||
* at the specified startIndex, and extending to the end of the input.
|
||||
@ -538,8 +761,6 @@ public:
|
||||
virtual UBool matches(int32_t startIndex, UErrorCode &status);
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Attempts to match the input string, starting from the beginning of the region,
|
||||
* against the pattern. Like the matches() method, this function
|
||||
@ -571,6 +792,7 @@ public:
|
||||
*/
|
||||
virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Find the next pattern match in the input string.
|
||||
* The find begins searching the input at the location following the end of
|
||||
@ -610,6 +832,22 @@ public:
|
||||
virtual UnicodeString group(UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns a string containing the text matched by the previous match.
|
||||
* If the pattern can match an empty string, an empty string may be returned.
|
||||
* @param dest A mutable UText in which the matching text is placed.
|
||||
* If NULL, a new UText will be created (which may not be mutable).
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* Possible errors are U_REGEX_INVALID_STATE if no match
|
||||
* has been attempted or the last match failed.
|
||||
* @return A string containing the matched input text. If a pre-allocated UText
|
||||
* was provided, it will always be used and returned.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual UText *group(UText *dest, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns a string containing the text captured by the given group
|
||||
* during the previous match operation. Group(0) is the entire match.
|
||||
@ -625,6 +863,24 @@ public:
|
||||
virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns a string containing the text captured by the given group
|
||||
* during the previous match operation. Group(0) is the entire match.
|
||||
*
|
||||
* @param groupNum the capture group number
|
||||
* @param dest A mutable UText in which the matching text is placed.
|
||||
* If NULL, a new UText will be created (which may not be mutable).
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* Possible errors are U_REGEX_INVALID_STATE if no match
|
||||
* has been attempted or the last match failed.
|
||||
* @return A string containing the matched input text. If a pre-allocated UText
|
||||
* was provided, it will always be used and returned.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the number of capturing groups in this matcher's pattern.
|
||||
* @return the number of capture groups
|
||||
@ -726,11 +982,31 @@ public:
|
||||
* Because no copy of the string is made, it is essential that the
|
||||
* caller not delete the string until after regexp operations on it
|
||||
* are done.
|
||||
* Note that while a reset on the matcher with an input string that is then
|
||||
* modified across/during matcher operations may be supported currently for UnicodeString,
|
||||
* this was not originally intended behavior, and support for this is not guaranteed
|
||||
* in upcoming versions of ICU.
|
||||
* @return this RegexMatcher.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
virtual RegexMatcher &reset(const UnicodeString &input);
|
||||
|
||||
|
||||
/**
|
||||
* Resets this matcher with a new input string. This allows instances of RegexMatcher
|
||||
* to be reused, which is more efficient than creating a new RegexMatcher for
|
||||
* each input string to be processed.
|
||||
* @param input The new string on which subsequent pattern matches will operate.
|
||||
* The matcher makes a shallow clone of the given text; ownership of the
|
||||
* original string remains with the caller. Because no deep copy of the
|
||||
* text is made, it is essential that the caller not modify the string
|
||||
* until after regexp operations on it are done.
|
||||
* @return this RegexMatcher.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual RegexMatcher &reset(UText *input);
|
||||
|
||||
private:
|
||||
/**
|
||||
* Cause a compilation error if an application accidently attempts to
|
||||
@ -740,6 +1016,8 @@ private:
|
||||
* To efficiently work with UChar *strings, wrap the data in a UnicodeString
|
||||
* using one of the aliasing constructors, such as
|
||||
* <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
|
||||
* or in a UText, using
|
||||
* <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
@ -747,13 +1025,34 @@ private:
|
||||
public:
|
||||
|
||||
/**
|
||||
* Returns the input string being matched. The returned string is not a copy,
|
||||
* but the live input string. It should not be altered or deleted.
|
||||
* Returns the input string being matched. Ownership of the string belongs to
|
||||
* the matcher; it should not be altered or deleted. This method will work even if the input
|
||||
* was originally supplied as a UText.
|
||||
* @return the input string
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
virtual const UnicodeString &input() const;
|
||||
|
||||
/**
|
||||
* Returns the input string being matched. This is the live input text; it should not be
|
||||
* altered or deleted. This method will work even if the input was originally supplied as
|
||||
* a UnicodeString.
|
||||
* @return the input text
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual UText *inputText() const;
|
||||
|
||||
/**
|
||||
* Returns the input string being matched, either by copying it into the provided
|
||||
* UText parameter or by returning a shallow clone of the live input. Note that copying
|
||||
* the entire input may cause significant performance and memory issues.
|
||||
* @param dest The UText into which the input should be copied, or NULL to create a new UText
|
||||
* @return dest if non-NULL, a shallow copy of the input text otherwise
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual UText *getInput(UText *dest) const;
|
||||
|
||||
|
||||
/** Sets the limits of this matcher's region.
|
||||
@ -838,6 +1137,7 @@ public:
|
||||
*/
|
||||
virtual UBool hasAnchoringBounds() const;
|
||||
|
||||
|
||||
/**
|
||||
* Set whether this matcher is using Anchoring Bounds for its region.
|
||||
* With anchoring bounds, pattern anchors such as ^ and $ will match at the start
|
||||
@ -852,6 +1152,7 @@ public:
|
||||
*/
|
||||
virtual RegexMatcher &useAnchoringBounds(UBool b);
|
||||
|
||||
|
||||
/**
|
||||
* Return TRUE if the most recent matching operation touched the
|
||||
* end of the text being processed. In this case, additional input text could
|
||||
@ -878,9 +1179,6 @@ public:
|
||||
virtual UBool requireEnd() const;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns the pattern that is interpreted by this matcher.
|
||||
* @return the RegexPattern for this RegexMatcher
|
||||
@ -908,6 +1206,29 @@ public:
|
||||
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Replaces every substring of the input that matches the pattern
|
||||
* with the given replacement string. This is a convenience function that
|
||||
* provides a complete find-and-replace-all operation.
|
||||
*
|
||||
* This method first resets this matcher. It then scans the input string
|
||||
* looking for matches of the pattern. Input that is not part of any
|
||||
* match is left unchanged; each match is replaced in the result by the
|
||||
* replacement string. The replacement string may contain references to
|
||||
* capture groups.
|
||||
*
|
||||
* @param replacement a string containing the replacement text.
|
||||
* @param dest a mutable UText in which the results are placed.
|
||||
* If NULL, a new UText will be created (which may not be mutable).
|
||||
* @param status a reference to a UErrorCode to receive any errors.
|
||||
* @return a string containing the results of the find and replace.
|
||||
* If a pre-allocated UText was provided, it will always be used and returned.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Replaces the first substring of the input that matches
|
||||
* the pattern with the replacement string. This is a convenience
|
||||
@ -930,6 +1251,34 @@ public:
|
||||
*/
|
||||
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Replaces the first substring of the input that matches
|
||||
* the pattern with the replacement string. This is a convenience
|
||||
* function that provides a complete find-and-replace operation.
|
||||
*
|
||||
* <p>This function first resets this RegexMatcher. It then scans the input string
|
||||
* looking for a match of the pattern. Input that is not part
|
||||
* of the match is appended directly to the result string; the match is replaced
|
||||
* in the result by the replacement string. The replacement string may contain
|
||||
* references to captured groups.</p>
|
||||
*
|
||||
* <p>The state of the matcher (the position at which a subsequent find()
|
||||
* would begin) after completing a replaceFirst() is not specified. The
|
||||
* RegexMatcher should be reset before doing additional find() operations.</p>
|
||||
*
|
||||
* @param replacement a string containing the replacement text.
|
||||
* @param dest a mutable UText in which the results are placed.
|
||||
* If NULL, a new UText will be created (which may not be mutable).
|
||||
* @param status a reference to a UErrorCode to receive any errors.
|
||||
* @return a string containing the results of the find and replace.
|
||||
* If a pre-allocated UText was provided, it will always be used and returned.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Implements a replace operation intended to be used as part of an
|
||||
* incremental find-and-replace.
|
||||
@ -961,6 +1310,37 @@ public:
|
||||
const UnicodeString &replacement, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Implements a replace operation intended to be used as part of an
|
||||
* incremental find-and-replace.
|
||||
*
|
||||
* <p>The input string, starting from the end of the previous replacement and ending at
|
||||
* the start of the current match, is appended to the destination string. Then the
|
||||
* replacement string is appended to the output string,
|
||||
* including handling any substitutions of captured text.</p>
|
||||
*
|
||||
* <p>For simple, prepackaged, non-incremental find-and-replace
|
||||
* operations, see replaceFirst() or replaceAll().</p>
|
||||
*
|
||||
* @param dest A mutable UText to which the results of the find-and-replace are appended.
|
||||
* Must not be NULL.
|
||||
* @param replacement A UText that provides the text to be substituted for
|
||||
* the input text that matched the regexp pattern. The replacement
|
||||
* text may contain references to captured text from the input.
|
||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||
* attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
|
||||
* if the replacement text specifies a capture group that
|
||||
* does not exist in the pattern.
|
||||
*
|
||||
* @return this RegexMatcher
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual RegexMatcher &appendReplacement(UText *dest,
|
||||
UText *replacement, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* As the final step in a find-and-replace operation, append the remainder
|
||||
* of the input string, starting at the position following the last appendReplacement(),
|
||||
@ -974,13 +1354,26 @@ public:
|
||||
virtual UnicodeString &appendTail(UnicodeString &dest);
|
||||
|
||||
|
||||
/**
|
||||
* As the final step in a find-and-replace operation, append the remainder
|
||||
* of the input string, starting at the position following the last appendReplacement(),
|
||||
* to the destination string. <code>appendTail()</code> is intended to be invoked after one
|
||||
* or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
|
||||
*
|
||||
* @param dest A mutable UText to which the results of the find-and-replace are appended.
|
||||
* Must not be NULL.
|
||||
* @return the destination string.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual UText *appendTail(UText *dest);
|
||||
|
||||
|
||||
/**
|
||||
* Split a string into fields. Somewhat like split() from Perl.
|
||||
* The pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the matches becomes the
|
||||
* fields themselves.
|
||||
* <p>
|
||||
*
|
||||
* @param input The string to be split into fields. The field delimiters
|
||||
* match the pattern (in the "this" object). This matcher
|
||||
@ -1004,6 +1397,35 @@ public:
|
||||
int32_t destCapacity,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Split a string into fields. Somewhat like split() from Perl.
|
||||
* The pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the matches becomes the
|
||||
* fields themselves.
|
||||
*
|
||||
* @param input The string to be split into fields. The field delimiters
|
||||
* match the pattern (in the "this" object). This matcher
|
||||
* will be reset to this input string.
|
||||
* @param dest An array of mutable UText structs to receive the results of the split.
|
||||
* If a field is NULL, a new UText is allocated to contain the results for
|
||||
* that field. This new UText is not guaranteed to be mutable.
|
||||
* @param destCapacity The number of elements in the destination array.
|
||||
* If the number of fields found is less than destCapacity, the
|
||||
* extra strings in the destination array are not altered.
|
||||
* If the number of destination strings is less than the number
|
||||
* of fields, the trailing part of the input string, including any
|
||||
* field delimiters, is placed in the last destination string.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return The number of fields into which the input string was split.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
virtual int32_t split(UText *input,
|
||||
UText *dest[],
|
||||
int32_t destCapacity,
|
||||
UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Set a processing time limit for match operations with this Matcher.
|
||||
*
|
||||
@ -1086,7 +1508,6 @@ public:
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Get the callback function for this URegularExpression.
|
||||
*
|
||||
@ -1132,7 +1553,7 @@ private:
|
||||
RegexMatcher(const RegexMatcher &other);
|
||||
RegexMatcher &operator =(const RegexMatcher &rhs);
|
||||
void init(UErrorCode &status); // Common initialization
|
||||
void init2(const UnicodeString &s, UErrorCode &e); // Common initialization, part 2.
|
||||
void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
|
||||
|
||||
friend class RegexPattern;
|
||||
friend class RegexCImpl;
|
||||
@ -1145,34 +1566,43 @@ private:
|
||||
// MatchAt This is the internal interface to the match engine itself.
|
||||
// Match status comes back in matcher member variables.
|
||||
//
|
||||
void MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
|
||||
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
|
||||
UBool isWordBoundary(int32_t pos); // perform Perl-like \b test
|
||||
UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test
|
||||
void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
|
||||
inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
|
||||
UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
|
||||
UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
|
||||
REStackFrame *resetStack();
|
||||
inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status);
|
||||
void IncrementTime(UErrorCode &status);
|
||||
|
||||
int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
|
||||
|
||||
UBool findUsingChunk();
|
||||
void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
|
||||
UBool isChunkWordBoundary(int32_t pos);
|
||||
|
||||
const RegexPattern *fPattern;
|
||||
RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
|
||||
// should delete it when through.
|
||||
|
||||
const UnicodeString *fInput; // The text being matched. Is never NULL.
|
||||
const UnicodeString *fInput; // The string being matched. Only used for input()
|
||||
UText *fInputText; // The text being matched. Is never NULL.
|
||||
UText *fAltInputText; // A shallow copy of the text being matched.
|
||||
// Only created if the pattern contains backreferences.
|
||||
int64_t fInputLength; // Full length of the input text.
|
||||
int32_t fFrameSize; // The size of a frame in the backtrack stack.
|
||||
|
||||
int32_t fRegionStart; // Start of the input region, default = 0.
|
||||
int32_t fRegionLimit; // End of input region, default to input.length.
|
||||
int64_t fRegionStart; // Start of the input region, default = 0.
|
||||
int64_t fRegionLimit; // End of input region, default to input.length.
|
||||
|
||||
int32_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
|
||||
int32_t fAnchorLimit; // See useAnchoringBounds
|
||||
int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
|
||||
int64_t fAnchorLimit; // See useAnchoringBounds
|
||||
|
||||
int32_t fLookStart; // Region bounds for look-ahead/behind and
|
||||
int32_t fLookLimit; // and other boundary tests. See
|
||||
int64_t fLookStart; // Region bounds for look-ahead/behind and
|
||||
int64_t fLookLimit; // and other boundary tests. See
|
||||
// useTransparentBounds
|
||||
|
||||
int32_t fActiveStart; // Currently active bounds for matching.
|
||||
int32_t fActiveLimit; // Usually is the same as region, but
|
||||
int64_t fActiveStart; // Currently active bounds for matching.
|
||||
int64_t fActiveLimit; // Usually is the same as region, but
|
||||
// is changed to fLookStart/Limit when
|
||||
// entering look around regions.
|
||||
|
||||
@ -1180,13 +1610,13 @@ private:
|
||||
UBool fAnchoringBounds; // True if using anchoring bounds.
|
||||
|
||||
UBool fMatch; // True if the last attempted match was successful.
|
||||
int32_t fMatchStart; // Position of the start of the most recent match
|
||||
int32_t fMatchEnd; // First position after the end of the most recent match
|
||||
int64_t fMatchStart; // Position of the start of the most recent match
|
||||
int64_t fMatchEnd; // First position after the end of the most recent match
|
||||
// Zero if no previous match, even when a region
|
||||
// is active.
|
||||
int32_t fLastMatchEnd; // First position after the end of the previous match,
|
||||
int64_t fLastMatchEnd; // First position after the end of the previous match,
|
||||
// or -1 if there was no previous match.
|
||||
int32_t fAppendPosition; // First position after the end of the previous
|
||||
int64_t fAppendPosition; // First position after the end of the previous
|
||||
// appendReplacement(). As described by the
|
||||
// JavaDoc for Java Matcher, where it is called
|
||||
// "append position"
|
||||
@ -1218,6 +1648,8 @@ private:
|
||||
// NULL if there is no callback.
|
||||
const void *fCallbackContext; // User Context ptr for callback function.
|
||||
|
||||
UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
|
||||
|
||||
UBool fTraceDebug; // Set true for debug tracing of match engine.
|
||||
|
||||
UErrorCode fDeferredStatus; // Save error state that cannot be immediately
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (C) 2004-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: regex.h
|
||||
* file name: uregex.h
|
||||
* encoding: US-ASCII
|
||||
* indentation:4
|
||||
*
|
||||
@ -23,6 +23,7 @@
|
||||
#ifndef UREGEX_H
|
||||
#define UREGEX_H
|
||||
|
||||
#include "unicode/utext.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
@ -113,6 +114,7 @@ typedef enum URegexpFlag{
|
||||
* The resulting regular expression handle can then be used to perform various
|
||||
* matching operations.
|
||||
*
|
||||
*
|
||||
* @param pattern The Regular Expression pattern to be compiled.
|
||||
* @param patternLength The length of the pattern, or -1 if the pattern is
|
||||
* NUL termintated.
|
||||
@ -135,6 +137,35 @@ uregex_open( const UChar *pattern,
|
||||
UParseError *pe,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Open (compile) an ICU regular expression. Compiles the regular expression in
|
||||
* string form into an internal representation using the specified match mode flags.
|
||||
* The resulting regular expression handle can then be used to perform various
|
||||
* matching operations.
|
||||
* <p>
|
||||
* The contents of the pattern UText will be extracted and saved. Ownership of the
|
||||
* UText struct itself remains with the caller. This is to match the behavior of
|
||||
* uregex_open().
|
||||
*
|
||||
* @param pattern The Regular Expression pattern to be compiled.
|
||||
* @param flags Flags that alter the default matching behavior for
|
||||
* the regular expression, UREGEX_CASE_INSENSITIVE, for
|
||||
* example. For default behavior, set this parameter to zero.
|
||||
* See <code>enum URegexpFlag</code>. All desired flags
|
||||
* are bitwise-ORed together.
|
||||
* @param pe Receives the position (line and column nubers) of any syntax
|
||||
* error within the source regular expression string. If this
|
||||
* information is not wanted, pass NULL for this parameter.
|
||||
* @param status Receives error detected by this function.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL URegularExpression * U_EXPORT2
|
||||
uregex_openUText(UText *pattern,
|
||||
uint32_t flags,
|
||||
UParseError *pe,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Open (compile) an ICU regular expression. The resulting regular expression
|
||||
* handle can then be used to perform various matching operations.
|
||||
@ -219,7 +250,8 @@ U_STABLE URegularExpression * U_EXPORT2
|
||||
uregex_clone(const URegularExpression *regexp, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Return a pointer to the source form of the pattern for this regular expression.
|
||||
* Returns a pointer to the source form of the pattern for this regular expression.
|
||||
* This function will work even if the pattern was originally specified as a UText.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param patLength This output parameter will be set to the length of the
|
||||
@ -235,9 +267,24 @@ uregex_clone(const URegularExpression *regexp, UErrorCode *status);
|
||||
* @stable ICU 3.0
|
||||
*/
|
||||
U_STABLE const UChar * U_EXPORT2
|
||||
uregex_pattern(const URegularExpression *regexp,
|
||||
int32_t *patLength,
|
||||
UErrorCode *status);
|
||||
uregex_pattern(const URegularExpression *regexp,
|
||||
int32_t *patLength,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Returns the source text of the pattern for this regular expression.
|
||||
* This function will work even if the pattern was originally specified as a UChar string.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param status Receives errors detected by this function.
|
||||
* @return the pattern text. The storage for the text is owned by the regular expression
|
||||
* object, and must not be altered or deleted.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL UText * U_EXPORT2
|
||||
uregex_patternUText(const URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
@ -279,10 +326,36 @@ uregex_setText(URegularExpression *regexp,
|
||||
int32_t textLength,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Set the subject text string upon which the regular expression will look for matches.
|
||||
* This function may be called any number of times, allowing the regular
|
||||
* expression pattern to be applied to different strings.
|
||||
* <p>
|
||||
* Regular expression matching operations work directly on the application's
|
||||
* string data; only a shallow clone is made. The subject string data must not be
|
||||
* altered after calling this function until after all regular expression
|
||||
* operations involving this string data are completed.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param text The subject text string.
|
||||
* @param status Receives errors detected by this function.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL void U_EXPORT2
|
||||
uregex_setUText(URegularExpression *regexp,
|
||||
UText *text,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get the subject text that is currently associated with this
|
||||
* regular expression object. This simply returns whatever string
|
||||
* pointer was previously supplied via uregex_setText().
|
||||
* regular expression object. If the input was supplied using uregex_setText(),
|
||||
* that pointer will be returned. Otherwise, the characters in the input will
|
||||
* be extracted to a buffer and returned. In either case, ownership remains
|
||||
* with the regular expression object.
|
||||
*
|
||||
* This function will work even if the input was originally specified as a UText.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param textLength The length of the string is returned in this output parameter.
|
||||
@ -291,7 +364,7 @@ uregex_setText(URegularExpression *regexp,
|
||||
* the text is known in advance to be a NUL terminated
|
||||
* string.
|
||||
* @param status Receives errors detected by this function.
|
||||
* @return Poiner to the subject text string currently associated with
|
||||
* @return Pointer to the subject text string currently associated with
|
||||
* this regular expression.
|
||||
* @stable ICU 3.0
|
||||
*/
|
||||
@ -300,6 +373,28 @@ uregex_getText(URegularExpression *regexp,
|
||||
int32_t *textLength,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Get the subject text that is currently associated with this
|
||||
* regular expression object.
|
||||
*
|
||||
* This function will work even if the input was originally specified as a UChar string.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param dest A mutable UText in which to store the current input.
|
||||
* If NULL, a new UText will be created as an immutable shallow clone
|
||||
* of the actual input string.
|
||||
* @param status Receives errors detected by this function.
|
||||
* @return The subject text currently associated with this regular expression.
|
||||
* If a pre-allocated UText was provided, it will always be used and returned.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL UText * U_EXPORT2
|
||||
uregex_getUText(URegularExpression *regexp,
|
||||
UText *dest,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Attempts to match the input string against the pattern.
|
||||
* To succeed, the match must extend to the end of the string,
|
||||
@ -428,6 +523,29 @@ uregex_group(URegularExpression *regexp,
|
||||
int32_t destCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
/** Extract the string for the specified matching expression or subexpression.
|
||||
* Group #0 is the complete string of matched text.
|
||||
* Group #1 is the text matched by the first set of capturing parentheses.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param groupNum The capture group to extract. Group 0 is the complete
|
||||
* match. The value of this parameter must be
|
||||
* less than or equal to the number of capture groups in
|
||||
* the pattern.
|
||||
* @param dest Mutable UText to receive the matching string data.
|
||||
* If NULL, a new UText will be created (which may not be mutable).
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return The matching string data. If a pre-allocated UText was provided,
|
||||
* it will always be used and returned.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL UText * U_EXPORT2
|
||||
uregex_groupUText(URegularExpression *regexp,
|
||||
int32_t groupNum,
|
||||
UText *dest,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Returns the index in the input string of the start of the text matched by the
|
||||
@ -676,6 +794,32 @@ uregex_replaceAll(URegularExpression *regexp,
|
||||
int32_t destCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Replaces every substring of the input that matches the pattern
|
||||
* with the given replacement string. This is a convenience function that
|
||||
* provides a complete find-and-replace-all operation.
|
||||
*
|
||||
* This method scans the input string looking for matches of the pattern.
|
||||
* Input that is not part of any match is copied unchanged to the
|
||||
* destination buffer. Matched regions are replaced in the output
|
||||
* buffer by the replacement string. The replacement string may contain
|
||||
* references to capture groups; these take the form of $1, $2, etc.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param replacement A string containing the replacement text.
|
||||
* @param dest A mutable UText that will receive the result.
|
||||
* If NULL, a new UText will be created (which may not be mutable).
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return A UText containing the results of the find and replace.
|
||||
* If a pre-allocated UText was provided, it will always be used and returned.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL UText * U_EXPORT2
|
||||
uregex_replaceAllUText(URegularExpression *regexp,
|
||||
UText *replacement,
|
||||
UText *dest,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Replaces the first substring of the input that matches the pattern
|
||||
@ -709,6 +853,33 @@ uregex_replaceFirst(URegularExpression *regexp,
|
||||
int32_t destCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Replaces the first substring of the input that matches the pattern
|
||||
* with the given replacement string. This is a convenience function that
|
||||
* provides a complete find-and-replace operation.
|
||||
*
|
||||
* This method scans the input string looking for a match of the pattern.
|
||||
* All input that is not part of the match is copied unchanged to the
|
||||
* destination buffer. The matched region is replaced in the output
|
||||
* buffer by the replacement string. The replacement string may contain
|
||||
* references to capture groups; these take the form of $1, $2, etc.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param replacement A string containing the replacement text.
|
||||
* @param dest A mutable UText that will receive the result.
|
||||
* If NULL, a new UText will be created (which may not be mutable).
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return A UText containing the results of the find and replace.
|
||||
* If a pre-allocated UText was provided, it will always be used and returned.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL UText * U_EXPORT2
|
||||
uregex_replaceFirstUText(URegularExpression *regexp,
|
||||
UText *replacement,
|
||||
UText *dest,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Implements a replace operation intended to be used as part of an
|
||||
@ -758,11 +929,40 @@ uregex_replaceFirst(URegularExpression *regexp,
|
||||
*/
|
||||
U_STABLE int32_t U_EXPORT2
|
||||
uregex_appendReplacement(URegularExpression *regexp,
|
||||
const UChar *replacementText,
|
||||
int32_t replacementLength,
|
||||
UChar **destBuf,
|
||||
int32_t *destCapacity,
|
||||
UErrorCode *status);
|
||||
const UChar *replacementText,
|
||||
int32_t replacementLength,
|
||||
UChar **destBuf,
|
||||
int32_t *destCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Implements a replace operation intended to be used as part of an
|
||||
* incremental find-and-replace.
|
||||
*
|
||||
* <p>The input string, starting from the end of the previous match and ending at
|
||||
* the start of the current match, is appended to the destination string. Then the
|
||||
* replacement string is appended to the output string,
|
||||
* including handling any substitutions of captured text.</p>
|
||||
*
|
||||
* <p>For simple, prepackaged, non-incremental find-and-replace
|
||||
* operations, see replaceFirst() or replaceAll().</p>
|
||||
*
|
||||
* @param regexp The regular expression object.
|
||||
* @param replacementText The string that will replace the matched portion of the
|
||||
* input string as it is copied to the destination buffer.
|
||||
* The replacement text may contain references ($1, for
|
||||
* example) to capture groups from the match.
|
||||
* @param dest A mutable UText that will receive the result. Must not be NULL.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL void U_EXPORT2
|
||||
uregex_appendReplacementUText(URegularExpression *regexp,
|
||||
UText *replacementText,
|
||||
UText *dest,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
@ -796,6 +996,26 @@ uregex_appendTail(URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* As the final step in a find-and-replace operation, append the remainder
|
||||
* of the input string, starting at the position following the last match,
|
||||
* to the destination string. <code>uregex_appendTailUText()</code> is intended
|
||||
* to be invoked after one or more invocations of the
|
||||
* <code>uregex_appendReplacementUText()</code> function.
|
||||
*
|
||||
* @param regexp The regular expression object. This is needed to
|
||||
* obtain the input string and with the position
|
||||
* of the last match within it.
|
||||
* @param dest A mutable UText that will receive the result. Must not be NULL.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return The destination UText.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL UText * U_EXPORT2
|
||||
uregex_appendTailUText(URegularExpression *regexp,
|
||||
UText *dest);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
@ -808,6 +1028,22 @@ uregex_appendTail(URegularExpression *regexp,
|
||||
* buffer, and NUL terminated. The position of each field within
|
||||
* the destination buffer is returned in the destFields array.
|
||||
*
|
||||
* Note: another choice for the design of this function would be to not
|
||||
* copy the resulting fields at all, but to return indexes and
|
||||
* lengths within the source text.
|
||||
* Advantages would be
|
||||
* o Faster. No Copying.
|
||||
* o Nothing extra needed when field data may contain embedded NUL chars.
|
||||
* o Less memory needed if working on large data.
|
||||
* Disadvantages
|
||||
* o Less consistent with C++ split, which copies into an
|
||||
* array of UnicodeStrings.
|
||||
* o No NUL termination, extracted fields would be less convenient
|
||||
* to use in most cases.
|
||||
* o Possible problems in the future, when support Unicode Normalization
|
||||
* could cause the fields to not correspond exactly to
|
||||
* a range of the source text.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param destBuf A (UChar *) buffer to receive the fields that
|
||||
* are extracted from the input string. These
|
||||
@ -846,6 +1082,39 @@ uregex_split( URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Split a string into fields. Somewhat like split() from Perl.
|
||||
* The pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the matches becomes the
|
||||
* fields themselves.
|
||||
* <p>
|
||||
* The behavior of this function is not very closely aligned with uregex_split();
|
||||
* instead, it is based on (and implemented directly on top of) the C++ split method.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param dest An array of mutable UText structs to receive the results of the split.
|
||||
* If a field is NULL, a new UText is allocated to contain the results for
|
||||
* that field. This new UText is not guaranteed to be mutable.
|
||||
* @param destCapacity The number of elements in the destination array.
|
||||
* If the number of fields found is less than destCapacity, the
|
||||
* extra strings in the destination array are not altered.
|
||||
* If the number of destination strings is less than the number
|
||||
* of fields, the trailing part of the input string, including any
|
||||
* field delimiters, is placed in the last destination string.
|
||||
* This behavior mimics that of Perl. It is not an error condition, and no
|
||||
* error status is returned when all destField positions are used.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return The number of fields into which the input string was split.
|
||||
*
|
||||
* @internal ICU 4.4 technology preview
|
||||
*/
|
||||
U_INTERNAL int32_t U_EXPORT2
|
||||
uregex_splitUText(URegularExpression *regexp,
|
||||
UText *destFields[],
|
||||
int32_t destFieldsCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2004-2009, International Business Machines
|
||||
* Copyright (C) 2004-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: regex.cpp
|
||||
@ -20,8 +20,14 @@
|
||||
#include "uassert.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
#include "regextxt.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
|
||||
|
||||
struct RegularExpression: public UMemory {
|
||||
public:
|
||||
RegularExpression();
|
||||
@ -35,9 +41,7 @@ public:
|
||||
const UChar *fText; // Text from setText()
|
||||
int32_t fTextLength; // Length provided by user with setText(), which
|
||||
// may be -1.
|
||||
|
||||
UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString.
|
||||
// TODO: regexp engine should not depend on UnicodeString.
|
||||
UBool fOwnsText;
|
||||
};
|
||||
|
||||
static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
|
||||
@ -51,6 +55,7 @@ RegularExpression::RegularExpression() {
|
||||
fMatcher = NULL;
|
||||
fText = NULL;
|
||||
fTextLength = 0;
|
||||
fOwnsText = FALSE;
|
||||
}
|
||||
|
||||
RegularExpression::~RegularExpression() {
|
||||
@ -61,6 +66,9 @@ RegularExpression::~RegularExpression() {
|
||||
uprv_free(fPatString);
|
||||
uprv_free(fPatRefCount);
|
||||
}
|
||||
if (fOwnsText && fText!=NULL) {
|
||||
uprv_free((void *)fText);
|
||||
}
|
||||
fMagic = 0;
|
||||
}
|
||||
|
||||
@ -81,7 +89,8 @@ static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool r
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
if (requiresText && re->fText == NULL) {
|
||||
// !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
|
||||
if (requiresText && re->fText == NULL && !re->fOwnsText) {
|
||||
*status = U_REGEX_INVALID_STATE;
|
||||
return FALSE;
|
||||
}
|
||||
@ -127,23 +136,110 @@ uregex_open( const UChar *pattern,
|
||||
|
||||
//
|
||||
// Make a copy of the pattern string, so we can return it later if asked.
|
||||
// For compiling the pattern, we will use a read-only-aliased UnicodeString
|
||||
// of this local copy, to avoid making even more copies.
|
||||
// For compiling the pattern, we will use a UText wrapper around
|
||||
// this local copy, to avoid making even more copies.
|
||||
//
|
||||
re->fPatString = patBuf;
|
||||
re->fPatStringLen = patternLength;
|
||||
u_memcpy(patBuf, pattern, actualPatLen);
|
||||
patBuf[actualPatLen] = 0;
|
||||
UnicodeString patString(patternLength==-1, patBuf, patternLength);
|
||||
|
||||
UText patText = UTEXT_INITIALIZER;
|
||||
utext_openUChars(&patText, patBuf, patternLength, status);
|
||||
|
||||
//
|
||||
// Compile the pattern
|
||||
//
|
||||
if (pe != NULL) {
|
||||
re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
|
||||
re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
|
||||
} else {
|
||||
re->fPat = RegexPattern::compile(patString, flags, *status);
|
||||
re->fPat = RegexPattern::compile(&patText, flags, *status);
|
||||
}
|
||||
utext_close(&patText);
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
goto ErrorExit;
|
||||
}
|
||||
|
||||
//
|
||||
// Create the matcher object
|
||||
//
|
||||
re->fMatcher = re->fPat->matcher(*status);
|
||||
if (U_SUCCESS(*status)) {
|
||||
return (URegularExpression*)re;
|
||||
}
|
||||
|
||||
ErrorExit:
|
||||
delete re;
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_openUText
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
U_CAPI URegularExpression * U_EXPORT2
|
||||
uregex_openUText(UText *pattern,
|
||||
uint32_t flags,
|
||||
UParseError *pe,
|
||||
UErrorCode *status) {
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
if (pattern == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int64_t patternNativeLength = utext_nativeLength(pattern);
|
||||
|
||||
if (patternNativeLength == 0) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
RegularExpression *re = new RegularExpression;
|
||||
|
||||
UErrorCode lengthStatus = U_ZERO_ERROR;
|
||||
int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
|
||||
|
||||
int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
|
||||
UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
|
||||
if (re == NULL || refC == NULL || patBuf == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
delete re;
|
||||
uprv_free(refC);
|
||||
uprv_free(patBuf);
|
||||
return NULL;
|
||||
}
|
||||
re->fPatRefCount = refC;
|
||||
*re->fPatRefCount = 1;
|
||||
|
||||
//
|
||||
// Make a copy of the pattern string, so we can return it later if asked.
|
||||
// For compiling the pattern, we will use a read-only UText wrapper
|
||||
// around this local copy, to avoid making even more copies.
|
||||
//
|
||||
re->fPatString = patBuf;
|
||||
re->fPatStringLen = pattern16Length;
|
||||
utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
|
||||
|
||||
UText patText = UTEXT_INITIALIZER;
|
||||
utext_openUChars(&patText, patBuf, pattern16Length, status);
|
||||
|
||||
//
|
||||
// Compile the pattern
|
||||
//
|
||||
if (pe != NULL) {
|
||||
re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
|
||||
} else {
|
||||
re->fPat = RegexPattern::compile(&patText, flags, *status);
|
||||
}
|
||||
utext_close(&patText);
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
goto ErrorExit;
|
||||
}
|
||||
@ -222,8 +318,8 @@ uregex_clone(const URegularExpression *source2, UErrorCode *status) {
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI const UChar * U_EXPORT2
|
||||
uregex_pattern(const URegularExpression *regexp2,
|
||||
int32_t *patLength,
|
||||
UErrorCode *status) {
|
||||
int32_t *patLength,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
|
||||
if (validateRE(regexp, status, FALSE) == FALSE) {
|
||||
@ -236,6 +332,20 @@ uregex_pattern(const URegularExpression *regexp2,
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_patternUText
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI UText * U_EXPORT2
|
||||
uregex_patternUText(const URegularExpression *regexp2,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
(void)status;
|
||||
return regexp->fPat->patternText();
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_flags
|
||||
@ -270,12 +380,48 @@ uregex_setText(URegularExpression *regexp2,
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (regexp->fOwnsText && regexp->fText != NULL) {
|
||||
uprv_free((void *)regexp->fText);
|
||||
}
|
||||
|
||||
regexp->fText = text;
|
||||
regexp->fTextLength = textLength;
|
||||
UBool isTerminated = (textLength == -1);
|
||||
regexp->fOwnsText = FALSE;
|
||||
|
||||
regexp->fTextString.setTo(isTerminated, text, textLength);
|
||||
regexp->fMatcher->reset(regexp->fTextString);
|
||||
UText input = UTEXT_INITIALIZER;
|
||||
utext_openUChars(&input, text, textLength, status);
|
||||
regexp->fMatcher->reset(&input);
|
||||
utext_close(&input); // reset() made a shallow clone, so we don't need this copy
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_setUText
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI void U_EXPORT2
|
||||
uregex_setUText(URegularExpression *regexp2,
|
||||
UText *text,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status, FALSE) == FALSE) {
|
||||
return;
|
||||
}
|
||||
if (text == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (regexp->fOwnsText && regexp->fText != NULL) {
|
||||
uprv_free((void *)regexp->fText);
|
||||
}
|
||||
|
||||
regexp->fText = NULL; // only fill it in on request
|
||||
regexp->fTextLength = -1;
|
||||
regexp->fOwnsText = TRUE;
|
||||
regexp->fMatcher->reset(text);
|
||||
}
|
||||
|
||||
|
||||
@ -293,6 +439,26 @@ uregex_getText(URegularExpression *regexp2,
|
||||
if (validateRE(regexp, status, FALSE) == FALSE) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (regexp->fText == NULL) {
|
||||
// need to fill in the text
|
||||
UText *inputText = regexp->fMatcher->inputText();
|
||||
int64_t inputNativeLength = utext_nativeLength(inputText);
|
||||
if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
|
||||
regexp->fText = inputText->chunkContents;
|
||||
regexp->fTextLength = inputNativeLength;
|
||||
regexp->fOwnsText = FALSE; // because the UText owns it
|
||||
} else {
|
||||
UErrorCode lengthStatus = U_ZERO_ERROR;
|
||||
regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
|
||||
UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
|
||||
|
||||
utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
|
||||
regexp->fText = inputChars;
|
||||
regexp->fOwnsText = TRUE; // should already be set but just in case
|
||||
}
|
||||
}
|
||||
|
||||
if (textLength != NULL) {
|
||||
*textLength = regexp->fTextLength;
|
||||
}
|
||||
@ -300,6 +466,23 @@ uregex_getText(URegularExpression *regexp2,
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_getUText
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI UText * U_EXPORT2
|
||||
uregex_getUText(URegularExpression *regexp2,
|
||||
UText *dest,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status, FALSE) == FALSE) {
|
||||
return dest;
|
||||
}
|
||||
return regexp->fMatcher->getInput(dest);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_matches
|
||||
@ -424,36 +607,90 @@ uregex_group(URegularExpression *regexp2,
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Pick up the range of characters from the matcher
|
||||
//
|
||||
int32_t startIx = regexp->fMatcher->start(groupNum, *status);
|
||||
int32_t endIx = regexp->fMatcher->end (groupNum, *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
if (destCapacity == 0 || regexp->fText != NULL) {
|
||||
// If preflighting or if we already have the text as UChars,
|
||||
// this is a little cheaper than going through uregex_groupUText()
|
||||
|
||||
//
|
||||
// Trim length based on buffer capacity
|
||||
//
|
||||
int32_t fullLength = endIx - startIx;
|
||||
int32_t copyLength = fullLength;
|
||||
if (copyLength < destCapacity) {
|
||||
dest[copyLength] = 0;
|
||||
} else if (copyLength == destCapacity) {
|
||||
*status = U_STRING_NOT_TERMINATED_WARNING;
|
||||
//
|
||||
// Pick up the range of characters from the matcher
|
||||
//
|
||||
int32_t startIx = regexp->fMatcher->start(groupNum, *status);
|
||||
int32_t endIx = regexp->fMatcher->end (groupNum, *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Trim length based on buffer capacity
|
||||
//
|
||||
int32_t fullLength = endIx - startIx;
|
||||
int32_t copyLength = fullLength;
|
||||
if (copyLength < destCapacity) {
|
||||
dest[copyLength] = 0;
|
||||
} else if (copyLength == destCapacity) {
|
||||
*status = U_STRING_NOT_TERMINATED_WARNING;
|
||||
} else {
|
||||
copyLength = destCapacity;
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
|
||||
//
|
||||
// Copy capture group to user's buffer
|
||||
//
|
||||
if (copyLength > 0) {
|
||||
u_memcpy(dest, ®exp->fText[startIx], copyLength);
|
||||
}
|
||||
return fullLength;
|
||||
} else {
|
||||
copyLength = destCapacity;
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
UText *groupText = uregex_groupUText(regexp2, groupNum, NULL, status);
|
||||
int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
|
||||
utext_close(groupText);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_groupUText
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI UText * U_EXPORT2
|
||||
uregex_groupUText(URegularExpression *regexp2,
|
||||
int32_t groupNum,
|
||||
UText *dest,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
UErrorCode emptyTextStatus = U_ZERO_ERROR;
|
||||
return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
|
||||
}
|
||||
|
||||
//
|
||||
// Copy capture group to user's buffer
|
||||
//
|
||||
if (copyLength > 0) {
|
||||
u_memcpy(dest, ®exp->fText[startIx], copyLength);
|
||||
if (regexp->fText != NULL) {
|
||||
//
|
||||
// Pick up the range of characters from the matcher
|
||||
// and use our already-extracted characters
|
||||
//
|
||||
int32_t startIx = regexp->fMatcher->start(groupNum, *status);
|
||||
int32_t endIx = regexp->fMatcher->end (groupNum, *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
UErrorCode emptyTextStatus = U_ZERO_ERROR;
|
||||
return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
|
||||
}
|
||||
|
||||
if (dest) {
|
||||
utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status);
|
||||
} else {
|
||||
UText groupText = UTEXT_INITIALIZER;
|
||||
utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status);
|
||||
dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
|
||||
utext_close(&groupText);
|
||||
}
|
||||
|
||||
return dest;
|
||||
} else {
|
||||
return regexp->fMatcher->group(groupNum, dest, *status);
|
||||
}
|
||||
return fullLength;
|
||||
}
|
||||
|
||||
|
||||
@ -582,8 +819,8 @@ uregex_hasTransparentBounds(const URegularExpression *regexp2,
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI void U_EXPORT2
|
||||
uregex_useTransparentBounds(URegularExpression *regexp2,
|
||||
UBool b,
|
||||
UErrorCode *status) {
|
||||
UBool b,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return;
|
||||
@ -599,7 +836,7 @@ uregex_useTransparentBounds(URegularExpression *regexp2,
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uregex_hasAnchoringBounds(const URegularExpression *regexp2,
|
||||
UErrorCode *status) {
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return FALSE;
|
||||
@ -615,8 +852,8 @@ uregex_hasAnchoringBounds(const URegularExpression *regexp2,
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI void U_EXPORT2
|
||||
uregex_useAnchoringBounds(URegularExpression *regexp2,
|
||||
UBool b,
|
||||
UErrorCode *status) {
|
||||
UBool b,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return;
|
||||
@ -699,8 +936,8 @@ uregex_getTimeLimit(const URegularExpression *regexp2,
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI void U_EXPORT2
|
||||
uregex_setStackLimit(URegularExpression *regexp2,
|
||||
int32_t limit,
|
||||
UErrorCode *status) {
|
||||
int32_t limit,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status)) {
|
||||
regexp->fMatcher->setStackLimit(limit, *status);
|
||||
@ -716,7 +953,7 @@ uregex_setStackLimit(URegularExpression *regexp2,
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uregex_getStackLimit(const URegularExpression *regexp2,
|
||||
UErrorCode *status) {
|
||||
UErrorCode *status) {
|
||||
int32_t retVal = 0;
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status)) {
|
||||
@ -738,7 +975,7 @@ uregex_setMatchCallback(URegularExpression *regexp2,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status)) {
|
||||
regexp->fMatcher->setMatchCallback(callback, context, *status);
|
||||
regexp->fMatcher->setMatchCallback(callback, context, *status);
|
||||
}
|
||||
}
|
||||
|
||||
@ -810,6 +1047,30 @@ uregex_replaceAll(URegularExpression *regexp2,
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_replaceAllUText
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI UText * U_EXPORT2
|
||||
uregex_replaceAllUText(URegularExpression *regexp2,
|
||||
UText *replacementText,
|
||||
UText *dest,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return 0;
|
||||
}
|
||||
if (replacementText == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_replaceFirst
|
||||
@ -847,6 +1108,30 @@ uregex_replaceFirst(URegularExpression *regexp2,
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_replaceFirstUText
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI UText * U_EXPORT2
|
||||
uregex_replaceFirstUText(URegularExpression *regexp2,
|
||||
UText *replacementText,
|
||||
UText *dest,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return 0;
|
||||
}
|
||||
if (replacementText == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_appendReplacement
|
||||
@ -868,28 +1153,23 @@ class RegexCImpl {
|
||||
UErrorCode *status);
|
||||
|
||||
inline static int32_t appendTail(RegularExpression *regexp,
|
||||
UChar **destBuf,
|
||||
int32_t *destCapacity,
|
||||
UErrorCode *status);
|
||||
UChar **destBuf,
|
||||
int32_t *destCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
inline static int32_t split(RegularExpression *regexp,
|
||||
UChar *destBuf,
|
||||
int32_t destCapacity,
|
||||
int32_t *requiredCapacity,
|
||||
UChar *destFields[],
|
||||
int32_t destFieldsCapacity,
|
||||
UErrorCode *status);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Call-back function for u_unescapeAt(), used when we encounter
|
||||
// \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
|
||||
//
|
||||
U_CDECL_BEGIN
|
||||
static UChar U_CALLCONV
|
||||
unescape_charAt(int32_t offset, void *context) {
|
||||
UChar c16 = ((UChar *)context)[offset];
|
||||
return c16;
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
|
||||
static const UChar BACKSLASH = 0x5c;
|
||||
static const UChar DOLLARSIGN = 0x24;
|
||||
|
||||
@ -910,11 +1190,11 @@ static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCap
|
||||
// appendReplacement, the actual implementation.
|
||||
//
|
||||
int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
||||
const UChar *replacementText,
|
||||
int32_t replacementLength,
|
||||
UChar **destBuf,
|
||||
int32_t *destCapacity,
|
||||
UErrorCode *status) {
|
||||
const UChar *replacementText,
|
||||
int32_t replacementLength,
|
||||
UChar **destBuf,
|
||||
int32_t *destCapacity,
|
||||
UErrorCode *status) {
|
||||
|
||||
// If we come in with a buffer overflow error, don't suppress the operation.
|
||||
// A series of appendReplacements, appendTail need to correctly preflight
|
||||
@ -958,12 +1238,29 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
||||
}
|
||||
|
||||
// Copy input string from the end of previous match to start of current match
|
||||
for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
|
||||
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
|
||||
if (regexp->fText != NULL) {
|
||||
int32_t matchStart;
|
||||
int32_t lastMatchEnd;
|
||||
if (UTEXT_USES_U16(m->fInputText)) {
|
||||
lastMatchEnd = m->fLastMatchEnd;
|
||||
matchStart = m->fMatchStart;
|
||||
} else {
|
||||
// !!!: Would like a better way to do this!
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
|
||||
status = U_ZERO_ERROR;
|
||||
matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
|
||||
}
|
||||
for (i=lastMatchEnd; i<matchStart; i++) {
|
||||
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
|
||||
}
|
||||
} else {
|
||||
UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
|
||||
destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
|
||||
&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// scan the replacement text, looking for substitutions ($n) and \escapes.
|
||||
int32_t replIdx = 0;
|
||||
while (replIdx < replacementLength) {
|
||||
@ -990,7 +1287,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
||||
if (c==0x55/*U*/ || c==0x75/*u*/) {
|
||||
// We have a \udddd or \Udddddddd escape sequence.
|
||||
UChar32 escapedChar =
|
||||
u_unescapeAt(unescape_charAt,
|
||||
u_unescapeAt(uregex_ucstr_unescape_charAt,
|
||||
&replIdx, // Index is updated by unescapeAt
|
||||
replacementLength, // Length of replacement text
|
||||
(void *)replacementText);
|
||||
@ -1050,11 +1347,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
||||
}
|
||||
|
||||
// Finally, append the capture group data to the destination.
|
||||
int32_t capacityRemaining = capacity - destIdx;
|
||||
if (capacityRemaining < 0) {
|
||||
capacityRemaining = 0;
|
||||
}
|
||||
destIdx += uregex_group((URegularExpression*)regexp, groupNum, dest+destIdx, capacityRemaining, status);
|
||||
destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
|
||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
// Ignore buffer overflow when extracting the group. We need to
|
||||
// continue on to get full size of the untruncated result. We will
|
||||
@ -1105,20 +1398,33 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
||||
}
|
||||
|
||||
//
|
||||
// appendReplacement the acutal API function,
|
||||
// appendReplacement the actual API function,
|
||||
//
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uregex_appendReplacement(URegularExpression *regexp2,
|
||||
const UChar *replacementText,
|
||||
int32_t replacementLength,
|
||||
UChar **destBuf,
|
||||
int32_t *destCapacity,
|
||||
UErrorCode *status) {
|
||||
const UChar *replacementText,
|
||||
int32_t replacementLength,
|
||||
UChar **destBuf,
|
||||
int32_t *destCapacity,
|
||||
UErrorCode *status) {
|
||||
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
return RegexCImpl::appendReplacement(
|
||||
regexp, replacementText, replacementLength,destBuf, destCapacity, status);
|
||||
}
|
||||
|
||||
//
|
||||
// uregex_appendReplacementUText...can just use the normal C++ method
|
||||
//
|
||||
U_CAPI void U_EXPORT2
|
||||
uregex_appendReplacementUText(URegularExpression *regexp2,
|
||||
UText *replText,
|
||||
UText *dest,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
regexp->fMatcher->appendReplacement(dest, replText, *status);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
@ -1126,9 +1432,9 @@ uregex_appendReplacement(URegularExpression *regexp2,
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
int32_t RegexCImpl::appendTail(RegularExpression *regexp,
|
||||
UChar **destBuf,
|
||||
int32_t *destCapacity,
|
||||
UErrorCode *status)
|
||||
UChar **destBuf,
|
||||
int32_t *destCapacity,
|
||||
UErrorCode *status)
|
||||
{
|
||||
|
||||
// If we come in with a buffer overflow error, don't suppress the operation.
|
||||
@ -1154,46 +1460,62 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp,
|
||||
|
||||
RegexMatcher *m = regexp->fMatcher;
|
||||
|
||||
int32_t srcIdx;
|
||||
if (m->fMatch) {
|
||||
// The most recent call to find() succeeded.
|
||||
srcIdx = m->fMatchEnd;
|
||||
} else {
|
||||
// The last call to find() on this matcher failed().
|
||||
// Look back to the end of the last find() that succeeded for src index.
|
||||
srcIdx = m->fLastMatchEnd;
|
||||
if (srcIdx == -1) {
|
||||
// There has been no successful match with this matcher.
|
||||
// We want to copy the whole string.
|
||||
srcIdx = 0;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t destIdx = 0;
|
||||
int32_t destCap = *destCapacity;
|
||||
UChar *dest = *destBuf;
|
||||
|
||||
for (;;) {
|
||||
if (srcIdx == regexp->fTextLength) {
|
||||
break;
|
||||
}
|
||||
UChar c = regexp->fText[srcIdx];
|
||||
if (c == 0 && regexp->fTextLength == -1) {
|
||||
break;
|
||||
}
|
||||
if (destIdx < destCap) {
|
||||
dest[destIdx] = c;
|
||||
if (regexp->fText != NULL) {
|
||||
int32_t srcIdx;
|
||||
int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
|
||||
if (nativeIdx == -1) {
|
||||
srcIdx = 0;
|
||||
} else if (UTEXT_USES_U16(m->fInputText)) {
|
||||
srcIdx = nativeIdx;
|
||||
} else {
|
||||
// We've overflowed the dest buffer.
|
||||
// If the total input string length is known, we can
|
||||
// compute the total buffer size needed without scanning through the string.
|
||||
if (regexp->fTextLength > 0) {
|
||||
destIdx += (regexp->fTextLength - srcIdx);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
if (srcIdx == regexp->fTextLength) {
|
||||
break;
|
||||
}
|
||||
UChar c = regexp->fText[srcIdx];
|
||||
if (c == 0 && regexp->fTextLength == -1) {
|
||||
regexp->fTextLength = srcIdx;
|
||||
break;
|
||||
}
|
||||
if (destIdx < destCap) {
|
||||
dest[destIdx] = c;
|
||||
} else {
|
||||
// We've overflowed the dest buffer.
|
||||
// If the total input string length is known, we can
|
||||
// compute the total buffer size needed without scanning through the string.
|
||||
if (regexp->fTextLength > 0) {
|
||||
destIdx += (regexp->fTextLength - srcIdx);
|
||||
break;
|
||||
}
|
||||
}
|
||||
srcIdx++;
|
||||
destIdx++;
|
||||
}
|
||||
srcIdx++;
|
||||
destIdx++;
|
||||
} else {
|
||||
int64_t srcIdx;
|
||||
if (m->fMatch) {
|
||||
// The most recent call to find() succeeded.
|
||||
srcIdx = m->fMatchEnd;
|
||||
} else {
|
||||
// The last call to find() on this matcher failed().
|
||||
// Look back to the end of the last find() that succeeded for src index.
|
||||
srcIdx = m->fLastMatchEnd;
|
||||
if (srcIdx == -1) {
|
||||
// There has been no successful match with this matcher.
|
||||
// We want to copy the whole string.
|
||||
srcIdx = 0;
|
||||
}
|
||||
}
|
||||
|
||||
destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
|
||||
}
|
||||
|
||||
//
|
||||
@ -1228,6 +1550,9 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp,
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// appendTail the actual API function
|
||||
//
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uregex_appendTail(URegularExpression *regexp2,
|
||||
UChar **destBuf,
|
||||
@ -1238,6 +1563,17 @@ uregex_appendTail(URegularExpression *regexp2,
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// uregex_appendTailUText...can just use the normal C++ method
|
||||
//
|
||||
U_CAPI UText * U_EXPORT2
|
||||
uregex_appendTailUText(URegularExpression *regexp2,
|
||||
UText *dest) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
return regexp->fMatcher->appendTail(dest);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// copyString Internal utility to copy a string to an output buffer,
|
||||
@ -1280,75 +1616,67 @@ static void copyString(UChar *destBuffer, // Destination buffer.
|
||||
// uregex_split
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uregex_split( URegularExpression *regexp2,
|
||||
UChar *destBuf,
|
||||
int32_t destCapacity,
|
||||
int32_t *requiredCapacity,
|
||||
UChar *destFields[],
|
||||
int32_t destFieldsCapacity,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return 0;
|
||||
}
|
||||
if (destBuf == NULL && destCapacity > 0 ||
|
||||
destCapacity < 0 ||
|
||||
destFields == NULL ||
|
||||
destFieldsCapacity < 1 ) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t RegexCImpl::split(RegularExpression *regexp,
|
||||
UChar *destBuf,
|
||||
int32_t destCapacity,
|
||||
int32_t *requiredCapacity,
|
||||
UChar *destFields[],
|
||||
int32_t destFieldsCapacity,
|
||||
UErrorCode *status) {
|
||||
//
|
||||
// Reset for the input text
|
||||
//
|
||||
regexp->fMatcher->reset();
|
||||
int32_t inputLen = regexp->fTextString.length();
|
||||
int32_t nextOutputStringStart = 0;
|
||||
UText *inputText = regexp->fMatcher->fInputText;
|
||||
int64_t nextOutputStringStart = 0;
|
||||
int64_t inputLen = regexp->fMatcher->fInputLength;
|
||||
if (inputLen == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Loop through the input text, searching for the delimiter pattern
|
||||
//
|
||||
int32_t i; // Index of the field being processed.
|
||||
int32_t destIdx = 0; // Next available position in destBuf;
|
||||
int32_t numCaptureGroups = regexp->fMatcher->groupCount();
|
||||
UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted
|
||||
for (i=0; ; i++) {
|
||||
if (i>=destFieldsCapacity-1) {
|
||||
// There are one or zero output string left.
|
||||
// There are one or zero output strings left.
|
||||
// Fill the last output string with whatever is left from the input, then exit the loop.
|
||||
// ( i will be == destFieldsCapacity if we filled the output array while processing
|
||||
// capture groups of the delimiter expression, in which case we will discard the
|
||||
// last capture group saved in favor of the unprocessed remainder of the
|
||||
// input string.)
|
||||
int32_t remainingLength = inputLen-nextOutputStringStart;
|
||||
if (remainingLength > 0) {
|
||||
}
|
||||
if (i >= destFieldsCapacity) {
|
||||
// No fields are left. Recycle the last one for holding the trailing part of
|
||||
// the input string.
|
||||
i = destFieldsCapacity-1;
|
||||
destIdx = (int32_t)(destFields[i] - destFields[0]);
|
||||
}
|
||||
if (inputLen > nextOutputStringStart) {
|
||||
if (i != destFieldsCapacity-1) {
|
||||
// No fields are left. Recycle the last one for holding the trailing part of
|
||||
// the input string.
|
||||
i = destFieldsCapacity-1;
|
||||
destIdx = (int32_t)(destFields[i] - destFields[0]);
|
||||
}
|
||||
|
||||
destFields[i] = &destBuf[destIdx];
|
||||
copyString(destBuf, destCapacity, &destIdx,
|
||||
®exp->fText[nextOutputStringStart], remainingLength);
|
||||
destFields[i] = &destBuf[destIdx];
|
||||
destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
|
||||
&destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (regexp->fMatcher->find()) {
|
||||
// We found another delimiter. Move everything from where we started looking
|
||||
// up until the start of the delimiter into the next output string.
|
||||
int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
|
||||
destFields[i] = &destBuf[destIdx];
|
||||
copyString(destBuf, destCapacity, &destIdx,
|
||||
®exp->fText[nextOutputStringStart], fieldLen);
|
||||
nextOutputStringStart = regexp->fMatcher->end(*status);
|
||||
|
||||
destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
|
||||
&destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
|
||||
if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
|
||||
tStatus = U_ZERO_ERROR;
|
||||
} else {
|
||||
*status = tStatus;
|
||||
}
|
||||
nextOutputStringStart = regexp->fMatcher->fMatchEnd;
|
||||
|
||||
// If the delimiter pattern has capturing parentheses, the captured
|
||||
// text goes out into the next n destination strings.
|
||||
@ -1361,16 +1689,16 @@ uregex_split( URegularExpression *regexp2,
|
||||
i++;
|
||||
|
||||
// Set up to extract the capture group contents into the dest buffer.
|
||||
UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow
|
||||
// error while extracting this group.
|
||||
int32_t remainingCapacity = destCapacity - destIdx;
|
||||
if (remainingCapacity < 0) {
|
||||
remainingCapacity = 0;
|
||||
}
|
||||
destFields[i] = &destBuf[destIdx];
|
||||
int32_t t = uregex_group(regexp2, groupNum, destFields[i], remainingCapacity, &tStatus);
|
||||
tStatus = U_ZERO_ERROR;
|
||||
int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
|
||||
destIdx += t + 1; // Record the space used in the output string buffer.
|
||||
// +1 for the NUL that terminates the string.
|
||||
if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
|
||||
tStatus = U_ZERO_ERROR;
|
||||
} else {
|
||||
*status = tStatus;
|
||||
}
|
||||
}
|
||||
|
||||
if (nextOutputStringStart == inputLen) {
|
||||
@ -1384,8 +1712,8 @@ uregex_split( URegularExpression *regexp2,
|
||||
// We ran off the end of the input while looking for the next delimiter.
|
||||
// All the remaining text goes into the current output string.
|
||||
destFields[i] = &destBuf[destIdx];
|
||||
copyString(destBuf, destCapacity, &destIdx,
|
||||
®exp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
|
||||
destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
|
||||
&destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1405,6 +1733,45 @@ uregex_split( URegularExpression *regexp2,
|
||||
return i+1;
|
||||
}
|
||||
|
||||
//
|
||||
// uregex_split The actual API function
|
||||
//
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uregex_split(URegularExpression *regexp2,
|
||||
UChar *destBuf,
|
||||
int32_t destCapacity,
|
||||
int32_t *requiredCapacity,
|
||||
UChar *destFields[],
|
||||
int32_t destFieldsCapacity,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return 0;
|
||||
}
|
||||
if (destBuf == NULL && destCapacity > 0 ||
|
||||
destCapacity < 0 ||
|
||||
destFields == NULL ||
|
||||
destFieldsCapacity < 1 ) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// uregex_splitUText...can just use the normal C++ method
|
||||
//
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uregex_splitUText(URegularExpression *regexp2,
|
||||
UText *destFields[],
|
||||
int32_t destFieldsCapacity,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
|
||||
}
|
||||
|
||||
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2004-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 2004-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/********************************************************************************
|
||||
@ -26,6 +26,7 @@
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/uregex.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "cintltst.h"
|
||||
|
||||
#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
|
||||
@ -86,11 +87,34 @@ static void test_assert_string(const char *expected, const UChar *actual, UBool
|
||||
#define TEST_ASSERT_STRING(expected, actual, nulTerm) test_assert_string(expected, actual, nulTerm, __FILE__, __LINE__)
|
||||
|
||||
|
||||
static void test_assert_utext(const char *expected, UText *actual, const char *file, int line) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UText expectedText = UTEXT_INITIALIZER;
|
||||
utext_openUTF8(&expectedText, expected, -1, &status);
|
||||
utext_setNativeIndex(actual, 0);
|
||||
if (utext_compare(&expectedText, -1, actual, -1) != 0) {
|
||||
UChar32 c;
|
||||
log_err("Failure at file %s, line %d, expected \"%s\", got \"", file, line, expected);
|
||||
c = utext_next32From(actual, 0);
|
||||
while (c != U_SENTINEL) {
|
||||
if (0x20<c && c <0x7e) {
|
||||
log_err("%c", c);
|
||||
} else {
|
||||
log_err("%#x", c);
|
||||
}
|
||||
c = UTEXT_NEXT32(actual);
|
||||
}
|
||||
log_err("\"\n");
|
||||
}
|
||||
}
|
||||
|
||||
#define TEST_ASSERT_UTEXT(expected, actual) test_assert_utext(expected, actual, __FILE__, __LINE__)
|
||||
|
||||
|
||||
|
||||
static void TestRegexCAPI(void);
|
||||
static void TestBug4315(void);
|
||||
static void TestUTextAPI(void);
|
||||
|
||||
void addURegexTest(TestNode** root);
|
||||
|
||||
@ -98,6 +122,7 @@ void addURegexTest(TestNode** root)
|
||||
{
|
||||
addTest(root, &TestRegexCAPI, "regex/TestRegexCAPI");
|
||||
addTest(root, &TestBug4315, "regex/TestBug4315");
|
||||
addTest(root, &TestUTextAPI, "regex/TestUTextAPI");
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1319,4 +1344,697 @@ static void TestBug4315(void) {
|
||||
uregex_close(theRegEx);
|
||||
}
|
||||
|
||||
/* Based on TestRegexCAPI() */
|
||||
static void TestUTextAPI(void) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
URegularExpression *re;
|
||||
UText patternText = UTEXT_INITIALIZER;
|
||||
UChar pat[200];
|
||||
|
||||
/* Mimimalist open/close */
|
||||
utext_openUTF8(&patternText, "abc*", -1, &status);
|
||||
re = uregex_openUText(&patternText, 0, 0, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err("Failed to open regular expression, line %d, error is \"%s\"\n", __LINE__, u_errorName(status));
|
||||
return;
|
||||
}
|
||||
uregex_close(re);
|
||||
|
||||
/* Open with all flag values set */
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openUText(&patternText,
|
||||
UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD,
|
||||
0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
uregex_close(re);
|
||||
|
||||
/* Open with an invalid flag */
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openUText(&patternText, 0x40000000, 0, &status);
|
||||
TEST_ASSERT(status == U_REGEX_INVALID_FLAG);
|
||||
uregex_close(re);
|
||||
|
||||
/* open with an invalid parameter */
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openUText(NULL,
|
||||
UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD, 0, &status);
|
||||
TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR && re == NULL);
|
||||
|
||||
/*
|
||||
* clone
|
||||
*/
|
||||
{
|
||||
URegularExpression *clone1;
|
||||
URegularExpression *clone2;
|
||||
URegularExpression *clone3;
|
||||
UChar testString1[30];
|
||||
UChar testString2[30];
|
||||
UBool result;
|
||||
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openUText(&patternText, 0, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
clone1 = uregex_clone(re, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(clone1 != NULL);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
clone2 = uregex_clone(re, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(clone2 != NULL);
|
||||
uregex_close(re);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
clone3 = uregex_clone(clone2, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(clone3 != NULL);
|
||||
|
||||
u_uastrncpy(testString1, "abcccd", sizeof(pat)/2);
|
||||
u_uastrncpy(testString2, "xxxabcccd", sizeof(pat)/2);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_setText(clone1, testString1, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
result = uregex_lookingAt(clone1, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(result==TRUE);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_setText(clone2, testString2, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
result = uregex_lookingAt(clone2, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(result==FALSE);
|
||||
result = uregex_find(clone2, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(result==TRUE);
|
||||
|
||||
uregex_close(clone1);
|
||||
uregex_close(clone2);
|
||||
uregex_close(clone3);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* pattern() and patternText()
|
||||
*/
|
||||
{
|
||||
const UChar *resultPat;
|
||||
int32_t resultLen;
|
||||
UText *resultText;
|
||||
u_uastrncpy(pat, "hello", sizeof(pat)/2); /* for comparison */
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
utext_openUTF8(&patternText, "hello", -1, &status);
|
||||
re = uregex_open(pat, -1, 0, NULL, &status);
|
||||
resultPat = uregex_pattern(re, &resultLen, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS above should change too... */
|
||||
if (U_SUCCESS(status)) {
|
||||
TEST_ASSERT(resultLen == -1);
|
||||
TEST_ASSERT(u_strcmp(resultPat, pat) == 0);
|
||||
}
|
||||
|
||||
resultText = uregex_patternUText(re, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_UTEXT("hello", resultText);
|
||||
|
||||
uregex_close(re);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_open(pat, 3, 0, NULL, &status);
|
||||
resultPat = uregex_pattern(re, &resultLen, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS above should change too... */
|
||||
if (U_SUCCESS(status)) {
|
||||
TEST_ASSERT(resultLen == 3);
|
||||
TEST_ASSERT(u_strncmp(resultPat, pat, 3) == 0);
|
||||
TEST_ASSERT(u_strlen(resultPat) == 3);
|
||||
}
|
||||
|
||||
resultText = uregex_patternUText(re, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_UTEXT("hel", resultText);
|
||||
|
||||
uregex_close(re);
|
||||
}
|
||||
|
||||
/*
|
||||
* setUText() and lookingAt()
|
||||
*/
|
||||
{
|
||||
UText text1 = UTEXT_INITIALIZER;
|
||||
UText text2 = UTEXT_INITIALIZER;
|
||||
UBool result;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
utext_openUTF8(&text1, "abcccd", -1, &status);
|
||||
utext_openUTF8(&text2, "abcccxd", -1, &status);
|
||||
|
||||
utext_openUTF8(&patternText, "abc*d", -1, &status);
|
||||
re = uregex_openUText(&patternText, 0, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* Operation before doing a setText should fail... */
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_lookingAt(re, 0, &status);
|
||||
TEST_ASSERT( status== U_REGEX_INVALID_STATE);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_setUText(re, &text1, &status);
|
||||
result = uregex_lookingAt(re, 0, &status);
|
||||
TEST_ASSERT(result == TRUE);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_setUText(re, &text2, &status);
|
||||
result = uregex_lookingAt(re, 0, &status);
|
||||
TEST_ASSERT(result == FALSE);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_setUText(re, &text1, &status);
|
||||
result = uregex_lookingAt(re, 0, &status);
|
||||
TEST_ASSERT(result == TRUE);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
uregex_close(re);
|
||||
utext_close(&text1);
|
||||
utext_close(&text2);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* getText() and getUText()
|
||||
*/
|
||||
{
|
||||
UText text1 = UTEXT_INITIALIZER;
|
||||
UText text2 = UTEXT_INITIALIZER;
|
||||
UChar text2Chars[20];
|
||||
UText *resultText;
|
||||
const UChar *result;
|
||||
int32_t textLength;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
utext_openUTF8(&text1, "abcccd", -1, &status);
|
||||
u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
|
||||
utext_openUChars(&text2, text2Chars, -1, &status);
|
||||
|
||||
utext_openUTF8(&patternText, "abc*d", -1, &status);
|
||||
re = uregex_openUText(&patternText, 0, NULL, &status);
|
||||
|
||||
/* First set a UText */
|
||||
uregex_setUText(re, &text1, &status);
|
||||
resultText = uregex_getUText(re, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(resultText != &text1);
|
||||
utext_setNativeIndex(resultText, 0);
|
||||
utext_setNativeIndex(&text1, 0);
|
||||
TEST_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
|
||||
utext_close(resultText);
|
||||
|
||||
result = uregex_getText(re, &textLength, &status); /* flattens UText into buffer */
|
||||
TEST_ASSERT(textLength == -1 || textLength == 6);
|
||||
resultText = uregex_getUText(re, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(resultText != &text1);
|
||||
utext_setNativeIndex(resultText, 0);
|
||||
utext_setNativeIndex(&text1, 0);
|
||||
TEST_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
|
||||
utext_close(resultText);
|
||||
|
||||
/* Then set a UChar * */
|
||||
uregex_setText(re, text2Chars, 7, &status);
|
||||
resultText = uregex_getUText(re, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
utext_setNativeIndex(resultText, 0);
|
||||
utext_setNativeIndex(&text2, 0);
|
||||
TEST_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
|
||||
utext_close(resultText);
|
||||
result = uregex_getText(re, &textLength, &status);
|
||||
TEST_ASSERT(textLength == 7);
|
||||
|
||||
uregex_close(re);
|
||||
utext_close(&text1);
|
||||
utext_close(&text2);
|
||||
}
|
||||
|
||||
/*
|
||||
* matches()
|
||||
*/
|
||||
{
|
||||
UText text1 = UTEXT_INITIALIZER;
|
||||
UBool result;
|
||||
UText nullText = UTEXT_INITIALIZER;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
utext_openUTF8(&text1, "abcccde", -1, &status);
|
||||
utext_openUTF8(&patternText, "abc*d", -1, &status);
|
||||
re = uregex_openUText(&patternText, 0, NULL, &status);
|
||||
|
||||
uregex_setUText(re, &text1, &status);
|
||||
result = uregex_matches(re, 0, &status);
|
||||
TEST_ASSERT(result == FALSE);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
uregex_close(re);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC(".?", 0, NULL, &status);
|
||||
uregex_setUText(re, &text1, &status);
|
||||
result = uregex_matches(re, 7, &status);
|
||||
TEST_ASSERT(result == TRUE);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
utext_openUTF8(&nullText, "", -1, &status);
|
||||
uregex_setUText(re, &nullText, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
result = uregex_matches(re, 0, &status);
|
||||
TEST_ASSERT(result == TRUE);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
uregex_close(re);
|
||||
utext_close(&text1);
|
||||
utext_close(&nullText);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* lookingAt() Used in setText test.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* find(), findNext, start, end, reset
|
||||
*/
|
||||
{
|
||||
UChar text1[50];
|
||||
UBool result;
|
||||
u_uastrncpy(text1, "012rx5rx890rxrx...", sizeof(text1)/2);
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC("rx", 0, NULL, &status);
|
||||
|
||||
uregex_setText(re, text1, -1, &status);
|
||||
result = uregex_find(re, 0, &status);
|
||||
TEST_ASSERT(result == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 3);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 5);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
result = uregex_find(re, 9, &status);
|
||||
TEST_ASSERT(result == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 11);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 13);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
result = uregex_find(re, 14, &status);
|
||||
TEST_ASSERT(result == FALSE);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_reset(re, 0, &status);
|
||||
|
||||
result = uregex_findNext(re, &status);
|
||||
TEST_ASSERT(result == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 3);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 5);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
result = uregex_findNext(re, &status);
|
||||
TEST_ASSERT(result == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 6);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 8);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_reset(re, 12, &status);
|
||||
|
||||
result = uregex_findNext(re, &status);
|
||||
TEST_ASSERT(result == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 13);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 15);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
result = uregex_findNext(re, &status);
|
||||
TEST_ASSERT(result == FALSE);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
uregex_close(re);
|
||||
}
|
||||
|
||||
/*
|
||||
* group()
|
||||
*/
|
||||
{
|
||||
UChar text1[80];
|
||||
UText *actual;
|
||||
UBool result;
|
||||
u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC("abc(.*?)def", 0, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
uregex_setText(re, text1, -1, &status);
|
||||
result = uregex_find(re, 0, &status);
|
||||
TEST_ASSERT(result==TRUE);
|
||||
|
||||
/* Capture Group 0, the full match. Should succeed. */
|
||||
status = U_ZERO_ERROR;
|
||||
actual = uregex_groupUText(re, 0, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_UTEXT("abc interior def", actual);
|
||||
utext_close(actual);
|
||||
|
||||
/* Capture group #1. Should succeed. */
|
||||
status = U_ZERO_ERROR;
|
||||
actual = uregex_groupUText(re, 1, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_UTEXT(" interior ", actual);
|
||||
utext_close(actual);
|
||||
|
||||
/* Capture group out of range. Error. */
|
||||
status = U_ZERO_ERROR;
|
||||
actual = uregex_groupUText(re, 2, NULL, &status);
|
||||
TEST_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
|
||||
TEST_ASSERT(utext_nativeLength(actual) == 0);
|
||||
utext_close(actual);
|
||||
|
||||
uregex_close(re);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* replaceFirst()
|
||||
*/
|
||||
{
|
||||
UChar text1[80];
|
||||
UChar text2[80];
|
||||
UText replText = UTEXT_INITIALIZER;
|
||||
UText *result;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
|
||||
u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
|
||||
utext_openUTF8(&replText, "<$1>", -1, &status);
|
||||
|
||||
re = uregex_openC("x(.*?)x", 0, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* Normal case, with match */
|
||||
uregex_setText(re, text1, -1, &status);
|
||||
result = uregex_replaceFirstUText(re, &replText, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_UTEXT("Replace <aa> x1x x...x.", result);
|
||||
utext_close(result);
|
||||
|
||||
/* No match. Text should copy to output with no changes. */
|
||||
uregex_setText(re, text2, -1, &status);
|
||||
result = uregex_replaceFirstUText(re, &replText, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_UTEXT("No match here.", result);
|
||||
utext_close(result);
|
||||
|
||||
/* Unicode escapes */
|
||||
uregex_setText(re, text1, -1, &status);
|
||||
utext_openUTF8(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
|
||||
result = uregex_replaceFirstUText(re, &replText, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_UTEXT("Replace \\AaaB$a x1x x...x.", result);
|
||||
utext_close(result);
|
||||
|
||||
uregex_close(re);
|
||||
utext_close(&replText);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* replaceAll()
|
||||
*/
|
||||
{
|
||||
UChar text1[80];
|
||||
UChar text2[80];
|
||||
UText replText = UTEXT_INITIALIZER;
|
||||
UText *result;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
|
||||
u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
|
||||
utext_openUTF8(&replText, "<$1>", -1, &status);
|
||||
|
||||
re = uregex_openC("x(.*?)x", 0, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* Normal case, with match */
|
||||
uregex_setText(re, text1, -1, &status);
|
||||
result = uregex_replaceAllUText(re, &replText, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_UTEXT("Replace <aa> <1> <...>.", result);
|
||||
utext_close(result);
|
||||
|
||||
/* No match. Text should copy to output with no changes. */
|
||||
uregex_setText(re, text2, -1, &status);
|
||||
result = uregex_replaceAllUText(re, &replText, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_UTEXT("No match here.", result);
|
||||
utext_close(result);
|
||||
|
||||
uregex_close(re);
|
||||
utext_close(&replText);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* appendReplacement()
|
||||
*/
|
||||
{
|
||||
UChar text[100];
|
||||
UChar repl[100];
|
||||
UChar buf[100];
|
||||
UChar *bufPtr;
|
||||
int32_t bufCap;
|
||||
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC(".*", 0, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
u_uastrncpy(text, "whatever", sizeof(text)/2);
|
||||
u_uastrncpy(repl, "some other", sizeof(repl)/2);
|
||||
uregex_setText(re, text, -1, &status);
|
||||
|
||||
/* match covers whole target string */
|
||||
uregex_find(re, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
bufPtr = buf;
|
||||
bufCap = sizeof(buf) / 2;
|
||||
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_STRING("some other", buf, TRUE);
|
||||
|
||||
/* Match has \u \U escapes */
|
||||
uregex_find(re, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
bufPtr = buf;
|
||||
bufCap = sizeof(buf) / 2;
|
||||
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", sizeof(repl)/2);
|
||||
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE);
|
||||
|
||||
uregex_close(re);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* appendReplacement(), appendTail() checked in replaceFirst(), replaceAll().
|
||||
*/
|
||||
|
||||
/*
|
||||
* splitUText()
|
||||
*/
|
||||
{
|
||||
UChar textToSplit[80];
|
||||
UChar text2[80];
|
||||
UText *fields[10];
|
||||
int32_t numFields;
|
||||
|
||||
u_uastrncpy(textToSplit, "first : second: third", sizeof(textToSplit)/2);
|
||||
u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC(":", 0, NULL, &status);
|
||||
|
||||
|
||||
/* Simple split */
|
||||
|
||||
uregex_setText(re, textToSplit, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if (U_SUCCESS(status)) {
|
||||
memset(fields, 0, sizeof(fields));
|
||||
numFields = uregex_splitUText(re, fields, 10, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
TEST_ASSERT(numFields == 3);
|
||||
TEST_ASSERT_UTEXT("first ", fields[0]);
|
||||
TEST_ASSERT_UTEXT(" second", fields[1]);
|
||||
TEST_ASSERT_UTEXT(" third", fields[2]);
|
||||
TEST_ASSERT(fields[3] == NULL);
|
||||
}
|
||||
}
|
||||
|
||||
uregex_close(re);
|
||||
|
||||
|
||||
/* Split with too few output strings available */
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC(":", 0, NULL, &status);
|
||||
uregex_setText(re, textToSplit, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
fields[0] = NULL;
|
||||
fields[1] = NULL;
|
||||
fields[2] = &patternText;
|
||||
numFields = uregex_splitUText(re, fields, 2, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
TEST_ASSERT(numFields == 2);
|
||||
TEST_ASSERT_UTEXT("first ", fields[0]);
|
||||
TEST_ASSERT_UTEXT(" second: third", fields[1]);
|
||||
TEST_ASSERT(fields[2] == &patternText);
|
||||
}
|
||||
}
|
||||
|
||||
uregex_close(re);
|
||||
}
|
||||
|
||||
/* splitUText(), part 2. Patterns with capture groups. The capture group text
|
||||
* comes out as additional fields. */
|
||||
{
|
||||
UChar textToSplit[80];
|
||||
UText *fields[10];
|
||||
int32_t numFields;
|
||||
|
||||
u_uastrncpy(textToSplit, "first <tag-a> second<tag-b> third", sizeof(textToSplit)/2);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
re = uregex_openC("<(.*?)>", 0, NULL, &status);
|
||||
|
||||
uregex_setText(re, textToSplit, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
memset(fields, 0, sizeof(fields));
|
||||
numFields = uregex_splitUText(re, fields, 10, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
TEST_ASSERT(numFields == 5);
|
||||
TEST_ASSERT_UTEXT("first ", fields[0]);
|
||||
TEST_ASSERT_UTEXT("tag-a", fields[1]);
|
||||
TEST_ASSERT_UTEXT(" second", fields[2]);
|
||||
TEST_ASSERT_UTEXT("tag-b", fields[3]);
|
||||
TEST_ASSERT_UTEXT(" third", fields[4]);
|
||||
TEST_ASSERT(fields[5] == NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* Split with too few output strings available (2) */
|
||||
status = U_ZERO_ERROR;
|
||||
fields[0] = NULL;
|
||||
fields[1] = NULL;
|
||||
fields[2] = &patternText;
|
||||
numFields = uregex_splitUText(re, fields, 2, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
TEST_ASSERT(numFields == 2);
|
||||
TEST_ASSERT_UTEXT("first ", fields[0]);
|
||||
TEST_ASSERT_UTEXT(" second<tag-b> third", fields[1]);
|
||||
TEST_ASSERT(fields[2] == &patternText);
|
||||
}
|
||||
|
||||
/* Split with too few output strings available (3) */
|
||||
status = U_ZERO_ERROR;
|
||||
fields[0] = NULL;
|
||||
fields[1] = NULL;
|
||||
fields[2] = NULL;
|
||||
fields[3] = &patternText;
|
||||
numFields = uregex_splitUText(re, fields, 3, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
TEST_ASSERT(numFields == 3);
|
||||
TEST_ASSERT_UTEXT("first ", fields[0]);
|
||||
TEST_ASSERT_UTEXT("tag-a", fields[1]);
|
||||
TEST_ASSERT_UTEXT(" second<tag-b> third", fields[2]);
|
||||
TEST_ASSERT(fields[3] == &patternText);
|
||||
}
|
||||
|
||||
/* Split with just enough output strings available (5) */
|
||||
status = U_ZERO_ERROR;
|
||||
fields[0] = NULL;
|
||||
fields[1] = NULL;
|
||||
fields[2] = NULL;
|
||||
fields[3] = NULL;
|
||||
fields[4] = NULL;
|
||||
fields[5] = &patternText;
|
||||
numFields = uregex_splitUText(re, fields, 5, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
TEST_ASSERT(numFields == 5);
|
||||
TEST_ASSERT_UTEXT("first ", fields[0]);
|
||||
TEST_ASSERT_UTEXT("tag-a", fields[1]);
|
||||
TEST_ASSERT_UTEXT(" second", fields[2]);
|
||||
TEST_ASSERT_UTEXT("tag-b", fields[3]);
|
||||
TEST_ASSERT_UTEXT(" third", fields[4]);
|
||||
TEST_ASSERT(fields[5] == &patternText);
|
||||
}
|
||||
|
||||
/* Split, end of text is a field delimiter. */
|
||||
status = U_ZERO_ERROR;
|
||||
uregex_setText(re, textToSplit, strlen("first <tag-a> second<tag-b>"), &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
memset(fields, 0, sizeof(fields));
|
||||
fields[9] = &patternText;
|
||||
numFields = uregex_splitUText(re, fields, 9, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
/* The TEST_ASSERT_SUCCESS call above should change too... */
|
||||
if(U_SUCCESS(status)) {
|
||||
TEST_ASSERT(numFields == 4);
|
||||
TEST_ASSERT_UTEXT("first ", fields[0]);
|
||||
TEST_ASSERT_UTEXT("tag-a", fields[1]);
|
||||
TEST_ASSERT_UTEXT(" second", fields[2]);
|
||||
TEST_ASSERT_UTEXT("tag-b", fields[3]);
|
||||
TEST_ASSERT(fields[4] == NULL);
|
||||
TEST_ASSERT(fields[8] == NULL);
|
||||
TEST_ASSERT(fields[9] == &patternText);
|
||||
}
|
||||
}
|
||||
|
||||
uregex_close(re);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2005-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 2005-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/************************************************************************
|
||||
@ -58,6 +58,8 @@ UTextTest::runIndexedTest(int32_t index, UBool exec,
|
||||
if (exec) Ticket5560(); break;
|
||||
case 4: name = "Ticket6847";
|
||||
if (exec) Ticket6847(); break;
|
||||
case 5: name = "ComparisonTest";
|
||||
if (exec) ComparisonTest(); break;
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
@ -836,6 +838,476 @@ void UTextTest::TestAccessNoClone(const UnicodeString &us, UText *ut, int cpCoun
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// ComparisonTest() Check the string comparison functions. Based on UnicodeStringTest::TestCompare()
|
||||
//
|
||||
void UTextTest::ComparisonTest()
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString test1Str("this is a test");
|
||||
UnicodeString test2Str("this is a test");
|
||||
UnicodeString test3Str("this is a test of the emergency broadcast system");
|
||||
UnicodeString test4Str("never say, \"this is a test\"!!");
|
||||
|
||||
UText test1 = UTEXT_INITIALIZER;
|
||||
UText test2 = UTEXT_INITIALIZER;
|
||||
UText test3 = UTEXT_INITIALIZER;
|
||||
UText test4 = UTEXT_INITIALIZER;
|
||||
|
||||
UChar uniChars[] = { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73,
|
||||
0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0 };
|
||||
char chars[] = { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73,
|
||||
0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0 };
|
||||
|
||||
UText uniCharText = UTEXT_INITIALIZER;
|
||||
UText charText = UTEXT_INITIALIZER;
|
||||
|
||||
utext_openUnicodeString(&test1, &test1Str, &status);
|
||||
utext_openUnicodeString(&test2, &test2Str, &status);
|
||||
utext_openUnicodeString(&test3, &test3Str, &status);
|
||||
utext_openUnicodeString(&test4, &test4Str, &status);
|
||||
|
||||
utext_openUChars(&uniCharText, uniChars, -1, &status);
|
||||
utext_openUTF8(&charText, chars, -1, &status);
|
||||
|
||||
TEST_SUCCESS(status);
|
||||
|
||||
// test utext_compare(), simple
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compare(&test1, -1, &test2, -1) != 0) errln("utext_compare() failed, simple setup");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compare(&test1, -1, &test3, -1) >= 0) errln("utext_compare() failed, simple setup");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 0);
|
||||
if (utext_compare(&test1, -1, &test4, -1) <= 0) errln("utext_compare() failed, simple setup");
|
||||
|
||||
// test utext_compareNativeLimit(), simple
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compareNativeLimit(&test1, -1, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, simple setup");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compareNativeLimit(&test1, -1, &test3, -1) >= 0) errln("utext_compareNativeLimit() failed, simple setup");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 0);
|
||||
if (utext_compareNativeLimit(&test1, -1, &test4, -1) <= 0) errln("utext_compareNativeLimit() failed, simple setup");
|
||||
|
||||
// test utext_compare(), one explicit length
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compare(&test1, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length");
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compare(&test3, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length");
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 12);
|
||||
if (utext_compare(&test4, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length and offset");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compare(&test3, 18, &test2, -1) <= 0) errln("utext_compare() failed, one explicit length");
|
||||
|
||||
// test utext_compareNativeLimit(), one explicit length
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compareNativeLimit(&test1, 14, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length");
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compareNativeLimit(&test3, 14, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length");
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 12);
|
||||
if (utext_compareNativeLimit(&test4, 26, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length and limit");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compareNativeLimit(&test3, 18, &test2, -1) <= 0) errln("utext_compareNativeLimit() failed, one explicit length");
|
||||
|
||||
// test utext_compare(), UChar-based UText
|
||||
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compare(&test2, -1, &uniCharText, -1) != 0) errln("utext_compare() failed, UChar-based UText");
|
||||
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compare(&test3, -1, &uniCharText, -1) <= 0) errln("utext_compare() failed, UChar-based UText");
|
||||
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 0);
|
||||
if (utext_compare(&test4, -1, &uniCharText, -1) >= 0) errln("utext_compare() failed, UChar-based UText");
|
||||
|
||||
// test utext_compareNativeLimit(), UChar-based UText
|
||||
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compareNativeLimit(&test2, -1, &uniCharText, -1) != 0) errln("utext_compareNativeLimit() failed, UChar-based UText");
|
||||
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compareNativeLimit(&test3, -1, &uniCharText, -1) <= 0) errln("utext_compareNativeLimit() failed, UChar-based UText");
|
||||
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 0);
|
||||
if (utext_compareNativeLimit(&test4, -1, &uniCharText, -1) >= 0) errln("utext_compareNativeLimit() failed, UChar-based UText");
|
||||
|
||||
// test utext_compare(), UTF8-based UText
|
||||
UTEXT_SETNATIVEINDEX(&charText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compare(&test2, -1, &charText, -1) != 0) errln("utext_compare() failed, UTF8-based UText");
|
||||
UTEXT_SETNATIVEINDEX(&charText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compare(&test3, -1, &charText, -1) <= 0) errln("utext_compare() failed, UTF8-based UText");
|
||||
UTEXT_SETNATIVEINDEX(&charText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 0);
|
||||
if (utext_compare(&test4, -1, &charText, -1) >= 0) errln("utext_compare() failed, UTF8-based UText");
|
||||
|
||||
// test utext_compareNativeLimit(), UTF8-based UText
|
||||
UTEXT_SETNATIVEINDEX(&charText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compareNativeLimit(&test2, -1, &charText, -1) != 0) errln("utext_compareNativeLimit() failed, UTF8-based UText");
|
||||
UTEXT_SETNATIVEINDEX(&charText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compareNativeLimit(&test3, -1, &charText, -1) <= 0) errln("utext_compareNativeLimit() failed, UTF8-based UText");
|
||||
UTEXT_SETNATIVEINDEX(&charText, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 0);
|
||||
if (utext_compareNativeLimit(&test4, -1, &charText, -1) >= 0) errln("utext_compareNativeLimit() failed, UTF8-based UText");
|
||||
|
||||
// test utext_compare(), length
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compare(&test1, -1, &test2, 4) != 0) errln("utext_compare() failed, one length");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compare(&test1, 5, &test2, 4) <= 0) errln("utext_compare() failed, both lengths");
|
||||
|
||||
// test utext_compareNativeLimit(), limit
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compareNativeLimit(&test1, -1, &test2, 4) != 0) errln("utext_compareNativeLimit() failed, one limit");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compareNativeLimit(&test1, 5, &test2, 4) <= 0) errln("utext_compareNativeLimit() failed, both limits");
|
||||
|
||||
// test utext_compare(), both explicit offsets and lengths
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compare(&test1, 14, &test2, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compare(&test1, 14, &test3, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 12);
|
||||
if (utext_compare(&test1, 14, &test4, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 10);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compare(&test1, 4, &test2, 4) >= 0) errln("utext_compare() failed, both explicit offsets and lengths");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 10);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 22);
|
||||
if (utext_compare(&test1, 4, &test3, 9) <= 0) errln("utext_compare() failed, both explicit offsets and lengths");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 10);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 22);
|
||||
if (utext_compare(&test1, 4, &test4, 4) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
|
||||
|
||||
// test utext_compareNativeLimit(), both explicit offsets and limits
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compareNativeLimit(&test1, 14, &test2, 14) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 0);
|
||||
if (utext_compareNativeLimit(&test1, 14, &test3, 14) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 0);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 12);
|
||||
if (utext_compareNativeLimit(&test1, 14, &test4, 26) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 10);
|
||||
UTEXT_SETNATIVEINDEX(&test2, 0);
|
||||
if (utext_compareNativeLimit(&test1, 14, &test2, 4) >= 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 10);
|
||||
UTEXT_SETNATIVEINDEX(&test3, 22);
|
||||
if (utext_compareNativeLimit(&test1, 14, &test3, 31) <= 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
|
||||
UTEXT_SETNATIVEINDEX(&test1, 10);
|
||||
UTEXT_SETNATIVEINDEX(&test4, 22);
|
||||
if (utext_compareNativeLimit(&test1, 14, &test4, 26) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
|
||||
|
||||
/* test caseCompare() */
|
||||
{
|
||||
static const UChar
|
||||
_mixed[]= { 0x61, 0x42, 0x131, 0x3a3, 0xdf, 0x130, 0x49, 0xfb03, 0xd93f, 0xdfff, 0 },
|
||||
_otherDefault[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x69, 0x307, 0x69, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
|
||||
_otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x69, 0x131, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
|
||||
_different[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x130, 0x49, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
|
||||
|
||||
UText
|
||||
mixed = UTEXT_INITIALIZER,
|
||||
otherDefault = UTEXT_INITIALIZER,
|
||||
otherExcludeSpecialI = UTEXT_INITIALIZER,
|
||||
different = UTEXT_INITIALIZER;
|
||||
|
||||
utext_openUChars(&mixed, _mixed, -1, &status);
|
||||
utext_openUChars(&otherDefault, _otherDefault, -1, &status);
|
||||
utext_openUChars(&otherExcludeSpecialI, _otherExcludeSpecialI, -1, &status);
|
||||
utext_openUChars(&different, _different, -1, &status);
|
||||
|
||||
TEST_SUCCESS(status);
|
||||
|
||||
int32_t result;
|
||||
|
||||
/* test default options */
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 0);
|
||||
UTEXT_SETNATIVEINDEX(&otherDefault, 0);
|
||||
result = utext_caseCompare(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 != result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompare (other, default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 0);
|
||||
UTEXT_SETNATIVEINDEX(&otherDefault, 0);
|
||||
result = utext_caseCompareNativeLimit(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 != result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompareNativeLimit (other, default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
|
||||
/* test excluding special I */
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 0);
|
||||
UTEXT_SETNATIVEINDEX(&otherExcludeSpecialI, 0);
|
||||
result = utext_caseCompare(&mixed, -1, &otherExcludeSpecialI, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
|
||||
if (0 != result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompare (otherExcludeSpecialI, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 0);
|
||||
UTEXT_SETNATIVEINDEX(&otherExcludeSpecialI, 0);
|
||||
result = utext_caseCompareNativeLimit(&mixed, -1, &otherExcludeSpecialI, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
|
||||
if (0 != result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompareNativeLimit (otherExcludeSpecialI, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 0);
|
||||
UTEXT_SETNATIVEINDEX(&otherDefault, 0);
|
||||
result = utext_caseCompare(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
|
||||
if (0 == result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompare (other, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be nonzero) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 0);
|
||||
UTEXT_SETNATIVEINDEX(&otherDefault, 0);
|
||||
result = utext_caseCompareNativeLimit(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
|
||||
if (0 == result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompareNativeLimit (other, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be nonzero) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
|
||||
/* test against different string */
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 0);
|
||||
UTEXT_SETNATIVEINDEX(&different, 0);
|
||||
result = utext_caseCompare(&mixed, -1, &different, -1, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 >= result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompare (different, default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 0);
|
||||
UTEXT_SETNATIVEINDEX(&different, 0);
|
||||
result = utext_caseCompareNativeLimit(&mixed, -1, &different, -1, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 >= result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompareNativeLimit (different, default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
|
||||
/* test caseCompare() - include the folded sharp s (U+00df) with different lengths */
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 1);
|
||||
UTEXT_SETNATIVEINDEX(&different, 1);
|
||||
result = utext_caseCompare(&mixed, 4, &different, 5, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 != result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompare (mixed[1-5), different[1-6), default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 1);
|
||||
UTEXT_SETNATIVEINDEX(&different, 1);
|
||||
result = utext_caseCompareNativeLimit(&mixed, 5, &different, 6, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 != result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompareNativeLimit (mixed[1-5), different[1-6), default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
|
||||
/* test caseCompare() - stop in the middle of the sharp s (U+00df) */
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 1);
|
||||
UTEXT_SETNATIVEINDEX(&different, 1);
|
||||
result = utext_caseCompare(&mixed, 4, &different, 4, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 >= result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompare (mixed[1-5), different[1-5), default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(&mixed, 1);
|
||||
UTEXT_SETNATIVEINDEX(&different, 1);
|
||||
result = utext_caseCompareNativeLimit(&mixed, 5, &different, 5, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 >= result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompareNativeLimit (mixed[1-5), different[1-5), default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
}
|
||||
|
||||
/* test surrogates in comparison */
|
||||
{
|
||||
static const UChar
|
||||
_before[] = { 0x65, 0xd800, 0xd800, 0xdc01, 0x65, 0x00 },
|
||||
_after[] = { 0x65, 0xd800, 0xdc00, 0x65, 0x00 };
|
||||
|
||||
UText
|
||||
before = UTEXT_INITIALIZER,
|
||||
after = UTEXT_INITIALIZER;
|
||||
|
||||
utext_openUChars(&before, _before, -1, &status);
|
||||
utext_openUChars(&after, _after, -1, &status);
|
||||
|
||||
TEST_SUCCESS(status);
|
||||
int32_t result;
|
||||
|
||||
UTEXT_SETNATIVEINDEX(&before, 1);
|
||||
UTEXT_SETNATIVEINDEX(&after, 1);
|
||||
result = utext_compare(&before, -1, &after, -1);
|
||||
if (0 <= result || U_FAILURE(status)) {
|
||||
errln("error: utext_compare ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
|
||||
UTEXT_SETNATIVEINDEX(&before, 1);
|
||||
UTEXT_SETNATIVEINDEX(&after, 1);
|
||||
result = utext_compare(&before, 3, &after, 3);
|
||||
if (0 <= result || U_FAILURE(status)) {
|
||||
errln("error: utext_compare with lengths ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
|
||||
UTEXT_SETNATIVEINDEX(&before, 1);
|
||||
UTEXT_SETNATIVEINDEX(&after, 1);
|
||||
result = utext_caseCompare(&before, -1, &after, -1, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 <= result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompare ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
|
||||
UTEXT_SETNATIVEINDEX(&before, 1);
|
||||
UTEXT_SETNATIVEINDEX(&after, 1);
|
||||
result = utext_caseCompare(&before, 3, &after, 3, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 <= result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompare with lengths ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
|
||||
utext_close(&before);
|
||||
utext_close(&after);
|
||||
}
|
||||
|
||||
/* test surrogates at end of string */
|
||||
{
|
||||
static const UChar
|
||||
_before[] = { 0x65, 0xd800, 0xd800, 0xdc01, 0x00 },
|
||||
_after[] = { 0x65, 0xd800, 0xdc00, 0x00 };
|
||||
|
||||
UText
|
||||
before = UTEXT_INITIALIZER,
|
||||
after = UTEXT_INITIALIZER;
|
||||
|
||||
utext_openUChars(&before, _before, -1, &status);
|
||||
utext_openUChars(&after, _after, -1, &status);
|
||||
|
||||
TEST_SUCCESS(status);
|
||||
int32_t result;
|
||||
|
||||
UTEXT_SETNATIVEINDEX(&before, 1);
|
||||
UTEXT_SETNATIVEINDEX(&after, 1);
|
||||
result = utext_compare(&before, -1, &after, -1);
|
||||
if (0 <= result || U_FAILURE(status)) {
|
||||
errln("error: utext_compare ({ 65, d800, 10001 }, { 65, 10000 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
|
||||
UTEXT_SETNATIVEINDEX(&before, 1);
|
||||
UTEXT_SETNATIVEINDEX(&after, 1);
|
||||
result = utext_caseCompare(&before, -1, &after, -1, U_FOLD_CASE_DEFAULT, &status);
|
||||
if (0 <= result || U_FAILURE(status)) {
|
||||
errln("error: utext_caseCompare ({ 65, d800, 10001 }, { 65, 10000 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
|
||||
}
|
||||
|
||||
utext_close(&before);
|
||||
utext_close(&after);
|
||||
}
|
||||
|
||||
/* test empty strings */
|
||||
{
|
||||
UChar zero16 = 0;
|
||||
char zero8 = 0;
|
||||
UText emptyUChar = UTEXT_INITIALIZER;
|
||||
UText emptyUTF8 = UTEXT_INITIALIZER;
|
||||
UText nullUChar = UTEXT_INITIALIZER;
|
||||
UText nullUTF8 = UTEXT_INITIALIZER;
|
||||
|
||||
utext_openUChars(&emptyUChar, &zero16, -1, &status);
|
||||
utext_openUTF8(&emptyUTF8, &zero8, -1, &status);
|
||||
utext_openUChars(&nullUChar, NULL, 0, &status);
|
||||
utext_openUTF8(&nullUTF8, NULL, 0, &status);
|
||||
|
||||
if (utext_compare(&emptyUChar, -1, &emptyUTF8, -1) != 0) {
|
||||
errln("error: utext_compare(&emptyUChar, -1, &emptyUTF8, -1) != 0");
|
||||
}
|
||||
if (utext_compare(&emptyUChar, -1, &nullUChar, -1) != 0) {
|
||||
errln("error: utext_compare(&emptyUChar, -1, &nullUChar, -1) != 0");
|
||||
}
|
||||
if (utext_compare(&emptyUChar, -1, &nullUTF8, -1) != 0) {
|
||||
errln("error: utext_compare(&emptyUChar, -1, &nullUTF8, -1) != 0");
|
||||
}
|
||||
if (utext_compare(&emptyUTF8, -1, &nullUChar, -1) != 0) {
|
||||
errln("error: utext_compare(&emptyUTF8, -1, &nullUChar, -1) != 0");
|
||||
}
|
||||
if (utext_compare(&emptyUTF8, -1, &nullUTF8, -1) != 0) {
|
||||
errln("error: utext_compare(&emptyUTF8, -1, &nullUTF8, -1) != 0");
|
||||
}
|
||||
if (utext_compare(&nullUChar, -1, &nullUTF8, -1) != 0) {
|
||||
errln("error: utext_compare(&nullUChar, -1, &nullUTF8, -1) != 0");
|
||||
}
|
||||
|
||||
if (utext_compareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1) != 0) {
|
||||
errln("error: utext_compareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1) != 0");
|
||||
}
|
||||
if (utext_compareNativeLimit(&emptyUChar, -1, &nullUChar, -1) != 0) {
|
||||
errln("error: utext_compareNativeLimit(&emptyUChar, -1, &nullUChar, -1) != 0");
|
||||
}
|
||||
if (utext_compareNativeLimit(&emptyUChar, -1, &nullUTF8, -1) != 0) {
|
||||
errln("error: utext_compareNativeLimit(&emptyUChar, -1, &nullUTF8, -1) != 0");
|
||||
}
|
||||
if (utext_compareNativeLimit(&emptyUTF8, -1, &nullUChar, -1) != 0) {
|
||||
errln("error: utext_compareNativeLimit(&emptyUTF8, -1, &nullUChar, -1) != 0");
|
||||
}
|
||||
if (utext_compareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1) != 0) {
|
||||
errln("error: utext_compareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1) != 0");
|
||||
}
|
||||
if (utext_compareNativeLimit(&nullUChar, -1, &nullUTF8, -1) != 0) {
|
||||
errln("error: utext_compareNativeLimit(&nullUChar, -1, &nullUTF8, -1) != 0");
|
||||
}
|
||||
|
||||
if (utext_caseCompare(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompare(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0");
|
||||
}
|
||||
if (utext_caseCompare(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompare(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0");
|
||||
}
|
||||
if (utext_caseCompare(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompare(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0");
|
||||
}
|
||||
if (utext_caseCompare(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompare(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0");
|
||||
}
|
||||
if (utext_caseCompare(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompare(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0");
|
||||
}
|
||||
if (utext_caseCompare(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompare(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0");
|
||||
}
|
||||
|
||||
if (utext_caseCompareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0");
|
||||
}
|
||||
if (utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0");
|
||||
}
|
||||
if (utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0");
|
||||
}
|
||||
if (utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0");
|
||||
}
|
||||
if (utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0");
|
||||
}
|
||||
if (utext_caseCompareNativeLimit(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
|
||||
errln("error: utext_caseCompareNativeLimit(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0");
|
||||
}
|
||||
|
||||
utext_close(&emptyUChar);
|
||||
utext_close(&emptyUTF8);
|
||||
utext_close(&nullUChar);
|
||||
utext_close(&nullUTF8);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// ErrorTest() Check various error and edge cases.
|
||||
|
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2005-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 2005-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/************************************************************************
|
||||
@ -33,6 +33,7 @@ public:
|
||||
void FreezeTest();
|
||||
void Ticket5560();
|
||||
void Ticket6847();
|
||||
void ComparisonTest();
|
||||
|
||||
private:
|
||||
struct m { // Map between native indices & code points.
|
||||
|
Loading…
Reference in New Issue
Block a user