ICU-4521 UText-based regex implementation

X-SVN-Rev: 27482
This commit is contained in:
Michael Grady 2010-02-03 02:59:35 +00:00
parent 7a93a3c3e2
commit 8216117f21
21 changed files with 9262 additions and 855 deletions

2
.gitattributes vendored
View File

@ -54,6 +54,8 @@ icu4c/source/data/in/nfkc.nrm -text
icu4c/source/data/in/nfkc_cf.nrm -text
icu4c/source/data/in/unorm.icu -text
icu4c/source/data/locales/pool.res -text
icu4c/source/i18n/regextxt.cpp -text
icu4c/source/i18n/regextxt.h -text
icu4c/source/samples/ucnv/data02.bin -text
icu4c/source/test/perf/README -text
icu4c/source/test/testdata/TestFont1.otf -text

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2004-2009, International Business Machines
* Copyright (C) 2004-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -136,6 +136,7 @@
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#include "unicode/rep.h"
@ -674,6 +675,148 @@ utext_extract(UText *ut,
UErrorCode *status);
/**
* Compare two UTexts (binary order). The comparison begins at each source text's
* iteration position. The iteration position of each UText will be left following
* the last character compared.
*
* The comparison is done in code point order; unlike u_strCompare, you
* cannot choose to use code unit order. This is because the characters
* in a UText are accessed one code point at a time, and may not be from a UTF-16
* context.
*
* This functions works with strings of different explicitly specified lengths
* unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
* A length argument of -1 signifies that as much of the string should be used as
* is necessary to compare with the other string. If both length arguments are -1,
* the entire remaining portionss of both strings are used.
*
* @param s1 First source string.
* @param length1 Length of first source string in UTF-32 code points.
*
* @param s2 Second source string.
* @param length2 Length of second source string in UTF-32 code points.
*
* @return <0 or 0 or >0 as usual for string comparisons
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL int32_t U_EXPORT2
utext_compare(UText *s1, int32_t length1,
UText *s2, int32_t length2);
/**
* Compare two UTexts (binary order). The comparison begins at each source text's
* iteration position. The iteration position of each UText will be left following
* the last character compared. This method differs from utext_compare in that
* it accepts native limits rather than lengths for each string.
*
* The comparison is done in code point order; unlike u_strCompare, you
* cannot choose to use code unit order. This is because the characters
* in a UText are accessed one code point at a time, and may not be from a UTF-16
* context.
*
* This functions works with strings of different explicitly specified lengths
* unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
* A limit argument of -1 signifies that as much of the string should be used as
* is necessary to compare with the other string. If both limit arguments are -1,
* the entire remaining portionss of both strings are used.
*
* @param s1 First source string.
* @param limit1 Native index of the last character in the first source string to be considered.
*
* @param s2 Second source string.
* @param limit2 Native index of the last character in the second source string to be considered.
*
* @return <0 or 0 or >0 as usual for string comparisons
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL int32_t U_EXPORT2
utext_compareNativeLimit(UText *s1, int64_t limit1,
UText *s2, int64_t limit2);
/**
* Compare two UTexts case-insensitively using full case folding. The comparison
* begins at each source text's iteration position. The iteration position of each
* UText will be left following the last character compared.
*
* The comparison is done in code point order; this is because the characters
* in a UText are accessed one code point at a time, and may not be from a UTF-16
* context.
*
* This functions works with strings of different explicitly specified lengths
* unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
* A length argument of -1 signifies that as much of the string should be used as
* is necessary to compare with the other string. If both length arguments are -1,
* the entire remaining portionss of both strings are used.
*
* @param s1 First source string.
* @param length1 Length of first source string in UTF-32 code points.
*
* @param s2 Second source string.
* @param length2 Length of second source string in UTF-32 code points.
*
* @param options A bit set of options:
* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
* Comparison in code point order with default case folding.
*
* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
*
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @return <0 or 0 or >0 as usual for string comparisons
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL int32_t U_EXPORT2
utext_caseCompare(UText *s1, int32_t length1,
UText *s2, int32_t length2,
uint32_t options, UErrorCode *pErrorCode);
/**
* Compare two UTexts case-insensitively using full case folding. The comparison
* begins at each source text's iteration position. The iteration position of each
* UText will be left following the last character compared. This method differs from
* utext_caseCompare in that it accepts native limits rather than lengths for each
* string.
*
* The comparison is done in code point order; this is because the characters
* in a UText are accessed one code point at a time, and may not be from a UTF-16
* context.
*
* This functions works with strings of different explicitly specified lengths
* unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
* A limit argument of -1 signifies that as much of the string should be used as
* is necessary to compare with the other string. If both length arguments are -1,
* the entire remaining portionss of both strings are used.
*
* @param s1 First source string.
* @param limit1 Native index of the last character in the first source string to be considered.
*
* @param s2 Second source string.
* @param limit2 Native index of the last character in the second source string to be considered.
*
* @param options A bit set of options:
* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
* Comparison in code point order with default case folding.
*
* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
*
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @return <0 or 0 or >0 as usual for string comparisons
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL int32_t U_EXPORT2
utext_caseCompareNativeLimit(UText *s1, int64_t limit1,
UText *s2, int64_t limit2,
uint32_t options, UErrorCode *pErrorCode);
/************************************************************************************
*
* #define inline versions of selected performance-critical text access functions
@ -689,6 +832,19 @@ utext_extract(UText *ut,
*
************************************************************************************/
/**
* inline version of utext_current32(), for performance-critical situations.
*
* Get the code point at the current iteration position of the UText.
* Returns U_SENTINEL (-1) if the position is at the end of the
* text.
*
* @internal ICU 4.4 technology preview
*/
#define UTEXT_CURRENT32(ut) \
((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \
((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut))
/**
* inline version of utext_next32(), for performance-critical situations.
*
@ -1291,8 +1447,8 @@ struct UTextFuncs {
* (private) Spare function pointer
* @internal
*/
UTextClose *spare1;
/**
* (private) Spare function pointer
* @internal

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2005-2009, International Business Machines
* Copyright (C) 2005-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -23,6 +23,7 @@
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
#include "putilimp.h"
U_NAMESPACE_USE
@ -450,6 +451,361 @@ utext_equals(const UText *a, const UText *b) {
return TRUE;
}
U_CAPI int32_t U_EXPORT2
utext_compare(UText *s1, int32_t length1,
UText *s2, int32_t length2) {
UChar32 c1, c2;
if(length1<0 && length2<0) {
/* strcmp style, go until end of string */
for(;;) {
c1 = UTEXT_NEXT32(s1);
c2 = UTEXT_NEXT32(s2);
if(c1 != c2) {
break;
} else if(c1 == U_SENTINEL) {
return 0;
}
}
} else {
if(length1 < 0) {
length1 = INT32_MIN;
} else if (length2 < 0) {
length2 = INT32_MIN;
}
/* memcmp/UnicodeString style, both length-specified */
while((length1 > 0 || length1 == INT32_MIN) && (length2 > 0 || length2 == INT32_MIN)) {
c1 = UTEXT_NEXT32(s1);
c2 = UTEXT_NEXT32(s2);
if(c1 != c2) {
break;
} else if(c1 == U_SENTINEL) {
return 0;
}
if (length1 != INT32_MIN) {
length1 -= 1;
}
if (length2 != INT32_MIN) {
length2 -= 1;
}
}
if(length1 <= 0 && length1 != INT32_MIN) {
if(length2 <= 0) {
return 0;
} else {
return -1;
}
} else if(length2 <= 0 && length2 != INT32_MIN) {
if (length1 <= 0) {
return 0;
} else {
return 1;
}
}
}
return (int32_t)c1-(int32_t)c2;
}
U_CAPI int32_t U_EXPORT2
utext_compareNativeLimit(UText *s1, int64_t limit1,
UText *s2, int64_t limit2) {
UChar32 c1, c2;
if(limit1<0 && limit2<0) {
/* strcmp style, go until end of string */
for(;;) {
c1 = UTEXT_NEXT32(s1);
c2 = UTEXT_NEXT32(s2);
if(c1 != c2) {
return (int32_t)c1-(int32_t)c2;
} else if(c1 == U_SENTINEL) {
return 0;
}
}
} else {
/* memcmp/UnicodeString style, both length-specified */
int64_t index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
int64_t index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);
while((limit1 < 0 || index1 < limit1) && (limit2 < 0 || index2 < limit2)) {
c1 = UTEXT_NEXT32(s1);
c2 = UTEXT_NEXT32(s2);
if(c1 != c2) {
return (int32_t)c1-(int32_t)c2;
} else if(c1 == U_SENTINEL) {
return 0;
}
if (limit1 >= 0) {
index1 = UTEXT_GETNATIVEINDEX(s1);
}
if (limit2 >= 0) {
index2 = UTEXT_GETNATIVEINDEX(s2);
}
}
if(limit1 >= 0 && index1 >= limit1) {
if(index2 >= limit2) {
return 0;
} else {
return -1;
}
} else {
if(index1 >= limit1) {
return 0;
} else {
return 1;
}
}
}
}
U_CAPI int32_t U_EXPORT2
utext_caseCompare(UText *s1, int32_t length1,
UText *s2, int32_t length2,
uint32_t options, UErrorCode *pErrorCode) {
const UCaseProps *csp;
/* case folding variables */
const UChar *p;
int32_t length;
/* case folding buffers, only use current-level start/limit */
UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
int32_t foldOffset1, foldOffset2, foldLength1, foldLength2;
/* current code points */
UChar32 c1, c2;
uint8_t cLength1, cLength2;
/* argument checking */
if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
return 0;
}
if(s1==NULL || s2==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
csp=ucase_getSingleton(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
/* for variable-length strings */
if(length1 < 0) {
length1 = INT32_MIN;
}
if (length2 < 0) {
length2 = INT32_MIN;
}
/* initialize */
foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0;
/* comparison loop */
while((foldOffset1 < foldLength1 || length1 > 0 || length1 == INT32_MIN) &&
(foldOffset2 < foldLength2 || length2 > 0 || length2 == INT32_MIN)) {
if(foldOffset1 < foldLength1) {
U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
cLength1 = 0;
} else {
c1 = UTEXT_NEXT32(s1);
if (c1 != U_SENTINEL) {
cLength1 = U16_LENGTH(c1);
length = ucase_toFullFolding(csp, c1, &p, options);
if(length >= 0) {
if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
u_memcpy(fold1, p, length);
foldOffset1 = 0;
foldLength1 = length;
U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
} else {
c1 = length;
}
}
}
if(length1 != INT32_MIN) {
length1 -= 1;
}
}
if(foldOffset2 < foldLength2) {
U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
cLength2 = 0;
} else {
c2 = UTEXT_NEXT32(s2);
if (c2 != U_SENTINEL) {
cLength2 = U16_LENGTH(c2);
length = ucase_toFullFolding(csp, c2, &p, options);
if(length >= 0) {
if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
u_memcpy(fold2, p, length);
foldOffset2 = 0;
foldLength2 = length;
U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
} else {
c2 = length;
}
}
} else if(c1 == U_SENTINEL) {
return 0; // end of both strings at once
}
if(length2 != INT32_MIN) {
length2 -= 1;
}
}
if(c1 != c2) {
return (int32_t)c1-(int32_t)c2;
}
}
/* By now at least one of the strings is out of characters */
length1 += foldLength1 - foldOffset1;
length2 += foldLength2 - foldOffset2;
if(length1 <= 0 && length1 != INT32_MIN) {
if(length2 <= 0) {
return 0;
} else {
return -1;
}
} else {
if (length1 <= 0) {
return 0;
} else {
return 1;
}
}
}
U_CAPI int32_t U_EXPORT2
utext_caseCompareNativeLimit(UText *s1, int64_t limit1,
UText *s2, int64_t limit2,
uint32_t options, UErrorCode *pErrorCode) {
const UCaseProps *csp;
/* case folding variables */
const UChar *p;
int32_t length;
/* case folding buffers, only use current-level start/limit */
UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
int32_t foldOffset1, foldOffset2, foldLength1, foldLength2;
/* current code points */
UChar32 c1, c2;
/* native indexes into s1 and s2 */
int64_t index1, index2;
/* argument checking */
if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
return 0;
}
if(s1==NULL || s2==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
csp=ucase_getSingleton(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
/* initialize */
index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);
foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0;
/* comparison loop */
while((foldOffset1 < foldLength1 || limit1 < 0 || index1 < limit1) &&
(foldOffset2 < foldLength2 || limit2 < 0 || index2 < limit2)) {
if(foldOffset1 < foldLength1) {
U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
} else {
c1 = UTEXT_NEXT32(s1);
if (c1 != U_SENTINEL) {
length = ucase_toFullFolding(csp, c1, &p, options);
if(length >= 0) {
if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
u_memcpy(fold1, p, length);
foldOffset1 = 0;
foldLength1 = length;
U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
} else {
c1 = length;
}
}
}
if (limit1 >= 0) {
index1 = UTEXT_GETNATIVEINDEX(s1);
}
}
if(foldOffset2 < foldLength2) {
U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
} else {
c2 = UTEXT_NEXT32(s2);
if (c2 != U_SENTINEL) {
length = ucase_toFullFolding(csp, c2, &p, options);
if(length >= 0) {
if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
u_memcpy(fold2, p, length);
foldOffset2 = 0;
foldLength2 = length;
U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
} else {
c2 = length;
}
}
} else if(c1 == U_SENTINEL) {
return 0;
}
if (limit2 >= 0) {
index2 = UTEXT_GETNATIVEINDEX(s2);
}
}
if(c1 != c2) {
return (int32_t)c1-(int32_t)c2;
}
}
/* By now at least one of the strings is out of characters */
index1 -= foldLength1 - foldOffset1;
index2 -= foldLength2 - foldOffset2;
if(limit1 >= 0 && index1 >= limit1) {
if(index2 >= limit2) {
return 0;
} else {
return -1;
}
} else {
if(index1 >= limit1) {
return 0;
} else {
return 1;
}
}
}
U_CAPI UBool U_EXPORT2
utext_isWritable(const UText *ut)
{
@ -800,7 +1156,7 @@ shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
adjustPointer(dest, &dest->p, src);
adjustPointer(dest, &dest->q, src);
adjustPointer(dest, &dest->r, src);
adjustPointer(dest, (const void **)&dest->chunkContents, src);
adjustPointer(dest, (const void **)&dest->chunkContents, src);
return dest;
}
@ -932,7 +1288,7 @@ utf8TextAccess(UText *ut, int64_t index, UBool forward) {
if (ix>length) {
if (length>=0) {
ix=length;
} else if (ix>ut->c) {
} else if (ix>=ut->c) {
// Zero terminated string, and requested index is beyond
// the region that has already been scanned.
// Scan up to either the end of the string or to the
@ -1415,7 +1771,7 @@ utext_strFromUTF8(UChar *dest,
if(ch<0){
ch = 0xfffd;
}
if(ch<=0xFFFF){
if(U_IS_BMP(ch)){
*(pDest++)=(UChar)ch;
}else{
*(pDest++)=UTF16_LEAD(ch);
@ -1438,7 +1794,7 @@ utext_strFromUTF8(UChar *dest,
if(ch<0){
ch = 0xfffd;
}
reqLength+=UTF_CHAR_LENGTH(ch);
reqLength+=U16_LENGTH(ch);
}
}
@ -1485,7 +1841,7 @@ utf8TextExtract(UText *ut,
int i;
if (start32 < ut->chunkNativeLimit) {
for (i=0; i<3; i++) {
if (U8_IS_LEAD(buf[start32]) || start32==0) {
if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
break;
}
start32--;
@ -1494,7 +1850,7 @@ utf8TextExtract(UText *ut,
if (limit32 < ut->chunkNativeLimit) {
for (i=0; i<3; i++) {
if (U8_IS_LEAD(buf[limit32]) || limit32==0) {
if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
break;
}
limit32--;
@ -1506,6 +1862,7 @@ utf8TextExtract(UText *ut,
utext_strFromUTF8(dest, destCapacity, &destLength,
(const char *)ut->context+start32, limit32-start32,
pErrorCode);
utf8TextAccess(ut, limit32, TRUE);
return destLength;
}
@ -1870,6 +2227,8 @@ repTextExtract(UText *ut,
}
UnicodeString buffer(dest, 0, destCapacity); // writable alias
rep->extractBetween(start32, limit32, buffer);
repTextAccess(ut, limit32, TRUE);
return u_terminateUChars(dest, destCapacity, length, status);
}
@ -2138,6 +2497,9 @@ unistrTextExtract(UText *t,
trimmedLength=destCapacity;
}
us->extract(start32, trimmedLength, dest);
t->chunkOffset = start32+trimmedLength;
} else {
t->chunkOffset = start32;
}
u_terminateUChars(dest, destCapacity, length, pErrorCode);
return length;
@ -2528,7 +2890,7 @@ ucstrTextExtract(UText *ut,
if (strLength>=0) {
// We have filled the destination buffer, and the string length is known.
// Cut the loop short. There is no need to scan string termination.
di = strLength;
di = limit32 - start32;
si = limit32;
break;
}
@ -2548,7 +2910,7 @@ ucstrTextExtract(UText *ut,
}
// Put iteration position at the point just following the extracted text
ut->chunkOffset = si;
ut->chunkOffset = uprv_min(strLength, start32 + destCapacity);
// Add a terminating NUL if space in the buffer permits,
// and set the error status as required.
@ -2754,21 +3116,26 @@ charIterTextExtract(UText *ut,
int32_t limit32 = pinIndex(limit, length);
int32_t desti = 0;
int32_t srci;
int32_t copyLimit;
CharacterIterator *ci = (CharacterIterator *)ut->context;
ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed.
srci = ci->getIndex();
copyLimit = srci;
while (srci<limit32) {
UChar32 c = ci->next32PostInc();
int32_t len = U16_LENGTH(c);
if (desti+len <= destCapacity) {
U16_APPEND_UNSAFE(dest, desti, c);
copyLimit = srci+len;
} else {
desti += len;
*status = U_BUFFER_OVERFLOW_ERROR;
}
srci += len;
}
charIterTextAccess(ut, copyLimit, TRUE);
u_terminateUChars(dest, destCapacity, desti, status);
return desti;

View File

@ -76,7 +76,7 @@ translit.o utrans.o esctrn.o unesctrn.o funcrepl.o strrepl.o tridpars.o \
cpdtrans.o rbt.o rbt_data.o rbt_pars.o rbt_rule.o rbt_set.o \
nultrans.o remtrans.o casetrn.o titletrn.o tolowtrn.o toupptrn.o anytrans.o \
name2uni.o uni2name.o nortrans.o quant.o transreg.o brktrans.o \
regexcmp.o rematch.o repattrn.o regexst.o udatpg.o uregex.o uregexc.o \
regexcmp.o rematch.o repattrn.o regexst.o regextxt.o udatpg.o uregex.o uregexc.o \
ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \
csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o \

View File

@ -3602,6 +3602,14 @@
RelativePath=".\regexst.h"
>
</File>
<File
RelativePath=".\regextxt.cpp"
>
</File>
<File
RelativePath=".\regextxt.h"
>
</File>
<File
RelativePath=".\rematch.cpp"
>

View File

@ -1,7 +1,7 @@
//
// file: regexcmp.cpp
//
// Copyright (C) 2002-2009 International Business Machines Corporation and others.
// Copyright (C) 2002-2010 International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the ICU regular expression compiler, which is responsible
@ -13,6 +13,7 @@
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/ustring.h"
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
@ -21,6 +22,7 @@
#include "unicode/parseerr.h"
#include "unicode/regex.h"
#include "util.h"
#include "putilimp.h"
#include "cmemory.h"
#include "cstring.h"
#include "uvectr32.h"
@ -33,6 +35,7 @@
// generated by a Perl script.
#include "regexcmp.h"
#include "regexst.h"
#include "regextxt.h"
@ -47,11 +50,13 @@ U_NAMESPACE_BEGIN
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
fParenStack(status), fSetStack(status), fSetOpStack(status)
{
// Lazy init of all shared global sets (needed for init()'s empty text)
RegexStaticSets::initGlobals(&status);
fStatus = &status;
fRXPat = rxp;
fScanIndex = 0;
fNextIndex = 0;
fPeekChar = -1;
fLineNum = 1;
fCharNum = 0;
@ -97,6 +102,24 @@ void RegexCompile::compile(
const UnicodeString &pat, // Source pat to be compiled.
UParseError &pp, // Error position info
UErrorCode &e) // Error Code
{
UText patternText = UTEXT_INITIALIZER;
utext_openConstUnicodeString(&patternText, &pat, &e);
if (U_SUCCESS(e)) {
compile(&patternText, pp, e);
utext_close(&patternText);
}
}
//
// compile, UText mode
// All the work is actually done here.
//
void RegexCompile::compile(
UText *pat, // Source pat to be compiled.
UParseError &pp, // Error position info
UErrorCode &e) // Error Code
{
fStatus = &e;
fParseErr = &pp;
@ -108,16 +131,16 @@ void RegexCompile::compile(
}
// There should be no pattern stuff in the RegexPattern object. They can not be reused.
U_ASSERT(fRXPat->fPattern.length() == 0);
U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
// Prepare the RegexPattern object to receive the compiled pattern.
fRXPat->fPattern = pat;
fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
// Initialize the pattern scanning state machine
fPatternLength = pat.length();
fPatternLength = utext_nativeLength(pat);
uint16_t state = 1;
const RegexTableEl *tableEl;
nextChar(fC); // Fetch the first char from the pattern string.
@ -250,34 +273,13 @@ void RegexCompile::compile(
// The pattern has now been read and processed, and the compiled code generated.
//
// Back-reference fixup
//
int32_t loc;
for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
int32_t op = fRXPat->fCompiledPat->elementAti(loc);
int32_t opType = URX_TYPE(op);
if (opType == URX_BACKREF || opType == URX_BACKREF_I) {
int32_t where = URX_VAL(op);
if (where > fRXPat->fGroupMap->size()) {
error(U_REGEX_INVALID_BACK_REF);
break;
}
where = fRXPat->fGroupMap->elementAti(where-1);
op = URX_BUILD(opType, where);
fRXPat->fCompiledPat->setElementAt(op, loc);
}
}
//
// Compute the number of digits requried for the largest capture group number.
//
fRXPat->fMaxCaptureDigits = 1;
int32_t n = 10;
for (;;) {
if (n > fRXPat->fGroupMap->size()) {
break;
}
int32_t groupCount = fRXPat->fGroupMap->size();
while (n <= groupCount) {
fRXPat->fMaxCaptureDigits++;
n *= 10;
}
@ -286,10 +288,15 @@ void RegexCompile::compile(
// The pattern's fFrameSize so far has accumulated the requirements for
// storage for capture parentheses, counters, etc. that are encountered
// in the pattern. Add space for the two variables that are always
// present in the saved state: the input string position and the
// position in the compiled pattern.
// present in the saved state: the input string position (int64_t) and
// the position in the compiled pattern.
//
fRXPat->fFrameSize+=2;
fRXPat->fFrameSize+=3;
//
// Optimization pass 1: NOPs, back-references, and case-folding
//
stripNOPs();
//
// Get bounds for the minimum and maximum length of a string that this
@ -299,10 +306,9 @@ void RegexCompile::compile(
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
//
// Optimization passes
// Optimization pass 2: match start type
//
matchStartType();
stripNOPs();
//
// Set up fast latin-1 range sets
@ -426,19 +432,19 @@ UBool RegexCompile::doParseActions(int32_t action)
// - NOP, which may later be replaced by a save-state if there
// is an '|' alternation within the parens.
//
// Each capture group gets three slots in the save stack frame:
// 0: Capture Group start position (in input string being matched.)
// 1: Capture Group end positino.
// 2: Start of Match-in-progress.
// Each capture group gets three double-width slots in the save stack frame:
// 0-1: Capture Group start position (in input string being matched.)
// 2-3: Capture Group end position.
// 4-5: Start of Match-in-progress.
// The first two locations are for a completed capture group, and are
// referred to by back references and the like.
// The third location stores the capture start position when an START_CAPTURE is
// encountered. This will be promoted to a completed capture when (and if) the corresponding
// END_CAPure is encountered.
// END_CAPTURE is encountered.
{
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
fRXPat->fFrameSize += 3;
int32_t varsLoc = fRXPat->fFrameSize; // Reserve five slots in match stack frame.
fRXPat->fFrameSize += 6;
int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
fRXPat->fCompiledPat->addElement(cop, *fStatus);
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
@ -532,10 +538,10 @@ UBool RegexCompile::doParseActions(int32_t action)
// 8. code for parenthesized stuff.
// 9. LA_END
//
// Two data slots are reserved, for saving the stack ptr and the input position.
// Three data slots are reserved, for saving the stack ptr and the (double-width) input position.
{
int32_t dataLoc = fRXPat->fDataSize;
fRXPat->fDataSize += 2;
fRXPat->fDataSize += 3;
int32_t op = URX_BUILD(URX_LA_START, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
@ -576,9 +582,10 @@ UBool RegexCompile::doParseActions(int32_t action)
// 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
// 7. END_LA // Restore match region, in case look-ahead was using
// an alternate (transparent) region.
// Three data slots are reserved, for saving the stack ptr and the (double-width) input position.
{
int32_t dataLoc = fRXPat->fDataSize;
fRXPat->fDataSize += 2;
fRXPat->fDataSize += 3;
int32_t op = URX_BUILD(URX_LA_START, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
@ -617,12 +624,12 @@ UBool RegexCompile::doParseActions(int32_t action)
// Allocate a block of matcher data, to contain (when running a match)
// 0: Stack ptr on entry
// 1: Input Index on entry
// 2: Start index of match current match attempt.
// 3: Original Input String len.
// 2-3: Start index of match current match attempt.
// 4-5: Original Input String len.
// Allocate data space
int32_t dataLoc = fRXPat->fDataSize;
fRXPat->fDataSize += 4;
fRXPat->fDataSize += 6;
// Emit URX_LB_START
int32_t op = URX_BUILD(URX_LB_START, dataLoc);
@ -670,12 +677,12 @@ UBool RegexCompile::doParseActions(int32_t action)
// Allocate a block of matcher data, to contain (when running a match)
// 0: Stack ptr on entry
// 1: Input Index on entry
// 2: Start index of match current match attempt.
// 3: Original Input String len.
// 2-3: Start index of match current match attempt.
// 4-5: Original Input String len.
// Allocate data space
int32_t dataLoc = fRXPat->fDataSize;
fRXPat->fDataSize += 4;
fRXPat->fDataSize += 6;
// Emit URX_LB_START
int32_t op = URX_BUILD(URX_LB_START, dataLoc);
@ -764,7 +771,7 @@ UBool RegexCompile::doParseActions(int32_t action)
int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
frameLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
fRXPat->fFrameSize += 2; // double-width index
int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
break;
@ -784,7 +791,7 @@ UBool RegexCompile::doParseActions(int32_t action)
}
fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
frameLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
fRXPat->fFrameSize += 2; // double-width index
int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
break;
@ -801,7 +808,7 @@ UBool RegexCompile::doParseActions(int32_t action)
// Emit the code sequence that can handle it.
insertOp(topLoc);
frameLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
fRXPat->fFrameSize += 2; // double-width index
int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
fRXPat->fCompiledPat->setElementAt(op, topLoc);
@ -907,7 +914,7 @@ UBool RegexCompile::doParseActions(int32_t action)
int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
dataLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
fRXPat->fFrameSize += 2; // double-width index
int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
break;
@ -927,7 +934,7 @@ UBool RegexCompile::doParseActions(int32_t action)
}
fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
dataLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
fRXPat->fFrameSize += 2; // double-width index
int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
break;
@ -945,7 +952,7 @@ UBool RegexCompile::doParseActions(int32_t action)
if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
insertOp(saveStateLoc);
dataLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
fRXPat->fFrameSize += 2; // double-width index
int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
@ -1658,7 +1665,7 @@ UBool RegexCompile::doParseActions(int32_t action)
}
case doSetNegate:
case doSetNegate:
// Scanned a '^' at the start of a set.
// Push the negation operator onto the set op stack.
// A twist for case-insensitive matching:
@ -1770,17 +1777,12 @@ void RegexCompile::literalChar(UChar32 c) {
// First char of a string in the pattern.
// Emit a OneChar op into the compiled pattern.
emitONE_CHAR(c);
// Also add it to the string pool, in case we get a second adjacent literal
// and want to change form ONE_CHAR to STRING
// Mark that we might actually be starting a string here
fStringOpStart = fRXPat->fLiteralText.length();
fRXPat->fLiteralText.append(c);
return;
}
// We are adding onto an existing string
fRXPat->fLiteralText.append(c);
op = fRXPat->fCompiledPat->lastElementi();
opType = URX_TYPE(op);
U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
@ -1797,10 +1799,12 @@ void RegexCompile::literalChar(UChar32 c) {
fRXPat->fCompiledPat->setElementAt(op, patternLoc);
return;
}
// The most recently emitted op is a ONECHAR.
// We've now received another adjacent char. Change the ONECHAR op
// to a string op.
fRXPat->fLiteralText.append(URX_VAL(op));
if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
op = URX_BUILD(URX_STRING_I, fStringOpStart);
} else {
@ -1811,7 +1815,10 @@ void RegexCompile::literalChar(UChar32 c) {
op = URX_BUILD(URX_STRING_LEN, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
// We are adding onto an existing string
fRXPat->fLiteralText.append(c);
// The pattern contains a URX_SRING / URX_STRING_LEN. Update the
// string length to reflect the new char we just added to the string.
stringLen = fRXPat->fLiteralText.length() - fStringOpStart;
@ -1834,7 +1841,7 @@ void RegexCompile::emitONE_CHAR(UChar32 c) {
if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
// We have a cased character, and are in case insensitive matching mode.
c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
//c = u_foldCase(c, U_FOLD_CASE_DEFAULT); // !!!: handled in stripNOPs() now
op = URX_BUILD(URX_ONECHAR_I, c);
} else {
// Uncased char, or case sensitive match mode.
@ -2245,7 +2252,6 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
// ignored strings, that would be better.)
theSet->removeAllStrings();
int32_t setSize = theSet->size();
UChar32 firstSetChar = theSet->charAt(0);
switch (setSize) {
case 0:
@ -2261,7 +2267,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
// The set contains only a single code point. Put it into
// the compiled pattern as a single char operation rather
// than a set, and discard the set itself.
literalChar(firstSetChar);
literalChar(theSet->charAt(0));
delete theSet;
}
break;
@ -2472,7 +2478,7 @@ void RegexCompile::matchStartType() {
case URX_STO_INP_LOC:
case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
case URX_BACKREF_I:
case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
case URX_LD_SP:
break;
@ -3378,6 +3384,14 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
// code generation to provide locations that may be patched later.
// Many end up unneeded, and are removed by this function.
//
// In order to minimize the number of passes through the pattern,
// back-reference fixup is also performed here (adjusting
// back-reference operands to point to the correct frame offsets).
//
// In addition, case-insensitive character and string literals are
// now case-folded here, rather than when first parsed or at match
// time.
//
//------------------------------------------------------------------------------
void RegexCompile::stripNOPs() {
@ -3399,6 +3413,9 @@ void RegexCompile::stripNOPs() {
d++;
}
}
UnicodeString caseStringBuffer;
int32_t stringDelta = 0;
// Make a second pass over the code, removing the NOPs by moving following
// code up, and patching operands that refer to code locations that
@ -3432,12 +3449,69 @@ void RegexCompile::stripNOPs() {
break;
}
case URX_ONECHAR_I:
{
UChar32 c = URX_VAL(op);
if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
// We have a cased character to fold
c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
op = URX_BUILD(URX_ONECHAR_I, c);
}
fRXPat->fCompiledPat->setElementAt(op, dst);
dst++;
break;
}
case URX_STRING_I:
{
op = URX_BUILD(URX_STRING_I, URX_VAL(op)+stringDelta);
src++;
int32_t lengthOp = fRXPat->fCompiledPat->elementAti(src);
caseStringBuffer.setTo(fRXPat->fLiteralText, URX_VAL(op), URX_VAL(lengthOp));
caseStringBuffer.foldCase(U_FOLD_CASE_DEFAULT);
int32_t newLen = caseStringBuffer.length();
if (newLen <= URX_VAL(lengthOp)) {
// don't shift if we don't have to, take the tiny memory hit of a smaller string
fRXPat->fLiteralText.replace(URX_VAL(op), newLen, caseStringBuffer);
} else {
// shift other strings over...at least UnicodeString handles this for us!
fRXPat->fLiteralText.replace(URX_VAL(op), URX_VAL(lengthOp), caseStringBuffer);
stringDelta += newLen - URX_VAL(lengthOp);
}
lengthOp = URX_BUILD(URX_STRING_LEN, newLen);
fRXPat->fCompiledPat->setElementAt(op, dst);
fRXPat->fCompiledPat->setElementAt(lengthOp, dst+1);
dst += 2;
break;
}
case URX_BACKREF:
case URX_BACKREF_I:
{
int32_t where = URX_VAL(op);
if (where > fRXPat->fGroupMap->size()) {
error(U_REGEX_INVALID_BACK_REF);
break;
}
where = fRXPat->fGroupMap->elementAti(where-1);
op = URX_BUILD(opType, where);
fRXPat->fCompiledPat->setElementAt(op, dst);
dst++;
fRXPat->fNeedsAltInput = TRUE;
break;
}
case URX_STRING:
op = URX_BUILD(URX_STRING, URX_VAL(op)+stringDelta);
// continue
case URX_RESERVED_OP:
case URX_RESERVED_OP_N:
case URX_BACKTRACK:
case URX_END:
case URX_ONECHAR:
case URX_STRING:
case URX_STRING_LEN:
case URX_START_CAPTURE:
case URX_END_CAPTURE:
@ -3460,13 +3534,9 @@ void RegexCompile::stripNOPs() {
case URX_DOTANY_UNIX:
case URX_STO_SP:
case URX_LD_SP:
case URX_BACKREF:
case URX_STO_INP_LOC:
case URX_LA_START:
case URX_LA_END:
case URX_ONECHAR_I:
case URX_STRING_I:
case URX_BACKREF_I:
case URX_DOLLAR_M:
case URX_CARET_M:
case URX_CARET_M_UNIX:
@ -3509,15 +3579,15 @@ void RegexCompile::error(UErrorCode e) {
*fStatus = e;
fParseErr->line = fLineNum;
fParseErr->offset = fCharNum;
UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
// Fill in the context.
// Note: extractBetween() pins supplied indicies to the string bounds.
uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex,
fParseErr->preContext, 0);
fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1,
fParseErr->postContext, 0);
utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
}
}
@ -3557,18 +3627,18 @@ static const UChar chLS = 0x2028; // Unicode Line Separator
//------------------------------------------------------------------------------
UChar32 RegexCompile::nextCharLL() {
UChar32 ch;
UnicodeString &pattern = fRXPat->fPattern;
if (fPeekChar != -1) {
ch = fPeekChar;
fPeekChar = -1;
return ch;
}
if (fPatternLength==0 || fNextIndex >= fPatternLength) {
return (UChar32)-1;
// assume we're already in the right place
ch = UTEXT_NEXT32(fRXPat->fPattern);
if (ch == U_SENTINEL) {
return ch;
}
ch = pattern.char32At(fNextIndex);
fNextIndex = pattern.moveIndex32(fNextIndex, 1);
if (ch == chCR ||
ch == chNEL ||
@ -3613,7 +3683,7 @@ UChar32 RegexCompile::peekCharLL() {
//------------------------------------------------------------------------------
void RegexCompile::nextChar(RegexPatternChar &c) {
fScanIndex = fNextIndex;
fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
c.fChar = nextCharLL();
c.fQuoted = FALSE;
@ -3670,8 +3740,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
// check for backslash escaped characters.
//
if (c.fChar == chBackSlash) {
int32_t startX = fNextIndex; // start and end positions of the
int32_t endX = fNextIndex; // sequence following the '\'
int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
//
// A '\' sequence that is handled by ICU's standard unescapeAt function.
@ -3680,19 +3749,39 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
//
nextCharLL(); // get & discard the peeked char.
c.fQuoted = TRUE;
c.fChar = fRXPat->fPattern.unescapeAt(endX);
if (startX == endX) {
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
int32_t endIndex = pos;
c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, fPatternLength, (void *)fRXPat->fPattern->chunkContents);
if (endIndex == pos) {
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
}
fCharNum += endIndex - pos;
UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
} else {
int32_t offset = 0;
struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
if (offset == 0) {
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
} else if (context.lastOffset == offset) {
UTEXT_PREVIOUS32(fRXPat->fPattern);
} else if (context.lastOffset != offset-1) {
utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
}
fCharNum += offset;
}
fCharNum += endX - startX;
fNextIndex = endX;
}
else if (peekCharLL() == chDigit0) {
// Octal Escape, using Java Regexp Conventions
// which are \0 followed by 1-3 octal digits.
// Different from ICU Unescape handling of Octal, which does not
// require the leading 0.
// Java also has the convention of only consuning 2 octal digits if
// Java also has the convention of only consuming 2 octal digits if
// the three digit number would be > 0xff
//
c.fChar = 0;
@ -3873,13 +3962,13 @@ UnicodeSet *RegexCompile::scanPosixProp() {
// Save the scanner state.
// TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
int32_t savedScanIndex = fScanIndex;
int32_t savedNextIndex = fNextIndex;
int64_t savedScanIndex = fScanIndex;
int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
UBool savedQuoteMode = fQuoteMode;
UBool savedInBackslashQuote = fInBackslashQuote;
UBool savedEOLComments = fEOLComments;
int32_t savedLineNum = fLineNum;
int32_t savedCharNum = fCharNum;
int64_t savedLineNum = fLineNum;
int64_t savedCharNum = fCharNum;
UChar32 savedLastChar = fLastChar;
UChar32 savedPeekChar = fPeekChar;
RegexPatternChar savedfC = fC;
@ -3926,7 +4015,6 @@ UnicodeSet *RegexCompile::scanPosixProp() {
// The main scanner will retry the input as a normal set expression,
// not a [:Property:] expression.
fScanIndex = savedScanIndex;
fNextIndex = savedNextIndex;
fQuoteMode = savedQuoteMode;
fInBackslashQuote = savedInBackslashQuote;
fEOLComments = savedEOLComments;
@ -3935,6 +4023,7 @@ UnicodeSet *RegexCompile::scanPosixProp() {
fLastChar = savedLastChar;
fPeekChar = savedPeekChar;
fC = savedfC;
UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
}
return uset;
}

View File

@ -1,7 +1,7 @@
//
// regexcmp.h
//
// Copyright (C) 2002-2008, International Business Machines Corporation and others.
// Copyright (C) 2002-2010, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for the class RegexCompile
@ -54,7 +54,8 @@ public:
RegexCompile(RegexPattern *rp, UErrorCode &e);
void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
void compile(UText *pat, UParseError &pp, UErrorCode &e);
virtual ~RegexCompile();
@ -102,7 +103,7 @@ private:
void fixLiterals(UBool split=FALSE); // Fix literal strings.
void insertOp(int32_t where); // Open up a slot for a new op in the
// generated code at the specified location.
void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code,
void emitONE_CHAR(UChar32 c); // Emit a ONE_CHAR op into the compiled code,
// taking case mode into account.
int32_t minMatchLength(int32_t start,
int32_t end);
@ -124,16 +125,14 @@ private:
//
// Data associated with low level character scanning
//
int32_t fScanIndex; // Index of current character being processed
int64_t fScanIndex; // Index of current character being processed
// in the rule input string.
int32_t fNextIndex; // Index of the next character, which
// is the first character not yet scanned.
UBool fQuoteMode; // Scan is in a \Q...\E quoted region
UBool fInBackslashQuote; // Scan is between a '\' and the following char.
UBool fEOLComments; // When scan is just after '(?', inhibit #... to
// end of line comments, in favor of (?#...) comments.
int32_t fLineNum; // Line number in input file.
int32_t fCharNum; // Char position within the line.
int64_t fLineNum; // Line number in input file.
int64_t fCharNum; // Char position within the line.
UChar32 fLastChar; // Previous char, needed to count CR-LF
// as a single line, not two.
UChar32 fPeekChar; // Saved char, if we've scanned ahead.
@ -168,8 +167,8 @@ private:
// holds the start index within RegexPattern.
// fLiteralText where the string is being stored.
int32_t fPatternLength; // Length of the input pattern string.
int64_t fPatternLength; // Length of the input pattern string.
UVector32 fParenStack; // parentheses stack. Each frame consists of
// the positions of compiled pattern operations
// needing fixup, followed by negative value. The
@ -196,7 +195,7 @@ private:
// -1 for the upper interval value means none
// was specified (unlimited occurences.)
int32_t fNameStartPos; // Starting position of a \N{NAME} name in a
int64_t fNameStartPos; // Starting position of a \N{NAME} name in a
// pattern, valid while remainder of name is
// scanned.
@ -208,7 +207,6 @@ private:
UChar32 fLastSetLiteral; // The last single code point added to a set.
// needed when "-y" is scanned, and we need
// to turn "x-y" into a range.
};
// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]

View File

@ -1,5 +1,5 @@
//
// Copyright (C) 2002-2007 International Business Machines Corporation
// Copyright (C) 2002-2010 International Business Machines Corporation
// and others. All rights reserved.
//
// file: regeximp.h
@ -279,11 +279,12 @@ enum {
// Match Engine State Stack Frame Layout.
//
struct REStackFrame {
int32_t fInputIdx; // Position of next character in the input string
int64_t fInputIdx; // Position of next character in the input string
int32_t fPatIdx; // Position of next Op in the compiled pattern
int32_t fExtra[2]; // Extra state, for capture group start/ends
// atomic parentheses, repeat counts, etc.
// Locations assigned at pattern compile time.
// Note that this will likely end up longer than 64 bits.
};
//
@ -307,7 +308,6 @@ enum StartOfMatch {
(v)==START_STRING? "START_STRING" : \
"ILLEGAL")
//
// 8 bit set, to fast-path latin-1 set membership tests.
//
@ -347,7 +347,6 @@ inline void Regex8BitSet::operator = (const Regex8BitSet &s) {
uprv_memcpy(d, s.d, sizeof(d));
}
U_NAMESPACE_END
#endif

View File

@ -1,7 +1,7 @@
//
// regexst.h
//
// Copyright (C) 2004-2008, International Business Machines Corporation and others.
// Copyright (C) 2004-2010, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains class RegexStaticSets
@ -214,6 +214,10 @@ fRuleDigitsAlias(NULL)
for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
fRuleSets[i].compact();
}
// Finally, initialize an empty string for utility purposes
fEmptyText = utext_openUChars(NULL, NULL, 0, status);
return; // If we reached this point, everything is fine so just exit
ExitConstrDeleteAll: // Remove fPropSets and fRuleSets and return error
@ -233,6 +237,8 @@ RegexStaticSets::~RegexStaticSets() {
fPropSets[i] = NULL;
}
fRuleDigitsAlias = NULL;
utext_close(fEmptyText);
}

View File

@ -1,7 +1,7 @@
//
// regexst.h
//
// Copyright (C) 2003-2008, International Business Machines Corporation and others.
// Copyright (C) 2003-2010, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for the class RegexStaticSets
@ -19,6 +19,7 @@
#define REGEXST_H
#include "unicode/utypes.h"
#include "unicode/utext.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "regeximp.h"
@ -45,7 +46,7 @@ public:
UnicodeSet fUnescapeCharSet; // Set of chars handled by unescape when
// encountered with a \ in a pattern.
UnicodeSet *fRuleDigitsAlias;
UnicodeString fEmptyString; // An empty string, to be used when a matcher
UText *fEmptyText; // An empty string, to be used when a matcher
// is created with no input.
};

View File

@ -0,0 +1,45 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2008-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
//
// file: regextxt.cpp
//
// This file contains utility code for supporting UText in the regular expression engine.
//
#include "regextxt.h"
U_NAMESPACE_BEGIN
U_CFUNC UChar U_CALLCONV
uregex_utext_unescape_charAt(int32_t offset, void *ct) {
struct URegexUTextUnescapeCharContext *context = (struct URegexUTextUnescapeCharContext *)ct;
UChar32 c;
if (offset == context->lastOffset + 1) {
c = UTEXT_NEXT32(context->text);
context->lastOffset++;
} else if (offset == context->lastOffset) {
c = UTEXT_PREVIOUS32(context->text);
UTEXT_NEXT32(context->text);
} else {
utext_moveIndex32(context->text, offset - context->lastOffset - 1);
c = UTEXT_NEXT32(context->text);
context->lastOffset = offset;
}
// !!!: Doesn't handle characters outside BMP
if (U_IS_BMP(c)) {
return (UChar)c;
} else {
return 0;
}
}
U_CFUNC UChar U_CALLCONV
uregex_ucstr_unescape_charAt(int32_t offset, void *context) {
return ((UChar *)context)[offset];
}
U_NAMESPACE_END

View File

@ -0,0 +1,48 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2008-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
//
// file: regextxt.h
//
// This file contains utility code for supporting UText in the regular expression engine.
//
// This class is internal to the regular expression implementation.
// For the public Regular Expression API, see the file "unicode/regex.h"
//
#ifndef _REGEXTXT_H
#define _REGEXTXT_H
#include <unicode/utypes.h>
#include <unicode/utext.h>
U_NAMESPACE_BEGIN
#define UTEXT_USES_U16(ut) (NULL==((ut)->pFuncs->mapNativeIndexToUTF16))
#if 0
#define REGEX_DISABLE_CHUNK_MODE 1
#endif
#ifdef REGEX_DISABLE_CHUNK_MODE
# define UTEXT_FULL_TEXT_IN_CHUNK(ut,len) (FALSE)
#else
# define UTEXT_FULL_TEXT_IN_CHUNK(ut,len) ((0==((ut)->chunkNativeStart))&&((len)==((ut)->chunkNativeLimit))&&((len)==((ut)->nativeIndexingLimit)))
#endif
struct URegexUTextUnescapeCharContext {
UText *text;
int32_t lastOffset;
};
#define U_REGEX_UTEXT_UNESCAPE_CONTEXT(text) { (text), -1 }
U_CFUNC UChar U_CALLCONV
uregex_utext_unescape_charAt(int32_t offset, void * /* struct URegexUTextUnescapeCharContext* */ context);
U_CFUNC UChar U_CALLCONV
uregex_ucstr_unescape_charAt(int32_t offset, void * /* UChar* */ context);
U_NAMESPACE_END
#endif

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,7 @@
//
/*
***************************************************************************
* Copyright (C) 2002-2009 International Business Machines Corporation *
* Copyright (C) 2002-2010 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
@ -29,11 +29,11 @@ U_NAMESPACE_BEGIN
//
//--------------------------------------------------------------------------
RegexPattern::RegexPattern() {
// Init all of this instance's data.
init();
UErrorCode status = U_ZERO_ERROR;
u_init(&status);
// Lazy init of all shared global sets.
RegexStaticSets::initGlobals(&fDeferredStatus);
// Init all of this instances data.
init();
}
@ -52,7 +52,7 @@ RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
//--------------------------------------------------------------------------
//
// Assignmenet Operator
// Assignment Operator
//
//--------------------------------------------------------------------------
RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
@ -68,7 +68,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
init();
// Copy simple fields
fPattern = other.fPattern;
fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
fFlags = other.fFlags;
fLiteralText = other.fLiteralText;
fDeferredStatus = other.fDeferredStatus;
@ -85,6 +85,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
*fInitialChars = *other.fInitialChars;
fInitialChar = other.fInitialChar;
*fInitialChars8 = *other.fInitialChars8;
fNeedsAltInput = other.fNeedsAltInput;
// Copy the pattern. It's just values, nothing deep to copy.
fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
@ -126,7 +127,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
//
//--------------------------------------------------------------------------
void RegexPattern::init() {
fPattern.remove();
fFlags = 0;
fCompiledPat = 0;
fLiteralText.remove();
@ -146,7 +146,9 @@ void RegexPattern::init() {
fInitialChars = NULL;
fInitialChar = 0;
fInitialChars8 = NULL;
fNeedsAltInput = FALSE;
fPattern = NULL; // will be set later
fCompiledPat = new UVector32(fDeferredStatus);
fGroupMap = new UVector32(fDeferredStatus);
fSets = new UVector(fDeferredStatus);
@ -192,6 +194,9 @@ void RegexPattern::zap() {
fInitialChars = NULL;
delete fInitialChars8;
fInitialChars8 = NULL;
if (fPattern != NULL) {
utext_close(fPattern);
}
}
@ -220,13 +225,27 @@ RegexPattern *RegexPattern::clone() const {
//
// operator == (comparison) Consider to patterns to be == if the
// pattern strings and the flags are the same.
// Note that pattern strings with the same
// characters can still be considered different.
//
//--------------------------------------------------------------------------
UBool RegexPattern::operator ==(const RegexPattern &other) const {
UBool r = this->fFlags == other.fFlags &&
this->fPattern == other.fPattern &&
this->fDeferredStatus == other.fDeferredStatus;
return r;
if (this->fPattern == NULL) {
if (other.fPattern == NULL) {
return this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus;
} else {
return FALSE;
}
} else {
if (other.fPattern == NULL) {
return FALSE;
} else {
UTEXT_SETNATIVEINDEX(this->fPattern, 0);
UTEXT_SETNATIVEINDEX(other.fPattern, 0);
return this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus &&
utext_equals(this->fPattern, other.fPattern);
}
}
}
//---------------------------------------------------------------------
@ -240,7 +259,57 @@ RegexPattern::compile(const UnicodeString &regex,
UParseError &pe,
UErrorCode &status)
{
if (U_FAILURE(status)) {
return NULL;
}
const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES;
if ((flags & ~allFlags) != 0) {
status = U_REGEX_INVALID_FLAG;
return NULL;
}
if ((flags & UREGEX_CANON_EQ) != 0) {
status = U_REGEX_UNIMPLEMENTED;
return NULL;
}
RegexPattern *This = new RegexPattern;
if (This == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
if (U_FAILURE(This->fDeferredStatus)) {
status = This->fDeferredStatus;
delete This;
return NULL;
}
This->fFlags = flags;
RegexCompile compiler(This, status);
compiler.compile(regex, pe, status);
if (U_FAILURE(status)) {
delete This;
This = NULL;
}
return This;
}
//
// compile, UText mode
//
RegexPattern * U_EXPORT2
RegexPattern::compile(UText *regex,
uint32_t flags,
UParseError &pe,
UErrorCode &status)
{
if (U_FAILURE(status)) {
return NULL;
}
@ -294,20 +363,43 @@ RegexPattern::compile(const UnicodeString &regex,
}
//
// compile with default flags, UText mode
//
RegexPattern * U_EXPORT2
RegexPattern::compile(UText *regex,
UParseError &pe,
UErrorCode &err)
{
return compile(regex, 0, pe, err);
}
//
// compile with no UParseErr parameter.
//
RegexPattern * U_EXPORT2
RegexPattern::compile( const UnicodeString &regex,
uint32_t flags,
UErrorCode &err)
RegexPattern::compile(const UnicodeString &regex,
uint32_t flags,
UErrorCode &err)
{
UParseError pe;
return compile(regex, flags, pe, err);
}
//
// compile with no UParseErr parameter, UText mode
//
RegexPattern * U_EXPORT2
RegexPattern::compile(UText *regex,
uint32_t flags,
UErrorCode &err)
{
UParseError pe;
return compile(regex, flags, pe, err);
}
//---------------------------------------------------------------------
//
@ -327,8 +419,21 @@ uint32_t RegexPattern::flags() const {
RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
UErrorCode &status) const {
RegexMatcher *retMatcher = matcher(status);
retMatcher->fDeferredStatus = status;
if (retMatcher != NULL) {
retMatcher->fDeferredStatus = status;
retMatcher->reset(input);
}
return retMatcher;
}
//
// matcher, UText mode
//
RegexMatcher *RegexPattern::matcher(UText *input,
UErrorCode &status) const {
RegexMatcher *retMatcher = matcher(status);
if (retMatcher != NULL) {
retMatcher->fDeferredStatus = status;
retMatcher->reset(input);
}
return retMatcher;
@ -399,6 +504,31 @@ UBool U_EXPORT2 RegexPattern::matches(const UnicodeString &regex,
}
//
// matches, UText mode
//
UBool U_EXPORT2 RegexPattern::matches(UText *regex,
UText *input,
UParseError &pe,
UErrorCode &status) {
if (U_FAILURE(status)) {return FALSE;}
UBool retVal;
RegexPattern *pat = NULL;
RegexMatcher *matcher = NULL;
pat = RegexPattern::compile(regex, 0, pe, status);
matcher = pat->matcher(input, status);
retVal = matcher->matches(status);
delete matcher;
delete pat;
return retVal;
}
//---------------------------------------------------------------------
@ -407,12 +537,43 @@ UBool U_EXPORT2 RegexPattern::matches(const UnicodeString &regex,
//
//---------------------------------------------------------------------
UnicodeString RegexPattern::pattern() const {
return fPattern;
if (fPattern == NULL) {
return UnicodeString();
} else {
UErrorCode status = U_ZERO_ERROR;
int64_t nativeLen = utext_nativeLength(fPattern);
int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
UnicodeString result;
status = U_ZERO_ERROR;
UChar *resultChars = result.getBuffer(len16);
utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
result.releaseBuffer(len16);
return result;
}
}
//---------------------------------------------------------------------
//
// patternText
//
//---------------------------------------------------------------------
UText *RegexPattern::patternText() const {
if (fPattern != NULL) {
return fPattern;
} else {
UErrorCode status = U_ZERO_ERROR;
RegexStaticSets::initGlobals(&status);
return RegexStaticSets::gStaticSets->fEmptyText;
}
}
//---------------------------------------------------------------------
//
// split
@ -421,7 +582,28 @@ UnicodeString RegexPattern::pattern() const {
int32_t RegexPattern::split(const UnicodeString &input,
UnicodeString dest[],
int32_t destCapacity,
UErrorCode &status) const
UErrorCode &status) const
{
if (U_FAILURE(status)) {
return 0;
};
RegexMatcher m(this);
int32_t r = 0;
// Check m's status to make sure all is ok.
if (U_SUCCESS(m.fDeferredStatus)) {
r = m.split(input, dest, destCapacity, status);
}
return r;
}
//
// split, UText mode
//
int32_t RegexPattern::split(UText *input,
UText *dest[],
int32_t destCapacity,
UErrorCode &status) const
{
if (U_FAILURE(status)) {
return 0;
@ -572,17 +754,24 @@ RegexPatternDump(const RegexPattern *This) {
int i;
REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
for (i=0; i<This->fPattern.length(); i++) {
REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
UChar32 c = utext_next32From(This->fPattern, 0);
while (c != U_SENTINEL) {
if (c<32 || c>256) {
c = '.';
}
REGEX_DUMP_DEBUG_PRINTF(("%c", c));
c = UTEXT_NEXT32(This->fPattern);
}
REGEX_DUMP_DEBUG_PRINTF(("\n"));
REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
if (This->fStartType == START_STRING) {
REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \""));
for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates.
}
REGEX_DUMP_DEBUG_PRINTF(("\"\n"));
} else if (This->fStartType == START_SET) {
int32_t numSetChars = This->fInitialChars->size();

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2002-2009, International Business Machines
* Copyright (C) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: regex.h
@ -48,6 +48,7 @@
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/utext.h"
#include "unicode/parseerr.h"
#include "unicode/uregex.h"
@ -187,6 +188,35 @@ public:
UParseError &pe,
UErrorCode &status);
/**
* Compiles the regular expression in string form into a RegexPattern
* object. These compile methods, rather than the constructors, are the usual
* way that RegexPattern objects are created.
*
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
* objects created from the pattern are active. RegexMatchers keep a pointer
* back to their pattern, so premature deletion of the pattern is a
* catastrophic error.</p>
*
* <p>All pattern match mode flags are set to their default values.</p>
*
* <p>Note that it is often more convenient to construct a RegexMatcher directly
* from a pattern string rather than separately compiling the pattern and
* then creating a RegexMatcher object from the pattern.</p>
*
* @param regex The regular expression to be compiled.
* @param pe Receives the position (line and column nubers) of any error
* within the regular expression.)
* @param status A reference to a UErrorCode to receive any errors.
* @return A regexPattern object for the compiled pattern.
*
* @internal ICU 4.4 technology preview
*/
static RegexPattern * U_EXPORT2 compile( UText *regex,
UParseError &pe,
UErrorCode &status);
/**
* Compiles the regular expression in string form into a RegexPattern
* object using the specified match mode flags. These compile methods,
@ -204,7 +234,7 @@ public:
*
* @param regex The regular expression to be compiled.
* @param flags The match mode flags to be used.
* @param pe Receives the position (line and column nubers) of any error
* @param pe Receives the position (line and column numbers) of any error
* within the regular expression.)
* @param status A reference to a UErrorCode to receive any errors.
* @return A regexPattern object for the compiled pattern.
@ -215,7 +245,37 @@ public:
uint32_t flags,
UParseError &pe,
UErrorCode &status);
/**
* Compiles the regular expression in string form into a RegexPattern
* object using the specified match mode flags. These compile methods,
* rather than the constructors, are the usual way that RegexPattern objects
* are created.
*
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
* objects created from the pattern are active. RegexMatchers keep a pointer
* back to their pattern, so premature deletion of the pattern is a
* catastrophic error.</p>
*
* <p>Note that it is often more convenient to construct a RegexMatcher directly
* from a pattern string instead of than separately compiling the pattern and
* then creating a RegexMatcher object from the pattern.</p>
*
* @param regex The regular expression to be compiled.
* @param flags The match mode flags to be used.
* @param pe Receives the position (line and column numbers) of any error
* within the regular expression.)
* @param status A reference to a UErrorCode to receive any errors.
* @return A regexPattern object for the compiled pattern.
*
* @internal ICU 4.4 technology preview
*/
static RegexPattern * U_EXPORT2 compile( UText *regex,
uint32_t flags,
UParseError &pe,
UErrorCode &status);
/**
* Compiles the regular expression in string form into a RegexPattern
@ -244,6 +304,33 @@ public:
UErrorCode &status);
/**
* Compiles the regular expression in string form into a RegexPattern
* object using the specified match mode flags. These compile methods,
* rather than the constructors, are the usual way that RegexPattern objects
* are created.
*
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
* objects created from the pattern are active. RegexMatchers keep a pointer
* back to their pattern, so premature deletion of the pattern is a
* catastrophic error.</p>
*
* <p>Note that it is often more convenient to construct a RegexMatcher directly
* from a pattern string instead of than separately compiling the pattern and
* then creating a RegexMatcher object from the pattern.</p>
*
* @param regex The regular expression to be compiled.
* @param flags The match mode flags to be used.
* @param status A reference to a UErrorCode to receive any errors.
* @return A regexPattern object for the compiled pattern.
*
* @internal ICU 4.4 technology preview
*/
static RegexPattern * U_EXPORT2 compile( UText *regex,
uint32_t flags,
UErrorCode &status);
/**
* Get the match mode flags that were used when compiling this pattern.
* @return the match mode flags
@ -270,6 +357,27 @@ public:
*/
virtual RegexMatcher *matcher(const UnicodeString &input,
UErrorCode &status) const;
/**
* Creates a RegexMatcher that will match the given input against this pattern. The
* RegexMatcher can then be used to perform match, find or replace operations
* on the input. Note that a RegexPattern object must not be deleted while
* RegexMatchers created from it still exist and might possibly be used again.
* <p>
* The matcher will make a shallow clone of the supplied input text, and all regexp
* pattern matching operations happen on this clone. While read-only operations on
* the supplied text are permitted, it is critical that the underlying string not be
* altered or deleted before use by the regular expression operations is complete.
*
* @param input The input text to which the regular expression will be applied.
* @param status A reference to a UErrorCode to receive any errors.
* @return A RegexMatcher object for this pattern and input.
*
* @internal ICU 4.4 technology preview
*/
virtual RegexMatcher *matcher(UText *input,
UErrorCode &status) const;
private:
/**
@ -280,6 +388,8 @@ private:
* To efficiently work with UChar *strings, wrap the data in a UnicodeString
* using one of the aliasing constructors, such as
* <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
* or in a UText, using
* <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
*
* @internal
*/
@ -318,15 +428,52 @@ public:
*/
static UBool U_EXPORT2 matches(const UnicodeString &regex,
const UnicodeString &input,
UParseError &pe,
UErrorCode &status);
/**
* Test whether a string matches a regular expression. This convenience function
* both compiles the reguluar expression and applies it in a single operation.
* Note that if the same pattern needs to be applied repeatedly, this method will be
* less efficient than creating and reusing a RegexMatcher object.
*
* @param regex The regular expression
* @param input The string data to be matched
* @param pe Receives the position of any syntax errors within the regular expression
* @param status A reference to a UErrorCode to receive any errors.
* @return True if the regular expression exactly matches the full input string.
*
* @internal ICU 4.4 technology preview
*/
static UBool U_EXPORT2 matches(UText *regex,
UText *input,
UParseError &pe,
UErrorCode &status);
/**
* Returns the regular expression from which this pattern was compiled.
* @stable ICU 2.4
* Returns the regular expression from which this pattern was compiled. This method will work
* even if the pattern was compiled from a UText.
*
* Note: If the pattern was originally compiled from a UText, and that UText was modified,
* the returned string may no longer reflect the RegexPattern object.
* @stable ICU 2.4
*/
virtual UnicodeString pattern() const;
/**
* Returns the regular expression from which this pattern was compiled. This method will work
* even if the pattern was compiled from a UnicodeString.
*
* Note: This is the original input, not a clone. If the pattern was originally compiled from a
* UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
* object.
*
* @internal ICU 4.4 technology preview
*/
virtual UText *patternText() const;
/**
@ -360,6 +507,37 @@ public:
UErrorCode &status) const;
/**
* Split a string into fields. Somewhat like split() from Perl.
* The pattern matches identify delimiters that separate the input
* into fields. The input data between the matches becomes the
* fields themselves.
* <p>
* For the best performance on split() operations,
* <code>RegexMatcher::split</code> is perferable to this function
*
* @param input The string to be split into fields. The field delimiters
* match the pattern (in the "this" object)
* @param dest An array of mutable UText structs to receive the results of the split.
* If a field is NULL, a new UText is allocated to contain the results for
* that field. This new UText is not guaranteed to be mutable.
* @param destCapacity The number of elements in the destination array.
* If the number of fields found is less than destCapacity, the
* extra strings in the destination array are not altered.
* If the number of destination strings is less than the number
* of fields, the trailing part of the input string, including any
* field delimiters, is placed in the last destination string.
* @param status A reference to a UErrorCode to receive any errors.
* @return The number of fields into which the input string was split.
*
* @internal ICU 4.4 technology preview
*/
virtual int32_t split(UText *input,
UText *dest[],
int32_t destCapacity,
UErrorCode &status) const;
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
@ -378,7 +556,7 @@ private:
//
// Implementation Data
//
UnicodeString fPattern; // The original pattern string.
UText *fPattern; // The original pattern string.
uint32_t fFlags; // The flags used when compiling the pattern.
//
UVector32 *fCompiledPat; // The compiled pattern p-code.
@ -396,7 +574,7 @@ private:
// >= this value. For some patterns, this calculated
// value may be less than the true shortest
// possible match.
int32_t fFrameSize; // Size of a state stack frame in the
// execution engine.
@ -421,6 +599,7 @@ private:
UnicodeSet *fInitialChars;
UChar32 fInitialChar;
Regex8BitSet *fInitialChars8;
UBool fNeedsAltInput;
friend class RegexCompile;
friend class RegexMatcher;
@ -468,6 +647,23 @@ public:
*/
RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
/**
* Construct a RegexMatcher for a regular expression.
* This is a convenience method that avoids the need to explicitly create
* a RegexPattern object. Note that if several RegexMatchers need to be
* created for the same expression, it will be more efficient to
* separately create and cache a RegexPattern object, and use
* its matcher() method to create the RegexMatcher objects.
*
* @param regexp The regular expression to be compiled.
* @param flags Regular expression options, such as case insensitive matching.
* @see UREGEX_CASE_INSENSITIVE
* @param status Any errors are reported by setting this UErrorCode variable.
*
* @internal ICU 4.4 technology preview
*/
RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
/**
* Construct a RegexMatcher for a regular expression.
* This is a convenience method that avoids the need to explicitly create
@ -492,6 +688,30 @@ public:
RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
uint32_t flags, UErrorCode &status);
/**
* Construct a RegexMatcher for a regular expression.
* This is a convenience method that avoids the need to explicitly create
* a RegexPattern object. Note that if several RegexMatchers need to be
* created for the same expression, it will be more efficient to
* separately create and cache a RegexPattern object, and use
* its matcher() method to create the RegexMatcher objects.
* <p>
* The matcher will make a shallow clone of the supplied input text, and all regexp
* pattern matching operations happen on this clone. While read-only operations on
* the supplied text are permitted, it is critical that the underlying string not be
* altered or deleted before use by the regular expression operations is complete.
*
* @param regexp The Regular Expression to be compiled.
* @param input The string to match. The matcher retains a shallow clone of the text.
* @param flags Regular expression options, such as case insensitive matching.
* @see UREGEX_CASE_INSENSITIVE
* @param status Any errors are reported by setting this UErrorCode variable.
*
* @internal ICU 4.4 technology preview
*/
RegexMatcher(UText *regexp, UText *input,
uint32_t flags, UErrorCode &status);
private:
/**
* Cause a compilation error if an application accidently attempts to
@ -501,6 +721,8 @@ private:
* To efficiently work with UChar *strings, wrap the data in a UnicodeString
* using one of the aliasing constructors, such as
* <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
* or in a UText, using
* <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
*
* @internal
*/
@ -525,6 +747,7 @@ public:
*/
virtual UBool matches(UErrorCode &status);
/**
* Resets the matcher, then attempts to match the input beginning
* at the specified startIndex, and extending to the end of the input.
@ -538,8 +761,6 @@ public:
virtual UBool matches(int32_t startIndex, UErrorCode &status);
/**
* Attempts to match the input string, starting from the beginning of the region,
* against the pattern. Like the matches() method, this function
@ -571,6 +792,7 @@ public:
*/
virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
/**
* Find the next pattern match in the input string.
* The find begins searching the input at the location following the end of
@ -610,6 +832,22 @@ public:
virtual UnicodeString group(UErrorCode &status) const;
/**
* Returns a string containing the text matched by the previous match.
* If the pattern can match an empty string, an empty string may be returned.
* @param dest A mutable UText in which the matching text is placed.
* If NULL, a new UText will be created (which may not be mutable).
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed.
* @return A string containing the matched input text. If a pre-allocated UText
* was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
*/
virtual UText *group(UText *dest, UErrorCode &status) const;
/**
* Returns a string containing the text captured by the given group
* during the previous match operation. Group(0) is the entire match.
@ -625,6 +863,24 @@ public:
virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
/**
* Returns a string containing the text captured by the given group
* during the previous match operation. Group(0) is the entire match.
*
* @param groupNum the capture group number
* @param dest A mutable UText in which the matching text is placed.
* If NULL, a new UText will be created (which may not be mutable).
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed.
* @return A string containing the matched input text. If a pre-allocated UText
* was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
*/
virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
/**
* Returns the number of capturing groups in this matcher's pattern.
* @return the number of capture groups
@ -726,11 +982,31 @@ public:
* Because no copy of the string is made, it is essential that the
* caller not delete the string until after regexp operations on it
* are done.
* Note that while a reset on the matcher with an input string that is then
* modified across/during matcher operations may be supported currently for UnicodeString,
* this was not originally intended behavior, and support for this is not guaranteed
* in upcoming versions of ICU.
* @return this RegexMatcher.
* @stable ICU 2.4
*/
virtual RegexMatcher &reset(const UnicodeString &input);
/**
* Resets this matcher with a new input string. This allows instances of RegexMatcher
* to be reused, which is more efficient than creating a new RegexMatcher for
* each input string to be processed.
* @param input The new string on which subsequent pattern matches will operate.
* The matcher makes a shallow clone of the given text; ownership of the
* original string remains with the caller. Because no deep copy of the
* text is made, it is essential that the caller not modify the string
* until after regexp operations on it are done.
* @return this RegexMatcher.
*
* @internal ICU 4.4 technology preview
*/
virtual RegexMatcher &reset(UText *input);
private:
/**
* Cause a compilation error if an application accidently attempts to
@ -740,6 +1016,8 @@ private:
* To efficiently work with UChar *strings, wrap the data in a UnicodeString
* using one of the aliasing constructors, such as
* <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
* or in a UText, using
* <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
*
* @internal
*/
@ -747,13 +1025,34 @@ private:
public:
/**
* Returns the input string being matched. The returned string is not a copy,
* but the live input string. It should not be altered or deleted.
* Returns the input string being matched. Ownership of the string belongs to
* the matcher; it should not be altered or deleted. This method will work even if the input
* was originally supplied as a UText.
* @return the input string
* @stable ICU 2.4
*/
virtual const UnicodeString &input() const;
/**
* Returns the input string being matched. This is the live input text; it should not be
* altered or deleted. This method will work even if the input was originally supplied as
* a UnicodeString.
* @return the input text
*
* @internal ICU 4.4 technology preview
*/
virtual UText *inputText() const;
/**
* Returns the input string being matched, either by copying it into the provided
* UText parameter or by returning a shallow clone of the live input. Note that copying
* the entire input may cause significant performance and memory issues.
* @param dest The UText into which the input should be copied, or NULL to create a new UText
* @return dest if non-NULL, a shallow copy of the input text otherwise
*
* @internal ICU 4.4 technology preview
*/
virtual UText *getInput(UText *dest) const;
/** Sets the limits of this matcher's region.
@ -838,6 +1137,7 @@ public:
*/
virtual UBool hasAnchoringBounds() const;
/**
* Set whether this matcher is using Anchoring Bounds for its region.
* With anchoring bounds, pattern anchors such as ^ and $ will match at the start
@ -852,6 +1152,7 @@ public:
*/
virtual RegexMatcher &useAnchoringBounds(UBool b);
/**
* Return TRUE if the most recent matching operation touched the
* end of the text being processed. In this case, additional input text could
@ -878,9 +1179,6 @@ public:
virtual UBool requireEnd() const;
/**
* Returns the pattern that is interpreted by this matcher.
* @return the RegexPattern for this RegexMatcher
@ -908,6 +1206,29 @@ public:
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
/**
* Replaces every substring of the input that matches the pattern
* with the given replacement string. This is a convenience function that
* provides a complete find-and-replace-all operation.
*
* This method first resets this matcher. It then scans the input string
* looking for matches of the pattern. Input that is not part of any
* match is left unchanged; each match is replaced in the result by the
* replacement string. The replacement string may contain references to
* capture groups.
*
* @param replacement a string containing the replacement text.
* @param dest a mutable UText in which the results are placed.
* If NULL, a new UText will be created (which may not be mutable).
* @param status a reference to a UErrorCode to receive any errors.
* @return a string containing the results of the find and replace.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
*/
virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
/**
* Replaces the first substring of the input that matches
* the pattern with the replacement string. This is a convenience
@ -929,7 +1250,35 @@ public:
* @stable ICU 2.4
*/
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
/**
* Replaces the first substring of the input that matches
* the pattern with the replacement string. This is a convenience
* function that provides a complete find-and-replace operation.
*
* <p>This function first resets this RegexMatcher. It then scans the input string
* looking for a match of the pattern. Input that is not part
* of the match is appended directly to the result string; the match is replaced
* in the result by the replacement string. The replacement string may contain
* references to captured groups.</p>
*
* <p>The state of the matcher (the position at which a subsequent find()
* would begin) after completing a replaceFirst() is not specified. The
* RegexMatcher should be reset before doing additional find() operations.</p>
*
* @param replacement a string containing the replacement text.
* @param dest a mutable UText in which the results are placed.
* If NULL, a new UText will be created (which may not be mutable).
* @param status a reference to a UErrorCode to receive any errors.
* @return a string containing the results of the find and replace.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
*/
virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
/**
* Implements a replace operation intended to be used as part of an
* incremental find-and-replace.
@ -959,6 +1308,37 @@ public:
*/
virtual RegexMatcher &appendReplacement(UnicodeString &dest,
const UnicodeString &replacement, UErrorCode &status);
/**
* Implements a replace operation intended to be used as part of an
* incremental find-and-replace.
*
* <p>The input string, starting from the end of the previous replacement and ending at
* the start of the current match, is appended to the destination string. Then the
* replacement string is appended to the output string,
* including handling any substitutions of captured text.</p>
*
* <p>For simple, prepackaged, non-incremental find-and-replace
* operations, see replaceFirst() or replaceAll().</p>
*
* @param dest A mutable UText to which the results of the find-and-replace are appended.
* Must not be NULL.
* @param replacement A UText that provides the text to be substituted for
* the input text that matched the regexp pattern. The replacement
* text may contain references to captured text from the input.
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
* if the replacement text specifies a capture group that
* does not exist in the pattern.
*
* @return this RegexMatcher
*
* @internal ICU 4.4 technology preview
*/
virtual RegexMatcher &appendReplacement(UText *dest,
UText *replacement, UErrorCode &status);
/**
@ -974,13 +1354,26 @@ public:
virtual UnicodeString &appendTail(UnicodeString &dest);
/**
* As the final step in a find-and-replace operation, append the remainder
* of the input string, starting at the position following the last appendReplacement(),
* to the destination string. <code>appendTail()</code> is intended to be invoked after one
* or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
*
* @param dest A mutable UText to which the results of the find-and-replace are appended.
* Must not be NULL.
* @return the destination string.
*
* @internal ICU 4.4 technology preview
*/
virtual UText *appendTail(UText *dest);
/**
* Split a string into fields. Somewhat like split() from Perl.
* The pattern matches identify delimiters that separate the input
* into fields. The input data between the matches becomes the
* fields themselves.
* <p>
*
* @param input The string to be split into fields. The field delimiters
* match the pattern (in the "this" object). This matcher
@ -1004,6 +1397,35 @@ public:
int32_t destCapacity,
UErrorCode &status);
/**
* Split a string into fields. Somewhat like split() from Perl.
* The pattern matches identify delimiters that separate the input
* into fields. The input data between the matches becomes the
* fields themselves.
*
* @param input The string to be split into fields. The field delimiters
* match the pattern (in the "this" object). This matcher
* will be reset to this input string.
* @param dest An array of mutable UText structs to receive the results of the split.
* If a field is NULL, a new UText is allocated to contain the results for
* that field. This new UText is not guaranteed to be mutable.
* @param destCapacity The number of elements in the destination array.
* If the number of fields found is less than destCapacity, the
* extra strings in the destination array are not altered.
* If the number of destination strings is less than the number
* of fields, the trailing part of the input string, including any
* field delimiters, is placed in the last destination string.
* @param status A reference to a UErrorCode to receive any errors.
* @return The number of fields into which the input string was split.
*
* @internal ICU 4.4 technology preview
*/
virtual int32_t split(UText *input,
UText *dest[],
int32_t destCapacity,
UErrorCode &status);
/**
* Set a processing time limit for match operations with this Matcher.
*
@ -1086,7 +1508,6 @@ public:
UErrorCode &status);
/**
* Get the callback function for this URegularExpression.
*
@ -1132,7 +1553,7 @@ private:
RegexMatcher(const RegexMatcher &other);
RegexMatcher &operator =(const RegexMatcher &rhs);
void init(UErrorCode &status); // Common initialization
void init2(const UnicodeString &s, UErrorCode &e); // Common initialization, part 2.
void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
friend class RegexPattern;
friend class RegexCImpl;
@ -1145,34 +1566,43 @@ private:
// MatchAt This is the internal interface to the match engine itself.
// Match status comes back in matcher member variables.
//
void MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
UBool isWordBoundary(int32_t pos); // perform Perl-like \b test
UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test
void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
REStackFrame *resetStack();
inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status);
void IncrementTime(UErrorCode &status);
int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
UBool findUsingChunk();
void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
UBool isChunkWordBoundary(int32_t pos);
const RegexPattern *fPattern;
RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
// should delete it when through.
const UnicodeString *fInput; // The text being matched. Is never NULL.
const UnicodeString *fInput; // The string being matched. Only used for input()
UText *fInputText; // The text being matched. Is never NULL.
UText *fAltInputText; // A shallow copy of the text being matched.
// Only created if the pattern contains backreferences.
int64_t fInputLength; // Full length of the input text.
int32_t fFrameSize; // The size of a frame in the backtrack stack.
int32_t fRegionStart; // Start of the input region, default = 0.
int32_t fRegionLimit; // End of input region, default to input.length.
int64_t fRegionStart; // Start of the input region, default = 0.
int64_t fRegionLimit; // End of input region, default to input.length.
int32_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
int32_t fAnchorLimit; // See useAnchoringBounds
int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
int64_t fAnchorLimit; // See useAnchoringBounds
int32_t fLookStart; // Region bounds for look-ahead/behind and
int32_t fLookLimit; // and other boundary tests. See
int64_t fLookStart; // Region bounds for look-ahead/behind and
int64_t fLookLimit; // and other boundary tests. See
// useTransparentBounds
int32_t fActiveStart; // Currently active bounds for matching.
int32_t fActiveLimit; // Usually is the same as region, but
int64_t fActiveStart; // Currently active bounds for matching.
int64_t fActiveLimit; // Usually is the same as region, but
// is changed to fLookStart/Limit when
// entering look around regions.
@ -1180,13 +1610,13 @@ private:
UBool fAnchoringBounds; // True if using anchoring bounds.
UBool fMatch; // True if the last attempted match was successful.
int32_t fMatchStart; // Position of the start of the most recent match
int32_t fMatchEnd; // First position after the end of the most recent match
int64_t fMatchStart; // Position of the start of the most recent match
int64_t fMatchEnd; // First position after the end of the most recent match
// Zero if no previous match, even when a region
// is active.
int32_t fLastMatchEnd; // First position after the end of the previous match,
int64_t fLastMatchEnd; // First position after the end of the previous match,
// or -1 if there was no previous match.
int32_t fAppendPosition; // First position after the end of the previous
int64_t fAppendPosition; // First position after the end of the previous
// appendReplacement(). As described by the
// JavaDoc for Java Matcher, where it is called
// "append position"
@ -1218,6 +1648,8 @@ private:
// NULL if there is no callback.
const void *fCallbackContext; // User Context ptr for callback function.
UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
UBool fTraceDebug; // Set true for debug tracing of match engine.
UErrorCode fDeferredStatus; // Save error state that cannot be immediately

View File

@ -3,7 +3,7 @@
* Copyright (C) 2004-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: regex.h
* file name: uregex.h
* encoding: US-ASCII
* indentation:4
*
@ -23,6 +23,7 @@
#ifndef UREGEX_H
#define UREGEX_H
#include "unicode/utext.h"
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
@ -112,6 +113,7 @@ typedef enum URegexpFlag{
* string form into an internal representation using the specified match mode flags.
* The resulting regular expression handle can then be used to perform various
* matching operations.
*
*
* @param pattern The Regular Expression pattern to be compiled.
* @param patternLength The length of the pattern, or -1 if the pattern is
@ -134,7 +136,36 @@ uregex_open( const UChar *pattern,
uint32_t flags,
UParseError *pe,
UErrorCode *status);
/**
* Open (compile) an ICU regular expression. Compiles the regular expression in
* string form into an internal representation using the specified match mode flags.
* The resulting regular expression handle can then be used to perform various
* matching operations.
* <p>
* The contents of the pattern UText will be extracted and saved. Ownership of the
* UText struct itself remains with the caller. This is to match the behavior of
* uregex_open().
*
* @param pattern The Regular Expression pattern to be compiled.
* @param flags Flags that alter the default matching behavior for
* the regular expression, UREGEX_CASE_INSENSITIVE, for
* example. For default behavior, set this parameter to zero.
* See <code>enum URegexpFlag</code>. All desired flags
* are bitwise-ORed together.
* @param pe Receives the position (line and column nubers) of any syntax
* error within the source regular expression string. If this
* information is not wanted, pass NULL for this parameter.
* @param status Receives error detected by this function.
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL URegularExpression * U_EXPORT2
uregex_openUText(UText *pattern,
uint32_t flags,
UParseError *pe,
UErrorCode *status);
/**
* Open (compile) an ICU regular expression. The resulting regular expression
* handle can then be used to perform various matching operations.
@ -219,7 +250,8 @@ U_STABLE URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression *regexp, UErrorCode *status);
/**
* Return a pointer to the source form of the pattern for this regular expression.
* Returns a pointer to the source form of the pattern for this regular expression.
* This function will work even if the pattern was originally specified as a UText.
*
* @param regexp The compiled regular expression.
* @param patLength This output parameter will be set to the length of the
@ -235,9 +267,24 @@ uregex_clone(const URegularExpression *regexp, UErrorCode *status);
* @stable ICU 3.0
*/
U_STABLE const UChar * U_EXPORT2
uregex_pattern(const URegularExpression *regexp,
int32_t *patLength,
UErrorCode *status);
uregex_pattern(const URegularExpression *regexp,
int32_t *patLength,
UErrorCode *status);
/**
* Returns the source text of the pattern for this regular expression.
* This function will work even if the pattern was originally specified as a UChar string.
*
* @param regexp The compiled regular expression.
* @param status Receives errors detected by this function.
* @return the pattern text. The storage for the text is owned by the regular expression
* object, and must not be altered or deleted.
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL UText * U_EXPORT2
uregex_patternUText(const URegularExpression *regexp,
UErrorCode *status);
/**
@ -279,10 +326,36 @@ uregex_setText(URegularExpression *regexp,
int32_t textLength,
UErrorCode *status);
/**
* Set the subject text string upon which the regular expression will look for matches.
* This function may be called any number of times, allowing the regular
* expression pattern to be applied to different strings.
* <p>
* Regular expression matching operations work directly on the application's
* string data; only a shallow clone is made. The subject string data must not be
* altered after calling this function until after all regular expression
* operations involving this string data are completed.
*
* @param regexp The compiled regular expression.
* @param text The subject text string.
* @param status Receives errors detected by this function.
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL void U_EXPORT2
uregex_setUText(URegularExpression *regexp,
UText *text,
UErrorCode *status);
/**
* Get the subject text that is currently associated with this
* regular expression object. This simply returns whatever string
* pointer was previously supplied via uregex_setText().
* regular expression object. If the input was supplied using uregex_setText(),
* that pointer will be returned. Otherwise, the characters in the input will
* be extracted to a buffer and returned. In either case, ownership remains
* with the regular expression object.
*
* This function will work even if the input was originally specified as a UText.
*
* @param regexp The compiled regular expression.
* @param textLength The length of the string is returned in this output parameter.
@ -291,7 +364,7 @@ uregex_setText(URegularExpression *regexp,
* the text is known in advance to be a NUL terminated
* string.
* @param status Receives errors detected by this function.
* @return Poiner to the subject text string currently associated with
* @return Pointer to the subject text string currently associated with
* this regular expression.
* @stable ICU 3.0
*/
@ -299,6 +372,28 @@ U_STABLE const UChar * U_EXPORT2
uregex_getText(URegularExpression *regexp,
int32_t *textLength,
UErrorCode *status);
/**
* Get the subject text that is currently associated with this
* regular expression object.
*
* This function will work even if the input was originally specified as a UChar string.
*
* @param regexp The compiled regular expression.
* @param dest A mutable UText in which to store the current input.
* If NULL, a new UText will be created as an immutable shallow clone
* of the actual input string.
* @param status Receives errors detected by this function.
* @return The subject text currently associated with this regular expression.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL UText * U_EXPORT2
uregex_getUText(URegularExpression *regexp,
UText *dest,
UErrorCode *status);
/**
* Attempts to match the input string against the pattern.
@ -428,6 +523,29 @@ uregex_group(URegularExpression *regexp,
int32_t destCapacity,
UErrorCode *status);
/** Extract the string for the specified matching expression or subexpression.
* Group #0 is the complete string of matched text.
* Group #1 is the text matched by the first set of capturing parentheses.
*
* @param regexp The compiled regular expression.
* @param groupNum The capture group to extract. Group 0 is the complete
* match. The value of this parameter must be
* less than or equal to the number of capture groups in
* the pattern.
* @param dest Mutable UText to receive the matching string data.
* If NULL, a new UText will be created (which may not be mutable).
* @param status A reference to a UErrorCode to receive any errors.
* @return The matching string data. If a pre-allocated UText was provided,
* it will always be used and returned.
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL UText * U_EXPORT2
uregex_groupUText(URegularExpression *regexp,
int32_t groupNum,
UText *dest,
UErrorCode *status);
/**
* Returns the index in the input string of the start of the text matched by the
@ -676,6 +794,32 @@ uregex_replaceAll(URegularExpression *regexp,
int32_t destCapacity,
UErrorCode *status);
/**
* Replaces every substring of the input that matches the pattern
* with the given replacement string. This is a convenience function that
* provides a complete find-and-replace-all operation.
*
* This method scans the input string looking for matches of the pattern.
* Input that is not part of any match is copied unchanged to the
* destination buffer. Matched regions are replaced in the output
* buffer by the replacement string. The replacement string may contain
* references to capture groups; these take the form of $1, $2, etc.
*
* @param regexp The compiled regular expression.
* @param replacement A string containing the replacement text.
* @param dest A mutable UText that will receive the result.
* If NULL, a new UText will be created (which may not be mutable).
* @param status A reference to a UErrorCode to receive any errors.
* @return A UText containing the results of the find and replace.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL UText * U_EXPORT2
uregex_replaceAllUText(URegularExpression *regexp,
UText *replacement,
UText *dest,
UErrorCode *status);
/**
* Replaces the first substring of the input that matches the pattern
@ -709,6 +853,33 @@ uregex_replaceFirst(URegularExpression *regexp,
int32_t destCapacity,
UErrorCode *status);
/**
* Replaces the first substring of the input that matches the pattern
* with the given replacement string. This is a convenience function that
* provides a complete find-and-replace operation.
*
* This method scans the input string looking for a match of the pattern.
* All input that is not part of the match is copied unchanged to the
* destination buffer. The matched region is replaced in the output
* buffer by the replacement string. The replacement string may contain
* references to capture groups; these take the form of $1, $2, etc.
*
* @param regexp The compiled regular expression.
* @param replacement A string containing the replacement text.
* @param dest A mutable UText that will receive the result.
* If NULL, a new UText will be created (which may not be mutable).
* @param status A reference to a UErrorCode to receive any errors.
* @return A UText containing the results of the find and replace.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL UText * U_EXPORT2
uregex_replaceFirstUText(URegularExpression *regexp,
UText *replacement,
UText *dest,
UErrorCode *status);
/**
* Implements a replace operation intended to be used as part of an
@ -758,11 +929,40 @@ uregex_replaceFirst(URegularExpression *regexp,
*/
U_STABLE int32_t U_EXPORT2
uregex_appendReplacement(URegularExpression *regexp,
const UChar *replacementText,
int32_t replacementLength,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status);
const UChar *replacementText,
int32_t replacementLength,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status);
/**
* Implements a replace operation intended to be used as part of an
* incremental find-and-replace.
*
* <p>The input string, starting from the end of the previous match and ending at
* the start of the current match, is appended to the destination string. Then the
* replacement string is appended to the output string,
* including handling any substitutions of captured text.</p>
*
* <p>For simple, prepackaged, non-incremental find-and-replace
* operations, see replaceFirst() or replaceAll().</p>
*
* @param regexp The regular expression object.
* @param replacementText The string that will replace the matched portion of the
* input string as it is copied to the destination buffer.
* The replacement text may contain references ($1, for
* example) to capture groups from the match.
* @param dest A mutable UText that will receive the result. Must not be NULL.
* @param status A reference to a UErrorCode to receive any errors.
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL void U_EXPORT2
uregex_appendReplacementUText(URegularExpression *regexp,
UText *replacementText,
UText *dest,
UErrorCode *status);
/**
@ -794,7 +994,27 @@ uregex_appendTail(URegularExpression *regexp,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status);
/**
* As the final step in a find-and-replace operation, append the remainder
* of the input string, starting at the position following the last match,
* to the destination string. <code>uregex_appendTailUText()</code> is intended
* to be invoked after one or more invocations of the
* <code>uregex_appendReplacementUText()</code> function.
*
* @param regexp The regular expression object. This is needed to
* obtain the input string and with the position
* of the last match within it.
* @param dest A mutable UText that will receive the result. Must not be NULL.
* @param status A reference to a UErrorCode to receive any errors.
* @return The destination UText.
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL UText * U_EXPORT2
uregex_appendTailUText(URegularExpression *regexp,
UText *dest);
@ -808,6 +1028,22 @@ uregex_appendTail(URegularExpression *regexp,
* buffer, and NUL terminated. The position of each field within
* the destination buffer is returned in the destFields array.
*
* Note: another choice for the design of this function would be to not
* copy the resulting fields at all, but to return indexes and
* lengths within the source text.
* Advantages would be
* o Faster. No Copying.
* o Nothing extra needed when field data may contain embedded NUL chars.
* o Less memory needed if working on large data.
* Disadvantages
* o Less consistent with C++ split, which copies into an
* array of UnicodeStrings.
* o No NUL termination, extracted fields would be less convenient
* to use in most cases.
* o Possible problems in the future, when support Unicode Normalization
* could cause the fields to not correspond exactly to
* a range of the source text.
*
* @param regexp The compiled regular expression.
* @param destBuf A (UChar *) buffer to receive the fields that
* are extracted from the input string. These
@ -846,6 +1082,39 @@ uregex_split( URegularExpression *regexp,
UErrorCode *status);
/**
* Split a string into fields. Somewhat like split() from Perl.
* The pattern matches identify delimiters that separate the input
* into fields. The input data between the matches becomes the
* fields themselves.
* <p>
* The behavior of this function is not very closely aligned with uregex_split();
* instead, it is based on (and implemented directly on top of) the C++ split method.
*
* @param regexp The compiled regular expression.
* @param dest An array of mutable UText structs to receive the results of the split.
* If a field is NULL, a new UText is allocated to contain the results for
* that field. This new UText is not guaranteed to be mutable.
* @param destCapacity The number of elements in the destination array.
* If the number of fields found is less than destCapacity, the
* extra strings in the destination array are not altered.
* If the number of destination strings is less than the number
* of fields, the trailing part of the input string, including any
* field delimiters, is placed in the last destination string.
* This behavior mimics that of Perl. It is not an error condition, and no
* error status is returned when all destField positions are used.
* @param status A reference to a UErrorCode to receive any errors.
* @return The number of fields into which the input string was split.
*
* @internal ICU 4.4 technology preview
*/
U_INTERNAL int32_t U_EXPORT2
uregex_splitUText(URegularExpression *regexp,
UText *destFields[],
int32_t destFieldsCapacity,
UErrorCode *status);
/**

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2004-2009, International Business Machines Corporation and
* Copyright (c) 2004-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -26,6 +26,7 @@
#include "unicode/uloc.h"
#include "unicode/uregex.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
#include "cintltst.h"
#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
@ -86,11 +87,34 @@ static void test_assert_string(const char *expected, const UChar *actual, UBool
#define TEST_ASSERT_STRING(expected, actual, nulTerm) test_assert_string(expected, actual, nulTerm, __FILE__, __LINE__)
static void test_assert_utext(const char *expected, UText *actual, const char *file, int line) {
UErrorCode status = U_ZERO_ERROR;
UText expectedText = UTEXT_INITIALIZER;
utext_openUTF8(&expectedText, expected, -1, &status);
utext_setNativeIndex(actual, 0);
if (utext_compare(&expectedText, -1, actual, -1) != 0) {
UChar32 c;
log_err("Failure at file %s, line %d, expected \"%s\", got \"", file, line, expected);
c = utext_next32From(actual, 0);
while (c != U_SENTINEL) {
if (0x20<c && c <0x7e) {
log_err("%c", c);
} else {
log_err("%#x", c);
}
c = UTEXT_NEXT32(actual);
}
log_err("\"\n");
}
}
#define TEST_ASSERT_UTEXT(expected, actual) test_assert_utext(expected, actual, __FILE__, __LINE__)
static void TestRegexCAPI(void);
static void TestBug4315(void);
static void TestUTextAPI(void);
void addURegexTest(TestNode** root);
@ -98,6 +122,7 @@ void addURegexTest(TestNode** root)
{
addTest(root, &TestRegexCAPI, "regex/TestRegexCAPI");
addTest(root, &TestBug4315, "regex/TestBug4315");
addTest(root, &TestUTextAPI, "regex/TestUTextAPI");
}
/*
@ -1319,4 +1344,697 @@ static void TestBug4315(void) {
uregex_close(theRegEx);
}
/* Based on TestRegexCAPI() */
static void TestUTextAPI(void) {
UErrorCode status = U_ZERO_ERROR;
URegularExpression *re;
UText patternText = UTEXT_INITIALIZER;
UChar pat[200];
/* Mimimalist open/close */
utext_openUTF8(&patternText, "abc*", -1, &status);
re = uregex_openUText(&patternText, 0, 0, &status);
if (U_FAILURE(status)) {
log_err("Failed to open regular expression, line %d, error is \"%s\"\n", __LINE__, u_errorName(status));
return;
}
uregex_close(re);
/* Open with all flag values set */
status = U_ZERO_ERROR;
re = uregex_openUText(&patternText,
UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD,
0, &status);
TEST_ASSERT_SUCCESS(status);
uregex_close(re);
/* Open with an invalid flag */
status = U_ZERO_ERROR;
re = uregex_openUText(&patternText, 0x40000000, 0, &status);
TEST_ASSERT(status == U_REGEX_INVALID_FLAG);
uregex_close(re);
/* open with an invalid parameter */
status = U_ZERO_ERROR;
re = uregex_openUText(NULL,
UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD, 0, &status);
TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR && re == NULL);
/*
* clone
*/
{
URegularExpression *clone1;
URegularExpression *clone2;
URegularExpression *clone3;
UChar testString1[30];
UChar testString2[30];
UBool result;
status = U_ZERO_ERROR;
re = uregex_openUText(&patternText, 0, 0, &status);
TEST_ASSERT_SUCCESS(status);
clone1 = uregex_clone(re, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(clone1 != NULL);
status = U_ZERO_ERROR;
clone2 = uregex_clone(re, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(clone2 != NULL);
uregex_close(re);
status = U_ZERO_ERROR;
clone3 = uregex_clone(clone2, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(clone3 != NULL);
u_uastrncpy(testString1, "abcccd", sizeof(pat)/2);
u_uastrncpy(testString2, "xxxabcccd", sizeof(pat)/2);
status = U_ZERO_ERROR;
uregex_setText(clone1, testString1, -1, &status);
TEST_ASSERT_SUCCESS(status);
result = uregex_lookingAt(clone1, 0, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(result==TRUE);
status = U_ZERO_ERROR;
uregex_setText(clone2, testString2, -1, &status);
TEST_ASSERT_SUCCESS(status);
result = uregex_lookingAt(clone2, 0, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(result==FALSE);
result = uregex_find(clone2, 0, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(result==TRUE);
uregex_close(clone1);
uregex_close(clone2);
uregex_close(clone3);
}
/*
* pattern() and patternText()
*/
{
const UChar *resultPat;
int32_t resultLen;
UText *resultText;
u_uastrncpy(pat, "hello", sizeof(pat)/2); /* for comparison */
status = U_ZERO_ERROR;
utext_openUTF8(&patternText, "hello", -1, &status);
re = uregex_open(pat, -1, 0, NULL, &status);
resultPat = uregex_pattern(re, &resultLen, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS above should change too... */
if (U_SUCCESS(status)) {
TEST_ASSERT(resultLen == -1);
TEST_ASSERT(u_strcmp(resultPat, pat) == 0);
}
resultText = uregex_patternUText(re, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT("hello", resultText);
uregex_close(re);
status = U_ZERO_ERROR;
re = uregex_open(pat, 3, 0, NULL, &status);
resultPat = uregex_pattern(re, &resultLen, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS above should change too... */
if (U_SUCCESS(status)) {
TEST_ASSERT(resultLen == 3);
TEST_ASSERT(u_strncmp(resultPat, pat, 3) == 0);
TEST_ASSERT(u_strlen(resultPat) == 3);
}
resultText = uregex_patternUText(re, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT("hel", resultText);
uregex_close(re);
}
/*
* setUText() and lookingAt()
*/
{
UText text1 = UTEXT_INITIALIZER;
UText text2 = UTEXT_INITIALIZER;
UBool result;
status = U_ZERO_ERROR;
utext_openUTF8(&text1, "abcccd", -1, &status);
utext_openUTF8(&text2, "abcccxd", -1, &status);
utext_openUTF8(&patternText, "abc*d", -1, &status);
re = uregex_openUText(&patternText, 0, NULL, &status);
TEST_ASSERT_SUCCESS(status);
/* Operation before doing a setText should fail... */
status = U_ZERO_ERROR;
uregex_lookingAt(re, 0, &status);
TEST_ASSERT( status== U_REGEX_INVALID_STATE);
status = U_ZERO_ERROR;
uregex_setUText(re, &text1, &status);
result = uregex_lookingAt(re, 0, &status);
TEST_ASSERT(result == TRUE);
TEST_ASSERT_SUCCESS(status);
status = U_ZERO_ERROR;
uregex_setUText(re, &text2, &status);
result = uregex_lookingAt(re, 0, &status);
TEST_ASSERT(result == FALSE);
TEST_ASSERT_SUCCESS(status);
status = U_ZERO_ERROR;
uregex_setUText(re, &text1, &status);
result = uregex_lookingAt(re, 0, &status);
TEST_ASSERT(result == TRUE);
TEST_ASSERT_SUCCESS(status);
uregex_close(re);
utext_close(&text1);
utext_close(&text2);
}
/*
* getText() and getUText()
*/
{
UText text1 = UTEXT_INITIALIZER;
UText text2 = UTEXT_INITIALIZER;
UChar text2Chars[20];
UText *resultText;
const UChar *result;
int32_t textLength;
status = U_ZERO_ERROR;
utext_openUTF8(&text1, "abcccd", -1, &status);
u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
utext_openUChars(&text2, text2Chars, -1, &status);
utext_openUTF8(&patternText, "abc*d", -1, &status);
re = uregex_openUText(&patternText, 0, NULL, &status);
/* First set a UText */
uregex_setUText(re, &text1, &status);
resultText = uregex_getUText(re, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(resultText != &text1);
utext_setNativeIndex(resultText, 0);
utext_setNativeIndex(&text1, 0);
TEST_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
utext_close(resultText);
result = uregex_getText(re, &textLength, &status); /* flattens UText into buffer */
TEST_ASSERT(textLength == -1 || textLength == 6);
resultText = uregex_getUText(re, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(resultText != &text1);
utext_setNativeIndex(resultText, 0);
utext_setNativeIndex(&text1, 0);
TEST_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
utext_close(resultText);
/* Then set a UChar * */
uregex_setText(re, text2Chars, 7, &status);
resultText = uregex_getUText(re, NULL, &status);
TEST_ASSERT_SUCCESS(status);
utext_setNativeIndex(resultText, 0);
utext_setNativeIndex(&text2, 0);
TEST_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
utext_close(resultText);
result = uregex_getText(re, &textLength, &status);
TEST_ASSERT(textLength == 7);
uregex_close(re);
utext_close(&text1);
utext_close(&text2);
}
/*
* matches()
*/
{
UText text1 = UTEXT_INITIALIZER;
UBool result;
UText nullText = UTEXT_INITIALIZER;
status = U_ZERO_ERROR;
utext_openUTF8(&text1, "abcccde", -1, &status);
utext_openUTF8(&patternText, "abc*d", -1, &status);
re = uregex_openUText(&patternText, 0, NULL, &status);
uregex_setUText(re, &text1, &status);
result = uregex_matches(re, 0, &status);
TEST_ASSERT(result == FALSE);
TEST_ASSERT_SUCCESS(status);
uregex_close(re);
status = U_ZERO_ERROR;
re = uregex_openC(".?", 0, NULL, &status);
uregex_setUText(re, &text1, &status);
result = uregex_matches(re, 7, &status);
TEST_ASSERT(result == TRUE);
TEST_ASSERT_SUCCESS(status);
status = U_ZERO_ERROR;
utext_openUTF8(&nullText, "", -1, &status);
uregex_setUText(re, &nullText, &status);
TEST_ASSERT_SUCCESS(status);
result = uregex_matches(re, 0, &status);
TEST_ASSERT(result == TRUE);
TEST_ASSERT_SUCCESS(status);
uregex_close(re);
utext_close(&text1);
utext_close(&nullText);
}
/*
* lookingAt() Used in setText test.
*/
/*
* find(), findNext, start, end, reset
*/
{
UChar text1[50];
UBool result;
u_uastrncpy(text1, "012rx5rx890rxrx...", sizeof(text1)/2);
status = U_ZERO_ERROR;
re = uregex_openC("rx", 0, NULL, &status);
uregex_setText(re, text1, -1, &status);
result = uregex_find(re, 0, &status);
TEST_ASSERT(result == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 3);
TEST_ASSERT(uregex_end(re, 0, &status) == 5);
TEST_ASSERT_SUCCESS(status);
result = uregex_find(re, 9, &status);
TEST_ASSERT(result == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 11);
TEST_ASSERT(uregex_end(re, 0, &status) == 13);
TEST_ASSERT_SUCCESS(status);
result = uregex_find(re, 14, &status);
TEST_ASSERT(result == FALSE);
TEST_ASSERT_SUCCESS(status);
status = U_ZERO_ERROR;
uregex_reset(re, 0, &status);
result = uregex_findNext(re, &status);
TEST_ASSERT(result == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 3);
TEST_ASSERT(uregex_end(re, 0, &status) == 5);
TEST_ASSERT_SUCCESS(status);
result = uregex_findNext(re, &status);
TEST_ASSERT(result == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 6);
TEST_ASSERT(uregex_end(re, 0, &status) == 8);
TEST_ASSERT_SUCCESS(status);
status = U_ZERO_ERROR;
uregex_reset(re, 12, &status);
result = uregex_findNext(re, &status);
TEST_ASSERT(result == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 13);
TEST_ASSERT(uregex_end(re, 0, &status) == 15);
TEST_ASSERT_SUCCESS(status);
result = uregex_findNext(re, &status);
TEST_ASSERT(result == FALSE);
TEST_ASSERT_SUCCESS(status);
uregex_close(re);
}
/*
* group()
*/
{
UChar text1[80];
UText *actual;
UBool result;
u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2);
status = U_ZERO_ERROR;
re = uregex_openC("abc(.*?)def", 0, NULL, &status);
TEST_ASSERT_SUCCESS(status);
uregex_setText(re, text1, -1, &status);
result = uregex_find(re, 0, &status);
TEST_ASSERT(result==TRUE);
/* Capture Group 0, the full match. Should succeed. */
status = U_ZERO_ERROR;
actual = uregex_groupUText(re, 0, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT("abc interior def", actual);
utext_close(actual);
/* Capture group #1. Should succeed. */
status = U_ZERO_ERROR;
actual = uregex_groupUText(re, 1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT(" interior ", actual);
utext_close(actual);
/* Capture group out of range. Error. */
status = U_ZERO_ERROR;
actual = uregex_groupUText(re, 2, NULL, &status);
TEST_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
TEST_ASSERT(utext_nativeLength(actual) == 0);
utext_close(actual);
uregex_close(re);
}
/*
* replaceFirst()
*/
{
UChar text1[80];
UChar text2[80];
UText replText = UTEXT_INITIALIZER;
UText *result;
status = U_ZERO_ERROR;
u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
utext_openUTF8(&replText, "<$1>", -1, &status);
re = uregex_openC("x(.*?)x", 0, NULL, &status);
TEST_ASSERT_SUCCESS(status);
/* Normal case, with match */
uregex_setText(re, text1, -1, &status);
result = uregex_replaceFirstUText(re, &replText, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT("Replace <aa> x1x x...x.", result);
utext_close(result);
/* No match. Text should copy to output with no changes. */
uregex_setText(re, text2, -1, &status);
result = uregex_replaceFirstUText(re, &replText, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT("No match here.", result);
utext_close(result);
/* Unicode escapes */
uregex_setText(re, text1, -1, &status);
utext_openUTF8(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
result = uregex_replaceFirstUText(re, &replText, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT("Replace \\AaaB$a x1x x...x.", result);
utext_close(result);
uregex_close(re);
utext_close(&replText);
}
/*
* replaceAll()
*/
{
UChar text1[80];
UChar text2[80];
UText replText = UTEXT_INITIALIZER;
UText *result;
status = U_ZERO_ERROR;
u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
utext_openUTF8(&replText, "<$1>", -1, &status);
re = uregex_openC("x(.*?)x", 0, NULL, &status);
TEST_ASSERT_SUCCESS(status);
/* Normal case, with match */
uregex_setText(re, text1, -1, &status);
result = uregex_replaceAllUText(re, &replText, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT("Replace <aa> <1> <...>.", result);
utext_close(result);
/* No match. Text should copy to output with no changes. */
uregex_setText(re, text2, -1, &status);
result = uregex_replaceAllUText(re, &replText, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT("No match here.", result);
utext_close(result);
uregex_close(re);
utext_close(&replText);
}
/*
* appendReplacement()
*/
{
UChar text[100];
UChar repl[100];
UChar buf[100];
UChar *bufPtr;
int32_t bufCap;
status = U_ZERO_ERROR;
re = uregex_openC(".*", 0, 0, &status);
TEST_ASSERT_SUCCESS(status);
u_uastrncpy(text, "whatever", sizeof(text)/2);
u_uastrncpy(repl, "some other", sizeof(repl)/2);
uregex_setText(re, text, -1, &status);
/* match covers whole target string */
uregex_find(re, 0, &status);
TEST_ASSERT_SUCCESS(status);
bufPtr = buf;
bufCap = sizeof(buf) / 2;
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_STRING("some other", buf, TRUE);
/* Match has \u \U escapes */
uregex_find(re, 0, &status);
TEST_ASSERT_SUCCESS(status);
bufPtr = buf;
bufCap = sizeof(buf) / 2;
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", sizeof(repl)/2);
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE);
uregex_close(re);
}
/*
* appendReplacement(), appendTail() checked in replaceFirst(), replaceAll().
*/
/*
* splitUText()
*/
{
UChar textToSplit[80];
UChar text2[80];
UText *fields[10];
int32_t numFields;
u_uastrncpy(textToSplit, "first : second: third", sizeof(textToSplit)/2);
u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
status = U_ZERO_ERROR;
re = uregex_openC(":", 0, NULL, &status);
/* Simple split */
uregex_setText(re, textToSplit, -1, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if (U_SUCCESS(status)) {
memset(fields, 0, sizeof(fields));
numFields = uregex_splitUText(re, fields, 10, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
TEST_ASSERT(numFields == 3);
TEST_ASSERT_UTEXT("first ", fields[0]);
TEST_ASSERT_UTEXT(" second", fields[1]);
TEST_ASSERT_UTEXT(" third", fields[2]);
TEST_ASSERT(fields[3] == NULL);
}
}
uregex_close(re);
/* Split with too few output strings available */
status = U_ZERO_ERROR;
re = uregex_openC(":", 0, NULL, &status);
uregex_setText(re, textToSplit, -1, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
fields[0] = NULL;
fields[1] = NULL;
fields[2] = &patternText;
numFields = uregex_splitUText(re, fields, 2, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
TEST_ASSERT(numFields == 2);
TEST_ASSERT_UTEXT("first ", fields[0]);
TEST_ASSERT_UTEXT(" second: third", fields[1]);
TEST_ASSERT(fields[2] == &patternText);
}
}
uregex_close(re);
}
/* splitUText(), part 2. Patterns with capture groups. The capture group text
* comes out as additional fields. */
{
UChar textToSplit[80];
UText *fields[10];
int32_t numFields;
u_uastrncpy(textToSplit, "first <tag-a> second<tag-b> third", sizeof(textToSplit)/2);
status = U_ZERO_ERROR;
re = uregex_openC("<(.*?)>", 0, NULL, &status);
uregex_setText(re, textToSplit, -1, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
memset(fields, 0, sizeof(fields));
numFields = uregex_splitUText(re, fields, 10, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
TEST_ASSERT(numFields == 5);
TEST_ASSERT_UTEXT("first ", fields[0]);
TEST_ASSERT_UTEXT("tag-a", fields[1]);
TEST_ASSERT_UTEXT(" second", fields[2]);
TEST_ASSERT_UTEXT("tag-b", fields[3]);
TEST_ASSERT_UTEXT(" third", fields[4]);
TEST_ASSERT(fields[5] == NULL);
}
}
/* Split with too few output strings available (2) */
status = U_ZERO_ERROR;
fields[0] = NULL;
fields[1] = NULL;
fields[2] = &patternText;
numFields = uregex_splitUText(re, fields, 2, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
TEST_ASSERT(numFields == 2);
TEST_ASSERT_UTEXT("first ", fields[0]);
TEST_ASSERT_UTEXT(" second<tag-b> third", fields[1]);
TEST_ASSERT(fields[2] == &patternText);
}
/* Split with too few output strings available (3) */
status = U_ZERO_ERROR;
fields[0] = NULL;
fields[1] = NULL;
fields[2] = NULL;
fields[3] = &patternText;
numFields = uregex_splitUText(re, fields, 3, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
TEST_ASSERT(numFields == 3);
TEST_ASSERT_UTEXT("first ", fields[0]);
TEST_ASSERT_UTEXT("tag-a", fields[1]);
TEST_ASSERT_UTEXT(" second<tag-b> third", fields[2]);
TEST_ASSERT(fields[3] == &patternText);
}
/* Split with just enough output strings available (5) */
status = U_ZERO_ERROR;
fields[0] = NULL;
fields[1] = NULL;
fields[2] = NULL;
fields[3] = NULL;
fields[4] = NULL;
fields[5] = &patternText;
numFields = uregex_splitUText(re, fields, 5, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
TEST_ASSERT(numFields == 5);
TEST_ASSERT_UTEXT("first ", fields[0]);
TEST_ASSERT_UTEXT("tag-a", fields[1]);
TEST_ASSERT_UTEXT(" second", fields[2]);
TEST_ASSERT_UTEXT("tag-b", fields[3]);
TEST_ASSERT_UTEXT(" third", fields[4]);
TEST_ASSERT(fields[5] == &patternText);
}
/* Split, end of text is a field delimiter. */
status = U_ZERO_ERROR;
uregex_setText(re, textToSplit, strlen("first <tag-a> second<tag-b>"), &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
memset(fields, 0, sizeof(fields));
fields[9] = &patternText;
numFields = uregex_splitUText(re, fields, 9, &status);
TEST_ASSERT_SUCCESS(status);
/* The TEST_ASSERT_SUCCESS call above should change too... */
if(U_SUCCESS(status)) {
TEST_ASSERT(numFields == 4);
TEST_ASSERT_UTEXT("first ", fields[0]);
TEST_ASSERT_UTEXT("tag-a", fields[1]);
TEST_ASSERT_UTEXT(" second", fields[2]);
TEST_ASSERT_UTEXT("tag-b", fields[3]);
TEST_ASSERT(fields[4] == NULL);
TEST_ASSERT(fields[8] == NULL);
TEST_ASSERT(fields[9] == &patternText);
}
}
uregex_close(re);
}
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2005-2009, International Business Machines Corporation and
* Copyright (c) 2005-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/************************************************************************
@ -58,6 +58,8 @@ UTextTest::runIndexedTest(int32_t index, UBool exec,
if (exec) Ticket5560(); break;
case 4: name = "Ticket6847";
if (exec) Ticket6847(); break;
case 5: name = "ComparisonTest";
if (exec) ComparisonTest(); break;
default: name = ""; break;
}
}
@ -836,6 +838,476 @@ void UTextTest::TestAccessNoClone(const UnicodeString &us, UText *ut, int cpCoun
}
//
// ComparisonTest() Check the string comparison functions. Based on UnicodeStringTest::TestCompare()
//
void UTextTest::ComparisonTest()
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString test1Str("this is a test");
UnicodeString test2Str("this is a test");
UnicodeString test3Str("this is a test of the emergency broadcast system");
UnicodeString test4Str("never say, \"this is a test\"!!");
UText test1 = UTEXT_INITIALIZER;
UText test2 = UTEXT_INITIALIZER;
UText test3 = UTEXT_INITIALIZER;
UText test4 = UTEXT_INITIALIZER;
UChar uniChars[] = { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73,
0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0 };
char chars[] = { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73,
0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0 };
UText uniCharText = UTEXT_INITIALIZER;
UText charText = UTEXT_INITIALIZER;
utext_openUnicodeString(&test1, &test1Str, &status);
utext_openUnicodeString(&test2, &test2Str, &status);
utext_openUnicodeString(&test3, &test3Str, &status);
utext_openUnicodeString(&test4, &test4Str, &status);
utext_openUChars(&uniCharText, uniChars, -1, &status);
utext_openUTF8(&charText, chars, -1, &status);
TEST_SUCCESS(status);
// test utext_compare(), simple
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compare(&test1, -1, &test2, -1) != 0) errln("utext_compare() failed, simple setup");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compare(&test1, -1, &test3, -1) >= 0) errln("utext_compare() failed, simple setup");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test4, 0);
if (utext_compare(&test1, -1, &test4, -1) <= 0) errln("utext_compare() failed, simple setup");
// test utext_compareNativeLimit(), simple
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compareNativeLimit(&test1, -1, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, simple setup");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compareNativeLimit(&test1, -1, &test3, -1) >= 0) errln("utext_compareNativeLimit() failed, simple setup");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test4, 0);
if (utext_compareNativeLimit(&test1, -1, &test4, -1) <= 0) errln("utext_compareNativeLimit() failed, simple setup");
// test utext_compare(), one explicit length
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compare(&test1, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length");
UTEXT_SETNATIVEINDEX(&test2, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compare(&test3, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length");
UTEXT_SETNATIVEINDEX(&test2, 0);
UTEXT_SETNATIVEINDEX(&test4, 12);
if (utext_compare(&test4, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length and offset");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compare(&test3, 18, &test2, -1) <= 0) errln("utext_compare() failed, one explicit length");
// test utext_compareNativeLimit(), one explicit length
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compareNativeLimit(&test1, 14, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length");
UTEXT_SETNATIVEINDEX(&test2, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compareNativeLimit(&test3, 14, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length");
UTEXT_SETNATIVEINDEX(&test2, 0);
UTEXT_SETNATIVEINDEX(&test4, 12);
if (utext_compareNativeLimit(&test4, 26, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length and limit");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compareNativeLimit(&test3, 18, &test2, -1) <= 0) errln("utext_compareNativeLimit() failed, one explicit length");
// test utext_compare(), UChar-based UText
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compare(&test2, -1, &uniCharText, -1) != 0) errln("utext_compare() failed, UChar-based UText");
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compare(&test3, -1, &uniCharText, -1) <= 0) errln("utext_compare() failed, UChar-based UText");
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
UTEXT_SETNATIVEINDEX(&test4, 0);
if (utext_compare(&test4, -1, &uniCharText, -1) >= 0) errln("utext_compare() failed, UChar-based UText");
// test utext_compareNativeLimit(), UChar-based UText
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compareNativeLimit(&test2, -1, &uniCharText, -1) != 0) errln("utext_compareNativeLimit() failed, UChar-based UText");
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compareNativeLimit(&test3, -1, &uniCharText, -1) <= 0) errln("utext_compareNativeLimit() failed, UChar-based UText");
UTEXT_SETNATIVEINDEX(&uniCharText, 0);
UTEXT_SETNATIVEINDEX(&test4, 0);
if (utext_compareNativeLimit(&test4, -1, &uniCharText, -1) >= 0) errln("utext_compareNativeLimit() failed, UChar-based UText");
// test utext_compare(), UTF8-based UText
UTEXT_SETNATIVEINDEX(&charText, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compare(&test2, -1, &charText, -1) != 0) errln("utext_compare() failed, UTF8-based UText");
UTEXT_SETNATIVEINDEX(&charText, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compare(&test3, -1, &charText, -1) <= 0) errln("utext_compare() failed, UTF8-based UText");
UTEXT_SETNATIVEINDEX(&charText, 0);
UTEXT_SETNATIVEINDEX(&test4, 0);
if (utext_compare(&test4, -1, &charText, -1) >= 0) errln("utext_compare() failed, UTF8-based UText");
// test utext_compareNativeLimit(), UTF8-based UText
UTEXT_SETNATIVEINDEX(&charText, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compareNativeLimit(&test2, -1, &charText, -1) != 0) errln("utext_compareNativeLimit() failed, UTF8-based UText");
UTEXT_SETNATIVEINDEX(&charText, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compareNativeLimit(&test3, -1, &charText, -1) <= 0) errln("utext_compareNativeLimit() failed, UTF8-based UText");
UTEXT_SETNATIVEINDEX(&charText, 0);
UTEXT_SETNATIVEINDEX(&test4, 0);
if (utext_compareNativeLimit(&test4, -1, &charText, -1) >= 0) errln("utext_compareNativeLimit() failed, UTF8-based UText");
// test utext_compare(), length
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compare(&test1, -1, &test2, 4) != 0) errln("utext_compare() failed, one length");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compare(&test1, 5, &test2, 4) <= 0) errln("utext_compare() failed, both lengths");
// test utext_compareNativeLimit(), limit
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compareNativeLimit(&test1, -1, &test2, 4) != 0) errln("utext_compareNativeLimit() failed, one limit");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compareNativeLimit(&test1, 5, &test2, 4) <= 0) errln("utext_compareNativeLimit() failed, both limits");
// test utext_compare(), both explicit offsets and lengths
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compare(&test1, 14, &test2, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compare(&test1, 14, &test3, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test4, 12);
if (utext_compare(&test1, 14, &test4, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
UTEXT_SETNATIVEINDEX(&test1, 10);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compare(&test1, 4, &test2, 4) >= 0) errln("utext_compare() failed, both explicit offsets and lengths");
UTEXT_SETNATIVEINDEX(&test1, 10);
UTEXT_SETNATIVEINDEX(&test3, 22);
if (utext_compare(&test1, 4, &test3, 9) <= 0) errln("utext_compare() failed, both explicit offsets and lengths");
UTEXT_SETNATIVEINDEX(&test1, 10);
UTEXT_SETNATIVEINDEX(&test4, 22);
if (utext_compare(&test1, 4, &test4, 4) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
// test utext_compareNativeLimit(), both explicit offsets and limits
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compareNativeLimit(&test1, 14, &test2, 14) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test3, 0);
if (utext_compareNativeLimit(&test1, 14, &test3, 14) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
UTEXT_SETNATIVEINDEX(&test1, 0);
UTEXT_SETNATIVEINDEX(&test4, 12);
if (utext_compareNativeLimit(&test1, 14, &test4, 26) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
UTEXT_SETNATIVEINDEX(&test1, 10);
UTEXT_SETNATIVEINDEX(&test2, 0);
if (utext_compareNativeLimit(&test1, 14, &test2, 4) >= 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
UTEXT_SETNATIVEINDEX(&test1, 10);
UTEXT_SETNATIVEINDEX(&test3, 22);
if (utext_compareNativeLimit(&test1, 14, &test3, 31) <= 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
UTEXT_SETNATIVEINDEX(&test1, 10);
UTEXT_SETNATIVEINDEX(&test4, 22);
if (utext_compareNativeLimit(&test1, 14, &test4, 26) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
/* test caseCompare() */
{
static const UChar
_mixed[]= { 0x61, 0x42, 0x131, 0x3a3, 0xdf, 0x130, 0x49, 0xfb03, 0xd93f, 0xdfff, 0 },
_otherDefault[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x69, 0x307, 0x69, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
_otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x69, 0x131, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
_different[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x130, 0x49, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
UText
mixed = UTEXT_INITIALIZER,
otherDefault = UTEXT_INITIALIZER,
otherExcludeSpecialI = UTEXT_INITIALIZER,
different = UTEXT_INITIALIZER;
utext_openUChars(&mixed, _mixed, -1, &status);
utext_openUChars(&otherDefault, _otherDefault, -1, &status);
utext_openUChars(&otherExcludeSpecialI, _otherExcludeSpecialI, -1, &status);
utext_openUChars(&different, _different, -1, &status);
TEST_SUCCESS(status);
int32_t result;
/* test default options */
UTEXT_SETNATIVEINDEX(&mixed, 0);
UTEXT_SETNATIVEINDEX(&otherDefault, 0);
result = utext_caseCompare(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_DEFAULT, &status);
if (0 != result || U_FAILURE(status)) {
errln("error: utext_caseCompare (other, default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&mixed, 0);
UTEXT_SETNATIVEINDEX(&otherDefault, 0);
result = utext_caseCompareNativeLimit(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_DEFAULT, &status);
if (0 != result || U_FAILURE(status)) {
errln("error: utext_caseCompareNativeLimit (other, default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
}
/* test excluding special I */
UTEXT_SETNATIVEINDEX(&mixed, 0);
UTEXT_SETNATIVEINDEX(&otherExcludeSpecialI, 0);
result = utext_caseCompare(&mixed, -1, &otherExcludeSpecialI, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
if (0 != result || U_FAILURE(status)) {
errln("error: utext_caseCompare (otherExcludeSpecialI, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&mixed, 0);
UTEXT_SETNATIVEINDEX(&otherExcludeSpecialI, 0);
result = utext_caseCompareNativeLimit(&mixed, -1, &otherExcludeSpecialI, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
if (0 != result || U_FAILURE(status)) {
errln("error: utext_caseCompareNativeLimit (otherExcludeSpecialI, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&mixed, 0);
UTEXT_SETNATIVEINDEX(&otherDefault, 0);
result = utext_caseCompare(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
if (0 == result || U_FAILURE(status)) {
errln("error: utext_caseCompare (other, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be nonzero) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&mixed, 0);
UTEXT_SETNATIVEINDEX(&otherDefault, 0);
result = utext_caseCompareNativeLimit(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
if (0 == result || U_FAILURE(status)) {
errln("error: utext_caseCompareNativeLimit (other, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be nonzero) (%s)\n", result, u_errorName(status));
}
/* test against different string */
UTEXT_SETNATIVEINDEX(&mixed, 0);
UTEXT_SETNATIVEINDEX(&different, 0);
result = utext_caseCompare(&mixed, -1, &different, -1, U_FOLD_CASE_DEFAULT, &status);
if (0 >= result || U_FAILURE(status)) {
errln("error: utext_caseCompare (different, default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&mixed, 0);
UTEXT_SETNATIVEINDEX(&different, 0);
result = utext_caseCompareNativeLimit(&mixed, -1, &different, -1, U_FOLD_CASE_DEFAULT, &status);
if (0 >= result || U_FAILURE(status)) {
errln("error: utext_caseCompareNativeLimit (different, default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
}
/* test caseCompare() - include the folded sharp s (U+00df) with different lengths */
UTEXT_SETNATIVEINDEX(&mixed, 1);
UTEXT_SETNATIVEINDEX(&different, 1);
result = utext_caseCompare(&mixed, 4, &different, 5, U_FOLD_CASE_DEFAULT, &status);
if (0 != result || U_FAILURE(status)) {
errln("error: utext_caseCompare (mixed[1-5), different[1-6), default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&mixed, 1);
UTEXT_SETNATIVEINDEX(&different, 1);
result = utext_caseCompareNativeLimit(&mixed, 5, &different, 6, U_FOLD_CASE_DEFAULT, &status);
if (0 != result || U_FAILURE(status)) {
errln("error: utext_caseCompareNativeLimit (mixed[1-5), different[1-6), default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
}
/* test caseCompare() - stop in the middle of the sharp s (U+00df) */
UTEXT_SETNATIVEINDEX(&mixed, 1);
UTEXT_SETNATIVEINDEX(&different, 1);
result = utext_caseCompare(&mixed, 4, &different, 4, U_FOLD_CASE_DEFAULT, &status);
if (0 >= result || U_FAILURE(status)) {
errln("error: utext_caseCompare (mixed[1-5), different[1-5), default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&mixed, 1);
UTEXT_SETNATIVEINDEX(&different, 1);
result = utext_caseCompareNativeLimit(&mixed, 5, &different, 5, U_FOLD_CASE_DEFAULT, &status);
if (0 >= result || U_FAILURE(status)) {
errln("error: utext_caseCompareNativeLimit (mixed[1-5), different[1-5), default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
}
}
/* test surrogates in comparison */
{
static const UChar
_before[] = { 0x65, 0xd800, 0xd800, 0xdc01, 0x65, 0x00 },
_after[] = { 0x65, 0xd800, 0xdc00, 0x65, 0x00 };
UText
before = UTEXT_INITIALIZER,
after = UTEXT_INITIALIZER;
utext_openUChars(&before, _before, -1, &status);
utext_openUChars(&after, _after, -1, &status);
TEST_SUCCESS(status);
int32_t result;
UTEXT_SETNATIVEINDEX(&before, 1);
UTEXT_SETNATIVEINDEX(&after, 1);
result = utext_compare(&before, -1, &after, -1);
if (0 <= result || U_FAILURE(status)) {
errln("error: utext_compare ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&before, 1);
UTEXT_SETNATIVEINDEX(&after, 1);
result = utext_compare(&before, 3, &after, 3);
if (0 <= result || U_FAILURE(status)) {
errln("error: utext_compare with lengths ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&before, 1);
UTEXT_SETNATIVEINDEX(&after, 1);
result = utext_caseCompare(&before, -1, &after, -1, U_FOLD_CASE_DEFAULT, &status);
if (0 <= result || U_FAILURE(status)) {
errln("error: utext_caseCompare ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&before, 1);
UTEXT_SETNATIVEINDEX(&after, 1);
result = utext_caseCompare(&before, 3, &after, 3, U_FOLD_CASE_DEFAULT, &status);
if (0 <= result || U_FAILURE(status)) {
errln("error: utext_caseCompare with lengths ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
}
utext_close(&before);
utext_close(&after);
}
/* test surrogates at end of string */
{
static const UChar
_before[] = { 0x65, 0xd800, 0xd800, 0xdc01, 0x00 },
_after[] = { 0x65, 0xd800, 0xdc00, 0x00 };
UText
before = UTEXT_INITIALIZER,
after = UTEXT_INITIALIZER;
utext_openUChars(&before, _before, -1, &status);
utext_openUChars(&after, _after, -1, &status);
TEST_SUCCESS(status);
int32_t result;
UTEXT_SETNATIVEINDEX(&before, 1);
UTEXT_SETNATIVEINDEX(&after, 1);
result = utext_compare(&before, -1, &after, -1);
if (0 <= result || U_FAILURE(status)) {
errln("error: utext_compare ({ 65, d800, 10001 }, { 65, 10000 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
}
UTEXT_SETNATIVEINDEX(&before, 1);
UTEXT_SETNATIVEINDEX(&after, 1);
result = utext_caseCompare(&before, -1, &after, -1, U_FOLD_CASE_DEFAULT, &status);
if (0 <= result || U_FAILURE(status)) {
errln("error: utext_caseCompare ({ 65, d800, 10001 }, { 65, 10000 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
}
utext_close(&before);
utext_close(&after);
}
/* test empty strings */
{
UChar zero16 = 0;
char zero8 = 0;
UText emptyUChar = UTEXT_INITIALIZER;
UText emptyUTF8 = UTEXT_INITIALIZER;
UText nullUChar = UTEXT_INITIALIZER;
UText nullUTF8 = UTEXT_INITIALIZER;
utext_openUChars(&emptyUChar, &zero16, -1, &status);
utext_openUTF8(&emptyUTF8, &zero8, -1, &status);
utext_openUChars(&nullUChar, NULL, 0, &status);
utext_openUTF8(&nullUTF8, NULL, 0, &status);
if (utext_compare(&emptyUChar, -1, &emptyUTF8, -1) != 0) {
errln("error: utext_compare(&emptyUChar, -1, &emptyUTF8, -1) != 0");
}
if (utext_compare(&emptyUChar, -1, &nullUChar, -1) != 0) {
errln("error: utext_compare(&emptyUChar, -1, &nullUChar, -1) != 0");
}
if (utext_compare(&emptyUChar, -1, &nullUTF8, -1) != 0) {
errln("error: utext_compare(&emptyUChar, -1, &nullUTF8, -1) != 0");
}
if (utext_compare(&emptyUTF8, -1, &nullUChar, -1) != 0) {
errln("error: utext_compare(&emptyUTF8, -1, &nullUChar, -1) != 0");
}
if (utext_compare(&emptyUTF8, -1, &nullUTF8, -1) != 0) {
errln("error: utext_compare(&emptyUTF8, -1, &nullUTF8, -1) != 0");
}
if (utext_compare(&nullUChar, -1, &nullUTF8, -1) != 0) {
errln("error: utext_compare(&nullUChar, -1, &nullUTF8, -1) != 0");
}
if (utext_compareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1) != 0) {
errln("error: utext_compareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1) != 0");
}
if (utext_compareNativeLimit(&emptyUChar, -1, &nullUChar, -1) != 0) {
errln("error: utext_compareNativeLimit(&emptyUChar, -1, &nullUChar, -1) != 0");
}
if (utext_compareNativeLimit(&emptyUChar, -1, &nullUTF8, -1) != 0) {
errln("error: utext_compareNativeLimit(&emptyUChar, -1, &nullUTF8, -1) != 0");
}
if (utext_compareNativeLimit(&emptyUTF8, -1, &nullUChar, -1) != 0) {
errln("error: utext_compareNativeLimit(&emptyUTF8, -1, &nullUChar, -1) != 0");
}
if (utext_compareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1) != 0) {
errln("error: utext_compareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1) != 0");
}
if (utext_compareNativeLimit(&nullUChar, -1, &nullUTF8, -1) != 0) {
errln("error: utext_compareNativeLimit(&nullUChar, -1, &nullUTF8, -1) != 0");
}
if (utext_caseCompare(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0) {
errln("error: utext_caseCompare(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0");
}
if (utext_caseCompare(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0) {
errln("error: utext_caseCompare(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0");
}
if (utext_caseCompare(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
errln("error: utext_caseCompare(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0");
}
if (utext_caseCompare(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0) {
errln("error: utext_caseCompare(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0");
}
if (utext_caseCompare(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0) {
errln("error: utext_caseCompare(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0");
}
if (utext_caseCompare(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
errln("error: utext_caseCompare(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0");
}
if (utext_caseCompareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0) {
errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0");
}
if (utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0) {
errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0");
}
if (utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0");
}
if (utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0) {
errln("error: utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0");
}
if (utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0) {
errln("error: utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0");
}
if (utext_caseCompareNativeLimit(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
errln("error: utext_caseCompareNativeLimit(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0");
}
utext_close(&emptyUChar);
utext_close(&emptyUTF8);
utext_close(&nullUChar);
utext_close(&nullUTF8);
}
}
//
// ErrorTest() Check various error and edge cases.

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2005-2009, International Business Machines Corporation and
* Copyright (c) 2005-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/************************************************************************
@ -33,6 +33,7 @@ public:
void FreezeTest();
void Ticket5560();
void Ticket6847();
void ComparisonTest();
private:
struct m { // Map between native indices & code points.