ICU-105 Regular Expressions, changes from code review
X-SVN-Rev: 10294
This commit is contained in:
parent
bf1f6b1213
commit
24bf088281
@ -1839,7 +1839,6 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
||||
"U_REGEX_PROPERTY_SYNTAX",
|
||||
"U_REGEX_UNIMPLEMENTED",
|
||||
"U_REGEX_MISMATCHED_PAREN",
|
||||
"U_REGEX_MATCH_MODE_ERROR"
|
||||
};
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
|
@ -500,18 +500,17 @@ typedef enum UErrorCode {
|
||||
/*
|
||||
* The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
|
||||
*/
|
||||
U_REGEX_ERROR_START=0x10300,
|
||||
U_REGEX_INTERNAL_ERROR,
|
||||
U_REGEX_RULE_SYNTAX,
|
||||
U_REGEX_INVALID_STATE,
|
||||
U_REGEX_BAD_ESCAPE_SEQUENCE,
|
||||
U_REGEX_PROPERTY_SYNTAX,
|
||||
U_REGEX_UNIMPLEMENTED,
|
||||
U_REGEX_MISMATCHED_PAREN,
|
||||
U_REGEX_MATCH_MODE_ERROR,
|
||||
U_REGEX_ERROR_LIMIT,
|
||||
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
|
||||
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
|
||||
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
|
||||
U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */
|
||||
U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
|
||||
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
|
||||
U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
|
||||
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
|
||||
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
|
||||
|
||||
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
} UErrorCode;
|
||||
|
||||
/* Use the following to determine if an UErrorCode represents */
|
||||
|
@ -28,8 +28,6 @@
|
||||
#include "ucln_in.h"
|
||||
#include "mutex.h"
|
||||
|
||||
#include "stdio.h" // TODO: Get rid of this
|
||||
|
||||
#include "regeximp.h"
|
||||
#include "regexcst.h" // Contains state table for the regex pattern parser.
|
||||
// generated by a Perl script.
|
||||
@ -40,7 +38,6 @@
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
const char RegexCompile::fgClassID=0;
|
||||
static const int RESCAN_DEBUG = 0;
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
@ -173,6 +170,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
||||
|
||||
//
|
||||
// Set up the constant (static) Unicode Sets.
|
||||
// TODO: something cleaner for that -128 constant.
|
||||
//
|
||||
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status);
|
||||
@ -282,14 +280,12 @@ void RegexCompile::compile(
|
||||
// the search will stop there, if not before.
|
||||
//
|
||||
tableEl = &gRuleParseStateTable[state];
|
||||
if (RESCAN_DEBUG) {
|
||||
printf( "char, line, col = (\'%c\', %d, %d) state=%s ",
|
||||
REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d) state=%s ",
|
||||
fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
|
||||
}
|
||||
|
||||
for (;;) { // loop through table rows belonging to this state, looking for one
|
||||
// that matches the current input char.
|
||||
if (RESCAN_DEBUG) { printf( ".");}
|
||||
REGEX_SCAN_DEBUG_PRINTF( ".");
|
||||
if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
|
||||
// Table row specified an individual character, not a set, and
|
||||
// the input character is not quoted, and
|
||||
@ -323,7 +319,7 @@ void RegexCompile::compile(
|
||||
// No match on this row, advance to the next row for this state,
|
||||
tableEl++;
|
||||
}
|
||||
if (RESCAN_DEBUG) { printf( "\n");}
|
||||
REGEX_SCAN_DEBUG_PRINTF("\n");
|
||||
|
||||
//
|
||||
// We've found the row of the state table that matches the current input
|
||||
@ -340,7 +336,7 @@ void RegexCompile::compile(
|
||||
fStackPtr++;
|
||||
if (fStackPtr >= kStackSize) {
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
// printf( "RegexCompile::parse() - state stack overflow.\n");
|
||||
REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n");
|
||||
fStackPtr--;
|
||||
}
|
||||
fStack[fStackPtr] = tableEl->fPushState;
|
||||
@ -369,6 +365,36 @@ void RegexCompile::compile(
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// The pattern has now been read and processed, and the compiled code generated.
|
||||
//
|
||||
|
||||
//
|
||||
// Compute the number of digits requried for the largest capture group number.
|
||||
//
|
||||
fRXPat->fMaxCaptureDigits = 1;
|
||||
int32_t n = 10;
|
||||
for (;;) {
|
||||
if (n > fRXPat->fNumCaptureGroups) {
|
||||
break;
|
||||
}
|
||||
fRXPat->fMaxCaptureDigits++;
|
||||
n *= 10;
|
||||
}
|
||||
|
||||
//
|
||||
// A stupid bit of non-sense to prevent code coverage testing from complaining
|
||||
// about the pattern.dump() debug function. Go through the motions of dumping,
|
||||
// even though, without the #define set, it will do nothing.
|
||||
//
|
||||
#ifndef REGEX_DUMP_DEBUG
|
||||
static UBool phonyDumpDone = FALSE;
|
||||
if (phonyDumpDone==FALSE) {
|
||||
fRXPat->dump();
|
||||
phonyDumpDone = TRUE;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1094,27 +1120,39 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
|
||||
if (theSet == NULL) {
|
||||
return;
|
||||
}
|
||||
if (theSet->size() > 1) {
|
||||
// The set contains two or more chars.
|
||||
int32_t setSize = theSet->size();
|
||||
UChar32 firstSetChar = theSet->charAt(0);
|
||||
if (firstSetChar == -1) {
|
||||
// Sets that contain only strings, but no individual chars,
|
||||
// will end up here. TODO: figure out what to with sets containing strings.
|
||||
setSize = 0;
|
||||
}
|
||||
|
||||
switch (setSize) {
|
||||
case 0: // Set of no elements. Always fails to match.
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
{
|
||||
// The set contains only a single code point. Put it into
|
||||
// the compiled pattern as a single char operation rather
|
||||
// than a set, and discard the set itself.
|
||||
int32_t charToken = URX_BUILD(URX_ONECHAR, firstSetChar);
|
||||
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
|
||||
delete theSet;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
{
|
||||
// The set contains two or more chars. (the normal case)
|
||||
// Put it into the compiled pattern as a set.
|
||||
int32_t setNumber = fRXPat->fSets->size();
|
||||
fRXPat->fSets->addElement(theSet, *fStatus);
|
||||
int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
|
||||
fRXPat->fCompiledPat->addElement(setOp, *fStatus);
|
||||
}
|
||||
else
|
||||
{
|
||||
// The set contains only a single code point. Put it into
|
||||
// the compiled pattern as a single char operation rather
|
||||
// than a set, and discard the set itself.
|
||||
UChar32 c = theSet->charAt(0);
|
||||
if (c == -1) {
|
||||
// Set contained no chars. Stuff an invalid char that can't match.
|
||||
c = 0x1fffff;
|
||||
}
|
||||
int32_t charToken = URX_BUILD(URX_ONECHAR, c);
|
||||
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
|
||||
delete theSet;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1321,7 +1359,7 @@ UnicodeSet *RegexCompile::scanSet() {
|
||||
if (U_FAILURE(localStatus)) {
|
||||
// TODO: Get more accurate position of the error from UnicodeSet's return info.
|
||||
// UnicodeSet appears to not be reporting correctly at this time.
|
||||
printf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
|
||||
REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
|
||||
error(localStatus);
|
||||
delete uset;
|
||||
return NULL;
|
||||
|
@ -28,8 +28,6 @@
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
||||
static const UBool REGEX_DEBUG = TRUE;
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// class RegexCompile Contains the regular expression compiler.
|
||||
|
@ -13,13 +13,45 @@
|
||||
#define _REGEXIMP_H
|
||||
|
||||
|
||||
//
|
||||
// debugging support. Enable one or more of the #defines immediately following
|
||||
//
|
||||
//#define REGEX_SCAN_DEBUG
|
||||
#define REGEX_DUMP_DEBUG
|
||||
//#define REGEX_RUN_DEBUG
|
||||
// End of #defines inteded to be directly set.
|
||||
|
||||
#ifdef REGEX_SCAN_DEBUG
|
||||
#define REGEX_SCAN_DEBUG_PRINTF printf
|
||||
#else
|
||||
#define REGEX_SCAN_DEBUG_PRINTF
|
||||
#endif
|
||||
|
||||
#ifdef REGEX_DUMP_DEBUG
|
||||
#define REGEX_DUMP_DEBUG_PRINTF printf
|
||||
#else
|
||||
#define REGEX_DUMP_DEBUG_PRINTF
|
||||
#endif
|
||||
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
#define REGEX_RUN_DEBUG_PRINTF printf
|
||||
#define REGEX_DUMP_DEBUG_PRINTF printf
|
||||
#else
|
||||
#define REGEX_RUN_DEBUG_PRINTF
|
||||
#endif
|
||||
|
||||
#if defined(REGEX_SCAN_DEBUG) || defined(REGEX_RUN_DEBUG) || defined(REGEX_DUMP_DEBUG)
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// Opcode types In the compiled form of the regexp, these are the type, or opcodes,
|
||||
// of the entries.
|
||||
//
|
||||
enum {
|
||||
URX_RESERVED_OP = 0,
|
||||
URX_UNUSED1 = 1,
|
||||
URX_BACKTRACK = 1,
|
||||
URX_END = 2,
|
||||
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
|
||||
URX_STRING = 4, // Value field is index of string start
|
||||
@ -52,7 +84,7 @@ enum {
|
||||
// Used for debug printing only.
|
||||
#define URX_OPCODE_NAMES \
|
||||
"URX_RESERVED_OP", \
|
||||
"URX_UNUSED1", \
|
||||
"URX_BACKTRACK", \
|
||||
"END", \
|
||||
"ONECHAR", \
|
||||
"STRING", \
|
||||
|
@ -280,9 +280,9 @@ UnicodeString RegexMatcher::group(UErrorCode &status) const {
|
||||
|
||||
|
||||
|
||||
UnicodeString RegexMatcher::group(int32_t group, UErrorCode &status) const {
|
||||
int32_t s = start(group, status);
|
||||
int32_t e = end(group, status);
|
||||
UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
|
||||
int32_t s = start(groupNum, status);
|
||||
int32_t e = end(groupNum, status);
|
||||
|
||||
// Note: calling start() and end() above will do all necessary checking that
|
||||
// the group number is OK and that a match exists. status will be set.
|
||||
@ -539,6 +539,28 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
int32_t opType; // the opcode
|
||||
int32_t opValue; // and the operand value.
|
||||
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
{
|
||||
printf("MatchAt(startIdx=%d)\n", startIdx);
|
||||
printf("Original Pattern: ");
|
||||
int i;
|
||||
for (i=0; i<fPattern->fPattern.length(); i++) {
|
||||
printf("%c", fPattern->fPattern.charAt(i));
|
||||
}
|
||||
printf("\n");
|
||||
printf("Input String: ");
|
||||
for (i=0; i<fInput->length(); i++) {
|
||||
UChar c = fInput->charAt(i);
|
||||
if (c<32 || c>256) {
|
||||
c = '.';
|
||||
}
|
||||
printf("%c", c);
|
||||
}
|
||||
printf("\n");
|
||||
printf("\n");
|
||||
printf("PatLoc inputIdx char\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
@ -569,7 +591,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
op = pat->elementAti(patIdx);
|
||||
opType = URX_TYPE(op);
|
||||
opValue = URX_VAL(op);
|
||||
// printf("%d %d \"%c\"\n", patIdx, inputIdx, fInput->char32At(inputIdx));
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
printf("inputIdx=%d inputChar=%c ", inputIdx, fInput->char32At(inputIdx));
|
||||
fPattern->dumpOp(patIdx);
|
||||
#endif
|
||||
patIdx++;
|
||||
|
||||
switch (opType) {
|
||||
@ -579,6 +604,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKTRACK:
|
||||
// Force a backtrack. In some circumstances, the pattern compiler
|
||||
// will notice that the pattern can't possibly match anything, and will
|
||||
// emit one of these at that point.
|
||||
backTrack(inputIdx, patIdx);
|
||||
break;
|
||||
|
||||
|
||||
case URX_ONECHAR:
|
||||
{
|
||||
UChar32 inputChar = fInput->char32At(inputIdx);
|
||||
@ -909,6 +942,11 @@ breakFromLoop:
|
||||
fLastMatchEnd = fMatchEnd;
|
||||
fMatchStart = startIdx;
|
||||
fMatchEnd = inputIdx;
|
||||
REGEX_RUN_DEBUG_PRINTF("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd);
|
||||
}
|
||||
else
|
||||
{
|
||||
REGEX_RUN_DEBUG_PRINTF("No match\n\n");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -18,8 +18,6 @@
|
||||
#include "regexcmp.h"
|
||||
#include "regeximp.h"
|
||||
|
||||
#include "stdio.h" // TODO: get rid of this...
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
@ -197,7 +195,7 @@ UBool RegexPattern::operator ==(const RegexPattern &other) const {
|
||||
//---------------------------------------------------------------------
|
||||
RegexPattern *RegexPattern::compile(
|
||||
const UnicodeString ®ex,
|
||||
int32_t flags,
|
||||
uint32_t flags,
|
||||
UParseError &pe,
|
||||
UErrorCode &status) {
|
||||
|
||||
@ -243,7 +241,7 @@ RegexPattern *RegexPattern::compile( const UnicodeString ®ex,
|
||||
// flags
|
||||
//
|
||||
//---------------------------------------------------------------------
|
||||
int32_t RegexPattern::flags() const {
|
||||
uint32_t RegexPattern::flags() const {
|
||||
return fFlags;
|
||||
}
|
||||
|
||||
@ -320,8 +318,6 @@ UnicodeString RegexPattern::pattern() const {
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// split
|
||||
// TODO: perl returns captured strings intermixed with the
|
||||
// fields. Should we do this too?
|
||||
//
|
||||
//---------------------------------------------------------------------
|
||||
int32_t RegexPattern::split(const UnicodeString &input,
|
||||
@ -383,10 +379,28 @@ int32_t RegexPattern::split(const UnicodeString &input,
|
||||
int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
|
||||
dest[i].setTo(input, nextOutputStringStart, fieldLen);
|
||||
nextOutputStringStart = fMatcher->fMatchEnd;
|
||||
|
||||
// If the delimiter pattern has capturing parentheses, the captured
|
||||
// text goes out into the next n destination strings.
|
||||
int32_t groupNum;
|
||||
for (groupNum=1; groupNum<=this->fNumCaptureGroups; groupNum++) {
|
||||
if (i==destCapacity-1) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
dest[i] = fMatcher->group(groupNum, status);
|
||||
}
|
||||
|
||||
if (nextOutputStringStart == inputLen) {
|
||||
// The delimiter was at the end of the string. We're done.
|
||||
break;
|
||||
}
|
||||
|
||||
if (i==destCapacity-1) {
|
||||
// We've filled up the last output string with capture group data.
|
||||
// Give back the last string, to be used for the remainder of the input.
|
||||
i--;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -410,35 +424,16 @@ int32_t RegexPattern::split(const UnicodeString &input,
|
||||
//---------------------------------------------------------------------
|
||||
static const char *opNames[] = {URX_OPCODE_NAMES};
|
||||
|
||||
void RegexPattern::dump() {
|
||||
int index;
|
||||
int i;
|
||||
UChar c;
|
||||
int32_t op;
|
||||
int32_t pinnedType;
|
||||
int32_t type;
|
||||
int32_t val;
|
||||
int32_t stringStart;
|
||||
|
||||
|
||||
printf("Original Pattern: ");
|
||||
for (i=0; i<fPattern.length(); i++) {
|
||||
printf("%c", fPattern.charAt(i));
|
||||
}
|
||||
printf("\n");
|
||||
printf("Pattern Valid?: %s\n", fBadState? "no" : "yes");
|
||||
printf("\nIndex Binary Type Operand\n"
|
||||
"-------------------------------------------\n");
|
||||
for (index = 0; ; index++) {
|
||||
op = fCompiledPat->elementAti(index);
|
||||
val = URX_VAL(op);
|
||||
type = URX_TYPE(op);
|
||||
pinnedType = type;
|
||||
void RegexPattern::dumpOp(int32_t index) const {
|
||||
int32_t op = fCompiledPat->elementAti(index);
|
||||
int32_t val = URX_VAL(op);
|
||||
int32_t type = URX_TYPE(op);
|
||||
int32_t pinnedType = type;
|
||||
if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
|
||||
pinnedType = 0;
|
||||
}
|
||||
|
||||
printf("%4d %08x %-15s ", index, op, opNames[pinnedType]);
|
||||
REGEX_DUMP_DEBUG_PRINTF("%4d %08x %-15s ", index, op, opNames[pinnedType]);
|
||||
switch (type) {
|
||||
case URX_NOP:
|
||||
case URX_DOTANY:
|
||||
@ -446,12 +441,12 @@ void RegexPattern::dump() {
|
||||
case URX_BACKSLASH_A:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_X:
|
||||
case URX_END:
|
||||
// Types with no operand field of interest.
|
||||
break;
|
||||
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_SETREF:
|
||||
case URX_STATIC_SETREF:
|
||||
case URX_STATE_SAVE:
|
||||
case URX_JMP:
|
||||
@ -461,37 +456,70 @@ void RegexPattern::dump() {
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
case URX_STRING_LEN:
|
||||
// types with an integer operand field.
|
||||
printf("%d", val);
|
||||
REGEX_DUMP_DEBUG_PRINTF("%d", val);
|
||||
break;
|
||||
|
||||
case URX_ONECHAR:
|
||||
printf("%c", val<256?val:'?');
|
||||
REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
|
||||
break;
|
||||
|
||||
case URX_STRING:
|
||||
stringStart = val;
|
||||
break;
|
||||
|
||||
case URX_STRING_LEN:
|
||||
for (i=stringStart; i<stringStart+val; i++) {
|
||||
c = fLiteralText[i];
|
||||
if (c >= 256) {c = '?';};
|
||||
printf("%c", c);
|
||||
{
|
||||
int32_t lengthOp = fCompiledPat->elementAti(index+1);
|
||||
U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
|
||||
int32_t length = URX_VAL(lengthOp);
|
||||
int32_t i;
|
||||
for (i=val; i<val+length; i++) {
|
||||
UChar c = fLiteralText[i];
|
||||
if (c < 32 || c >= 256) {c = '.';}
|
||||
REGEX_DUMP_DEBUG_PRINTF("%c", c);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_END:
|
||||
goto breakFromLoop;
|
||||
case URX_SETREF:
|
||||
{
|
||||
REGEX_DUMP_DEBUG_PRINTF("%d ", val);
|
||||
UnicodeString s;
|
||||
UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
|
||||
set->toPattern(s, TRUE);
|
||||
for (int32_t i=0; i<s.length(); i++) {
|
||||
REGEX_DUMP_DEBUG_PRINTF("%c", s.charAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
default:
|
||||
printf("??????");
|
||||
REGEX_DUMP_DEBUG_PRINTF("??????");
|
||||
break;
|
||||
}
|
||||
printf("\n");
|
||||
REGEX_DUMP_DEBUG_PRINTF("\n");
|
||||
}
|
||||
breakFromLoop:
|
||||
printf("\n\n");
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void RegexPattern::dump() const {
|
||||
int index;
|
||||
int i;
|
||||
|
||||
REGEX_DUMP_DEBUG_PRINTF("Original Pattern: ");
|
||||
for (i=0; i<fPattern.length(); i++) {
|
||||
REGEX_DUMP_DEBUG_PRINTF("%c", fPattern.charAt(i));
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF("\n");
|
||||
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n", fBadState? "no" : "yes");
|
||||
REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
|
||||
"-------------------------------------------\n");
|
||||
for (index = 0; index<fCompiledPat->size(); index++) {
|
||||
dumpOp(index);
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF("\n\n");
|
||||
};
|
||||
|
||||
const char RegexPattern::fgClassID = 0;
|
||||
|
@ -81,6 +81,8 @@ enum {
|
||||
* to be applied to input text, and a few convenience methods for simple common
|
||||
* uses of regular expressions.
|
||||
*
|
||||
* <p>Class RegexPattern is not intended to be subclassed.</p>
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
class U_I18N_API RegexPattern: public UObject {
|
||||
@ -192,7 +194,7 @@ public:
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
static RegexPattern *compile( const UnicodeString ®ex,
|
||||
int32_t flags,
|
||||
uint32_t flags,
|
||||
UParseError &pe,
|
||||
UErrorCode &status);
|
||||
|
||||
@ -202,7 +204,7 @@ public:
|
||||
* @return the match mode flags
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t flags() const;
|
||||
virtual uint32_t flags() const;
|
||||
|
||||
/*
|
||||
* Creates a RegexMatcher that will match the given input against this pattern. The
|
||||
@ -275,7 +277,7 @@ public:
|
||||
//
|
||||
// dump Debug function, displays the compiled form of a pattern.
|
||||
//
|
||||
void dump();
|
||||
void dump() const;
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
@ -291,14 +293,12 @@ public:
|
||||
*/
|
||||
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
||||
|
||||
static const char fgClassID;
|
||||
|
||||
private:
|
||||
//
|
||||
// Implementation Data
|
||||
//
|
||||
UnicodeString fPattern; // The original pattern string.
|
||||
int32_t fFlags; // The flags used when compiling the pattern.
|
||||
uint32_t fFlags; // The flags used when compiling the pattern.
|
||||
//
|
||||
UVector *fCompiledPat; // The compiled pattern.
|
||||
UnicodeString fLiteralText; // Any literal string data from the pattern,
|
||||
@ -317,6 +317,12 @@ private:
|
||||
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
|
||||
// regex character classes, e.g. Word.
|
||||
|
||||
/**
|
||||
* The address of this static class variable serves as this class's ID
|
||||
* for ICU "poor man's RTTI".
|
||||
*/
|
||||
static const char fgClassID;
|
||||
|
||||
friend class RegexCompile;
|
||||
friend class RegexMatcher;
|
||||
|
||||
@ -325,6 +331,7 @@ private:
|
||||
//
|
||||
void init(); // Common initialization, for use by constructors.
|
||||
void zap(); // Common cleanup
|
||||
void dumpOp(int32_t index) const;
|
||||
|
||||
|
||||
|
||||
@ -343,6 +350,8 @@ private:
|
||||
* input text to which the expression can be applied. It includes methods
|
||||
* for testing for matches, and for find and replace operations.
|
||||
*
|
||||
* <p>Class RegexMatcher is not intended to be subclassed.</p>
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
class U_I18N_API RegexMatcher: public UObject {
|
||||
@ -355,6 +364,227 @@ public:
|
||||
*/
|
||||
virtual ~RegexMatcher();
|
||||
|
||||
|
||||
/**
|
||||
* Attempts to match the entire input string against the pattern.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return TRUE if there is a match
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UBool matches(UErrorCode &status);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Attempts to match the input string, starting from the beginning, against the pattern.
|
||||
* Like the matches() method, this function always starts at the beginning of the input string;
|
||||
* unlike that function, it does not require that the entire input string be matched.
|
||||
*
|
||||
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
|
||||
* <code>end()</code>, and <code>group()</code> functions.</p>
|
||||
*
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return TRUE if there is a match at the start of the input string.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UBool lookingAt(UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Find the next pattern match in the input string.
|
||||
* The find begins searching the input at the location following the end of
|
||||
* the previous match, or at the start of the string if there is no previous match.
|
||||
* If a match is found, <code>start(), end()</code> and <code>group()</code>
|
||||
* will provide more information regarding the match.
|
||||
* <p>Note that if the input string is changed by the application,
|
||||
* use find(startPos, status) instead of find(), because the saved starting
|
||||
* position may not be valid with the altered input string.</p>
|
||||
* @return TRUE if a match is found.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UBool find();
|
||||
|
||||
|
||||
/**
|
||||
* Resets this RegexMatcher and then attempts to find the next substring of the
|
||||
* input string that matches the pattern, starting at the specified index.
|
||||
*
|
||||
* @param start the position in the input string to begin the search
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return TRUE if a match is found.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UBool find(int32_t start, UErrorCode &status);
|
||||
|
||||
|
||||
/*
|
||||
* Returns a string containing the text matched by the previous match.
|
||||
* If the pattern can match an empty string, an empty string may be returned.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* Possible errors are U_REGEX_INVALID_STATE if no match
|
||||
* has been attempted or the last match failed.
|
||||
* @return a string containing the matched input text.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString group(UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns a string containing the text captured by the given group
|
||||
* during the previous match operation. Group(0) is the entire match.
|
||||
*
|
||||
* @param group the capture group number
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* Possible errors are U_REGEX_INVALID_STATE if no match
|
||||
* has been attempted or the last match failed and
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
|
||||
* @return the captured text
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the number of capturing groups in this matcher's pattern.
|
||||
* @return the number of capture groups
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t groupCount() const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the index in the input string of the start of the text matched
|
||||
* during the previous match operation.
|
||||
* @param status a reference to a UErrorCode to receive any errors.
|
||||
* @return The position in the input string of the start of the last match.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t start(UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the index in the input string of the start of the text matched by the
|
||||
* specified capture group during the previous match operation. Return -1 if
|
||||
* the capture group exists in the pattern, but was not part of the last match.
|
||||
*
|
||||
* @param group the capture group number
|
||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||
* attempted or the last match failed, and
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
|
||||
* @return the start position of substring matched by the specified group.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t start(int group, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the index in the input string of the character following the
|
||||
* text matched during the previous match operation.
|
||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||
* attempted or the last match failed.
|
||||
* @return the index of the last character matched, plus one.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t end(UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the index in the input string of the character following the
|
||||
* text matched by the specified capture group during the previous match operation.
|
||||
* @param group the capture group number
|
||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||
* attempted or the last match failed and
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
|
||||
* @return the index of the last character, plus one, of the text
|
||||
* captured by the specifed group during the previous match operation.
|
||||
* Return -1 if the capture group was not part of the match.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t end(int group, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Resets this matcher. The effect is to remove any memory of previous matches,
|
||||
* and to cause subsequent find() operations to begin at the beginning of
|
||||
* the input string.
|
||||
*
|
||||
* @return this RegexMatcher.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual RegexMatcher &reset();
|
||||
|
||||
|
||||
/**
|
||||
* Resets this matcher with a new input string. This allows instances of RegexMatcher
|
||||
* to be reused, which is more efficient than creating a new RegexMatcher for
|
||||
* each input string to be processed.
|
||||
* @return this RegexMatcher.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual RegexMatcher &reset(const UnicodeString &input);
|
||||
|
||||
|
||||
/**
|
||||
* Returns the input string being matched. The returned string is not a copy,
|
||||
* but the live input string. It should not be altered or deleted.
|
||||
* @return the input string
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual const UnicodeString &input() const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the pattern that is interpreted by this matcher.
|
||||
* @return the RegexPattern for this RegexMatcher
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual const RegexPattern &pattern() const;
|
||||
|
||||
|
||||
/**
|
||||
* Replaces every substring of the input that matches the pattern
|
||||
* with the given replacement string. This is a convenience function that
|
||||
* provides a complete find-and-replace-all operation.
|
||||
*
|
||||
* This method first resets this matcher. It then scans the input string
|
||||
* looking for matches of the pattern. Input that is not part of any
|
||||
* match is left unchanged; each match is replaced in the result by the
|
||||
* replacement string. The replacement string may contain references to
|
||||
* capture groups.
|
||||
*
|
||||
* @param replacement a string containing the replacement text.
|
||||
* @param status a reference to a UErrorCode to receive any errors.
|
||||
* @return a string containing the results of the find and replace.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Replaces the first substring of the input that matches
|
||||
* the pattern with the replacement string. This is a convenience
|
||||
* function that provides a complete find-and-replace operation.
|
||||
*
|
||||
* <p>This function first resets this RegexMatcher. It then scans the input string
|
||||
* looking for a match of the pattern. Input that is not part
|
||||
* of the match is appended directly to the result string; the match is replaced
|
||||
* in the result by the replacement string. The replacement string may contain
|
||||
* references to captured groups.</p>
|
||||
*
|
||||
* <p>The state of the matcher (the position at which a subsequent find()
|
||||
* would begin) after completing a replaceFirst() is not specified. The
|
||||
* RegexMatcher should be reset before doing additional find() operations.</p>
|
||||
*
|
||||
* @param replacement a string containing the replacement text.
|
||||
* @param status a reference to a UErrorCode to receive any errors.
|
||||
* @return a string containing the results of the find and replace.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Implements a replace operation intended to be used as part of an
|
||||
* incremental find-and-replace.
|
||||
@ -399,219 +629,6 @@ public:
|
||||
virtual UnicodeString &appendTail(UnicodeString &dest);
|
||||
|
||||
|
||||
/**
|
||||
* Returns the index in the input string of the character following the
|
||||
* text matched during the previous match operation.
|
||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||
* attempted or the last match failed.
|
||||
* @return the index of the last character matched, plus one.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t end(UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the index in the input string of the character following the
|
||||
* text matched by the specified capture group during the previous match operation.
|
||||
* @param group the capture group number
|
||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||
* attempted or the last match failed and
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
|
||||
* @return the index of the last character, plus one, of the text
|
||||
* captured by the specifed group during the previous match operation.
|
||||
* Return -1 if the capture group was not part of the match.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t end(int group, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Find the next pattern match in the input string.
|
||||
* The find begins searching the input at the location following the end of
|
||||
* the previous match, or at the start of the string if there is no previous match.
|
||||
* If a match is found, <code>start(), end()</code> and <code>group()</code>
|
||||
* will provide more information regarding the match.
|
||||
* @return TRUE if a match is found.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UBool find();
|
||||
|
||||
|
||||
/**
|
||||
* Resets this RegexMatcher and then attempts to find the next substring of the
|
||||
* input string that matches the pattern, starting at the specified index.
|
||||
*
|
||||
* @param status the position in the input string to begin the search
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return TRUE if a match is found.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UBool find(int32_t start, UErrorCode &status);
|
||||
|
||||
|
||||
/*
|
||||
* Returns a string containing the text matched by the previous match.
|
||||
* If the pattern can match an empty string, an empty string may be returned.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* Possible errors are U_REGEX_INVALID_STATE if no match
|
||||
* has been attempted or the last match failed.
|
||||
* @return a string containing the matched input text.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString group(UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns a string containing the text captured by the given group
|
||||
* during the previous match operation. Group(0) is the entire match.
|
||||
*
|
||||
* @param group the capture group number
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* Possible errors are U_REGEX_INVALID_STATE if no match
|
||||
* has been attempted or the last match failed and
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
|
||||
* @return the captured text
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString group(int32_t group, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the number of capturing groups in this matcher's pattern.
|
||||
* @return the number of capture groups
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t groupCount() const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the input string being matched. The returned string is not a copy,
|
||||
* but the live input string. It should not be altered or deleted.
|
||||
* @return the input string
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual const UnicodeString &input() const;
|
||||
|
||||
|
||||
/**
|
||||
* Attempts to match the input string, starting from the beginning, against the pattern.
|
||||
* Like the matches() method, this function always starts at the beginning of the input string;
|
||||
* unlike that function, it does not require that the entire input string be matched.
|
||||
*
|
||||
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
|
||||
* <code>end()</code>, and <code>group()</code> functions.</p>
|
||||
*
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return TRUE if there is a match at the start of the input string.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UBool lookingAt(UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Attempts to match the entire input string against the pattern.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return TRUE if there is a match
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UBool matches(UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Returns the pattern that is interpreted by this matcher.
|
||||
* @return the RegexPattern for this RegexMatcher
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual const RegexPattern &pattern() const;
|
||||
|
||||
|
||||
/**
|
||||
* Replaces every substring of the input that matches the pattern
|
||||
* with the given replacement string. This is a convenience function that
|
||||
* provides a complete find-and-replace-all operation.
|
||||
*
|
||||
* This method first resets this matcher. It then scans the input string
|
||||
* looking for matches of the pattern. Input that is not part of any
|
||||
* match is left unchanged; each match is replaced in the result by the
|
||||
* replacement string. The replacement string may contain references to
|
||||
* capture groups.
|
||||
*
|
||||
* @param replacement a string containing the replacement text.
|
||||
* @param status a reference to a UErrorCode to receive any errors.
|
||||
* @return a string containing the results of the find and replace.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Replaces the first substring of the input that matches
|
||||
* the pattern with the replacement string. This is a convenience
|
||||
* function that provides a complete find-and-replace operation.
|
||||
*
|
||||
* This function first resets this RegexMatcher. It then scans the input string
|
||||
* looking for a match of the pattern. Input that is not part
|
||||
* of the match is appended directly to the result string; the match is replaced
|
||||
* in the result by the replacement string. The replacement string may contain
|
||||
* references to captured groups.
|
||||
*
|
||||
* @param replacement a string containing the replacement text.
|
||||
* @param status a reference to a UErrorCode to receive any errors.
|
||||
* @return a string containing the results of the find and replace.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Resets this matcher. The effect is to remove any memory of previous matches,
|
||||
* and to cause subsequent find() operations to begin at the beginning of
|
||||
* the input string.
|
||||
*
|
||||
* @return this RegexMatcher.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual RegexMatcher &reset();
|
||||
|
||||
|
||||
/**
|
||||
* Resets this matcher with a new input string. This allows instances of RegexMatcher
|
||||
* to be reused, which is more efficient than creating a new RegexMatcher for
|
||||
* each input string to be processed.
|
||||
* @return this RegexMatcher.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual RegexMatcher &reset(const UnicodeString &input);
|
||||
|
||||
|
||||
/**
|
||||
* Returns the index in the input string of the start of the text matched
|
||||
* during the previous match operation.
|
||||
* @param status a reference to a UErrorCode to receive any errors.
|
||||
* @return The position in the input string of the start of the last match.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t start(UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the index in the input string of the start of the text matched by the
|
||||
* specified capture group during the previous match operation. Return -1 if
|
||||
* the capture group exists in the pattern, but was not part of the last match.
|
||||
*
|
||||
* @param group the capture group number
|
||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||
* attempted or the last match failed, and
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
|
||||
* @return the start position of substring matched by the specified group.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t start(int group, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
*
|
||||
@ -626,8 +643,6 @@ public:
|
||||
*/
|
||||
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
||||
|
||||
static const char fgClassID;
|
||||
|
||||
private:
|
||||
// Constructors and other object boilerplate are private.
|
||||
// Instances of RegexMatcher can not be assigned, copied, cloned, etc.
|
||||
@ -658,6 +673,13 @@ private:
|
||||
UVector *fCaptureStarts;
|
||||
UVector *fCaptureEnds;
|
||||
|
||||
/**
|
||||
* The address of this static class variable serves as this class's ID
|
||||
* for ICU "poor man's RTTI".
|
||||
*/
|
||||
static const char fgClassID;
|
||||
|
||||
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -368,7 +368,7 @@ void RegexTest::Basic() {
|
||||
//
|
||||
#if 0
|
||||
{
|
||||
REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
|
||||
REGEX_FIND("[{ab}]", "a");
|
||||
}
|
||||
exit(1);
|
||||
#endif
|
||||
@ -436,6 +436,9 @@ void RegexTest::Basic() {
|
||||
REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
|
||||
REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
|
||||
|
||||
// Set contains only a string, no individual chars.
|
||||
REGEX_TESTLM("[{ab}]", "a", FALSE, FALSE);
|
||||
|
||||
//
|
||||
// OR operator in patterns
|
||||
//
|
||||
@ -975,6 +978,52 @@ void RegexTest::API_Pattern() {
|
||||
|
||||
delete pat1;
|
||||
|
||||
// split, with a pattern with (capture)
|
||||
pat1 = RegexPattern::compile("<(\\w*)>", pe, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
||||
n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==6);
|
||||
REGEX_ASSERT(fields[0]=="");
|
||||
REGEX_ASSERT(fields[1]=="a");
|
||||
REGEX_ASSERT(fields[2]=="Now is ");
|
||||
REGEX_ASSERT(fields[3]=="b");
|
||||
REGEX_ASSERT(fields[4]=="the time");
|
||||
REGEX_ASSERT(fields[5]=="c");
|
||||
REGEX_ASSERT(fields[6]=="");
|
||||
|
||||
n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==6);
|
||||
REGEX_ASSERT(fields[0]==" ");
|
||||
REGEX_ASSERT(fields[1]=="a");
|
||||
REGEX_ASSERT(fields[2]=="Now is ");
|
||||
REGEX_ASSERT(fields[3]=="b");
|
||||
REGEX_ASSERT(fields[4]=="the time");
|
||||
REGEX_ASSERT(fields[5]=="c");
|
||||
REGEX_ASSERT(fields[6]=="");
|
||||
|
||||
n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==4);
|
||||
REGEX_ASSERT(fields[0]==" ");
|
||||
REGEX_ASSERT(fields[1]=="a");
|
||||
REGEX_ASSERT(fields[2]=="Now is ");
|
||||
REGEX_ASSERT(fields[3]=="the time<c>");
|
||||
delete pat1;
|
||||
|
||||
pat1 = RegexPattern::compile("([-,])", pe, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
n = pat1->split("1-10,20", fields, 10, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(n==5);
|
||||
REGEX_ASSERT(fields[0]=="1");
|
||||
REGEX_ASSERT(fields[1]=="-");
|
||||
REGEX_ASSERT(fields[2]=="10");
|
||||
REGEX_ASSERT(fields[3]==",");
|
||||
REGEX_ASSERT(fields[4]=="20");
|
||||
delete pat1;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user