ICU-105 Regular Expressions, changes from code review

X-SVN-Rev: 10294
This commit is contained in:
Andy Heninger 2002-11-19 19:31:03 +00:00
parent bf1f6b1213
commit 24bf088281
9 changed files with 556 additions and 353 deletions

View File

@ -1839,7 +1839,6 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_PROPERTY_SYNTAX", "U_REGEX_PROPERTY_SYNTAX",
"U_REGEX_UNIMPLEMENTED", "U_REGEX_UNIMPLEMENTED",
"U_REGEX_MISMATCHED_PAREN", "U_REGEX_MISMATCHED_PAREN",
"U_REGEX_MATCH_MODE_ERROR"
}; };
U_CAPI const char * U_EXPORT2 U_CAPI const char * U_EXPORT2

View File

@ -500,18 +500,17 @@ typedef enum UErrorCode {
/* /*
* The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs * The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
*/ */
U_REGEX_ERROR_START=0x10300, U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
U_REGEX_INTERNAL_ERROR, U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
U_REGEX_RULE_SYNTAX, U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
U_REGEX_INVALID_STATE, U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */
U_REGEX_BAD_ESCAPE_SEQUENCE, U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
U_REGEX_PROPERTY_SYNTAX, U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
U_REGEX_UNIMPLEMENTED, U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
U_REGEX_MISMATCHED_PAREN, U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
U_REGEX_MATCH_MODE_ERROR, U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
U_REGEX_ERROR_LIMIT,
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
} UErrorCode; } UErrorCode;
/* Use the following to determine if an UErrorCode represents */ /* Use the following to determine if an UErrorCode represents */

View File

@ -28,8 +28,6 @@
#include "ucln_in.h" #include "ucln_in.h"
#include "mutex.h" #include "mutex.h"
#include "stdio.h" // TODO: Get rid of this
#include "regeximp.h" #include "regeximp.h"
#include "regexcst.h" // Contains state table for the regex pattern parser. #include "regexcst.h" // Contains state table for the regex pattern parser.
// generated by a Perl script. // generated by a Perl script.
@ -40,7 +38,6 @@
U_NAMESPACE_BEGIN U_NAMESPACE_BEGIN
const char RegexCompile::fgClassID=0; const char RegexCompile::fgClassID=0;
static const int RESCAN_DEBUG = 0;
//---------------------------------------------------------------------------------------- //----------------------------------------------------------------------------------------
// //
@ -173,6 +170,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
// //
// Set up the constant (static) Unicode Sets. // Set up the constant (static) Unicode Sets.
// TODO: something cleaner for that -128 constant.
// //
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status); ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status);
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status); ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status);
@ -282,14 +280,12 @@ void RegexCompile::compile(
// the search will stop there, if not before. // the search will stop there, if not before.
// //
tableEl = &gRuleParseStateTable[state]; tableEl = &gRuleParseStateTable[state];
if (RESCAN_DEBUG) { REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d) state=%s ",
printf( "char, line, col = (\'%c\', %d, %d) state=%s ", fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
}
for (;;) { // loop through table rows belonging to this state, looking for one for (;;) { // loop through table rows belonging to this state, looking for one
// that matches the current input char. // that matches the current input char.
if (RESCAN_DEBUG) { printf( ".");} REGEX_SCAN_DEBUG_PRINTF( ".");
if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) { if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
// Table row specified an individual character, not a set, and // Table row specified an individual character, not a set, and
// the input character is not quoted, and // the input character is not quoted, and
@ -323,7 +319,7 @@ void RegexCompile::compile(
// No match on this row, advance to the next row for this state, // No match on this row, advance to the next row for this state,
tableEl++; tableEl++;
} }
if (RESCAN_DEBUG) { printf( "\n");} REGEX_SCAN_DEBUG_PRINTF("\n");
// //
// We've found the row of the state table that matches the current input // We've found the row of the state table that matches the current input
@ -340,7 +336,7 @@ void RegexCompile::compile(
fStackPtr++; fStackPtr++;
if (fStackPtr >= kStackSize) { if (fStackPtr >= kStackSize) {
error(U_REGEX_INTERNAL_ERROR); error(U_REGEX_INTERNAL_ERROR);
// printf( "RegexCompile::parse() - state stack overflow.\n"); REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n");
fStackPtr--; fStackPtr--;
} }
fStack[fStackPtr] = tableEl->fPushState; fStack[fStackPtr] = tableEl->fPushState;
@ -369,6 +365,36 @@ void RegexCompile::compile(
} }
//
// The pattern has now been read and processed, and the compiled code generated.
//
//
// Compute the number of digits requried for the largest capture group number.
//
fRXPat->fMaxCaptureDigits = 1;
int32_t n = 10;
for (;;) {
if (n > fRXPat->fNumCaptureGroups) {
break;
}
fRXPat->fMaxCaptureDigits++;
n *= 10;
}
//
// A stupid bit of non-sense to prevent code coverage testing from complaining
// about the pattern.dump() debug function. Go through the motions of dumping,
// even though, without the #define set, it will do nothing.
//
#ifndef REGEX_DUMP_DEBUG
static UBool phonyDumpDone = FALSE;
if (phonyDumpDone==FALSE) {
fRXPat->dump();
phonyDumpDone = TRUE;
}
#endif
} }
@ -1094,27 +1120,39 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
if (theSet == NULL) { if (theSet == NULL) {
return; return;
} }
if (theSet->size() > 1) { int32_t setSize = theSet->size();
// The set contains two or more chars. UChar32 firstSetChar = theSet->charAt(0);
// Put it into the compiled pattern as a set. if (firstSetChar == -1) {
int32_t setNumber = fRXPat->fSets->size(); // Sets that contain only strings, but no individual chars,
fRXPat->fSets->addElement(theSet, *fStatus); // will end up here. TODO: figure out what to with sets containing strings.
int32_t setOp = URX_BUILD(URX_SETREF, setNumber); setSize = 0;
fRXPat->fCompiledPat->addElement(setOp, *fStatus);
} }
else
{ switch (setSize) {
// The set contains only a single code point. Put it into case 0: // Set of no elements. Always fails to match.
// the compiled pattern as a single char operation rather fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
// than a set, and discard the set itself. break;
UChar32 c = theSet->charAt(0);
if (c == -1) { case 1:
// Set contained no chars. Stuff an invalid char that can't match. {
c = 0x1fffff; // The set contains only a single code point. Put it into
// the compiled pattern as a single char operation rather
// than a set, and discard the set itself.
int32_t charToken = URX_BUILD(URX_ONECHAR, firstSetChar);
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
delete theSet;
}
break;
default:
{
// The set contains two or more chars. (the normal case)
// Put it into the compiled pattern as a set.
int32_t setNumber = fRXPat->fSets->size();
fRXPat->fSets->addElement(theSet, *fStatus);
int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
fRXPat->fCompiledPat->addElement(setOp, *fStatus);
} }
int32_t charToken = URX_BUILD(URX_ONECHAR, c);
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
delete theSet;
} }
} }
@ -1321,7 +1359,7 @@ UnicodeSet *RegexCompile::scanSet() {
if (U_FAILURE(localStatus)) { if (U_FAILURE(localStatus)) {
// TODO: Get more accurate position of the error from UnicodeSet's return info. // TODO: Get more accurate position of the error from UnicodeSet's return info.
// UnicodeSet appears to not be reporting correctly at this time. // UnicodeSet appears to not be reporting correctly at this time.
printf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
error(localStatus); error(localStatus);
delete uset; delete uset;
return NULL; return NULL;

View File

@ -28,8 +28,6 @@
U_NAMESPACE_BEGIN U_NAMESPACE_BEGIN
static const UBool REGEX_DEBUG = TRUE;
//-------------------------------------------------------------------------------- //--------------------------------------------------------------------------------
// //
// class RegexCompile Contains the regular expression compiler. // class RegexCompile Contains the regular expression compiler.

View File

@ -13,13 +13,45 @@
#define _REGEXIMP_H #define _REGEXIMP_H
//
// debugging support. Enable one or more of the #defines immediately following
//
//#define REGEX_SCAN_DEBUG
#define REGEX_DUMP_DEBUG
//#define REGEX_RUN_DEBUG
// End of #defines inteded to be directly set.
#ifdef REGEX_SCAN_DEBUG
#define REGEX_SCAN_DEBUG_PRINTF printf
#else
#define REGEX_SCAN_DEBUG_PRINTF
#endif
#ifdef REGEX_DUMP_DEBUG
#define REGEX_DUMP_DEBUG_PRINTF printf
#else
#define REGEX_DUMP_DEBUG_PRINTF
#endif
#ifdef REGEX_RUN_DEBUG
#define REGEX_RUN_DEBUG_PRINTF printf
#define REGEX_DUMP_DEBUG_PRINTF printf
#else
#define REGEX_RUN_DEBUG_PRINTF
#endif
#if defined(REGEX_SCAN_DEBUG) || defined(REGEX_RUN_DEBUG) || defined(REGEX_DUMP_DEBUG)
#include <stdio.h>
#endif
// //
// Opcode types In the compiled form of the regexp, these are the type, or opcodes, // Opcode types In the compiled form of the regexp, these are the type, or opcodes,
// of the entries. // of the entries.
// //
enum { enum {
URX_RESERVED_OP = 0, URX_RESERVED_OP = 0,
URX_UNUSED1 = 1, URX_BACKTRACK = 1,
URX_END = 2, URX_END = 2,
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
URX_STRING = 4, // Value field is index of string start URX_STRING = 4, // Value field is index of string start
@ -52,7 +84,7 @@ enum {
// Used for debug printing only. // Used for debug printing only.
#define URX_OPCODE_NAMES \ #define URX_OPCODE_NAMES \
"URX_RESERVED_OP", \ "URX_RESERVED_OP", \
"URX_UNUSED1", \ "URX_BACKTRACK", \
"END", \ "END", \
"ONECHAR", \ "ONECHAR", \
"STRING", \ "STRING", \

View File

@ -280,9 +280,9 @@ UnicodeString RegexMatcher::group(UErrorCode &status) const {
UnicodeString RegexMatcher::group(int32_t group, UErrorCode &status) const { UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
int32_t s = start(group, status); int32_t s = start(groupNum, status);
int32_t e = end(group, status); int32_t e = end(groupNum, status);
// Note: calling start() and end() above will do all necessary checking that // Note: calling start() and end() above will do all necessary checking that
// the group number is OK and that a match exists. status will be set. // the group number is OK and that a match exists. status will be set.
@ -539,6 +539,28 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
int32_t opType; // the opcode int32_t opType; // the opcode
int32_t opValue; // and the operand value. int32_t opValue; // and the operand value.
#ifdef REGEX_RUN_DEBUG
{
printf("MatchAt(startIdx=%d)\n", startIdx);
printf("Original Pattern: ");
int i;
for (i=0; i<fPattern->fPattern.length(); i++) {
printf("%c", fPattern->fPattern.charAt(i));
}
printf("\n");
printf("Input String: ");
for (i=0; i<fInput->length(); i++) {
UChar c = fInput->charAt(i);
if (c<32 || c>256) {
c = '.';
}
printf("%c", c);
}
printf("\n");
printf("\n");
printf("PatLoc inputIdx char\n");
}
#endif
if (U_FAILURE(status)) { if (U_FAILURE(status)) {
return; return;
@ -569,7 +591,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
op = pat->elementAti(patIdx); op = pat->elementAti(patIdx);
opType = URX_TYPE(op); opType = URX_TYPE(op);
opValue = URX_VAL(op); opValue = URX_VAL(op);
// printf("%d %d \"%c\"\n", patIdx, inputIdx, fInput->char32At(inputIdx)); #ifdef REGEX_RUN_DEBUG
printf("inputIdx=%d inputChar=%c ", inputIdx, fInput->char32At(inputIdx));
fPattern->dumpOp(patIdx);
#endif
patIdx++; patIdx++;
switch (opType) { switch (opType) {
@ -579,6 +604,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
break; break;
case URX_BACKTRACK:
// Force a backtrack. In some circumstances, the pattern compiler
// will notice that the pattern can't possibly match anything, and will
// emit one of these at that point.
backTrack(inputIdx, patIdx);
break;
case URX_ONECHAR: case URX_ONECHAR:
{ {
UChar32 inputChar = fInput->char32At(inputIdx); UChar32 inputChar = fInput->char32At(inputIdx);
@ -909,7 +942,12 @@ breakFromLoop:
fLastMatchEnd = fMatchEnd; fLastMatchEnd = fMatchEnd;
fMatchStart = startIdx; fMatchStart = startIdx;
fMatchEnd = inputIdx; fMatchEnd = inputIdx;
REGEX_RUN_DEBUG_PRINTF("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd);
} }
else
{
REGEX_RUN_DEBUG_PRINTF("No match\n\n");
}
return; return;
} }

View File

@ -18,8 +18,6 @@
#include "regexcmp.h" #include "regexcmp.h"
#include "regeximp.h" #include "regeximp.h"
#include "stdio.h" // TODO: get rid of this...
U_NAMESPACE_BEGIN U_NAMESPACE_BEGIN
//-------------------------------------------------------------------------- //--------------------------------------------------------------------------
@ -197,7 +195,7 @@ UBool RegexPattern::operator ==(const RegexPattern &other) const {
//--------------------------------------------------------------------- //---------------------------------------------------------------------
RegexPattern *RegexPattern::compile( RegexPattern *RegexPattern::compile(
const UnicodeString &regex, const UnicodeString &regex,
int32_t flags, uint32_t flags,
UParseError &pe, UParseError &pe,
UErrorCode &status) { UErrorCode &status) {
@ -243,7 +241,7 @@ RegexPattern *RegexPattern::compile( const UnicodeString &regex,
// flags // flags
// //
//--------------------------------------------------------------------- //---------------------------------------------------------------------
int32_t RegexPattern::flags() const { uint32_t RegexPattern::flags() const {
return fFlags; return fFlags;
} }
@ -320,8 +318,6 @@ UnicodeString RegexPattern::pattern() const {
//--------------------------------------------------------------------- //---------------------------------------------------------------------
// //
// split // split
// TODO: perl returns captured strings intermixed with the
// fields. Should we do this too?
// //
//--------------------------------------------------------------------- //---------------------------------------------------------------------
int32_t RegexPattern::split(const UnicodeString &input, int32_t RegexPattern::split(const UnicodeString &input,
@ -383,10 +379,28 @@ int32_t RegexPattern::split(const UnicodeString &input,
int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart; int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
dest[i].setTo(input, nextOutputStringStart, fieldLen); dest[i].setTo(input, nextOutputStringStart, fieldLen);
nextOutputStringStart = fMatcher->fMatchEnd; nextOutputStringStart = fMatcher->fMatchEnd;
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
for (groupNum=1; groupNum<=this->fNumCaptureGroups; groupNum++) {
if (i==destCapacity-1) {
break;
}
i++;
dest[i] = fMatcher->group(groupNum, status);
}
if (nextOutputStringStart == inputLen) { if (nextOutputStringStart == inputLen) {
// The delimiter was at the end of the string. We're done. // The delimiter was at the end of the string. We're done.
break; break;
} }
if (i==destCapacity-1) {
// We've filled up the last output string with capture group data.
// Give back the last string, to be used for the remainder of the input.
i--;
}
} }
else else
{ {
@ -410,88 +424,102 @@ int32_t RegexPattern::split(const UnicodeString &input,
//--------------------------------------------------------------------- //---------------------------------------------------------------------
static const char *opNames[] = {URX_OPCODE_NAMES}; static const char *opNames[] = {URX_OPCODE_NAMES};
void RegexPattern::dump() { void RegexPattern::dumpOp(int32_t index) const {
int32_t op = fCompiledPat->elementAti(index);
int32_t val = URX_VAL(op);
int32_t type = URX_TYPE(op);
int32_t pinnedType = type;
if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
pinnedType = 0;
}
REGEX_DUMP_DEBUG_PRINTF("%4d %08x %-15s ", index, op, opNames[pinnedType]);
switch (type) {
case URX_NOP:
case URX_DOTANY:
case URX_FAIL:
case URX_BACKSLASH_A:
case URX_BACKSLASH_G:
case URX_BACKSLASH_X:
case URX_END:
// Types with no operand field of interest.
break;
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_STATIC_SETREF:
case URX_STATE_SAVE:
case URX_JMP:
case URX_BACKSLASH_B:
case URX_BACKSLASH_D:
case URX_BACKSLASH_W:
case URX_BACKSLASH_Z:
case URX_CARET:
case URX_DOLLAR:
case URX_STRING_LEN:
// types with an integer operand field.
REGEX_DUMP_DEBUG_PRINTF("%d", val);
break;
case URX_ONECHAR:
REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
break;
case URX_STRING:
{
int32_t lengthOp = fCompiledPat->elementAti(index+1);
U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
int32_t length = URX_VAL(lengthOp);
int32_t i;
for (i=val; i<val+length; i++) {
UChar c = fLiteralText[i];
if (c < 32 || c >= 256) {c = '.';}
REGEX_DUMP_DEBUG_PRINTF("%c", c);
}
}
break;
case URX_SETREF:
{
REGEX_DUMP_DEBUG_PRINTF("%d ", val);
UnicodeString s;
UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
set->toPattern(s, TRUE);
for (int32_t i=0; i<s.length(); i++) {
REGEX_DUMP_DEBUG_PRINTF("%c", s.charAt(i));
}
}
default:
REGEX_DUMP_DEBUG_PRINTF("??????");
break;
}
REGEX_DUMP_DEBUG_PRINTF("\n");
}
void RegexPattern::dump() const {
int index; int index;
int i; int i;
UChar c;
int32_t op;
int32_t pinnedType;
int32_t type;
int32_t val;
int32_t stringStart;
REGEX_DUMP_DEBUG_PRINTF("Original Pattern: ");
printf("Original Pattern: ");
for (i=0; i<fPattern.length(); i++) { for (i=0; i<fPattern.length(); i++) {
printf("%c", fPattern.charAt(i)); REGEX_DUMP_DEBUG_PRINTF("%c", fPattern.charAt(i));
} }
printf("\n"); REGEX_DUMP_DEBUG_PRINTF("\n");
printf("Pattern Valid?: %s\n", fBadState? "no" : "yes"); REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n", fBadState? "no" : "yes");
printf("\nIndex Binary Type Operand\n" REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
"-------------------------------------------\n"); "-------------------------------------------\n");
for (index = 0; ; index++) { for (index = 0; index<fCompiledPat->size(); index++) {
op = fCompiledPat->elementAti(index); dumpOp(index);
val = URX_VAL(op);
type = URX_TYPE(op);
pinnedType = type;
if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
pinnedType = 0;
}
printf("%4d %08x %-15s ", index, op, opNames[pinnedType]);
switch (type) {
case URX_NOP:
case URX_DOTANY:
case URX_FAIL:
case URX_BACKSLASH_A:
case URX_BACKSLASH_G:
case URX_BACKSLASH_X:
// Types with no operand field of interest.
break;
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_SETREF:
case URX_STATIC_SETREF:
case URX_STATE_SAVE:
case URX_JMP:
case URX_BACKSLASH_B:
case URX_BACKSLASH_D:
case URX_BACKSLASH_W:
case URX_BACKSLASH_Z:
case URX_CARET:
case URX_DOLLAR:
// types with an integer operand field.
printf("%d", val);
break;
case URX_ONECHAR:
printf("%c", val<256?val:'?');
break;
case URX_STRING:
stringStart = val;
break;
case URX_STRING_LEN:
for (i=stringStart; i<stringStart+val; i++) {
c = fLiteralText[i];
if (c >= 256) {c = '?';};
printf("%c", c);
}
break;
case URX_END:
goto breakFromLoop;
default:
printf("??????");
break;
}
printf("\n");
} }
breakFromLoop: REGEX_DUMP_DEBUG_PRINTF("\n\n");
printf("\n\n");
}; };
const char RegexPattern::fgClassID = 0; const char RegexPattern::fgClassID = 0;

View File

@ -81,6 +81,8 @@ enum {
* to be applied to input text, and a few convenience methods for simple common * to be applied to input text, and a few convenience methods for simple common
* uses of regular expressions. * uses of regular expressions.
* *
* <p>Class RegexPattern is not intended to be subclassed.</p>
*
* @draft ICU 2.4 * @draft ICU 2.4
*/ */
class U_I18N_API RegexPattern: public UObject { class U_I18N_API RegexPattern: public UObject {
@ -192,7 +194,7 @@ public:
* @draft ICU 2.4 * @draft ICU 2.4
*/ */
static RegexPattern *compile( const UnicodeString &regex, static RegexPattern *compile( const UnicodeString &regex,
int32_t flags, uint32_t flags,
UParseError &pe, UParseError &pe,
UErrorCode &status); UErrorCode &status);
@ -202,7 +204,7 @@ public:
* @return the match mode flags * @return the match mode flags
* @draft ICU 2.4 * @draft ICU 2.4
*/ */
virtual int32_t flags() const; virtual uint32_t flags() const;
/* /*
* Creates a RegexMatcher that will match the given input against this pattern. The * Creates a RegexMatcher that will match the given input against this pattern. The
@ -275,7 +277,7 @@ public:
// //
// dump Debug function, displays the compiled form of a pattern. // dump Debug function, displays the compiled form of a pattern.
// //
void dump(); void dump() const;
/** /**
* ICU "poor man's RTTI", returns a UClassID for the actual class. * ICU "poor man's RTTI", returns a UClassID for the actual class.
@ -291,14 +293,12 @@ public:
*/ */
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; } static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
static const char fgClassID;
private: private:
// //
// Implementation Data // Implementation Data
// //
UnicodeString fPattern; // The original pattern string. UnicodeString fPattern; // The original pattern string.
int32_t fFlags; // The flags used when compiling the pattern. uint32_t fFlags; // The flags used when compiling the pattern.
// //
UVector *fCompiledPat; // The compiled pattern. UVector *fCompiledPat; // The compiled pattern.
UnicodeString fLiteralText; // Any literal string data from the pattern, UnicodeString fLiteralText; // Any literal string data from the pattern,
@ -317,6 +317,12 @@ private:
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
// regex character classes, e.g. Word. // regex character classes, e.g. Word.
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
friend class RegexCompile; friend class RegexCompile;
friend class RegexMatcher; friend class RegexMatcher;
@ -325,6 +331,7 @@ private:
// //
void init(); // Common initialization, for use by constructors. void init(); // Common initialization, for use by constructors.
void zap(); // Common cleanup void zap(); // Common cleanup
void dumpOp(int32_t index) const;
@ -343,6 +350,8 @@ private:
* input text to which the expression can be applied. It includes methods * input text to which the expression can be applied. It includes methods
* for testing for matches, and for find and replace operations. * for testing for matches, and for find and replace operations.
* *
* <p>Class RegexMatcher is not intended to be subclassed.</p>
*
* @draft ICU 2.4 * @draft ICU 2.4
*/ */
class U_I18N_API RegexMatcher: public UObject { class U_I18N_API RegexMatcher: public UObject {
@ -355,6 +364,227 @@ public:
*/ */
virtual ~RegexMatcher(); virtual ~RegexMatcher();
/**
* Attempts to match the entire input string against the pattern.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match
* @draft ICU 2.4
*/
virtual UBool matches(UErrorCode &status);
/**
* Attempts to match the input string, starting from the beginning, against the pattern.
* Like the matches() method, this function always starts at the beginning of the input string;
* unlike that function, it does not require that the entire input string be matched.
*
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
* <code>end()</code>, and <code>group()</code> functions.</p>
*
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match at the start of the input string.
* @draft ICU 2.4
*/
virtual UBool lookingAt(UErrorCode &status);
/**
* Find the next pattern match in the input string.
* The find begins searching the input at the location following the end of
* the previous match, or at the start of the string if there is no previous match.
* If a match is found, <code>start(), end()</code> and <code>group()</code>
* will provide more information regarding the match.
* <p>Note that if the input string is changed by the application,
* use find(startPos, status) instead of find(), because the saved starting
* position may not be valid with the altered input string.</p>
* @return TRUE if a match is found.
* @draft ICU 2.4
*/
virtual UBool find();
/**
* Resets this RegexMatcher and then attempts to find the next substring of the
* input string that matches the pattern, starting at the specified index.
*
* @param start the position in the input string to begin the search
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if a match is found.
* @draft ICU 2.4
*/
virtual UBool find(int32_t start, UErrorCode &status);
/*
* Returns a string containing the text matched by the previous match.
* If the pattern can match an empty string, an empty string may be returned.
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed.
* @return a string containing the matched input text.
* @draft ICU 2.4
*/
virtual UnicodeString group(UErrorCode &status) const;
/**
* Returns a string containing the text captured by the given group
* during the previous match operation. Group(0) is the entire match.
*
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
* @return the captured text
* @draft ICU 2.4
*/
virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
/**
* Returns the number of capturing groups in this matcher's pattern.
* @return the number of capture groups
* @draft ICU 2.4
*/
virtual int32_t groupCount() const;
/**
* Returns the index in the input string of the start of the text matched
* during the previous match operation.
* @param status a reference to a UErrorCode to receive any errors.
* @return The position in the input string of the start of the last match.
* @draft ICU 2.4
*/
virtual int32_t start(UErrorCode &status) const;
/**
* Returns the index in the input string of the start of the text matched by the
* specified capture group during the previous match operation. Return -1 if
* the capture group exists in the pattern, but was not part of the last match.
*
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed, and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
* @return the start position of substring matched by the specified group.
* @draft ICU 2.4
*/
virtual int32_t start(int group, UErrorCode &status) const;
/**
* Returns the index in the input string of the character following the
* text matched during the previous match operation.
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed.
* @return the index of the last character matched, plus one.
* @draft ICU 2.4
*/
virtual int32_t end(UErrorCode &status) const;
/**
* Returns the index in the input string of the character following the
* text matched by the specified capture group during the previous match operation.
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
* @return the index of the last character, plus one, of the text
* captured by the specifed group during the previous match operation.
* Return -1 if the capture group was not part of the match.
* @draft ICU 2.4
*/
virtual int32_t end(int group, UErrorCode &status) const;
/**
* Resets this matcher. The effect is to remove any memory of previous matches,
* and to cause subsequent find() operations to begin at the beginning of
* the input string.
*
* @return this RegexMatcher.
* @draft ICU 2.4
*/
virtual RegexMatcher &reset();
/**
* Resets this matcher with a new input string. This allows instances of RegexMatcher
* to be reused, which is more efficient than creating a new RegexMatcher for
* each input string to be processed.
* @return this RegexMatcher.
* @draft ICU 2.4
*/
virtual RegexMatcher &reset(const UnicodeString &input);
/**
* Returns the input string being matched. The returned string is not a copy,
* but the live input string. It should not be altered or deleted.
* @return the input string
* @draft ICU 2.4
*/
virtual const UnicodeString &input() const;
/**
* Returns the pattern that is interpreted by this matcher.
* @return the RegexPattern for this RegexMatcher
* @draft ICU 2.4
*/
virtual const RegexPattern &pattern() const;
/**
* Replaces every substring of the input that matches the pattern
* with the given replacement string. This is a convenience function that
* provides a complete find-and-replace-all operation.
*
* This method first resets this matcher. It then scans the input string
* looking for matches of the pattern. Input that is not part of any
* match is left unchanged; each match is replaced in the result by the
* replacement string. The replacement string may contain references to
* capture groups.
*
* @param replacement a string containing the replacement text.
* @param status a reference to a UErrorCode to receive any errors.
* @return a string containing the results of the find and replace.
* @draft ICU 2.4
*/
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
/**
* Replaces the first substring of the input that matches
* the pattern with the replacement string. This is a convenience
* function that provides a complete find-and-replace operation.
*
* <p>This function first resets this RegexMatcher. It then scans the input string
* looking for a match of the pattern. Input that is not part
* of the match is appended directly to the result string; the match is replaced
* in the result by the replacement string. The replacement string may contain
* references to captured groups.</p>
*
* <p>The state of the matcher (the position at which a subsequent find()
* would begin) after completing a replaceFirst() is not specified. The
* RegexMatcher should be reset before doing additional find() operations.</p>
*
* @param replacement a string containing the replacement text.
* @param status a reference to a UErrorCode to receive any errors.
* @return a string containing the results of the find and replace.
* @draft ICU 2.4
*/
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
/** /**
* Implements a replace operation intended to be used as part of an * Implements a replace operation intended to be used as part of an
* incremental find-and-replace. * incremental find-and-replace.
@ -398,219 +628,6 @@ public:
*/ */
virtual UnicodeString &appendTail(UnicodeString &dest); virtual UnicodeString &appendTail(UnicodeString &dest);
/**
* Returns the index in the input string of the character following the
* text matched during the previous match operation.
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed.
* @return the index of the last character matched, plus one.
* @draft ICU 2.4
*/
virtual int32_t end(UErrorCode &status) const;
/**
* Returns the index in the input string of the character following the
* text matched by the specified capture group during the previous match operation.
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
* @return the index of the last character, plus one, of the text
* captured by the specifed group during the previous match operation.
* Return -1 if the capture group was not part of the match.
* @draft ICU 2.4
*/
virtual int32_t end(int group, UErrorCode &status) const;
/**
* Find the next pattern match in the input string.
* The find begins searching the input at the location following the end of
* the previous match, or at the start of the string if there is no previous match.
* If a match is found, <code>start(), end()</code> and <code>group()</code>
* will provide more information regarding the match.
* @return TRUE if a match is found.
* @draft ICU 2.4
*/
virtual UBool find();
/**
* Resets this RegexMatcher and then attempts to find the next substring of the
* input string that matches the pattern, starting at the specified index.
*
* @param status the position in the input string to begin the search
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if a match is found.
* @draft ICU 2.4
*/
virtual UBool find(int32_t start, UErrorCode &status);
/*
* Returns a string containing the text matched by the previous match.
* If the pattern can match an empty string, an empty string may be returned.
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed.
* @return a string containing the matched input text.
* @draft ICU 2.4
*/
virtual UnicodeString group(UErrorCode &status) const;
/**
* Returns a string containing the text captured by the given group
* during the previous match operation. Group(0) is the entire match.
*
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
* @return the captured text
* @draft ICU 2.4
*/
virtual UnicodeString group(int32_t group, UErrorCode &status) const;
/**
* Returns the number of capturing groups in this matcher's pattern.
* @return the number of capture groups
* @draft ICU 2.4
*/
virtual int32_t groupCount() const;
/**
* Returns the input string being matched. The returned string is not a copy,
* but the live input string. It should not be altered or deleted.
* @return the input string
* @draft ICU 2.4
*/
virtual const UnicodeString &input() const;
/**
* Attempts to match the input string, starting from the beginning, against the pattern.
* Like the matches() method, this function always starts at the beginning of the input string;
* unlike that function, it does not require that the entire input string be matched.
*
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
* <code>end()</code>, and <code>group()</code> functions.</p>
*
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match at the start of the input string.
* @draft ICU 2.4
*/
virtual UBool lookingAt(UErrorCode &status);
/**
* Attempts to match the entire input string against the pattern.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match
* @draft ICU 2.4
*/
virtual UBool matches(UErrorCode &status);
/**
* Returns the pattern that is interpreted by this matcher.
* @return the RegexPattern for this RegexMatcher
* @draft ICU 2.4
*/
virtual const RegexPattern &pattern() const;
/**
* Replaces every substring of the input that matches the pattern
* with the given replacement string. This is a convenience function that
* provides a complete find-and-replace-all operation.
*
* This method first resets this matcher. It then scans the input string
* looking for matches of the pattern. Input that is not part of any
* match is left unchanged; each match is replaced in the result by the
* replacement string. The replacement string may contain references to
* capture groups.
*
* @param replacement a string containing the replacement text.
* @param status a reference to a UErrorCode to receive any errors.
* @return a string containing the results of the find and replace.
* @draft ICU 2.4
*/
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
/**
* Replaces the first substring of the input that matches
* the pattern with the replacement string. This is a convenience
* function that provides a complete find-and-replace operation.
*
* This function first resets this RegexMatcher. It then scans the input string
* looking for a match of the pattern. Input that is not part
* of the match is appended directly to the result string; the match is replaced
* in the result by the replacement string. The replacement string may contain
* references to captured groups.
*
* @param replacement a string containing the replacement text.
* @param status a reference to a UErrorCode to receive any errors.
* @return a string containing the results of the find and replace.
* @draft ICU 2.4
*/
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
/**
* Resets this matcher. The effect is to remove any memory of previous matches,
* and to cause subsequent find() operations to begin at the beginning of
* the input string.
*
* @return this RegexMatcher.
* @draft ICU 2.4
*/
virtual RegexMatcher &reset();
/**
* Resets this matcher with a new input string. This allows instances of RegexMatcher
* to be reused, which is more efficient than creating a new RegexMatcher for
* each input string to be processed.
* @return this RegexMatcher.
* @draft ICU 2.4
*/
virtual RegexMatcher &reset(const UnicodeString &input);
/**
* Returns the index in the input string of the start of the text matched
* during the previous match operation.
* @param status a reference to a UErrorCode to receive any errors.
* @return The position in the input string of the start of the last match.
* @draft ICU 2.4
*/
virtual int32_t start(UErrorCode &status) const;
/**
* Returns the index in the input string of the start of the text matched by the
* specified capture group during the previous match operation. Return -1 if
* the capture group exists in the pattern, but was not part of the last match.
*
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed, and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
* @return the start position of substring matched by the specified group.
* @draft ICU 2.4
*/
virtual int32_t start(int group, UErrorCode &status) const;
/** /**
* ICU "poor man's RTTI", returns a UClassID for the actual class. * ICU "poor man's RTTI", returns a UClassID for the actual class.
@ -626,8 +643,6 @@ public:
*/ */
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; } static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
static const char fgClassID;
private: private:
// Constructors and other object boilerplate are private. // Constructors and other object boilerplate are private.
// Instances of RegexMatcher can not be assigned, copied, cloned, etc. // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
@ -658,6 +673,13 @@ private:
UVector *fCaptureStarts; UVector *fCaptureStarts;
UVector *fCaptureEnds; UVector *fCaptureEnds;
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
}; };
U_NAMESPACE_END U_NAMESPACE_END

View File

@ -368,7 +368,7 @@ void RegexTest::Basic() {
// //
#if 0 #if 0
{ {
REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D"); REGEX_FIND("[{ab}]", "a");
} }
exit(1); exit(1);
#endif #endif
@ -436,6 +436,9 @@ void RegexTest::Basic() {
REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
// Set contains only a string, no individual chars.
REGEX_TESTLM("[{ab}]", "a", FALSE, FALSE);
// //
// OR operator in patterns // OR operator in patterns
// //
@ -975,6 +978,52 @@ void RegexTest::API_Pattern() {
delete pat1; delete pat1;
// split, with a pattern with (capture)
pat1 = RegexPattern::compile("<(\\w*)>", pe, status);
REGEX_CHECK_STATUS;
n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="c");
REGEX_ASSERT(fields[6]=="");
n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="c");
REGEX_ASSERT(fields[6]=="");
n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==4);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="the time<c>");
delete pat1;
pat1 = RegexPattern::compile("([-,])", pe, status);
REGEX_CHECK_STATUS;
n = pat1->split("1-10,20", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==5);
REGEX_ASSERT(fields[0]=="1");
REGEX_ASSERT(fields[1]=="-");
REGEX_ASSERT(fields[2]=="10");
REGEX_ASSERT(fields[3]==",");
REGEX_ASSERT(fields[4]=="20");
delete pat1;
} }