ICU-105 Regular Expressions, changes from code review

X-SVN-Rev: 10294
This commit is contained in:
Andy Heninger 2002-11-19 19:31:03 +00:00
parent bf1f6b1213
commit 24bf088281
9 changed files with 556 additions and 353 deletions

View File

@ -1839,7 +1839,6 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_PROPERTY_SYNTAX",
"U_REGEX_UNIMPLEMENTED",
"U_REGEX_MISMATCHED_PAREN",
"U_REGEX_MATCH_MODE_ERROR"
};
U_CAPI const char * U_EXPORT2

View File

@ -500,18 +500,17 @@ typedef enum UErrorCode {
/*
* The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
*/
U_REGEX_ERROR_START=0x10300,
U_REGEX_INTERNAL_ERROR,
U_REGEX_RULE_SYNTAX,
U_REGEX_INVALID_STATE,
U_REGEX_BAD_ESCAPE_SEQUENCE,
U_REGEX_PROPERTY_SYNTAX,
U_REGEX_UNIMPLEMENTED,
U_REGEX_MISMATCHED_PAREN,
U_REGEX_MATCH_MODE_ERROR,
U_REGEX_ERROR_LIMIT,
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */
U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
} UErrorCode;
/* Use the following to determine if an UErrorCode represents */

View File

@ -28,8 +28,6 @@
#include "ucln_in.h"
#include "mutex.h"
#include "stdio.h" // TODO: Get rid of this
#include "regeximp.h"
#include "regexcst.h" // Contains state table for the regex pattern parser.
// generated by a Perl script.
@ -40,7 +38,6 @@
U_NAMESPACE_BEGIN
const char RegexCompile::fgClassID=0;
static const int RESCAN_DEBUG = 0;
//----------------------------------------------------------------------------------------
//
@ -173,6 +170,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
//
// Set up the constant (static) Unicode Sets.
// TODO: something cleaner for that -128 constant.
//
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status);
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status);
@ -282,14 +280,12 @@ void RegexCompile::compile(
// the search will stop there, if not before.
//
tableEl = &gRuleParseStateTable[state];
if (RESCAN_DEBUG) {
printf( "char, line, col = (\'%c\', %d, %d) state=%s ",
fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
}
REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d) state=%s ",
fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
for (;;) { // loop through table rows belonging to this state, looking for one
// that matches the current input char.
if (RESCAN_DEBUG) { printf( ".");}
REGEX_SCAN_DEBUG_PRINTF( ".");
if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
// Table row specified an individual character, not a set, and
// the input character is not quoted, and
@ -323,7 +319,7 @@ void RegexCompile::compile(
// No match on this row, advance to the next row for this state,
tableEl++;
}
if (RESCAN_DEBUG) { printf( "\n");}
REGEX_SCAN_DEBUG_PRINTF("\n");
//
// We've found the row of the state table that matches the current input
@ -340,7 +336,7 @@ void RegexCompile::compile(
fStackPtr++;
if (fStackPtr >= kStackSize) {
error(U_REGEX_INTERNAL_ERROR);
// printf( "RegexCompile::parse() - state stack overflow.\n");
REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n");
fStackPtr--;
}
fStack[fStackPtr] = tableEl->fPushState;
@ -369,6 +365,36 @@ void RegexCompile::compile(
}
//
// The pattern has now been read and processed, and the compiled code generated.
//
//
// Compute the number of digits requried for the largest capture group number.
//
fRXPat->fMaxCaptureDigits = 1;
int32_t n = 10;
for (;;) {
if (n > fRXPat->fNumCaptureGroups) {
break;
}
fRXPat->fMaxCaptureDigits++;
n *= 10;
}
//
// A stupid bit of non-sense to prevent code coverage testing from complaining
// about the pattern.dump() debug function. Go through the motions of dumping,
// even though, without the #define set, it will do nothing.
//
#ifndef REGEX_DUMP_DEBUG
static UBool phonyDumpDone = FALSE;
if (phonyDumpDone==FALSE) {
fRXPat->dump();
phonyDumpDone = TRUE;
}
#endif
}
@ -1094,27 +1120,39 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
if (theSet == NULL) {
return;
}
if (theSet->size() > 1) {
// The set contains two or more chars.
// Put it into the compiled pattern as a set.
int32_t setNumber = fRXPat->fSets->size();
fRXPat->fSets->addElement(theSet, *fStatus);
int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
fRXPat->fCompiledPat->addElement(setOp, *fStatus);
int32_t setSize = theSet->size();
UChar32 firstSetChar = theSet->charAt(0);
if (firstSetChar == -1) {
// Sets that contain only strings, but no individual chars,
// will end up here. TODO: figure out what to with sets containing strings.
setSize = 0;
}
else
{
// The set contains only a single code point. Put it into
// the compiled pattern as a single char operation rather
// than a set, and discard the set itself.
UChar32 c = theSet->charAt(0);
if (c == -1) {
// Set contained no chars. Stuff an invalid char that can't match.
c = 0x1fffff;
switch (setSize) {
case 0: // Set of no elements. Always fails to match.
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
break;
case 1:
{
// The set contains only a single code point. Put it into
// the compiled pattern as a single char operation rather
// than a set, and discard the set itself.
int32_t charToken = URX_BUILD(URX_ONECHAR, firstSetChar);
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
delete theSet;
}
break;
default:
{
// The set contains two or more chars. (the normal case)
// Put it into the compiled pattern as a set.
int32_t setNumber = fRXPat->fSets->size();
fRXPat->fSets->addElement(theSet, *fStatus);
int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
fRXPat->fCompiledPat->addElement(setOp, *fStatus);
}
int32_t charToken = URX_BUILD(URX_ONECHAR, c);
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
delete theSet;
}
}
@ -1321,7 +1359,7 @@ UnicodeSet *RegexCompile::scanSet() {
if (U_FAILURE(localStatus)) {
// TODO: Get more accurate position of the error from UnicodeSet's return info.
// UnicodeSet appears to not be reporting correctly at this time.
printf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
error(localStatus);
delete uset;
return NULL;

View File

@ -28,8 +28,6 @@
U_NAMESPACE_BEGIN
static const UBool REGEX_DEBUG = TRUE;
//--------------------------------------------------------------------------------
//
// class RegexCompile Contains the regular expression compiler.

View File

@ -13,13 +13,45 @@
#define _REGEXIMP_H
//
// debugging support. Enable one or more of the #defines immediately following
//
//#define REGEX_SCAN_DEBUG
#define REGEX_DUMP_DEBUG
//#define REGEX_RUN_DEBUG
// End of #defines inteded to be directly set.
#ifdef REGEX_SCAN_DEBUG
#define REGEX_SCAN_DEBUG_PRINTF printf
#else
#define REGEX_SCAN_DEBUG_PRINTF
#endif
#ifdef REGEX_DUMP_DEBUG
#define REGEX_DUMP_DEBUG_PRINTF printf
#else
#define REGEX_DUMP_DEBUG_PRINTF
#endif
#ifdef REGEX_RUN_DEBUG
#define REGEX_RUN_DEBUG_PRINTF printf
#define REGEX_DUMP_DEBUG_PRINTF printf
#else
#define REGEX_RUN_DEBUG_PRINTF
#endif
#if defined(REGEX_SCAN_DEBUG) || defined(REGEX_RUN_DEBUG) || defined(REGEX_DUMP_DEBUG)
#include <stdio.h>
#endif
//
// Opcode types In the compiled form of the regexp, these are the type, or opcodes,
// of the entries.
//
enum {
URX_RESERVED_OP = 0,
URX_UNUSED1 = 1,
URX_BACKTRACK = 1,
URX_END = 2,
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
URX_STRING = 4, // Value field is index of string start
@ -52,7 +84,7 @@ enum {
// Used for debug printing only.
#define URX_OPCODE_NAMES \
"URX_RESERVED_OP", \
"URX_UNUSED1", \
"URX_BACKTRACK", \
"END", \
"ONECHAR", \
"STRING", \

View File

@ -280,9 +280,9 @@ UnicodeString RegexMatcher::group(UErrorCode &status) const {
UnicodeString RegexMatcher::group(int32_t group, UErrorCode &status) const {
int32_t s = start(group, status);
int32_t e = end(group, status);
UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
int32_t s = start(groupNum, status);
int32_t e = end(groupNum, status);
// Note: calling start() and end() above will do all necessary checking that
// the group number is OK and that a match exists. status will be set.
@ -539,6 +539,28 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
int32_t opType; // the opcode
int32_t opValue; // and the operand value.
#ifdef REGEX_RUN_DEBUG
{
printf("MatchAt(startIdx=%d)\n", startIdx);
printf("Original Pattern: ");
int i;
for (i=0; i<fPattern->fPattern.length(); i++) {
printf("%c", fPattern->fPattern.charAt(i));
}
printf("\n");
printf("Input String: ");
for (i=0; i<fInput->length(); i++) {
UChar c = fInput->charAt(i);
if (c<32 || c>256) {
c = '.';
}
printf("%c", c);
}
printf("\n");
printf("\n");
printf("PatLoc inputIdx char\n");
}
#endif
if (U_FAILURE(status)) {
return;
@ -569,7 +591,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
op = pat->elementAti(patIdx);
opType = URX_TYPE(op);
opValue = URX_VAL(op);
// printf("%d %d \"%c\"\n", patIdx, inputIdx, fInput->char32At(inputIdx));
#ifdef REGEX_RUN_DEBUG
printf("inputIdx=%d inputChar=%c ", inputIdx, fInput->char32At(inputIdx));
fPattern->dumpOp(patIdx);
#endif
patIdx++;
switch (opType) {
@ -579,6 +604,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
break;
case URX_BACKTRACK:
// Force a backtrack. In some circumstances, the pattern compiler
// will notice that the pattern can't possibly match anything, and will
// emit one of these at that point.
backTrack(inputIdx, patIdx);
break;
case URX_ONECHAR:
{
UChar32 inputChar = fInput->char32At(inputIdx);
@ -909,7 +942,12 @@ breakFromLoop:
fLastMatchEnd = fMatchEnd;
fMatchStart = startIdx;
fMatchEnd = inputIdx;
REGEX_RUN_DEBUG_PRINTF("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd);
}
else
{
REGEX_RUN_DEBUG_PRINTF("No match\n\n");
}
return;
}

View File

@ -18,8 +18,6 @@
#include "regexcmp.h"
#include "regeximp.h"
#include "stdio.h" // TODO: get rid of this...
U_NAMESPACE_BEGIN
//--------------------------------------------------------------------------
@ -197,7 +195,7 @@ UBool RegexPattern::operator ==(const RegexPattern &other) const {
//---------------------------------------------------------------------
RegexPattern *RegexPattern::compile(
const UnicodeString &regex,
int32_t flags,
uint32_t flags,
UParseError &pe,
UErrorCode &status) {
@ -243,7 +241,7 @@ RegexPattern *RegexPattern::compile( const UnicodeString &regex,
// flags
//
//---------------------------------------------------------------------
int32_t RegexPattern::flags() const {
uint32_t RegexPattern::flags() const {
return fFlags;
}
@ -320,8 +318,6 @@ UnicodeString RegexPattern::pattern() const {
//---------------------------------------------------------------------
//
// split
// TODO: perl returns captured strings intermixed with the
// fields. Should we do this too?
//
//---------------------------------------------------------------------
int32_t RegexPattern::split(const UnicodeString &input,
@ -383,10 +379,28 @@ int32_t RegexPattern::split(const UnicodeString &input,
int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
dest[i].setTo(input, nextOutputStringStart, fieldLen);
nextOutputStringStart = fMatcher->fMatchEnd;
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
for (groupNum=1; groupNum<=this->fNumCaptureGroups; groupNum++) {
if (i==destCapacity-1) {
break;
}
i++;
dest[i] = fMatcher->group(groupNum, status);
}
if (nextOutputStringStart == inputLen) {
// The delimiter was at the end of the string. We're done.
break;
}
if (i==destCapacity-1) {
// We've filled up the last output string with capture group data.
// Give back the last string, to be used for the remainder of the input.
i--;
}
}
else
{
@ -410,88 +424,102 @@ int32_t RegexPattern::split(const UnicodeString &input,
//---------------------------------------------------------------------
static const char *opNames[] = {URX_OPCODE_NAMES};
void RegexPattern::dump() {
void RegexPattern::dumpOp(int32_t index) const {
int32_t op = fCompiledPat->elementAti(index);
int32_t val = URX_VAL(op);
int32_t type = URX_TYPE(op);
int32_t pinnedType = type;
if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
pinnedType = 0;
}
REGEX_DUMP_DEBUG_PRINTF("%4d %08x %-15s ", index, op, opNames[pinnedType]);
switch (type) {
case URX_NOP:
case URX_DOTANY:
case URX_FAIL:
case URX_BACKSLASH_A:
case URX_BACKSLASH_G:
case URX_BACKSLASH_X:
case URX_END:
// Types with no operand field of interest.
break;
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_STATIC_SETREF:
case URX_STATE_SAVE:
case URX_JMP:
case URX_BACKSLASH_B:
case URX_BACKSLASH_D:
case URX_BACKSLASH_W:
case URX_BACKSLASH_Z:
case URX_CARET:
case URX_DOLLAR:
case URX_STRING_LEN:
// types with an integer operand field.
REGEX_DUMP_DEBUG_PRINTF("%d", val);
break;
case URX_ONECHAR:
REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
break;
case URX_STRING:
{
int32_t lengthOp = fCompiledPat->elementAti(index+1);
U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
int32_t length = URX_VAL(lengthOp);
int32_t i;
for (i=val; i<val+length; i++) {
UChar c = fLiteralText[i];
if (c < 32 || c >= 256) {c = '.';}
REGEX_DUMP_DEBUG_PRINTF("%c", c);
}
}
break;
case URX_SETREF:
{
REGEX_DUMP_DEBUG_PRINTF("%d ", val);
UnicodeString s;
UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
set->toPattern(s, TRUE);
for (int32_t i=0; i<s.length(); i++) {
REGEX_DUMP_DEBUG_PRINTF("%c", s.charAt(i));
}
}
default:
REGEX_DUMP_DEBUG_PRINTF("??????");
break;
}
REGEX_DUMP_DEBUG_PRINTF("\n");
}
void RegexPattern::dump() const {
int index;
int i;
UChar c;
int32_t op;
int32_t pinnedType;
int32_t type;
int32_t val;
int32_t stringStart;
printf("Original Pattern: ");
REGEX_DUMP_DEBUG_PRINTF("Original Pattern: ");
for (i=0; i<fPattern.length(); i++) {
printf("%c", fPattern.charAt(i));
REGEX_DUMP_DEBUG_PRINTF("%c", fPattern.charAt(i));
}
printf("\n");
printf("Pattern Valid?: %s\n", fBadState? "no" : "yes");
printf("\nIndex Binary Type Operand\n"
REGEX_DUMP_DEBUG_PRINTF("\n");
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n", fBadState? "no" : "yes");
REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
"-------------------------------------------\n");
for (index = 0; ; index++) {
op = fCompiledPat->elementAti(index);
val = URX_VAL(op);
type = URX_TYPE(op);
pinnedType = type;
if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
pinnedType = 0;
}
printf("%4d %08x %-15s ", index, op, opNames[pinnedType]);
switch (type) {
case URX_NOP:
case URX_DOTANY:
case URX_FAIL:
case URX_BACKSLASH_A:
case URX_BACKSLASH_G:
case URX_BACKSLASH_X:
// Types with no operand field of interest.
break;
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_SETREF:
case URX_STATIC_SETREF:
case URX_STATE_SAVE:
case URX_JMP:
case URX_BACKSLASH_B:
case URX_BACKSLASH_D:
case URX_BACKSLASH_W:
case URX_BACKSLASH_Z:
case URX_CARET:
case URX_DOLLAR:
// types with an integer operand field.
printf("%d", val);
break;
case URX_ONECHAR:
printf("%c", val<256?val:'?');
break;
case URX_STRING:
stringStart = val;
break;
case URX_STRING_LEN:
for (i=stringStart; i<stringStart+val; i++) {
c = fLiteralText[i];
if (c >= 256) {c = '?';};
printf("%c", c);
}
break;
case URX_END:
goto breakFromLoop;
default:
printf("??????");
break;
}
printf("\n");
for (index = 0; index<fCompiledPat->size(); index++) {
dumpOp(index);
}
breakFromLoop:
printf("\n\n");
REGEX_DUMP_DEBUG_PRINTF("\n\n");
};
const char RegexPattern::fgClassID = 0;

View File

@ -81,6 +81,8 @@ enum {
* to be applied to input text, and a few convenience methods for simple common
* uses of regular expressions.
*
* <p>Class RegexPattern is not intended to be subclassed.</p>
*
* @draft ICU 2.4
*/
class U_I18N_API RegexPattern: public UObject {
@ -192,7 +194,7 @@ public:
* @draft ICU 2.4
*/
static RegexPattern *compile( const UnicodeString &regex,
int32_t flags,
uint32_t flags,
UParseError &pe,
UErrorCode &status);
@ -202,7 +204,7 @@ public:
* @return the match mode flags
* @draft ICU 2.4
*/
virtual int32_t flags() const;
virtual uint32_t flags() const;
/*
* Creates a RegexMatcher that will match the given input against this pattern. The
@ -275,7 +277,7 @@ public:
//
// dump Debug function, displays the compiled form of a pattern.
//
void dump();
void dump() const;
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
@ -291,14 +293,12 @@ public:
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
static const char fgClassID;
private:
//
// Implementation Data
//
UnicodeString fPattern; // The original pattern string.
int32_t fFlags; // The flags used when compiling the pattern.
uint32_t fFlags; // The flags used when compiling the pattern.
//
UVector *fCompiledPat; // The compiled pattern.
UnicodeString fLiteralText; // Any literal string data from the pattern,
@ -317,6 +317,12 @@ private:
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
// regex character classes, e.g. Word.
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
friend class RegexCompile;
friend class RegexMatcher;
@ -325,6 +331,7 @@ private:
//
void init(); // Common initialization, for use by constructors.
void zap(); // Common cleanup
void dumpOp(int32_t index) const;
@ -343,6 +350,8 @@ private:
* input text to which the expression can be applied. It includes methods
* for testing for matches, and for find and replace operations.
*
* <p>Class RegexMatcher is not intended to be subclassed.</p>
*
* @draft ICU 2.4
*/
class U_I18N_API RegexMatcher: public UObject {
@ -355,6 +364,227 @@ public:
*/
virtual ~RegexMatcher();
/**
* Attempts to match the entire input string against the pattern.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match
* @draft ICU 2.4
*/
virtual UBool matches(UErrorCode &status);
/**
* Attempts to match the input string, starting from the beginning, against the pattern.
* Like the matches() method, this function always starts at the beginning of the input string;
* unlike that function, it does not require that the entire input string be matched.
*
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
* <code>end()</code>, and <code>group()</code> functions.</p>
*
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match at the start of the input string.
* @draft ICU 2.4
*/
virtual UBool lookingAt(UErrorCode &status);
/**
* Find the next pattern match in the input string.
* The find begins searching the input at the location following the end of
* the previous match, or at the start of the string if there is no previous match.
* If a match is found, <code>start(), end()</code> and <code>group()</code>
* will provide more information regarding the match.
* <p>Note that if the input string is changed by the application,
* use find(startPos, status) instead of find(), because the saved starting
* position may not be valid with the altered input string.</p>
* @return TRUE if a match is found.
* @draft ICU 2.4
*/
virtual UBool find();
/**
* Resets this RegexMatcher and then attempts to find the next substring of the
* input string that matches the pattern, starting at the specified index.
*
* @param start the position in the input string to begin the search
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if a match is found.
* @draft ICU 2.4
*/
virtual UBool find(int32_t start, UErrorCode &status);
/*
* Returns a string containing the text matched by the previous match.
* If the pattern can match an empty string, an empty string may be returned.
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed.
* @return a string containing the matched input text.
* @draft ICU 2.4
*/
virtual UnicodeString group(UErrorCode &status) const;
/**
* Returns a string containing the text captured by the given group
* during the previous match operation. Group(0) is the entire match.
*
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
* @return the captured text
* @draft ICU 2.4
*/
virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
/**
* Returns the number of capturing groups in this matcher's pattern.
* @return the number of capture groups
* @draft ICU 2.4
*/
virtual int32_t groupCount() const;
/**
* Returns the index in the input string of the start of the text matched
* during the previous match operation.
* @param status a reference to a UErrorCode to receive any errors.
* @return The position in the input string of the start of the last match.
* @draft ICU 2.4
*/
virtual int32_t start(UErrorCode &status) const;
/**
* Returns the index in the input string of the start of the text matched by the
* specified capture group during the previous match operation. Return -1 if
* the capture group exists in the pattern, but was not part of the last match.
*
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed, and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
* @return the start position of substring matched by the specified group.
* @draft ICU 2.4
*/
virtual int32_t start(int group, UErrorCode &status) const;
/**
* Returns the index in the input string of the character following the
* text matched during the previous match operation.
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed.
* @return the index of the last character matched, plus one.
* @draft ICU 2.4
*/
virtual int32_t end(UErrorCode &status) const;
/**
* Returns the index in the input string of the character following the
* text matched by the specified capture group during the previous match operation.
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
* @return the index of the last character, plus one, of the text
* captured by the specifed group during the previous match operation.
* Return -1 if the capture group was not part of the match.
* @draft ICU 2.4
*/
virtual int32_t end(int group, UErrorCode &status) const;
/**
* Resets this matcher. The effect is to remove any memory of previous matches,
* and to cause subsequent find() operations to begin at the beginning of
* the input string.
*
* @return this RegexMatcher.
* @draft ICU 2.4
*/
virtual RegexMatcher &reset();
/**
* Resets this matcher with a new input string. This allows instances of RegexMatcher
* to be reused, which is more efficient than creating a new RegexMatcher for
* each input string to be processed.
* @return this RegexMatcher.
* @draft ICU 2.4
*/
virtual RegexMatcher &reset(const UnicodeString &input);
/**
* Returns the input string being matched. The returned string is not a copy,
* but the live input string. It should not be altered or deleted.
* @return the input string
* @draft ICU 2.4
*/
virtual const UnicodeString &input() const;
/**
* Returns the pattern that is interpreted by this matcher.
* @return the RegexPattern for this RegexMatcher
* @draft ICU 2.4
*/
virtual const RegexPattern &pattern() const;
/**
* Replaces every substring of the input that matches the pattern
* with the given replacement string. This is a convenience function that
* provides a complete find-and-replace-all operation.
*
* This method first resets this matcher. It then scans the input string
* looking for matches of the pattern. Input that is not part of any
* match is left unchanged; each match is replaced in the result by the
* replacement string. The replacement string may contain references to
* capture groups.
*
* @param replacement a string containing the replacement text.
* @param status a reference to a UErrorCode to receive any errors.
* @return a string containing the results of the find and replace.
* @draft ICU 2.4
*/
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
/**
* Replaces the first substring of the input that matches
* the pattern with the replacement string. This is a convenience
* function that provides a complete find-and-replace operation.
*
* <p>This function first resets this RegexMatcher. It then scans the input string
* looking for a match of the pattern. Input that is not part
* of the match is appended directly to the result string; the match is replaced
* in the result by the replacement string. The replacement string may contain
* references to captured groups.</p>
*
* <p>The state of the matcher (the position at which a subsequent find()
* would begin) after completing a replaceFirst() is not specified. The
* RegexMatcher should be reset before doing additional find() operations.</p>
*
* @param replacement a string containing the replacement text.
* @param status a reference to a UErrorCode to receive any errors.
* @return a string containing the results of the find and replace.
* @draft ICU 2.4
*/
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
/**
* Implements a replace operation intended to be used as part of an
* incremental find-and-replace.
@ -398,219 +628,6 @@ public:
*/
virtual UnicodeString &appendTail(UnicodeString &dest);
/**
* Returns the index in the input string of the character following the
* text matched during the previous match operation.
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed.
* @return the index of the last character matched, plus one.
* @draft ICU 2.4
*/
virtual int32_t end(UErrorCode &status) const;
/**
* Returns the index in the input string of the character following the
* text matched by the specified capture group during the previous match operation.
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
* @return the index of the last character, plus one, of the text
* captured by the specifed group during the previous match operation.
* Return -1 if the capture group was not part of the match.
* @draft ICU 2.4
*/
virtual int32_t end(int group, UErrorCode &status) const;
/**
* Find the next pattern match in the input string.
* The find begins searching the input at the location following the end of
* the previous match, or at the start of the string if there is no previous match.
* If a match is found, <code>start(), end()</code> and <code>group()</code>
* will provide more information regarding the match.
* @return TRUE if a match is found.
* @draft ICU 2.4
*/
virtual UBool find();
/**
* Resets this RegexMatcher and then attempts to find the next substring of the
* input string that matches the pattern, starting at the specified index.
*
* @param status the position in the input string to begin the search
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if a match is found.
* @draft ICU 2.4
*/
virtual UBool find(int32_t start, UErrorCode &status);
/*
* Returns a string containing the text matched by the previous match.
* If the pattern can match an empty string, an empty string may be returned.
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed.
* @return a string containing the matched input text.
* @draft ICU 2.4
*/
virtual UnicodeString group(UErrorCode &status) const;
/**
* Returns a string containing the text captured by the given group
* during the previous match operation. Group(0) is the entire match.
*
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
* @return the captured text
* @draft ICU 2.4
*/
virtual UnicodeString group(int32_t group, UErrorCode &status) const;
/**
* Returns the number of capturing groups in this matcher's pattern.
* @return the number of capture groups
* @draft ICU 2.4
*/
virtual int32_t groupCount() const;
/**
* Returns the input string being matched. The returned string is not a copy,
* but the live input string. It should not be altered or deleted.
* @return the input string
* @draft ICU 2.4
*/
virtual const UnicodeString &input() const;
/**
* Attempts to match the input string, starting from the beginning, against the pattern.
* Like the matches() method, this function always starts at the beginning of the input string;
* unlike that function, it does not require that the entire input string be matched.
*
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
* <code>end()</code>, and <code>group()</code> functions.</p>
*
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match at the start of the input string.
* @draft ICU 2.4
*/
virtual UBool lookingAt(UErrorCode &status);
/**
* Attempts to match the entire input string against the pattern.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match
* @draft ICU 2.4
*/
virtual UBool matches(UErrorCode &status);
/**
* Returns the pattern that is interpreted by this matcher.
* @return the RegexPattern for this RegexMatcher
* @draft ICU 2.4
*/
virtual const RegexPattern &pattern() const;
/**
* Replaces every substring of the input that matches the pattern
* with the given replacement string. This is a convenience function that
* provides a complete find-and-replace-all operation.
*
* This method first resets this matcher. It then scans the input string
* looking for matches of the pattern. Input that is not part of any
* match is left unchanged; each match is replaced in the result by the
* replacement string. The replacement string may contain references to
* capture groups.
*
* @param replacement a string containing the replacement text.
* @param status a reference to a UErrorCode to receive any errors.
* @return a string containing the results of the find and replace.
* @draft ICU 2.4
*/
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
/**
* Replaces the first substring of the input that matches
* the pattern with the replacement string. This is a convenience
* function that provides a complete find-and-replace operation.
*
* This function first resets this RegexMatcher. It then scans the input string
* looking for a match of the pattern. Input that is not part
* of the match is appended directly to the result string; the match is replaced
* in the result by the replacement string. The replacement string may contain
* references to captured groups.
*
* @param replacement a string containing the replacement text.
* @param status a reference to a UErrorCode to receive any errors.
* @return a string containing the results of the find and replace.
* @draft ICU 2.4
*/
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
/**
* Resets this matcher. The effect is to remove any memory of previous matches,
* and to cause subsequent find() operations to begin at the beginning of
* the input string.
*
* @return this RegexMatcher.
* @draft ICU 2.4
*/
virtual RegexMatcher &reset();
/**
* Resets this matcher with a new input string. This allows instances of RegexMatcher
* to be reused, which is more efficient than creating a new RegexMatcher for
* each input string to be processed.
* @return this RegexMatcher.
* @draft ICU 2.4
*/
virtual RegexMatcher &reset(const UnicodeString &input);
/**
* Returns the index in the input string of the start of the text matched
* during the previous match operation.
* @param status a reference to a UErrorCode to receive any errors.
* @return The position in the input string of the start of the last match.
* @draft ICU 2.4
*/
virtual int32_t start(UErrorCode &status) const;
/**
* Returns the index in the input string of the start of the text matched by the
* specified capture group during the previous match operation. Return -1 if
* the capture group exists in the pattern, but was not part of the last match.
*
* @param group the capture group number
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed, and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
* @return the start position of substring matched by the specified group.
* @draft ICU 2.4
*/
virtual int32_t start(int group, UErrorCode &status) const;
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
@ -626,8 +643,6 @@ public:
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
static const char fgClassID;
private:
// Constructors and other object boilerplate are private.
// Instances of RegexMatcher can not be assigned, copied, cloned, etc.
@ -658,6 +673,13 @@ private:
UVector *fCaptureStarts;
UVector *fCaptureEnds;
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
U_NAMESPACE_END

View File

@ -368,7 +368,7 @@ void RegexTest::Basic() {
//
#if 0
{
REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
REGEX_FIND("[{ab}]", "a");
}
exit(1);
#endif
@ -436,6 +436,9 @@ void RegexTest::Basic() {
REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
// Set contains only a string, no individual chars.
REGEX_TESTLM("[{ab}]", "a", FALSE, FALSE);
//
// OR operator in patterns
//
@ -975,6 +978,52 @@ void RegexTest::API_Pattern() {
delete pat1;
// split, with a pattern with (capture)
pat1 = RegexPattern::compile("<(\\w*)>", pe, status);
REGEX_CHECK_STATUS;
n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(fields[0]=="");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="c");
REGEX_ASSERT(fields[6]=="");
n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==6);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="b");
REGEX_ASSERT(fields[4]=="the time");
REGEX_ASSERT(fields[5]=="c");
REGEX_ASSERT(fields[6]=="");
n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==4);
REGEX_ASSERT(fields[0]==" ");
REGEX_ASSERT(fields[1]=="a");
REGEX_ASSERT(fields[2]=="Now is ");
REGEX_ASSERT(fields[3]=="the time<c>");
delete pat1;
pat1 = RegexPattern::compile("([-,])", pe, status);
REGEX_CHECK_STATUS;
n = pat1->split("1-10,20", fields, 10, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(n==5);
REGEX_ASSERT(fields[0]=="1");
REGEX_ASSERT(fields[1]=="-");
REGEX_ASSERT(fields[2]=="10");
REGEX_ASSERT(fields[3]==",");
REGEX_ASSERT(fields[4]=="20");
delete pat1;
}