From 24bf088281b1cec1753e936ad50ed6106acbb3e2 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Tue, 19 Nov 2002 19:31:03 +0000 Subject: [PATCH] ICU-105 Regular Expressions, changes from code review X-SVN-Rev: 10294 --- icu4c/source/common/putil.c | 1 - icu4c/source/common/unicode/utypes.h | 21 +- icu4c/source/i18n/regexcmp.cpp | 98 +++-- icu4c/source/i18n/regexcmp.h | 2 - icu4c/source/i18n/regeximp.h | 36 +- icu4c/source/i18n/rematch.cpp | 46 ++- icu4c/source/i18n/repattrn.cpp | 190 +++++----- icu4c/source/i18n/unicode/regex.h | 464 +++++++++++++----------- icu4c/source/test/intltest/regextst.cpp | 51 ++- 9 files changed, 556 insertions(+), 353 deletions(-) diff --git a/icu4c/source/common/putil.c b/icu4c/source/common/putil.c index 17bc5b4e93..7047a04cce 100644 --- a/icu4c/source/common/putil.c +++ b/icu4c/source/common/putil.c @@ -1839,7 +1839,6 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = { "U_REGEX_PROPERTY_SYNTAX", "U_REGEX_UNIMPLEMENTED", "U_REGEX_MISMATCHED_PAREN", - "U_REGEX_MATCH_MODE_ERROR" }; U_CAPI const char * U_EXPORT2 diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index ffb3bc334a..40f92d0d72 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -500,18 +500,17 @@ typedef enum UErrorCode { /* * The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs */ - U_REGEX_ERROR_START=0x10300, - U_REGEX_INTERNAL_ERROR, - U_REGEX_RULE_SYNTAX, - U_REGEX_INVALID_STATE, - U_REGEX_BAD_ESCAPE_SEQUENCE, - U_REGEX_PROPERTY_SYNTAX, - U_REGEX_UNIMPLEMENTED, - U_REGEX_MISMATCHED_PAREN, - U_REGEX_MATCH_MODE_ERROR, - U_REGEX_ERROR_LIMIT, + U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */ + U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */ + U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */ + U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */ + U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */ + U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */ + U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */ + U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */ + U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */ - U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ + U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ } UErrorCode; /* Use the following to determine if an UErrorCode represents */ diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 895e4d8cef..a38fafe801 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -28,8 +28,6 @@ #include "ucln_in.h" #include "mutex.h" -#include "stdio.h" // TODO: Get rid of this - #include "regeximp.h" #include "regexcst.h" // Contains state table for the regex pattern parser. // generated by a Perl script. @@ -40,7 +38,6 @@ U_NAMESPACE_BEGIN const char RegexCompile::fgClassID=0; -static const int RESCAN_DEBUG = 0; //---------------------------------------------------------------------------------------- // @@ -173,6 +170,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status) // // Set up the constant (static) Unicode Sets. + // TODO: something cleaner for that -128 constant. // ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status); ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status); @@ -282,14 +280,12 @@ void RegexCompile::compile( // the search will stop there, if not before. // tableEl = &gRuleParseStateTable[state]; - if (RESCAN_DEBUG) { - printf( "char, line, col = (\'%c\', %d, %d) state=%s ", - fC.fChar, fLineNum, fCharNum, RegexStateNames[state]); - } + REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d) state=%s ", + fC.fChar, fLineNum, fCharNum, RegexStateNames[state]); for (;;) { // loop through table rows belonging to this state, looking for one // that matches the current input char. - if (RESCAN_DEBUG) { printf( ".");} + REGEX_SCAN_DEBUG_PRINTF( "."); if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) { // Table row specified an individual character, not a set, and // the input character is not quoted, and @@ -323,7 +319,7 @@ void RegexCompile::compile( // No match on this row, advance to the next row for this state, tableEl++; } - if (RESCAN_DEBUG) { printf( "\n");} + REGEX_SCAN_DEBUG_PRINTF("\n"); // // We've found the row of the state table that matches the current input @@ -340,7 +336,7 @@ void RegexCompile::compile( fStackPtr++; if (fStackPtr >= kStackSize) { error(U_REGEX_INTERNAL_ERROR); - // printf( "RegexCompile::parse() - state stack overflow.\n"); + REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n"); fStackPtr--; } fStack[fStackPtr] = tableEl->fPushState; @@ -369,6 +365,36 @@ void RegexCompile::compile( } + // + // The pattern has now been read and processed, and the compiled code generated. + // + + // + // Compute the number of digits requried for the largest capture group number. + // + fRXPat->fMaxCaptureDigits = 1; + int32_t n = 10; + for (;;) { + if (n > fRXPat->fNumCaptureGroups) { + break; + } + fRXPat->fMaxCaptureDigits++; + n *= 10; + } + + // + // A stupid bit of non-sense to prevent code coverage testing from complaining + // about the pattern.dump() debug function. Go through the motions of dumping, + // even though, without the #define set, it will do nothing. + // +#ifndef REGEX_DUMP_DEBUG + static UBool phonyDumpDone = FALSE; + if (phonyDumpDone==FALSE) { + fRXPat->dump(); + phonyDumpDone = TRUE; + } +#endif + } @@ -1094,27 +1120,39 @@ void RegexCompile::compileSet(UnicodeSet *theSet) if (theSet == NULL) { return; } - if (theSet->size() > 1) { - // The set contains two or more chars. - // Put it into the compiled pattern as a set. - int32_t setNumber = fRXPat->fSets->size(); - fRXPat->fSets->addElement(theSet, *fStatus); - int32_t setOp = URX_BUILD(URX_SETREF, setNumber); - fRXPat->fCompiledPat->addElement(setOp, *fStatus); + int32_t setSize = theSet->size(); + UChar32 firstSetChar = theSet->charAt(0); + if (firstSetChar == -1) { + // Sets that contain only strings, but no individual chars, + // will end up here. TODO: figure out what to with sets containing strings. + setSize = 0; } - else - { - // The set contains only a single code point. Put it into - // the compiled pattern as a single char operation rather - // than a set, and discard the set itself. - UChar32 c = theSet->charAt(0); - if (c == -1) { - // Set contained no chars. Stuff an invalid char that can't match. - c = 0x1fffff; + + switch (setSize) { + case 0: // Set of no elements. Always fails to match. + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus); + break; + + case 1: + { + // The set contains only a single code point. Put it into + // the compiled pattern as a single char operation rather + // than a set, and discard the set itself. + int32_t charToken = URX_BUILD(URX_ONECHAR, firstSetChar); + fRXPat->fCompiledPat->addElement(charToken, *fStatus); + delete theSet; + } + break; + + default: + { + // The set contains two or more chars. (the normal case) + // Put it into the compiled pattern as a set. + int32_t setNumber = fRXPat->fSets->size(); + fRXPat->fSets->addElement(theSet, *fStatus); + int32_t setOp = URX_BUILD(URX_SETREF, setNumber); + fRXPat->fCompiledPat->addElement(setOp, *fStatus); } - int32_t charToken = URX_BUILD(URX_ONECHAR, c); - fRXPat->fCompiledPat->addElement(charToken, *fStatus); - delete theSet; } } @@ -1321,7 +1359,7 @@ UnicodeSet *RegexCompile::scanSet() { if (U_FAILURE(localStatus)) { // TODO: Get more accurate position of the error from UnicodeSet's return info. // UnicodeSet appears to not be reporting correctly at this time. - printf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); + REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); error(localStatus); delete uset; return NULL; diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h index 7f11cb114a..d809b78ea6 100644 --- a/icu4c/source/i18n/regexcmp.h +++ b/icu4c/source/i18n/regexcmp.h @@ -28,8 +28,6 @@ U_NAMESPACE_BEGIN -static const UBool REGEX_DEBUG = TRUE; - //-------------------------------------------------------------------------------- // // class RegexCompile Contains the regular expression compiler. diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index 3d28b1849f..b176ed54c0 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -13,13 +13,45 @@ #define _REGEXIMP_H +// +// debugging support. Enable one or more of the #defines immediately following +// +//#define REGEX_SCAN_DEBUG +#define REGEX_DUMP_DEBUG +//#define REGEX_RUN_DEBUG +// End of #defines inteded to be directly set. + +#ifdef REGEX_SCAN_DEBUG +#define REGEX_SCAN_DEBUG_PRINTF printf +#else +#define REGEX_SCAN_DEBUG_PRINTF +#endif + +#ifdef REGEX_DUMP_DEBUG +#define REGEX_DUMP_DEBUG_PRINTF printf +#else +#define REGEX_DUMP_DEBUG_PRINTF +#endif + +#ifdef REGEX_RUN_DEBUG +#define REGEX_RUN_DEBUG_PRINTF printf +#define REGEX_DUMP_DEBUG_PRINTF printf +#else +#define REGEX_RUN_DEBUG_PRINTF +#endif + +#if defined(REGEX_SCAN_DEBUG) || defined(REGEX_RUN_DEBUG) || defined(REGEX_DUMP_DEBUG) +#include +#endif + + // // Opcode types In the compiled form of the regexp, these are the type, or opcodes, // of the entries. // enum { URX_RESERVED_OP = 0, - URX_UNUSED1 = 1, + URX_BACKTRACK = 1, URX_END = 2, URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match URX_STRING = 4, // Value field is index of string start @@ -52,7 +84,7 @@ enum { // Used for debug printing only. #define URX_OPCODE_NAMES \ "URX_RESERVED_OP", \ - "URX_UNUSED1", \ + "URX_BACKTRACK", \ "END", \ "ONECHAR", \ "STRING", \ diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 52ac4af64f..10f92708b0 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -280,9 +280,9 @@ UnicodeString RegexMatcher::group(UErrorCode &status) const { -UnicodeString RegexMatcher::group(int32_t group, UErrorCode &status) const { - int32_t s = start(group, status); - int32_t e = end(group, status); +UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { + int32_t s = start(groupNum, status); + int32_t e = end(groupNum, status); // Note: calling start() and end() above will do all necessary checking that // the group number is OK and that a match exists. status will be set. @@ -539,6 +539,28 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { int32_t opType; // the opcode int32_t opValue; // and the operand value. + #ifdef REGEX_RUN_DEBUG + { + printf("MatchAt(startIdx=%d)\n", startIdx); + printf("Original Pattern: "); + int i; + for (i=0; ifPattern.length(); i++) { + printf("%c", fPattern->fPattern.charAt(i)); + } + printf("\n"); + printf("Input String: "); + for (i=0; ilength(); i++) { + UChar c = fInput->charAt(i); + if (c<32 || c>256) { + c = '.'; + } + printf("%c", c); + } + printf("\n"); + printf("\n"); + printf("PatLoc inputIdx char\n"); + } + #endif if (U_FAILURE(status)) { return; @@ -569,7 +591,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { op = pat->elementAti(patIdx); opType = URX_TYPE(op); opValue = URX_VAL(op); - // printf("%d %d \"%c\"\n", patIdx, inputIdx, fInput->char32At(inputIdx)); + #ifdef REGEX_RUN_DEBUG + printf("inputIdx=%d inputChar=%c ", inputIdx, fInput->char32At(inputIdx)); + fPattern->dumpOp(patIdx); + #endif patIdx++; switch (opType) { @@ -579,6 +604,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { break; + case URX_BACKTRACK: + // Force a backtrack. In some circumstances, the pattern compiler + // will notice that the pattern can't possibly match anything, and will + // emit one of these at that point. + backTrack(inputIdx, patIdx); + break; + + case URX_ONECHAR: { UChar32 inputChar = fInput->char32At(inputIdx); @@ -909,7 +942,12 @@ breakFromLoop: fLastMatchEnd = fMatchEnd; fMatchStart = startIdx; fMatchEnd = inputIdx; + REGEX_RUN_DEBUG_PRINTF("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd); } + else + { + REGEX_RUN_DEBUG_PRINTF("No match\n\n"); + } return; } diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index 227bd2d628..46268e6552 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -18,8 +18,6 @@ #include "regexcmp.h" #include "regeximp.h" -#include "stdio.h" // TODO: get rid of this... - U_NAMESPACE_BEGIN //-------------------------------------------------------------------------- @@ -197,7 +195,7 @@ UBool RegexPattern::operator ==(const RegexPattern &other) const { //--------------------------------------------------------------------- RegexPattern *RegexPattern::compile( const UnicodeString ®ex, - int32_t flags, + uint32_t flags, UParseError &pe, UErrorCode &status) { @@ -243,7 +241,7 @@ RegexPattern *RegexPattern::compile( const UnicodeString ®ex, // flags // //--------------------------------------------------------------------- -int32_t RegexPattern::flags() const { +uint32_t RegexPattern::flags() const { return fFlags; } @@ -320,8 +318,6 @@ UnicodeString RegexPattern::pattern() const { //--------------------------------------------------------------------- // // split -// TODO: perl returns captured strings intermixed with the -// fields. Should we do this too? // //--------------------------------------------------------------------- int32_t RegexPattern::split(const UnicodeString &input, @@ -383,10 +379,28 @@ int32_t RegexPattern::split(const UnicodeString &input, int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart; dest[i].setTo(input, nextOutputStringStart, fieldLen); nextOutputStringStart = fMatcher->fMatchEnd; + + // If the delimiter pattern has capturing parentheses, the captured + // text goes out into the next n destination strings. + int32_t groupNum; + for (groupNum=1; groupNum<=this->fNumCaptureGroups; groupNum++) { + if (i==destCapacity-1) { + break; + } + i++; + dest[i] = fMatcher->group(groupNum, status); + } + if (nextOutputStringStart == inputLen) { // The delimiter was at the end of the string. We're done. break; } + + if (i==destCapacity-1) { + // We've filled up the last output string with capture group data. + // Give back the last string, to be used for the remainder of the input. + i--; + } } else { @@ -410,88 +424,102 @@ int32_t RegexPattern::split(const UnicodeString &input, //--------------------------------------------------------------------- static const char *opNames[] = {URX_OPCODE_NAMES}; -void RegexPattern::dump() { +void RegexPattern::dumpOp(int32_t index) const { + int32_t op = fCompiledPat->elementAti(index); + int32_t val = URX_VAL(op); + int32_t type = URX_TYPE(op); + int32_t pinnedType = type; + if (pinnedType >= sizeof(opNames)/sizeof(char *)) { + pinnedType = 0; + } + + REGEX_DUMP_DEBUG_PRINTF("%4d %08x %-15s ", index, op, opNames[pinnedType]); + switch (type) { + case URX_NOP: + case URX_DOTANY: + case URX_FAIL: + case URX_BACKSLASH_A: + case URX_BACKSLASH_G: + case URX_BACKSLASH_X: + case URX_END: + // Types with no operand field of interest. + break; + + case URX_START_CAPTURE: + case URX_END_CAPTURE: + case URX_STATIC_SETREF: + case URX_STATE_SAVE: + case URX_JMP: + case URX_BACKSLASH_B: + case URX_BACKSLASH_D: + case URX_BACKSLASH_W: + case URX_BACKSLASH_Z: + case URX_CARET: + case URX_DOLLAR: + case URX_STRING_LEN: + // types with an integer operand field. + REGEX_DUMP_DEBUG_PRINTF("%d", val); + break; + + case URX_ONECHAR: + REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?'); + break; + + case URX_STRING: + { + int32_t lengthOp = fCompiledPat->elementAti(index+1); + U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); + int32_t length = URX_VAL(lengthOp); + int32_t i; + for (i=val; i= 256) {c = '.';} + REGEX_DUMP_DEBUG_PRINTF("%c", c); + } + } + break; + + case URX_SETREF: + { + REGEX_DUMP_DEBUG_PRINTF("%d ", val); + UnicodeString s; + UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); + set->toPattern(s, TRUE); + for (int32_t i=0; ielementAti(index); - val = URX_VAL(op); - type = URX_TYPE(op); - pinnedType = type; - if (pinnedType >= sizeof(opNames)/sizeof(char *)) { - pinnedType = 0; - } - - printf("%4d %08x %-15s ", index, op, opNames[pinnedType]); - switch (type) { - case URX_NOP: - case URX_DOTANY: - case URX_FAIL: - case URX_BACKSLASH_A: - case URX_BACKSLASH_G: - case URX_BACKSLASH_X: - // Types with no operand field of interest. - break; - - case URX_START_CAPTURE: - case URX_END_CAPTURE: - case URX_SETREF: - case URX_STATIC_SETREF: - case URX_STATE_SAVE: - case URX_JMP: - case URX_BACKSLASH_B: - case URX_BACKSLASH_D: - case URX_BACKSLASH_W: - case URX_BACKSLASH_Z: - case URX_CARET: - case URX_DOLLAR: - // types with an integer operand field. - printf("%d", val); - break; - - case URX_ONECHAR: - printf("%c", val<256?val:'?'); - break; - - case URX_STRING: - stringStart = val; - break; - - case URX_STRING_LEN: - for (i=stringStart; i= 256) {c = '?';}; - printf("%c", c); - } - break; - - case URX_END: - goto breakFromLoop; - - default: - printf("??????"); - break; - } - printf("\n"); + for (index = 0; indexsize(); index++) { + dumpOp(index); } -breakFromLoop: - printf("\n\n"); + REGEX_DUMP_DEBUG_PRINTF("\n\n"); }; const char RegexPattern::fgClassID = 0; diff --git a/icu4c/source/i18n/unicode/regex.h b/icu4c/source/i18n/unicode/regex.h index 4a4d9f2843..c6d6c4e353 100644 --- a/icu4c/source/i18n/unicode/regex.h +++ b/icu4c/source/i18n/unicode/regex.h @@ -81,6 +81,8 @@ enum { * to be applied to input text, and a few convenience methods for simple common * uses of regular expressions. * + *

Class RegexPattern is not intended to be subclassed.

+ * * @draft ICU 2.4 */ class U_I18N_API RegexPattern: public UObject { @@ -192,7 +194,7 @@ public: * @draft ICU 2.4 */ static RegexPattern *compile( const UnicodeString ®ex, - int32_t flags, + uint32_t flags, UParseError &pe, UErrorCode &status); @@ -202,7 +204,7 @@ public: * @return the match mode flags * @draft ICU 2.4 */ - virtual int32_t flags() const; + virtual uint32_t flags() const; /* * Creates a RegexMatcher that will match the given input against this pattern. The @@ -275,7 +277,7 @@ public: // // dump Debug function, displays the compiled form of a pattern. // - void dump(); + void dump() const; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. @@ -291,14 +293,12 @@ public: */ static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; } - static const char fgClassID; - private: // // Implementation Data // UnicodeString fPattern; // The original pattern string. - int32_t fFlags; // The flags used when compiling the pattern. + uint32_t fFlags; // The flags used when compiling the pattern. // UVector *fCompiledPat; // The compiled pattern. UnicodeString fLiteralText; // Any literal string data from the pattern, @@ -317,6 +317,12 @@ private: UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined // regex character classes, e.g. Word. + /** + * The address of this static class variable serves as this class's ID + * for ICU "poor man's RTTI". + */ + static const char fgClassID; + friend class RegexCompile; friend class RegexMatcher; @@ -325,6 +331,7 @@ private: // void init(); // Common initialization, for use by constructors. void zap(); // Common cleanup + void dumpOp(int32_t index) const; @@ -343,6 +350,8 @@ private: * input text to which the expression can be applied. It includes methods * for testing for matches, and for find and replace operations. * + *

Class RegexMatcher is not intended to be subclassed.

+ * * @draft ICU 2.4 */ class U_I18N_API RegexMatcher: public UObject { @@ -355,6 +364,227 @@ public: */ virtual ~RegexMatcher(); + + /** + * Attempts to match the entire input string against the pattern. + * @param status A reference to a UErrorCode to receive any errors. + * @return TRUE if there is a match + * @draft ICU 2.4 + */ + virtual UBool matches(UErrorCode &status); + + + + /** + * Attempts to match the input string, starting from the beginning, against the pattern. + * Like the matches() method, this function always starts at the beginning of the input string; + * unlike that function, it does not require that the entire input string be matched. + * + *

If the match succeeds then more information can be obtained via the start(), + * end(), and group() functions.

+ * + * @param status A reference to a UErrorCode to receive any errors. + * @return TRUE if there is a match at the start of the input string. + * @draft ICU 2.4 + */ + virtual UBool lookingAt(UErrorCode &status); + + + /** + * Find the next pattern match in the input string. + * The find begins searching the input at the location following the end of + * the previous match, or at the start of the string if there is no previous match. + * If a match is found, start(), end() and group() + * will provide more information regarding the match. + *

Note that if the input string is changed by the application, + * use find(startPos, status) instead of find(), because the saved starting + * position may not be valid with the altered input string.

+ * @return TRUE if a match is found. + * @draft ICU 2.4 + */ + virtual UBool find(); + + + /** + * Resets this RegexMatcher and then attempts to find the next substring of the + * input string that matches the pattern, starting at the specified index. + * + * @param start the position in the input string to begin the search + * @param status A reference to a UErrorCode to receive any errors. + * @return TRUE if a match is found. + * @draft ICU 2.4 + */ + virtual UBool find(int32_t start, UErrorCode &status); + + + /* + * Returns a string containing the text matched by the previous match. + * If the pattern can match an empty string, an empty string may be returned. + * @param status A reference to a UErrorCode to receive any errors. + * Possible errors are U_REGEX_INVALID_STATE if no match + * has been attempted or the last match failed. + * @return a string containing the matched input text. + * @draft ICU 2.4 + */ + virtual UnicodeString group(UErrorCode &status) const; + + + /** + * Returns a string containing the text captured by the given group + * during the previous match operation. Group(0) is the entire match. + * + * @param group the capture group number + * @param status A reference to a UErrorCode to receive any errors. + * Possible errors are U_REGEX_INVALID_STATE if no match + * has been attempted or the last match failed and + * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. + * @return the captured text + * @draft ICU 2.4 + */ + virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; + + + /** + * Returns the number of capturing groups in this matcher's pattern. + * @return the number of capture groups + * @draft ICU 2.4 + */ + virtual int32_t groupCount() const; + + + /** + * Returns the index in the input string of the start of the text matched + * during the previous match operation. + * @param status a reference to a UErrorCode to receive any errors. + * @return The position in the input string of the start of the last match. + * @draft ICU 2.4 + */ + virtual int32_t start(UErrorCode &status) const; + + + /** + * Returns the index in the input string of the start of the text matched by the + * specified capture group during the previous match operation. Return -1 if + * the capture group exists in the pattern, but was not part of the last match. + * + * @param group the capture group number + * @param status A reference to a UErrorCode to receive any errors. Possible + * errors are U_REGEX_INVALID_STATE if no match has been + * attempted or the last match failed, and + * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number + * @return the start position of substring matched by the specified group. + * @draft ICU 2.4 + */ + virtual int32_t start(int group, UErrorCode &status) const; + + + /** + * Returns the index in the input string of the character following the + * text matched during the previous match operation. + * @param status A reference to a UErrorCode to receive any errors. Possible + * errors are U_REGEX_INVALID_STATE if no match has been + * attempted or the last match failed. + * @return the index of the last character matched, plus one. + * @draft ICU 2.4 + */ + virtual int32_t end(UErrorCode &status) const; + + + /** + * Returns the index in the input string of the character following the + * text matched by the specified capture group during the previous match operation. + * @param group the capture group number + * @param status A reference to a UErrorCode to receive any errors. Possible + * errors are U_REGEX_INVALID_STATE if no match has been + * attempted or the last match failed and + * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number + * @return the index of the last character, plus one, of the text + * captured by the specifed group during the previous match operation. + * Return -1 if the capture group was not part of the match. + * @draft ICU 2.4 + */ + virtual int32_t end(int group, UErrorCode &status) const; + + + /** + * Resets this matcher. The effect is to remove any memory of previous matches, + * and to cause subsequent find() operations to begin at the beginning of + * the input string. + * + * @return this RegexMatcher. + * @draft ICU 2.4 + */ + virtual RegexMatcher &reset(); + + + /** + * Resets this matcher with a new input string. This allows instances of RegexMatcher + * to be reused, which is more efficient than creating a new RegexMatcher for + * each input string to be processed. + * @return this RegexMatcher. + * @draft ICU 2.4 + */ + virtual RegexMatcher &reset(const UnicodeString &input); + + + /** + * Returns the input string being matched. The returned string is not a copy, + * but the live input string. It should not be altered or deleted. + * @return the input string + * @draft ICU 2.4 + */ + virtual const UnicodeString &input() const; + + + /** + * Returns the pattern that is interpreted by this matcher. + * @return the RegexPattern for this RegexMatcher + * @draft ICU 2.4 + */ + virtual const RegexPattern &pattern() const; + + + /** + * Replaces every substring of the input that matches the pattern + * with the given replacement string. This is a convenience function that + * provides a complete find-and-replace-all operation. + * + * This method first resets this matcher. It then scans the input string + * looking for matches of the pattern. Input that is not part of any + * match is left unchanged; each match is replaced in the result by the + * replacement string. The replacement string may contain references to + * capture groups. + * + * @param replacement a string containing the replacement text. + * @param status a reference to a UErrorCode to receive any errors. + * @return a string containing the results of the find and replace. + * @draft ICU 2.4 + */ + virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); + + + /** + * Replaces the first substring of the input that matches + * the pattern with the replacement string. This is a convenience + * function that provides a complete find-and-replace operation. + * + *

This function first resets this RegexMatcher. It then scans the input string + * looking for a match of the pattern. Input that is not part + * of the match is appended directly to the result string; the match is replaced + * in the result by the replacement string. The replacement string may contain + * references to captured groups.

+ * + *

The state of the matcher (the position at which a subsequent find() + * would begin) after completing a replaceFirst() is not specified. The + * RegexMatcher should be reset before doing additional find() operations.

+ * + * @param replacement a string containing the replacement text. + * @param status a reference to a UErrorCode to receive any errors. + * @return a string containing the results of the find and replace. + * @draft ICU 2.4 + */ + virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); + /** * Implements a replace operation intended to be used as part of an * incremental find-and-replace. @@ -398,219 +628,6 @@ public: */ virtual UnicodeString &appendTail(UnicodeString &dest); - - /** - * Returns the index in the input string of the character following the - * text matched during the previous match operation. - * @param status A reference to a UErrorCode to receive any errors. Possible - * errors are U_REGEX_INVALID_STATE if no match has been - * attempted or the last match failed. - * @return the index of the last character matched, plus one. - * @draft ICU 2.4 - */ - virtual int32_t end(UErrorCode &status) const; - - - /** - * Returns the index in the input string of the character following the - * text matched by the specified capture group during the previous match operation. - * @param group the capture group number - * @param status A reference to a UErrorCode to receive any errors. Possible - * errors are U_REGEX_INVALID_STATE if no match has been - * attempted or the last match failed and - * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number - * @return the index of the last character, plus one, of the text - * captured by the specifed group during the previous match operation. - * Return -1 if the capture group was not part of the match. - * @draft ICU 2.4 - */ - virtual int32_t end(int group, UErrorCode &status) const; - - - /** - * Find the next pattern match in the input string. - * The find begins searching the input at the location following the end of - * the previous match, or at the start of the string if there is no previous match. - * If a match is found, start(), end() and group() - * will provide more information regarding the match. - * @return TRUE if a match is found. - * @draft ICU 2.4 - */ - virtual UBool find(); - - - /** - * Resets this RegexMatcher and then attempts to find the next substring of the - * input string that matches the pattern, starting at the specified index. - * - * @param status the position in the input string to begin the search - * @param status A reference to a UErrorCode to receive any errors. - * @return TRUE if a match is found. - * @draft ICU 2.4 - */ - virtual UBool find(int32_t start, UErrorCode &status); - - - /* - * Returns a string containing the text matched by the previous match. - * If the pattern can match an empty string, an empty string may be returned. - * @param status A reference to a UErrorCode to receive any errors. - * Possible errors are U_REGEX_INVALID_STATE if no match - * has been attempted or the last match failed. - * @return a string containing the matched input text. - * @draft ICU 2.4 - */ - virtual UnicodeString group(UErrorCode &status) const; - - - /** - * Returns a string containing the text captured by the given group - * during the previous match operation. Group(0) is the entire match. - * - * @param group the capture group number - * @param status A reference to a UErrorCode to receive any errors. - * Possible errors are U_REGEX_INVALID_STATE if no match - * has been attempted or the last match failed and - * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. - * @return the captured text - * @draft ICU 2.4 - */ - virtual UnicodeString group(int32_t group, UErrorCode &status) const; - - - /** - * Returns the number of capturing groups in this matcher's pattern. - * @return the number of capture groups - * @draft ICU 2.4 - */ - virtual int32_t groupCount() const; - - - /** - * Returns the input string being matched. The returned string is not a copy, - * but the live input string. It should not be altered or deleted. - * @return the input string - * @draft ICU 2.4 - */ - virtual const UnicodeString &input() const; - - - /** - * Attempts to match the input string, starting from the beginning, against the pattern. - * Like the matches() method, this function always starts at the beginning of the input string; - * unlike that function, it does not require that the entire input string be matched. - * - *

If the match succeeds then more information can be obtained via the start(), - * end(), and group() functions.

- * - * @param status A reference to a UErrorCode to receive any errors. - * @return TRUE if there is a match at the start of the input string. - * @draft ICU 2.4 - */ - virtual UBool lookingAt(UErrorCode &status); - - - /** - * Attempts to match the entire input string against the pattern. - * @param status A reference to a UErrorCode to receive any errors. - * @return TRUE if there is a match - * @draft ICU 2.4 - */ - virtual UBool matches(UErrorCode &status); - - - /** - * Returns the pattern that is interpreted by this matcher. - * @return the RegexPattern for this RegexMatcher - * @draft ICU 2.4 - */ - virtual const RegexPattern &pattern() const; - - - /** - * Replaces every substring of the input that matches the pattern - * with the given replacement string. This is a convenience function that - * provides a complete find-and-replace-all operation. - * - * This method first resets this matcher. It then scans the input string - * looking for matches of the pattern. Input that is not part of any - * match is left unchanged; each match is replaced in the result by the - * replacement string. The replacement string may contain references to - * capture groups. - * - * @param replacement a string containing the replacement text. - * @param status a reference to a UErrorCode to receive any errors. - * @return a string containing the results of the find and replace. - * @draft ICU 2.4 - */ - virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); - - - /** - * Replaces the first substring of the input that matches - * the pattern with the replacement string. This is a convenience - * function that provides a complete find-and-replace operation. - * - * This function first resets this RegexMatcher. It then scans the input string - * looking for a match of the pattern. Input that is not part - * of the match is appended directly to the result string; the match is replaced - * in the result by the replacement string. The replacement string may contain - * references to captured groups. - * - * @param replacement a string containing the replacement text. - * @param status a reference to a UErrorCode to receive any errors. - * @return a string containing the results of the find and replace. - * @draft ICU 2.4 - */ - virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); - - - /** - * Resets this matcher. The effect is to remove any memory of previous matches, - * and to cause subsequent find() operations to begin at the beginning of - * the input string. - * - * @return this RegexMatcher. - * @draft ICU 2.4 - */ - virtual RegexMatcher &reset(); - - - /** - * Resets this matcher with a new input string. This allows instances of RegexMatcher - * to be reused, which is more efficient than creating a new RegexMatcher for - * each input string to be processed. - * @return this RegexMatcher. - * @draft ICU 2.4 - */ - virtual RegexMatcher &reset(const UnicodeString &input); - - - /** - * Returns the index in the input string of the start of the text matched - * during the previous match operation. - * @param status a reference to a UErrorCode to receive any errors. - * @return The position in the input string of the start of the last match. - * @draft ICU 2.4 - */ - virtual int32_t start(UErrorCode &status) const; - - - /** - * Returns the index in the input string of the start of the text matched by the - * specified capture group during the previous match operation. Return -1 if - * the capture group exists in the pattern, but was not part of the last match. - * - * @param group the capture group number - * @param status A reference to a UErrorCode to receive any errors. Possible - * errors are U_REGEX_INVALID_STATE if no match has been - * attempted or the last match failed, and - * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number - * @return the start position of substring matched by the specified group. - * @draft ICU 2.4 - */ - virtual int32_t start(int group, UErrorCode &status) const; - /** * ICU "poor man's RTTI", returns a UClassID for the actual class. @@ -626,8 +643,6 @@ public: */ static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; } - static const char fgClassID; - private: // Constructors and other object boilerplate are private. // Instances of RegexMatcher can not be assigned, copied, cloned, etc. @@ -658,6 +673,13 @@ private: UVector *fCaptureStarts; UVector *fCaptureEnds; + /** + * The address of this static class variable serves as this class's ID + * for ICU "poor man's RTTI". + */ + static const char fgClassID; + + }; U_NAMESPACE_END diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index e97651e62b..4618deda6c 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -368,7 +368,7 @@ void RegexTest::Basic() { // #if 0 { - REGEX_FIND("(?:ABC)+", "<0>ABCABCABCD"); + REGEX_FIND("[{ab}]", "a"); } exit(1); #endif @@ -436,6 +436,9 @@ void RegexTest::Basic() { REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); + // Set contains only a string, no individual chars. + REGEX_TESTLM("[{ab}]", "a", FALSE, FALSE); + // // OR operator in patterns // @@ -975,6 +978,52 @@ void RegexTest::API_Pattern() { delete pat1; + // split, with a pattern with (capture) + pat1 = RegexPattern::compile("<(\\w*)>", pe, status); + REGEX_CHECK_STATUS; + + n = pat1->split("Now is the time", fields, 10, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==6); + REGEX_ASSERT(fields[0]==""); + REGEX_ASSERT(fields[1]=="a"); + REGEX_ASSERT(fields[2]=="Now is "); + REGEX_ASSERT(fields[3]=="b"); + REGEX_ASSERT(fields[4]=="the time"); + REGEX_ASSERT(fields[5]=="c"); + REGEX_ASSERT(fields[6]==""); + + n = pat1->split(" Now is the time", fields, 10, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==6); + REGEX_ASSERT(fields[0]==" "); + REGEX_ASSERT(fields[1]=="a"); + REGEX_ASSERT(fields[2]=="Now is "); + REGEX_ASSERT(fields[3]=="b"); + REGEX_ASSERT(fields[4]=="the time"); + REGEX_ASSERT(fields[5]=="c"); + REGEX_ASSERT(fields[6]==""); + + n = pat1->split(" Now is the time", fields, 4, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==4); + REGEX_ASSERT(fields[0]==" "); + REGEX_ASSERT(fields[1]=="a"); + REGEX_ASSERT(fields[2]=="Now is "); + REGEX_ASSERT(fields[3]=="the time"); + delete pat1; + + pat1 = RegexPattern::compile("([-,])", pe, status); + REGEX_CHECK_STATUS; + n = pat1->split("1-10,20", fields, 10, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==5); + REGEX_ASSERT(fields[0]=="1"); + REGEX_ASSERT(fields[1]=="-"); + REGEX_ASSERT(fields[2]=="10"); + REGEX_ASSERT(fields[3]==","); + REGEX_ASSERT(fields[4]=="20"); + delete pat1; }