ICU-105 Regular Expressions, changes from code review

X-SVN-Rev: 10294
2002-11-19 19:31:03 +00:00 · 2002-11-19 19:31:03 +00:00 · 24bf088281
commit 24bf088281
parent bf1f6b1213
9 changed files with 556 additions and 353 deletions
--- a/icu4c/source/common/putil.c
+++ b/icu4c/source/common/putil.c
@ -1839,7 +1839,6 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
    "U_REGEX_PROPERTY_SYNTAX",
    "U_REGEX_UNIMPLEMENTED",
    "U_REGEX_MISMATCHED_PAREN",
    "U_REGEX_MATCH_MODE_ERROR"
 };
 U_CAPI const char * U_EXPORT2
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -500,18 +500,17 @@ typedef enum UErrorCode {
    /*
     * The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
     */
-     U_REGEX_ERROR_START=0x10300,
+     U_REGEX_ERROR_START=0x10300,          /**< Start of codes indicating Regexp failures */
-     U_REGEX_INTERNAL_ERROR,
+     U_REGEX_INTERNAL_ERROR,               /**< An internal error (bug) was detected.     */
-     U_REGEX_RULE_SYNTAX,
+     U_REGEX_RULE_SYNTAX,                  /**< Syntax error in regexp pattern.           */
-     U_REGEX_INVALID_STATE,
+     U_REGEX_INVALID_STATE,                /**< RegexMatcher in invalid state for requested operation */
-     U_REGEX_BAD_ESCAPE_SEQUENCE,
+     U_REGEX_BAD_ESCAPE_SEQUENCE,          /**< Unrecognized backslash escape sequence in pattern */
-     U_REGEX_PROPERTY_SYNTAX,
+     U_REGEX_PROPERTY_SYNTAX,              /**< Incorrect Unicode property                */
-     U_REGEX_UNIMPLEMENTED,
+     U_REGEX_UNIMPLEMENTED,                /**< Use of regexp feature that is not yet implemented. */
-     U_REGEX_MISMATCHED_PAREN,
+     U_REGEX_MISMATCHED_PAREN,             /**< Incorrectly nested parentheses in regexp pattern. */
-     U_REGEX_MATCH_MODE_ERROR,
+     U_REGEX_ERROR_LIMIT,                  /**< This must always be the last value to indicate the limit for regexp errors */
     U_REGEX_ERROR_LIMIT,
-    U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
+    U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT      /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
 } UErrorCode;
 /* Use the following to determine if an UErrorCode represents */
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -28,8 +28,6 @@
 #include "ucln_in.h"
 #include "mutex.h"
 #include "stdio.h"      // TODO:  Get rid of this
 #include "regeximp.h"
 #include "regexcst.h"   // Contains state table for the regex pattern parser.
                        //   generated by a Perl script.
@ -40,7 +38,6 @@
 U_NAMESPACE_BEGIN
 const char       RegexCompile::fgClassID=0;
 static const int RESCAN_DEBUG = 0;
 //----------------------------------------------------------------------------------------
 //
@ -173,6 +170,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
    //
    //  Set up the constant (static) Unicode Sets.
    //    TODO:  something cleaner for that -128 constant.
    //
    ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128],   gRuleSet_rule_char_pattern,  status);
    ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern,      status);
@ -282,14 +280,12 @@ void    RegexCompile::compile(
        //    the search will stop there, if not before.
        //
        tableEl = &gRuleParseStateTable[state];
-        if (RESCAN_DEBUG) {
+        REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d)    state=%s ",
-            printf( "char, line, col = (\'%c\', %d, %d)    state=%s ",
+            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
                fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
        }
        for (;;) {    // loop through table rows belonging to this state, looking for one
                      //   that matches the current input char.
-            if (RESCAN_DEBUG) { printf( ".");}
+            REGEX_SCAN_DEBUG_PRINTF( ".");
            if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
                // Table row specified an individual character, not a set, and
                //   the input character is not quoted, and
@ -323,7 +319,7 @@ void    RegexCompile::compile(
            // No match on this row, advance to the next  row for this state,
            tableEl++;
        }
-        if (RESCAN_DEBUG) { printf( "\n");}
+        REGEX_SCAN_DEBUG_PRINTF("\n");
        //
        // We've found the row of the state table that matches the current input
@ -340,7 +336,7 @@ void    RegexCompile::compile(
            fStackPtr++;
            if (fStackPtr >= kStackSize) {
                error(U_REGEX_INTERNAL_ERROR);
-                // printf( "RegexCompile::parse() - state stack overflow.\n");
+                REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n");
                fStackPtr--;
            }
            fStack[fStackPtr] = tableEl->fPushState;
@ -369,6 +365,36 @@ void    RegexCompile::compile(
    }
    //
    // The pattern has now been read and processed, and the compiled code generated.
    //
    //
    // Compute the number of digits requried for the largest capture group number.
    //
    fRXPat->fMaxCaptureDigits = 1;
    int32_t  n = 10;
    for (;;) {
        if (n > fRXPat->fNumCaptureGroups) {
            break;
        }
        fRXPat->fMaxCaptureDigits++;
        n *= 10;
    }
    //
    // A stupid bit of non-sense to prevent code coverage testing from complaining
    //   about the pattern.dump() debug function.  Go through the motions of dumping,
    //   even though, without the #define set, it will do nothing.
    //
 #ifndef REGEX_DUMP_DEBUG
    static UBool phonyDumpDone = FALSE;
    if (phonyDumpDone==FALSE) {
        fRXPat->dump();
        phonyDumpDone = TRUE;
    }
 #endif
 }
@ -1094,27 +1120,39 @@ void        RegexCompile::compileSet(UnicodeSet *theSet)
    if (theSet == NULL) {
        return;
    }
-    if (theSet->size() > 1) {
+    int32_t  setSize = theSet->size();
-        //  The set contains two or more chars.
+    UChar32  firstSetChar = theSet->charAt(0);
-        //  Put it into the compiled pattern as a set.
+    if (firstSetChar == -1) {
-        int32_t setNumber = fRXPat->fSets->size();
+        // Sets that contain only strings, but no individual chars,
-        fRXPat->fSets->addElement(theSet, *fStatus);
+        // will end up here.   TODO:  figure out what to with sets containing strings.
-        int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
+        setSize = 0;
        fRXPat->fCompiledPat->addElement(setOp, *fStatus);
    }
-    else
+
-    {
+    switch (setSize) {
-        // The set contains only a single code point.  Put it into
+    case 0:      // Set of no elements.   Always fails to match.  
-        //   the compiled pattern as a single char operation rather
+        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
-        //   than a set, and discard the set itself.
+        break;
-        UChar32  c = theSet->charAt(0);
+        
-        if (c == -1) {
+    case 1:
-            // Set contained no chars.  Stuff an invalid char that can't match.
+        {
-            c = 0x1fffff;
+            // The set contains only a single code point.  Put it into
            //   the compiled pattern as a single char operation rather
            //   than a set, and discard the set itself.
            int32_t  charToken = URX_BUILD(URX_ONECHAR, firstSetChar);
            fRXPat->fCompiledPat->addElement(charToken, *fStatus);
            delete theSet;
        }
        break;
    default: 
        {
            //  The set contains two or more chars.  (the normal case)
            //  Put it into the compiled pattern as a set.
            int32_t setNumber = fRXPat->fSets->size();
            fRXPat->fSets->addElement(theSet, *fStatus);
            int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
            fRXPat->fCompiledPat->addElement(setOp, *fStatus);
        }
        int32_t  charToken = URX_BUILD(URX_ONECHAR, c);
        fRXPat->fCompiledPat->addElement(charToken, *fStatus);
        delete theSet;
    }
 }
@ -1321,7 +1359,7 @@ UnicodeSet *RegexCompile::scanSet() {
    if (U_FAILURE(localStatus)) {
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
        //         UnicodeSet appears to not be reporting correctly at this time.
-        printf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
+        REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
        error(localStatus);
        delete uset;
        return NULL;
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -28,8 +28,6 @@
 U_NAMESPACE_BEGIN
 static const UBool REGEX_DEBUG = TRUE;
 //--------------------------------------------------------------------------------
 //
 //  class RegexCompile    Contains the regular expression compiler.
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -13,13 +13,45 @@
 #define _REGEXIMP_H
 //
 //  debugging support.  Enable one or more of the #defines immediately following
 //
 //#define REGEX_SCAN_DEBUG
 #define REGEX_DUMP_DEBUG
 //#define REGEX_RUN_DEBUG
 //  End of #defines inteded to be directly set.
 #ifdef REGEX_SCAN_DEBUG
 #define REGEX_SCAN_DEBUG_PRINTF printf
 #else
 #define REGEX_SCAN_DEBUG_PRINTF
 #endif
 #ifdef REGEX_DUMP_DEBUG
 #define REGEX_DUMP_DEBUG_PRINTF printf
 #else
 #define REGEX_DUMP_DEBUG_PRINTF
 #endif
 #ifdef REGEX_RUN_DEBUG
 #define REGEX_RUN_DEBUG_PRINTF printf
 #define REGEX_DUMP_DEBUG_PRINTF printf
 #else
 #define REGEX_RUN_DEBUG_PRINTF
 #endif
 #if defined(REGEX_SCAN_DEBUG) || defined(REGEX_RUN_DEBUG) || defined(REGEX_DUMP_DEBUG)
 #include <stdio.h>
 #endif
 //
 //  Opcode types     In the compiled form of the regexp, these are the type, or opcodes,
 //                   of the entries.
 //
 enum {
     URX_RESERVED_OP   = 0,
-     URX_UNUSED1       = 1,
+     URX_BACKTRACK     = 1,
     URX_END           = 2,
     URX_ONECHAR       = 3,    // Value field is the 21 bit unicode char to match
     URX_STRING        = 4,    // Value field is index of string start
@ -52,7 +84,7 @@ enum {
 //   Used for debug printing only.
 #define URX_OPCODE_NAMES       \
        "URX_RESERVED_OP",     \
-        "URX_UNUSED1",         \
+        "URX_BACKTRACK",       \
        "END",                 \
        "ONECHAR",             \
        "STRING",              \
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -280,9 +280,9 @@ UnicodeString RegexMatcher::group(UErrorCode &status) const {
-UnicodeString RegexMatcher::group(int32_t group, UErrorCode &status) const {
+UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
-    int32_t  s = start(group, status);
+    int32_t  s = start(groupNum, status);
-    int32_t  e = end(group, status);
+    int32_t  e = end(groupNum, status);
    // Note:  calling start() and end() above will do all necessary checking that
    //        the group number is OK and that a match exists.  status will be set.
@ -539,6 +539,28 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
    int32_t     opType;                //    the opcode
    int32_t     opValue;               //    and the operand value.
    #ifdef REGEX_RUN_DEBUG
    {
        printf("MatchAt(startIdx=%d)\n", startIdx);
        printf("Original Pattern: ");
        int i;
        for (i=0; i<fPattern->fPattern.length(); i++) {
            printf("%c", fPattern->fPattern.charAt(i));
        }
        printf("\n");
        printf("Input String: ");
        for (i=0; i<fInput->length(); i++) {
            UChar c = fInput->charAt(i);
            if (c<32 || c>256) {
                c = '.';
            }
            printf("%c", c);
        }
        printf("\n");
        printf("\n");
        printf("PatLoc  inputIdx  char\n");
    }
    #endif
    if (U_FAILURE(status)) {
        return;
@ -569,7 +591,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
        op      = pat->elementAti(patIdx);
        opType  = URX_TYPE(op);
        opValue = URX_VAL(op);
-        // printf("%d   %d  \"%c\"\n", patIdx, inputIdx, fInput->char32At(inputIdx));
+        #ifdef REGEX_RUN_DEBUG
            printf("inputIdx=%d   inputChar=%c    ", inputIdx, fInput->char32At(inputIdx));
            fPattern->dumpOp(patIdx);
        #endif
        patIdx++;
        switch (opType) {
@ -579,6 +604,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
            break;
        case URX_BACKTRACK:
            // Force a backtrack.  In some circumstances, the pattern compiler
            //   will notice that the pattern can't possibly match anything, and will
            //   emit one of these at that point.
            backTrack(inputIdx, patIdx);
            break;
        case URX_ONECHAR:
            {
                UChar32 inputChar = fInput->char32At(inputIdx);
@ -909,7 +942,12 @@ breakFromLoop:
        fLastMatchEnd = fMatchEnd;
        fMatchStart   = startIdx;
        fMatchEnd     = inputIdx;
        REGEX_RUN_DEBUG_PRINTF("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd);
        }
    else
    {
        REGEX_RUN_DEBUG_PRINTF("No match\n\n");
    }
    return;
 }
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -18,8 +18,6 @@
 #include "regexcmp.h"
 #include "regeximp.h"
 #include "stdio.h"    // TODO:  get rid of this...
 U_NAMESPACE_BEGIN
 //--------------------------------------------------------------------------
@ -197,7 +195,7 @@ UBool   RegexPattern::operator ==(const RegexPattern &other) const {
 //---------------------------------------------------------------------
 RegexPattern  *RegexPattern::compile(
                             const UnicodeString &regex,
-                             int32_t              flags,
+                             uint32_t             flags,
                             UParseError          &pe,
                             UErrorCode           &status)  {
@ -243,7 +241,7 @@ RegexPattern *RegexPattern::compile( const UnicodeString &regex,
 //   flags
 //
 //---------------------------------------------------------------------
-int32_t RegexPattern::flags() const {
+uint32_t RegexPattern::flags() const {
    return fFlags;
 }
@ -320,8 +318,6 @@ UnicodeString RegexPattern::pattern() const {
 //---------------------------------------------------------------------
 //
 //   split
 //            TODO:  perl returns captured strings intermixed with the
 //                   fields.  Should we do this too?
 //
 //---------------------------------------------------------------------
 int32_t  RegexPattern::split(const UnicodeString &input,
@ -383,10 +379,28 @@ int32_t  RegexPattern::split(const UnicodeString &input,
            int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
            dest[i].setTo(input, nextOutputStringStart, fieldLen);
            nextOutputStringStart = fMatcher->fMatchEnd;
            // If the delimiter pattern has capturing parentheses, the captured
            //  text goes out into the next n destination strings.
            int32_t groupNum;
            for (groupNum=1; groupNum<=this->fNumCaptureGroups; groupNum++) {
                if (i==destCapacity-1) {
                    break;
                }
                i++;
                dest[i] = fMatcher->group(groupNum, status);
            }
            if (nextOutputStringStart == inputLen) {
                // The delimiter was at the end of the string.  We're done.
                break;
            }
            if (i==destCapacity-1) {
                // We've filled up the last output string with capture group data.
                //  Give back the last string, to be used for the remainder of the input.
                i--;
            }
        }
        else
        {
@ -410,88 +424,102 @@ int32_t  RegexPattern::split(const UnicodeString &input,
 //---------------------------------------------------------------------
 static const char *opNames[] = {URX_OPCODE_NAMES};
-void   RegexPattern::dump() {
+void   RegexPattern::dumpOp(int32_t index) const {
    int32_t op          = fCompiledPat->elementAti(index);
    int32_t val         = URX_VAL(op);
    int32_t type        = URX_TYPE(op);
    int32_t pinnedType  = type;
    if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
        pinnedType = 0;
    }
    REGEX_DUMP_DEBUG_PRINTF("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]);
    switch (type) {
    case URX_NOP:
    case URX_DOTANY:
    case URX_FAIL:
    case URX_BACKSLASH_A:
    case URX_BACKSLASH_G:
    case URX_BACKSLASH_X:
    case URX_END:
        // Types with no operand field of interest.
        break;
    case URX_START_CAPTURE:
    case URX_END_CAPTURE:
    case URX_STATIC_SETREF:
    case URX_STATE_SAVE:
    case URX_JMP:
    case URX_BACKSLASH_B:
    case URX_BACKSLASH_D:
    case URX_BACKSLASH_W:
    case URX_BACKSLASH_Z:
    case URX_CARET:
    case URX_DOLLAR:
    case URX_STRING_LEN:
        // types with an integer operand field.
        REGEX_DUMP_DEBUG_PRINTF("%d", val);
        break;
    case URX_ONECHAR:
        REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
        break;
    case URX_STRING:
        {
            int32_t lengthOp       = fCompiledPat->elementAti(index+1);
            U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
            int32_t length = URX_VAL(lengthOp);
            int32_t i;
            for (i=val; i<val+length; i++) {
                UChar c = fLiteralText[i];
                if (c < 32 || c >= 256) {c = '.';}
                REGEX_DUMP_DEBUG_PRINTF("%c", c);
            }
        }
        break;
    case URX_SETREF:
        {
            REGEX_DUMP_DEBUG_PRINTF("%d ", val);
            UnicodeString s;
            UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
            set->toPattern(s, TRUE);
            for (int32_t i=0; i<s.length(); i++) {
                REGEX_DUMP_DEBUG_PRINTF("%c", s.charAt(i));
            }
        }
    default:
        REGEX_DUMP_DEBUG_PRINTF("??????");
        break;
    }
    REGEX_DUMP_DEBUG_PRINTF("\n");
 }
 void   RegexPattern::dump() const {
    int      index;
    int      i;
    UChar    c;
    int32_t  op;
    int32_t  pinnedType;
    int32_t  type;
    int32_t  val;
    int32_t  stringStart;
-
+    REGEX_DUMP_DEBUG_PRINTF("Original Pattern:  ");
    printf("Original Pattern:  ");
    for (i=0; i<fPattern.length(); i++) {
-        printf("%c", fPattern.charAt(i));
+        REGEX_DUMP_DEBUG_PRINTF("%c", fPattern.charAt(i));
    }
-    printf("\n");
+    REGEX_DUMP_DEBUG_PRINTF("\n");
-    printf("Pattern Valid?:     %s\n", fBadState? "no" : "yes");
+    REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?:     %s\n", fBadState? "no" : "yes");
-    printf("\nIndex   Binary     Type             Operand\n"
+    REGEX_DUMP_DEBUG_PRINTF("\nIndex   Binary     Type             Operand\n"
           "-------------------------------------------\n");
-    for (index = 0; ; index++) {
+    for (index = 0; index<fCompiledPat->size(); index++) {
-        op         = fCompiledPat->elementAti(index);
+        dumpOp(index);
        val        = URX_VAL(op);
        type       = URX_TYPE(op);
        pinnedType = type;
        if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
            pinnedType = 0;
        }
        printf("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]);
        switch (type) {
        case URX_NOP:
        case URX_DOTANY:
        case URX_FAIL:
        case URX_BACKSLASH_A:
        case URX_BACKSLASH_G:
        case URX_BACKSLASH_X:
            // Types with no operand field of interest.
            break;
        case URX_START_CAPTURE:
        case URX_END_CAPTURE:
        case URX_SETREF:
        case URX_STATIC_SETREF:
        case URX_STATE_SAVE:
        case URX_JMP:
        case URX_BACKSLASH_B:
        case URX_BACKSLASH_D:
        case URX_BACKSLASH_W:
        case URX_BACKSLASH_Z:
        case URX_CARET:
        case URX_DOLLAR:
            // types with an integer operand field.
            printf("%d", val);
            break;
        case URX_ONECHAR:
            printf("%c", val<256?val:'?');
            break;
        case URX_STRING:
            stringStart = val;
            break;
        case URX_STRING_LEN:
            for (i=stringStart; i<stringStart+val; i++) {
                c = fLiteralText[i];
                if (c >= 256) {c = '?';};
                printf("%c", c);
            }
            break;
        case URX_END:
            goto breakFromLoop;
        default:
            printf("??????");
            break;
        }
        printf("\n");
    }
-breakFromLoop:
+    REGEX_DUMP_DEBUG_PRINTF("\n\n");
    printf("\n\n");
 };
 const char RegexPattern::fgClassID = 0;
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -81,6 +81,8 @@ enum {
  * to be applied to input text, and a few convenience methods for simple common
  * uses of regular expressions.
  *
  * <p>Class RegexPattern is not intended to be subclassed.</p>
  *
  * @draft ICU 2.4
  */
 class U_I18N_API RegexPattern: public UObject {
@ -192,7 +194,7 @@ public:
    *    @draft ICU 2.4
    */
    static RegexPattern *compile( const UnicodeString &regex,
-        int32_t              flags,
+        uint32_t             flags,
        UParseError          &pe,
        UErrorCode           &status); 
@ -202,7 +204,7 @@ public:
    *     @return  the match mode flags
    *     @draft ICU 2.4
    */
-    virtual int32_t flags() const;
+    virtual uint32_t flags() const;
   /*
    *  Creates a RegexMatcher that will match the given input against this pattern.  The
@ -275,7 +277,7 @@ public:
    //
    //   dump   Debug function, displays the compiled form of a pattern.
    //
-    void dump();
+    void dump() const;
    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
@ -291,14 +293,12 @@ public:
    */
    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
    static const char fgClassID;
 private:
    //
    //  Implementation Data
    //
    UnicodeString   fPattern;      // The original pattern string.
-    int32_t         fFlags;        // The flags used when compiling the pattern.
+    uint32_t        fFlags;        // The flags used when compiling the pattern.
                                   //   
    UVector         *fCompiledPat; // The compiled pattern.
    UnicodeString   fLiteralText;  // Any literal string data from the pattern, 
@ -317,6 +317,12 @@ private:
    UnicodeSet    **fStaticSets;  // Ptr to static (shared) sets for predefined
                                    //   regex character classes, e.g. Word.
    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;
    friend class RegexCompile;
    friend class RegexMatcher;
@ -325,6 +331,7 @@ private:
    //
    void        init();            // Common initialization, for use by constructors.
    void        zap();             // Common cleanup
    void        dumpOp(int32_t index) const;
@ -343,6 +350,8 @@ private:
  *  input text to which the expression can be applied.  It includes methods
  *  for testing for matches, and for find and replace operations.
  *
  * <p>Class RegexMatcher is not intended to be subclassed.</p>
  *
  * @draft ICU 2.4
  */
  class U_I18N_API RegexMatcher: public UObject {
@ -355,6 +364,227 @@ public:
    */
    virtual ~RegexMatcher();
   /**
    *   Attempts to match the entire input string against the pattern.
    *    @param   status     A reference to a UErrorCode to receive any errors. 
    *    @return TRUE if there is a match
    *    @draft ICU 2.4
    */
    virtual UBool matches(UErrorCode &status);
   /**
    *   Attempts to match the input string, starting from the beginning, against the pattern.
    *   Like the matches() method, this function always starts at the beginning of the input string;
    *   unlike that function, it does not require that the entire input string be matched.
    *
    *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
    *     <code>end()</code>, and <code>group()</code> functions.</p>
    *
    *    @param   status     A reference to a UErrorCode to receive any errors. 
    *    @return  TRUE if there is a match at the start of the input string.
    *    @draft ICU 2.4
    */
    virtual UBool lookingAt(UErrorCode &status);
   /**
    *  Find the next pattern match in the input string.
    *  The find begins searching the input at the location following the end of
    *  the previous match, or at the start of the string if there is no previous match.
    *  If a match is found, <code>start(), end()</code> and <code>group()</code>
    *  will provide more information regarding the match.
    *  <p>Note that if the input string is changed by the application, 
    *     use find(startPos, status) instead of find(), because the saved starting
    *     position may not be valid with the altered input string.</p>
    *  @return  TRUE if a match is found.
    *  @draft ICU 2.4
    */
    virtual UBool find();
   /**
    *   Resets this RegexMatcher and then attempts to find the next substring of the 
    *   input string that matches the pattern, starting at the specified index. 
    *
    *   @param   start     the position in the input string to begin the search
    *   @param   status    A reference to a UErrorCode to receive any errors.  
    *   @return  TRUE if a match is found.
    *   @draft ICU 2.4
    */
    virtual UBool find(int32_t start, UErrorCode &status); 
   /*
    *   Returns a string containing the text matched by the previous match. 
    *   If the pattern can match an empty string, an empty string may be returned.
    *   @param   status      A reference to a UErrorCode to receive any errors.  
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
    *                        has been attempted or the last match failed. 
    *   @return  a string containing the matched input text.  
    *   @draft ICU 2.4
    */
    virtual UnicodeString group(UErrorCode &status) const;
   /**
    *    Returns a string containing the text captured by the given group
    *    during the previous match operation.  Group(0) is the entire match.
    *   
    *    @param group the capture group number
    *    @param   status     A reference to a UErrorCode to receive any errors.  
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
    *                        has been attempted or the last match failed and
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
    *    @return the captured text
    *    @draft ICU 2.4
    */
    virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 
   /**
    *   Returns the number of capturing groups in this matcher's pattern.
    *   @return the number of capture groups
    *   @draft ICU 2.4
    */
    virtual int32_t groupCount() const;
   /**
    *   Returns the index in the input string of the start of the text matched 
    *   during the previous match operation. 
    *    @param   status      a reference to a UErrorCode to receive any errors. 
    *    @return              The position in the input string of the start of the last match.
    *    @draft ICU 2.4
    */
    virtual int32_t start(UErrorCode &status) const;
   /**
    *   Returns the index in the input string of the start of the text matched by the
    *    specified capture group during the previous match operation.  Return -1 if
    *    the capture group exists in the pattern, but was not part of the last match.
    *
    *    @param  group       the capture group number
    *    @param  status      A reference to a UErrorCode to receive any errors.  Possible 
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
    *                        attempted or the last match failed, and
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
    *    @return the start position of substring matched by the specified group.
    *    @draft ICU 2.4
    */
    virtual int32_t start(int group, UErrorCode &status) const;
   /**
    *    Returns the index in the input string of the character following the
    *    text matched during the previous match operation.  
    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible 
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
    *                        attempted or the last match failed.
    *    @return the index of the last character matched, plus one.
    *   @draft ICU 2.4
    */
    virtual int32_t end(UErrorCode &status) const;
   /**
    *    Returns the index in the input string of the character following the
    *    text matched by the specified capture group during the previous match operation.
    *    @param group  the capture group number
    *    @param   status      A reference to a UErrorCode to receive any errors.  Possible 
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
    *                        attempted or the last match failed and
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
    *    @return  the index of the last character, plus one, of the text 
    *              captured by the specifed group during the previous match operation. 
    *              Return -1 if the capture group was not part of the match.
    *    @draft ICU 2.4
    */
    virtual int32_t end(int group, UErrorCode &status) const; 
   /**
    *   Resets this matcher.  The effect is to remove any memory of previous matches,
    *       and to cause subsequent find() operations to begin at the beginning of
    *       the input string.
    *
    *   @return this RegexMatcher.
    *   @draft ICU 2.4
    */
    virtual RegexMatcher &reset();
   /**
    *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
    *     to be reused, which is more efficient than creating a new RegexMatcher for
    *     each input string to be processed.
    *   @return this RegexMatcher.
    *   @draft ICU 2.4
    */
    virtual RegexMatcher &reset(const UnicodeString &input);  
   /**
    *   Returns the input string being matched.  The returned string is not a copy,
    *   but the live input string.  It should not be altered or deleted.
    *   @return the input string
    *   @draft ICU 2.4
    */
    virtual const UnicodeString &input() const; 
   /**
    *    Returns the pattern that is interpreted by this matcher.
    *    @return  the RegexPattern for this RegexMatcher
    *    @draft ICU 2.4
    */
    virtual const RegexPattern &pattern() const;
   /**
    *    Replaces every substring of the input that matches the pattern
    *    with the given replacement string.  This is a convenience function that
    *    provides a complete find-and-replace-all operation.
    *
    *    This method first resets this matcher. It then scans the input string
    *    looking for matches of the pattern. Input that is not part of any 
    *    match is left unchanged; each match is replaced in the result by the
    *    replacement string. The replacement string may contain references to
    *    capture groups. 
    *
    *    @param   replacement a string containing the replacement text.
    *    @param   status      a reference to a UErrorCode to receive any errors. 
    *    @return              a string containing the results of the find and replace.
    *    @draft ICU 2.4
    */
    virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 
   /**
    * Replaces the first substring of the input that matches
    * the pattern with the replacement string.   This is a convenience
    * function that provides a complete find-and-replace operation.
    *
    * <p>This function first resets this RegexMatcher. It then scans the input string
    * looking for a match of the pattern. Input that is not part
    * of the match is appended directly to the result string; the match is replaced
    * in the result by the replacement string. The replacement string may contain
    * references to captured groups.</p>
    *
    * <p>The state of the matcher (the position at which a subsequent find()
    *    would begin) after completing a replaceFirst() is not specified.  The
    *    RegexMatcher should be reset before doing additional find() operations.</p>
    * 
    *    @param   replacement a string containing the replacement text.
    *    @param   status      a reference to a UErrorCode to receive any errors. 
    *    @return              a string containing the results of the find and replace.
    *    @draft ICU 2.4
    */
    virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 
   /**
    *   Implements a replace operation intended to be used as part of an
    *   incremental find-and-replace.
@ -398,219 +628,6 @@ public:
    */
    virtual UnicodeString &appendTail(UnicodeString &dest); 
   /**
    *    Returns the index in the input string of the character following the
    *    text matched during the previous match operation.  
    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible 
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
    *                        attempted or the last match failed.
    *    @return the index of the last character matched, plus one.
    *   @draft ICU 2.4
    */
    virtual int32_t end(UErrorCode &status) const;
   /**
    *    Returns the index in the input string of the character following the
    *    text matched by the specified capture group during the previous match operation.
    *    @param group  the capture group number
    *    @param   status      A reference to a UErrorCode to receive any errors.  Possible 
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
    *                        attempted or the last match failed and
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
    *    @return  the index of the last character, plus one, of the text 
    *              captured by the specifed group during the previous match operation. 
    *              Return -1 if the capture group was not part of the match.
    *    @draft ICU 2.4
    */
    virtual int32_t end(int group, UErrorCode &status) const; 
   /**
    *  Find the next pattern match in the input string.
    *  The find begins searching the input at the location following the end of
    *  the previous match, or at the start of the string if there is no previous match.
    *  If a match is found, <code>start(), end()</code> and <code>group()</code>
    *  will provide more information regarding the match.
    *  @return  TRUE if a match is found.
    *  @draft ICU 2.4
    */
    virtual UBool find();
   /**
    *   Resets this RegexMatcher and then attempts to find the next substring of the 
    *   input string that matches the pattern, starting at the specified index. 
    *
    *   @param status the position in the input string to begin the search
    *   @param   status      A reference to a UErrorCode to receive any errors.  
    *   @return  TRUE if a match is found.
    *   @draft ICU 2.4
    */
    virtual UBool find(int32_t start, UErrorCode &status); 
   /*
    *   Returns a string containing the text matched by the previous match. 
    *   If the pattern can match an empty string, an empty string may be returned.
    *   @param   status      A reference to a UErrorCode to receive any errors.  
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
    *                        has been attempted or the last match failed. 
    *   @return  a string containing the matched input text.  
    *   @draft ICU 2.4
    */
    virtual UnicodeString group(UErrorCode &status) const;
   /**
    *    Returns a string containing the text captured by the given group
    *    during the previous match operation.  Group(0) is the entire match.
    *   
    *    @param group the capture group number
    *    @param   status     A reference to a UErrorCode to receive any errors.  
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
    *                        has been attempted or the last match failed and
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
    *    @return the captured text
    *    @draft ICU 2.4
    */
    virtual UnicodeString group(int32_t group, UErrorCode &status) const; 
   /**
    *   Returns the number of capturing groups in this matcher's pattern.
    *   @return the number of capture groups
    *   @draft ICU 2.4
    */
    virtual int32_t groupCount() const;
   /**
    *   Returns the input string being matched.  The returned string is not a copy,
    *   but the live input string.  It should not be altered or deleted.
    *   @return the input string
    *   @draft ICU 2.4
    */
    virtual const UnicodeString &input() const; 
   /**
    *   Attempts to match the input string, starting from the beginning, against the pattern.
    *   Like the matches() method, this function always starts at the beginning of the input string;
    *   unlike that function, it does not require that the entire input string be matched.
    *
    *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
    *     <code>end()</code>, and <code>group()</code> functions.</p>
    *
    *    @param   status     A reference to a UErrorCode to receive any errors. 
    *    @return  TRUE if there is a match at the start of the input string.
    *    @draft ICU 2.4
    */
    virtual UBool lookingAt(UErrorCode &status);
   /**
    *   Attempts to match the entire input string against the pattern.
    *    @param   status     A reference to a UErrorCode to receive any errors. 
    *    @return TRUE if there is a match
    *    @draft ICU 2.4
    */
    virtual UBool matches(UErrorCode &status);
   /**
    *    Returns the pattern that is interpreted by this matcher.
    *    @return  the RegexPattern for this RegexMatcher
    *    @draft ICU 2.4
    */
    virtual const RegexPattern &pattern() const;
   /**
    *    Replaces every substring of the input that matches the pattern
    *    with the given replacement string.  This is a convenience function that
    *    provides a complete find-and-replace-all operation.
    *
    *    This method first resets this matcher. It then scans the input string
    *    looking for matches of the pattern. Input that is not part of any 
    *    match is left unchanged; each match is replaced in the result by the
    *    replacement string. The replacement string may contain references to
    *    capture groups. 
    *
    *    @param   replacement a string containing the replacement text.
    *    @param   status      a reference to a UErrorCode to receive any errors. 
    *    @return              a string containing the results of the find and replace.
    *    @draft ICU 2.4
    */
    virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 
   /**
    * Replaces the first substring of the input that matches
    * the pattern with the replacement string.   This is a convenience
    * function that provides a complete find-and-replace operation.
    *
    * This function first resets this RegexMatcher. It then scans the input string
    * looking for a match of the pattern. Input that is not part
    * of the match is appended directly to the result string; the match is replaced
    * in the result by the replacement string. The replacement string may contain
    * references to captured groups.
    * 
    *    @param   replacement a string containing the replacement text.
    *    @param   status      a reference to a UErrorCode to receive any errors. 
    *    @return              a string containing the results of the find and replace.
    *    @draft ICU 2.4
    */
    virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 
   /**
    *   Resets this matcher.  The effect is to remove any memory of previous matches,
    *       and to cause subsequent find() operations to begin at the beginning of
    *       the input string.
    *
    *   @return this RegexMatcher.
    *   @draft ICU 2.4
    */
    virtual RegexMatcher &reset();
   /**
    *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
    *     to be reused, which is more efficient than creating a new RegexMatcher for
    *     each input string to be processed.
    *   @return this RegexMatcher.
    *   @draft ICU 2.4
    */
    virtual RegexMatcher &reset(const UnicodeString &input);  
   /**
    *   Returns the index in the input string of the start of the text matched 
    *   during the previous match operation. 
    *    @param   status      a reference to a UErrorCode to receive any errors. 
    *    @return              The position in the input string of the start of the last match.
    *    @draft ICU 2.4
    */
    virtual int32_t start(UErrorCode &status) const;
   /**
    *   Returns the index in the input string of the start of the text matched by the
    *    specified capture group during the previous match operation.  Return -1 if
    *    the capture group exists in the pattern, but was not part of the last match.
    *
    *    @param  group       the capture group number
    *    @param  status      A reference to a UErrorCode to receive any errors.  Possible 
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
    *                        attempted or the last match failed, and
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
    *    @return the start position of substring matched by the specified group.
    *    @draft ICU 2.4
    */
    virtual int32_t start(int group, UErrorCode &status) const;
    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
@ -626,8 +643,6 @@ public:
    */
    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
    static const char fgClassID;
 private:
    // Constructors and other object boilerplate are private.
    // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
@ -658,6 +673,13 @@ private:
    UVector             *fCaptureStarts;
    UVector             *fCaptureEnds;
    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char   fgClassID;
 };  
 U_NAMESPACE_END
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -368,7 +368,7 @@ void RegexTest::Basic() {
 //
 #if 0
    {
-    REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
+    REGEX_FIND("[{ab}]", "a");
    }
    exit(1);
 #endif
@ -436,6 +436,9 @@ void RegexTest::Basic() {
    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    // Set contains only a string, no individual chars.
    REGEX_TESTLM("[{ab}]", "a", FALSE, FALSE);
    //
    //   OR operator in patterns
    //
@ -975,6 +978,52 @@ void RegexTest::API_Pattern() {
    delete pat1;
    //  split, with a pattern with (capture)
    pat1 = RegexPattern::compile("<(\\w*)>",  pe, status);
    REGEX_CHECK_STATUS;
    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
    REGEX_CHECK_STATUS;
    REGEX_ASSERT(n==6);
    REGEX_ASSERT(fields[0]=="");
    REGEX_ASSERT(fields[1]=="a");
    REGEX_ASSERT(fields[2]=="Now is ");
    REGEX_ASSERT(fields[3]=="b");
    REGEX_ASSERT(fields[4]=="the time");
    REGEX_ASSERT(fields[5]=="c");
    REGEX_ASSERT(fields[6]=="");
    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
    REGEX_CHECK_STATUS;
    REGEX_ASSERT(n==6);
    REGEX_ASSERT(fields[0]=="  ");
    REGEX_ASSERT(fields[1]=="a");
    REGEX_ASSERT(fields[2]=="Now is ");
    REGEX_ASSERT(fields[3]=="b");
    REGEX_ASSERT(fields[4]=="the time");
    REGEX_ASSERT(fields[5]=="c");
    REGEX_ASSERT(fields[6]=="");
    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
    REGEX_CHECK_STATUS;
    REGEX_ASSERT(n==4);
    REGEX_ASSERT(fields[0]=="  ");
    REGEX_ASSERT(fields[1]=="a");
    REGEX_ASSERT(fields[2]=="Now is ");
    REGEX_ASSERT(fields[3]=="the time<c>");
    delete pat1;
    pat1 = RegexPattern::compile("([-,])",  pe, status);
    REGEX_CHECK_STATUS;
    n = pat1->split("1-10,20", fields, 10, status);
    REGEX_CHECK_STATUS;
    REGEX_ASSERT(n==5);
    REGEX_ASSERT(fields[0]=="1");
    REGEX_ASSERT(fields[1]=="-");
    REGEX_ASSERT(fields[2]=="10");
    REGEX_ASSERT(fields[3]==",");
    REGEX_ASSERT(fields[4]=="20");
    delete pat1;
 }