ICU-105 Regular Expressions, changes from code review

X-SVN-Rev: 10294
2002-11-19 19:31:03 +00:00 · 2002-11-19 19:31:03 +00:00 · 24bf088281
commit 24bf088281
parent bf1f6b1213
9 changed files with 556 additions and 353 deletions
--- a/icu4c/source/common/putil.c
+++ b/icu4c/source/common/putil.c
@ -1839,7 +1839,6 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
    "U_REGEX_PROPERTY_SYNTAX",
    "U_REGEX_UNIMPLEMENTED",
    "U_REGEX_MISMATCHED_PAREN",
-    "U_REGEX_MATCH_MODE_ERROR"
 };

 U_CAPI const char * U_EXPORT2
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -500,18 +500,17 @@ typedef enum UErrorCode {
    /*
     * The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
     */
-     U_REGEX_ERROR_START=0x10300,
-     U_REGEX_INTERNAL_ERROR,
-     U_REGEX_RULE_SYNTAX,
-     U_REGEX_INVALID_STATE,
-     U_REGEX_BAD_ESCAPE_SEQUENCE,
-     U_REGEX_PROPERTY_SYNTAX,
-     U_REGEX_UNIMPLEMENTED,
-     U_REGEX_MISMATCHED_PAREN,
-     U_REGEX_MATCH_MODE_ERROR,
-     U_REGEX_ERROR_LIMIT,
+     U_REGEX_ERROR_START=0x10300,          /**< Start of codes indicating Regexp failures */
+     U_REGEX_INTERNAL_ERROR,               /**< An internal error (bug) was detected.     */
+     U_REGEX_RULE_SYNTAX,                  /**< Syntax error in regexp pattern.           */
+     U_REGEX_INVALID_STATE,                /**< RegexMatcher in invalid state for requested operation */
+     U_REGEX_BAD_ESCAPE_SEQUENCE,          /**< Unrecognized backslash escape sequence in pattern */
+     U_REGEX_PROPERTY_SYNTAX,              /**< Incorrect Unicode property                */
+     U_REGEX_UNIMPLEMENTED,                /**< Use of regexp feature that is not yet implemented. */
+     U_REGEX_MISMATCHED_PAREN,             /**< Incorrectly nested parentheses in regexp pattern. */
+     U_REGEX_ERROR_LIMIT,                  /**< This must always be the last value to indicate the limit for regexp errors */

-    U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
+    U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT      /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
 } UErrorCode;

 /* Use the following to determine if an UErrorCode represents */
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -28,8 +28,6 @@
 #include "ucln_in.h"
 #include "mutex.h"

-#include "stdio.h"      // TODO:  Get rid of this
-  
 #include "regeximp.h"
 #include "regexcst.h"   // Contains state table for the regex pattern parser.
                        //   generated by a Perl script.
@ -40,7 +38,6 @@
 U_NAMESPACE_BEGIN

 const char       RegexCompile::fgClassID=0;
-static const int RESCAN_DEBUG = 0;

 //----------------------------------------------------------------------------------------
 //
@ -173,6 +170,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)

    //
    //  Set up the constant (static) Unicode Sets.
+    //    TODO:  something cleaner for that -128 constant.
    //
    ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128],   gRuleSet_rule_char_pattern,  status);
    ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern,      status);
@ -282,14 +280,12 @@ void    RegexCompile::compile(
        //    the search will stop there, if not before.
        //
        tableEl = &gRuleParseStateTable[state];
-        if (RESCAN_DEBUG) {
-            printf( "char, line, col = (\'%c\', %d, %d)    state=%s ",
+        REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d)    state=%s ",
            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
-        }

        for (;;) {    // loop through table rows belonging to this state, looking for one
                      //   that matches the current input char.
-            if (RESCAN_DEBUG) { printf( ".");}
+            REGEX_SCAN_DEBUG_PRINTF( ".");
            if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
                // Table row specified an individual character, not a set, and
                //   the input character is not quoted, and
@ -323,7 +319,7 @@ void    RegexCompile::compile(
            // No match on this row, advance to the next  row for this state,
            tableEl++;
        }
-        if (RESCAN_DEBUG) { printf( "\n");}
+        REGEX_SCAN_DEBUG_PRINTF("\n");

        //
        // We've found the row of the state table that matches the current input
@ -340,7 +336,7 @@ void    RegexCompile::compile(
            fStackPtr++;
            if (fStackPtr >= kStackSize) {
                error(U_REGEX_INTERNAL_ERROR);
-                // printf( "RegexCompile::parse() - state stack overflow.\n");
+                REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n");
                fStackPtr--;
            }
            fStack[fStackPtr] = tableEl->fPushState;
@ -369,6 +365,36 @@ void    RegexCompile::compile(

    }

+    //
+    // The pattern has now been read and processed, and the compiled code generated.
+    //
+
+    //
+    // Compute the number of digits requried for the largest capture group number.
+    //
+    fRXPat->fMaxCaptureDigits = 1;
+    int32_t  n = 10;
+    for (;;) {
+        if (n > fRXPat->fNumCaptureGroups) {
+            break;
+        }
+        fRXPat->fMaxCaptureDigits++;
+        n *= 10;
+    }
+
+    //
+    // A stupid bit of non-sense to prevent code coverage testing from complaining
+    //   about the pattern.dump() debug function.  Go through the motions of dumping,
+    //   even though, without the #define set, it will do nothing.
+    //
+#ifndef REGEX_DUMP_DEBUG
+    static UBool phonyDumpDone = FALSE;
+    if (phonyDumpDone==FALSE) {
+        fRXPat->dump();
+        phonyDumpDone = TRUE;
+    }
+#endif
+
 }


@ -1094,27 +1120,39 @@ void        RegexCompile::compileSet(UnicodeSet *theSet)
    if (theSet == NULL) {
        return;
    }
-    if (theSet->size() > 1) {
-        //  The set contains two or more chars.
+    int32_t  setSize = theSet->size();
+    UChar32  firstSetChar = theSet->charAt(0);
+    if (firstSetChar == -1) {
+        // Sets that contain only strings, but no individual chars,
+        // will end up here.   TODO:  figure out what to with sets containing strings.
+        setSize = 0;
+    }
+
+    switch (setSize) {
+    case 0:      // Set of no elements.   Always fails to match.  
+        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
+        break;
+        
+    case 1:
+        {
+            // The set contains only a single code point.  Put it into
+            //   the compiled pattern as a single char operation rather
+            //   than a set, and discard the set itself.
+            int32_t  charToken = URX_BUILD(URX_ONECHAR, firstSetChar);
+            fRXPat->fCompiledPat->addElement(charToken, *fStatus);
+            delete theSet;
+        }
+        break;
+        
+    default: 
+        {
+            //  The set contains two or more chars.  (the normal case)
            //  Put it into the compiled pattern as a set.
            int32_t setNumber = fRXPat->fSets->size();
            fRXPat->fSets->addElement(theSet, *fStatus);
            int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
            fRXPat->fCompiledPat->addElement(setOp, *fStatus);
        }
-    else
-    {
-        // The set contains only a single code point.  Put it into
-        //   the compiled pattern as a single char operation rather
-        //   than a set, and discard the set itself.
-        UChar32  c = theSet->charAt(0);
-        if (c == -1) {
-            // Set contained no chars.  Stuff an invalid char that can't match.
-            c = 0x1fffff;
-        }
-        int32_t  charToken = URX_BUILD(URX_ONECHAR, c);
-        fRXPat->fCompiledPat->addElement(charToken, *fStatus);
-        delete theSet;
    }
 }

@ -1321,7 +1359,7 @@ UnicodeSet *RegexCompile::scanSet() {
    if (U_FAILURE(localStatus)) {
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
        //         UnicodeSet appears to not be reporting correctly at this time.
-        printf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
+        REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
        error(localStatus);
        delete uset;
        return NULL;
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -28,8 +28,6 @@
 U_NAMESPACE_BEGIN


-static const UBool REGEX_DEBUG = TRUE;
-
 //--------------------------------------------------------------------------------
 //
 //  class RegexCompile    Contains the regular expression compiler.
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -13,13 +13,45 @@
 #define _REGEXIMP_H


+//
+//  debugging support.  Enable one or more of the #defines immediately following
+//
+//#define REGEX_SCAN_DEBUG
+#define REGEX_DUMP_DEBUG
+//#define REGEX_RUN_DEBUG
+//  End of #defines inteded to be directly set.
+
+#ifdef REGEX_SCAN_DEBUG
+#define REGEX_SCAN_DEBUG_PRINTF printf
+#else
+#define REGEX_SCAN_DEBUG_PRINTF
+#endif
+
+#ifdef REGEX_DUMP_DEBUG
+#define REGEX_DUMP_DEBUG_PRINTF printf
+#else
+#define REGEX_DUMP_DEBUG_PRINTF
+#endif
+
+#ifdef REGEX_RUN_DEBUG
+#define REGEX_RUN_DEBUG_PRINTF printf
+#define REGEX_DUMP_DEBUG_PRINTF printf
+#else
+#define REGEX_RUN_DEBUG_PRINTF
+#endif
+
+#if defined(REGEX_SCAN_DEBUG) || defined(REGEX_RUN_DEBUG) || defined(REGEX_DUMP_DEBUG)
+#include <stdio.h>
+#endif
+
+
 //
 //  Opcode types     In the compiled form of the regexp, these are the type, or opcodes,
 //                   of the entries.
 //
 enum {
     URX_RESERVED_OP   = 0,
-     URX_UNUSED1       = 1,
+     URX_BACKTRACK     = 1,
     URX_END           = 2,
     URX_ONECHAR       = 3,    // Value field is the 21 bit unicode char to match
     URX_STRING        = 4,    // Value field is index of string start
@ -52,7 +84,7 @@ enum {
 //   Used for debug printing only.
 #define URX_OPCODE_NAMES       \
        "URX_RESERVED_OP",     \
-        "URX_UNUSED1",         \
+        "URX_BACKTRACK",       \
        "END",                 \
        "ONECHAR",             \
        "STRING",              \
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -280,9 +280,9 @@ UnicodeString RegexMatcher::group(UErrorCode &status) const {



-UnicodeString RegexMatcher::group(int32_t group, UErrorCode &status) const {
-    int32_t  s = start(group, status);
-    int32_t  e = end(group, status);
+UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
+    int32_t  s = start(groupNum, status);
+    int32_t  e = end(groupNum, status);

    // Note:  calling start() and end() above will do all necessary checking that
    //        the group number is OK and that a match exists.  status will be set.
@ -539,6 +539,28 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
    int32_t     opType;                //    the opcode
    int32_t     opValue;               //    and the operand value.

+    #ifdef REGEX_RUN_DEBUG
+    {
+        printf("MatchAt(startIdx=%d)\n", startIdx);
+        printf("Original Pattern: ");
+        int i;
+        for (i=0; i<fPattern->fPattern.length(); i++) {
+            printf("%c", fPattern->fPattern.charAt(i));
+        }
+        printf("\n");
+        printf("Input String: ");
+        for (i=0; i<fInput->length(); i++) {
+            UChar c = fInput->charAt(i);
+            if (c<32 || c>256) {
+                c = '.';
+            }
+            printf("%c", c);
+        }
+        printf("\n");
+        printf("\n");
+        printf("PatLoc  inputIdx  char\n");
+    }
+    #endif

    if (U_FAILURE(status)) {
        return;
@ -569,7 +591,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
        op      = pat->elementAti(patIdx);
        opType  = URX_TYPE(op);
        opValue = URX_VAL(op);
-        // printf("%d   %d  \"%c\"\n", patIdx, inputIdx, fInput->char32At(inputIdx));
+        #ifdef REGEX_RUN_DEBUG
+            printf("inputIdx=%d   inputChar=%c    ", inputIdx, fInput->char32At(inputIdx));
+            fPattern->dumpOp(patIdx);
+        #endif
        patIdx++;

        switch (opType) {
@ -579,6 +604,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
            break;


+        case URX_BACKTRACK:
+            // Force a backtrack.  In some circumstances, the pattern compiler
+            //   will notice that the pattern can't possibly match anything, and will
+            //   emit one of these at that point.
+            backTrack(inputIdx, patIdx);
+            break;
+
+
        case URX_ONECHAR:
            {
                UChar32 inputChar = fInput->char32At(inputIdx);
@ -909,6 +942,11 @@ breakFromLoop:
        fLastMatchEnd = fMatchEnd;
        fMatchStart   = startIdx;
        fMatchEnd     = inputIdx;
+        REGEX_RUN_DEBUG_PRINTF("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd);
+        }
+    else
+    {
+        REGEX_RUN_DEBUG_PRINTF("No match\n\n");
    }
    return;
 }
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -18,8 +18,6 @@
 #include "regexcmp.h"
 #include "regeximp.h"

-#include "stdio.h"    // TODO:  get rid of this...
-
 U_NAMESPACE_BEGIN

 //--------------------------------------------------------------------------
@ -197,7 +195,7 @@ UBool   RegexPattern::operator ==(const RegexPattern &other) const {
 //---------------------------------------------------------------------
 RegexPattern  *RegexPattern::compile(
                             const UnicodeString &regex,
-                             int32_t              flags,
+                             uint32_t             flags,
                             UParseError          &pe,
                             UErrorCode           &status)  {

@ -243,7 +241,7 @@ RegexPattern *RegexPattern::compile( const UnicodeString &regex,
 //   flags
 //
 //---------------------------------------------------------------------
-int32_t RegexPattern::flags() const {
+uint32_t RegexPattern::flags() const {
    return fFlags;
 }

@ -320,8 +318,6 @@ UnicodeString RegexPattern::pattern() const {
 //---------------------------------------------------------------------
 //
 //   split
-//            TODO:  perl returns captured strings intermixed with the
-//                   fields.  Should we do this too?
 //
 //---------------------------------------------------------------------
 int32_t  RegexPattern::split(const UnicodeString &input,
@ -383,10 +379,28 @@ int32_t  RegexPattern::split(const UnicodeString &input,
            int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
            dest[i].setTo(input, nextOutputStringStart, fieldLen);
            nextOutputStringStart = fMatcher->fMatchEnd;
+
+            // If the delimiter pattern has capturing parentheses, the captured
+            //  text goes out into the next n destination strings.
+            int32_t groupNum;
+            for (groupNum=1; groupNum<=this->fNumCaptureGroups; groupNum++) {
+                if (i==destCapacity-1) {
+                    break;
+                }
+                i++;
+                dest[i] = fMatcher->group(groupNum, status);
+            }
+
            if (nextOutputStringStart == inputLen) {
                // The delimiter was at the end of the string.  We're done.
                break;
            }
+
+            if (i==destCapacity-1) {
+                // We've filled up the last output string with capture group data.
+                //  Give back the last string, to be used for the remainder of the input.
+                i--;
+            }
        }
        else
        {
@ -410,35 +424,16 @@ int32_t  RegexPattern::split(const UnicodeString &input,
 //---------------------------------------------------------------------
 static const char *opNames[] = {URX_OPCODE_NAMES};

-void   RegexPattern::dump() {
-    int      index;
-    int      i;
-    UChar    c;
-    int32_t  op;
-    int32_t  pinnedType;
-    int32_t  type;
-    int32_t  val;
-    int32_t  stringStart;
-
-
-    printf("Original Pattern:  ");
-    for (i=0; i<fPattern.length(); i++) {
-        printf("%c", fPattern.charAt(i));
-    }
-    printf("\n");
-    printf("Pattern Valid?:     %s\n", fBadState? "no" : "yes");
-    printf("\nIndex   Binary     Type             Operand\n"
-           "-------------------------------------------\n");
-    for (index = 0; ; index++) {
-        op         = fCompiledPat->elementAti(index);
-        val        = URX_VAL(op);
-        type       = URX_TYPE(op);
-        pinnedType = type;
+void   RegexPattern::dumpOp(int32_t index) const {
+    int32_t op          = fCompiledPat->elementAti(index);
+    int32_t val         = URX_VAL(op);
+    int32_t type        = URX_TYPE(op);
+    int32_t pinnedType  = type;
    if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
        pinnedType = 0;
    }
    
-        printf("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]);
+    REGEX_DUMP_DEBUG_PRINTF("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]);
    switch (type) {
    case URX_NOP:
    case URX_DOTANY:
@ -446,12 +441,12 @@ void   RegexPattern::dump() {
    case URX_BACKSLASH_A:
    case URX_BACKSLASH_G:
    case URX_BACKSLASH_X:
+    case URX_END:
        // Types with no operand field of interest.
        break;
        
    case URX_START_CAPTURE:
    case URX_END_CAPTURE:
-        case URX_SETREF:
    case URX_STATIC_SETREF:
    case URX_STATE_SAVE:
    case URX_JMP:
@ -461,37 +456,70 @@ void   RegexPattern::dump() {
    case URX_BACKSLASH_Z:
    case URX_CARET:
    case URX_DOLLAR:
+    case URX_STRING_LEN:
        // types with an integer operand field.
-            printf("%d", val);
+        REGEX_DUMP_DEBUG_PRINTF("%d", val);
        break;
        
    case URX_ONECHAR:
-            printf("%c", val<256?val:'?');
+        REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
        break;
        
    case URX_STRING:
-            stringStart = val;
-            break;
-
-        case URX_STRING_LEN:
-            for (i=stringStart; i<stringStart+val; i++) {
-                c = fLiteralText[i];
-                if (c >= 256) {c = '?';};
-                printf("%c", c);
+        {
+            int32_t lengthOp       = fCompiledPat->elementAti(index+1);
+            U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
+            int32_t length = URX_VAL(lengthOp);
+            int32_t i;
+            for (i=val; i<val+length; i++) {
+                UChar c = fLiteralText[i];
+                if (c < 32 || c >= 256) {c = '.';}
+                REGEX_DUMP_DEBUG_PRINTF("%c", c);
+            }
        }
        break;

-        case URX_END:
-            goto breakFromLoop;
+    case URX_SETREF:
+        {
+            REGEX_DUMP_DEBUG_PRINTF("%d ", val);
+            UnicodeString s;
+            UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
+            set->toPattern(s, TRUE);
+            for (int32_t i=0; i<s.length(); i++) {
+                REGEX_DUMP_DEBUG_PRINTF("%c", s.charAt(i));
+            }
+        }
+
+
        
    default:
-            printf("??????");
+        REGEX_DUMP_DEBUG_PRINTF("??????");
        break;
    }
-        printf("\n");
+    REGEX_DUMP_DEBUG_PRINTF("\n");
 }
-breakFromLoop:
-    printf("\n\n");
+
+
+
+
+
+
+void   RegexPattern::dump() const {
+    int      index;
+    int      i;
+
+    REGEX_DUMP_DEBUG_PRINTF("Original Pattern:  ");
+    for (i=0; i<fPattern.length(); i++) {
+        REGEX_DUMP_DEBUG_PRINTF("%c", fPattern.charAt(i));
+    }
+    REGEX_DUMP_DEBUG_PRINTF("\n");
+    REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?:     %s\n", fBadState? "no" : "yes");
+    REGEX_DUMP_DEBUG_PRINTF("\nIndex   Binary     Type             Operand\n"
+           "-------------------------------------------\n");
+    for (index = 0; index<fCompiledPat->size(); index++) {
+        dumpOp(index);
+    }
+    REGEX_DUMP_DEBUG_PRINTF("\n\n");
 };

 const char RegexPattern::fgClassID = 0;
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -81,6 +81,8 @@ enum {
  * to be applied to input text, and a few convenience methods for simple common
  * uses of regular expressions.
  *
+  * <p>Class RegexPattern is not intended to be subclassed.</p>
+  *
  * @draft ICU 2.4
  */
 class U_I18N_API RegexPattern: public UObject {
@ -192,7 +194,7 @@ public:
    *    @draft ICU 2.4
    */
    static RegexPattern *compile( const UnicodeString &regex,
-        int32_t              flags,
+        uint32_t             flags,
        UParseError          &pe,
        UErrorCode           &status); 

@ -202,7 +204,7 @@ public:
    *     @return  the match mode flags
    *     @draft ICU 2.4
    */
-    virtual int32_t flags() const;
+    virtual uint32_t flags() const;
    
   /*
    *  Creates a RegexMatcher that will match the given input against this pattern.  The
@ -275,7 +277,7 @@ public:
    //
    //   dump   Debug function, displays the compiled form of a pattern.
    //
-    void dump();
+    void dump() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
@ -291,14 +293,12 @@ public:
    */
    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
    
-    static const char fgClassID;
-
 private:
    //
    //  Implementation Data
    //
    UnicodeString   fPattern;      // The original pattern string.
-    int32_t         fFlags;        // The flags used when compiling the pattern.
+    uint32_t        fFlags;        // The flags used when compiling the pattern.
                                   //   
    UVector         *fCompiledPat; // The compiled pattern.
    UnicodeString   fLiteralText;  // Any literal string data from the pattern, 
@ -317,6 +317,12 @@ private:
    UnicodeSet    **fStaticSets;  // Ptr to static (shared) sets for predefined
                                    //   regex character classes, e.g. Word.

+    /**
+     * The address of this static class variable serves as this class's ID
+     * for ICU "poor man's RTTI".
+     */
+    static const char fgClassID;
+
    friend class RegexCompile;
    friend class RegexMatcher;

@ -325,6 +331,7 @@ private:
    //
    void        init();            // Common initialization, for use by constructors.
    void        zap();             // Common cleanup
+    void        dumpOp(int32_t index) const;



@ -343,6 +350,8 @@ private:
  *  input text to which the expression can be applied.  It includes methods
  *  for testing for matches, and for find and replace operations.
  *
+  * <p>Class RegexMatcher is not intended to be subclassed.</p>
+  *
  * @draft ICU 2.4
  */
  class U_I18N_API RegexMatcher: public UObject {
@ -355,6 +364,227 @@ public:
    */
    virtual ~RegexMatcher();

+    
+   /**
+    *   Attempts to match the entire input string against the pattern.
+    *    @param   status     A reference to a UErrorCode to receive any errors. 
+    *    @return TRUE if there is a match
+    *    @draft ICU 2.4
+    */
+    virtual UBool matches(UErrorCode &status);
+    
+    
+    
+   /**
+    *   Attempts to match the input string, starting from the beginning, against the pattern.
+    *   Like the matches() method, this function always starts at the beginning of the input string;
+    *   unlike that function, it does not require that the entire input string be matched.
+    *
+    *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
+    *     <code>end()</code>, and <code>group()</code> functions.</p>
+    *
+    *    @param   status     A reference to a UErrorCode to receive any errors. 
+    *    @return  TRUE if there is a match at the start of the input string.
+    *    @draft ICU 2.4
+    */
+    virtual UBool lookingAt(UErrorCode &status);
+    
+    
+   /**
+    *  Find the next pattern match in the input string.
+    *  The find begins searching the input at the location following the end of
+    *  the previous match, or at the start of the string if there is no previous match.
+    *  If a match is found, <code>start(), end()</code> and <code>group()</code>
+    *  will provide more information regarding the match.
+    *  <p>Note that if the input string is changed by the application, 
+    *     use find(startPos, status) instead of find(), because the saved starting
+    *     position may not be valid with the altered input string.</p>
+    *  @return  TRUE if a match is found.
+    *  @draft ICU 2.4
+    */
+    virtual UBool find();
+    
+    
+   /**
+    *   Resets this RegexMatcher and then attempts to find the next substring of the 
+    *   input string that matches the pattern, starting at the specified index. 
+    *
+    *   @param   start     the position in the input string to begin the search
+    *   @param   status    A reference to a UErrorCode to receive any errors.  
+    *   @return  TRUE if a match is found.
+    *   @draft ICU 2.4
+    */
+    virtual UBool find(int32_t start, UErrorCode &status); 
+    
+    
+   /*
+    *   Returns a string containing the text matched by the previous match. 
+    *   If the pattern can match an empty string, an empty string may be returned.
+    *   @param   status      A reference to a UErrorCode to receive any errors.  
+    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
+    *                        has been attempted or the last match failed. 
+    *   @return  a string containing the matched input text.  
+    *   @draft ICU 2.4
+    */
+    virtual UnicodeString group(UErrorCode &status) const;
+    
+    
+   /**
+    *    Returns a string containing the text captured by the given group
+    *    during the previous match operation.  Group(0) is the entire match.
+    *   
+    *    @param group the capture group number
+    *    @param   status     A reference to a UErrorCode to receive any errors.  
+    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
+    *                        has been attempted or the last match failed and
+    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
+    *    @return the captured text
+    *    @draft ICU 2.4
+    */
+    virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 
+    
+    
+   /**
+    *   Returns the number of capturing groups in this matcher's pattern.
+    *   @return the number of capture groups
+    *   @draft ICU 2.4
+    */
+    virtual int32_t groupCount() const;
+    
+    
+   /**
+    *   Returns the index in the input string of the start of the text matched 
+    *   during the previous match operation. 
+    *    @param   status      a reference to a UErrorCode to receive any errors. 
+    *    @return              The position in the input string of the start of the last match.
+    *    @draft ICU 2.4
+    */
+    virtual int32_t start(UErrorCode &status) const;
+    
+    
+   /**
+    *   Returns the index in the input string of the start of the text matched by the
+    *    specified capture group during the previous match operation.  Return -1 if
+    *    the capture group exists in the pattern, but was not part of the last match.
+    *
+    *    @param  group       the capture group number
+    *    @param  status      A reference to a UErrorCode to receive any errors.  Possible 
+    *                        errors are  U_REGEX_INVALID_STATE if no match has been
+    *                        attempted or the last match failed, and
+    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
+    *    @return the start position of substring matched by the specified group.
+    *    @draft ICU 2.4
+    */
+    virtual int32_t start(int group, UErrorCode &status) const;
+    
+    
+   /**
+    *    Returns the index in the input string of the character following the
+    *    text matched during the previous match operation.  
+    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible 
+    *                        errors are  U_REGEX_INVALID_STATE if no match has been
+    *                        attempted or the last match failed.
+    *    @return the index of the last character matched, plus one.
+    *   @draft ICU 2.4
+    */
+    virtual int32_t end(UErrorCode &status) const;
+    
+    
+   /**
+    *    Returns the index in the input string of the character following the
+    *    text matched by the specified capture group during the previous match operation.
+    *    @param group  the capture group number
+    *    @param   status      A reference to a UErrorCode to receive any errors.  Possible 
+    *                        errors are  U_REGEX_INVALID_STATE if no match has been
+    *                        attempted or the last match failed and
+    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
+    *    @return  the index of the last character, plus one, of the text 
+    *              captured by the specifed group during the previous match operation. 
+    *              Return -1 if the capture group was not part of the match.
+    *    @draft ICU 2.4
+    */
+    virtual int32_t end(int group, UErrorCode &status) const; 
+    
+    
+   /**
+    *   Resets this matcher.  The effect is to remove any memory of previous matches,
+    *       and to cause subsequent find() operations to begin at the beginning of
+    *       the input string.
+    *
+    *   @return this RegexMatcher.
+    *   @draft ICU 2.4
+    */
+    virtual RegexMatcher &reset();
+    
+    
+   /**
+    *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
+    *     to be reused, which is more efficient than creating a new RegexMatcher for
+    *     each input string to be processed.
+    *   @return this RegexMatcher.
+    *   @draft ICU 2.4
+    */
+    virtual RegexMatcher &reset(const UnicodeString &input);  
+    
+    
+   /**
+    *   Returns the input string being matched.  The returned string is not a copy,
+    *   but the live input string.  It should not be altered or deleted.
+    *   @return the input string
+    *   @draft ICU 2.4
+    */
+    virtual const UnicodeString &input() const; 
+    
+    
+   /**
+    *    Returns the pattern that is interpreted by this matcher.
+    *    @return  the RegexPattern for this RegexMatcher
+    *    @draft ICU 2.4
+    */
+    virtual const RegexPattern &pattern() const;
+    
+    
+   /**
+    *    Replaces every substring of the input that matches the pattern
+    *    with the given replacement string.  This is a convenience function that
+    *    provides a complete find-and-replace-all operation.
+    *
+    *    This method first resets this matcher. It then scans the input string
+    *    looking for matches of the pattern. Input that is not part of any 
+    *    match is left unchanged; each match is replaced in the result by the
+    *    replacement string. The replacement string may contain references to
+    *    capture groups. 
+    *
+    *    @param   replacement a string containing the replacement text.
+    *    @param   status      a reference to a UErrorCode to receive any errors. 
+    *    @return              a string containing the results of the find and replace.
+    *    @draft ICU 2.4
+    */
+    virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 
+    
+    
+   /**
+    * Replaces the first substring of the input that matches
+    * the pattern with the replacement string.   This is a convenience
+    * function that provides a complete find-and-replace operation.
+    *
+    * <p>This function first resets this RegexMatcher. It then scans the input string
+    * looking for a match of the pattern. Input that is not part
+    * of the match is appended directly to the result string; the match is replaced
+    * in the result by the replacement string. The replacement string may contain
+    * references to captured groups.</p>
+    *
+    * <p>The state of the matcher (the position at which a subsequent find()
+    *    would begin) after completing a replaceFirst() is not specified.  The
+    *    RegexMatcher should be reset before doing additional find() operations.</p>
+    * 
+    *    @param   replacement a string containing the replacement text.
+    *    @param   status      a reference to a UErrorCode to receive any errors. 
+    *    @return              a string containing the results of the find and replace.
+    *    @draft ICU 2.4
+    */
+    virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 
+    
   /**
    *   Implements a replace operation intended to be used as part of an
    *   incremental find-and-replace.
@ -399,219 +629,6 @@ public:
    virtual UnicodeString &appendTail(UnicodeString &dest); 
    

-   /**
-    *    Returns the index in the input string of the character following the
-    *    text matched during the previous match operation.  
-    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible 
-    *                        errors are  U_REGEX_INVALID_STATE if no match has been
-    *                        attempted or the last match failed.
-    *    @return the index of the last character matched, plus one.
-    *   @draft ICU 2.4
-    */
-    virtual int32_t end(UErrorCode &status) const;
-    
-    
-   /**
-    *    Returns the index in the input string of the character following the
-    *    text matched by the specified capture group during the previous match operation.
-    *    @param group  the capture group number
-    *    @param   status      A reference to a UErrorCode to receive any errors.  Possible 
-    *                        errors are  U_REGEX_INVALID_STATE if no match has been
-    *                        attempted or the last match failed and
-    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
-    *    @return  the index of the last character, plus one, of the text 
-    *              captured by the specifed group during the previous match operation. 
-    *              Return -1 if the capture group was not part of the match.
-    *    @draft ICU 2.4
-    */
-    virtual int32_t end(int group, UErrorCode &status) const; 
-    
-    
-   /**
-    *  Find the next pattern match in the input string.
-    *  The find begins searching the input at the location following the end of
-    *  the previous match, or at the start of the string if there is no previous match.
-    *  If a match is found, <code>start(), end()</code> and <code>group()</code>
-    *  will provide more information regarding the match.
-    *  @return  TRUE if a match is found.
-    *  @draft ICU 2.4
-    */
-    virtual UBool find();
-    
-    
-   /**
-    *   Resets this RegexMatcher and then attempts to find the next substring of the 
-    *   input string that matches the pattern, starting at the specified index. 
-    *
-    *   @param status the position in the input string to begin the search
-    *   @param   status      A reference to a UErrorCode to receive any errors.  
-    *   @return  TRUE if a match is found.
-    *   @draft ICU 2.4
-    */
-    virtual UBool find(int32_t start, UErrorCode &status); 
-    
-    
-   /*
-    *   Returns a string containing the text matched by the previous match. 
-    *   If the pattern can match an empty string, an empty string may be returned.
-    *   @param   status      A reference to a UErrorCode to receive any errors.  
-    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
-    *                        has been attempted or the last match failed. 
-    *   @return  a string containing the matched input text.  
-    *   @draft ICU 2.4
-    */
-    virtual UnicodeString group(UErrorCode &status) const;
-    
-    
-   /**
-    *    Returns a string containing the text captured by the given group
-    *    during the previous match operation.  Group(0) is the entire match.
-    *   
-    *    @param group the capture group number
-    *    @param   status     A reference to a UErrorCode to receive any errors.  
-    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
-    *                        has been attempted or the last match failed and
-    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
-    *    @return the captured text
-    *    @draft ICU 2.4
-    */
-    virtual UnicodeString group(int32_t group, UErrorCode &status) const; 
-    
-    
-   /**
-    *   Returns the number of capturing groups in this matcher's pattern.
-    *   @return the number of capture groups
-    *   @draft ICU 2.4
-    */
-    virtual int32_t groupCount() const;
-    
-    
-   /**
-    *   Returns the input string being matched.  The returned string is not a copy,
-    *   but the live input string.  It should not be altered or deleted.
-    *   @return the input string
-    *   @draft ICU 2.4
-    */
-    virtual const UnicodeString &input() const; 
-    
-    
-   /**
-    *   Attempts to match the input string, starting from the beginning, against the pattern.
-    *   Like the matches() method, this function always starts at the beginning of the input string;
-    *   unlike that function, it does not require that the entire input string be matched.
-    *
-    *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
-    *     <code>end()</code>, and <code>group()</code> functions.</p>
-    *
-    *    @param   status     A reference to a UErrorCode to receive any errors. 
-    *    @return  TRUE if there is a match at the start of the input string.
-    *    @draft ICU 2.4
-    */
-    virtual UBool lookingAt(UErrorCode &status);
-    
-    
-   /**
-    *   Attempts to match the entire input string against the pattern.
-    *    @param   status     A reference to a UErrorCode to receive any errors. 
-    *    @return TRUE if there is a match
-    *    @draft ICU 2.4
-    */
-    virtual UBool matches(UErrorCode &status);
-    
-    
-   /**
-    *    Returns the pattern that is interpreted by this matcher.
-    *    @return  the RegexPattern for this RegexMatcher
-    *    @draft ICU 2.4
-    */
-    virtual const RegexPattern &pattern() const;
-    
-    
-   /**
-    *    Replaces every substring of the input that matches the pattern
-    *    with the given replacement string.  This is a convenience function that
-    *    provides a complete find-and-replace-all operation.
-    *
-    *    This method first resets this matcher. It then scans the input string
-    *    looking for matches of the pattern. Input that is not part of any 
-    *    match is left unchanged; each match is replaced in the result by the
-    *    replacement string. The replacement string may contain references to
-    *    capture groups. 
-    *
-    *    @param   replacement a string containing the replacement text.
-    *    @param   status      a reference to a UErrorCode to receive any errors. 
-    *    @return              a string containing the results of the find and replace.
-    *    @draft ICU 2.4
-    */
-    virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 
-    
-    
-   /**
-    * Replaces the first substring of the input that matches
-    * the pattern with the replacement string.   This is a convenience
-    * function that provides a complete find-and-replace operation.
-    *
-    * This function first resets this RegexMatcher. It then scans the input string
-    * looking for a match of the pattern. Input that is not part
-    * of the match is appended directly to the result string; the match is replaced
-    * in the result by the replacement string. The replacement string may contain
-    * references to captured groups.
-    * 
-    *    @param   replacement a string containing the replacement text.
-    *    @param   status      a reference to a UErrorCode to receive any errors. 
-    *    @return              a string containing the results of the find and replace.
-    *    @draft ICU 2.4
-    */
-    virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 
-    
-    
-   /**
-    *   Resets this matcher.  The effect is to remove any memory of previous matches,
-    *       and to cause subsequent find() operations to begin at the beginning of
-    *       the input string.
-    *
-    *   @return this RegexMatcher.
-    *   @draft ICU 2.4
-    */
-    virtual RegexMatcher &reset();
-    
-    
-   /**
-    *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
-    *     to be reused, which is more efficient than creating a new RegexMatcher for
-    *     each input string to be processed.
-    *   @return this RegexMatcher.
-    *   @draft ICU 2.4
-    */
-    virtual RegexMatcher &reset(const UnicodeString &input);  
-    
-    
-   /**
-    *   Returns the index in the input string of the start of the text matched 
-    *   during the previous match operation. 
-    *    @param   status      a reference to a UErrorCode to receive any errors. 
-    *    @return              The position in the input string of the start of the last match.
-    *    @draft ICU 2.4
-    */
-    virtual int32_t start(UErrorCode &status) const;
-    
-    
-   /**
-    *   Returns the index in the input string of the start of the text matched by the
-    *    specified capture group during the previous match operation.  Return -1 if
-    *    the capture group exists in the pattern, but was not part of the last match.
-    *
-    *    @param  group       the capture group number
-    *    @param  status      A reference to a UErrorCode to receive any errors.  Possible 
-    *                        errors are  U_REGEX_INVALID_STATE if no match has been
-    *                        attempted or the last match failed, and
-    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
-    *    @return the start position of substring matched by the specified group.
-    *    @draft ICU 2.4
-    */
-    virtual int32_t start(int group, UErrorCode &status) const;
-    
-
    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
@ -626,8 +643,6 @@ public:
    */
    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
    
-    static const char fgClassID;
-
 private:
    // Constructors and other object boilerplate are private.
    // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
@ -658,6 +673,13 @@ private:
    UVector             *fCaptureStarts;
    UVector             *fCaptureEnds;

+    /**
+     * The address of this static class variable serves as this class's ID
+     * for ICU "poor man's RTTI".
+     */
+    static const char   fgClassID;
+
+
 };  

 U_NAMESPACE_END
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -368,7 +368,7 @@ void RegexTest::Basic() {
 //
 #if 0
    {
-    REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
+    REGEX_FIND("[{ab}]", "a");
    }
    exit(1);
 #endif
@ -436,6 +436,9 @@ void RegexTest::Basic() {
    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);

+    // Set contains only a string, no individual chars.
+    REGEX_TESTLM("[{ab}]", "a", FALSE, FALSE);
+
    //
    //   OR operator in patterns
    //
@ -975,6 +978,52 @@ void RegexTest::API_Pattern() {

    delete pat1;

+    //  split, with a pattern with (capture)
+    pat1 = RegexPattern::compile("<(\\w*)>",  pe, status);
+    REGEX_CHECK_STATUS;
+
+    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(n==6);
+    REGEX_ASSERT(fields[0]=="");
+    REGEX_ASSERT(fields[1]=="a");
+    REGEX_ASSERT(fields[2]=="Now is ");
+    REGEX_ASSERT(fields[3]=="b");
+    REGEX_ASSERT(fields[4]=="the time");
+    REGEX_ASSERT(fields[5]=="c");
+    REGEX_ASSERT(fields[6]=="");
+
+    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(n==6);
+    REGEX_ASSERT(fields[0]=="  ");
+    REGEX_ASSERT(fields[1]=="a");
+    REGEX_ASSERT(fields[2]=="Now is ");
+    REGEX_ASSERT(fields[3]=="b");
+    REGEX_ASSERT(fields[4]=="the time");
+    REGEX_ASSERT(fields[5]=="c");
+    REGEX_ASSERT(fields[6]=="");
+
+    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(n==4);
+    REGEX_ASSERT(fields[0]=="  ");
+    REGEX_ASSERT(fields[1]=="a");
+    REGEX_ASSERT(fields[2]=="Now is ");
+    REGEX_ASSERT(fields[3]=="the time<c>");
+    delete pat1;
+
+    pat1 = RegexPattern::compile("([-,])",  pe, status);
+    REGEX_CHECK_STATUS;
+    n = pat1->split("1-10,20", fields, 10, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(n==5);
+    REGEX_ASSERT(fields[0]=="1");
+    REGEX_ASSERT(fields[1]=="-");
+    REGEX_ASSERT(fields[2]=="10");
+    REGEX_ASSERT(fields[3]==",");
+    REGEX_ASSERT(fields[4]=="20");
+    delete pat1;
 }