ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10063
2002-10-24 22:16:07 +00:00 · 2002-10-24 22:16:07 +00:00 · 425ac49187
commit 425ac49187
parent 08ca9c365b
10 changed files with 484 additions and 160 deletions
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -503,6 +503,7 @@ typedef enum UErrorCode {
     U_REGEX_ERROR_START=0x10300,
     U_REGEX_INTERNAL_ERROR,
     U_REGEX_INVALID_STATE,
+     U_REGEX_BAD_ESCAPE_SEQUENCE,
     U_REGEX_ERROR_LIMIT,

    U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -59,22 +59,21 @@ static const UChar gRuleSet_rule_char_pattern[]       = {
    0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};


-static const UChar gRuleSet_name_char_pattern[]       = {
-//    [    _      \    p     {     L      }     \     p     {    N      }     ]
-    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
-
 static const UChar gRuleSet_digit_char_pattern[] = {
 //    [    0      -    9     ]
    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};

-static const UChar gRuleSet_name_start_char_pattern[] = {
-//    [    _      \    p     {     L      }     ]
-    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
-
-static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00};  // "any"

 static UnicodeSet  *gRuleSets[10];         // Array of ptrs to the actual UnicodeSet objects.
+static UnicodeSet  *gUnescapeCharSet;

+//
+//   These are the backslash escape characters that ICU's unescape
+//    will handle.
+//
+static const UChar gUnescapeCharPattern[] = {
+//    [     a     b     c     e     f     n     r     t     u     U     ] 
+    0x5b, 0x61, 0x62, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d};


 //----------------------------------------------------------------------------------------
@ -88,7 +87,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)

    fScanIndex = 0;
    fNextIndex = 0;
-
+    fPeekChar  = -1;
    fLineNum    = 1;
    fCharNum    = 0;
    fQuoteMode  = FALSE;
@ -110,13 +109,16 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
        gRuleSets[kRuleSet_rule_char-128]       = new UnicodeSet(gRuleSet_rule_char_pattern,       status);
        gRuleSets[kRuleSet_white_space-128]     = new UnicodeSet(UnicodePropertySet::getRuleWhiteSpaceSet(status));
        gRuleSets[kRuleSet_digit_char-128]      = new UnicodeSet(gRuleSet_digit_char_pattern,      status);
+        gUnescapeCharSet                        = new UnicodeSet(gUnescapeCharPattern,             status);
        if (U_FAILURE(status)) {
            delete gRuleSets[kRuleSet_rule_char-128];
            delete gRuleSets[kRuleSet_white_space-128];
            delete gRuleSets[kRuleSet_digit_char-128];
+            delete gUnescapeCharSet;
            gRuleSets[kRuleSet_rule_char-128]   = NULL;
            gRuleSets[kRuleSet_white_space-128] = NULL;
            gRuleSets[kRuleSet_digit_char-128]  = NULL;
+            gUnescapeCharSet = NULL;
            return;
        }
    }
@ -218,7 +220,7 @@ void    RegexCompile::compile(
                // Table row specified "quoted" and the char was quoted.
                break;
            }
-            if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  {
+            if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
                // Table row specified eof and we hit eof on the input.
                break;
            }
@ -605,14 +607,15 @@ UBool RegexCompile::doParseActions(EParseAction action)
        break;


-            
    case doDotAny:
        // scanned a ".",  match any single character.
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus);
        break;


-    case doExprFinished:
+    case doBackslashA:
+        // Scanned a "\A".
+        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
        break;

    case doExit:
@ -816,6 +819,11 @@ UChar32  RegexCompile::nextCharLL() {
    UChar32       ch;
    UnicodeString &pattern = fRXPat->fPattern;

+    if (fPeekChar != -1) {
+        ch = fPeekChar;
+        fPeekChar = -1;
+        return ch;
+    }
    if (fPatternLength==0 || fNextIndex >= fPatternLength) {
        return (UChar32)-1;
    }
@ -846,12 +854,25 @@ UChar32  RegexCompile::nextCharLL() {
    return ch;
 }

+//---------------------------------------------------------------------------------
+//
+//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
+//                 character without actually getting it.
+//
+//---------------------------------------------------------------------------------
+UChar32  RegexCompile::peekCharLL() {
+    if (fPeekChar == -1) {
+        fPeekChar = nextCharLL();
+    }
+    return fPeekChar;
+}
+

 //---------------------------------------------------------------------------------
 //
-//   nextChar     for rules scanning.  At this level, we handle stripping
-//                out comments and processing backslash character escapes.
-//                The rest of the rules grammar is handled at the next level up.
+//   nextChar     for pattern scanning.  At this level, we handle stripping
+//                out comments and processing some backslash character escapes.
+//                The rest of the pattern grammar is handled at the next level up.
 //
 //---------------------------------------------------------------------------------
 void RegexCompile::nextChar(RegexPatternChar &c) {
@ -870,7 +891,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
    {
        // We are not in a 'quoted region' of the source.
        //
-        if (c.fChar == chPound) {
+        if (fFreeForm && c.fChar == chPound) {
            // Start of a comment.  Consume the rest of it.
            //  The new-line char that terminates the comment is always returned.
            //  It will be treated as white-space, and serves to break up anything
@ -891,16 +912,22 @@ void RegexCompile::nextChar(RegexPatternChar &c) {

        //
        //  check for backslash escaped characters.
-        //  Use UnicodeString::unescapeAt() to handle them.
+        //  Use UnicodeString::unescapeAt() to handle those that it can.
+        //  Otherwise just return the '\', and let the pattern parser deal with it.
        //
+        int32_t startX = fNextIndex;  // start and end positions of the 
+        int32_t endX   = fNextIndex;  //   sequence following the '\'
        if (c.fChar == chBackSlash) {
-            c.fQuoted = TRUE;
-            int32_t startX = fNextIndex;
-            c.fChar = fRXPat->fPattern.unescapeAt(fNextIndex);
-            if (fNextIndex == startX) {
-                error(U_BRK_HEX_DIGITS_EXPECTED);
+            if (gUnescapeCharSet->contains(peekCharLL())) {
+                nextCharLL();     // get & discard the peeked char.
+                c.fQuoted = TRUE;
+                c.fChar = fRXPat->fPattern.unescapeAt(endX);
+                if (startX == endX) {
+                    error(U_REGEX_BAD_ESCAPE_SEQUENCE);
+                }
+                fCharNum += endX - startX;
+                fNextIndex = endX;
            }
-            fCharNum += fNextIndex-startX;
        }
    }
    // putc(c.fChar, stdout);
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -65,8 +65,6 @@ public:

    void        nextChar(RegexPatternChar &c);      // Get the next char from the input stream.

-    UBool       push(const RegexPatternChar &c);    // Push (unget) one character.
-                                                    //   Only a single character may be pushed.

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
@ -88,6 +86,7 @@ private:
    void        error(UErrorCode e);                   // error reporting convenience function.

    UChar32     nextCharLL();
+    UChar32     peekCharLL();
    UnicodeSet  *scanSet();
    void        handleCloseParen();
    int32_t     blockTopLoc();                       // Locate a position in the compiled pattern
@ -99,6 +98,9 @@ private:
    RegexPattern                  *fRXPat;
    UParseError                   *fParseErr;

+    //
+    //  Data associated with low level character scanning
+    //
    int32_t                       fScanIndex;        // Index of current character being processed
                                                     //   in the rule input string.
    int32_t                       fNextIndex;        // Index of the next character, which
@ -109,6 +111,8 @@ private:
    int                           fCharNum;          // Char position within the line.
    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
                                                     //   as a single line, not two.
+    UChar32                       fPeekChar;         // Saved char, if we've scanned ahead.
+

    RegexPatternChar              fC;                // Current char for parse state machine
                                                     //   processing.
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -40,6 +40,7 @@ enum Regex_PatternParseAction {
    doOpenLookAheadNeg,
    doPlus,
    doOpenNonCaptureParen,
+    doBackslashA,
    doNGPlus,
    doPatFinish,
    doIntervalMinValue,
@ -51,7 +52,6 @@ enum Regex_PatternParseAction {
    doOpenLookAhead,
    doNumberExpectedError,
    doDotAny,
-    doExprFinished,
    doScanUnicodeSet,
    doNOP,
    doExit,
@ -80,71 +80,65 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    {doNOP, 0, 0, 0, TRUE}
    , {doPatStart, 255, 3, 2, FALSE}     //  1      start
    , {doPatFinish, 255, 2,0,  FALSE}     //  2      finish
-    , {doStartString, 254, 10,0,  TRUE}     //  3      term
-    , {doStartString, 130, 10,0,  TRUE}     //  4 
-    , {doScanUnicodeSet, 91 /* [ */, 17,0,  TRUE}     //  5 
-    , {doNOP, 40 /* ( */, 29, 17, TRUE}     //  6 
-    , {doDotAny, 46 /* . */, 17,0,  TRUE}     //  7 
-    , {doNOP, 253, 255,0,  FALSE}     //  8 
-    , {doRuleError, 255, 67,0,  FALSE}     //  9 
-    , {doStringChar, 254, 10,0,  TRUE}     //  10      string
-    , {doStringChar, 130, 10,0,  TRUE}     //  11 
-    , {doSplitString, 63 /* ? */, 17,0,  FALSE}     //  12 
-    , {doSplitString, 43 /* + */, 17,0,  FALSE}     //  13 
-    , {doSplitString, 42 /* * */, 17,0,  FALSE}     //  14 
-    , {doSplitString, 123 /* { */, 17,0,  FALSE}     //  15 
-    , {doEndString, 255, 17,0,  FALSE}     //  16 
-    , {doNOP, 42 /* * */, 40,0,  TRUE}     //  17      expr-quant
-    , {doNOP, 43 /* + */, 43,0,  TRUE}     //  18 
-    , {doNOP, 63 /* ? */, 46,0,  TRUE}     //  19 
-    , {doNOP, 255, 21,0,  FALSE}     //  20 
-    , {doNOP, 254, 3,0,  FALSE}     //  21      expr-cont
-    , {doNOP, 130, 3,0,  FALSE}     //  22 
-    , {doNOP, 91 /* [ */, 3,0,  FALSE}     //  23 
-    , {doNOP, 40 /* ( */, 3,0,  FALSE}     //  24 
-    , {doNOP, 46 /* . */, 3,0,  FALSE}     //  25 
-    , {doOrOperator, 124 /* | */, 3,0,  TRUE}     //  26 
-    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  27 
-    , {doExprFinished, 255, 255,0,  FALSE}     //  28 
-    , {doNOP, 63 /* ? */, 31,0,  TRUE}     //  29      open-paren
-    , {doOpenCaptureParen, 255, 3, 17, FALSE}     //  30 
-    , {doOpenNonCaptureParen, 58 /* : */, 3, 17, TRUE}     //  31      open-paren-extended
-    , {doOpenAtomicParen, 62 /* > */, 3, 17, TRUE}     //  32 
-    , {doOpenLookAhead, 61 /* = */, 3, 21, TRUE}     //  33 
-    , {doOpenLookAheadNeg, 33 /* ! */, 3, 21, TRUE}     //  34 
-    , {doNOP, 60 /* < */, 37,0,  TRUE}     //  35 
-    , {doBadOpenParenType, 255, 67,0,  FALSE}     //  36 
-    , {doOpenLookBehind, 61 /* = */, 3, 21, TRUE}     //  37      open-paren-lookbehind
-    , {doOpenLookBehindNeg, 33 /* ! */, 3, 21, TRUE}     //  38 
-    , {doBadOpenParenType, 255, 67,0,  FALSE}     //  39 
-    , {doNGStar, 63 /* ? */, 21,0,  TRUE}     //  40      quant-star
-    , {doPossesiveStar, 43 /* + */, 21,0,  TRUE}     //  41 
-    , {doStar, 255, 21,0,  FALSE}     //  42 
-    , {doNGPlus, 63 /* ? */, 21,0,  TRUE}     //  43      quant-plus
-    , {doPossesivePlus, 43 /* + */, 21,0,  TRUE}     //  44 
-    , {doPlus, 255, 21,0,  FALSE}     //  45 
-    , {doNGOpt, 63 /* ? */, 21,0,  TRUE}     //  46      quant-opt
-    , {doPossesiveOpt, 43 /* + */, 21,0,  TRUE}     //  47 
-    , {doOpt, 255, 21,0,  FALSE}     //  48 
-    , {doNOP, 129, 49,0,  TRUE}     //  49      interval-open
-    , {doIntervalMinValue, 128, 52,0,  FALSE}     //  50 
-    , {doNumberExpectedError, 255, 67,0,  FALSE}     //  51 
-    , {doNOP, 129, 56,0,  TRUE}     //  52      interval-value
-    , {doNOP, 125 /* } */, 56,0,  FALSE}     //  53 
-    , {doIntervalDigit, 128, 52,0,  TRUE}     //  54 
-    , {doNumberExpectedError, 255, 67,0,  FALSE}     //  55 
-    , {doNOP, 129, 56,0,  TRUE}     //  56      interval-close
-    , {doTagValue, 125 /* } */, 59,0,  TRUE}     //  57 
-    , {doNumberExpectedError, 255, 67,0,  FALSE}     //  58 
-    , {doNOP, 254, 3,0,  FALSE}     //  59      expr-cont-no-interval
-    , {doNOP, 130, 3,0,  FALSE}     //  60 
-    , {doNOP, 91 /* [ */, 3,0,  FALSE}     //  61 
-    , {doNOP, 40 /* ( */, 3,0,  FALSE}     //  62 
-    , {doNOP, 46 /* . */, 3,0,  FALSE}     //  63 
-    , {doExprOrOperator, 124 /* | */, 3,0,  TRUE}     //  64 
-    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  65 
-    , {doExprFinished, 255, 255,0,  FALSE}     //  66 
-    , {doExit, 255, 67,0,  TRUE}     //  67      errorDeath
+    , {doStartString, 254, 11,0,  TRUE}     //  3      term
+    , {doStartString, 130, 11,0,  TRUE}     //  4 
+    , {doScanUnicodeSet, 91 /* [ */, 18,0,  TRUE}     //  5 
+    , {doNOP, 40 /* ( */, 25, 18, TRUE}     //  6 
+    , {doDotAny, 46 /* . */, 18,0,  TRUE}     //  7 
+    , {doNOP, 92 /* \ */, 59,0,  TRUE}     //  8 
+    , {doNOP, 253, 2,0,  FALSE}     //  9 
+    , {doRuleError, 255, 61,0,  FALSE}     //  10 
+    , {doStringChar, 254, 11,0,  TRUE}     //  11      string
+    , {doStringChar, 130, 11,0,  TRUE}     //  12 
+    , {doSplitString, 63 /* ? */, 18,0,  FALSE}     //  13 
+    , {doSplitString, 43 /* + */, 18,0,  FALSE}     //  14 
+    , {doSplitString, 42 /* * */, 18,0,  FALSE}     //  15 
+    , {doSplitString, 123 /* { */, 18,0,  FALSE}     //  16 
+    , {doEndString, 255, 18,0,  FALSE}     //  17 
+    , {doNOP, 42 /* * */, 36,0,  TRUE}     //  18      expr-quant
+    , {doNOP, 43 /* + */, 39,0,  TRUE}     //  19 
+    , {doNOP, 63 /* ? */, 42,0,  TRUE}     //  20 
+    , {doNOP, 255, 22,0,  FALSE}     //  21 
+    , {doOrOperator, 124 /* | */, 3,0,  TRUE}     //  22      expr-cont
+    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  23 
+    , {doNOP, 255, 3,0,  FALSE}     //  24 
+    , {doNOP, 63 /* ? */, 27,0,  TRUE}     //  25      open-paren
+    , {doOpenCaptureParen, 255, 3, 18, FALSE}     //  26 
+    , {doOpenNonCaptureParen, 58 /* : */, 3, 18, TRUE}     //  27      open-paren-extended
+    , {doOpenAtomicParen, 62 /* > */, 3, 18, TRUE}     //  28 
+    , {doOpenLookAhead, 61 /* = */, 3, 22, TRUE}     //  29 
+    , {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE}     //  30 
+    , {doNOP, 60 /* < */, 33,0,  TRUE}     //  31 
+    , {doBadOpenParenType, 255, 61,0,  FALSE}     //  32 
+    , {doOpenLookBehind, 61 /* = */, 3, 22, TRUE}     //  33      open-paren-lookbehind
+    , {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE}     //  34 
+    , {doBadOpenParenType, 255, 61,0,  FALSE}     //  35 
+    , {doNGStar, 63 /* ? */, 22,0,  TRUE}     //  36      quant-star
+    , {doPossesiveStar, 43 /* + */, 22,0,  TRUE}     //  37 
+    , {doStar, 255, 22,0,  FALSE}     //  38 
+    , {doNGPlus, 63 /* ? */, 22,0,  TRUE}     //  39      quant-plus
+    , {doPossesivePlus, 43 /* + */, 22,0,  TRUE}     //  40 
+    , {doPlus, 255, 22,0,  FALSE}     //  41 
+    , {doNGOpt, 63 /* ? */, 22,0,  TRUE}     //  42      quant-opt
+    , {doPossesiveOpt, 43 /* + */, 22,0,  TRUE}     //  43 
+    , {doOpt, 255, 22,0,  FALSE}     //  44 
+    , {doNOP, 129, 45,0,  TRUE}     //  45      interval-open
+    , {doIntervalMinValue, 128, 48,0,  FALSE}     //  46 
+    , {doNumberExpectedError, 255, 61,0,  FALSE}     //  47 
+    , {doNOP, 129, 52,0,  TRUE}     //  48      interval-value
+    , {doNOP, 125 /* } */, 52,0,  FALSE}     //  49 
+    , {doIntervalDigit, 128, 48,0,  TRUE}     //  50 
+    , {doNumberExpectedError, 255, 61,0,  FALSE}     //  51 
+    , {doNOP, 129, 52,0,  TRUE}     //  52      interval-close
+    , {doTagValue, 125 /* } */, 55,0,  TRUE}     //  53 
+    , {doNumberExpectedError, 255, 61,0,  FALSE}     //  54 
+    , {doNOP, 254, 3,0,  FALSE}     //  55      expr-cont-no-interval
+    , {doExprOrOperator, 124 /* | */, 3,0,  TRUE}     //  56 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  57 
+    , {doNOP, 255, 3,0,  FALSE}     //  58 
+    , {doBackslashA, 65 /* A */, 3,0,  TRUE}     //  59      backslash
+    , {doStartString, 255, 11,0,  TRUE}     //  60 
+    , {doExit, 255, 61,0,  TRUE}     //  61      errorDeath
 };
 static const char *RegexStateNames[] = {    0,
     "start",
@ -155,6 +149,7 @@ static const char *RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
    0,
     "string",
    0,
@ -169,11 +164,6 @@ static const char *RegexStateNames[] = {    0,
    0,
     "expr-cont",
    0,
-    0,
-    0,
-    0,
-    0,
-    0,
    0,
     "open-paren",
    0,
@ -209,9 +199,7 @@ static const char *RegexStateNames[] = {    0,
    0,
    0,
    0,
-    0,
-    0,
-    0,
+     "backslash",
    0,
     "errorDeath",
    0};
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -77,7 +77,8 @@ term:
    '['                  n expr-quant     		            doScanUnicodeSet
    '('                  n open-paren            ^expr-quant          
    '.'                  n expr-quant                               doDotAny
-    eof		           pop
+    '\'                  n backslash
+    eof		           finish
    default                errorDeath                               doRuleError
    

@ -110,17 +111,12 @@ expr-quant:
    
 #
 #  expr-cont      Expression, continuation.  At a point where additional terms are
-#                                            allowed, but not required.
+#                                            allowed, but not required.  No Quantifiers
 #
 expr-cont:
-    quoted                  term                                    
-    rule_char               term                                    
-    '['                     term                                    
-    '('                     term                                    
-    '.'                     term                                    
    '|'                  n  term                                    doOrOperator
    ')'                  n  pop                                     doCloseParen
-    default                 pop                                     doExprFinished
+    default                 term                                    
    

 #
@ -205,16 +201,18 @@ interval-close:
 #
 expr-cont-no-interval:
    quoted                  term                                    
-    rule_char               term                                    
-    '['                     term                                    
-    '('                     term                                    
-    '.'                     term                                    
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
-    default                 pop                                     doExprFinished
+    default                 term                   
    
    
-
+#
+#  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
+#                                  The low level next-char function will have preprocessed
+#                                  some of them already; those won't come here.
+backslash:
+   'A'                   n  term                                    doBackslashA
+   default               n  string				    doStartString   

    
    
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -26,7 +26,7 @@ static const uint32_t     URX_STATE_SAVE    = 6;    // Value field is pattern po
 static const uint32_t     URX_NOP           = 7;
 static const uint32_t     URX_START_CAPTURE = 8;    // Value field is capture group number.
 static const uint32_t     URX_END_CAPTURE   = 9;    // Value field is capture group number
-static const uint32_t     URX_UNUSED10      = 10;   // Value field is index in pattern to
+static const uint32_t     URX_BACKSLASH_A   = 10;   // Value field is index in pattern to
                                                    //   loop back to.
 static const uint32_t     URX_SETREF        = 11;   // Value field is index of set in array of sets.
 static const uint32_t     URX_DOTANY        = 12; 
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -11,6 +11,7 @@
 #include "unicode/utypes.h"
 #include "unicode/regex.h"
 #include "unicode/uniset.h"
+#include "unicode/uchar.h"
 #include "uassert.h"
 #include "uvector.h"
 #include "regeximp.h"
@ -54,20 +55,126 @@ RegexMatcher::~RegexMatcher() {



-
+static const UChar BACKSLASH  = 0x5c;
+static const UChar DOLLARSIGN = 0x24;
+//--------------------------------------------------------------------------------
+//
+//    appendReplacement
+//
+//--------------------------------------------------------------------------------
 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
-                                              const UnicodeString &replacement) {
+                                              const UnicodeString &replacement,
+                                              UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return *this;
+    }
+    if (fMatch == FALSE) {
+        status = U_REGEX_INVALID_STATE;
+        return *this;
+    }
+
+    // Copy input string from the end of previous match to start of current match
+    int32_t  len = fMatchStart-fLastMatchEnd;
+    if (len > 0) {
+        dest.append(*fInput, fLastMatchEnd, len);
+    }
+    
+
+    // scan the replacement text, looking for substitutions ($n) and \escapes.
+    int32_t  replLen = replacement.length();
+    int32_t  replIdx;
+    for (replIdx = 0; replIdx<replLen; replIdx++) {
+        UChar  c = replacement.charAt(replIdx);
+        if (c == BACKSLASH) {
+            // Backslash Escape.  Copy the following char out without further checks.
+            replIdx++;
+            if (replIdx >= replLen) {
+                break;
+            }
+            c = replacement.charAt(replIdx);
+            dest.append(c);
+            continue;
+        }
+
+        if (c != DOLLARSIGN) {
+            // Normal char, not a $.  Copy it out without further checks.
+            dest.append(c);
+            continue;
+        }
+
+        // We've got a $.  Pick up a capture group number if one follows.
+        // Consume at most the number of digits necessary for the largest capture
+        // number that is valid for this pattern.
+        if (++replIdx >= replLen) {
+            // $ was at the end of the replacement string.  Dump it out and be done.
+            dest.append(c);
+            break;
+        }
+
+        int32_t numDigits = 0;
+        int32_t groupNum  = 0;
+        for (;;) {
+            c = replacement.charAt(replIdx);
+            if (u_isdigit(c) == FALSE) {
+                break;
+            }
+            groupNum=groupNum*10 + u_charDigitValue(c);
+            numDigits++;
+            if (++replIdx >= replLen) {
+                break;
+            }
+            if (numDigits >= fPattern->fMaxCaptureDigits) {
+                break;
+            }
+        }
+
+        // We've scanned one char ahead in the pattern.  Back up so the
+        //  next iteration of the loop picks the char again.
+        --replIdx;
+
+        if (numDigits == 0) {
+            // The $ didn't introduce a group number at all.
+            // Treat it as just part of the substitution text.
+            dest.append(DOLLARSIGN);
+            continue;
+        }
+
+        // Finally, append the capture group data to the destination.
+        dest.append(group(groupNum, status));
+        if (U_FAILURE(status)) {
+            // Can fail if group number is out of range.
+            return *this;
+        }
+
+    }
+
    return *this;
 }



+//--------------------------------------------------------------------------------
+//
+//    appendTail     Intended to be used in conjunction with appendReplacement()
+//                   To the destination string, append everything following
+//                   the last match position from the input string.
+//
+//--------------------------------------------------------------------------------
 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
+    int32_t  len = fInputLength-fMatchEnd;
+    if (len > 0) {
+        dest.append(*fInput, fMatchEnd, len);
+    }
    return dest;
 }



+//--------------------------------------------------------------------------------
+//
+//   end
+//
+//--------------------------------------------------------------------------------
 int32_t RegexMatcher::end(UErrorCode &err) const {
    return end(0, err);
 }
@ -78,7 +185,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
    if (U_FAILURE(err)) {
        return 0;
    }
-    if (fLastMatch == FALSE) {
+    if (fMatch == FALSE) {
        err = U_REGEX_INVALID_STATE;
        return 0;
    }
@ -88,7 +195,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
    }
    int32_t e = 0;
    if (group == 0) {
-        e = fLastMatchEnd; 
+        e = fMatchEnd; 
    } else {
        int32_t s = fCaptureEnds->elementAti(group);
        // TODO:  what to do if no match on this specific group?
@ -101,11 +208,16 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {



+//--------------------------------------------------------------------------------
+//
+//   find()
+//
+//--------------------------------------------------------------------------------
 UBool RegexMatcher::find() {
    // Start at the position of the last match end.  (Will be zero if the
    //   matcher has been reset.
    UErrorCode status = U_ZERO_ERROR;
-    return find(fLastMatchEnd, status);
+    return find(fMatchEnd, status);
 }


@ -128,16 +240,20 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
        if (U_FAILURE(status)) {
            return FALSE;
        }
-        if (fLastMatch) {
+        if (fMatch) {
            return TRUE;
        }
    }
-    fLastMatchStart = fLastMatchEnd = fInputLength;
    return FALSE;
 }



+//--------------------------------------------------------------------------------
+//
+//  group()
+//
+//--------------------------------------------------------------------------------
 UnicodeString RegexMatcher::group(UErrorCode &status) const {
    return group(0, status);
 }
@ -181,7 +297,7 @@ UBool RegexMatcher::lookingAt(UErrorCode &status) {
    }
    reset();
    MatchAt(0, status);
-    return fLastMatch;
+    return fMatch;
 }


@ -192,7 +308,7 @@ UBool RegexMatcher::matches(UErrorCode &status) {
    }
    reset();
    MatchAt(0, status);
-    UBool   success  = (fLastMatch && fLastMatchEnd==fInputLength);
+    UBool   success  = (fMatch && fMatchEnd==fInputLength);
    return success;
 }

@ -205,23 +321,58 @@ const RegexPattern &RegexMatcher::pattern() const {



-UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &err) {
-    return UnicodeString();
+//--------------------------------------------------------------------------------
+//
+//    replaceAll
+//
+//--------------------------------------------------------------------------------
+UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return *fInput;
+    }
+    UnicodeString destString;
+    for (reset(); find(); ) {
+        appendReplacement(destString, replacement, status);
+    }
+    appendTail(destString);
+    return destString;
 }




-UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &err) {
-    return UnicodeString();
+//--------------------------------------------------------------------------------
+//
+//    replaceFirst
+//
+//--------------------------------------------------------------------------------
+UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return *fInput;
+    }
+    reset();
+    if (!find()) {
+        return *fInput;
+    }
+
+    UnicodeString destString;
+    appendReplacement(destString, replacement, status);
+    appendTail(destString);
+    return destString;
 }



+//--------------------------------------------------------------------------------
+//
+//     reset
+//
+//--------------------------------------------------------------------------------
 RegexMatcher &RegexMatcher::reset() {
-    fLastMatchStart = 0;
-    fLastMatchEnd   = 0;
-    fLastMatch      = FALSE;
+    fMatchStart   = 0;
+    fMatchEnd     = 0;
+    fLastMatchEnd = 0;
+    fMatch        = FALSE;
    int i;
    for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
        fCaptureStarts->setElementAt(i, -1);
@ -252,7 +403,7 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
    if (U_FAILURE(err)) {
        return 0;
    }
-    if (fLastMatch == FALSE) {
+    if (fMatch == FALSE) {
        err = U_REGEX_INVALID_STATE;
        return 0;
    }
@ -262,7 +413,7 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
    }
    int32_t s;
    if (group == 0) {
-        s = fLastMatchStart; 
+        s = fMatchStart; 
    } else {
        s = fCaptureStarts->elementAti(group);
        // TODO:  what to do if no match on this specific group?
@ -272,6 +423,26 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {



+//--------------------------------------------------------------------------------
+//
+//    getCaptureText    We have encountered a '\' that might preceed a
+//                      capture group specification. 
+//                      If a valid capture group number follows the '\', 
+//                      return the indicies to the start & end of the captured
+//                      text, and update the patIdx to the position following the
+//                      \n sequence.
+//
+//                      This function is used during find and replace operations when
+//                      processing caputure references in the replacement text.
+//
+//--------------------------------------------------------------------------------
+UBool  RegexMatcher::getCaptureText(const UnicodeString &rep,
+                                int32_t &repIdx,
+                                int32_t &textStart,
+                                int32_t &textEnd)
+{
+    return FALSE;
+}

 //--------------------------------------------------------------------------------
 //
@ -408,6 +579,12 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
            fCaptureEnds->setElementAt(inputIdx, opValue);
            break;

+        case URX_BACKSLASH_A:
+            if (inputIdx != 0) {
+                backTrack(inputIdx, patIdx);
+            }
+            break;
+

        case URX_SETREF:
            if (inputIdx < fInputLength) {
@ -449,7 +626,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
        default:
            // Trouble.  The compiled pattern contains an entry with an
            //           unrecognized type tag.
-            U_ASSERT(false);
+            U_ASSERT(FALSE);
        }

        if (U_FAILURE(status)) {
@ -458,10 +635,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
    }
    
 breakFromLoop:
-    fLastMatch = isMatch;
+    fMatch = isMatch;
    if (isMatch) {
-        fLastMatchStart  = startIdx;
-        fLastMatchEnd    = inputIdx;
+        fLastMatchEnd = fMatchEnd;
+        fMatchStart   = startIdx;
+        fMatchEnd     = inputIdx;
        }
    return;
 }
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -65,6 +65,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    fLiteralText      = other.fLiteralText;
    fBadState         = other.fBadState;
    fNumCaptureGroups = other.fNumCaptureGroups;
+    fMaxCaptureDigits = other.fMaxCaptureDigits;
    if (fBadState) {
        return *this;
    }
@ -108,6 +109,7 @@ void RegexPattern::init() {
    fFlags            = 0;
    fBadState         = FALSE;
    fNumCaptureGroups = 0;
+    fMaxCaptureDigits = 1;     // TODO:  calculate for real.
    fMatcher          = NULL;
    
    UErrorCode status=U_ZERO_ERROR;
@ -301,6 +303,8 @@ UnicodeString RegexPattern::pattern() const {
 //---------------------------------------------------------------------
 //
 //   split
+//            TODO:  perl returns captured strings intermixed with the
+//                   fields.  Should we do this too?
 //
 //---------------------------------------------------------------------
 int32_t  RegexPattern::split(const UnicodeString &input,
@ -359,9 +363,9 @@ int32_t  RegexPattern::split(const UnicodeString &input,
        if (fMatcher->find()) {
            // We found another delimiter.  Move everything from where we started looking
            //  up until the start of the delimiter into the next output string.
-            int32_t fieldLen = fMatcher->fLastMatchStart - nextOutputStringStart;
+            int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
            dest[i].setTo(input, nextOutputStringStart, fieldLen);
-            nextOutputStringStart = fMatcher->fLastMatchEnd;
+            nextOutputStringStart = fMatcher->fMatchEnd;
            if (nextOutputStringStart == inputLen) {
                // The delimiter was at the end of the string.  We're done.
                break;
@ -407,7 +411,7 @@ static char *opNames[] = {
        "NOP",
        "START_CAPTURE",
        "END_CAPTURE",
-        "?10",
+        "URX_BACKSLASH_A",
        "SETREF",
        "DOTANY",
        "JMP",
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -178,6 +178,7 @@ private:
                                   //  make new ones on each call.

    int32_t         fNumCaptureGroups;
+    int32_t         fMaxCaptureDigits;

    friend class RegexCompile;
    friend class RegexMatcher;
@ -226,13 +227,16 @@ public:
    *   The append position is set to the position of the first
    *   character following the match in the input string.
    *
+    *   For complete, prepackaged, non-incremental find-and-replace
+    *   operations, see replaceFirst() or replaceAll().
+    *
    *   Returns:  This Matcher
    *
    *    error:  Illegal state - no match yet attemtped, or last match failed.
    *            IndexOutOfBounds - caputure string number from replacement string.
    */
    virtual RegexMatcher &appendReplacement(UnicodeString &dest,
-        const UnicodeString &replacement);
+        const UnicodeString &replacement, UErrorCode &status);
    
    
   /*
@ -329,7 +333,8 @@ public:
    
    /*
    *    Replaces every subsequence of the input sequence that matches the pattern
-    *    with the given replacement string.
+    *    with the given replacement string.  This is a convenience function that
+    *    provides a complete find-and-replace-all operation.
    *
    *    This method first resets this matcher. It then scans the input sequence
    *    looking for matches of the pattern. Characters that are not part of any 
@ -337,10 +342,7 @@ public:
    *    replacement string. The replacement string may contain references to
    *    captured subsequences as in the appendReplacement method. 
    *
-    *    @return   The target string.  Depending on how the RegexMatcher was
-    *              created, this may either be the original input string or a copy
-    *
-    *    Error:  Index out of bounds (replacement string capture group)
+    *    @return   A string containing the results of the find and replace.
    *
    */
    virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &err); 
@ -348,16 +350,15 @@ public:
    
    /*
    * Replaces the first subsequence of the input sequence that matches
-    * the pattern with the given replacement string. 
+    * the pattern with the given replacement string.   This is a convenience
+    * function that provides a complete find-and-replace operation.
+    *
    * This method first resets this matcher. It then scans the input sequence
    * looking for a match of the pattern. Characters that are not part
    * of the match are appended directly to the result string; the match is replaced
    * in the result by the replacement string. The replacement string may contain
    * references to captured subsequences as in the appendReplacement method. 
    *
-    *    Error:  Index out of bounds (replacement string capture group)
-    *            Illegal state (no match)
-    *      Note:  Javadoc doesn't list exceptions, but they gotta be there for consistency
    */
    virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &err); 
    
@ -409,27 +410,33 @@ public:

 private:
    // Constructors and other object boilerplate are private.
-    // Creation by users is through factory method in RegexPattern
+    // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
+    // Creation by users is only through the factory method in class RegexPattern
    RegexMatcher(const RegexPattern *pat); 
    RegexMatcher(const RegexMatcher &other);
    RegexMatcher &operator =(const RegexMatcher &rhs);
    friend class RegexPattern;

-    inline void backTrack(int32_t &inputIdx, int32_t &patIdx);

    //
    //  MatchAt   This is the internal interface to the match engine itself.
    //            Match status comes back in matcher member variables.
    //
-    virtual void MatchAt(int32_t startIdx, UErrorCode &status);   
+    void         MatchAt(int32_t startIdx, UErrorCode &status);   
+    inline  void backTrack(int32_t &inputIdx, int32_t &patIdx);
+    UBool        getCaptureText(const UnicodeString &rep,
+                                int32_t &repIdx,
+                                int32_t &textStart,
+                                int32_t &textEnd);


    const RegexPattern  *fPattern;
    const UnicodeString *fInput;
    int32_t              fInputLength;
-    UBool                fLastMatch;        // True if the last match was successful.
-    int32_t              fLastMatchStart;
-    int32_t              fLastMatchEnd;
+    UBool                fMatch;           // True if the last match was successful.
+    int32_t              fMatchStart;      // Position of the start of the most recent match
+    int32_t              fMatchEnd;        // First position after the end of the most recent match
+    int32_t              fLastMatchEnd;    // First position after the end of the previous match.
    UStack              *fBackTrackStack;
    UVector             *fCaptureStarts;
    UVector             *fCaptureEnds;
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -31,12 +31,12 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
    if (exec) logln("TestSuite RegexTest: ");
    switch (index) {

-        case 0: name = "API_Match";
-            if (exec) API_Match(); 
-            break;
-        case 1: name = "Basic";
+        case 0: name = "Basic";
            if (exec) Basic(); 
            break;
+        case 1: name = "API_Match";
+            if (exec) API_Match(); 
+            break;
        case 2: name = "API_Replace";
            if (exec) API_Replace(); 
            break;
@ -87,6 +87,7 @@ UBool RegexTest::doRegexLMTest(char *pat, char *text, UBool looking, UBool match
        errln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %d\n", line, status);
        return FALSE;
    }
+    // REPattern->dump();

    UnicodeString inputString(inputText);
    UnicodeString unEscapedInput = inputString.unescape();
@ -295,6 +296,101 @@ void RegexTest::API_Match() {
        delete matcher;
        delete pat;
    }
+
+    //
+    //  Replace
+    //
+    {
+        int32_t             flags=0;
+        UParseError         pe;
+        UErrorCode          status=U_ZERO_ERROR;
+
+        UnicodeString       re("abc");
+        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
+        REGEX_CHECK_STATUS;
+        UnicodeString data = ".abc..abc...abc..";
+        //                    012345678901234567
+        RegexMatcher *matcher = pat->matcher(data, status);
+
+        //
+        //  Plain vanilla matches.
+        //
+        UnicodeString  dest;
+        dest = matcher->replaceFirst("yz", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == ".yz..abc...abc..");
+
+        dest = matcher->replaceAll("yz", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == ".yz..yz...yz..");
+
+        //
+        //  Plain vanilla non-matches.
+        //
+        UnicodeString d2 = ".abx..abx...abx..";
+        matcher->reset(d2);
+        dest = matcher->replaceFirst("yz", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == ".abx..abx...abx..");
+
+        dest = matcher->replaceAll("yz", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == ".abx..abx...abx..");
+
+        //
+        // Empty source string
+        //
+        UnicodeString d3 = "";
+        matcher->reset(d3);
+        dest = matcher->replaceFirst("yz", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == "");
+
+        dest = matcher->replaceAll("yz", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == "");
+
+        //
+        // Empty substitution string
+        //
+        matcher->reset(data);              // ".abc..abc...abc.."
+        dest = matcher->replaceFirst("", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == "...abc...abc..");
+
+        dest = matcher->replaceAll("", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == "........");
+
+        //
+        // match whole string
+        //
+        UnicodeString d4 = "abc";
+        matcher->reset(d4);   
+        dest = matcher->replaceFirst("xyz", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == "xyz");
+
+        dest = matcher->replaceAll("xyz", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == "xyz");
+
+        //
+        // Capture Group, simple case
+        //
+        UnicodeString       re2("a(..)");
+        RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
+        REGEX_CHECK_STATUS;
+        UnicodeString d5 = "abcdefg";
+        RegexMatcher *matcher2 = pat2->matcher(d5, status);
+        REGEX_CHECK_STATUS;
+        dest = matcher2->replaceFirst("$1$1", status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(dest == "bcbcdefg");
+      
+    }
+
+
        
 }

@ -314,6 +410,7 @@ void RegexTest::Basic() {
 //
 #if 0
    {
+            REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    }
    return;
 #endif
@ -419,6 +516,26 @@ void RegexTest::Basic() {
    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);

+    //
+    //  Escape sequences that become single literal chars, handled internally
+    //   by ICU's Unescape.
+    //
+    
+    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
+    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
+    REGEX_TESTLM("\\b", "\\u0008", TRUE, TRUE);        // BS
+    // REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L (or whatever) TODO: bug in Unescape
+    // REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape  TODO: bug in Unescape
+    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
+    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
+    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
+    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
+    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);       
+    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);       
+
+    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
+    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
+
 };