ICU-2422 regexp, match flag setting options added.

X-SVN-Rev: 11032
2003-02-12 01:28:01 +00:00 · 2003-02-12 01:28:01 +00:00 · 2397658197
commit 2397658197
parent 5e8f53a387
10 changed files with 218 additions and 71 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -185,7 +185,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
    fCharNum        = 0;
    fQuoteMode      = FALSE;
    fFreeForm       = FALSE;
-    fCaseI          = (fRXPat->fFlags & UREGEX_CASE_INSENSITIVE) != 0;
+    fModeFlags      = fRXPat->fFlags;

    fMatchOpenParen  = -1;
    fMatchCloseParen = -1;
@ -579,9 +579,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
            //   of the two NOPs.  Depending on what follows in the pattern, the
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
            //   address of the end of the parenthesized group.
-            fParenStack.push(-2, *fStatus);           // Begin a new frame.
-            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
-            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
+            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
+            fParenStack.push(capturing, *fStatus);                        // Frame type.
+            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
+            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc

            // Save the mapping from group number to stack frame variable position.
            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
@ -601,9 +602,10 @@ UBool RegexCompile::doParseActions(EParseAction action)

            // On the Parentheses stack, start a new frame and add the postions
            //   of the two NOPs.
-            fParenStack.push(-1, *fStatus);                               // Begin a new frame.
-            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
-            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
+            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
+            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
+            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
+            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
        }
         break;

@ -628,7 +630,8 @@ UBool RegexCompile::doParseActions(EParseAction action)
            //   of the two NOPs.  Depending on what follows in the pattern, the
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
            //   address of the end of the parenthesized group.
-            fParenStack.push(-3, *fStatus);           // Begin a new frame.
+            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
+            fParenStack.push(atomic, *fStatus);                           // Frame type.
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
        }
@ -659,9 +662,10 @@ UBool RegexCompile::doParseActions(EParseAction action)

            // On the Parentheses stack, start a new frame and add the postions
            //   of the NOPs.  
-            fParenStack.push(lookAhead, *fStatus);                        // Begin a new frame.
-            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
-            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
+            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
+            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
+            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
+            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
        }
        break;

@ -690,9 +694,10 @@ UBool RegexCompile::doParseActions(EParseAction action)

            // On the Parentheses stack, start a new frame and add the postions
            //   of the StateSave and NOP.  
-            fParenStack.push( negLookAhead, *fStatus);                    // Begin a new frame.
-            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE
-            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
+            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
+            fParenStack.push( negLookAhead, *fStatus);                    // Frame type
+            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
+            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
            
            // Instructions #5 and #6 will be added when the ')' is encountered.
        }
@ -957,16 +962,30 @@ UBool RegexCompile::doParseActions(EParseAction action)

    case doDotAny:
        // scanned a ".",  match any single character.
-        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus);
+        {
+            int32_t   op;
+            if (fModeFlags & UREGEX_DOTALL) {
+                op = URX_BUILD(URX_DOTANY_ALL, 0);
+            } else {
+                op = URX_BUILD(URX_DOTANY, 0);
+            }
+            fRXPat->fCompiledPat->addElement(op, *fStatus);
+        }
        break;

-    case doCaret:       // TODO:  multi-line mode flag.
-        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
+    case doCaret: 
+        {
+            int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_CARET_M : URX_CARET;
+            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
+        }
        break;


-    case doDollar:       // TODO:  multi-line mode flag.
-        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
+    case doDollar:  
+        {
+            int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_DOLLAR_M : URX_DOLLAR;
+            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
+        }
        break;

    case doBackslashA:
@ -1051,8 +1070,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
    case doScanUnicodeSet:
        {
            UnicodeSet *theSet = scanSet();
-            if (fCaseI && theSet != NULL) {
+            if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && theSet != NULL) {
                caseClose(theSet);   // TODO:  replace with the real function.
+                // theSet->closeOver(USET_CASE);
            }
            compileSet(theSet);
        }
@ -1094,7 +1114,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
            //  of compilation, it will be changed to the variables location.
            U_ASSERT(groupNum > 0);
            int32_t  op;
-            if (fCaseI) {
+            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
                op = URX_BUILD(URX_BACKREF_I, groupNum);
            } else {
                op = URX_BUILD(URX_BACKREF, groupNum);
@ -1217,11 +1237,70 @@ UBool RegexCompile::doParseActions(EParseAction action)
        break;


-    case doMatchMode:   //  (?i)    and similar
-        // TODO:  implement
-        error(U_REGEX_UNIMPLEMENTED);
+    case doBeginMatchMode:
+        fNewModeFlags = fModeFlags;
+        fSetModeFlag  = TRUE;
        break;

+    case doMatchMode:   //  (?i)    and similar
+        {
+            int32_t  bit = 0;
+            switch (fC.fChar) {
+            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
+            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
+            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
+            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
+            case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
+            default:
+                U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
+                                   // by the scanner.
+            }
+            if (fSetModeFlag) {
+                fNewModeFlags |= bit;
+            } else {
+                fNewModeFlags &= ~bit;
+            }
+        }
+        break;
+
+    case doSetMatchMode:
+        // We've got a (?i) or similar.  The match mode is being changed, but
+        //   the change is not scoped to a parenthesized block.
+        fModeFlags = fNewModeFlags;
+
+        // Prevent any string from spanning across the change of match mode.
+        //   Otherwise the pattern "abc(?i)def" would make a single string of "abcdef" 
+        fixLiterals();     
+        break;
+
+
+    case doMatchModeParen:
+        // We've got a (?i: or similar.  Begin a parenthesized block, save old
+        //   mode flags so they can be restored at the close of the block.
+        //
+        //   Compile to a
+        //      - NOP, which later may be replaced by a save-state if the
+        //         parenthesized group gets a * quantifier, followed by
+        //      - NOP, which may later be replaced by a save-state if there
+        //             is an '|' alternation within the parens.
+        {
+            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
+            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
+
+            // On the Parentheses stack, start a new frame and add the postions
+            //   of the two NOPs (a normal non-capturing () frame, except for the
+            //   saving of the orignal mode flags.)
+            fParenStack.push(fModeFlags, *fStatus);
+            fParenStack.push(flags, *fStatus);                            // Frame Marker
+            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
+            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
+
+            // Set the current mode flags to the new values.
+            fModeFlags = fNewModeFlags;
+        }
+        break;
+
+

    default:
        error(U_REGEX_INTERNAL_ERROR);
@ -1278,7 +1357,7 @@ void RegexCompile::literalChar()  {
    opType = URX_TYPE(op);
    U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
    if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
-        if (fCaseI) {
+        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
            op     = URX_BUILD(URX_STRING_I, fStringOpStart);
        } else {
            op     = URX_BUILD(URX_STRING, fStringOpStart);
@ -1308,7 +1387,7 @@ void RegexCompile::literalChar()  {
 //------------------------------------------------------------------------------
 void RegexCompile::emitONE_CHAR(UChar32  c) {
    int32_t op;
-    if (fCaseI && (u_tolower(c) != u_toupper(c))) {
+    if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && (u_tolower(c) != u_toupper(c))) {
        // We have a cased character, and are in case insensitive matching mode.
        // TODO: replace with a better test.  See Alan L.'s mail of 2/6
        c  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
@ -1540,11 +1619,17 @@ void  RegexCompile::handleCloseParen() {
        fMatchOpenParen     = patIdx;
    }

+    //  At the close of any parenthesized block, restore the match mode flags  to
+    //  the value they had at the open paren.  Saved value is
+    //  at the top of the paren stack.  
+    fModeFlags = fParenStack.popi();
+    
    // DO any additional fixups, depending on the specific kind of
    // parentesized grouping this is

    switch (patIdx) {
    case plain:
+    case flags:
        // No additional fixups required.
        //   (Grouping-only parentheses)
        break;
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -74,7 +74,8 @@ public:
        capturing    = -2, 
        atomic       = -3,
        lookAhead    = -4,
-        negLookAhead = -5
+        negLookAhead = -5,
+        flags        = -6
    };

 private:
@ -142,7 +143,11 @@ private:
    //
    //  Data associated with the generation of the pcode for the match engine
    //
-    UBool                         fCaseI;            // Case Insensitive Match Mode is on.
+    int32_t                       fModeFlags;        // Match Flags.  (Case Insensitive, etc.)
+    int32_t                       fNewModeFlags;     // New flags, while compiling (?i, holds state
+                                                     //   until last flag is scanned.
+    UBool                         fSetModeFlag;      // true for (?ismx, false for (?-ismx
+

    int32_t                       fStringOpStart;    // While a literal string is being scanned
                                                     //   holds the start index within RegexPattern.
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -24,6 +24,7 @@ U_NAMESPACE_BEGIN
 enum Regex_PatternParseAction {
    doCloseParen,
    doProperty,
+    doBeginMatchMode,
    doOrOperator,
    doOpenCaptureParen,
    doBadOpenParenType,
@ -53,11 +54,13 @@ enum Regex_PatternParseAction {
    doBackslashA,
    doBackslashB,
    doNGPlus,
+    doSetMatchMode,
    doPatFinish,
    doBackslashD,
    doPossesiveOpt,
    doEscapeError,
    doBackslashG,
+    doMatchModeParen,
    doOpt,
    doInterval,
    doLiteralChar,
@ -136,11 +139,11 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE}     //  32 
    , {doNOP, 60 /* < */, 43,0,  TRUE}     //  33 
    , {doNOP, 35 /* # */, 46, 2, TRUE}     //  34 
-    , {doMatchMode, 105 /* i */, 49,0,  TRUE}     //  35 
-    , {doMatchMode, 120 /* x */, 49,0,  TRUE}     //  36 
-    , {doMatchMode, 115 /* s */, 49,0,  TRUE}     //  37 
-    , {doMatchMode, 109 /* m */, 49,0,  TRUE}     //  38 
-    , {doMatchMode, 45 /* - */, 49,0,  TRUE}     //  39 
+    , {doBeginMatchMode, 105 /* i */, 49,0,  FALSE}     //  35 
+    , {doBeginMatchMode, 109 /* m */, 49,0,  FALSE}     //  36 
+    , {doBeginMatchMode, 115 /* s */, 49,0,  FALSE}     //  37 
+    , {doBeginMatchMode, 120 /* x */, 49,0,  FALSE}     //  38 
+    , {doBeginMatchMode, 45 /* - */, 49,0,  FALSE}     //  39 
    , {doConditionalExpr, 40 /* ( */, 101,0,  TRUE}     //  40 
    , {doPerlInline, 123 /* { */, 101,0,  TRUE}     //  41 
    , {doBadOpenParenType, 255, 101,0,  FALSE}     //  42 
@ -151,12 +154,12 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doMismatchedParenErr, 253, 101,0,  FALSE}     //  47 
    , {doNOP, 255, 46,0,  TRUE}     //  48 
    , {doMatchMode, 105 /* i */, 49,0,  TRUE}     //  49      paren-flag
-    , {doMatchMode, 115 /* s */, 49,0,  TRUE}     //  50 
-    , {doMatchMode, 109 /* m */, 49,0,  TRUE}     //  51 
+    , {doMatchMode, 109 /* m */, 49,0,  TRUE}     //  50 
+    , {doMatchMode, 115 /* s */, 49,0,  TRUE}     //  51 
    , {doMatchMode, 120 /* x */, 49,0,  TRUE}     //  52 
    , {doMatchMode, 45 /* - */, 49,0,  TRUE}     //  53 
-    , {doNOP, 41 /* ) */, 2,0,  TRUE}     //  54 
-    , {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE}     //  55 
+    , {doSetMatchMode, 41 /* ) */, 2,0,  TRUE}     //  54 
+    , {doMatchModeParen, 58 /* : */, 2, 14, TRUE}     //  55 
    , {doNOP, 255, 101,0,  FALSE}     //  56 
    , {doNGStar, 63 /* ? */, 20,0,  TRUE}     //  57      quant-star
    , {doPossesiveStar, 43 /* + */, 20,0,  TRUE}     //  58 
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -132,11 +132,11 @@ open-paren-extended:
    '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
    '<'                  n  open-paren-lookbehind
    '#'                  n  paren-comment   ^term
-    'i'                  n  paren-flag                              doMatchMode
-    'x'                  n  paren-flag                              doMatchMode
-    's'                  n  paren-flag                              doMatchMode
-    'm'                  n  paren-flag                              doMatchMode
-    '-'                  n  paren-flag                              doMatchMode
+    'i'                     paren-flag                              doBeginMatchMode
+    'm'                     paren-flag                              doBeginMatchMode
+    's'                     paren-flag                              doBeginMatchMode
+    'x'                     paren-flag                              doBeginMatchMode
+    '-'                     paren-flag                              doBeginMatchMode
    '('                  n  errorDeath                              doConditionalExpr
    '{'                  n  errorDeath                              doPerlInline
    default                 errorDeath                              doBadOpenParenType
@ -157,16 +157,16 @@ paren-comment:
    default              n  paren-comment

 #
-#  paren-flag    Scanned a (?ismx-ismx  flag setting thing
-#                TODO:  this is not fully implemented yet.
+#  paren-flag    Scanned a (?ismx-ismx  flag setting 
+#                 
 paren-flag:
    'i'                  n  paren-flag                              doMatchMode
-    's'                  n  paren-flag                              doMatchMode
    'm'                  n  paren-flag                              doMatchMode
+    's'                  n  paren-flag                              doMatchMode
    'x'                  n  paren-flag                              doMatchMode
    '-'                  n  paren-flag                              doMatchMode
-    ')'                  n  term
-    ':'                  n  term              ^expr-quant           doOpenNonCaptureParen
+    ')'                  n  term                                    doSetMatchMode
+    ':'                  n  term              ^expr-quant           doMatchModeParen
    default                 errorDeath
    
    
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -117,9 +117,11 @@ enum {
                               //   First Operand:  Index of start of string in string literals
                               //   Second Operand (next word in compiled code):
                               //     the length of the string.
-     URX_BACKREF_I     = 41    // Case insensitive back reference.
+     URX_BACKREF_I     = 41,   // Case insensitive back reference.
                               //   Parameter is the index of the
                               //   capture group variables in the state stack frame.
+     URX_DOLLAR_M      = 42,   // $ in multi-line mode.
+     URX_CARET_M       = 43    // ^ in multi-line mode.
 };           

 // Keep this list of opcode names in sync with the above enum
@ -166,7 +168,9 @@ enum {
        "LA_END",              \
        "ONECHAR_I",           \
        "STRING_I",            \
-        "BACKREF_I"
+        "BACKREF_I",           \
+        "DOLLAR_M",            \
+        "CARET_M"

 //
 //  Convenience macros for assembling and disassembling a compiled operation.
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -810,17 +810,52 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {

            fp = (REStackFrame *)fStack->popFrame(frameSize);

-            // TODO:  support for multi-line mode.
            break;


-        case URX_CARET:                    //  ^, test for start of line
+         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
+             {
+                 if (fp->fInputIdx >= inputLen) {
+                     // We really are at the end of input.  Success.
+                     break;
+                 }
+                 // If we are positioned just before a new-line , succeed.
+                 // It makes no difference where the new-line is within the input.
+                 UChar32 c = inputBuf[fp->fInputIdx];
+                 if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
+                     break;                         // At new-line at end of input. Success
+                 }
+                 // not at a new line.  Fail.
+                 fp = (REStackFrame *)fStack->popFrame(frameSize);
+             }
+             break;
+
+
+       case URX_CARET:                    //  ^, test for start of line
            if (fp->fInputIdx != 0) {
                fp = (REStackFrame *)fStack->popFrame(frameSize);
-            }                              // TODO:  support for multi-line mode.
+            }           
            break;


+       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
+           {
+               if (fp->fInputIdx == 0) {
+                   // We are at the start input.  Success.
+                   break;
+               }
+               // Check the character just before the current pos.
+               UChar  c = inputBuf[fp->fInputIdx - 1]; 
+               if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
+                   //  It's a new-line.  ^ is true.  Success.
+                   break;                        
+               }
+               // Not at the start of a line.  Fail.
+               fp = (REStackFrame *)fStack->popFrame(frameSize);
+           }             
+           break;
+
+
        case URX_BACKSLASH_A:          // Test for start of input
            if (fp->fInputIdx != 0) {
                fp = (REStackFrame *)fStack->popFrame(frameSize);
@ -966,10 +1001,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {

        case URX_DOTANY:
            {
-                // . matches anything
+                // . matches anything, but stops at end-of-line.
                if (fp->fInputIdx >= inputLen) {
                    // At end of input.  Match failed.  Backtrack out.
-                        fp = (REStackFrame *)fStack->popFrame(frameSize);
+                    fp = (REStackFrame *)fStack->popFrame(frameSize);
                    break;
                }
                // There is input left.  Advance over one char, unless we've hit end-of-line
@ -988,20 +1023,20 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
        case URX_DOTANY_ALL:
            {
                // ., in dot-matches-all (including new lines) mode
-                // . matches anything
                if (fp->fInputIdx >= inputLen) {
                    // At end of input.  Match failed.  Backtrack out.
                    fp = (REStackFrame *)fStack->popFrame(frameSize);
                    break;
                }
-                // There is input left.  Advance over one char, unless we've hit end-of-line
-                UChar32 c = fInput->char32At(fp->fInputIdx);
-                fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
-                if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
+                // There is input left.  Advance over one char, except if we are
+                //   at a cr/lf, advance over both of them.
+                UChar32 c; 
+                U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
+                if (c==0x0d) {
                    // In the case of a CR/LF, we need to advance over both.
-                    UChar32 nextc = fInput->char32At(fp->fInputIdx);
-                    if (c == 0x0d && nextc == 0x0a) {
-                        fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
+                    UChar nextc = inputBuf[fp->fInputIdx];
+                    if (nextc == 0x0a) {
+                        fp->fInputIdx++;
                    }
                }
            }
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -452,10 +452,14 @@ void   RegexPattern::dumpOp(int32_t index) const {
    case URX_NOP:
    case URX_DOTANY:
    case URX_FAIL:
+    case URX_CARET:
+    case URX_DOLLAR:
    case URX_BACKSLASH_A:
    case URX_BACKSLASH_G:
    case URX_BACKSLASH_X:
    case URX_END:
+    case URX_DOLLAR_M:
+    case URX_CARET_M:
        // Types with no operand field of interest.
        break;
        
@ -468,8 +472,6 @@ void   RegexPattern::dumpOp(int32_t index) const {
    case URX_BACKSLASH_D:
    case URX_BACKSLASH_W:
    case URX_BACKSLASH_Z:
-    case URX_CARET:
-    case URX_DOLLAR:
    case URX_STRING_LEN:
    case URX_CTR_INIT:
    case URX_CTR_INIT_NG:
@ -485,6 +487,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
    case URX_JMPX:
    case URX_LA_START:
    case URX_LA_END:
+    case URX_BACKREF_I:
        // types with an integer operand field.
        REGEX_DUMP_DEBUG_PRINTF("%d", val);
        break;
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -64,13 +64,17 @@ struct REStackFrame;
 enum {
    /** Forces normalization of pattern and strings.  @draft ICU 2.4 */
    UREGEX_CANON_EQ         = 128,
+
    /**  Enable case insensitive matching.  @draft ICU 2.4 */
    UREGEX_CASE_INSENSITIVE = 2,
+
    /**  Allow white space and comments within patterns  @draft ICU 2.4 */
    UREGEX_COMMENTS         = 4,
+
    /**  If set, '.' matches line terminators,  otherwise '.' matching stops at line end.
      *  @draft ICU 2.4 */
    UREGEX_DOTALL           = 32,
+
    /**   Control behavior of "$" and "^"
      *    If set, recognize line terminators within string,
      *    otherwise, match only at start and end of input string.
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -1228,10 +1228,6 @@ void RegexTest::Errors() {
    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);

-    // Flag settings not yet implemented
-    REGEX_ERR("(?i:stuff*)", 1, 3, U_REGEX_UNIMPLEMENTED);
-    REGEX_ERR("(?-si) stuff", 1, 3, U_REGEX_UNIMPLEMENTED);
-
    // Look-ahead, Look-behind
    REGEX_ERR("abc(?<=xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED);   // look-behind
    REGEX_ERR("abc(?<!xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED);   // negated look-behind
@ -1666,10 +1662,16 @@ void RegexTest::PerlTests() {
            }

            else if (perlExpr.startsWith("\\")) {    // \Escape.  Take following char as a literal.
+                                                     //           or as an escaped sequence (e.g. \n)
                if (perlExpr.length() > 1) {
                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
                }
-                resultString.append(perlExpr.charAt(0));
+                UChar c = perlExpr.charAt(0);
+                switch (c) {
+                case 'n':   c = '\n'; break;
+                // add any other escape sequences that show up in the test expected results.
+                }
+                resultString.append(c); 
                perlExpr.remove(0, 1);
            }

@ -1693,6 +1695,8 @@ void RegexTest::PerlTests() {
        UnicodeString expectedS(fields[4]);
        expectedS.findAndReplace(nulnulSrc, nulnul);
        expectedS.findAndReplace(ffffSrc,   ffff);
+        expectedS.findAndReplace("\\n", "\n");
+

        if (expectedS.compare(resultString) != 0) {
            errln("Line %d: Incorrect perl expression results.  Expected \"%s\"; got \"%s\"",
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -210,5 +210,9 @@

 # Case Insensitive
 "aBc"                    i      "<0>ABC</0>"      
-#"a[^bc]d"               i       "ABD"                      #  TODO:   case closure bug  
-'((((((((((a))))))))))\10' i     "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
+#"a[^bc]d"               i      "ABD"                      #  TODO:   case closure bug  
+'((((((((((a))))))))))\10' i    "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
+
+"(?:(?i)a)b"                    "<0>Ab</0>"
+"ab(?i)cd"	                 "<0>abCd</0>"
+"ab$cd"                         "abcd"