scuffed-code/icu4c/source/i18n/regexcmp.cpp


//
//  file:  regexcmp.cpp
//
//  Copyright (C) 2002, International Business Machines Corporation and others.
//  All Rights Reserved.
//
//  This file contains the ICU regular expression compiler, which is responsible
//  for processing a regular expression pattern into the compiled form that
//  is used by the match finding engine.
//

#include "unicode/utypes.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/parsepos.h"
#include "unicode/parseerr.h"
#include "unicode/regex.h"
#include "uprops.h"
#include "cmemory.h"
#include "cstring.h"
#include "uvectr32.h"
#include "uassert.h"
#include "ucln_in.h"
#include "mutex.h"

#include "regeximp.h"
#include "regexcst.h"   // Contains state table for the regex pattern parser.
                        //   generated by a Perl script.
#include "regexcmp.h"


U_NAMESPACE_BEGIN

//----------------------------------------------------------------------------------------
//
// Unicode Sets for each of the character classes needed for parsing a regex pattern.
//               (Initialized with hex values for portability to EBCDIC based machines.
//                Really ugly, but there's no good way to avoid it.)
//
//              The sets are referred to by name in the regexcst.txt, which is the
//              source form of the state transition table.  These names are converted
//              to indicies in regexcst.h by the perl state table building script regexcst.pl.
//              The indices are used to access the array gRuleSets.
//
//----------------------------------------------------------------------------------------

// "Rule Char" Characters are those with no special meaning, and therefore do not
//    need to be escaped to appear as literals in a regexp.  Expressed
//    as the inverse of those needing escaping --  [^\*\?\+\[\(\)\{\}\^\$\|\\\.]
static const UChar gRuleSet_rule_char_pattern[]       = {
 //   [    ^      \     *     \     ?     \     +     \     [     \     (     /     )
    0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
 //   \     {    \     }     \     ^     \     $     \     |     \     \     \     .     ]
    0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};


static const UChar gRuleSet_digit_char_pattern[] = {
//    [    0      -    9     ]
    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
static const UnicodeSet *gRuleDigits = NULL;


static UnicodeSet  *gRuleSets[10];         // Array of ptrs to the actual UnicodeSet objects.
static UnicodeSet  *gUnescapeCharSet;

//
//   Here are the backslash escape characters that ICU's unescape() function
//    will handle.
//
static const UChar gUnescapeCharPattern[] = {
//    [     a     c     e     f     n     r     t     u     U     ]
    0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0};


//
//  White space characters that may appear within a pattern in free-form mode
//
static const UChar gRuleWhiteSpacePattern[] = {
    /* "[[:Cf:][:WSpace:]]" */
    91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
        83, 112, 97, 99, 101, 58, 93, 93, 0 };


//
//  Unicode Set Definitions for Regular Expression  \w
//
static const UChar gIsWordPattern[] = {
//    [     \     p     {     L     l     }     \     p     {     L     u     }
    0x5b, 0x5c, 0x70, 0x7b, 0x4c, 0x6c, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x75, 0x7d,
//          \     p     {     L     t     }     \     p     {     L     o     }
          0x5c, 0x70, 0x7b, 0x4c, 0x74, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x6f, 0x7d,
//          \     p     {     N     d     }     ]
          0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, 0x5d, 0};


//
//  Unicode Set Definitions for Regular Expression  \s
//
    static const UChar gIsSpacePattern[] = {
//    [     \     t     \     n     \     f     \     r     \     p     {     Z     }     ]
    0x5b, 0x5c, 0x74, 0x5c, 0x6e, 0x5c, 0x66, 0x5c, 0x72, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5d,  0};


//
//  UnicodeSets used in implementation of Grapheme Cluster detection, \X
//
    static const UChar gGC_ControlPattern[] = {
//    [     [     :     Z     l     :     ]     [     :     Z     p     :     ]
    0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
//    [     :     C     c     :     ]     [     :     C     f     :     ]     ]
    0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x5d, 0};

    static const UChar gGC_ExtendPattern[] = {
//    [     [     :     M     n     :     ]     [     :     M     e     :     ]
    0x5b, 0x5b, 0x3a, 0x4d, 0x6e, 0x3a, 0x5d, 0x5b, 0x3a, 0x4d, 0x65, 0x3a, 0x5d,
//    \     u     f     f     9     e     -     \     u     f     f     9     f     ]
    0x5c, 0x75, 0x66, 0x66, 0x39, 0x65, 0x2d, 0x5c, 0x75, 0x66, 0x66, 0x39, 0x66, 0x5d, 0};

    static const UChar gGC_LPattern[] = {
//    [     \     u     1     1     0     0     -     \     u     1     1     5     f     ]
    0x5b, 0x5c, 0x75, 0x31, 0x31, 0x30, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x35, 0x66, 0x5d, 0};

    static const UChar gGC_VPattern[] = {
//    [     \     u     1     1     6     0     -     \     u     1     1     a     2     ]
    0x5b, 0x5c, 0x75, 0x31, 0x31, 0x36, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x32, 0x5d, 0};

    static const UChar gGC_TPattern[] = {
//    [     \     u     1     1     a     8     -     \     u     1     1     f     9     ]
    0x5b, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x38, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x66, 0x39, 0x5d, 0};


static UnicodeSet *gPropSets[URX_LAST_SET];


//----------------------------------------------------------------------------------------
//
//   ThreadSafeUnicodeSetInit   Thread safe creation of a shared UnicodeSet.
//
//----------------------------------------------------------------------------------------
static void ThreadSafeUnicodeSetInit(UnicodeSet **pSet, const UChar *pattern, UErrorCode &status) {
    if (*pSet == NULL) {
        UnicodeSet *t = new UnicodeSet(pattern, status);
        if (U_FAILURE(status)) {
            delete t;
            return;
        }
        if (t == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        Mutex  lock;
        if (*pSet == NULL) {
            *pSet = t;
        } else {
            delete t;
        }
    }
}


//----------------------------------------------------------------------------------------
//
//   InitGraphemeClusterSets   Initialize the constant UnicodeSets needed for the
//                             determination of Grapheme Cluster boundaries.
//
//----------------------------------------------------------------------------------------
static void InitGraphemeClusterSets() {
    UErrorCode status = U_ZERO_ERROR;     // TODO:  some sort of error handling needed.
    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_EXTEND],       gGC_ExtendPattern,           status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_CONTROL],      gGC_ControlPattern,          status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_L],            gGC_LPattern,                status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_V],            gGC_VPattern,                status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_T],            gGC_TPattern,                status);

    if (gPropSets[URX_GC_NORMAL] == NULL) {

        //
        // These sets  are dynamically constructed, because their
        //   intialization strings would be unreasonable.
        //
        UnicodeSet *LV     = new UnicodeSet;;
        UnicodeSet *LVT    = new UnicodeSet;
        UnicodeSet *Normal = new UnicodeSet;


        // The Precomposed Hangul syllables have the range of 0xac00 - 0xd7a3.
        // Categorize these as LV or LVT, using the decomposition algorithm from
        // the Unicode Standard 3.0, section 3.11
        const int32_t TCount = 28;
        UChar  c;
        for (c=0xac00; c<0xd7a4; c+=TCount) {
            LV->add(c);
        }
        LVT->add(0xac00, 0xd7a3);
        LVT->removeAll(*LV);


        //
        //  "Normal" is the set of characters that don't need special handling
        //            when finding grapheme cluster boundaries.
        //
        Normal->complement();
        Normal->remove(0xac00, 0xd7a4);
        Normal->removeAll(*gPropSets[URX_GC_CONTROL]);
        Normal->removeAll(*gPropSets[URX_GC_L]);
        Normal->removeAll(*gPropSets[URX_GC_V]);
        Normal->removeAll(*gPropSets[URX_GC_T]);

        //
        //  Thread Safe initialization of the global pointers to these sets.
        //
        Mutex  lock;
        if (gPropSets[URX_GC_NORMAL] == NULL) {
            gPropSets[URX_GC_NORMAL] = Normal;
            gPropSets[URX_GC_LV]     = LV;
            gPropSets[URX_GC_LVT]    = LVT;
        } else {
            delete Normal;
            delete LV;
            delete LVT;
        }
    }
}


//----------------------------------------------------------------------------------------
//
//   caseClose(UnicodeSet)     TODO: replace by the real UnicodeSet memboer function once
//                             Alan has it implemented
//
//----------------------------------------------------------------------------------------
static void caseClose(UnicodeSet *theSet) {
    int32_t   rn;
    int32_t   numRanges = theSet->getRangeCount();

    for (rn=0; rn<numRanges; rn++) {
        UChar32 rs = theSet->getRangeStart(rn);
        UChar32 re = theSet->getRangeEnd(rn);
        UChar32 c;
        for (c=rs; c<=re; c++) {
            UChar32 lowerC = u_tolower(c);
            UChar32 upperC = u_toupper(c);
            if (lowerC != c) {
                theSet->add(lowerC);
            }
            if (upperC != c) {
                theSet->add(upperC);
            }
        }
    }
}


//----------------------------------------------------------------------------------------
//
//  Constructor.
//
//----------------------------------------------------------------------------------------
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status)
{
    fStatus           = &status;

    fRXPat            = rxp;
    fScanIndex        = 0;
    fNextIndex        = 0;
    fPeekChar         = -1;
    fLineNum          = 1;
    fCharNum          = 0;
    fQuoteMode        = FALSE;
    fInBackslashQuote = FALSE;
    fModeFlags        = fRXPat->fFlags;
    fEOLComments      = TRUE;

    fMatchOpenParen   = -1;
    fMatchCloseParen  = -1;

    if (U_FAILURE(status)) {
        return;
    }

    //
    //  Register the I18n library for cleanup,
    //     but only if we haven't initialized our globals yet.
    if (gRuleSets[kRuleSet_rule_char-128] == NULL) {
        ucln_i18n_registerCleanup();
    }

    //
    //  Set up the constant (static) Unicode Sets.
    //    TODO:  something cleaner for that -128 constant.
    //
    ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128],   gRuleSet_rule_char_pattern,  status);
    ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern,      status);
    ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_digit_char-128],  gRuleSet_digit_char_pattern, status);
    gRuleDigits = gRuleSets[kRuleSet_digit_char-128];
    ThreadSafeUnicodeSetInit(&gUnescapeCharSet,                    gUnescapeCharPattern,        status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET],           gIsWordPattern,              status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET],          gIsSpacePattern,             status);

    InitGraphemeClusterSets();
}


//----------------------------------------------------------------------------------------
//
//  Destructor
//
//----------------------------------------------------------------------------------------
RegexCompile::~RegexCompile() {
}


//----------------------------------------------------------------------------------------
//
//   cleanup.    Called (indirectly) by u_cleanup to free all cached memory
//
//----------------------------------------------------------------------------------------
void RegexCompile::cleanup() {
    delete gRuleSets[kRuleSet_rule_char-128];
    delete gRuleSets[kRuleSet_white_space-128];
    delete gRuleSets[kRuleSet_digit_char-128];
    delete gUnescapeCharSet;
    gRuleSets[kRuleSet_rule_char-128]   = NULL;
    gRuleSets[kRuleSet_white_space-128] = NULL;
    gRuleSets[kRuleSet_digit_char-128]  = NULL;
    gUnescapeCharSet = NULL;
    int i;
    for (i=0; i<URX_LAST_SET; i++) {
        delete (UnicodeSet *)gPropSets[i];
        gPropSets[i] = NULL;
    }
    return;
}


//---------------------------------------------------------------------------------
//
//  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
//                           The state tables are hand-written in the file regexcst.txt,
//                           and converted to the form used here by a perl
//                           script regexcst.pl
//
//---------------------------------------------------------------------------------
void    RegexCompile::compile(
                         const UnicodeString &pat,   // Source pat to be compiled.
                         UParseError &pp,            // Error position info
                         UErrorCode &e)              // Error Code
{
    fStatus             = &e;
    fParseErr           = &pp;
    fStackPtr           = 0;
    fStack[fStackPtr]   = 0;

    if (U_FAILURE(*fStatus)) {
        return;
    }

    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
    U_ASSERT(fRXPat->fPattern.length() == 0);

    // Prepare the RegexPattern object to receive the compiled pattern.
    fRXPat->fPattern        = pat;
    fRXPat->fStaticSets     = gPropSets;


    // Initialize the pattern scanning state machine
    fPatternLength = pat.length();
    uint16_t                state = 1;
    const RegexTableEl      *tableEl;
    nextChar(fC);                        // Fetch the first char from the pattern string.

    //
    // Main loop for the regex pattern parsing state machine.
    //   Runs once per state transition.
    //   Each time through optionally performs, depending on the state table,
    //      - an advance to the the next pattern char
    //      - an action to be performed.
    //      - pushing or popping a state to/from the local state return stack.
    //   file regexcst.txt is the source for the state table.  The logic behind
    //     recongizing the pattern syntax is there, not here.
    //
    for (;;) {
        //  Bail out if anything has gone wrong.
        //  Regex pattern parsing stops on the first error encountered.
        if (U_FAILURE(*fStatus)) {
            break;
        }

        U_ASSERT(state != 0);

        // Find the state table element that matches the input char from the pattern, or the
        //    class of the input character.  Start with the first table row for this
        //    state, then linearly scan forward until we find a row that matches the
        //    character.  The last row for each state always matches all characters, so
        //    the search will stop there, if not before.
        //
        tableEl = &gRuleParseStateTable[state];
        REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d)    state=%s ",
            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);

        for (;;) {    // loop through table rows belonging to this state, looking for one
                      //   that matches the current input char.
            REGEX_SCAN_DEBUG_PRINTF( ".");
            if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
                // Table row specified an individual character, not a set, and
                //   the input character is not quoted, and
                //   the input character matched it.
                break;
            }
            if (tableEl->fCharClass == 255) {
                // Table row specified default, match anything character class.
                break;
            }
            if (tableEl->fCharClass == 254 && fC.fQuoted)  {
                // Table row specified "quoted" and the char was quoted.
                break;
            }
            if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
                // Table row specified eof and we hit eof on the input.
                break;
            }

            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
                fC.fQuoted == FALSE &&                                       //   char is not escaped &&
                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
                UnicodeSet *uniset = gRuleSets[tableEl->fCharClass-128];
                if (uniset->contains(fC.fChar)) {
                    // Table row specified a character class, or set of characters,
                    //   and the current char matches it.
                    break;
                }
            }

            // No match on this row, advance to the next  row for this state,
            tableEl++;
        }
        REGEX_SCAN_DEBUG_PRINTF("\n");

        //
        // We've found the row of the state table that matches the current input
        //   character from the rules string.
        // Perform any action specified  by this row in the state table.
        if (doParseActions((EParseAction)tableEl->fAction) == FALSE) {
            // Break out of the state machine loop if the
            //   the action signalled some kind of error, or
            //   the action was to exit, occurs on normal end-of-rules-input.
            break;
        }

        if (tableEl->fPushState != 0) {
            fStackPtr++;
            if (fStackPtr >= kStackSize) {
                error(U_REGEX_INTERNAL_ERROR);
                REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n");
                fStackPtr--;
            }
            fStack[fStackPtr] = tableEl->fPushState;
        }

        //
        //  NextChar.  This is where characters are actually fetched from the pattern.
        //             Happens under control of the 'n' tag in the state table.
        //
        if (tableEl->fNextChar) {
            nextChar(fC);
        }

        // Get the next state from the table entry, or from the
        //   state stack if the next state was specified as "pop".
        if (tableEl->fNextState != 255) {
            state = tableEl->fNextState;
        } else {
            state = fStack[fStackPtr];
            fStackPtr--;
            if (fStackPtr < 0) {
                // state stack underflow
                // This will occur if the user pattern has mis-matched parentheses,
                //   with extra close parens.
                //
                fStackPtr++;
                error(U_REGEX_MISMATCHED_PAREN);
            }
        }

    }

    //
    // The pattern has now been read and processed, and the compiled code generated.
    //

    // Back-reference fixup
    //
    int32_t loc;
    for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
        int32_t op = fRXPat->fCompiledPat->elementAti(loc);
        int32_t opType = URX_TYPE(op);
        if (opType == URX_BACKREF || opType == URX_BACKREF_I) {
            int32_t where = URX_VAL(op);
            if (where > fRXPat->fGroupMap->size()) {
                error(U_REGEX_INVALID_BACK_REF);
                break;
            }
            where = fRXPat->fGroupMap->elementAti(where-1);
            op    = URX_BUILD(opType, where);
            fRXPat->fCompiledPat->setElementAt(op, loc);
        }
    }


    //
    // Compute the number of digits requried for the largest capture group number.
    //
    fRXPat->fMaxCaptureDigits = 1;
    int32_t  n = 10;
    for (;;) {
        if (n > fRXPat->fGroupMap->size()) {
            break;
        }
        fRXPat->fMaxCaptureDigits++;
        n *= 10;
    }

    //
    // The pattern's fFrameSize so far has accumulated the requirements for
    //   storage for capture parentheses, counters, etc. that are encountered
    //   in the pattern.  Add space for the two variables that are always
    //   present in the saved state:  the input string position and the
    //   position in the compiled pattern.
    //
    fRXPat->fFrameSize+=2;

    //
    // A stupid bit of non-sense to prevent code coverage testing from complaining
    //   about the pattern.dump() debug function.  Go through the motions of dumping,
    //   even though, without the #define set, it will do nothing.
    //
#ifndef REGEX_DUMP_DEBUG
    static UBool phonyDumpDone = FALSE;
    if (phonyDumpDone==FALSE) {
        fRXPat->dump();
        phonyDumpDone = TRUE;
    }
#endif

}


//----------------------------------------------------------------------------------------
//
//  doParseAction        Do some action during regex pattern parsing.
//                       Called by the parse state machine.
//
//
//----------------------------------------------------------------------------------------
UBool RegexCompile::doParseActions(EParseAction action)
{
    UBool   returnVal = TRUE;

    switch ((Regex_PatternParseAction)action) {

    case doPatStart:
        // Start of pattern compiles to:
        //0   SAVE   2        Fall back to position of FAIL
        //1   jmp    3
        //2   FAIL            Stop if we ever reach here.
        //3   NOP             Dummy, so start of pattern looks the same as
        //                    the start of an ( grouping.
        //4   NOP             Resreved, will be replaced by a save if there are
        //                    OR | operators at the top level
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP,  3), *fStatus);
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP,  0), *fStatus);
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP,  0), *fStatus);

        fParenStack.push(-1, *fStatus);     // Begin a Paren Stack Frame
        fParenStack.push( 3, *fStatus);     // Push location of first NOP
        break;

    case doPatFinish:
        // We've scanned to the end of the pattern
        //  The end of pattern compiles to:
        //        URX_END
        //    which will stop the runtime match engine.
        //  Encountering end of pattern also behaves like a close paren,
        //   and forces fixups of the State Save at the beginning of the compiled pattern
        //   and of any OR operations at the top level.
        //
        handleCloseParen();
        if (fParenStack.size() > 0) {
            // Missing close paren in pattern.
            error(U_REGEX_MISMATCHED_PAREN);
        }

        // add the END operation to the compiled pattern.
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);

        // Terminate the pattern compilation state machine.
        returnVal = FALSE;
        break;


    case doOrOperator:
        // Scanning a '|', as in (A|B)
        {
            // Insert a SAVE operation at the start of the pattern section preceding
            //   this OR at this level.  This SAVE will branch the match forward
            //   to the right hand side of the OR in the event that the left hand
            //   side fails to match and backtracks.  Locate the position for the
            //   save from the location on the top of the parentheses stack.
            int32_t savePosition = fParenStack.popi();
            int32_t op = fRXPat->fCompiledPat->elementAti(savePosition);
            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
            op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
            fRXPat->fCompiledPat->setElementAt(op, savePosition);

            // Append an JMP operation into the compiled pattern.  The operand for
            //  the JMP will eventually be the location following the ')' for the
            //  group.  This will be patched in later, when the ')' is encountered.
            op = URX_BUILD(URX_JMP, 0);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // Push the position of the newly added JMP op onto the parentheses stack.
            // This registers if for fixup when this block's close paren is encountered.
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);

            // Append a NOP to the compiled pattern.  This is the slot reserved
            //   for a SAVE in the event that there is yet another '|' following
            //   this one.
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
        }
        break;


    case doOpenCaptureParen:
        // Open Paren.
        //   Compile to a
        //      - NOP, which later may be replaced by a save-state if the
        //         parenthesized group gets a * quantifier, followed by
        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        //
        //    Each capture group gets three slots in the save stack frame:
        //         0:   Capture Group start position (in input string being matched.)
        //         1:   Capture Group end   positino.
        //         2:   Start of Match-in-progress.
        //    The first two locations are for a completed capture group, and are
        //     referred to by back references and the like.
        //    The third location stores the capture start position when an START_CAPTURE is
        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
        //      END_CAPure is encountered.
        {
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
            int32_t  varsLoc    = fRXPat->fFrameSize;    // Reserve three slots in match stack frame.
            fRXPat->fFrameSize += 3;
            int32_t  cop        = URX_BUILD(URX_START_CAPTURE, varsLoc);
            fRXPat->fCompiledPat->addElement(cop, *fStatus);
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the two NOPs.  Depending on what follows in the pattern, the
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
            //   address of the end of the parenthesized group.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(capturing, *fStatus);                        // Frame type.
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc

            // Save the mapping from group number to stack frame variable position.
            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
        }
         break;

    case doOpenNonCaptureParen:
        // Open non-caputuring (grouping only) Paren.
        //   Compile to a
        //      - NOP, which later may be replaced by a save-state if the
        //         parenthesized group gets a * quantifier, followed by
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        {
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the two NOPs.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
        }
         break;


    case doOpenAtomicParen:
        // Open Atomic Paren.  (?>
        //   Compile to a
        //      - NOP, which later may be replaced if the parenthesized group
        //         has a quantifier, followed by
        //      - STO_SP  save state stack position, so it can be restored at the ")"
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        {
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
            int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the
            fRXPat->fDataSize += 1;                    //  state stack ptr.
            int32_t  stoOp     = URX_BUILD(URX_STO_SP, varLoc);
            fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the two NOPs.  Depending on what follows in the pattern, the
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
            //   address of the end of the parenthesized group.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(atomic, *fStatus);                           // Frame type.
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
        }
        break;


    case doOpenLookAhead:
        // Positive Look-ahead   (?=  stuff  )
        // Compiles to
        //    1    START_LA     dataLoc
        //    2.   NOP              reserved for use by quantifiers on the block.
        //                          Look-ahead can't have quantifiers, but paren stack
        //                             compile time conventions require the slot anyhow.
        //    3.   NOP              may be replaced if there is are '|' ops in the block.
        //    4.     code for parenthesized stuff.
        //    5.   ENDLA
        //
        //  Two data slots are reserved, for saving the stack ptr and the input position.
        {
            int32_t dataLoc = fRXPat->fDataSize;
            fRXPat->fDataSize += 2;
            int32_t op = URX_BUILD(URX_LA_START, dataLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            op = URX_BUILD(URX_NOP, 0);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the NOPs.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
        }
        break;

    case doOpenLookAheadNeg:
        // Negated Lookahead.   (?! stuff )
        // Compiles to
        //    1.    START_LA    dataloc
        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
        //                                //   which continues with the match.
        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
        //    4.       code for parenthesized stuff.
        //    5.    END_LA                // Cut back stack, remove saved state from step 2.
        //    6.    FAIL                  // code in block succeeded, so neg. lookahead fails.
        //    7.    ...
        {
            int32_t dataLoc = fRXPat->fDataSize;
            fRXPat->fDataSize += 2;
            int32_t op = URX_BUILD(URX_LA_START, dataLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            op = URX_BUILD(URX_STATE_SAVE, 0);    // dest address will be patched later.
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            op = URX_BUILD(URX_NOP, 0);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the StateSave and NOP.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push( negLookAhead, *fStatus);                    // Frame type
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location

            // Instructions #5 and #6 will be added when the ')' is encountered.
        }
        break;

    case doOpenLookBehind:
        // Open Paren.
        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doOpenLookBehindNeg:
        // Open Paren.
        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doConditionalExpr:
        // Conditionals such as (?(1)a:b)
    case doPerlInline:
        // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
        error(U_REGEX_UNIMPLEMENTED);
        break;


    case doCloseParen:
        handleCloseParen();
        if (fParenStack.size() <= 0) {
            //  Extra close paren, or missing open paren.
            error(U_REGEX_MISMATCHED_PAREN);
        }
        break;

    case doNOP:
        break;


    case doBadOpenParenType:
    case doRuleError:
        error(U_REGEX_RULE_SYNTAX);
        returnVal = FALSE;
        break;


    case doMismatchedParenErr:
        error(U_REGEX_MISMATCHED_PAREN);
        returnVal = FALSE;
        break;

    case doPlus:
        //  Normal '+'  compiles to
        //     1.   stuff to be repeated  (already built)
        //     2.   state-save  4
        //     3.   jmp 1
        //     4.   ...
        {
            int32_t   topLoc = blockTopLoc(FALSE);        // location of item #1

            // Locate the position in the compiled pattern where the match will continue
            //   after completing the +   (4 in the comment above)
            int32_t continueLoc = fRXPat->fCompiledPat->size()+2;

            // Emit the STATE_SAVE
            int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
            fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);

            // Emit the JMP
            int32_t jmpOp = URX_BUILD(URX_JMP, topLoc);
            fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
        }
        break;

    case doNGPlus:
        //  Non-greedy '+?'  compiles to
        //     1.   stuff to be repeated  (already built)
        //     2.   state-save  1
        //     3.   ...
        {
            int32_t topLoc      = blockTopLoc(FALSE);
            int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
            fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
        }
        break;


    case doOpt:
        // Normal (greedy) ? quantifier.
        //  Compiles to
        //     1. state save 3
        //     2.    body of optional block
        //     3. ...
        // Insert the state save into the compiled pattern, and we're done.
        {
            int32_t   saveStateLoc = blockTopLoc(TRUE);
            int32_t   saveStateOp  = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
        }
        break;

    case doNGOpt:
        // Non-greedy ?? quantifier
        //   compiles to
        //    1.  jmp   4
        //    2.     body of optional block
        //    3   jmp   5
        //    4.  state save 2
        //    5    ...
        //  This code is less than ideal, with two jmps instead of one, because we can only
        //  insert one instruction at the top of the block being iterated.
        {
            int32_t  jmp1_loc = blockTopLoc(TRUE);
            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();

            int32_t  jmp1_op  = URX_BUILD(URX_JMP, jmp2_loc+1);
            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);

            int32_t  jmp2_op  = URX_BUILD(URX_JMP, jmp2_loc+2);
            fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);

            int32_t  save_op  = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
            fRXPat->fCompiledPat->addElement(save_op, *fStatus);
        }
        break;


    case doStar:
        // Normal (greedy) * quantifier.
        // Compiles to
        //       1.   STATE_SAVE   4
        //       2.      body of stuff being iterated over
        //       3.   JMP  1
        //       4.   ...
        //
        // Or, if the body can match a zero-length string, to inhibit infinite loops,
        //       1.   STATE_SAVE   6
        //       2.   POS_SAVE     data-loc
        //       3.      body of stuff
        //       4.   JMPX    1
        //       5            data-loc   (extra operand of JMPX)
        //       6.   ...
        {
            // location of item #1, the STATE_SAVE
            int32_t   saveStateLoc = blockTopLoc(TRUE);
            int32_t   dataLoc = -1;

            if (possibleNullMatch(saveStateLoc, fRXPat->fCompiledPat->size()-1)) {
                insertOp(saveStateLoc);
                dataLoc =  fRXPat->fFrameSize;
                fRXPat->fFrameSize++;

                int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
            }

            // Locate the position in the compiled pattern where the match will continue
            //   after completing the *.   (4 in the comment above)
            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
            if (dataLoc != -1) {
                continueLoc++;
            }

            // Put together the save state op store it into the compiled code.
            int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);

            // Append the URX_JMP or URX_JMPX operation to the compiled pattern.  Its target
            // is the locaton of the state-save, above.
            if (dataLoc == -1) {
                int32_t jmpOp = URX_BUILD(URX_JMP, saveStateLoc);
                fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
            } else {
                int32_t op = URX_BUILD(URX_JMPX, saveStateLoc);
                fRXPat->fCompiledPat->addElement(op, *fStatus);
                op = URX_BUILD(URX_RESERVED_OP, dataLoc);
                fRXPat->fCompiledPat->addElement(op, *fStatus);
            }

        }
        break;

    case doNGStar:
        // Non-greedy *? quantifier
        // compiles to
        //     1.   JMP    3
        //     2.      body of stuff being iterated over
        //     3.   STATE_SAVE  2
        //     4    ...
        {
            int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
            int32_t     jmpOp   = URX_BUILD(URX_JMP, saveLoc);
            int32_t     stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
            fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
        }
        break;


    case doIntervalInit:
        // The '{' opening an interval quantifier was just scanned.
        // Init the counter varaiables that will accumulate the values as the digits
        //    are scanned.
        fIntervalLow = 0;
        fIntervalUpper = -1;
        break;

    case doIntevalLowerDigit:
        // Scanned a digit from the lower value of an {lower,upper} interval
        {
            int32_t digitValue = u_charDigitValue(fC.fChar);
            U_ASSERT(digitValue >= 0);
            fIntervalLow = fIntervalLow*10 + digitValue;
            if (fIntervalLow < 0) {
                error(U_REGEX_NUMBER_TOO_BIG);
            }
        }
        break;

    case doIntervalUpperDigit:
        // Scanned a digit from the upper value of an {lower,upper} interval
        {
            if (fIntervalUpper < 0) {
                fIntervalUpper = 0;
            }
            int32_t digitValue = u_charDigitValue(fC.fChar);
            U_ASSERT(digitValue >= 0);
            fIntervalUpper = fIntervalUpper*10 + digitValue;
            if (fIntervalLow < 0) {
                error(U_REGEX_NUMBER_TOO_BIG);
            }
        }
        break;

    case doIntervalSame:
        // Scanned a single value interval like {27}.  Upper = Lower.
        fIntervalUpper = fIntervalLow;
        break;

    case doInterval:
        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
        compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
        break;

    case doPossesiveInterval:
        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
        compileInterval(URX_CTR_INIT_P, URX_CTR_LOOP_P);
        break;

    case doNGInterval:
        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
        break;

    case doIntervalError:
        error(U_REGEX_BAD_INTERVAL);
        break;

    case doLiteralChar:
        // We've just scanned a "normal" character from the pattern,
        literalChar();
        break;


    case doDotAny:
        // scanned a ".",  match any single character.
        {
            int32_t   op;
            if (fModeFlags & UREGEX_DOTALL) {
                op = URX_BUILD(URX_DOTANY_ALL, 0);
            } else {
                op = URX_BUILD(URX_DOTANY, 0);
            }
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;

    case doCaret:
        {
            int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_CARET_M : URX_CARET;
            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
        }
        break;


    case doDollar:
        {
            int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_DOLLAR_M : URX_DOLLAR;
            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
        }
        break;

    case doBackslashA:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
        break;

    case doBackslashB:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 1), *fStatus);
        break;

    case doBackslashb:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus);
        break;

    case doBackslashD:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
        break;

    case doBackslashd:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
        break;

    case doBackslashG:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
        break;

    case doBackslashS:
        fRXPat->fCompiledPat->addElement(
            URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET | URX_NEG_SET), *fStatus);
        break;

    case doBackslashs:
        fRXPat->fCompiledPat->addElement(
            URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
        break;

    case doBackslashW:
        fRXPat->fCompiledPat->addElement(
            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
        break;

    case doBackslashw:
        fRXPat->fCompiledPat->addElement(
            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
        break;

    case doBackslashX:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
        break;

    case doBackslashx:              // \x{abcd}   alternate hex format
        //  TODO:  implement
        error(U_REGEX_UNIMPLEMENTED);
        break;


    case doBackslashZ:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
        break;

    case doBackslashz:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
        break;

    case doEscapeError:
        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
        break;

    case doExit:
        returnVal = FALSE;
        break;

    case doProperty:
        {
            UnicodeSet *theSet = scanProp();
            compileSet(theSet);
        }
        break;


    case doScanUnicodeSet:
        {
            UnicodeSet *theSet = scanSet();
            if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && theSet != NULL) {
                caseClose(theSet);   // TODO:  replace with the real function.
                // theSet->closeOver(USET_CASE);
            }
            compileSet(theSet);
        }
        break;

    case doEnterQuoteMode:
        // Just scanned a \Q.  Put character scanner into quote mode.
        fQuoteMode = TRUE;
        break;

    case doBackRef:
        // BackReference.  Somewhat unusual in that the front-end can not completely parse
        //                 the regular expression, because the number of digits to be consumed
        //                 depends on the number of capture groups that have been defined.  So
        //                 we have to do it here instead.
        {
            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
            int32_t  groupNum = 0;
            UChar32  c        = fC.fChar;

            for (;;) {
                // Loop once per digit, for max allowed number of digits in a back reference.
                int32_t digit = u_charDigitValue(c);
                groupNum = groupNum * 10 + digit;
                if (groupNum >= numCaptureGroups) {
                    break;
                }
                c = peekCharLL();
                if (gRuleDigits->contains(c) == FALSE) {
                    break;
                }
                nextCharLL();
            }

            // Scan of the back reference in the source regexp is complete.  Now generate
            //  the compiled code for it.
            // Because capture groups can be forward-referenced by back-references,
            //  we fill the operand with the capture group number.  At the end
            //  of compilation, it will be changed to the variables location.
            U_ASSERT(groupNum > 0);
            int32_t  op;
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
                op = URX_BUILD(URX_BACKREF_I, groupNum);
            } else {
                op = URX_BUILD(URX_BACKREF, groupNum);
            }
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;


    case doOctal:
        error(U_REGEX_UNIMPLEMENTED);
        break;


    case doNamedChar:            // \N{NAMED_CHAR}
        //  TODO:  implement
        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doPossesivePlus:
        // Possessive ++ quantifier.
        // Compiles to
        //       1.   STO_SP
        //       2.      body of stuff being iterated over
        //       3.   STATE_SAVE 5
        //       4.   JMP        2
        //       5.   LD_SP
        //       6.   ...
        //
        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
        //                then unconditionally discarded.  Perhaps introduce a new opcode
        //
        {
            // Emit the STO_SP
            int32_t   topLoc = blockTopLoc(TRUE);
            int32_t   stoLoc = fRXPat->fDataSize;
            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc);

            // Emit the STATE_SAVE
            op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // Emit the JMP
            op = URX_BUILD(URX_JMP, topLoc+1);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // Emit the LD_SP
            op = URX_BUILD(URX_LD_SP, stoLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;

    case doPossesiveStar:
        // Possessive *+ quantifier.
        // Compiles to
        //       1.   STO_SP       loc
        //       2.   STATE_SAVE   5
        //       3.      body of stuff being iterated over
        //       4.   JMP          2
        //       5.   LD_SP        loc
        //       6    ...
        // TODO:  do something to cut back the state stack each time through the loop.
        {
            // Reserve two slots at the top of the block.
            int32_t   topLoc = blockTopLoc(TRUE);
            insertOp(topLoc);

            // emit   STO_SP     loc
            int32_t   stoLoc = fRXPat->fDataSize;
            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc);

            // Emit the SAVE_STATE   5
            int32_t L7 = fRXPat->fCompiledPat->size()+1;
            op = URX_BUILD(URX_STATE_SAVE, L7);
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);

            // Append the JMP operation.
            op = URX_BUILD(URX_JMP, topLoc+1);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // Emit the LD_SP       loc
            op = URX_BUILD(URX_LD_SP, stoLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;

    case doPossesiveOpt:
        // Possessive  ?+ quantifier.
        //  Compiles to
        //     1. STO_SP      loc
        //     2. SAVE_STATE  5
        //     3.    body of optional block
        //     4. LD_SP       loc
        //     5. ...
        //
        {
            // Reserve two slots at the top of the block.
            int32_t   topLoc = blockTopLoc(TRUE);
            insertOp(topLoc);

            // Emit the STO_SP
            int32_t   stoLoc = fRXPat->fDataSize;
            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc);

            // Emit the SAVE_STATE
            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
            op = URX_BUILD(URX_STATE_SAVE, continueLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);

            // Emit the LD_SP
            op = URX_BUILD(URX_LD_SP, stoLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;


    case doBeginMatchMode:
        fNewModeFlags = fModeFlags;
        fSetModeFlag  = TRUE;
        break;

    case doMatchMode:   //  (?i)    and similar
        {
            int32_t  bit = 0;
            switch (fC.fChar) {
            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
            case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
            default:
                U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
                                   // by the scanner.
            }
            if (fSetModeFlag) {
                fNewModeFlags |= bit;
            } else {
                fNewModeFlags &= ~bit;
            }
        }
        break;

    case doSetMatchMode:
        // We've got a (?i) or similar.  The match mode is being changed, but
        //   the change is not scoped to a parenthesized block.
        fModeFlags = fNewModeFlags;

        // Prevent any string from spanning across the change of match mode.
        //   Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
        fixLiterals();
        break;


    case doMatchModeParen:
        // We've got a (?i: or similar.  Begin a parenthesized block, save old
        //   mode flags so they can be restored at the close of the block.
        //
        //   Compile to a
        //      - NOP, which later may be replaced by a save-state if the
        //         parenthesized group gets a * quantifier, followed by
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        {
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the two NOPs (a normal non-capturing () frame, except for the
            //   saving of the orignal mode flags.)
            fParenStack.push(fModeFlags, *fStatus);
            fParenStack.push(flags, *fStatus);                            // Frame Marker
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP

            // Set the current mode flags to the new values.
            fModeFlags = fNewModeFlags;
        }
        break;

    case doSuppressComments:
        // We have just scanned a '(?'.  We now need to prevent the character scanner from
        // treating a '#' as a to-the-end-of-line comment.
        //   (This Perl compatibility just gets uglier and uglier to do...)
        fEOLComments = FALSE;
        break;


    default:
        U_ASSERT(FALSE);
        error(U_REGEX_INTERNAL_ERROR);
        returnVal = FALSE;
        break;
    }
    return returnVal;
};


//------------------------------------------------------------------------------
//
//   literalChar           We've encountered a literal character from the pattern,
//                             or an escape sequence that reduces to a character.
//                         Add it to the string containing all literal chars/strings from
//                             the pattern.
//                         If we are in a pattern string already, add the new char to it.
//                         If we aren't in a pattern string, begin one now.
//
//------------------------------------------------------------------------------
void RegexCompile::literalChar()  {
    int32_t           op;            // An operation in the compiled pattern.
    int32_t           opType;
    int32_t           patternLoc;   // A position in the compiled pattern.
    int32_t           stringLen;


    // If the last thing compiled into the pattern was not a literal char,
    //   force this new literal char to begin a new string, and not append to the previous.
    op     = fRXPat->fCompiledPat->lastElementi();
    opType = URX_TYPE(op);
    if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR || opType == URX_ONECHAR_I)) {
        fixLiterals();
    }

    if (fStringOpStart == -1) {
        // First char of a string in the pattern.
        // Emit a OneChar op into the compiled pattern.
        emitONE_CHAR(fC.fChar);

        // Also add it to the string pool, in case we get a second adjacent literal
        //   and want to change form ONE_CHAR to STRING
        fStringOpStart = fRXPat->fLiteralText.length();
        fRXPat->fLiteralText.append(fC.fChar);
        return;
    }

    // We are adding onto an existing string
    fRXPat->fLiteralText.append(fC.fChar);

    // If the most recently emitted op is a URX_ONECHAR, change it to a string op.
    op     = fRXPat->fCompiledPat->lastElementi();
    opType = URX_TYPE(op);
    U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
    if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
            op     = URX_BUILD(URX_STRING_I, fStringOpStart);
        } else {
            op     = URX_BUILD(URX_STRING, fStringOpStart);
        }
        patternLoc = fRXPat->fCompiledPat->size() - 1;
        fRXPat->fCompiledPat->setElementAt(op, patternLoc);
        op         = URX_BUILD(URX_STRING_LEN, 0);
        fRXPat->fCompiledPat->addElement(op, *fStatus);
    }

    // The pattern contains a URX_SRING / URX_STRING_LEN.  Update the
    //  string length to reflect the new char we just added to the string.
    stringLen  = fRXPat->fLiteralText.length() - fStringOpStart;
    op         = URX_BUILD(URX_STRING_LEN, stringLen);
    patternLoc = fRXPat->fCompiledPat->size() - 1;
    fRXPat->fCompiledPat->setElementAt(op, patternLoc);
}


//------------------------------------------------------------------------------
//
//    emitONE_CHAR         emit a ONE_CHAR op into the generated code.
//                         Choose cased or uncased version, depending on the
//                         match mode and whether the character itself is cased.
//
//------------------------------------------------------------------------------
void RegexCompile::emitONE_CHAR(UChar32  c) {
    int32_t op;
    if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && (u_tolower(c) != u_toupper(c))) {
        // We have a cased character, and are in case insensitive matching mode.
        // TODO: replace with a better test.  See Alan L.'s mail of 2/6
        c  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
        op = URX_BUILD(URX_ONECHAR_I, c);
    } else {
        // Uncased char, or case sensitive match mode.
        //  Either way, just generate a literal compare of the char.
        op = URX_BUILD(URX_ONECHAR, c);
    }
    fRXPat->fCompiledPat->addElement(op, *fStatus);
}


//------------------------------------------------------------------------------
//
//    fixLiterals           When compiling something that can follow a literal
//                          string in a pattern, we need to "fix" any preceding
//                          string, which will cause any subsequent literals to
//                          begin a new string, rather than appending to the
//                          old one.
//
//                          Optionally, split the last char of the string off into
//                          a single "ONE_CHAR" operation, so that quantifiers can
//                          apply to that char alone.  Example:   abc*
//                          The * must apply to the 'c' only.
//
//------------------------------------------------------------------------------
void    RegexCompile::fixLiterals(UBool split) {
    int32_t  stringStart = fStringOpStart;    // start index of the current literal string
    int32_t  op;                              // An op from/for the compiled pattern.
    int32_t  opType;                          // An opcode type from the compiled pattern.
    int32_t  stringLastCharIdx;
    UChar32  lastChar;
    int32_t  stringNextToLastCharIdx;
    UChar32  nextToLastChar;
    int32_t  stringLen;

    fStringOpStart = -1;
    if (!split) {
        return;
    }

    // Split:  We need to  ensure that the last item in the compiled pattern does
    //   not refer to a literal string of more than one char.  If it does,
    //   separate the last char from the rest of the string.

    // If the last operation from the compiled pattern is not a string,
    //   nothing needs to be done
    op     = fRXPat->fCompiledPat->lastElementi();
    opType = URX_TYPE(op);
    if (opType != URX_STRING_LEN) {
        return;
    }
    stringLen = URX_VAL(op);

    //
    // Find the position of the last code point in the string  (might be a surrogate pair)
    //
    stringLastCharIdx = fRXPat->fLiteralText.length();
    stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
    lastChar          = fRXPat->fLiteralText.char32At(stringLastCharIdx);

    // The string should always be at least two code points long, meaning that there
    //   should be something before the last char position that we just found.
    U_ASSERT(stringLastCharIdx > stringStart);
    stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
    U_ASSERT(stringNextToLastCharIdx >= stringStart);
    nextToLastChar          = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx);

    if (stringNextToLastCharIdx > stringStart) {
        // The length of string remaining after removing one char is two or more.
        // Leave the string in the compiled pattern, shorten it by one char,
        //   and append a URX_ONECHAR op for the last char.
        stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
        op = URX_BUILD(URX_STRING_LEN, stringLen);
        fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
        emitONE_CHAR(lastChar);
    } else {
        // The original string consisted of exactly two characters.  Replace
        // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
        // of URX_ONECHARs.
        fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2);
        emitONE_CHAR(nextToLastChar);
        emitONE_CHAR(lastChar);
    }
}


//------------------------------------------------------------------------------
//
//   insertOp()             Insert a slot for a new opcode into the already
//                          compiled pattern code.
//
//                          Fill the slot with a NOP.  Our caller will replace it
//                          with what they really wanted.
//
//------------------------------------------------------------------------------
void   RegexCompile::insertOp(int32_t where) {
    UVector32 *code = fRXPat->fCompiledPat;
    U_ASSERT(where>0 && where < code->size());

    int32_t  nop = URX_BUILD(URX_NOP, 0);
    code->insertElementAt(nop, where, *fStatus);

    // Walk through the pattern, looking for any ops with targets that
    //  were moved down by the insert.  Fix them.
    int32_t loc;
    for (loc=0; loc<code->size(); loc++) {
        int32_t op = code->elementAti(loc);
        int32_t opType = URX_TYPE(op);
        int32_t opValue = URX_VAL(op);
        if ((opType == URX_JMP         ||
            opType == URX_JMPX         ||
            opType == URX_STATE_SAVE   ||
            opType == URX_CTR_LOOP     ||
            opType == URX_CTR_LOOP_NG  ||
            opType == URX_CTR_LOOP_P   ||
            opType == URX_RELOC_OPRND)    && opValue > where) {
            // Target location for this opcode is after the insertion point and
            //   needs to be incremented to adjust for the insertion.
            opValue++;
            op = URX_BUILD(opType, opValue);
            code->setElementAt(op, loc);
        }
    }

    // Now fix up the parentheses stack.  All positive values in it are locations in
    //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
    for (loc=0; loc<fParenStack.size(); loc++) {
        int32_t x = fParenStack.elementAti(loc);
        if (x>where) {
            x++;
            fParenStack.setElementAt(x, loc);
        }
    }
}


//------------------------------------------------------------------------------
//
//   blockTopLoc()          Find or create a location in the compiled pattern
//                          at the start of the operation or block that has
//                          just been compiled.  Needed when a quantifier (* or
//                          whatever) appears, and we need to add an operation
//                          at the start of the thing being quantified.
//
//                          (Parenthesized Blocks) have a slot with a NOP that
//                          is reserved for this purpose.  .* or similar don't
//                          and a slot needs to be added.
//
//       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
//                                         at the returned location.
//                                 FALSE - just return the address,
//                                         do not reserve a location there.
//
//------------------------------------------------------------------------------
int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
    int32_t   theLoc;
    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
    {
        // The item just processed is a parenthesized block.
        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
        U_ASSERT(theLoc > 0);
        uint32_t  opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
        U_ASSERT(URX_TYPE(opAtTheLoc) == URX_NOP);
    }
    else {
        // Item just compiled is a single thing, a ".", or a single char, or a set reference.
        // No slot for STATE_SAVE was pre-reserved in the compiled code.
        // We need to make space now.
        fixLiterals(TRUE);  // If last item was a string, separate the last char.
        theLoc = fRXPat->fCompiledPat->size()-1;
        if (reserveLoc) {
            int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
            int32_t prevType = URX_TYPE(opAtTheLoc);
            int32_t  nop = URX_BUILD(URX_NOP, 0);
            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
        }
    }
    return theLoc;
}


//------------------------------------------------------------------------------
//
//    handleCloseParen      When compiling a close paren, we need to go back
//                          and fix up any JMP or SAVE operations within the
//                          parenthesized block that need to target the end
//                          of the block.  The locations of these are kept on
//                          the paretheses stack.
//
//                          This function is called both when encountering a
//                          real ) and at the end of the pattern.
//
//-------------------------------------------------------------------------------
void  RegexCompile::handleCloseParen() {
    int32_t   patIdx;
    int32_t   patOp;
    if (fParenStack.size() <= 0) {
        error(U_REGEX_MISMATCHED_PAREN);
        return;
    }

    // Force any literal chars that may follow the close paren to start a new string,
    //   and not attach to any preceding it.
    fixLiterals(FALSE);

    // Fixup any operations within the just-closed parenthesized group
    //    that need to reference the end of the (block).
    //    (The first one popped from the stack is an unused slot for
    //     alternation (OR) state save, but applying the fixup to it does no harm.)
    for (;;) {
        patIdx = fParenStack.popi();
        if (patIdx < 0) {
            // value < 0 flags the start of the frame on the paren stack.
            break;
        }
        U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
        patOp = fRXPat->fCompiledPat->elementAti(patIdx);
        U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
        patOp |= fRXPat->fCompiledPat->size();  // Set it now.
        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
        fMatchOpenParen     = patIdx;
    }

    //  At the close of any parenthesized block, restore the match mode flags  to
    //  the value they had at the open paren.  Saved value is
    //  at the top of the paren stack.
    fModeFlags = fParenStack.popi();

    // DO any additional fixups, depending on the specific kind of
    // parentesized grouping this is

    switch (patIdx) {
    case plain:
    case flags:
        // No additional fixups required.
        //   (Grouping-only parentheses)
        break;
    case capturing:
        // Capturing Parentheses.
        //   Insert a End Capture op into the pattern.
        //   The frame offset of the variables for this cg is obtained from the
        //       start capture op and put it into the end-capture op.
        {
            int32_t   captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
            U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);

            int32_t   frameVarLocation = URX_VAL(captureOp);
            int32_t   endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
            fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
        }
        break;
    case atomic:
        // Atomic Parenthesis.
        //   Insert a LD_SP operation to restore the state stack to the position
        //   it was when the atomic parens were entered.
        {
            int32_t   stoOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
            U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
            int32_t   stoLoc = URX_VAL(stoOp);
            int32_t   ldOp   = URX_BUILD(URX_LD_SP, stoLoc);
            fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
        }
        break;

    case lookAhead:
        {
            int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
            int32_t dataLoc  = URX_VAL(startOp);
            int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;

    case negLookAhead:
        {
            // See comment at doOpenLookAheadNeg
            int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
            int32_t dataLoc  = URX_VAL(startOp);
            int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
             op              = URX_BUILD(URX_FAIL, 0);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // Patch the URX_SAVE near the top of the block.
            int32_t saveOp   = fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
            U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
            int32_t dest     = fRXPat->fCompiledPat->size();
            saveOp           = URX_BUILD(URX_STATE_SAVE, dest);
            fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
        }
        break;


    default:
        U_ASSERT(FALSE);
    }

    // remember the next location in the compiled pattern.
    // The compilation of Quantifiers will look at this to see whether its looping
    //   over a parenthesized block or a single item
    fMatchCloseParen = fRXPat->fCompiledPat->size();
}


//----------------------------------------------------------------------------------------
//
//   compileSet       Compile the pattern operations for a reference to a
//                    UnicodeSet.
//
//----------------------------------------------------------------------------------------
void        RegexCompile::compileSet(UnicodeSet *theSet)
{
    if (theSet == NULL) {
        return;
    }
    int32_t  setSize = theSet->size();
    UChar32  firstSetChar = theSet->charAt(0);
    if (firstSetChar == -1) {
        // Sets that contain only strings, but no individual chars,
        // will end up here.   TODO:  figure out what to with sets containing strings.
        setSize = 0;
    }

    switch (setSize) {
    case 0:
        {
            // Set of no elements.   Always fails to match.
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
            delete theSet;
        }
        break;

    case 1:
        {
            // The set contains only a single code point.  Put it into
            //   the compiled pattern as a single char operation rather
            //   than a set, and discard the set itself.
            int32_t  charToken = URX_BUILD(URX_ONECHAR, firstSetChar);
            fRXPat->fCompiledPat->addElement(charToken, *fStatus);
            delete theSet;
        }
        break;

    default:
        {
            //  The set contains two or more chars.  (the normal case)
            //  Put it into the compiled pattern as a set.
            int32_t setNumber = fRXPat->fSets->size();
            fRXPat->fSets->addElement(theSet, *fStatus);
            int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
            fRXPat->fCompiledPat->addElement(setOp, *fStatus);
        }
    }
}


//----------------------------------------------------------------------------------------
//
//   compileInterval    Generate the code for a {min, max} style interval quantifier.
//                      Except for the specific opcodes used, the code is the same
//                      for all three types (greedy, non-greedy, possessive) of
//                      intervals.  The opcodes are supplied as parameters.
//
//----------------------------------------------------------------------------------------
void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
{
    // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
    //   four slots in the compiled code.  Reserve them.
    int32_t   topOfBlock = blockTopLoc(TRUE);
    insertOp(topOfBlock);
    insertOp(topOfBlock);
    insertOp(topOfBlock);

    // The operands for the CTR_INIT opcode include the index in the matcher data
    //   of the counter.  Allocate it now.
    int32_t   counterLoc = fRXPat->fFrameSize;
    fRXPat->fFrameSize++;

    int32_t   op = URX_BUILD(InitOp, counterLoc);
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock);

    // The second operand of CTR_INIT is the location following the end of the loop.
    //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
    //   compilation of something later on causes the code to grow and the target
    //   position to move.
    int32_t loopEnd = fRXPat->fCompiledPat->size();
    op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);

    // Followed by the min and max counts.
    fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
    fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);

    // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
    //   Goes at end of the block being looped over, so just append to the code so far.
    op = URX_BUILD(LoopOp, topOfBlock);
    fRXPat->fCompiledPat->addElement(op, *fStatus);

    if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
        error(U_REGEX_MAX_LT_MIN);
    }

}

//----------------------------------------------------------------------------------------
//
//  possibleNullMatch    Test a range of compiled pattern for the possibility that it
//                       might match an empty string.  Used to control the generation
//                       of extra checking code to prevent infinite loops in the match
//                       engine on repeated empty matches, such as might happen with
//                            (x?)*
//                       when the input string is not at an x.
//
//----------------------------------------------------------------------------------------
UBool  RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
    // for now, just return true.  TODO:  make a real implementation
    return TRUE;
}


//----------------------------------------------------------------------------------------
//
//  Error         Report a rule parse error.
//                Only report it if no previous error has been recorded.
//
//----------------------------------------------------------------------------------------
void RegexCompile::error(UErrorCode e) {
    if (U_SUCCESS(*fStatus)) {
        *fStatus = e;
        fParseErr->line  = fLineNum;
        fParseErr->offset = fCharNum;
        fParseErr->preContext[0] = 0;    // TODO:  copy in some input pattern text
        fParseErr->preContext[0] = 0;
    }
}


//
//  Assorted Unicode character constants.
//     Numeric because there is no portable way to enter them as literals.
//     (Think EBCDIC).
//
static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
static const UChar      chLF        = 0x0a;
static const UChar      chNEL       = 0x85;      //    NEL newline variant
static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
static const UChar      chApos      = 0x27;      //  single quote, for quoted chars.
static const UChar      chPound     = 0x23;      // '#', introduces a comment.
static const UChar      chE         = 0x45;      // 'E'
static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
static const UChar      chLParen    = 0x28;
static const UChar      chRParen    = 0x29;
static const UChar      chLBracket  = 0x5b;
static const UChar      chRBracket  = 0x5d;
static const UChar      chRBrace    = 0x7d;
static const UChar      chLowerP    = 0x70;
static const UChar      chUpperP    = 0x50;


//----------------------------------------------------------------------------------------
//
//  nextCharLL    Low Level Next Char from the regex pattern.
//                Get a char from the string, keep track of input position
//                     for error reporting.
//
//----------------------------------------------------------------------------------------
UChar32  RegexCompile::nextCharLL() {
    UChar32       ch;
    UnicodeString &pattern = fRXPat->fPattern;

    if (fPeekChar != -1) {
        ch = fPeekChar;
        fPeekChar = -1;
        return ch;
    }
    if (fPatternLength==0 || fNextIndex >= fPatternLength) {
        return (UChar32)-1;
    }
    ch         = pattern.char32At(fNextIndex);
    fNextIndex = pattern.moveIndex32(fNextIndex, 1);

    if (ch == chCR ||
        ch == chNEL ||
        ch == chLS   ||
        ch == chLF && fLastChar != chCR) {
        // Character is starting a new line.  Bump up the line number, and
        //  reset the column to 0.
        fLineNum++;
        fCharNum=0;
        if (fQuoteMode) {
            error(U_REGEX_RULE_SYNTAX);
            fQuoteMode = FALSE;
        }
    }
    else {
        // Character is not starting a new line.  Except in the case of a
        //   LF following a CR, increment the column position.
        if (ch != chLF) {
            fCharNum++;
        }
    }
    fLastChar = ch;
    return ch;
}

//---------------------------------------------------------------------------------
//
//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
//                 character without actually getting it.
//
//---------------------------------------------------------------------------------
UChar32  RegexCompile::peekCharLL() {
    if (fPeekChar == -1) {
        fPeekChar = nextCharLL();
    }
    return fPeekChar;
}


//---------------------------------------------------------------------------------
//
//   nextChar     for pattern scanning.  At this level, we handle stripping
//                out comments and processing some backslash character escapes.
//                The rest of the pattern grammar is handled at the next level up.
//
//---------------------------------------------------------------------------------
void RegexCompile::nextChar(RegexPatternChar &c) {

    fScanIndex = fNextIndex;
    c.fChar    = nextCharLL();
    c.fQuoted  = FALSE;

    if (fQuoteMode) {
        c.fQuoted = TRUE;
        if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) {
            fQuoteMode = FALSE;  //  Exit quote mode,
            nextCharLL();       // discard the E
            nextChar(c);        // recurse to get the real next char
        }
    }
    else if (fInBackslashQuote) {
        // The current character immediately follows a '\'
        // Don't check for any further escapes, just return it as-is.
        // Don't set c.fQuoted, because that would prevent the state machine from
        //    dispatching on the character.
        fInBackslashQuote = FALSE;
    }
    else
    {
        // We are not in a \Q quoted region \E of the source.
        //
        if (fModeFlags & UREGEX_COMMENTS) {
            //
            // We are in free-spacing and comments mode.
            //  Scan through any white space and comments, until we
            //  reach a significant character or the end of inut.
            for (;;) {
                if (c.fChar == (UChar32)-1) {
                    break;     // End of Input
                }
                if  (c.fChar == chPound && fEOLComments == TRUE) {
                    // Start of a comment.  Consume the rest of it, until EOF or a new line
                    for (;;) {
                        c.fChar = nextCharLL();
                        if (c.fChar == (UChar32)-1 ||  // EOF
                            c.fChar == chCR        ||
                            c.fChar == chLF        ||
                            c.fChar == chNEL       ||
                            c.fChar == chLS)       {
                            break;
                        }
                    }
                }
                if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
                    //  TODO:  is RuleWhiteSpace the right thing to use here?
                    break;
                }
                c.fChar = nextCharLL();
            }
        }

        //
        //  check for backslash escaped characters.
        //
                int32_t startX = fNextIndex;  // start and end positions of the
                int32_t endX   = fNextIndex;  //   sequence following the '\'
        if (c.fChar == chBackSlash) {
            if (gUnescapeCharSet->contains(peekCharLL())) {
                //
                // A '\' sequence that is handled by ICU's standard unescapeAt function.
                //   Includes \uxxxx, \n, \r, many others.
                //   Return the single equivalent character.
                //
                nextCharLL();                 // get & discard the peeked char.
                c.fQuoted = TRUE;
                c.fChar = fRXPat->fPattern.unescapeAt(endX);
                if (startX == endX) {
                    error(U_REGEX_BAD_ESCAPE_SEQUENCE);
                }
                fCharNum += endX - startX;
                fNextIndex = endX;
            }
            else
            {
                // We are in a '\' escape that will be handled by the state table scanner.
                // Just return the backslash, but remember that the following char is to
                //  be taken literally.  TODO:  this is awkward
                fInBackslashQuote = TRUE;
            }
        }
    }

    // re-enable # to end-of-line comments, in case they were disabled..
    // They are disabled by the parser upon seeing '(?', but this lasts for
    //  the fetching of the next character only.
    fEOLComments = TRUE;

    // putc(c.fChar, stdout);
}


//---------------------------------------------------------------------------------
//
//  scanSet    Construct a UnicodeSet from the text at the current scan
//             position.  Advance the scan position to the first character
//             after the set.
//
//             The scan position is normally under the control of the state machine
//             that controls pattern parsing.  UnicodeSets, however, are parsed by
//             the UnicodeSet constructor, not by the Regex pattern parser.
//
//---------------------------------------------------------------------------------
UnicodeSet *RegexCompile::scanSet() {
    UnicodeSet    *uset = NULL;
    ParsePosition  pos;
    int            startPos;
    int            i;

    if (U_FAILURE(*fStatus)) {
        return NULL;
    }

    pos.setIndex(fScanIndex);
    startPos = fScanIndex;
    UErrorCode localStatus = U_ZERO_ERROR;
    uset = new UnicodeSet(fRXPat->fPattern, pos,
                         localStatus);
    if (U_FAILURE(localStatus)) {
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
        //         UnicodeSet appears to not be reporting correctly at this time.
        REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
        error(localStatus);
        delete uset;
        return NULL;
    }

    // Advance the current scan postion over the UnicodeSet.
    //   Don't just set fScanIndex because the line/char positions maintained
    //   for error reporting would be thrown off.
    i = pos.getIndex();
    for (;;) {
        if (fNextIndex >= i) {
            break;
        }
        nextCharLL();
    }

    return uset;
};


//---------------------------------------------------------------------------------
//
//  scanProp   Construct a UnicodeSet from the text at the current scan
//             position, which will be of the form \p{whaterver}
//
//             The scan position will be at the 'p' or 'P'.  On return
//             the scan position should be just after the '}'
//
//             Return a UnicodeSet, constructed from the \P pattern,
//             or NULL if the pattern is invalid.
//
//---------------------------------------------------------------------------------
UnicodeSet *RegexCompile::scanProp() {
    UnicodeSet    *uset = NULL;

    if (U_FAILURE(*fStatus)) {
        return NULL;
    }

    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP);

    // enclose the \p{property} from the regex pattern source in  [brackets]
    UnicodeString setPattern;
    setPattern.append(chLBracket);
    setPattern.append(chBackSlash);
    for (;;) {
        setPattern.append(fC.fChar);
        if (fC.fChar == chRBrace) {
            break;
        }
        nextChar(fC);
        if (fC.fChar == -1) {
            // Hit the end of the input string without finding the closing '}'
            *fStatus = U_REGEX_PROPERTY_SYNTAX;
            return NULL;
        }
    }
    setPattern.append(chRBracket);

    // Build the UnicodeSet from the set pattern we just built up in a string.
    uset = new UnicodeSet(setPattern, *fStatus);
    if (U_FAILURE(*fStatus)) {
        delete uset;
        uset =  NULL;
    }

    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
    return uset;
};

U_NAMESPACE_END
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS