scuffed-code/icu4c/source/i18n/regexcmp.cpp


//
//  file:  regexcmp.cpp
//
//  Copyright (C) 2002, International Business Machines Corporation and others.
//  All Rights Reserved.
//
//  This file contains the ICU regular expression compiler, which is responsible
//  for processing a regular expression pattern into the compiled form that
//  is used by the match finding engine.
//

#include "unicode/utypes.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/parsepos.h"
#include "unicode/parseerr.h"
#include "unicode/regex.h"
#include "uprops.h"
#include "cmemory.h"
#include "cstring.h"
#include "uvectr32.h"
#include "uassert.h"
#include "ucln_in.h"
#include "mutex.h"

#include "regeximp.h"
#include "regexcst.h"   // Contains state table for the regex pattern parser.
                        //   generated by a Perl script.
#include "regexcmp.h"


U_NAMESPACE_BEGIN

//----------------------------------------------------------------------------------------
//
// Unicode Sets for each of the character classes needed for parsing a regex pattern.
//               (Initialized with hex values for portability to EBCDIC based machines.
//                Really ugly, but there's no good way to avoid it.)
//
//              The sets are referred to by name in the regexcst.txt, which is the
//              source form of the state transition table.  These names are converted
//              to indicies in regexcst.h by the perl state table building script regexcst.pl.
//              The indices are used to access the array gRuleSets.
//
//----------------------------------------------------------------------------------------

// "Rule Char" Characters are those with no special meaning, and therefore do not
//    need to be escaped to appear as literals in a regexp.  Expressed
//    as the inverse of those needing escaping --  [^\*\?\+\[\(\)\{\}\^\$\|\\\.]
static const UChar gRuleSet_rule_char_pattern[]       = {
 //   [    ^      \     *     \     ?     \     +     \     [     \     (     /     )
    0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
 //   \     {    \     }     \     ^     \     $     \     |     \     \     \     .     ]
    0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};


static const UChar gRuleSet_digit_char_pattern[] = {
//    [    0      -    9     ]
    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
static const UnicodeSet *gRuleDigits = NULL;


static UnicodeSet  *gRuleSets[10];         // Array of ptrs to the actual UnicodeSet objects.
static UnicodeSet  *gUnescapeCharSet;

//
//   Here are the backslash escape characters that ICU's unescape() function
//    will handle.
//
static const UChar gUnescapeCharPattern[] = {
//    [     a     c     e     f     n     r     t     u     U     ]
    0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0};


//
//  White space characters that may appear within a pattern in free-form mode
//
static const UChar gRuleWhiteSpacePattern[] = {
    /* "[[:Cf:][:WSpace:]]" */
    91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
        83, 112, 97, 99, 101, 58, 93, 93, 0 };


//
//  Unicode Set Definitions for Regular Expression  \w
//
static const UChar gIsWordPattern[] = {
//    [     \     p     {     L     l     }     \     p     {     L     u     }
    0x5b, 0x5c, 0x70, 0x7b, 0x4c, 0x6c, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x75, 0x7d,
//          \     p     {     L     t     }     \     p     {     L     o     }
          0x5c, 0x70, 0x7b, 0x4c, 0x74, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x6f, 0x7d,
//          \     p     {     N     d     }     ]
          0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, 0x5d, 0};


//
//  Unicode Set Definitions for Regular Expression  \s
//
    static const UChar gIsSpacePattern[] = {
//    [     \     t     \     n     \     f     \     r     \     p     {     Z     }     ]
    0x5b, 0x5c, 0x74, 0x5c, 0x6e, 0x5c, 0x66, 0x5c, 0x72, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5d,  0};

static UnicodeSet *gPropSets[URX_LAST_SET];


//----------------------------------------------------------------------------------------
//
//   ThreadSafeUnicodeSetInit   Thread safe creation of a shared UnicodeSet.
//
//----------------------------------------------------------------------------------------
static void ThreadSafeUnicodeSetInit(UnicodeSet **pSet, const UChar *pattern, UErrorCode &status) {
    if (*pSet == NULL) {
        UnicodeSet *t = new UnicodeSet(pattern, status);
        if (U_FAILURE(status)) {
            delete t;
            return;
        }
        if (t == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        Mutex  lock;
        if (*pSet == NULL) {
            *pSet = t;
        } else {
            delete t;
        }
    }
}


//----------------------------------------------------------------------------------------
//
//  Constructor.
//
//----------------------------------------------------------------------------------------
RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
{
    fStatus             = &status;

    fScanIndex      = 0;
    fNextIndex      = 0;
    fPeekChar       = -1;
    fLineNum        = 1;
    fCharNum        = 0;
    fQuoteMode      = FALSE;
    fFreeForm       = FALSE;

    fMatchOpenParen  = -1;
    fMatchCloseParen = -1;

    if (U_FAILURE(status)) {
        return;
    }

    //
    //  Register the I18n library for cleanup,
    //     but only if we haven't initialized our globals yet.
    if (gRuleSets[kRuleSet_rule_char-128] == NULL) {
        ucln_i18n_registerCleanup();
    }

    //
    //  Set up the constant (static) Unicode Sets.
    //    TODO:  something cleaner for that -128 constant.
    //
    ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128],   gRuleSet_rule_char_pattern,  status);
    ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern,      status);
    ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_digit_char-128],  gRuleSet_digit_char_pattern, status);
    gRuleDigits = gRuleSets[kRuleSet_digit_char-128];
    ThreadSafeUnicodeSetInit(&gUnescapeCharSet,                    gUnescapeCharPattern,        status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET],           gIsWordPattern,              status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET],          gIsSpacePattern,             status);
}


//----------------------------------------------------------------------------------------
//
//  Destructor
//
//----------------------------------------------------------------------------------------
RegexCompile::~RegexCompile() {
}


//----------------------------------------------------------------------------------------
//
//   cleanup.    Called (indirectly) by u_cleanup to free all cached memory
//
//----------------------------------------------------------------------------------------
void RegexCompile::cleanup() {
    delete gRuleSets[kRuleSet_rule_char-128];
    delete gRuleSets[kRuleSet_white_space-128];
    delete gRuleSets[kRuleSet_digit_char-128];
    delete gUnescapeCharSet;
    gRuleSets[kRuleSet_rule_char-128]   = NULL;
    gRuleSets[kRuleSet_white_space-128] = NULL;
    gRuleSets[kRuleSet_digit_char-128]  = NULL;
    gUnescapeCharSet = NULL;
    int i;
    for (i=0; i<URX_LAST_SET; i++) {
        delete (UnicodeSet *)gPropSets[i];
        gPropSets[i] = NULL;
    }
    return;
}


//---------------------------------------------------------------------------------
//
//  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
//                           The state tables are hand-written in the file regexcst.txt,
//                           and converted to the form used here by a perl
//                           script regexcst.pl
//
//---------------------------------------------------------------------------------
void    RegexCompile::compile(
                         RegexPattern &rxp,          // User level pattern object to receive
                                                     //    the compiled pattern.
                         const UnicodeString &pat,   // Source pat to be compiled.
                         UParseError &pp,            // Error position info
                         UErrorCode &e)              // Error Code
{
    fStatus             = &e;
    fRXPat              = &rxp;
    fParseErr           = &pp;
    fStackPtr           = 0;
    fStack[fStackPtr]   = 0;

    if (U_FAILURE(*fStatus)) {
        return;
    }

    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
    U_ASSERT(fRXPat->fPattern.length() == 0);

    // Prepare the RegexPattern object to receive the compiled pattern.
    fRXPat->fPattern        = pat;
    fRXPat->fStaticSets     = gPropSets;


    // Initialize the pattern scanning state machine
    fPatternLength = pat.length();
    uint16_t                state = 1;
    const RegexTableEl      *tableEl;
    nextChar(fC);                        // Fetch the first char from the pattern string.

    //
    // Main loop for the regex pattern parsing state machine.
    //   Runs once per state transition.
    //   Each time through optionally performs, depending on the state table,
    //      - an advance to the the next pattern char
    //      - an action to be performed.
    //      - pushing or popping a state to/from the local state return stack.
    //   file regexcst.txt is the source for the state table.  The logic behind
    //     recongizing the pattern syntax is there, not here.
    //
    for (;;) {
        //  Bail out if anything has gone wrong.
        //  Regex pattern parsing stops on the first error encountered.
        if (U_FAILURE(*fStatus)) {
            break;
        }

        U_ASSERT(state != 0);

        // Find the state table element that matches the input char from the rule, or the
        //    class of the input character.  Start with the first table row for this
        //    state, then linearly scan forward until we find a row that matches the
        //    character.  The last row for each state always matches all characters, so
        //    the search will stop there, if not before.
        //
        tableEl = &gRuleParseStateTable[state];
        REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d)    state=%s ",
            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);

        for (;;) {    // loop through table rows belonging to this state, looking for one
                      //   that matches the current input char.
            REGEX_SCAN_DEBUG_PRINTF( ".");
            if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
                // Table row specified an individual character, not a set, and
                //   the input character is not quoted, and
                //   the input character matched it.
                break;
            }
            if (tableEl->fCharClass == 255) {
                // Table row specified default, match anything character class.
                break;
            }
            if (tableEl->fCharClass == 254 && fC.fQuoted)  {
                // Table row specified "quoted" and the char was quoted.
                break;
            }
            if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
                // Table row specified eof and we hit eof on the input.
                break;
            }

            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
                fC.fQuoted == FALSE &&                                      //   char is not escaped &&
                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
                UnicodeSet *uniset = gRuleSets[tableEl->fCharClass-128];
                if (uniset->contains(fC.fChar)) {
                    // Table row specified a character class, or set of characters,
                    //   and the current char matches it.
                    break;
                }
            }

            // No match on this row, advance to the next  row for this state,
            tableEl++;
        }
        REGEX_SCAN_DEBUG_PRINTF("\n");

        //
        // We've found the row of the state table that matches the current input
        //   character from the rules string.
        // Perform any action specified  by this row in the state table.
        if (doParseActions((EParseAction)tableEl->fAction) == FALSE) {
            // Break out of the state machine loop if the
            //   the action signalled some kind of error, or
            //   the action was to exit, occurs on normal end-of-rules-input.
            break;
        }

        if (tableEl->fPushState != 0) {
            fStackPtr++;
            if (fStackPtr >= kStackSize) {
                error(U_REGEX_INTERNAL_ERROR);
                REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n");
                fStackPtr--;
            }
            fStack[fStackPtr] = tableEl->fPushState;
        }

        if (tableEl->fNextChar) {
            nextChar(fC);
        }

        // Get the next state from the table entry, or from the
        //   state stack if the next state was specified as "pop".
        if (tableEl->fNextState != 255) {
            state = tableEl->fNextState;
        } else {
            state = fStack[fStackPtr];
            fStackPtr--;
            if (fStackPtr < 0) {
                // state stack underflow
                // This will occur if the user pattern has mis-matched parentheses,
                //   with extra close parens.
                //
                fStackPtr++;
                error(U_REGEX_MISMATCHED_PAREN);
            }
        }

    }

    //
    // The pattern has now been read and processed, and the compiled code generated.
    //

    // Back-reference fixup
    //
    int32_t loc;
    for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
        int32_t op = fRXPat->fCompiledPat->elementAti(loc);
        if (URX_TYPE(op) == URX_BACKREF) {
            int32_t where = URX_VAL(op);
            if (where > fRXPat->fGroupMap->size()) {
                error(U_REGEX_INVALID_BACK_REF);
                break;
            }
            where = fRXPat->fGroupMap->elementAti(where-1);
            op    = URX_BUILD(URX_BACKREF, where);
            fRXPat->fCompiledPat->setElementAt(op, loc);
        }
    }


    //
    // Compute the number of digits requried for the largest capture group number.
    //
    fRXPat->fMaxCaptureDigits = 1;
    int32_t  n = 10;
    for (;;) {
        if (n > fRXPat->fGroupMap->size()) {
            break;
        }
        fRXPat->fMaxCaptureDigits++;
        n *= 10;
    }

    //
    // The pattern's fFrameSize so far has accumulated the requirements for
    //   storage for capture parentheses, counters, etc. that are encountered
    //   in the pattern.  Add space for the two variables that are always
    //   present in the saved state:  the input string position and the
    //   position in the compiled pattern.
    //
    fRXPat->fFrameSize+=2;

    //
    // A stupid bit of non-sense to prevent code coverage testing from complaining
    //   about the pattern.dump() debug function.  Go through the motions of dumping,
    //   even though, without the #define set, it will do nothing.
    //
#ifndef REGEX_DUMP_DEBUG
    static UBool phonyDumpDone = FALSE;
    if (phonyDumpDone==FALSE) {
        fRXPat->dump();
        phonyDumpDone = TRUE;
    }
#endif

}


//----------------------------------------------------------------------------------------
//
//  doParseAction        Do some action during regex pattern parsing.
//                       Called by the parse state machine.
//
//
//----------------------------------------------------------------------------------------
UBool RegexCompile::doParseActions(EParseAction action)
{
    UBool   returnVal = TRUE;

    switch ((Regex_PatternParseAction)action) {

    case doPatStart:
        // Start of pattern compiles to:
        //0   SAVE   2        Fall back to position of FAIL
        //1   jmp    3
        //2   FAIL            Stop if we ever reach here.
        //3   NOP             Dummy, so start of pattern looks the same as
        //                    the start of an ( grouping.
        //4   NOP             Resreved, will be replaced by a save if there are
        //                    OR | operators at the top level
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP,  3), *fStatus);
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP,  0), *fStatus);
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP,  0), *fStatus);

        fParenStack.push(-1, *fStatus);     // Begin a Paren Stack Frame
        fParenStack.push( 3, *fStatus);     // Push location of first NOP
        break;

    case doPatFinish:
        // We've scanned to the end of the pattern
        //  The end of pattern compiles to:
        //        URX_END
        //    which will stop the runtime match engine.
        //  Encountering end of pattern also behaves like a close paren,
        //   and forces fixups of the State Save at the beginning of the compiled pattern
        //   and of any OR operations at the top level.
        //
        handleCloseParen();
        if (fParenStack.size() > 0) {
            // Missing close paren in pattern.
            error(U_REGEX_MISMATCHED_PAREN);
        }

        // add the END operation to the compiled pattern.
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);

        // Terminate the pattern compilation state machine.
        returnVal = FALSE;
        break;


    case doOrOperator:
        // Scanning a '|', as in (A|B)
        {
            // Insert a SAVE operation at the start of the pattern section preceding
            //   this OR at this level.  This SAVE will branch the match forward
            //   to the right hand side of the OR in the event that the left hand
            //   side fails to match and backtracks.  Locate the position for the
            //   save from the location on the top of the parentheses stack.
            int32_t savePosition = fParenStack.popi();
            int32_t op = fRXPat->fCompiledPat->elementAti(savePosition);
            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
            op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
            fRXPat->fCompiledPat->setElementAt(op, savePosition);

            // Append an JMP operation into the compiled pattern.  The operand for
            //  the JMP will eventually be the location following the ')' for the
            //  group.  This will be patched in later, when the ')' is encountered.
            op = URX_BUILD(URX_JMP, 0);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // Push the position of the newly added JMP op onto the parentheses stack.
            // This registers if for fixup when this block's close paren is encountered.
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);

            // Append a NOP to the compiled pattern.  This is the slot reserved
            //   for a SAVE in the event that there is yet another '|' following
            //   this one.
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
        }
        break;


    case doOpenCaptureParen:
        // Open Paren.
        //   Compile to a
        //      - NOP, which later may be replaced by a save-state if the
        //         parenthesized group gets a * quantifier, followed by
        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        //
        //    Each capture group gets three slots in the save stack frame:
        //         0:   Capture Group start position (in input string being matched.)
        //         1:   Capture Group end   positino.
        //         2:   Start of Match-in-progress.
        //    The first two locations are for a completed capture group, and are
        //     referred to by back references and the like.
        //    The third location stores the capture start position when an START_CAPTURE is
        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
        //      END_CAPure is encountered.
        {
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
            int32_t  varsLoc    = fRXPat->fFrameSize;    // Reserve three slots in match stack frame.
            fRXPat->fFrameSize += 3;
            int32_t  cop        = URX_BUILD(URX_START_CAPTURE, varsLoc);
            fRXPat->fCompiledPat->addElement(cop, *fStatus);
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the two NOPs.  Depending on what follows in the pattern, the
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
            //   address of the end of the parenthesized group.
            fParenStack.push(-2, *fStatus);           // Begin a new frame.
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP

            // Save the mapping from group number to stack frame variable position.
            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
        }
         break;

    case doOpenNonCaptureParen:
        // Open non-caputuring (grouping only) Paren.
        //   Compile to a
        //      - NOP, which later may be replaced by a save-state if the
        //         parenthesized group gets a * quantifier, followed by
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        {
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the two NOPs.
            fParenStack.push(-1, *fStatus);                               // Begin a new frame.
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
        }
         break;


    case doOpenAtomicParen:
        // Open Atomic Paren.  (?>
        //   Compile to a
        //      - NOP, which later may be replaced if the parenthesized group
        //         has a quantifier, followed by
        //      - STO_SP  save state stack position, so it can be restored at the ")"
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        {
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
            int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the
            fRXPat->fDataSize += 1;                    //  state stack ptr.
            int32_t  stoOp     = URX_BUILD(URX_STO_SP, varLoc);
            fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the two NOPs.  Depending on what follows in the pattern, the
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
            //   address of the end of the parenthesized group.
            fParenStack.push(-3, *fStatus);           // Begin a new frame.
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
        }
        break;


    case doOpenLookAhead:
        // Positive Look-ahead   (?=  stuff  )
        // Compiles to
        //    1    START_LA     dataLoc
        //    2.   NOP              reserved for use by quantifiers on the block.
        //                          Look-ahead can't have quantifiers, but paren stack
        //                             compile time conventions require the slot anyhow.
        //    3.   NOP              may be replaced if there is are '|' ops in the block.
        //    4.     code for parenthesized stuff.
        //    5.   ENDLA
        //
        //  Two data slots are reserved, for saving the stack ptr and the input position.
        {
            int32_t dataLoc = fRXPat->fDataSize;
            fRXPat->fDataSize += 2;
            int32_t op = URX_BUILD(URX_LA_START, dataLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            op = URX_BUILD(URX_NOP, 0);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the NOPs.
            fParenStack.push(lookAhead, *fStatus);                        // Begin a new frame.
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
        }
        break;

    case doOpenLookAheadNeg:
        // Negated Lookahead.   (?! stuff )
        // Compiles to
        //    1.    START_LA    dataloc
        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
        //                                //   which continues with the match.
        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
        //    4.       code for parenthesized stuff.
        //    5.    END_LA                // Cut back stack, remove saved state from step 2.
        //    6.    FAIL                  // code in block succeeded, so neg. lookahead fails.
        //    7.    ...
        {
            int32_t dataLoc = fRXPat->fDataSize;
            fRXPat->fDataSize += 2;
            int32_t op = URX_BUILD(URX_LA_START, dataLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            op = URX_BUILD(URX_STATE_SAVE, 0);    // dest address will be patched later.
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            op = URX_BUILD(URX_NOP, 0);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
            //   of the StateSave and NOP.
            fParenStack.push( negLookAhead, *fStatus);                    // Begin a new frame.
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP

            // Instructions #5 and #6 will be added when the ')' is encountered.
        }
        break;

    case doOpenLookBehind:
        // Open Paren.
        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doOpenLookBehindNeg:
        // Open Paren.
        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doConditionalExpr:
        // Conditionals such as (?(1)a:b)
    case doPerlInline:
        // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
        error(U_REGEX_UNIMPLEMENTED);
        break;


    case doCloseParen:
        handleCloseParen();
        if (fParenStack.size() <= 0) {
            //  Extra close paren, or missing open paren.
            error(U_REGEX_MISMATCHED_PAREN);
        }
        break;

    case doNOP:
        break;


    case doBadOpenParenType:
    case doRuleError:
        error(U_REGEX_RULE_SYNTAX);
        returnVal = FALSE;
        break;


    case doMismatchedParenErr:
        error(U_REGEX_MISMATCHED_PAREN);
        returnVal = FALSE;
        break;

    case doPlus:
        //  Normal '+'  compiles to
        //     1.   stuff to be repeated  (already built)
        //     2.   state-save  4
        //     3.   jmp 1
        //     4.   ...
        {
            int32_t   topLoc = blockTopLoc(FALSE);        // location of item #1

            // Locate the position in the compiled pattern where the match will continue
            //   after completing the +   (4 in the comment above)
            int32_t continueLoc = fRXPat->fCompiledPat->size()+2;

            // Emit the STATE_SAVE
            int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
            fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);

            // Emit the JMP
            int32_t jmpOp = URX_BUILD(URX_JMP, topLoc);
            fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
        }
        break;

    case doNGPlus:
        //  Non-greedy '+?'  compiles to
        //     1.   stuff to be repeated  (already built)
        //     2.   state-save  1
        //     3.   ...
        {
            int32_t topLoc      = blockTopLoc(FALSE);
            int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
            fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
        }
        break;


    case doOpt:
        // Normal (greedy) ? quantifier.
        //  Compiles to
        //     1. state save 3
        //     2.    body of optional block
        //     3. ...
        // Insert the state save into the compiled pattern, and we're done.
        {
            int32_t   saveStateLoc = blockTopLoc(TRUE);
            int32_t   saveStateOp  = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
        }
        break;

    case doNGOpt:
        // Non-greedy ?? quantifier
        //   compiles to
        //    1.  jmp   4
        //    2.     body of optional block
        //    3   jmp   5
        //    4.  state save 2
        //    5    ...
        //  This code is less than ideal, with two jmps instead of one, because we can only
        //  insert one instruction at the top of the block being iterated.
        {
            int32_t  jmp1_loc = blockTopLoc(TRUE);
            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();

            int32_t  jmp1_op  = URX_BUILD(URX_JMP, jmp2_loc+1);
            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);

            int32_t  jmp2_op  = URX_BUILD(URX_JMP, jmp2_loc+2);
            fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);

            int32_t  save_op  = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
            fRXPat->fCompiledPat->addElement(save_op, *fStatus);
        }
        break;


    case doStar:
        // Normal (greedy) * quantifier.
        // Compiles to
        //       1.   STATE_SAVE   4
        //       2.      body of stuff being iterated over
        //       3.   JMP  1
        //       4.   ...
        //
        // Or, if the body can match a zero-length string, to inhibit infinite loops,
        //       1.   STATE_SAVE   6
        //       2.   POS_SAVE     data-loc
        //       3.      body of stuff
        //       4.   JMPX    1
        //       5            data-loc   (extra operand of JMPX)
        //       6.   ...
        {
            // location of item #1, the STATE_SAVE
            int32_t   saveStateLoc = blockTopLoc(TRUE);
            int32_t   dataLoc = -1;

            if (possibleNullMatch(saveStateLoc, fRXPat->fCompiledPat->size()-1)) {
                insertOp(saveStateLoc);
                dataLoc =  fRXPat->fFrameSize;
                fRXPat->fFrameSize++;

                int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
            }

            // Locate the position in the compiled pattern where the match will continue
            //   after completing the *.   (4 in the comment above)
            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
            if (dataLoc != -1) {
                continueLoc++;
            }

            // Put together the save state op store it into the compiled code.
            int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);

            // Append the URX_JMP or URX_JMPX operation to the compiled pattern.  Its target
            // is the locaton of the state-save, above.
            if (dataLoc == -1) {
                int32_t jmpOp = URX_BUILD(URX_JMP, saveStateLoc);
                fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
            } else {
                int32_t op = URX_BUILD(URX_JMPX, saveStateLoc);
                fRXPat->fCompiledPat->addElement(op, *fStatus);
                op = URX_BUILD(URX_RESERVED_OP, dataLoc);
                fRXPat->fCompiledPat->addElement(op, *fStatus);
            }

        }
        break;

    case doNGStar:
        // Non-greedy *? quantifier
        // compiles to
        //     1.   JMP    3
        //     2.      body of stuff being iterated over
        //     3.   STATE_SAVE  2
        //     4    ...
        {
            int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
            int32_t     jmpOp   = URX_BUILD(URX_JMP, saveLoc);
            int32_t     stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
            fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
        }
        break;


    case doIntervalInit:
        // The '{' opening an interval quantifier was just scanned.
        // Init the counter varaiables that will accumulate the values as the digits
        //    are scanned.
        fIntervalLow = 0;
        fIntervalUpper = -1;
        break;

    case doIntevalLowerDigit:
        // Scanned a digit from the lower value of an {lower,upper} interval
        {
            int32_t digitValue = u_charDigitValue(fC.fChar);
            U_ASSERT(digitValue >= 0);
            fIntervalLow = fIntervalLow*10 + digitValue;
            if (fIntervalLow < 0) {
                error(U_REGEX_NUMBER_TOO_BIG);
            }
        }
        break;

    case doIntervalUpperDigit:
        // Scanned a digit from the upper value of an {lower,upper} interval
        {
            if (fIntervalUpper < 0) {
                fIntervalUpper = 0;
            }
            int32_t digitValue = u_charDigitValue(fC.fChar);
            U_ASSERT(digitValue >= 0);
            fIntervalUpper = fIntervalUpper*10 + digitValue;
            if (fIntervalLow < 0) {
                error(U_REGEX_NUMBER_TOO_BIG);
            }
        }
        break;

    case doIntervalSame:
        // Scanned a single value interval like {27}.  Upper = Lower.
        fIntervalUpper = fIntervalLow;
        break;

    case doInterval:
        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
        compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
        break;

    case doPossesiveInterval:
        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
        compileInterval(URX_CTR_INIT_P, URX_CTR_LOOP_P);
        break;

    case doNGInterval:
        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
        break;

    case doIntervalError:
        error(U_REGEX_BAD_INTERVAL);
        break;

    case doLiteralChar:
        // We've just scanned a "normal" character from the pattern,
        literalChar();
        break;


    case doDotAny:
        // scanned a ".",  match any single character.
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus);
        break;

    case doCaret:       // TODO:  multi-line mode flag.
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
        break;


    case doDollar:       // TODO:  multi-line mode flag.
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
        break;

    case doBackslashA:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
        break;

    case doBackslashB:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 1), *fStatus);
        break;

    case doBackslashb:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus);
        break;

    case doBackslashD:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
        break;

    case doBackslashd:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
        break;

    case doBackslashG:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
        break;

    case doBackslashS:
        fRXPat->fCompiledPat->addElement(
            URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET | URX_NEG_SET), *fStatus);
        break;

    case doBackslashs:
        fRXPat->fCompiledPat->addElement(
            URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
        break;

    case doBackslashW:
        fRXPat->fCompiledPat->addElement(
            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
        break;

    case doBackslashw:
        fRXPat->fCompiledPat->addElement(
            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
        break;

    case doBackslashX:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
        break;

    case doBackslashx:              // \x{abcd}   alternate hex format
        //  TODO:  implement
        error(U_REGEX_UNIMPLEMENTED);
        break;


    case doBackslashZ:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
        break;

    case doBackslashz:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
        break;

    case doEscapeError:
        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
        break;

    case doExit:
        returnVal = FALSE;
        break;

    case doProperty:
        {
            UnicodeSet *theSet = scanProp();
            compileSet(theSet);
        }
        break;


    case doScanUnicodeSet:
        {
            UnicodeSet *theSet = scanSet();
            compileSet(theSet);
        }
        break;

    case doEnterQuoteMode:
        // Just scanned a \Q.  Put character scanner into quote mode.
        fQuoteMode = TRUE;
        break;

    case doBackRef:
        // BackReference.  Somewhat unusual in that the front-end can not completely parse
        //                 the regular expression, because the number of digits to be consumed
        //                 depends on the number of capture groups that have been defined.  So
        //                 we have to do it here instead.
        {
            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
            int32_t  groupNum = 0;
            UChar32  c        = fC.fChar;

            for (;;) {
                // Loop once per digit, for max allowed number of digits in a back reference.
                int32_t digit = u_charDigitValue(c);
                groupNum = groupNum * 10 + digit;
                if (groupNum >= numCaptureGroups) {
                    break;
                }
                c = peekCharLL();
                if (gRuleDigits->contains(c) == FALSE) {
                    break;
                }
                nextCharLL();
            }

            // Scan of the back reference in the source regexp is complete.  Now generate
            //  the compiled code for it.
            // Because capture groups can be forward-referenced by back-references,
            //  we fill the operand with the capture group number.  At the end
            //  of compilation, it will be changed to the variables location.
            U_ASSERT(groupNum > 0);
            // int32_t  varsLoc = fRXPat->fGroupMap->elementAti(groupNum-1);
            int32_t  op = URX_BUILD(URX_BACKREF, groupNum);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;


    case doOctal:
        error(U_REGEX_UNIMPLEMENTED);
        break;


    case doNamedChar:            // \N{NAMED_CHAR}
        //  TODO:  implement
        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doPossesivePlus:
        // Possessive ++ quantifier.
        // Compiles to
        //       1.   STO_SP
        //       2.      body of stuff being iterated over
        //       3.   STATE_SAVE 5
        //       4.   JMP        2
        //       5.   LD_SP
        //       6.   ...
        //
        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
        //                then unconditionally discarded.  Perhaps introduce a new opcode
        //
        {
            // Emit the STO_SP
            int32_t   topLoc = blockTopLoc(TRUE);
            int32_t   stoLoc = fRXPat->fDataSize;
            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc);

            // Emit the STATE_SAVE
            op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // Emit the JMP
            op = URX_BUILD(URX_JMP, topLoc+1);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // Emit the LD_SP
            op = URX_BUILD(URX_LD_SP, stoLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;

    case doPossesiveStar:
        // Possessive *+ quantifier.
        // Compiles to
        //       1.   STO_SP       loc
        //       2.   STATE_SAVE   5
        //       3.      body of stuff being iterated over
        //       4.   JMP          2
        //       5.   LD_SP        loc
        //       6    ...
        // TODO:  do something to cut back the state stack each time through the loop.
        {
            // Reserve two slots at the top of the block.
            int32_t   topLoc = blockTopLoc(TRUE);
            insertOp(topLoc);

            // emit   STO_SP     loc
            int32_t   stoLoc = fRXPat->fDataSize;
            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc);

            // Emit the SAVE_STATE   5
            int32_t L7 = fRXPat->fCompiledPat->size()+1;
            op = URX_BUILD(URX_STATE_SAVE, L7);
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);

            // Append the JMP operation.
            op = URX_BUILD(URX_JMP, topLoc+1);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // Emit the LD_SP       loc
            op = URX_BUILD(URX_LD_SP, stoLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;

    case doPossesiveOpt:
        // Possessive  ?+ quantifier.
        //  Compiles to
        //     1. STO_SP      loc
        //     2. SAVE_STATE  5
        //     3.    body of optional block
        //     4. LD_SP       loc
        //     5. ...
        //
        {
            // Reserve two slots at the top of the block.
            int32_t   topLoc = blockTopLoc(TRUE);
            insertOp(topLoc);

            // Emit the STO_SP
            int32_t   stoLoc = fRXPat->fDataSize;
            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc);

            // Emit the SAVE_STATE
            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
            op = URX_BUILD(URX_STATE_SAVE, continueLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);

            // Emit the LD_SP
            op = URX_BUILD(URX_LD_SP, stoLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;


    case doMatchMode:   //  (?i)    and similar
        // TODO:  implement
        error(U_REGEX_UNIMPLEMENTED);
        break;


    default:
        error(U_REGEX_INTERNAL_ERROR);
        returnVal = FALSE;
        break;
    }
    return returnVal;
};


//------------------------------------------------------------------------------
//
//   literalChar           We've encountered a literal character from the pattern,
//                             or an escape sequence that reduces to a character.
//                         Add it to the string containing all literal chars/strings from
//                             the pattern.
//                         If we are in a pattern string already, add the new char to it.
//                         If we aren't in a pattern string, begin one now.
//
//------------------------------------------------------------------------------
void RegexCompile::literalChar()  {
    int32_t           op;            // An operation in the compiled pattern.
    int32_t           opType;
    int32_t           patternLoc;   // A position in the compiled pattern.
    int32_t           stringLen;


    // If the last thing compiled into the pattern was not a literal char,
    //   force this new literal char to begin a new string, and not append to the previous.
    op     = fRXPat->fCompiledPat->lastElementi();
    opType = URX_TYPE(op);
    if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR)) {
        fixLiterals();
    }

    if (fStringOpStart == -1) {
        // First char of a string in the pattern.
        // Emit a OneChar op into the compiled pattern.
        op = URX_BUILD(URX_ONECHAR, fC.fChar);
        fRXPat->fCompiledPat->addElement(op, *fStatus);

        // Also add it to the string pool, in case we get a second adjacent literal
        //   and want to change form ONE_CHAR to STRING
        fStringOpStart = fRXPat->fLiteralText.length();
        fRXPat->fLiteralText.append(fC.fChar);
        return;
    }

    // We are adding onto an existing string
    fRXPat->fLiteralText.append(fC.fChar);

    // If the most recently emitted op is a URX_ONECHAR, change it to a string op.
    op     = fRXPat->fCompiledPat->lastElementi();
    opType = URX_TYPE(op);
    U_ASSERT(opType == URX_ONECHAR || opType == URX_STRING_LEN);
    if (opType == URX_ONECHAR) {
        op         = URX_BUILD(URX_STRING, fStringOpStart);
        patternLoc = fRXPat->fCompiledPat->size() - 1;
        fRXPat->fCompiledPat->setElementAt(op, patternLoc);
        op         = URX_BUILD(URX_STRING_LEN, 0);
        fRXPat->fCompiledPat->addElement(op, *fStatus);
    }

    // The pattern contains a URX_SRING / URX_STRING_LEN.  Update the
    //  string length to reflect the new char we just added to the string.
    stringLen  = fRXPat->fLiteralText.length() - fStringOpStart;
    op         = URX_BUILD(URX_STRING_LEN, stringLen);
    patternLoc = fRXPat->fCompiledPat->size() - 1;
    fRXPat->fCompiledPat->setElementAt(op, patternLoc);
}


//------------------------------------------------------------------------------
//
//    fixLiterals           When compiling something that can follow a literal
//                          string in a pattern, we need to "fix" any preceding
//                          string, which will cause any subsequent literals to
//                          begin a new string, rather than appending to the
//                          old one.
//
//                          Optionally, split the last char of the string off into
//                          a single "ONE_CHAR" operation, so that quantifiers can
//                          apply to that char alone.  Example:   abc*
//                          The * needs to apply to the 'c' only.
//
//------------------------------------------------------------------------------
void    RegexCompile::fixLiterals(UBool split) {
    int32_t  stringStart = fStringOpStart;    // start index of the current literal string
    int32_t  op;                              // An op from/for the compiled pattern.
    int32_t  opType;                          // An opcode type from the compiled pattern.
    int32_t  stringLastCharIdx;
    UChar32  lastChar;
    int32_t  stringNextToLastCharIdx;
    UChar32  nextToLastChar;
    int32_t  stringLen;

    fStringOpStart = -1;
    if (!split) {
        return;
    }

    // Split:  We need to  ensure that the last item in the compiled pattern does
    //   not refer to a literal string of more than one char.  If it does,
    //   separate the last char from the rest of the string.

    // If the last operation from the compiled pattern is not a string,
    //   nothing needs to be done
    op     = fRXPat->fCompiledPat->lastElementi();
    opType = URX_TYPE(op);
    if (opType != URX_STRING_LEN) {
        return;
    }
    stringLen = URX_VAL(op);

    //
    // Find the position of the last code point in the string  (might be a surrogate pair)
    //
    stringLastCharIdx = fRXPat->fLiteralText.length();
    stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
    lastChar          = fRXPat->fLiteralText.char32At(stringLastCharIdx);

    // The string should always be at least two code points long, meaning that there
    //   should be something before the last char position that we just found.
    U_ASSERT(stringLastCharIdx > stringStart);
    stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
    U_ASSERT(stringNextToLastCharIdx >= stringStart);
    nextToLastChar          = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx);

    if (stringNextToLastCharIdx > stringStart) {
        // The length of string remaining after removing one char is two or more.
        // Leave the string in the compiled pattern, shorten it by one char,
        //   and append a URX_ONECHAR op for the last char.
        stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
        op = URX_BUILD(URX_STRING_LEN, stringLen);
        fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
        op = URX_BUILD(URX_ONECHAR, lastChar);
        fRXPat->fCompiledPat->addElement(op, *fStatus);
    } else {
        // The original string consisted of exactly two characters.  Replace
        // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
        // of URX_ONECHARs.
        op = URX_BUILD(URX_ONECHAR, nextToLastChar);
        fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -2);
        op = URX_BUILD(URX_ONECHAR, lastChar);
        fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
    }
}


//------------------------------------------------------------------------------
//
//   insertOp()             Insert a slot for a new opcode into the already
//                          compiled pattern code.
//
//                          Fill the slot with a NOP.  Our caller will replace it
//                          with what they really wanted.
//
//------------------------------------------------------------------------------
void   RegexCompile::insertOp(int32_t where) {
    UVector32 *code = fRXPat->fCompiledPat;
    U_ASSERT(where>0 && where < code->size());

    int32_t  nop = URX_BUILD(URX_NOP, 0);
    code->insertElementAt(nop, where, *fStatus);

    // Walk through the pattern, looking for any ops with targets that
    //  were moved down by the insert.  Fix them.
    int32_t loc;
    for (loc=0; loc<code->size(); loc++) {
        int32_t op = code->elementAti(loc);
        int32_t opType = URX_TYPE(op);
        int32_t opValue = URX_VAL(op);
        if ((opType == URX_JMP         ||
            opType == URX_JMPX         ||
            opType == URX_STATE_SAVE   ||
            opType == URX_CTR_LOOP     ||
            opType == URX_CTR_LOOP_NG  ||
            opType == URX_CTR_LOOP_P   ||
            opType == URX_RELOC_OPRND)    && opValue > where) {
            // Target location for this opcode is after the insertion point and
            //   needs to be incremented to adjust for the insertion.
            opValue++;
            op = URX_BUILD(opType, opValue);
            code->setElementAt(op, loc);
        }
    }

    // Now fix up the parentheses stack.  All positive values in it are locations in
    //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
    for (loc=0; loc<fParenStack.size(); loc++) {
        int32_t x = fParenStack.elementAti(loc);
        if (x>where) {
            x++;
            fParenStack.setElementAt(x, loc);
        }
    }
}


//------------------------------------------------------------------------------
//
//   blockTopLoc()          Find or create a location in the compiled pattern
//                          at the start of the operation or block that has
//                          just been compiled.  Needed when a quantifier (* or
//                          whatever) appears, and we need to add an operation
//                          at the start of the thing being quantified.
//
//                          (Parenthesized Blocks) have a slot with a NOP that
//                          is reserved for this purpose.  .* or similar don't
//                          and a slot needs to be added.
//
//       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
//                                         at the returned location.
//                                 FALSE - just return the address,
//                                         do not reserve a location there.
//
//------------------------------------------------------------------------------
int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
    int32_t   theLoc;
    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
    {
        // The item just processed is a parenthesized block.
        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
        U_ASSERT(theLoc > 0);
        uint32_t  opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
        U_ASSERT(URX_TYPE(opAtTheLoc) == URX_NOP);
    }
    else {
        // Item just compiled is a single thing, a ".", or a single char, or a set reference.
        // No slot for STATE_SAVE was pre-reserved in the compiled code.
        // We need to make space now.
        fixLiterals(TRUE);  // If last item was a string, separate the last char.
        theLoc = fRXPat->fCompiledPat->size()-1;
        if (reserveLoc) {
            int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
            int32_t prevType = URX_TYPE(opAtTheLoc);
            int32_t  nop = URX_BUILD(URX_NOP, 0);
            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
        }
    }
    return theLoc;
}


//------------------------------------------------------------------------------
//
//    handleCloseParen      When compiling a close paren, we need to go back
//                          and fix up any JMP or SAVE operations within the
//                          parenthesized block that need to target the end
//                          of the block.  The locations of these are kept on
//                          the paretheses stack.
//
//                          This function is called both when encountering a
//                          real ) and at the end of the pattern.
//
//-------------------------------------------------------------------------------
void  RegexCompile::handleCloseParen() {
    int32_t   patIdx;
    int32_t   patOp;
    if (fParenStack.size() <= 0) {
        error(U_REGEX_MISMATCHED_PAREN);
        return;
    }

    // Force any literal chars that may follow the close paren to start a new string,
    //   and not attach to any preceding it.
    fixLiterals(FALSE);

    // Fixup any operations within the just-closed parenthesized group
    //    that need to reference the end of the (block).
    //    (The first one popped from the stack is an unused slot for
    //     alternation (OR) state save, but applying the fixup to it does no harm.)
    for (;;) {
        patIdx = fParenStack.popi();
        if (patIdx < 0) {
            // value < 0 flags the start of the frame on the paren stack.
            break;
        }
        U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
        patOp = fRXPat->fCompiledPat->elementAti(patIdx);
        U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
        patOp |= fRXPat->fCompiledPat->size();  // Set it now.
        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
        fMatchOpenParen     = patIdx;
    }

    // DO any additional fixups, depending on the specific kind of
    // parentesized grouping this is

    switch (patIdx) {
    case plain:
        // No additional fixups required.
        //   (Grouping-only parentheses)
        break;
    case capturing:
        // Capturing Parentheses.
        //   Insert a End Capture op into the pattern.
        //   The frame offset of the variables for this cg is obtained from the
        //       start capture op and put it into the end-capture op.
        {
            int32_t   captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
            U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);

            int32_t   frameVarLocation = URX_VAL(captureOp);
            int32_t   endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
            fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
        }
        break;
    case atomic:
        // Atomic Parenthesis.
        //   Insert a LD_SP operation to restore the state stack to the position
        //   it was when the atomic parens were entered.
        {
            int32_t   stoOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
            U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
            int32_t   stoLoc = URX_VAL(stoOp);
            int32_t   ldOp   = URX_BUILD(URX_LD_SP, stoLoc);
            fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
        }
        break;

    case lookAhead:
        {
            int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
            int32_t dataLoc  = URX_VAL(startOp);
            int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;

    case negLookAhead:
        {
            // See comment at doOpenLookAheadNeg
            int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
            int32_t dataLoc  = URX_VAL(startOp);
            int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
             op              = URX_BUILD(URX_FAIL, 0);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

            // Patch the URX_SAVE near the top of the block.
            int32_t saveOp   = fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
            U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
            int32_t dest     = fRXPat->fCompiledPat->size();
            saveOp           = URX_BUILD(URX_STATE_SAVE, dest);
            fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
        }
        break;


    default:
        U_ASSERT(FALSE);
    }

    // remember the next location in the compiled pattern.
    // The compilation of Quantifiers will look at this to see whether its looping
    //   over a parenthesized block or a single item
    fMatchCloseParen = fRXPat->fCompiledPat->size();
}


//----------------------------------------------------------------------------------------
//
//   compileSet       Compile the pattern operations for a reference to a
//                    UnicodeSet.
//
//----------------------------------------------------------------------------------------
void        RegexCompile::compileSet(UnicodeSet *theSet)
{
    if (theSet == NULL) {
        return;
    }
    int32_t  setSize = theSet->size();
    UChar32  firstSetChar = theSet->charAt(0);
    if (firstSetChar == -1) {
        // Sets that contain only strings, but no individual chars,
        // will end up here.   TODO:  figure out what to with sets containing strings.
        setSize = 0;
    }

    switch (setSize) {
    case 0:
        {
            // Set of no elements.   Always fails to match.
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
            delete theSet;
        }
        break;

    case 1:
        {
            // The set contains only a single code point.  Put it into
            //   the compiled pattern as a single char operation rather
            //   than a set, and discard the set itself.
            int32_t  charToken = URX_BUILD(URX_ONECHAR, firstSetChar);
            fRXPat->fCompiledPat->addElement(charToken, *fStatus);
            delete theSet;
        }
        break;

    default:
        {
            //  The set contains two or more chars.  (the normal case)
            //  Put it into the compiled pattern as a set.
            int32_t setNumber = fRXPat->fSets->size();
            fRXPat->fSets->addElement(theSet, *fStatus);
            int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
            fRXPat->fCompiledPat->addElement(setOp, *fStatus);
        }
    }
}


//----------------------------------------------------------------------------------------
//
//   compileInterval    Generate the code for a {min, max} style interval quantifier.
//                      Except for the specific opcodes used, the code is the same
//                      for all three types (greedy, non-greedy, possessive) of
//                      intervals.  The opcodes are supplied as parameters.
//
//----------------------------------------------------------------------------------------
void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
{
    // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
    //   four slots in the compiled code.  Reserve them.
    int32_t   topOfBlock = blockTopLoc(TRUE);
    insertOp(topOfBlock);
    insertOp(topOfBlock);
    insertOp(topOfBlock);

    // The operands for the CTR_INIT opcode include the index in the matcher data
    //   of the counter.  Allocate it now.
    int32_t   counterLoc = fRXPat->fFrameSize;
    fRXPat->fFrameSize++;

    int32_t   op = URX_BUILD(InitOp, counterLoc);
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock);

    // The second operand of CTR_INIT is the location following the end of the loop.
    //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
    //   compilation of something later on causes the code to grow and the target
    //   position to move.
    int32_t loopEnd = fRXPat->fCompiledPat->size();
    op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);

    // Followed by the min and max counts.
    fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
    fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);

    // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
    //   Goes at end of the block being looped over, so just append to the code so far.
    op = URX_BUILD(LoopOp, topOfBlock);
    fRXPat->fCompiledPat->addElement(op, *fStatus);

    if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
        error(U_REGEX_MAX_LT_MIN);
    }

}

//----------------------------------------------------------------------------------------
//
//  possibleNullMatch    Test a range of compiled pattern for the possibility that it
//                       might match an empty string.  Used to control the generation
//                       of extra checking code to prevent infinite loops in the match
//                       engine on repeated empty matches, such as might happen with
//                            (x?)*
//                       when the input string is not at an x.
//
//----------------------------------------------------------------------------------------
UBool  RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
    // for now, just return true.  TODO:  make a real implementation
    return TRUE;
}


//----------------------------------------------------------------------------------------
//
//  Error         Report a rule parse error.
//                Only report it if no previous error has been recorded.
//
//----------------------------------------------------------------------------------------
void RegexCompile::error(UErrorCode e) {
    if (U_SUCCESS(*fStatus)) {
        *fStatus = e;
        fParseErr->line  = fLineNum;
        fParseErr->offset = fCharNum;
        fParseErr->preContext[0] = 0;    // TODO:  copy in some input pattern text
        fParseErr->preContext[0] = 0;
    }
}


//
//  Assorted Unicode character constants.
//     Numeric because there is no portable way to enter them as literals.
//     (Think EBCDIC).
//
static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
static const UChar      chLF        = 0x0a;
static const UChar      chNEL       = 0x85;      //    NEL newline variant
static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
static const UChar      chApos      = 0x27;      //  single quote, for quoted chars.
static const UChar      chPound     = 0x23;      // '#', introduces a comment.
static const UChar      chE         = 0x45;      // 'E'
static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
static const UChar      chLParen    = 0x28;
static const UChar      chRParen    = 0x29;
static const UChar      chLBracket  = 0x5b;
static const UChar      chRBracket  = 0x5d;
static const UChar      chRBrace    = 0x7d;
static const UChar      chLowerP    = 0x70;
static const UChar      chUpperP    = 0x50;


//----------------------------------------------------------------------------------------
//
//  nextCharLL    Low Level Next Char from the regex pattern.
//                Get a char from the string,
//                keep track of input position for error reporting.
//
//----------------------------------------------------------------------------------------
UChar32  RegexCompile::nextCharLL() {
    UChar32       ch;
    UnicodeString &pattern = fRXPat->fPattern;

    if (fPeekChar != -1) {
        ch = fPeekChar;
        fPeekChar = -1;
        return ch;
    }
    if (fPatternLength==0 || fNextIndex >= fPatternLength) {
        return (UChar32)-1;
    }
    ch         = pattern.char32At(fNextIndex);
    fNextIndex = pattern.moveIndex32(fNextIndex, 1);

    if (ch == chCR ||
        ch == chNEL ||
        ch == chLS   ||
        ch == chLF && fLastChar != chCR) {
        // Character is starting a new line.  Bump up the line number, and
        //  reset the column to 0.
        fLineNum++;
        fCharNum=0;
        if (fQuoteMode) {
            error(U_REGEX_RULE_SYNTAX);
            fQuoteMode = FALSE;
        }
    }
    else {
        // Character is not starting a new line.  Except in the case of a
        //   LF following a CR, increment the column position.
        if (ch != chLF) {
            fCharNum++;
        }
    }
    fLastChar = ch;
    return ch;
}

//---------------------------------------------------------------------------------
//
//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
//                 character without actually getting it.
//
//---------------------------------------------------------------------------------
UChar32  RegexCompile::peekCharLL() {
    if (fPeekChar == -1) {
        fPeekChar = nextCharLL();
    }
    return fPeekChar;
}


//---------------------------------------------------------------------------------
//
//   nextChar     for pattern scanning.  At this level, we handle stripping
//                out comments and processing some backslash character escapes.
//                The rest of the pattern grammar is handled at the next level up.
//
//---------------------------------------------------------------------------------
void RegexCompile::nextChar(RegexPatternChar &c) {

    // Unicode Character constants needed for the processing done by nextChar(),
    //   in hex because literals wont work on EBCDIC machines.

    fScanIndex = fNextIndex;
    c.fChar    = nextCharLL();
    c.fQuoted  = FALSE;

    if (fQuoteMode) {
        c.fQuoted = TRUE;
        if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) {
            fQuoteMode = FALSE;  //  Exit quote mode,
            nextCharLL();       // discard the E
            nextChar(c);        // recurse to get the real next char
        }
    }
    else
    {
        // We are not in a 'quoted region' of the source.
        //
        if (fFreeForm && c.fChar == chPound) {
            // Start of a comment.  Consume the rest of it.
            //  The new-line char that terminates the comment is always returned.
            //  It will be treated as white-space, and serves to break up anything
            //    that might otherwise incorrectly clump together with a comment in
            //    the middle (a variable name, for example.)
            for (;;) {
                c.fChar = nextCharLL();
                if (c.fChar == (UChar32)-1 ||  // EOF
                    c.fChar == chCR     ||
                    c.fChar == chLF     ||
                    c.fChar == chNEL    ||
                    c.fChar == chLS)       {break;}
            }
        }
        if (c.fChar == (UChar32)-1) {
            return;
        }

        //
        //  check for backslash escaped characters.
        //  Use UnicodeString::unescapeAt() to handle those that it can.
        //  Otherwise just return the '\', and let the pattern parser deal with it.
        //
        int32_t startX = fNextIndex;  // start and end positions of the
        int32_t endX   = fNextIndex;  //   sequence following the '\'
        if (c.fChar == chBackSlash) {
            if (gUnescapeCharSet->contains(peekCharLL())) {
                nextCharLL();     // get & discard the peeked char.
                c.fQuoted = TRUE;
                c.fChar = fRXPat->fPattern.unescapeAt(endX);
                if (startX == endX) {
                    error(U_REGEX_BAD_ESCAPE_SEQUENCE);
                }
                fCharNum += endX - startX;
                fNextIndex = endX;
            }
        }
    }
    // putc(c.fChar, stdout);
}


//---------------------------------------------------------------------------------
//
//  scanSet    Construct a UnicodeSet from the text at the current scan
//             position.  Advance the scan position to the first character
//             after the set.
//
//             The scan position is normally under the control of the state machine
//             that controls pattern parsing.  UnicodeSets, however, are parsed by
//             the UnicodeSet constructor, not by the Regex pattern parser.
//
//---------------------------------------------------------------------------------
UnicodeSet *RegexCompile::scanSet() {
    UnicodeSet    *uset = NULL;
    ParsePosition  pos;
    int            startPos;
    int            i;

    if (U_FAILURE(*fStatus)) {
        return NULL;
    }

    pos.setIndex(fScanIndex);
    startPos = fScanIndex;
    UErrorCode localStatus = U_ZERO_ERROR;
    uset = new UnicodeSet(fRXPat->fPattern, pos,
                         localStatus);
    if (U_FAILURE(localStatus)) {
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
        //         UnicodeSet appears to not be reporting correctly at this time.
        REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
        error(localStatus);
        delete uset;
        return NULL;
    }

    // Advance the current scan postion over the UnicodeSet.
    //   Don't just set fScanIndex because the line/char positions maintained
    //   for error reporting would be thrown off.
    i = pos.getIndex();
    for (;;) {
        if (fNextIndex >= i) {
            break;
        }
        nextCharLL();
    }

    return uset;
};


//---------------------------------------------------------------------------------
//
//  scanProp   Construct a UnicodeSet from the text at the current scan
//             position, which will be of the form \p{whaterver}
//
//             The scan position will be at the 'p' or 'P'.  On return
//             the scan position should be just after the '}'
//
//             Return a UnicodeSet, constructed from the \P pattern,
//             or NULL if the pattern is invalid.
//
//---------------------------------------------------------------------------------
UnicodeSet *RegexCompile::scanProp() {
    UnicodeSet    *uset = NULL;

    if (U_FAILURE(*fStatus)) {
        return NULL;
    }

    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP);

    // enclose the \p{property} from the regex pattern source in  [brackets]
    UnicodeString setPattern;
    setPattern.append(chLBracket);
    setPattern.append(chBackSlash);
    for (;;) {
        setPattern.append(fC.fChar);
        if (fC.fChar == chRBrace) {
            break;
        }
        nextChar(fC);
        if (fC.fChar == -1) {
            // Hit the end of the input string without finding the closing '}'
            *fStatus = U_REGEX_PROPERTY_SYNTAX;
            return NULL;
        }
    }
    setPattern.append(chRBracket);

    // Build the UnicodeSet from the set pattern we just built up in a string.
    uset = new UnicodeSet(setPattern, *fStatus);
    if (U_FAILURE(*fStatus)) {
        delete uset;
        uset =  NULL;
    }

    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
    return uset;
};

U_NAMESPACE_END
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS