scuffed-code/icu4c/source/i18n/rbbi_bld.h

/*
* Copyright © {1999}, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   12/15/99    rgillam     Port from Java.
**********************************************************************
*/

#ifndef RBBI_BLD_H
#define RBBI_BLD_H

#include "rbbi.h"
#include "rbbi_tbl.h"
#include "unicode/uniset.h"
#include "uvector.h"

class ExpressionList;

//=======================================================================
// RuleBasedBreakIterator.Builder
//=======================================================================
/**
 * The Builder class has the job of constructing a RuleBasedBreakIterator from a
 * textual description.  A Builder is constructed by RuleBasedBreakIterator's
 * constructor, which uses it to construct the iterator itself and then throws it
 * away.
 * <p>The construction logic is separated out into its own class for two primary
 * reasons:
 * <ul><li>The construction logic is quite complicated and large.  Separating it
 * out into its own class means the code must only be loaded into memory while a
 * RuleBasedBreakIterator is being constructed, and can be purged after that.
 * <li>There is a fair amount of state that must be maintained throughout the
 * construction process that is not needed by the iterator after construction.
 * Separating this state out into another class prevents all of the functions that
 * construct the iterator from having to have really long parameter lists,
 * (hopefully) contributing to readability and maintainability.</ul>
 * <p>It'd be really nice if this could be an independent class rather than an
 * inner class, because that would shorten the source file considerably, but
 * making Builder an inner class of RuleBasedBreakIterator allows it direct access
 * to RuleBasedBreakIterator's private members, which saves us from having to
 * provide some kind of "back door" to the Builder class that could then also be
 * used by other classes.
 */
class RuleBasedBreakIteratorBuilder {

protected:
    /**
     * The iterator we're constructing.
     */
    RuleBasedBreakIterator& iterator;

    /**
     * The tables object for the iterator we're constructing.
     */
    RuleBasedBreakIteratorTables* tables;

    /**
     * A temporary place to hold the rules as they're being processed.
     */
    UVector tempRuleList;

    /**
     * A temporary holding place used for calculating the character categories.
     * This object contains UnicodeSet objects.
     */
    UVector categories;

    /**
     * The number of categories (and thus the number of columns in the finished state tables)
     */
    int32_t numCategories;

    /**
     * A table used to map parts of regexp text to lists of character categories,
     * rather than having to figure them out from scratch each time
     */
    ExpressionList* expressions;

    /**
     * A temporary holding place for the list of ignore characters
     */
    UnicodeSet ignoreChars;

    /**
     * A temporary holding place where the forward state table is built
     */
    UVector tempStateTable;

    /**
     * A list of all the states that have to be filled in with transitions to the
     * next state that is created.  Used when building the state table from the
     * regular expressions.
     */
    UVector decisionPointList;

    /**
     * A UStack for holding decision point lists.  This is used to handle nested
     * parentheses and braces in regexps.
     */
    UStack decisionPointStack;

    /**
     * A list of states that loop back on themselves.  Used to handle .*?
     */
    UVector loopingStates;

    /**
     * Looping states actually have to be backfilled later in the process
     * than everything else.  This is where a the list of states to backfill
     * is accumulated.  This is also used to handle .*?
     */
    UVector statesToBackfill;

    /**
     * A list mapping pairs of state numbers for states that are to be combined
     * to the state number of the state representing their combination.  Used
     * in the process of making the state table deterministic to prevent
     * infinite recursion.
     */
    UVector mergeList;

    /**
     * A flag that is used to indicate when the list of looping states can
     * be reset.
     */
    UBool clearLoopingStates;

    /**
     * A place where an error message can be stored if we get a parse error.
     * The error message is never displayed anywhere, so this is useful pretty
     * much only in conjunction with a debugger.
     */
    UnicodeString errorMessage;

    /**
     * A bit mask used to indicate a bit in the table's flags column that marks a
     * state as an accepting state.
     */
    static const int32_t END_STATE_FLAG /*= 0x8000*/;

    /**
     * A bit mask used to indicate a bit in the table's flags column that marks a
     * state as one the builder shouldn't loop to any looping states
     */
    static const int32_t DONT_LOOP_FLAG /*= 0x4000*/;

    /**
     * A bit mask used to indicate a bit in the table's flags column that marks a
     * state as a lookahead state.
     */
    static const int32_t LOOKAHEAD_STATE_FLAG /*= 0x2000*/;

    /**
     * A bit mask representing the union of the mask values listed above.
     * Used for clearing or masking off the flag bits.
     */
    static const int32_t ALL_FLAGS /*= END_STATE_FLAG | LOOKAHEAD_STATE_FLAG
            | DONT_LOOP_FLAG*/;

public:

    /**
     * The Builder class contains a reference to the iterator it's supposed to build.
     */
    RuleBasedBreakIteratorBuilder(RuleBasedBreakIterator& iteratorToBuild);

    /**
     * Destructor.
     */
    ~RuleBasedBreakIteratorBuilder();

    /**
     * This is the main function for setting up the BreakIterator's tables.  It
     * just vectors different parts of the job off to other functions.
     */
    virtual void buildBreakIterator(const UnicodeString&    description,
                                    UErrorCode& err);

private:

    /**
     * Thus function has three main purposes:
     * <ul><li>Perform general syntax checking on the description, so the rest of the
     * build code can assume that it's parsing a legal description.
     * <li>Split the description into separate rules
     * <li>Perform variable-name substitutions (so that no one else sees variable names)
     * </ul>
     */
    virtual void buildRuleList(UnicodeString& description,
                               UErrorCode& err);

protected:

    /**
     * This function performs variable-name substitutions.  First it does syntax
     * checking on the variable-name definition.  If it's syntactically valid, it
     * then goes through the remainder of the description and does a simple
     * find-and-replace of the variable name with its text.  (The variable text
     * must be enclosed in either [] or () for this to work.)
     */
    virtual void processSubstitution(UnicodeString& description,
                                     UTextOffset ruleStart,
                                     UTextOffset ruleEnd,
                                     UTextOffset startPos,
                                     UErrorCode& err);

    /**
     * This function defines a protocol for handling substitution names that
     * are "special," i.e., that have some property beyond just being
     * substitutions.  At the RuleBasedBreakIterator level, we have one
     * special substitution name, "<ignore>".  Subclasses can override this
     * function to add more.  Any special processing that has to go on beyond
     * that which is done by the normal substitution-processing code is done
     * here.
     */
    virtual void handleSpecialSubstitution(const UnicodeString& replace,
                                           const UnicodeString& replaceWith,
                                           int32_t startPos,
                                           const UnicodeString& description,
                                           UErrorCode& err);

    /**
     * This function provides a hook for subclasses to mess with the character
     * category table.
     */
    virtual void mungeExpressionList();

    /**
     * This function builds the character category table.  On entry,
     * tempRuleList is a UVector of break rules that has had variable names substituted.
     * On exit, the charCategoryTable data member has been initialized to hold the
     * character category table, and tempRuleList's rules have been munged to contain
     * character category numbers everywhere a literal character or a [] expression
     * originally occurred.
     */
    virtual void buildCharCategories(UErrorCode& err);

private:

    /**
     * This is the function that builds the forward state table.  Most of the real
     * work is done in parseRule(), which is called once for each rule in the
     * description.
     */
    virtual void buildStateTable(UErrorCode& err);

    /**
     * This is where most of the work really happens.  This routine parses a single
     * rule in the rule description, adding and modifying states in the state
     * table according to the new expression.  The state table is kept deterministic
     * throughout the whole operation, although some ugly postprocessing is needed
     * to handle the *? token.
     */
    virtual void parseRule(const UnicodeString& rule,
                           UBool               forward);

    /**
     * Update entries in the state table, and merge states when necessary to keep
     * the table deterministic.
     * @param rows The list of rows that need updating (the decision point list)
     * @param pendingChars A character category list, encoded in a String.  This is the
     * list of the columns that need updating.
     * @param newValue Update the cells specfied above to contain this value
     */
    virtual void updateStateTable(const UVector&       rows,
                                  const UnicodeString& pendingChars,
                                  int16_t              newValue);

    /**
     * The real work of making the state table deterministic happens here.  This function
     * merges a state in the state table (specified by rowNum) with a state that is
     * passed in (newValues).  The basic process is to copy the nonzero cells in newStates
     * into the state in the state table (we'll call that oldValues).  If there's a
     * collision (i.e., if the same cell has a nonzero value in both states, and it's
     * not the SAME value), then we have to reconcile the collision.  We do this by
     * creating a new state, adding it to the end of the state table, and using this
     * function recursively to merge the original two states into a single, combined
     * state.  This process may happen recursively (i.e., each successive level may
     * involve collisions).  To prevent infinite recursion, we keep a log of merge
     * operations.  Any time we're merging two states we've merged before, we can just
     * supply the row number for the result of that merge operation rather than creating
     * a new state just like it.
     * @param rowNum The row number in the state table of the state to be updated
     * @param newValues The state to merge it with.
     * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
     * (itself a copy of the decision point list from parseRule()).  Newly-created
     * states get added to the decision point list if their "parents" were on it.
     */
    virtual void mergeStates(int32_t  rowNum,
                             int16_t* newValues,
                             const UVector& rowsBeingUpdated);

    /**
     * The merge list is a list of pairs of rows that have been merged somewhere in
     * the process of building this state table, along with the row number of the
     * row containing the merged state.  This function looks up a pair of row numbers
     * and returns the row number of the row they combine into.  (It returns 0 if
     * this pair of rows isn't in the merge list.)
     */
    virtual int32_t searchMergeList(int32_t a, int32_t b);

    /**
     * This function is used to update the list of current loooping states (i.e.,
     * states that are controlled by a *? construct).  It backfills values from
     * the looping states into unpopulated cells of the states that are currently
     * marked for backfilling, and then updates the list of looping states to be
     * the new list
     * @param newLoopingStates The list of new looping states
     * @param endStates The list of states to treat as end states (states that
     * can exit the loop).
     */
    virtual void setLoopingStates(const UVector* newLoopingStates,
                                  const UVector& endStates);

    /**
     * This removes "ending states" and states reachable from them from the
     * list of states to backfill.
     * @param The row number of the state to remove from the backfill list
     */
    virtual void eliminateBackfillStates(int32_t baseState);

    /**
     * This function completes the backfilling process by actually doing the
     * backfilling on the states that are marked for it
     */
    virtual void backfillLoopingStates(void);

    /**
     * This function completes the state-table-building process by doing several
     * postprocessing steps and copying everything into its final resting place
     * in the iterator itself
     * @param forward True if we're working on the forward state table
     */
    virtual void finishBuildingStateTable(UBool forward);

    /**
     * This function builds the backward state table from the forward state
     * table and any additional rules (identified by the ! on the front)
     * supplied in the description
     */
    virtual void buildBackwardsStateTable(UErrorCode& err);

protected:

    /**
     * Throws an IllegalArgumentException representing a syntax error in the rule
     * description.  The exception's message contains some debugging information.
     * @param message A message describing the problem
     * @param position The position in the description where the problem was
     * discovered
     * @param context The string containing the error
     */
    virtual void setUpErrorMessage(const UnicodeString& message,
                                   int32_t position,
                                   const UnicodeString& context);
};

#endif