/* * Copyright © {1999}, International Business Machines Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 12/15/99 rgillam Port from Java. ********************************************************************** */ #ifndef RBBI_BLD_H #define RBBI_BLD_H #include "rbbi.h" #include "rbbi_tbl.h" #include "unicode/uniset.h" #include "uvector.h" class ExpressionList; //======================================================================= // RuleBasedBreakIterator.Builder //======================================================================= /** * The Builder class has the job of constructing a RuleBasedBreakIterator from a * textual description. A Builder is constructed by RuleBasedBreakIterator's * constructor, which uses it to construct the iterator itself and then throws it * away. *

The construction logic is separated out into its own class for two primary * reasons: *

*

It'd be really nice if this could be an independent class rather than an * inner class, because that would shorten the source file considerably, but * making Builder an inner class of RuleBasedBreakIterator allows it direct access * to RuleBasedBreakIterator's private members, which saves us from having to * provide some kind of "back door" to the Builder class that could then also be * used by other classes. */ class RuleBasedBreakIteratorBuilder { protected: /** * The iterator we're constructing. */ RuleBasedBreakIterator& iterator; /** * The tables object for the iterator we're constructing. */ RuleBasedBreakIteratorTables* tables; /** * A temporary place to hold the rules as they're being processed. */ UVector tempRuleList; /** * A temporary holding place used for calculating the character categories. * This object contains UnicodeSet objects. */ UVector categories; /** * The number of categories (and thus the number of columns in the finished state tables) */ int32_t numCategories; /** * A table used to map parts of regexp text to lists of character categories, * rather than having to figure them out from scratch each time */ ExpressionList* expressions; /** * A temporary holding place for the list of ignore characters */ UnicodeSet ignoreChars; /** * A temporary holding place where the forward state table is built */ UVector tempStateTable; /** * A list of all the states that have to be filled in with transitions to the * next state that is created. Used when building the state table from the * regular expressions. */ UVector decisionPointList; /** * A UStack for holding decision point lists. This is used to handle nested * parentheses and braces in regexps. */ UStack decisionPointStack; /** * A list of states that loop back on themselves. Used to handle .*? */ UVector loopingStates; /** * Looping states actually have to be backfilled later in the process * than everything else. This is where a the list of states to backfill * is accumulated. This is also used to handle .*? */ UVector statesToBackfill; /** * A list mapping pairs of state numbers for states that are to be combined * to the state number of the state representing their combination. Used * in the process of making the state table deterministic to prevent * infinite recursion. */ UVector mergeList; /** * A flag that is used to indicate when the list of looping states can * be reset. */ bool_t clearLoopingStates; /** * A place where an error message can be stored if we get a parse error. * The error message is never displayed anywhere, so this is useful pretty * much only in conjunction with a debugger. */ UnicodeString errorMessage; /** * A bit mask used to indicate a bit in the table's flags column that marks a * state as an accepting state. */ static const int32_t END_STATE_FLAG /*= 0x8000*/; /** * A bit mask used to indicate a bit in the table's flags column that marks a * state as one the builder shouldn't loop to any looping states */ static const int32_t DONT_LOOP_FLAG /*= 0x4000*/; /** * A bit mask used to indicate a bit in the table's flags column that marks a * state as a lookahead state. */ static const int32_t LOOKAHEAD_STATE_FLAG /*= 0x2000*/; /** * A bit mask representing the union of the mask values listed above. * Used for clearing or masking off the flag bits. */ static const int32_t ALL_FLAGS /*= END_STATE_FLAG | LOOKAHEAD_STATE_FLAG | DONT_LOOP_FLAG*/; public: /** * The Builder class contains a reference to the iterator it's supposed to build. */ RuleBasedBreakIteratorBuilder(RuleBasedBreakIterator& iteratorToBuild); /** * Destructor. */ ~RuleBasedBreakIteratorBuilder(); /** * This is the main function for setting up the BreakIterator's tables. It * just vectors different parts of the job off to other functions. */ virtual void buildBreakIterator(const UnicodeString& description, UErrorCode& err); private: /** * Thus function has three main purposes: *

*/ virtual void buildRuleList(UnicodeString& description, UErrorCode& err); protected: /** * This function performs variable-name substitutions. First it does syntax * checking on the variable-name definition. If it's syntactically valid, it * then goes through the remainder of the description and does a simple * find-and-replace of the variable name with its text. (The variable text * must be enclosed in either [] or () for this to work.) */ virtual void processSubstitution(UnicodeString& description, UTextOffset ruleStart, UTextOffset ruleEnd, UTextOffset startPos, UErrorCode& err); /** * This function defines a protocol for handling substitution names that * are "special," i.e., that have some property beyond just being * substitutions. At the RuleBasedBreakIterator level, we have one * special substitution name, "". Subclasses can override this * function to add more. Any special processing that has to go on beyond * that which is done by the normal substitution-processing code is done * here. */ virtual void handleSpecialSubstitution(const UnicodeString& replace, const UnicodeString& replaceWith, int32_t startPos, const UnicodeString& description, UErrorCode& err); /** * This function provides a hook for subclasses to mess with the character * category table. */ virtual void mungeExpressionList(); /** * This function builds the character category table. On entry, * tempRuleList is a UVector of break rules that has had variable names substituted. * On exit, the charCategoryTable data member has been initialized to hold the * character category table, and tempRuleList's rules have been munged to contain * character category numbers everywhere a literal character or a [] expression * originally occurred. */ virtual void buildCharCategories(UErrorCode& err); private: /** * This is the function that builds the forward state table. Most of the real * work is done in parseRule(), which is called once for each rule in the * description. */ virtual void buildStateTable(UErrorCode& err); /** * This is where most of the work really happens. This routine parses a single * rule in the rule description, adding and modifying states in the state * table according to the new expression. The state table is kept deterministic * throughout the whole operation, although some ugly postprocessing is needed * to handle the *? token. */ virtual void parseRule(const UnicodeString& rule, bool_t forward); /** * Update entries in the state table, and merge states when necessary to keep * the table deterministic. * @param rows The list of rows that need updating (the decision point list) * @param pendingChars A character category list, encoded in a String. This is the * list of the columns that need updating. * @param newValue Update the cells specfied above to contain this value */ virtual void updateStateTable(const UVector& rows, const UnicodeString& pendingChars, int16_t newValue); /** * The real work of making the state table deterministic happens here. This function * merges a state in the state table (specified by rowNum) with a state that is * passed in (newValues). The basic process is to copy the nonzero cells in newStates * into the state in the state table (we'll call that oldValues). If there's a * collision (i.e., if the same cell has a nonzero value in both states, and it's * not the SAME value), then we have to reconcile the collision. We do this by * creating a new state, adding it to the end of the state table, and using this * function recursively to merge the original two states into a single, combined * state. This process may happen recursively (i.e., each successive level may * involve collisions). To prevent infinite recursion, we keep a log of merge * operations. Any time we're merging two states we've merged before, we can just * supply the row number for the result of that merge operation rather than creating * a new state just like it. * @param rowNum The row number in the state table of the state to be updated * @param newValues The state to merge it with. * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable() * (itself a copy of the decision point list from parseRule()). Newly-created * states get added to the decision point list if their "parents" were on it. */ virtual void mergeStates(int32_t rowNum, int16_t* newValues, const UVector& rowsBeingUpdated); /** * The merge list is a list of pairs of rows that have been merged somewhere in * the process of building this state table, along with the row number of the * row containing the merged state. This function looks up a pair of row numbers * and returns the row number of the row they combine into. (It returns 0 if * this pair of rows isn't in the merge list.) */ virtual int32_t searchMergeList(int32_t a, int32_t b); /** * This function is used to update the list of current loooping states (i.e., * states that are controlled by a *? construct). It backfills values from * the looping states into unpopulated cells of the states that are currently * marked for backfilling, and then updates the list of looping states to be * the new list * @param newLoopingStates The list of new looping states * @param endStates The list of states to treat as end states (states that * can exit the loop). */ virtual void setLoopingStates(const UVector* newLoopingStates, const UVector& endStates); /** * This removes "ending states" and states reachable from them from the * list of states to backfill. * @param The row number of the state to remove from the backfill list */ virtual void eliminateBackfillStates(int32_t baseState); /** * This function completes the backfilling process by actually doing the * backfilling on the states that are marked for it */ virtual void backfillLoopingStates(void); /** * This function completes the state-table-building process by doing several * postprocessing steps and copying everything into its final resting place * in the iterator itself * @param forward True if we're working on the forward state table */ virtual void finishBuildingStateTable(bool_t forward); /** * This function builds the backward state table from the forward state * table and any additional rules (identified by the ! on the front) * supplied in the description */ virtual void buildBackwardsStateTable(UErrorCode& err); protected: /** * Throws an IllegalArgumentException representing a syntax error in the rule * description. The exception's message contains some debugging information. * @param message A message describing the problem * @param position The position in the description where the problem was * discovered * @param context The string containing the error */ virtual void setUpErrorMessage(const UnicodeString& message, int32_t position, const UnicodeString& context); }; #endif