8c78422db2
X-SVN-Rev: 7942
359 lines
14 KiB
C++
359 lines
14 KiB
C++
/*
|
|
* Copyright (C) {1999}, International Business Machines Corporation and others. All Rights Reserved.
|
|
**********************************************************************
|
|
* Date Name Description
|
|
* 12/15/99 rgillam Port from Java.
|
|
**********************************************************************
|
|
*/
|
|
|
|
#ifndef RBBI_BLD_H
|
|
#define RBBI_BLD_H
|
|
|
|
#include "rbbi.h"
|
|
#include "rbbi_tbl.h"
|
|
#include "unicode/uniset.h"
|
|
#include "uvector.h"
|
|
|
|
class ExpressionList;
|
|
|
|
//=======================================================================
|
|
// RuleBasedBreakIterator.Builder
|
|
//=======================================================================
|
|
/**
|
|
* The Builder class has the job of constructing a RuleBasedBreakIterator from a
|
|
* textual description. A Builder is constructed by RuleBasedBreakIterator's
|
|
* constructor, which uses it to construct the iterator itself and then throws it
|
|
* away.
|
|
* <p>The construction logic is separated out into its own class for two primary
|
|
* reasons:
|
|
* <ul><li>The construction logic is quite complicated and large. Separating it
|
|
* out into its own class means the code must only be loaded into memory while a
|
|
* RuleBasedBreakIterator is being constructed, and can be purged after that.
|
|
* <li>There is a fair amount of state that must be maintained throughout the
|
|
* construction process that is not needed by the iterator after construction.
|
|
* Separating this state out into another class prevents all of the functions that
|
|
* construct the iterator from having to have really long parameter lists,
|
|
* (hopefully) contributing to readability and maintainability.</ul>
|
|
* <p>It'd be really nice if this could be an independent class rather than an
|
|
* inner class, because that would shorten the source file considerably, but
|
|
* making Builder an inner class of RuleBasedBreakIterator allows it direct access
|
|
* to RuleBasedBreakIterator's private members, which saves us from having to
|
|
* provide some kind of "back door" to the Builder class that could then also be
|
|
* used by other classes.
|
|
*/
|
|
class RuleBasedBreakIteratorBuilder {
|
|
|
|
protected:
|
|
/**
|
|
* The iterator we're constructing.
|
|
*/
|
|
RuleBasedBreakIterator& iterator;
|
|
|
|
/**
|
|
* The tables object for the iterator we're constructing.
|
|
*/
|
|
RuleBasedBreakIteratorTables* tables;
|
|
|
|
/**
|
|
* A temporary place to hold the rules as they're being processed.
|
|
*/
|
|
UVector tempRuleList;
|
|
|
|
/**
|
|
* A temporary holding place used for calculating the character categories.
|
|
* This object contains UnicodeSet objects.
|
|
*/
|
|
UVector categories;
|
|
|
|
/**
|
|
* The number of categories (and thus the number of columns in the finished state tables)
|
|
*/
|
|
int32_t numCategories;
|
|
|
|
/**
|
|
* A table used to map parts of regexp text to lists of character categories,
|
|
* rather than having to figure them out from scratch each time
|
|
*/
|
|
ExpressionList* expressions;
|
|
|
|
/**
|
|
* A temporary holding place for the list of ignore characters
|
|
*/
|
|
UnicodeSet ignoreChars;
|
|
|
|
/**
|
|
* A temporary holding place where the forward state table is built
|
|
*/
|
|
UVector tempStateTable;
|
|
|
|
/**
|
|
* A list of all the states that have to be filled in with transitions to the
|
|
* next state that is created. Used when building the state table from the
|
|
* regular expressions.
|
|
*/
|
|
UVector decisionPointList;
|
|
|
|
/**
|
|
* A UStack for holding decision point lists. This is used to handle nested
|
|
* parentheses and braces in regexps.
|
|
*/
|
|
UStack decisionPointStack;
|
|
|
|
/**
|
|
* A list of states that loop back on themselves. Used to handle .*?
|
|
*/
|
|
UVector loopingStates;
|
|
|
|
/**
|
|
* Looping states actually have to be backfilled later in the process
|
|
* than everything else. This is where a the list of states to backfill
|
|
* is accumulated. This is also used to handle .*?
|
|
*/
|
|
UVector statesToBackfill;
|
|
|
|
/**
|
|
* A list mapping pairs of state numbers for states that are to be combined
|
|
* to the state number of the state representing their combination. Used
|
|
* in the process of making the state table deterministic to prevent
|
|
* infinite recursion.
|
|
*/
|
|
UVector mergeList;
|
|
|
|
/**
|
|
* A flag that is used to indicate when the list of looping states can
|
|
* be reset.
|
|
*/
|
|
UBool clearLoopingStates;
|
|
|
|
/**
|
|
* A place where an error message can be stored if we get a parse error.
|
|
* The error message is never displayed anywhere, so this is useful pretty
|
|
* much only in conjunction with a debugger.
|
|
*/
|
|
UnicodeString errorMessage;
|
|
|
|
/**
|
|
* A bit mask used to indicate a bit in the table's flags column that marks a
|
|
* state as an accepting state.
|
|
*/
|
|
static const int32_t END_STATE_FLAG /*= 0x8000*/;
|
|
|
|
/**
|
|
* A bit mask used to indicate a bit in the table's flags column that marks a
|
|
* state as one the builder shouldn't loop to any looping states
|
|
*/
|
|
static const int32_t DONT_LOOP_FLAG /*= 0x4000*/;
|
|
|
|
/**
|
|
* A bit mask used to indicate a bit in the table's flags column that marks a
|
|
* state as a lookahead state.
|
|
*/
|
|
static const int32_t LOOKAHEAD_STATE_FLAG /*= 0x2000*/;
|
|
|
|
/**
|
|
* A bit mask representing the union of the mask values listed above.
|
|
* Used for clearing or masking off the flag bits.
|
|
*/
|
|
static const int32_t ALL_FLAGS /*= END_STATE_FLAG | LOOKAHEAD_STATE_FLAG
|
|
| DONT_LOOP_FLAG*/;
|
|
|
|
public:
|
|
|
|
/**
|
|
* The Builder class contains a reference to the iterator it's supposed to build.
|
|
*/
|
|
RuleBasedBreakIteratorBuilder(RuleBasedBreakIterator& iteratorToBuild);
|
|
|
|
/**
|
|
* Destructor.
|
|
*/
|
|
~RuleBasedBreakIteratorBuilder();
|
|
|
|
/**
|
|
* This is the main function for setting up the BreakIterator's tables. It
|
|
* just vectors different parts of the job off to other functions.
|
|
*/
|
|
virtual void buildBreakIterator(const UnicodeString& description,
|
|
UErrorCode& err);
|
|
|
|
private:
|
|
|
|
/**
|
|
* Thus function has three main purposes:
|
|
* <ul><li>Perform general syntax checking on the description, so the rest of the
|
|
* build code can assume that it's parsing a legal description.
|
|
* <li>Split the description into separate rules
|
|
* <li>Perform variable-name substitutions (so that no one else sees variable names)
|
|
* </ul>
|
|
*/
|
|
virtual void buildRuleList(UnicodeString& description,
|
|
UErrorCode& err);
|
|
|
|
protected:
|
|
|
|
/**
|
|
* This function performs variable-name substitutions. First it does syntax
|
|
* checking on the variable-name definition. If it's syntactically valid, it
|
|
* then goes through the remainder of the description and does a simple
|
|
* find-and-replace of the variable name with its text. (The variable text
|
|
* must be enclosed in either [] or () for this to work.)
|
|
*/
|
|
virtual void processSubstitution(UnicodeString& description,
|
|
int32_t ruleStart,
|
|
int32_t ruleEnd,
|
|
int32_t startPos,
|
|
UErrorCode& err);
|
|
|
|
/**
|
|
* This function defines a protocol for handling substitution names that
|
|
* are "special," i.e., that have some property beyond just being
|
|
* substitutions. At the RuleBasedBreakIterator level, we have one
|
|
* special substitution name, "<ignore>". Subclasses can override this
|
|
* function to add more. Any special processing that has to go on beyond
|
|
* that which is done by the normal substitution-processing code is done
|
|
* here.
|
|
*/
|
|
virtual void handleSpecialSubstitution(const UnicodeString& replace,
|
|
const UnicodeString& replaceWith,
|
|
int32_t startPos,
|
|
const UnicodeString& description,
|
|
UErrorCode& err);
|
|
|
|
/**
|
|
* This function provides a hook for subclasses to mess with the character
|
|
* category table.
|
|
*/
|
|
virtual void mungeExpressionList();
|
|
|
|
/**
|
|
* This function builds the character category table. On entry,
|
|
* tempRuleList is a UVector of break rules that has had variable names substituted.
|
|
* On exit, the charCategoryTable data member has been initialized to hold the
|
|
* character category table, and tempRuleList's rules have been munged to contain
|
|
* character category numbers everywhere a literal character or a [] expression
|
|
* originally occurred.
|
|
*/
|
|
virtual void buildCharCategories(UErrorCode& err);
|
|
|
|
private:
|
|
|
|
/**
|
|
* This is the function that builds the forward state table. Most of the real
|
|
* work is done in parseRule(), which is called once for each rule in the
|
|
* description.
|
|
*/
|
|
virtual void buildStateTable(UErrorCode& err);
|
|
|
|
/**
|
|
* This is where most of the work really happens. This routine parses a single
|
|
* rule in the rule description, adding and modifying states in the state
|
|
* table according to the new expression. The state table is kept deterministic
|
|
* throughout the whole operation, although some ugly postprocessing is needed
|
|
* to handle the *? token.
|
|
*/
|
|
virtual void parseRule(const UnicodeString& rule,
|
|
UBool forward);
|
|
|
|
/**
|
|
* Update entries in the state table, and merge states when necessary to keep
|
|
* the table deterministic.
|
|
* @param rows The list of rows that need updating (the decision point list)
|
|
* @param pendingChars A character category list, encoded in a String. This is the
|
|
* list of the columns that need updating.
|
|
* @param newValue Update the cells specfied above to contain this value
|
|
*/
|
|
virtual void updateStateTable(const UVector& rows,
|
|
const UnicodeString& pendingChars,
|
|
int16_t newValue);
|
|
|
|
/**
|
|
* The real work of making the state table deterministic happens here. This function
|
|
* merges a state in the state table (specified by rowNum) with a state that is
|
|
* passed in (newValues). The basic process is to copy the nonzero cells in newStates
|
|
* into the state in the state table (we'll call that oldValues). If there's a
|
|
* collision (i.e., if the same cell has a nonzero value in both states, and it's
|
|
* not the SAME value), then we have to reconcile the collision. We do this by
|
|
* creating a new state, adding it to the end of the state table, and using this
|
|
* function recursively to merge the original two states into a single, combined
|
|
* state. This process may happen recursively (i.e., each successive level may
|
|
* involve collisions). To prevent infinite recursion, we keep a log of merge
|
|
* operations. Any time we're merging two states we've merged before, we can just
|
|
* supply the row number for the result of that merge operation rather than creating
|
|
* a new state just like it.
|
|
* @param rowNum The row number in the state table of the state to be updated
|
|
* @param newValues The state to merge it with.
|
|
* @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
|
|
* (itself a copy of the decision point list from parseRule()). Newly-created
|
|
* states get added to the decision point list if their "parents" were on it.
|
|
*/
|
|
virtual void mergeStates(int32_t rowNum,
|
|
int16_t* newValues,
|
|
const UVector& rowsBeingUpdated);
|
|
|
|
/**
|
|
* The merge list is a list of pairs of rows that have been merged somewhere in
|
|
* the process of building this state table, along with the row number of the
|
|
* row containing the merged state. This function looks up a pair of row numbers
|
|
* and returns the row number of the row they combine into. (It returns 0 if
|
|
* this pair of rows isn't in the merge list.)
|
|
*/
|
|
virtual int32_t searchMergeList(int32_t a, int32_t b);
|
|
|
|
/**
|
|
* This function is used to update the list of current loooping states (i.e.,
|
|
* states that are controlled by a *? construct). It backfills values from
|
|
* the looping states into unpopulated cells of the states that are currently
|
|
* marked for backfilling, and then updates the list of looping states to be
|
|
* the new list
|
|
* @param newLoopingStates The list of new looping states
|
|
* @param endStates The list of states to treat as end states (states that
|
|
* can exit the loop).
|
|
*/
|
|
virtual void setLoopingStates(const UVector* newLoopingStates,
|
|
const UVector& endStates);
|
|
|
|
/**
|
|
* This removes "ending states" and states reachable from them from the
|
|
* list of states to backfill.
|
|
* @param The row number of the state to remove from the backfill list
|
|
*/
|
|
virtual void eliminateBackfillStates(int32_t baseState);
|
|
|
|
/**
|
|
* This function completes the backfilling process by actually doing the
|
|
* backfilling on the states that are marked for it
|
|
*/
|
|
virtual void backfillLoopingStates(void);
|
|
|
|
/**
|
|
* This function completes the state-table-building process by doing several
|
|
* postprocessing steps and copying everything into its final resting place
|
|
* in the iterator itself
|
|
* @param forward True if we're working on the forward state table
|
|
*/
|
|
virtual void finishBuildingStateTable(UBool forward);
|
|
|
|
/**
|
|
* This function builds the backward state table from the forward state
|
|
* table and any additional rules (identified by the ! on the front)
|
|
* supplied in the description
|
|
*/
|
|
virtual void buildBackwardsStateTable(UErrorCode& err);
|
|
|
|
protected:
|
|
|
|
/**
|
|
* Throws an IllegalArgumentException representing a syntax error in the rule
|
|
* description. The exception's message contains some debugging information.
|
|
* @param message A message describing the problem
|
|
* @param position The position in the description where the problem was
|
|
* discovered
|
|
* @param context The string containing the error
|
|
*/
|
|
virtual void setUpErrorMessage(const UnicodeString& message,
|
|
int32_t position,
|
|
const UnicodeString& context);
|
|
};
|
|
|
|
#endif
|