/* * Copyright © {1999}, International Business Machines Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #ifndef RBT_PARS_H #define RBT_PARS_H #include "unicode/rbt.h" #include "uvector.h" class TransliterationRuleData; class UnicodeSet; class ParseData; class TransliterationRuleParser { /** * This is a reference to external data we don't own. This works because * we only hold this for the duration of the call to parse(). */ const UnicodeString& rules; RuleBasedTransliterator::Direction direction; TransliterationRuleData* data; /** * We use a single error code during parsing. Rather than pass it * through each API, we keep it here. */ UErrorCode status; /** * Temporary symbol table used during parsing. */ ParseData* parseData; /** * Temporary vector of set variables. When parsing is complete, this * is copied into the array data.setVariables. As with data.setVariables, * element 0 corresponds to character data.setVariablesBase. */ UVector setVariablesVector; /** * The next available stand-in for variables. This starts at some point in * the private use area (discovered dynamically) and increments up toward * variableLimit. At any point during parsing, available * variables are variableNext..variableLimit-1. */ UChar variableNext; /** * The last available stand-in for variables. This is discovered * dynamically. At any point during parsing, available variables are * variableNext..variableLimit-1. */ UChar variableLimit; // Operators static const UChar VARIABLE_DEF_OP; static const UChar FORWARD_RULE_OP; static const UChar REVERSE_RULE_OP; static const UChar FWDREV_RULE_OP; // internal rep of <> op static const UnicodeString OPERATORS; // Other special characters static const UChar QUOTE; static const UChar ESCAPE; static const UChar END_OF_RULE; static const UChar RULE_COMMENT_CHAR; static const UChar VARIABLE_REF_OPEN; static const UChar VARIABLE_REF_CLOSE; static const UChar CONTEXT_OPEN; static const UChar CONTEXT_CLOSE; static const UChar SET_OPEN; static const UChar SET_CLOSE; static const UChar CURSOR_POS; public: static TransliterationRuleData* parse(const UnicodeString& rules, RuleBasedTransliterator::Direction direction); private: /** * @param rules list of rules, separated by newline characters * @exception IllegalArgumentException if there is a syntax error in the * rules */ TransliterationRuleParser(const UnicodeString& rules, RuleBasedTransliterator::Direction direction); /** * Destructor. */ ~TransliterationRuleParser(); /** * Parse the given string as a sequence of rules, separated by newline * characters ('\n'), and cause this object to implement those rules. Any * previous rules are discarded. Typically this method is called exactly * once, during construction. * @exception IllegalArgumentException if there is a syntax error in the * rules */ void parseRules(void); /** * MAIN PARSER. Parse the next rule in the given rule string, starting * at pos. Return the index after the last character parsed. Do not * parse characters at or after limit. * * Important: The character at pos must be a non-whitespace character * that is not the comment character. * * This method handles quoting, escaping, and whitespace removal. It * parses the end-of-rule character. It recognizes context and cursor * indicators. Once it does a lexical breakdown of the rule at pos, it * creates a rule object and adds it to our rule list. */ int32_t parseRule(int32_t pos, int32_t limit); /** * Called by main parser upon syntax error. Search the rule string * for the probable end of the rule. Of course, if the error is that * the end of rule marker is missing, then the rule end will not be found. * In any case the rule start will be correctly reported. * @param msg error description * @param rule pattern string * @param start position of first character of current rule */ int32_t syntaxError(const char* msg, const UnicodeString&, int32_t start); /** * Allocate a private-use substitution character for the given set, * register it in the setVariables hash, and return the substitution * character. */ UChar registerSet(UnicodeSet* adoptedSet); /** * Determines what part of the private use region of Unicode we can use for * variable stand-ins. The correct way to do this is as follows: Parse each * rule, and for forward and reverse rules, take the FROM expression, and * make a hash of all characters used. The TO expression should be ignored. * When done, everything not in the hash is available for use. In practice, * this method may employ some other algorithm for improved speed. */ void determineVariableRange(void); /** * Returns the index of the first character in a set, ignoring quoted text. * For example, in the string "abc'hide'h", the 'h' in "hide" will not be * found by a search for "h". Unlike String.indexOf(), this method searches * not for a single character, but for any character of the string * setOfChars. * @param text text to be searched * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= text.length(). * @param setOfChars string with one or more distinct characters * @return Offset of the first character in setOfChars * found, or -1 if not found. * @see #indexOf */ static int32_t quotedIndexOf(const UnicodeString& text, int32_t start, int32_t limit, const UnicodeString& setOfChars); }; #endif