/* * Copyright © {1999}, International Business Machines Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #ifndef RBT_PARS_H #define RBT_PARS_H #include "rbt.h" class TransliterationRuleData; class TransliterationRuleParser { /** * This is a reference to external data we don't own. This works because * we only hold this for the duration of the call to parse(). */ const UnicodeString& rules; RuleBasedTransliterator::Direction direction; TransliterationRuleData* data; /** * We use a single error code during parsing. Rather than pass it * through each API, we keep it here. */ UErrorCode status; /** * The next available stand-in for variables. This starts at some point in * the private use area (discovered dynamically) and increments up toward * variableLimit. At any point during parsing, available * variables are variableNext..variableLimit-1. */ UChar variableNext; /** * The last available stand-in for variables. This is discovered * dynamically. At any point during parsing, available variables are * variableNext..variableLimit-1. */ UChar variableLimit; // Operators static const UChar VARIABLE_DEF_OP; static const UChar FORWARD_RULE_OP; static const UChar REVERSE_RULE_OP; static const char* OPERATORS; // Other special characters static const UChar QUOTE; static const UChar VARIABLE_REF_OPEN; static const UChar VARIABLE_REF_CLOSE; static const UChar CONTEXT_OPEN; static const UChar CONTEXT_CLOSE; static const UChar CURSOR_POS; static const UChar RULE_COMMENT_CHAR; /** * Specials must be quoted in rules to be used as literals. * Specials may not occur in variable names. */ static const char* SPECIALS; /** * Specials that must be quoted in variable definitions. */ static const char* DEF_SPECIALS; public: static TransliterationRuleData* parse(const UnicodeString& rules, RuleBasedTransliterator::Direction direction); private: /** * @param rules list of rules, separated by newline characters * @exception IllegalArgumentException if there is a syntax error in the * rules */ TransliterationRuleParser(const UnicodeString& rules, RuleBasedTransliterator::Direction direction); /** * Parse the given string as a sequence of rules, separated by newline * characters ('\n'), and cause this object to implement those rules. Any * previous rules are discarded. Typically this method is called exactly * once, during construction. * @exception IllegalArgumentException if there is a syntax error in the * rules */ void parseRules(); /** * Parse the given substring as a rule, and append it to the rules currently * represented in this object. * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= rules.length(). * @exception IllegalArgumentException if there is a syntax error in the * rules */ void applyRule(int32_t start, int32_t limit); /** * Add a variable definition. * @param name the name of the variable. It must not already be defined. * @param pattern the value of the variable. It may be a single character * or a pattern describing a character set. * @exception IllegalArgumentException if there is a syntax error */ void applyVariableDef(const UnicodeString& name, const UnicodeString& pattern); /** * Given a rule, parses it into three pieces: The left side, the right side, * and the operator. Returns the operator. Quotes and variable references * are resolved; the otuput text in all StringBuffer parameters * is literal text. This method delegates to other parsing methods to * handle the match pattern, output pattern, and other sub-patterns in the * rule. * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= rules.length(). * @param left left side of rule is appended to this buffer * with the quotes removed and variables resolved * @param right right side of rule is appended to this buffer * with the quotes removed and variables resolved * @param anteContext the preceding context of the match pattern, * if there is one, is appended to this buffer * @param postContext the following context of the match pattern, * if there is one, is appended to this buffer * @param cursorPos if there is a cursor in the output pattern, its * offset is stored in cursorPos[0] * @return The operator character, one of the characters in OPERATORS. */ UChar parseRule(int32_t start, int32_t limit, UnicodeString& left, UnicodeString& right, UnicodeString& anteContext, UnicodeString& postContext, int32_t& cursorPos); /** * Parses the match pattern of a forward or reverse rule. Given the raw * match pattern, return the match text and the context on both sides, if * any. Resolves all quotes and variables. * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= rules.length(). * @param text the key to be matched will be appended to this buffer * @param anteContext the preceding context, if any, will be appended * to this buffer. * @param postContext the following context, if any, will be appended * to this buffer. */ void parseMatchPattern(int32_t start, int32_t limit, UnicodeString& text, UnicodeString& anteContext, UnicodeString& postContext); void parseSubPattern(int32_t start, int32_t limit, UnicodeString& text); /** * Parse a variable definition sub pattern. This kind of sub * pattern differs in the set of characters that are considered * special. In particular, the '[' and ']' characters are not * special, since these are used in UnicodeSet patterns. */ void parseDefPattern(int32_t start, int32_t limit, UnicodeString& text); /** * Parses the output pattern of a forward or reverse rule. Given the * output pattern, return the output text and the position of the cursor, * if any. Resolves all quotes and variables. * @param rules the string to be parsed * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= rules.length(). * @param text the output text will be appended to this buffer * @param cursorPos if this parameter is not null, then cursorPos[0] * will be set to the cursor position, or -1 if there is none. If this * parameter is null, then cursors will be disallowed. */ void parseOutputPattern(int32_t start, int32_t limit, UnicodeString& text, int32_t& cursorPos); /** * Parses a sub-pattern of a rule. Return the text and the position of the cursor, * if any. Resolves all quotes and variables. * @param rules the string to be parsed * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= rules.length(). * @param text the output text will be appended to this buffer * @param cursorPos if this parameter is not null, then cursorPos[0] * will be set to the cursor position, or -1 if there is none. If this * parameter is null, then cursors will be disallowed. * @param specials characters that must be quoted; typically either * SPECIALS or DEF_SPECIALS. */ void parseSubPattern(int32_t start, int32_t limit, UnicodeString& text, int32_t* cursorPos, const UnicodeString& specials); void validateVariableName(const UnicodeString& name); /** * Returns the single character value of the given variable name. Defined * names are recognized. * * NO LONGER SUPPORTED: * If a Unicode category name is given, a standard character variable * in the range firstCategoryVariable to lastCategoryVariable is returned, * with value firstCategoryVariable + n, where n is the category * number. * @exception IllegalArgumentException if the name is unknown. */ //$ Character getVariableDef(const UnicodeString& name); /** * Determines what part of the private use region of Unicode we can use for * variable stand-ins. The correct way to do this is as follows: Parse each * rule, and for forward and reverse rules, take the FROM expression, and * make a hash of all characters used. The TO expression should be ignored. * When done, everything not in the hash is available for use. In practice, * this method may employ some other algorithm for improved speed. */ void determineVariableRange(); /** * Returns the index of the first character in a set, ignoring quoted text. * For example, in the string "abc'hide'h", the 'h' in "hide" will not be * found by a search for "h". Unlike String.indexOf(), this method searches * not for a single character, but for any character of the string * setOfChars. * @param text text to be searched * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= text.length(). * @param setOfChars string with one or more distinct characters * @return Offset of the first character in setOfChars * found, or -1 if not found. * @see #indexOf */ static int32_t quotedIndexOf(const UnicodeString& text, int32_t start, int32_t limit, const UnicodeString& setOfChars); /** * Returns the index of the first character in a set. Unlike * String.indexOf(), this method searches not for a single character, but * for any character of the string setOfChars. * @param text text to be searched * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= text.length(). * @param setOfChars string with one or more distinct characters * @return Offset of the first character in setOfChars * found, or -1 if not found. * @see #quotedIndexOf */ static int32_t indexOf(const UnicodeString& text, int32_t start, int32_t limit, const UnicodeString& setOfChars); /** * Returns the index of the first character in a set. Unlike * String.indexOf(), this method searches not for a single character, but * for any character of the string setOfChars. * @param text text to be searched * @param setOfChars string with one or more distinct characters * @return Offset of the first character in setOfChars * found, or -1 if not found. * @see #quotedIndexOf */ static int32_t indexOf(const UnicodeString& text, const UnicodeString& setOfChars); }; #endif