/* ********************************************************************** * Copyright (C) 1999, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #include "rbt_pars.h" #include "unicode/rbt.h" #include "rbt_rule.h" #include "unirange.h" #include "rbt_data.h" #include "unicode/uniset.h" #include "cstring.h" #include "unicode/parsepos.h" #include "symtable.h" // Operators const UChar TransliterationRuleParser::VARIABLE_DEF_OP = 0x003D/*=*/; const UChar TransliterationRuleParser::FORWARD_RULE_OP = 0x003E/*>*/; const UChar TransliterationRuleParser::REVERSE_RULE_OP = 0x003C/*<*/; const UChar TransliterationRuleParser::FWDREV_RULE_OP = 0x007E/*~*/; // internal rep of <> op const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3); // Other special characters const UChar TransliterationRuleParser::QUOTE = 0x0027/*'*/; const UChar TransliterationRuleParser::ESCAPE = 0x005C/*\*/; const UChar TransliterationRuleParser::END_OF_RULE = 0x003B/*;*/; const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = 0x0023/*#*/; const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = 0x007B/*{*/; const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = 0x007D/*}*/; const UChar TransliterationRuleParser::CONTEXT_OPEN = 0x0028/*(*/; const UChar TransliterationRuleParser::CONTEXT_CLOSE = 0x0029/*)*/; const UChar TransliterationRuleParser::SET_OPEN = 0x005B/*[*/; const UChar TransliterationRuleParser::SET_CLOSE = 0x005D/*]*/; const UChar TransliterationRuleParser::CURSOR_POS = 0x007C/*|*/; //---------------------------------------------------------------------- // BEGIN ParseData //---------------------------------------------------------------------- /** * This class implements the SymbolTable interface. It is used * during parsing to give UnicodeSet access to variables that * have been defined so far. Note that it uses setVariablesVector, * _not_ data.setVariables. */ class ParseData : public SymbolTable { public: const TransliterationRuleData* data; // alias const UVector* setVariablesVector; // alias ParseData(const TransliterationRuleData* data = 0, const UVector* setVariablesVector = 0); /** * Lookup the object associated with this string and return it. * Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not * exist. Return a non-NULL set if the name is mapped to a set; * otherwise return a NULL set. */ virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set, UErrorCode& status) const; }; ParseData::ParseData(const TransliterationRuleData* d, const UVector* sets) : data(d), setVariablesVector(sets) {} /** * Implement SymbolTable API. Lookup a variable, returning * either a Character, a UnicodeSet, or null. */ void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set, UErrorCode& status) const { c = data->lookupVariable(name, status); if (U_SUCCESS(status)) { int32_t i = c - data->setVariablesBase; set = (i < setVariablesVector->size()) ? (UnicodeSet*) setVariablesVector->elementAt(i) : 0; } } //---------------------------------------------------------------------- // END ParseData //---------------------------------------------------------------------- TransliterationRuleData* TransliterationRuleParser::parse(const UnicodeString& rules, RuleBasedTransliterator::Direction direction) { TransliterationRuleParser parser(rules, direction); parser.parseRules(); if (U_FAILURE(parser.status)) { delete parser.data; parser.data = 0; } return parser.data; } /** * @param rules list of rules, separated by newline characters * @exception IllegalArgumentException if there is a syntax error in the * rules */ TransliterationRuleParser::TransliterationRuleParser( const UnicodeString& theRules, RuleBasedTransliterator::Direction theDirection) : rules(theRules), direction(theDirection), data(0) { parseData = new ParseData(0, &setVariablesVector); } /** * Destructor. */ TransliterationRuleParser::~TransliterationRuleParser() { delete parseData; } /** * Parse the given string as a sequence of rules, separated by newline * characters ('\n'), and cause this object to implement those rules. Any * previous rules are discarded. Typically this method is called exactly * once, during construction. * @exception IllegalArgumentException if there is a syntax error in the * rules */ void TransliterationRuleParser::parseRules(void) { status = U_ZERO_ERROR; delete data; data = new TransliterationRuleData(status); if (U_FAILURE(status)) { return; } parseData->data = data; setVariablesVector.removeAllElements(); determineVariableRange(); int32_t pos = 0; int32_t limit = rules.length(); while (pos < limit && U_SUCCESS(status)) { UChar c = rules.charAt(pos++); if (Unicode::isWhitespace(c)) { // Ignore leading whitespace. Note that this is not // Unicode spaces, but Java spaces -- a subset, // representing whitespace likely to be seen in code. continue; } // Skip lines starting with the comment character if (c == RULE_COMMENT_CHAR) { pos = rules.indexOf("\n", pos) + 1; if (pos == 0) { break; // No "\n" found; rest of rule is a commnet } continue; // Either fall out or restart with next line } // We've found the start of a rule. c is its first // character, and pos points past c. Lexically parse the // rule into component pieces. pos = parseRule(--pos, limit); } // Convert the set vector to an array data->setVariablesLength = setVariablesVector.size(); data->setVariables = new UnicodeSet*[data->setVariablesLength]; // orphanElement removes the given element and shifts all other // elements down. For performance (and code clarity) we work from // the end back to index 0. for (int32_t i=data->setVariablesLength; i>0; ) { --i; data->setVariables[i] = (UnicodeSet*) setVariablesVector.orphanElementAt(i); } // Index the rules if (U_SUCCESS(status)) { data->ruleSet.freeze(*data, status); } } /** * MAIN PARSER. Parse the next rule in the given rule string, starting * at pos. Return the index after the last character parsed. Do not * parse characters at or after limit. * * Important: The character at pos must be a non-whitespace character * that is not the comment character. * * This method handles quoting, escaping, and whitespace removal. It * parses the end-of-rule character. It recognizes context and cursor * indicators. Once it does a lexical breakdown of the rule at pos, it * creates a rule object and adds it to our rule list. */ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) { // Locate the left side, operator, and right side int32_t start = pos; UChar op = 0; UnicodeString buf; int32_t cursor = -1; // position of cursor in buf int32_t ante = -1; // position of ante context marker ')' in buf int32_t post = -1; // position of post context marker '(' in buf int32_t postClose = -1; // position of post context close ')' in buf // Assigned to buf and its adjuncts after the LHS has been // parsed. Thereafter, buf etc. refer to the RHS. UnicodeString left; int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1; UnicodeString scratch; while (pos < limit) { UChar c = rules.charAt(pos++); if (Unicode::isWhitespace(c)) { // Ignore whitespace. Note that this is not Unicode // spaces, but Java spaces -- a subset, representing // whitespace likely to be seen in code. continue; } // Handle escapes if (c == ESCAPE) { if (pos == limit) { return syntaxError("Trailing backslash", rules, start); } // Parse \uXXXX escapes c = rules.charAt(pos++); if (c == 0x0075/*u*/) { if ((pos+4) > limit) { return syntaxError("Malformed Unicode escape", rules, start); } c = (UChar)0x0000; for (int32_t plim=pos+4; pos= 0) { if (op != 0) { return syntaxError("Unquoted special", rules, start); } // Found an operator char. Check for forward-reverse operator. if (c == REVERSE_RULE_OP && (pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) { ++pos; op = FWDREV_RULE_OP; } else { op = c; } left = buf; // lhs leftCursor = cursor; leftAnte = ante; leftPost = post; leftPostClose = postClose; buf.truncate(0); cursor = ante = post = postClose = -1; continue; } if (c == END_OF_RULE) { break; } switch (c) { case VARIABLE_REF_OPEN: { int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos); if (pos == j || j < 0) { // empty or unterminated return syntaxError("Malformed variable reference", rules, start); } scratch.truncate(0); rules.extractBetween(pos, j, scratch); pos = j+1; UChar v = data->lookupVariable(scratch, status); if (U_FAILURE(status)) { return syntaxError("Undefined variable", rules, start); } buf.append(v); } break; case CONTEXT_OPEN: if (post >= 0) { return syntaxError("Multiple post contexts", rules, start); } // Ignore CONTEXT_OPEN if buffer length is zero -- that means // this is the optional opening delimiter for the ante context. if (buf.length() > 0) { post = buf.length(); } break; case CONTEXT_CLOSE: if (postClose >= 0) { return syntaxError("Unexpected ')'", rules, start); } if (post >= 0) { // This is probably the optional closing delimiter // for the post context; save the pos and check later. postClose = buf.length(); } else if (ante >= 0) { return syntaxError("Multiple ante contexts", rules, start); } else { ante = buf.length(); } break; case SET_OPEN: { ParsePosition pp(pos-1); // Backup to opening '[' buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status))); if (U_FAILURE(status)) { return syntaxError("Invalid set", rules, start); } pos = pp.getIndex(); } break; case VARIABLE_REF_CLOSE: case SET_CLOSE: return syntaxError("Unquoted special", rules, start); case CURSOR_POS: if (cursor >= 0) { return syntaxError("Multiple cursors", rules, start); } cursor = buf.length(); break; default: buf.append(c); break; } } if (op == 0) { return syntaxError("No operator", rules, start); } // Check context close parameters if ((leftPostClose >= 0 && leftPostClose != left.length()) || (postClose >= 0 && postClose != buf.length())) { return syntaxError("Extra text after ]", rules, start); } // Context is only allowed on the input side; that is, the left side // for forward rules. Cursors are only allowed on the output side; // that is, the right side for forward rules. Bidirectional rules // ignore elements that do not apply. switch (op) { case VARIABLE_DEF_OP: // LHS is the name. RHS is a single character, either a literal // or a set (already parsed). If RHS is longer than one // character, it is either a multi-character string, or multiple // sets, or a mixture of chars and sets -- syntax error. if (buf.length() != 1) { return syntaxError("Malformed RHS", rules, start); } if (data->isVariableDefined(left)) { return syntaxError("Duplicate definition", rules, start); } data->defineVariable(left, buf.charAt(0), status); break; case FORWARD_RULE_OP: if (direction == RuleBasedTransliterator::FORWARD) { if (ante >= 0 || post >= 0 || leftCursor >= 0) { return syntaxError("Malformed rule", rules, start); } data->ruleSet.addRule(new TransliterationRule( left, leftAnte, leftPost, buf, cursor, status), status); } // otherwise ignore the rule; it's not the direction we want break; case REVERSE_RULE_OP: if (direction == RuleBasedTransliterator::REVERSE) { if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) { return syntaxError("Malformed rule", rules, start); } data->ruleSet.addRule(new TransliterationRule( buf, ante, post, left, leftCursor, status), status); } // otherwise ignore the rule; it's not the direction we want break; case FWDREV_RULE_OP: if (direction == RuleBasedTransliterator::FORWARD) { // The output side is the right; trim off any context if (post >= 0) { buf.remove(post); } if (ante >= 0) { buf.removeBetween(0, ante); } data->ruleSet.addRule(new TransliterationRule( left, leftAnte, leftPost, buf, cursor, status), status); } else { // The output side is the left; trim off any context if (leftPost >= 0) { left.remove(leftPost); } if (leftAnte >= 0) { left.removeBetween(0, leftAnte); } data->ruleSet.addRule(new TransliterationRule( buf, ante, post, left, leftCursor, status), status); } break; } return pos; } /** * Called by main parser upon syntax error. Search the rule string * for the probable end of the rule. Of course, if the error is that * the end of rule marker is missing, then the rule end will not be found. * In any case the rule start will be correctly reported. * @param msg error description * @param rule pattern string * @param start position of first character of current rule */ int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/, const UnicodeString& /*rule*/, int32_t start) { //| int end = quotedIndexOf(rule, start, rule.length(), ";"); //| if (end < 0) { //| end = rule.length(); //| } //| throw new IllegalArgumentException(msg + " in " + //| rule.substring(start, end)); status = U_ILLEGAL_ARGUMENT_ERROR; return start; } /** * Allocate a private-use substitution character for the given set, * register it in the setVariables hash, and return the substitution * character. */ UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) { if (variableNext >= variableLimit) { // throw new RuntimeException("Private use variables exhausted"); status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } setVariablesVector.addElement(adoptedSet); return variableNext++; } /** * Determines what part of the private use region of Unicode we can use for * variable stand-ins. The correct way to do this is as follows: Parse each * rule, and for forward and reverse rules, take the FROM expression, and * make a hash of all characters used. The TO expression should be ignored. * When done, everything not in the hash is available for use. In practice, * this method may employ some other algorithm for improved speed. */ void TransliterationRuleParser::determineVariableRange(void) { UnicodeRange privateUse(0xE000, 0x1900); // Private use area UnicodeRange* r = privateUse.largestUnusedSubrange(rules); data->setVariablesBase = variableNext = variableLimit = (UChar) 0; if (r != 0) { data->setVariablesBase = variableNext = r->start; variableLimit = (UChar) (r->start + r->length); delete r; } if (variableNext >= variableLimit) { status = U_ILLEGAL_ARGUMENT_ERROR; } } /** * Returns the index of the first character in a set, ignoring quoted text. * For example, in the string "abc'hide'h", the 'h' in "hide" will not be * found by a search for "h". Unlike String.indexOf(), this method searches * not for a single character, but for any character of the string * setOfChars. * @param text text to be searched * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= text.length(). * @param setOfChars string with one or more distinct characters * @return Offset of the first character in setOfChars * found, or -1 if not found. * @see #indexOf */ int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text, int32_t start, int32_t limit, const UnicodeString& setOfChars) { for (int32_t i=start; i= 0) { return i; } } return -1; }