scuffed-code/icu4c/source/i18n/rbt_pars.cpp

/*
**********************************************************************
*   Copyright (C) 1999, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#include "rbt_pars.h"
#include "unicode/rbt.h"
#include "rbt_rule.h"
#include "unirange.h"
#include "rbt_data.h"
#include "unicode/uniset.h"
#include "cstring.h"
#include "unicode/parsepos.h"
#include "symtable.h"

// Operators
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = 0x003D/*=*/;
const UChar TransliterationRuleParser::FORWARD_RULE_OP = 0x003E/*>*/;
const UChar TransliterationRuleParser::REVERSE_RULE_OP = 0x003C/*<*/;
const UChar TransliterationRuleParser::FWDREV_RULE_OP  = 0x007E/*~*/; // internal rep of <> op
const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3);

// Other special characters
const UChar TransliterationRuleParser::QUOTE = 0x0027/*'*/;
const UChar TransliterationRuleParser::ESCAPE = 0x005C/*\*/;
const UChar TransliterationRuleParser::END_OF_RULE = 0x003B/*;*/;
const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = 0x0023/*#*/;

const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = 0x007B/*{*/;
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = 0x007D/*}*/;
const UChar TransliterationRuleParser::CONTEXT_OPEN = 0x0028/*(*/;
const UChar TransliterationRuleParser::CONTEXT_CLOSE = 0x0029/*)*/;
const UChar TransliterationRuleParser::SET_OPEN = 0x005B/*[*/;
const UChar TransliterationRuleParser::SET_CLOSE = 0x005D/*]*/;
const UChar TransliterationRuleParser::CURSOR_POS = 0x007C/*|*/;

//----------------------------------------------------------------------
// BEGIN ParseData
//----------------------------------------------------------------------

/**
 * This class implements the SymbolTable interface.  It is used
 * during parsing to give UnicodeSet access to variables that
 * have been defined so far.  Note that it uses setVariablesVector,
 * _not_ data.setVariables.
 */
class ParseData : public SymbolTable {
public:
    const TransliterationRuleData* data; // alias

    const UVector* setVariablesVector; // alias

    ParseData(const TransliterationRuleData* data = 0,
              const UVector* setVariablesVector = 0);

    /**
     * Lookup the object associated with this string and return it.
     * Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
     * exist.  Return a non-NULL set if the name is mapped to a set;
     * otherwise return a NULL set.
     */
    virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
                        UErrorCode& status) const;
};

ParseData::ParseData(const TransliterationRuleData* d,
                     const UVector* sets) :
    data(d), setVariablesVector(sets) {}

/**
 * Implement SymbolTable API.  Lookup a variable, returning
 * either a Character, a UnicodeSet, or null.
 */
void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
                       UErrorCode& status) const {
    c = data->lookupVariable(name, status);
    if (U_SUCCESS(status)) {
        int32_t i = c - data->setVariablesBase;
        set = (i < setVariablesVector->size()) ?
            (UnicodeSet*) setVariablesVector->elementAt(i) : 0;
    }
}

//----------------------------------------------------------------------
// END ParseData
//----------------------------------------------------------------------

TransliterationRuleData*
TransliterationRuleParser::parse(const UnicodeString& rules,
                                 RuleBasedTransliterator::Direction direction) {
    TransliterationRuleParser parser(rules, direction);
    parser.parseRules();
    if (U_FAILURE(parser.status)) {
        delete parser.data;
        parser.data = 0;
    }
    return parser.data;
}

/**
 * @param rules list of rules, separated by newline characters
 * @exception IllegalArgumentException if there is a syntax error in the
 * rules
 */
TransliterationRuleParser::TransliterationRuleParser(
                                     const UnicodeString& theRules,
                                     RuleBasedTransliterator::Direction theDirection) :
    rules(theRules), direction(theDirection), data(0) {
    parseData = new ParseData(0, &setVariablesVector);
}

/**
 * Destructor.
 */
TransliterationRuleParser::~TransliterationRuleParser() {
    delete parseData;
}

/**
 * Parse the given string as a sequence of rules, separated by newline
 * characters ('\n'), and cause this object to implement those rules.  Any
 * previous rules are discarded.  Typically this method is called exactly
 * once, during construction.
 * @exception IllegalArgumentException if there is a syntax error in the
 * rules
 */
void TransliterationRuleParser::parseRules(void) {
    status = U_ZERO_ERROR;

    delete data;
    data = new TransliterationRuleData(status);
    if (U_FAILURE(status)) {
        return;
    }

    parseData->data = data;
    setVariablesVector.removeAllElements();
    determineVariableRange();

    int32_t pos = 0;
    int32_t limit = rules.length();
    while (pos < limit && U_SUCCESS(status)) {
        UChar c = rules.charAt(pos++);
        if (Unicode::isWhitespace(c)) {
            // Ignore leading whitespace.  Note that this is not
            // Unicode spaces, but Java spaces -- a subset,
            // representing whitespace likely to be seen in code.
            continue;
        }
        // Skip lines starting with the comment character
        if (c == RULE_COMMENT_CHAR) {
            pos = rules.indexOf("\n", pos) + 1;
            if (pos == 0) {
                break; // No "\n" found; rest of rule is a commnet
            }
            continue; // Either fall out or restart with next line
        }
        // We've found the start of a rule.  c is its first
        // character, and pos points past c.  Lexically parse the
        // rule into component pieces.
        pos = parseRule(--pos, limit);                    
    }
    
    // Convert the set vector to an array
    data->setVariablesLength = setVariablesVector.size();
    data->setVariables = new UnicodeSet*[data->setVariablesLength];
    // orphanElement removes the given element and shifts all other
    // elements down.  For performance (and code clarity) we work from
    // the end back to index 0.
    for (int32_t i=data->setVariablesLength; i>0; ) {
        --i;
        data->setVariables[i] =
            (UnicodeSet*) setVariablesVector.orphanElementAt(i);
    }

    // Index the rules
    if (U_SUCCESS(status)) {
        data->ruleSet.freeze(*data, status);
    }
}

/**
 * MAIN PARSER.  Parse the next rule in the given rule string, starting
 * at pos.  Return the index after the last character parsed.  Do not
 * parse characters at or after limit.
 *
 * Important:  The character at pos must be a non-whitespace character
 * that is not the comment character.
 *
 * This method handles quoting, escaping, and whitespace removal.  It
 * parses the end-of-rule character.  It recognizes context and cursor
 * indicators.  Once it does a lexical breakdown of the rule at pos, it
 * creates a rule object and adds it to our rule list.
 */
int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
    // Locate the left side, operator, and right side
    int32_t start = pos;
    UChar op = 0;

    UnicodeString buf;
    int32_t cursor = -1; // position of cursor in buf
    int32_t ante = -1;   // position of ante context marker ')' in buf
    int32_t post = -1;   // position of post context marker '(' in buf
    int32_t postClose = -1; // position of post context close ')' in buf

    // Assigned to buf and its adjuncts after the LHS has been
    // parsed.  Thereafter, buf etc. refer to the RHS.
    UnicodeString left;
    int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;

    UnicodeString scratch;

    while (pos < limit) {
        UChar c = rules.charAt(pos++);
        if (Unicode::isWhitespace(c)) {
            // Ignore whitespace.  Note that this is not Unicode
            // spaces, but Java spaces -- a subset, representing
            // whitespace likely to be seen in code.
            continue;
        }
        // Handle escapes
        if (c == ESCAPE) {
            if (pos == limit) {
                return syntaxError("Trailing backslash", rules, start);
            }
            // Parse \uXXXX escapes
            c = rules.charAt(pos++);
            if (c == 0x0075/*u*/) {
                if ((pos+4) > limit) {
                    return syntaxError("Malformed Unicode escape", rules, start);
                }
                c = (UChar)0x0000;
                for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
                    int32_t digit = Unicode::digit(rules.charAt(pos), 16);
                    if (digit<0) {
                        return syntaxError("Malformed Unicode escape", rules, start);
                    }
                    c = (UChar) ((c << 4) | digit);
                }
            }

            buf.append(c);
            continue;
        }
        // Handle quoted matter
        if (c == QUOTE) {
            int32_t iq = rules.indexOf(QUOTE, pos);
            if (iq == pos) {
                buf.append(c); // Parse [''] outside quotes as [']
                ++pos;
            } else {
                /* This loop picks up a segment of quoted text of the
                 * form 'aaaa' each time through.  If this segment
                 * hasn't really ended ('aaaa''bbbb') then it keeps
                 * looping, each time adding on a new segment.  When it
                 * reaches the final quote it breaks.
                 */
                for (;;) {
                    if (iq < 0) {
                        return syntaxError("Unterminated quote", rules, start);
                    }
                    scratch.truncate(0);
                    rules.extractBetween(pos, iq, scratch);
                    buf.append(scratch);
                    pos = iq+1;
                    if (pos < limit && rules.charAt(pos) == QUOTE) {
                        // Parse [''] inside quotes as [']
                        iq = rules.indexOf(QUOTE, pos+1);
                        // Continue looping
                    } else {
                        break;
                    }
                }
            }
            continue;
        }
        if (OPERATORS.indexOf(c) >= 0) {
            if (op != 0) {
                return syntaxError("Unquoted special", rules, start);
            }
            // Found an operator char.  Check for forward-reverse operator.
            if (c == REVERSE_RULE_OP &&
                (pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) {
                ++pos;
                op = FWDREV_RULE_OP;
            } else {
                op = c;
            }
            left = buf; // lhs
            leftCursor = cursor;
            leftAnte = ante;
            leftPost = post;
            leftPostClose = postClose;

            buf.truncate(0);
            cursor = ante = post = postClose = -1;
            continue;
        }
        if (c == END_OF_RULE) {
            break;
        }
        switch (c) {
        case VARIABLE_REF_OPEN:
            {
                int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
                if (pos == j || j < 0) { // empty or unterminated
                    return syntaxError("Malformed variable reference", rules, start);
                }
                scratch.truncate(0);
                rules.extractBetween(pos, j, scratch);
                pos = j+1;
                UChar v = data->lookupVariable(scratch, status);
                if (U_FAILURE(status)) {
                    return syntaxError("Undefined variable", rules, start);
                }
                buf.append(v);
            }
            break;
        case CONTEXT_OPEN:
            if (post >= 0) {
                return syntaxError("Multiple post contexts", rules, start);
            }
            // Ignore CONTEXT_OPEN if buffer length is zero -- that means
            // this is the optional opening delimiter for the ante context.
            if (buf.length() > 0) {
                post = buf.length();
            }
            break;
        case CONTEXT_CLOSE:
            if (postClose >= 0) {
                return syntaxError("Unexpected ')'", rules, start);
            }
            if (post >= 0) {
                // This is probably the optional closing delimiter
                // for the post context; save the pos and check later.
                postClose = buf.length();
            } else if (ante >= 0) {
                return syntaxError("Multiple ante contexts", rules, start);
            } else {
                ante = buf.length();
            }
            break;
        case SET_OPEN: {
            ParsePosition pp(pos-1); // Backup to opening '['
            buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
            if (U_FAILURE(status)) {
                return syntaxError("Invalid set", rules, start);
            }
            pos = pp.getIndex(); }
            break;
        case VARIABLE_REF_CLOSE:
        case SET_CLOSE:
            return syntaxError("Unquoted special", rules, start);
        case CURSOR_POS:
            if (cursor >= 0) {
                return syntaxError("Multiple cursors", rules, start);
            }
            cursor = buf.length();
            break;
        default:
            buf.append(c);
            break;
        }
    }
    if (op == 0) {
        return syntaxError("No operator", rules, start);
    }

    // Check context close parameters
    if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
        (postClose >= 0 && postClose != buf.length())) {
        return syntaxError("Extra text after ]", rules, start);
    }

    // Context is only allowed on the input side; that is, the left side
    // for forward rules.  Cursors are only allowed on the output side;
    // that is, the right side for forward rules.  Bidirectional rules
    // ignore elements that do not apply.

    switch (op) {
    case VARIABLE_DEF_OP:
        // LHS is the name.  RHS is a single character, either a literal
        // or a set (already parsed).  If RHS is longer than one
        // character, it is either a multi-character string, or multiple
        // sets, or a mixture of chars and sets -- syntax error.
        if (buf.length() != 1) {
            return syntaxError("Malformed RHS", rules, start);
        }
        if (data->isVariableDefined(left)) {
            return syntaxError("Duplicate definition", rules, start);
        }
        data->defineVariable(left, buf.charAt(0), status);
        break;

    case FORWARD_RULE_OP:
        if (direction == RuleBasedTransliterator::FORWARD) {
            if (ante >= 0 || post >= 0 || leftCursor >= 0) {
                return syntaxError("Malformed rule", rules, start);
            }
            data->ruleSet.addRule(new TransliterationRule(
                                     left, leftAnte, leftPost,
                                     buf, cursor, status), status);
        } // otherwise ignore the rule; it's not the direction we want
        break;

    case REVERSE_RULE_OP:
        if (direction == RuleBasedTransliterator::REVERSE) {
            if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
                return syntaxError("Malformed rule", rules, start);
            }
            data->ruleSet.addRule(new TransliterationRule(
                                     buf, ante, post,
                                     left, leftCursor, status), status);
        } // otherwise ignore the rule; it's not the direction we want
        break;

    case FWDREV_RULE_OP:
        if (direction == RuleBasedTransliterator::FORWARD) {
            // The output side is the right; trim off any context
            if (post >= 0) {
                buf.remove(post);
            }
            if (ante >= 0) {
                buf.removeBetween(0, ante);
            }
            data->ruleSet.addRule(new TransliterationRule(
                                     left, leftAnte, leftPost,
                                     buf, cursor, status), status);
        } else {
            // The output side is the left; trim off any context
            if (leftPost >= 0) {
                left.remove(leftPost);
            }
            if (leftAnte >= 0) {
                left.removeBetween(0, leftAnte);
            }
            data->ruleSet.addRule(new TransliterationRule(
                                     buf, ante, post,
                                     left, leftCursor, status), status);
        }
        break;
    }

    return pos;
}

/**
 * Called by main parser upon syntax error.  Search the rule string
 * for the probable end of the rule.  Of course, if the error is that
 * the end of rule marker is missing, then the rule end will not be found.
 * In any case the rule start will be correctly reported.
 * @param msg error description
 * @param rule pattern string
 * @param start position of first character of current rule
 */
int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/,
                                               const UnicodeString& /*rule*/,
                                               int32_t start) {
//|    int end = quotedIndexOf(rule, start, rule.length(), ";");
//|    if (end < 0) {
//|        end = rule.length();
//|    }
//|    throw new IllegalArgumentException(msg + " in " +
//|                                       rule.substring(start, end));
    status = U_ILLEGAL_ARGUMENT_ERROR;
    return start;
}

/**
 * Allocate a private-use substitution character for the given set,
 * register it in the setVariables hash, and return the substitution
 * character.
 */
UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
    if (variableNext >= variableLimit) {
        // throw new RuntimeException("Private use variables exhausted");
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
    setVariablesVector.addElement(adoptedSet);
    return variableNext++;
}

/**
 * Determines what part of the private use region of Unicode we can use for
 * variable stand-ins.  The correct way to do this is as follows: Parse each
 * rule, and for forward and reverse rules, take the FROM expression, and
 * make a hash of all characters used.  The TO expression should be ignored.
 * When done, everything not in the hash is available for use.  In practice,
 * this method may employ some other algorithm for improved speed.
 */
void TransliterationRuleParser::determineVariableRange(void) {
    UnicodeRange privateUse(0xE000, 0x1900); // Private use area

    UnicodeRange* r = privateUse.largestUnusedSubrange(rules);

    data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
    
    if (r != 0) {
        data->setVariablesBase = variableNext = r->start;
        variableLimit = (UChar) (r->start + r->length);
        delete r;
    }

    if (variableNext >= variableLimit) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
    }
}

/**
 * Returns the index of the first character in a set, ignoring quoted text.
 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
 * found by a search for "h".  Unlike String.indexOf(), this method searches
 * not for a single character, but for any character of the string
 * <code>setOfChars</code>.
 * @param text text to be searched
 * @param start the beginning index, inclusive; <code>0 <= start
 * <= limit</code>.
 * @param limit the ending index, exclusive; <code>start <= limit
 * <= text.length()</code>.
 * @param setOfChars string with one or more distinct characters
 * @return Offset of the first character in <code>setOfChars</code>
 * found, or -1 if not found.
 * @see #indexOf
 */
int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
                                                 int32_t start, int32_t limit,
                                                 const UnicodeString& setOfChars) {
    for (int32_t i=start; i<limit; ++i) {
        UChar c = text.charAt(i);
        if (c == QUOTE) {
            while (++i < limit
                   && text.charAt(i) != QUOTE) {}
        } else if (setOfChars.indexOf(c) >= 0) {
            return i;
        }
    }
    return -1;
}