scuffed-code/icu4c/source/i18n/rbt_pars.cpp

642 lines
25 KiB
C++

/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "rbt_pars.h"
#include "unicode/rbt.h"
#include "rbt_rule.h"
#include "unirange.h"
#include "rbt_data.h"
#include "unicode/uniset.h"
// Operators
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>';
const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<';
const char* TransliterationRuleParser::OPERATORS = "=><";
// Other special characters
const UChar TransliterationRuleParser::QUOTE = '\'';
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
const UChar TransliterationRuleParser::CONTEXT_OPEN = '[';
const UChar TransliterationRuleParser::CONTEXT_CLOSE = ']';
const UChar TransliterationRuleParser::CURSOR_POS = '|';
const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#';
/**
* Specials must be quoted in rules to be used as literals.
* Specials may not occur in variable names.
*
* This string is a superset of OPERATORS.
*/
const char* TransliterationRuleParser::SPECIALS = "'{}[]|#=><";
/**
* Specials that must be quoted in variable definitions.
*/
const char* TransliterationRuleParser::DEF_SPECIALS = "'{}";
TransliterationRuleData*
TransliterationRuleParser::parse(const UnicodeString& rules,
RuleBasedTransliterator::Direction direction) {
TransliterationRuleParser parser(rules, direction);
parser.parseRules();
if (U_FAILURE(parser.status)) {
delete parser.data;
parser.data = 0;
}
return parser.data;
}
/**
* @param rules list of rules, separated by newline characters
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
TransliterationRuleParser::TransliterationRuleParser(
const UnicodeString& theRules,
RuleBasedTransliterator::Direction theDirection) :
rules(theRules), direction(theDirection), data(0) {}
/**
* Parse the given string as a sequence of rules, separated by newline
* characters ('\n'), and cause this object to implement those rules. Any
* previous rules are discarded. Typically this method is called exactly
* once, during construction.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void TransliterationRuleParser::parseRules(void) {
status = U_ZERO_ERROR;
delete data;
data = new TransliterationRuleData(status);
if (U_FAILURE(status)) {
return;
}
determineVariableRange();
int32_t n = rules.length();
int32_t i = 0;
while (i<n && U_SUCCESS(status)) {
int32_t limit = rules.indexOf('\n', i);
// Recognize "\\\n" as an escaped "\n"
while (limit>0 && rules.charAt(limit-1) == '\\') {
limit = rules.indexOf('\n', limit+1);
}
if (limit == -1) {
limit = n;
}
// Skip over empty lines and line starting with #
if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) {
applyRule(i, limit);
}
i = limit + 1;
}
data->ruleSet.freeze();
}
/**
* Parse the given substring as a rule, and append it to the rules currently
* represented in this object.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void TransliterationRuleParser::applyRule(int32_t start, int32_t limit) {
/* General description of parsing: Initially, rules contain two types of
* quoted characters. First, there are variable references, such as
* "{alpha}". Second, there are quotes, such as "'<'" or "''". One of
* the first steps in parsing a rule is to resolve such quoted matter.
* Quotes are removed early, leaving unquoted literal matter. Variable
* references are resolved and replaced by single characters. In some
* instances these characters represent themselves; in others, they
* stand for categories of characters. Character categories are either
* predefined (e.g., "{Lu}"), or are defined by the user using a
* statement (e.g., "vowels:aeiouAEIOU").
*
* Another early step in parsing is to split each rule into component
* pieces. These pieces are, for every rule, a left-hand side, a right-
* hand side, and an operator. The left- and right-hand sides may not
* be empty, except for the output patterns of forward and reverse
* rules. In addition to this partitioning, the match patterns of
* forward and reverse rules must be partitioned into antecontext,
* postcontext, and literal pattern, where the context portions may or
* may not be present. Finally, output patterns must have the cursor
* indicator '|' detected and removed, with its position recorded.
*
* Quote removal, variable resolution, and sub-pattern splitting must
* all happen at once. This is due chiefly to the quoting mechanism,
* which allows special characters to appear at arbitrary positions in
* the final unquoted text. (For this reason, alteration of the rule
* language is somewhat clumsy; it entails reassessment and revision of
* the parsing methods as a whole.)
*
* After this processing of rules is complete, the final end products
* are unquoted pieces of text of various types, and an integer cursor
* position, if one is specified. These processed raw materials are now
* easy to deal with; other classes such as UnicodeSet and
* TransliterationRule need know nothing of quoting or variables.
*/
UnicodeString left;
UnicodeString right;
UnicodeString anteContext;
UnicodeString postContext;
int32_t cursorPos;
UChar op = parseRule(start, limit, left, right,
anteContext, postContext, cursorPos);
if (U_FAILURE(status)) {
return;
}
switch (op) {
case VARIABLE_DEF_OP:
applyVariableDef(left, right);
break;
case FORWARD_RULE_OP:
if (direction == RuleBasedTransliterator::FORWARD) {
data->ruleSet.addRule(new TransliterationRule(
left, right,
anteContext, postContext,
cursorPos, status),
status);
} // otherwise ignore the rule; it's not the direction we want
break;
case REVERSE_RULE_OP:
if (direction == RuleBasedTransliterator::REVERSE) {
data->ruleSet.addRule(new TransliterationRule(
right, left,
anteContext, postContext,
cursorPos, status),
status);
} // otherwise ignore the rule; it's not the direction we want
break;
}
}
/**
* Add a variable definition.
* @param name the name of the variable. It must not already be defined.
* @param pattern the value of the variable. It may be a single character
* or a pattern describing a character set.
* @exception IllegalArgumentException if there is a syntax error
*/
void TransliterationRuleParser::applyVariableDef(const UnicodeString& name,
const UnicodeString& pattern) {
validateVariableName(name);
if (U_FAILURE(status)) {
return;
}
if (data->isVariableDefined(name)) {
// throw new IllegalArgumentException("Duplicate variable definition: "
// + name + '=' + pattern);
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
//! if (UnicodeSet.getCategoryID(name) >= 0) {
//! throw new IllegalArgumentException("Reserved variable name: "
//! + name);
//! }
if (pattern.length() < 1) {
// throw new IllegalArgumentException("Variable definition missing: "
// + name);
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (pattern.length() == 1) {
// Got a single character variable definition
//$ data->variableNames.put(name, new Character(pattern.charAt(0)));
data->defineVariable(name, pattern.charAt(0), status);
} else {
// Got more than one character; parse it as a category
if (variableNext >= variableLimit) {
//$ throw new RuntimeException("Private use variables exhausted");
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
//$ Character c = new Character(variableNext++);
//$ data->variableNames.put(name, c);
//$ data->setVariables.put(c, new UnicodeSet(pattern));
data->defineVariable(name, variableNext++,
new UnicodeSet(pattern, status),
status);
}
}
/**
* Given a rule, parses it into three pieces: The left side, the right side,
* and the operator. Returns the operator. Quotes and variable references
* are resolved; the otuput text in all <code>StringBuffer</code> parameters
* is literal text. This method delegates to other parsing methods to
* handle the match pattern, output pattern, and other sub-patterns in the
* rule.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param left left side of rule is appended to this buffer
* with the quotes removed and variables resolved
* @param right right side of rule is appended to this buffer
* with the quotes removed and variables resolved
* @param anteContext the preceding context of the match pattern,
* if there is one, is appended to this buffer
* @param postContext the following context of the match pattern,
* if there is one, is appended to this buffer
* @param cursorPos if there is a cursor in the output pattern, its
* offset is stored in <code>cursorPos</code>, otherwise set to -1.
* @return The operator character, one of the characters in OPERATORS.
*/
UChar TransliterationRuleParser::parseRule(int32_t start, int32_t limit,
UnicodeString& left,
UnicodeString& right,
UnicodeString& anteContext,
UnicodeString& postContext,
int32_t& cursorPos) {
/* Parse the rule into three pieces -- left, operator, and right,
* parsing out quotes. The result is that left and right will have
* unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted
* operators throw an exception. Two quotes inside or outside
* quotes indicates a quote literal. E.g., "o''clock" -> "o'clock".
*/
int32_t i = quotedIndexOf(rules, start, limit, OPERATORS);
if (i < 0) {
//$ throw new IllegalArgumentException(
//$ "Syntax error: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
cursorPos = -1;
UChar c = rules.charAt(i);
switch (c) {
case FORWARD_RULE_OP:
if (i == start) {
//$ throw new IllegalArgumentException(
//$ "Empty left side: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
parseMatchPattern(start, i, left, anteContext, postContext);
if (i != (limit-1)) {
parseOutputPattern(i+1, limit, right, cursorPos);
}
break;
case REVERSE_RULE_OP:
if (i == (limit-1)) {
//$ throw new IllegalArgumentException(
//$ "Empty right side: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if (i != start) {
parseOutputPattern(start, i, left, cursorPos);
}
parseMatchPattern(i+1, limit, right, anteContext, postContext);
break;
default:
if (i == start || i == (limit-1)) {
//$ throw new IllegalArgumentException(
//$ "Empty left or right side: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
parseSubPattern(start, i, left);
parseDefPattern(i+1, limit, right);
break;
}
return c;
}
/**
* Parses the match pattern of a forward or reverse rule. Given the raw
* match pattern, return the match text and the context on both sides, if
* any. Resolves all quotes and variables.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the key to be matched will be appended to this buffer
* @param anteContext the preceding context, if any, will be appended
* to this buffer.
* @param postContext the following context, if any, will be appended
* to this buffer.
*/
void TransliterationRuleParser::parseMatchPattern(int32_t start, int32_t limit,
UnicodeString& text,
UnicodeString& anteContext,
UnicodeString& postContext) {
if (start >= limit) {
//$ throw new IllegalArgumentException(
//$ "Empty expression in rule: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
//$ if (anteContext != 0) {
// Ignore optional opening and closing context characters
if (rules.charAt(start) == CONTEXT_OPEN) {
++start;
}
if (rules.charAt(limit-1) == CONTEXT_CLOSE) {
--limit;
}
// The four possibilities are:
// key
// anteContext]key
// anteContext]key[postContext
// key[postContext
int32_t ante = quotedIndexOf(rules, start, limit, CONTEXT_CLOSE);
int32_t post = quotedIndexOf(rules, start, limit, CONTEXT_OPEN);
if (ante >= 0 && post >= 0 && ante > post) {
//$ throw new IllegalArgumentException(
//$ "Syntax error in context specifier: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (ante >= 0) {
parseSubPattern(start, ante, anteContext);
start = ante+1;
}
if (post >= 0) {
parseSubPattern(post+1, limit, postContext);
limit = post;
}
//$ }
parseSubPattern(start, limit, text);
}
void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
UnicodeString& text) {
parseSubPattern(start, limit, text, 0, SPECIALS);
}
/**
* Parse a variable definition sub pattern. This kind of sub
* pattern differs in the set of characters that are considered
* special. In particular, the '[' and ']' characters are not
* special, since these are used in UnicodeSet patterns.
*/
void TransliterationRuleParser::parseDefPattern(int32_t start, int32_t limit,
UnicodeString& text) {
parseSubPattern(start, limit, text, 0, DEF_SPECIALS);
}
/**
* Parses the output pattern of a forward or reverse rule. Given the
* output pattern, return the output text and the position of the cursor,
* if any. Resolves all quotes and variables.
* @param rules the string to be parsed
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the output text will be appended to this buffer
* @param cursorPos if this parameter is not null, then cursorPos
* will be set to the cursor position, or -1 if there is none. If this
* parameter is null, then cursors will be disallowed.
*/
void TransliterationRuleParser::parseOutputPattern(int32_t start, int32_t limit,
UnicodeString& text,
int32_t& cursorPos) {
parseSubPattern(start, limit, text, &cursorPos, SPECIALS);
}
/**
* Parses a sub-pattern of a rule. Return the text and the position of the cursor,
* if any. Resolves all quotes and variables.
* @param rules the string to be parsed
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the output text will be appended to this buffer
* @param cursorPos if this parameter is not null, then cursorPos
* will be set to the cursor position, or -1 if there is none. If this
* parameter is null, then cursors will be disallowed.
* @param specials characters that must be quoted; typically either
* SPECIALS or DEF_SPECIALS.
*/
void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
UnicodeString& text,
int32_t* cursorPos,
const UnicodeString& specials) {
bool_t inQuote = FALSE;
if (start >= limit) {
//$ throw new IllegalArgumentException("Empty expression in rule");
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (cursorPos != 0) {
*cursorPos = -1;
}
for (int32_t i=start; i<limit; ++i) {
UChar c = rules.charAt(i);
if (c == QUOTE) {
// Check for double quote
if ((i+1) < limit
&& rules.charAt(i+1) == QUOTE) {
text.append(QUOTE);
++i; // Skip over both quotes
} else {
inQuote = !inQuote;
}
} else if (inQuote) {
text.append(c);
} else if (c == VARIABLE_REF_OPEN) {
++i;
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, i);
if (i == j || j < 0) { // empty or unterminated
//$ throw new IllegalArgumentException("Illegal variable reference: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
UnicodeString name;
rules.extractBetween(i, j, name);
validateVariableName(name);
if (U_FAILURE(status)) {
return;
}
UChar ch = data->lookupVariable(name, status);
if (U_FAILURE(status)) {
return;
}
text.append(ch);
i = j;
} else if (c == CURSOR_POS && cursorPos != 0) {
if (*cursorPos >= 0) {
//$ throw new IllegalArgumentException("Multiple cursors: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
*cursorPos = text.length();
} else if (specials.indexOf(c) >= 0) {
//$ throw new IllegalArgumentException("Unquoted special character: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
} else {
text.append(c);
}
}
}
void TransliterationRuleParser::validateVariableName(const UnicodeString& name) {
if (indexOf(name, SPECIALS) >= 0) {
//throw new IllegalArgumentException(
// "Special character in variable name: "
// + name);
status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
/**
* Returns the single character value of the given variable name. Defined
* names are recognized.
*
* NO LONGER SUPPORTED:
* If a Unicode category name is given, a standard character variable
* in the range firstCategoryVariable to lastCategoryVariable is returned,
* with value firstCategoryVariable + n, where n is the category
* number.
* @exception IllegalArgumentException if the name is unknown.
*/
//$ UChar TransliterationRuleParser::getVariableDef(const UnicodeString& name) {
//$ UChar ch = data->lookupVariable(name, status);
//$ //! if (ch == null) {
//$ //! int id = UnicodeSet.getCategoryID(name);
//$ //! if (id >= 0) {
//$ //! ch = new Character((char) (firstCategoryVariable + id));
//$ //! data->variableNames.put(name, ch);
//$ //! data->setVariables.put(ch, new UnicodeSet(id));
//$ //! }
//$ //! }
//$ if (ch == 0) {
//$ throw new IllegalArgumentException("Undefined variable: "
//$ + name);
//$ }
//$ return ch;
//$ }
/**
* Determines what part of the private use region of Unicode we can use for
* variable stand-ins. The correct way to do this is as follows: Parse each
* rule, and for forward and reverse rules, take the FROM expression, and
* make a hash of all characters used. The TO expression should be ignored.
* When done, everything not in the hash is available for use. In practice,
* this method may employ some other algorithm for improved speed.
*/
void TransliterationRuleParser::determineVariableRange(void) {
UnicodeRange privateUse(0xE000, 0x1900); // Private use area
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
variableNext = variableLimit = (UChar) 0;
if (r != 0) {
variableNext = r->start;
variableLimit = (UChar) (r->start + r->length);
delete r;
}
if (variableNext >= variableLimit) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
/**
* Returns the index of the first character in a set, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for "h". Unlike String.indexOf(), this method searches
* not for a single character, but for any character of the string
* <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #indexOf
*/
int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars) {
for (int32_t i=start; i<limit; ++i) {
UChar c = text.charAt(i);
if (c == QUOTE) {
while (++i < limit
&& text.charAt(i) != QUOTE) {}
} else if (setOfChars.indexOf(c) >= 0) {
return i;
}
}
return -1;
}
/**
* Returns the index of the first character in a set. Unlike
* String.indexOf(), this method searches not for a single character, but
* for any character of the string <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #quotedIndexOf
*/
int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars) {
for (int32_t i=start; i<limit; ++i) {
if (setOfChars.indexOf(text.charAt(i)) >= 0) {
return i;
}
}
return -1;
}
/**
* Returns the index of the first character in a set. Unlike
* String.indexOf(), this method searches not for a single character, but
* for any character of the string <code>setOfChars</code>.
* @param text text to be searched
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #quotedIndexOf
*/
int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
const UnicodeString& setOfChars) {
return indexOf(text, 0, text.length(), setOfChars);
}