7ce42e2f31
X-SVN-Rev: 728
544 lines
20 KiB
C++
544 lines
20 KiB
C++
/*
|
|
**********************************************************************
|
|
* Copyright (C) 1999, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
**********************************************************************
|
|
* Date Name Description
|
|
* 11/17/99 aliu Creation.
|
|
**********************************************************************
|
|
*/
|
|
#include "rbt_pars.h"
|
|
#include "unicode/rbt.h"
|
|
#include "rbt_rule.h"
|
|
#include "unirange.h"
|
|
#include "rbt_data.h"
|
|
#include "unicode/uniset.h"
|
|
#include "cstring.h"
|
|
#include "unicode/parsepos.h"
|
|
#include "symtable.h"
|
|
|
|
// Operators
|
|
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
|
|
const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>';
|
|
const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<';
|
|
const UChar TransliterationRuleParser::FWDREV_RULE_OP = '~'; // internal rep of <> op
|
|
const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3);
|
|
|
|
// Other special characters
|
|
const UChar TransliterationRuleParser::QUOTE = '\'';
|
|
const UChar TransliterationRuleParser::ESCAPE = '\\';
|
|
const UChar TransliterationRuleParser::END_OF_RULE = ';';
|
|
const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#';
|
|
|
|
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
|
|
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
|
|
const UChar TransliterationRuleParser::CONTEXT_OPEN = '(';
|
|
const UChar TransliterationRuleParser::CONTEXT_CLOSE = ')';
|
|
const UChar TransliterationRuleParser::SET_OPEN = '[';
|
|
const UChar TransliterationRuleParser::SET_CLOSE = ']';
|
|
const UChar TransliterationRuleParser::CURSOR_POS = '|';
|
|
|
|
//----------------------------------------------------------------------
|
|
// BEGIN ParseData
|
|
//----------------------------------------------------------------------
|
|
|
|
/**
|
|
* This class implements the SymbolTable interface. It is used
|
|
* during parsing to give UnicodeSet access to variables that
|
|
* have been defined so far. Note that it uses setVariablesVector,
|
|
* _not_ data.setVariables.
|
|
*/
|
|
class ParseData : public SymbolTable {
|
|
public:
|
|
const TransliterationRuleData* data; // alias
|
|
|
|
const UVector* setVariablesVector; // alias
|
|
|
|
ParseData(const TransliterationRuleData* data = 0,
|
|
const UVector* setVariablesVector = 0);
|
|
|
|
/**
|
|
* Lookup the object associated with this string and return it.
|
|
* Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
|
|
* exist. Return a non-NULL set if the name is mapped to a set;
|
|
* otherwise return a NULL set.
|
|
*/
|
|
virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
|
|
UErrorCode& status) const;
|
|
};
|
|
|
|
ParseData::ParseData(const TransliterationRuleData* d,
|
|
const UVector* sets) :
|
|
data(d), setVariablesVector(sets) {}
|
|
|
|
/**
|
|
* Implement SymbolTable API. Lookup a variable, returning
|
|
* either a Character, a UnicodeSet, or null.
|
|
*/
|
|
void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
|
|
UErrorCode& status) const {
|
|
c = data->lookupVariable(name, status);
|
|
if (U_SUCCESS(status)) {
|
|
int32_t i = c - data->setVariablesBase;
|
|
set = (i < setVariablesVector->size()) ?
|
|
(UnicodeSet*) setVariablesVector->elementAt(i) : 0;
|
|
}
|
|
}
|
|
|
|
//----------------------------------------------------------------------
|
|
// END ParseData
|
|
//----------------------------------------------------------------------
|
|
|
|
TransliterationRuleData*
|
|
TransliterationRuleParser::parse(const UnicodeString& rules,
|
|
RuleBasedTransliterator::Direction direction) {
|
|
TransliterationRuleParser parser(rules, direction);
|
|
parser.parseRules();
|
|
if (U_FAILURE(parser.status)) {
|
|
delete parser.data;
|
|
parser.data = 0;
|
|
}
|
|
return parser.data;
|
|
}
|
|
|
|
/**
|
|
* @param rules list of rules, separated by newline characters
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
* rules
|
|
*/
|
|
TransliterationRuleParser::TransliterationRuleParser(
|
|
const UnicodeString& theRules,
|
|
RuleBasedTransliterator::Direction theDirection) :
|
|
rules(theRules), direction(theDirection), data(0) {
|
|
parseData = new ParseData(0, &setVariablesVector);
|
|
}
|
|
|
|
/**
|
|
* Destructor.
|
|
*/
|
|
TransliterationRuleParser::~TransliterationRuleParser() {
|
|
delete parseData;
|
|
}
|
|
|
|
/**
|
|
* Parse the given string as a sequence of rules, separated by newline
|
|
* characters ('\n'), and cause this object to implement those rules. Any
|
|
* previous rules are discarded. Typically this method is called exactly
|
|
* once, during construction.
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
* rules
|
|
*/
|
|
void TransliterationRuleParser::parseRules(void) {
|
|
status = U_ZERO_ERROR;
|
|
|
|
delete data;
|
|
data = new TransliterationRuleData(status);
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
parseData->data = data;
|
|
setVariablesVector.removeAllElements();
|
|
determineVariableRange();
|
|
|
|
int32_t pos = 0;
|
|
int32_t limit = rules.length();
|
|
while (pos < limit && U_SUCCESS(status)) {
|
|
UChar c = rules.charAt(pos++);
|
|
if (Unicode::isWhitespace(c)) {
|
|
// Ignore leading whitespace. Note that this is not
|
|
// Unicode spaces, but Java spaces -- a subset,
|
|
// representing whitespace likely to be seen in code.
|
|
continue;
|
|
}
|
|
// Skip lines starting with the comment character
|
|
if (c == RULE_COMMENT_CHAR) {
|
|
pos = rules.indexOf("\n", pos) + 1;
|
|
if (pos == 0) {
|
|
break; // No "\n" found; rest of rule is a commnet
|
|
}
|
|
continue; // Either fall out or restart with next line
|
|
}
|
|
// We've found the start of a rule. c is its first
|
|
// character, and pos points past c. Lexically parse the
|
|
// rule into component pieces.
|
|
pos = parseRule(--pos, limit);
|
|
}
|
|
|
|
// Convert the set vector to an array
|
|
data->setVariablesLength = setVariablesVector.size();
|
|
data->setVariables = new UnicodeSet*[data->setVariablesLength];
|
|
// orphanElement removes the given element and shifts all other
|
|
// elements down. For performance (and code clarity) we work from
|
|
// the end back to index 0.
|
|
for (int32_t i=data->setVariablesLength; i>0; ) {
|
|
--i;
|
|
data->setVariables[i] =
|
|
(UnicodeSet*) setVariablesVector.orphanElementAt(i);
|
|
}
|
|
|
|
// Index the rules
|
|
if (U_SUCCESS(status)) {
|
|
data->ruleSet.freeze(*data, status);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
|
* at pos. Return the index after the last character parsed. Do not
|
|
* parse characters at or after limit.
|
|
*
|
|
* Important: The character at pos must be a non-whitespace character
|
|
* that is not the comment character.
|
|
*
|
|
* This method handles quoting, escaping, and whitespace removal. It
|
|
* parses the end-of-rule character. It recognizes context and cursor
|
|
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
|
* creates a rule object and adds it to our rule list.
|
|
*/
|
|
int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
|
|
// Locate the left side, operator, and right side
|
|
int32_t start = pos;
|
|
UChar op = 0;
|
|
|
|
UnicodeString buf;
|
|
int32_t cursor = -1; // position of cursor in buf
|
|
int32_t ante = -1; // position of ante context marker ')' in buf
|
|
int32_t post = -1; // position of post context marker '(' in buf
|
|
int32_t postClose = -1; // position of post context close ')' in buf
|
|
|
|
// Assigned to buf and its adjuncts after the LHS has been
|
|
// parsed. Thereafter, buf etc. refer to the RHS.
|
|
UnicodeString left;
|
|
int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
|
|
|
|
UnicodeString scratch;
|
|
|
|
while (pos < limit) {
|
|
UChar c = rules.charAt(pos++);
|
|
if (Unicode::isWhitespace(c)) {
|
|
// Ignore whitespace. Note that this is not Unicode
|
|
// spaces, but Java spaces -- a subset, representing
|
|
// whitespace likely to be seen in code.
|
|
continue;
|
|
}
|
|
// Handle escapes
|
|
if (c == ESCAPE) {
|
|
if (pos == limit) {
|
|
return syntaxError("Trailing backslash", rules, start);
|
|
}
|
|
// Parse \uXXXX escapes
|
|
c = rules.charAt(pos++);
|
|
if (c == 'u') {
|
|
if ((pos+4) > limit) {
|
|
return syntaxError("Malformed Unicode escape", rules, start);
|
|
}
|
|
c = (UChar)0x0000;
|
|
for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
|
|
int32_t digit = Unicode::digit(rules.charAt(pos), 16);
|
|
if (digit<0) {
|
|
return syntaxError("Malformed Unicode escape", rules, start);
|
|
}
|
|
c = (UChar) ((c << 4) | digit);
|
|
}
|
|
}
|
|
|
|
buf.append(c);
|
|
continue;
|
|
}
|
|
// Handle quoted matter
|
|
if (c == QUOTE) {
|
|
int32_t iq = rules.indexOf(QUOTE, pos);
|
|
if (iq == pos) {
|
|
buf.append(c); // Parse [''] outside quotes as [']
|
|
++pos;
|
|
} else {
|
|
/* This loop picks up a segment of quoted text of the
|
|
* form 'aaaa' each time through. If this segment
|
|
* hasn't really ended ('aaaa''bbbb') then it keeps
|
|
* looping, each time adding on a new segment. When it
|
|
* reaches the final quote it breaks.
|
|
*/
|
|
for (;;) {
|
|
if (iq < 0) {
|
|
return syntaxError("Unterminated quote", rules, start);
|
|
}
|
|
scratch.truncate(0);
|
|
rules.extractBetween(pos, iq, scratch);
|
|
buf.append(scratch);
|
|
pos = iq+1;
|
|
if (pos < limit && rules.charAt(pos) == QUOTE) {
|
|
// Parse [''] inside quotes as [']
|
|
iq = rules.indexOf(QUOTE, pos+1);
|
|
// Continue looping
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
if (OPERATORS.indexOf(c) >= 0) {
|
|
if (op != 0) {
|
|
return syntaxError("Unquoted special", rules, start);
|
|
}
|
|
// Found an operator char. Check for forward-reverse operator.
|
|
if (c == REVERSE_RULE_OP &&
|
|
(pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) {
|
|
++pos;
|
|
op = FWDREV_RULE_OP;
|
|
} else {
|
|
op = c;
|
|
}
|
|
left = buf; // lhs
|
|
leftCursor = cursor;
|
|
leftAnte = ante;
|
|
leftPost = post;
|
|
leftPostClose = postClose;
|
|
|
|
buf.truncate(0);
|
|
cursor = ante = post = postClose = -1;
|
|
continue;
|
|
}
|
|
if (c == END_OF_RULE) {
|
|
break;
|
|
}
|
|
switch (c) {
|
|
case VARIABLE_REF_OPEN:
|
|
{
|
|
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
|
|
if (pos == j || j < 0) { // empty or unterminated
|
|
return syntaxError("Malformed variable reference", rules, start);
|
|
}
|
|
scratch.truncate(0);
|
|
rules.extractBetween(pos, j, scratch);
|
|
pos = j+1;
|
|
UChar v = data->lookupVariable(scratch, status);
|
|
if (U_FAILURE(status)) {
|
|
return syntaxError("Undefined variable", rules, start);
|
|
}
|
|
buf.append(v);
|
|
}
|
|
break;
|
|
case CONTEXT_OPEN:
|
|
if (post >= 0) {
|
|
return syntaxError("Multiple post contexts", rules, start);
|
|
}
|
|
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
|
|
// this is the optional opening delimiter for the ante context.
|
|
if (buf.length() > 0) {
|
|
post = buf.length();
|
|
}
|
|
break;
|
|
case CONTEXT_CLOSE:
|
|
if (postClose >= 0) {
|
|
return syntaxError("Unexpected ')'", rules, start);
|
|
}
|
|
if (post >= 0) {
|
|
// This is probably the optional closing delimiter
|
|
// for the post context; save the pos and check later.
|
|
postClose = buf.length();
|
|
} else if (ante >= 0) {
|
|
return syntaxError("Multiple ante contexts", rules, start);
|
|
} else {
|
|
ante = buf.length();
|
|
}
|
|
break;
|
|
case SET_OPEN: {
|
|
ParsePosition pp(pos-1); // Backup to opening '['
|
|
buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
|
|
if (U_FAILURE(status)) {
|
|
return syntaxError("Invalid set", rules, start);
|
|
}
|
|
pos = pp.getIndex(); }
|
|
break;
|
|
case VARIABLE_REF_CLOSE:
|
|
case SET_CLOSE:
|
|
return syntaxError("Unquoted special", rules, start);
|
|
case CURSOR_POS:
|
|
if (cursor >= 0) {
|
|
return syntaxError("Multiple cursors", rules, start);
|
|
}
|
|
cursor = buf.length();
|
|
break;
|
|
default:
|
|
buf.append(c);
|
|
break;
|
|
}
|
|
}
|
|
if (op == 0) {
|
|
return syntaxError("No operator", rules, start);
|
|
}
|
|
|
|
// Check context close parameters
|
|
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
|
|
(postClose >= 0 && postClose != buf.length())) {
|
|
return syntaxError("Extra text after ]", rules, start);
|
|
}
|
|
|
|
// Context is only allowed on the input side; that is, the left side
|
|
// for forward rules. Cursors are only allowed on the output side;
|
|
// that is, the right side for forward rules. Bidirectional rules
|
|
// ignore elements that do not apply.
|
|
|
|
switch (op) {
|
|
case VARIABLE_DEF_OP:
|
|
// LHS is the name. RHS is a single character, either a literal
|
|
// or a set (already parsed). If RHS is longer than one
|
|
// character, it is either a multi-character string, or multiple
|
|
// sets, or a mixture of chars and sets -- syntax error.
|
|
if (buf.length() != 1) {
|
|
return syntaxError("Malformed RHS", rules, start);
|
|
}
|
|
if (data->isVariableDefined(left)) {
|
|
return syntaxError("Duplicate definition", rules, start);
|
|
}
|
|
data->defineVariable(left, buf.charAt(0), status);
|
|
break;
|
|
|
|
case FORWARD_RULE_OP:
|
|
if (direction == RuleBasedTransliterator::FORWARD) {
|
|
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
|
|
return syntaxError("Malformed rule", rules, start);
|
|
}
|
|
data->ruleSet.addRule(new TransliterationRule(
|
|
left, leftAnte, leftPost,
|
|
buf, cursor, status), status);
|
|
} // otherwise ignore the rule; it's not the direction we want
|
|
break;
|
|
|
|
case REVERSE_RULE_OP:
|
|
if (direction == RuleBasedTransliterator::REVERSE) {
|
|
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
|
|
return syntaxError("Malformed rule", rules, start);
|
|
}
|
|
data->ruleSet.addRule(new TransliterationRule(
|
|
buf, ante, post,
|
|
left, leftCursor, status), status);
|
|
} // otherwise ignore the rule; it's not the direction we want
|
|
break;
|
|
|
|
case FWDREV_RULE_OP:
|
|
if (direction == RuleBasedTransliterator::FORWARD) {
|
|
// The output side is the right; trim off any context
|
|
if (post >= 0) {
|
|
buf.remove(post);
|
|
}
|
|
if (ante >= 0) {
|
|
buf.removeBetween(0, ante);
|
|
}
|
|
data->ruleSet.addRule(new TransliterationRule(
|
|
left, leftAnte, leftPost,
|
|
buf, cursor, status), status);
|
|
} else {
|
|
// The output side is the left; trim off any context
|
|
if (leftPost >= 0) {
|
|
left.remove(leftPost);
|
|
}
|
|
if (leftAnte >= 0) {
|
|
left.removeBetween(0, leftAnte);
|
|
}
|
|
data->ruleSet.addRule(new TransliterationRule(
|
|
buf, ante, post,
|
|
left, leftCursor, status), status);
|
|
}
|
|
break;
|
|
}
|
|
|
|
return pos;
|
|
}
|
|
|
|
/**
|
|
* Called by main parser upon syntax error. Search the rule string
|
|
* for the probable end of the rule. Of course, if the error is that
|
|
* the end of rule marker is missing, then the rule end will not be found.
|
|
* In any case the rule start will be correctly reported.
|
|
* @param msg error description
|
|
* @param rule pattern string
|
|
* @param start position of first character of current rule
|
|
*/
|
|
int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/,
|
|
const UnicodeString& /*rule*/,
|
|
int32_t start) {
|
|
//| int end = quotedIndexOf(rule, start, rule.length(), ";");
|
|
//| if (end < 0) {
|
|
//| end = rule.length();
|
|
//| }
|
|
//| throw new IllegalArgumentException(msg + " in " +
|
|
//| rule.substring(start, end));
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return start;
|
|
}
|
|
|
|
/**
|
|
* Allocate a private-use substitution character for the given set,
|
|
* register it in the setVariables hash, and return the substitution
|
|
* character.
|
|
*/
|
|
UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
|
|
if (variableNext >= variableLimit) {
|
|
// throw new RuntimeException("Private use variables exhausted");
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return 0;
|
|
}
|
|
setVariablesVector.addElement(adoptedSet);
|
|
return variableNext++;
|
|
}
|
|
|
|
/**
|
|
* Determines what part of the private use region of Unicode we can use for
|
|
* variable stand-ins. The correct way to do this is as follows: Parse each
|
|
* rule, and for forward and reverse rules, take the FROM expression, and
|
|
* make a hash of all characters used. The TO expression should be ignored.
|
|
* When done, everything not in the hash is available for use. In practice,
|
|
* this method may employ some other algorithm for improved speed.
|
|
*/
|
|
void TransliterationRuleParser::determineVariableRange(void) {
|
|
UnicodeRange privateUse(0xE000, 0x1900); // Private use area
|
|
|
|
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
|
|
|
|
data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
|
|
|
|
if (r != 0) {
|
|
data->setVariablesBase = variableNext = r->start;
|
|
variableLimit = (UChar) (r->start + r->length);
|
|
delete r;
|
|
}
|
|
|
|
if (variableNext >= variableLimit) {
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns the index of the first character in a set, ignoring quoted text.
|
|
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
|
|
* found by a search for "h". Unlike String.indexOf(), this method searches
|
|
* not for a single character, but for any character of the string
|
|
* <code>setOfChars</code>.
|
|
* @param text text to be searched
|
|
* @param start the beginning index, inclusive; <code>0 <= start
|
|
* <= limit</code>.
|
|
* @param limit the ending index, exclusive; <code>start <= limit
|
|
* <= text.length()</code>.
|
|
* @param setOfChars string with one or more distinct characters
|
|
* @return Offset of the first character in <code>setOfChars</code>
|
|
* found, or -1 if not found.
|
|
* @see #indexOf
|
|
*/
|
|
int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
|
|
int32_t start, int32_t limit,
|
|
const UnicodeString& setOfChars) {
|
|
for (int32_t i=start; i<limit; ++i) {
|
|
UChar c = text.charAt(i);
|
|
if (c == QUOTE) {
|
|
while (++i < limit
|
|
&& text.charAt(i) != QUOTE) {}
|
|
} else if (setOfChars.indexOf(c) >= 0) {
|
|
return i;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|