af7124308c
X-SVN-Rev: 958
545 lines
20 KiB
C++
545 lines
20 KiB
C++
/*
|
|
**********************************************************************
|
|
* Copyright (C) 1999, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
**********************************************************************
|
|
* Date Name Description
|
|
* 11/17/99 aliu Creation.
|
|
**********************************************************************
|
|
*/
|
|
#include "rbt_pars.h"
|
|
#include "unicode/rbt.h"
|
|
#include "rbt_rule.h"
|
|
#include "unirange.h"
|
|
#include "rbt_data.h"
|
|
#include "unicode/uniset.h"
|
|
#include "cstring.h"
|
|
#include "unicode/parsepos.h"
|
|
#include "symtable.h"
|
|
#include "unicode/parseerr.h"
|
|
|
|
// Operators
|
|
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = 0x003D/*=*/;
|
|
const UChar TransliterationRuleParser::FORWARD_RULE_OP = 0x003E/*>*/;
|
|
const UChar TransliterationRuleParser::REVERSE_RULE_OP = 0x003C/*<*/;
|
|
const UChar TransliterationRuleParser::FWDREV_RULE_OP = 0x007E/*~*/; // internal rep of <> op
|
|
const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3);
|
|
|
|
// Other special characters
|
|
const UChar TransliterationRuleParser::QUOTE = 0x0027/*'*/;
|
|
const UChar TransliterationRuleParser::ESCAPE = 0x005C/*\*/;
|
|
const UChar TransliterationRuleParser::END_OF_RULE = 0x003B/*;*/;
|
|
const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = 0x0023/*#*/;
|
|
|
|
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = 0x007B/*{*/;
|
|
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = 0x007D/*}*/;
|
|
const UChar TransliterationRuleParser::CONTEXT_OPEN = 0x0028/*(*/;
|
|
const UChar TransliterationRuleParser::CONTEXT_CLOSE = 0x0029/*)*/;
|
|
const UChar TransliterationRuleParser::SET_OPEN = 0x005B/*[*/;
|
|
const UChar TransliterationRuleParser::SET_CLOSE = 0x005D/*]*/;
|
|
const UChar TransliterationRuleParser::CURSOR_POS = 0x007C/*|*/;
|
|
|
|
//----------------------------------------------------------------------
|
|
// BEGIN ParseData
|
|
//----------------------------------------------------------------------
|
|
|
|
/**
|
|
* This class implements the SymbolTable interface. It is used
|
|
* during parsing to give UnicodeSet access to variables that
|
|
* have been defined so far. Note that it uses setVariablesVector,
|
|
* _not_ data.setVariables.
|
|
*/
|
|
class ParseData : public SymbolTable {
|
|
public:
|
|
const TransliterationRuleData* data; // alias
|
|
|
|
const UVector* setVariablesVector; // alias
|
|
|
|
ParseData(const TransliterationRuleData* data = 0,
|
|
const UVector* setVariablesVector = 0);
|
|
|
|
/**
|
|
* Lookup the object associated with this string and return it.
|
|
* Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
|
|
* exist. Return a non-NULL set if the name is mapped to a set;
|
|
* otherwise return a NULL set.
|
|
*/
|
|
virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
|
|
UErrorCode& status) const;
|
|
};
|
|
|
|
ParseData::ParseData(const TransliterationRuleData* d,
|
|
const UVector* sets) :
|
|
data(d), setVariablesVector(sets) {}
|
|
|
|
/**
|
|
* Implement SymbolTable API. Lookup a variable, returning
|
|
* either a Character, a UnicodeSet, or null.
|
|
*/
|
|
void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
|
|
UErrorCode& status) const {
|
|
c = data->lookupVariable(name, status);
|
|
if (U_SUCCESS(status)) {
|
|
int32_t i = c - data->setVariablesBase;
|
|
set = (i < setVariablesVector->size()) ?
|
|
(UnicodeSet*) setVariablesVector->elementAt(i) : 0;
|
|
}
|
|
}
|
|
|
|
//----------------------------------------------------------------------
|
|
// END ParseData
|
|
//----------------------------------------------------------------------
|
|
|
|
TransliterationRuleData*
|
|
TransliterationRuleParser::parse(const UnicodeString& rules,
|
|
RuleBasedTransliterator::Direction direction,
|
|
ParseError* parseError) {
|
|
TransliterationRuleParser parser(rules, direction, parseError);
|
|
parser.parseRules();
|
|
if (U_FAILURE(parser.status)) {
|
|
delete parser.data;
|
|
parser.data = 0;
|
|
}
|
|
return parser.data;
|
|
}
|
|
|
|
/**
|
|
* @param rules list of rules, separated by newline characters
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
* rules
|
|
*/
|
|
TransliterationRuleParser::TransliterationRuleParser(
|
|
const UnicodeString& theRules,
|
|
RuleBasedTransliterator::Direction theDirection,
|
|
ParseError* theParseError) :
|
|
rules(theRules), direction(theDirection), data(0), parseError(theParseError) {
|
|
parseData = new ParseData(0, &setVariablesVector);
|
|
}
|
|
|
|
/**
|
|
* Destructor.
|
|
*/
|
|
TransliterationRuleParser::~TransliterationRuleParser() {
|
|
delete parseData;
|
|
}
|
|
|
|
/**
|
|
* Parse the given string as a sequence of rules, separated by newline
|
|
* characters ('\n'), and cause this object to implement those rules. Any
|
|
* previous rules are discarded. Typically this method is called exactly
|
|
* once, during construction.
|
|
* @exception IllegalArgumentException if there is a syntax error in the
|
|
* rules
|
|
*/
|
|
void TransliterationRuleParser::parseRules(void) {
|
|
status = U_ZERO_ERROR;
|
|
|
|
delete data;
|
|
data = new TransliterationRuleData(status);
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
parseData->data = data;
|
|
setVariablesVector.removeAllElements();
|
|
if (parseError != 0) {
|
|
parseError->code = 0;
|
|
}
|
|
determineVariableRange();
|
|
|
|
int32_t pos = 0;
|
|
int32_t limit = rules.length();
|
|
while (pos < limit && U_SUCCESS(status)) {
|
|
UChar c = rules.charAt(pos++);
|
|
if (Unicode::isWhitespace(c)) {
|
|
// Ignore leading whitespace. Note that this is not
|
|
// Unicode spaces, but Java spaces -- a subset,
|
|
// representing whitespace likely to be seen in code.
|
|
continue;
|
|
}
|
|
// Skip lines starting with the comment character
|
|
if (c == RULE_COMMENT_CHAR) {
|
|
pos = rules.indexOf((UChar)0x000A /*\n*/, pos) + 1;
|
|
if (pos == 0) {
|
|
break; // No "\n" found; rest of rule is a commnet
|
|
}
|
|
continue; // Either fall out or restart with next line
|
|
}
|
|
// We've found the start of a rule. c is its first
|
|
// character, and pos points past c. Lexically parse the
|
|
// rule into component pieces.
|
|
pos = parseRule(--pos, limit);
|
|
}
|
|
|
|
// Convert the set vector to an array
|
|
data->setVariablesLength = setVariablesVector.size();
|
|
data->setVariables = new UnicodeSet*[data->setVariablesLength];
|
|
// orphanElement removes the given element and shifts all other
|
|
// elements down. For performance (and code clarity) we work from
|
|
// the end back to index 0.
|
|
for (int32_t i=data->setVariablesLength; i>0; ) {
|
|
--i;
|
|
data->setVariables[i] =
|
|
(UnicodeSet*) setVariablesVector.orphanElementAt(i);
|
|
}
|
|
|
|
// Index the rules
|
|
if (U_SUCCESS(status)) {
|
|
data->ruleSet.freeze(*data, status);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
|
* at pos. Return the index after the last character parsed. Do not
|
|
* parse characters at or after limit.
|
|
*
|
|
* Important: The character at pos must be a non-whitespace character
|
|
* that is not the comment character.
|
|
*
|
|
* This method handles quoting, escaping, and whitespace removal. It
|
|
* parses the end-of-rule character. It recognizes context and cursor
|
|
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
|
* creates a rule object and adds it to our rule list.
|
|
*/
|
|
int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
|
|
// Locate the left side, operator, and right side
|
|
int32_t start = pos;
|
|
UChar op = 0;
|
|
|
|
UnicodeString buf;
|
|
int32_t cursor = -1; // position of cursor in buf
|
|
int32_t ante = -1; // position of ante context marker ')' in buf
|
|
int32_t post = -1; // position of post context marker '(' in buf
|
|
int32_t postClose = -1; // position of post context close ')' in buf
|
|
|
|
// Assigned to buf and its adjuncts after the LHS has been
|
|
// parsed. Thereafter, buf etc. refer to the RHS.
|
|
UnicodeString left;
|
|
int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
|
|
|
|
UnicodeString scratch;
|
|
|
|
while (pos < limit) {
|
|
UChar c = rules.charAt(pos++);
|
|
if (Unicode::isWhitespace(c)) {
|
|
// Ignore whitespace. Note that this is not Unicode
|
|
// spaces, but Java spaces -- a subset, representing
|
|
// whitespace likely to be seen in code.
|
|
continue;
|
|
}
|
|
// Handle escapes
|
|
if (c == ESCAPE) {
|
|
if (pos == limit) {
|
|
return syntaxError(RuleBasedTransliterator::TRAILING_BACKSLASH, rules, start);
|
|
}
|
|
// Parse \uXXXX escapes
|
|
c = rules.charAt(pos++);
|
|
if (c == 0x0075/*u*/) {
|
|
if ((pos+4) > limit) {
|
|
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start);
|
|
}
|
|
c = (UChar)0x0000;
|
|
for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
|
|
int32_t digit = Unicode::digit(rules.charAt(pos), 16);
|
|
if (digit<0) {
|
|
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start);
|
|
}
|
|
c = (UChar) ((c << 4) | digit);
|
|
}
|
|
}
|
|
|
|
buf.append(c);
|
|
continue;
|
|
}
|
|
// Handle quoted matter
|
|
if (c == QUOTE) {
|
|
int32_t iq = rules.indexOf(QUOTE, pos);
|
|
if (iq == pos) {
|
|
buf.append(c); // Parse [''] outside quotes as [']
|
|
++pos;
|
|
} else {
|
|
/* This loop picks up a segment of quoted text of the
|
|
* form 'aaaa' each time through. If this segment
|
|
* hasn't really ended ('aaaa''bbbb') then it keeps
|
|
* looping, each time adding on a new segment. When it
|
|
* reaches the final quote it breaks.
|
|
*/
|
|
for (;;) {
|
|
if (iq < 0) {
|
|
return syntaxError(RuleBasedTransliterator::UNTERMINATED_QUOTE, rules, start);
|
|
}
|
|
scratch.truncate(0);
|
|
rules.extractBetween(pos, iq, scratch);
|
|
buf.append(scratch);
|
|
pos = iq+1;
|
|
if (pos < limit && rules.charAt(pos) == QUOTE) {
|
|
// Parse [''] inside quotes as [']
|
|
iq = rules.indexOf(QUOTE, pos+1);
|
|
// Continue looping
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
if (OPERATORS.indexOf(c) >= 0) {
|
|
if (op != 0) {
|
|
return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start);
|
|
}
|
|
// Found an operator char. Check for forward-reverse operator.
|
|
if (c == REVERSE_RULE_OP &&
|
|
(pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) {
|
|
++pos;
|
|
op = FWDREV_RULE_OP;
|
|
} else {
|
|
op = c;
|
|
}
|
|
left = buf; // lhs
|
|
leftCursor = cursor;
|
|
leftAnte = ante;
|
|
leftPost = post;
|
|
leftPostClose = postClose;
|
|
|
|
buf.truncate(0);
|
|
cursor = ante = post = postClose = -1;
|
|
continue;
|
|
}
|
|
if (c == END_OF_RULE) {
|
|
break;
|
|
}
|
|
switch (c) {
|
|
case VARIABLE_REF_OPEN:
|
|
{
|
|
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
|
|
if (pos == j || j < 0) { // empty or unterminated
|
|
return syntaxError(RuleBasedTransliterator::MALFORMED_VARIABLE_REFERENCE, rules, start);
|
|
}
|
|
scratch.truncate(0);
|
|
rules.extractBetween(pos, j, scratch);
|
|
pos = j+1;
|
|
UChar v = data->lookupVariable(scratch, status);
|
|
if (U_FAILURE(status)) {
|
|
return syntaxError(RuleBasedTransliterator::UNDEFINED_VARIABLE, rules, start);
|
|
}
|
|
buf.append(v);
|
|
}
|
|
break;
|
|
case CONTEXT_OPEN:
|
|
if (post >= 0) {
|
|
return syntaxError(RuleBasedTransliterator::MULTIPLE_POST_CONTEXTS, rules, start);
|
|
}
|
|
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
|
|
// this is the optional opening delimiter for the ante context.
|
|
if (buf.length() > 0) {
|
|
post = buf.length();
|
|
}
|
|
break;
|
|
case CONTEXT_CLOSE:
|
|
if (postClose >= 0) {
|
|
return syntaxError(RuleBasedTransliterator::UNEXPECTED_CLOSE_CONTEXT, rules, start);
|
|
}
|
|
if (post >= 0) {
|
|
// This is probably the optional closing delimiter
|
|
// for the post context; save the pos and check later.
|
|
postClose = buf.length();
|
|
} else if (ante >= 0) {
|
|
return syntaxError(RuleBasedTransliterator::MULTIPLE_ANTE_CONTEXTS, rules, start);
|
|
} else {
|
|
ante = buf.length();
|
|
}
|
|
break;
|
|
case SET_OPEN: {
|
|
ParsePosition pp(pos-1); // Backup to opening '['
|
|
buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
|
|
if (U_FAILURE(status)) {
|
|
return syntaxError(RuleBasedTransliterator::MALFORMED_SET, rules, start);
|
|
}
|
|
pos = pp.getIndex(); }
|
|
break;
|
|
case VARIABLE_REF_CLOSE:
|
|
case SET_CLOSE:
|
|
return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start);
|
|
case CURSOR_POS:
|
|
if (cursor >= 0) {
|
|
return syntaxError(RuleBasedTransliterator::MULTIPLE_CURSORS, rules, start);
|
|
}
|
|
cursor = buf.length();
|
|
break;
|
|
default:
|
|
buf.append(c);
|
|
break;
|
|
}
|
|
}
|
|
if (op == 0) {
|
|
return syntaxError(RuleBasedTransliterator::MISSING_OPERATOR, rules, start);
|
|
}
|
|
|
|
// Check context close parameters
|
|
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
|
|
(postClose >= 0 && postClose != buf.length())) {
|
|
return syntaxError(RuleBasedTransliterator::TEXT_AFTER_CLOSE_CONTEXT, rules, start);
|
|
}
|
|
|
|
// Context is only allowed on the input side; that is, the left side
|
|
// for forward rules. Cursors are only allowed on the output side;
|
|
// that is, the right side for forward rules. Bidirectional rules
|
|
// ignore elements that do not apply.
|
|
|
|
switch (op) {
|
|
case VARIABLE_DEF_OP:
|
|
// LHS is the name. RHS is a single character, either a literal
|
|
// or a set (already parsed). If RHS is longer than one
|
|
// character, it is either a multi-character string, or multiple
|
|
// sets, or a mixture of chars and sets -- syntax error.
|
|
if (buf.length() != 1) {
|
|
return syntaxError(RuleBasedTransliterator::MALFORMED_RHS, rules, start);
|
|
}
|
|
if (data->isVariableDefined(left)) {
|
|
return syntaxError(RuleBasedTransliterator::DUPLICATE_VARIABLE_DEFINITION, rules, start);
|
|
}
|
|
data->defineVariable(left, buf.charAt(0), status);
|
|
break;
|
|
|
|
case FORWARD_RULE_OP:
|
|
if (direction == RuleBasedTransliterator::FORWARD) {
|
|
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
|
|
return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start);
|
|
}
|
|
data->ruleSet.addRule(new TransliterationRule(
|
|
left, leftAnte, leftPost,
|
|
buf, cursor, status), status);
|
|
} // otherwise ignore the rule; it's not the direction we want
|
|
break;
|
|
|
|
case REVERSE_RULE_OP:
|
|
if (direction == RuleBasedTransliterator::REVERSE) {
|
|
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
|
|
return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start);
|
|
}
|
|
data->ruleSet.addRule(new TransliterationRule(
|
|
buf, ante, post,
|
|
left, leftCursor, status), status);
|
|
} // otherwise ignore the rule; it's not the direction we want
|
|
break;
|
|
|
|
case FWDREV_RULE_OP:
|
|
if (direction == RuleBasedTransliterator::FORWARD) {
|
|
// The output side is the right; trim off any context
|
|
if (post >= 0) {
|
|
buf.remove(post);
|
|
}
|
|
if (ante >= 0) {
|
|
buf.removeBetween(0, ante);
|
|
}
|
|
data->ruleSet.addRule(new TransliterationRule(
|
|
left, leftAnte, leftPost,
|
|
buf, cursor, status), status);
|
|
} else {
|
|
// The output side is the left; trim off any context
|
|
if (leftPost >= 0) {
|
|
left.remove(leftPost);
|
|
}
|
|
if (leftAnte >= 0) {
|
|
left.removeBetween(0, leftAnte);
|
|
}
|
|
data->ruleSet.addRule(new TransliterationRule(
|
|
buf, ante, post,
|
|
left, leftCursor, status), status);
|
|
}
|
|
break;
|
|
}
|
|
|
|
return pos;
|
|
}
|
|
|
|
/**
|
|
* Called by main parser upon syntax error. Search the rule string
|
|
* for the probable end of the rule. Of course, if the error is that
|
|
* the end of rule marker is missing, then the rule end will not be found.
|
|
* In any case the rule start will be correctly reported.
|
|
* @param msg error description
|
|
* @param rule pattern string
|
|
* @param start position of first character of current rule
|
|
*/
|
|
int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode,
|
|
const UnicodeString& rule,
|
|
int32_t start) {
|
|
if (parseError != 0) {
|
|
parseError->code = parseErrorCode;
|
|
parseError->line = 0; // We don't return a line #
|
|
parseError->offset = start; // Character offset from rule start
|
|
int32_t end = quotedIndexOf(rule, start, rule.length(), END_OF_RULE);
|
|
if (end < 0) {
|
|
end = rule.length();
|
|
}
|
|
rule.extractBetween(start, end, parseError->context); // Current rule
|
|
}
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return start;
|
|
}
|
|
|
|
/**
|
|
* Allocate a private-use substitution character for the given set,
|
|
* register it in the setVariables hash, and return the substitution
|
|
* character.
|
|
*/
|
|
UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
|
|
if (variableNext >= variableLimit) {
|
|
// throw new RuntimeException("Private use variables exhausted");
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return 0;
|
|
}
|
|
setVariablesVector.addElement(adoptedSet);
|
|
return variableNext++;
|
|
}
|
|
|
|
/**
|
|
* Determines what part of the private use region of Unicode we can use for
|
|
* variable stand-ins. The correct way to do this is as follows: Parse each
|
|
* rule, and for forward and reverse rules, take the FROM expression, and
|
|
* make a hash of all characters used. The TO expression should be ignored.
|
|
* When done, everything not in the hash is available for use. In practice,
|
|
* this method may employ some other algorithm for improved speed.
|
|
*/
|
|
void TransliterationRuleParser::determineVariableRange(void) {
|
|
UnicodeRange privateUse(0xE000, 0x1900); // Private use area
|
|
|
|
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
|
|
|
|
data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
|
|
|
|
if (r != 0) {
|
|
data->setVariablesBase = variableNext = r->start;
|
|
variableLimit = (UChar) (r->start + r->length);
|
|
delete r;
|
|
}
|
|
|
|
if (variableNext >= variableLimit) {
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns the index of a character, ignoring quoted text.
|
|
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
|
|
* found by a search for 'h'.
|
|
*/
|
|
int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
|
|
int32_t start, int32_t limit,
|
|
UChar charToFind) {
|
|
for (int32_t i=start; i<limit; ++i) {
|
|
UChar c = text.charAt(i);
|
|
if (c == ESCAPE) {
|
|
++i;
|
|
} else if (c == QUOTE) {
|
|
while (++i < limit
|
|
&& text.charAt(i) != QUOTE) {}
|
|
} else if (c == charToFind) {
|
|
return i;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|