ICU-199 new rule syntax; performance improvement; update rules
X-SVN-Rev: 559
This commit is contained in:
parent
cd8a516d90
commit
1a6cfef879
@ -43,6 +43,14 @@ TransliterationRuleData::defineVariable(const UnicodeString& name,
|
||||
UChar standIn,
|
||||
UnicodeSet* adoptedSet,
|
||||
UErrorCode& status) {
|
||||
defineVariable(name, standIn, status);
|
||||
defineSet(standIn, adoptedSet, status);
|
||||
}
|
||||
|
||||
void
|
||||
TransliterationRuleData::defineSet(UChar standIn,
|
||||
UnicodeSet* adoptedSet,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
@ -50,9 +58,6 @@ TransliterationRuleData::defineVariable(const UnicodeString& name,
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
uhash_putKey(variableNames, name.hashCode() & 0x7FFFFFFF,
|
||||
(void*) standIn,
|
||||
&status);
|
||||
uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF),
|
||||
adoptedSet,
|
||||
&status);
|
||||
|
@ -72,6 +72,10 @@ public:
|
||||
UnicodeSet* adoptedSet,
|
||||
UErrorCode& status);
|
||||
|
||||
void defineSet(UChar standIn,
|
||||
UnicodeSet* adoptedSet,
|
||||
UErrorCode& status);
|
||||
|
||||
UChar lookupVariable(const UnicodeString& name,
|
||||
UErrorCode& status) const;
|
||||
|
||||
|
@ -13,35 +13,30 @@
|
||||
#include "unirange.h"
|
||||
#include "rbt_data.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "cstring.h"
|
||||
#include "unicode/parsepos.h"
|
||||
|
||||
// Operators
|
||||
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
|
||||
const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>';
|
||||
const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<';
|
||||
const char* TransliterationRuleParser::OPERATORS = "=><";
|
||||
const UChar TransliterationRuleParser::FWDREV_RULE_OP = '~'; // internal rep of <> op
|
||||
const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3);
|
||||
|
||||
// Other special characters
|
||||
const UChar TransliterationRuleParser::QUOTE = '\'';
|
||||
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
|
||||
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
|
||||
const UChar TransliterationRuleParser::CONTEXT_OPEN = '[';
|
||||
const UChar TransliterationRuleParser::CONTEXT_CLOSE = ']';
|
||||
const UChar TransliterationRuleParser::CURSOR_POS = '|';
|
||||
const UChar TransliterationRuleParser::ESCAPE = '\\';
|
||||
const UChar TransliterationRuleParser::END_OF_RULE = ';';
|
||||
const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#';
|
||||
|
||||
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
|
||||
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
|
||||
const UChar TransliterationRuleParser::CONTEXT_OPEN = '(';
|
||||
const UChar TransliterationRuleParser::CONTEXT_CLOSE = ')';
|
||||
const UChar TransliterationRuleParser::SET_OPEN = '[';
|
||||
const UChar TransliterationRuleParser::SET_CLOSE = ']';
|
||||
const UChar TransliterationRuleParser::CURSOR_POS = '|';
|
||||
|
||||
/**
|
||||
* Specials must be quoted in rules to be used as literals.
|
||||
* Specials may not occur in variable names.
|
||||
*
|
||||
* This string is a superset of OPERATORS.
|
||||
*/
|
||||
const char* TransliterationRuleParser::SPECIALS = "'{}[]|#=><";
|
||||
|
||||
/**
|
||||
* Specials that must be quoted in variable definitions.
|
||||
*/
|
||||
const char* TransliterationRuleParser::DEF_SPECIALS = "'{}";
|
||||
|
||||
TransliterationRuleData*
|
||||
TransliterationRuleParser::parse(const UnicodeString& rules,
|
||||
@ -84,465 +79,339 @@ void TransliterationRuleParser::parseRules(void) {
|
||||
|
||||
determineVariableRange();
|
||||
|
||||
int32_t n = rules.length();
|
||||
int32_t i = 0;
|
||||
while (i<n && U_SUCCESS(status)) {
|
||||
int32_t limit = rules.indexOf('\n', i);
|
||||
|
||||
// Recognize "\\\n" as an escaped "\n"
|
||||
while (limit>0 && rules.charAt(limit-1) == '\\') {
|
||||
limit = rules.indexOf('\n', limit+1);
|
||||
int32_t pos = 0;
|
||||
int32_t limit = rules.length();
|
||||
while (pos < limit && U_SUCCESS(status)) {
|
||||
UChar c = rules.charAt(pos++);
|
||||
if (Unicode::isWhitespace(c)) {
|
||||
// Ignore leading whitespace. Note that this is not
|
||||
// Unicode spaces, but Java spaces -- a subset,
|
||||
// representing whitespace likely to be seen in code.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (limit == -1) {
|
||||
limit = n;
|
||||
// Skip lines starting with the comment character
|
||||
if (c == RULE_COMMENT_CHAR) {
|
||||
pos = rules.indexOf("\n", pos) + 1;
|
||||
if (pos == 0) {
|
||||
break; // No "\n" found; rest of rule is a commnet
|
||||
}
|
||||
continue; // Either fall out or restart with next line
|
||||
}
|
||||
// Skip over empty lines and line starting with #
|
||||
if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) {
|
||||
applyRule(i, limit);
|
||||
}
|
||||
i = limit + 1;
|
||||
// We've found the start of a rule. c is its first
|
||||
// character, and pos points past c. Lexically parse the
|
||||
// rule into component pieces.
|
||||
pos = parseRule(--pos, limit);
|
||||
}
|
||||
|
||||
// Index the rules
|
||||
if (U_SUCCESS(status)) {
|
||||
data->ruleSet.freeze(*data, status);
|
||||
}
|
||||
|
||||
data->ruleSet.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the given substring as a rule, and append it to the rules currently
|
||||
* represented in this object.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
||||
* at pos. Return the index after the last character parsed. Do not
|
||||
* parse characters at or after limit.
|
||||
*
|
||||
* Important: The character at pos must be a non-whitespace character
|
||||
* that is not the comment character.
|
||||
*
|
||||
* This method handles quoting, escaping, and whitespace removal. It
|
||||
* parses the end-of-rule character. It recognizes context and cursor
|
||||
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
||||
* creates a rule object and adds it to our rule list.
|
||||
*/
|
||||
void TransliterationRuleParser::applyRule(int32_t start, int32_t limit) {
|
||||
/* General description of parsing: Initially, rules contain two types of
|
||||
* quoted characters. First, there are variable references, such as
|
||||
* "{alpha}". Second, there are quotes, such as "'<'" or "''". One of
|
||||
* the first steps in parsing a rule is to resolve such quoted matter.
|
||||
* Quotes are removed early, leaving unquoted literal matter. Variable
|
||||
* references are resolved and replaced by single characters. In some
|
||||
* instances these characters represent themselves; in others, they
|
||||
* stand for categories of characters. Character categories are either
|
||||
* predefined (e.g., "{Lu}"), or are defined by the user using a
|
||||
* statement (e.g., "vowels:aeiouAEIOU").
|
||||
*
|
||||
* Another early step in parsing is to split each rule into component
|
||||
* pieces. These pieces are, for every rule, a left-hand side, a right-
|
||||
* hand side, and an operator. The left- and right-hand sides may not
|
||||
* be empty, except for the output patterns of forward and reverse
|
||||
* rules. In addition to this partitioning, the match patterns of
|
||||
* forward and reverse rules must be partitioned into antecontext,
|
||||
* postcontext, and literal pattern, where the context portions may or
|
||||
* may not be present. Finally, output patterns must have the cursor
|
||||
* indicator '|' detected and removed, with its position recorded.
|
||||
*
|
||||
* Quote removal, variable resolution, and sub-pattern splitting must
|
||||
* all happen at once. This is due chiefly to the quoting mechanism,
|
||||
* which allows special characters to appear at arbitrary positions in
|
||||
* the final unquoted text. (For this reason, alteration of the rule
|
||||
* language is somewhat clumsy; it entails reassessment and revision of
|
||||
* the parsing methods as a whole.)
|
||||
*
|
||||
* After this processing of rules is complete, the final end products
|
||||
* are unquoted pieces of text of various types, and an integer cursor
|
||||
* position, if one is specified. These processed raw materials are now
|
||||
* easy to deal with; other classes such as UnicodeSet and
|
||||
* TransliterationRule need know nothing of quoting or variables.
|
||||
*/
|
||||
int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
|
||||
// Locate the left side, operator, and right side
|
||||
int32_t start = pos;
|
||||
UChar op = 0;
|
||||
|
||||
UnicodeString buf;
|
||||
int32_t cursor = -1; // position of cursor in buf
|
||||
int32_t ante = -1; // position of ante context marker ')' in buf
|
||||
int32_t post = -1; // position of post context marker '(' in buf
|
||||
int32_t postClose = -1; // position of post context close ')' in buf
|
||||
|
||||
// Assigned to buf and its adjuncts after the LHS has been
|
||||
// parsed. Thereafter, buf etc. refer to the RHS.
|
||||
UnicodeString left;
|
||||
UnicodeString right;
|
||||
UnicodeString anteContext;
|
||||
UnicodeString postContext;
|
||||
int32_t cursorPos;
|
||||
int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
|
||||
|
||||
UChar op = parseRule(start, limit, left, right,
|
||||
anteContext, postContext, cursorPos);
|
||||
UnicodeString scratch;
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
while (pos < limit) {
|
||||
UChar c = rules.charAt(pos++);
|
||||
if (Unicode::isWhitespace(c)) {
|
||||
// Ignore whitespace. Note that this is not Unicode
|
||||
// spaces, but Java spaces -- a subset, representing
|
||||
// whitespace likely to be seen in code.
|
||||
continue;
|
||||
}
|
||||
// Handle escapes
|
||||
if (c == ESCAPE) {
|
||||
if (pos == limit) {
|
||||
return syntaxError("Trailing backslash", rules, start);
|
||||
}
|
||||
// Parse \uXXXX escapes
|
||||
c = rules.charAt(pos++);
|
||||
if (c == 'u') {
|
||||
if ((pos+4) > limit) {
|
||||
return syntaxError("Malformed Unicode escape", rules, start);
|
||||
}
|
||||
c = (UChar)0x0000;
|
||||
for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
|
||||
int32_t digit = Unicode::digit(rules.charAt(pos), 16);
|
||||
if (digit<0) {
|
||||
return syntaxError("Malformed Unicode escape", rules, start);
|
||||
}
|
||||
c = (UChar) ((c << 4) | digit);
|
||||
}
|
||||
}
|
||||
|
||||
buf.append(c);
|
||||
continue;
|
||||
}
|
||||
// Handle quoted matter
|
||||
if (c == QUOTE) {
|
||||
int32_t iq = rules.indexOf(QUOTE, pos);
|
||||
if (iq == pos) {
|
||||
buf.append(c); // Parse [''] outside quotes as [']
|
||||
++pos;
|
||||
} else {
|
||||
/* This loop picks up a segment of quoted text of the
|
||||
* form 'aaaa' each time through. If this segment
|
||||
* hasn't really ended ('aaaa''bbbb') then it keeps
|
||||
* looping, each time adding on a new segment. When it
|
||||
* reaches the final quote it breaks.
|
||||
*/
|
||||
for (;;) {
|
||||
if (iq < 0) {
|
||||
return syntaxError("Unterminated quote", rules, start);
|
||||
}
|
||||
scratch.truncate(0);
|
||||
rules.extractBetween(pos, iq, scratch);
|
||||
buf.append(scratch);
|
||||
pos = iq+1;
|
||||
if (pos < limit && rules.charAt(pos) == QUOTE) {
|
||||
// Parse [''] inside quotes as [']
|
||||
iq = rules.indexOf(QUOTE, pos+1);
|
||||
// Continue looping
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (OPERATORS.indexOf(c) >= 0) {
|
||||
if (op != 0) {
|
||||
return syntaxError("Unquoted special", rules, start);
|
||||
}
|
||||
// Found an operator char. Check for forward-reverse operator.
|
||||
if (c == REVERSE_RULE_OP &&
|
||||
(pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) {
|
||||
++pos;
|
||||
op = FWDREV_RULE_OP;
|
||||
} else {
|
||||
op = c;
|
||||
}
|
||||
left = buf; // lhs
|
||||
leftCursor = cursor;
|
||||
leftAnte = ante;
|
||||
leftPost = post;
|
||||
leftPostClose = postClose;
|
||||
|
||||
buf.truncate(0);
|
||||
cursor = ante = post = postClose = -1;
|
||||
continue;
|
||||
}
|
||||
if (c == END_OF_RULE) {
|
||||
break;
|
||||
}
|
||||
switch (c) {
|
||||
case VARIABLE_REF_OPEN:
|
||||
{
|
||||
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
|
||||
if (pos == j || j < 0) { // empty or unterminated
|
||||
return syntaxError("Malformed variable reference", rules, start);
|
||||
}
|
||||
scratch.truncate(0);
|
||||
rules.extractBetween(pos, j, scratch);
|
||||
pos = j+1;
|
||||
UChar v = data->lookupVariable(scratch, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return syntaxError("Undefined variable", rules, start);
|
||||
}
|
||||
buf.append(v);
|
||||
}
|
||||
break;
|
||||
case CONTEXT_OPEN:
|
||||
if (post >= 0) {
|
||||
return syntaxError("Multiple post contexts", rules, start);
|
||||
}
|
||||
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
|
||||
// this is the optional opening delimiter for the ante context.
|
||||
if (buf.length() > 0) {
|
||||
post = buf.length();
|
||||
}
|
||||
break;
|
||||
case CONTEXT_CLOSE:
|
||||
if (postClose >= 0) {
|
||||
return syntaxError("Unexpected ')'", rules, start);
|
||||
}
|
||||
if (post >= 0) {
|
||||
// This is probably the optional closing delimiter
|
||||
// for the post context; save the pos and check later.
|
||||
postClose = buf.length();
|
||||
} else if (ante >= 0) {
|
||||
return syntaxError("Multiple ante contexts", rules, start);
|
||||
} else {
|
||||
ante = buf.length();
|
||||
}
|
||||
break;
|
||||
case SET_OPEN: {
|
||||
ParsePosition pp(pos-1); // Backup to opening '['
|
||||
buf.append(registerSet(new UnicodeSet(rules, pp, data, status)));
|
||||
if (U_FAILURE(status)) {
|
||||
return syntaxError("Invalid set", rules, start);
|
||||
}
|
||||
pos = pp.getIndex(); }
|
||||
break;
|
||||
case VARIABLE_REF_CLOSE:
|
||||
case SET_CLOSE:
|
||||
return syntaxError("Unquoted special", rules, start);
|
||||
case CURSOR_POS:
|
||||
if (cursor >= 0) {
|
||||
return syntaxError("Multiple cursors", rules, start);
|
||||
}
|
||||
cursor = buf.length();
|
||||
break;
|
||||
default:
|
||||
buf.append(c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (op == 0) {
|
||||
return syntaxError("No operator", rules, start);
|
||||
}
|
||||
|
||||
// Check context close parameters
|
||||
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
|
||||
(postClose >= 0 && postClose != buf.length())) {
|
||||
return syntaxError("Extra text after ]", rules, start);
|
||||
}
|
||||
|
||||
// Context is only allowed on the input side; that is, the left side
|
||||
// for forward rules. Cursors are only allowed on the output side;
|
||||
// that is, the right side for forward rules. Bidirectional rules
|
||||
// ignore elements that do not apply.
|
||||
|
||||
switch (op) {
|
||||
case VARIABLE_DEF_OP:
|
||||
applyVariableDef(left, right);
|
||||
// LHS is the name. RHS is a single character, either a literal
|
||||
// or a set (already parsed). If RHS is longer than one
|
||||
// character, it is either a multi-character string, or multiple
|
||||
// sets, or a mixture of chars and sets -- syntax error.
|
||||
if (buf.length() != 1) {
|
||||
return syntaxError("Malformed RHS", rules, start);
|
||||
}
|
||||
if (data->isVariableDefined(left)) {
|
||||
return syntaxError("Duplicate definition", rules, start);
|
||||
}
|
||||
data->defineVariable(left, buf.charAt(0), status);
|
||||
break;
|
||||
|
||||
case FORWARD_RULE_OP:
|
||||
if (direction == RuleBasedTransliterator::FORWARD) {
|
||||
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
|
||||
return syntaxError("Malformed rule", rules, start);
|
||||
}
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
left, right,
|
||||
anteContext, postContext,
|
||||
cursorPos, status),
|
||||
status);
|
||||
left, leftAnte, leftPost,
|
||||
buf, cursor, status), status);
|
||||
} // otherwise ignore the rule; it's not the direction we want
|
||||
break;
|
||||
|
||||
case REVERSE_RULE_OP:
|
||||
if (direction == RuleBasedTransliterator::REVERSE) {
|
||||
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
|
||||
return syntaxError("Malformed rule", rules, start);
|
||||
}
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
right, left,
|
||||
anteContext, postContext,
|
||||
cursorPos, status),
|
||||
status);
|
||||
buf, ante, post,
|
||||
left, leftCursor, status), status);
|
||||
} // otherwise ignore the rule; it's not the direction we want
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a variable definition.
|
||||
* @param name the name of the variable. It must not already be defined.
|
||||
* @param pattern the value of the variable. It may be a single character
|
||||
* or a pattern describing a character set.
|
||||
* @exception IllegalArgumentException if there is a syntax error
|
||||
*/
|
||||
void TransliterationRuleParser::applyVariableDef(const UnicodeString& name,
|
||||
const UnicodeString& pattern) {
|
||||
validateVariableName(name);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (data->isVariableDefined(name)) {
|
||||
// throw new IllegalArgumentException("Duplicate variable definition: "
|
||||
// + name + '=' + pattern);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
//! if (UnicodeSet.getCategoryID(name) >= 0) {
|
||||
//! throw new IllegalArgumentException("Reserved variable name: "
|
||||
//! + name);
|
||||
//! }
|
||||
if (pattern.length() < 1) {
|
||||
// throw new IllegalArgumentException("Variable definition missing: "
|
||||
// + name);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (pattern.length() == 1) {
|
||||
// Got a single character variable definition
|
||||
//$ data->variableNames.put(name, new Character(pattern.charAt(0)));
|
||||
data->defineVariable(name, pattern.charAt(0), status);
|
||||
} else {
|
||||
// Got more than one character; parse it as a category
|
||||
if (variableNext >= variableLimit) {
|
||||
//$ throw new RuntimeException("Private use variables exhausted");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
case FWDREV_RULE_OP:
|
||||
if (direction == RuleBasedTransliterator::FORWARD) {
|
||||
// The output side is the right; trim off any context
|
||||
if (post >= 0) {
|
||||
buf.remove(post);
|
||||
}
|
||||
if (ante >= 0) {
|
||||
buf.removeBetween(0, ante);
|
||||
}
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
left, leftAnte, leftPost,
|
||||
buf, cursor, status), status);
|
||||
} else {
|
||||
// The output side is the left; trim off any context
|
||||
if (leftPost >= 0) {
|
||||
left.remove(leftPost);
|
||||
}
|
||||
if (leftAnte >= 0) {
|
||||
left.removeBetween(0, leftAnte);
|
||||
}
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
buf, ante, post,
|
||||
left, leftCursor, status), status);
|
||||
}
|
||||
//$ Character c = new Character(variableNext++);
|
||||
//$ data->variableNames.put(name, c);
|
||||
//$ data->setVariables.put(c, new UnicodeSet(pattern));
|
||||
data->defineVariable(name, variableNext++,
|
||||
new UnicodeSet(pattern, status),
|
||||
status);
|
||||
break;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a rule, parses it into three pieces: The left side, the right side,
|
||||
* and the operator. Returns the operator. Quotes and variable references
|
||||
* are resolved; the otuput text in all <code>StringBuffer</code> parameters
|
||||
* is literal text. This method delegates to other parsing methods to
|
||||
* handle the match pattern, output pattern, and other sub-patterns in the
|
||||
* rule.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param left left side of rule is appended to this buffer
|
||||
* with the quotes removed and variables resolved
|
||||
* @param right right side of rule is appended to this buffer
|
||||
* with the quotes removed and variables resolved
|
||||
* @param anteContext the preceding context of the match pattern,
|
||||
* if there is one, is appended to this buffer
|
||||
* @param postContext the following context of the match pattern,
|
||||
* if there is one, is appended to this buffer
|
||||
* @param cursorPos if there is a cursor in the output pattern, its
|
||||
* offset is stored in <code>cursorPos</code>, otherwise set to -1.
|
||||
* @return The operator character, one of the characters in OPERATORS.
|
||||
* Called by main parser upon syntax error. Search the rule string
|
||||
* for the probable end of the rule. Of course, if the error is that
|
||||
* the end of rule marker is missing, then the rule end will not be found.
|
||||
* In any case the rule start will be correctly reported.
|
||||
* @param msg error description
|
||||
* @param rule pattern string
|
||||
* @param start position of first character of current rule
|
||||
*/
|
||||
UChar TransliterationRuleParser::parseRule(int32_t start, int32_t limit,
|
||||
UnicodeString& left,
|
||||
UnicodeString& right,
|
||||
UnicodeString& anteContext,
|
||||
UnicodeString& postContext,
|
||||
int32_t& cursorPos) {
|
||||
/* Parse the rule into three pieces -- left, operator, and right,
|
||||
* parsing out quotes. The result is that left and right will have
|
||||
* unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted
|
||||
* operators throw an exception. Two quotes inside or outside
|
||||
* quotes indicates a quote literal. E.g., "o''clock" -> "o'clock".
|
||||
*/
|
||||
int32_t i = quotedIndexOf(rules, start, limit, OPERATORS);
|
||||
if (i < 0) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Syntax error: "
|
||||
//$ + rules.substring(start, limit));
|
||||
int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/,
|
||||
const UnicodeString& /*rule*/,
|
||||
int32_t start) {
|
||||
//| int end = quotedIndexOf(rule, start, rule.length(), ";");
|
||||
//| if (end < 0) {
|
||||
//| end = rule.length();
|
||||
//| }
|
||||
//| throw new IllegalArgumentException(msg + " in " +
|
||||
//| rule.substring(start, end));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate a private-use substitution character for the given set,
|
||||
* register it in the setVariables hash, and return the substitution
|
||||
* character.
|
||||
*/
|
||||
UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
|
||||
if (variableNext >= variableLimit) {
|
||||
// throw new RuntimeException("Private use variables exhausted");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
cursorPos = -1;
|
||||
UChar c = rules.charAt(i);
|
||||
switch (c) {
|
||||
case FORWARD_RULE_OP:
|
||||
if (i == start) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Empty left side: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
parseMatchPattern(start, i, left, anteContext, postContext);
|
||||
if (i != (limit-1)) {
|
||||
parseOutputPattern(i+1, limit, right, cursorPos);
|
||||
}
|
||||
break;
|
||||
case REVERSE_RULE_OP:
|
||||
if (i == (limit-1)) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Empty right side: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if (i != start) {
|
||||
parseOutputPattern(start, i, left, cursorPos);
|
||||
}
|
||||
parseMatchPattern(i+1, limit, right, anteContext, postContext);
|
||||
break;
|
||||
default:
|
||||
if (i == start || i == (limit-1)) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Empty left or right side: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
parseSubPattern(start, i, left);
|
||||
parseDefPattern(i+1, limit, right);
|
||||
break;
|
||||
}
|
||||
UChar c = variableNext++;
|
||||
data->defineSet(c, adoptedSet, status);
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the match pattern of a forward or reverse rule. Given the raw
|
||||
* match pattern, return the match text and the context on both sides, if
|
||||
* any. Resolves all quotes and variables.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the key to be matched will be appended to this buffer
|
||||
* @param anteContext the preceding context, if any, will be appended
|
||||
* to this buffer.
|
||||
* @param postContext the following context, if any, will be appended
|
||||
* to this buffer.
|
||||
*/
|
||||
void TransliterationRuleParser::parseMatchPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
UnicodeString& anteContext,
|
||||
UnicodeString& postContext) {
|
||||
if (start >= limit) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Empty expression in rule: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
//$ if (anteContext != 0) {
|
||||
// Ignore optional opening and closing context characters
|
||||
if (rules.charAt(start) == CONTEXT_OPEN) {
|
||||
++start;
|
||||
}
|
||||
if (rules.charAt(limit-1) == CONTEXT_CLOSE) {
|
||||
--limit;
|
||||
}
|
||||
// The four possibilities are:
|
||||
// key
|
||||
// anteContext]key
|
||||
// anteContext]key[postContext
|
||||
// key[postContext
|
||||
int32_t ante = quotedIndexOf(rules, start, limit, CONTEXT_CLOSE);
|
||||
int32_t post = quotedIndexOf(rules, start, limit, CONTEXT_OPEN);
|
||||
if (ante >= 0 && post >= 0 && ante > post) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Syntax error in context specifier: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
if (ante >= 0) {
|
||||
parseSubPattern(start, ante, anteContext);
|
||||
start = ante+1;
|
||||
}
|
||||
if (post >= 0) {
|
||||
parseSubPattern(post+1, limit, postContext);
|
||||
limit = post;
|
||||
}
|
||||
//$ }
|
||||
parseSubPattern(start, limit, text);
|
||||
}
|
||||
|
||||
void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text) {
|
||||
parseSubPattern(start, limit, text, 0, SPECIALS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a variable definition sub pattern. This kind of sub
|
||||
* pattern differs in the set of characters that are considered
|
||||
* special. In particular, the '[' and ']' characters are not
|
||||
* special, since these are used in UnicodeSet patterns.
|
||||
*/
|
||||
void TransliterationRuleParser::parseDefPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text) {
|
||||
parseSubPattern(start, limit, text, 0, DEF_SPECIALS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the output pattern of a forward or reverse rule. Given the
|
||||
* output pattern, return the output text and the position of the cursor,
|
||||
* if any. Resolves all quotes and variables.
|
||||
* @param rules the string to be parsed
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the output text will be appended to this buffer
|
||||
* @param cursorPos if this parameter is not null, then cursorPos
|
||||
* will be set to the cursor position, or -1 if there is none. If this
|
||||
* parameter is null, then cursors will be disallowed.
|
||||
*/
|
||||
void TransliterationRuleParser::parseOutputPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
int32_t& cursorPos) {
|
||||
parseSubPattern(start, limit, text, &cursorPos, SPECIALS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a sub-pattern of a rule. Return the text and the position of the cursor,
|
||||
* if any. Resolves all quotes and variables.
|
||||
* @param rules the string to be parsed
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the output text will be appended to this buffer
|
||||
* @param cursorPos if this parameter is not null, then cursorPos
|
||||
* will be set to the cursor position, or -1 if there is none. If this
|
||||
* parameter is null, then cursors will be disallowed.
|
||||
* @param specials characters that must be quoted; typically either
|
||||
* SPECIALS or DEF_SPECIALS.
|
||||
*/
|
||||
void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
int32_t* cursorPos,
|
||||
const UnicodeString& specials) {
|
||||
bool_t inQuote = FALSE;
|
||||
|
||||
if (start >= limit) {
|
||||
//$ throw new IllegalArgumentException("Empty expression in rule");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
if (cursorPos != 0) {
|
||||
*cursorPos = -1;
|
||||
}
|
||||
for (int32_t i=start; i<limit; ++i) {
|
||||
UChar c = rules.charAt(i);
|
||||
if (c == QUOTE) {
|
||||
// Check for double quote
|
||||
if ((i+1) < limit
|
||||
&& rules.charAt(i+1) == QUOTE) {
|
||||
text.append(QUOTE);
|
||||
++i; // Skip over both quotes
|
||||
} else {
|
||||
inQuote = !inQuote;
|
||||
}
|
||||
} else if (inQuote) {
|
||||
text.append(c);
|
||||
} else if (c == VARIABLE_REF_OPEN) {
|
||||
++i;
|
||||
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, i);
|
||||
if (i == j || j < 0) { // empty or unterminated
|
||||
//$ throw new IllegalArgumentException("Illegal variable reference: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
UnicodeString name;
|
||||
rules.extractBetween(i, j, name);
|
||||
validateVariableName(name);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
UChar ch = data->lookupVariable(name, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
text.append(ch);
|
||||
i = j;
|
||||
} else if (c == CURSOR_POS && cursorPos != 0) {
|
||||
if (*cursorPos >= 0) {
|
||||
//$ throw new IllegalArgumentException("Multiple cursors: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
*cursorPos = text.length();
|
||||
} else if (specials.indexOf(c) >= 0) {
|
||||
//$ throw new IllegalArgumentException("Unquoted special character: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
} else {
|
||||
text.append(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TransliterationRuleParser::validateVariableName(const UnicodeString& name) {
|
||||
if (indexOf(name, SPECIALS) >= 0) {
|
||||
//throw new IllegalArgumentException(
|
||||
// "Special character in variable name: "
|
||||
// + name);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the single character value of the given variable name. Defined
|
||||
* names are recognized.
|
||||
*
|
||||
* NO LONGER SUPPORTED:
|
||||
* If a Unicode category name is given, a standard character variable
|
||||
* in the range firstCategoryVariable to lastCategoryVariable is returned,
|
||||
* with value firstCategoryVariable + n, where n is the category
|
||||
* number.
|
||||
* @exception IllegalArgumentException if the name is unknown.
|
||||
*/
|
||||
//$ UChar TransliterationRuleParser::getVariableDef(const UnicodeString& name) {
|
||||
//$ UChar ch = data->lookupVariable(name, status);
|
||||
//$ //! if (ch == null) {
|
||||
//$ //! int id = UnicodeSet.getCategoryID(name);
|
||||
//$ //! if (id >= 0) {
|
||||
//$ //! ch = new Character((char) (firstCategoryVariable + id));
|
||||
//$ //! data->variableNames.put(name, ch);
|
||||
//$ //! data->setVariables.put(ch, new UnicodeSet(id));
|
||||
//$ //! }
|
||||
//$ //! }
|
||||
//$ if (ch == 0) {
|
||||
//$ throw new IllegalArgumentException("Undefined variable: "
|
||||
//$ + name);
|
||||
//$ }
|
||||
//$ return ch;
|
||||
//$ }
|
||||
|
||||
/**
|
||||
* Determines what part of the private use region of Unicode we can use for
|
||||
* variable stand-ins. The correct way to do this is as follows: Parse each
|
||||
@ -599,43 +468,3 @@ int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index of the first character in a set. Unlike
|
||||
* String.indexOf(), this method searches not for a single character, but
|
||||
* for any character of the string <code>setOfChars</code>.
|
||||
* @param text text to be searched
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param setOfChars string with one or more distinct characters
|
||||
* @return Offset of the first character in <code>setOfChars</code>
|
||||
* found, or -1 if not found.
|
||||
* @see #quotedIndexOf
|
||||
*/
|
||||
int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& setOfChars) {
|
||||
for (int32_t i=start; i<limit; ++i) {
|
||||
if (setOfChars.indexOf(text.charAt(i)) >= 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index of the first character in a set. Unlike
|
||||
* String.indexOf(), this method searches not for a single character, but
|
||||
* for any character of the string <code>setOfChars</code>.
|
||||
* @param text text to be searched
|
||||
* @param setOfChars string with one or more distinct characters
|
||||
* @return Offset of the first character in <code>setOfChars</code>
|
||||
* found, or -1 if not found.
|
||||
* @see #quotedIndexOf
|
||||
*/
|
||||
int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
|
||||
const UnicodeString& setOfChars) {
|
||||
return indexOf(text, 0, text.length(), setOfChars);
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "unicode/rbt.h"
|
||||
|
||||
class TransliterationRuleData;
|
||||
class UnicodeSet;
|
||||
|
||||
class TransliterationRuleParser {
|
||||
|
||||
@ -49,29 +50,21 @@ class TransliterationRuleParser {
|
||||
static const UChar VARIABLE_DEF_OP;
|
||||
static const UChar FORWARD_RULE_OP;
|
||||
static const UChar REVERSE_RULE_OP;
|
||||
static const char* OPERATORS;
|
||||
|
||||
static const UChar FWDREV_RULE_OP; // internal rep of <> op
|
||||
static const UnicodeString OPERATORS;
|
||||
|
||||
// Other special characters
|
||||
static const UChar QUOTE;
|
||||
static const UChar ESCAPE;
|
||||
static const UChar END_OF_RULE;
|
||||
static const UChar RULE_COMMENT_CHAR;
|
||||
static const UChar VARIABLE_REF_OPEN;
|
||||
static const UChar VARIABLE_REF_CLOSE;
|
||||
static const UChar CONTEXT_OPEN;
|
||||
static const UChar CONTEXT_CLOSE;
|
||||
static const UChar SET_OPEN;
|
||||
static const UChar SET_CLOSE;
|
||||
static const UChar CURSOR_POS;
|
||||
static const UChar RULE_COMMENT_CHAR;
|
||||
|
||||
|
||||
/**
|
||||
* Specials must be quoted in rules to be used as literals.
|
||||
* Specials may not occur in variable names.
|
||||
*/
|
||||
static const char* SPECIALS;
|
||||
|
||||
/**
|
||||
* Specials that must be quoted in variable definitions.
|
||||
*/
|
||||
static const char* DEF_SPECIALS;
|
||||
|
||||
public:
|
||||
|
||||
@ -100,140 +93,38 @@ private:
|
||||
void parseRules(void);
|
||||
|
||||
/**
|
||||
* Parse the given substring as a rule, and append it to the rules currently
|
||||
* represented in this object.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
void applyRule(int32_t start, int32_t limit);
|
||||
|
||||
/**
|
||||
* Add a variable definition.
|
||||
* @param name the name of the variable. It must not already be defined.
|
||||
* @param pattern the value of the variable. It may be a single character
|
||||
* or a pattern describing a character set.
|
||||
* @exception IllegalArgumentException if there is a syntax error
|
||||
*/
|
||||
void applyVariableDef(const UnicodeString& name,
|
||||
const UnicodeString& pattern);
|
||||
|
||||
/**
|
||||
* Given a rule, parses it into three pieces: The left side, the right side,
|
||||
* and the operator. Returns the operator. Quotes and variable references
|
||||
* are resolved; the otuput text in all <code>StringBuffer</code> parameters
|
||||
* is literal text. This method delegates to other parsing methods to
|
||||
* handle the match pattern, output pattern, and other sub-patterns in the
|
||||
* rule.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param left left side of rule is appended to this buffer
|
||||
* with the quotes removed and variables resolved
|
||||
* @param right right side of rule is appended to this buffer
|
||||
* with the quotes removed and variables resolved
|
||||
* @param anteContext the preceding context of the match pattern,
|
||||
* if there is one, is appended to this buffer
|
||||
* @param postContext the following context of the match pattern,
|
||||
* if there is one, is appended to this buffer
|
||||
* @param cursorPos if there is a cursor in the output pattern, its
|
||||
* offset is stored in <code>cursorPos[0]</code>
|
||||
* @return The operator character, one of the characters in OPERATORS.
|
||||
*/
|
||||
UChar parseRule(int32_t start, int32_t limit,
|
||||
UnicodeString& left, UnicodeString& right,
|
||||
UnicodeString& anteContext,
|
||||
UnicodeString& postContext,
|
||||
int32_t& cursorPos);
|
||||
|
||||
/**
|
||||
* Parses the match pattern of a forward or reverse rule. Given the raw
|
||||
* match pattern, return the match text and the context on both sides, if
|
||||
* any. Resolves all quotes and variables.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the key to be matched will be appended to this buffer
|
||||
* @param anteContext the preceding context, if any, will be appended
|
||||
* to this buffer.
|
||||
* @param postContext the following context, if any, will be appended
|
||||
* to this buffer.
|
||||
*/
|
||||
void parseMatchPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
UnicodeString& anteContext,
|
||||
UnicodeString& postContext);
|
||||
|
||||
void parseSubPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text);
|
||||
|
||||
/**
|
||||
* Parse a variable definition sub pattern. This kind of sub
|
||||
* pattern differs in the set of characters that are considered
|
||||
* special. In particular, the '[' and ']' characters are not
|
||||
* special, since these are used in UnicodeSet patterns.
|
||||
*/
|
||||
void parseDefPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text);
|
||||
|
||||
/**
|
||||
* Parses the output pattern of a forward or reverse rule. Given the
|
||||
* output pattern, return the output text and the position of the cursor,
|
||||
* if any. Resolves all quotes and variables.
|
||||
* @param rules the string to be parsed
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the output text will be appended to this buffer
|
||||
* @param cursorPos if this parameter is not null, then cursorPos[0]
|
||||
* will be set to the cursor position, or -1 if there is none. If this
|
||||
* parameter is null, then cursors will be disallowed.
|
||||
*/
|
||||
void parseOutputPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
int32_t& cursorPos);
|
||||
|
||||
/**
|
||||
* Parses a sub-pattern of a rule. Return the text and the position of the cursor,
|
||||
* if any. Resolves all quotes and variables.
|
||||
* @param rules the string to be parsed
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the output text will be appended to this buffer
|
||||
* @param cursorPos if this parameter is not null, then cursorPos[0]
|
||||
* will be set to the cursor position, or -1 if there is none. If this
|
||||
* parameter is null, then cursors will be disallowed.
|
||||
* @param specials characters that must be quoted; typically either
|
||||
* SPECIALS or DEF_SPECIALS.
|
||||
*/
|
||||
void parseSubPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
int32_t* cursorPos,
|
||||
const UnicodeString& specials);
|
||||
|
||||
void validateVariableName(const UnicodeString& name);
|
||||
|
||||
/**
|
||||
* Returns the single character value of the given variable name. Defined
|
||||
* names are recognized.
|
||||
* MAIN PARSER. Parse the next rule in the given rule string, starting
|
||||
* at pos. Return the index after the last character parsed. Do not
|
||||
* parse characters at or after limit.
|
||||
*
|
||||
* NO LONGER SUPPORTED:
|
||||
* If a Unicode category name is given, a standard character variable
|
||||
* in the range firstCategoryVariable to lastCategoryVariable is returned,
|
||||
* with value firstCategoryVariable + n, where n is the category
|
||||
* number.
|
||||
* @exception IllegalArgumentException if the name is unknown.
|
||||
* Important: The character at pos must be a non-whitespace character
|
||||
* that is not the comment character.
|
||||
*
|
||||
* This method handles quoting, escaping, and whitespace removal. It
|
||||
* parses the end-of-rule character. It recognizes context and cursor
|
||||
* indicators. Once it does a lexical breakdown of the rule at pos, it
|
||||
* creates a rule object and adds it to our rule list.
|
||||
*/
|
||||
//$ Character getVariableDef(const UnicodeString& name);
|
||||
int32_t parseRule(int32_t pos, int32_t limit);
|
||||
|
||||
/**
|
||||
* Called by main parser upon syntax error. Search the rule string
|
||||
* for the probable end of the rule. Of course, if the error is that
|
||||
* the end of rule marker is missing, then the rule end will not be found.
|
||||
* In any case the rule start will be correctly reported.
|
||||
* @param msg error description
|
||||
* @param rule pattern string
|
||||
* @param start position of first character of current rule
|
||||
*/
|
||||
int32_t syntaxError(const char* msg, const UnicodeString&, int32_t start);
|
||||
|
||||
/**
|
||||
* Allocate a private-use substitution character for the given set,
|
||||
* register it in the setVariables hash, and return the substitution
|
||||
* character.
|
||||
*/
|
||||
UChar registerSet(UnicodeSet* adoptedSet);
|
||||
|
||||
/**
|
||||
* Determines what part of the private use region of Unicode we can use for
|
||||
* variable stand-ins. The correct way to do this is as follows: Parse each
|
||||
@ -263,38 +154,6 @@ private:
|
||||
static int32_t quotedIndexOf(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& setOfChars);
|
||||
|
||||
/**
|
||||
* Returns the index of the first character in a set. Unlike
|
||||
* String.indexOf(), this method searches not for a single character, but
|
||||
* for any character of the string <code>setOfChars</code>.
|
||||
* @param text text to be searched
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param setOfChars string with one or more distinct characters
|
||||
* @return Offset of the first character in <code>setOfChars</code>
|
||||
* found, or -1 if not found.
|
||||
* @see #quotedIndexOf
|
||||
*/
|
||||
static int32_t indexOf(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& setOfChars);
|
||||
|
||||
/**
|
||||
* Returns the index of the first character in a set. Unlike
|
||||
* String.indexOf(), this method searches not for a single character, but
|
||||
* for any character of the string <code>setOfChars</code>.
|
||||
* @param text text to be searched
|
||||
* @param setOfChars string with one or more distinct characters
|
||||
* @return Offset of the first character in <code>setOfChars</code>
|
||||
* found, or -1 if not found.
|
||||
* @see #quotedIndexOf
|
||||
*/
|
||||
static int32_t indexOf(const UnicodeString& text,
|
||||
const UnicodeString& setOfChars);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -25,6 +25,7 @@
|
||||
* after the <code>key</code>
|
||||
* @param cursorPos a position for the cursor after the <code>output</code>
|
||||
* is emitted. If less than zero, then the cursor is placed after the
|
||||
|
||||
* <code>output</code>; that is, -1 is equivalent to
|
||||
* <code>output.length()</code>. If greater than
|
||||
* <code>output.length()</code> then an exception is thrown.
|
||||
@ -37,55 +38,93 @@ TransliterationRule::TransliterationRule(const UnicodeString& theKey,
|
||||
const UnicodeString& thePostContext,
|
||||
int32_t theCursorPos,
|
||||
UErrorCode &status) :
|
||||
key(theKey), output(theOutput),
|
||||
anteContext(theAnteContext),
|
||||
postContext(thePostContext),
|
||||
cursorPos(theCursorPos),
|
||||
maskKey(0) {
|
||||
|
||||
output(theOutput),
|
||||
cursorPos(theCursorPos)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
anteContextLength = theAnteContext.length();
|
||||
keyLength = theKey.length();
|
||||
pattern = theAnteContext;
|
||||
pattern.append(theKey).append(thePostContext);
|
||||
if (cursorPos < 0) {
|
||||
cursorPos = output.length();
|
||||
}
|
||||
if (cursorPos > output.length()) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
/* The mask key is needed when we are adding individual rules to a rule
|
||||
* set, for performance. Here are the numbers: Without mask key, 13.0
|
||||
* seconds. With mask key, 6.2 seconds. However, once the rules have
|
||||
* been added to the set, then they can be discarded to free up space.
|
||||
* This is what the freeze() method does. After freeze() has been
|
||||
* called, the method masks() must NOT be called.
|
||||
*/
|
||||
maskKey = new UnicodeString(key);
|
||||
if (maskKey == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
maskKey->append(postContext);
|
||||
}
|
||||
}
|
||||
|
||||
TransliterationRule::~TransliterationRule() {
|
||||
delete maskKey;
|
||||
/**
|
||||
* Construct a new rule with the given input, output text, and other
|
||||
* attributes. A cursor position may be specified for the output text.
|
||||
* @param input input string, including key and optional ante and
|
||||
* post context
|
||||
* @param anteContextPos offset into input to end of ante context, or -1 if
|
||||
* none. Must be <= input.length() if not -1.
|
||||
* @param postContextPos offset into input to start of post context, or -1
|
||||
* if none. Must be <= input.length() if not -1, and must be >=
|
||||
* anteContextPos.
|
||||
* @param output output string
|
||||
* @param cursorPos offset into output at which cursor is located, or -1 if
|
||||
* none. If less than zero, then the cursor is placed after the
|
||||
* <code>output</code>; that is, -1 is equivalent to
|
||||
* <code>output.length()</code>. If greater than
|
||||
* <code>output.length()</code> then an exception is thrown.
|
||||
*/
|
||||
TransliterationRule::TransliterationRule(const UnicodeString& input,
|
||||
int32_t anteContextPos, int32_t postContextPos,
|
||||
const UnicodeString& output,
|
||||
int32_t cursorPos,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
// Do range checks only when warranted to save time
|
||||
if (anteContextPos < 0) {
|
||||
anteContextLength = 0;
|
||||
} else {
|
||||
if (anteContextPos > input.length()) {
|
||||
// throw new IllegalArgumentException("Invalid ante context");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
anteContextLength = anteContextPos;
|
||||
}
|
||||
if (postContextPos < 0) {
|
||||
keyLength = input.length() - anteContextLength;
|
||||
} else {
|
||||
if (postContextPos < anteContextLength ||
|
||||
postContextPos > input.length()) {
|
||||
// throw new IllegalArgumentException("Invalid post context");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
keyLength = postContextPos - anteContextLength;
|
||||
}
|
||||
if (cursorPos < 0) {
|
||||
this->cursorPos = output.length();
|
||||
} else {
|
||||
if (cursorPos > output.length()) {
|
||||
// throw new IllegalArgumentException("Invalid cursor position");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
this->cursorPos = cursorPos;
|
||||
}
|
||||
pattern = input;
|
||||
this->output = output;
|
||||
}
|
||||
|
||||
TransliterationRule::~TransliterationRule() {}
|
||||
|
||||
/**
|
||||
* Return the length of the key. Equivalent to <code>getKey().length()</code>.
|
||||
* @return the length of the match key.
|
||||
*/
|
||||
int32_t TransliterationRule::getKeyLength(void) const {
|
||||
return key.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the key.
|
||||
* @return the match key.
|
||||
*/
|
||||
const UnicodeString& TransliterationRule::getKey(void) const {
|
||||
return key;
|
||||
return keyLength;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -110,7 +149,45 @@ int32_t TransliterationRule::getCursorPos(void) const {
|
||||
* <code>getMaximumContextLength()</code>.
|
||||
*/
|
||||
int32_t TransliterationRule::getAnteContextLength(void) const {
|
||||
return anteContext.length();
|
||||
return anteContextLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method. Returns 8-bit index value for this rule.
|
||||
* This is the low byte of the first character of the key,
|
||||
* unless the first character of the key is a set. If it's a
|
||||
* set, or otherwise can match multiple keys, the index value is -1.
|
||||
*/
|
||||
int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) {
|
||||
if (anteContextLength == pattern.length()) {
|
||||
// A pattern with just ante context {such as foo)>bar} can
|
||||
// match any key.
|
||||
return -1;
|
||||
}
|
||||
UChar c = pattern.charAt(anteContextLength);
|
||||
return data.lookupSet(c) == NULL ? (c & 0xFF) : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method. Returns true if this rule matches the given
|
||||
* index value. The index value is an 8-bit integer, 0..255,
|
||||
* representing the low byte of the first character of the key.
|
||||
* It matches this rule if it matches the first character of the
|
||||
* key, or if the first character of the key is a set, and the set
|
||||
* contains any character with a low byte equal to the index
|
||||
* value. If the rule contains only ante context, as in foo)>bar,
|
||||
* then it will match any key.
|
||||
*/
|
||||
bool_t TransliterationRule::matchesIndexValue(uint8_t v,
|
||||
const TransliterationRuleData& data) {
|
||||
if (anteContextLength == pattern.length()) {
|
||||
// A pattern with just ante context {such as foo)>bar} can
|
||||
// match any key.
|
||||
return TRUE;
|
||||
}
|
||||
UChar c = pattern.charAt(anteContextLength);
|
||||
UnicodeSet* set = data.lookupSet(c);
|
||||
return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -118,43 +195,37 @@ int32_t TransliterationRule::getAnteContextLength(void) const {
|
||||
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
|
||||
* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
|
||||
* "[c]a>x" masks "[dc]a>y".
|
||||
*
|
||||
* <p>This method must not be called after freeze() is called.
|
||||
*/
|
||||
bool_t TransliterationRule::masks(const TransliterationRule& r2) const {
|
||||
/* There are three cases of masking. In each instance, rule1
|
||||
* masks rule2.
|
||||
/* Rule r1 masks rule r2 if the string formed of the
|
||||
* antecontext, key, and postcontext overlaps in the following
|
||||
* way:
|
||||
*
|
||||
* 1. KEY mask: len(key1) < len(key2), key2 starts with key1.
|
||||
*
|
||||
* 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
*
|
||||
* 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
* r1: aakkkpppp
|
||||
* r2: aaakkkkkpppp
|
||||
* ^
|
||||
*
|
||||
* The strings must be aligned at the first character of the
|
||||
* key. The length of r1 to the left of the alignment point
|
||||
* must be <= the length of r2 to the left; ditto for the
|
||||
* right. The characters of r1 must equal (or be a superset
|
||||
* of) the corresponding characters of r2. The superset
|
||||
* operation should be performed to check for UnicodeSet
|
||||
* masking.
|
||||
*/
|
||||
|
||||
/* LIMITATION of the current mask algorithm: Some rule
|
||||
* maskings are currently not detected. For example,
|
||||
* "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking,
|
||||
* we need a subset operator on UnicodeSet objects, which we
|
||||
* currently do not have. This can be added later.
|
||||
* "{Lu}]a>x" masks "A]a>y". This can be added later. TODO
|
||||
*/
|
||||
return ((maskKey->length() < r2.maskKey->length() &&
|
||||
r2.maskKey->startsWith(*maskKey)) ||
|
||||
(r2.anteContext.length() != 0 && *maskKey == *r2.maskKey &&
|
||||
((anteContext.length() == 0) ||
|
||||
(anteContext.length() < r2.anteContext.length() &&
|
||||
r2.anteContext.endsWith(anteContext)))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, masks() must NOT be called.
|
||||
* If it is called, an exception will be thrown.
|
||||
*/
|
||||
void TransliterationRule::freeze(void) {
|
||||
delete maskKey;
|
||||
maskKey = 0;
|
||||
int32_t len = pattern.length();
|
||||
int32_t left = anteContextLength;
|
||||
int32_t left2 = r2.anteContextLength;
|
||||
int32_t right = len - left;
|
||||
int32_t right2 = r2.pattern.length() - left2;
|
||||
return left <= left2 && right <= right2 &&
|
||||
0 == r2.pattern.compare(left2 - left, len, pattern);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -186,17 +257,10 @@ bool_t TransliterationRule::matches(const UnicodeString& text,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
return
|
||||
(anteContext.length() == 0
|
||||
|| regionMatches(text, start, limit, result,
|
||||
cursor - anteContext.length(),
|
||||
anteContext, data, filter)) &&
|
||||
regionMatches(text, start, limit, result, cursor,
|
||||
key, data, filter) &&
|
||||
(postContext.length() == 0
|
||||
|| regionMatches(text, start, limit, result,
|
||||
cursor + key.length(),
|
||||
postContext, data, filter));
|
||||
// Match anteContext, key, and postContext
|
||||
return regionMatches(text, start, limit, result,
|
||||
cursor - anteContextLength,
|
||||
pattern, data, filter);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -219,15 +283,10 @@ bool_t TransliterationRule::matches(const Replaceable& text,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
return
|
||||
(anteContext.length() == 0
|
||||
|| regionMatches(text, start, limit, cursor - anteContext.length(),
|
||||
anteContext, data, filter)) &&
|
||||
regionMatches(text, start, limit, cursor,
|
||||
key, data, filter) &&
|
||||
(postContext.length() == 0
|
||||
|| regionMatches(text, start, limit, cursor + key.length(),
|
||||
postContext, data, filter));
|
||||
// Match anteContext, key, and postContext
|
||||
return regionMatches(text, start, limit,
|
||||
cursor - anteContextLength,
|
||||
pattern, data, filter);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -260,28 +319,10 @@ int32_t TransliterationRule::getMatchDegree(const Replaceable& text,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
if (anteContext.length() != 0
|
||||
&& !regionMatches(text, start, limit, cursor - anteContext.length(),
|
||||
anteContext, data, filter)) {
|
||||
return MISMATCH;
|
||||
}
|
||||
int32_t len = getRegionMatchLength(text, start, limit, cursor,
|
||||
key, data, filter);
|
||||
if (len < 0) {
|
||||
return MISMATCH;
|
||||
}
|
||||
if (len < key.length()) {
|
||||
return PARTIAL_MATCH;
|
||||
}
|
||||
if (postContext.length() == 0) {
|
||||
return FULL_MATCH;
|
||||
}
|
||||
len = getRegionMatchLength(text, start, limit,
|
||||
cursor + key.length(),
|
||||
postContext, data, filter);
|
||||
return (len < 0) ? MISMATCH
|
||||
: ((len == postContext.length()) ? FULL_MATCH
|
||||
: PARTIAL_MATCH);
|
||||
int len = getRegionMatchLength(text, start, limit, cursor - anteContextLength,
|
||||
pattern, data, filter);
|
||||
return len < anteContextLength ? MISMATCH :
|
||||
(len < pattern.length() ? PARTIAL_MATCH : FULL_MATCH);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -72,9 +72,13 @@ public:
|
||||
private:
|
||||
|
||||
/**
|
||||
* The string that must be matched.
|
||||
* The string that must be matched, consisting of the anteContext, key,
|
||||
* and postContext, concatenated together, in that order. Some components
|
||||
* may be empty (zero length).
|
||||
* @see anteContextLength
|
||||
* @see keyLength
|
||||
*/
|
||||
UnicodeString key;
|
||||
UnicodeString pattern;
|
||||
|
||||
/**
|
||||
* The string that is emitted if the key, anteContext, and postContext
|
||||
@ -83,16 +87,18 @@ private:
|
||||
UnicodeString output;
|
||||
|
||||
/**
|
||||
* The string that must match before the key. If empty, then
|
||||
* there is no matching requirement before the key.
|
||||
* The length of the string that must match before the key. If
|
||||
* zero, then there is no matching requirement before the key.
|
||||
* Substring [0,anteContextLength) of pattern is the anteContext.
|
||||
*/
|
||||
UnicodeString anteContext;
|
||||
int32_t anteContextLength;
|
||||
|
||||
/**
|
||||
* The string that must match after the key. If empty, then there
|
||||
* is no matching requirement after the key.
|
||||
* The length of the key. Substring [anteContextLength,
|
||||
* anteContextLength + keyLength) is the key.
|
||||
|
||||
*/
|
||||
UnicodeString postContext;
|
||||
int32_t keyLength;
|
||||
|
||||
/**
|
||||
* The position of the cursor after emitting the output string, from 0 to
|
||||
@ -101,12 +107,6 @@ private:
|
||||
*/
|
||||
int32_t cursorPos;
|
||||
|
||||
/**
|
||||
* A string used to implement masks().
|
||||
* @see #freeze
|
||||
*/
|
||||
UnicodeString* maskKey;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
@ -134,6 +134,29 @@ public:
|
||||
int32_t theCursorPos,
|
||||
UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Construct a new rule with the given input, output text, and other
|
||||
* attributes. A cursor position may be specified for the output text.
|
||||
* @param input input string, including key and optional ante and
|
||||
* post context
|
||||
* @param anteContextPos offset into input to end of ante context, or -1 if
|
||||
* none. Must be <= input.length() if not -1.
|
||||
* @param postContextPos offset into input to start of post context, or -1
|
||||
* if none. Must be <= input.length() if not -1, and must be >=
|
||||
* anteContextPos.
|
||||
* @param output output string
|
||||
* @param cursorPos offset into output at which cursor is located, or -1 if
|
||||
* none. If less than zero, then the cursor is placed after the
|
||||
* <code>output</code>; that is, -1 is equivalent to
|
||||
* <code>output.length()</code>. If greater than
|
||||
* <code>output.length()</code> then an exception is thrown.
|
||||
*/
|
||||
TransliterationRule(const UnicodeString& input,
|
||||
int32_t anteContextPos, int32_t postContextPos,
|
||||
const UnicodeString& output,
|
||||
int32_t cursorPos,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
@ -145,12 +168,6 @@ public:
|
||||
*/
|
||||
virtual int32_t getKeyLength(void) const;
|
||||
|
||||
/**
|
||||
* Return the key.
|
||||
* @return the match key.
|
||||
*/
|
||||
virtual const UnicodeString& getKey(void) const;
|
||||
|
||||
/**
|
||||
* Return the output string.
|
||||
* @return the output string.
|
||||
@ -170,22 +187,39 @@ public:
|
||||
*/
|
||||
virtual int32_t getAnteContextLength(void) const;
|
||||
|
||||
private:
|
||||
friend class TransliterationRuleSet;
|
||||
|
||||
/**
|
||||
* Internal method. Returns 8-bit index value for this rule.
|
||||
* This is the low byte of the first character of the key,
|
||||
* unless the first character of the key is a set. If it's a
|
||||
* set, or otherwise can match multiple keys, the index value is -1.
|
||||
*/
|
||||
int16_t getIndexValue(const TransliterationRuleData& data);
|
||||
|
||||
/**
|
||||
* Internal method. Returns true if this rule matches the given
|
||||
* index value. The index value is an 8-bit integer, 0..255,
|
||||
* representing the low byte of the first character of the key.
|
||||
* It matches this rule if it matches the first character of the
|
||||
* key, or if the first character of the key is a set, and the set
|
||||
* contains any character with a low byte equal to the index
|
||||
* value. If the rule contains only ante context, as in foo)>bar,
|
||||
* then it will match any key.
|
||||
*/
|
||||
bool_t matchesIndexValue(uint8_t v,
|
||||
const TransliterationRuleData& data);
|
||||
|
||||
public:
|
||||
/**
|
||||
* Return true if this rule masks another rule. If r1 masks r2 then
|
||||
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
|
||||
* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
|
||||
* "[c]a>x" masks "[dc]a>y".
|
||||
*
|
||||
* <p>This method must not be called after freeze() is called.
|
||||
*/
|
||||
virtual bool_t masks(const TransliterationRule& r2) const;
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, masks() must NOT be called.
|
||||
* If it is called, an exception will be thrown.
|
||||
*/
|
||||
virtual void freeze(void);
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text. The text being matched
|
||||
* occupies a virtual buffer consisting of the contents of
|
||||
|
@ -30,6 +30,16 @@
|
||||
*/
|
||||
TransliterationRuleSet::TransliterationRuleSet() {
|
||||
maxContextLength = 0;
|
||||
ruleVector = new UVector();
|
||||
rules = NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
TransliterationRuleSet::~TransliterationRuleSet() {
|
||||
delete ruleVector;
|
||||
delete[] rules;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -45,31 +55,22 @@ int32_t TransliterationRuleSet::getMaximumContextLength(void) const {
|
||||
* significant.
|
||||
*
|
||||
* <p>Once freeze() is called, this method must not be called.
|
||||
* @param rule the rule to add
|
||||
* @param adoptedRule the rule to add
|
||||
*/
|
||||
void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
|
||||
UErrorCode& status) {
|
||||
|
||||
// Build time, no checking : 3562 ms
|
||||
// Build time, with checking: 6234 ms
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
delete adoptedRule;
|
||||
return;
|
||||
}
|
||||
|
||||
for (int32_t i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule* r = (TransliterationRule*) rules.elementAt(i);
|
||||
if (r->masks(*adoptedRule)) {
|
||||
//throw new IllegalArgumentException("Rule " + rule +
|
||||
// " must precede " + r);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
delete adoptedRule;
|
||||
return;
|
||||
}
|
||||
if (ruleVector == NULL) {
|
||||
// throw new IllegalArgumentException("Cannot add rules after freezing");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
delete adoptedRule;
|
||||
return;
|
||||
}
|
||||
ruleVector->addElement(adoptedRule);
|
||||
|
||||
rules.addElement(adoptedRule);
|
||||
int32_t len;
|
||||
if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) {
|
||||
maxContextLength = len;
|
||||
@ -77,13 +78,109 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
|
||||
}
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, addRule() must NOT
|
||||
* be called again.
|
||||
* Close this rule set to further additions, check it for masked rules,
|
||||
* and index it to optimize performance. Once this method is called,
|
||||
* addRule() can no longer be called.
|
||||
* @exception IllegalArgumentException if some rules are masked
|
||||
*/
|
||||
void TransliterationRuleSet::freeze(void) {
|
||||
for (int32_t i=0; i<rules.size(); ++i) {
|
||||
((TransliterationRule*) rules.elementAt(i))->freeze();
|
||||
void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Construct the rule array and index table. We reorder the
|
||||
* rules by sorting them into 256 bins. Each bin contains all
|
||||
* rules matching the index value for that bin. A rule
|
||||
* matches an index value if string whose first key character
|
||||
* has a low byte equal to the index value can match the rule.
|
||||
*
|
||||
* Each bin contains zero or more rules, in the same order
|
||||
* they were found originally. However, the total rules in
|
||||
* the bins may exceed the number in the original vector,
|
||||
* since rules that have a variable as their first key
|
||||
* character will generally fall into more than one bin.
|
||||
*
|
||||
* That is, each bin contains all rules that either have that
|
||||
* first index value as their first key character, or have
|
||||
* a set containing the index value as their first character.
|
||||
*/
|
||||
int32_t n = ruleVector->size();
|
||||
int32_t j;
|
||||
int16_t x;
|
||||
UVector v(2*n); // heuristic; adjust as needed
|
||||
|
||||
/* Precompute the index values. This saves a LOT of time.
|
||||
*/
|
||||
int16_t* indexValue = new int16_t[n];
|
||||
for (j=0; j<n; ++j) {
|
||||
TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
|
||||
indexValue[j] = r->getIndexValue(data);
|
||||
}
|
||||
for (x=0; x<256; ++x) {
|
||||
index[x] = v.size();
|
||||
for (j=0; j<n; ++j) {
|
||||
if (indexValue[j] >= 0) {
|
||||
if (indexValue[j] == x) {
|
||||
v.addElement(ruleVector->elementAt(j));
|
||||
}
|
||||
} else {
|
||||
// If the indexValue is < 0, then the first key character is
|
||||
// a set, and we must use the more time-consuming
|
||||
// matchesIndexValue check. In practice this happens
|
||||
// rarely, so we seldom tread this code path.
|
||||
TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
|
||||
if (r->matchesIndexValue((uint8_t)x, data)) {
|
||||
v.addElement(r);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
delete[] indexValue;
|
||||
index[256] = v.size();
|
||||
|
||||
/* Freeze things into an array.
|
||||
*/
|
||||
rules = new TransliterationRule*[v.size()];
|
||||
for (j=0; j<v.size(); ++j) {
|
||||
rules[j] = (TransliterationRule*) v.elementAt(j);
|
||||
}
|
||||
delete ruleVector;
|
||||
ruleVector = NULL;
|
||||
|
||||
// TODO Add error reporting that indicates the rules that
|
||||
// are being masked.
|
||||
//UnicodeString errors;
|
||||
|
||||
/* Check for masking. This is MUCH faster than our old check,
|
||||
* which was each rule against each following rule, since we
|
||||
* only have to check for masking within each bin now. It's
|
||||
* 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
|
||||
* count, and n2 is the per-bin rule count. But n2<<n1, so
|
||||
* it's a big win.
|
||||
*/
|
||||
for (x=0; x<256; ++x) {
|
||||
for (j=index[x]; j<index[x+1]-1; ++j) {
|
||||
TransliterationRule* r1 = rules[j];
|
||||
for (int32_t k=j+1; k<index[x+1]; ++k) {
|
||||
TransliterationRule* r2 = rules[k];
|
||||
if (r1->masks(*r2)) {
|
||||
//| if (errors == null) {
|
||||
//| errors = new StringBuffer();
|
||||
//| } else {
|
||||
//| errors.append("\n");
|
||||
//| }
|
||||
//| errors.append("Rule " + r1 + " masks " + r2);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//if (errors != null) {
|
||||
// throw new IllegalArgumentException(errors.toString());
|
||||
//}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -119,15 +216,18 @@ TransliterationRuleSet::findMatch(const UnicodeString& text,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
for (int32_t i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule* rule =
|
||||
(TransliterationRule*) rules.elementAt(i);
|
||||
if (rule->matches(text, start, limit, result,
|
||||
cursor, data, filter)) {
|
||||
return rule;
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
int32_t rlen = result.length();
|
||||
int16_t x = 0xFF & (cursor < rlen ? result.charAt(cursor)
|
||||
: text.charAt(cursor - rlen + start));
|
||||
for (int32_t i=index[x]; i<index[x+1]; ++i) {
|
||||
if (rules[i]->matches(text, start, limit, result, cursor, data, filter)) {
|
||||
return rules[i];
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -154,15 +254,16 @@ TransliterationRuleSet::findMatch(const Replaceable& text,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
for (int32_t i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule* rule =
|
||||
(TransliterationRule*) rules.elementAt(i);
|
||||
if (rule->matches(text, start, limit, cursor,
|
||||
data, filter)) {
|
||||
return rule;
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
int16_t x = text.charAt(cursor) & 0xFF;
|
||||
for (int32_t i=index[x]; i<index[x+1]; ++i) {
|
||||
if (rules[i]->matches(text, start, limit, cursor, data, filter)) {
|
||||
return rules[i];
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -199,19 +300,22 @@ TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
|
||||
const TransliterationRuleData& data,
|
||||
bool_t& isPartial,
|
||||
const UnicodeFilter* filter) const {
|
||||
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
isPartial = FALSE;
|
||||
for (int32_t i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule* rule =
|
||||
(TransliterationRule*) rules.elementAt(i);
|
||||
int32_t match = rule->getMatchDegree(text, start, limit, cursor,
|
||||
data, filter);
|
||||
int16_t x = text.charAt(cursor) & 0xFF;
|
||||
for (int32_t i=index[x]; i<index[x+1]; ++i) {
|
||||
int32_t match = rules[i]->getMatchDegree(text, start, limit, cursor,
|
||||
data, filter);
|
||||
switch (match) {
|
||||
case TransliterationRule::FULL_MATCH:
|
||||
return rule;
|
||||
return rules[i];
|
||||
case TransliterationRule::PARTIAL_MATCH:
|
||||
isPartial = TRUE;
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
@ -30,15 +30,30 @@ class UnicodeString;
|
||||
*/
|
||||
class TransliterationRuleSet {
|
||||
/**
|
||||
* Vector of rules, in the order added.
|
||||
* Vector of rules, in the order added. This is only used while the rule
|
||||
* set is getting built. After that, freeze() reorders and indexes the
|
||||
* rules, and this Vector is freed.
|
||||
*/
|
||||
UVector rules;
|
||||
UVector* ruleVector;
|
||||
|
||||
/**
|
||||
* Length of the longest preceding context
|
||||
*/
|
||||
int32_t maxContextLength;
|
||||
|
||||
/**
|
||||
* Sorted and indexed table of rules. This is created by freeze() from
|
||||
* the rules in ruleVector.
|
||||
*/
|
||||
TransliterationRule** rules;
|
||||
|
||||
/**
|
||||
* Index table. For text having a first character c, compute x = c&0xFF.
|
||||
* Now use rules[index[x]..index[x+1]-1]. This index table is created by
|
||||
* freeze().
|
||||
*/
|
||||
int32_t index[257];
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
@ -46,6 +61,11 @@ public:
|
||||
*/
|
||||
TransliterationRuleSet();
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~TransliterationRuleSet();
|
||||
|
||||
/**
|
||||
* Return the maximum context length.
|
||||
* @return the length of the longest preceding context.
|
||||
@ -57,16 +77,19 @@ public:
|
||||
* significant.
|
||||
*
|
||||
* <p>Once freeze() is called, this method must not be called.
|
||||
* @param rule the rule to add
|
||||
* @param adoptedRule the rule to add
|
||||
*/
|
||||
virtual void addRule(TransliterationRule* adoptedRule,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, addRule() must NOT
|
||||
* be called again.
|
||||
* Close this rule set to further additions, check it for masked rules,
|
||||
* and index it to optimize performance. Once this method is called,
|
||||
* addRule() can no longer be called.
|
||||
* @exception IllegalArgumentException if some rules are masked
|
||||
*/
|
||||
virtual void freeze(void);
|
||||
virtual void freeze(const TransliterationRuleData& data,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text. The
|
||||
|
@ -14,7 +14,7 @@
|
||||
|
||||
// N.B.: This mapping is different in ICU and Java
|
||||
const UnicodeString UnicodeSet::CATEGORY_NAMES(
|
||||
"CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf");
|
||||
"CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf", "");
|
||||
|
||||
/**
|
||||
* A cache mapping character category integers, as returned by
|
||||
@ -28,7 +28,7 @@ UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE =
|
||||
* Delimiter string used in patterns to close a category reference:
|
||||
* ":]". Example: "[:Lu:]".
|
||||
*/
|
||||
const UnicodeString UnicodeSet::CATEGORY_CLOSE(":]", "");
|
||||
const UnicodeString UnicodeSet::CATEGORY_CLOSE = UNICODE_STRING(":]", 2);
|
||||
|
||||
/**
|
||||
* Delimiter char beginning a variable reference:
|
||||
@ -69,23 +69,20 @@ UnicodeSet::UnicodeSet() : pairs() {}
|
||||
* white space. See the class description for the syntax of the
|
||||
* pattern language.
|
||||
* @param pattern a string specifying what characters are in the set
|
||||
* @param ignoreSpaces if <code>true</code>, all spaces in the
|
||||
* pattern are ignored, except those preceded by '\\'. Spaces are
|
||||
* those characters for which <code>Character.isSpaceChar()</code>
|
||||
* is <code>true</code>.
|
||||
* @exception <code>IllegalArgumentException</code> if the pattern
|
||||
* contains a syntax error.
|
||||
*/
|
||||
UnicodeSet::UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces,
|
||||
UErrorCode& status) : pairs() {
|
||||
applyPattern(pattern, ignoreSpaces, status);
|
||||
}
|
||||
|
||||
UnicodeSet::UnicodeSet(const UnicodeString& pattern,
|
||||
UErrorCode& status) : pairs() {
|
||||
applyPattern(pattern, status);
|
||||
}
|
||||
|
||||
UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
|
||||
const TransliterationRuleData* data,
|
||||
UErrorCode& status) {
|
||||
parse(pairs, pattern, pos, data, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a set from the given Unicode character category.
|
||||
* @param category an integer indicating the character category as
|
||||
@ -164,50 +161,24 @@ int32_t UnicodeSet::hashCode(void) const {
|
||||
* contains a syntax error.
|
||||
*/
|
||||
void UnicodeSet::applyPattern(const UnicodeString& pattern,
|
||||
bool_t ignoreSpaces,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
ParsePosition pos(0);
|
||||
UnicodeString* pat = (UnicodeString*) &pattern;
|
||||
parse(pairs, pattern, pos, NULL, status);
|
||||
|
||||
// To ignore spaces, create a new pattern without spaces. We
|
||||
// have to process all '\' escapes. If '\' is encountered,
|
||||
// insert it and the following character (if any -- let parse
|
||||
// deal with any syntax errors) in the pattern. This allows
|
||||
// escaped spaces.
|
||||
if (ignoreSpaces) {
|
||||
pat = new UnicodeString();
|
||||
for (int32_t i=0; i<pattern.length(); ++i) {
|
||||
UChar c = pattern.charAt(i);
|
||||
if (Unicode::isSpaceChar(c)) {
|
||||
continue;
|
||||
}
|
||||
if (c == '\\' && (i+1) < pattern.length()) {
|
||||
pat->append(c);
|
||||
c = pattern.charAt(++i);
|
||||
// Fall through and append the following char
|
||||
}
|
||||
pat->append(c);
|
||||
}
|
||||
// Skip over trailing whitespace
|
||||
int32_t i = pos.getIndex();
|
||||
int32_t n = pattern.length();
|
||||
while (i<n && Unicode::isWhitespace(pattern.charAt(i))) {
|
||||
++i;
|
||||
}
|
||||
|
||||
parse(pairs, *pat, pos, NULL, status);
|
||||
|
||||
// Skip over trailing whitespace -- clean up later
|
||||
while (pos.getIndex() < pat->length() &&
|
||||
Unicode::isWhitespace(pat->charAt(pos.getIndex()))) {
|
||||
pos.setIndex(pos.getIndex() + 1);
|
||||
}
|
||||
|
||||
if (pos.getIndex() != pat->length()) {
|
||||
if (i != n) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
if (pat != &pattern) {
|
||||
delete pat;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -279,6 +250,34 @@ bool_t UnicodeSet::contains(UChar c) const {
|
||||
return contains(c, c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <tt>true</tt> if this set contains any character whose low byte
|
||||
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
|
||||
* indexing.
|
||||
*/
|
||||
bool_t UnicodeSet::containsIndexValue(uint8_t v) const {
|
||||
/* The index value v, in the range [0,255], is contained in this set if
|
||||
* it is contained in any pair of this set. Pairs either have the high
|
||||
* bytes equal, or unequal. If the high bytes are equal, then we have
|
||||
* aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
|
||||
* v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
|
||||
* Then v is contained if xx <= v || v <= yy. (This is identical to the
|
||||
* time zone month containment logic.)
|
||||
*/
|
||||
for (int32_t i=0; i<pairs.length(); i+=2) {
|
||||
UChar low = pairs.charAt(i);
|
||||
UChar high = pairs.charAt(i+1);
|
||||
if ((low & 0xFF00) == (high & 0xFF00)) {
|
||||
if (uint8_t(low) <= v && v <= uint8_t(high)) {
|
||||
return TRUE;
|
||||
}
|
||||
} else if (uint8_t(low) <= v || v <= uint8_t(high)) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the specified range to this set if it is not already
|
||||
* present. If this set already contains the specified range,
|
||||
|
Loading…
Reference in New Issue
Block a user