ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
This commit is contained in:
Alan Liu 2000-01-13 07:28:08 +00:00
parent cd8a516d90
commit 1a6cfef879
9 changed files with 781 additions and 883 deletions

View File

@ -43,6 +43,14 @@ TransliterationRuleData::defineVariable(const UnicodeString& name,
UChar standIn,
UnicodeSet* adoptedSet,
UErrorCode& status) {
defineVariable(name, standIn, status);
defineSet(standIn, adoptedSet, status);
}
void
TransliterationRuleData::defineSet(UChar standIn,
UnicodeSet* adoptedSet,
UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
@ -50,9 +58,6 @@ TransliterationRuleData::defineVariable(const UnicodeString& name,
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uhash_putKey(variableNames, name.hashCode() & 0x7FFFFFFF,
(void*) standIn,
&status);
uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF),
adoptedSet,
&status);

View File

@ -72,6 +72,10 @@ public:
UnicodeSet* adoptedSet,
UErrorCode& status);
void defineSet(UChar standIn,
UnicodeSet* adoptedSet,
UErrorCode& status);
UChar lookupVariable(const UnicodeString& name,
UErrorCode& status) const;

View File

@ -13,35 +13,30 @@
#include "unirange.h"
#include "rbt_data.h"
#include "unicode/uniset.h"
#include "cstring.h"
#include "unicode/parsepos.h"
// Operators
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>';
const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<';
const char* TransliterationRuleParser::OPERATORS = "=><";
const UChar TransliterationRuleParser::FWDREV_RULE_OP = '~'; // internal rep of <> op
const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3);
// Other special characters
const UChar TransliterationRuleParser::QUOTE = '\'';
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
const UChar TransliterationRuleParser::CONTEXT_OPEN = '[';
const UChar TransliterationRuleParser::CONTEXT_CLOSE = ']';
const UChar TransliterationRuleParser::CURSOR_POS = '|';
const UChar TransliterationRuleParser::ESCAPE = '\\';
const UChar TransliterationRuleParser::END_OF_RULE = ';';
const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#';
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
const UChar TransliterationRuleParser::CONTEXT_OPEN = '(';
const UChar TransliterationRuleParser::CONTEXT_CLOSE = ')';
const UChar TransliterationRuleParser::SET_OPEN = '[';
const UChar TransliterationRuleParser::SET_CLOSE = ']';
const UChar TransliterationRuleParser::CURSOR_POS = '|';
/**
* Specials must be quoted in rules to be used as literals.
* Specials may not occur in variable names.
*
* This string is a superset of OPERATORS.
*/
const char* TransliterationRuleParser::SPECIALS = "'{}[]|#=><";
/**
* Specials that must be quoted in variable definitions.
*/
const char* TransliterationRuleParser::DEF_SPECIALS = "'{}";
TransliterationRuleData*
TransliterationRuleParser::parse(const UnicodeString& rules,
@ -84,465 +79,339 @@ void TransliterationRuleParser::parseRules(void) {
determineVariableRange();
int32_t n = rules.length();
int32_t i = 0;
while (i<n && U_SUCCESS(status)) {
int32_t limit = rules.indexOf('\n', i);
// Recognize "\\\n" as an escaped "\n"
while (limit>0 && rules.charAt(limit-1) == '\\') {
limit = rules.indexOf('\n', limit+1);
int32_t pos = 0;
int32_t limit = rules.length();
while (pos < limit && U_SUCCESS(status)) {
UChar c = rules.charAt(pos++);
if (Unicode::isWhitespace(c)) {
// Ignore leading whitespace. Note that this is not
// Unicode spaces, but Java spaces -- a subset,
// representing whitespace likely to be seen in code.
continue;
}
if (limit == -1) {
limit = n;
// Skip lines starting with the comment character
if (c == RULE_COMMENT_CHAR) {
pos = rules.indexOf("\n", pos) + 1;
if (pos == 0) {
break; // No "\n" found; rest of rule is a commnet
}
continue; // Either fall out or restart with next line
}
// Skip over empty lines and line starting with #
if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) {
applyRule(i, limit);
}
i = limit + 1;
// We've found the start of a rule. c is its first
// character, and pos points past c. Lexically parse the
// rule into component pieces.
pos = parseRule(--pos, limit);
}
// Index the rules
if (U_SUCCESS(status)) {
data->ruleSet.freeze(*data, status);
}
data->ruleSet.freeze();
}
/**
* Parse the given substring as a rule, and append it to the rules currently
* represented in this object.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
* MAIN PARSER. Parse the next rule in the given rule string, starting
* at pos. Return the index after the last character parsed. Do not
* parse characters at or after limit.
*
* Important: The character at pos must be a non-whitespace character
* that is not the comment character.
*
* This method handles quoting, escaping, and whitespace removal. It
* parses the end-of-rule character. It recognizes context and cursor
* indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list.
*/
void TransliterationRuleParser::applyRule(int32_t start, int32_t limit) {
/* General description of parsing: Initially, rules contain two types of
* quoted characters. First, there are variable references, such as
* "{alpha}". Second, there are quotes, such as "'<'" or "''". One of
* the first steps in parsing a rule is to resolve such quoted matter.
* Quotes are removed early, leaving unquoted literal matter. Variable
* references are resolved and replaced by single characters. In some
* instances these characters represent themselves; in others, they
* stand for categories of characters. Character categories are either
* predefined (e.g., "{Lu}"), or are defined by the user using a
* statement (e.g., "vowels:aeiouAEIOU").
*
* Another early step in parsing is to split each rule into component
* pieces. These pieces are, for every rule, a left-hand side, a right-
* hand side, and an operator. The left- and right-hand sides may not
* be empty, except for the output patterns of forward and reverse
* rules. In addition to this partitioning, the match patterns of
* forward and reverse rules must be partitioned into antecontext,
* postcontext, and literal pattern, where the context portions may or
* may not be present. Finally, output patterns must have the cursor
* indicator '|' detected and removed, with its position recorded.
*
* Quote removal, variable resolution, and sub-pattern splitting must
* all happen at once. This is due chiefly to the quoting mechanism,
* which allows special characters to appear at arbitrary positions in
* the final unquoted text. (For this reason, alteration of the rule
* language is somewhat clumsy; it entails reassessment and revision of
* the parsing methods as a whole.)
*
* After this processing of rules is complete, the final end products
* are unquoted pieces of text of various types, and an integer cursor
* position, if one is specified. These processed raw materials are now
* easy to deal with; other classes such as UnicodeSet and
* TransliterationRule need know nothing of quoting or variables.
*/
int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
// Locate the left side, operator, and right side
int32_t start = pos;
UChar op = 0;
UnicodeString buf;
int32_t cursor = -1; // position of cursor in buf
int32_t ante = -1; // position of ante context marker ')' in buf
int32_t post = -1; // position of post context marker '(' in buf
int32_t postClose = -1; // position of post context close ')' in buf
// Assigned to buf and its adjuncts after the LHS has been
// parsed. Thereafter, buf etc. refer to the RHS.
UnicodeString left;
UnicodeString right;
UnicodeString anteContext;
UnicodeString postContext;
int32_t cursorPos;
int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
UChar op = parseRule(start, limit, left, right,
anteContext, postContext, cursorPos);
UnicodeString scratch;
if (U_FAILURE(status)) {
return;
while (pos < limit) {
UChar c = rules.charAt(pos++);
if (Unicode::isWhitespace(c)) {
// Ignore whitespace. Note that this is not Unicode
// spaces, but Java spaces -- a subset, representing
// whitespace likely to be seen in code.
continue;
}
// Handle escapes
if (c == ESCAPE) {
if (pos == limit) {
return syntaxError("Trailing backslash", rules, start);
}
// Parse \uXXXX escapes
c = rules.charAt(pos++);
if (c == 'u') {
if ((pos+4) > limit) {
return syntaxError("Malformed Unicode escape", rules, start);
}
c = (UChar)0x0000;
for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
int32_t digit = Unicode::digit(rules.charAt(pos), 16);
if (digit<0) {
return syntaxError("Malformed Unicode escape", rules, start);
}
c = (UChar) ((c << 4) | digit);
}
}
buf.append(c);
continue;
}
// Handle quoted matter
if (c == QUOTE) {
int32_t iq = rules.indexOf(QUOTE, pos);
if (iq == pos) {
buf.append(c); // Parse [''] outside quotes as [']
++pos;
} else {
/* This loop picks up a segment of quoted text of the
* form 'aaaa' each time through. If this segment
* hasn't really ended ('aaaa''bbbb') then it keeps
* looping, each time adding on a new segment. When it
* reaches the final quote it breaks.
*/
for (;;) {
if (iq < 0) {
return syntaxError("Unterminated quote", rules, start);
}
scratch.truncate(0);
rules.extractBetween(pos, iq, scratch);
buf.append(scratch);
pos = iq+1;
if (pos < limit && rules.charAt(pos) == QUOTE) {
// Parse [''] inside quotes as [']
iq = rules.indexOf(QUOTE, pos+1);
// Continue looping
} else {
break;
}
}
}
continue;
}
if (OPERATORS.indexOf(c) >= 0) {
if (op != 0) {
return syntaxError("Unquoted special", rules, start);
}
// Found an operator char. Check for forward-reverse operator.
if (c == REVERSE_RULE_OP &&
(pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) {
++pos;
op = FWDREV_RULE_OP;
} else {
op = c;
}
left = buf; // lhs
leftCursor = cursor;
leftAnte = ante;
leftPost = post;
leftPostClose = postClose;
buf.truncate(0);
cursor = ante = post = postClose = -1;
continue;
}
if (c == END_OF_RULE) {
break;
}
switch (c) {
case VARIABLE_REF_OPEN:
{
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
if (pos == j || j < 0) { // empty or unterminated
return syntaxError("Malformed variable reference", rules, start);
}
scratch.truncate(0);
rules.extractBetween(pos, j, scratch);
pos = j+1;
UChar v = data->lookupVariable(scratch, status);
if (U_FAILURE(status)) {
return syntaxError("Undefined variable", rules, start);
}
buf.append(v);
}
break;
case CONTEXT_OPEN:
if (post >= 0) {
return syntaxError("Multiple post contexts", rules, start);
}
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
// this is the optional opening delimiter for the ante context.
if (buf.length() > 0) {
post = buf.length();
}
break;
case CONTEXT_CLOSE:
if (postClose >= 0) {
return syntaxError("Unexpected ')'", rules, start);
}
if (post >= 0) {
// This is probably the optional closing delimiter
// for the post context; save the pos and check later.
postClose = buf.length();
} else if (ante >= 0) {
return syntaxError("Multiple ante contexts", rules, start);
} else {
ante = buf.length();
}
break;
case SET_OPEN: {
ParsePosition pp(pos-1); // Backup to opening '['
buf.append(registerSet(new UnicodeSet(rules, pp, data, status)));
if (U_FAILURE(status)) {
return syntaxError("Invalid set", rules, start);
}
pos = pp.getIndex(); }
break;
case VARIABLE_REF_CLOSE:
case SET_CLOSE:
return syntaxError("Unquoted special", rules, start);
case CURSOR_POS:
if (cursor >= 0) {
return syntaxError("Multiple cursors", rules, start);
}
cursor = buf.length();
break;
default:
buf.append(c);
break;
}
}
if (op == 0) {
return syntaxError("No operator", rules, start);
}
// Check context close parameters
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
(postClose >= 0 && postClose != buf.length())) {
return syntaxError("Extra text after ]", rules, start);
}
// Context is only allowed on the input side; that is, the left side
// for forward rules. Cursors are only allowed on the output side;
// that is, the right side for forward rules. Bidirectional rules
// ignore elements that do not apply.
switch (op) {
case VARIABLE_DEF_OP:
applyVariableDef(left, right);
// LHS is the name. RHS is a single character, either a literal
// or a set (already parsed). If RHS is longer than one
// character, it is either a multi-character string, or multiple
// sets, or a mixture of chars and sets -- syntax error.
if (buf.length() != 1) {
return syntaxError("Malformed RHS", rules, start);
}
if (data->isVariableDefined(left)) {
return syntaxError("Duplicate definition", rules, start);
}
data->defineVariable(left, buf.charAt(0), status);
break;
case FORWARD_RULE_OP:
if (direction == RuleBasedTransliterator::FORWARD) {
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
return syntaxError("Malformed rule", rules, start);
}
data->ruleSet.addRule(new TransliterationRule(
left, right,
anteContext, postContext,
cursorPos, status),
status);
left, leftAnte, leftPost,
buf, cursor, status), status);
} // otherwise ignore the rule; it's not the direction we want
break;
case REVERSE_RULE_OP:
if (direction == RuleBasedTransliterator::REVERSE) {
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
return syntaxError("Malformed rule", rules, start);
}
data->ruleSet.addRule(new TransliterationRule(
right, left,
anteContext, postContext,
cursorPos, status),
status);
buf, ante, post,
left, leftCursor, status), status);
} // otherwise ignore the rule; it's not the direction we want
break;
}
}
/**
* Add a variable definition.
* @param name the name of the variable. It must not already be defined.
* @param pattern the value of the variable. It may be a single character
* or a pattern describing a character set.
* @exception IllegalArgumentException if there is a syntax error
*/
void TransliterationRuleParser::applyVariableDef(const UnicodeString& name,
const UnicodeString& pattern) {
validateVariableName(name);
if (U_FAILURE(status)) {
return;
}
if (data->isVariableDefined(name)) {
// throw new IllegalArgumentException("Duplicate variable definition: "
// + name + '=' + pattern);
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
//! if (UnicodeSet.getCategoryID(name) >= 0) {
//! throw new IllegalArgumentException("Reserved variable name: "
//! + name);
//! }
if (pattern.length() < 1) {
// throw new IllegalArgumentException("Variable definition missing: "
// + name);
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (pattern.length() == 1) {
// Got a single character variable definition
//$ data->variableNames.put(name, new Character(pattern.charAt(0)));
data->defineVariable(name, pattern.charAt(0), status);
} else {
// Got more than one character; parse it as a category
if (variableNext >= variableLimit) {
//$ throw new RuntimeException("Private use variables exhausted");
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
case FWDREV_RULE_OP:
if (direction == RuleBasedTransliterator::FORWARD) {
// The output side is the right; trim off any context
if (post >= 0) {
buf.remove(post);
}
if (ante >= 0) {
buf.removeBetween(0, ante);
}
data->ruleSet.addRule(new TransliterationRule(
left, leftAnte, leftPost,
buf, cursor, status), status);
} else {
// The output side is the left; trim off any context
if (leftPost >= 0) {
left.remove(leftPost);
}
if (leftAnte >= 0) {
left.removeBetween(0, leftAnte);
}
data->ruleSet.addRule(new TransliterationRule(
buf, ante, post,
left, leftCursor, status), status);
}
//$ Character c = new Character(variableNext++);
//$ data->variableNames.put(name, c);
//$ data->setVariables.put(c, new UnicodeSet(pattern));
data->defineVariable(name, variableNext++,
new UnicodeSet(pattern, status),
status);
break;
}
return pos;
}
/**
* Given a rule, parses it into three pieces: The left side, the right side,
* and the operator. Returns the operator. Quotes and variable references
* are resolved; the otuput text in all <code>StringBuffer</code> parameters
* is literal text. This method delegates to other parsing methods to
* handle the match pattern, output pattern, and other sub-patterns in the
* rule.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param left left side of rule is appended to this buffer
* with the quotes removed and variables resolved
* @param right right side of rule is appended to this buffer
* with the quotes removed and variables resolved
* @param anteContext the preceding context of the match pattern,
* if there is one, is appended to this buffer
* @param postContext the following context of the match pattern,
* if there is one, is appended to this buffer
* @param cursorPos if there is a cursor in the output pattern, its
* offset is stored in <code>cursorPos</code>, otherwise set to -1.
* @return The operator character, one of the characters in OPERATORS.
* Called by main parser upon syntax error. Search the rule string
* for the probable end of the rule. Of course, if the error is that
* the end of rule marker is missing, then the rule end will not be found.
* In any case the rule start will be correctly reported.
* @param msg error description
* @param rule pattern string
* @param start position of first character of current rule
*/
UChar TransliterationRuleParser::parseRule(int32_t start, int32_t limit,
UnicodeString& left,
UnicodeString& right,
UnicodeString& anteContext,
UnicodeString& postContext,
int32_t& cursorPos) {
/* Parse the rule into three pieces -- left, operator, and right,
* parsing out quotes. The result is that left and right will have
* unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted
* operators throw an exception. Two quotes inside or outside
* quotes indicates a quote literal. E.g., "o''clock" -> "o'clock".
*/
int32_t i = quotedIndexOf(rules, start, limit, OPERATORS);
if (i < 0) {
//$ throw new IllegalArgumentException(
//$ "Syntax error: "
//$ + rules.substring(start, limit));
int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/,
const UnicodeString& /*rule*/,
int32_t start) {
//| int end = quotedIndexOf(rule, start, rule.length(), ";");
//| if (end < 0) {
//| end = rule.length();
//| }
//| throw new IllegalArgumentException(msg + " in " +
//| rule.substring(start, end));
status = U_ILLEGAL_ARGUMENT_ERROR;
return start;
}
/**
* Allocate a private-use substitution character for the given set,
* register it in the setVariables hash, and return the substitution
* character.
*/
UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
if (variableNext >= variableLimit) {
// throw new RuntimeException("Private use variables exhausted");
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
cursorPos = -1;
UChar c = rules.charAt(i);
switch (c) {
case FORWARD_RULE_OP:
if (i == start) {
//$ throw new IllegalArgumentException(
//$ "Empty left side: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
parseMatchPattern(start, i, left, anteContext, postContext);
if (i != (limit-1)) {
parseOutputPattern(i+1, limit, right, cursorPos);
}
break;
case REVERSE_RULE_OP:
if (i == (limit-1)) {
//$ throw new IllegalArgumentException(
//$ "Empty right side: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if (i != start) {
parseOutputPattern(start, i, left, cursorPos);
}
parseMatchPattern(i+1, limit, right, anteContext, postContext);
break;
default:
if (i == start || i == (limit-1)) {
//$ throw new IllegalArgumentException(
//$ "Empty left or right side: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
parseSubPattern(start, i, left);
parseDefPattern(i+1, limit, right);
break;
}
UChar c = variableNext++;
data->defineSet(c, adoptedSet, status);
return c;
}
/**
* Parses the match pattern of a forward or reverse rule. Given the raw
* match pattern, return the match text and the context on both sides, if
* any. Resolves all quotes and variables.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the key to be matched will be appended to this buffer
* @param anteContext the preceding context, if any, will be appended
* to this buffer.
* @param postContext the following context, if any, will be appended
* to this buffer.
*/
void TransliterationRuleParser::parseMatchPattern(int32_t start, int32_t limit,
UnicodeString& text,
UnicodeString& anteContext,
UnicodeString& postContext) {
if (start >= limit) {
//$ throw new IllegalArgumentException(
//$ "Empty expression in rule: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
//$ if (anteContext != 0) {
// Ignore optional opening and closing context characters
if (rules.charAt(start) == CONTEXT_OPEN) {
++start;
}
if (rules.charAt(limit-1) == CONTEXT_CLOSE) {
--limit;
}
// The four possibilities are:
// key
// anteContext]key
// anteContext]key[postContext
// key[postContext
int32_t ante = quotedIndexOf(rules, start, limit, CONTEXT_CLOSE);
int32_t post = quotedIndexOf(rules, start, limit, CONTEXT_OPEN);
if (ante >= 0 && post >= 0 && ante > post) {
//$ throw new IllegalArgumentException(
//$ "Syntax error in context specifier: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (ante >= 0) {
parseSubPattern(start, ante, anteContext);
start = ante+1;
}
if (post >= 0) {
parseSubPattern(post+1, limit, postContext);
limit = post;
}
//$ }
parseSubPattern(start, limit, text);
}
void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
UnicodeString& text) {
parseSubPattern(start, limit, text, 0, SPECIALS);
}
/**
* Parse a variable definition sub pattern. This kind of sub
* pattern differs in the set of characters that are considered
* special. In particular, the '[' and ']' characters are not
* special, since these are used in UnicodeSet patterns.
*/
void TransliterationRuleParser::parseDefPattern(int32_t start, int32_t limit,
UnicodeString& text) {
parseSubPattern(start, limit, text, 0, DEF_SPECIALS);
}
/**
* Parses the output pattern of a forward or reverse rule. Given the
* output pattern, return the output text and the position of the cursor,
* if any. Resolves all quotes and variables.
* @param rules the string to be parsed
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the output text will be appended to this buffer
* @param cursorPos if this parameter is not null, then cursorPos
* will be set to the cursor position, or -1 if there is none. If this
* parameter is null, then cursors will be disallowed.
*/
void TransliterationRuleParser::parseOutputPattern(int32_t start, int32_t limit,
UnicodeString& text,
int32_t& cursorPos) {
parseSubPattern(start, limit, text, &cursorPos, SPECIALS);
}
/**
* Parses a sub-pattern of a rule. Return the text and the position of the cursor,
* if any. Resolves all quotes and variables.
* @param rules the string to be parsed
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the output text will be appended to this buffer
* @param cursorPos if this parameter is not null, then cursorPos
* will be set to the cursor position, or -1 if there is none. If this
* parameter is null, then cursors will be disallowed.
* @param specials characters that must be quoted; typically either
* SPECIALS or DEF_SPECIALS.
*/
void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
UnicodeString& text,
int32_t* cursorPos,
const UnicodeString& specials) {
bool_t inQuote = FALSE;
if (start >= limit) {
//$ throw new IllegalArgumentException("Empty expression in rule");
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (cursorPos != 0) {
*cursorPos = -1;
}
for (int32_t i=start; i<limit; ++i) {
UChar c = rules.charAt(i);
if (c == QUOTE) {
// Check for double quote
if ((i+1) < limit
&& rules.charAt(i+1) == QUOTE) {
text.append(QUOTE);
++i; // Skip over both quotes
} else {
inQuote = !inQuote;
}
} else if (inQuote) {
text.append(c);
} else if (c == VARIABLE_REF_OPEN) {
++i;
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, i);
if (i == j || j < 0) { // empty or unterminated
//$ throw new IllegalArgumentException("Illegal variable reference: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
UnicodeString name;
rules.extractBetween(i, j, name);
validateVariableName(name);
if (U_FAILURE(status)) {
return;
}
UChar ch = data->lookupVariable(name, status);
if (U_FAILURE(status)) {
return;
}
text.append(ch);
i = j;
} else if (c == CURSOR_POS && cursorPos != 0) {
if (*cursorPos >= 0) {
//$ throw new IllegalArgumentException("Multiple cursors: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
*cursorPos = text.length();
} else if (specials.indexOf(c) >= 0) {
//$ throw new IllegalArgumentException("Unquoted special character: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
} else {
text.append(c);
}
}
}
void TransliterationRuleParser::validateVariableName(const UnicodeString& name) {
if (indexOf(name, SPECIALS) >= 0) {
//throw new IllegalArgumentException(
// "Special character in variable name: "
// + name);
status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
/**
* Returns the single character value of the given variable name. Defined
* names are recognized.
*
* NO LONGER SUPPORTED:
* If a Unicode category name is given, a standard character variable
* in the range firstCategoryVariable to lastCategoryVariable is returned,
* with value firstCategoryVariable + n, where n is the category
* number.
* @exception IllegalArgumentException if the name is unknown.
*/
//$ UChar TransliterationRuleParser::getVariableDef(const UnicodeString& name) {
//$ UChar ch = data->lookupVariable(name, status);
//$ //! if (ch == null) {
//$ //! int id = UnicodeSet.getCategoryID(name);
//$ //! if (id >= 0) {
//$ //! ch = new Character((char) (firstCategoryVariable + id));
//$ //! data->variableNames.put(name, ch);
//$ //! data->setVariables.put(ch, new UnicodeSet(id));
//$ //! }
//$ //! }
//$ if (ch == 0) {
//$ throw new IllegalArgumentException("Undefined variable: "
//$ + name);
//$ }
//$ return ch;
//$ }
/**
* Determines what part of the private use region of Unicode we can use for
* variable stand-ins. The correct way to do this is as follows: Parse each
@ -599,43 +468,3 @@ int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
}
return -1;
}
/**
* Returns the index of the first character in a set. Unlike
* String.indexOf(), this method searches not for a single character, but
* for any character of the string <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #quotedIndexOf
*/
int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars) {
for (int32_t i=start; i<limit; ++i) {
if (setOfChars.indexOf(text.charAt(i)) >= 0) {
return i;
}
}
return -1;
}
/**
* Returns the index of the first character in a set. Unlike
* String.indexOf(), this method searches not for a single character, but
* for any character of the string <code>setOfChars</code>.
* @param text text to be searched
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #quotedIndexOf
*/
int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
const UnicodeString& setOfChars) {
return indexOf(text, 0, text.length(), setOfChars);
}

View File

@ -11,6 +11,7 @@
#include "unicode/rbt.h"
class TransliterationRuleData;
class UnicodeSet;
class TransliterationRuleParser {
@ -49,29 +50,21 @@ class TransliterationRuleParser {
static const UChar VARIABLE_DEF_OP;
static const UChar FORWARD_RULE_OP;
static const UChar REVERSE_RULE_OP;
static const char* OPERATORS;
static const UChar FWDREV_RULE_OP; // internal rep of <> op
static const UnicodeString OPERATORS;
// Other special characters
static const UChar QUOTE;
static const UChar ESCAPE;
static const UChar END_OF_RULE;
static const UChar RULE_COMMENT_CHAR;
static const UChar VARIABLE_REF_OPEN;
static const UChar VARIABLE_REF_CLOSE;
static const UChar CONTEXT_OPEN;
static const UChar CONTEXT_CLOSE;
static const UChar SET_OPEN;
static const UChar SET_CLOSE;
static const UChar CURSOR_POS;
static const UChar RULE_COMMENT_CHAR;
/**
* Specials must be quoted in rules to be used as literals.
* Specials may not occur in variable names.
*/
static const char* SPECIALS;
/**
* Specials that must be quoted in variable definitions.
*/
static const char* DEF_SPECIALS;
public:
@ -100,140 +93,38 @@ private:
void parseRules(void);
/**
* Parse the given substring as a rule, and append it to the rules currently
* represented in this object.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void applyRule(int32_t start, int32_t limit);
/**
* Add a variable definition.
* @param name the name of the variable. It must not already be defined.
* @param pattern the value of the variable. It may be a single character
* or a pattern describing a character set.
* @exception IllegalArgumentException if there is a syntax error
*/
void applyVariableDef(const UnicodeString& name,
const UnicodeString& pattern);
/**
* Given a rule, parses it into three pieces: The left side, the right side,
* and the operator. Returns the operator. Quotes and variable references
* are resolved; the otuput text in all <code>StringBuffer</code> parameters
* is literal text. This method delegates to other parsing methods to
* handle the match pattern, output pattern, and other sub-patterns in the
* rule.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param left left side of rule is appended to this buffer
* with the quotes removed and variables resolved
* @param right right side of rule is appended to this buffer
* with the quotes removed and variables resolved
* @param anteContext the preceding context of the match pattern,
* if there is one, is appended to this buffer
* @param postContext the following context of the match pattern,
* if there is one, is appended to this buffer
* @param cursorPos if there is a cursor in the output pattern, its
* offset is stored in <code>cursorPos[0]</code>
* @return The operator character, one of the characters in OPERATORS.
*/
UChar parseRule(int32_t start, int32_t limit,
UnicodeString& left, UnicodeString& right,
UnicodeString& anteContext,
UnicodeString& postContext,
int32_t& cursorPos);
/**
* Parses the match pattern of a forward or reverse rule. Given the raw
* match pattern, return the match text and the context on both sides, if
* any. Resolves all quotes and variables.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the key to be matched will be appended to this buffer
* @param anteContext the preceding context, if any, will be appended
* to this buffer.
* @param postContext the following context, if any, will be appended
* to this buffer.
*/
void parseMatchPattern(int32_t start, int32_t limit,
UnicodeString& text,
UnicodeString& anteContext,
UnicodeString& postContext);
void parseSubPattern(int32_t start, int32_t limit,
UnicodeString& text);
/**
* Parse a variable definition sub pattern. This kind of sub
* pattern differs in the set of characters that are considered
* special. In particular, the '[' and ']' characters are not
* special, since these are used in UnicodeSet patterns.
*/
void parseDefPattern(int32_t start, int32_t limit,
UnicodeString& text);
/**
* Parses the output pattern of a forward or reverse rule. Given the
* output pattern, return the output text and the position of the cursor,
* if any. Resolves all quotes and variables.
* @param rules the string to be parsed
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the output text will be appended to this buffer
* @param cursorPos if this parameter is not null, then cursorPos[0]
* will be set to the cursor position, or -1 if there is none. If this
* parameter is null, then cursors will be disallowed.
*/
void parseOutputPattern(int32_t start, int32_t limit,
UnicodeString& text,
int32_t& cursorPos);
/**
* Parses a sub-pattern of a rule. Return the text and the position of the cursor,
* if any. Resolves all quotes and variables.
* @param rules the string to be parsed
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the output text will be appended to this buffer
* @param cursorPos if this parameter is not null, then cursorPos[0]
* will be set to the cursor position, or -1 if there is none. If this
* parameter is null, then cursors will be disallowed.
* @param specials characters that must be quoted; typically either
* SPECIALS or DEF_SPECIALS.
*/
void parseSubPattern(int32_t start, int32_t limit,
UnicodeString& text,
int32_t* cursorPos,
const UnicodeString& specials);
void validateVariableName(const UnicodeString& name);
/**
* Returns the single character value of the given variable name. Defined
* names are recognized.
* MAIN PARSER. Parse the next rule in the given rule string, starting
* at pos. Return the index after the last character parsed. Do not
* parse characters at or after limit.
*
* NO LONGER SUPPORTED:
* If a Unicode category name is given, a standard character variable
* in the range firstCategoryVariable to lastCategoryVariable is returned,
* with value firstCategoryVariable + n, where n is the category
* number.
* @exception IllegalArgumentException if the name is unknown.
* Important: The character at pos must be a non-whitespace character
* that is not the comment character.
*
* This method handles quoting, escaping, and whitespace removal. It
* parses the end-of-rule character. It recognizes context and cursor
* indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list.
*/
//$ Character getVariableDef(const UnicodeString& name);
int32_t parseRule(int32_t pos, int32_t limit);
/**
* Called by main parser upon syntax error. Search the rule string
* for the probable end of the rule. Of course, if the error is that
* the end of rule marker is missing, then the rule end will not be found.
* In any case the rule start will be correctly reported.
* @param msg error description
* @param rule pattern string
* @param start position of first character of current rule
*/
int32_t syntaxError(const char* msg, const UnicodeString&, int32_t start);
/**
* Allocate a private-use substitution character for the given set,
* register it in the setVariables hash, and return the substitution
* character.
*/
UChar registerSet(UnicodeSet* adoptedSet);
/**
* Determines what part of the private use region of Unicode we can use for
* variable stand-ins. The correct way to do this is as follows: Parse each
@ -263,38 +154,6 @@ private:
static int32_t quotedIndexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars);
/**
* Returns the index of the first character in a set. Unlike
* String.indexOf(), this method searches not for a single character, but
* for any character of the string <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #quotedIndexOf
*/
static int32_t indexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars);
/**
* Returns the index of the first character in a set. Unlike
* String.indexOf(), this method searches not for a single character, but
* for any character of the string <code>setOfChars</code>.
* @param text text to be searched
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #quotedIndexOf
*/
static int32_t indexOf(const UnicodeString& text,
const UnicodeString& setOfChars);
};
#endif

View File

@ -25,6 +25,7 @@
* after the <code>key</code>
* @param cursorPos a position for the cursor after the <code>output</code>
* is emitted. If less than zero, then the cursor is placed after the
* <code>output</code>; that is, -1 is equivalent to
* <code>output.length()</code>. If greater than
* <code>output.length()</code> then an exception is thrown.
@ -37,55 +38,93 @@ TransliterationRule::TransliterationRule(const UnicodeString& theKey,
const UnicodeString& thePostContext,
int32_t theCursorPos,
UErrorCode &status) :
key(theKey), output(theOutput),
anteContext(theAnteContext),
postContext(thePostContext),
cursorPos(theCursorPos),
maskKey(0) {
output(theOutput),
cursorPos(theCursorPos)
{
if (U_FAILURE(status)) {
return;
}
anteContextLength = theAnteContext.length();
keyLength = theKey.length();
pattern = theAnteContext;
pattern.append(theKey).append(thePostContext);
if (cursorPos < 0) {
cursorPos = output.length();
}
if (cursorPos > output.length()) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
/* The mask key is needed when we are adding individual rules to a rule
* set, for performance. Here are the numbers: Without mask key, 13.0
* seconds. With mask key, 6.2 seconds. However, once the rules have
* been added to the set, then they can be discarded to free up space.
* This is what the freeze() method does. After freeze() has been
* called, the method masks() must NOT be called.
*/
maskKey = new UnicodeString(key);
if (maskKey == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
} else {
maskKey->append(postContext);
}
}
TransliterationRule::~TransliterationRule() {
delete maskKey;
/**
* Construct a new rule with the given input, output text, and other
* attributes. A cursor position may be specified for the output text.
* @param input input string, including key and optional ante and
* post context
* @param anteContextPos offset into input to end of ante context, or -1 if
* none. Must be <= input.length() if not -1.
* @param postContextPos offset into input to start of post context, or -1
* if none. Must be <= input.length() if not -1, and must be >=
* anteContextPos.
* @param output output string
* @param cursorPos offset into output at which cursor is located, or -1 if
* none. If less than zero, then the cursor is placed after the
* <code>output</code>; that is, -1 is equivalent to
* <code>output.length()</code>. If greater than
* <code>output.length()</code> then an exception is thrown.
*/
TransliterationRule::TransliterationRule(const UnicodeString& input,
int32_t anteContextPos, int32_t postContextPos,
const UnicodeString& output,
int32_t cursorPos,
UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
// Do range checks only when warranted to save time
if (anteContextPos < 0) {
anteContextLength = 0;
} else {
if (anteContextPos > input.length()) {
// throw new IllegalArgumentException("Invalid ante context");
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
anteContextLength = anteContextPos;
}
if (postContextPos < 0) {
keyLength = input.length() - anteContextLength;
} else {
if (postContextPos < anteContextLength ||
postContextPos > input.length()) {
// throw new IllegalArgumentException("Invalid post context");
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
keyLength = postContextPos - anteContextLength;
}
if (cursorPos < 0) {
this->cursorPos = output.length();
} else {
if (cursorPos > output.length()) {
// throw new IllegalArgumentException("Invalid cursor position");
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
this->cursorPos = cursorPos;
}
pattern = input;
this->output = output;
}
TransliterationRule::~TransliterationRule() {}
/**
* Return the length of the key. Equivalent to <code>getKey().length()</code>.
* @return the length of the match key.
*/
int32_t TransliterationRule::getKeyLength(void) const {
return key.length();
}
/**
* Return the key.
* @return the match key.
*/
const UnicodeString& TransliterationRule::getKey(void) const {
return key;
return keyLength;
}
/**
@ -110,7 +149,45 @@ int32_t TransliterationRule::getCursorPos(void) const {
* <code>getMaximumContextLength()</code>.
*/
int32_t TransliterationRule::getAnteContextLength(void) const {
return anteContext.length();
return anteContextLength;
}
/**
* Internal method. Returns 8-bit index value for this rule.
* This is the low byte of the first character of the key,
* unless the first character of the key is a set. If it's a
* set, or otherwise can match multiple keys, the index value is -1.
*/
int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) {
if (anteContextLength == pattern.length()) {
// A pattern with just ante context {such as foo)>bar} can
// match any key.
return -1;
}
UChar c = pattern.charAt(anteContextLength);
return data.lookupSet(c) == NULL ? (c & 0xFF) : -1;
}
/**
* Internal method. Returns true if this rule matches the given
* index value. The index value is an 8-bit integer, 0..255,
* representing the low byte of the first character of the key.
* It matches this rule if it matches the first character of the
* key, or if the first character of the key is a set, and the set
* contains any character with a low byte equal to the index
* value. If the rule contains only ante context, as in foo)>bar,
* then it will match any key.
*/
bool_t TransliterationRule::matchesIndexValue(uint8_t v,
const TransliterationRuleData& data) {
if (anteContextLength == pattern.length()) {
// A pattern with just ante context {such as foo)>bar} can
// match any key.
return TRUE;
}
UChar c = pattern.charAt(anteContextLength);
UnicodeSet* set = data.lookupSet(c);
return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
}
/**
@ -118,43 +195,37 @@ int32_t TransliterationRule::getAnteContextLength(void) const {
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
* "[c]a>x" masks "[dc]a>y".
*
* <p>This method must not be called after freeze() is called.
*/
bool_t TransliterationRule::masks(const TransliterationRule& r2) const {
/* There are three cases of masking. In each instance, rule1
* masks rule2.
/* Rule r1 masks rule r2 if the string formed of the
* antecontext, key, and postcontext overlaps in the following
* way:
*
* 1. KEY mask: len(key1) < len(key2), key2 starts with key1.
*
* 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
* prefix2 ends with prefix1, suffix2 starts with suffix1.
*
* 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
* prefix2 ends with prefix1, suffix2 starts with suffix1.
* r1: aakkkpppp
* r2: aaakkkkkpppp
* ^
*
* The strings must be aligned at the first character of the
* key. The length of r1 to the left of the alignment point
* must be <= the length of r2 to the left; ditto for the
* right. The characters of r1 must equal (or be a superset
* of) the corresponding characters of r2. The superset
* operation should be performed to check for UnicodeSet
* masking.
*/
/* LIMITATION of the current mask algorithm: Some rule
* maskings are currently not detected. For example,
* "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking,
* we need a subset operator on UnicodeSet objects, which we
* currently do not have. This can be added later.
* "{Lu}]a>x" masks "A]a>y". This can be added later. TODO
*/
return ((maskKey->length() < r2.maskKey->length() &&
r2.maskKey->startsWith(*maskKey)) ||
(r2.anteContext.length() != 0 && *maskKey == *r2.maskKey &&
((anteContext.length() == 0) ||
(anteContext.length() < r2.anteContext.length() &&
r2.anteContext.endsWith(anteContext)))));
}
/**
* Free up space. Once this method is called, masks() must NOT be called.
* If it is called, an exception will be thrown.
*/
void TransliterationRule::freeze(void) {
delete maskKey;
maskKey = 0;
int32_t len = pattern.length();
int32_t left = anteContextLength;
int32_t left2 = r2.anteContextLength;
int32_t right = len - left;
int32_t right2 = r2.pattern.length() - left2;
return left <= left2 && right <= right2 &&
0 == r2.pattern.compare(left2 - left, len, pattern);
}
/**
@ -186,17 +257,10 @@ bool_t TransliterationRule::matches(const UnicodeString& text,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
return
(anteContext.length() == 0
|| regionMatches(text, start, limit, result,
cursor - anteContext.length(),
anteContext, data, filter)) &&
regionMatches(text, start, limit, result, cursor,
key, data, filter) &&
(postContext.length() == 0
|| regionMatches(text, start, limit, result,
cursor + key.length(),
postContext, data, filter));
// Match anteContext, key, and postContext
return regionMatches(text, start, limit, result,
cursor - anteContextLength,
pattern, data, filter);
}
/**
@ -219,15 +283,10 @@ bool_t TransliterationRule::matches(const Replaceable& text,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
return
(anteContext.length() == 0
|| regionMatches(text, start, limit, cursor - anteContext.length(),
anteContext, data, filter)) &&
regionMatches(text, start, limit, cursor,
key, data, filter) &&
(postContext.length() == 0
|| regionMatches(text, start, limit, cursor + key.length(),
postContext, data, filter));
// Match anteContext, key, and postContext
return regionMatches(text, start, limit,
cursor - anteContextLength,
pattern, data, filter);
}
/**
@ -260,28 +319,10 @@ int32_t TransliterationRule::getMatchDegree(const Replaceable& text,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
if (anteContext.length() != 0
&& !regionMatches(text, start, limit, cursor - anteContext.length(),
anteContext, data, filter)) {
return MISMATCH;
}
int32_t len = getRegionMatchLength(text, start, limit, cursor,
key, data, filter);
if (len < 0) {
return MISMATCH;
}
if (len < key.length()) {
return PARTIAL_MATCH;
}
if (postContext.length() == 0) {
return FULL_MATCH;
}
len = getRegionMatchLength(text, start, limit,
cursor + key.length(),
postContext, data, filter);
return (len < 0) ? MISMATCH
: ((len == postContext.length()) ? FULL_MATCH
: PARTIAL_MATCH);
int len = getRegionMatchLength(text, start, limit, cursor - anteContextLength,
pattern, data, filter);
return len < anteContextLength ? MISMATCH :
(len < pattern.length() ? PARTIAL_MATCH : FULL_MATCH);
}
/**

View File

@ -72,9 +72,13 @@ public:
private:
/**
* The string that must be matched.
* The string that must be matched, consisting of the anteContext, key,
* and postContext, concatenated together, in that order. Some components
* may be empty (zero length).
* @see anteContextLength
* @see keyLength
*/
UnicodeString key;
UnicodeString pattern;
/**
* The string that is emitted if the key, anteContext, and postContext
@ -83,16 +87,18 @@ private:
UnicodeString output;
/**
* The string that must match before the key. If empty, then
* there is no matching requirement before the key.
* The length of the string that must match before the key. If
* zero, then there is no matching requirement before the key.
* Substring [0,anteContextLength) of pattern is the anteContext.
*/
UnicodeString anteContext;
int32_t anteContextLength;
/**
* The string that must match after the key. If empty, then there
* is no matching requirement after the key.
* The length of the key. Substring [anteContextLength,
* anteContextLength + keyLength) is the key.
*/
UnicodeString postContext;
int32_t keyLength;
/**
* The position of the cursor after emitting the output string, from 0 to
@ -101,12 +107,6 @@ private:
*/
int32_t cursorPos;
/**
* A string used to implement masks().
* @see #freeze
*/
UnicodeString* maskKey;
public:
/**
@ -134,6 +134,29 @@ public:
int32_t theCursorPos,
UErrorCode &status);
/**
* Construct a new rule with the given input, output text, and other
* attributes. A cursor position may be specified for the output text.
* @param input input string, including key and optional ante and
* post context
* @param anteContextPos offset into input to end of ante context, or -1 if
* none. Must be <= input.length() if not -1.
* @param postContextPos offset into input to start of post context, or -1
* if none. Must be <= input.length() if not -1, and must be >=
* anteContextPos.
* @param output output string
* @param cursorPos offset into output at which cursor is located, or -1 if
* none. If less than zero, then the cursor is placed after the
* <code>output</code>; that is, -1 is equivalent to
* <code>output.length()</code>. If greater than
* <code>output.length()</code> then an exception is thrown.
*/
TransliterationRule(const UnicodeString& input,
int32_t anteContextPos, int32_t postContextPos,
const UnicodeString& output,
int32_t cursorPos,
UErrorCode& status);
/**
* Destructor.
*/
@ -145,12 +168,6 @@ public:
*/
virtual int32_t getKeyLength(void) const;
/**
* Return the key.
* @return the match key.
*/
virtual const UnicodeString& getKey(void) const;
/**
* Return the output string.
* @return the output string.
@ -170,22 +187,39 @@ public:
*/
virtual int32_t getAnteContextLength(void) const;
private:
friend class TransliterationRuleSet;
/**
* Internal method. Returns 8-bit index value for this rule.
* This is the low byte of the first character of the key,
* unless the first character of the key is a set. If it's a
* set, or otherwise can match multiple keys, the index value is -1.
*/
int16_t getIndexValue(const TransliterationRuleData& data);
/**
* Internal method. Returns true if this rule matches the given
* index value. The index value is an 8-bit integer, 0..255,
* representing the low byte of the first character of the key.
* It matches this rule if it matches the first character of the
* key, or if the first character of the key is a set, and the set
* contains any character with a low byte equal to the index
* value. If the rule contains only ante context, as in foo)>bar,
* then it will match any key.
*/
bool_t matchesIndexValue(uint8_t v,
const TransliterationRuleData& data);
public:
/**
* Return true if this rule masks another rule. If r1 masks r2 then
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
* "[c]a>x" masks "[dc]a>y".
*
* <p>This method must not be called after freeze() is called.
*/
virtual bool_t masks(const TransliterationRule& r2) const;
/**
* Free up space. Once this method is called, masks() must NOT be called.
* If it is called, an exception will be thrown.
*/
virtual void freeze(void);
/**
* Return true if this rule matches the given text. The text being matched
* occupies a virtual buffer consisting of the contents of

View File

@ -30,6 +30,16 @@
*/
TransliterationRuleSet::TransliterationRuleSet() {
maxContextLength = 0;
ruleVector = new UVector();
rules = NULL;
}
/**
* Destructor.
*/
TransliterationRuleSet::~TransliterationRuleSet() {
delete ruleVector;
delete[] rules;
}
/**
@ -45,31 +55,22 @@ int32_t TransliterationRuleSet::getMaximumContextLength(void) const {
* significant.
*
* <p>Once freeze() is called, this method must not be called.
* @param rule the rule to add
* @param adoptedRule the rule to add
*/
void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
UErrorCode& status) {
// Build time, no checking : 3562 ms
// Build time, with checking: 6234 ms
if (U_FAILURE(status)) {
delete adoptedRule;
return;
}
for (int32_t i=0; i<rules.size(); ++i) {
TransliterationRule* r = (TransliterationRule*) rules.elementAt(i);
if (r->masks(*adoptedRule)) {
//throw new IllegalArgumentException("Rule " + rule +
// " must precede " + r);
status = U_ILLEGAL_ARGUMENT_ERROR;
delete adoptedRule;
return;
}
if (ruleVector == NULL) {
// throw new IllegalArgumentException("Cannot add rules after freezing");
status = U_ILLEGAL_ARGUMENT_ERROR;
delete adoptedRule;
return;
}
ruleVector->addElement(adoptedRule);
rules.addElement(adoptedRule);
int32_t len;
if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) {
maxContextLength = len;
@ -77,13 +78,109 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
}
/**
* Free up space. Once this method is called, addRule() must NOT
* be called again.
* Close this rule set to further additions, check it for masked rules,
* and index it to optimize performance. Once this method is called,
* addRule() can no longer be called.
* @exception IllegalArgumentException if some rules are masked
*/
void TransliterationRuleSet::freeze(void) {
for (int32_t i=0; i<rules.size(); ++i) {
((TransliterationRule*) rules.elementAt(i))->freeze();
void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
/* Construct the rule array and index table. We reorder the
* rules by sorting them into 256 bins. Each bin contains all
* rules matching the index value for that bin. A rule
* matches an index value if string whose first key character
* has a low byte equal to the index value can match the rule.
*
* Each bin contains zero or more rules, in the same order
* they were found originally. However, the total rules in
* the bins may exceed the number in the original vector,
* since rules that have a variable as their first key
* character will generally fall into more than one bin.
*
* That is, each bin contains all rules that either have that
* first index value as their first key character, or have
* a set containing the index value as their first character.
*/
int32_t n = ruleVector->size();
int32_t j;
int16_t x;
UVector v(2*n); // heuristic; adjust as needed
/* Precompute the index values. This saves a LOT of time.
*/
int16_t* indexValue = new int16_t[n];
for (j=0; j<n; ++j) {
TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
indexValue[j] = r->getIndexValue(data);
}
for (x=0; x<256; ++x) {
index[x] = v.size();
for (j=0; j<n; ++j) {
if (indexValue[j] >= 0) {
if (indexValue[j] == x) {
v.addElement(ruleVector->elementAt(j));
}
} else {
// If the indexValue is < 0, then the first key character is
// a set, and we must use the more time-consuming
// matchesIndexValue check. In practice this happens
// rarely, so we seldom tread this code path.
TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
if (r->matchesIndexValue((uint8_t)x, data)) {
v.addElement(r);
}
}
}
}
delete[] indexValue;
index[256] = v.size();
/* Freeze things into an array.
*/
rules = new TransliterationRule*[v.size()];
for (j=0; j<v.size(); ++j) {
rules[j] = (TransliterationRule*) v.elementAt(j);
}
delete ruleVector;
ruleVector = NULL;
// TODO Add error reporting that indicates the rules that
// are being masked.
//UnicodeString errors;
/* Check for masking. This is MUCH faster than our old check,
* which was each rule against each following rule, since we
* only have to check for masking within each bin now. It's
* 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
* count, and n2 is the per-bin rule count. But n2<<n1, so
* it's a big win.
*/
for (x=0; x<256; ++x) {
for (j=index[x]; j<index[x+1]-1; ++j) {
TransliterationRule* r1 = rules[j];
for (int32_t k=j+1; k<index[x+1]; ++k) {
TransliterationRule* r2 = rules[k];
if (r1->masks(*r2)) {
//| if (errors == null) {
//| errors = new StringBuffer();
//| } else {
//| errors.append("\n");
//| }
//| errors.append("Rule " + r1 + " masks " + r2);
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
}
}
}
//if (errors != null) {
// throw new IllegalArgumentException(errors.toString());
//}
}
/**
@ -119,15 +216,18 @@ TransliterationRuleSet::findMatch(const UnicodeString& text,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
for (int32_t i=0; i<rules.size(); ++i) {
TransliterationRule* rule =
(TransliterationRule*) rules.elementAt(i);
if (rule->matches(text, start, limit, result,
cursor, data, filter)) {
return rule;
/* We only need to check our indexed bin of the rule table,
* based on the low byte of the first key character.
*/
int32_t rlen = result.length();
int16_t x = 0xFF & (cursor < rlen ? result.charAt(cursor)
: text.charAt(cursor - rlen + start));
for (int32_t i=index[x]; i<index[x+1]; ++i) {
if (rules[i]->matches(text, start, limit, result, cursor, data, filter)) {
return rules[i];
}
}
return 0;
return NULL;
}
/**
@ -154,15 +254,16 @@ TransliterationRuleSet::findMatch(const Replaceable& text,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
for (int32_t i=0; i<rules.size(); ++i) {
TransliterationRule* rule =
(TransliterationRule*) rules.elementAt(i);
if (rule->matches(text, start, limit, cursor,
data, filter)) {
return rule;
/* We only need to check our indexed bin of the rule table,
* based on the low byte of the first key character.
*/
int16_t x = text.charAt(cursor) & 0xFF;
for (int32_t i=index[x]; i<index[x+1]; ++i) {
if (rules[i]->matches(text, start, limit, cursor, data, filter)) {
return rules[i];
}
}
return 0;
return NULL;
}
/**
@ -199,19 +300,22 @@ TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
const TransliterationRuleData& data,
bool_t& isPartial,
const UnicodeFilter* filter) const {
/* We only need to check our indexed bin of the rule table,
* based on the low byte of the first key character.
*/
isPartial = FALSE;
for (int32_t i=0; i<rules.size(); ++i) {
TransliterationRule* rule =
(TransliterationRule*) rules.elementAt(i);
int32_t match = rule->getMatchDegree(text, start, limit, cursor,
data, filter);
int16_t x = text.charAt(cursor) & 0xFF;
for (int32_t i=index[x]; i<index[x+1]; ++i) {
int32_t match = rules[i]->getMatchDegree(text, start, limit, cursor,
data, filter);
switch (match) {
case TransliterationRule::FULL_MATCH:
return rule;
return rules[i];
case TransliterationRule::PARTIAL_MATCH:
isPartial = TRUE;
return 0;
return NULL;
}
}
return 0;
return NULL;
}

View File

@ -30,15 +30,30 @@ class UnicodeString;
*/
class TransliterationRuleSet {
/**
* Vector of rules, in the order added.
* Vector of rules, in the order added. This is only used while the rule
* set is getting built. After that, freeze() reorders and indexes the
* rules, and this Vector is freed.
*/
UVector rules;
UVector* ruleVector;
/**
* Length of the longest preceding context
*/
int32_t maxContextLength;
/**
* Sorted and indexed table of rules. This is created by freeze() from
* the rules in ruleVector.
*/
TransliterationRule** rules;
/**
* Index table. For text having a first character c, compute x = c&0xFF.
* Now use rules[index[x]..index[x+1]-1]. This index table is created by
* freeze().
*/
int32_t index[257];
public:
/**
@ -46,6 +61,11 @@ public:
*/
TransliterationRuleSet();
/**
* Destructor.
*/
virtual ~TransliterationRuleSet();
/**
* Return the maximum context length.
* @return the length of the longest preceding context.
@ -57,16 +77,19 @@ public:
* significant.
*
* <p>Once freeze() is called, this method must not be called.
* @param rule the rule to add
* @param adoptedRule the rule to add
*/
virtual void addRule(TransliterationRule* adoptedRule,
UErrorCode& status);
/**
* Free up space. Once this method is called, addRule() must NOT
* be called again.
* Close this rule set to further additions, check it for masked rules,
* and index it to optimize performance. Once this method is called,
* addRule() can no longer be called.
* @exception IllegalArgumentException if some rules are masked
*/
virtual void freeze(void);
virtual void freeze(const TransliterationRuleData& data,
UErrorCode& status);
/**
* Attempt to find a matching rule at the specified point in the text. The

View File

@ -14,7 +14,7 @@
// N.B.: This mapping is different in ICU and Java
const UnicodeString UnicodeSet::CATEGORY_NAMES(
"CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf");
"CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf", "");
/**
* A cache mapping character category integers, as returned by
@ -28,7 +28,7 @@ UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE =
* Delimiter string used in patterns to close a category reference:
* ":]". Example: "[:Lu:]".
*/
const UnicodeString UnicodeSet::CATEGORY_CLOSE(":]", "");
const UnicodeString UnicodeSet::CATEGORY_CLOSE = UNICODE_STRING(":]", 2);
/**
* Delimiter char beginning a variable reference:
@ -69,23 +69,20 @@ UnicodeSet::UnicodeSet() : pairs() {}
* white space. See the class description for the syntax of the
* pattern language.
* @param pattern a string specifying what characters are in the set
* @param ignoreSpaces if <code>true</code>, all spaces in the
* pattern are ignored, except those preceded by '\\'. Spaces are
* those characters for which <code>Character.isSpaceChar()</code>
* is <code>true</code>.
* @exception <code>IllegalArgumentException</code> if the pattern
* contains a syntax error.
*/
UnicodeSet::UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces,
UErrorCode& status) : pairs() {
applyPattern(pattern, ignoreSpaces, status);
}
UnicodeSet::UnicodeSet(const UnicodeString& pattern,
UErrorCode& status) : pairs() {
applyPattern(pattern, status);
}
UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
const TransliterationRuleData* data,
UErrorCode& status) {
parse(pairs, pattern, pos, data, status);
}
/**
* Constructs a set from the given Unicode character category.
* @param category an integer indicating the character category as
@ -164,50 +161,24 @@ int32_t UnicodeSet::hashCode(void) const {
* contains a syntax error.
*/
void UnicodeSet::applyPattern(const UnicodeString& pattern,
bool_t ignoreSpaces,
UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
ParsePosition pos(0);
UnicodeString* pat = (UnicodeString*) &pattern;
parse(pairs, pattern, pos, NULL, status);
// To ignore spaces, create a new pattern without spaces. We
// have to process all '\' escapes. If '\' is encountered,
// insert it and the following character (if any -- let parse
// deal with any syntax errors) in the pattern. This allows
// escaped spaces.
if (ignoreSpaces) {
pat = new UnicodeString();
for (int32_t i=0; i<pattern.length(); ++i) {
UChar c = pattern.charAt(i);
if (Unicode::isSpaceChar(c)) {
continue;
}
if (c == '\\' && (i+1) < pattern.length()) {
pat->append(c);
c = pattern.charAt(++i);
// Fall through and append the following char
}
pat->append(c);
}
// Skip over trailing whitespace
int32_t i = pos.getIndex();
int32_t n = pattern.length();
while (i<n && Unicode::isWhitespace(pattern.charAt(i))) {
++i;
}
parse(pairs, *pat, pos, NULL, status);
// Skip over trailing whitespace -- clean up later
while (pos.getIndex() < pat->length() &&
Unicode::isWhitespace(pat->charAt(pos.getIndex()))) {
pos.setIndex(pos.getIndex() + 1);
}
if (pos.getIndex() != pat->length()) {
if (i != n) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
if (pat != &pattern) {
delete pat;
}
}
/**
@ -279,6 +250,34 @@ bool_t UnicodeSet::contains(UChar c) const {
return contains(c, c);
}
/**
* Returns <tt>true</tt> if this set contains any character whose low byte
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
* indexing.
*/
bool_t UnicodeSet::containsIndexValue(uint8_t v) const {
/* The index value v, in the range [0,255], is contained in this set if
* it is contained in any pair of this set. Pairs either have the high
* bytes equal, or unequal. If the high bytes are equal, then we have
* aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
* v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
* Then v is contained if xx <= v || v <= yy. (This is identical to the
* time zone month containment logic.)
*/
for (int32_t i=0; i<pairs.length(); i+=2) {
UChar low = pairs.charAt(i);
UChar high = pairs.charAt(i+1);
if ((low & 0xFF00) == (high & 0xFF00)) {
if (uint8_t(low) <= v && v <= uint8_t(high)) {
return TRUE;
}
} else if (uint8_t(low) <= v || v <= uint8_t(high)) {
return TRUE;
}
}
return FALSE;
}
/**
* Adds the specified range to this set if it is not already
* present. If this set already contains the specified range,