ICU-199 new rule syntax; performance improvement; update rules

X-SVN-Rev: 559
2000-01-13 07:28:08 +00:00 · 2000-01-13 07:28:08 +00:00 · 1a6cfef879
commit 1a6cfef879
parent cd8a516d90
9 changed files with 781 additions and 883 deletions
--- a/icu4c/source/i18n/rbt_data.cpp
+++ b/icu4c/source/i18n/rbt_data.cpp
@ -43,6 +43,14 @@ TransliterationRuleData::defineVariable(const UnicodeString& name,
                                        UChar standIn,
                                        UnicodeSet* adoptedSet,
                                        UErrorCode& status) {
+    defineVariable(name, standIn, status);
+    defineSet(standIn, adoptedSet, status);
+}
+
+void
+TransliterationRuleData::defineSet(UChar standIn,
+                                   UnicodeSet* adoptedSet,
+                                   UErrorCode& status) {
    if (U_FAILURE(status)) {
        return;
    }
@ -50,9 +58,6 @@ TransliterationRuleData::defineVariable(const UnicodeString& name,
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
-    uhash_putKey(variableNames, name.hashCode() & 0x7FFFFFFF,
-                 (void*) standIn,
-                 &status);
    uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF),
                 adoptedSet,
                 &status);
--- a/icu4c/source/i18n/rbt_data.h
+++ b/icu4c/source/i18n/rbt_data.h
@ -72,6 +72,10 @@ public:
                        UnicodeSet* adoptedSet,
                        UErrorCode& status);

+    void defineSet(UChar standIn,
+                   UnicodeSet* adoptedSet,
+                   UErrorCode& status);
+
    UChar lookupVariable(const UnicodeString& name,
                         UErrorCode& status) const;
    
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -13,35 +13,30 @@
 #include "unirange.h"
 #include "rbt_data.h"
 #include "unicode/uniset.h"
+#include "cstring.h"
+#include "unicode/parsepos.h"

 // Operators
 const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
 const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>';
 const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<';
-const char* TransliterationRuleParser::OPERATORS = "=><";
+const UChar TransliterationRuleParser::FWDREV_RULE_OP  = '~'; // internal rep of <> op
+const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3);

 // Other special characters
 const UChar TransliterationRuleParser::QUOTE = '\'';
-const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
-const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
-const UChar TransliterationRuleParser::CONTEXT_OPEN = '[';
-const UChar TransliterationRuleParser::CONTEXT_CLOSE = ']';
-const UChar TransliterationRuleParser::CURSOR_POS = '|';
+const UChar TransliterationRuleParser::ESCAPE = '\\';
+const UChar TransliterationRuleParser::END_OF_RULE = ';';
 const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#';

+const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
+const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
+const UChar TransliterationRuleParser::CONTEXT_OPEN = '(';
+const UChar TransliterationRuleParser::CONTEXT_CLOSE = ')';
+const UChar TransliterationRuleParser::SET_OPEN = '[';
+const UChar TransliterationRuleParser::SET_CLOSE = ']';
+const UChar TransliterationRuleParser::CURSOR_POS = '|';

-/**
- * Specials must be quoted in rules to be used as literals.
- * Specials may not occur in variable names.
- *
- * This string is a superset of OPERATORS.
- */
-const char* TransliterationRuleParser::SPECIALS = "'{}[]|#=><";
-
-/**
- * Specials that must be quoted in variable definitions.
- */
-const char* TransliterationRuleParser::DEF_SPECIALS = "'{}";

 TransliterationRuleData*
 TransliterationRuleParser::parse(const UnicodeString& rules,
@ -84,465 +79,339 @@ void TransliterationRuleParser::parseRules(void) {
    
    determineVariableRange();

-    int32_t n = rules.length();
-    int32_t i = 0;
-    while (i<n && U_SUCCESS(status)) {
-        int32_t limit = rules.indexOf('\n', i);
-
-        // Recognize "\\\n" as an escaped "\n"
-        while (limit>0 && rules.charAt(limit-1) == '\\') {
-            limit = rules.indexOf('\n', limit+1);
+    int32_t pos = 0;
+    int32_t limit = rules.length();
+    while (pos < limit && U_SUCCESS(status)) {
+        UChar c = rules.charAt(pos++);
+        if (Unicode::isWhitespace(c)) {
+            // Ignore leading whitespace.  Note that this is not
+            // Unicode spaces, but Java spaces -- a subset,
+            // representing whitespace likely to be seen in code.
+            continue;
        }
-
-        if (limit == -1) {
-            limit = n;
+        // Skip lines starting with the comment character
+        if (c == RULE_COMMENT_CHAR) {
+            pos = rules.indexOf("\n", pos) + 1;
+            if (pos == 0) {
+                break; // No "\n" found; rest of rule is a commnet
+            }
+            continue; // Either fall out or restart with next line
        }
-        // Skip over empty lines and line starting with #
-        if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) {
-            applyRule(i, limit);
-        }
-        i = limit + 1;
+        // We've found the start of a rule.  c is its first
+        // character, and pos points past c.  Lexically parse the
+        // rule into component pieces.
+        pos = parseRule(--pos, limit);                    
+    }
+    
+    // Index the rules
+    if (U_SUCCESS(status)) {
+        data->ruleSet.freeze(*data, status);
    }
-
-    data->ruleSet.freeze();
 }

 /**
- * Parse the given substring as a rule, and append it to the rules currently
- * represented in this object.
- * @param start the beginning index, inclusive; <code>0 <= start
- * <= limit</code>.
- * @param limit the ending index, exclusive; <code>start <= limit
- * <= rules.length()</code>.
- * @exception IllegalArgumentException if there is a syntax error in the
- * rules
+ * MAIN PARSER.  Parse the next rule in the given rule string, starting
+ * at pos.  Return the index after the last character parsed.  Do not
+ * parse characters at or after limit.
+ *
+ * Important:  The character at pos must be a non-whitespace character
+ * that is not the comment character.
+ *
+ * This method handles quoting, escaping, and whitespace removal.  It
+ * parses the end-of-rule character.  It recognizes context and cursor
+ * indicators.  Once it does a lexical breakdown of the rule at pos, it
+ * creates a rule object and adds it to our rule list.
 */
-void TransliterationRuleParser::applyRule(int32_t start, int32_t limit) {
-    /* General description of parsing: Initially, rules contain two types of
-     * quoted characters.  First, there are variable references, such as
-     * "{alpha}".  Second, there are quotes, such as "'<'" or "''".  One of
-     * the first steps in parsing a rule is to resolve such quoted matter.
-     * Quotes are removed early, leaving unquoted literal matter.  Variable
-     * references are resolved and replaced by single characters.  In some
-     * instances these characters represent themselves; in others, they
-     * stand for categories of characters.  Character categories are either
-     * predefined (e.g., "{Lu}"), or are defined by the user using a
-     * statement (e.g., "vowels:aeiouAEIOU").
-     *
-     * Another early step in parsing is to split each rule into component
-     * pieces.  These pieces are, for every rule, a left-hand side, a right-
-     * hand side, and an operator.  The left- and right-hand sides may not
-     * be empty, except for the output patterns of forward and reverse
-     * rules.  In addition to this partitioning, the match patterns of
-     * forward and reverse rules must be partitioned into antecontext,
-     * postcontext, and literal pattern, where the context portions may or
-     * may not be present.  Finally, output patterns must have the cursor
-     * indicator '|' detected and removed, with its position recorded.
-     *
-     * Quote removal, variable resolution, and sub-pattern splitting must
-     * all happen at once.  This is due chiefly to the quoting mechanism,
-     * which allows special characters to appear at arbitrary positions in
-     * the final unquoted text.  (For this reason, alteration of the rule
-     * language is somewhat clumsy; it entails reassessment and revision of
-     * the parsing methods as a whole.)
-     *
-     * After this processing of rules is complete, the final end products
-     * are unquoted pieces of text of various types, and an integer cursor
-     * position, if one is specified.  These processed raw materials are now
-     * easy to deal with; other classes such as UnicodeSet and
-     * TransliterationRule need know nothing of quoting or variables.
-     */
+int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
+    // Locate the left side, operator, and right side
+    int32_t start = pos;
+    UChar op = 0;
+
+    UnicodeString buf;
+    int32_t cursor = -1; // position of cursor in buf
+    int32_t ante = -1;   // position of ante context marker ')' in buf
+    int32_t post = -1;   // position of post context marker '(' in buf
+    int32_t postClose = -1; // position of post context close ')' in buf
+
+    // Assigned to buf and its adjuncts after the LHS has been
+    // parsed.  Thereafter, buf etc. refer to the RHS.
    UnicodeString left;
-    UnicodeString right;
-    UnicodeString anteContext;
-    UnicodeString postContext;
-    int32_t cursorPos;
+    int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;

-    UChar op = parseRule(start, limit, left, right,
-                         anteContext, postContext, cursorPos);
+    UnicodeString scratch;

-    if (U_FAILURE(status)) {
-        return;
+    while (pos < limit) {
+        UChar c = rules.charAt(pos++);
+        if (Unicode::isWhitespace(c)) {
+            // Ignore whitespace.  Note that this is not Unicode
+            // spaces, but Java spaces -- a subset, representing
+            // whitespace likely to be seen in code.
+            continue;
+        }
+        // Handle escapes
+        if (c == ESCAPE) {
+            if (pos == limit) {
+                return syntaxError("Trailing backslash", rules, start);
+            }
+            // Parse \uXXXX escapes
+            c = rules.charAt(pos++);
+            if (c == 'u') {
+                if ((pos+4) > limit) {
+                    return syntaxError("Malformed Unicode escape", rules, start);
+                }
+                c = (UChar)0x0000;
+                for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
+                    int32_t digit = Unicode::digit(rules.charAt(pos), 16);
+                    if (digit<0) {
+                        return syntaxError("Malformed Unicode escape", rules, start);
+                    }
+                    c = (UChar) ((c << 4) | digit);
+                }
+            }
+
+            buf.append(c);
+            continue;
+        }
+        // Handle quoted matter
+        if (c == QUOTE) {
+            int32_t iq = rules.indexOf(QUOTE, pos);
+            if (iq == pos) {
+                buf.append(c); // Parse [''] outside quotes as [']
+                ++pos;
+            } else {
+                /* This loop picks up a segment of quoted text of the
+                 * form 'aaaa' each time through.  If this segment
+                 * hasn't really ended ('aaaa''bbbb') then it keeps
+                 * looping, each time adding on a new segment.  When it
+                 * reaches the final quote it breaks.
+                 */
+                for (;;) {
+                    if (iq < 0) {
+                        return syntaxError("Unterminated quote", rules, start);
+                    }
+                    scratch.truncate(0);
+                    rules.extractBetween(pos, iq, scratch);
+                    buf.append(scratch);
+                    pos = iq+1;
+                    if (pos < limit && rules.charAt(pos) == QUOTE) {
+                        // Parse [''] inside quotes as [']
+                        iq = rules.indexOf(QUOTE, pos+1);
+                        // Continue looping
+                    } else {
+                        break;
+                    }
+                }
+            }
+            continue;
+        }
+        if (OPERATORS.indexOf(c) >= 0) {
+            if (op != 0) {
+                return syntaxError("Unquoted special", rules, start);
+            }
+            // Found an operator char.  Check for forward-reverse operator.
+            if (c == REVERSE_RULE_OP &&
+                (pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) {
+                ++pos;
+                op = FWDREV_RULE_OP;
+            } else {
+                op = c;
+            }
+            left = buf; // lhs
+            leftCursor = cursor;
+            leftAnte = ante;
+            leftPost = post;
+            leftPostClose = postClose;
+
+            buf.truncate(0);
+            cursor = ante = post = postClose = -1;
+            continue;
+        }
+        if (c == END_OF_RULE) {
+            break;
+        }
+        switch (c) {
+        case VARIABLE_REF_OPEN:
+            {
+                int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
+                if (pos == j || j < 0) { // empty or unterminated
+                    return syntaxError("Malformed variable reference", rules, start);
+                }
+                scratch.truncate(0);
+                rules.extractBetween(pos, j, scratch);
+                pos = j+1;
+                UChar v = data->lookupVariable(scratch, status);
+                if (U_FAILURE(status)) {
+                    return syntaxError("Undefined variable", rules, start);
+                }
+                buf.append(v);
+            }
+            break;
+        case CONTEXT_OPEN:
+            if (post >= 0) {
+                return syntaxError("Multiple post contexts", rules, start);
+            }
+            // Ignore CONTEXT_OPEN if buffer length is zero -- that means
+            // this is the optional opening delimiter for the ante context.
+            if (buf.length() > 0) {
+                post = buf.length();
+            }
+            break;
+        case CONTEXT_CLOSE:
+            if (postClose >= 0) {
+                return syntaxError("Unexpected ')'", rules, start);
+            }
+            if (post >= 0) {
+                // This is probably the optional closing delimiter
+                // for the post context; save the pos and check later.
+                postClose = buf.length();
+            } else if (ante >= 0) {
+                return syntaxError("Multiple ante contexts", rules, start);
+            } else {
+                ante = buf.length();
+            }
+            break;
+        case SET_OPEN: {
+            ParsePosition pp(pos-1); // Backup to opening '['
+            buf.append(registerSet(new UnicodeSet(rules, pp, data, status)));
+            if (U_FAILURE(status)) {
+                return syntaxError("Invalid set", rules, start);
+            }
+            pos = pp.getIndex(); }
+            break;
+        case VARIABLE_REF_CLOSE:
+        case SET_CLOSE:
+            return syntaxError("Unquoted special", rules, start);
+        case CURSOR_POS:
+            if (cursor >= 0) {
+                return syntaxError("Multiple cursors", rules, start);
+            }
+            cursor = buf.length();
+            break;
+        default:
+            buf.append(c);
+            break;
+        }
    }
+    if (op == 0) {
+        return syntaxError("No operator", rules, start);
+    }
+
+    // Check context close parameters
+    if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
+        (postClose >= 0 && postClose != buf.length())) {
+        return syntaxError("Extra text after ]", rules, start);
+    }
+
+    // Context is only allowed on the input side; that is, the left side
+    // for forward rules.  Cursors are only allowed on the output side;
+    // that is, the right side for forward rules.  Bidirectional rules
+    // ignore elements that do not apply.

    switch (op) {
    case VARIABLE_DEF_OP:
-        applyVariableDef(left, right);
+        // LHS is the name.  RHS is a single character, either a literal
+        // or a set (already parsed).  If RHS is longer than one
+        // character, it is either a multi-character string, or multiple
+        // sets, or a mixture of chars and sets -- syntax error.
+        if (buf.length() != 1) {
+            return syntaxError("Malformed RHS", rules, start);
+        }
+        if (data->isVariableDefined(left)) {
+            return syntaxError("Duplicate definition", rules, start);
+        }
+        data->defineVariable(left, buf.charAt(0), status);
        break;
+
    case FORWARD_RULE_OP:
        if (direction == RuleBasedTransliterator::FORWARD) {
+            if (ante >= 0 || post >= 0 || leftCursor >= 0) {
+                return syntaxError("Malformed rule", rules, start);
+            }
            data->ruleSet.addRule(new TransliterationRule(
-                                     left, right,
-                                     anteContext, postContext,
-                                     cursorPos, status),
-                                  status);
+                                     left, leftAnte, leftPost,
+                                     buf, cursor, status), status);
        } // otherwise ignore the rule; it's not the direction we want
        break;
+
    case REVERSE_RULE_OP:
        if (direction == RuleBasedTransliterator::REVERSE) {
+            if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
+                return syntaxError("Malformed rule", rules, start);
+            }
            data->ruleSet.addRule(new TransliterationRule(
-                                     right, left,
-                                     anteContext, postContext,
-                                     cursorPos, status),
-                                  status);
+                                     buf, ante, post,
+                                     left, leftCursor, status), status);
        } // otherwise ignore the rule; it's not the direction we want
        break;
-    }
-}

-/**
- * Add a variable definition.
- * @param name the name of the variable.  It must not already be defined.
- * @param pattern the value of the variable.  It may be a single character
- * or a pattern describing a character set.
- * @exception IllegalArgumentException if there is a syntax error
- */
-void TransliterationRuleParser::applyVariableDef(const UnicodeString& name,
-                                                 const UnicodeString& pattern) {
-    validateVariableName(name);
-
-    if (U_FAILURE(status)) {
-        return;
-    }
-
-    if (data->isVariableDefined(name)) {
-        // throw new IllegalArgumentException("Duplicate variable definition: "
-        //                                   + name + '=' + pattern);
-        status = U_ILLEGAL_ARGUMENT_ERROR; 
-        return;
-    }
-//!         if (UnicodeSet.getCategoryID(name) >= 0) {
-//!             throw new IllegalArgumentException("Reserved variable name: "
-//!                                                + name);
-//!         }
-    if (pattern.length() < 1) {
-        // throw new IllegalArgumentException("Variable definition missing: "
-        //                                   + name);
-        status = U_ILLEGAL_ARGUMENT_ERROR;
-        return;
-    }
-
-    if (pattern.length() == 1) {
-        // Got a single character variable definition
-        //$ data->variableNames.put(name, new Character(pattern.charAt(0)));
-        data->defineVariable(name, pattern.charAt(0), status);
-    } else {
-        // Got more than one character; parse it as a category
-        if (variableNext >= variableLimit) {
-            //$ throw new RuntimeException("Private use variables exhausted");
-            status = U_ILLEGAL_ARGUMENT_ERROR;
-            return;
+    case FWDREV_RULE_OP:
+        if (direction == RuleBasedTransliterator::FORWARD) {
+            // The output side is the right; trim off any context
+            if (post >= 0) {
+                buf.remove(post);
+            }
+            if (ante >= 0) {
+                buf.removeBetween(0, ante);
+            }
+            data->ruleSet.addRule(new TransliterationRule(
+                                     left, leftAnte, leftPost,
+                                     buf, cursor, status), status);
+        } else {
+            // The output side is the left; trim off any context
+            if (leftPost >= 0) {
+                left.remove(leftPost);
+            }
+            if (leftAnte >= 0) {
+                left.removeBetween(0, leftAnte);
+            }
+            data->ruleSet.addRule(new TransliterationRule(
+                                     buf, ante, post,
+                                     left, leftCursor, status), status);
        }
-        //$ Character c = new Character(variableNext++);
-        //$ data->variableNames.put(name, c);
-        //$ data->setVariables.put(c, new UnicodeSet(pattern));
-        data->defineVariable(name, variableNext++,
-                             new UnicodeSet(pattern, status),
-                             status);
+        break;
    }
+
+    return pos;
 }

 /**
- * Given a rule, parses it into three pieces: The left side, the right side,
- * and the operator.  Returns the operator.  Quotes and variable references
- * are resolved; the otuput text in all <code>StringBuffer</code> parameters
- * is literal text.  This method delegates to other parsing methods to
- * handle the match pattern, output pattern, and other sub-patterns in the
- * rule.
- * @param start the beginning index, inclusive; <code>0 <= start
- * <= limit</code>.
- * @param limit the ending index, exclusive; <code>start <= limit
- * <= rules.length()</code>.
- * @param left left side of rule is appended to this buffer
- * with the quotes removed and variables resolved
- * @param right right side of rule is appended to this buffer
- * with the quotes removed and variables resolved
- * @param anteContext the preceding context of the match pattern,
- * if there is one, is appended to this buffer
- * @param postContext the following context of the match pattern,
- * if there is one, is appended to this buffer
- * @param cursorPos if there is a cursor in the output pattern, its
- * offset is stored in <code>cursorPos</code>, otherwise set to -1.
- * @return The operator character, one of the characters in OPERATORS.
+ * Called by main parser upon syntax error.  Search the rule string
+ * for the probable end of the rule.  Of course, if the error is that
+ * the end of rule marker is missing, then the rule end will not be found.
+ * In any case the rule start will be correctly reported.
+ * @param msg error description
+ * @param rule pattern string
+ * @param start position of first character of current rule
 */
-UChar TransliterationRuleParser::parseRule(int32_t start, int32_t limit,
-                                           UnicodeString& left,
-                                           UnicodeString& right,
-                                           UnicodeString& anteContext,
-                                           UnicodeString& postContext,
-                                           int32_t& cursorPos) {
-    /* Parse the rule into three pieces -- left, operator, and right,
-     * parsing out quotes.  The result is that left and right will have
-     * unquoted text.  E.g., "gt<'>'" will have right = ">".  Unquoted
-     * operators throw an exception.  Two quotes inside or outside
-     * quotes indicates a quote literal.  E.g., "o''clock" -> "o'clock".
-     */
-    int32_t i = quotedIndexOf(rules, start, limit, OPERATORS);
-    if (i < 0) {
-        //$ throw new IllegalArgumentException(
-        //$              "Syntax error: "
-        //$              + rules.substring(start, limit));
+int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/,
+                                               const UnicodeString& /*rule*/,
+                                               int32_t start) {
+//|    int end = quotedIndexOf(rule, start, rule.length(), ";");
+//|    if (end < 0) {
+//|        end = rule.length();
+//|    }
+//|    throw new IllegalArgumentException(msg + " in " +
+//|                                       rule.substring(start, end));
+    status = U_ILLEGAL_ARGUMENT_ERROR;
+    return start;
+}
+
+/**
+ * Allocate a private-use substitution character for the given set,
+ * register it in the setVariables hash, and return the substitution
+ * character.
+ */
+UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
+    if (variableNext >= variableLimit) {
+        // throw new RuntimeException("Private use variables exhausted");
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
-    cursorPos = -1;
-    UChar c = rules.charAt(i);
-    switch (c) {
-    case FORWARD_RULE_OP:
-        if (i == start) {
-            //$ throw new IllegalArgumentException(
-            //$               "Empty left side: "
-            //$               + rules.substring(start, limit));
-            status = U_ILLEGAL_ARGUMENT_ERROR;
-            return 0;
-        }
-        parseMatchPattern(start, i, left, anteContext, postContext);
-        if (i != (limit-1)) {
-            parseOutputPattern(i+1, limit, right, cursorPos);
-        }
-        break;
-    case REVERSE_RULE_OP:
-        if (i == (limit-1)) {
-            //$ throw new IllegalArgumentException(
-            //$               "Empty right side: "
-            //$               + rules.substring(start, limit));
-            status = U_ILLEGAL_ARGUMENT_ERROR;
-            return 0;
-        }
-        if (i != start) {
-            parseOutputPattern(start, i, left, cursorPos);
-        }
-        parseMatchPattern(i+1, limit, right, anteContext, postContext);
-        break;
-    default:
-        if (i == start || i == (limit-1)) {
-            //$ throw new IllegalArgumentException(
-            //$               "Empty left or right side: "
-            //$               + rules.substring(start, limit));
-            status = U_ILLEGAL_ARGUMENT_ERROR;
-            return 0;
-        }
-        parseSubPattern(start, i, left);
-        parseDefPattern(i+1, limit, right);
-        break;
-    }
+    UChar c = variableNext++;
+    data->defineSet(c, adoptedSet, status);
    return c;
 }

-/**
- * Parses the match pattern of a forward or reverse rule.  Given the raw
- * match pattern, return the match text and the context on both sides, if
- * any.  Resolves all quotes and variables.
- * @param start the beginning index, inclusive; <code>0 <= start
- * <= limit</code>.
- * @param limit the ending index, exclusive; <code>start <= limit
- * <= rules.length()</code>.
- * @param text the key to be matched will be appended to this buffer
- * @param anteContext the preceding context, if any, will be appended
- * to this buffer.
- * @param postContext the following context, if any, will be appended
- * to this buffer.
- */
-void TransliterationRuleParser::parseMatchPattern(int32_t start, int32_t limit,
-                                                  UnicodeString& text,
-                                                  UnicodeString& anteContext,
-                                                  UnicodeString& postContext) {
-    if (start >= limit) {
-        //$ throw new IllegalArgumentException(
-        //$               "Empty expression in rule: "
-        //$               + rules.substring(start, limit));
-        status = U_ILLEGAL_ARGUMENT_ERROR;
-        return;
-    }
-    //$ if (anteContext != 0) {
-        // Ignore optional opening and closing context characters
-        if (rules.charAt(start) == CONTEXT_OPEN) {
-            ++start;
-        }
-        if (rules.charAt(limit-1) == CONTEXT_CLOSE) {
-            --limit;
-        }
-        // The four possibilities are:
-        //             key
-        // anteContext]key
-        // anteContext]key[postContext
-        //             key[postContext
-        int32_t ante = quotedIndexOf(rules, start, limit, CONTEXT_CLOSE);
-        int32_t post = quotedIndexOf(rules, start, limit, CONTEXT_OPEN);
-        if (ante >= 0 && post >= 0 && ante > post) {
-            //$ throw new IllegalArgumentException(
-            //$               "Syntax error in context specifier: "
-            //$               + rules.substring(start, limit));
-            status = U_ILLEGAL_ARGUMENT_ERROR;
-            return;
-        }
-        if (ante >= 0) {
-            parseSubPattern(start, ante, anteContext);
-            start = ante+1;
-        }
-        if (post >= 0) {
-            parseSubPattern(post+1, limit, postContext);
-            limit = post;
-        }
-    //$ }
-    parseSubPattern(start, limit, text);
-}
-
-void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
-                                                UnicodeString& text) {
-    parseSubPattern(start, limit, text, 0, SPECIALS);
-}
-
-/**
- * Parse a variable definition sub pattern.  This kind of sub
- * pattern differs in the set of characters that are considered
- * special.  In particular, the '[' and ']' characters are not
- * special, since these are used in UnicodeSet patterns.
- */
-void TransliterationRuleParser::parseDefPattern(int32_t start, int32_t limit,
-                                                UnicodeString& text) {
-    parseSubPattern(start, limit, text, 0, DEF_SPECIALS);
-}
-
-/**
- * Parses the output pattern of a forward or reverse rule.  Given the
- * output pattern, return the output text and the position of the cursor,
- * if any.  Resolves all quotes and variables.
- * @param rules the string to be parsed
- * @param start the beginning index, inclusive; <code>0 <= start
- * <= limit</code>.
- * @param limit the ending index, exclusive; <code>start <= limit
- * <= rules.length()</code>.
- * @param text the output text will be appended to this buffer
- * @param cursorPos if this parameter is not null, then cursorPos
- * will be set to the cursor position, or -1 if there is none.  If this
- * parameter is null, then cursors will be disallowed.
- */
-void TransliterationRuleParser::parseOutputPattern(int32_t start, int32_t limit,
-                                                   UnicodeString& text,
-                                                   int32_t& cursorPos) {
-    parseSubPattern(start, limit, text, &cursorPos, SPECIALS);
-}
-
-/**
- * Parses a sub-pattern of a rule.  Return the text and the position of the cursor,
- * if any.  Resolves all quotes and variables.
- * @param rules the string to be parsed
- * @param start the beginning index, inclusive; <code>0 <= start
- * <= limit</code>.
- * @param limit the ending index, exclusive; <code>start <= limit
- * <= rules.length()</code>.
- * @param text the output text will be appended to this buffer
- * @param cursorPos if this parameter is not null, then cursorPos
- * will be set to the cursor position, or -1 if there is none.  If this
- * parameter is null, then cursors will be disallowed.
- * @param specials characters that must be quoted; typically either
- * SPECIALS or DEF_SPECIALS.
- */
-void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
-                                                UnicodeString& text,
-                                                int32_t* cursorPos,
-                                                const UnicodeString& specials) {
-    bool_t inQuote = FALSE;
-
-    if (start >= limit) {
-        //$ throw new IllegalArgumentException("Empty expression in rule");
-        status = U_ILLEGAL_ARGUMENT_ERROR;
-        return;
-    }
-    if (cursorPos != 0) {
-        *cursorPos = -1;
-    }
-    for (int32_t i=start; i<limit; ++i) {
-        UChar c = rules.charAt(i);
-        if (c == QUOTE) {
-            // Check for double quote
-            if ((i+1) < limit
-                && rules.charAt(i+1) == QUOTE) {
-                text.append(QUOTE);
-                ++i; // Skip over both quotes
-            } else {
-                inQuote = !inQuote;
-            }
-        } else if (inQuote) {
-            text.append(c);
-        } else if (c == VARIABLE_REF_OPEN) {
-            ++i;
-            int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, i);
-            if (i == j || j < 0) { // empty or unterminated
-                //$ throw new IllegalArgumentException("Illegal variable reference: "
-                //$                                    + rules.substring(start, limit));
-                status = U_ILLEGAL_ARGUMENT_ERROR;
-                return;
-            }
-            UnicodeString name;
-            rules.extractBetween(i, j, name);
-            validateVariableName(name);
-            if (U_FAILURE(status)) {
-                return;
-            }
-            UChar ch = data->lookupVariable(name, status);
-            if (U_FAILURE(status)) {
-                return;
-            }
-            text.append(ch);
-            i = j;
-        } else if (c == CURSOR_POS && cursorPos != 0) {
-            if (*cursorPos >= 0) {
-                //$ throw new IllegalArgumentException("Multiple cursors: "
-                //$                                    + rules.substring(start, limit));
-                status = U_ILLEGAL_ARGUMENT_ERROR;
-                return;
-            }
-            *cursorPos = text.length();
-        } else if (specials.indexOf(c) >= 0) {
-            //$ throw new IllegalArgumentException("Unquoted special character: "
-            //$                                    + rules.substring(start, limit));
-            status = U_ILLEGAL_ARGUMENT_ERROR;
-            return;
-        } else {
-            text.append(c);
-        }
-    }
-}
-
-void TransliterationRuleParser::validateVariableName(const UnicodeString& name) {
-    if (indexOf(name, SPECIALS) >= 0) {
-        //throw new IllegalArgumentException(
-        //              "Special character in variable name: "
-        //              + name);
-        status = U_ILLEGAL_ARGUMENT_ERROR;
-    }
-}
-
-/**
- * Returns the single character value of the given variable name.  Defined
- * names are recognized.
- *
- * NO LONGER SUPPORTED:
- * If a Unicode category name is given, a standard character variable
- * in the range firstCategoryVariable to lastCategoryVariable is returned,
- * with value firstCategoryVariable + n, where n is the category
- * number.
- * @exception IllegalArgumentException if the name is unknown.
- */
-//$ UChar TransliterationRuleParser::getVariableDef(const UnicodeString& name) {
-//$     UChar ch = data->lookupVariable(name, status);
-//$ //!         if (ch == null) {
-//$ //!             int id = UnicodeSet.getCategoryID(name);
-//$ //!             if (id >= 0) {
-//$ //!                 ch = new Character((char) (firstCategoryVariable + id));
-//$ //!                 data->variableNames.put(name, ch);
-//$ //!                 data->setVariables.put(ch, new UnicodeSet(id));
-//$ //!             }
-//$ //!         }
-//$     if (ch == 0) {
-//$         throw new IllegalArgumentException("Undefined variable: "
-//$                                            + name);
-//$     }
-//$     return ch;
-//$ }
-
 /**
 * Determines what part of the private use region of Unicode we can use for
 * variable stand-ins.  The correct way to do this is as follows: Parse each
@ -599,43 +468,3 @@ int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
    }
    return -1;
 }
-
-/**
- * Returns the index of the first character in a set.  Unlike
- * String.indexOf(), this method searches not for a single character, but
- * for any character of the string <code>setOfChars</code>.
- * @param text text to be searched
- * @param start the beginning index, inclusive; <code>0 <= start
- * <= limit</code>.
- * @param limit the ending index, exclusive; <code>start <= limit
- * <= text.length()</code>.
- * @param setOfChars string with one or more distinct characters
- * @return Offset of the first character in <code>setOfChars</code>
- * found, or -1 if not found.
- * @see #quotedIndexOf
- */
-int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
-                                           int32_t start, int32_t limit,
-                                           const UnicodeString& setOfChars) {
-    for (int32_t i=start; i<limit; ++i) {
-        if (setOfChars.indexOf(text.charAt(i)) >= 0) {
-            return i;
-        }
-    }
-    return -1;
-}
-
-/**
- * Returns the index of the first character in a set.  Unlike
- * String.indexOf(), this method searches not for a single character, but
- * for any character of the string <code>setOfChars</code>.
- * @param text text to be searched
- * @param setOfChars string with one or more distinct characters
- * @return Offset of the first character in <code>setOfChars</code>
- * found, or -1 if not found.
- * @see #quotedIndexOf
- */
-int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
-                                           const UnicodeString& setOfChars) {
-    return indexOf(text, 0, text.length(), setOfChars);
-}
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@ -11,6 +11,7 @@
 #include "unicode/rbt.h"

 class TransliterationRuleData;
+class UnicodeSet;

 class TransliterationRuleParser {

@ -49,29 +50,21 @@ class TransliterationRuleParser {
    static const UChar VARIABLE_DEF_OP;
    static const UChar FORWARD_RULE_OP;
    static const UChar REVERSE_RULE_OP;
-    static const char* OPERATORS;
-
+    static const UChar FWDREV_RULE_OP; // internal rep of <> op
+    static const UnicodeString OPERATORS;

    // Other special characters
    static const UChar QUOTE;
+    static const UChar ESCAPE;
+    static const UChar END_OF_RULE;
+    static const UChar RULE_COMMENT_CHAR;
    static const UChar VARIABLE_REF_OPEN;
    static const UChar VARIABLE_REF_CLOSE;
    static const UChar CONTEXT_OPEN;
    static const UChar CONTEXT_CLOSE;
+    static const UChar SET_OPEN;
+    static const UChar SET_CLOSE;
    static const UChar CURSOR_POS;
-    static const UChar RULE_COMMENT_CHAR;
-
-
-    /**
-     * Specials must be quoted in rules to be used as literals.
-     * Specials may not occur in variable names.
-     */
-    static const char* SPECIALS;
-
-    /**
-     * Specials that must be quoted in variable definitions.
-     */
-    static const char* DEF_SPECIALS;

 public:

@ -100,140 +93,38 @@ private:
    void parseRules(void);

    /**
-     * Parse the given substring as a rule, and append it to the rules currently
-     * represented in this object.
-     * @param start the beginning index, inclusive; <code>0 <= start
-     * <= limit</code>.
-     * @param limit the ending index, exclusive; <code>start <= limit
-     * <= rules.length()</code>.
-     * @exception IllegalArgumentException if there is a syntax error in the
-     * rules
-     */
-    void applyRule(int32_t start, int32_t limit);
-
-    /**
-     * Add a variable definition.
-     * @param name the name of the variable.  It must not already be defined.
-     * @param pattern the value of the variable.  It may be a single character
-     * or a pattern describing a character set.
-     * @exception IllegalArgumentException if there is a syntax error
-     */
-    void applyVariableDef(const UnicodeString& name,
-                          const UnicodeString& pattern);
-
-    /**
-     * Given a rule, parses it into three pieces: The left side, the right side,
-     * and the operator.  Returns the operator.  Quotes and variable references
-     * are resolved; the otuput text in all <code>StringBuffer</code> parameters
-     * is literal text.  This method delegates to other parsing methods to
-     * handle the match pattern, output pattern, and other sub-patterns in the
-     * rule.
-     * @param start the beginning index, inclusive; <code>0 <= start
-     * <= limit</code>.
-     * @param limit the ending index, exclusive; <code>start <= limit
-     * <= rules.length()</code>.
-     * @param left left side of rule is appended to this buffer
-     * with the quotes removed and variables resolved
-     * @param right right side of rule is appended to this buffer
-     * with the quotes removed and variables resolved
-     * @param anteContext the preceding context of the match pattern,
-     * if there is one, is appended to this buffer
-     * @param postContext the following context of the match pattern,
-     * if there is one, is appended to this buffer
-     * @param cursorPos if there is a cursor in the output pattern, its
-     * offset is stored in <code>cursorPos[0]</code>
-     * @return The operator character, one of the characters in OPERATORS.
-     */
-    UChar parseRule(int32_t start, int32_t limit,
-                    UnicodeString& left, UnicodeString& right,
-                    UnicodeString& anteContext,
-                    UnicodeString& postContext,
-                    int32_t& cursorPos);
-
-    /**
-     * Parses the match pattern of a forward or reverse rule.  Given the raw
-     * match pattern, return the match text and the context on both sides, if
-     * any.  Resolves all quotes and variables.
-     * @param start the beginning index, inclusive; <code>0 <= start
-     * <= limit</code>.
-     * @param limit the ending index, exclusive; <code>start <= limit
-     * <= rules.length()</code>.
-     * @param text the key to be matched will be appended to this buffer
-     * @param anteContext the preceding context, if any, will be appended
-     * to this buffer.
-     * @param postContext the following context, if any, will be appended
-     * to this buffer.
-     */
-    void parseMatchPattern(int32_t start, int32_t limit,
-                           UnicodeString& text,
-                           UnicodeString& anteContext,
-                           UnicodeString& postContext);
-
-    void parseSubPattern(int32_t start, int32_t limit,
-                         UnicodeString& text);
-    
-    /**
-     * Parse a variable definition sub pattern.  This kind of sub
-     * pattern differs in the set of characters that are considered
-     * special.  In particular, the '[' and ']' characters are not
-     * special, since these are used in UnicodeSet patterns.
-     */
-    void parseDefPattern(int32_t start, int32_t limit,
-                         UnicodeString& text);
-    
-    /**
-     * Parses the output pattern of a forward or reverse rule.  Given the
-     * output pattern, return the output text and the position of the cursor,
-     * if any.  Resolves all quotes and variables.
-     * @param rules the string to be parsed
-     * @param start the beginning index, inclusive; <code>0 <= start
-     * <= limit</code>.
-     * @param limit the ending index, exclusive; <code>start <= limit
-     * <= rules.length()</code>.
-     * @param text the output text will be appended to this buffer
-     * @param cursorPos if this parameter is not null, then cursorPos[0]
-     * will be set to the cursor position, or -1 if there is none.  If this
-     * parameter is null, then cursors will be disallowed.
-     */
-    void parseOutputPattern(int32_t start, int32_t limit,
-                            UnicodeString& text,
-                            int32_t& cursorPos);
-
-    /**
-     * Parses a sub-pattern of a rule.  Return the text and the position of the cursor,
-     * if any.  Resolves all quotes and variables.
-     * @param rules the string to be parsed
-     * @param start the beginning index, inclusive; <code>0 <= start
-     * <= limit</code>.
-     * @param limit the ending index, exclusive; <code>start <= limit
-     * <= rules.length()</code>.
-     * @param text the output text will be appended to this buffer
-     * @param cursorPos if this parameter is not null, then cursorPos[0]
-     * will be set to the cursor position, or -1 if there is none.  If this
-     * parameter is null, then cursors will be disallowed.
-     * @param specials characters that must be quoted; typically either
-     * SPECIALS or DEF_SPECIALS.
-     */
-    void parseSubPattern(int32_t start, int32_t limit,
-                         UnicodeString& text,
-                         int32_t* cursorPos,
-                         const UnicodeString& specials);
-
-    void validateVariableName(const UnicodeString& name);
-
-    /**
-     * Returns the single character value of the given variable name.  Defined
-     * names are recognized.
+     * MAIN PARSER.  Parse the next rule in the given rule string, starting
+     * at pos.  Return the index after the last character parsed.  Do not
+     * parse characters at or after limit.
     *
-     * NO LONGER SUPPORTED:
-     * If a Unicode category name is given, a standard character variable
-     * in the range firstCategoryVariable to lastCategoryVariable is returned,
-     * with value firstCategoryVariable + n, where n is the category
-     * number.
-     * @exception IllegalArgumentException if the name is unknown.
+     * Important:  The character at pos must be a non-whitespace character
+     * that is not the comment character.
+     *
+     * This method handles quoting, escaping, and whitespace removal.  It
+     * parses the end-of-rule character.  It recognizes context and cursor
+     * indicators.  Once it does a lexical breakdown of the rule at pos, it
+     * creates a rule object and adds it to our rule list.
     */
-    //$ Character getVariableDef(const UnicodeString& name);
+    int32_t parseRule(int32_t pos, int32_t limit);

+    /**
+     * Called by main parser upon syntax error.  Search the rule string
+     * for the probable end of the rule.  Of course, if the error is that
+     * the end of rule marker is missing, then the rule end will not be found.
+     * In any case the rule start will be correctly reported.
+     * @param msg error description
+     * @param rule pattern string
+     * @param start position of first character of current rule
+     */
+    int32_t syntaxError(const char* msg, const UnicodeString&, int32_t start);
+
+    /**
+     * Allocate a private-use substitution character for the given set,
+     * register it in the setVariables hash, and return the substitution
+     * character.
+     */
+    UChar registerSet(UnicodeSet* adoptedSet);
+ 
    /**
     * Determines what part of the private use region of Unicode we can use for
     * variable stand-ins.  The correct way to do this is as follows: Parse each
@ -263,38 +154,6 @@ private:
    static int32_t quotedIndexOf(const UnicodeString& text,
                                 int32_t start, int32_t limit,
                                 const UnicodeString& setOfChars);
-
-    /**
-     * Returns the index of the first character in a set.  Unlike
-     * String.indexOf(), this method searches not for a single character, but
-     * for any character of the string <code>setOfChars</code>.
-     * @param text text to be searched
-     * @param start the beginning index, inclusive; <code>0 <= start
-     * <= limit</code>.
-     * @param limit the ending index, exclusive; <code>start <= limit
-     * <= text.length()</code>.
-     * @param setOfChars string with one or more distinct characters
-     * @return Offset of the first character in <code>setOfChars</code>
-     * found, or -1 if not found.
-     * @see #quotedIndexOf
-     */
-    static int32_t indexOf(const UnicodeString& text,
-                           int32_t start, int32_t limit,
-                           const UnicodeString& setOfChars);
-    
-    /**
-     * Returns the index of the first character in a set.  Unlike
-     * String.indexOf(), this method searches not for a single character, but
-     * for any character of the string <code>setOfChars</code>.
-     * @param text text to be searched
-     * @param setOfChars string with one or more distinct characters
-     * @return Offset of the first character in <code>setOfChars</code>
-     * found, or -1 if not found.
-     * @see #quotedIndexOf
-     */
-    static int32_t indexOf(const UnicodeString& text,
-                           const UnicodeString& setOfChars);
-    
 };

 #endif
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@ -25,6 +25,7 @@
 * after the <code>key</code>
 * @param cursorPos a position for the cursor after the <code>output</code>
 * is emitted.  If less than zero, then the cursor is placed after the
+
 * <code>output</code>; that is, -1 is equivalent to
 * <code>output.length()</code>.  If greater than
 * <code>output.length()</code> then an exception is thrown.
@ -37,55 +38,93 @@ TransliterationRule::TransliterationRule(const UnicodeString& theKey,
                                         const UnicodeString& thePostContext,
                                         int32_t theCursorPos,
                                         UErrorCode &status) :
-    key(theKey), output(theOutput),
-    anteContext(theAnteContext),
-    postContext(thePostContext),
-    cursorPos(theCursorPos),
-    maskKey(0) {
-
+    output(theOutput),
+    cursorPos(theCursorPos)
+{
    if (U_FAILURE(status)) {
        return;
    }
-
+    anteContextLength = theAnteContext.length();
+    keyLength = theKey.length();
+    pattern = theAnteContext;
+    pattern.append(theKey).append(thePostContext);
    if (cursorPos < 0) {
        cursorPos = output.length();
    }
    if (cursorPos > output.length()) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
    }
-    /* The mask key is needed when we are adding individual rules to a rule
-     * set, for performance.  Here are the numbers: Without mask key, 13.0
-     * seconds.  With mask key, 6.2 seconds.  However, once the rules have
-     * been added to the set, then they can be discarded to free up space.
-     * This is what the freeze() method does.  After freeze() has been
-     * called, the method masks() must NOT be called.
-     */
-    maskKey = new UnicodeString(key);
-    if (maskKey == 0) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-    } else {
-        maskKey->append(postContext);
-    }
 }

-TransliterationRule::~TransliterationRule() {
-    delete maskKey;
+/**
+ * Construct a new rule with the given input, output text, and other
+ * attributes.  A cursor position may be specified for the output text.
+ * @param input input string, including key and optional ante and
+ * post context
+ * @param anteContextPos offset into input to end of ante context, or -1 if
+ * none.  Must be <= input.length() if not -1.
+ * @param postContextPos offset into input to start of post context, or -1
+ * if none.  Must be <= input.length() if not -1, and must be >=
+ * anteContextPos.
+ * @param output output string
+ * @param cursorPos offset into output at which cursor is located, or -1 if
+ * none.  If less than zero, then the cursor is placed after the
+ * <code>output</code>; that is, -1 is equivalent to
+ * <code>output.length()</code>.  If greater than
+ * <code>output.length()</code> then an exception is thrown.
+ */
+TransliterationRule::TransliterationRule(const UnicodeString& input,
+                                         int32_t anteContextPos, int32_t postContextPos,
+                                         const UnicodeString& output,
+                                         int32_t cursorPos,
+                                         UErrorCode& status) {
+    if (U_FAILURE(status)) {
+        return;
+    }
+    // Do range checks only when warranted to save time
+    if (anteContextPos < 0) {
+        anteContextLength = 0;
+    } else {
+        if (anteContextPos > input.length()) {
+            // throw new IllegalArgumentException("Invalid ante context");
+            status = U_ILLEGAL_ARGUMENT_ERROR;
+            return;
+        }
+        anteContextLength = anteContextPos;
+    }
+    if (postContextPos < 0) {
+        keyLength = input.length() - anteContextLength;
+    } else {
+        if (postContextPos < anteContextLength ||
+            postContextPos > input.length()) {
+            // throw new IllegalArgumentException("Invalid post context");
+            status = U_ILLEGAL_ARGUMENT_ERROR;
+            return;
+        }
+        keyLength = postContextPos - anteContextLength;
+    }
+    if (cursorPos < 0) {
+        this->cursorPos = output.length();
+    } else {
+        if (cursorPos > output.length()) {
+            // throw new IllegalArgumentException("Invalid cursor position");
+            status = U_ILLEGAL_ARGUMENT_ERROR;
+            return;
+        }
+        this->cursorPos = cursorPos;
+    }
+    pattern = input;
+    this->output = output;
 }

+TransliterationRule::~TransliterationRule() {}
+
 /**
 * Return the length of the key.  Equivalent to <code>getKey().length()</code>.
 * @return the length of the match key.
 */
 int32_t TransliterationRule::getKeyLength(void) const {
-    return key.length();
-}
-
-/**
- * Return the key.
- * @return the match key.
- */
-const UnicodeString& TransliterationRule::getKey(void) const {
-    return key;
+    return keyLength;
 }

 /**
@ -110,7 +149,45 @@ int32_t TransliterationRule::getCursorPos(void) const {
 * <code>getMaximumContextLength()</code>.
 */
 int32_t TransliterationRule::getAnteContextLength(void) const {
-    return anteContext.length();
+    return anteContextLength;
+}
+
+/**
+ * Internal method.  Returns 8-bit index value for this rule.
+ * This is the low byte of the first character of the key,
+ * unless the first character of the key is a set.  If it's a
+ * set, or otherwise can match multiple keys, the index value is -1.
+ */
+int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) {
+    if (anteContextLength == pattern.length()) {
+        // A pattern with just ante context {such as foo)>bar} can
+        // match any key.
+        return -1;
+    }
+    UChar c = pattern.charAt(anteContextLength);
+    return data.lookupSet(c) == NULL ? (c & 0xFF) : -1;
+}
+
+/**
+ * Internal method.  Returns true if this rule matches the given
+ * index value.  The index value is an 8-bit integer, 0..255,
+ * representing the low byte of the first character of the key.
+ * It matches this rule if it matches the first character of the
+ * key, or if the first character of the key is a set, and the set
+ * contains any character with a low byte equal to the index
+ * value.  If the rule contains only ante context, as in foo)>bar,
+ * then it will match any key.
+ */
+bool_t TransliterationRule::matchesIndexValue(uint8_t v,
+                                   const TransliterationRuleData& data) {
+    if (anteContextLength == pattern.length()) {
+        // A pattern with just ante context {such as foo)>bar} can
+        // match any key.
+        return TRUE;
+    }
+    UChar c = pattern.charAt(anteContextLength);
+    UnicodeSet* set = data.lookupSet(c);
+    return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
 }

 /**
@ -118,43 +195,37 @@ int32_t TransliterationRule::getAnteContextLength(void) const {
 * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
 * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
 * "[c]a>x" masks "[dc]a>y".
- *
- * <p>This method must not be called after freeze() is called.
 */
 bool_t TransliterationRule::masks(const TransliterationRule& r2) const {
-    /* There are three cases of masking.  In each instance, rule1
-     * masks rule2.
+    /* Rule r1 masks rule r2 if the string formed of the
+     * antecontext, key, and postcontext overlaps in the following
+     * way:
     *
-     * 1. KEY mask: len(key1) < len(key2), key2 starts with key1.
-     *
-     * 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
-     * prefix2 ends with prefix1, suffix2 starts with suffix1.
-     *
-     * 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
-     * prefix2 ends with prefix1, suffix2 starts with suffix1.
+     * r1:      aakkkpppp
+     * r2:     aaakkkkkpppp
+     *            ^
+     * 
+     * The strings must be aligned at the first character of the
+     * key.  The length of r1 to the left of the alignment point
+     * must be <= the length of r2 to the left; ditto for the
+     * right.  The characters of r1 must equal (or be a superset
+     * of) the corresponding characters of r2.  The superset
+     * operation should be performed to check for UnicodeSet
+     * masking.
     */

    /* LIMITATION of the current mask algorithm: Some rule
     * maskings are currently not detected.  For example,
-     * "{Lu}]a>x" masks "A]a>y".  To detect these sorts of masking,
-     * we need a subset operator on UnicodeSet objects, which we
-     * currently do not have.  This can be added later.
+     * "{Lu}]a>x" masks "A]a>y".  This can be added later. TODO
     */
-    return ((maskKey->length() < r2.maskKey->length() &&
-             r2.maskKey->startsWith(*maskKey)) ||
-            (r2.anteContext.length() != 0 && *maskKey == *r2.maskKey &&
-             ((anteContext.length() == 0) ||
-              (anteContext.length() < r2.anteContext.length() &&
-               r2.anteContext.endsWith(anteContext)))));
-}

-/**
- * Free up space.  Once this method is called, masks() must NOT be called.
- * If it is called, an exception will be thrown.
- */
-void TransliterationRule::freeze(void) {
-    delete maskKey;
-    maskKey = 0;
+    int32_t len = pattern.length();
+    int32_t left = anteContextLength;
+    int32_t left2 = r2.anteContextLength;
+    int32_t right = len - left;
+    int32_t right2 = r2.pattern.length() - left2;
+    return left <= left2 && right <= right2 &&
+        0 == r2.pattern.compare(left2 - left, len, pattern);
 }

 /**
@ -186,17 +257,10 @@ bool_t TransliterationRule::matches(const UnicodeString& text,
                                    int32_t cursor,
                                    const TransliterationRuleData& data,
                                    const UnicodeFilter* filter) const {
-    return
-        (anteContext.length() == 0
-         || regionMatches(text, start, limit, result,
-                          cursor - anteContext.length(),
-                          anteContext, data, filter)) &&
-        regionMatches(text, start, limit, result, cursor,
-                      key, data, filter) &&
-        (postContext.length() == 0
-         || regionMatches(text, start, limit, result,
-                          cursor + key.length(),
-                          postContext, data, filter));
+    // Match anteContext, key, and postContext
+    return regionMatches(text, start, limit, result,
+                         cursor - anteContextLength,
+                         pattern, data, filter);
 }

 /**
@ -219,15 +283,10 @@ bool_t TransliterationRule::matches(const Replaceable& text,
                                    int32_t cursor,
                                    const TransliterationRuleData& data,
                                    const UnicodeFilter* filter) const {
-    return
-        (anteContext.length() == 0
-         || regionMatches(text, start, limit, cursor - anteContext.length(),
-                          anteContext, data, filter)) &&
-        regionMatches(text, start, limit, cursor,
-                      key, data, filter) &&
-        (postContext.length() == 0
-         || regionMatches(text, start, limit, cursor + key.length(),
-                          postContext, data, filter));
+    // Match anteContext, key, and postContext
+    return regionMatches(text, start, limit,
+                         cursor - anteContextLength,
+                         pattern, data, filter);
 }

 /**
@ -260,28 +319,10 @@ int32_t TransliterationRule::getMatchDegree(const Replaceable& text,
                                            int32_t cursor,
                                            const TransliterationRuleData& data,
                                            const UnicodeFilter* filter) const {
-    if (anteContext.length() != 0
-        && !regionMatches(text, start, limit, cursor - anteContext.length(),
-                          anteContext, data, filter)) {
-        return MISMATCH;
-    }
-    int32_t len = getRegionMatchLength(text, start, limit, cursor,
-                                       key, data, filter);
-    if (len < 0) {
-        return MISMATCH;
-    }
-    if (len < key.length()) {
-        return PARTIAL_MATCH;
-    }
-    if (postContext.length() == 0) {
-        return FULL_MATCH;
-    }
-    len = getRegionMatchLength(text, start, limit,
-                               cursor + key.length(),
-                               postContext, data, filter);
-    return (len < 0) ? MISMATCH
-                     : ((len == postContext.length()) ? FULL_MATCH
-                                                      : PARTIAL_MATCH);
+    int len = getRegionMatchLength(text, start, limit, cursor - anteContextLength,
+                                   pattern, data, filter);
+    return len < anteContextLength ? MISMATCH :
+        (len < pattern.length() ? PARTIAL_MATCH : FULL_MATCH);
 }

 /**
--- a/icu4c/source/i18n/rbt_rule.h
+++ b/icu4c/source/i18n/rbt_rule.h
@ -72,9 +72,13 @@ public:
 private:

    /**
-     * The string that must be matched.
+     * The string that must be matched, consisting of the anteContext, key,
+     * and postContext, concatenated together, in that order.  Some components
+     * may be empty (zero length).
+     * @see anteContextLength
+     * @see keyLength
     */
-    UnicodeString key;
+    UnicodeString pattern;

    /**
     * The string that is emitted if the key, anteContext, and postContext
@ -83,16 +87,18 @@ private:
    UnicodeString output;

    /**
-     * The string that must match before the key.  If empty, then
-     * there is no matching requirement before the key.
+     * The length of the string that must match before the key.  If
+     * zero, then there is no matching requirement before the key.
+     * Substring [0,anteContextLength) of pattern is the anteContext.
     */
-    UnicodeString anteContext;
+    int32_t anteContextLength;

    /**
-     * The string that must match after the key.  If empty, then there
-     * is no matching requirement after the key.
+     * The length of the key.  Substring [anteContextLength,
+     * anteContextLength + keyLength) is the key.
+
     */
-    UnicodeString postContext;
+    int32_t keyLength;

    /**
     * The position of the cursor after emitting the output string, from 0 to
@ -101,12 +107,6 @@ private:
     */
    int32_t cursorPos;

-    /**
-     * A string used to implement masks().
-     * @see #freeze
-     */
-    UnicodeString* maskKey;
-
 public:

    /**
@ -134,6 +134,29 @@ public:
                        int32_t theCursorPos,
                        UErrorCode &status);

+    /**
+     * Construct a new rule with the given input, output text, and other
+     * attributes.  A cursor position may be specified for the output text.
+     * @param input input string, including key and optional ante and
+     * post context
+     * @param anteContextPos offset into input to end of ante context, or -1 if
+     * none.  Must be <= input.length() if not -1.
+     * @param postContextPos offset into input to start of post context, or -1
+     * if none.  Must be <= input.length() if not -1, and must be >=
+     * anteContextPos.
+     * @param output output string
+     * @param cursorPos offset into output at which cursor is located, or -1 if
+     * none.  If less than zero, then the cursor is placed after the
+     * <code>output</code>; that is, -1 is equivalent to
+     * <code>output.length()</code>.  If greater than
+     * <code>output.length()</code> then an exception is thrown.
+     */
+    TransliterationRule(const UnicodeString& input,
+                        int32_t anteContextPos, int32_t postContextPos,
+                        const UnicodeString& output,
+                        int32_t cursorPos,
+                        UErrorCode& status);
+
    /**
     * Destructor.
     */
@ -145,12 +168,6 @@ public:
     */
    virtual int32_t getKeyLength(void) const;

-    /**
-     * Return the key.
-     * @return the match key.
-     */
-    virtual const UnicodeString& getKey(void) const;
-
    /**
     * Return the output string.
     * @return the output string.
@ -170,22 +187,39 @@ public:
     */
    virtual int32_t getAnteContextLength(void) const;

+private:
+    friend class TransliterationRuleSet;
+
+    /**
+     * Internal method.  Returns 8-bit index value for this rule.
+     * This is the low byte of the first character of the key,
+     * unless the first character of the key is a set.  If it's a
+     * set, or otherwise can match multiple keys, the index value is -1.
+     */
+    int16_t getIndexValue(const TransliterationRuleData& data);
+
+    /**
+     * Internal method.  Returns true if this rule matches the given
+     * index value.  The index value is an 8-bit integer, 0..255,
+     * representing the low byte of the first character of the key.
+     * It matches this rule if it matches the first character of the
+     * key, or if the first character of the key is a set, and the set
+     * contains any character with a low byte equal to the index
+     * value.  If the rule contains only ante context, as in foo)>bar,
+     * then it will match any key.
+     */
+    bool_t matchesIndexValue(uint8_t v,
+                             const TransliterationRuleData& data);
+
+public:
    /**
     * Return true if this rule masks another rule.  If r1 masks r2 then
     * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
     * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
     * "[c]a>x" masks "[dc]a>y".
-     *
-     * <p>This method must not be called after freeze() is called.
     */
    virtual bool_t masks(const TransliterationRule& r2) const;

-    /**
-     * Free up space.  Once this method is called, masks() must NOT be called.
-     * If it is called, an exception will be thrown.
-     */
-    virtual void freeze(void);
-
    /**
     * Return true if this rule matches the given text.  The text being matched
     * occupies a virtual buffer consisting of the contents of
--- a/icu4c/source/i18n/rbt_set.cpp
+++ b/icu4c/source/i18n/rbt_set.cpp
@ -30,6 +30,16 @@
 */
 TransliterationRuleSet::TransliterationRuleSet() {
    maxContextLength = 0;
+    ruleVector = new UVector();
+    rules = NULL;
+}
+
+/**
+ * Destructor.
+ */
+TransliterationRuleSet::~TransliterationRuleSet() {
+    delete ruleVector;
+    delete[] rules;
 }

 /**
@ -45,31 +55,22 @@ int32_t TransliterationRuleSet::getMaximumContextLength(void) const {
 * significant.
 *
 * <p>Once freeze() is called, this method must not be called.
- * @param rule the rule to add
+ * @param adoptedRule the rule to add
 */
 void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
                                     UErrorCode& status) {
-    
-    // Build time, no checking  : 3562 ms
-    // Build time, with checking: 6234 ms
-
    if (U_FAILURE(status)) {
        delete adoptedRule;
        return;
    }
-
-    for (int32_t i=0; i<rules.size(); ++i) {
-        TransliterationRule* r = (TransliterationRule*) rules.elementAt(i);
-        if (r->masks(*adoptedRule)) {
-            //throw new IllegalArgumentException("Rule " + rule +
-            //                                   " must precede " + r);
-            status = U_ILLEGAL_ARGUMENT_ERROR;
-            delete adoptedRule;
-            return;
-        }
+    if (ruleVector == NULL) {
+        // throw new IllegalArgumentException("Cannot add rules after freezing");
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        delete adoptedRule;
+        return;
    }
+    ruleVector->addElement(adoptedRule);

-    rules.addElement(adoptedRule);
    int32_t len;
    if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) {
        maxContextLength = len;
@ -77,13 +78,109 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
 }

 /**
- * Free up space.  Once this method is called, addRule() must NOT
- * be called again.
+ * Close this rule set to further additions, check it for masked rules,
+ * and index it to optimize performance.  Once this method is called,
+ * addRule() can no longer be called.
+ * @exception IllegalArgumentException if some rules are masked
 */
-void TransliterationRuleSet::freeze(void) {
-    for (int32_t i=0; i<rules.size(); ++i) {
-        ((TransliterationRule*) rules.elementAt(i))->freeze();
+void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
+                                    UErrorCode& status) {
+    if (U_FAILURE(status)) {
+        return;
    }
+
+    /* Construct the rule array and index table.  We reorder the
+     * rules by sorting them into 256 bins.  Each bin contains all
+     * rules matching the index value for that bin.  A rule
+     * matches an index value if string whose first key character
+     * has a low byte equal to the index value can match the rule.
+     *
+     * Each bin contains zero or more rules, in the same order
+     * they were found originally.  However, the total rules in
+     * the bins may exceed the number in the original vector,
+     * since rules that have a variable as their first key
+     * character will generally fall into more than one bin.
+     *
+     * That is, each bin contains all rules that either have that
+     * first index value as their first key character, or have
+     * a set containing the index value as their first character.
+     */
+    int32_t n = ruleVector->size();
+    int32_t j;
+    int16_t x;
+    UVector v(2*n); // heuristic; adjust as needed
+
+    /* Precompute the index values.  This saves a LOT of time.
+     */
+    int16_t* indexValue = new int16_t[n];
+    for (j=0; j<n; ++j) {
+        TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
+        indexValue[j] = r->getIndexValue(data);
+    }
+    for (x=0; x<256; ++x) {
+        index[x] = v.size();
+        for (j=0; j<n; ++j) {
+            if (indexValue[j] >= 0) {
+                if (indexValue[j] == x) {
+                    v.addElement(ruleVector->elementAt(j));
+                }
+            } else {
+                // If the indexValue is < 0, then the first key character is
+                // a set, and we must use the more time-consuming
+                // matchesIndexValue check.  In practice this happens
+                // rarely, so we seldom tread this code path.
+                TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
+                if (r->matchesIndexValue((uint8_t)x, data)) {
+                    v.addElement(r);
+                }
+            }
+        }
+    }
+    delete[] indexValue;
+    index[256] = v.size();
+
+    /* Freeze things into an array.
+     */
+    rules = new TransliterationRule*[v.size()];
+    for (j=0; j<v.size(); ++j) {
+        rules[j] = (TransliterationRule*) v.elementAt(j);
+    }
+    delete ruleVector;
+    ruleVector = NULL;
+
+    // TODO Add error reporting that indicates the rules that
+    //      are being masked.
+    //UnicodeString errors;
+
+    /* Check for masking.  This is MUCH faster than our old check,
+     * which was each rule against each following rule, since we
+     * only have to check for masking within each bin now.  It's
+     * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
+     * count, and n2 is the per-bin rule count.  But n2<<n1, so
+     * it's a big win.
+     */
+    for (x=0; x<256; ++x) {
+        for (j=index[x]; j<index[x+1]-1; ++j) {
+            TransliterationRule* r1 = rules[j];
+            for (int32_t k=j+1; k<index[x+1]; ++k) {
+                TransliterationRule* r2 = rules[k];
+                if (r1->masks(*r2)) {
+//|                 if (errors == null) {
+//|                     errors = new StringBuffer();
+//|                 } else {
+//|                     errors.append("\n");
+//|                 }
+//|                 errors.append("Rule " + r1 + " masks " + r2);
+                    status = U_ILLEGAL_ARGUMENT_ERROR;
+                    return;
+                }
+            }
+        }
+    }
+
+    //if (errors != null) {
+    //    throw new IllegalArgumentException(errors.toString());
+    //}
 }

 /**
@ -119,15 +216,18 @@ TransliterationRuleSet::findMatch(const UnicodeString& text,
                                  int32_t cursor,
                                  const TransliterationRuleData& data,
                                  const UnicodeFilter* filter) const {
-    for (int32_t i=0; i<rules.size(); ++i) {
-        TransliterationRule* rule =
-            (TransliterationRule*) rules.elementAt(i);
-        if (rule->matches(text, start, limit, result,
-                          cursor, data, filter)) {
-            return rule;
+    /* We only need to check our indexed bin of the rule table,
+     * based on the low byte of the first key character.
+     */
+    int32_t rlen = result.length();
+    int16_t x = 0xFF & (cursor < rlen ? result.charAt(cursor)
+                        : text.charAt(cursor - rlen + start));
+    for (int32_t i=index[x]; i<index[x+1]; ++i) {
+        if (rules[i]->matches(text, start, limit, result, cursor, data, filter)) {
+            return rules[i];
        }
    }
-    return 0;
+    return NULL;
 }

 /**
@ -154,15 +254,16 @@ TransliterationRuleSet::findMatch(const Replaceable& text,
                                  int32_t cursor,
                                  const TransliterationRuleData& data,
                                  const UnicodeFilter* filter) const {
-    for (int32_t i=0; i<rules.size(); ++i) {
-        TransliterationRule* rule =
-            (TransliterationRule*) rules.elementAt(i);
-        if (rule->matches(text, start, limit, cursor,
-                          data, filter)) {
-            return rule;
+    /* We only need to check our indexed bin of the rule table,
+     * based on the low byte of the first key character.
+     */
+    int16_t x = text.charAt(cursor) & 0xFF;
+    for (int32_t i=index[x]; i<index[x+1]; ++i) {
+        if (rules[i]->matches(text, start, limit, cursor, data, filter)) {
+            return rules[i];
        }
    }
-    return 0;
+    return NULL;
 }

 /**
@ -199,19 +300,22 @@ TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
                                             const TransliterationRuleData& data,
                                             bool_t& isPartial,
                                             const UnicodeFilter* filter) const {
+
+    /* We only need to check our indexed bin of the rule table,
+     * based on the low byte of the first key character.
+     */
    isPartial = FALSE;
-    for (int32_t i=0; i<rules.size(); ++i) {
-        TransliterationRule* rule =
-            (TransliterationRule*) rules.elementAt(i);
-        int32_t match = rule->getMatchDegree(text, start, limit, cursor,
-                                             data, filter);
+    int16_t x = text.charAt(cursor) & 0xFF;
+    for (int32_t i=index[x]; i<index[x+1]; ++i) {
+        int32_t match = rules[i]->getMatchDegree(text, start, limit, cursor,
+                                                 data, filter);
        switch (match) {
        case TransliterationRule::FULL_MATCH:
-            return rule;
+            return rules[i];
        case TransliterationRule::PARTIAL_MATCH:
            isPartial = TRUE;
-            return 0;
+            return NULL;
        }
    }
-    return 0;
+    return NULL;
 }
--- a/icu4c/source/i18n/rbt_set.h
+++ b/icu4c/source/i18n/rbt_set.h
@ -30,15 +30,30 @@ class UnicodeString;
 */
 class TransliterationRuleSet {
    /**
-     * Vector of rules, in the order added.
+     * Vector of rules, in the order added.  This is only used while the rule
+     * set is getting built.  After that, freeze() reorders and indexes the
+     * rules, and this Vector is freed.
     */
-    UVector rules;
+    UVector* ruleVector;

    /**
     * Length of the longest preceding context
     */
    int32_t maxContextLength;

+    /**
+     * Sorted and indexed table of rules.  This is created by freeze() from
+     * the rules in ruleVector.
+     */
+    TransliterationRule** rules;
+
+    /**
+     * Index table.  For text having a first character c, compute x = c&0xFF.
+     * Now use rules[index[x]..index[x+1]-1].  This index table is created by
+     * freeze().
+     */
+    int32_t index[257];
+
 public:

    /**
@ -46,6 +61,11 @@ public:
     */
    TransliterationRuleSet();

+    /**
+     * Destructor.
+     */
+    virtual ~TransliterationRuleSet();
+
    /**
     * Return the maximum context length.
     * @return the length of the longest preceding context.
@ -57,16 +77,19 @@ public:
     * significant.
     *
     * <p>Once freeze() is called, this method must not be called.
-     * @param rule the rule to add
+     * @param adoptedRule the rule to add
     */
    virtual void addRule(TransliterationRule* adoptedRule,
                         UErrorCode& status);

    /**
-     * Free up space.  Once this method is called, addRule() must NOT
-     * be called again.
+     * Close this rule set to further additions, check it for masked rules,
+     * and index it to optimize performance.  Once this method is called,
+     * addRule() can no longer be called.
+     * @exception IllegalArgumentException if some rules are masked
     */
-    virtual void freeze(void);
+    virtual void freeze(const TransliterationRuleData& data,
+                        UErrorCode& status);

    /**
     * Attempt to find a matching rule at the specified point in the text.  The
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@ -14,7 +14,7 @@

 // N.B.: This mapping is different in ICU and Java
 const UnicodeString UnicodeSet::CATEGORY_NAMES(
-    "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf");
+    "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf", "");

 /**
 * A cache mapping character category integers, as returned by
@ -28,7 +28,7 @@ UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE =
 * Delimiter string used in patterns to close a category reference:
 * ":]".  Example: "[:Lu:]".
 */
-const UnicodeString UnicodeSet::CATEGORY_CLOSE(":]", "");
+const UnicodeString UnicodeSet::CATEGORY_CLOSE = UNICODE_STRING(":]", 2);

 /**
 * Delimiter char beginning a variable reference:
@ -69,23 +69,20 @@ UnicodeSet::UnicodeSet() : pairs() {}
 * white space.  See the class description for the syntax of the
 * pattern language.
 * @param pattern a string specifying what characters are in the set
- * @param ignoreSpaces if <code>true</code>, all spaces in the
- * pattern are ignored, except those preceded by '\\'.  Spaces are
- * those characters for which <code>Character.isSpaceChar()</code>
- * is <code>true</code>.
 * @exception <code>IllegalArgumentException</code> if the pattern
 * contains a syntax error.
 */
-UnicodeSet::UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces,
-                       UErrorCode& status) : pairs() {
-    applyPattern(pattern, ignoreSpaces, status);
-}
-
 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
                       UErrorCode& status) : pairs() {
    applyPattern(pattern, status);
 }

+UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
+                       const TransliterationRuleData* data,
+                       UErrorCode& status) {
+    parse(pairs, pattern, pos, data, status);
+}
+
 /**
 * Constructs a set from the given Unicode character category.
 * @param category an integer indicating the character category as
@ -164,50 +161,24 @@ int32_t UnicodeSet::hashCode(void) const {
 * contains a syntax error.
 */
 void UnicodeSet::applyPattern(const UnicodeString& pattern,
-                              bool_t ignoreSpaces,
                              UErrorCode& status) {
    if (U_FAILURE(status)) {
        return;
    }

    ParsePosition pos(0);
-	UnicodeString* pat = (UnicodeString*) &pattern;
+    parse(pairs, pattern, pos, NULL, status);

-    // To ignore spaces, create a new pattern without spaces.  We
-    // have to process all '\' escapes.  If '\' is encountered,
-    // insert it and the following character (if any -- let parse
-    // deal with any syntax errors) in the pattern.  This allows
-    // escaped spaces.
-    if (ignoreSpaces) {
-		pat = new UnicodeString();
-        for (int32_t i=0; i<pattern.length(); ++i) {
-            UChar c = pattern.charAt(i);
-            if (Unicode::isSpaceChar(c)) {
-                continue;
-            }
-            if (c == '\\' && (i+1) < pattern.length()) {
-                pat->append(c);
-                c = pattern.charAt(++i);
-                // Fall through and append the following char
-            }
-            pat->append(c);
-        }
+    // Skip over trailing whitespace
+    int32_t i = pos.getIndex();
+    int32_t n = pattern.length();
+    while (i<n && Unicode::isWhitespace(pattern.charAt(i))) {
+        ++i;
    }

-    parse(pairs, *pat, pos, NULL, status);
-
-    // Skip over trailing whitespace -- clean up later
-    while (pos.getIndex() < pat->length() &&
-           Unicode::isWhitespace(pat->charAt(pos.getIndex()))) {
-        pos.setIndex(pos.getIndex() + 1);
-    }
-
-    if (pos.getIndex() != pat->length()) {
+    if (i != n) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
    }
-	if (pat != &pattern) {
-		delete pat;
-	}
 }

 /**
@ -279,6 +250,34 @@ bool_t UnicodeSet::contains(UChar c) const {
    return contains(c, c);
 }

+/**
+ * Returns <tt>true</tt> if this set contains any character whose low byte
+ * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
+ * indexing.
+ */
+bool_t UnicodeSet::containsIndexValue(uint8_t v) const {
+    /* The index value v, in the range [0,255], is contained in this set if
+     * it is contained in any pair of this set.  Pairs either have the high
+     * bytes equal, or unequal.  If the high bytes are equal, then we have
+     * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
+     * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
+     * Then v is contained if xx <= v || v <= yy.  (This is identical to the
+     * time zone month containment logic.)
+     */
+    for (int32_t i=0; i<pairs.length(); i+=2) {
+        UChar low = pairs.charAt(i);
+        UChar high = pairs.charAt(i+1);
+        if ((low & 0xFF00) == (high & 0xFF00)) {
+            if (uint8_t(low) <= v && v <= uint8_t(high)) {
+                return TRUE;
+            }
+        } else if (uint8_t(low) <= v || v <= uint8_t(high)) {
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
+
 /**
 * Adds the specified range to this set if it is not already
 * present.  If this set already contains the specified range,