From 572e9063c00af4004de0e528ffe6b2b53362962a Mon Sep 17 00:00:00 2001 From: Alan Liu Date: Tue, 11 Jan 2000 02:25:03 +0000 Subject: [PATCH] Rewrite UnicodeSet and RBT parsers for better performance and new syntax X-SVN-Rev: 519 --- .../ibm/icu/text/RuleBasedTransliterator.java | 1078 ++++++++++------- .../com/ibm/icu/text/TransliterationRule.java | 78 +- icu4j/src/com/ibm/icu/text/UnicodeSet.java | 522 ++++---- .../com/ibm/text/RuleBasedTransliterator.java | 1078 ++++++++++------- .../src/com/ibm/text/TransliterationRule.java | 78 +- icu4j/src/com/ibm/text/UnicodeSet.java | 522 ++++---- 6 files changed, 1960 insertions(+), 1396 deletions(-) diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java index 572a959963..7337a05292 100755 --- a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java +++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java @@ -2,6 +2,7 @@ package com.ibm.text; import java.util.Hashtable; import java.util.Vector; +import java.text.ParsePosition; /** * A transliterator that reads a set of rules in order to determine how to @@ -181,9 +182,12 @@ import java.util.Vector; *

Copyright © IBM Corporation 1999. All rights reserved. * * @author Alan Liu - * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.7 $ $Date: 2000/01/06 01:36:36 $ + * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.8 $ $Date: 2000/01/11 02:25:03 $ * * $Log: RuleBasedTransliterator.java,v $ + * Revision 1.8 2000/01/11 02:25:03 Alan + * Rewrite UnicodeSet and RBT parsers for better performance and new syntax + * * Revision 1.7 2000/01/06 01:36:36 Alan * Allow string arrays in rule resource bundles * @@ -195,7 +199,6 @@ import java.util.Vector; * * Revision 1.4 1999/12/22 01:05:54 Alan * Improve masking checking; turn it off by default, for better performance - * */ public class RuleBasedTransliterator extends Transliterator { /** @@ -214,8 +217,6 @@ public class RuleBasedTransliterator extends Transliterator { static final boolean DEBUG = false; - static final boolean CHECK_MASKING = true; - private static final String COPYRIGHT = "\u00A9 IBM Corporation 1999. All rights reserved."; @@ -561,33 +562,34 @@ public class RuleBasedTransliterator extends Transliterator { private static final char VARIABLE_DEF_OP = '='; private static final char FORWARD_RULE_OP = '>'; private static final char REVERSE_RULE_OP = '<'; - private static final char FWDREV_RULE_OP = '~'; // internal rep of FWDREF_OP_STRING + private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op private static final String OPERATORS = "=><"; - // Forward-Reverse operator - // a<>b is equivalent to ab - private static final String FWDREV_OP_STRING = "<>"; // must have length 2 - // Other special characters private static final char QUOTE = '\''; + private static final char ESCAPE = '\\'; + private static final char END_OF_RULE = ';'; + private static final char RULE_COMMENT_CHAR = '#'; + private static final char VARIABLE_REF_OPEN = '{'; private static final char VARIABLE_REF_CLOSE = '}'; - private static final char CONTEXT_OPEN = '['; - private static final char CONTEXT_CLOSE = ']'; + private static final char CONTEXT_OPEN = '('; + private static final char CONTEXT_CLOSE = ')'; + private static final char SET_OPEN = '['; + private static final char SET_CLOSE = ']'; private static final char CURSOR_POS = '|'; - private static final char RULE_COMMENT_CHAR = '#'; /** * Specials must be quoted in rules to be used as literals. * Specials may not occur in variable names. */ - private static final String SPECIALS = "'{}[]|#" + OPERATORS; +//! private static final String SPECIALS = "{}[]|" + OPERATORS; /** * Specials that must be quoted in variable definitions. */ - private static final String DEF_SPECIALS = "'{}"; +//! private static final String DEF_SPECIALS = "{}"; /** * @param rules list of rules, separated by semicolon characters @@ -616,37 +618,12 @@ public class RuleBasedTransliterator extends Transliterator { determineVariableRange(ruleArray); StringBuffer errors = null; - for (int irule=0; irule0 && rules.charAt(limit-1) == '\\') { - limit = rules.indexOf(';', limit+1); - } - - if (limit == -1) { - limit = n; - } - // Skip over empty lines and line starting with # - if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) { - try { - applyRule(i, limit); - } catch (IllegalArgumentException e) { - if (errors == null) { - errors = new StringBuffer(e.getMessage()); - } else { - errors.append("\n").append(e.getMessage()); - } - } - } - i = limit + 1; - } + try { + parseRuleArray(ruleArray); + } catch (IllegalArgumentException e) { + errors = new StringBuffer(e.getMessage()); } - + // Index the rules try { data.ruleSet.freeze(data.setVariables); @@ -663,411 +640,684 @@ public class RuleBasedTransliterator extends Transliterator { } } - /** - * Parse the given substring as a rule, and append it to the rules currently - * represented in this object. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @exception IllegalArgumentException if there is a syntax error in the - * rules - */ - private void applyRule(int start, int limit) { - /* General description of parsing: Initially, rules contain two types of - * quoted characters. First, there are variable references, such as - * "{alpha}". Second, there are quotes, such as "'<'" or "''". One of - * the first steps in parsing a rule is to resolve such quoted matter. - * Quotes are removed early, leaving unquoted literal matter. Variable - * references are resolved and replaced by single characters. In some - * instances these characters represent themselves; in others, they - * stand for categories of characters. Character categories are either - * predefined (e.g., "{Lu}"), or are defined by the user using a - * statement (e.g., "vowels:aeiouAEIOU"). - * - * Another early step in parsing is to split each rule into component - * pieces. These pieces are, for every rule, a left-hand side, a right- - * hand side, and an operator. The left- and right-hand sides may not - * be empty, except for the output patterns of forward and reverse - * rules. In addition to this partitioning, the match patterns of - * forward and reverse rules must be partitioned into antecontext, - * postcontext, and literal pattern, where the context portions may or - * may not be present. Finally, output patterns must have the cursor - * indicator '|' detected and removed, with its position recorded. - * - * Quote removal, variable resolution, and sub-pattern splitting must - * all happen at once. This is due chiefly to the quoting mechanism, - * which allows special characters to appear at arbitrary positions in - * the final unquoted text. (For this reason, alteration of the rule - * language is somewhat clumsy; it entails reassessment and revision of - * the parsing methods as a whole.) - * - * After this processing of rules is complete, the final end products - * are unquoted pieces of text of various types, and an integer cursor - * position, if one is specified. These processed raw materials are now - * easy to deal with; other classes such as UnicodeSet and - * TransliterationRule need know nothing of quoting or variables. - */ - StringBuffer left = new StringBuffer(); - StringBuffer right = new StringBuffer(); - StringBuffer anteContext = new StringBuffer(); - StringBuffer postContext = new StringBuffer(); - int cursorPos[] = new int[1]; - char operator = parseRule(start, limit, left, right, - anteContext, postContext, cursorPos); + + + + + + + private void parseRuleArray(String[] ruleArray) { + String[] leftRight = new String[2]; + char[] op = new char[1]; + for (int i=0; i= 0) { + if (operator != 0) { + syntaxError("Unquoted " + c, rule, start); + } + // Found an operator char. Check for forward-reverse operator. + if (c == REVERSE_RULE_OP && + (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { + ++pos; + operator = FWDREV_RULE_OP; + } else { + operator = c; + } + left = buf.toString(); // lhs + leftCursor = cursor; + leftAnte = ante; + leftPost = post; + leftPostClose = postClose; + + buf.setLength(0); + cursor = ante = post = postClose = -1; + continue; + } + switch (c) { + case END_OF_RULE: + break main; + case VARIABLE_REF_OPEN: + { + int j = rule.indexOf(VARIABLE_REF_CLOSE, pos); + if (pos == j || j < 0) { // empty or unterminated + syntaxError("Malformed variable reference", rule, start); + } + String name = rule.substring(pos, j); + pos = j+1; + buf.append(getVariableDef(name).charValue()); + } + break; + case CONTEXT_OPEN: + if (post >= 0) { + syntaxError("Multiple post contexts", rule, start); + } + // Ignore CONTEXT_OPEN if buffer length is zero -- that means + // this is the optional opening delimiter for the ante context. + if (buf.length() > 0) { + post = buf.length(); + } + break; + case CONTEXT_CLOSE: + if (postClose >= 0) { + syntaxError("Unexpected " + c, rule, start); + } + if (post >= 0) { + // This is probably the optional closing delimiter + // for the post context; save the pos and check later. + postClose = buf.length(); + } else if (ante >= 0) { + syntaxError("Multiple ante contexts", rule, start); + } else { + ante = buf.length(); + } + break; + case SET_OPEN: + ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '[' + buf.append(registerSet(new UnicodeSet(rule, pp, + data.variableNames, data.setVariables)).charValue()); + pos = pp.getIndex(); + break; + case VARIABLE_REF_CLOSE: + case SET_CLOSE: + syntaxError("Unquoted " + c, rule, start); + case CURSOR_POS: + if (cursor >= 0) { + syntaxError("Multiple cursors", rule, start); + } + cursor = buf.length(); + break; + default: + buf.append(c); + break; + } + } + if (operator == 0) { + syntaxError("No operator", rule, start); + } + + // Check context close parameters + if ((leftPostClose >= 0 && leftPostClose != left.length()) || + (postClose >= 0 && postClose != buf.length())) { + syntaxError("Extra text after ]", rule, start); + } + + // Context is only allowed on the input side; that is, the left side + // for forward rules. Cursors are only allowed on the output side; + // that is, the right side for forward rules. Bidirectional rules + // ignore elements that do not apply. switch (operator) { case VARIABLE_DEF_OP: - applyVariableDef(left.toString(), right.toString()); + // LHS is the name. RHS is a single character, either a literal + // or a set (already parsed). If RHS is longer than one + // character, it is either a multi-character string, or multiple + // sets, or a mixture of chars and sets -- syntax error. + if (buf.length() != 1) { + syntaxError("Malformed RHS", rule, start); + } + if (data.variableNames.get(left) != null) { + syntaxError("Duplicate definition of {" + + left + "}", rule, start); + } + data.variableNames.put(left, new Character(buf.charAt(0))); break; + case FORWARD_RULE_OP: if (direction == FORWARD) { + if (ante >= 0 || post >= 0 || leftCursor >= 0) { + syntaxError("Malformed rule", rule, start); + } data.ruleSet.addRule(new TransliterationRule( - left.toString(), right.toString(), - anteContext.toString(), postContext.toString(), - cursorPos[0])); + left, leftAnte, leftPost, + buf.toString(), cursor)); } // otherwise ignore the rule; it's not the direction we want break; + case REVERSE_RULE_OP: if (direction == REVERSE) { + if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) { + syntaxError("Malformed rule", rule, start); + } data.ruleSet.addRule(new TransliterationRule( - right.toString(), left.toString(), - anteContext.toString(), postContext.toString(), - cursorPos[0])); + buf.toString(), ante, post, + left, leftCursor)); } // otherwise ignore the rule; it's not the direction we want break; + case FWDREV_RULE_OP: - data.ruleSet.addRule(new TransliterationRule( - direction == FORWARD ? left.toString() : right.toString(), - direction == FORWARD ? right.toString() : left.toString(), - // Context & cursor disallowed - "", "", -1)); + if (direction == FORWARD) { + // The output side is the right; trim off any context + String output = buf.toString().substring(ante < 0 ? 0 : ante, + post < 0 ? buf.length() : post); + data.ruleSet.addRule(new TransliterationRule( + left, leftAnte, leftPost, + output, cursor)); + } else { + // The output side is the left; trim off any context + String output = left.substring(leftAnte < 0 ? 0 : leftAnte, + leftPost < 0 ? left.length() : leftPost); + data.ruleSet.addRule(new TransliterationRule( + buf.toString(), ante, post, + output, leftCursor)); + } break; } + + return pos; } - /** - * Add a variable definition. - * @param name the name of the variable. It must not already be defined. - * @param pattern the value of the variable. It may be a single character - * or a pattern describing a character set. - * @exception IllegalArgumentException if there is a syntax error - */ - private final void applyVariableDef(String name, String pattern) { - validateVariableName(name); - if (data.variableNames.get(name) != null) { - throw new IllegalArgumentException("Duplicate variable definition: " - + name + '=' + pattern); - } -//! if (UnicodeSet.getCategoryID(name) >= 0) { -//! throw new IllegalArgumentException("Reserved variable name: " -//! + name); -//! } - if (pattern.length() < 1) { - throw new IllegalArgumentException("Variable definition missing: " - + name); - } - if (pattern.length() == 1) { - // Got a single character variable definition - data.variableNames.put(name, new Character(pattern.charAt(0))); - } else { - // Got more than one character; parse it as a category - if (variableNext >= variableLimit) { - throw new RuntimeException("Private use variables exhausted"); - } - Character c = new Character(variableNext++); - data.variableNames.put(name, c); - data.setVariables.put(c, new UnicodeSet(pattern)); + + + private static final void syntaxError(String msg, String rule, int start) { + int end = quotedIndexOf(rule, start, rule.length(), ";"); + if (end < 0) { + end = rule.length(); } + throw new IllegalArgumentException(msg + " in " + + rule.substring(start, end)); } - /** - * Given a rule, parses it into three pieces: The left side, the right side, - * and the operator. Returns the operator. Quotes and variable references - * are resolved; the otuput text in all StringBuffer parameters - * is literal text. This method delegates to other parsing methods to - * handle the match pattern, output pattern, and other sub-patterns in the - * rule. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param left left side of rule is appended to this buffer - * with the quotes removed and variables resolved - * @param right right side of rule is appended to this buffer - * with the quotes removed and variables resolved - * @param anteContext the preceding context of the match pattern, - * if there is one, is appended to this buffer - * @param postContext the following context of the match pattern, - * if there is one, is appended to this buffer - * @param cursorPos if there is a cursor in the output pattern, its - * offset is stored in cursorPos[0] - * @return The operator character, one of the characters in OPERATORS. - */ - private char parseRule(int start, int limit, - StringBuffer left, StringBuffer right, - StringBuffer anteContext, - StringBuffer postContext, - int[] cursorPos) { - if (false) { - System.err.println("Parsing " + rules.substring(start, limit)); - } - /* Parse the rule into three pieces -- left, operator, and right, - * parsing out quotes. The result is that left and right will have - * unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted - * operators throw an exception. Two quotes inside or outside - * quotes indicates a quote literal. E.g., "o''clock" -> "o'clock". - */ - int i = quotedIndexOf(rules, start, limit, OPERATORS); - if (i < 0) { - throw new IllegalArgumentException( - "Syntax error: " - + rules.substring(start, limit)); - } - char c = rules.charAt(i); - - // Look for "<>" double rules. - if ((i+1) < limit && rules.substring(i, i+2).equals(FWDREV_OP_STRING)) { - if (i == start) { - throw new IllegalArgumentException( - "Empty left side: " - + rules.substring(start, limit)); - } - if (i+2 == limit) { - throw new IllegalArgumentException( - "Empty right side: " - + rules.substring(start, limit)); - } - parseSubPattern(start, i, left, null, SPECIALS); - parseSubPattern(i+2, limit, right, null, SPECIALS); - return FWDREV_RULE_OP; - } - switch (c) { - case FORWARD_RULE_OP: - if (i == start) { - throw new IllegalArgumentException( - "Empty left side: " - + rules.substring(start, limit)); - } - parseMatchPattern(start, i, left, anteContext, postContext); - if (i != (limit-1)) { - parseOutputPattern(i+1, limit, right, cursorPos); - } - break; - case REVERSE_RULE_OP: - if (i == (limit-1)) { - throw new IllegalArgumentException( - "Empty right side: " - + rules.substring(start, limit)); - } - if (i != start) { - parseOutputPattern(start, i, left, cursorPos); - } - parseMatchPattern(i+1, limit, right, anteContext, postContext); - break; - case VARIABLE_DEF_OP: - if (i == start || i == (limit-1)) { - throw new IllegalArgumentException( - "Empty left or right side: " - + rules.substring(start, limit)); - } - parseSubPattern(start, i, left); - parseDefPattern(i+1, limit, right); - break; - default: - throw new RuntimeException(); + +//| /** +//| * Parse the given substring as a rule, and append it to the rules currently +//| * represented in this object. +//| * @param start the beginning index, inclusive; 0 <= start +//| * <= limit. +//| * @param limit the ending index, exclusive; start <= limit +//| * <= rules.length(). +//| * @exception IllegalArgumentException if there is a syntax error in the +//| * rules +//| */ +//| private void applyRule(int start, int limit) { +//| /* General description of parsing: Initially, rules contain two types of +//| * quoted characters. First, there are variable references, such as +//| * "{alpha}". Second, there are quotes, such as "'<'" or "''". One of +//| * the first steps in parsing a rule is to resolve such quoted matter. +//| * Quotes are removed early, leaving unquoted literal matter. Variable +//| * references are resolved and replaced by single characters. In some +//| * instances these characters represent themselves; in others, they +//| * stand for categories of characters. Character categories are either +//| * predefined (e.g., "{Lu}"), or are defined by the user using a +//| * statement (e.g., "vowels:aeiouAEIOU"). +//| * +//| * Another early step in parsing is to split each rule into component +//| * pieces. These pieces are, for every rule, a left-hand side, a right- +//| * hand side, and an operator. The left- and right-hand sides may not +//| * be empty, except for the output patterns of forward and reverse +//| * rules. In addition to this partitioning, the match patterns of +//| * forward and reverse rules must be partitioned into antecontext, +//| * postcontext, and literal pattern, where the context portions may or +//| * may not be present. Finally, output patterns must have the cursor +//| * indicator '|' detected and removed, with its position recorded. +//| * +//| * Quote removal, variable resolution, and sub-pattern splitting must +//| * all happen at once. This is due chiefly to the quoting mechanism, +//| * which allows special characters to appear at arbitrary positions in +//| * the final unquoted text. (For this reason, alteration of the rule +//| * language is somewhat clumsy; it entails reassessment and revision of +//| * the parsing methods as a whole.) +//| * +//| * After this processing of rules is complete, the final end products +//| * are unquoted pieces of text of various types, and an integer cursor +//| * position, if one is specified. These processed raw materials are now +//| * easy to deal with; other classes such as UnicodeSet and +//| * TransliterationRule need know nothing of quoting or variables. +//| */ +//| StringBuffer left = new StringBuffer(); +//| StringBuffer right = new StringBuffer(); +//| StringBuffer anteContext = new StringBuffer(); +//| StringBuffer postContext = new StringBuffer(); +//| int cursorPos[] = new int[1]; +//| +//| char operator = parseRule(start, limit, left, right, +//| anteContext, postContext, cursorPos); +//| +//| switch (operator) { +//| case VARIABLE_DEF_OP: +//| applyVariableDef(left.toString(), right.toString()); +//| break; +//| case FORWARD_RULE_OP: +//| if (direction == FORWARD) { +//| data.ruleSet.addRule(new TransliterationRule( +//| left.toString(), right.toString(), +//| anteContext.toString(), postContext.toString(), +//| cursorPos[0])); +//| } // otherwise ignore the rule; it's not the direction we want +//| break; +//| case REVERSE_RULE_OP: +//| if (direction == REVERSE) { +//| data.ruleSet.addRule(new TransliterationRule( +//| right.toString(), left.toString(), +//| anteContext.toString(), postContext.toString(), +//| cursorPos[0])); +//| } // otherwise ignore the rule; it's not the direction we want +//| break; +//| case FWDREV_RULE_OP: +//| data.ruleSet.addRule(new TransliterationRule( +//| direction == FORWARD ? left.toString() : right.toString(), +//| direction == FORWARD ? right.toString() : left.toString(), +//| // Context & cursor disallowed +//| "", "", -1)); +//| break; +//| } +//| } + +//| /** +//| * Add a variable definition. +//| * @param name the name of the variable. It must not already be defined. +//| * @param pattern the value of the variable. It may be a single character +//| * or a pattern describing a character set. +//| * @exception IllegalArgumentException if there is a syntax error +//| */ +//| private final void applyVariableDef(String name, String pattern) { +//| validateVariableName(name); +//| if (data.variableNames.get(name) != null) { +//| throw new IllegalArgumentException("Duplicate variable definition: " +//| + name + '=' + pattern); +//| } +//| if (pattern.length() < 1) { +//| throw new IllegalArgumentException("Variable definition missing: " +//| + name); +//| } +//| if (pattern.length() == 1) { +//| // Got a single character variable definition +//| data.variableNames.put(name, new Character(pattern.charAt(0))); +//| } else { +//| // Got more than one character; parse it as a category +//| UnicodeSet set = new UnicodeSet(pattern); +//| data.variableNames.put(name, registerSet(set)); +//| } +//| } + + + + + private final Character registerSet(UnicodeSet set) { + if (variableNext >= variableLimit) { + throw new RuntimeException("Private use variables exhausted"); } + Character c = new Character(variableNext++); + data.setVariables.put(c, set); return c; } - /** - * Parses the match pattern of a forward or reverse rule. Given the raw - * match pattern, return the match text and the context on both sides, if - * any. Resolves all quotes and variables. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the key to be matched will be appended to this buffer - * @param anteContext the preceding context, if any, will be appended - * to this buffer. - * @param postContext the following context, if any, will be appended - * to this buffer. - */ - private void parseMatchPattern(int start, int limit, - StringBuffer text, - StringBuffer anteContext, - StringBuffer postContext) { - if (start >= limit) { - throw new IllegalArgumentException( - "Empty expression in rule: " - + rules.substring(start, limit)); - } - if (anteContext != null) { - // Ignore optional opening and closing context characters - if (rules.charAt(start) == CONTEXT_OPEN) { - ++start; - } - if (rules.charAt(limit-1) == CONTEXT_CLOSE) { - --limit; - } - // The four possibilities are: - // key - // anteContext]key - // anteContext]key[postContext - // key[postContext - int ante = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_CLOSE)); - int post = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_OPEN)); - if (ante >= 0 && post >= 0 && ante > post) { - throw new IllegalArgumentException( - "Syntax error in context specifier: " - + rules.substring(start, limit)); - } - if (ante >= 0) { - parseSubPattern(start, ante, anteContext); - start = ante+1; - } - if (post >= 0) { - parseSubPattern(post+1, limit, postContext); - limit = post; - } - } - parseSubPattern(start, limit, text); - } - private final void parseSubPattern(int start, int limit, - StringBuffer text) { - parseSubPattern(start, limit, text, null, SPECIALS); - } - /** - * Parse a variable definition sub pattern. This kind of sub - * pattern differs in the set of characters that are considered - * special. In particular, the '[' and ']' characters are not - * special, since these are used in UnicodeSet patterns. - */ - private final void parseDefPattern(int start, int limit, - StringBuffer text) { - parseSubPattern(start, limit, text, null, DEF_SPECIALS); - } - /** - * Parses the output pattern of a forward or reverse rule. Given the - * output pattern, return the output text and the position of the cursor, - * if any. Resolves all quotes and variables. - * @param rules the string to be parsed - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the output text will be appended to this buffer - * @param cursorPos if this parameter is not null, then cursorPos[0] - * will be set to the cursor position, or -1 if there is none. If this - * parameter is null, then cursors will be disallowed. - */ - private final void parseOutputPattern(int start, int limit, - StringBuffer text, - int[] cursorPos) { - parseSubPattern(start, limit, text, cursorPos, SPECIALS); - } - - /** - * Parses a sub-pattern of a rule. Return the text and the position of the cursor, - * if any. Resolves all quotes and variables. - * @param rules the string to be parsed - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the output text will be appended to this buffer - * @param cursorPos if this parameter is not null, then cursorPos[0] - * will be set to the cursor position, or -1 if there is none. If this - * parameter is null, then cursors will be disallowed. - * @param specials characters that must be quoted; typically either - * SPECIALS or DEF_SPECIALS. - */ - private void parseSubPattern(int start, int limit, - StringBuffer text, - int[] cursorPos, - String specials) { - boolean inQuote = false; - - if (start >= limit) { - throw new IllegalArgumentException("Empty expression in rule"); - } - if (cursorPos != null) { - cursorPos[0] = -1; - } - for (int i=start; i= 0) { - throw new IllegalArgumentException("Multiple cursors: " - + rules.substring(start, limit)); - } - cursorPos[0] = text.length(); - } else if (specials.indexOf(c) >= 0) { - throw new IllegalArgumentException("Unquoted special character: " - + rules.substring(start, limit)); - } else { - text.append(c); - } - } - } - - private static void validateVariableName(String name) { - if (indexOf(name, SPECIALS) >= 0) { - throw new IllegalArgumentException( - "Special character in variable name: " - + name); - } - } +//| /** +//| * Given a rule, parses it into three pieces: The left side, the right side, +//| * and the operator. Returns the operator. Quotes and variable references +//| * are resolved; the otuput text in all StringBuffer parameters +//| * is literal text. This method delegates to other parsing methods to +//| * handle the match pattern, output pattern, and other sub-patterns in the +//| * rule. +//| * @param start the beginning index, inclusive; 0 <= start +//| * <= limit. +//| * @param limit the ending index, exclusive; start <= limit +//| * <= rules.length(). +//| * @param left left side of rule is appended to this buffer +//| * with the quotes removed and variables resolved +//| * @param right right side of rule is appended to this buffer +//| * with the quotes removed and variables resolved +//| * @param anteContext the preceding context of the match pattern, +//| * if there is one, is appended to this buffer +//| * @param postContext the following context of the match pattern, +//| * if there is one, is appended to this buffer +//| * @param cursorPos if there is a cursor in the output pattern, its +//| * offset is stored in cursorPos[0] +//| * @return The operator character, one of the characters in OPERATORS. +//| */ +//| private char parseRule(int start, int limit, +//| StringBuffer left, StringBuffer right, +//| StringBuffer anteContext, +//| StringBuffer postContext, +//| int[] cursorPos) { +//| if (false) { +//| System.err.println("Parsing " + rules.substring(start, limit)); +//| } +//| /* Parse the rule into three pieces -- left, operator, and right, +//| * parsing out quotes. The result is that left and right will have +//| * unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted +//| * operators throw an exception. Two quotes inside or outside +//| * quotes indicates a quote literal. E.g., "o''clock" -> "o'clock". +//| */ +//| int i = quotedIndexOf(rules, start, limit, OPERATORS); +//| if (i < 0) { +//| throw new IllegalArgumentException( +//| "Syntax error: " +//| + rules.substring(start, limit)); +//| } +//| char c = rules.charAt(i); +//| +//| // Look for "<>" double rules. +//| if ((i+1) < limit && rules.substring(i, i+2).equals(FWDREV_OP_STRING)) { +//| if (i == start) { +//| throw new IllegalArgumentException( +//| "Empty left side: " +//| + rules.substring(start, limit)); +//| } +//| if (i+2 == limit) { +//| throw new IllegalArgumentException( +//| "Empty right side: " +//| + rules.substring(start, limit)); +//| } +//| parseSubPattern(start, i, left, null, SPECIALS); +//| parseSubPattern(i+2, limit, right, null, SPECIALS); +//| return FWDREV_RULE_OP; +//| } +//| +//| switch (c) { +//| case FORWARD_RULE_OP: +//| if (i == start) { +//| throw new IllegalArgumentException( +//| "Empty left side: " +//| + rules.substring(start, limit)); +//| } +//| parseMatchPattern(start, i, left, anteContext, postContext); +//| if (i != (limit-1)) { +//| parseOutputPattern(i+1, limit, right, cursorPos); +//| } +//| break; +//| case REVERSE_RULE_OP: +//| if (i == (limit-1)) { +//| throw new IllegalArgumentException( +//| "Empty right side: " +//| + rules.substring(start, limit)); +//| } +//| if (i != start) { +//| parseOutputPattern(start, i, left, cursorPos); +//| } +//| parseMatchPattern(i+1, limit, right, anteContext, postContext); +//| break; +//| case VARIABLE_DEF_OP: +//| if (i == start || i == (limit-1)) { +//| throw new IllegalArgumentException( +//| "Empty left or right side: " +//| + rules.substring(start, limit)); +//| } +//| parseSubPattern(start, i, left); +//| parseDefPattern(i+1, limit, right); +//| break; +//| default: +//| throw new RuntimeException(); +//| } +//| return c; +//| } +//| +//| /** +//| * Parses the match pattern of a forward or reverse rule. Given the raw +//| * match pattern, return the match text and the context on both sides, if +//| * any. Resolves all quotes and variables. +//| * @param start the beginning index, inclusive; 0 <= start +//| * <= limit. +//| * @param limit the ending index, exclusive; start <= limit +//| * <= rules.length(). +//| * @param text the key to be matched will be appended to this buffer +//| * @param anteContext the preceding context, if any, will be appended +//| * to this buffer. +//| * @param postContext the following context, if any, will be appended +//| * to this buffer. +//| */ +//| private void parseMatchPattern(int start, int limit, +//| StringBuffer text, +//| StringBuffer anteContext, +//| StringBuffer postContext) { +//| if (start >= limit) { +//| throw new IllegalArgumentException( +//| "Empty expression in rule: " +//| + rules.substring(start, limit)); +//| } +//| if (anteContext != null) { +//| // Ignore optional opening and closing context characters +//| if (rules.charAt(start) == CONTEXT_OPEN) { +//| ++start; +//| } +//| if (rules.charAt(limit-1) == CONTEXT_CLOSE) { +//| --limit; +//| } +//| // The four possibilities are: +//| // key +//| // anteContext]key +//| // anteContext]key[postContext +//| // key[postContext +//| int ante = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_CLOSE)); +//| int post = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_OPEN)); +//| if (ante >= 0 && post >= 0 && ante > post) { +//| throw new IllegalArgumentException( +//| "Syntax error in context specifier: " +//| + rules.substring(start, limit)); +//| } +//| if (ante >= 0) { +//| parseSubPattern(start, ante, anteContext); +//| start = ante+1; +//| } +//| if (post >= 0) { +//| parseSubPattern(post+1, limit, postContext); +//| limit = post; +//| } +//| } +//| parseSubPattern(start, limit, text); +//| } +//| +//| private final void parseSubPattern(int start, int limit, +//| StringBuffer text) { +//| parseSubPattern(start, limit, text, null, SPECIALS); +//| } +//| +//| /** +//| * Parse a variable definition sub pattern. This kind of sub +//| * pattern differs in the set of characters that are considered +//| * special. In particular, the '[' and ']' characters are not +//| * special, since these are used in UnicodeSet patterns. +//| */ +//| private final void parseDefPattern(int start, int limit, +//| StringBuffer text) { +//| parseSubPattern(start, limit, text, null, DEF_SPECIALS); +//| } +//| +//| /** +//| * Parses the output pattern of a forward or reverse rule. Given the +//| * output pattern, return the output text and the position of the cursor, +//| * if any. Resolves all quotes and variables. +//| * @param rules the string to be parsed +//| * @param start the beginning index, inclusive; 0 <= start +//| * <= limit. +//| * @param limit the ending index, exclusive; start <= limit +//| * <= rules.length(). +//| * @param text the output text will be appended to this buffer +//| * @param cursorPos if this parameter is not null, then cursorPos[0] +//| * will be set to the cursor position, or -1 if there is none. If this +//| * parameter is null, then cursors will be disallowed. +//| */ +//| private final void parseOutputPattern(int start, int limit, +//| StringBuffer text, +//| int[] cursorPos) { +//| parseSubPattern(start, limit, text, cursorPos, SPECIALS); +//| } +//| +//| /** +//| * Parses a sub-pattern of a rule. Return the text and the position of the cursor, +//| * if any. Resolves all quotes and variables. +//| * @param rules the string to be parsed +//| * @param start the beginning index, inclusive; 0 <= start +//| * <= limit. +//| * @param limit the ending index, exclusive; start <= limit +//| * <= rules.length(). +//| * @param text the output text will be appended to this buffer +//| * @param cursorPos if this parameter is not null, then cursorPos[0] +//| * will be set to the cursor position, or -1 if there is none. If this +//| * parameter is null, then cursors will be disallowed. +//| * @param specials characters that must be quoted; typically either +//| * SPECIALS or DEF_SPECIALS. +//| */ +//| private void parseSubPattern(int start, int limit, +//| StringBuffer text, +//| int[] cursorPos, +//| String specials) { +//| boolean inQuote = false; +//| +//| if (start >= limit) { +//| throw new IllegalArgumentException("Empty expression in rule"); +//| } +//| if (cursorPos != null) { +//| cursorPos[0] = -1; +//| } +//| for (int i=start; i= 0) { +//| throw new IllegalArgumentException("Multiple cursors: " +//| + rules.substring(start, limit)); +//| } +//| cursorPos[0] = text.length(); +//| } else if (specials.indexOf(c) >= 0) { +//| throw new IllegalArgumentException("Unquoted special character: " +//| + rules.substring(start, limit)); +//| } else { +//| text.append(c); +//| } +//| } +//| } +//| +//| private static void validateVariableName(String name) { +//| if (indexOf(name, SPECIALS) >= 0) { +//| throw new IllegalArgumentException( +//| "Special character in variable name: " +//| + name); +//| } +//| } /** * Returns the single character value of the given variable name. Defined * names are recognized. - * - * NO LONGER SUPPORTED: - * If a Unicode category name is given, a standard character variable - * in the range firstCategoryVariable to lastCategoryVariable is returned, - * with value firstCategoryVariable + n, where n is the category - * number. * @exception IllegalArgumentException if the name is unknown. */ private Character getVariableDef(String name) { Character ch = (Character) data.variableNames.get(name); -//! if (ch == null) { -//! int id = UnicodeSet.getCategoryID(name); -//! if (id >= 0) { -//! ch = new Character((char) (firstCategoryVariable + id)); -//! data.variableNames.put(name, ch); -//! data.setVariables.put(ch, new UnicodeSet(id)); -//! } -//! } if (ch == null) { throw new IllegalArgumentException("Undefined variable: " + name); @@ -1084,6 +1334,10 @@ public class RuleBasedTransliterator extends Transliterator { * this method may employ some other algorithm for improved speed. */ private final void determineVariableRange(String[] ruleArray) { + // As an initial implementation, we just run through all the + // characters, ignoring any quoting. This works since the quote + // mechanisms are outside the private use area. + Range r = new Range('\uE000', 0x1900); // Private use area r = r.largestUnusedSubrange(ruleArray); @@ -1121,7 +1375,9 @@ public class RuleBasedTransliterator extends Transliterator { String setOfChars) { for (int i=start; i= 0) { diff --git a/icu4j/src/com/ibm/icu/text/TransliterationRule.java b/icu4j/src/com/ibm/icu/text/TransliterationRule.java index a06801f3fd..55104c8610 100755 --- a/icu4j/src/com/ibm/icu/text/TransliterationRule.java +++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java @@ -21,9 +21,12 @@ import java.util.Dictionary; *

Copyright © IBM Corporation 1999. All rights reserved. * * @author Alan Liu - * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.5 $ $Date: 2000/01/04 21:43:57 $ + * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.6 $ $Date: 2000/01/11 02:25:03 $ * * $Log: TransliterationRule.java,v $ + * Revision 1.6 2000/01/11 02:25:03 Alan + * Rewrite UnicodeSet and RBT parsers for better performance and new syntax + * * Revision 1.5 2000/01/04 21:43:57 Alan * Add rule indexing, and move masking check to TransliterationRuleSet. * @@ -134,6 +137,46 @@ class TransliterationRule { } } + + + + + + + /** + * @param input input string, including key and optional ante and + * post context + * @param anteContextPos offset into input to end of ante context, or + * -1 if none + * @param postContextPos offset into input to start of post context, + * or -1 if none + * @param output output string + * @param cursorPos offset into output at which cursor is located, + * or -1 if none. + */ + public TransliterationRule(String input, + int anteContextPos, int postContextPos, + String output, + int cursorPos) { + anteContextLength = (anteContextPos < 0) ? 0 : anteContextPos; + keyLength = (postContextPos < 0) ? input.length() - anteContextLength : + postContextPos - anteContextLength; + pattern = input; + this.output = output; + this.cursorPos = cursorPos < 0 ? output.length() : cursorPos; + if (anteContextPos > input.length() || postContextPos > input.length() || + cursorPos > output.length()) { + throw new IllegalArgumentException(); + } + } + + + + + + + + /** * Return the length of the key. Equivalent to getKey().length(). * @return the length of the match key. @@ -171,9 +214,14 @@ class TransliterationRule { * Internal method. Returns 8-bit index value for this rule. * This is the low byte of the first character of the key, * unless the first character of the key is a set. If it's a - * set, the index value is -1. + * set, or otherwise can match multiple keys, the index value is -1. */ final int getIndexValue(Dictionary variables) { + if (anteContextLength == pattern.length()) { + // A pattern with just ante context {such as foo)>bar} can + // match any key. + return -1; + } char c = pattern.charAt(anteContextLength); return variables.get(new Character(c)) == null ? (c & 0xFF) : -1; } @@ -185,9 +233,15 @@ class TransliterationRule { * It matches this rule if it matches the first character of the * key, or if the first character of the key is a set, and the set * contains any character with a low byte equal to the index - * value. + * value. If the rule contains only ante context, as in foo)>bar, + * then it will match any key. */ final boolean matchesIndexValue(int v, Dictionary variables) { + if (anteContextLength == pattern.length()) { + // A pattern with just ante context {such as foo)>bar} can + // match any key. + return true; + } char c = pattern.charAt(anteContextLength); UnicodeSet set = (UnicodeSet) variables.get(new Character(c)); return set == null ? (c & 0xFF) == v : set.containsIndexValue(v); @@ -238,15 +292,15 @@ class TransliterationRule { */ public String toString() { return getClass().getName() + '{' - + escape(anteContextLength > 0 ? ("[" + pattern.substring(0, anteContextLength) + - ']') : "") - + pattern.substring(anteContextLength, anteContextLength + keyLength) - + (anteContextLength + keyLength < pattern.length() ? - ("[" + pattern.substring(anteContextLength + keyLength) + ']') : "") - + " -> " - + (cursorPos < output.length() - ? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos)) - : output) + + escape((anteContextLength > 0 ? ("(" + pattern.substring(0, anteContextLength) + + ") ") : "") + + pattern.substring(anteContextLength, anteContextLength + keyLength) + + (anteContextLength + keyLength < pattern.length() ? + (" (" + pattern.substring(anteContextLength + keyLength) + ")") : "") + + " > " + + (cursorPos < output.length() + ? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos)) + : output)) + '}'; } diff --git a/icu4j/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/src/com/ibm/icu/text/UnicodeSet.java index c63c0de07c..975f2856fd 100755 --- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java @@ -1,6 +1,7 @@ package com.ibm.text; import java.text.*; +import java.util.Dictionary; /** * A mutable set of Unicode characters. Objects of this class @@ -225,7 +226,7 @@ import java.text.*; * *Unsupported by Java (and hence unsupported by UnicodeSet). * * @author Alan Liu - * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.2 $ $Date: 2000/01/04 21:43:58 $ */ + * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.3 $ $Date: 2000/01/11 02:25:03 $ */ public class UnicodeSet { /** * The internal representation is a StringBuffer of even length. @@ -251,6 +252,9 @@ public class UnicodeSet { private static final int UNSUPPORTED_CATEGORY = 17; + private static final char VARIABLE_REF_OPEN = '{'; + private static final char VARIABLE_REF_CLOSE = '}'; + private static final int CATEGORY_COUNT = 29; /** @@ -293,25 +297,21 @@ public class UnicodeSet { * a syntax error. */ public UnicodeSet(String pattern) { - applyPattern(pattern, false); + applyPattern(pattern); } - /** - * Constructs a set from the given pattern, optionally ignoring - * white space. See the class description for the syntax of the - * pattern language. - * @param pattern a string specifying what characters are in the set - * @param ignoreSpaces if true, all spaces in the - * pattern are ignored, except those preceded by '\u005C'. Spaces are - * those characters for which Character.isSpaceChar() - * is true. - * @exception IllegalArgumentException if the pattern - * contains a syntax error. - */ - public UnicodeSet(String pattern, boolean ignoreSpaces) { - applyPattern(pattern, ignoreSpaces); + + + + + public UnicodeSet(String pattern, ParsePosition pos, + Dictionary varNameToChar, Dictionary varCharToSet) { + applyPattern(pattern, pos, varNameToChar, varCharToSet); } + + + /** * Constructs a set from the given Unicode character category. * @param category an integer indicating the character category as @@ -328,57 +328,15 @@ public class UnicodeSet { } /** - * Modifies this set to represent the set specified by the given - * pattern. See the class description for the syntax of the - * pattern language. + * Modifies this set to represent the set specified by the given pattern. + * See the class description for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @exception IllegalArgumentException if the pattern * contains a syntax error. */ - public final void applyPattern(String pattern) { - applyPattern(pattern, false); - } - - /** - * Modifies this set to represent the set specified by the given - * pattern, optionally ignoring white space. See the class - * description for the syntax of the pattern language. - * @param pattern a string specifying what characters are in the set - * @param ignoreSpaces if true, all spaces in the - * pattern are ignored. Spaces are those characters for which - * Character.isSpaceChar() is true. - * Characters preceded by '\\' are escaped, losing any special - * meaning they otherwise have. Spaces may be included by - * escaping them. - * @exception IllegalArgumentException if the pattern - * contains a syntax error. - */ - public void applyPattern(String pattern, boolean ignoreSpaces) { + public void applyPattern(String pattern) { ParsePosition pos = new ParsePosition(0); - - // To ignore spaces, create a new pattern without spaces. We - // have to process all '\' escapes. If '\' is encountered, - // insert it and the following character (if any -- let parse - // deal with any syntax errors) in the pattern. This allows - // escaped spaces. - if (ignoreSpaces) { - StringBuffer pat = new StringBuffer(); - for (int i=0; ipattern + * @param pattern the string containing the pattern to be parsed. The + * portion of the string from pos.getIndex(), which must be a '[', to the + * corresponding closing ']', is parsed. + * @param pos upon entry, the position at which to being parsing. The + * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return + * from a successful parse, pos.getIndex() is either the character after the + * closing ']' of the parsed pattern, or pattern.length() if the closing ']' + * is the last character of the pattern string. + * @return a StringBuffer containing a pairs list for the parsed substring + * of pattern * @exception IllegalArgumentException if the parse fails. */ - private static StringBuffer parse(String pattern, ParsePosition pos) { + private static StringBuffer parse(String pattern, ParsePosition pos, + Dictionary varNameToChar, Dictionary varCharToSet) { - boolean invert = false; StringBuffer pairsBuf = new StringBuffer(); + boolean invert = false; - /** - * Nodes: 0 - idle, waiting for '[' - * 10 - like 11, but immediately after "[" or "[^" - * 11 - awaiting x, "]", "[...]", or "[:...:]" - * 21 - after x - * 23 - after x- - * - * The parsing state machine moves from node 0 through zero or more - * other nodes back to node 0, in a successful parse. + int lastChar = -1; // This is either a char (0..FFFF) or -1 + char lastOp = 0; + + /* This loop iterates over the characters in the pattern. We start at + * the position specified by pos. We exit the loop when either a + * matching closing ']' is seen, or we read all characters of the + * pattern. In the latter case an error will be thrown. */ - int node = 0; - char first = 0; - int i; - /** - * This loop iterates over the characters in the pattern. We - * start at the position specified by pos. We exit the loop - * when either a matching closing ']' is seen, or we read all - * characters of the pattern. + /* Pattern syntax: + * pat := '[' '^'? elem* ']' + * elem := a | a '-' a | set | set op set + * set := pat | (a set variable) + * op := '&' | '-' + * a := (a character, possibly defined by a var) */ - for (i=pos.getIndex(); i= pattern.length()) { + if ((i+4) >= limit) { throw new IllegalArgumentException("Invalid \\u escape"); } c = '\u0000'; @@ -731,201 +762,143 @@ public class UnicodeSet { } } - /** - * Within this loop, we handle each of the four - * conditions: '[', ']', '-', other. The first three - * characters must not be escaped. + /* Parse variable references. These are treated as literals. If a + * variable refers to a UnicodeSet, nestedPairs is assigned here. + * Variable names are only parsed if varNameToChar is not null. + * Set variables are only looked up if varCharToSet is not null. */ + else if (varNameToChar != null && !isLiteral && c == VARIABLE_REF_OPEN) { + ++i; + int j = pattern.indexOf(VARIABLE_REF_CLOSE, i); + if (i == j || j < 0) { // empty or unterminated + throw new IllegalArgumentException("Illegal variable reference"); + } + String name = pattern.substring(i, j); + ++j; + Character ch = (Character) varNameToChar.get(name); + if (ch == null) { + throw new IllegalArgumentException("Undefined variable: " + + name); + } + c = ch.charValue(); + isLiteral = true; - /** - * An opening bracket indicates either the first bracket - * of the entire subpattern we are parsing, in which case - * we are in node 0 and move into node 10. We also check - * for an immediately following '^', indicating the - * complement of the following pattern. ('^' is any other - * position has no special meaning.) If we are not in - * node 0, '[' represents a nested subpattern that must be - * recursively parsed and checked for following operators - * ('&' or '|'). If two nested subpatterns follow one - * another with no operator, their union is formed, just - * as with any other elements that follow one another - * without intervening operator. The other thing we - * handle here is the syntax "[:Xx:]" or "[:X:]" that - * indicates a Unicode category or supercategory. + if (varCharToSet != null) { + UnicodeSet set = (UnicodeSet) varCharToSet.get(ch); + if (set != null) { + nestedPairs = set.pairs.toString(); + } + } + } + + /* An opening bracket indicates the first bracket of a nested + * subpattern, either a normal pattern or a category pattern. We + * recognize these here and set nestedPairs accordingly. */ - if (!isLiteral && c == '[') { - boolean parseOp = false; + else if (!isLiteral && c == '[') { + // Handle "[:...:]", representing a character category char d = charAfter(pattern, i); - // "[:...:]" represents a character category if (d == ':') { - if (node == 23) { - throw new IllegalArgumentException("Unexpected \"[:\""); - } - if (node == 21) { - addPair(pairsBuf, first, first); - node = 11; - } i += 2; int j = pattern.indexOf(":]", i); if (j < 0) { throw new IllegalArgumentException("Missing \":]\""); } - doUnion(pairsBuf, - getCategoryPairs(pattern.substring(i, j))); - i = j+1; - if (node == 10) { - node = 11; - parseOp = true; - } else if (node == 0) { + nestedPairs = getCategoryPairs(pattern.substring(i, j)); + i = j+1; // Make i point to ']' + if (mode == 3) { + // Entire pattern is a category; leave parse loop + pairsBuf.append(nestedPairs); break; } } else { - if (node == 0) { - node = 10; - if (d == '^') { - invert = true; - ++i; - } - } else { - // Nested '[' - pos.setIndex(i); - doUnion(pairsBuf, parse(pattern, pos) - .toString()); - i = pos.getIndex() - 1; // Subtract 1 to point at ']' - parseOp = true; - } + // Recurse to get the pairs for this nested set. + pos.setIndex(i); // Add 2 to point AFTER op + nestedPairs = parse(pattern, pos, varNameToChar, varCharToSet).toString(); + i = pos.getIndex() - 1; // - 1 to point at ']' } - /** - * parseOp is true after "[:...:]" or a nested - * "[...]". It is false only after the final closing - * ']'. If parseOp is true, we look past the closing - * ']' to see if we have an operator character. If - * so, we parse the subsequent "[...]" recursively, - * then perform the operation. We do this in a loop - * until there are no more operators. Note that this - * means the operators have equal precedence and are - * bound left-to-right. - */ - if (parseOp) { - for (;;) { - // Is the next character an operator? - char op = charAfter(pattern, i); - if (op == '-' || op == '&') { - pos.setIndex(i+2); // Add 2 to point AFTER op - String rhs = parse(pattern, pos).toString(); - if (op == '-') { - doDifference(pairsBuf, rhs); - } else if (op == '&') { - doIntersection(pairsBuf, rhs); - } - i = pos.getIndex() - 1; // - 1 to point at ']' - } else { - break; - } - } - } } - /** - * A closing bracket can only be a closing bracket for - * "[...]", since the closing bracket for "[:...:]" is - * taken care of when the initial "[:" is seen. When we - * see a closing bracket, we then know, if we were in node - * 21 (after x) or 23 (after x-) that nothing more is - * coming, and we add the last character(s) we saw to the - * set. Note that a trailing '-' assumes its literal - * meaning, just as a leading '-' after "[" or "[^". + /* At this point we have either a character c, or a nested set. If + * we have encountered a nested set, either embedded in the pattern, + * or as a variable, we have a non-null nestedPairs, and c should be + * ignored. Otherwise c is the current character, and isLiteral + * indicates whether it is an escaped literal (or variable) or a + * normal unescaped character. Unescaped characters '-', '&', and + * ']' have special meanings. */ - else if (!isLiteral && c == ']') { - if (node == 0) { - throw new IllegalArgumentException("Unexpected ']'"); - } - if (node == 21 || node == 23) { - addPair(pairsBuf, first, first); - if (node == 23) { - addPair(pairsBuf, '-', '-'); + if (nestedPairs != null) { + if (lastChar >= 0) { + if (lastOp != 0) { + throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp); } + addPair(pairsBuf, (char)lastChar, (char)lastChar); + lastChar = -1; } - node = 0; + switch (lastOp) { + case '-': + doDifference(pairsBuf, nestedPairs); + break; + case '&': + doIntersection(pairsBuf, nestedPairs); + break; + case 0: + doUnion(pairsBuf, nestedPairs); + break; + } + lastOp = 0; + } else if (!isLiteral && c == ']') { + // Final closing delimiter. This is the only way we leave this + // loop if the pattern is well-formed. break; - } - - /** - * '-' has the following interpretations: 1. Within - * "[...]", between two letters, it indicates a range. - * 2. Between two nested bracket patterns, "[[...]-[...]", - * it indicates asymmetric difference. 3. At the start of - * a bracket pattern, "[-...]", "[^-...]", it indicates - * the literal character '-'. 4. At the end of a bracket - * pattern, "[...-]", it indicates the literal character - * '-'. - * - * We handle cases 1 and 3 here. Cases 2 and 4 are - * handled in the ']' parsing code. - */ - else if (!isLiteral && c == '-') { - if (node == 10) { - addPair(pairsBuf, c, c); // Handle "[-...]", "[^-...]" - } else if (node == 21) { - node = 23; - } else { - throw new IllegalArgumentException("Unexpected '-'"); - } - } - - /** - * If we fall through to this point, we have a literal - * character, either one that has been escaped with a - * backslash, escaped with a backslash u, or that isn't - * a special '[', ']', or '-'. - * - * Literals can either start a range "x-...", end a range, - * "...-x", or indicate a single character "x". - */ - else { - if (node == 10 || node == 11) { - first = c; - node = 21; - } else if (node == 21) { - addPair(pairsBuf, first, first); - first = c; - node = 21; - } else if (node == 23) { - if (c < first) { - throw new IllegalArgumentException("Bad range"); - } - addPair(pairsBuf, first, c); - node = 11; - } else { - throw new IllegalArgumentException("Expected '[', got '" + c + '\''); + } else if (lastOp == 0 && !isLiteral && (c == '-' || c == '&')) { + lastOp = c; + } else if (lastOp == '-') { + addPair(pairsBuf, (char)lastChar, c); + lastOp = 0; + lastChar = -1; + } else if (lastOp != 0) { + // We have & or & + throw new IllegalArgumentException("Unquoted " + lastOp); + } else { + if (lastChar >= 0) { + // We have + addPair(pairsBuf, (char)lastChar, (char)lastChar); } + lastChar = c; } } - if (node != 0) { - throw new IllegalArgumentException("Missing ']'"); + // Handle unprocessed stuff preceding the closing ']' + if (lastOp == '-') { + // Trailing '-' is treated as literal + addPair(pairsBuf, lastOp, lastOp); + } else if (lastOp == '&') { + throw new IllegalArgumentException("Unquoted trailing " + lastOp); + } + if (lastChar >= 0) { + addPair(pairsBuf, (char)lastChar, (char)lastChar); } /** - * i indexes the last character we parsed or is - * pattern.length(). In the latter case, the node will not be - * zero, since we have run off the end without finding a - * closing ']'. Therefore, the above statement will have - * thrown an exception, and we'll never get here. If we get - * here, we know i < pattern.length(), and we set the - * ParsePosition to the next character to be parsed. - */ - pos.setIndex(i+1); - - /** - * If we saw a '^' after the initial '[' of this pattern, then - * perform the complement. (Inversion after '[:' is handled - * elsewhere.) + * If we saw a '^' after the initial '[' of this pattern, then perform + * the complement. (Inversion after '[:' is handled elsewhere.) */ if (invert) { doComplement(pairsBuf); } + /** + * i indexes the last character we parsed or is pattern.length(). In + * the latter case, we have run off the end without finding a closing + * ']'. Otherwise, we know i < pattern.length(), and we set the + * ParsePosition to the next character to be parsed. + */ + if (i == limit) { + throw new IllegalArgumentException("Missing ']'"); + } + pos.setIndex(i+1); + return pairsBuf; } @@ -1352,7 +1325,6 @@ public class UnicodeSet { /** * Returns the character after the given position, or '\uFFFF' if * there is none. - */ private static final char charAfter(String str, int i) { return ((++i) < str.length()) ? str.charAt(i) : '\uFFFF'; diff --git a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java index 572a959963..7337a05292 100755 --- a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java +++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java @@ -2,6 +2,7 @@ package com.ibm.text; import java.util.Hashtable; import java.util.Vector; +import java.text.ParsePosition; /** * A transliterator that reads a set of rules in order to determine how to @@ -181,9 +182,12 @@ import java.util.Vector; *

Copyright © IBM Corporation 1999. All rights reserved. * * @author Alan Liu - * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.7 $ $Date: 2000/01/06 01:36:36 $ + * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.8 $ $Date: 2000/01/11 02:25:03 $ * * $Log: RuleBasedTransliterator.java,v $ + * Revision 1.8 2000/01/11 02:25:03 Alan + * Rewrite UnicodeSet and RBT parsers for better performance and new syntax + * * Revision 1.7 2000/01/06 01:36:36 Alan * Allow string arrays in rule resource bundles * @@ -195,7 +199,6 @@ import java.util.Vector; * * Revision 1.4 1999/12/22 01:05:54 Alan * Improve masking checking; turn it off by default, for better performance - * */ public class RuleBasedTransliterator extends Transliterator { /** @@ -214,8 +217,6 @@ public class RuleBasedTransliterator extends Transliterator { static final boolean DEBUG = false; - static final boolean CHECK_MASKING = true; - private static final String COPYRIGHT = "\u00A9 IBM Corporation 1999. All rights reserved."; @@ -561,33 +562,34 @@ public class RuleBasedTransliterator extends Transliterator { private static final char VARIABLE_DEF_OP = '='; private static final char FORWARD_RULE_OP = '>'; private static final char REVERSE_RULE_OP = '<'; - private static final char FWDREV_RULE_OP = '~'; // internal rep of FWDREF_OP_STRING + private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op private static final String OPERATORS = "=><"; - // Forward-Reverse operator - // a<>b is equivalent to ab - private static final String FWDREV_OP_STRING = "<>"; // must have length 2 - // Other special characters private static final char QUOTE = '\''; + private static final char ESCAPE = '\\'; + private static final char END_OF_RULE = ';'; + private static final char RULE_COMMENT_CHAR = '#'; + private static final char VARIABLE_REF_OPEN = '{'; private static final char VARIABLE_REF_CLOSE = '}'; - private static final char CONTEXT_OPEN = '['; - private static final char CONTEXT_CLOSE = ']'; + private static final char CONTEXT_OPEN = '('; + private static final char CONTEXT_CLOSE = ')'; + private static final char SET_OPEN = '['; + private static final char SET_CLOSE = ']'; private static final char CURSOR_POS = '|'; - private static final char RULE_COMMENT_CHAR = '#'; /** * Specials must be quoted in rules to be used as literals. * Specials may not occur in variable names. */ - private static final String SPECIALS = "'{}[]|#" + OPERATORS; +//! private static final String SPECIALS = "{}[]|" + OPERATORS; /** * Specials that must be quoted in variable definitions. */ - private static final String DEF_SPECIALS = "'{}"; +//! private static final String DEF_SPECIALS = "{}"; /** * @param rules list of rules, separated by semicolon characters @@ -616,37 +618,12 @@ public class RuleBasedTransliterator extends Transliterator { determineVariableRange(ruleArray); StringBuffer errors = null; - for (int irule=0; irule0 && rules.charAt(limit-1) == '\\') { - limit = rules.indexOf(';', limit+1); - } - - if (limit == -1) { - limit = n; - } - // Skip over empty lines and line starting with # - if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) { - try { - applyRule(i, limit); - } catch (IllegalArgumentException e) { - if (errors == null) { - errors = new StringBuffer(e.getMessage()); - } else { - errors.append("\n").append(e.getMessage()); - } - } - } - i = limit + 1; - } + try { + parseRuleArray(ruleArray); + } catch (IllegalArgumentException e) { + errors = new StringBuffer(e.getMessage()); } - + // Index the rules try { data.ruleSet.freeze(data.setVariables); @@ -663,411 +640,684 @@ public class RuleBasedTransliterator extends Transliterator { } } - /** - * Parse the given substring as a rule, and append it to the rules currently - * represented in this object. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @exception IllegalArgumentException if there is a syntax error in the - * rules - */ - private void applyRule(int start, int limit) { - /* General description of parsing: Initially, rules contain two types of - * quoted characters. First, there are variable references, such as - * "{alpha}". Second, there are quotes, such as "'<'" or "''". One of - * the first steps in parsing a rule is to resolve such quoted matter. - * Quotes are removed early, leaving unquoted literal matter. Variable - * references are resolved and replaced by single characters. In some - * instances these characters represent themselves; in others, they - * stand for categories of characters. Character categories are either - * predefined (e.g., "{Lu}"), or are defined by the user using a - * statement (e.g., "vowels:aeiouAEIOU"). - * - * Another early step in parsing is to split each rule into component - * pieces. These pieces are, for every rule, a left-hand side, a right- - * hand side, and an operator. The left- and right-hand sides may not - * be empty, except for the output patterns of forward and reverse - * rules. In addition to this partitioning, the match patterns of - * forward and reverse rules must be partitioned into antecontext, - * postcontext, and literal pattern, where the context portions may or - * may not be present. Finally, output patterns must have the cursor - * indicator '|' detected and removed, with its position recorded. - * - * Quote removal, variable resolution, and sub-pattern splitting must - * all happen at once. This is due chiefly to the quoting mechanism, - * which allows special characters to appear at arbitrary positions in - * the final unquoted text. (For this reason, alteration of the rule - * language is somewhat clumsy; it entails reassessment and revision of - * the parsing methods as a whole.) - * - * After this processing of rules is complete, the final end products - * are unquoted pieces of text of various types, and an integer cursor - * position, if one is specified. These processed raw materials are now - * easy to deal with; other classes such as UnicodeSet and - * TransliterationRule need know nothing of quoting or variables. - */ - StringBuffer left = new StringBuffer(); - StringBuffer right = new StringBuffer(); - StringBuffer anteContext = new StringBuffer(); - StringBuffer postContext = new StringBuffer(); - int cursorPos[] = new int[1]; - char operator = parseRule(start, limit, left, right, - anteContext, postContext, cursorPos); + + + + + + + private void parseRuleArray(String[] ruleArray) { + String[] leftRight = new String[2]; + char[] op = new char[1]; + for (int i=0; i= 0) { + if (operator != 0) { + syntaxError("Unquoted " + c, rule, start); + } + // Found an operator char. Check for forward-reverse operator. + if (c == REVERSE_RULE_OP && + (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { + ++pos; + operator = FWDREV_RULE_OP; + } else { + operator = c; + } + left = buf.toString(); // lhs + leftCursor = cursor; + leftAnte = ante; + leftPost = post; + leftPostClose = postClose; + + buf.setLength(0); + cursor = ante = post = postClose = -1; + continue; + } + switch (c) { + case END_OF_RULE: + break main; + case VARIABLE_REF_OPEN: + { + int j = rule.indexOf(VARIABLE_REF_CLOSE, pos); + if (pos == j || j < 0) { // empty or unterminated + syntaxError("Malformed variable reference", rule, start); + } + String name = rule.substring(pos, j); + pos = j+1; + buf.append(getVariableDef(name).charValue()); + } + break; + case CONTEXT_OPEN: + if (post >= 0) { + syntaxError("Multiple post contexts", rule, start); + } + // Ignore CONTEXT_OPEN if buffer length is zero -- that means + // this is the optional opening delimiter for the ante context. + if (buf.length() > 0) { + post = buf.length(); + } + break; + case CONTEXT_CLOSE: + if (postClose >= 0) { + syntaxError("Unexpected " + c, rule, start); + } + if (post >= 0) { + // This is probably the optional closing delimiter + // for the post context; save the pos and check later. + postClose = buf.length(); + } else if (ante >= 0) { + syntaxError("Multiple ante contexts", rule, start); + } else { + ante = buf.length(); + } + break; + case SET_OPEN: + ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '[' + buf.append(registerSet(new UnicodeSet(rule, pp, + data.variableNames, data.setVariables)).charValue()); + pos = pp.getIndex(); + break; + case VARIABLE_REF_CLOSE: + case SET_CLOSE: + syntaxError("Unquoted " + c, rule, start); + case CURSOR_POS: + if (cursor >= 0) { + syntaxError("Multiple cursors", rule, start); + } + cursor = buf.length(); + break; + default: + buf.append(c); + break; + } + } + if (operator == 0) { + syntaxError("No operator", rule, start); + } + + // Check context close parameters + if ((leftPostClose >= 0 && leftPostClose != left.length()) || + (postClose >= 0 && postClose != buf.length())) { + syntaxError("Extra text after ]", rule, start); + } + + // Context is only allowed on the input side; that is, the left side + // for forward rules. Cursors are only allowed on the output side; + // that is, the right side for forward rules. Bidirectional rules + // ignore elements that do not apply. switch (operator) { case VARIABLE_DEF_OP: - applyVariableDef(left.toString(), right.toString()); + // LHS is the name. RHS is a single character, either a literal + // or a set (already parsed). If RHS is longer than one + // character, it is either a multi-character string, or multiple + // sets, or a mixture of chars and sets -- syntax error. + if (buf.length() != 1) { + syntaxError("Malformed RHS", rule, start); + } + if (data.variableNames.get(left) != null) { + syntaxError("Duplicate definition of {" + + left + "}", rule, start); + } + data.variableNames.put(left, new Character(buf.charAt(0))); break; + case FORWARD_RULE_OP: if (direction == FORWARD) { + if (ante >= 0 || post >= 0 || leftCursor >= 0) { + syntaxError("Malformed rule", rule, start); + } data.ruleSet.addRule(new TransliterationRule( - left.toString(), right.toString(), - anteContext.toString(), postContext.toString(), - cursorPos[0])); + left, leftAnte, leftPost, + buf.toString(), cursor)); } // otherwise ignore the rule; it's not the direction we want break; + case REVERSE_RULE_OP: if (direction == REVERSE) { + if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) { + syntaxError("Malformed rule", rule, start); + } data.ruleSet.addRule(new TransliterationRule( - right.toString(), left.toString(), - anteContext.toString(), postContext.toString(), - cursorPos[0])); + buf.toString(), ante, post, + left, leftCursor)); } // otherwise ignore the rule; it's not the direction we want break; + case FWDREV_RULE_OP: - data.ruleSet.addRule(new TransliterationRule( - direction == FORWARD ? left.toString() : right.toString(), - direction == FORWARD ? right.toString() : left.toString(), - // Context & cursor disallowed - "", "", -1)); + if (direction == FORWARD) { + // The output side is the right; trim off any context + String output = buf.toString().substring(ante < 0 ? 0 : ante, + post < 0 ? buf.length() : post); + data.ruleSet.addRule(new TransliterationRule( + left, leftAnte, leftPost, + output, cursor)); + } else { + // The output side is the left; trim off any context + String output = left.substring(leftAnte < 0 ? 0 : leftAnte, + leftPost < 0 ? left.length() : leftPost); + data.ruleSet.addRule(new TransliterationRule( + buf.toString(), ante, post, + output, leftCursor)); + } break; } + + return pos; } - /** - * Add a variable definition. - * @param name the name of the variable. It must not already be defined. - * @param pattern the value of the variable. It may be a single character - * or a pattern describing a character set. - * @exception IllegalArgumentException if there is a syntax error - */ - private final void applyVariableDef(String name, String pattern) { - validateVariableName(name); - if (data.variableNames.get(name) != null) { - throw new IllegalArgumentException("Duplicate variable definition: " - + name + '=' + pattern); - } -//! if (UnicodeSet.getCategoryID(name) >= 0) { -//! throw new IllegalArgumentException("Reserved variable name: " -//! + name); -//! } - if (pattern.length() < 1) { - throw new IllegalArgumentException("Variable definition missing: " - + name); - } - if (pattern.length() == 1) { - // Got a single character variable definition - data.variableNames.put(name, new Character(pattern.charAt(0))); - } else { - // Got more than one character; parse it as a category - if (variableNext >= variableLimit) { - throw new RuntimeException("Private use variables exhausted"); - } - Character c = new Character(variableNext++); - data.variableNames.put(name, c); - data.setVariables.put(c, new UnicodeSet(pattern)); + + + private static final void syntaxError(String msg, String rule, int start) { + int end = quotedIndexOf(rule, start, rule.length(), ";"); + if (end < 0) { + end = rule.length(); } + throw new IllegalArgumentException(msg + " in " + + rule.substring(start, end)); } - /** - * Given a rule, parses it into three pieces: The left side, the right side, - * and the operator. Returns the operator. Quotes and variable references - * are resolved; the otuput text in all StringBuffer parameters - * is literal text. This method delegates to other parsing methods to - * handle the match pattern, output pattern, and other sub-patterns in the - * rule. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param left left side of rule is appended to this buffer - * with the quotes removed and variables resolved - * @param right right side of rule is appended to this buffer - * with the quotes removed and variables resolved - * @param anteContext the preceding context of the match pattern, - * if there is one, is appended to this buffer - * @param postContext the following context of the match pattern, - * if there is one, is appended to this buffer - * @param cursorPos if there is a cursor in the output pattern, its - * offset is stored in cursorPos[0] - * @return The operator character, one of the characters in OPERATORS. - */ - private char parseRule(int start, int limit, - StringBuffer left, StringBuffer right, - StringBuffer anteContext, - StringBuffer postContext, - int[] cursorPos) { - if (false) { - System.err.println("Parsing " + rules.substring(start, limit)); - } - /* Parse the rule into three pieces -- left, operator, and right, - * parsing out quotes. The result is that left and right will have - * unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted - * operators throw an exception. Two quotes inside or outside - * quotes indicates a quote literal. E.g., "o''clock" -> "o'clock". - */ - int i = quotedIndexOf(rules, start, limit, OPERATORS); - if (i < 0) { - throw new IllegalArgumentException( - "Syntax error: " - + rules.substring(start, limit)); - } - char c = rules.charAt(i); - - // Look for "<>" double rules. - if ((i+1) < limit && rules.substring(i, i+2).equals(FWDREV_OP_STRING)) { - if (i == start) { - throw new IllegalArgumentException( - "Empty left side: " - + rules.substring(start, limit)); - } - if (i+2 == limit) { - throw new IllegalArgumentException( - "Empty right side: " - + rules.substring(start, limit)); - } - parseSubPattern(start, i, left, null, SPECIALS); - parseSubPattern(i+2, limit, right, null, SPECIALS); - return FWDREV_RULE_OP; - } - switch (c) { - case FORWARD_RULE_OP: - if (i == start) { - throw new IllegalArgumentException( - "Empty left side: " - + rules.substring(start, limit)); - } - parseMatchPattern(start, i, left, anteContext, postContext); - if (i != (limit-1)) { - parseOutputPattern(i+1, limit, right, cursorPos); - } - break; - case REVERSE_RULE_OP: - if (i == (limit-1)) { - throw new IllegalArgumentException( - "Empty right side: " - + rules.substring(start, limit)); - } - if (i != start) { - parseOutputPattern(start, i, left, cursorPos); - } - parseMatchPattern(i+1, limit, right, anteContext, postContext); - break; - case VARIABLE_DEF_OP: - if (i == start || i == (limit-1)) { - throw new IllegalArgumentException( - "Empty left or right side: " - + rules.substring(start, limit)); - } - parseSubPattern(start, i, left); - parseDefPattern(i+1, limit, right); - break; - default: - throw new RuntimeException(); + +//| /** +//| * Parse the given substring as a rule, and append it to the rules currently +//| * represented in this object. +//| * @param start the beginning index, inclusive; 0 <= start +//| * <= limit. +//| * @param limit the ending index, exclusive; start <= limit +//| * <= rules.length(). +//| * @exception IllegalArgumentException if there is a syntax error in the +//| * rules +//| */ +//| private void applyRule(int start, int limit) { +//| /* General description of parsing: Initially, rules contain two types of +//| * quoted characters. First, there are variable references, such as +//| * "{alpha}". Second, there are quotes, such as "'<'" or "''". One of +//| * the first steps in parsing a rule is to resolve such quoted matter. +//| * Quotes are removed early, leaving unquoted literal matter. Variable +//| * references are resolved and replaced by single characters. In some +//| * instances these characters represent themselves; in others, they +//| * stand for categories of characters. Character categories are either +//| * predefined (e.g., "{Lu}"), or are defined by the user using a +//| * statement (e.g., "vowels:aeiouAEIOU"). +//| * +//| * Another early step in parsing is to split each rule into component +//| * pieces. These pieces are, for every rule, a left-hand side, a right- +//| * hand side, and an operator. The left- and right-hand sides may not +//| * be empty, except for the output patterns of forward and reverse +//| * rules. In addition to this partitioning, the match patterns of +//| * forward and reverse rules must be partitioned into antecontext, +//| * postcontext, and literal pattern, where the context portions may or +//| * may not be present. Finally, output patterns must have the cursor +//| * indicator '|' detected and removed, with its position recorded. +//| * +//| * Quote removal, variable resolution, and sub-pattern splitting must +//| * all happen at once. This is due chiefly to the quoting mechanism, +//| * which allows special characters to appear at arbitrary positions in +//| * the final unquoted text. (For this reason, alteration of the rule +//| * language is somewhat clumsy; it entails reassessment and revision of +//| * the parsing methods as a whole.) +//| * +//| * After this processing of rules is complete, the final end products +//| * are unquoted pieces of text of various types, and an integer cursor +//| * position, if one is specified. These processed raw materials are now +//| * easy to deal with; other classes such as UnicodeSet and +//| * TransliterationRule need know nothing of quoting or variables. +//| */ +//| StringBuffer left = new StringBuffer(); +//| StringBuffer right = new StringBuffer(); +//| StringBuffer anteContext = new StringBuffer(); +//| StringBuffer postContext = new StringBuffer(); +//| int cursorPos[] = new int[1]; +//| +//| char operator = parseRule(start, limit, left, right, +//| anteContext, postContext, cursorPos); +//| +//| switch (operator) { +//| case VARIABLE_DEF_OP: +//| applyVariableDef(left.toString(), right.toString()); +//| break; +//| case FORWARD_RULE_OP: +//| if (direction == FORWARD) { +//| data.ruleSet.addRule(new TransliterationRule( +//| left.toString(), right.toString(), +//| anteContext.toString(), postContext.toString(), +//| cursorPos[0])); +//| } // otherwise ignore the rule; it's not the direction we want +//| break; +//| case REVERSE_RULE_OP: +//| if (direction == REVERSE) { +//| data.ruleSet.addRule(new TransliterationRule( +//| right.toString(), left.toString(), +//| anteContext.toString(), postContext.toString(), +//| cursorPos[0])); +//| } // otherwise ignore the rule; it's not the direction we want +//| break; +//| case FWDREV_RULE_OP: +//| data.ruleSet.addRule(new TransliterationRule( +//| direction == FORWARD ? left.toString() : right.toString(), +//| direction == FORWARD ? right.toString() : left.toString(), +//| // Context & cursor disallowed +//| "", "", -1)); +//| break; +//| } +//| } + +//| /** +//| * Add a variable definition. +//| * @param name the name of the variable. It must not already be defined. +//| * @param pattern the value of the variable. It may be a single character +//| * or a pattern describing a character set. +//| * @exception IllegalArgumentException if there is a syntax error +//| */ +//| private final void applyVariableDef(String name, String pattern) { +//| validateVariableName(name); +//| if (data.variableNames.get(name) != null) { +//| throw new IllegalArgumentException("Duplicate variable definition: " +//| + name + '=' + pattern); +//| } +//| if (pattern.length() < 1) { +//| throw new IllegalArgumentException("Variable definition missing: " +//| + name); +//| } +//| if (pattern.length() == 1) { +//| // Got a single character variable definition +//| data.variableNames.put(name, new Character(pattern.charAt(0))); +//| } else { +//| // Got more than one character; parse it as a category +//| UnicodeSet set = new UnicodeSet(pattern); +//| data.variableNames.put(name, registerSet(set)); +//| } +//| } + + + + + private final Character registerSet(UnicodeSet set) { + if (variableNext >= variableLimit) { + throw new RuntimeException("Private use variables exhausted"); } + Character c = new Character(variableNext++); + data.setVariables.put(c, set); return c; } - /** - * Parses the match pattern of a forward or reverse rule. Given the raw - * match pattern, return the match text and the context on both sides, if - * any. Resolves all quotes and variables. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the key to be matched will be appended to this buffer - * @param anteContext the preceding context, if any, will be appended - * to this buffer. - * @param postContext the following context, if any, will be appended - * to this buffer. - */ - private void parseMatchPattern(int start, int limit, - StringBuffer text, - StringBuffer anteContext, - StringBuffer postContext) { - if (start >= limit) { - throw new IllegalArgumentException( - "Empty expression in rule: " - + rules.substring(start, limit)); - } - if (anteContext != null) { - // Ignore optional opening and closing context characters - if (rules.charAt(start) == CONTEXT_OPEN) { - ++start; - } - if (rules.charAt(limit-1) == CONTEXT_CLOSE) { - --limit; - } - // The four possibilities are: - // key - // anteContext]key - // anteContext]key[postContext - // key[postContext - int ante = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_CLOSE)); - int post = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_OPEN)); - if (ante >= 0 && post >= 0 && ante > post) { - throw new IllegalArgumentException( - "Syntax error in context specifier: " - + rules.substring(start, limit)); - } - if (ante >= 0) { - parseSubPattern(start, ante, anteContext); - start = ante+1; - } - if (post >= 0) { - parseSubPattern(post+1, limit, postContext); - limit = post; - } - } - parseSubPattern(start, limit, text); - } - private final void parseSubPattern(int start, int limit, - StringBuffer text) { - parseSubPattern(start, limit, text, null, SPECIALS); - } - /** - * Parse a variable definition sub pattern. This kind of sub - * pattern differs in the set of characters that are considered - * special. In particular, the '[' and ']' characters are not - * special, since these are used in UnicodeSet patterns. - */ - private final void parseDefPattern(int start, int limit, - StringBuffer text) { - parseSubPattern(start, limit, text, null, DEF_SPECIALS); - } - /** - * Parses the output pattern of a forward or reverse rule. Given the - * output pattern, return the output text and the position of the cursor, - * if any. Resolves all quotes and variables. - * @param rules the string to be parsed - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the output text will be appended to this buffer - * @param cursorPos if this parameter is not null, then cursorPos[0] - * will be set to the cursor position, or -1 if there is none. If this - * parameter is null, then cursors will be disallowed. - */ - private final void parseOutputPattern(int start, int limit, - StringBuffer text, - int[] cursorPos) { - parseSubPattern(start, limit, text, cursorPos, SPECIALS); - } - - /** - * Parses a sub-pattern of a rule. Return the text and the position of the cursor, - * if any. Resolves all quotes and variables. - * @param rules the string to be parsed - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= rules.length(). - * @param text the output text will be appended to this buffer - * @param cursorPos if this parameter is not null, then cursorPos[0] - * will be set to the cursor position, or -1 if there is none. If this - * parameter is null, then cursors will be disallowed. - * @param specials characters that must be quoted; typically either - * SPECIALS or DEF_SPECIALS. - */ - private void parseSubPattern(int start, int limit, - StringBuffer text, - int[] cursorPos, - String specials) { - boolean inQuote = false; - - if (start >= limit) { - throw new IllegalArgumentException("Empty expression in rule"); - } - if (cursorPos != null) { - cursorPos[0] = -1; - } - for (int i=start; i= 0) { - throw new IllegalArgumentException("Multiple cursors: " - + rules.substring(start, limit)); - } - cursorPos[0] = text.length(); - } else if (specials.indexOf(c) >= 0) { - throw new IllegalArgumentException("Unquoted special character: " - + rules.substring(start, limit)); - } else { - text.append(c); - } - } - } - - private static void validateVariableName(String name) { - if (indexOf(name, SPECIALS) >= 0) { - throw new IllegalArgumentException( - "Special character in variable name: " - + name); - } - } +//| /** +//| * Given a rule, parses it into three pieces: The left side, the right side, +//| * and the operator. Returns the operator. Quotes and variable references +//| * are resolved; the otuput text in all StringBuffer parameters +//| * is literal text. This method delegates to other parsing methods to +//| * handle the match pattern, output pattern, and other sub-patterns in the +//| * rule. +//| * @param start the beginning index, inclusive; 0 <= start +//| * <= limit. +//| * @param limit the ending index, exclusive; start <= limit +//| * <= rules.length(). +//| * @param left left side of rule is appended to this buffer +//| * with the quotes removed and variables resolved +//| * @param right right side of rule is appended to this buffer +//| * with the quotes removed and variables resolved +//| * @param anteContext the preceding context of the match pattern, +//| * if there is one, is appended to this buffer +//| * @param postContext the following context of the match pattern, +//| * if there is one, is appended to this buffer +//| * @param cursorPos if there is a cursor in the output pattern, its +//| * offset is stored in cursorPos[0] +//| * @return The operator character, one of the characters in OPERATORS. +//| */ +//| private char parseRule(int start, int limit, +//| StringBuffer left, StringBuffer right, +//| StringBuffer anteContext, +//| StringBuffer postContext, +//| int[] cursorPos) { +//| if (false) { +//| System.err.println("Parsing " + rules.substring(start, limit)); +//| } +//| /* Parse the rule into three pieces -- left, operator, and right, +//| * parsing out quotes. The result is that left and right will have +//| * unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted +//| * operators throw an exception. Two quotes inside or outside +//| * quotes indicates a quote literal. E.g., "o''clock" -> "o'clock". +//| */ +//| int i = quotedIndexOf(rules, start, limit, OPERATORS); +//| if (i < 0) { +//| throw new IllegalArgumentException( +//| "Syntax error: " +//| + rules.substring(start, limit)); +//| } +//| char c = rules.charAt(i); +//| +//| // Look for "<>" double rules. +//| if ((i+1) < limit && rules.substring(i, i+2).equals(FWDREV_OP_STRING)) { +//| if (i == start) { +//| throw new IllegalArgumentException( +//| "Empty left side: " +//| + rules.substring(start, limit)); +//| } +//| if (i+2 == limit) { +//| throw new IllegalArgumentException( +//| "Empty right side: " +//| + rules.substring(start, limit)); +//| } +//| parseSubPattern(start, i, left, null, SPECIALS); +//| parseSubPattern(i+2, limit, right, null, SPECIALS); +//| return FWDREV_RULE_OP; +//| } +//| +//| switch (c) { +//| case FORWARD_RULE_OP: +//| if (i == start) { +//| throw new IllegalArgumentException( +//| "Empty left side: " +//| + rules.substring(start, limit)); +//| } +//| parseMatchPattern(start, i, left, anteContext, postContext); +//| if (i != (limit-1)) { +//| parseOutputPattern(i+1, limit, right, cursorPos); +//| } +//| break; +//| case REVERSE_RULE_OP: +//| if (i == (limit-1)) { +//| throw new IllegalArgumentException( +//| "Empty right side: " +//| + rules.substring(start, limit)); +//| } +//| if (i != start) { +//| parseOutputPattern(start, i, left, cursorPos); +//| } +//| parseMatchPattern(i+1, limit, right, anteContext, postContext); +//| break; +//| case VARIABLE_DEF_OP: +//| if (i == start || i == (limit-1)) { +//| throw new IllegalArgumentException( +//| "Empty left or right side: " +//| + rules.substring(start, limit)); +//| } +//| parseSubPattern(start, i, left); +//| parseDefPattern(i+1, limit, right); +//| break; +//| default: +//| throw new RuntimeException(); +//| } +//| return c; +//| } +//| +//| /** +//| * Parses the match pattern of a forward or reverse rule. Given the raw +//| * match pattern, return the match text and the context on both sides, if +//| * any. Resolves all quotes and variables. +//| * @param start the beginning index, inclusive; 0 <= start +//| * <= limit. +//| * @param limit the ending index, exclusive; start <= limit +//| * <= rules.length(). +//| * @param text the key to be matched will be appended to this buffer +//| * @param anteContext the preceding context, if any, will be appended +//| * to this buffer. +//| * @param postContext the following context, if any, will be appended +//| * to this buffer. +//| */ +//| private void parseMatchPattern(int start, int limit, +//| StringBuffer text, +//| StringBuffer anteContext, +//| StringBuffer postContext) { +//| if (start >= limit) { +//| throw new IllegalArgumentException( +//| "Empty expression in rule: " +//| + rules.substring(start, limit)); +//| } +//| if (anteContext != null) { +//| // Ignore optional opening and closing context characters +//| if (rules.charAt(start) == CONTEXT_OPEN) { +//| ++start; +//| } +//| if (rules.charAt(limit-1) == CONTEXT_CLOSE) { +//| --limit; +//| } +//| // The four possibilities are: +//| // key +//| // anteContext]key +//| // anteContext]key[postContext +//| // key[postContext +//| int ante = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_CLOSE)); +//| int post = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_OPEN)); +//| if (ante >= 0 && post >= 0 && ante > post) { +//| throw new IllegalArgumentException( +//| "Syntax error in context specifier: " +//| + rules.substring(start, limit)); +//| } +//| if (ante >= 0) { +//| parseSubPattern(start, ante, anteContext); +//| start = ante+1; +//| } +//| if (post >= 0) { +//| parseSubPattern(post+1, limit, postContext); +//| limit = post; +//| } +//| } +//| parseSubPattern(start, limit, text); +//| } +//| +//| private final void parseSubPattern(int start, int limit, +//| StringBuffer text) { +//| parseSubPattern(start, limit, text, null, SPECIALS); +//| } +//| +//| /** +//| * Parse a variable definition sub pattern. This kind of sub +//| * pattern differs in the set of characters that are considered +//| * special. In particular, the '[' and ']' characters are not +//| * special, since these are used in UnicodeSet patterns. +//| */ +//| private final void parseDefPattern(int start, int limit, +//| StringBuffer text) { +//| parseSubPattern(start, limit, text, null, DEF_SPECIALS); +//| } +//| +//| /** +//| * Parses the output pattern of a forward or reverse rule. Given the +//| * output pattern, return the output text and the position of the cursor, +//| * if any. Resolves all quotes and variables. +//| * @param rules the string to be parsed +//| * @param start the beginning index, inclusive; 0 <= start +//| * <= limit. +//| * @param limit the ending index, exclusive; start <= limit +//| * <= rules.length(). +//| * @param text the output text will be appended to this buffer +//| * @param cursorPos if this parameter is not null, then cursorPos[0] +//| * will be set to the cursor position, or -1 if there is none. If this +//| * parameter is null, then cursors will be disallowed. +//| */ +//| private final void parseOutputPattern(int start, int limit, +//| StringBuffer text, +//| int[] cursorPos) { +//| parseSubPattern(start, limit, text, cursorPos, SPECIALS); +//| } +//| +//| /** +//| * Parses a sub-pattern of a rule. Return the text and the position of the cursor, +//| * if any. Resolves all quotes and variables. +//| * @param rules the string to be parsed +//| * @param start the beginning index, inclusive; 0 <= start +//| * <= limit. +//| * @param limit the ending index, exclusive; start <= limit +//| * <= rules.length(). +//| * @param text the output text will be appended to this buffer +//| * @param cursorPos if this parameter is not null, then cursorPos[0] +//| * will be set to the cursor position, or -1 if there is none. If this +//| * parameter is null, then cursors will be disallowed. +//| * @param specials characters that must be quoted; typically either +//| * SPECIALS or DEF_SPECIALS. +//| */ +//| private void parseSubPattern(int start, int limit, +//| StringBuffer text, +//| int[] cursorPos, +//| String specials) { +//| boolean inQuote = false; +//| +//| if (start >= limit) { +//| throw new IllegalArgumentException("Empty expression in rule"); +//| } +//| if (cursorPos != null) { +//| cursorPos[0] = -1; +//| } +//| for (int i=start; i= 0) { +//| throw new IllegalArgumentException("Multiple cursors: " +//| + rules.substring(start, limit)); +//| } +//| cursorPos[0] = text.length(); +//| } else if (specials.indexOf(c) >= 0) { +//| throw new IllegalArgumentException("Unquoted special character: " +//| + rules.substring(start, limit)); +//| } else { +//| text.append(c); +//| } +//| } +//| } +//| +//| private static void validateVariableName(String name) { +//| if (indexOf(name, SPECIALS) >= 0) { +//| throw new IllegalArgumentException( +//| "Special character in variable name: " +//| + name); +//| } +//| } /** * Returns the single character value of the given variable name. Defined * names are recognized. - * - * NO LONGER SUPPORTED: - * If a Unicode category name is given, a standard character variable - * in the range firstCategoryVariable to lastCategoryVariable is returned, - * with value firstCategoryVariable + n, where n is the category - * number. * @exception IllegalArgumentException if the name is unknown. */ private Character getVariableDef(String name) { Character ch = (Character) data.variableNames.get(name); -//! if (ch == null) { -//! int id = UnicodeSet.getCategoryID(name); -//! if (id >= 0) { -//! ch = new Character((char) (firstCategoryVariable + id)); -//! data.variableNames.put(name, ch); -//! data.setVariables.put(ch, new UnicodeSet(id)); -//! } -//! } if (ch == null) { throw new IllegalArgumentException("Undefined variable: " + name); @@ -1084,6 +1334,10 @@ public class RuleBasedTransliterator extends Transliterator { * this method may employ some other algorithm for improved speed. */ private final void determineVariableRange(String[] ruleArray) { + // As an initial implementation, we just run through all the + // characters, ignoring any quoting. This works since the quote + // mechanisms are outside the private use area. + Range r = new Range('\uE000', 0x1900); // Private use area r = r.largestUnusedSubrange(ruleArray); @@ -1121,7 +1375,9 @@ public class RuleBasedTransliterator extends Transliterator { String setOfChars) { for (int i=start; i= 0) { diff --git a/icu4j/src/com/ibm/text/TransliterationRule.java b/icu4j/src/com/ibm/text/TransliterationRule.java index a06801f3fd..55104c8610 100755 --- a/icu4j/src/com/ibm/text/TransliterationRule.java +++ b/icu4j/src/com/ibm/text/TransliterationRule.java @@ -21,9 +21,12 @@ import java.util.Dictionary; *

Copyright © IBM Corporation 1999. All rights reserved. * * @author Alan Liu - * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.5 $ $Date: 2000/01/04 21:43:57 $ + * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.6 $ $Date: 2000/01/11 02:25:03 $ * * $Log: TransliterationRule.java,v $ + * Revision 1.6 2000/01/11 02:25:03 Alan + * Rewrite UnicodeSet and RBT parsers for better performance and new syntax + * * Revision 1.5 2000/01/04 21:43:57 Alan * Add rule indexing, and move masking check to TransliterationRuleSet. * @@ -134,6 +137,46 @@ class TransliterationRule { } } + + + + + + + /** + * @param input input string, including key and optional ante and + * post context + * @param anteContextPos offset into input to end of ante context, or + * -1 if none + * @param postContextPos offset into input to start of post context, + * or -1 if none + * @param output output string + * @param cursorPos offset into output at which cursor is located, + * or -1 if none. + */ + public TransliterationRule(String input, + int anteContextPos, int postContextPos, + String output, + int cursorPos) { + anteContextLength = (anteContextPos < 0) ? 0 : anteContextPos; + keyLength = (postContextPos < 0) ? input.length() - anteContextLength : + postContextPos - anteContextLength; + pattern = input; + this.output = output; + this.cursorPos = cursorPos < 0 ? output.length() : cursorPos; + if (anteContextPos > input.length() || postContextPos > input.length() || + cursorPos > output.length()) { + throw new IllegalArgumentException(); + } + } + + + + + + + + /** * Return the length of the key. Equivalent to getKey().length(). * @return the length of the match key. @@ -171,9 +214,14 @@ class TransliterationRule { * Internal method. Returns 8-bit index value for this rule. * This is the low byte of the first character of the key, * unless the first character of the key is a set. If it's a - * set, the index value is -1. + * set, or otherwise can match multiple keys, the index value is -1. */ final int getIndexValue(Dictionary variables) { + if (anteContextLength == pattern.length()) { + // A pattern with just ante context {such as foo)>bar} can + // match any key. + return -1; + } char c = pattern.charAt(anteContextLength); return variables.get(new Character(c)) == null ? (c & 0xFF) : -1; } @@ -185,9 +233,15 @@ class TransliterationRule { * It matches this rule if it matches the first character of the * key, or if the first character of the key is a set, and the set * contains any character with a low byte equal to the index - * value. + * value. If the rule contains only ante context, as in foo)>bar, + * then it will match any key. */ final boolean matchesIndexValue(int v, Dictionary variables) { + if (anteContextLength == pattern.length()) { + // A pattern with just ante context {such as foo)>bar} can + // match any key. + return true; + } char c = pattern.charAt(anteContextLength); UnicodeSet set = (UnicodeSet) variables.get(new Character(c)); return set == null ? (c & 0xFF) == v : set.containsIndexValue(v); @@ -238,15 +292,15 @@ class TransliterationRule { */ public String toString() { return getClass().getName() + '{' - + escape(anteContextLength > 0 ? ("[" + pattern.substring(0, anteContextLength) + - ']') : "") - + pattern.substring(anteContextLength, anteContextLength + keyLength) - + (anteContextLength + keyLength < pattern.length() ? - ("[" + pattern.substring(anteContextLength + keyLength) + ']') : "") - + " -> " - + (cursorPos < output.length() - ? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos)) - : output) + + escape((anteContextLength > 0 ? ("(" + pattern.substring(0, anteContextLength) + + ") ") : "") + + pattern.substring(anteContextLength, anteContextLength + keyLength) + + (anteContextLength + keyLength < pattern.length() ? + (" (" + pattern.substring(anteContextLength + keyLength) + ")") : "") + + " > " + + (cursorPos < output.length() + ? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos)) + : output)) + '}'; } diff --git a/icu4j/src/com/ibm/text/UnicodeSet.java b/icu4j/src/com/ibm/text/UnicodeSet.java index c63c0de07c..975f2856fd 100755 --- a/icu4j/src/com/ibm/text/UnicodeSet.java +++ b/icu4j/src/com/ibm/text/UnicodeSet.java @@ -1,6 +1,7 @@ package com.ibm.text; import java.text.*; +import java.util.Dictionary; /** * A mutable set of Unicode characters. Objects of this class @@ -225,7 +226,7 @@ import java.text.*; * *Unsupported by Java (and hence unsupported by UnicodeSet). * * @author Alan Liu - * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.2 $ $Date: 2000/01/04 21:43:58 $ */ + * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.3 $ $Date: 2000/01/11 02:25:03 $ */ public class UnicodeSet { /** * The internal representation is a StringBuffer of even length. @@ -251,6 +252,9 @@ public class UnicodeSet { private static final int UNSUPPORTED_CATEGORY = 17; + private static final char VARIABLE_REF_OPEN = '{'; + private static final char VARIABLE_REF_CLOSE = '}'; + private static final int CATEGORY_COUNT = 29; /** @@ -293,25 +297,21 @@ public class UnicodeSet { * a syntax error. */ public UnicodeSet(String pattern) { - applyPattern(pattern, false); + applyPattern(pattern); } - /** - * Constructs a set from the given pattern, optionally ignoring - * white space. See the class description for the syntax of the - * pattern language. - * @param pattern a string specifying what characters are in the set - * @param ignoreSpaces if true, all spaces in the - * pattern are ignored, except those preceded by '\u005C'. Spaces are - * those characters for which Character.isSpaceChar() - * is true. - * @exception IllegalArgumentException if the pattern - * contains a syntax error. - */ - public UnicodeSet(String pattern, boolean ignoreSpaces) { - applyPattern(pattern, ignoreSpaces); + + + + + public UnicodeSet(String pattern, ParsePosition pos, + Dictionary varNameToChar, Dictionary varCharToSet) { + applyPattern(pattern, pos, varNameToChar, varCharToSet); } + + + /** * Constructs a set from the given Unicode character category. * @param category an integer indicating the character category as @@ -328,57 +328,15 @@ public class UnicodeSet { } /** - * Modifies this set to represent the set specified by the given - * pattern. See the class description for the syntax of the - * pattern language. + * Modifies this set to represent the set specified by the given pattern. + * See the class description for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @exception IllegalArgumentException if the pattern * contains a syntax error. */ - public final void applyPattern(String pattern) { - applyPattern(pattern, false); - } - - /** - * Modifies this set to represent the set specified by the given - * pattern, optionally ignoring white space. See the class - * description for the syntax of the pattern language. - * @param pattern a string specifying what characters are in the set - * @param ignoreSpaces if true, all spaces in the - * pattern are ignored. Spaces are those characters for which - * Character.isSpaceChar() is true. - * Characters preceded by '\\' are escaped, losing any special - * meaning they otherwise have. Spaces may be included by - * escaping them. - * @exception IllegalArgumentException if the pattern - * contains a syntax error. - */ - public void applyPattern(String pattern, boolean ignoreSpaces) { + public void applyPattern(String pattern) { ParsePosition pos = new ParsePosition(0); - - // To ignore spaces, create a new pattern without spaces. We - // have to process all '\' escapes. If '\' is encountered, - // insert it and the following character (if any -- let parse - // deal with any syntax errors) in the pattern. This allows - // escaped spaces. - if (ignoreSpaces) { - StringBuffer pat = new StringBuffer(); - for (int i=0; ipattern + * @param pattern the string containing the pattern to be parsed. The + * portion of the string from pos.getIndex(), which must be a '[', to the + * corresponding closing ']', is parsed. + * @param pos upon entry, the position at which to being parsing. The + * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return + * from a successful parse, pos.getIndex() is either the character after the + * closing ']' of the parsed pattern, or pattern.length() if the closing ']' + * is the last character of the pattern string. + * @return a StringBuffer containing a pairs list for the parsed substring + * of pattern * @exception IllegalArgumentException if the parse fails. */ - private static StringBuffer parse(String pattern, ParsePosition pos) { + private static StringBuffer parse(String pattern, ParsePosition pos, + Dictionary varNameToChar, Dictionary varCharToSet) { - boolean invert = false; StringBuffer pairsBuf = new StringBuffer(); + boolean invert = false; - /** - * Nodes: 0 - idle, waiting for '[' - * 10 - like 11, but immediately after "[" or "[^" - * 11 - awaiting x, "]", "[...]", or "[:...:]" - * 21 - after x - * 23 - after x- - * - * The parsing state machine moves from node 0 through zero or more - * other nodes back to node 0, in a successful parse. + int lastChar = -1; // This is either a char (0..FFFF) or -1 + char lastOp = 0; + + /* This loop iterates over the characters in the pattern. We start at + * the position specified by pos. We exit the loop when either a + * matching closing ']' is seen, or we read all characters of the + * pattern. In the latter case an error will be thrown. */ - int node = 0; - char first = 0; - int i; - /** - * This loop iterates over the characters in the pattern. We - * start at the position specified by pos. We exit the loop - * when either a matching closing ']' is seen, or we read all - * characters of the pattern. + /* Pattern syntax: + * pat := '[' '^'? elem* ']' + * elem := a | a '-' a | set | set op set + * set := pat | (a set variable) + * op := '&' | '-' + * a := (a character, possibly defined by a var) */ - for (i=pos.getIndex(); i= pattern.length()) { + if ((i+4) >= limit) { throw new IllegalArgumentException("Invalid \\u escape"); } c = '\u0000'; @@ -731,201 +762,143 @@ public class UnicodeSet { } } - /** - * Within this loop, we handle each of the four - * conditions: '[', ']', '-', other. The first three - * characters must not be escaped. + /* Parse variable references. These are treated as literals. If a + * variable refers to a UnicodeSet, nestedPairs is assigned here. + * Variable names are only parsed if varNameToChar is not null. + * Set variables are only looked up if varCharToSet is not null. */ + else if (varNameToChar != null && !isLiteral && c == VARIABLE_REF_OPEN) { + ++i; + int j = pattern.indexOf(VARIABLE_REF_CLOSE, i); + if (i == j || j < 0) { // empty or unterminated + throw new IllegalArgumentException("Illegal variable reference"); + } + String name = pattern.substring(i, j); + ++j; + Character ch = (Character) varNameToChar.get(name); + if (ch == null) { + throw new IllegalArgumentException("Undefined variable: " + + name); + } + c = ch.charValue(); + isLiteral = true; - /** - * An opening bracket indicates either the first bracket - * of the entire subpattern we are parsing, in which case - * we are in node 0 and move into node 10. We also check - * for an immediately following '^', indicating the - * complement of the following pattern. ('^' is any other - * position has no special meaning.) If we are not in - * node 0, '[' represents a nested subpattern that must be - * recursively parsed and checked for following operators - * ('&' or '|'). If two nested subpatterns follow one - * another with no operator, their union is formed, just - * as with any other elements that follow one another - * without intervening operator. The other thing we - * handle here is the syntax "[:Xx:]" or "[:X:]" that - * indicates a Unicode category or supercategory. + if (varCharToSet != null) { + UnicodeSet set = (UnicodeSet) varCharToSet.get(ch); + if (set != null) { + nestedPairs = set.pairs.toString(); + } + } + } + + /* An opening bracket indicates the first bracket of a nested + * subpattern, either a normal pattern or a category pattern. We + * recognize these here and set nestedPairs accordingly. */ - if (!isLiteral && c == '[') { - boolean parseOp = false; + else if (!isLiteral && c == '[') { + // Handle "[:...:]", representing a character category char d = charAfter(pattern, i); - // "[:...:]" represents a character category if (d == ':') { - if (node == 23) { - throw new IllegalArgumentException("Unexpected \"[:\""); - } - if (node == 21) { - addPair(pairsBuf, first, first); - node = 11; - } i += 2; int j = pattern.indexOf(":]", i); if (j < 0) { throw new IllegalArgumentException("Missing \":]\""); } - doUnion(pairsBuf, - getCategoryPairs(pattern.substring(i, j))); - i = j+1; - if (node == 10) { - node = 11; - parseOp = true; - } else if (node == 0) { + nestedPairs = getCategoryPairs(pattern.substring(i, j)); + i = j+1; // Make i point to ']' + if (mode == 3) { + // Entire pattern is a category; leave parse loop + pairsBuf.append(nestedPairs); break; } } else { - if (node == 0) { - node = 10; - if (d == '^') { - invert = true; - ++i; - } - } else { - // Nested '[' - pos.setIndex(i); - doUnion(pairsBuf, parse(pattern, pos) - .toString()); - i = pos.getIndex() - 1; // Subtract 1 to point at ']' - parseOp = true; - } + // Recurse to get the pairs for this nested set. + pos.setIndex(i); // Add 2 to point AFTER op + nestedPairs = parse(pattern, pos, varNameToChar, varCharToSet).toString(); + i = pos.getIndex() - 1; // - 1 to point at ']' } - /** - * parseOp is true after "[:...:]" or a nested - * "[...]". It is false only after the final closing - * ']'. If parseOp is true, we look past the closing - * ']' to see if we have an operator character. If - * so, we parse the subsequent "[...]" recursively, - * then perform the operation. We do this in a loop - * until there are no more operators. Note that this - * means the operators have equal precedence and are - * bound left-to-right. - */ - if (parseOp) { - for (;;) { - // Is the next character an operator? - char op = charAfter(pattern, i); - if (op == '-' || op == '&') { - pos.setIndex(i+2); // Add 2 to point AFTER op - String rhs = parse(pattern, pos).toString(); - if (op == '-') { - doDifference(pairsBuf, rhs); - } else if (op == '&') { - doIntersection(pairsBuf, rhs); - } - i = pos.getIndex() - 1; // - 1 to point at ']' - } else { - break; - } - } - } } - /** - * A closing bracket can only be a closing bracket for - * "[...]", since the closing bracket for "[:...:]" is - * taken care of when the initial "[:" is seen. When we - * see a closing bracket, we then know, if we were in node - * 21 (after x) or 23 (after x-) that nothing more is - * coming, and we add the last character(s) we saw to the - * set. Note that a trailing '-' assumes its literal - * meaning, just as a leading '-' after "[" or "[^". + /* At this point we have either a character c, or a nested set. If + * we have encountered a nested set, either embedded in the pattern, + * or as a variable, we have a non-null nestedPairs, and c should be + * ignored. Otherwise c is the current character, and isLiteral + * indicates whether it is an escaped literal (or variable) or a + * normal unescaped character. Unescaped characters '-', '&', and + * ']' have special meanings. */ - else if (!isLiteral && c == ']') { - if (node == 0) { - throw new IllegalArgumentException("Unexpected ']'"); - } - if (node == 21 || node == 23) { - addPair(pairsBuf, first, first); - if (node == 23) { - addPair(pairsBuf, '-', '-'); + if (nestedPairs != null) { + if (lastChar >= 0) { + if (lastOp != 0) { + throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp); } + addPair(pairsBuf, (char)lastChar, (char)lastChar); + lastChar = -1; } - node = 0; + switch (lastOp) { + case '-': + doDifference(pairsBuf, nestedPairs); + break; + case '&': + doIntersection(pairsBuf, nestedPairs); + break; + case 0: + doUnion(pairsBuf, nestedPairs); + break; + } + lastOp = 0; + } else if (!isLiteral && c == ']') { + // Final closing delimiter. This is the only way we leave this + // loop if the pattern is well-formed. break; - } - - /** - * '-' has the following interpretations: 1. Within - * "[...]", between two letters, it indicates a range. - * 2. Between two nested bracket patterns, "[[...]-[...]", - * it indicates asymmetric difference. 3. At the start of - * a bracket pattern, "[-...]", "[^-...]", it indicates - * the literal character '-'. 4. At the end of a bracket - * pattern, "[...-]", it indicates the literal character - * '-'. - * - * We handle cases 1 and 3 here. Cases 2 and 4 are - * handled in the ']' parsing code. - */ - else if (!isLiteral && c == '-') { - if (node == 10) { - addPair(pairsBuf, c, c); // Handle "[-...]", "[^-...]" - } else if (node == 21) { - node = 23; - } else { - throw new IllegalArgumentException("Unexpected '-'"); - } - } - - /** - * If we fall through to this point, we have a literal - * character, either one that has been escaped with a - * backslash, escaped with a backslash u, or that isn't - * a special '[', ']', or '-'. - * - * Literals can either start a range "x-...", end a range, - * "...-x", or indicate a single character "x". - */ - else { - if (node == 10 || node == 11) { - first = c; - node = 21; - } else if (node == 21) { - addPair(pairsBuf, first, first); - first = c; - node = 21; - } else if (node == 23) { - if (c < first) { - throw new IllegalArgumentException("Bad range"); - } - addPair(pairsBuf, first, c); - node = 11; - } else { - throw new IllegalArgumentException("Expected '[', got '" + c + '\''); + } else if (lastOp == 0 && !isLiteral && (c == '-' || c == '&')) { + lastOp = c; + } else if (lastOp == '-') { + addPair(pairsBuf, (char)lastChar, c); + lastOp = 0; + lastChar = -1; + } else if (lastOp != 0) { + // We have & or & + throw new IllegalArgumentException("Unquoted " + lastOp); + } else { + if (lastChar >= 0) { + // We have + addPair(pairsBuf, (char)lastChar, (char)lastChar); } + lastChar = c; } } - if (node != 0) { - throw new IllegalArgumentException("Missing ']'"); + // Handle unprocessed stuff preceding the closing ']' + if (lastOp == '-') { + // Trailing '-' is treated as literal + addPair(pairsBuf, lastOp, lastOp); + } else if (lastOp == '&') { + throw new IllegalArgumentException("Unquoted trailing " + lastOp); + } + if (lastChar >= 0) { + addPair(pairsBuf, (char)lastChar, (char)lastChar); } /** - * i indexes the last character we parsed or is - * pattern.length(). In the latter case, the node will not be - * zero, since we have run off the end without finding a - * closing ']'. Therefore, the above statement will have - * thrown an exception, and we'll never get here. If we get - * here, we know i < pattern.length(), and we set the - * ParsePosition to the next character to be parsed. - */ - pos.setIndex(i+1); - - /** - * If we saw a '^' after the initial '[' of this pattern, then - * perform the complement. (Inversion after '[:' is handled - * elsewhere.) + * If we saw a '^' after the initial '[' of this pattern, then perform + * the complement. (Inversion after '[:' is handled elsewhere.) */ if (invert) { doComplement(pairsBuf); } + /** + * i indexes the last character we parsed or is pattern.length(). In + * the latter case, we have run off the end without finding a closing + * ']'. Otherwise, we know i < pattern.length(), and we set the + * ParsePosition to the next character to be parsed. + */ + if (i == limit) { + throw new IllegalArgumentException("Missing ']'"); + } + pos.setIndex(i+1); + return pairsBuf; } @@ -1352,7 +1325,6 @@ public class UnicodeSet { /** * Returns the character after the given position, or '\uFFFF' if * there is none. - */ private static final char charAfter(String str, int i) { return ((++i) < str.length()) ? str.charAt(i) : '\uFFFF';