Add segment support.

X-SVN-Rev: 1165
2000-04-19 16:37:38 +00:00 · 2000-04-19 16:37:38 +00:00 · 2947282e42
commit 2947282e42
parent 9a19714271
6 changed files with 882 additions and 420 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $ 
- * $Date: 2000/03/22 02:00:08 $ 
+ * $Date: 2000/04/19 16:37:38 $ 
- * $Revision: 1.14 $
+ * $Revision: 1.15 $
 *
 *****************************************************************************************
 */
@ -412,6 +412,25 @@ public class TransliteratorTest extends TestFmwk {
        expect(hex3, "012", "&#x30;&#x31;&#x32;");
    }
    /**
     * Test segments and segment references.
     */
    public void TestSegments() {
        // Array of 3n items
        // Each item is <rules>, <input>, <expected output>
        String[] DATA = {
            "$([a-z]$) . $([0-9]$) > $2-$1",
            "abc.123.xyz.456",
            "ab1-c23.xy4-z56",
        };
        for (int i=0; i<DATA.length; i+=3) {
            logln("Pattern: " + Utility.escape(DATA[i]));
            Transliterator t = new RuleBasedTransliterator("<ID>", DATA[i]);
            expect(t, DATA[i+1], DATA[i+2]);
        }
    }
    //======================================================================
    // Support methods
    //======================================================================
--- a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $ 
- * $Date: 2000/04/12 20:17:45 $ 
+ * $Date: 2000/04/19 16:34:18 $ 
- * $Revision: 1.18 $
+ * $Revision: 1.19 $
 *
 *****************************************************************************************
 */
@ -209,9 +209,12 @@ import com.ibm.util.Utility;
 * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
 *
 * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.18 $ $Date: 2000/04/12 20:17:45 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.19 $ $Date: 2000/04/19 16:34:18 $
 *
 * $Log: RuleBasedTransliterator.java,v $
 * Revision 1.19  2000/04/19 16:34:18  alan
 * Add segment support.
 *
 * Revision 1.18  2000/04/12 20:17:45  alan
 * Delegate replace operation to rule object
 *
@ -379,7 +382,7 @@ public class RuleBasedTransliterator extends Transliterator {
                }
            } else {
                // Delegate replacement to TransliterationRule object
-                limit += r.replace(text, cursor);
+                limit += r.replace(text, cursor, data);
                // text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
                // limit += r.getOutput().length() - r.getKeyLength();
                cursor += r.getCursorPos();
@ -448,12 +451,14 @@ public class RuleBasedTransliterator extends Transliterator {
        public UnicodeSet[] setVariables;
        /**
-         * The character represented by setVariables[0].
+         * The character that represents setVariables[0].  Characters
         * setVariablesBase through setVariablesBase +
         * setVariables.length - 1 represent UnicodeSet objects.
         */
        public char setVariablesBase;
        /**
-         * Return the UnicodeSet associated with the given character, or
+         * Return the UnicodeSet represented by the given character, or
         * null if none.
         */
        public UnicodeSet lookup(char c) {
@ -461,6 +466,22 @@ public class RuleBasedTransliterator extends Transliterator {
            return (i >= 0 && i < setVariables.length)
                ? setVariables[i] : null;
        }
        /**
         * The character that represents segment 1.  Characters segmentBase
         * through segmentBase + 8 represent segments 1 through 9.
         */
        public char segmentBase;
        /**
         * Return the zero-based index of the segment represented by the given
         * character, or -1 if none.  Repeat: This is a zero-based return value,
         * 0..8, even though these are notated "$1".."$9".
         */
        public int lookupSegmentReference(char c) {
            int i = c - segmentBase;
            return (i >= 0 && i < 9) ? i : -1;
        }
    }
@ -548,6 +569,12 @@ public class RuleBasedTransliterator extends Transliterator {
        private static final char SET_CLOSE           = ']';
        private static final char CURSOR_POS          = '|';
        // Segments of the input string are delimited by "$(" and "$)".  In the
        // output string these segments are referenced as "$1" through "$9".
        private static final char SEGMENT_REF         = '$';
        private static final char SEGMENT_OPEN        = '(';
        private static final char SEGMENT_CLOSE       = ')';
        /**
         * @param rules list of rules, separated by semicolon characters
         * @exception IllegalArgumentException if there is a syntax error in the
@ -633,33 +660,34 @@ public class RuleBasedTransliterator extends Transliterator {
        }
        /**
-         * MAIN PARSER.  Parse the next rule in the given rule string, starting
+         * A class representing one side of a rule.  This class knows how to
-         * at pos.  Return the index after the last character parsed.  Do not
+         * parse half of a rule.  It is tightly coupled to the method
-         * parse characters at or after limit.
+         * RuleBasedTransliterator.Parser.parseRule().
         *
         * Important:  The character at pos must be a non-whitespace character
         * that is not the comment character.
         *
         * This method handles quoting, escaping, and whitespace removal.  It
         * parses the end-of-rule character.  It recognizes context and cursor
         * indicators.  Once it does a lexical breakdown of the rule at pos, it
         * creates a rule object and adds it to our rule list.
         */
-        private int parseRule(String rule, int pos, int limit) {
+        static class RuleHalf {
-            // Locate the left side, operator, and right side
+
            public String text;
            public int cursor = -1; // position of cursor in text
            public int ante = -1;   // position of ante context marker ')' in text
            public int post = -1;   // position of post context marker '(' in text
            // Record the position of the segment substrings and references.  A
            // given side should have segments or segment references, but not
            // both.
            public Vector segments = null; // ref substring start,limits
            public int maxRef = -1; // index of largest ref (1..9)
            /**
             * Parse one side of a rule, stopping at either the limit,
             * the END_OF_RULE character, or an operator.  Return
             * the pos of the terminating character (or limit).
             */
            public int parse(String rule, int pos, int limit,
                             RuleBasedTransliterator.Parser parser) {
                int start = pos;
            char operator = 0;
                StringBuffer buf = new StringBuffer();
-            int cursor = -1; // position of cursor in buf
+                int postClose = -1; // position of post context close ')' in text
            int ante = -1;   // position of ante context marker ')' in buf
            int post = -1;   // position of post context marker '(' in buf
            int postClose = -1; // position of post context close ')' in buf
            // Assigned to buf and its adjuncts after the LHS has been
            // parsed.  Thereafter, buf etc. refer to the RHS.
            String left = null;
            int leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
            main:
                while (pos < limit) {
@ -709,29 +737,47 @@ public class RuleBasedTransliterator extends Transliterator {
                        continue;
                    }
                    if (OPERATORS.indexOf(c) >= 0) {
-                    if (operator != 0) {
+                        --pos; // Backup to point to operator
-                        syntaxError("Unquoted " + c, rule, start);
+                        break main;
                    }
-                    // Found an operator char.  Check for forward-reverse operator.
+                    // Handle segment definitions "$(" ")$" and references "$1"
-                    if (c == REVERSE_RULE_OP &&
+                    // .. "$9".
-                        (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
+                    if (c == SEGMENT_REF) {
-                        ++pos;
+                        // After a SEGMENT_REF, must see SEGMENT_OPEN,
-                        operator = FWDREV_RULE_OP;
+                        // SEGMENT_CLOSE, or a digit 1 to 9, with no intervening
                        // whitespace
                        if (pos == limit) {
                            syntaxError("Trailing " + c, rule, start);
                        }
                        c = rule.charAt(pos++);
                        if (c == SEGMENT_OPEN || c == SEGMENT_CLOSE) {
                            // Parse "$(", "$)"
                            if (segments == null) {
                                segments = new Vector();
                            }
                            if ((c == SEGMENT_OPEN) !=
                                (segments.size() % 2 == 0)) {
                                syntaxError("Mismatched segment delimiters",
                                            rule, start);
                            }
                            segments.addElement(new Integer(buf.length()));
                        } else {
-                        operator = c;
+                            // Parse "$1" "$2" .. "$9"
                            int r = Character.digit(c, 10);
                            if (r < 1 || r > 9) {
                                syntaxError("Illegal char after " + SEGMENT_REF,
                                            rule, start);
                            }
                            if (r > maxRef) {
                                maxRef = r;
                            }
                            buf.append((char) (parser.data.segmentBase + r - 1));
                        }
                    left = buf.toString(); // lhs
                    leftCursor = cursor;
                    leftAnte = ante;
                    leftPost = post;
                    leftPostClose = postClose;
                    buf.setLength(0);
                    cursor = ante = post = postClose = -1;
                        continue;
                    }
                    switch (c) {
                    case END_OF_RULE:
                        --pos; // Backup to point to END_OF_RULE
                        break main;
                    case VARIABLE_REF_OPEN:
                        {
@ -741,7 +787,7 @@ public class RuleBasedTransliterator extends Transliterator {
                            }
                            String name = rule.substring(pos, j);
                            pos = j+1;
-                        buf.append(getVariableDef(name));
+                            buf.append(parser.getVariableDef(name));
                        }
                        break;
                    case CONTEXT_OPEN:
@ -770,7 +816,7 @@ public class RuleBasedTransliterator extends Transliterator {
                        break;
                    case SET_OPEN:
                        ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
-                    buf.append(registerSet(new UnicodeSet(rule, pp, parseData)));
+                        buf.append(parser.registerSet(new UnicodeSet(rule, pp, parser.parseData)));
                        pos = pp.getIndex();
                        break;
                    case VARIABLE_REF_CLOSE:
@ -787,77 +833,155 @@ public class RuleBasedTransliterator extends Transliterator {
                        break;
                    }
                }
            if (operator == 0) {
                syntaxError("No operator", rule, start);
            }
                // Check context close parameters
-            if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
+                if (postClose >= 0 && postClose != buf.length()) {
                (postClose >= 0 && postClose != buf.length())) {
                    syntaxError("Extra text after ]", rule, start);
                }
-            // Context is only allowed on the input side; that is, the left side
+                text = buf.toString();
-            // for forward rules.  Cursors are only allowed on the output side;
+                return pos;
-            // that is, the right side for forward rules.  Bidirectional rules
+            }
            // ignore elements that do not apply.
-            switch (operator) {
+            /**
-            case VARIABLE_DEF_OP:
+             * Remove context.
             */
            void removeContext() {
                text = text.substring(ante < 0 ? 0 : ante,
                                      post < 0 ? text.length() : post);
                ante = post = -1;
            }
            /**
             * Create and return an int[] array of segments.
             */
            int[] getSegments() {
                if (segments == null) {
                    return null;
                }
                int[] result = new int[segments.size()];
                for (int i=0; i<segments.size(); ++i) {
                    result[i] = ((Number)segments.elementAt(i)).intValue();
                }
                return result;
            }
        }
        /**
         * MAIN PARSER.  Parse the next rule in the given rule string, starting
         * at pos.  Return the index after the last character parsed.  Do not
         * parse characters at or after limit.
         *
         * Important:  The character at pos must be a non-whitespace character
         * that is not the comment character.
         *
         * This method handles quoting, escaping, and whitespace removal.  It
         * parses the end-of-rule character.  It recognizes context and cursor
         * indicators.  Once it does a lexical breakdown of the rule at pos, it
         * creates a rule object and adds it to our rule list.
         *
         * This method is tightly coupled to the inner class RuleHalf.
         */
        private int parseRule(String rule, int pos, int limit) {
            // Locate the left side, operator, and right side
            int start = pos;
            char operator = 0;
            RuleHalf left  = new RuleHalf();
            RuleHalf right = new RuleHalf();
            pos = left.parse(rule, pos, limit, this);
            if (pos == limit ||
                OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
                syntaxError("No operator", rule, start);
            }
            // Found an operator char.  Check for forward-reverse operator.
            if (operator == REVERSE_RULE_OP &&
                (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
                ++pos;
                operator = FWDREV_RULE_OP;
            }
            pos = right.parse(rule, pos, limit, this);
            if (pos < limit) {
                if (rule.charAt(pos) == END_OF_RULE) {
                    ++pos;
                } else {
                    // RuleHalf parser must have terminated at an operator
                    syntaxError("Unquoted operator", rule, start);
                }
            }
            if (operator == VARIABLE_DEF_OP) {
                // LHS is the name.  RHS is a single character, either a literal
                // or a set (already parsed).  If RHS is longer than one
                // character, it is either a multi-character string, or multiple
                // sets, or a mixture of chars and sets -- syntax error.
-                if (buf.length() != 1) {
+                if (right.text.length() != 1) {
                    syntaxError("Malformed RHS", rule, start);
                }
-                if (data.variableNames.get(left) != null) {
+                if (data.variableNames.get(left.text) != null) {
                    syntaxError("Duplicate definition of {" +
-                                left + "}", rule, start);
+                                left.text + "}", rule, start);
                }
-                data.variableNames.put(left, new Character(buf.charAt(0)));
+                data.variableNames.put(left.text, new Character(right.text.charAt(0)));
-                break;
+                return pos;
            case FORWARD_RULE_OP:
                if (direction == FORWARD) {
                    if (ante >= 0 || post >= 0 || leftCursor >= 0) {
                        syntaxError("Malformed rule", rule, start);
            }
                    data.ruleSet.addRule(new TransliterationRule(
                                             left, leftAnte, leftPost,
                                             buf.toString(), cursor));
                } // otherwise ignore the rule; it's not the direction we want
                break;
-            case REVERSE_RULE_OP:
+            // If the direction we want doesn't match the rule
            // direction, do nothing.
            if (operator != FWDREV_RULE_OP &&
                ((direction == FORWARD) != (operator == FORWARD_RULE_OP))) {
                return pos;
            }
            // Transform the rule into a forward rule by swapping the
            // sides if necessary.
            if (direction == REVERSE) {
-                    if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
+                RuleHalf temp = left;
                left = right;
                right = temp;
            }
            // Remove non-applicable elements in forward-reverse
            // rules.  Bidirectional rules ignore elements that do not
            // apply.
            if (operator == FWDREV_RULE_OP) {
                right.removeContext();
                right.segments = null;
                left.cursor = left.maxRef = -1;
            }
            // Context is only allowed on the input side.  Cursors are only
            // allowed on the output side.  Segment delimiters can only appear
            // on the left, and references on the right.
            if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
                right.segments != null || left.maxRef >= 0) {
                syntaxError("Malformed rule", rule, start);
            }
                    data.ruleSet.addRule(new TransliterationRule(
                                             buf.toString(), ante, post,
                                             left, leftCursor));
                } // otherwise ignore the rule; it's not the direction we want
                break;
-            case FWDREV_RULE_OP:
+            // Check integrity of segments and segment references.  Each
-                if (direction == FORWARD) {
+            // segment's start must have a corresponding limit, and the
-                    // The output side is the right; trim off any context
+            // references must not refer to segments that do not exist.
-                    String output = buf.toString().substring(ante < 0 ? 0 : ante,
+            int[] segmentsArray = null;
-                                                             post < 0 ? buf.length() : post);
+            if (left.segments != null) {
-                    data.ruleSet.addRule(new TransliterationRule(
+                int n = left.segments.size();
-                                             left, leftAnte, leftPost,
+                if (n % 2 != 0) {
-                                             output, cursor));
+                    syntaxError("Odd length segments", rule, start);
                } else {
                    // The output side is the left; trim off any context
                    String output = left.substring(leftAnte < 0 ? 0 : leftAnte,
                                                   leftPost < 0 ? left.length() : leftPost);
                    data.ruleSet.addRule(new TransliterationRule(
                                             buf.toString(), ante, post,
                                             output, leftCursor));
                }
-                break;
+                n /= 2;
                if (right.maxRef > n) {
                    syntaxError("Undefined segment reference " + right.maxRef, rule, start);
                }
            }
            data.ruleSet.addRule(new TransliterationRule(
                                         left.text, left.ante, left.post,
                                         right.text, right.cursor,
                                         left.getSegments(), data));
            return pos;
        }
@ -871,13 +995,13 @@ public class RuleBasedTransliterator extends Transliterator {
         * @param rule pattern string
         * @param start position of first character of current rule
         */
-        private static final void syntaxError(String msg, String rule, int start) {
+        static final void syntaxError(String msg, String rule, int start) {
            int end = quotedIndexOf(rule, start, rule.length(), ";");
            if (end < 0) {
                end = rule.length();
            }
-            throw new IllegalArgumentException(msg + " in " +
+            throw new IllegalArgumentException(msg + " in \"" +
-                                               rule.substring(start, end));
+                                               Utility.escape(rule.substring(start, end)) + '"');
        }
        /**
@ -928,7 +1052,9 @@ public class RuleBasedTransliterator extends Transliterator {
                    "No private use characters available for variables");
            }
-            data.setVariablesBase = variableNext = r.start;
+            // Allocate 9 characters for segment references 1 through 9
            data.segmentBase = r.start;
            data.setVariablesBase = variableNext = (char) (data.segmentBase + 9);
            variableLimit = (char) (r.start + r.length);
            if (variableNext >= variableLimit) {
--- a/icu4j/src/com/ibm/icu/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $ 
- * $Date: 2000/04/12 20:17:45 $ 
+ * $Date: 2000/04/19 16:34:18 $ 
- * $Revision: 1.15 $
+ * $Revision: 1.16 $
 *
 *****************************************************************************************
 */
@ -30,12 +30,26 @@ import com.ibm.util.Utility;
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined. 
 *
 * <p>A rule may contain segments in its input string and segment references in
 * its output string.  A segment is a substring of the input pattern, indicated
 * by an offset and limit.  The segment may span the preceding or following
 * context.  A segment reference is a special character in the output string
 * that causes a segment of the input string (not the input pattern) to be
 * copied to the output string.  The range of special characters that represent
 * segment references is defined by RuleBasedTransliterator.Data.
 *
 * <p>Example: The rule "$([a-z]$) . $([0-9]$) > $2 . $1" will change the input
 * string "abc.123" to "ab1.c23".
 *
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.15 $ $Date: 2000/04/12 20:17:45 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.16 $ $Date: 2000/04/19 16:34:18 $
 *
 * $Log: TransliterationRule.java,v $
 * Revision 1.16  2000/04/19 16:34:18  alan
 * Add segment support.
 *
 * Revision 1.15  2000/04/12 20:17:45  alan
 * Delegate replace operation to rule object
 *
@ -121,6 +135,21 @@ class TransliterationRule {
     */
    private String output;
    /**
     * Array of segments.  These are segments of the input string that may be
     * referenced and appear in the output string.  Each segment is stored as an
     * offset, limit pair.  Segments are referenced by a 1-based index;
     * reference i thus includes characters at offset segments[2*i-2] to
     * segments[2*i-1]-1 in the pattern string.
     *
     * In the output string, a segment reference is indicated by a character in
     * a special range, as defined by RuleBasedTransliterator.Data.
     *
     * Most rules have no segments, in which case segments is null, and the
     * output string need not be checked for segment reference characters.
     */
    private int[] segments;
    /**
     * The length of the string that must match before the key.  If
     * zero, then there is no matching requirement before the key.
@ -160,11 +189,17 @@ class TransliterationRule {
     * <code>output</code>; that is, -1 is equivalent to
     * <code>output.length()</code>.  If greater than
     * <code>output.length()</code> then an exception is thrown.
     * @param segs array of 2n integers.  Each of n pairs consists of offset,
     * limit for a segment of the input string.  Characters in the output string
     * refer to these segments if they are in a special range determined by the
     * associated RuleBasedTransliterator.Data object.  May be null if there are
     * no segments.
     */
    public TransliterationRule(String input,
                               int anteContextPos, int postContextPos,
                               String output,
-                               int cursorPos) {
+                               int cursorPos,
                               int[] segs) {
        // Do range checks only when warranted to save time
        if (anteContextPos < 0) {
            anteContextLength = 0;
@ -193,6 +228,34 @@ class TransliterationRule {
        }
        pattern = input;
        this.output = output;
        // We don't validate the segments array.  The caller must
        // guarantee that the segments are well-formed.
        this.segments = segs;
    }
    /**
     * Construct a new rule with the given input, output text, and other
     * attributes.  A cursor position may be specified for the output text.
     * @param input input string, including key and optional ante and
     * post context
     * @param anteContextPos offset into input to end of ante context, or -1 if
     * none.  Must be <= input.length() if not -1.
     * @param postContextPos offset into input to start of post context, or -1
     * if none.  Must be <= input.length() if not -1, and must be >=
     * anteContextPos.
     * @param output output string
     * @param cursorPos offset into output at which cursor is located, or -1 if
     * none.  If less than zero, then the cursor is placed after the
     * <code>output</code>; that is, -1 is equivalent to
     * <code>output.length()</code>.  If greater than
     * <code>output.length()</code> then an exception is thrown.
     */
    public TransliterationRule(String input,
                               int anteContextPos, int postContextPos,
                               String output,
                               int cursorPos) {
        this(input, anteContextPos, postContextPos,
             output, cursorPos, null);
    }
    /**
@ -238,11 +301,34 @@ class TransliterationRule {
     * matches.  This is the offset to the point after the ante
     * context, if any, and before the match string and any post
     * context.
     * @param data the RuleBasedTransliterator.Data object specifying
     * context for this transliterator.
     * @return the change in the length of the text
     */
-    int replace(Replaceable text, int offset) {
+    public int replace(Replaceable text, int offset,
-        text.replace(offset, offset + keyLength, output);
+                       RuleBasedTransliterator.Data data) {
-        return output.length() - keyLength;
+        String out;
        if (segments == null) {
            out = output;
        } else {
            int textStart = offset - anteContextLength;
            StringBuffer buf = new StringBuffer();
            for (int i=0; i<output.length(); ++i) {
                char c = output.charAt(i);
                int b = data.lookupSegmentReference(c);
                if (b < 0) {
                    buf.append(c);
                } else {
                    for (int j=textStart + segments[2*b];
                         j<textStart + segments[2*b+1]; ++j) {
                        buf.append(text.charAt(j));
                    }
                }
            }
            out = buf.toString();
        }
        text.replace(offset, offset + keyLength, out);
        return out.length() - keyLength;
    }
    /**
--- a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $ 
- * $Date: 2000/03/22 02:00:08 $ 
+ * $Date: 2000/04/19 16:37:38 $ 
- * $Revision: 1.14 $
+ * $Revision: 1.15 $
 *
 *****************************************************************************************
 */
@ -412,6 +412,25 @@ public class TransliteratorTest extends TestFmwk {
        expect(hex3, "012", "&#x30;&#x31;&#x32;");
    }
    /**
     * Test segments and segment references.
     */
    public void TestSegments() {
        // Array of 3n items
        // Each item is <rules>, <input>, <expected output>
        String[] DATA = {
            "$([a-z]$) . $([0-9]$) > $2-$1",
            "abc.123.xyz.456",
            "ab1-c23.xy4-z56",
        };
        for (int i=0; i<DATA.length; i+=3) {
            logln("Pattern: " + Utility.escape(DATA[i]));
            Transliterator t = new RuleBasedTransliterator("<ID>", DATA[i]);
            expect(t, DATA[i+1], DATA[i+2]);
        }
    }
    //======================================================================
    // Support methods
    //======================================================================
--- a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $ 
- * $Date: 2000/04/12 20:17:45 $ 
+ * $Date: 2000/04/19 16:34:18 $ 
- * $Revision: 1.18 $
+ * $Revision: 1.19 $
 *
 *****************************************************************************************
 */
@ -209,9 +209,12 @@ import com.ibm.util.Utility;
 * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
 *
 * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.18 $ $Date: 2000/04/12 20:17:45 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.19 $ $Date: 2000/04/19 16:34:18 $
 *
 * $Log: RuleBasedTransliterator.java,v $
 * Revision 1.19  2000/04/19 16:34:18  alan
 * Add segment support.
 *
 * Revision 1.18  2000/04/12 20:17:45  alan
 * Delegate replace operation to rule object
 *
@ -379,7 +382,7 @@ public class RuleBasedTransliterator extends Transliterator {
                }
            } else {
                // Delegate replacement to TransliterationRule object
-                limit += r.replace(text, cursor);
+                limit += r.replace(text, cursor, data);
                // text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
                // limit += r.getOutput().length() - r.getKeyLength();
                cursor += r.getCursorPos();
@ -448,12 +451,14 @@ public class RuleBasedTransliterator extends Transliterator {
        public UnicodeSet[] setVariables;
        /**
-         * The character represented by setVariables[0].
+         * The character that represents setVariables[0].  Characters
         * setVariablesBase through setVariablesBase +
         * setVariables.length - 1 represent UnicodeSet objects.
         */
        public char setVariablesBase;
        /**
-         * Return the UnicodeSet associated with the given character, or
+         * Return the UnicodeSet represented by the given character, or
         * null if none.
         */
        public UnicodeSet lookup(char c) {
@ -461,6 +466,22 @@ public class RuleBasedTransliterator extends Transliterator {
            return (i >= 0 && i < setVariables.length)
                ? setVariables[i] : null;
        }
        /**
         * The character that represents segment 1.  Characters segmentBase
         * through segmentBase + 8 represent segments 1 through 9.
         */
        public char segmentBase;
        /**
         * Return the zero-based index of the segment represented by the given
         * character, or -1 if none.  Repeat: This is a zero-based return value,
         * 0..8, even though these are notated "$1".."$9".
         */
        public int lookupSegmentReference(char c) {
            int i = c - segmentBase;
            return (i >= 0 && i < 9) ? i : -1;
        }
    }
@ -548,6 +569,12 @@ public class RuleBasedTransliterator extends Transliterator {
        private static final char SET_CLOSE           = ']';
        private static final char CURSOR_POS          = '|';
        // Segments of the input string are delimited by "$(" and "$)".  In the
        // output string these segments are referenced as "$1" through "$9".
        private static final char SEGMENT_REF         = '$';
        private static final char SEGMENT_OPEN        = '(';
        private static final char SEGMENT_CLOSE       = ')';
        /**
         * @param rules list of rules, separated by semicolon characters
         * @exception IllegalArgumentException if there is a syntax error in the
@ -633,33 +660,34 @@ public class RuleBasedTransliterator extends Transliterator {
        }
        /**
-         * MAIN PARSER.  Parse the next rule in the given rule string, starting
+         * A class representing one side of a rule.  This class knows how to
-         * at pos.  Return the index after the last character parsed.  Do not
+         * parse half of a rule.  It is tightly coupled to the method
-         * parse characters at or after limit.
+         * RuleBasedTransliterator.Parser.parseRule().
         *
         * Important:  The character at pos must be a non-whitespace character
         * that is not the comment character.
         *
         * This method handles quoting, escaping, and whitespace removal.  It
         * parses the end-of-rule character.  It recognizes context and cursor
         * indicators.  Once it does a lexical breakdown of the rule at pos, it
         * creates a rule object and adds it to our rule list.
         */
-        private int parseRule(String rule, int pos, int limit) {
+        static class RuleHalf {
-            // Locate the left side, operator, and right side
+
            public String text;
            public int cursor = -1; // position of cursor in text
            public int ante = -1;   // position of ante context marker ')' in text
            public int post = -1;   // position of post context marker '(' in text
            // Record the position of the segment substrings and references.  A
            // given side should have segments or segment references, but not
            // both.
            public Vector segments = null; // ref substring start,limits
            public int maxRef = -1; // index of largest ref (1..9)
            /**
             * Parse one side of a rule, stopping at either the limit,
             * the END_OF_RULE character, or an operator.  Return
             * the pos of the terminating character (or limit).
             */
            public int parse(String rule, int pos, int limit,
                             RuleBasedTransliterator.Parser parser) {
                int start = pos;
            char operator = 0;
                StringBuffer buf = new StringBuffer();
-            int cursor = -1; // position of cursor in buf
+                int postClose = -1; // position of post context close ')' in text
            int ante = -1;   // position of ante context marker ')' in buf
            int post = -1;   // position of post context marker '(' in buf
            int postClose = -1; // position of post context close ')' in buf
            // Assigned to buf and its adjuncts after the LHS has been
            // parsed.  Thereafter, buf etc. refer to the RHS.
            String left = null;
            int leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
            main:
                while (pos < limit) {
@ -709,29 +737,47 @@ public class RuleBasedTransliterator extends Transliterator {
                        continue;
                    }
                    if (OPERATORS.indexOf(c) >= 0) {
-                    if (operator != 0) {
+                        --pos; // Backup to point to operator
-                        syntaxError("Unquoted " + c, rule, start);
+                        break main;
                    }
-                    // Found an operator char.  Check for forward-reverse operator.
+                    // Handle segment definitions "$(" ")$" and references "$1"
-                    if (c == REVERSE_RULE_OP &&
+                    // .. "$9".
-                        (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
+                    if (c == SEGMENT_REF) {
-                        ++pos;
+                        // After a SEGMENT_REF, must see SEGMENT_OPEN,
-                        operator = FWDREV_RULE_OP;
+                        // SEGMENT_CLOSE, or a digit 1 to 9, with no intervening
                        // whitespace
                        if (pos == limit) {
                            syntaxError("Trailing " + c, rule, start);
                        }
                        c = rule.charAt(pos++);
                        if (c == SEGMENT_OPEN || c == SEGMENT_CLOSE) {
                            // Parse "$(", "$)"
                            if (segments == null) {
                                segments = new Vector();
                            }
                            if ((c == SEGMENT_OPEN) !=
                                (segments.size() % 2 == 0)) {
                                syntaxError("Mismatched segment delimiters",
                                            rule, start);
                            }
                            segments.addElement(new Integer(buf.length()));
                        } else {
-                        operator = c;
+                            // Parse "$1" "$2" .. "$9"
                            int r = Character.digit(c, 10);
                            if (r < 1 || r > 9) {
                                syntaxError("Illegal char after " + SEGMENT_REF,
                                            rule, start);
                            }
                            if (r > maxRef) {
                                maxRef = r;
                            }
                            buf.append((char) (parser.data.segmentBase + r - 1));
                        }
                    left = buf.toString(); // lhs
                    leftCursor = cursor;
                    leftAnte = ante;
                    leftPost = post;
                    leftPostClose = postClose;
                    buf.setLength(0);
                    cursor = ante = post = postClose = -1;
                        continue;
                    }
                    switch (c) {
                    case END_OF_RULE:
                        --pos; // Backup to point to END_OF_RULE
                        break main;
                    case VARIABLE_REF_OPEN:
                        {
@ -741,7 +787,7 @@ public class RuleBasedTransliterator extends Transliterator {
                            }
                            String name = rule.substring(pos, j);
                            pos = j+1;
-                        buf.append(getVariableDef(name));
+                            buf.append(parser.getVariableDef(name));
                        }
                        break;
                    case CONTEXT_OPEN:
@ -770,7 +816,7 @@ public class RuleBasedTransliterator extends Transliterator {
                        break;
                    case SET_OPEN:
                        ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
-                    buf.append(registerSet(new UnicodeSet(rule, pp, parseData)));
+                        buf.append(parser.registerSet(new UnicodeSet(rule, pp, parser.parseData)));
                        pos = pp.getIndex();
                        break;
                    case VARIABLE_REF_CLOSE:
@ -787,77 +833,155 @@ public class RuleBasedTransliterator extends Transliterator {
                        break;
                    }
                }
            if (operator == 0) {
                syntaxError("No operator", rule, start);
            }
                // Check context close parameters
-            if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
+                if (postClose >= 0 && postClose != buf.length()) {
                (postClose >= 0 && postClose != buf.length())) {
                    syntaxError("Extra text after ]", rule, start);
                }
-            // Context is only allowed on the input side; that is, the left side
+                text = buf.toString();
-            // for forward rules.  Cursors are only allowed on the output side;
+                return pos;
-            // that is, the right side for forward rules.  Bidirectional rules
+            }
            // ignore elements that do not apply.
-            switch (operator) {
+            /**
-            case VARIABLE_DEF_OP:
+             * Remove context.
             */
            void removeContext() {
                text = text.substring(ante < 0 ? 0 : ante,
                                      post < 0 ? text.length() : post);
                ante = post = -1;
            }
            /**
             * Create and return an int[] array of segments.
             */
            int[] getSegments() {
                if (segments == null) {
                    return null;
                }
                int[] result = new int[segments.size()];
                for (int i=0; i<segments.size(); ++i) {
                    result[i] = ((Number)segments.elementAt(i)).intValue();
                }
                return result;
            }
        }
        /**
         * MAIN PARSER.  Parse the next rule in the given rule string, starting
         * at pos.  Return the index after the last character parsed.  Do not
         * parse characters at or after limit.
         *
         * Important:  The character at pos must be a non-whitespace character
         * that is not the comment character.
         *
         * This method handles quoting, escaping, and whitespace removal.  It
         * parses the end-of-rule character.  It recognizes context and cursor
         * indicators.  Once it does a lexical breakdown of the rule at pos, it
         * creates a rule object and adds it to our rule list.
         *
         * This method is tightly coupled to the inner class RuleHalf.
         */
        private int parseRule(String rule, int pos, int limit) {
            // Locate the left side, operator, and right side
            int start = pos;
            char operator = 0;
            RuleHalf left  = new RuleHalf();
            RuleHalf right = new RuleHalf();
            pos = left.parse(rule, pos, limit, this);
            if (pos == limit ||
                OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
                syntaxError("No operator", rule, start);
            }
            // Found an operator char.  Check for forward-reverse operator.
            if (operator == REVERSE_RULE_OP &&
                (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
                ++pos;
                operator = FWDREV_RULE_OP;
            }
            pos = right.parse(rule, pos, limit, this);
            if (pos < limit) {
                if (rule.charAt(pos) == END_OF_RULE) {
                    ++pos;
                } else {
                    // RuleHalf parser must have terminated at an operator
                    syntaxError("Unquoted operator", rule, start);
                }
            }
            if (operator == VARIABLE_DEF_OP) {
                // LHS is the name.  RHS is a single character, either a literal
                // or a set (already parsed).  If RHS is longer than one
                // character, it is either a multi-character string, or multiple
                // sets, or a mixture of chars and sets -- syntax error.
-                if (buf.length() != 1) {
+                if (right.text.length() != 1) {
                    syntaxError("Malformed RHS", rule, start);
                }
-                if (data.variableNames.get(left) != null) {
+                if (data.variableNames.get(left.text) != null) {
                    syntaxError("Duplicate definition of {" +
-                                left + "}", rule, start);
+                                left.text + "}", rule, start);
                }
-                data.variableNames.put(left, new Character(buf.charAt(0)));
+                data.variableNames.put(left.text, new Character(right.text.charAt(0)));
-                break;
+                return pos;
            case FORWARD_RULE_OP:
                if (direction == FORWARD) {
                    if (ante >= 0 || post >= 0 || leftCursor >= 0) {
                        syntaxError("Malformed rule", rule, start);
            }
                    data.ruleSet.addRule(new TransliterationRule(
                                             left, leftAnte, leftPost,
                                             buf.toString(), cursor));
                } // otherwise ignore the rule; it's not the direction we want
                break;
-            case REVERSE_RULE_OP:
+            // If the direction we want doesn't match the rule
            // direction, do nothing.
            if (operator != FWDREV_RULE_OP &&
                ((direction == FORWARD) != (operator == FORWARD_RULE_OP))) {
                return pos;
            }
            // Transform the rule into a forward rule by swapping the
            // sides if necessary.
            if (direction == REVERSE) {
-                    if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
+                RuleHalf temp = left;
                left = right;
                right = temp;
            }
            // Remove non-applicable elements in forward-reverse
            // rules.  Bidirectional rules ignore elements that do not
            // apply.
            if (operator == FWDREV_RULE_OP) {
                right.removeContext();
                right.segments = null;
                left.cursor = left.maxRef = -1;
            }
            // Context is only allowed on the input side.  Cursors are only
            // allowed on the output side.  Segment delimiters can only appear
            // on the left, and references on the right.
            if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
                right.segments != null || left.maxRef >= 0) {
                syntaxError("Malformed rule", rule, start);
            }
                    data.ruleSet.addRule(new TransliterationRule(
                                             buf.toString(), ante, post,
                                             left, leftCursor));
                } // otherwise ignore the rule; it's not the direction we want
                break;
-            case FWDREV_RULE_OP:
+            // Check integrity of segments and segment references.  Each
-                if (direction == FORWARD) {
+            // segment's start must have a corresponding limit, and the
-                    // The output side is the right; trim off any context
+            // references must not refer to segments that do not exist.
-                    String output = buf.toString().substring(ante < 0 ? 0 : ante,
+            int[] segmentsArray = null;
-                                                             post < 0 ? buf.length() : post);
+            if (left.segments != null) {
-                    data.ruleSet.addRule(new TransliterationRule(
+                int n = left.segments.size();
-                                             left, leftAnte, leftPost,
+                if (n % 2 != 0) {
-                                             output, cursor));
+                    syntaxError("Odd length segments", rule, start);
                } else {
                    // The output side is the left; trim off any context
                    String output = left.substring(leftAnte < 0 ? 0 : leftAnte,
                                                   leftPost < 0 ? left.length() : leftPost);
                    data.ruleSet.addRule(new TransliterationRule(
                                             buf.toString(), ante, post,
                                             output, leftCursor));
                }
-                break;
+                n /= 2;
                if (right.maxRef > n) {
                    syntaxError("Undefined segment reference " + right.maxRef, rule, start);
                }
            }
            data.ruleSet.addRule(new TransliterationRule(
                                         left.text, left.ante, left.post,
                                         right.text, right.cursor,
                                         left.getSegments(), data));
            return pos;
        }
@ -871,13 +995,13 @@ public class RuleBasedTransliterator extends Transliterator {
         * @param rule pattern string
         * @param start position of first character of current rule
         */
-        private static final void syntaxError(String msg, String rule, int start) {
+        static final void syntaxError(String msg, String rule, int start) {
            int end = quotedIndexOf(rule, start, rule.length(), ";");
            if (end < 0) {
                end = rule.length();
            }
-            throw new IllegalArgumentException(msg + " in " +
+            throw new IllegalArgumentException(msg + " in \"" +
-                                               rule.substring(start, end));
+                                               Utility.escape(rule.substring(start, end)) + '"');
        }
        /**
@ -928,7 +1052,9 @@ public class RuleBasedTransliterator extends Transliterator {
                    "No private use characters available for variables");
            }
-            data.setVariablesBase = variableNext = r.start;
+            // Allocate 9 characters for segment references 1 through 9
            data.segmentBase = r.start;
            data.setVariablesBase = variableNext = (char) (data.segmentBase + 9);
            variableLimit = (char) (r.start + r.length);
            if (variableNext >= variableLimit) {
--- a/icu4j/src/com/ibm/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/text/TransliterationRule.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $ 
- * $Date: 2000/04/12 20:17:45 $ 
+ * $Date: 2000/04/19 16:34:18 $ 
- * $Revision: 1.15 $
+ * $Revision: 1.16 $
 *
 *****************************************************************************************
 */
@ -30,12 +30,26 @@ import com.ibm.util.Utility;
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined. 
 *
 * <p>A rule may contain segments in its input string and segment references in
 * its output string.  A segment is a substring of the input pattern, indicated
 * by an offset and limit.  The segment may span the preceding or following
 * context.  A segment reference is a special character in the output string
 * that causes a segment of the input string (not the input pattern) to be
 * copied to the output string.  The range of special characters that represent
 * segment references is defined by RuleBasedTransliterator.Data.
 *
 * <p>Example: The rule "$([a-z]$) . $([0-9]$) > $2 . $1" will change the input
 * string "abc.123" to "ab1.c23".
 *
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.15 $ $Date: 2000/04/12 20:17:45 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.16 $ $Date: 2000/04/19 16:34:18 $
 *
 * $Log: TransliterationRule.java,v $
 * Revision 1.16  2000/04/19 16:34:18  alan
 * Add segment support.
 *
 * Revision 1.15  2000/04/12 20:17:45  alan
 * Delegate replace operation to rule object
 *
@ -121,6 +135,21 @@ class TransliterationRule {
     */
    private String output;
    /**
     * Array of segments.  These are segments of the input string that may be
     * referenced and appear in the output string.  Each segment is stored as an
     * offset, limit pair.  Segments are referenced by a 1-based index;
     * reference i thus includes characters at offset segments[2*i-2] to
     * segments[2*i-1]-1 in the pattern string.
     *
     * In the output string, a segment reference is indicated by a character in
     * a special range, as defined by RuleBasedTransliterator.Data.
     *
     * Most rules have no segments, in which case segments is null, and the
     * output string need not be checked for segment reference characters.
     */
    private int[] segments;
    /**
     * The length of the string that must match before the key.  If
     * zero, then there is no matching requirement before the key.
@ -160,11 +189,17 @@ class TransliterationRule {
     * <code>output</code>; that is, -1 is equivalent to
     * <code>output.length()</code>.  If greater than
     * <code>output.length()</code> then an exception is thrown.
     * @param segs array of 2n integers.  Each of n pairs consists of offset,
     * limit for a segment of the input string.  Characters in the output string
     * refer to these segments if they are in a special range determined by the
     * associated RuleBasedTransliterator.Data object.  May be null if there are
     * no segments.
     */
    public TransliterationRule(String input,
                               int anteContextPos, int postContextPos,
                               String output,
-                               int cursorPos) {
+                               int cursorPos,
                               int[] segs) {
        // Do range checks only when warranted to save time
        if (anteContextPos < 0) {
            anteContextLength = 0;
@ -193,6 +228,34 @@ class TransliterationRule {
        }
        pattern = input;
        this.output = output;
        // We don't validate the segments array.  The caller must
        // guarantee that the segments are well-formed.
        this.segments = segs;
    }
    /**
     * Construct a new rule with the given input, output text, and other
     * attributes.  A cursor position may be specified for the output text.
     * @param input input string, including key and optional ante and
     * post context
     * @param anteContextPos offset into input to end of ante context, or -1 if
     * none.  Must be <= input.length() if not -1.
     * @param postContextPos offset into input to start of post context, or -1
     * if none.  Must be <= input.length() if not -1, and must be >=
     * anteContextPos.
     * @param output output string
     * @param cursorPos offset into output at which cursor is located, or -1 if
     * none.  If less than zero, then the cursor is placed after the
     * <code>output</code>; that is, -1 is equivalent to
     * <code>output.length()</code>.  If greater than
     * <code>output.length()</code> then an exception is thrown.
     */
    public TransliterationRule(String input,
                               int anteContextPos, int postContextPos,
                               String output,
                               int cursorPos) {
        this(input, anteContextPos, postContextPos,
             output, cursorPos, null);
    }
    /**
@ -238,11 +301,34 @@ class TransliterationRule {
     * matches.  This is the offset to the point after the ante
     * context, if any, and before the match string and any post
     * context.
     * @param data the RuleBasedTransliterator.Data object specifying
     * context for this transliterator.
     * @return the change in the length of the text
     */
-    int replace(Replaceable text, int offset) {
+    public int replace(Replaceable text, int offset,
-        text.replace(offset, offset + keyLength, output);
+                       RuleBasedTransliterator.Data data) {
-        return output.length() - keyLength;
+        String out;
        if (segments == null) {
            out = output;
        } else {
            int textStart = offset - anteContextLength;
            StringBuffer buf = new StringBuffer();
            for (int i=0; i<output.length(); ++i) {
                char c = output.charAt(i);
                int b = data.lookupSegmentReference(c);
                if (b < 0) {
                    buf.append(c);
                } else {
                    for (int j=textStart + segments[2*b];
                         j<textStart + segments[2*b+1]; ++j) {
                        buf.append(text.charAt(j));
                    }
                }
            }
            out = buf.toString();
        }
        text.replace(offset, offset + keyLength, out);
        return out.length() - keyLength;
    }
    /**