Add segment support.

X-SVN-Rev: 1165
2000-04-19 16:37:38 +00:00 · 2000-04-19 16:37:38 +00:00 · 2947282e42
commit 2947282e42
parent 9a19714271
6 changed files with 882 additions and 420 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $ 
- * $Date: 2000/03/22 02:00:08 $ 
- * $Revision: 1.14 $
+ * $Date: 2000/04/19 16:37:38 $ 
+ * $Revision: 1.15 $
 *
 *****************************************************************************************
 */
@ -412,6 +412,25 @@ public class TransliteratorTest extends TestFmwk {
        expect(hex3, "012", "&#x30;&#x31;&#x32;");
    }

+    /**
+     * Test segments and segment references.
+     */
+    public void TestSegments() {
+        // Array of 3n items
+        // Each item is <rules>, <input>, <expected output>
+        String[] DATA = {
+            "$([a-z]$) . $([0-9]$) > $2-$1",
+            "abc.123.xyz.456",
+            "ab1-c23.xy4-z56",
+        };
+
+        for (int i=0; i<DATA.length; i+=3) {
+            logln("Pattern: " + Utility.escape(DATA[i]));
+            Transliterator t = new RuleBasedTransliterator("<ID>", DATA[i]);
+            expect(t, DATA[i+1], DATA[i+2]);
+        }
+    }
+
    //======================================================================
    // Support methods
    //======================================================================
--- a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $ 
- * $Date: 2000/04/12 20:17:45 $ 
- * $Revision: 1.18 $
+ * $Date: 2000/04/19 16:34:18 $ 
+ * $Revision: 1.19 $
 *
 *****************************************************************************************
 */
@ -209,9 +209,12 @@ import com.ibm.util.Utility;
 * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
 *
 * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.18 $ $Date: 2000/04/12 20:17:45 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.19 $ $Date: 2000/04/19 16:34:18 $
 *
 * $Log: RuleBasedTransliterator.java,v $
+ * Revision 1.19  2000/04/19 16:34:18  alan
+ * Add segment support.
+ *
 * Revision 1.18  2000/04/12 20:17:45  alan
 * Delegate replace operation to rule object
 *
@ -379,7 +382,7 @@ public class RuleBasedTransliterator extends Transliterator {
                }
            } else {
                // Delegate replacement to TransliterationRule object
-                limit += r.replace(text, cursor);
+                limit += r.replace(text, cursor, data);
                // text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
                // limit += r.getOutput().length() - r.getKeyLength();
                cursor += r.getCursorPos();
@ -448,12 +451,14 @@ public class RuleBasedTransliterator extends Transliterator {
        public UnicodeSet[] setVariables;

        /**
-         * The character represented by setVariables[0].
+         * The character that represents setVariables[0].  Characters
+         * setVariablesBase through setVariablesBase +
+         * setVariables.length - 1 represent UnicodeSet objects.
         */
        public char setVariablesBase;

        /**
-         * Return the UnicodeSet associated with the given character, or
+         * Return the UnicodeSet represented by the given character, or
         * null if none.
         */
        public UnicodeSet lookup(char c) {
@ -461,6 +466,22 @@ public class RuleBasedTransliterator extends Transliterator {
            return (i >= 0 && i < setVariables.length)
                ? setVariables[i] : null;
        }
+
+        /**
+         * The character that represents segment 1.  Characters segmentBase
+         * through segmentBase + 8 represent segments 1 through 9.
+         */
+        public char segmentBase;
+
+        /**
+         * Return the zero-based index of the segment represented by the given
+         * character, or -1 if none.  Repeat: This is a zero-based return value,
+         * 0..8, even though these are notated "$1".."$9".
+         */
+        public int lookupSegmentReference(char c) {
+            int i = c - segmentBase;
+            return (i >= 0 && i < 9) ? i : -1;
+        }
    }


@ -548,6 +569,12 @@ public class RuleBasedTransliterator extends Transliterator {
        private static final char SET_CLOSE           = ']';
        private static final char CURSOR_POS          = '|';

+        // Segments of the input string are delimited by "$(" and "$)".  In the
+        // output string these segments are referenced as "$1" through "$9".
+        private static final char SEGMENT_REF         = '$';
+        private static final char SEGMENT_OPEN        = '(';
+        private static final char SEGMENT_CLOSE       = ')';
+
        /**
         * @param rules list of rules, separated by semicolon characters
         * @exception IllegalArgumentException if there is a syntax error in the
@ -632,6 +659,214 @@ public class RuleBasedTransliterator extends Transliterator {
            }
        }

+        /**
+         * A class representing one side of a rule.  This class knows how to
+         * parse half of a rule.  It is tightly coupled to the method
+         * RuleBasedTransliterator.Parser.parseRule().
+         */
+        static class RuleHalf {
+
+            public String text;
+
+            public int cursor = -1; // position of cursor in text
+            public int ante = -1;   // position of ante context marker ')' in text
+            public int post = -1;   // position of post context marker '(' in text
+
+            // Record the position of the segment substrings and references.  A
+            // given side should have segments or segment references, but not
+            // both.
+            public Vector segments = null; // ref substring start,limits
+            public int maxRef = -1; // index of largest ref (1..9)
+
+            /**
+             * Parse one side of a rule, stopping at either the limit,
+             * the END_OF_RULE character, or an operator.  Return
+             * the pos of the terminating character (or limit).
+             */
+            public int parse(String rule, int pos, int limit,
+                             RuleBasedTransliterator.Parser parser) {
+                int start = pos;
+                StringBuffer buf = new StringBuffer();
+                int postClose = -1; // position of post context close ')' in text
+
+            main:
+                while (pos < limit) {
+                    char c = rule.charAt(pos++);
+                    if (Character.isWhitespace(c)) {
+                        // Ignore whitespace.  Note that this is not Unicode
+                        // spaces, but Java spaces -- a subset, representing
+                        // whitespace likely to be seen in code.
+                        continue;
+                    }
+                    // Handle escapes
+                    if (c == ESCAPE) {
+                        if (pos == limit) {
+                            syntaxError("Trailing backslash", rule, start);
+                        }
+                        buf.append(rule.charAt(pos++));
+                        continue;
+                    }
+                    // Handle quoted matter
+                    if (c == QUOTE) {
+                        int iq = rule.indexOf(QUOTE, pos);
+                        if (iq == pos) {
+                            buf.append(c); // Parse [''] outside quotes as [']
+                            ++pos;
+                        } else {
+                            /* This loop picks up a segment of quoted text of the
+                             * form 'aaaa' each time through.  If this segment
+                             * hasn't really ended ('aaaa''bbbb') then it keeps
+                             * looping, each time adding on a new segment.  When it
+                             * reaches the final quote it breaks.
+                             */
+                            for (;;) {
+                                if (iq < 0) {
+                                    syntaxError("Unterminated quote", rule, start);
+                                }
+                                buf.append(rule.substring(pos, iq));
+                                pos = iq+1;
+                                if (pos < limit && rule.charAt(pos) == QUOTE) {
+                                // Parse [''] inside quotes as [']
+                                    iq = rule.indexOf(QUOTE, pos+1);
+                                // Continue looping
+                                } else {
+                                    break;
+                                }
+                            }
+                        }
+                        continue;
+                    }
+                    if (OPERATORS.indexOf(c) >= 0) {
+                        --pos; // Backup to point to operator
+                        break main;
+                    }
+                    // Handle segment definitions "$(" ")$" and references "$1"
+                    // .. "$9".
+                    if (c == SEGMENT_REF) {
+                        // After a SEGMENT_REF, must see SEGMENT_OPEN,
+                        // SEGMENT_CLOSE, or a digit 1 to 9, with no intervening
+                        // whitespace
+                        if (pos == limit) {
+                            syntaxError("Trailing " + c, rule, start);
+                        }
+                        c = rule.charAt(pos++);
+                        if (c == SEGMENT_OPEN || c == SEGMENT_CLOSE) {
+                            // Parse "$(", "$)"
+                            if (segments == null) {
+                                segments = new Vector();
+                            }
+                            if ((c == SEGMENT_OPEN) !=
+                                (segments.size() % 2 == 0)) {
+                                syntaxError("Mismatched segment delimiters",
+                                            rule, start);
+                            }
+                            segments.addElement(new Integer(buf.length()));
+                        } else {
+                            // Parse "$1" "$2" .. "$9"
+                            int r = Character.digit(c, 10);
+                            if (r < 1 || r > 9) {
+                                syntaxError("Illegal char after " + SEGMENT_REF,
+                                            rule, start);
+                            }
+                            if (r > maxRef) {
+                                maxRef = r;
+                            }
+                            buf.append((char) (parser.data.segmentBase + r - 1));
+                        }
+                        continue;
+                    }
+                    switch (c) {
+                    case END_OF_RULE:
+                        --pos; // Backup to point to END_OF_RULE
+                        break main;
+                    case VARIABLE_REF_OPEN:
+                        {
+                            int j = rule.indexOf(VARIABLE_REF_CLOSE, pos);
+                            if (pos == j || j < 0) { // empty or unterminated
+                                syntaxError("Malformed variable reference", rule, start);
+                            }
+                            String name = rule.substring(pos, j);
+                            pos = j+1;
+                            buf.append(parser.getVariableDef(name));
+                        }
+                        break;
+                    case CONTEXT_OPEN:
+                        if (post >= 0) {
+                            syntaxError("Multiple post contexts", rule, start);
+                        }
+                        // Ignore CONTEXT_OPEN if buffer length is zero -- that means
+                        // this is the optional opening delimiter for the ante context.
+                        if (buf.length() > 0) {
+                            post = buf.length();
+                        }
+                        break;
+                    case CONTEXT_CLOSE:
+                        if (postClose >= 0) {
+                            syntaxError("Unexpected " + c, rule, start);
+                        }
+                        if (post >= 0) {
+                            // This is probably the optional closing delimiter
+                            // for the post context; save the pos and check later.
+                            postClose = buf.length();
+                        } else if (ante >= 0) {
+                            syntaxError("Multiple ante contexts", rule, start);
+                        } else {
+                            ante = buf.length();
+                        }
+                        break;
+                    case SET_OPEN:
+                        ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
+                        buf.append(parser.registerSet(new UnicodeSet(rule, pp, parser.parseData)));
+                        pos = pp.getIndex();
+                        break;
+                    case VARIABLE_REF_CLOSE:
+                    case SET_CLOSE:
+                        syntaxError("Unquoted " + c, rule, start);
+                    case CURSOR_POS:
+                        if (cursor >= 0) {
+                            syntaxError("Multiple cursors", rule, start);
+                        }
+                        cursor = buf.length();
+                        break;
+                    default:
+                        buf.append(c);
+                        break;
+                    }
+                }
+
+                // Check context close parameters
+                if (postClose >= 0 && postClose != buf.length()) {
+                    syntaxError("Extra text after ]", rule, start);
+                }
+
+                text = buf.toString();
+                return pos;
+            }
+
+            /**
+             * Remove context.
+             */
+            void removeContext() {
+                text = text.substring(ante < 0 ? 0 : ante,
+                                      post < 0 ? text.length() : post);
+                ante = post = -1;
+            }
+
+            /**
+             * Create and return an int[] array of segments.
+             */
+            int[] getSegments() {
+                if (segments == null) {
+                    return null;
+                }
+                int[] result = new int[segments.size()];
+                for (int i=0; i<segments.size(); ++i) {
+                    result[i] = ((Number)segments.elementAt(i)).intValue();
+                }
+                return result;
+            }
+        }
+
        /**
         * MAIN PARSER.  Parse the next rule in the given rule string, starting
         * at pos.  Return the index after the last character parsed.  Do not
@ -644,221 +879,110 @@ public class RuleBasedTransliterator extends Transliterator {
         * parses the end-of-rule character.  It recognizes context and cursor
         * indicators.  Once it does a lexical breakdown of the rule at pos, it
         * creates a rule object and adds it to our rule list.
+         *
+         * This method is tightly coupled to the inner class RuleHalf.
         */
        private int parseRule(String rule, int pos, int limit) {
            // Locate the left side, operator, and right side
            int start = pos;
            char operator = 0;

-            StringBuffer buf = new StringBuffer();
-            int cursor = -1; // position of cursor in buf
-            int ante = -1;   // position of ante context marker ')' in buf
-            int post = -1;   // position of post context marker '(' in buf
-            int postClose = -1; // position of post context close ')' in buf
+            RuleHalf left  = new RuleHalf();
+            RuleHalf right = new RuleHalf();

-            // Assigned to buf and its adjuncts after the LHS has been
-            // parsed.  Thereafter, buf etc. refer to the RHS.
-            String left = null;
-            int leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
+            pos = left.parse(rule, pos, limit, this);

-        main:
-            while (pos < limit) {
-                char c = rule.charAt(pos++);
-                if (Character.isWhitespace(c)) {
-                    // Ignore whitespace.  Note that this is not Unicode
-                    // spaces, but Java spaces -- a subset, representing
-                    // whitespace likely to be seen in code.
-                    continue;
-                }
-                // Handle escapes
-                if (c == ESCAPE) {
-                    if (pos == limit) {
-                        syntaxError("Trailing backslash", rule, start);
-                    }
-                    buf.append(rule.charAt(pos++));
-                    continue;
-                }
-                // Handle quoted matter
-                if (c == QUOTE) {
-                    int iq = rule.indexOf(QUOTE, pos);
-                    if (iq == pos) {
-                        buf.append(c); // Parse [''] outside quotes as [']
-                        ++pos;
-                    } else {
-                        /* This loop picks up a segment of quoted text of the
-                         * form 'aaaa' each time through.  If this segment
-                         * hasn't really ended ('aaaa''bbbb') then it keeps
-                         * looping, each time adding on a new segment.  When it
-                         * reaches the final quote it breaks.
-                         */
-                        for (;;) {
-                            if (iq < 0) {
-                                syntaxError("Unterminated quote", rule, start);
-                            }
-                            buf.append(rule.substring(pos, iq));
-                            pos = iq+1;
-                            if (pos < limit && rule.charAt(pos) == QUOTE) {
-                                // Parse [''] inside quotes as [']
-                                iq = rule.indexOf(QUOTE, pos+1);
-                                // Continue looping
-                            } else {
-                                break;
-                            }
-                        }
-                    }
-                    continue;
-                }
-                if (OPERATORS.indexOf(c) >= 0) {
-                    if (operator != 0) {
-                        syntaxError("Unquoted " + c, rule, start);
-                    }
-                    // Found an operator char.  Check for forward-reverse operator.
-                    if (c == REVERSE_RULE_OP &&
-                        (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
-                        ++pos;
-                        operator = FWDREV_RULE_OP;
-                    } else {
-                        operator = c;
-                    }
-                    left = buf.toString(); // lhs
-                    leftCursor = cursor;
-                    leftAnte = ante;
-                    leftPost = post;
-                    leftPostClose = postClose;
-
-                    buf.setLength(0);
-                    cursor = ante = post = postClose = -1;
-                    continue;
-                }
-                switch (c) {
-                case END_OF_RULE:
-                    break main;
-                case VARIABLE_REF_OPEN:
-                    {
-                        int j = rule.indexOf(VARIABLE_REF_CLOSE, pos);
-                        if (pos == j || j < 0) { // empty or unterminated
-                            syntaxError("Malformed variable reference", rule, start);
-                        }
-                        String name = rule.substring(pos, j);
-                        pos = j+1;
-                        buf.append(getVariableDef(name));
-                    }
-                    break;
-                case CONTEXT_OPEN:
-                    if (post >= 0) {
-                        syntaxError("Multiple post contexts", rule, start);
-                    }
-                    // Ignore CONTEXT_OPEN if buffer length is zero -- that means
-                    // this is the optional opening delimiter for the ante context.
-                    if (buf.length() > 0) {
-                        post = buf.length();
-                    }
-                    break;
-                case CONTEXT_CLOSE:
-                    if (postClose >= 0) {
-                        syntaxError("Unexpected " + c, rule, start);
-                    }
-                    if (post >= 0) {
-                        // This is probably the optional closing delimiter
-                        // for the post context; save the pos and check later.
-                        postClose = buf.length();
-                    } else if (ante >= 0) {
-                        syntaxError("Multiple ante contexts", rule, start);
-                    } else {
-                        ante = buf.length();
-                    }
-                    break;
-                case SET_OPEN:
-                    ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
-                    buf.append(registerSet(new UnicodeSet(rule, pp, parseData)));
-                    pos = pp.getIndex();
-                    break;
-                case VARIABLE_REF_CLOSE:
-                case SET_CLOSE:
-                    syntaxError("Unquoted " + c, rule, start);
-                case CURSOR_POS:
-                    if (cursor >= 0) {
-                        syntaxError("Multiple cursors", rule, start);
-                    }
-                    cursor = buf.length();
-                    break;
-                default:
-                    buf.append(c);
-                    break;
-                }
-            }
-            if (operator == 0) {
+            if (pos == limit ||
+                OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
                syntaxError("No operator", rule, start);
            }

-            // Check context close parameters
-            if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
-                (postClose >= 0 && postClose != buf.length())) {
-                syntaxError("Extra text after ]", rule, start);
+            // Found an operator char.  Check for forward-reverse operator.
+            if (operator == REVERSE_RULE_OP &&
+                (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
+                ++pos;
+                operator = FWDREV_RULE_OP;
            }

-            // Context is only allowed on the input side; that is, the left side
-            // for forward rules.  Cursors are only allowed on the output side;
-            // that is, the right side for forward rules.  Bidirectional rules
-            // ignore elements that do not apply.
+            pos = right.parse(rule, pos, limit, this);

-            switch (operator) {
-            case VARIABLE_DEF_OP:
+            if (pos < limit) {
+                if (rule.charAt(pos) == END_OF_RULE) {
+                    ++pos;
+                } else {
+                    // RuleHalf parser must have terminated at an operator
+                    syntaxError("Unquoted operator", rule, start);
+                }
+            }
+
+            if (operator == VARIABLE_DEF_OP) {
                // LHS is the name.  RHS is a single character, either a literal
                // or a set (already parsed).  If RHS is longer than one
                // character, it is either a multi-character string, or multiple
                // sets, or a mixture of chars and sets -- syntax error.
-                if (buf.length() != 1) {
+                if (right.text.length() != 1) {
                    syntaxError("Malformed RHS", rule, start);
                }
-                if (data.variableNames.get(left) != null) {
+                if (data.variableNames.get(left.text) != null) {
                    syntaxError("Duplicate definition of {" +
-                                left + "}", rule, start);
+                                left.text + "}", rule, start);
                }
-                data.variableNames.put(left, new Character(buf.charAt(0)));
-                break;
-
-            case FORWARD_RULE_OP:
-                if (direction == FORWARD) {
-                    if (ante >= 0 || post >= 0 || leftCursor >= 0) {
-                        syntaxError("Malformed rule", rule, start);
-                    }
-                    data.ruleSet.addRule(new TransliterationRule(
-                                             left, leftAnte, leftPost,
-                                             buf.toString(), cursor));
-                } // otherwise ignore the rule; it's not the direction we want
-                break;
-
-            case REVERSE_RULE_OP:
-                if (direction == REVERSE) {
-                    if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
-                        syntaxError("Malformed rule", rule, start);
-                    }
-                    data.ruleSet.addRule(new TransliterationRule(
-                                             buf.toString(), ante, post,
-                                             left, leftCursor));
-                } // otherwise ignore the rule; it's not the direction we want
-                break;
-
-            case FWDREV_RULE_OP:
-                if (direction == FORWARD) {
-                    // The output side is the right; trim off any context
-                    String output = buf.toString().substring(ante < 0 ? 0 : ante,
-                                                             post < 0 ? buf.length() : post);
-                    data.ruleSet.addRule(new TransliterationRule(
-                                             left, leftAnte, leftPost,
-                                             output, cursor));
-                } else {
-                    // The output side is the left; trim off any context
-                    String output = left.substring(leftAnte < 0 ? 0 : leftAnte,
-                                                   leftPost < 0 ? left.length() : leftPost);
-                    data.ruleSet.addRule(new TransliterationRule(
-                                             buf.toString(), ante, post,
-                                             output, leftCursor));
-                }
-                break;
+                data.variableNames.put(left.text, new Character(right.text.charAt(0)));
+                return pos;
            }

+            // If the direction we want doesn't match the rule
+            // direction, do nothing.
+            if (operator != FWDREV_RULE_OP &&
+                ((direction == FORWARD) != (operator == FORWARD_RULE_OP))) {
+                return pos;
+            }
+
+            // Transform the rule into a forward rule by swapping the
+            // sides if necessary.
+            if (direction == REVERSE) {
+                RuleHalf temp = left;
+                left = right;
+                right = temp;
+            }
+
+            // Remove non-applicable elements in forward-reverse
+            // rules.  Bidirectional rules ignore elements that do not
+            // apply.
+            if (operator == FWDREV_RULE_OP) {
+                right.removeContext();
+                right.segments = null;
+                left.cursor = left.maxRef = -1;
+            }
+
+            // Context is only allowed on the input side.  Cursors are only
+            // allowed on the output side.  Segment delimiters can only appear
+            // on the left, and references on the right.
+            if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
+                right.segments != null || left.maxRef >= 0) {
+                syntaxError("Malformed rule", rule, start);
+            }
+
+            // Check integrity of segments and segment references.  Each
+            // segment's start must have a corresponding limit, and the
+            // references must not refer to segments that do not exist.
+            int[] segmentsArray = null;
+            if (left.segments != null) {
+                int n = left.segments.size();
+                if (n % 2 != 0) {
+                    syntaxError("Odd length segments", rule, start);
+                }
+                n /= 2;
+                if (right.maxRef > n) {
+                    syntaxError("Undefined segment reference " + right.maxRef, rule, start);
+                }
+            }
+
+            data.ruleSet.addRule(new TransliterationRule(
+                                         left.text, left.ante, left.post,
+                                         right.text, right.cursor,
+                                         left.getSegments(), data));
+            
            return pos;
        }

@ -871,13 +995,13 @@ public class RuleBasedTransliterator extends Transliterator {
         * @param rule pattern string
         * @param start position of first character of current rule
         */
-        private static final void syntaxError(String msg, String rule, int start) {
+        static final void syntaxError(String msg, String rule, int start) {
            int end = quotedIndexOf(rule, start, rule.length(), ";");
            if (end < 0) {
                end = rule.length();
            }
-            throw new IllegalArgumentException(msg + " in " +
-                                               rule.substring(start, end));
+            throw new IllegalArgumentException(msg + " in \"" +
+                                               Utility.escape(rule.substring(start, end)) + '"');
        }
        
        /**
@ -928,7 +1052,9 @@ public class RuleBasedTransliterator extends Transliterator {
                    "No private use characters available for variables");
            }

-            data.setVariablesBase = variableNext = r.start;
+            // Allocate 9 characters for segment references 1 through 9
+            data.segmentBase = r.start;
+            data.setVariablesBase = variableNext = (char) (data.segmentBase + 9);
            variableLimit = (char) (r.start + r.length);

            if (variableNext >= variableLimit) {
--- a/icu4j/src/com/ibm/icu/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $ 
- * $Date: 2000/04/12 20:17:45 $ 
- * $Revision: 1.15 $
+ * $Date: 2000/04/19 16:34:18 $ 
+ * $Revision: 1.16 $
 *
 *****************************************************************************************
 */
@ -30,12 +30,26 @@ import com.ibm.util.Utility;
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined. 
 *
+ * <p>A rule may contain segments in its input string and segment references in
+ * its output string.  A segment is a substring of the input pattern, indicated
+ * by an offset and limit.  The segment may span the preceding or following
+ * context.  A segment reference is a special character in the output string
+ * that causes a segment of the input string (not the input pattern) to be
+ * copied to the output string.  The range of special characters that represent
+ * segment references is defined by RuleBasedTransliterator.Data.
+ *
+ * <p>Example: The rule "$([a-z]$) . $([0-9]$) > $2 . $1" will change the input
+ * string "abc.123" to "ab1.c23".
+ *
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.15 $ $Date: 2000/04/12 20:17:45 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.16 $ $Date: 2000/04/19 16:34:18 $
 *
 * $Log: TransliterationRule.java,v $
+ * Revision 1.16  2000/04/19 16:34:18  alan
+ * Add segment support.
+ *
 * Revision 1.15  2000/04/12 20:17:45  alan
 * Delegate replace operation to rule object
 *
@ -121,6 +135,21 @@ class TransliterationRule {
     */
    private String output;

+    /**
+     * Array of segments.  These are segments of the input string that may be
+     * referenced and appear in the output string.  Each segment is stored as an
+     * offset, limit pair.  Segments are referenced by a 1-based index;
+     * reference i thus includes characters at offset segments[2*i-2] to
+     * segments[2*i-1]-1 in the pattern string.
+     *
+     * In the output string, a segment reference is indicated by a character in
+     * a special range, as defined by RuleBasedTransliterator.Data.
+     *
+     * Most rules have no segments, in which case segments is null, and the
+     * output string need not be checked for segment reference characters.
+     */
+    private int[] segments;
+
    /**
     * The length of the string that must match before the key.  If
     * zero, then there is no matching requirement before the key.
@ -160,11 +189,17 @@ class TransliterationRule {
     * <code>output</code>; that is, -1 is equivalent to
     * <code>output.length()</code>.  If greater than
     * <code>output.length()</code> then an exception is thrown.
+     * @param segs array of 2n integers.  Each of n pairs consists of offset,
+     * limit for a segment of the input string.  Characters in the output string
+     * refer to these segments if they are in a special range determined by the
+     * associated RuleBasedTransliterator.Data object.  May be null if there are
+     * no segments.
     */
    public TransliterationRule(String input,
                               int anteContextPos, int postContextPos,
                               String output,
-                               int cursorPos) {
+                               int cursorPos,
+                               int[] segs) {
        // Do range checks only when warranted to save time
        if (anteContextPos < 0) {
            anteContextLength = 0;
@ -193,6 +228,34 @@ class TransliterationRule {
        }
        pattern = input;
        this.output = output;
+        // We don't validate the segments array.  The caller must
+        // guarantee that the segments are well-formed.
+        this.segments = segs;
+    }
+
+    /**
+     * Construct a new rule with the given input, output text, and other
+     * attributes.  A cursor position may be specified for the output text.
+     * @param input input string, including key and optional ante and
+     * post context
+     * @param anteContextPos offset into input to end of ante context, or -1 if
+     * none.  Must be <= input.length() if not -1.
+     * @param postContextPos offset into input to start of post context, or -1
+     * if none.  Must be <= input.length() if not -1, and must be >=
+     * anteContextPos.
+     * @param output output string
+     * @param cursorPos offset into output at which cursor is located, or -1 if
+     * none.  If less than zero, then the cursor is placed after the
+     * <code>output</code>; that is, -1 is equivalent to
+     * <code>output.length()</code>.  If greater than
+     * <code>output.length()</code> then an exception is thrown.
+     */
+    public TransliterationRule(String input,
+                               int anteContextPos, int postContextPos,
+                               String output,
+                               int cursorPos) {
+        this(input, anteContextPos, postContextPos,
+             output, cursorPos, null);
    }

    /**
@ -238,11 +301,34 @@ class TransliterationRule {
     * matches.  This is the offset to the point after the ante
     * context, if any, and before the match string and any post
     * context.
+     * @param data the RuleBasedTransliterator.Data object specifying
+     * context for this transliterator.
     * @return the change in the length of the text
     */
-    int replace(Replaceable text, int offset) {
-        text.replace(offset, offset + keyLength, output);
-        return output.length() - keyLength;
+    public int replace(Replaceable text, int offset,
+                       RuleBasedTransliterator.Data data) {
+        String out;
+        if (segments == null) {
+            out = output;
+        } else {
+            int textStart = offset - anteContextLength;
+            StringBuffer buf = new StringBuffer();
+            for (int i=0; i<output.length(); ++i) {
+                char c = output.charAt(i);
+                int b = data.lookupSegmentReference(c);
+                if (b < 0) {
+                    buf.append(c);
+                } else {
+                    for (int j=textStart + segments[2*b];
+                         j<textStart + segments[2*b+1]; ++j) {
+                        buf.append(text.charAt(j));
+                    }
+                }
+            }
+            out = buf.toString();
+        }
+        text.replace(offset, offset + keyLength, out);
+        return out.length() - keyLength;
    }

    /**
--- a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $ 
- * $Date: 2000/03/22 02:00:08 $ 
- * $Revision: 1.14 $
+ * $Date: 2000/04/19 16:37:38 $ 
+ * $Revision: 1.15 $
 *
 *****************************************************************************************
 */
@ -412,6 +412,25 @@ public class TransliteratorTest extends TestFmwk {
        expect(hex3, "012", "&#x30;&#x31;&#x32;");
    }

+    /**
+     * Test segments and segment references.
+     */
+    public void TestSegments() {
+        // Array of 3n items
+        // Each item is <rules>, <input>, <expected output>
+        String[] DATA = {
+            "$([a-z]$) . $([0-9]$) > $2-$1",
+            "abc.123.xyz.456",
+            "ab1-c23.xy4-z56",
+        };
+
+        for (int i=0; i<DATA.length; i+=3) {
+            logln("Pattern: " + Utility.escape(DATA[i]));
+            Transliterator t = new RuleBasedTransliterator("<ID>", DATA[i]);
+            expect(t, DATA[i+1], DATA[i+2]);
+        }
+    }
+
    //======================================================================
    // Support methods
    //======================================================================
--- a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $ 
- * $Date: 2000/04/12 20:17:45 $ 
- * $Revision: 1.18 $
+ * $Date: 2000/04/19 16:34:18 $ 
+ * $Revision: 1.19 $
 *
 *****************************************************************************************
 */
@ -209,9 +209,12 @@ import com.ibm.util.Utility;
 * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
 *
 * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.18 $ $Date: 2000/04/12 20:17:45 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.19 $ $Date: 2000/04/19 16:34:18 $
 *
 * $Log: RuleBasedTransliterator.java,v $
+ * Revision 1.19  2000/04/19 16:34:18  alan
+ * Add segment support.
+ *
 * Revision 1.18  2000/04/12 20:17:45  alan
 * Delegate replace operation to rule object
 *
@ -379,7 +382,7 @@ public class RuleBasedTransliterator extends Transliterator {
                }
            } else {
                // Delegate replacement to TransliterationRule object
-                limit += r.replace(text, cursor);
+                limit += r.replace(text, cursor, data);
                // text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
                // limit += r.getOutput().length() - r.getKeyLength();
                cursor += r.getCursorPos();
@ -448,12 +451,14 @@ public class RuleBasedTransliterator extends Transliterator {
        public UnicodeSet[] setVariables;

        /**
-         * The character represented by setVariables[0].
+         * The character that represents setVariables[0].  Characters
+         * setVariablesBase through setVariablesBase +
+         * setVariables.length - 1 represent UnicodeSet objects.
         */
        public char setVariablesBase;

        /**
-         * Return the UnicodeSet associated with the given character, or
+         * Return the UnicodeSet represented by the given character, or
         * null if none.
         */
        public UnicodeSet lookup(char c) {
@ -461,6 +466,22 @@ public class RuleBasedTransliterator extends Transliterator {
            return (i >= 0 && i < setVariables.length)
                ? setVariables[i] : null;
        }
+
+        /**
+         * The character that represents segment 1.  Characters segmentBase
+         * through segmentBase + 8 represent segments 1 through 9.
+         */
+        public char segmentBase;
+
+        /**
+         * Return the zero-based index of the segment represented by the given
+         * character, or -1 if none.  Repeat: This is a zero-based return value,
+         * 0..8, even though these are notated "$1".."$9".
+         */
+        public int lookupSegmentReference(char c) {
+            int i = c - segmentBase;
+            return (i >= 0 && i < 9) ? i : -1;
+        }
    }


@ -548,6 +569,12 @@ public class RuleBasedTransliterator extends Transliterator {
        private static final char SET_CLOSE           = ']';
        private static final char CURSOR_POS          = '|';

+        // Segments of the input string are delimited by "$(" and "$)".  In the
+        // output string these segments are referenced as "$1" through "$9".
+        private static final char SEGMENT_REF         = '$';
+        private static final char SEGMENT_OPEN        = '(';
+        private static final char SEGMENT_CLOSE       = ')';
+
        /**
         * @param rules list of rules, separated by semicolon characters
         * @exception IllegalArgumentException if there is a syntax error in the
@ -632,6 +659,214 @@ public class RuleBasedTransliterator extends Transliterator {
            }
        }

+        /**
+         * A class representing one side of a rule.  This class knows how to
+         * parse half of a rule.  It is tightly coupled to the method
+         * RuleBasedTransliterator.Parser.parseRule().
+         */
+        static class RuleHalf {
+
+            public String text;
+
+            public int cursor = -1; // position of cursor in text
+            public int ante = -1;   // position of ante context marker ')' in text
+            public int post = -1;   // position of post context marker '(' in text
+
+            // Record the position of the segment substrings and references.  A
+            // given side should have segments or segment references, but not
+            // both.
+            public Vector segments = null; // ref substring start,limits
+            public int maxRef = -1; // index of largest ref (1..9)
+
+            /**
+             * Parse one side of a rule, stopping at either the limit,
+             * the END_OF_RULE character, or an operator.  Return
+             * the pos of the terminating character (or limit).
+             */
+            public int parse(String rule, int pos, int limit,
+                             RuleBasedTransliterator.Parser parser) {
+                int start = pos;
+                StringBuffer buf = new StringBuffer();
+                int postClose = -1; // position of post context close ')' in text
+
+            main:
+                while (pos < limit) {
+                    char c = rule.charAt(pos++);
+                    if (Character.isWhitespace(c)) {
+                        // Ignore whitespace.  Note that this is not Unicode
+                        // spaces, but Java spaces -- a subset, representing
+                        // whitespace likely to be seen in code.
+                        continue;
+                    }
+                    // Handle escapes
+                    if (c == ESCAPE) {
+                        if (pos == limit) {
+                            syntaxError("Trailing backslash", rule, start);
+                        }
+                        buf.append(rule.charAt(pos++));
+                        continue;
+                    }
+                    // Handle quoted matter
+                    if (c == QUOTE) {
+                        int iq = rule.indexOf(QUOTE, pos);
+                        if (iq == pos) {
+                            buf.append(c); // Parse [''] outside quotes as [']
+                            ++pos;
+                        } else {
+                            /* This loop picks up a segment of quoted text of the
+                             * form 'aaaa' each time through.  If this segment
+                             * hasn't really ended ('aaaa''bbbb') then it keeps
+                             * looping, each time adding on a new segment.  When it
+                             * reaches the final quote it breaks.
+                             */
+                            for (;;) {
+                                if (iq < 0) {
+                                    syntaxError("Unterminated quote", rule, start);
+                                }
+                                buf.append(rule.substring(pos, iq));
+                                pos = iq+1;
+                                if (pos < limit && rule.charAt(pos) == QUOTE) {
+                                // Parse [''] inside quotes as [']
+                                    iq = rule.indexOf(QUOTE, pos+1);
+                                // Continue looping
+                                } else {
+                                    break;
+                                }
+                            }
+                        }
+                        continue;
+                    }
+                    if (OPERATORS.indexOf(c) >= 0) {
+                        --pos; // Backup to point to operator
+                        break main;
+                    }
+                    // Handle segment definitions "$(" ")$" and references "$1"
+                    // .. "$9".
+                    if (c == SEGMENT_REF) {
+                        // After a SEGMENT_REF, must see SEGMENT_OPEN,
+                        // SEGMENT_CLOSE, or a digit 1 to 9, with no intervening
+                        // whitespace
+                        if (pos == limit) {
+                            syntaxError("Trailing " + c, rule, start);
+                        }
+                        c = rule.charAt(pos++);
+                        if (c == SEGMENT_OPEN || c == SEGMENT_CLOSE) {
+                            // Parse "$(", "$)"
+                            if (segments == null) {
+                                segments = new Vector();
+                            }
+                            if ((c == SEGMENT_OPEN) !=
+                                (segments.size() % 2 == 0)) {
+                                syntaxError("Mismatched segment delimiters",
+                                            rule, start);
+                            }
+                            segments.addElement(new Integer(buf.length()));
+                        } else {
+                            // Parse "$1" "$2" .. "$9"
+                            int r = Character.digit(c, 10);
+                            if (r < 1 || r > 9) {
+                                syntaxError("Illegal char after " + SEGMENT_REF,
+                                            rule, start);
+                            }
+                            if (r > maxRef) {
+                                maxRef = r;
+                            }
+                            buf.append((char) (parser.data.segmentBase + r - 1));
+                        }
+                        continue;
+                    }
+                    switch (c) {
+                    case END_OF_RULE:
+                        --pos; // Backup to point to END_OF_RULE
+                        break main;
+                    case VARIABLE_REF_OPEN:
+                        {
+                            int j = rule.indexOf(VARIABLE_REF_CLOSE, pos);
+                            if (pos == j || j < 0) { // empty or unterminated
+                                syntaxError("Malformed variable reference", rule, start);
+                            }
+                            String name = rule.substring(pos, j);
+                            pos = j+1;
+                            buf.append(parser.getVariableDef(name));
+                        }
+                        break;
+                    case CONTEXT_OPEN:
+                        if (post >= 0) {
+                            syntaxError("Multiple post contexts", rule, start);
+                        }
+                        // Ignore CONTEXT_OPEN if buffer length is zero -- that means
+                        // this is the optional opening delimiter for the ante context.
+                        if (buf.length() > 0) {
+                            post = buf.length();
+                        }
+                        break;
+                    case CONTEXT_CLOSE:
+                        if (postClose >= 0) {
+                            syntaxError("Unexpected " + c, rule, start);
+                        }
+                        if (post >= 0) {
+                            // This is probably the optional closing delimiter
+                            // for the post context; save the pos and check later.
+                            postClose = buf.length();
+                        } else if (ante >= 0) {
+                            syntaxError("Multiple ante contexts", rule, start);
+                        } else {
+                            ante = buf.length();
+                        }
+                        break;
+                    case SET_OPEN:
+                        ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
+                        buf.append(parser.registerSet(new UnicodeSet(rule, pp, parser.parseData)));
+                        pos = pp.getIndex();
+                        break;
+                    case VARIABLE_REF_CLOSE:
+                    case SET_CLOSE:
+                        syntaxError("Unquoted " + c, rule, start);
+                    case CURSOR_POS:
+                        if (cursor >= 0) {
+                            syntaxError("Multiple cursors", rule, start);
+                        }
+                        cursor = buf.length();
+                        break;
+                    default:
+                        buf.append(c);
+                        break;
+                    }
+                }
+
+                // Check context close parameters
+                if (postClose >= 0 && postClose != buf.length()) {
+                    syntaxError("Extra text after ]", rule, start);
+                }
+
+                text = buf.toString();
+                return pos;
+            }
+
+            /**
+             * Remove context.
+             */
+            void removeContext() {
+                text = text.substring(ante < 0 ? 0 : ante,
+                                      post < 0 ? text.length() : post);
+                ante = post = -1;
+            }
+
+            /**
+             * Create and return an int[] array of segments.
+             */
+            int[] getSegments() {
+                if (segments == null) {
+                    return null;
+                }
+                int[] result = new int[segments.size()];
+                for (int i=0; i<segments.size(); ++i) {
+                    result[i] = ((Number)segments.elementAt(i)).intValue();
+                }
+                return result;
+            }
+        }
+
        /**
         * MAIN PARSER.  Parse the next rule in the given rule string, starting
         * at pos.  Return the index after the last character parsed.  Do not
@ -644,221 +879,110 @@ public class RuleBasedTransliterator extends Transliterator {
         * parses the end-of-rule character.  It recognizes context and cursor
         * indicators.  Once it does a lexical breakdown of the rule at pos, it
         * creates a rule object and adds it to our rule list.
+         *
+         * This method is tightly coupled to the inner class RuleHalf.
         */
        private int parseRule(String rule, int pos, int limit) {
            // Locate the left side, operator, and right side
            int start = pos;
            char operator = 0;

-            StringBuffer buf = new StringBuffer();
-            int cursor = -1; // position of cursor in buf
-            int ante = -1;   // position of ante context marker ')' in buf
-            int post = -1;   // position of post context marker '(' in buf
-            int postClose = -1; // position of post context close ')' in buf
+            RuleHalf left  = new RuleHalf();
+            RuleHalf right = new RuleHalf();

-            // Assigned to buf and its adjuncts after the LHS has been
-            // parsed.  Thereafter, buf etc. refer to the RHS.
-            String left = null;
-            int leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
+            pos = left.parse(rule, pos, limit, this);

-        main:
-            while (pos < limit) {
-                char c = rule.charAt(pos++);
-                if (Character.isWhitespace(c)) {
-                    // Ignore whitespace.  Note that this is not Unicode
-                    // spaces, but Java spaces -- a subset, representing
-                    // whitespace likely to be seen in code.
-                    continue;
-                }
-                // Handle escapes
-                if (c == ESCAPE) {
-                    if (pos == limit) {
-                        syntaxError("Trailing backslash", rule, start);
-                    }
-                    buf.append(rule.charAt(pos++));
-                    continue;
-                }
-                // Handle quoted matter
-                if (c == QUOTE) {
-                    int iq = rule.indexOf(QUOTE, pos);
-                    if (iq == pos) {
-                        buf.append(c); // Parse [''] outside quotes as [']
-                        ++pos;
-                    } else {
-                        /* This loop picks up a segment of quoted text of the
-                         * form 'aaaa' each time through.  If this segment
-                         * hasn't really ended ('aaaa''bbbb') then it keeps
-                         * looping, each time adding on a new segment.  When it
-                         * reaches the final quote it breaks.
-                         */
-                        for (;;) {
-                            if (iq < 0) {
-                                syntaxError("Unterminated quote", rule, start);
-                            }
-                            buf.append(rule.substring(pos, iq));
-                            pos = iq+1;
-                            if (pos < limit && rule.charAt(pos) == QUOTE) {
-                                // Parse [''] inside quotes as [']
-                                iq = rule.indexOf(QUOTE, pos+1);
-                                // Continue looping
-                            } else {
-                                break;
-                            }
-                        }
-                    }
-                    continue;
-                }
-                if (OPERATORS.indexOf(c) >= 0) {
-                    if (operator != 0) {
-                        syntaxError("Unquoted " + c, rule, start);
-                    }
-                    // Found an operator char.  Check for forward-reverse operator.
-                    if (c == REVERSE_RULE_OP &&
-                        (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
-                        ++pos;
-                        operator = FWDREV_RULE_OP;
-                    } else {
-                        operator = c;
-                    }
-                    left = buf.toString(); // lhs
-                    leftCursor = cursor;
-                    leftAnte = ante;
-                    leftPost = post;
-                    leftPostClose = postClose;
-
-                    buf.setLength(0);
-                    cursor = ante = post = postClose = -1;
-                    continue;
-                }
-                switch (c) {
-                case END_OF_RULE:
-                    break main;
-                case VARIABLE_REF_OPEN:
-                    {
-                        int j = rule.indexOf(VARIABLE_REF_CLOSE, pos);
-                        if (pos == j || j < 0) { // empty or unterminated
-                            syntaxError("Malformed variable reference", rule, start);
-                        }
-                        String name = rule.substring(pos, j);
-                        pos = j+1;
-                        buf.append(getVariableDef(name));
-                    }
-                    break;
-                case CONTEXT_OPEN:
-                    if (post >= 0) {
-                        syntaxError("Multiple post contexts", rule, start);
-                    }
-                    // Ignore CONTEXT_OPEN if buffer length is zero -- that means
-                    // this is the optional opening delimiter for the ante context.
-                    if (buf.length() > 0) {
-                        post = buf.length();
-                    }
-                    break;
-                case CONTEXT_CLOSE:
-                    if (postClose >= 0) {
-                        syntaxError("Unexpected " + c, rule, start);
-                    }
-                    if (post >= 0) {
-                        // This is probably the optional closing delimiter
-                        // for the post context; save the pos and check later.
-                        postClose = buf.length();
-                    } else if (ante >= 0) {
-                        syntaxError("Multiple ante contexts", rule, start);
-                    } else {
-                        ante = buf.length();
-                    }
-                    break;
-                case SET_OPEN:
-                    ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '['
-                    buf.append(registerSet(new UnicodeSet(rule, pp, parseData)));
-                    pos = pp.getIndex();
-                    break;
-                case VARIABLE_REF_CLOSE:
-                case SET_CLOSE:
-                    syntaxError("Unquoted " + c, rule, start);
-                case CURSOR_POS:
-                    if (cursor >= 0) {
-                        syntaxError("Multiple cursors", rule, start);
-                    }
-                    cursor = buf.length();
-                    break;
-                default:
-                    buf.append(c);
-                    break;
-                }
-            }
-            if (operator == 0) {
+            if (pos == limit ||
+                OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
                syntaxError("No operator", rule, start);
            }

-            // Check context close parameters
-            if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
-                (postClose >= 0 && postClose != buf.length())) {
-                syntaxError("Extra text after ]", rule, start);
+            // Found an operator char.  Check for forward-reverse operator.
+            if (operator == REVERSE_RULE_OP &&
+                (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
+                ++pos;
+                operator = FWDREV_RULE_OP;
            }

-            // Context is only allowed on the input side; that is, the left side
-            // for forward rules.  Cursors are only allowed on the output side;
-            // that is, the right side for forward rules.  Bidirectional rules
-            // ignore elements that do not apply.
+            pos = right.parse(rule, pos, limit, this);

-            switch (operator) {
-            case VARIABLE_DEF_OP:
+            if (pos < limit) {
+                if (rule.charAt(pos) == END_OF_RULE) {
+                    ++pos;
+                } else {
+                    // RuleHalf parser must have terminated at an operator
+                    syntaxError("Unquoted operator", rule, start);
+                }
+            }
+
+            if (operator == VARIABLE_DEF_OP) {
                // LHS is the name.  RHS is a single character, either a literal
                // or a set (already parsed).  If RHS is longer than one
                // character, it is either a multi-character string, or multiple
                // sets, or a mixture of chars and sets -- syntax error.
-                if (buf.length() != 1) {
+                if (right.text.length() != 1) {
                    syntaxError("Malformed RHS", rule, start);
                }
-                if (data.variableNames.get(left) != null) {
+                if (data.variableNames.get(left.text) != null) {
                    syntaxError("Duplicate definition of {" +
-                                left + "}", rule, start);
+                                left.text + "}", rule, start);
                }
-                data.variableNames.put(left, new Character(buf.charAt(0)));
-                break;
-
-            case FORWARD_RULE_OP:
-                if (direction == FORWARD) {
-                    if (ante >= 0 || post >= 0 || leftCursor >= 0) {
-                        syntaxError("Malformed rule", rule, start);
-                    }
-                    data.ruleSet.addRule(new TransliterationRule(
-                                             left, leftAnte, leftPost,
-                                             buf.toString(), cursor));
-                } // otherwise ignore the rule; it's not the direction we want
-                break;
-
-            case REVERSE_RULE_OP:
-                if (direction == REVERSE) {
-                    if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
-                        syntaxError("Malformed rule", rule, start);
-                    }
-                    data.ruleSet.addRule(new TransliterationRule(
-                                             buf.toString(), ante, post,
-                                             left, leftCursor));
-                } // otherwise ignore the rule; it's not the direction we want
-                break;
-
-            case FWDREV_RULE_OP:
-                if (direction == FORWARD) {
-                    // The output side is the right; trim off any context
-                    String output = buf.toString().substring(ante < 0 ? 0 : ante,
-                                                             post < 0 ? buf.length() : post);
-                    data.ruleSet.addRule(new TransliterationRule(
-                                             left, leftAnte, leftPost,
-                                             output, cursor));
-                } else {
-                    // The output side is the left; trim off any context
-                    String output = left.substring(leftAnte < 0 ? 0 : leftAnte,
-                                                   leftPost < 0 ? left.length() : leftPost);
-                    data.ruleSet.addRule(new TransliterationRule(
-                                             buf.toString(), ante, post,
-                                             output, leftCursor));
-                }
-                break;
+                data.variableNames.put(left.text, new Character(right.text.charAt(0)));
+                return pos;
            }

+            // If the direction we want doesn't match the rule
+            // direction, do nothing.
+            if (operator != FWDREV_RULE_OP &&
+                ((direction == FORWARD) != (operator == FORWARD_RULE_OP))) {
+                return pos;
+            }
+
+            // Transform the rule into a forward rule by swapping the
+            // sides if necessary.
+            if (direction == REVERSE) {
+                RuleHalf temp = left;
+                left = right;
+                right = temp;
+            }
+
+            // Remove non-applicable elements in forward-reverse
+            // rules.  Bidirectional rules ignore elements that do not
+            // apply.
+            if (operator == FWDREV_RULE_OP) {
+                right.removeContext();
+                right.segments = null;
+                left.cursor = left.maxRef = -1;
+            }
+
+            // Context is only allowed on the input side.  Cursors are only
+            // allowed on the output side.  Segment delimiters can only appear
+            // on the left, and references on the right.
+            if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
+                right.segments != null || left.maxRef >= 0) {
+                syntaxError("Malformed rule", rule, start);
+            }
+
+            // Check integrity of segments and segment references.  Each
+            // segment's start must have a corresponding limit, and the
+            // references must not refer to segments that do not exist.
+            int[] segmentsArray = null;
+            if (left.segments != null) {
+                int n = left.segments.size();
+                if (n % 2 != 0) {
+                    syntaxError("Odd length segments", rule, start);
+                }
+                n /= 2;
+                if (right.maxRef > n) {
+                    syntaxError("Undefined segment reference " + right.maxRef, rule, start);
+                }
+            }
+
+            data.ruleSet.addRule(new TransliterationRule(
+                                         left.text, left.ante, left.post,
+                                         right.text, right.cursor,
+                                         left.getSegments(), data));
+            
            return pos;
        }

@ -871,13 +995,13 @@ public class RuleBasedTransliterator extends Transliterator {
         * @param rule pattern string
         * @param start position of first character of current rule
         */
-        private static final void syntaxError(String msg, String rule, int start) {
+        static final void syntaxError(String msg, String rule, int start) {
            int end = quotedIndexOf(rule, start, rule.length(), ";");
            if (end < 0) {
                end = rule.length();
            }
-            throw new IllegalArgumentException(msg + " in " +
-                                               rule.substring(start, end));
+            throw new IllegalArgumentException(msg + " in \"" +
+                                               Utility.escape(rule.substring(start, end)) + '"');
        }
        
        /**
@ -928,7 +1052,9 @@ public class RuleBasedTransliterator extends Transliterator {
                    "No private use characters available for variables");
            }

-            data.setVariablesBase = variableNext = r.start;
+            // Allocate 9 characters for segment references 1 through 9
+            data.segmentBase = r.start;
+            data.setVariablesBase = variableNext = (char) (data.segmentBase + 9);
            variableLimit = (char) (r.start + r.length);

            if (variableNext >= variableLimit) {
--- a/icu4j/src/com/ibm/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/text/TransliterationRule.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $ 
- * $Date: 2000/04/12 20:17:45 $ 
- * $Revision: 1.15 $
+ * $Date: 2000/04/19 16:34:18 $ 
+ * $Revision: 1.16 $
 *
 *****************************************************************************************
 */
@ -30,12 +30,26 @@ import com.ibm.util.Utility;
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined. 
 *
+ * <p>A rule may contain segments in its input string and segment references in
+ * its output string.  A segment is a substring of the input pattern, indicated
+ * by an offset and limit.  The segment may span the preceding or following
+ * context.  A segment reference is a special character in the output string
+ * that causes a segment of the input string (not the input pattern) to be
+ * copied to the output string.  The range of special characters that represent
+ * segment references is defined by RuleBasedTransliterator.Data.
+ *
+ * <p>Example: The rule "$([a-z]$) . $([0-9]$) > $2 . $1" will change the input
+ * string "abc.123" to "ab1.c23".
+ *
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.15 $ $Date: 2000/04/12 20:17:45 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.16 $ $Date: 2000/04/19 16:34:18 $
 *
 * $Log: TransliterationRule.java,v $
+ * Revision 1.16  2000/04/19 16:34:18  alan
+ * Add segment support.
+ *
 * Revision 1.15  2000/04/12 20:17:45  alan
 * Delegate replace operation to rule object
 *
@ -121,6 +135,21 @@ class TransliterationRule {
     */
    private String output;

+    /**
+     * Array of segments.  These are segments of the input string that may be
+     * referenced and appear in the output string.  Each segment is stored as an
+     * offset, limit pair.  Segments are referenced by a 1-based index;
+     * reference i thus includes characters at offset segments[2*i-2] to
+     * segments[2*i-1]-1 in the pattern string.
+     *
+     * In the output string, a segment reference is indicated by a character in
+     * a special range, as defined by RuleBasedTransliterator.Data.
+     *
+     * Most rules have no segments, in which case segments is null, and the
+     * output string need not be checked for segment reference characters.
+     */
+    private int[] segments;
+
    /**
     * The length of the string that must match before the key.  If
     * zero, then there is no matching requirement before the key.
@ -160,11 +189,17 @@ class TransliterationRule {
     * <code>output</code>; that is, -1 is equivalent to
     * <code>output.length()</code>.  If greater than
     * <code>output.length()</code> then an exception is thrown.
+     * @param segs array of 2n integers.  Each of n pairs consists of offset,
+     * limit for a segment of the input string.  Characters in the output string
+     * refer to these segments if they are in a special range determined by the
+     * associated RuleBasedTransliterator.Data object.  May be null if there are
+     * no segments.
     */
    public TransliterationRule(String input,
                               int anteContextPos, int postContextPos,
                               String output,
-                               int cursorPos) {
+                               int cursorPos,
+                               int[] segs) {
        // Do range checks only when warranted to save time
        if (anteContextPos < 0) {
            anteContextLength = 0;
@ -193,6 +228,34 @@ class TransliterationRule {
        }
        pattern = input;
        this.output = output;
+        // We don't validate the segments array.  The caller must
+        // guarantee that the segments are well-formed.
+        this.segments = segs;
+    }
+
+    /**
+     * Construct a new rule with the given input, output text, and other
+     * attributes.  A cursor position may be specified for the output text.
+     * @param input input string, including key and optional ante and
+     * post context
+     * @param anteContextPos offset into input to end of ante context, or -1 if
+     * none.  Must be <= input.length() if not -1.
+     * @param postContextPos offset into input to start of post context, or -1
+     * if none.  Must be <= input.length() if not -1, and must be >=
+     * anteContextPos.
+     * @param output output string
+     * @param cursorPos offset into output at which cursor is located, or -1 if
+     * none.  If less than zero, then the cursor is placed after the
+     * <code>output</code>; that is, -1 is equivalent to
+     * <code>output.length()</code>.  If greater than
+     * <code>output.length()</code> then an exception is thrown.
+     */
+    public TransliterationRule(String input,
+                               int anteContextPos, int postContextPos,
+                               String output,
+                               int cursorPos) {
+        this(input, anteContextPos, postContextPos,
+             output, cursorPos, null);
    }

    /**
@ -238,11 +301,34 @@ class TransliterationRule {
     * matches.  This is the offset to the point after the ante
     * context, if any, and before the match string and any post
     * context.
+     * @param data the RuleBasedTransliterator.Data object specifying
+     * context for this transliterator.
     * @return the change in the length of the text
     */
-    int replace(Replaceable text, int offset) {
-        text.replace(offset, offset + keyLength, output);
-        return output.length() - keyLength;
+    public int replace(Replaceable text, int offset,
+                       RuleBasedTransliterator.Data data) {
+        String out;
+        if (segments == null) {
+            out = output;
+        } else {
+            int textStart = offset - anteContextLength;
+            StringBuffer buf = new StringBuffer();
+            for (int i=0; i<output.length(); ++i) {
+                char c = output.charAt(i);
+                int b = data.lookupSegmentReference(c);
+                if (b < 0) {
+                    buf.append(c);
+                } else {
+                    for (int j=textStart + segments[2*b];
+                         j<textStart + segments[2*b+1]; ++j) {
+                        buf.append(text.charAt(j));
+                    }
+                }
+            }
+            out = buf.toString();
+        }
+        text.replace(offset, offset + keyLength, out);
+        return out.length() - keyLength;
    }

    /**