ICU-1406 make quantified segments behave like perl counterparts

X-SVN-Rev: 6493
2001-10-30 18:08:53 +00:00 · 2001-10-30 18:08:53 +00:00 · 2c2b11dfe8
commit 2c2b11dfe8
parent 0d08aaadcc
13 changed files with 1073 additions and 1463 deletions
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -63,6 +63,10 @@ static const UChar gOPERATORS[] = {
    0x3D, 0x3E, 0x3C, 0     // "=><"
 };

+static const UChar HALF_ENDERS[] = {
+    0x3D, 0x3E, 0x3C, 59, 0 // "=><;"
+};
+
 // These are also used in Transliterator::toRules()
 static const int32_t ID_TOKEN_LEN = 2;
 static const UChar   ID_TOKEN[]   = { 0x3A, 0x3A }; // ':', ':'
@ -147,256 +151,6 @@ UnicodeString ParseData::parseReference(const UnicodeString& text,
    return result;
 }

-//----------------------------------------------------------------------
-// Segments
-//----------------------------------------------------------------------
-
-/**
- * Segments are parentheses-enclosed regions of the input string.
- * These are referenced in the output string using the notation $1,
- * $2, etc.  Numbering is in order of appearance of the left
- * parenthesis.  Number is one-based.  Segments are defined as start,
- * limit pairs.  Segments may nest.
- *
- * During parsing, segment data is encoded in an object of class
- * Segments.  At runtime, the same data is encoded in compact form as
- * an array of integers in a TransliterationRule.  The runtime encoding
- * must satisfy three goals:
- *
- * 1. Iterate over the offsets in a pattern, from left to right,
- *    and indicate all segment boundaries, in order.  This is done
- *    during matching.
- *
- * 2. Given a reference $n, produce the start and limit offsets
- *    for that segment.  This is done during replacement.
- *
- * 3. Similar to goal 1, but in addition, indicate whether each
- *    segment boundary is a start or a limit, in other words, whether
- *    each is an open paren or a close paren.  This is required by
- *    the toRule() method.
- *
- * Goal 1 must be satisfied at high speed since this is done during
- * matching.  Goal 2 is next most important.  Goal 3 is not performance
- * critical since it is only needed by toRule().
- *
- * The array of integers is actually two arrays concatenated.  The
- * first gives the index values of the open and close parentheses in
- * the order they appear.  The second maps segment numbers to the
- * indices of the first array.  The two arrays have the same length.
- * Iterating over the first array satisfies goal 1.  Indexing into the
- * second array satisfies goal 2.  Goal 3 is satisfied by iterating
- * over the second array and constructing the required data when
- * needed.  This is what toRule() does.
- *
- * Example:  (a b(c d)e f)
- *            0 1 2 3 4 5 6
- *
- * First array: Indices are 0, 2, 4, and 6.
- 
- * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
- * second array is 0, 3, 1 2 -- these give the indices in the
- * first array at which $1:open, $1:close, $2:open, and $2:close
- * occur.
- *
- * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
- *
- * Each subarray is terminated with a -1, and two leading entries
- * give the number of segments and the offset to the first entry
- * of the second array.  In addition, the second array value are
- * all offset by 2 so they index directly into the final array.
- * The total array size is 4*segments[0] + 4.  The second index is
- * 2*segments[0] + 3.
- *
- * In the output string, a segment reference is indicated by a
- * character in a special range, as defined by
- * RuleBasedTransliterator.Data.
- *
- * Most rules have no segments, in which case segments is null, and the
- * output string need not be checked for segment reference characters.
- *
- * See also rbt_rule.h/cpp.
- */
-class Segments {
-    UVector offsets;
-    UVector isOpenParen;
-public:
-    Segments(UErrorCode &status);
-    ~Segments();
-    void addParenthesisAt(int32_t offset, UBool isOpenParen, UErrorCode &status);
-    int32_t getLastParenOffset(UBool& isOpenParen) const;
-    UBool extractLastParenSubstring(int32_t& start, int32_t& limit);
-    int32_t* createArray(UErrorCode &status) const;
-    UBool validate() const;
-    int32_t count() const; // number of segments
-private:
-    int32_t offset(int32_t i) const;
-    UBool isOpen(int32_t i) const;
-    int32_t size() const; // size of the UVectors
-};
-
-int32_t Segments::offset(int32_t i) const {
-    return offsets.elementAti(i);
-}
-
-UBool Segments::isOpen(int32_t i) const {
-    return isOpenParen.elementAti(i) != 0;
-}
-
-int32_t Segments::size() const {
-    // assert(offset.size() == isOpenParen.size());
-    return offsets.size();
-}
-
-Segments::Segments(UErrorCode &status)
- : offsets(status),
-   isOpenParen(status)
-{}
-Segments::~Segments() {}
-
-void Segments::addParenthesisAt(int32_t offset, UBool isOpen, UErrorCode &status) {
-    offsets.addElement(offset, status);
-    isOpenParen.addElement(isOpen ? 1 : 0, status);
-}
-
-int32_t Segments::getLastParenOffset(UBool& isOpenParenReturn) const {
-    if (size() == 0) {
-        return -1;
-    }
-    isOpenParenReturn = isOpen(size()-1);
-    return offset(size()-1);
-}
-
-// Remove the last (rightmost) segment.  Store its offsets in start
-// and limit, and then convert all offsets at or after start to be
-// equal to start.  Upon failure, return FALSE.  Assume that the
-// caller has already called getLastParenOffset() and validated that
-// there is at least one parenthesis and that the last one is a close
-// paren.
-UBool Segments::extractLastParenSubstring(int32_t& start, int32_t& limit) {
-    // assert(offsets.size() > 0);
-    // assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
-    int32_t i = size() - 1;
-    int32_t n = 1; // count of close parens we need to match
-    // Record position of the last close paren
-    limit = offset(i);
-    --i; // back up to the one before the last one
-    while (i >= 0 && n != 0) {
-        n += isOpen(i) ? -1 : 1;
-    }
-    if (n != 0) {
-        return FALSE;
-    }
-    // assert(i>=0);
-    start = offset(i);
-    // Reset all segment pairs from i to size() - 1 to [start, start+1).
-    while (i<size()) {
-        int32_t o = isOpen(i) ? start : (start+1);
-        offsets.setElementAt(o, i);
-        ++i;
-    }
-    return TRUE;
-}
-
-// Assume caller has already gotten a TRUE validate().
-int32_t* Segments::createArray(UErrorCode &status) const {
-    int32_t c = count(); // number of segments
-    int32_t arrayLen = 4*c + 4;
-    int32_t *array = new int32_t[arrayLen];
-    int32_t a2offset = 2*c + 3; // offset to array 2
-
-    if (array == NULL) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-        return NULL;
-    }
-    array[0] = c;
-    array[1] = a2offset;
-    int32_t i;
-    for (i=0; i<2*c; ++i) {
-        array[2+i] = offset(i);
-    }
-    array[a2offset-1] = -1;
-    array[arrayLen-1] = -1;
-    // Now walk through and match up segment numbers with parentheses.
-    // Number segments from 0.  We're going to offset all entries by 2
-    // to skip the first two elements, array[0] and array[1].
-    UStack stack(status);
-    int32_t nextOpen = 0; // seg # of next open, 0-based
-    if (U_FAILURE(status)) {
-        return NULL;
-    }
-    for (i=0; i<2*c; ++i) {
-        UBool open = isOpen(i);
-        // Let seg be the zero-based segment number.
-        // Open parens are at 2*seg in array 2.
-        // Close parens are at 2*seg+1 in array 2.
-        if (open) {
-            array[a2offset + 2*nextOpen] = 2+i;
-            stack.push(nextOpen, status);
-            ++nextOpen;
-        } else {
-            int32_t nextClose = stack.popi();
-            array[a2offset + 2*nextClose+1] = 2+i;
-        }
-    }
-    // assert(stack.empty());
-
-    // Perform a series of checks on the array.  DO NOT COMPILE INTO
-    // PRODUCTION CODE.  Use to debug array building problems.
-    //
-    //::if (!stack.empty()) {
-    //::    __asm int 03;
-    //::}
-    //::// check the array
-    //::if (array[0] < 1) {
-    //::    __asm int 03;
-    //::}
-    //::if (array[1] < 5) {
-    //::    __asm int 03;
-    //::}
-    //::for (i=2; i<2+array[0]*2; ++i) {
-    //::    if (array[i] < 0) { // array[i] is an offset into the rule
-    //::        __asm int 03;
-    //::    }
-    //::}
-    //::if (array[2+array[0]*2] != -1) {
-    //::    __asm int 03;
-    //::}
-    //::for (i=array[1]; i<array[1]+array[0]*2; ++i) {
-    //::    if (array[i] < 2 || array[i] >= (2+2*array[0])) {
-    //::        __asm int 03;
-    //::    }
-    //::}
-    //::if (array[array[1]+array[0]*2] != -1) {
-    //::    __asm int 03;
-    //::}
-
-    return array;
-}
-
-UBool Segments::validate() const {
-    // want number of parens >= 2
-    // want number of parens to be even
-    // want first paren '('
-    // want parens to match up in the end
-    if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
-        return FALSE;
-    }
-    int32_t n = 0;
-    for (int32_t i=0; i<size(); ++i) {
-        n += isOpen(i) ? 1 : -1;
-        if (n < 0) {
-            return FALSE;
-        }
-    }
-    return n == 0;
-}
-
-// Assume caller has already gotten a TRUE validate().
-int32_t Segments::count() const {
-    // assert(validate());
-    return size() / 2;
-}
-
 //----------------------------------------------------------------------
 // BEGIN RuleHalf
 //----------------------------------------------------------------------
@ -416,11 +170,7 @@ public:
    int32_t ante;   // position of ante context marker '{' in text
    int32_t post;   // position of post context marker '}' in text

-    // Record the position of the segment substrings and references.  A
-    // given side should have segments or segment references, but not
-    // both.
-    Segments* segments;
-    int32_t maxRef;       // index of largest ref ($n) on the right
+    int32_t maxRef; // n where maximum segment ref is $n; 1-based

    // Record the offset to the cursor either to the left or to the
    // right of the key.  This is indicated by characters on the output
@ -432,9 +182,26 @@ public:
    // output text.
    int32_t cursorOffset; // only nonzero on output side

+    // Position of first CURSOR_OFFSET on _right_.  This will be -1
+    // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
+    int32_t cursorOffsetPos;
+
    UBool anchorStart;
    UBool anchorEnd;
    
+    UErrorCode ec;
+
+    /**
+     * UnicodeMatcher objects corresponding to each segment.
+     */
+    UVector segments;
+        
+    /**
+     * The segment number from 0..n-1 of the next '(' we see
+     * during parsing; 0-based.
+     */
+    int32_t nextSegmentNumber;
+
    TransliteratorParser& parser;

    //--------------------------------------------------
@ -443,22 +210,22 @@ public:
    RuleHalf(TransliteratorParser& parser);
    ~RuleHalf();

-    /**
-     * Parse one side of a rule, stopping at either the limit,
-     * the END_OF_RULE character, or an operator.  Return
-     * the pos of the terminating character (or limit).
-     */
    int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit);

+    int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
+                         UnicodeString& buf,
+                         UBool isSegment);
+
    /**
     * Remove context.
     */
    void removeContext();

    /**
-     * Create and return an int[] array of segments.
+     * Create and return a UnicodeMatcher*[] array of segments,
+     * or NULL if there are no segments.
     */
-    int32_t* createSegments(UErrorCode& status) const;
+    UnicodeMatcher** createSegments(UErrorCode& status) const;

    int syntaxError(UErrorCode code,
                    const UnicodeString& rule,
@ -472,30 +239,69 @@ private:
    RuleHalf& operator=(const RuleHalf&);
 };

-RuleHalf::RuleHalf(TransliteratorParser& p) : parser(p) {
+RuleHalf::RuleHalf(TransliteratorParser& p) :
+    ec(U_ZERO_ERROR),
+    segments(ec),
+    parser(p)
+{
    cursor = -1;
    ante = -1;
    post = -1;
-    segments = NULL;
    maxRef = -1;
    cursorOffset = 0;
+    cursorOffsetPos = 0;
    anchorStart = anchorEnd = FALSE;
+    segments.removeAllElements();
+    nextSegmentNumber = 0;
 }

 RuleHalf::~RuleHalf() {
-    delete segments;
 }

 /**
 * Parse one side of a rule, stopping at either the limit,
- * the END_OF_RULE character, or an operator.  Return
- * the pos of the terminating character (or limit).
+ * the END_OF_RULE character, or an operator.
+ * @return the index after the terminating character, or
+ * if limit was reached, limit
 */
 int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
    int32_t start = pos;
-    UnicodeString& buf = text;
+    text.truncate(0);
+    pos = parseSection(rule, pos, limit, text, FALSE);
+
+    if (cursorOffset > 0 && cursor != cursorOffsetPos) {
+        return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
+    }
+    
+    return pos;
+}
+ 
+/**
+ * Parse a section of one side of a rule, stopping at either
+ * the limit, the END_OF_RULE character, an operator, or a
+ * segment close character.  This method parses both a
+ * top-level rule half and a segment within such a rule half.
+ * It calls itself recursively to parse segments and nested
+ * segments.
+ * @param buf buffer into which to accumulate the rule pattern
+ * characters, either literal characters from the rule or
+ * standins for UnicodeMatcher objects including segments.
+ * @param isSegment if true, then we've already seen a '(' and
+ * pos on entry points right after it.  Accumulate everything
+ * up to the closing ')', put it in a segment matcher object,
+ * generate a standin for it, and add the standin to buf.  As
+ * a side effect, update the segments vector with a reference
+ * to the segment matcher.  This works recursively for nested
+ * segments.  If isSegment is false, just accumulate
+ * characters into buf.
+ * @return the index after the terminating character, or
+ * if limit was reached, limit
+ */
+int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
+                               UnicodeString& buf,
+                               UBool isSegment) {
+    int32_t start = pos;
    ParsePosition pp;
-    int32_t cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
    UnicodeString scratch;
    UBool done = FALSE;
    int32_t quoteStart = -1; // Most recent 'single quoted string'
@ -503,6 +309,15 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
    int32_t varStart = -1; // Most recent $variableReference
    int32_t varLimit = -1;

+    // If isSegment, then bufSegStart is the offset in buf to
+    // the first character of the segment we are parsing.
+    int32_t bufSegStart = 0;
+    int32_t segmentNumber = 0;
+    if (isSegment) {
+        bufSegStart = buf.length();
+        segmentNumber = nextSegmentNumber++;
+    }
+    
    while (pos < limit && !done) {
        UChar c = rule.charAt(pos++);
        if (u_isWhitespace(c)) {
@ -511,8 +326,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            // whitespace likely to be seen in code.
            continue;
        }
-        if (u_strchr(gOPERATORS, c) != NULL) {
-            --pos; // Backup to point to operator
+        if (u_strchr(HALF_ENDERS, c) != NULL) {
+            if (isSegment) {
+                // Unclosed segment
+                return syntaxError(U_UNCLOSED_SEGMENT, rule, start);
+            }
            break;
        }
        if (anchorEnd) {
@ -575,6 +393,10 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            continue;
        }
        switch (c) {
+                    
+        //------------------------------------------------------
+        // Elements allowed within and out of segments
+        //------------------------------------------------------
        case ANCHOR_START:
            if (buf.length() == 0 && !anchorStart) {
                anchorStart = TRUE;
@ -584,17 +406,7 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            }
          break;
        case SEGMENT_OPEN:
-        case SEGMENT_CLOSE:
-            // Handle segment definitions "(" and ")"
-            // Parse "(", ")"
-            if (segments == NULL) {
-                segments = new Segments(parser.status);
-            }
-            segments->addParenthesisAt(buf.length(), c == SEGMENT_OPEN, parser.status);
-            break;
-        case END_OF_RULE:
-            --pos; // Backup to point to END_OF_RULE
-            done = TRUE;
+            pos = parseSection(rule, pos, limit, buf, TRUE);
            break;
        case SymbolTable::SYMBOL_REF:
            // Handle variable references and segment references "$1" .. "$9"
@ -655,25 +467,128 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
                }
            }
            break;
+        case DOT:
+            buf.append(parser.getDotStandIn());
+            break;
+        case KLEENE_STAR:
+        case ONE_OR_MORE:
+        case ZERO_OR_ONE:
+            // Quantifiers.  We handle single characters, quoted strings,
+            // variable references, and segments.
+            //  a+      matches  aaa
+            //  'foo'+  matches  foofoofoo
+            //  $v+     matches  xyxyxy if $v == xy
+            //  (seg)+  matches  segsegseg
+            {
+                if (isSegment && buf.length() == bufSegStart) {
+                    // The */+ immediately follows '('
+                    return syntaxError(U_MISPLACED_QUANTIFIER, rule, start);
+                }
+
+                int32_t qstart, qlimit;
+                // The */+ follows an isolated character or quote
+                // or variable reference
+                if (buf.length() == quoteLimit) {
+                    // The */+ follows a 'quoted string'
+                    qstart = quoteStart;
+                    qlimit = quoteLimit;
+                } else if (buf.length() == varLimit) {
+                    // The */+ follows a $variableReference
+                    qstart = varStart;
+                    qlimit = varLimit;
+                } else {
+                    // The */+ follows a single character, possibly
+                    // a segment standin
+                    qstart = buf.length() - 1;
+                    qlimit = qstart + 1;
+                }
+
+                UnicodeMatcher *m =
+                    new StringMatcher(buf, qstart, qlimit, FALSE, *parser.data);
+                int32_t min = 0;
+                int32_t max = Quantifier::MAX;
+                switch (c) {
+                case ONE_OR_MORE:
+                    min = 1;
+                    break;
+                case ZERO_OR_ONE:
+                    min = 0;
+                    max = 1;
+                    break;
+                // case KLEENE_STAR:
+                //    do nothing -- min, max already set
+                }
+                m = new Quantifier(m, min, max);
+                buf.truncate(qstart);
+                buf.append(parser.generateStandInFor(m));
+            }
+            break;
+
+        //------------------------------------------------------
+        // Elements allowed ONLY WITHIN segments
+        //------------------------------------------------------
+        case SEGMENT_CLOSE:
+            if (isSegment) {
+                // We're done parsing a segment.  The relevant
+                // characters are in buf, starting at offset
+                // bufSegStart.  Extract them into a string
+                // matcher, and replace them with a standin
+                // for that matcher.
+                StringMatcher *m =
+                    new StringMatcher(buf, bufSegStart, buf.length(),
+                                      TRUE, *parser.data);
+                // Since we call parseSection() recursively,
+                // nested segments will result in segment i+1
+                // getting parsed and stored before segment i;
+                // be careful with the vector handling here.
+                if ((segmentNumber+1) > segments.size()) {
+                    segments.setSize(segmentNumber+1);
+                }
+                segments.setElementAt(m, segmentNumber);
+                buf.truncate(bufSegStart);
+                buf.append(parser.generateStandInFor(m));
+                done = TRUE;
+                break;
+            }
+
+            // If we aren't in a segment, then a segment close
+            // character is a syntax error.
+            return syntaxError(U_UNQUOTED_SPECIAL, rule, start);
+
+        //------------------------------------------------------
+        // Elements allowed ONLY OUTSIDE segments
+        //------------------------------------------------------
        case CONTEXT_ANTE:
+            if (isSegment) {
+                return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
+            }
            if (ante >= 0) {
                return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start);
            }
            ante = buf.length();
            break;
        case CONTEXT_POST:
+            if (isSegment) {
+                return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
+            }
            if (post >= 0) {
                return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start);
            }
            post = buf.length();
            break;
        case CURSOR_POS:
+            if (isSegment) {
+                return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
+            }
            if (cursor >= 0) {
                return syntaxError(U_MULTIPLE_CURSORS, rule, start);
            }
            cursor = buf.length();
            break;
        case CURSOR_OFFSET:
+            if (isSegment) {
+                return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
+            }
            if (cursorOffset < 0) {
                if (buf.length() > 0) {
                    return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
@ -695,69 +610,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
                }
            }
            break;
-        case DOT:
-            buf.append(parser.getDotStandIn());
-            break;
-        case KLEENE_STAR:
-        case ONE_OR_MORE:
-        case ZERO_OR_ONE:
-            // Quantifiers.  We handle single characters, quoted strings,
-            // variable references, and segments.
-            //  a+      matches  aaa
-            //  'foo'+  matches  foofoofoo
-            //  $v+     matches  xyxyxy if $v == xy
-            //  (seg)+  matches  segsegseg
-            {
-                int32_t start, limit;
-                UBool isOpenParen;
-                UBool isSegment = FALSE;
-                if (segments != 0 &&
-                    segments->getLastParenOffset(isOpenParen) == buf.length()) {
-                    // The */+ immediately follows a segment
-                    if (isOpenParen) {
-                        return syntaxError(U_MISPLACED_QUANTIFIER, rule, start);
-                    }
-                    if (!segments->extractLastParenSubstring(start, limit)) {
-                        return syntaxError(U_MISMATCHED_SEGMENT_DELIMITERS, rule, start);
-                    }
-                    isSegment = TRUE;
-                } else {
-                    // The */+ follows an isolated character or quote
-                    // or variable reference
-                    if (buf.length() == quoteLimit) {
-                        // The */+ follows a 'quoted string'
-                        start = quoteStart;
-                        limit = quoteLimit;
-                    } else if (buf.length() == varLimit) {
-                        // The */+ follows a $variableReference
-                        start = varStart;
-                        limit = varLimit;
-                    } else {
-                        // The */+ follows a single character
-                        start = buf.length() - 1;
-                        limit = start + 1;
-                    }
-                }
-                UnicodeMatcher *m =
-                    new StringMatcher(buf, start, limit, isSegment, *parser.data);
-                int32_t min = 0;
-                int32_t max = Quantifier::MAX;
-                switch (c) {
-                case ONE_OR_MORE:
-                    min = 1;
-                    break;
-                case ZERO_OR_ONE:
-                    min = 0;
-                    max = 1;
-                    break;
-                // case KLEENE_STAR:
-                //    do nothing -- min, max already set
-                }
-                m = new Quantifier(m, min, max);
-                buf.truncate(start);
-                buf.append(parser.generateStandInFor(m));
-            }
-            break;
+
+
+        //------------------------------------------------------
+        // Non-special characters
+        //------------------------------------------------------
        default:
            // Disallow unquoted characters other than [0-9A-Za-z]
            // in the printable ASCII range.  These characters are
@ -773,10 +630,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
        }
    }

-    if (cursorOffset > 0 && cursor != cursorOffsetPos) {
-        return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
-    }
-    // text = buf.toString();
    return pos;
 }

@ -797,10 +650,15 @@ void RuleHalf::removeContext() {
 }

 /**
- * Create and return an int32_t[] array of segments.
+ * Create and return a UnicodeMatcher*[] array of segments,
+ * or NULL if there are no segments.
 */
-int32_t* RuleHalf::createSegments(UErrorCode& status) const {
-    return (segments == 0) ? 0 : segments->createArray(status);
+UnicodeMatcher** RuleHalf::createSegments(UErrorCode& status) const {
+    if (segments.size() == 0) {
+        return NULL;
+    }
+    UnicodeMatcher** result = new UnicodeMatcher*[segments.size()];
+    return (UnicodeMatcher**) segments.toArray((void**) result);
 }

 //----------------------------------------------------------------------
@ -1172,9 +1030,10 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
        return start;
    }

-    if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(pos++))) == NULL) {
+    if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) {
        return syntaxError(U_MISSING_OPERATOR, rule, start);
    }
+    ++pos;

    // Found an operator char.  Check for forward-reverse operator.
    if (op == REVERSE_RULE_OP &&
@ -1189,7 +1048,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
    }

    if (pos < limit) {
-        if (rule.charAt(pos) == END_OF_RULE) {
+        if (rule.charAt(--pos) == END_OF_RULE) {
            ++pos;
        } else {
            // RuleHalf parser must have terminated at an operator
@ -1251,8 +1110,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
    // apply.
    if (op == FWDREV_RULE_OP) {
        right->removeContext();
-        delete right->segments;
-        right->segments = NULL;
+        right->segments.removeAllElements();
        left->cursor = left->maxRef = -1;
        left->cursorOffset = 0;
    }
@ -1272,7 +1130,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
    // cannot place the cursor outside the limits of the context.
    // Anchors are only allowed on the input side.
    if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
-        right->segments != NULL || left->maxRef >= 0 ||
+        right->segments.size() > 0 || left->maxRef >= 0 ||
        (right->cursorOffset != 0 && right->cursor < 0) ||
        // - The following two checks were used to ensure that the
        // - the cursor offset stayed within the ante- or postcontext.
@ -1288,20 +1146,15 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
    // Check integrity of segments and segment references.  Each
    // segment's start must have a corresponding limit, and the
    // references must not refer to segments that do not exist.
-    if (left->segments != NULL) {
-        if (!left->segments->validate()) {
-            return syntaxError(U_MISSING_SEGMENT_CLOSE, rule, start);
-        }
-        int32_t n = left->segments->count();
-        if (right->maxRef > n) {
-            return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start);
-        }
+    if (right->maxRef > left->segments.size()) {
+        return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start);
    }

    data->ruleSet.addRule(new TransliterationRule(
                                 left->text, left->ante, left->post,
                                 right->text, right->cursor, right->cursorOffset,
                                 left->createSegments(status),
+                                 left->segments.size(),
                                 left->anchorStart, left->anchorEnd,
                                 data,
                                 status), status);
@ -1366,7 +1219,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
    if (variableNext >= variableLimit) {
        // throw new RuntimeException("Private use variables exhausted");
        delete adopted;
-        status = U_ILLEGAL_ARGUMENT_ERROR;
+        status = U_VARIABLE_RANGE_EXHAUSTED;
        return 0;
    }
    variablesVector->addElement(adopted, status);
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@ -14,28 +14,11 @@
 #include "unicode/uniset.h"
 #include "unicode/unicode.h"
 #include "cmemory.h"
+#include "strmatch.h"

 static const UChar APOSTROPHE = 0x0027; // '\''
 static const UChar BACKSLASH  = 0x005C; // '\' 

-// To process segments we need to allocate arrays of integers.  We use
-// stack storage as long as the segment count is <= MAX_STATIC_SEGS.
-// Otherwise, we allocate heap space.
-#define MAX_STATIC_SEGS 20
-
-// Macros for accessing the array of integers encoding the position of
-// SEGMENTS_COUNT number of segments, n (half the number of parens)
-// SEGMENTS_LEN   length of the segments array (number of elements)
-// SEGMENTS_POS   position in 'pattern' of parenthesis i, where i=0..2n-1
-// SEGMENTS_NUM   index into segments to access POS of $1.open,
-//                $1.close, $2.open, $2.close,.., $n.open, $n.close
-//                Relative to FIRST_SEG_POS_INDEX.  Ranges from 0..2n-1.
-#define FIRST_SEG_POS_INDEX 2
-#define SEGMENTS_COUNT(x) x[0]
-#define SEGMENTS_LEN(x) (SEGMENTS_COUNT(x)*4+4)
-#define SEGMENTS_POS(x,i) x[FIRST_SEG_POS_INDEX+i]
-#define SEGMENTS_NUM(x,i) (x[x[1]+i]-FIRST_SEG_POS_INDEX)
-
 U_NAMESPACE_BEGIN

 const UChar TransliterationRule::ETHER = 0xFFFF;
@ -56,11 +39,10 @@ const UChar TransliterationRule::ETHER = 0xFFFF;
 * <code>output</code>; that is, -1 is equivalent to
 * <code>output.length()</code>.  If greater than
 * <code>output.length()</code> then an exception is thrown.
- * @param adoptedSegs array of 2n integers.  Each of n pairs consists of offset,
- * limit for a segment of the input string.  Characters in the output string
- * refer to these segments if they are in a special range determined by the
- * associated RuleBasedTransliterator.Data object.  May be null if there are
- * no segments.
+ * @param segs array of UnicodeMatcher corresponding to input pattern
+ * segments, or null if there are none.  The array itself is adopted,
+ * but the pointers within it are not.
+ * @param segsCount number of elements in segs[]
 * @param anchorStart TRUE if the the rule is anchored on the left to
 * the context start
 * @param anchorEnd TRUE if the rule is anchored on the right to the
@ -70,7 +52,8 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
                                         int32_t anteContextPos, int32_t postContextPos,
                                         const UnicodeString& outputStr,
                                         int32_t cursorPosition, int32_t cursorOffset,
-                                         int32_t* adoptedSegs,
+                                         UnicodeMatcher** segs,
+                                         int32_t segsCount,
                                         UBool anchorStart, UBool anchorEnd,
                                         const TransliterationRuleData* theData,
                                         UErrorCode& status) :
@ -113,23 +96,11 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
    this->cursorPos = cursorPosition + cursorOffset;
    this->output = outputStr;
    // We don't validate the segments array.  The caller must
-    // guarantee that the segments are well-formed.
-    this->segments = adoptedSegs;
-    // Find the position of the first segment index that is after the
-    // anteContext (in the key).  Note that this may be a start or a
-    // limit index.  If all segments are in the ante context,
-    // firstKeySeg should point past the last segment -- that is, it
-    // should point at the end marker, which is -1.  This allows the
-    // code to back up by one to obtain the last ante context segment.
-    firstKeySeg = -1;
-    if (segments != 0) {
-        firstKeySeg = FIRST_SEG_POS_INDEX;
-        while (segments[firstKeySeg] >= 0 &&
-               segments[firstKeySeg] < anteContextLength) {
-            ++firstKeySeg;
-        }
-        firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
-    }
+    // guarantee that the segments are well-formed (that is, that
+    // all $n references in the output refer to indices of this
+    // array, and that no array elements are null).
+    this->segments = segs;
+    this->segmentsCount = segsCount;

    pattern = input;
    flags = 0;
@ -149,18 +120,17 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
 TransliterationRule::TransliterationRule(TransliterationRule& other) :
    pattern(other.pattern),
    output(other.output),
-    firstKeySeg(other.firstKeySeg),
    anteContextLength(other.anteContextLength),
    keyLength(other.keyLength),
    cursorPos(other.cursorPos),
    flags(other.flags),
    data(other.data) {

-    segments = 0;
-    if (other.segments != 0) {
-        int32_t len = SEGMENTS_LEN(other.segments);
-        segments = new int32_t[len];
-        uprv_memcpy(segments, other.segments, len*sizeof(segments[0]));
+    segments = NULL;
+    segmentsCount = 0;
+    if (other.segmentsCount > 0) {
+        segments = new UnicodeMatcher*[other.segmentsCount];
+        uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0]));
    }
 }

@ -341,26 +311,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,

    // ============================ MATCH ===========================

-    // Record the actual positions, in the text, of the segments.
-	// These are recorded in the order that they occur in the pattern.
-
-    // segPos[] is an array of 2*SEGMENTS_COUNT elements.  It
-    // records the position in 'text' of each segment boundary, in
-    // the order that they occur in 'pattern'.
-    int32_t _segPos[2*MAX_STATIC_SEGS];
-    int32_t *segPos = _segPos;
-    if (segments != 0 && SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
-        segPos = new int32_t[2*SEGMENTS_COUNT(segments)];
+    // Reset segment match data
+    if (segments != NULL) {
+        for (int32_t i=0; i<segmentsCount; ++i) {
+            ((StringMatcher*) segments[i])->resetMatch();
+        }
    }
-    // iSeg is an index into segments[] that accesses the first
-    // array.  As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
-    // When indexing into segments[] FIRST_SEG_POS_INDEX must be
-    // added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
-    int32_t iSeg = firstKeySeg - 1;
-    // nextSegPos is an offset in 'pattern'.  When the cursor is
-    // equal to nextSegPos, we are at a segment boundary, and we
-    // record the position in the real text in segPos[].
-    int32_t nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;

    UMatchDegree m;
    int32_t lenDelta, keyLimit;
@ -386,26 +342,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                keyChar == text.charAt(oText)) {
                --oText;
            } else {
-                m = U_MISMATCH;
-                goto exit;
+                return U_MISMATCH;
            }
        } else {
            // Subtract 1 from contextStart to make it a reverse limit
            if (matcher->matches(text, oText, pos.contextStart-1, FALSE)
                != U_MATCH) {
-                m = U_MISMATCH;
-                goto exit;
+                return U_MISMATCH;
            }
        }
-        while (nextSegPos == oPattern) {
-            segPos[iSeg] = oText;
-            if (oText >= 0) {
-                segPos[iSeg] += UTF_CHAR_LENGTH(text.char32At(oText));
-            } else {
-                ++segPos[iSeg];
-            }
-            nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
-        }
    }

    minOText = posAfter(text, oText);
@ -413,15 +358,11 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    // ------------------------ Start Anchor ------------------------

    if ((flags & ANCHOR_START) && oText != posBefore(text, pos.contextStart)) {
-        m = U_MISMATCH;
-        goto exit;
+        return U_MISMATCH;
    }

    // -------------------- Key and Post Context --------------------

-    iSeg = firstKeySeg;
-    nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
-
    oPattern = 0;
    oText = pos.start;
    keyLimit = 0;
@ -429,8 +370,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
        if (incremental && oText == pos.limit) {
            // We've reached the limit without a mismatch and
            // without completing our match.
-            m = U_PARTIAL_MATCH;
-            goto exit;
+            return U_PARTIAL_MATCH;
        }

        // It might seem that we could do a check like this here:
@ -445,10 +385,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
        // depending on whether we're in the key or in the post
        // context.

-        while (oPattern == nextSegPos) {
-            segPos[iSeg] = oText;
-            nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
-        }
        if (oPattern == keyLength) {
            keyLimit = oText;
        }
@ -467,13 +403,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                keyChar == text.charAt(oText)) {
                ++oText;
            } else {
-                m = U_MISMATCH;
-                goto exit;
+                return U_MISMATCH;
            }
        } else {
            m = matcher->matches(text, oText, matchLimit, incremental);
            if (m != U_MATCH) {
-                goto exit;
+                return m;
            }
        }

@ -486,10 +421,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
        //!    // at the end of the key.
        //!    return UnicodeMatcher.U_MISMATCH;
        //!}
-    }
-    while (oPattern == nextSegPos) {
-        segPos[iSeg] = oText;
-        nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
    }
 	if (oPattern == keyLength) {
 		keyLimit = oText;
@ -509,8 +440,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    // =========================== REPLACE ==========================

    // We have a full match.  The key is between pos.start and
-    // keyLimit.  Segment indices have been recorded in segPos[].
-    // Perform a replacement.
+    // keyLimit.

    if (segments == NULL) {
        text.handleReplaceBetween(pos.start, keyLimit, output);
@ -562,11 +492,22 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                    buf.remove();
                }
                // Copy segment with out-of-band data 
-                b *= 2;
-                int32_t start = segPos[SEGMENTS_NUM(segments,b)];
-                int32_t limit = segPos[SEGMENTS_NUM(segments,b+1)];
-                text.copy(start, limit, dest);
-                dest += limit - start;
+                StringMatcher* m = (StringMatcher*) segments[b];
+                int32_t start = m->getMatchStart();
+                int32_t limit = m->getMatchLimit();
+                // If there was no match, that means that a quantifier
+                // matched zero-length.  E.g., x (a)* y matched "xy".
+                if (start >= 0) {
+                    // Adjust indices for segments in post context
+                    // for any inserted text between the key and
+                    // the post context.
+                    if (start >= keyLimit) {
+                        start += dest - keyLimit;
+                        limit += dest - keyLimit;
+                    }
+                    text.copy(start, limit, dest);
+                    dest += limit - start;
+                }
            }
            oOutput += UTF_CHAR_LENGTH(c);
        }
@ -600,13 +541,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    pos.contextLimit += lenDelta;
    // Restrict new value of start to [minOText, min(oText, pos.limit)].
    pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
-    m = U_MATCH;
-    
-  exit:
-    if (segPos != _segPos) {
-        delete[] segPos;
-    }
-    return m;
+    return U_MATCH;
 }

 /**
@ -727,23 +662,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
                                           UBool escapeUnprintable) const {
    int32_t i;

-    // iseg indexes into segments[] directly (not offset from FSPI)
-    int32_t iseg = FIRST_SEG_POS_INDEX-1;
-    int32_t nextSeg = -1;
-    // Build an array of booleans specifying open vs. close paren
-    UBool _isOpen[2*MAX_STATIC_SEGS];
-    UBool *isOpen = _isOpen;
-    if (segments != 0) {
-        if (SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
-            isOpen = new UBool[2*SEGMENTS_COUNT(segments)];
-        }
-        for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
-            isOpen[SEGMENTS_NUM(segments,i)  ] = TRUE;
-            isOpen[SEGMENTS_NUM(segments,i+1)] = FALSE;
-        }
-        nextSeg = segments[++iseg];
-    }
-
    // Accumulate special characters (and non-specials following them)
    // into quoteBuf.  Append quoteBuf, within single quotes, when
    // a non-quoted element must be inserted.
@ -765,14 +683,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
            appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
        }

-        // Append either '(' or ')' if we are at a segment index
-        if (i == nextSeg) {
-            appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
-                             (UChar)0x0028 : (UChar)0x0029,
-                             TRUE, escapeUnprintable, quoteBuf);
-            nextSeg = segments[++iseg];
-        }
-
        if (emitBraces && i == (anteContextLength + keyLength)) {
            appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
        }
@ -787,11 +697,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
        }
    }

-    if (i == nextSeg) {
-        // assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
-        appendToRule(rule, (UChar)0x0029 /*)*/, TRUE, escapeUnprintable, quoteBuf);
-    }
-
    if (emitBraces && i == (anteContextLength + keyLength)) {
        appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
    }
@ -854,9 +759,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,

    appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);

-    if (isOpen != _isOpen) {
-        delete[] isOpen;
-    }
    return rule;
 }

--- a/icu4c/source/i18n/rbt_rule.h
+++ b/icu4c/source/i18n/rbt_rule.h
@ -33,6 +33,16 @@ class TransliterationRuleData;
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined.
 *
+ * <p>A rule may contain segments in its input string and segment
+ * references in its output string.  A segment is a substring of the
+ * input pattern, indicated by an offset and limit.  The segment may
+ * be in the preceding or following context.  It may not span a
+ * context boundary.  A segment reference is a special character in
+ * the output string that causes a segment of the input string (not
+ * the input pattern) to be copied to the output string.  The range of
+ * special characters that represent segment references is defined by
+ * RuleBasedTransliterator.Data.
+ *
 * @author Alan Liu
 */
 class TransliterationRule {
@ -65,20 +75,20 @@ private:
    UnicodeString output;

    /**
-     * An array of integers encoding the position of the segments.
-     * See rbt_pars.cpp::Segments for more details.
+     * An array of matcher objects corresponding to the input pattern
+     * segments.  If there are no segments this is null.  N.B. This is
+     * a UnicodeMatcher for generality, but in practice it is always a
+     * StringMatcher.  In the future we may generalize this, but for
+     * now we sometimes cast down to StringMatcher.
+     *
+     * The array is owned, but the pointers within it are not.
     */
-    int32_t* segments;
+    UnicodeMatcher** segments;

    /**
-     * A value we compute from segments.  The first index into segments[]
-     * that is >= anteContextLength.  That is, the first one that is within
-     * the forward scanned part of the pattern -- the key or the postContext.
-     * If there are no segments, this has the value -1.  This index is relative
-     * to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
-     * segments[FIRST_SEG_POS_INDEX + firstKeySeg].
+     * The number of elements in segments[] or zero if segments is NULL.
     */
-    int32_t firstKeySeg;
+    int32_t segmentsCount;

    /**
     * The length of the string that must match before the key.  If
@ -143,11 +153,10 @@ public:
     * 0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
     * "xyz" and moves the cursor to before "a".  It would have a cursorOffset
     * of -3.
-     * @param adoptedSegs array of 2n integers.  Each of n pairs consists of offset,
-     * limit for a segment of the input string.  Characters in the output string
-     * refer to these segments if they are in a special range determined by the
-     * associated RuleBasedTransliterator.Data object.  May be null if there are
-     * no segments.
+     * @param segs array of UnicodeMatcher corresponding to input pattern
+     * segments, or null if there are none.  The array itself is adopted,
+     * but the pointers within it are not.
+     * @param segsCount number of elements in segs[]
     * @param anchorStart TRUE if the the rule is anchored on the left to
     * the context start
     * @param anchorEnd TRUE if the rule is anchored on the right to the
@ -157,7 +166,8 @@ public:
                        int32_t anteContextPos, int32_t postContextPos,
                        const UnicodeString& outputStr,
                        int32_t cursorPosition, int32_t cursorOffset,
-                        int32_t* adoptedSegs,
+                        UnicodeMatcher** segs,
+                        int32_t segsCount,
                        UBool anchorStart, UBool anchorEnd,
                        const TransliterationRuleData* data,
                        UErrorCode& status);
--- a/icu4c/source/i18n/strmatch.cpp
+++ b/icu4c/source/i18n/strmatch.cpp
@ -18,7 +18,9 @@ StringMatcher::StringMatcher(const UnicodeString& theString,
                             UBool isSeg,
                             const TransliterationRuleData& theData) :
    data(theData),
-    isSegment(isSeg)
+    isSegment(isSeg),
+    matchStart(-1),
+    matchLimit(-1)
 {
    theString.extractBetween(start, limit, pattern);
 }
@ -27,7 +29,9 @@ StringMatcher::StringMatcher(const StringMatcher& o) :
    UnicodeMatcher(o),
    pattern(o.pattern),
    data(o.data),
-    isSegment(o.isSegment)
+    isSegment(o.isSegment),
+    matchStart(o.matchStart),
+    matchLimit(o.matchStart)
 {
 }

@ -54,6 +58,7 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
    int32_t i;
    int32_t cursor = offset;
    if (limit < cursor) {
+        // Match in the reverse direction
        for (i=pattern.length()-1; i>=0; --i) {
            UChar keyChar = pattern.charAt(i);
            const UnicodeMatcher* subm = data.lookup(keyChar);
@ -72,6 +77,14 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
                }
            }
        }
+        // Record the match position, but adjust for a normal
+        // forward start, limit, and only if a prior match does not
+        // exist -- we want the rightmost match.
+        if (matchStart < 0) {
+            // cast away const -- should modify method to be non-const
+            ((StringMatcher*)this)->matchStart = cursor+1;
+            ((StringMatcher*)this)->matchLimit = offset+1;
+        }
    } else {
        for (i=0; i<pattern.length(); ++i) {
            if (incremental && cursor == limit) {
@ -99,6 +112,10 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
                }
            }
        }
+        // Record the match position
+        // cast away const -- should modify method to be non-const
+        ((StringMatcher*)this)->matchStart = offset;
+        ((StringMatcher*)this)->matchLimit = cursor;
    }

    offset = cursor;
@ -128,8 +145,8 @@ UnicodeString& StringMatcher::toPattern(UnicodeString& result,
        result.append((UChar)41); /*)*/
    }
    // Flush quoteBuf out to result
-    TransliterationRule::appendToRule(result, (UChar32)(isSegment?41/*)*/:-1),
-                                          TRUE, escapeUnprintable, quoteBuf);
+    TransliterationRule::appendToRule(result, -1,
+                                      TRUE, escapeUnprintable, quoteBuf);
    return result;
 }

@ -145,6 +162,32 @@ UBool StringMatcher::matchesIndexValue(uint8_t v) const {
    return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
 }

+/**
+ * Remove any match data.  This must be called before performing a
+ * set of matches with this segment.
+ */
+ void StringMatcher::resetMatch() {
+    matchStart = matchLimit = -1;
+}
+
+/**
+ * Return the start offset, in the match text, of the <em>rightmost</em>
+ * match.  This method may get moved up into the UnicodeMatcher if
+ * it turns out to be useful to generalize this.
+ */
+int32_t StringMatcher::getMatchStart() const {
+    return matchStart;
+}
+
+/**
+ * Return the limit offset, in the match text, of the <em>rightmost</em>
+ * match.  This method may get moved up into the UnicodeMatcher if
+ * it turns out to be useful to generalize this.
+ */
+int32_t StringMatcher::getMatchLimit() const {
+    return matchLimit;
+}
+
 U_NAMESPACE_END

 //eof
--- a/icu4c/source/i18n/strmatch.h
+++ b/icu4c/source/i18n/strmatch.h
@ -59,6 +59,26 @@ class StringMatcher : public UnicodeMatcher {
     */
    virtual UBool matchesIndexValue(uint8_t v) const;

+    /**
+     * Remove any match data.  This must be called before performing a
+     * set of matches with this segment.
+     */
+    void resetMatch();
+
+    /**
+     * Return the start offset, in the match text, of the <em>rightmost</em>
+     * match.  This method may get moved up into the UnicodeMatcher if
+     * it turns out to be useful to generalize this.
+     */
+    int32_t getMatchStart() const;
+
+    /**
+     * Return the limit offset, in the match text, of the <em>rightmost</em>
+     * match.  This method may get moved up into the UnicodeMatcher if
+     * it turns out to be useful to generalize this.
+     */
+    int32_t getMatchLimit() const;
+
 private:

    UnicodeString pattern;
@ -66,6 +86,10 @@ class StringMatcher : public UnicodeMatcher {
    const TransliterationRuleData& data;

    UBool isSegment;
+
+    int32_t matchStart;
+
+    int32_t matchLimit;
 };

 U_NAMESPACE_END
--- a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
- * $Date: 2001/10/26 22:59:26 $
- * $Revision: 1.57 $
+ * $Date: 2001/10/30 18:08:19 $
+ * $Revision: 1.58 $
 *
 *****************************************************************************************
 */
@ -1268,9 +1268,11 @@ public class TransliteratorTest extends TestFmwk {
               "c abc ababc",
               "d d abd");

+        // NOTE: The (ab)+ when referenced just yields a single "ab",
+        // not the full sequence of them.  This accords with perl behavior.
        expect("(ab)+ {x} > '(' $1 ')';",
               "x abx ababxy",
-               "x ab(ab) abab(abab)y");
+               "x ab(ab) abab(ab)y");

        expect("b+ > x;",
               "ac abc abbc abbbc",
@ -1288,12 +1290,11 @@ public class TransliteratorTest extends TestFmwk {
               "qa qab qaba qababc",
               "xa x xa xc");

-        // Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
-        // In perl, it only matches the first occurrence, so the output
-        // is "()a (ab) (ab)a (ab)c".
+        // NOTE: The (ab)+ when referenced just yields a single "ab",
+        // not the full sequence of them.  This accords with perl behavior.
        expect("q(ab)* > '(' $1 ')';",
               "qa qab qaba qababc",
-               "()a (ab) (ab)a (abab)c");
+               "()a (ab) (ab)a (ab)c");

        // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
        // quoted string
@ -1574,6 +1575,46 @@ public class TransliteratorTest extends TestFmwk {
        expect(gr, "\u03B1\u0314", "ha");
    }

+    /**
+     * Test quantified segment behavior.  We want:
+     * ([abc])+ > x $1 x; applied to "cba" produces "xax"
+     */
+    public void TestQuantifiedSegment() {
+        // The normal case
+        expect("([abc]+) > x $1 x;", "cba", "xcbax");
+
+        // The tricky case; the quantifier is around the segment
+        expect("([abc])+ > x $1 x;", "cba", "xax");
+
+        // Tricky case in reverse direction
+        expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
+
+        // Check post-context segment
+        expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
+
+        // Test toRule/toPattern for non-quantified segment.
+        // Careful with spacing here.
+        String r = "([a-c]){q} > x $1 x;";
+        Transliterator t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
+        String rr = t.toRules(true);
+        if (!r.equals(rr)) {
+            errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
+        } else {
+            logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
+        }
+
+        // Test toRule/toPattern for quantified segment.
+        // Careful with spacing here.
+        r = "([a-c])+{q} > x $1 x;";
+        t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
+        rr = t.toRules(true);
+        if (!r.equals(rr)) {
+            errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
+        } else {
+            logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
+        }
+    }
+
    //======================================================================
    // icu4j ONLY
    // These tests are not mirrored (yet) in icu4c at
--- a/icu4j/src/com/ibm/icu/text/StringMatcher.java
+++ b/icu4j/src/com/ibm/icu/text/StringMatcher.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringMatcher.java,v $ 
- * $Date: 2001/10/25 22:32:02 $ 
- * $Revision: 1.2 $
+ * $Date: 2001/10/30 18:04:08 $ 
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -18,16 +18,27 @@ class StringMatcher implements UnicodeMatcher {

    private boolean isSegment;

+    private int matchStart;
+
+    private int matchLimit;
+
    private final RuleBasedTransliterator.Data data;

+    public StringMatcher(String theString,
+                         boolean isSeg,
+                         RuleBasedTransliterator.Data theData) {
+        data = theData;
+        isSegment = isSeg;
+        pattern = theString;
+        matchStart = matchLimit = -1;
+    }
+
    public StringMatcher(String theString,
                         int start,
                         int limit,
                         boolean isSeg,
                         RuleBasedTransliterator.Data theData) {
-        data = theData;
-        isSegment = isSeg;
-        pattern = theString.substring(start, limit);
+        this(theString.substring(start, limit), isSeg, theData);
    }

    /**
@ -40,6 +51,7 @@ class StringMatcher implements UnicodeMatcher {
        int i;
        int[] cursor = new int[] { offset[0] };
        if (limit < cursor[0]) {
+            // Match in the reverse direction
            for (i=pattern.length()-1; i>=0; --i) {
                char keyChar = pattern.charAt(i);
                UnicodeMatcher subm = data.lookup(keyChar);
@ -58,6 +70,13 @@ class StringMatcher implements UnicodeMatcher {
                    }
                }
            }
+            // Record the match position, but adjust for a normal
+            // forward start, limit, and only if a prior match does not
+            // exist -- we want the rightmost match.
+            if (matchStart < 0) {
+                matchStart = cursor[0]+1;
+                matchLimit = offset[0]+1;
+            }
        } else {
            for (i=0; i<pattern.length(); ++i) {
                if (incremental && cursor[0] == limit) {
@ -85,6 +104,9 @@ class StringMatcher implements UnicodeMatcher {
                    }
                }
            }
+            // Record the match position
+            matchStart = offset[0];
+            matchLimit = cursor[0];
        }

        offset[0] = cursor[0];
@ -114,7 +136,7 @@ class StringMatcher implements UnicodeMatcher {
            result.append(')');
        }
        // Flush quoteBuf out to result
-        TransliterationRule.appendToRule(result, (isSegment?')':-1),
+        TransliterationRule.appendToRule(result, -1,
                                         true, escapeUnprintable, quoteBuf);
        return result.toString();
    }
@ -130,6 +152,32 @@ class StringMatcher implements UnicodeMatcher {
        UnicodeMatcher m = data.lookup(c);
        return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
    }
+
+    /**
+     * Remove any match data.  This must be called before performing a
+     * set of matches with this segment.
+     */
+    public void resetMatch() {
+        matchStart = matchLimit = -1;
+    }
+
+    /**
+     * Return the start offset, in the match text, of the <em>rightmost</em>
+     * match.  This method may get moved up into the UnicodeMatcher if
+     * it turns out to be useful to generalize this.
+     */
+    public int getMatchStart() {
+        return matchStart;
+    }
+
+    /**
+     * Return the limit offset, in the match text, of the <em>rightmost</em>
+     * match.  This method may get moved up into the UnicodeMatcher if
+     * it turns out to be useful to generalize this.
+     */
+    public int getMatchLimit() {
+        return matchLimit;
+    }
 }

 //eof
--- a/icu4j/src/com/ibm/icu/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
- * $Date: 2001/10/25 23:22:15 $
- * $Revision: 1.33 $
+ * $Date: 2001/10/30 18:04:08 $
+ * $Revision: 1.34 $
 *
 *****************************************************************************************
 */
@ -30,13 +30,15 @@ import com.ibm.util.Utility;
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined.
 *
- * <p>A rule may contain segments in its input string and segment references in
- * its output string.  A segment is a substring of the input pattern, indicated
- * by an offset and limit.  The segment may span the preceding or following
- * context.  A segment reference is a special character in the output string
- * that causes a segment of the input string (not the input pattern) to be
- * copied to the output string.  The range of special characters that represent
- * segment references is defined by RuleBasedTransliterator.Data.
+ * <p>A rule may contain segments in its input string and segment
+ * references in its output string.  A segment is a substring of the
+ * input pattern, indicated by an offset and limit.  The segment may
+ * be in the preceding or following context.  It may not span a
+ * context boundary.  A segment reference is a special character in
+ * the output string that causes a segment of the input string (not
+ * the input pattern) to be copied to the output string.  The range of
+ * special characters that represent segment references is defined by
+ * RuleBasedTransliterator.Data.
 *
 * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
 * string "abc.123" to "ab1.c23".
@ -44,7 +46,7 @@ import com.ibm.util.Utility;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.33 $ $Date: 2001/10/25 23:22:15 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.34 $ $Date: 2001/10/30 18:04:08 $
 */
 class TransliterationRule {

@ -64,20 +66,13 @@ class TransliterationRule {
    private String output;

    /**
-     * An array of integers encoding the position of the segments.
-     * See RuleBasedTransliterator.Segments for more details.
+     * An array of matcher objects corresponding to the input pattern
+     * segments.  If there are no segments this is null.  N.B. This is
+     * a UnicodeMatcher for generality, but in practice it is always a
+     * StringMatcher.  In the future we may generalize this, but for
+     * now we sometimes cast down to StringMatcher.
     */
-    int[] segments;
-
-    /**
-     * A value we compute from segments.  The first index into segments[]
-     * that is >= anteContextLength.  That is, the first one that is within
-     * the forward scanned part of the pattern -- the key or the postContext.
-     * If there are no segments, this has the value -1.  This index is relative
-     * to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
-     * segments[FIRST_SEG_POS_INDEX + firstKeySeg].
-     */
-    int firstKeySeg;
+    UnicodeMatcher[] segments;

    /**
     * The length of the string that must match before the key.  If
@ -127,20 +122,6 @@ class TransliterationRule {
    private static final char APOSTROPHE = '\'';
    private static final char BACKSLASH  = '\\';

-    // Macros for accessing the array of integers encoding the position of
-    // the segments.  See RuleBasedTransliterator.Segments for more details.
-    // SEGMENTS_COUNT number of segments, n (half the number of parens)
-    // SEGMENTS_LEN   length of the segments array (number of elements)
-    // SEGMENTS_POS   position in 'pattern' of parenthesis i, where i=0..2n-1
-    // SEGMENTS_NUM   index into segments to access POS of $1.open,
-    //                $1.close, $2.open, $2.close,.., $n.open, $n.close
-    //                Relative to FIRST_SEG_POS_INDEX.  Ranges from 0..2n-1.
-    static final int FIRST_SEG_POS_INDEX = 2;
-    static final int SEGMENTS_COUNT(int[] x) { return x[0]; }
-    static final int SEGMENTS_LEN(int[] x) { return (SEGMENTS_COUNT(x)*4+4); }
-    static final int SEGMENTS_POS(int[] x,int i) { return x[FIRST_SEG_POS_INDEX+i]; }
-    static final int SEGMENTS_NUM(int[] x,int i) { return x[x[1]+i]-FIRST_SEG_POS_INDEX; }
-
    private static final String COPYRIGHT =
        "\u00A9 IBM Corporation 1999-2001. All rights reserved.";

@ -165,12 +146,8 @@ class TransliterationRule {
     * 0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
     * "xyz" and moves the cursor to before "a".  It would have a cursorOffset
     * of -3.
-     * @param segs array of 2n integers.  Each of n pairs consists of offset,
-     * limit for a segment of the input string.  Characters in the output string
-     * refer to these segments if they are in a special range determined by the
-     * associated RuleBasedTransliterator.Data object.  May be null if there are
-     * no segments.  The caller is responsible for validating that segments
-     * are well-formed.
+     * @param segs array of UnicodeMatcher corresponding to input pattern
+     * segments, or null if there are none
     * @param anchorStart true if the the rule is anchored on the left to
     * the context start
     * @param anchorEnd true if the rule is anchored on the right to the
@ -180,7 +157,7 @@ class TransliterationRule {
                               int anteContextPos, int postContextPos,
                               String output,
                               int cursorPos, int cursorOffset,
-                               int[] segs,
+                               UnicodeMatcher[] segs,
                               boolean anchorStart, boolean anchorEnd,
                               RuleBasedTransliterator.Data theData) {
        data = theData;
@ -212,25 +189,11 @@ class TransliterationRule {
        this.cursorPos = cursorPos + cursorOffset;
        this.output = output;
        // We don't validate the segments array.  The caller must
-        // guarantee that the segments are well-formed.
+        // guarantee that the segments are well-formed (that is, that
+        // all $n references in the output refer to indices of this
+        // array, and that no array elements are null).
        this.segments = segs;

-        // Find the position of the first segment index that is after the
-        // anteContext (in the key).  Note that this may be a start or a
-        // limit index.  If all segments are in the ante context,
-        // firstKeySeg should point past the last segment -- that is, it
-        // should point at the end marker, which is -1.  This allows the
-        // code to back up by one to obtain the last ante context segment.
-        firstKeySeg = -1;
-        if (segments != null) {
-            firstKeySeg = FIRST_SEG_POS_INDEX;
-            while (segments[firstKeySeg] >= 0 &&
-                   segments[firstKeySeg] < anteContextLength) {
-                ++firstKeySeg;
-            }
-            firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
-        }
-
        pattern = input;
        flags = 0;
        if (anchorStart) {
@ -410,25 +373,12 @@ class TransliterationRule {

        // ============================ MATCH ===========================

-        // Record the actual positions, in the text, of the segments.
-        // These are recorded in the order that they occur in the pattern.
-
-        // segPos[] is an array of 2*SEGMENTS_COUNT elements.  It
-        // records the position in 'text' of each segment boundary, in
-        // the order that they occur in 'pattern'.
-        int[] segPos = null;
+        // Reset segment match data
        if (segments != null) {
-            segPos = new int[2*SEGMENTS_COUNT(segments)];
+            for (int i=0; i<segments.length; ++i) {
+                ((StringMatcher) segments[i]).resetMatch();
+            }
        }
-        // iSeg is an index into segments[] that accesses the first
-        // array.  As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
-        // When indexing into segments[] FIRST_SEG_POS_INDEX must be
-        // added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
-        int iSeg = firstKeySeg - 1;
-        // nextSegPos is an offset in 'pattern'.  When the cursor is
-        // equal to nextSegPos, we are at a segment boundary, and we
-        // record the position in the real text in segPos[].
-        int nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;

        int lenDelta, keyLimit;
        int[] intRef = new int[1];
@ -465,15 +415,6 @@ class TransliterationRule {
                }
                oText = intRef[0];
            }
-            while (nextSegPos == oPattern) {
-                segPos[iSeg] = oText;
-                if (oText >= 0) {
-                    segPos[iSeg] += UTF16.getCharCount(UTF16.charAt(text, oText));
-                } else {
-                    ++segPos[iSeg];
-                }
-                nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
-            }
        }

        minOText = posAfter(text, oText);
@ -486,9 +427,6 @@ class TransliterationRule {

        // -------------------- Key and Post Context --------------------

-        iSeg = firstKeySeg;
-        nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
-
        oPattern = 0;
        oText = pos.start;
        keyLimit = 0;
@ -511,10 +449,6 @@ class TransliterationRule {
            // depending on whether we're in the key or in the post
            // context.

-            while (oPattern == nextSegPos) {
-                segPos[iSeg] = oText;
-                nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
-            }
            if (oPattern == keyLength) {
                keyLimit = oText;
            }
@ -554,10 +488,6 @@ class TransliterationRule {
            //!    return UnicodeMatcher.U_MISMATCH;
            //!}
        }
-        while (oPattern == nextSegPos) {
-            segPos[iSeg] = oText;
-            nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
-        }
        if (oPattern == keyLength) {
            keyLimit = oText;
        }
@ -576,8 +506,7 @@ class TransliterationRule {
        // =========================== REPLACE ==========================

        // We have a full match.  The key is between pos.start and
-        // keyLimit.  Segment indices have been recorded in segPos[].
-        // Perform a replacement.
+        // keyLimit.

        if (segments == null) {
            text.replace(pos.start, keyLimit, output);
@ -629,11 +558,22 @@ class TransliterationRule {
                        buf.setLength(0);
                    }
                    // Copy segment with out-of-band data
-                    b *= 2;
-                    int start = segPos[SEGMENTS_NUM(segments,b)];
-                    int limit = segPos[SEGMENTS_NUM(segments,b+1)];
-                    text.copy(start, limit, dest);
-                    dest += limit - start;
+                    StringMatcher m = (StringMatcher) segments[b];
+                    int start = m.getMatchStart();
+                    int limit = m.getMatchLimit();
+                    // If there was no match, that means that a quantifier
+                    // matched zero-length.  E.g., x (a)* y matched "xy".
+                    if (start >= 0) {
+                        // Adjust indices for segments in post context
+                        // for any inserted text between the key and
+                        // the post context.
+                        if (start >= keyLimit) {
+                            start += dest - keyLimit;
+                            limit += dest - keyLimit;
+                        }
+                        text.copy(start, limit, dest);
+                        dest += limit - start;
+                    }
                }
                oOutput += UTF16.getCharCount(c);
            }
@ -790,20 +730,6 @@ class TransliterationRule {

        StringBuffer rule = new StringBuffer();

-        // iseg indexes into segments[] directly (not offset from FSPI)
-        int iseg = FIRST_SEG_POS_INDEX-1;
-        int nextSeg = -1;
-        // Build an array of booleans specifying open vs. close paren
-        boolean[] isOpen = null;
-        if (segments != null) {
-            isOpen = new boolean[2*SEGMENTS_COUNT(segments)];
-            for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
-                isOpen[SEGMENTS_NUM(segments,i)  ] = true;
-                isOpen[SEGMENTS_NUM(segments,i+1)] = false;
-            }
-            nextSeg = segments[++iseg];
-        }
-
        // Accumulate special characters (and non-specials following them)
        // into quoteBuf.  Append quoteBuf, within single quotes, when
        // a non-quoted element must be inserted.
@ -825,14 +751,6 @@ class TransliterationRule {
                appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
            }

-            // Append either '(' or ')' if we are at a segment index
-            if (i == nextSeg) {
-                appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
-                                 '(' : ')',
-                                 true, escapeUnprintable, quoteBuf);
-                nextSeg = segments[++iseg];
-            }
-
            if (emitBraces && i == (anteContextLength + keyLength)) {
                appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
            }
@ -847,11 +765,6 @@ class TransliterationRule {
            }
        }

-        if (i == nextSeg) {
-            // assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
-            appendToRule(rule, ')', true, escapeUnprintable, quoteBuf);
-        }
-
        if (emitBraces && i == (anteContextLength + keyLength)) {
            appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
        }
@ -885,7 +798,7 @@ class TransliterationRule {
            } else {
                ++seg; // make 1-based
                appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
-                rule.append(0x24 /*$*/);
+                rule.append('$');
                boolean show = false; // true if we should display digits
                for (int p=9; p>=0; --p) {
                    int d = seg / POW10[p];
@ -938,6 +851,9 @@ class TransliterationRule {

 /**
 * $Log: TransliterationRule.java,v $
+ * Revision 1.34  2001/10/30 18:04:08  alan
+ * jitterbug 1406: make quantified segments behave like perl counterparts
+ *
 * Revision 1.33  2001/10/25 23:22:15  alan
 * jitterbug 73: changes to support zero-length matchers at end of key
 *
--- a/icu4j/src/com/ibm/icu/text/TransliteratorParser.java
+++ b/icu4j/src/com/ibm/icu/text/TransliteratorParser.java
@ -4,8 +4,8 @@
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
-* $Date: 2001/10/24 00:03:38 $
-* $Revision: 1.7 $
+* $Date: 2001/10/30 18:04:09 $
+* $Revision: 1.8 $
 **********************************************************************
 */
 package com.ibm.text;
@ -117,6 +117,7 @@ class TransliteratorParser {
    private static final char FWDREV_RULE_OP    = '~'; // internal rep of <> op

    private static final String OPERATORS = "=><";
+    private static final String HALF_ENDERS = "=><;";

    // Other special characters
    private static final char QUOTE               = '\'';
@ -142,7 +143,7 @@ class TransliteratorParser {
    // private static final char ANCHOR_END       = '$';

    // Segments of the input string are delimited by "(" and ")".  In the
-    // output string these segments are referenced as "$1" through "$9".
+    // output string these segments are referenced as "$1", "$2", etc.
    private static final char SEGMENT_OPEN        = '(';
    private static final char SEGMENT_CLOSE       = ')';

@ -285,209 +286,6 @@ class TransliteratorParser {
        }
    };

-    //----------------------------------------------------------------------
-    // class Segments
-    //----------------------------------------------------------------------
-
-    /**
-     * Segments are parentheses-enclosed regions of the input string.
-     * These are referenced in the output string using the notation $1,
-     * $2, etc.  Numbering is in order of appearance of the left
-     * parenthesis.  Number is one-based.  Segments are defined as start,
-     * limit pairs.  Segments may nest.
-     *
-     * During parsing, segment data is encoded in an object of class
-     * Segments.  At runtime, the same data is encoded in compact form as
-     * an array of integers in a TransliterationRule.  The runtime encoding
-     * must satisfy three goals:
-     *
-     * 1. Iterate over the offsets in a pattern, from left to right,
-     *    and indicate all segment boundaries, in order.  This is done
-     *    during matching.
-     *
-     * 2. Given a reference $n, produce the start and limit offsets
-     *    for that segment.  This is done during replacement.
-     *
-     * 3. Similar to goal 1, but in addition, indicate whether each
-     *    segment boundary is a start or a limit, in other words, whether
-     *    each is an open paren or a close paren.  This is required by
-     *    the toRule() method.
-     *
-     * Goal 1 must be satisfied at high speed since this is done during
-     * matching.  Goal 2 is next most important.  Goal 3 is not performance
-     * critical since it is only needed by toRule().
-     *
-     * The array of integers is actually two arrays concatenated.  The
-     * first gives the index values of the open and close parentheses in
-     * the order they appear.  The second maps segment numbers to the
-     * indices of the first array.  The two arrays have the same length.
-     * Iterating over the first array satisfies goal 1.  Indexing into the
-     * second array satisfies goal 2.  Goal 3 is satisfied by iterating
-     * over the second array and constructing the required data when
-     * needed.  This is what toRule() does.
-     *
-     * Example:  (a b(c d)e f)
-     *            0 1 2 3 4 5 6
-     *
-     * First array: Indices are 0, 2, 4, and 6.
-
-     * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
-     * second array is 0, 3, 1 2 -- these give the indices in the
-     * first array at which $1:open, $1:close, $2:open, and $2:close
-     * occur.
-     *
-     * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
-     *
-     * Each subarray is terminated with a -1, and two leading entries
-     * give the number of segments and the offset to the first entry
-     * of the second array.  In addition, the second array value are
-     * all offset by 2 so they index directly into the final array.
-     * The total array size is 4*segments[0] + 4.  The second index is
-     * 2*segments[0] + 3.
-     *
-     * In the output string, a segment reference is indicated by a
-     * character in a special range, as defined by
-     * RuleBasedTransliterator.Data.
-     *
-     * Most rules have no segments, in which case segments is null, and the
-     * output string need not be checked for segment reference characters.
-     *
-     * See also rbt_rule.h/cpp.
-     */
-    private static class Segments {
-
-        private Vector offsets; // holds Integer objects
-
-        private Vector isOpenParen; // holds Boolean objects
-
-        private int offset(int i) {
-            return ((Integer) offsets.elementAt(i)).intValue();
-        }
-
-        private boolean isOpen(int i) {
-            return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
-        }
-
-        // size of the Vectors
-        private int size() {
-            // assert(offset.size() == isOpenParen.size());
-            return offsets.size();
-        }
-
-        public Segments() {
-            offsets = new Vector();
-            isOpenParen = new Vector();
-        }
-
-        public void addParenthesisAt(int offset, boolean isOpen) {
-            offsets.addElement(new Integer(offset));
-            isOpenParen.addElement(new Boolean(isOpen));
-        }
-
-        public int getLastParenOffset(boolean[] isOpenParen) {
-            if (size() == 0) {
-                return -1;
-            }
-            isOpenParen[0] = isOpen(size()-1);
-            return offset(size()-1);
-        }
-
-        // Remove the last (rightmost) segment.  Store its offsets in start
-        // and limit, and then convert all offsets at or after start to be
-        // equal to start.  Upon failure, return FALSE.  Assume that the
-        // caller has already called getLastParenOffset() and validated that
-        // there is at least one parenthesis and that the last one is a close
-        // paren.
-        public boolean extractLastParenSubstring(int[] start, int[] limit) {
-            // assert(offsets.size() > 0);
-            // assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
-            int i = size() - 1;
-            int n = 1; // count of close parens we need to match
-            // Record position of the last close paren
-            limit[0] = offset(i);
-            --i; // back up to the one before the last one
-            while (i >= 0 && n != 0) {
-                n += isOpen(i) ? -1 : 1;
-            }
-            if (n != 0) {
-                return false;
-            }
-            // assert(i>=0);
-            start[0] = offset(i);
-            // Reset all segment pairs from i to size() - 1 to [start, start+1).
-            while (i<size()) {
-                int o = isOpen(i) ? start[0] : (start[0]+1);
-                offsets.setElementAt(new Integer(o), i);
-                ++i;
-            }
-            return true;
-        }
-
-        // Assume caller has already gotten a TRUE validate().
-        public int[] createArray() {
-            int c = count(); // number of segments
-            int arrayLen = 4*c + 4;
-            int[] array = new int[arrayLen];
-            int a2offset = 2*c + 3; // offset to array 2
-
-            array[0] = c;
-            array[1] = a2offset;
-            int i;
-            for (i=0; i<2*c; ++i) {
-                array[2+i] = offset(i);
-            }
-            array[a2offset-1] = -1;
-            array[arrayLen-1] = -1;
-            // Now walk through and match up segment numbers with parentheses.
-            // Number segments from 0.  We're going to offset all entries by 2
-            // to skip the first two elements, array[0] and array[1].
-            Stack stack = new Stack();
-            int nextOpen = 0; // seg # of next open, 0-based
-            for (i=0; i<2*c; ++i) {
-                boolean open = isOpen(i);
-                // Let seg be the zero-based segment number.
-                // Open parens are at 2*seg in array 2.
-                // Close parens are at 2*seg+1 in array 2.
-                if (open) {
-                    array[a2offset + 2*nextOpen] = 2+i;
-                    stack.push(new Integer(nextOpen));
-                    ++nextOpen;
-                } else {
-                    int nextClose = ((Integer) stack.pop()).intValue();
-                    array[a2offset + 2*nextClose+1] = 2+i;
-                }
-            }
-            // assert(stack.empty());
-
-            return array;
-        }
-
-        public boolean validate() {
-            // want number of parens >= 2
-            // want number of parens to be even
-            // want first paren '('
-            // want parens to match up in the end
-            if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
-                return false;
-            }
-            int n = 0;
-            for (int i=0; i<size(); ++i) {
-                n += isOpen(i) ? 1 : -1;
-                if (n < 0) {
-                    return false;
-                }
-            }
-            return n == 0;
-        }
-
-        // Number of segments
-        // Assume caller has already gotten a TRUE validate().
-        public int count() {
-            // assert(validate());
-            return size() / 2;
-        }
-    }
-
    //----------------------------------------------------------------------
    // class RuleHalf
    //----------------------------------------------------------------------
@ -505,11 +303,7 @@ class TransliteratorParser {
        public int ante = -1;   // position of ante context marker '{' in text
        public int post = -1;   // position of post context marker '}' in text

-        // Record the position of the segment substrings and references.  A
-        // given side should have segments or segment references, but not
-        // both.
-        public Segments segments = null;
-        public int maxRef = -1; // index of largest ref (1..9)
+        public int maxRef = -1; // n where maximum segment ref is $n; 1-based

        // Record the offset to the cursor either to the left or to the
        // right of the key.  This is indicated by characters on the output
@ -521,29 +315,88 @@ class TransliteratorParser {
        // output text.
        public int cursorOffset = 0; // only nonzero on output side

+        // Position of first CURSOR_OFFSET on _right_.  This will be -1
+        // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
+        private int cursorOffsetPos = 0;
+
        public boolean anchorStart = false;
        public boolean anchorEnd   = false;

+        /**
+         * UnicodeMatcher objects corresponding to each segment.
+         */
+        public Vector segments = new Vector();
+        
+        /**
+         * The segment number from 0..n-1 of the next '(' we see
+         * during parsing; 0-based.
+         */
+        private int nextSegmentNumber = 0;
+
        /**
         * Parse one side of a rule, stopping at either the limit,
-         * the END_OF_RULE character, or an operator.  Return
-         * the pos of the terminating character (or limit).
+         * the END_OF_RULE character, or an operator.
+         * @return the index after the terminating character, or
+         * if limit was reached, limit
         */
        public int parse(String rule, int pos, int limit,
                         TransliteratorParser parser) {
            int start = pos;
            StringBuffer buf = new StringBuffer();
+            pos = parseSection(rule, pos, limit, parser, buf, false);
+            text = buf.toString();
+
+            if (cursorOffset > 0 && cursor != cursorOffsetPos) {
+                syntaxError("Misplaced " + CURSOR_POS, rule, start);
+            }
+
+            return pos;
+        }
+
+        /**
+         * Parse a section of one side of a rule, stopping at either
+         * the limit, the END_OF_RULE character, an operator, or a
+         * segment close character.  This method parses both a
+         * top-level rule half and a segment within such a rule half.
+         * It calls itself recursively to parse segments and nested
+         * segments.
+         * @param buf buffer into which to accumulate the rule pattern
+         * characters, either literal characters from the rule or
+         * standins for UnicodeMatcher objects including segments.
+         * @param isSegment if true, then we've already seen a '(' and
+         * pos on entry points right after it.  Accumulate everything
+         * up to the closing ')', put it in a segment matcher object,
+         * generate a standin for it, and add the standin to buf.  As
+         * a side effect, update the segments vector with a reference
+         * to the segment matcher.  This works recursively for nested
+         * segments.  If isSegment is false, just accumulate
+         * characters into buf.
+         * @return the index after the terminating character, or
+         * if limit was reached, limit
+         */
+        private int parseSection(String rule, int pos, int limit,
+                                 TransliteratorParser parser,
+                                 StringBuffer buf,
+                                 boolean isSegment) {
+            int start = pos;
            ParsePosition pp = null;
-            int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
-            boolean done = false;
            int quoteStart = -1; // Most recent 'single quoted string'
            int quoteLimit = -1;
            int varStart = -1; // Most recent $variableReference
            int varLimit = -1;
            int[] iref = new int[1];

+            // If isSegment, then bufSegStart is the offset in buf to
+            // the first character of the segment we are parsing.
+            int bufSegStart = 0;
+            int segmentNumber = 0;
+            if (isSegment) {
+                bufSegStart = buf.length();
+                segmentNumber = nextSegmentNumber++;
+            }
+
        main:
-            while (pos < limit && !done) {
+            while (pos < limit) {
                char c = rule.charAt(pos++);
                if (Character.isWhitespace(c)) {
                    // Ignore whitespace.  Note that this is not Unicode
@ -551,8 +404,11 @@ class TransliteratorParser {
                    // whitespace likely to be seen in code.
                    continue;
                }
-                if (OPERATORS.indexOf(c) >= 0) {
-                    --pos; // Backup to point to operator
+                // HALF_ENDERS is all chars that end a rule half: "<>=;"
+                if (HALF_ENDERS.indexOf(c) >= 0) {
+                    if (isSegment) {
+                        syntaxError("Unclosed segment", rule, start);
+                    }
                    break main;
                }
                if (anchorEnd) {
@ -614,7 +470,12 @@ class TransliteratorParser {
                    }
                    continue;
                }
+
                switch (c) {
+                    
+                //------------------------------------------------------
+                // Elements allowed within and out of segments
+                //------------------------------------------------------
                case ANCHOR_START:
                    if (buf.length() == 0 && !anchorStart) {
                        anchorStart = true;
@ -624,17 +485,8 @@ class TransliteratorParser {
                    }
                    break;
                case SEGMENT_OPEN:
-                case SEGMENT_CLOSE:
-                    // Handle segment definitions "(" and ")"
-                    // Parse "(", ")"
-                    if (segments == null) {
-                        segments = new Segments();
-                    }
-                    segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
+                    pos = parseSection(rule, pos, limit, parser, buf, true);
                    break;
-                case END_OF_RULE:
-                    --pos; // Backup to point to END_OF_RULE
-                    break main;
                case SymbolTable.SYMBOL_REF:
                    // Handle variable references and segment references "$1" .. "$9"
                    {
@ -676,7 +528,7 @@ class TransliteratorParser {
                            }
                            pp.setIndex(pos);
                            String name = parser.parseData.
-                                            parseReference(rule, pp, limit);
+                                parseReference(rule, pp, limit);
                            if (name == null) {
                                // This means the '$' was not followed by a
                                // valid name.  Try to interpret it as an
@ -697,25 +549,129 @@ class TransliteratorParser {
                        }
                    }
                    break;
+                case DOT:
+                    buf.append(parser.getDotStandIn());
+                    break;
+                case KLEENE_STAR:
+                case ONE_OR_MORE:
+                case ZERO_OR_ONE:
+                    // Quantifiers.  We handle single characters, quoted strings,
+                    // variable references, and segments.
+                    //  a+      matches  aaa
+                    //  'foo'+  matches  foofoofoo
+                    //  $v+     matches  xyxyxy if $v == xy
+                    //  (seg)+  matches  segsegseg
+                    {
+                        if (isSegment && buf.length() == bufSegStart) {
+                            // The */+ immediately follows '('
+                            syntaxError("Misplaced quantifier", rule, start);
+                            break;
+                        } 
+ 
+                        int qstart, qlimit;
+                        // The */+ follows an isolated character or quote
+                        // or variable reference
+                        if (buf.length() == quoteLimit) {
+                            // The */+ follows a 'quoted string'
+                            qstart = quoteStart;
+                            qlimit = quoteLimit;
+                        } else if (buf.length() == varLimit) {
+                            // The */+ follows a $variableReference
+                            qstart = varStart;
+                            qlimit = varLimit;
+                        } else {
+                            // The */+ follows a single character, possibly
+                            // a segment standin
+                            qstart = buf.length() - 1;
+                            qlimit = qstart + 1;
+                        }
+
+                        UnicodeMatcher m =
+                            new StringMatcher(buf.toString(), qstart, qlimit,
+                                              false, parser.data);
+                        int min = 0;
+                        int max = Quantifier.MAX;
+                        switch (c) {
+                        case ONE_OR_MORE:
+                            min = 1;
+                            break;
+                        case ZERO_OR_ONE:
+                            min = 0;
+                            max = 1;
+                            break;
+                            // case KLEENE_STAR:
+                            //    do nothing -- min, max already set
+                        }
+                        m = new Quantifier(m, min, max);
+                        buf.setLength(qstart);
+                        buf.append(parser.generateStandInFor(m));
+                    }
+                    break;
+
+                //------------------------------------------------------
+                // Elements allowed ONLY WITHIN segments
+                //------------------------------------------------------
+                case SEGMENT_CLOSE:
+                    if (isSegment) {
+                        // We're done parsing a segment.  The relevant
+                        // characters are in buf, starting at offset
+                        // bufSegStart.  Extract them into a string
+                        // matcher, and replace them with a standin
+                        // for that matcher.
+                        StringMatcher m =
+                            new StringMatcher(buf.substring(bufSegStart),
+                                              true, parser.data);
+                        // Since we call parseSection() recursively,
+                        // nested segments will result in segment i+1
+                        // getting parsed and stored before segment i;
+                        // be careful with the vector handling here.
+                        if ((segmentNumber+1) > segments.size()) {
+                            segments.setSize(segmentNumber+1);
+                        }
+                        segments.setElementAt(m, segmentNumber);
+                        buf.setLength(bufSegStart);
+                        buf.append(parser.generateStandInFor(m));
+                        break main;
+                    }
+                    // If we aren't in a segment, then a segment close
+                    // character is a syntax error.
+                    syntaxError("Unquoted special", rule, start);
+                    break;
+
+                //------------------------------------------------------
+                // Elements allowed ONLY OUTSIDE segments
+                //------------------------------------------------------
                case CONTEXT_ANTE:
+                    if (isSegment) {
+                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
+                    }
                    if (ante >= 0) {
                        syntaxError("Multiple ante contexts", rule, start);
                    }
                    ante = buf.length();
                    break;
                case CONTEXT_POST:
+                    if (isSegment) {
+                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
+                    }
                    if (post >= 0) {
                        syntaxError("Multiple post contexts", rule, start);
                    }
                    post = buf.length();
                    break;
                case CURSOR_POS:
+                    if (isSegment) {
+                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
+                    }
                    if (cursor >= 0) {
                        syntaxError("Multiple cursors", rule, start);
                    }
                    cursor = buf.length();
                    break;
                case CURSOR_OFFSET:
+                    if (isSegment) {
+                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
+                    }
                    if (cursorOffset < 0) {
                        if (buf.length() > 0) {
                            syntaxError("Misplaced " + c, rule, start);
@ -737,74 +693,10 @@ class TransliteratorParser {
                        }
                    }
                    break;
-                case DOT:
-                    buf.append(parser.getDotStandIn());
-                    break;
-                case KLEENE_STAR:
-                case ONE_OR_MORE:
-                case ZERO_OR_ONE:
-                    // Quantifiers.  We handle single characters, quoted strings,
-                    // variable references, and segments.
-                    //  a+      matches  aaa
-                    //  'foo'+  matches  foofoofoo
-                    //  $v+     matches  xyxyxy if $v == xy
-                    //  (seg)+  matches  segsegseg
-                    {
-                        int qstart, qlimit;
-                        boolean[] isOpenParen = new boolean[1];
-                        boolean isSegment = false;
-                        if (segments != null &&
-                            segments.getLastParenOffset(isOpenParen) == buf.length()) {
-                            // The */+ immediately follows a segment
-                            if (isOpenParen[0]) {
-                                syntaxError("Misplaced quantifier", rule, start);
-                            }
-                            int[] startparam = new int[1];
-                            int[] limitparam = new int[1];
-                            if (!segments.extractLastParenSubstring(startparam, limitparam)) {
-                                syntaxError("Mismatched segment delimiters", rule, start);
-                            }
-                            qstart = startparam[0];
-                            qlimit = limitparam[0];
-                            isSegment = true;
-                        } else {
-                            // The */+ follows an isolated character or quote
-                            // or variable reference
-                            if (buf.length() == quoteLimit) {
-                                // The */+ follows a 'quoted string'
-                                qstart = quoteStart;
-                                qlimit = quoteLimit;
-                            } else if (buf.length() == varLimit) {
-                                // The */+ follows a $variableReference
-                                qstart = varStart;
-                                qlimit = varLimit;
-                            } else {
-                                // The */+ follows a single character
-                                qstart = buf.length() - 1;
-                                qlimit = qstart + 1;
-                            }
-                        }
-                        UnicodeMatcher m =
-                            new StringMatcher(buf.toString(), qstart, qlimit,
-                                              isSegment, parser.data);
-                        int min = 0;
-                        int max = Quantifier.MAX;
-                        switch (c) {
-                        case ONE_OR_MORE:
-                            min = 1;
-                            break;
-                        case ZERO_OR_ONE:
-                            min = 0;
-                            max = 1;
-                            break;
-                            // case KLEENE_STAR:
-                            //    do nothing -- min, max already set
-                        }
-                        m = new Quantifier(m, min, max);
-                        buf.setLength(qstart);
-                        buf.append(parser.generateStandInFor(m));
-                    }
-                    break;
+
+                //------------------------------------------------------
+                // Non-special characters
+                //------------------------------------------------------
                default:
                    // Disallow unquoted characters other than [0-9A-Za-z]
                    // in the printable ASCII range.  These characters are
@ -819,11 +711,6 @@ class TransliteratorParser {
                    break;
                }
            }
-
-            if (cursorOffset > 0 && cursor != cursorOffsetPos) {
-                syntaxError("Misplaced " + CURSOR_POS, rule, start);
-            }
-            text = buf.toString();
            return pos;
        }

@ -838,10 +725,12 @@ class TransliteratorParser {
        }

        /**
-         * Create and return an int[] array of segments.
+         * Create and return a UnicodeMatcher[] array of segments,
+         * or null if there are no segments.
         */
-        int[] createSegments() {
-            return (segments == null) ? null : segments.createArray();
+        UnicodeMatcher[] createSegments() {
+            return (segments.size() == 0) ? null :
+                (UnicodeMatcher[]) segments.toArray(new UnicodeMatcher[segments.size()]);
        }
    }

@ -1096,9 +985,10 @@ class TransliteratorParser {
        pos = left.parse(rule, pos, limit, this);

        if (pos == limit ||
-            OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
-            syntaxError("No operator", rule, start);
+            OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
+            syntaxError("No operator pos=" + pos, rule, start);
        }
+        ++pos;

        // Found an operator char.  Check for forward-reverse operator.
        if (operator == REVERSE_RULE_OP &&
@ -1110,7 +1000,7 @@ class TransliteratorParser {
        pos = right.parse(rule, pos, limit, this);

        if (pos < limit) {
-            if (rule.charAt(pos) == END_OF_RULE) {
+            if (rule.charAt(--pos) == END_OF_RULE) {
                ++pos;
            } else {
                // RuleHalf parser must have terminated at an operator
@ -1173,7 +1063,7 @@ class TransliteratorParser {
        // apply.
        if (operator == FWDREV_RULE_OP) {
            right.removeContext();
-            right.segments = null;
+            right.segments.removeAllElements();
            left.cursor = left.maxRef = -1;
            left.cursorOffset = 0;
        }
@ -1193,7 +1083,7 @@ class TransliteratorParser {
        // cannot place the cursor outside the limits of the context.
        // Anchors are only allowed on the input side.
        if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
-            right.segments != null || left.maxRef >= 0 ||
+            right.segments.size() > 0 || left.maxRef >= 0 ||
            (right.cursorOffset != 0 && right.cursor < 0) ||
            // - The following two checks were used to ensure that the
            // - the cursor offset stayed within the ante- or postcontext.
@ -1208,14 +1098,8 @@ class TransliteratorParser {
        // Check integrity of segments and segment references.  Each
        // segment's start must have a corresponding limit, and the
        // references must not refer to segments that do not exist.
-        if (left.segments != null) {
-            if (!left.segments.validate()) {
-                syntaxError("Missing segment close", rule, start);
-            }
-            int n = left.segments.count();
-            if (right.maxRef > n) {
-                syntaxError("Undefined segment reference", rule, start);
-            }
+        if (right.maxRef > left.segments.size()) {
+            syntaxError("Undefined segment reference $" + right.maxRef, rule, start);
        }

        data.ruleSet.addRule(new TransliterationRule(
@ -1363,7 +1247,7 @@ class TransliteratorParser {
    char generateStandInFor(UnicodeMatcher matcher) {
        // assert(matcher != null);
        if (variableNext >= variableLimit) {
-            throw new RuntimeException("Private use variables exhausted");
+            throw new RuntimeException("Variable range exhausted");
        }
        variablesVector.addElement(matcher);
        return variableNext++;
--- a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
- * $Date: 2001/10/26 22:59:26 $
- * $Revision: 1.57 $
+ * $Date: 2001/10/30 18:08:19 $
+ * $Revision: 1.58 $
 *
 *****************************************************************************************
 */
@ -1268,9 +1268,11 @@ public class TransliteratorTest extends TestFmwk {
               "c abc ababc",
               "d d abd");

+        // NOTE: The (ab)+ when referenced just yields a single "ab",
+        // not the full sequence of them.  This accords with perl behavior.
        expect("(ab)+ {x} > '(' $1 ')';",
               "x abx ababxy",
-               "x ab(ab) abab(abab)y");
+               "x ab(ab) abab(ab)y");

        expect("b+ > x;",
               "ac abc abbc abbbc",
@ -1288,12 +1290,11 @@ public class TransliteratorTest extends TestFmwk {
               "qa qab qaba qababc",
               "xa x xa xc");

-        // Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
-        // In perl, it only matches the first occurrence, so the output
-        // is "()a (ab) (ab)a (ab)c".
+        // NOTE: The (ab)+ when referenced just yields a single "ab",
+        // not the full sequence of them.  This accords with perl behavior.
        expect("q(ab)* > '(' $1 ')';",
               "qa qab qaba qababc",
-               "()a (ab) (ab)a (abab)c");
+               "()a (ab) (ab)a (ab)c");

        // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
        // quoted string
@ -1574,6 +1575,46 @@ public class TransliteratorTest extends TestFmwk {
        expect(gr, "\u03B1\u0314", "ha");
    }

+    /**
+     * Test quantified segment behavior.  We want:
+     * ([abc])+ > x $1 x; applied to "cba" produces "xax"
+     */
+    public void TestQuantifiedSegment() {
+        // The normal case
+        expect("([abc]+) > x $1 x;", "cba", "xcbax");
+
+        // The tricky case; the quantifier is around the segment
+        expect("([abc])+ > x $1 x;", "cba", "xax");
+
+        // Tricky case in reverse direction
+        expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
+
+        // Check post-context segment
+        expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
+
+        // Test toRule/toPattern for non-quantified segment.
+        // Careful with spacing here.
+        String r = "([a-c]){q} > x $1 x;";
+        Transliterator t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
+        String rr = t.toRules(true);
+        if (!r.equals(rr)) {
+            errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
+        } else {
+            logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
+        }
+
+        // Test toRule/toPattern for quantified segment.
+        // Careful with spacing here.
+        r = "([a-c])+{q} > x $1 x;";
+        t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
+        rr = t.toRules(true);
+        if (!r.equals(rr)) {
+            errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
+        } else {
+            logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
+        }
+    }
+
    //======================================================================
    // icu4j ONLY
    // These tests are not mirrored (yet) in icu4c at
--- a/icu4j/src/com/ibm/text/StringMatcher.java
+++ b/icu4j/src/com/ibm/text/StringMatcher.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/StringMatcher.java,v $ 
- * $Date: 2001/10/25 22:32:02 $ 
- * $Revision: 1.2 $
+ * $Date: 2001/10/30 18:04:08 $ 
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -18,16 +18,27 @@ class StringMatcher implements UnicodeMatcher {

    private boolean isSegment;

+    private int matchStart;
+
+    private int matchLimit;
+
    private final RuleBasedTransliterator.Data data;

+    public StringMatcher(String theString,
+                         boolean isSeg,
+                         RuleBasedTransliterator.Data theData) {
+        data = theData;
+        isSegment = isSeg;
+        pattern = theString;
+        matchStart = matchLimit = -1;
+    }
+
    public StringMatcher(String theString,
                         int start,
                         int limit,
                         boolean isSeg,
                         RuleBasedTransliterator.Data theData) {
-        data = theData;
-        isSegment = isSeg;
-        pattern = theString.substring(start, limit);
+        this(theString.substring(start, limit), isSeg, theData);
    }

    /**
@ -40,6 +51,7 @@ class StringMatcher implements UnicodeMatcher {
        int i;
        int[] cursor = new int[] { offset[0] };
        if (limit < cursor[0]) {
+            // Match in the reverse direction
            for (i=pattern.length()-1; i>=0; --i) {
                char keyChar = pattern.charAt(i);
                UnicodeMatcher subm = data.lookup(keyChar);
@ -58,6 +70,13 @@ class StringMatcher implements UnicodeMatcher {
                    }
                }
            }
+            // Record the match position, but adjust for a normal
+            // forward start, limit, and only if a prior match does not
+            // exist -- we want the rightmost match.
+            if (matchStart < 0) {
+                matchStart = cursor[0]+1;
+                matchLimit = offset[0]+1;
+            }
        } else {
            for (i=0; i<pattern.length(); ++i) {
                if (incremental && cursor[0] == limit) {
@ -85,6 +104,9 @@ class StringMatcher implements UnicodeMatcher {
                    }
                }
            }
+            // Record the match position
+            matchStart = offset[0];
+            matchLimit = cursor[0];
        }

        offset[0] = cursor[0];
@ -114,7 +136,7 @@ class StringMatcher implements UnicodeMatcher {
            result.append(')');
        }
        // Flush quoteBuf out to result
-        TransliterationRule.appendToRule(result, (isSegment?')':-1),
+        TransliterationRule.appendToRule(result, -1,
                                         true, escapeUnprintable, quoteBuf);
        return result.toString();
    }
@ -130,6 +152,32 @@ class StringMatcher implements UnicodeMatcher {
        UnicodeMatcher m = data.lookup(c);
        return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
    }
+
+    /**
+     * Remove any match data.  This must be called before performing a
+     * set of matches with this segment.
+     */
+    public void resetMatch() {
+        matchStart = matchLimit = -1;
+    }
+
+    /**
+     * Return the start offset, in the match text, of the <em>rightmost</em>
+     * match.  This method may get moved up into the UnicodeMatcher if
+     * it turns out to be useful to generalize this.
+     */
+    public int getMatchStart() {
+        return matchStart;
+    }
+
+    /**
+     * Return the limit offset, in the match text, of the <em>rightmost</em>
+     * match.  This method may get moved up into the UnicodeMatcher if
+     * it turns out to be useful to generalize this.
+     */
+    public int getMatchLimit() {
+        return matchLimit;
+    }
 }

 //eof
--- a/icu4j/src/com/ibm/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/text/TransliterationRule.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
- * $Date: 2001/10/25 23:22:15 $
- * $Revision: 1.33 $
+ * $Date: 2001/10/30 18:04:08 $
+ * $Revision: 1.34 $
 *
 *****************************************************************************************
 */
@ -30,13 +30,15 @@ import com.ibm.util.Utility;
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined.
 *
- * <p>A rule may contain segments in its input string and segment references in
- * its output string.  A segment is a substring of the input pattern, indicated
- * by an offset and limit.  The segment may span the preceding or following
- * context.  A segment reference is a special character in the output string
- * that causes a segment of the input string (not the input pattern) to be
- * copied to the output string.  The range of special characters that represent
- * segment references is defined by RuleBasedTransliterator.Data.
+ * <p>A rule may contain segments in its input string and segment
+ * references in its output string.  A segment is a substring of the
+ * input pattern, indicated by an offset and limit.  The segment may
+ * be in the preceding or following context.  It may not span a
+ * context boundary.  A segment reference is a special character in
+ * the output string that causes a segment of the input string (not
+ * the input pattern) to be copied to the output string.  The range of
+ * special characters that represent segment references is defined by
+ * RuleBasedTransliterator.Data.
 *
 * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
 * string "abc.123" to "ab1.c23".
@ -44,7 +46,7 @@ import com.ibm.util.Utility;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.33 $ $Date: 2001/10/25 23:22:15 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.34 $ $Date: 2001/10/30 18:04:08 $
 */
 class TransliterationRule {

@ -64,20 +66,13 @@ class TransliterationRule {
    private String output;

    /**
-     * An array of integers encoding the position of the segments.
-     * See RuleBasedTransliterator.Segments for more details.
+     * An array of matcher objects corresponding to the input pattern
+     * segments.  If there are no segments this is null.  N.B. This is
+     * a UnicodeMatcher for generality, but in practice it is always a
+     * StringMatcher.  In the future we may generalize this, but for
+     * now we sometimes cast down to StringMatcher.
     */
-    int[] segments;
-
-    /**
-     * A value we compute from segments.  The first index into segments[]
-     * that is >= anteContextLength.  That is, the first one that is within
-     * the forward scanned part of the pattern -- the key or the postContext.
-     * If there are no segments, this has the value -1.  This index is relative
-     * to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
-     * segments[FIRST_SEG_POS_INDEX + firstKeySeg].
-     */
-    int firstKeySeg;
+    UnicodeMatcher[] segments;

    /**
     * The length of the string that must match before the key.  If
@ -127,20 +122,6 @@ class TransliterationRule {
    private static final char APOSTROPHE = '\'';
    private static final char BACKSLASH  = '\\';

-    // Macros for accessing the array of integers encoding the position of
-    // the segments.  See RuleBasedTransliterator.Segments for more details.
-    // SEGMENTS_COUNT number of segments, n (half the number of parens)
-    // SEGMENTS_LEN   length of the segments array (number of elements)
-    // SEGMENTS_POS   position in 'pattern' of parenthesis i, where i=0..2n-1
-    // SEGMENTS_NUM   index into segments to access POS of $1.open,
-    //                $1.close, $2.open, $2.close,.., $n.open, $n.close
-    //                Relative to FIRST_SEG_POS_INDEX.  Ranges from 0..2n-1.
-    static final int FIRST_SEG_POS_INDEX = 2;
-    static final int SEGMENTS_COUNT(int[] x) { return x[0]; }
-    static final int SEGMENTS_LEN(int[] x) { return (SEGMENTS_COUNT(x)*4+4); }
-    static final int SEGMENTS_POS(int[] x,int i) { return x[FIRST_SEG_POS_INDEX+i]; }
-    static final int SEGMENTS_NUM(int[] x,int i) { return x[x[1]+i]-FIRST_SEG_POS_INDEX; }
-
    private static final String COPYRIGHT =
        "\u00A9 IBM Corporation 1999-2001. All rights reserved.";

@ -165,12 +146,8 @@ class TransliterationRule {
     * 0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
     * "xyz" and moves the cursor to before "a".  It would have a cursorOffset
     * of -3.
-     * @param segs array of 2n integers.  Each of n pairs consists of offset,
-     * limit for a segment of the input string.  Characters in the output string
-     * refer to these segments if they are in a special range determined by the
-     * associated RuleBasedTransliterator.Data object.  May be null if there are
-     * no segments.  The caller is responsible for validating that segments
-     * are well-formed.
+     * @param segs array of UnicodeMatcher corresponding to input pattern
+     * segments, or null if there are none
     * @param anchorStart true if the the rule is anchored on the left to
     * the context start
     * @param anchorEnd true if the rule is anchored on the right to the
@ -180,7 +157,7 @@ class TransliterationRule {
                               int anteContextPos, int postContextPos,
                               String output,
                               int cursorPos, int cursorOffset,
-                               int[] segs,
+                               UnicodeMatcher[] segs,
                               boolean anchorStart, boolean anchorEnd,
                               RuleBasedTransliterator.Data theData) {
        data = theData;
@ -212,25 +189,11 @@ class TransliterationRule {
        this.cursorPos = cursorPos + cursorOffset;
        this.output = output;
        // We don't validate the segments array.  The caller must
-        // guarantee that the segments are well-formed.
+        // guarantee that the segments are well-formed (that is, that
+        // all $n references in the output refer to indices of this
+        // array, and that no array elements are null).
        this.segments = segs;

-        // Find the position of the first segment index that is after the
-        // anteContext (in the key).  Note that this may be a start or a
-        // limit index.  If all segments are in the ante context,
-        // firstKeySeg should point past the last segment -- that is, it
-        // should point at the end marker, which is -1.  This allows the
-        // code to back up by one to obtain the last ante context segment.
-        firstKeySeg = -1;
-        if (segments != null) {
-            firstKeySeg = FIRST_SEG_POS_INDEX;
-            while (segments[firstKeySeg] >= 0 &&
-                   segments[firstKeySeg] < anteContextLength) {
-                ++firstKeySeg;
-            }
-            firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
-        }
-
        pattern = input;
        flags = 0;
        if (anchorStart) {
@ -410,25 +373,12 @@ class TransliterationRule {

        // ============================ MATCH ===========================

-        // Record the actual positions, in the text, of the segments.
-        // These are recorded in the order that they occur in the pattern.
-
-        // segPos[] is an array of 2*SEGMENTS_COUNT elements.  It
-        // records the position in 'text' of each segment boundary, in
-        // the order that they occur in 'pattern'.
-        int[] segPos = null;
+        // Reset segment match data
        if (segments != null) {
-            segPos = new int[2*SEGMENTS_COUNT(segments)];
+            for (int i=0; i<segments.length; ++i) {
+                ((StringMatcher) segments[i]).resetMatch();
+            }
        }
-        // iSeg is an index into segments[] that accesses the first
-        // array.  As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
-        // When indexing into segments[] FIRST_SEG_POS_INDEX must be
-        // added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
-        int iSeg = firstKeySeg - 1;
-        // nextSegPos is an offset in 'pattern'.  When the cursor is
-        // equal to nextSegPos, we are at a segment boundary, and we
-        // record the position in the real text in segPos[].
-        int nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;

        int lenDelta, keyLimit;
        int[] intRef = new int[1];
@ -465,15 +415,6 @@ class TransliterationRule {
                }
                oText = intRef[0];
            }
-            while (nextSegPos == oPattern) {
-                segPos[iSeg] = oText;
-                if (oText >= 0) {
-                    segPos[iSeg] += UTF16.getCharCount(UTF16.charAt(text, oText));
-                } else {
-                    ++segPos[iSeg];
-                }
-                nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
-            }
        }

        minOText = posAfter(text, oText);
@ -486,9 +427,6 @@ class TransliterationRule {

        // -------------------- Key and Post Context --------------------

-        iSeg = firstKeySeg;
-        nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
-
        oPattern = 0;
        oText = pos.start;
        keyLimit = 0;
@ -511,10 +449,6 @@ class TransliterationRule {
            // depending on whether we're in the key or in the post
            // context.

-            while (oPattern == nextSegPos) {
-                segPos[iSeg] = oText;
-                nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
-            }
            if (oPattern == keyLength) {
                keyLimit = oText;
            }
@ -554,10 +488,6 @@ class TransliterationRule {
            //!    return UnicodeMatcher.U_MISMATCH;
            //!}
        }
-        while (oPattern == nextSegPos) {
-            segPos[iSeg] = oText;
-            nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
-        }
        if (oPattern == keyLength) {
            keyLimit = oText;
        }
@ -576,8 +506,7 @@ class TransliterationRule {
        // =========================== REPLACE ==========================

        // We have a full match.  The key is between pos.start and
-        // keyLimit.  Segment indices have been recorded in segPos[].
-        // Perform a replacement.
+        // keyLimit.

        if (segments == null) {
            text.replace(pos.start, keyLimit, output);
@ -629,11 +558,22 @@ class TransliterationRule {
                        buf.setLength(0);
                    }
                    // Copy segment with out-of-band data
-                    b *= 2;
-                    int start = segPos[SEGMENTS_NUM(segments,b)];
-                    int limit = segPos[SEGMENTS_NUM(segments,b+1)];
-                    text.copy(start, limit, dest);
-                    dest += limit - start;
+                    StringMatcher m = (StringMatcher) segments[b];
+                    int start = m.getMatchStart();
+                    int limit = m.getMatchLimit();
+                    // If there was no match, that means that a quantifier
+                    // matched zero-length.  E.g., x (a)* y matched "xy".
+                    if (start >= 0) {
+                        // Adjust indices for segments in post context
+                        // for any inserted text between the key and
+                        // the post context.
+                        if (start >= keyLimit) {
+                            start += dest - keyLimit;
+                            limit += dest - keyLimit;
+                        }
+                        text.copy(start, limit, dest);
+                        dest += limit - start;
+                    }
                }
                oOutput += UTF16.getCharCount(c);
            }
@ -790,20 +730,6 @@ class TransliterationRule {

        StringBuffer rule = new StringBuffer();

-        // iseg indexes into segments[] directly (not offset from FSPI)
-        int iseg = FIRST_SEG_POS_INDEX-1;
-        int nextSeg = -1;
-        // Build an array of booleans specifying open vs. close paren
-        boolean[] isOpen = null;
-        if (segments != null) {
-            isOpen = new boolean[2*SEGMENTS_COUNT(segments)];
-            for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
-                isOpen[SEGMENTS_NUM(segments,i)  ] = true;
-                isOpen[SEGMENTS_NUM(segments,i+1)] = false;
-            }
-            nextSeg = segments[++iseg];
-        }
-
        // Accumulate special characters (and non-specials following them)
        // into quoteBuf.  Append quoteBuf, within single quotes, when
        // a non-quoted element must be inserted.
@ -825,14 +751,6 @@ class TransliterationRule {
                appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
            }

-            // Append either '(' or ')' if we are at a segment index
-            if (i == nextSeg) {
-                appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
-                                 '(' : ')',
-                                 true, escapeUnprintable, quoteBuf);
-                nextSeg = segments[++iseg];
-            }
-
            if (emitBraces && i == (anteContextLength + keyLength)) {
                appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
            }
@ -847,11 +765,6 @@ class TransliterationRule {
            }
        }

-        if (i == nextSeg) {
-            // assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
-            appendToRule(rule, ')', true, escapeUnprintable, quoteBuf);
-        }
-
        if (emitBraces && i == (anteContextLength + keyLength)) {
            appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
        }
@ -885,7 +798,7 @@ class TransliterationRule {
            } else {
                ++seg; // make 1-based
                appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
-                rule.append(0x24 /*$*/);
+                rule.append('$');
                boolean show = false; // true if we should display digits
                for (int p=9; p>=0; --p) {
                    int d = seg / POW10[p];
@ -938,6 +851,9 @@ class TransliterationRule {

 /**
 * $Log: TransliterationRule.java,v $
+ * Revision 1.34  2001/10/30 18:04:08  alan
+ * jitterbug 1406: make quantified segments behave like perl counterparts
+ *
 * Revision 1.33  2001/10/25 23:22:15  alan
 * jitterbug 73: changes to support zero-length matchers at end of key
 *
--- a/icu4j/src/com/ibm/text/TransliteratorParser.java
+++ b/icu4j/src/com/ibm/text/TransliteratorParser.java
@ -4,8 +4,8 @@
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $
-* $Date: 2001/10/24 00:03:38 $
-* $Revision: 1.7 $
+* $Date: 2001/10/30 18:04:09 $
+* $Revision: 1.8 $
 **********************************************************************
 */
 package com.ibm.text;
@ -117,6 +117,7 @@ class TransliteratorParser {
    private static final char FWDREV_RULE_OP    = '~'; // internal rep of <> op

    private static final String OPERATORS = "=><";
+    private static final String HALF_ENDERS = "=><;";

    // Other special characters
    private static final char QUOTE               = '\'';
@ -142,7 +143,7 @@ class TransliteratorParser {
    // private static final char ANCHOR_END       = '$';

    // Segments of the input string are delimited by "(" and ")".  In the
-    // output string these segments are referenced as "$1" through "$9".
+    // output string these segments are referenced as "$1", "$2", etc.
    private static final char SEGMENT_OPEN        = '(';
    private static final char SEGMENT_CLOSE       = ')';

@ -285,209 +286,6 @@ class TransliteratorParser {
        }
    };

-    //----------------------------------------------------------------------
-    // class Segments
-    //----------------------------------------------------------------------
-
-    /**
-     * Segments are parentheses-enclosed regions of the input string.
-     * These are referenced in the output string using the notation $1,
-     * $2, etc.  Numbering is in order of appearance of the left
-     * parenthesis.  Number is one-based.  Segments are defined as start,
-     * limit pairs.  Segments may nest.
-     *
-     * During parsing, segment data is encoded in an object of class
-     * Segments.  At runtime, the same data is encoded in compact form as
-     * an array of integers in a TransliterationRule.  The runtime encoding
-     * must satisfy three goals:
-     *
-     * 1. Iterate over the offsets in a pattern, from left to right,
-     *    and indicate all segment boundaries, in order.  This is done
-     *    during matching.
-     *
-     * 2. Given a reference $n, produce the start and limit offsets
-     *    for that segment.  This is done during replacement.
-     *
-     * 3. Similar to goal 1, but in addition, indicate whether each
-     *    segment boundary is a start or a limit, in other words, whether
-     *    each is an open paren or a close paren.  This is required by
-     *    the toRule() method.
-     *
-     * Goal 1 must be satisfied at high speed since this is done during
-     * matching.  Goal 2 is next most important.  Goal 3 is not performance
-     * critical since it is only needed by toRule().
-     *
-     * The array of integers is actually two arrays concatenated.  The
-     * first gives the index values of the open and close parentheses in
-     * the order they appear.  The second maps segment numbers to the
-     * indices of the first array.  The two arrays have the same length.
-     * Iterating over the first array satisfies goal 1.  Indexing into the
-     * second array satisfies goal 2.  Goal 3 is satisfied by iterating
-     * over the second array and constructing the required data when
-     * needed.  This is what toRule() does.
-     *
-     * Example:  (a b(c d)e f)
-     *            0 1 2 3 4 5 6
-     *
-     * First array: Indices are 0, 2, 4, and 6.
-
-     * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
-     * second array is 0, 3, 1 2 -- these give the indices in the
-     * first array at which $1:open, $1:close, $2:open, and $2:close
-     * occur.
-     *
-     * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
-     *
-     * Each subarray is terminated with a -1, and two leading entries
-     * give the number of segments and the offset to the first entry
-     * of the second array.  In addition, the second array value are
-     * all offset by 2 so they index directly into the final array.
-     * The total array size is 4*segments[0] + 4.  The second index is
-     * 2*segments[0] + 3.
-     *
-     * In the output string, a segment reference is indicated by a
-     * character in a special range, as defined by
-     * RuleBasedTransliterator.Data.
-     *
-     * Most rules have no segments, in which case segments is null, and the
-     * output string need not be checked for segment reference characters.
-     *
-     * See also rbt_rule.h/cpp.
-     */
-    private static class Segments {
-
-        private Vector offsets; // holds Integer objects
-
-        private Vector isOpenParen; // holds Boolean objects
-
-        private int offset(int i) {
-            return ((Integer) offsets.elementAt(i)).intValue();
-        }
-
-        private boolean isOpen(int i) {
-            return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
-        }
-
-        // size of the Vectors
-        private int size() {
-            // assert(offset.size() == isOpenParen.size());
-            return offsets.size();
-        }
-
-        public Segments() {
-            offsets = new Vector();
-            isOpenParen = new Vector();
-        }
-
-        public void addParenthesisAt(int offset, boolean isOpen) {
-            offsets.addElement(new Integer(offset));
-            isOpenParen.addElement(new Boolean(isOpen));
-        }
-
-        public int getLastParenOffset(boolean[] isOpenParen) {
-            if (size() == 0) {
-                return -1;
-            }
-            isOpenParen[0] = isOpen(size()-1);
-            return offset(size()-1);
-        }
-
-        // Remove the last (rightmost) segment.  Store its offsets in start
-        // and limit, and then convert all offsets at or after start to be
-        // equal to start.  Upon failure, return FALSE.  Assume that the
-        // caller has already called getLastParenOffset() and validated that
-        // there is at least one parenthesis and that the last one is a close
-        // paren.
-        public boolean extractLastParenSubstring(int[] start, int[] limit) {
-            // assert(offsets.size() > 0);
-            // assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
-            int i = size() - 1;
-            int n = 1; // count of close parens we need to match
-            // Record position of the last close paren
-            limit[0] = offset(i);
-            --i; // back up to the one before the last one
-            while (i >= 0 && n != 0) {
-                n += isOpen(i) ? -1 : 1;
-            }
-            if (n != 0) {
-                return false;
-            }
-            // assert(i>=0);
-            start[0] = offset(i);
-            // Reset all segment pairs from i to size() - 1 to [start, start+1).
-            while (i<size()) {
-                int o = isOpen(i) ? start[0] : (start[0]+1);
-                offsets.setElementAt(new Integer(o), i);
-                ++i;
-            }
-            return true;
-        }
-
-        // Assume caller has already gotten a TRUE validate().
-        public int[] createArray() {
-            int c = count(); // number of segments
-            int arrayLen = 4*c + 4;
-            int[] array = new int[arrayLen];
-            int a2offset = 2*c + 3; // offset to array 2
-
-            array[0] = c;
-            array[1] = a2offset;
-            int i;
-            for (i=0; i<2*c; ++i) {
-                array[2+i] = offset(i);
-            }
-            array[a2offset-1] = -1;
-            array[arrayLen-1] = -1;
-            // Now walk through and match up segment numbers with parentheses.
-            // Number segments from 0.  We're going to offset all entries by 2
-            // to skip the first two elements, array[0] and array[1].
-            Stack stack = new Stack();
-            int nextOpen = 0; // seg # of next open, 0-based
-            for (i=0; i<2*c; ++i) {
-                boolean open = isOpen(i);
-                // Let seg be the zero-based segment number.
-                // Open parens are at 2*seg in array 2.
-                // Close parens are at 2*seg+1 in array 2.
-                if (open) {
-                    array[a2offset + 2*nextOpen] = 2+i;
-                    stack.push(new Integer(nextOpen));
-                    ++nextOpen;
-                } else {
-                    int nextClose = ((Integer) stack.pop()).intValue();
-                    array[a2offset + 2*nextClose+1] = 2+i;
-                }
-            }
-            // assert(stack.empty());
-
-            return array;
-        }
-
-        public boolean validate() {
-            // want number of parens >= 2
-            // want number of parens to be even
-            // want first paren '('
-            // want parens to match up in the end
-            if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
-                return false;
-            }
-            int n = 0;
-            for (int i=0; i<size(); ++i) {
-                n += isOpen(i) ? 1 : -1;
-                if (n < 0) {
-                    return false;
-                }
-            }
-            return n == 0;
-        }
-
-        // Number of segments
-        // Assume caller has already gotten a TRUE validate().
-        public int count() {
-            // assert(validate());
-            return size() / 2;
-        }
-    }
-
    //----------------------------------------------------------------------
    // class RuleHalf
    //----------------------------------------------------------------------
@ -505,11 +303,7 @@ class TransliteratorParser {
        public int ante = -1;   // position of ante context marker '{' in text
        public int post = -1;   // position of post context marker '}' in text

-        // Record the position of the segment substrings and references.  A
-        // given side should have segments or segment references, but not
-        // both.
-        public Segments segments = null;
-        public int maxRef = -1; // index of largest ref (1..9)
+        public int maxRef = -1; // n where maximum segment ref is $n; 1-based

        // Record the offset to the cursor either to the left or to the
        // right of the key.  This is indicated by characters on the output
@ -521,29 +315,88 @@ class TransliteratorParser {
        // output text.
        public int cursorOffset = 0; // only nonzero on output side

+        // Position of first CURSOR_OFFSET on _right_.  This will be -1
+        // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
+        private int cursorOffsetPos = 0;
+
        public boolean anchorStart = false;
        public boolean anchorEnd   = false;

+        /**
+         * UnicodeMatcher objects corresponding to each segment.
+         */
+        public Vector segments = new Vector();
+        
+        /**
+         * The segment number from 0..n-1 of the next '(' we see
+         * during parsing; 0-based.
+         */
+        private int nextSegmentNumber = 0;
+
        /**
         * Parse one side of a rule, stopping at either the limit,
-         * the END_OF_RULE character, or an operator.  Return
-         * the pos of the terminating character (or limit).
+         * the END_OF_RULE character, or an operator.
+         * @return the index after the terminating character, or
+         * if limit was reached, limit
         */
        public int parse(String rule, int pos, int limit,
                         TransliteratorParser parser) {
            int start = pos;
            StringBuffer buf = new StringBuffer();
+            pos = parseSection(rule, pos, limit, parser, buf, false);
+            text = buf.toString();
+
+            if (cursorOffset > 0 && cursor != cursorOffsetPos) {
+                syntaxError("Misplaced " + CURSOR_POS, rule, start);
+            }
+
+            return pos;
+        }
+
+        /**
+         * Parse a section of one side of a rule, stopping at either
+         * the limit, the END_OF_RULE character, an operator, or a
+         * segment close character.  This method parses both a
+         * top-level rule half and a segment within such a rule half.
+         * It calls itself recursively to parse segments and nested
+         * segments.
+         * @param buf buffer into which to accumulate the rule pattern
+         * characters, either literal characters from the rule or
+         * standins for UnicodeMatcher objects including segments.
+         * @param isSegment if true, then we've already seen a '(' and
+         * pos on entry points right after it.  Accumulate everything
+         * up to the closing ')', put it in a segment matcher object,
+         * generate a standin for it, and add the standin to buf.  As
+         * a side effect, update the segments vector with a reference
+         * to the segment matcher.  This works recursively for nested
+         * segments.  If isSegment is false, just accumulate
+         * characters into buf.
+         * @return the index after the terminating character, or
+         * if limit was reached, limit
+         */
+        private int parseSection(String rule, int pos, int limit,
+                                 TransliteratorParser parser,
+                                 StringBuffer buf,
+                                 boolean isSegment) {
+            int start = pos;
            ParsePosition pp = null;
-            int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
-            boolean done = false;
            int quoteStart = -1; // Most recent 'single quoted string'
            int quoteLimit = -1;
            int varStart = -1; // Most recent $variableReference
            int varLimit = -1;
            int[] iref = new int[1];

+            // If isSegment, then bufSegStart is the offset in buf to
+            // the first character of the segment we are parsing.
+            int bufSegStart = 0;
+            int segmentNumber = 0;
+            if (isSegment) {
+                bufSegStart = buf.length();
+                segmentNumber = nextSegmentNumber++;
+            }
+
        main:
-            while (pos < limit && !done) {
+            while (pos < limit) {
                char c = rule.charAt(pos++);
                if (Character.isWhitespace(c)) {
                    // Ignore whitespace.  Note that this is not Unicode
@ -551,8 +404,11 @@ class TransliteratorParser {
                    // whitespace likely to be seen in code.
                    continue;
                }
-                if (OPERATORS.indexOf(c) >= 0) {
-                    --pos; // Backup to point to operator
+                // HALF_ENDERS is all chars that end a rule half: "<>=;"
+                if (HALF_ENDERS.indexOf(c) >= 0) {
+                    if (isSegment) {
+                        syntaxError("Unclosed segment", rule, start);
+                    }
                    break main;
                }
                if (anchorEnd) {
@ -614,7 +470,12 @@ class TransliteratorParser {
                    }
                    continue;
                }
+
                switch (c) {
+                    
+                //------------------------------------------------------
+                // Elements allowed within and out of segments
+                //------------------------------------------------------
                case ANCHOR_START:
                    if (buf.length() == 0 && !anchorStart) {
                        anchorStart = true;
@ -624,17 +485,8 @@ class TransliteratorParser {
                    }
                    break;
                case SEGMENT_OPEN:
-                case SEGMENT_CLOSE:
-                    // Handle segment definitions "(" and ")"
-                    // Parse "(", ")"
-                    if (segments == null) {
-                        segments = new Segments();
-                    }
-                    segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
+                    pos = parseSection(rule, pos, limit, parser, buf, true);
                    break;
-                case END_OF_RULE:
-                    --pos; // Backup to point to END_OF_RULE
-                    break main;
                case SymbolTable.SYMBOL_REF:
                    // Handle variable references and segment references "$1" .. "$9"
                    {
@ -676,7 +528,7 @@ class TransliteratorParser {
                            }
                            pp.setIndex(pos);
                            String name = parser.parseData.
-                                            parseReference(rule, pp, limit);
+                                parseReference(rule, pp, limit);
                            if (name == null) {
                                // This means the '$' was not followed by a
                                // valid name.  Try to interpret it as an
@ -697,25 +549,129 @@ class TransliteratorParser {
                        }
                    }
                    break;
+                case DOT:
+                    buf.append(parser.getDotStandIn());
+                    break;
+                case KLEENE_STAR:
+                case ONE_OR_MORE:
+                case ZERO_OR_ONE:
+                    // Quantifiers.  We handle single characters, quoted strings,
+                    // variable references, and segments.
+                    //  a+      matches  aaa
+                    //  'foo'+  matches  foofoofoo
+                    //  $v+     matches  xyxyxy if $v == xy
+                    //  (seg)+  matches  segsegseg
+                    {
+                        if (isSegment && buf.length() == bufSegStart) {
+                            // The */+ immediately follows '('
+                            syntaxError("Misplaced quantifier", rule, start);
+                            break;
+                        } 
+ 
+                        int qstart, qlimit;
+                        // The */+ follows an isolated character or quote
+                        // or variable reference
+                        if (buf.length() == quoteLimit) {
+                            // The */+ follows a 'quoted string'
+                            qstart = quoteStart;
+                            qlimit = quoteLimit;
+                        } else if (buf.length() == varLimit) {
+                            // The */+ follows a $variableReference
+                            qstart = varStart;
+                            qlimit = varLimit;
+                        } else {
+                            // The */+ follows a single character, possibly
+                            // a segment standin
+                            qstart = buf.length() - 1;
+                            qlimit = qstart + 1;
+                        }
+
+                        UnicodeMatcher m =
+                            new StringMatcher(buf.toString(), qstart, qlimit,
+                                              false, parser.data);
+                        int min = 0;
+                        int max = Quantifier.MAX;
+                        switch (c) {
+                        case ONE_OR_MORE:
+                            min = 1;
+                            break;
+                        case ZERO_OR_ONE:
+                            min = 0;
+                            max = 1;
+                            break;
+                            // case KLEENE_STAR:
+                            //    do nothing -- min, max already set
+                        }
+                        m = new Quantifier(m, min, max);
+                        buf.setLength(qstart);
+                        buf.append(parser.generateStandInFor(m));
+                    }
+                    break;
+
+                //------------------------------------------------------
+                // Elements allowed ONLY WITHIN segments
+                //------------------------------------------------------
+                case SEGMENT_CLOSE:
+                    if (isSegment) {
+                        // We're done parsing a segment.  The relevant
+                        // characters are in buf, starting at offset
+                        // bufSegStart.  Extract them into a string
+                        // matcher, and replace them with a standin
+                        // for that matcher.
+                        StringMatcher m =
+                            new StringMatcher(buf.substring(bufSegStart),
+                                              true, parser.data);
+                        // Since we call parseSection() recursively,
+                        // nested segments will result in segment i+1
+                        // getting parsed and stored before segment i;
+                        // be careful with the vector handling here.
+                        if ((segmentNumber+1) > segments.size()) {
+                            segments.setSize(segmentNumber+1);
+                        }
+                        segments.setElementAt(m, segmentNumber);
+                        buf.setLength(bufSegStart);
+                        buf.append(parser.generateStandInFor(m));
+                        break main;
+                    }
+                    // If we aren't in a segment, then a segment close
+                    // character is a syntax error.
+                    syntaxError("Unquoted special", rule, start);
+                    break;
+
+                //------------------------------------------------------
+                // Elements allowed ONLY OUTSIDE segments
+                //------------------------------------------------------
                case CONTEXT_ANTE:
+                    if (isSegment) {
+                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
+                    }
                    if (ante >= 0) {
                        syntaxError("Multiple ante contexts", rule, start);
                    }
                    ante = buf.length();
                    break;
                case CONTEXT_POST:
+                    if (isSegment) {
+                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
+                    }
                    if (post >= 0) {
                        syntaxError("Multiple post contexts", rule, start);
                    }
                    post = buf.length();
                    break;
                case CURSOR_POS:
+                    if (isSegment) {
+                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
+                    }
                    if (cursor >= 0) {
                        syntaxError("Multiple cursors", rule, start);
                    }
                    cursor = buf.length();
                    break;
                case CURSOR_OFFSET:
+                    if (isSegment) {
+                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
+                    }
                    if (cursorOffset < 0) {
                        if (buf.length() > 0) {
                            syntaxError("Misplaced " + c, rule, start);
@ -737,74 +693,10 @@ class TransliteratorParser {
                        }
                    }
                    break;
-                case DOT:
-                    buf.append(parser.getDotStandIn());
-                    break;
-                case KLEENE_STAR:
-                case ONE_OR_MORE:
-                case ZERO_OR_ONE:
-                    // Quantifiers.  We handle single characters, quoted strings,
-                    // variable references, and segments.
-                    //  a+      matches  aaa
-                    //  'foo'+  matches  foofoofoo
-                    //  $v+     matches  xyxyxy if $v == xy
-                    //  (seg)+  matches  segsegseg
-                    {
-                        int qstart, qlimit;
-                        boolean[] isOpenParen = new boolean[1];
-                        boolean isSegment = false;
-                        if (segments != null &&
-                            segments.getLastParenOffset(isOpenParen) == buf.length()) {
-                            // The */+ immediately follows a segment
-                            if (isOpenParen[0]) {
-                                syntaxError("Misplaced quantifier", rule, start);
-                            }
-                            int[] startparam = new int[1];
-                            int[] limitparam = new int[1];
-                            if (!segments.extractLastParenSubstring(startparam, limitparam)) {
-                                syntaxError("Mismatched segment delimiters", rule, start);
-                            }
-                            qstart = startparam[0];
-                            qlimit = limitparam[0];
-                            isSegment = true;
-                        } else {
-                            // The */+ follows an isolated character or quote
-                            // or variable reference
-                            if (buf.length() == quoteLimit) {
-                                // The */+ follows a 'quoted string'
-                                qstart = quoteStart;
-                                qlimit = quoteLimit;
-                            } else if (buf.length() == varLimit) {
-                                // The */+ follows a $variableReference
-                                qstart = varStart;
-                                qlimit = varLimit;
-                            } else {
-                                // The */+ follows a single character
-                                qstart = buf.length() - 1;
-                                qlimit = qstart + 1;
-                            }
-                        }
-                        UnicodeMatcher m =
-                            new StringMatcher(buf.toString(), qstart, qlimit,
-                                              isSegment, parser.data);
-                        int min = 0;
-                        int max = Quantifier.MAX;
-                        switch (c) {
-                        case ONE_OR_MORE:
-                            min = 1;
-                            break;
-                        case ZERO_OR_ONE:
-                            min = 0;
-                            max = 1;
-                            break;
-                            // case KLEENE_STAR:
-                            //    do nothing -- min, max already set
-                        }
-                        m = new Quantifier(m, min, max);
-                        buf.setLength(qstart);
-                        buf.append(parser.generateStandInFor(m));
-                    }
-                    break;
+
+                //------------------------------------------------------
+                // Non-special characters
+                //------------------------------------------------------
                default:
                    // Disallow unquoted characters other than [0-9A-Za-z]
                    // in the printable ASCII range.  These characters are
@ -819,11 +711,6 @@ class TransliteratorParser {
                    break;
                }
            }
-
-            if (cursorOffset > 0 && cursor != cursorOffsetPos) {
-                syntaxError("Misplaced " + CURSOR_POS, rule, start);
-            }
-            text = buf.toString();
            return pos;
        }

@ -838,10 +725,12 @@ class TransliteratorParser {
        }

        /**
-         * Create and return an int[] array of segments.
+         * Create and return a UnicodeMatcher[] array of segments,
+         * or null if there are no segments.
         */
-        int[] createSegments() {
-            return (segments == null) ? null : segments.createArray();
+        UnicodeMatcher[] createSegments() {
+            return (segments.size() == 0) ? null :
+                (UnicodeMatcher[]) segments.toArray(new UnicodeMatcher[segments.size()]);
        }
    }

@ -1096,9 +985,10 @@ class TransliteratorParser {
        pos = left.parse(rule, pos, limit, this);

        if (pos == limit ||
-            OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
-            syntaxError("No operator", rule, start);
+            OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
+            syntaxError("No operator pos=" + pos, rule, start);
        }
+        ++pos;

        // Found an operator char.  Check for forward-reverse operator.
        if (operator == REVERSE_RULE_OP &&
@ -1110,7 +1000,7 @@ class TransliteratorParser {
        pos = right.parse(rule, pos, limit, this);

        if (pos < limit) {
-            if (rule.charAt(pos) == END_OF_RULE) {
+            if (rule.charAt(--pos) == END_OF_RULE) {
                ++pos;
            } else {
                // RuleHalf parser must have terminated at an operator
@ -1173,7 +1063,7 @@ class TransliteratorParser {
        // apply.
        if (operator == FWDREV_RULE_OP) {
            right.removeContext();
-            right.segments = null;
+            right.segments.removeAllElements();
            left.cursor = left.maxRef = -1;
            left.cursorOffset = 0;
        }
@ -1193,7 +1083,7 @@ class TransliteratorParser {
        // cannot place the cursor outside the limits of the context.
        // Anchors are only allowed on the input side.
        if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
-            right.segments != null || left.maxRef >= 0 ||
+            right.segments.size() > 0 || left.maxRef >= 0 ||
            (right.cursorOffset != 0 && right.cursor < 0) ||
            // - The following two checks were used to ensure that the
            // - the cursor offset stayed within the ante- or postcontext.
@ -1208,14 +1098,8 @@ class TransliteratorParser {
        // Check integrity of segments and segment references.  Each
        // segment's start must have a corresponding limit, and the
        // references must not refer to segments that do not exist.
-        if (left.segments != null) {
-            if (!left.segments.validate()) {
-                syntaxError("Missing segment close", rule, start);
-            }
-            int n = left.segments.count();
-            if (right.maxRef > n) {
-                syntaxError("Undefined segment reference", rule, start);
-            }
+        if (right.maxRef > left.segments.size()) {
+            syntaxError("Undefined segment reference $" + right.maxRef, rule, start);
        }

        data.ruleSet.addRule(new TransliterationRule(
@ -1363,7 +1247,7 @@ class TransliteratorParser {
    char generateStandInFor(UnicodeMatcher matcher) {
        // assert(matcher != null);
        if (variableNext >= variableLimit) {
-            throw new RuntimeException("Private use variables exhausted");
+            throw new RuntimeException("Variable range exhausted");
        }
        variablesVector.addElement(matcher);
        return variableNext++;