ICU-1406 make quantified segments behave like perl counterparts

X-SVN-Rev: 6493
2001-10-30 18:08:53 +00:00 · 2001-10-30 18:08:53 +00:00 · 2c2b11dfe8
commit 2c2b11dfe8
parent 0d08aaadcc
13 changed files with 1073 additions and 1463 deletions
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -63,6 +63,10 @@ static const UChar gOPERATORS[] = {
    0x3D, 0x3E, 0x3C, 0     // "=><"
 };
 static const UChar HALF_ENDERS[] = {
    0x3D, 0x3E, 0x3C, 59, 0 // "=><;"
 };
 // These are also used in Transliterator::toRules()
 static const int32_t ID_TOKEN_LEN = 2;
 static const UChar   ID_TOKEN[]   = { 0x3A, 0x3A }; // ':', ':'
@ -147,256 +151,6 @@ UnicodeString ParseData::parseReference(const UnicodeString& text,
    return result;
 }
 //----------------------------------------------------------------------
 // Segments
 //----------------------------------------------------------------------
 /**
 * Segments are parentheses-enclosed regions of the input string.
 * These are referenced in the output string using the notation $1,
 * $2, etc.  Numbering is in order of appearance of the left
 * parenthesis.  Number is one-based.  Segments are defined as start,
 * limit pairs.  Segments may nest.
 *
 * During parsing, segment data is encoded in an object of class
 * Segments.  At runtime, the same data is encoded in compact form as
 * an array of integers in a TransliterationRule.  The runtime encoding
 * must satisfy three goals:
 *
 * 1. Iterate over the offsets in a pattern, from left to right,
 *    and indicate all segment boundaries, in order.  This is done
 *    during matching.
 *
 * 2. Given a reference $n, produce the start and limit offsets
 *    for that segment.  This is done during replacement.
 *
 * 3. Similar to goal 1, but in addition, indicate whether each
 *    segment boundary is a start or a limit, in other words, whether
 *    each is an open paren or a close paren.  This is required by
 *    the toRule() method.
 *
 * Goal 1 must be satisfied at high speed since this is done during
 * matching.  Goal 2 is next most important.  Goal 3 is not performance
 * critical since it is only needed by toRule().
 *
 * The array of integers is actually two arrays concatenated.  The
 * first gives the index values of the open and close parentheses in
 * the order they appear.  The second maps segment numbers to the
 * indices of the first array.  The two arrays have the same length.
 * Iterating over the first array satisfies goal 1.  Indexing into the
 * second array satisfies goal 2.  Goal 3 is satisfied by iterating
 * over the second array and constructing the required data when
 * needed.  This is what toRule() does.
 *
 * Example:  (a b(c d)e f)
 *            0 1 2 3 4 5 6
 *
 * First array: Indices are 0, 2, 4, and 6.
 * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
 * second array is 0, 3, 1 2 -- these give the indices in the
 * first array at which $1:open, $1:close, $2:open, and $2:close
 * occur.
 *
 * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
 *
 * Each subarray is terminated with a -1, and two leading entries
 * give the number of segments and the offset to the first entry
 * of the second array.  In addition, the second array value are
 * all offset by 2 so they index directly into the final array.
 * The total array size is 4*segments[0] + 4.  The second index is
 * 2*segments[0] + 3.
 *
 * In the output string, a segment reference is indicated by a
 * character in a special range, as defined by
 * RuleBasedTransliterator.Data.
 *
 * Most rules have no segments, in which case segments is null, and the
 * output string need not be checked for segment reference characters.
 *
 * See also rbt_rule.h/cpp.
 */
 class Segments {
    UVector offsets;
    UVector isOpenParen;
 public:
    Segments(UErrorCode &status);
    ~Segments();
    void addParenthesisAt(int32_t offset, UBool isOpenParen, UErrorCode &status);
    int32_t getLastParenOffset(UBool& isOpenParen) const;
    UBool extractLastParenSubstring(int32_t& start, int32_t& limit);
    int32_t* createArray(UErrorCode &status) const;
    UBool validate() const;
    int32_t count() const; // number of segments
 private:
    int32_t offset(int32_t i) const;
    UBool isOpen(int32_t i) const;
    int32_t size() const; // size of the UVectors
 };
 int32_t Segments::offset(int32_t i) const {
    return offsets.elementAti(i);
 }
 UBool Segments::isOpen(int32_t i) const {
    return isOpenParen.elementAti(i) != 0;
 }
 int32_t Segments::size() const {
    // assert(offset.size() == isOpenParen.size());
    return offsets.size();
 }
 Segments::Segments(UErrorCode &status)
 : offsets(status),
   isOpenParen(status)
 {}
 Segments::~Segments() {}
 void Segments::addParenthesisAt(int32_t offset, UBool isOpen, UErrorCode &status) {
    offsets.addElement(offset, status);
    isOpenParen.addElement(isOpen ? 1 : 0, status);
 }
 int32_t Segments::getLastParenOffset(UBool& isOpenParenReturn) const {
    if (size() == 0) {
        return -1;
    }
    isOpenParenReturn = isOpen(size()-1);
    return offset(size()-1);
 }
 // Remove the last (rightmost) segment.  Store its offsets in start
 // and limit, and then convert all offsets at or after start to be
 // equal to start.  Upon failure, return FALSE.  Assume that the
 // caller has already called getLastParenOffset() and validated that
 // there is at least one parenthesis and that the last one is a close
 // paren.
 UBool Segments::extractLastParenSubstring(int32_t& start, int32_t& limit) {
    // assert(offsets.size() > 0);
    // assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
    int32_t i = size() - 1;
    int32_t n = 1; // count of close parens we need to match
    // Record position of the last close paren
    limit = offset(i);
    --i; // back up to the one before the last one
    while (i >= 0 && n != 0) {
        n += isOpen(i) ? -1 : 1;
    }
    if (n != 0) {
        return FALSE;
    }
    // assert(i>=0);
    start = offset(i);
    // Reset all segment pairs from i to size() - 1 to [start, start+1).
    while (i<size()) {
        int32_t o = isOpen(i) ? start : (start+1);
        offsets.setElementAt(o, i);
        ++i;
    }
    return TRUE;
 }
 // Assume caller has already gotten a TRUE validate().
 int32_t* Segments::createArray(UErrorCode &status) const {
    int32_t c = count(); // number of segments
    int32_t arrayLen = 4*c + 4;
    int32_t *array = new int32_t[arrayLen];
    int32_t a2offset = 2*c + 3; // offset to array 2
    if (array == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }
    array[0] = c;
    array[1] = a2offset;
    int32_t i;
    for (i=0; i<2*c; ++i) {
        array[2+i] = offset(i);
    }
    array[a2offset-1] = -1;
    array[arrayLen-1] = -1;
    // Now walk through and match up segment numbers with parentheses.
    // Number segments from 0.  We're going to offset all entries by 2
    // to skip the first two elements, array[0] and array[1].
    UStack stack(status);
    int32_t nextOpen = 0; // seg # of next open, 0-based
    if (U_FAILURE(status)) {
        return NULL;
    }
    for (i=0; i<2*c; ++i) {
        UBool open = isOpen(i);
        // Let seg be the zero-based segment number.
        // Open parens are at 2*seg in array 2.
        // Close parens are at 2*seg+1 in array 2.
        if (open) {
            array[a2offset + 2*nextOpen] = 2+i;
            stack.push(nextOpen, status);
            ++nextOpen;
        } else {
            int32_t nextClose = stack.popi();
            array[a2offset + 2*nextClose+1] = 2+i;
        }
    }
    // assert(stack.empty());
    // Perform a series of checks on the array.  DO NOT COMPILE INTO
    // PRODUCTION CODE.  Use to debug array building problems.
    //
    //::if (!stack.empty()) {
    //::    __asm int 03;
    //::}
    //::// check the array
    //::if (array[0] < 1) {
    //::    __asm int 03;
    //::}
    //::if (array[1] < 5) {
    //::    __asm int 03;
    //::}
    //::for (i=2; i<2+array[0]*2; ++i) {
    //::    if (array[i] < 0) { // array[i] is an offset into the rule
    //::        __asm int 03;
    //::    }
    //::}
    //::if (array[2+array[0]*2] != -1) {
    //::    __asm int 03;
    //::}
    //::for (i=array[1]; i<array[1]+array[0]*2; ++i) {
    //::    if (array[i] < 2 || array[i] >= (2+2*array[0])) {
    //::        __asm int 03;
    //::    }
    //::}
    //::if (array[array[1]+array[0]*2] != -1) {
    //::    __asm int 03;
    //::}
    return array;
 }
 UBool Segments::validate() const {
    // want number of parens >= 2
    // want number of parens to be even
    // want first paren '('
    // want parens to match up in the end
    if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
        return FALSE;
    }
    int32_t n = 0;
    for (int32_t i=0; i<size(); ++i) {
        n += isOpen(i) ? 1 : -1;
        if (n < 0) {
            return FALSE;
        }
    }
    return n == 0;
 }
 // Assume caller has already gotten a TRUE validate().
 int32_t Segments::count() const {
    // assert(validate());
    return size() / 2;
 }
 //----------------------------------------------------------------------
 // BEGIN RuleHalf
 //----------------------------------------------------------------------
@ -416,11 +170,7 @@ public:
    int32_t ante;   // position of ante context marker '{' in text
    int32_t post;   // position of post context marker '}' in text
-    // Record the position of the segment substrings and references.  A
+    int32_t maxRef; // n where maximum segment ref is $n; 1-based
    // given side should have segments or segment references, but not
    // both.
    Segments* segments;
    int32_t maxRef;       // index of largest ref ($n) on the right
    // Record the offset to the cursor either to the left or to the
    // right of the key.  This is indicated by characters on the output
@ -432,9 +182,26 @@ public:
    // output text.
    int32_t cursorOffset; // only nonzero on output side
    // Position of first CURSOR_OFFSET on _right_.  This will be -1
    // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
    int32_t cursorOffsetPos;
    UBool anchorStart;
    UBool anchorEnd;
    UErrorCode ec;
    /**
     * UnicodeMatcher objects corresponding to each segment.
     */
    UVector segments;
    /**
     * The segment number from 0..n-1 of the next '(' we see
     * during parsing; 0-based.
     */
    int32_t nextSegmentNumber;
    TransliteratorParser& parser;
    //--------------------------------------------------
@ -443,22 +210,22 @@ public:
    RuleHalf(TransliteratorParser& parser);
    ~RuleHalf();
    /**
     * Parse one side of a rule, stopping at either the limit,
     * the END_OF_RULE character, or an operator.  Return
     * the pos of the terminating character (or limit).
     */
    int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit);
    int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
                         UnicodeString& buf,
                         UBool isSegment);
    /**
     * Remove context.
     */
    void removeContext();
    /**
-     * Create and return an int[] array of segments.
+     * Create and return a UnicodeMatcher*[] array of segments,
     * or NULL if there are no segments.
     */
-    int32_t* createSegments(UErrorCode& status) const;
+    UnicodeMatcher** createSegments(UErrorCode& status) const;
    int syntaxError(UErrorCode code,
                    const UnicodeString& rule,
@ -472,30 +239,69 @@ private:
    RuleHalf& operator=(const RuleHalf&);
 };
-RuleHalf::RuleHalf(TransliteratorParser& p) : parser(p) {
+RuleHalf::RuleHalf(TransliteratorParser& p) :
    ec(U_ZERO_ERROR),
    segments(ec),
    parser(p)
 {
    cursor = -1;
    ante = -1;
    post = -1;
    segments = NULL;
    maxRef = -1;
    cursorOffset = 0;
    cursorOffsetPos = 0;
    anchorStart = anchorEnd = FALSE;
    segments.removeAllElements();
    nextSegmentNumber = 0;
 }
 RuleHalf::~RuleHalf() {
    delete segments;
 }
 /**
 * Parse one side of a rule, stopping at either the limit,
- * the END_OF_RULE character, or an operator.  Return
+ * the END_OF_RULE character, or an operator.
- * the pos of the terminating character (or limit).
+ * @return the index after the terminating character, or
 * if limit was reached, limit
 */
 int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
    int32_t start = pos;
-    UnicodeString& buf = text;
+    text.truncate(0);
    pos = parseSection(rule, pos, limit, text, FALSE);
    if (cursorOffset > 0 && cursor != cursorOffsetPos) {
        return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
    }
    return pos;
 }
 /**
 * Parse a section of one side of a rule, stopping at either
 * the limit, the END_OF_RULE character, an operator, or a
 * segment close character.  This method parses both a
 * top-level rule half and a segment within such a rule half.
 * It calls itself recursively to parse segments and nested
 * segments.
 * @param buf buffer into which to accumulate the rule pattern
 * characters, either literal characters from the rule or
 * standins for UnicodeMatcher objects including segments.
 * @param isSegment if true, then we've already seen a '(' and
 * pos on entry points right after it.  Accumulate everything
 * up to the closing ')', put it in a segment matcher object,
 * generate a standin for it, and add the standin to buf.  As
 * a side effect, update the segments vector with a reference
 * to the segment matcher.  This works recursively for nested
 * segments.  If isSegment is false, just accumulate
 * characters into buf.
 * @return the index after the terminating character, or
 * if limit was reached, limit
 */
 int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
                               UnicodeString& buf,
                               UBool isSegment) {
    int32_t start = pos;
    ParsePosition pp;
    int32_t cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
    UnicodeString scratch;
    UBool done = FALSE;
    int32_t quoteStart = -1; // Most recent 'single quoted string'
@ -503,6 +309,15 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
    int32_t varStart = -1; // Most recent $variableReference
    int32_t varLimit = -1;
    // If isSegment, then bufSegStart is the offset in buf to
    // the first character of the segment we are parsing.
    int32_t bufSegStart = 0;
    int32_t segmentNumber = 0;
    if (isSegment) {
        bufSegStart = buf.length();
        segmentNumber = nextSegmentNumber++;
    }
    while (pos < limit && !done) {
        UChar c = rule.charAt(pos++);
        if (u_isWhitespace(c)) {
@ -511,8 +326,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            // whitespace likely to be seen in code.
            continue;
        }
-        if (u_strchr(gOPERATORS, c) != NULL) {
+        if (u_strchr(HALF_ENDERS, c) != NULL) {
-            --pos; // Backup to point to operator
+            if (isSegment) {
                // Unclosed segment
                return syntaxError(U_UNCLOSED_SEGMENT, rule, start);
            }
            break;
        }
        if (anchorEnd) {
@ -575,6 +393,10 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            continue;
        }
        switch (c) {
        //------------------------------------------------------
        // Elements allowed within and out of segments
        //------------------------------------------------------
        case ANCHOR_START:
            if (buf.length() == 0 && !anchorStart) {
                anchorStart = TRUE;
@ -584,17 +406,7 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            }
          break;
        case SEGMENT_OPEN:
-        case SEGMENT_CLOSE:
+            pos = parseSection(rule, pos, limit, buf, TRUE);
            // Handle segment definitions "(" and ")"
            // Parse "(", ")"
            if (segments == NULL) {
                segments = new Segments(parser.status);
            }
            segments->addParenthesisAt(buf.length(), c == SEGMENT_OPEN, parser.status);
            break;
        case END_OF_RULE:
            --pos; // Backup to point to END_OF_RULE
            done = TRUE;
            break;
        case SymbolTable::SYMBOL_REF:
            // Handle variable references and segment references "$1" .. "$9"
@ -655,25 +467,128 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
                }
            }
            break;
        case DOT:
            buf.append(parser.getDotStandIn());
            break;
        case KLEENE_STAR:
        case ONE_OR_MORE:
        case ZERO_OR_ONE:
            // Quantifiers.  We handle single characters, quoted strings,
            // variable references, and segments.
            //  a+      matches  aaa
            //  'foo'+  matches  foofoofoo
            //  $v+     matches  xyxyxy if $v == xy
            //  (seg)+  matches  segsegseg
            {
                if (isSegment && buf.length() == bufSegStart) {
                    // The */+ immediately follows '('
                    return syntaxError(U_MISPLACED_QUANTIFIER, rule, start);
                }
                int32_t qstart, qlimit;
                // The */+ follows an isolated character or quote
                // or variable reference
                if (buf.length() == quoteLimit) {
                    // The */+ follows a 'quoted string'
                    qstart = quoteStart;
                    qlimit = quoteLimit;
                } else if (buf.length() == varLimit) {
                    // The */+ follows a $variableReference
                    qstart = varStart;
                    qlimit = varLimit;
                } else {
                    // The */+ follows a single character, possibly
                    // a segment standin
                    qstart = buf.length() - 1;
                    qlimit = qstart + 1;
                }
                UnicodeMatcher *m =
                    new StringMatcher(buf, qstart, qlimit, FALSE, *parser.data);
                int32_t min = 0;
                int32_t max = Quantifier::MAX;
                switch (c) {
                case ONE_OR_MORE:
                    min = 1;
                    break;
                case ZERO_OR_ONE:
                    min = 0;
                    max = 1;
                    break;
                // case KLEENE_STAR:
                //    do nothing -- min, max already set
                }
                m = new Quantifier(m, min, max);
                buf.truncate(qstart);
                buf.append(parser.generateStandInFor(m));
            }
            break;
        //------------------------------------------------------
        // Elements allowed ONLY WITHIN segments
        //------------------------------------------------------
        case SEGMENT_CLOSE:
            if (isSegment) {
                // We're done parsing a segment.  The relevant
                // characters are in buf, starting at offset
                // bufSegStart.  Extract them into a string
                // matcher, and replace them with a standin
                // for that matcher.
                StringMatcher *m =
                    new StringMatcher(buf, bufSegStart, buf.length(),
                                      TRUE, *parser.data);
                // Since we call parseSection() recursively,
                // nested segments will result in segment i+1
                // getting parsed and stored before segment i;
                // be careful with the vector handling here.
                if ((segmentNumber+1) > segments.size()) {
                    segments.setSize(segmentNumber+1);
                }
                segments.setElementAt(m, segmentNumber);
                buf.truncate(bufSegStart);
                buf.append(parser.generateStandInFor(m));
                done = TRUE;
                break;
            }
            // If we aren't in a segment, then a segment close
            // character is a syntax error.
            return syntaxError(U_UNQUOTED_SPECIAL, rule, start);
        //------------------------------------------------------
        // Elements allowed ONLY OUTSIDE segments
        //------------------------------------------------------
        case CONTEXT_ANTE:
            if (isSegment) {
                return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
            }
            if (ante >= 0) {
                return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start);
            }
            ante = buf.length();
            break;
        case CONTEXT_POST:
            if (isSegment) {
                return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
            }
            if (post >= 0) {
                return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start);
            }
            post = buf.length();
            break;
        case CURSOR_POS:
            if (isSegment) {
                return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
            }
            if (cursor >= 0) {
                return syntaxError(U_MULTIPLE_CURSORS, rule, start);
            }
            cursor = buf.length();
            break;
        case CURSOR_OFFSET:
            if (isSegment) {
                return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
            }
            if (cursorOffset < 0) {
                if (buf.length() > 0) {
                    return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
@ -695,69 +610,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
                }
            }
            break;
-        case DOT:
+
-            buf.append(parser.getDotStandIn());
+
-            break;
+        //------------------------------------------------------
-        case KLEENE_STAR:
+        // Non-special characters
-        case ONE_OR_MORE:
+        //------------------------------------------------------
        case ZERO_OR_ONE:
            // Quantifiers.  We handle single characters, quoted strings,
            // variable references, and segments.
            //  a+      matches  aaa
            //  'foo'+  matches  foofoofoo
            //  $v+     matches  xyxyxy if $v == xy
            //  (seg)+  matches  segsegseg
            {
                int32_t start, limit;
                UBool isOpenParen;
                UBool isSegment = FALSE;
                if (segments != 0 &&
                    segments->getLastParenOffset(isOpenParen) == buf.length()) {
                    // The */+ immediately follows a segment
                    if (isOpenParen) {
                        return syntaxError(U_MISPLACED_QUANTIFIER, rule, start);
                    }
                    if (!segments->extractLastParenSubstring(start, limit)) {
                        return syntaxError(U_MISMATCHED_SEGMENT_DELIMITERS, rule, start);
                    }
                    isSegment = TRUE;
                } else {
                    // The */+ follows an isolated character or quote
                    // or variable reference
                    if (buf.length() == quoteLimit) {
                        // The */+ follows a 'quoted string'
                        start = quoteStart;
                        limit = quoteLimit;
                    } else if (buf.length() == varLimit) {
                        // The */+ follows a $variableReference
                        start = varStart;
                        limit = varLimit;
                    } else {
                        // The */+ follows a single character
                        start = buf.length() - 1;
                        limit = start + 1;
                    }
                }
                UnicodeMatcher *m =
                    new StringMatcher(buf, start, limit, isSegment, *parser.data);
                int32_t min = 0;
                int32_t max = Quantifier::MAX;
                switch (c) {
                case ONE_OR_MORE:
                    min = 1;
                    break;
                case ZERO_OR_ONE:
                    min = 0;
                    max = 1;
                    break;
                // case KLEENE_STAR:
                //    do nothing -- min, max already set
                }
                m = new Quantifier(m, min, max);
                buf.truncate(start);
                buf.append(parser.generateStandInFor(m));
            }
            break;
        default:
            // Disallow unquoted characters other than [0-9A-Za-z]
            // in the printable ASCII range.  These characters are
@ -773,10 +630,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
        }
    }
    if (cursorOffset > 0 && cursor != cursorOffsetPos) {
        return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
    }
    // text = buf.toString();
    return pos;
 }
@ -797,10 +650,15 @@ void RuleHalf::removeContext() {
 }
 /**
- * Create and return an int32_t[] array of segments.
+ * Create and return a UnicodeMatcher*[] array of segments,
 * or NULL if there are no segments.
 */
-int32_t* RuleHalf::createSegments(UErrorCode& status) const {
+UnicodeMatcher** RuleHalf::createSegments(UErrorCode& status) const {
-    return (segments == 0) ? 0 : segments->createArray(status);
+    if (segments.size() == 0) {
        return NULL;
    }
    UnicodeMatcher** result = new UnicodeMatcher*[segments.size()];
    return (UnicodeMatcher**) segments.toArray((void**) result);
 }
 //----------------------------------------------------------------------
@ -1172,9 +1030,10 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
        return start;
    }
-    if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(pos++))) == NULL) {
+    if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) {
        return syntaxError(U_MISSING_OPERATOR, rule, start);
    }
    ++pos;
    // Found an operator char.  Check for forward-reverse operator.
    if (op == REVERSE_RULE_OP &&
@ -1189,7 +1048,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
    }
    if (pos < limit) {
-        if (rule.charAt(pos) == END_OF_RULE) {
+        if (rule.charAt(--pos) == END_OF_RULE) {
            ++pos;
        } else {
            // RuleHalf parser must have terminated at an operator
@ -1251,8 +1110,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
    // apply.
    if (op == FWDREV_RULE_OP) {
        right->removeContext();
-        delete right->segments;
+        right->segments.removeAllElements();
        right->segments = NULL;
        left->cursor = left->maxRef = -1;
        left->cursorOffset = 0;
    }
@ -1272,7 +1130,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
    // cannot place the cursor outside the limits of the context.
    // Anchors are only allowed on the input side.
    if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
-        right->segments != NULL || left->maxRef >= 0 ||
+        right->segments.size() > 0 || left->maxRef >= 0 ||
        (right->cursorOffset != 0 && right->cursor < 0) ||
        // - The following two checks were used to ensure that the
        // - the cursor offset stayed within the ante- or postcontext.
@ -1288,20 +1146,15 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
    // Check integrity of segments and segment references.  Each
    // segment's start must have a corresponding limit, and the
    // references must not refer to segments that do not exist.
-    if (left->segments != NULL) {
+    if (right->maxRef > left->segments.size()) {
        if (!left->segments->validate()) {
            return syntaxError(U_MISSING_SEGMENT_CLOSE, rule, start);
        }
        int32_t n = left->segments->count();
        if (right->maxRef > n) {
        return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start);
    }
    }
    data->ruleSet.addRule(new TransliterationRule(
                                 left->text, left->ante, left->post,
                                 right->text, right->cursor, right->cursorOffset,
                                 left->createSegments(status),
                                 left->segments.size(),
                                 left->anchorStart, left->anchorEnd,
                                 data,
                                 status), status);
@ -1366,7 +1219,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
    if (variableNext >= variableLimit) {
        // throw new RuntimeException("Private use variables exhausted");
        delete adopted;
-        status = U_ILLEGAL_ARGUMENT_ERROR;
+        status = U_VARIABLE_RANGE_EXHAUSTED;
        return 0;
    }
    variablesVector->addElement(adopted, status);
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@ -14,28 +14,11 @@
 #include "unicode/uniset.h"
 #include "unicode/unicode.h"
 #include "cmemory.h"
 #include "strmatch.h"
 static const UChar APOSTROPHE = 0x0027; // '\''
 static const UChar BACKSLASH  = 0x005C; // '\' 
 // To process segments we need to allocate arrays of integers.  We use
 // stack storage as long as the segment count is <= MAX_STATIC_SEGS.
 // Otherwise, we allocate heap space.
 #define MAX_STATIC_SEGS 20
 // Macros for accessing the array of integers encoding the position of
 // SEGMENTS_COUNT number of segments, n (half the number of parens)
 // SEGMENTS_LEN   length of the segments array (number of elements)
 // SEGMENTS_POS   position in 'pattern' of parenthesis i, where i=0..2n-1
 // SEGMENTS_NUM   index into segments to access POS of $1.open,
 //                $1.close, $2.open, $2.close,.., $n.open, $n.close
 //                Relative to FIRST_SEG_POS_INDEX.  Ranges from 0..2n-1.
 #define FIRST_SEG_POS_INDEX 2
 #define SEGMENTS_COUNT(x) x[0]
 #define SEGMENTS_LEN(x) (SEGMENTS_COUNT(x)*4+4)
 #define SEGMENTS_POS(x,i) x[FIRST_SEG_POS_INDEX+i]
 #define SEGMENTS_NUM(x,i) (x[x[1]+i]-FIRST_SEG_POS_INDEX)
 U_NAMESPACE_BEGIN
 const UChar TransliterationRule::ETHER = 0xFFFF;
@ -56,11 +39,10 @@ const UChar TransliterationRule::ETHER = 0xFFFF;
 * <code>output</code>; that is, -1 is equivalent to
 * <code>output.length()</code>.  If greater than
 * <code>output.length()</code> then an exception is thrown.
- * @param adoptedSegs array of 2n integers.  Each of n pairs consists of offset,
+ * @param segs array of UnicodeMatcher corresponding to input pattern
- * limit for a segment of the input string.  Characters in the output string
+ * segments, or null if there are none.  The array itself is adopted,
- * refer to these segments if they are in a special range determined by the
+ * but the pointers within it are not.
- * associated RuleBasedTransliterator.Data object.  May be null if there are
+ * @param segsCount number of elements in segs[]
 * no segments.
 * @param anchorStart TRUE if the the rule is anchored on the left to
 * the context start
 * @param anchorEnd TRUE if the rule is anchored on the right to the
@ -70,7 +52,8 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
                                         int32_t anteContextPos, int32_t postContextPos,
                                         const UnicodeString& outputStr,
                                         int32_t cursorPosition, int32_t cursorOffset,
-                                         int32_t* adoptedSegs,
+                                         UnicodeMatcher** segs,
                                         int32_t segsCount,
                                         UBool anchorStart, UBool anchorEnd,
                                         const TransliterationRuleData* theData,
                                         UErrorCode& status) :
@ -113,23 +96,11 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
    this->cursorPos = cursorPosition + cursorOffset;
    this->output = outputStr;
    // We don't validate the segments array.  The caller must
-    // guarantee that the segments are well-formed.
+    // guarantee that the segments are well-formed (that is, that
-    this->segments = adoptedSegs;
+    // all $n references in the output refer to indices of this
-    // Find the position of the first segment index that is after the
+    // array, and that no array elements are null).
-    // anteContext (in the key).  Note that this may be a start or a
+    this->segments = segs;
-    // limit index.  If all segments are in the ante context,
+    this->segmentsCount = segsCount;
    // firstKeySeg should point past the last segment -- that is, it
    // should point at the end marker, which is -1.  This allows the
    // code to back up by one to obtain the last ante context segment.
    firstKeySeg = -1;
    if (segments != 0) {
        firstKeySeg = FIRST_SEG_POS_INDEX;
        while (segments[firstKeySeg] >= 0 &&
               segments[firstKeySeg] < anteContextLength) {
            ++firstKeySeg;
        }
        firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
    }
    pattern = input;
    flags = 0;
@ -149,18 +120,17 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
 TransliterationRule::TransliterationRule(TransliterationRule& other) :
    pattern(other.pattern),
    output(other.output),
    firstKeySeg(other.firstKeySeg),
    anteContextLength(other.anteContextLength),
    keyLength(other.keyLength),
    cursorPos(other.cursorPos),
    flags(other.flags),
    data(other.data) {
-    segments = 0;
+    segments = NULL;
-    if (other.segments != 0) {
+    segmentsCount = 0;
-        int32_t len = SEGMENTS_LEN(other.segments);
+    if (other.segmentsCount > 0) {
-        segments = new int32_t[len];
+        segments = new UnicodeMatcher*[other.segmentsCount];
-        uprv_memcpy(segments, other.segments, len*sizeof(segments[0]));
+        uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0]));
    }
 }
@ -341,26 +311,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    // ============================ MATCH ===========================
-    // Record the actual positions, in the text, of the segments.
+    // Reset segment match data
-	// These are recorded in the order that they occur in the pattern.
+    if (segments != NULL) {
-
+        for (int32_t i=0; i<segmentsCount; ++i) {
-    // segPos[] is an array of 2*SEGMENTS_COUNT elements.  It
+            ((StringMatcher*) segments[i])->resetMatch();
-    // records the position in 'text' of each segment boundary, in
+        }
    // the order that they occur in 'pattern'.
    int32_t _segPos[2*MAX_STATIC_SEGS];
    int32_t *segPos = _segPos;
    if (segments != 0 && SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
        segPos = new int32_t[2*SEGMENTS_COUNT(segments)];
    }
    // iSeg is an index into segments[] that accesses the first
    // array.  As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
    // When indexing into segments[] FIRST_SEG_POS_INDEX must be
    // added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
    int32_t iSeg = firstKeySeg - 1;
    // nextSegPos is an offset in 'pattern'.  When the cursor is
    // equal to nextSegPos, we are at a segment boundary, and we
    // record the position in the real text in segPos[].
    int32_t nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
    UMatchDegree m;
    int32_t lenDelta, keyLimit;
@ -386,26 +342,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                keyChar == text.charAt(oText)) {
                --oText;
            } else {
-                m = U_MISMATCH;
+                return U_MISMATCH;
                goto exit;
            }
        } else {
            // Subtract 1 from contextStart to make it a reverse limit
            if (matcher->matches(text, oText, pos.contextStart-1, FALSE)
                != U_MATCH) {
-                m = U_MISMATCH;
+                return U_MISMATCH;
                goto exit;
            }
        }
        while (nextSegPos == oPattern) {
            segPos[iSeg] = oText;
            if (oText >= 0) {
                segPos[iSeg] += UTF_CHAR_LENGTH(text.char32At(oText));
            } else {
                ++segPos[iSeg];
            }
            nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
        }
    }
    minOText = posAfter(text, oText);
@ -413,15 +358,11 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    // ------------------------ Start Anchor ------------------------
    if ((flags & ANCHOR_START) && oText != posBefore(text, pos.contextStart)) {
-        m = U_MISMATCH;
+        return U_MISMATCH;
        goto exit;
    }
    // -------------------- Key and Post Context --------------------
    iSeg = firstKeySeg;
    nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
    oPattern = 0;
    oText = pos.start;
    keyLimit = 0;
@ -429,8 +370,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
        if (incremental && oText == pos.limit) {
            // We've reached the limit without a mismatch and
            // without completing our match.
-            m = U_PARTIAL_MATCH;
+            return U_PARTIAL_MATCH;
            goto exit;
        }
        // It might seem that we could do a check like this here:
@ -445,10 +385,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
        // depending on whether we're in the key or in the post
        // context.
        while (oPattern == nextSegPos) {
            segPos[iSeg] = oText;
            nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
        }
        if (oPattern == keyLength) {
            keyLimit = oText;
        }
@ -467,13 +403,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                keyChar == text.charAt(oText)) {
                ++oText;
            } else {
-                m = U_MISMATCH;
+                return U_MISMATCH;
                goto exit;
            }
        } else {
            m = matcher->matches(text, oText, matchLimit, incremental);
            if (m != U_MATCH) {
-                goto exit;
+                return m;
            }
        }
@ -486,10 +421,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
        //!    // at the end of the key.
        //!    return UnicodeMatcher.U_MISMATCH;
        //!}
    }
    while (oPattern == nextSegPos) {
        segPos[iSeg] = oText;
        nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
    }
 	if (oPattern == keyLength) {
 		keyLimit = oText;
@ -509,8 +440,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    // =========================== REPLACE ==========================
    // We have a full match.  The key is between pos.start and
-    // keyLimit.  Segment indices have been recorded in segPos[].
+    // keyLimit.
    // Perform a replacement.
    if (segments == NULL) {
        text.handleReplaceBetween(pos.start, keyLimit, output);
@ -562,12 +492,23 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                    buf.remove();
                }
                // Copy segment with out-of-band data 
-                b *= 2;
+                StringMatcher* m = (StringMatcher*) segments[b];
-                int32_t start = segPos[SEGMENTS_NUM(segments,b)];
+                int32_t start = m->getMatchStart();
-                int32_t limit = segPos[SEGMENTS_NUM(segments,b+1)];
+                int32_t limit = m->getMatchLimit();
                // If there was no match, that means that a quantifier
                // matched zero-length.  E.g., x (a)* y matched "xy".
                if (start >= 0) {
                    // Adjust indices for segments in post context
                    // for any inserted text between the key and
                    // the post context.
                    if (start >= keyLimit) {
                        start += dest - keyLimit;
                        limit += dest - keyLimit;
                    }
                    text.copy(start, limit, dest);
                    dest += limit - start;
                }
            }
            oOutput += UTF_CHAR_LENGTH(c);
        }
        // Insert any accumulated straight text.
@ -600,13 +541,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    pos.contextLimit += lenDelta;
    // Restrict new value of start to [minOText, min(oText, pos.limit)].
    pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
-    m = U_MATCH;
+    return U_MATCH;
  exit:
    if (segPos != _segPos) {
        delete[] segPos;
    }
    return m;
 }
 /**
@ -727,23 +662,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
                                           UBool escapeUnprintable) const {
    int32_t i;
    // iseg indexes into segments[] directly (not offset from FSPI)
    int32_t iseg = FIRST_SEG_POS_INDEX-1;
    int32_t nextSeg = -1;
    // Build an array of booleans specifying open vs. close paren
    UBool _isOpen[2*MAX_STATIC_SEGS];
    UBool *isOpen = _isOpen;
    if (segments != 0) {
        if (SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
            isOpen = new UBool[2*SEGMENTS_COUNT(segments)];
        }
        for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
            isOpen[SEGMENTS_NUM(segments,i)  ] = TRUE;
            isOpen[SEGMENTS_NUM(segments,i+1)] = FALSE;
        }
        nextSeg = segments[++iseg];
    }
    // Accumulate special characters (and non-specials following them)
    // into quoteBuf.  Append quoteBuf, within single quotes, when
    // a non-quoted element must be inserted.
@ -765,14 +683,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
            appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
        }
        // Append either '(' or ')' if we are at a segment index
        if (i == nextSeg) {
            appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
                             (UChar)0x0028 : (UChar)0x0029,
                             TRUE, escapeUnprintable, quoteBuf);
            nextSeg = segments[++iseg];
        }
        if (emitBraces && i == (anteContextLength + keyLength)) {
            appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
        }
@ -787,11 +697,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
        }
    }
    if (i == nextSeg) {
        // assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
        appendToRule(rule, (UChar)0x0029 /*)*/, TRUE, escapeUnprintable, quoteBuf);
    }
    if (emitBraces && i == (anteContextLength + keyLength)) {
        appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
    }
@ -854,9 +759,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
    appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
    if (isOpen != _isOpen) {
        delete[] isOpen;
    }
    return rule;
 }
--- a/icu4c/source/i18n/rbt_rule.h
+++ b/icu4c/source/i18n/rbt_rule.h
@ -33,6 +33,16 @@ class TransliterationRuleData;
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined.
 *
 * <p>A rule may contain segments in its input string and segment
 * references in its output string.  A segment is a substring of the
 * input pattern, indicated by an offset and limit.  The segment may
 * be in the preceding or following context.  It may not span a
 * context boundary.  A segment reference is a special character in
 * the output string that causes a segment of the input string (not
 * the input pattern) to be copied to the output string.  The range of
 * special characters that represent segment references is defined by
 * RuleBasedTransliterator.Data.
 *
 * @author Alan Liu
 */
 class TransliterationRule {
@ -65,20 +75,20 @@ private:
    UnicodeString output;
    /**
-     * An array of integers encoding the position of the segments.
+     * An array of matcher objects corresponding to the input pattern
-     * See rbt_pars.cpp::Segments for more details.
+     * segments.  If there are no segments this is null.  N.B. This is
     * a UnicodeMatcher for generality, but in practice it is always a
     * StringMatcher.  In the future we may generalize this, but for
     * now we sometimes cast down to StringMatcher.
     *
     * The array is owned, but the pointers within it are not.
     */
-    int32_t* segments;
+    UnicodeMatcher** segments;
    /**
-     * A value we compute from segments.  The first index into segments[]
+     * The number of elements in segments[] or zero if segments is NULL.
     * that is >= anteContextLength.  That is, the first one that is within
     * the forward scanned part of the pattern -- the key or the postContext.
     * If there are no segments, this has the value -1.  This index is relative
     * to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
     * segments[FIRST_SEG_POS_INDEX + firstKeySeg].
     */
-    int32_t firstKeySeg;
+    int32_t segmentsCount;
    /**
     * The length of the string that must match before the key.  If
@ -143,11 +153,10 @@ public:
     * 0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
     * "xyz" and moves the cursor to before "a".  It would have a cursorOffset
     * of -3.
-     * @param adoptedSegs array of 2n integers.  Each of n pairs consists of offset,
+     * @param segs array of UnicodeMatcher corresponding to input pattern
-     * limit for a segment of the input string.  Characters in the output string
+     * segments, or null if there are none.  The array itself is adopted,
-     * refer to these segments if they are in a special range determined by the
+     * but the pointers within it are not.
-     * associated RuleBasedTransliterator.Data object.  May be null if there are
+     * @param segsCount number of elements in segs[]
     * no segments.
     * @param anchorStart TRUE if the the rule is anchored on the left to
     * the context start
     * @param anchorEnd TRUE if the rule is anchored on the right to the
@ -157,7 +166,8 @@ public:
                        int32_t anteContextPos, int32_t postContextPos,
                        const UnicodeString& outputStr,
                        int32_t cursorPosition, int32_t cursorOffset,
-                        int32_t* adoptedSegs,
+                        UnicodeMatcher** segs,
                        int32_t segsCount,
                        UBool anchorStart, UBool anchorEnd,
                        const TransliterationRuleData* data,
                        UErrorCode& status);
--- a/icu4c/source/i18n/strmatch.cpp
+++ b/icu4c/source/i18n/strmatch.cpp
@ -18,7 +18,9 @@ StringMatcher::StringMatcher(const UnicodeString& theString,
                             UBool isSeg,
                             const TransliterationRuleData& theData) :
    data(theData),
-    isSegment(isSeg)
+    isSegment(isSeg),
    matchStart(-1),
    matchLimit(-1)
 {
    theString.extractBetween(start, limit, pattern);
 }
@ -27,7 +29,9 @@ StringMatcher::StringMatcher(const StringMatcher& o) :
    UnicodeMatcher(o),
    pattern(o.pattern),
    data(o.data),
-    isSegment(o.isSegment)
+    isSegment(o.isSegment),
    matchStart(o.matchStart),
    matchLimit(o.matchStart)
 {
 }
@ -54,6 +58,7 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
    int32_t i;
    int32_t cursor = offset;
    if (limit < cursor) {
        // Match in the reverse direction
        for (i=pattern.length()-1; i>=0; --i) {
            UChar keyChar = pattern.charAt(i);
            const UnicodeMatcher* subm = data.lookup(keyChar);
@ -72,6 +77,14 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
                }
            }
        }
        // Record the match position, but adjust for a normal
        // forward start, limit, and only if a prior match does not
        // exist -- we want the rightmost match.
        if (matchStart < 0) {
            // cast away const -- should modify method to be non-const
            ((StringMatcher*)this)->matchStart = cursor+1;
            ((StringMatcher*)this)->matchLimit = offset+1;
        }
    } else {
        for (i=0; i<pattern.length(); ++i) {
            if (incremental && cursor == limit) {
@ -99,6 +112,10 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
                }
            }
        }
        // Record the match position
        // cast away const -- should modify method to be non-const
        ((StringMatcher*)this)->matchStart = offset;
        ((StringMatcher*)this)->matchLimit = cursor;
    }
    offset = cursor;
@ -128,7 +145,7 @@ UnicodeString& StringMatcher::toPattern(UnicodeString& result,
        result.append((UChar)41); /*)*/
    }
    // Flush quoteBuf out to result
-    TransliterationRule::appendToRule(result, (UChar32)(isSegment?41/*)*/:-1),
+    TransliterationRule::appendToRule(result, -1,
                                      TRUE, escapeUnprintable, quoteBuf);
    return result;
 }
@ -145,6 +162,32 @@ UBool StringMatcher::matchesIndexValue(uint8_t v) const {
    return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
 }
 /**
 * Remove any match data.  This must be called before performing a
 * set of matches with this segment.
 */
 void StringMatcher::resetMatch() {
    matchStart = matchLimit = -1;
 }
 /**
 * Return the start offset, in the match text, of the <em>rightmost</em>
 * match.  This method may get moved up into the UnicodeMatcher if
 * it turns out to be useful to generalize this.
 */
 int32_t StringMatcher::getMatchStart() const {
    return matchStart;
 }
 /**
 * Return the limit offset, in the match text, of the <em>rightmost</em>
 * match.  This method may get moved up into the UnicodeMatcher if
 * it turns out to be useful to generalize this.
 */
 int32_t StringMatcher::getMatchLimit() const {
    return matchLimit;
 }
 U_NAMESPACE_END
 //eof
--- a/icu4c/source/i18n/strmatch.h
+++ b/icu4c/source/i18n/strmatch.h
@ -59,6 +59,26 @@ class StringMatcher : public UnicodeMatcher {
     */
    virtual UBool matchesIndexValue(uint8_t v) const;
    /**
     * Remove any match data.  This must be called before performing a
     * set of matches with this segment.
     */
    void resetMatch();
    /**
     * Return the start offset, in the match text, of the <em>rightmost</em>
     * match.  This method may get moved up into the UnicodeMatcher if
     * it turns out to be useful to generalize this.
     */
    int32_t getMatchStart() const;
    /**
     * Return the limit offset, in the match text, of the <em>rightmost</em>
     * match.  This method may get moved up into the UnicodeMatcher if
     * it turns out to be useful to generalize this.
     */
    int32_t getMatchLimit() const;
 private:
    UnicodeString pattern;
@ -66,6 +86,10 @@ class StringMatcher : public UnicodeMatcher {
    const TransliterationRuleData& data;
    UBool isSegment;
    int32_t matchStart;
    int32_t matchLimit;
 };
 U_NAMESPACE_END
--- a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
- * $Date: 2001/10/26 22:59:26 $
+ * $Date: 2001/10/30 18:08:19 $
- * $Revision: 1.57 $
+ * $Revision: 1.58 $
 *
 *****************************************************************************************
 */
@ -1268,9 +1268,11 @@ public class TransliteratorTest extends TestFmwk {
               "c abc ababc",
               "d d abd");
        // NOTE: The (ab)+ when referenced just yields a single "ab",
        // not the full sequence of them.  This accords with perl behavior.
        expect("(ab)+ {x} > '(' $1 ')';",
               "x abx ababxy",
-               "x ab(ab) abab(abab)y");
+               "x ab(ab) abab(ab)y");
        expect("b+ > x;",
               "ac abc abbc abbbc",
@ -1288,12 +1290,11 @@ public class TransliteratorTest extends TestFmwk {
               "qa qab qaba qababc",
               "xa x xa xc");
-        // Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
+        // NOTE: The (ab)+ when referenced just yields a single "ab",
-        // In perl, it only matches the first occurrence, so the output
+        // not the full sequence of them.  This accords with perl behavior.
        // is "()a (ab) (ab)a (ab)c".
        expect("q(ab)* > '(' $1 ')';",
               "qa qab qaba qababc",
-               "()a (ab) (ab)a (abab)c");
+               "()a (ab) (ab)a (ab)c");
        // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
        // quoted string
@ -1574,6 +1575,46 @@ public class TransliteratorTest extends TestFmwk {
        expect(gr, "\u03B1\u0314", "ha");
    }
    /**
     * Test quantified segment behavior.  We want:
     * ([abc])+ > x $1 x; applied to "cba" produces "xax"
     */
    public void TestQuantifiedSegment() {
        // The normal case
        expect("([abc]+) > x $1 x;", "cba", "xcbax");
        // The tricky case; the quantifier is around the segment
        expect("([abc])+ > x $1 x;", "cba", "xax");
        // Tricky case in reverse direction
        expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
        // Check post-context segment
        expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
        // Test toRule/toPattern for non-quantified segment.
        // Careful with spacing here.
        String r = "([a-c]){q} > x $1 x;";
        Transliterator t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
        String rr = t.toRules(true);
        if (!r.equals(rr)) {
            errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
        } else {
            logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
        }
        // Test toRule/toPattern for quantified segment.
        // Careful with spacing here.
        r = "([a-c])+{q} > x $1 x;";
        t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
        rr = t.toRules(true);
        if (!r.equals(rr)) {
            errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
        } else {
            logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
        }
    }
    //======================================================================
    // icu4j ONLY
    // These tests are not mirrored (yet) in icu4c at
--- a/icu4j/src/com/ibm/icu/text/StringMatcher.java
+++ b/icu4j/src/com/ibm/icu/text/StringMatcher.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringMatcher.java,v $ 
- * $Date: 2001/10/25 22:32:02 $ 
+ * $Date: 2001/10/30 18:04:08 $ 
- * $Revision: 1.2 $
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -18,16 +18,27 @@ class StringMatcher implements UnicodeMatcher {
    private boolean isSegment;
    private int matchStart;
    private int matchLimit;
    private final RuleBasedTransliterator.Data data;
    public StringMatcher(String theString,
                         boolean isSeg,
                         RuleBasedTransliterator.Data theData) {
        data = theData;
        isSegment = isSeg;
        pattern = theString;
        matchStart = matchLimit = -1;
    }
    public StringMatcher(String theString,
                         int start,
                         int limit,
                         boolean isSeg,
                         RuleBasedTransliterator.Data theData) {
-        data = theData;
+        this(theString.substring(start, limit), isSeg, theData);
        isSegment = isSeg;
        pattern = theString.substring(start, limit);
    }
    /**
@ -40,6 +51,7 @@ class StringMatcher implements UnicodeMatcher {
        int i;
        int[] cursor = new int[] { offset[0] };
        if (limit < cursor[0]) {
            // Match in the reverse direction
            for (i=pattern.length()-1; i>=0; --i) {
                char keyChar = pattern.charAt(i);
                UnicodeMatcher subm = data.lookup(keyChar);
@ -58,6 +70,13 @@ class StringMatcher implements UnicodeMatcher {
                    }
                }
            }
            // Record the match position, but adjust for a normal
            // forward start, limit, and only if a prior match does not
            // exist -- we want the rightmost match.
            if (matchStart < 0) {
                matchStart = cursor[0]+1;
                matchLimit = offset[0]+1;
            }
        } else {
            for (i=0; i<pattern.length(); ++i) {
                if (incremental && cursor[0] == limit) {
@ -85,6 +104,9 @@ class StringMatcher implements UnicodeMatcher {
                    }
                }
            }
            // Record the match position
            matchStart = offset[0];
            matchLimit = cursor[0];
        }
        offset[0] = cursor[0];
@ -114,7 +136,7 @@ class StringMatcher implements UnicodeMatcher {
            result.append(')');
        }
        // Flush quoteBuf out to result
-        TransliterationRule.appendToRule(result, (isSegment?')':-1),
+        TransliterationRule.appendToRule(result, -1,
                                         true, escapeUnprintable, quoteBuf);
        return result.toString();
    }
@ -130,6 +152,32 @@ class StringMatcher implements UnicodeMatcher {
        UnicodeMatcher m = data.lookup(c);
        return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
    }
    /**
     * Remove any match data.  This must be called before performing a
     * set of matches with this segment.
     */
    public void resetMatch() {
        matchStart = matchLimit = -1;
    }
    /**
     * Return the start offset, in the match text, of the <em>rightmost</em>
     * match.  This method may get moved up into the UnicodeMatcher if
     * it turns out to be useful to generalize this.
     */
    public int getMatchStart() {
        return matchStart;
    }
    /**
     * Return the limit offset, in the match text, of the <em>rightmost</em>
     * match.  This method may get moved up into the UnicodeMatcher if
     * it turns out to be useful to generalize this.
     */
    public int getMatchLimit() {
        return matchLimit;
    }
 }
 //eof
--- a/icu4j/src/com/ibm/icu/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
- * $Date: 2001/10/25 23:22:15 $
+ * $Date: 2001/10/30 18:04:08 $
- * $Revision: 1.33 $
+ * $Revision: 1.34 $
 *
 *****************************************************************************************
 */
@ -30,13 +30,15 @@ import com.ibm.util.Utility;
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined.
 *
- * <p>A rule may contain segments in its input string and segment references in
+ * <p>A rule may contain segments in its input string and segment
- * its output string.  A segment is a substring of the input pattern, indicated
+ * references in its output string.  A segment is a substring of the
- * by an offset and limit.  The segment may span the preceding or following
+ * input pattern, indicated by an offset and limit.  The segment may
- * context.  A segment reference is a special character in the output string
+ * be in the preceding or following context.  It may not span a
- * that causes a segment of the input string (not the input pattern) to be
+ * context boundary.  A segment reference is a special character in
- * copied to the output string.  The range of special characters that represent
+ * the output string that causes a segment of the input string (not
- * segment references is defined by RuleBasedTransliterator.Data.
+ * the input pattern) to be copied to the output string.  The range of
 * special characters that represent segment references is defined by
 * RuleBasedTransliterator.Data.
 *
 * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
 * string "abc.123" to "ab1.c23".
@ -44,7 +46,7 @@ import com.ibm.util.Utility;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.33 $ $Date: 2001/10/25 23:22:15 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.34 $ $Date: 2001/10/30 18:04:08 $
 */
 class TransliterationRule {
@ -64,20 +66,13 @@ class TransliterationRule {
    private String output;
    /**
-     * An array of integers encoding the position of the segments.
+     * An array of matcher objects corresponding to the input pattern
-     * See RuleBasedTransliterator.Segments for more details.
+     * segments.  If there are no segments this is null.  N.B. This is
     * a UnicodeMatcher for generality, but in practice it is always a
     * StringMatcher.  In the future we may generalize this, but for
     * now we sometimes cast down to StringMatcher.
     */
-    int[] segments;
+    UnicodeMatcher[] segments;
    /**
     * A value we compute from segments.  The first index into segments[]
     * that is >= anteContextLength.  That is, the first one that is within
     * the forward scanned part of the pattern -- the key or the postContext.
     * If there are no segments, this has the value -1.  This index is relative
     * to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
     * segments[FIRST_SEG_POS_INDEX + firstKeySeg].
     */
    int firstKeySeg;
    /**
     * The length of the string that must match before the key.  If
@ -127,20 +122,6 @@ class TransliterationRule {
    private static final char APOSTROPHE = '\'';
    private static final char BACKSLASH  = '\\';
    // Macros for accessing the array of integers encoding the position of
    // the segments.  See RuleBasedTransliterator.Segments for more details.
    // SEGMENTS_COUNT number of segments, n (half the number of parens)
    // SEGMENTS_LEN   length of the segments array (number of elements)
    // SEGMENTS_POS   position in 'pattern' of parenthesis i, where i=0..2n-1
    // SEGMENTS_NUM   index into segments to access POS of $1.open,
    //                $1.close, $2.open, $2.close,.., $n.open, $n.close
    //                Relative to FIRST_SEG_POS_INDEX.  Ranges from 0..2n-1.
    static final int FIRST_SEG_POS_INDEX = 2;
    static final int SEGMENTS_COUNT(int[] x) { return x[0]; }
    static final int SEGMENTS_LEN(int[] x) { return (SEGMENTS_COUNT(x)*4+4); }
    static final int SEGMENTS_POS(int[] x,int i) { return x[FIRST_SEG_POS_INDEX+i]; }
    static final int SEGMENTS_NUM(int[] x,int i) { return x[x[1]+i]-FIRST_SEG_POS_INDEX; }
    private static final String COPYRIGHT =
        "\u00A9 IBM Corporation 1999-2001. All rights reserved.";
@ -165,12 +146,8 @@ class TransliterationRule {
     * 0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
     * "xyz" and moves the cursor to before "a".  It would have a cursorOffset
     * of -3.
-     * @param segs array of 2n integers.  Each of n pairs consists of offset,
+     * @param segs array of UnicodeMatcher corresponding to input pattern
-     * limit for a segment of the input string.  Characters in the output string
+     * segments, or null if there are none
     * refer to these segments if they are in a special range determined by the
     * associated RuleBasedTransliterator.Data object.  May be null if there are
     * no segments.  The caller is responsible for validating that segments
     * are well-formed.
     * @param anchorStart true if the the rule is anchored on the left to
     * the context start
     * @param anchorEnd true if the rule is anchored on the right to the
@ -180,7 +157,7 @@ class TransliterationRule {
                               int anteContextPos, int postContextPos,
                               String output,
                               int cursorPos, int cursorOffset,
-                               int[] segs,
+                               UnicodeMatcher[] segs,
                               boolean anchorStart, boolean anchorEnd,
                               RuleBasedTransliterator.Data theData) {
        data = theData;
@ -212,25 +189,11 @@ class TransliterationRule {
        this.cursorPos = cursorPos + cursorOffset;
        this.output = output;
        // We don't validate the segments array.  The caller must
-        // guarantee that the segments are well-formed.
+        // guarantee that the segments are well-formed (that is, that
        // all $n references in the output refer to indices of this
        // array, and that no array elements are null).
        this.segments = segs;
        // Find the position of the first segment index that is after the
        // anteContext (in the key).  Note that this may be a start or a
        // limit index.  If all segments are in the ante context,
        // firstKeySeg should point past the last segment -- that is, it
        // should point at the end marker, which is -1.  This allows the
        // code to back up by one to obtain the last ante context segment.
        firstKeySeg = -1;
        if (segments != null) {
            firstKeySeg = FIRST_SEG_POS_INDEX;
            while (segments[firstKeySeg] >= 0 &&
                   segments[firstKeySeg] < anteContextLength) {
                ++firstKeySeg;
            }
            firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
        }
        pattern = input;
        flags = 0;
        if (anchorStart) {
@ -410,25 +373,12 @@ class TransliterationRule {
        // ============================ MATCH ===========================
-        // Record the actual positions, in the text, of the segments.
+        // Reset segment match data
        // These are recorded in the order that they occur in the pattern.
        // segPos[] is an array of 2*SEGMENTS_COUNT elements.  It
        // records the position in 'text' of each segment boundary, in
        // the order that they occur in 'pattern'.
        int[] segPos = null;
        if (segments != null) {
-            segPos = new int[2*SEGMENTS_COUNT(segments)];
+            for (int i=0; i<segments.length; ++i) {
                ((StringMatcher) segments[i]).resetMatch();
            }
        }
        // iSeg is an index into segments[] that accesses the first
        // array.  As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
        // When indexing into segments[] FIRST_SEG_POS_INDEX must be
        // added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
        int iSeg = firstKeySeg - 1;
        // nextSegPos is an offset in 'pattern'.  When the cursor is
        // equal to nextSegPos, we are at a segment boundary, and we
        // record the position in the real text in segPos[].
        int nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
        int lenDelta, keyLimit;
        int[] intRef = new int[1];
@ -465,15 +415,6 @@ class TransliterationRule {
                }
                oText = intRef[0];
            }
            while (nextSegPos == oPattern) {
                segPos[iSeg] = oText;
                if (oText >= 0) {
                    segPos[iSeg] += UTF16.getCharCount(UTF16.charAt(text, oText));
                } else {
                    ++segPos[iSeg];
                }
                nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
            }
        }
        minOText = posAfter(text, oText);
@ -486,9 +427,6 @@ class TransliterationRule {
        // -------------------- Key and Post Context --------------------
        iSeg = firstKeySeg;
        nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
        oPattern = 0;
        oText = pos.start;
        keyLimit = 0;
@ -511,10 +449,6 @@ class TransliterationRule {
            // depending on whether we're in the key or in the post
            // context.
            while (oPattern == nextSegPos) {
                segPos[iSeg] = oText;
                nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
            }
            if (oPattern == keyLength) {
                keyLimit = oText;
            }
@ -554,10 +488,6 @@ class TransliterationRule {
            //!    return UnicodeMatcher.U_MISMATCH;
            //!}
        }
        while (oPattern == nextSegPos) {
            segPos[iSeg] = oText;
            nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
        }
        if (oPattern == keyLength) {
            keyLimit = oText;
        }
@ -576,8 +506,7 @@ class TransliterationRule {
        // =========================== REPLACE ==========================
        // We have a full match.  The key is between pos.start and
-        // keyLimit.  Segment indices have been recorded in segPos[].
+        // keyLimit.
        // Perform a replacement.
        if (segments == null) {
            text.replace(pos.start, keyLimit, output);
@ -629,12 +558,23 @@ class TransliterationRule {
                        buf.setLength(0);
                    }
                    // Copy segment with out-of-band data
-                    b *= 2;
+                    StringMatcher m = (StringMatcher) segments[b];
-                    int start = segPos[SEGMENTS_NUM(segments,b)];
+                    int start = m.getMatchStart();
-                    int limit = segPos[SEGMENTS_NUM(segments,b+1)];
+                    int limit = m.getMatchLimit();
                    // If there was no match, that means that a quantifier
                    // matched zero-length.  E.g., x (a)* y matched "xy".
                    if (start >= 0) {
                        // Adjust indices for segments in post context
                        // for any inserted text between the key and
                        // the post context.
                        if (start >= keyLimit) {
                            start += dest - keyLimit;
                            limit += dest - keyLimit;
                        }
                        text.copy(start, limit, dest);
                        dest += limit - start;
                    }
                }
                oOutput += UTF16.getCharCount(c);
            }
            // Insert any accumulated straight text.
@ -790,20 +730,6 @@ class TransliterationRule {
        StringBuffer rule = new StringBuffer();
        // iseg indexes into segments[] directly (not offset from FSPI)
        int iseg = FIRST_SEG_POS_INDEX-1;
        int nextSeg = -1;
        // Build an array of booleans specifying open vs. close paren
        boolean[] isOpen = null;
        if (segments != null) {
            isOpen = new boolean[2*SEGMENTS_COUNT(segments)];
            for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
                isOpen[SEGMENTS_NUM(segments,i)  ] = true;
                isOpen[SEGMENTS_NUM(segments,i+1)] = false;
            }
            nextSeg = segments[++iseg];
        }
        // Accumulate special characters (and non-specials following them)
        // into quoteBuf.  Append quoteBuf, within single quotes, when
        // a non-quoted element must be inserted.
@ -825,14 +751,6 @@ class TransliterationRule {
                appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
            }
            // Append either '(' or ')' if we are at a segment index
            if (i == nextSeg) {
                appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
                                 '(' : ')',
                                 true, escapeUnprintable, quoteBuf);
                nextSeg = segments[++iseg];
            }
            if (emitBraces && i == (anteContextLength + keyLength)) {
                appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
            }
@ -847,11 +765,6 @@ class TransliterationRule {
            }
        }
        if (i == nextSeg) {
            // assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
            appendToRule(rule, ')', true, escapeUnprintable, quoteBuf);
        }
        if (emitBraces && i == (anteContextLength + keyLength)) {
            appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
        }
@ -885,7 +798,7 @@ class TransliterationRule {
            } else {
                ++seg; // make 1-based
                appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
-                rule.append(0x24 /*$*/);
+                rule.append('$');
                boolean show = false; // true if we should display digits
                for (int p=9; p>=0; --p) {
                    int d = seg / POW10[p];
@ -938,6 +851,9 @@ class TransliterationRule {
 /**
 * $Log: TransliterationRule.java,v $
 * Revision 1.34  2001/10/30 18:04:08  alan
 * jitterbug 1406: make quantified segments behave like perl counterparts
 *
 * Revision 1.33  2001/10/25 23:22:15  alan
 * jitterbug 73: changes to support zero-length matchers at end of key
 *
--- a/icu4j/src/com/ibm/icu/text/TransliteratorParser.java
+++ b/icu4j/src/com/ibm/icu/text/TransliteratorParser.java
@ -4,8 +4,8 @@
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
-* $Date: 2001/10/24 00:03:38 $
+* $Date: 2001/10/30 18:04:09 $
-* $Revision: 1.7 $
+* $Revision: 1.8 $
 **********************************************************************
 */
 package com.ibm.text;
@ -117,6 +117,7 @@ class TransliteratorParser {
    private static final char FWDREV_RULE_OP    = '~'; // internal rep of <> op
    private static final String OPERATORS = "=><";
    private static final String HALF_ENDERS = "=><;";
    // Other special characters
    private static final char QUOTE               = '\'';
@ -142,7 +143,7 @@ class TransliteratorParser {
    // private static final char ANCHOR_END       = '$';
    // Segments of the input string are delimited by "(" and ")".  In the
-    // output string these segments are referenced as "$1" through "$9".
+    // output string these segments are referenced as "$1", "$2", etc.
    private static final char SEGMENT_OPEN        = '(';
    private static final char SEGMENT_CLOSE       = ')';
@ -285,209 +286,6 @@ class TransliteratorParser {
        }
    };
    //----------------------------------------------------------------------
    // class Segments
    //----------------------------------------------------------------------
    /**
     * Segments are parentheses-enclosed regions of the input string.
     * These are referenced in the output string using the notation $1,
     * $2, etc.  Numbering is in order of appearance of the left
     * parenthesis.  Number is one-based.  Segments are defined as start,
     * limit pairs.  Segments may nest.
     *
     * During parsing, segment data is encoded in an object of class
     * Segments.  At runtime, the same data is encoded in compact form as
     * an array of integers in a TransliterationRule.  The runtime encoding
     * must satisfy three goals:
     *
     * 1. Iterate over the offsets in a pattern, from left to right,
     *    and indicate all segment boundaries, in order.  This is done
     *    during matching.
     *
     * 2. Given a reference $n, produce the start and limit offsets
     *    for that segment.  This is done during replacement.
     *
     * 3. Similar to goal 1, but in addition, indicate whether each
     *    segment boundary is a start or a limit, in other words, whether
     *    each is an open paren or a close paren.  This is required by
     *    the toRule() method.
     *
     * Goal 1 must be satisfied at high speed since this is done during
     * matching.  Goal 2 is next most important.  Goal 3 is not performance
     * critical since it is only needed by toRule().
     *
     * The array of integers is actually two arrays concatenated.  The
     * first gives the index values of the open and close parentheses in
     * the order they appear.  The second maps segment numbers to the
     * indices of the first array.  The two arrays have the same length.
     * Iterating over the first array satisfies goal 1.  Indexing into the
     * second array satisfies goal 2.  Goal 3 is satisfied by iterating
     * over the second array and constructing the required data when
     * needed.  This is what toRule() does.
     *
     * Example:  (a b(c d)e f)
     *            0 1 2 3 4 5 6
     *
     * First array: Indices are 0, 2, 4, and 6.
     * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
     * second array is 0, 3, 1 2 -- these give the indices in the
     * first array at which $1:open, $1:close, $2:open, and $2:close
     * occur.
     *
     * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
     *
     * Each subarray is terminated with a -1, and two leading entries
     * give the number of segments and the offset to the first entry
     * of the second array.  In addition, the second array value are
     * all offset by 2 so they index directly into the final array.
     * The total array size is 4*segments[0] + 4.  The second index is
     * 2*segments[0] + 3.
     *
     * In the output string, a segment reference is indicated by a
     * character in a special range, as defined by
     * RuleBasedTransliterator.Data.
     *
     * Most rules have no segments, in which case segments is null, and the
     * output string need not be checked for segment reference characters.
     *
     * See also rbt_rule.h/cpp.
     */
    private static class Segments {
        private Vector offsets; // holds Integer objects
        private Vector isOpenParen; // holds Boolean objects
        private int offset(int i) {
            return ((Integer) offsets.elementAt(i)).intValue();
        }
        private boolean isOpen(int i) {
            return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
        }
        // size of the Vectors
        private int size() {
            // assert(offset.size() == isOpenParen.size());
            return offsets.size();
        }
        public Segments() {
            offsets = new Vector();
            isOpenParen = new Vector();
        }
        public void addParenthesisAt(int offset, boolean isOpen) {
            offsets.addElement(new Integer(offset));
            isOpenParen.addElement(new Boolean(isOpen));
        }
        public int getLastParenOffset(boolean[] isOpenParen) {
            if (size() == 0) {
                return -1;
            }
            isOpenParen[0] = isOpen(size()-1);
            return offset(size()-1);
        }
        // Remove the last (rightmost) segment.  Store its offsets in start
        // and limit, and then convert all offsets at or after start to be
        // equal to start.  Upon failure, return FALSE.  Assume that the
        // caller has already called getLastParenOffset() and validated that
        // there is at least one parenthesis and that the last one is a close
        // paren.
        public boolean extractLastParenSubstring(int[] start, int[] limit) {
            // assert(offsets.size() > 0);
            // assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
            int i = size() - 1;
            int n = 1; // count of close parens we need to match
            // Record position of the last close paren
            limit[0] = offset(i);
            --i; // back up to the one before the last one
            while (i >= 0 && n != 0) {
                n += isOpen(i) ? -1 : 1;
            }
            if (n != 0) {
                return false;
            }
            // assert(i>=0);
            start[0] = offset(i);
            // Reset all segment pairs from i to size() - 1 to [start, start+1).
            while (i<size()) {
                int o = isOpen(i) ? start[0] : (start[0]+1);
                offsets.setElementAt(new Integer(o), i);
                ++i;
            }
            return true;
        }
        // Assume caller has already gotten a TRUE validate().
        public int[] createArray() {
            int c = count(); // number of segments
            int arrayLen = 4*c + 4;
            int[] array = new int[arrayLen];
            int a2offset = 2*c + 3; // offset to array 2
            array[0] = c;
            array[1] = a2offset;
            int i;
            for (i=0; i<2*c; ++i) {
                array[2+i] = offset(i);
            }
            array[a2offset-1] = -1;
            array[arrayLen-1] = -1;
            // Now walk through and match up segment numbers with parentheses.
            // Number segments from 0.  We're going to offset all entries by 2
            // to skip the first two elements, array[0] and array[1].
            Stack stack = new Stack();
            int nextOpen = 0; // seg # of next open, 0-based
            for (i=0; i<2*c; ++i) {
                boolean open = isOpen(i);
                // Let seg be the zero-based segment number.
                // Open parens are at 2*seg in array 2.
                // Close parens are at 2*seg+1 in array 2.
                if (open) {
                    array[a2offset + 2*nextOpen] = 2+i;
                    stack.push(new Integer(nextOpen));
                    ++nextOpen;
                } else {
                    int nextClose = ((Integer) stack.pop()).intValue();
                    array[a2offset + 2*nextClose+1] = 2+i;
                }
            }
            // assert(stack.empty());
            return array;
        }
        public boolean validate() {
            // want number of parens >= 2
            // want number of parens to be even
            // want first paren '('
            // want parens to match up in the end
            if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
                return false;
            }
            int n = 0;
            for (int i=0; i<size(); ++i) {
                n += isOpen(i) ? 1 : -1;
                if (n < 0) {
                    return false;
                }
            }
            return n == 0;
        }
        // Number of segments
        // Assume caller has already gotten a TRUE validate().
        public int count() {
            // assert(validate());
            return size() / 2;
        }
    }
    //----------------------------------------------------------------------
    // class RuleHalf
    //----------------------------------------------------------------------
@ -505,11 +303,7 @@ class TransliteratorParser {
        public int ante = -1;   // position of ante context marker '{' in text
        public int post = -1;   // position of post context marker '}' in text
-        // Record the position of the segment substrings and references.  A
+        public int maxRef = -1; // n where maximum segment ref is $n; 1-based
        // given side should have segments or segment references, but not
        // both.
        public Segments segments = null;
        public int maxRef = -1; // index of largest ref (1..9)
        // Record the offset to the cursor either to the left or to the
        // right of the key.  This is indicated by characters on the output
@ -521,29 +315,88 @@ class TransliteratorParser {
        // output text.
        public int cursorOffset = 0; // only nonzero on output side
        // Position of first CURSOR_OFFSET on _right_.  This will be -1
        // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
        private int cursorOffsetPos = 0;
        public boolean anchorStart = false;
        public boolean anchorEnd   = false;
        /**
         * UnicodeMatcher objects corresponding to each segment.
         */
        public Vector segments = new Vector();
        /**
         * The segment number from 0..n-1 of the next '(' we see
         * during parsing; 0-based.
         */
        private int nextSegmentNumber = 0;
        /**
         * Parse one side of a rule, stopping at either the limit,
-         * the END_OF_RULE character, or an operator.  Return
+         * the END_OF_RULE character, or an operator.
-         * the pos of the terminating character (or limit).
+         * @return the index after the terminating character, or
         * if limit was reached, limit
         */
        public int parse(String rule, int pos, int limit,
                         TransliteratorParser parser) {
            int start = pos;
            StringBuffer buf = new StringBuffer();
            pos = parseSection(rule, pos, limit, parser, buf, false);
            text = buf.toString();
            if (cursorOffset > 0 && cursor != cursorOffsetPos) {
                syntaxError("Misplaced " + CURSOR_POS, rule, start);
            }
            return pos;
        }
        /**
         * Parse a section of one side of a rule, stopping at either
         * the limit, the END_OF_RULE character, an operator, or a
         * segment close character.  This method parses both a
         * top-level rule half and a segment within such a rule half.
         * It calls itself recursively to parse segments and nested
         * segments.
         * @param buf buffer into which to accumulate the rule pattern
         * characters, either literal characters from the rule or
         * standins for UnicodeMatcher objects including segments.
         * @param isSegment if true, then we've already seen a '(' and
         * pos on entry points right after it.  Accumulate everything
         * up to the closing ')', put it in a segment matcher object,
         * generate a standin for it, and add the standin to buf.  As
         * a side effect, update the segments vector with a reference
         * to the segment matcher.  This works recursively for nested
         * segments.  If isSegment is false, just accumulate
         * characters into buf.
         * @return the index after the terminating character, or
         * if limit was reached, limit
         */
        private int parseSection(String rule, int pos, int limit,
                                 TransliteratorParser parser,
                                 StringBuffer buf,
                                 boolean isSegment) {
            int start = pos;
            ParsePosition pp = null;
            int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
            boolean done = false;
            int quoteStart = -1; // Most recent 'single quoted string'
            int quoteLimit = -1;
            int varStart = -1; // Most recent $variableReference
            int varLimit = -1;
            int[] iref = new int[1];
            // If isSegment, then bufSegStart is the offset in buf to
            // the first character of the segment we are parsing.
            int bufSegStart = 0;
            int segmentNumber = 0;
            if (isSegment) {
                bufSegStart = buf.length();
                segmentNumber = nextSegmentNumber++;
            }
        main:
-            while (pos < limit && !done) {
+            while (pos < limit) {
                char c = rule.charAt(pos++);
                if (Character.isWhitespace(c)) {
                    // Ignore whitespace.  Note that this is not Unicode
@ -551,8 +404,11 @@ class TransliteratorParser {
                    // whitespace likely to be seen in code.
                    continue;
                }
-                if (OPERATORS.indexOf(c) >= 0) {
+                // HALF_ENDERS is all chars that end a rule half: "<>=;"
-                    --pos; // Backup to point to operator
+                if (HALF_ENDERS.indexOf(c) >= 0) {
                    if (isSegment) {
                        syntaxError("Unclosed segment", rule, start);
                    }
                    break main;
                }
                if (anchorEnd) {
@ -614,7 +470,12 @@ class TransliteratorParser {
                    }
                    continue;
                }
                switch (c) {
                //------------------------------------------------------
                // Elements allowed within and out of segments
                //------------------------------------------------------
                case ANCHOR_START:
                    if (buf.length() == 0 && !anchorStart) {
                        anchorStart = true;
@ -624,17 +485,8 @@ class TransliteratorParser {
                    }
                    break;
                case SEGMENT_OPEN:
-                case SEGMENT_CLOSE:
+                    pos = parseSection(rule, pos, limit, parser, buf, true);
                    // Handle segment definitions "(" and ")"
                    // Parse "(", ")"
                    if (segments == null) {
                        segments = new Segments();
                    }
                    segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
                    break;
                case END_OF_RULE:
                    --pos; // Backup to point to END_OF_RULE
                    break main;
                case SymbolTable.SYMBOL_REF:
                    // Handle variable references and segment references "$1" .. "$9"
                    {
@ -697,25 +549,129 @@ class TransliteratorParser {
                        }
                    }
                    break;
                case DOT:
                    buf.append(parser.getDotStandIn());
                    break;
                case KLEENE_STAR:
                case ONE_OR_MORE:
                case ZERO_OR_ONE:
                    // Quantifiers.  We handle single characters, quoted strings,
                    // variable references, and segments.
                    //  a+      matches  aaa
                    //  'foo'+  matches  foofoofoo
                    //  $v+     matches  xyxyxy if $v == xy
                    //  (seg)+  matches  segsegseg
                    {
                        if (isSegment && buf.length() == bufSegStart) {
                            // The */+ immediately follows '('
                            syntaxError("Misplaced quantifier", rule, start);
                            break;
                        } 
                        int qstart, qlimit;
                        // The */+ follows an isolated character or quote
                        // or variable reference
                        if (buf.length() == quoteLimit) {
                            // The */+ follows a 'quoted string'
                            qstart = quoteStart;
                            qlimit = quoteLimit;
                        } else if (buf.length() == varLimit) {
                            // The */+ follows a $variableReference
                            qstart = varStart;
                            qlimit = varLimit;
                        } else {
                            // The */+ follows a single character, possibly
                            // a segment standin
                            qstart = buf.length() - 1;
                            qlimit = qstart + 1;
                        }
                        UnicodeMatcher m =
                            new StringMatcher(buf.toString(), qstart, qlimit,
                                              false, parser.data);
                        int min = 0;
                        int max = Quantifier.MAX;
                        switch (c) {
                        case ONE_OR_MORE:
                            min = 1;
                            break;
                        case ZERO_OR_ONE:
                            min = 0;
                            max = 1;
                            break;
                            // case KLEENE_STAR:
                            //    do nothing -- min, max already set
                        }
                        m = new Quantifier(m, min, max);
                        buf.setLength(qstart);
                        buf.append(parser.generateStandInFor(m));
                    }
                    break;
                //------------------------------------------------------
                // Elements allowed ONLY WITHIN segments
                //------------------------------------------------------
                case SEGMENT_CLOSE:
                    if (isSegment) {
                        // We're done parsing a segment.  The relevant
                        // characters are in buf, starting at offset
                        // bufSegStart.  Extract them into a string
                        // matcher, and replace them with a standin
                        // for that matcher.
                        StringMatcher m =
                            new StringMatcher(buf.substring(bufSegStart),
                                              true, parser.data);
                        // Since we call parseSection() recursively,
                        // nested segments will result in segment i+1
                        // getting parsed and stored before segment i;
                        // be careful with the vector handling here.
                        if ((segmentNumber+1) > segments.size()) {
                            segments.setSize(segmentNumber+1);
                        }
                        segments.setElementAt(m, segmentNumber);
                        buf.setLength(bufSegStart);
                        buf.append(parser.generateStandInFor(m));
                        break main;
                    }
                    // If we aren't in a segment, then a segment close
                    // character is a syntax error.
                    syntaxError("Unquoted special", rule, start);
                    break;
                //------------------------------------------------------
                // Elements allowed ONLY OUTSIDE segments
                //------------------------------------------------------
                case CONTEXT_ANTE:
                    if (isSegment) {
                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
                    }
                    if (ante >= 0) {
                        syntaxError("Multiple ante contexts", rule, start);
                    }
                    ante = buf.length();
                    break;
                case CONTEXT_POST:
                    if (isSegment) {
                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
                    }
                    if (post >= 0) {
                        syntaxError("Multiple post contexts", rule, start);
                    }
                    post = buf.length();
                    break;
                case CURSOR_POS:
                    if (isSegment) {
                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
                    }
                    if (cursor >= 0) {
                        syntaxError("Multiple cursors", rule, start);
                    }
                    cursor = buf.length();
                    break;
                case CURSOR_OFFSET:
                    if (isSegment) {
                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
                    }
                    if (cursorOffset < 0) {
                        if (buf.length() > 0) {
                            syntaxError("Misplaced " + c, rule, start);
@ -737,74 +693,10 @@ class TransliteratorParser {
                        }
                    }
                    break;
-                case DOT:
+
-                    buf.append(parser.getDotStandIn());
+                //------------------------------------------------------
-                    break;
+                // Non-special characters
-                case KLEENE_STAR:
+                //------------------------------------------------------
                case ONE_OR_MORE:
                case ZERO_OR_ONE:
                    // Quantifiers.  We handle single characters, quoted strings,
                    // variable references, and segments.
                    //  a+      matches  aaa
                    //  'foo'+  matches  foofoofoo
                    //  $v+     matches  xyxyxy if $v == xy
                    //  (seg)+  matches  segsegseg
                    {
                        int qstart, qlimit;
                        boolean[] isOpenParen = new boolean[1];
                        boolean isSegment = false;
                        if (segments != null &&
                            segments.getLastParenOffset(isOpenParen) == buf.length()) {
                            // The */+ immediately follows a segment
                            if (isOpenParen[0]) {
                                syntaxError("Misplaced quantifier", rule, start);
                            }
                            int[] startparam = new int[1];
                            int[] limitparam = new int[1];
                            if (!segments.extractLastParenSubstring(startparam, limitparam)) {
                                syntaxError("Mismatched segment delimiters", rule, start);
                            }
                            qstart = startparam[0];
                            qlimit = limitparam[0];
                            isSegment = true;
                        } else {
                            // The */+ follows an isolated character or quote
                            // or variable reference
                            if (buf.length() == quoteLimit) {
                                // The */+ follows a 'quoted string'
                                qstart = quoteStart;
                                qlimit = quoteLimit;
                            } else if (buf.length() == varLimit) {
                                // The */+ follows a $variableReference
                                qstart = varStart;
                                qlimit = varLimit;
                            } else {
                                // The */+ follows a single character
                                qstart = buf.length() - 1;
                                qlimit = qstart + 1;
                            }
                        }
                        UnicodeMatcher m =
                            new StringMatcher(buf.toString(), qstart, qlimit,
                                              isSegment, parser.data);
                        int min = 0;
                        int max = Quantifier.MAX;
                        switch (c) {
                        case ONE_OR_MORE:
                            min = 1;
                            break;
                        case ZERO_OR_ONE:
                            min = 0;
                            max = 1;
                            break;
                            // case KLEENE_STAR:
                            //    do nothing -- min, max already set
                        }
                        m = new Quantifier(m, min, max);
                        buf.setLength(qstart);
                        buf.append(parser.generateStandInFor(m));
                    }
                    break;
                default:
                    // Disallow unquoted characters other than [0-9A-Za-z]
                    // in the printable ASCII range.  These characters are
@ -819,11 +711,6 @@ class TransliteratorParser {
                    break;
                }
            }
            if (cursorOffset > 0 && cursor != cursorOffsetPos) {
                syntaxError("Misplaced " + CURSOR_POS, rule, start);
            }
            text = buf.toString();
            return pos;
        }
@ -838,10 +725,12 @@ class TransliteratorParser {
        }
        /**
-         * Create and return an int[] array of segments.
+         * Create and return a UnicodeMatcher[] array of segments,
         * or null if there are no segments.
         */
-        int[] createSegments() {
+        UnicodeMatcher[] createSegments() {
-            return (segments == null) ? null : segments.createArray();
+            return (segments.size() == 0) ? null :
                (UnicodeMatcher[]) segments.toArray(new UnicodeMatcher[segments.size()]);
        }
    }
@ -1096,9 +985,10 @@ class TransliteratorParser {
        pos = left.parse(rule, pos, limit, this);
        if (pos == limit ||
-            OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
+            OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
-            syntaxError("No operator", rule, start);
+            syntaxError("No operator pos=" + pos, rule, start);
        }
        ++pos;
        // Found an operator char.  Check for forward-reverse operator.
        if (operator == REVERSE_RULE_OP &&
@ -1110,7 +1000,7 @@ class TransliteratorParser {
        pos = right.parse(rule, pos, limit, this);
        if (pos < limit) {
-            if (rule.charAt(pos) == END_OF_RULE) {
+            if (rule.charAt(--pos) == END_OF_RULE) {
                ++pos;
            } else {
                // RuleHalf parser must have terminated at an operator
@ -1173,7 +1063,7 @@ class TransliteratorParser {
        // apply.
        if (operator == FWDREV_RULE_OP) {
            right.removeContext();
-            right.segments = null;
+            right.segments.removeAllElements();
            left.cursor = left.maxRef = -1;
            left.cursorOffset = 0;
        }
@ -1193,7 +1083,7 @@ class TransliteratorParser {
        // cannot place the cursor outside the limits of the context.
        // Anchors are only allowed on the input side.
        if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
-            right.segments != null || left.maxRef >= 0 ||
+            right.segments.size() > 0 || left.maxRef >= 0 ||
            (right.cursorOffset != 0 && right.cursor < 0) ||
            // - The following two checks were used to ensure that the
            // - the cursor offset stayed within the ante- or postcontext.
@ -1208,14 +1098,8 @@ class TransliteratorParser {
        // Check integrity of segments and segment references.  Each
        // segment's start must have a corresponding limit, and the
        // references must not refer to segments that do not exist.
-        if (left.segments != null) {
+        if (right.maxRef > left.segments.size()) {
-            if (!left.segments.validate()) {
+            syntaxError("Undefined segment reference $" + right.maxRef, rule, start);
                syntaxError("Missing segment close", rule, start);
            }
            int n = left.segments.count();
            if (right.maxRef > n) {
                syntaxError("Undefined segment reference", rule, start);
            }
        }
        data.ruleSet.addRule(new TransliterationRule(
@ -1363,7 +1247,7 @@ class TransliteratorParser {
    char generateStandInFor(UnicodeMatcher matcher) {
        // assert(matcher != null);
        if (variableNext >= variableLimit) {
-            throw new RuntimeException("Private use variables exhausted");
+            throw new RuntimeException("Variable range exhausted");
        }
        variablesVector.addElement(matcher);
        return variableNext++;
--- a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
- * $Date: 2001/10/26 22:59:26 $
+ * $Date: 2001/10/30 18:08:19 $
- * $Revision: 1.57 $
+ * $Revision: 1.58 $
 *
 *****************************************************************************************
 */
@ -1268,9 +1268,11 @@ public class TransliteratorTest extends TestFmwk {
               "c abc ababc",
               "d d abd");
        // NOTE: The (ab)+ when referenced just yields a single "ab",
        // not the full sequence of them.  This accords with perl behavior.
        expect("(ab)+ {x} > '(' $1 ')';",
               "x abx ababxy",
-               "x ab(ab) abab(abab)y");
+               "x ab(ab) abab(ab)y");
        expect("b+ > x;",
               "ac abc abbc abbbc",
@ -1288,12 +1290,11 @@ public class TransliteratorTest extends TestFmwk {
               "qa qab qaba qababc",
               "xa x xa xc");
-        // Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
+        // NOTE: The (ab)+ when referenced just yields a single "ab",
-        // In perl, it only matches the first occurrence, so the output
+        // not the full sequence of them.  This accords with perl behavior.
        // is "()a (ab) (ab)a (ab)c".
        expect("q(ab)* > '(' $1 ')';",
               "qa qab qaba qababc",
-               "()a (ab) (ab)a (abab)c");
+               "()a (ab) (ab)a (ab)c");
        // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
        // quoted string
@ -1574,6 +1575,46 @@ public class TransliteratorTest extends TestFmwk {
        expect(gr, "\u03B1\u0314", "ha");
    }
    /**
     * Test quantified segment behavior.  We want:
     * ([abc])+ > x $1 x; applied to "cba" produces "xax"
     */
    public void TestQuantifiedSegment() {
        // The normal case
        expect("([abc]+) > x $1 x;", "cba", "xcbax");
        // The tricky case; the quantifier is around the segment
        expect("([abc])+ > x $1 x;", "cba", "xax");
        // Tricky case in reverse direction
        expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
        // Check post-context segment
        expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
        // Test toRule/toPattern for non-quantified segment.
        // Careful with spacing here.
        String r = "([a-c]){q} > x $1 x;";
        Transliterator t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
        String rr = t.toRules(true);
        if (!r.equals(rr)) {
            errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
        } else {
            logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
        }
        // Test toRule/toPattern for quantified segment.
        // Careful with spacing here.
        r = "([a-c])+{q} > x $1 x;";
        t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
        rr = t.toRules(true);
        if (!r.equals(rr)) {
            errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
        } else {
            logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
        }
    }
    //======================================================================
    // icu4j ONLY
    // These tests are not mirrored (yet) in icu4c at
--- a/icu4j/src/com/ibm/text/StringMatcher.java
+++ b/icu4j/src/com/ibm/text/StringMatcher.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/StringMatcher.java,v $ 
- * $Date: 2001/10/25 22:32:02 $ 
+ * $Date: 2001/10/30 18:04:08 $ 
- * $Revision: 1.2 $
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -18,16 +18,27 @@ class StringMatcher implements UnicodeMatcher {
    private boolean isSegment;
    private int matchStart;
    private int matchLimit;
    private final RuleBasedTransliterator.Data data;
    public StringMatcher(String theString,
                         boolean isSeg,
                         RuleBasedTransliterator.Data theData) {
        data = theData;
        isSegment = isSeg;
        pattern = theString;
        matchStart = matchLimit = -1;
    }
    public StringMatcher(String theString,
                         int start,
                         int limit,
                         boolean isSeg,
                         RuleBasedTransliterator.Data theData) {
-        data = theData;
+        this(theString.substring(start, limit), isSeg, theData);
        isSegment = isSeg;
        pattern = theString.substring(start, limit);
    }
    /**
@ -40,6 +51,7 @@ class StringMatcher implements UnicodeMatcher {
        int i;
        int[] cursor = new int[] { offset[0] };
        if (limit < cursor[0]) {
            // Match in the reverse direction
            for (i=pattern.length()-1; i>=0; --i) {
                char keyChar = pattern.charAt(i);
                UnicodeMatcher subm = data.lookup(keyChar);
@ -58,6 +70,13 @@ class StringMatcher implements UnicodeMatcher {
                    }
                }
            }
            // Record the match position, but adjust for a normal
            // forward start, limit, and only if a prior match does not
            // exist -- we want the rightmost match.
            if (matchStart < 0) {
                matchStart = cursor[0]+1;
                matchLimit = offset[0]+1;
            }
        } else {
            for (i=0; i<pattern.length(); ++i) {
                if (incremental && cursor[0] == limit) {
@ -85,6 +104,9 @@ class StringMatcher implements UnicodeMatcher {
                    }
                }
            }
            // Record the match position
            matchStart = offset[0];
            matchLimit = cursor[0];
        }
        offset[0] = cursor[0];
@ -114,7 +136,7 @@ class StringMatcher implements UnicodeMatcher {
            result.append(')');
        }
        // Flush quoteBuf out to result
-        TransliterationRule.appendToRule(result, (isSegment?')':-1),
+        TransliterationRule.appendToRule(result, -1,
                                         true, escapeUnprintable, quoteBuf);
        return result.toString();
    }
@ -130,6 +152,32 @@ class StringMatcher implements UnicodeMatcher {
        UnicodeMatcher m = data.lookup(c);
        return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
    }
    /**
     * Remove any match data.  This must be called before performing a
     * set of matches with this segment.
     */
    public void resetMatch() {
        matchStart = matchLimit = -1;
    }
    /**
     * Return the start offset, in the match text, of the <em>rightmost</em>
     * match.  This method may get moved up into the UnicodeMatcher if
     * it turns out to be useful to generalize this.
     */
    public int getMatchStart() {
        return matchStart;
    }
    /**
     * Return the limit offset, in the match text, of the <em>rightmost</em>
     * match.  This method may get moved up into the UnicodeMatcher if
     * it turns out to be useful to generalize this.
     */
    public int getMatchLimit() {
        return matchLimit;
    }
 }
 //eof
--- a/icu4j/src/com/ibm/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/text/TransliterationRule.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
- * $Date: 2001/10/25 23:22:15 $
+ * $Date: 2001/10/30 18:04:08 $
- * $Revision: 1.33 $
+ * $Revision: 1.34 $
 *
 *****************************************************************************************
 */
@ -30,13 +30,15 @@ import com.ibm.util.Utility;
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined.
 *
- * <p>A rule may contain segments in its input string and segment references in
+ * <p>A rule may contain segments in its input string and segment
- * its output string.  A segment is a substring of the input pattern, indicated
+ * references in its output string.  A segment is a substring of the
- * by an offset and limit.  The segment may span the preceding or following
+ * input pattern, indicated by an offset and limit.  The segment may
- * context.  A segment reference is a special character in the output string
+ * be in the preceding or following context.  It may not span a
- * that causes a segment of the input string (not the input pattern) to be
+ * context boundary.  A segment reference is a special character in
- * copied to the output string.  The range of special characters that represent
+ * the output string that causes a segment of the input string (not
- * segment references is defined by RuleBasedTransliterator.Data.
+ * the input pattern) to be copied to the output string.  The range of
 * special characters that represent segment references is defined by
 * RuleBasedTransliterator.Data.
 *
 * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
 * string "abc.123" to "ab1.c23".
@ -44,7 +46,7 @@ import com.ibm.util.Utility;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.33 $ $Date: 2001/10/25 23:22:15 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.34 $ $Date: 2001/10/30 18:04:08 $
 */
 class TransliterationRule {
@ -64,20 +66,13 @@ class TransliterationRule {
    private String output;
    /**
-     * An array of integers encoding the position of the segments.
+     * An array of matcher objects corresponding to the input pattern
-     * See RuleBasedTransliterator.Segments for more details.
+     * segments.  If there are no segments this is null.  N.B. This is
     * a UnicodeMatcher for generality, but in practice it is always a
     * StringMatcher.  In the future we may generalize this, but for
     * now we sometimes cast down to StringMatcher.
     */
-    int[] segments;
+    UnicodeMatcher[] segments;
    /**
     * A value we compute from segments.  The first index into segments[]
     * that is >= anteContextLength.  That is, the first one that is within
     * the forward scanned part of the pattern -- the key or the postContext.
     * If there are no segments, this has the value -1.  This index is relative
     * to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
     * segments[FIRST_SEG_POS_INDEX + firstKeySeg].
     */
    int firstKeySeg;
    /**
     * The length of the string that must match before the key.  If
@ -127,20 +122,6 @@ class TransliterationRule {
    private static final char APOSTROPHE = '\'';
    private static final char BACKSLASH  = '\\';
    // Macros for accessing the array of integers encoding the position of
    // the segments.  See RuleBasedTransliterator.Segments for more details.
    // SEGMENTS_COUNT number of segments, n (half the number of parens)
    // SEGMENTS_LEN   length of the segments array (number of elements)
    // SEGMENTS_POS   position in 'pattern' of parenthesis i, where i=0..2n-1
    // SEGMENTS_NUM   index into segments to access POS of $1.open,
    //                $1.close, $2.open, $2.close,.., $n.open, $n.close
    //                Relative to FIRST_SEG_POS_INDEX.  Ranges from 0..2n-1.
    static final int FIRST_SEG_POS_INDEX = 2;
    static final int SEGMENTS_COUNT(int[] x) { return x[0]; }
    static final int SEGMENTS_LEN(int[] x) { return (SEGMENTS_COUNT(x)*4+4); }
    static final int SEGMENTS_POS(int[] x,int i) { return x[FIRST_SEG_POS_INDEX+i]; }
    static final int SEGMENTS_NUM(int[] x,int i) { return x[x[1]+i]-FIRST_SEG_POS_INDEX; }
    private static final String COPYRIGHT =
        "\u00A9 IBM Corporation 1999-2001. All rights reserved.";
@ -165,12 +146,8 @@ class TransliterationRule {
     * 0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
     * "xyz" and moves the cursor to before "a".  It would have a cursorOffset
     * of -3.
-     * @param segs array of 2n integers.  Each of n pairs consists of offset,
+     * @param segs array of UnicodeMatcher corresponding to input pattern
-     * limit for a segment of the input string.  Characters in the output string
+     * segments, or null if there are none
     * refer to these segments if they are in a special range determined by the
     * associated RuleBasedTransliterator.Data object.  May be null if there are
     * no segments.  The caller is responsible for validating that segments
     * are well-formed.
     * @param anchorStart true if the the rule is anchored on the left to
     * the context start
     * @param anchorEnd true if the rule is anchored on the right to the
@ -180,7 +157,7 @@ class TransliterationRule {
                               int anteContextPos, int postContextPos,
                               String output,
                               int cursorPos, int cursorOffset,
-                               int[] segs,
+                               UnicodeMatcher[] segs,
                               boolean anchorStart, boolean anchorEnd,
                               RuleBasedTransliterator.Data theData) {
        data = theData;
@ -212,25 +189,11 @@ class TransliterationRule {
        this.cursorPos = cursorPos + cursorOffset;
        this.output = output;
        // We don't validate the segments array.  The caller must
-        // guarantee that the segments are well-formed.
+        // guarantee that the segments are well-formed (that is, that
        // all $n references in the output refer to indices of this
        // array, and that no array elements are null).
        this.segments = segs;
        // Find the position of the first segment index that is after the
        // anteContext (in the key).  Note that this may be a start or a
        // limit index.  If all segments are in the ante context,
        // firstKeySeg should point past the last segment -- that is, it
        // should point at the end marker, which is -1.  This allows the
        // code to back up by one to obtain the last ante context segment.
        firstKeySeg = -1;
        if (segments != null) {
            firstKeySeg = FIRST_SEG_POS_INDEX;
            while (segments[firstKeySeg] >= 0 &&
                   segments[firstKeySeg] < anteContextLength) {
                ++firstKeySeg;
            }
            firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
        }
        pattern = input;
        flags = 0;
        if (anchorStart) {
@ -410,25 +373,12 @@ class TransliterationRule {
        // ============================ MATCH ===========================
-        // Record the actual positions, in the text, of the segments.
+        // Reset segment match data
        // These are recorded in the order that they occur in the pattern.
        // segPos[] is an array of 2*SEGMENTS_COUNT elements.  It
        // records the position in 'text' of each segment boundary, in
        // the order that they occur in 'pattern'.
        int[] segPos = null;
        if (segments != null) {
-            segPos = new int[2*SEGMENTS_COUNT(segments)];
+            for (int i=0; i<segments.length; ++i) {
                ((StringMatcher) segments[i]).resetMatch();
            }
        }
        // iSeg is an index into segments[] that accesses the first
        // array.  As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
        // When indexing into segments[] FIRST_SEG_POS_INDEX must be
        // added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
        int iSeg = firstKeySeg - 1;
        // nextSegPos is an offset in 'pattern'.  When the cursor is
        // equal to nextSegPos, we are at a segment boundary, and we
        // record the position in the real text in segPos[].
        int nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
        int lenDelta, keyLimit;
        int[] intRef = new int[1];
@ -465,15 +415,6 @@ class TransliterationRule {
                }
                oText = intRef[0];
            }
            while (nextSegPos == oPattern) {
                segPos[iSeg] = oText;
                if (oText >= 0) {
                    segPos[iSeg] += UTF16.getCharCount(UTF16.charAt(text, oText));
                } else {
                    ++segPos[iSeg];
                }
                nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
            }
        }
        minOText = posAfter(text, oText);
@ -486,9 +427,6 @@ class TransliterationRule {
        // -------------------- Key and Post Context --------------------
        iSeg = firstKeySeg;
        nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
        oPattern = 0;
        oText = pos.start;
        keyLimit = 0;
@ -511,10 +449,6 @@ class TransliterationRule {
            // depending on whether we're in the key or in the post
            // context.
            while (oPattern == nextSegPos) {
                segPos[iSeg] = oText;
                nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
            }
            if (oPattern == keyLength) {
                keyLimit = oText;
            }
@ -554,10 +488,6 @@ class TransliterationRule {
            //!    return UnicodeMatcher.U_MISMATCH;
            //!}
        }
        while (oPattern == nextSegPos) {
            segPos[iSeg] = oText;
            nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
        }
        if (oPattern == keyLength) {
            keyLimit = oText;
        }
@ -576,8 +506,7 @@ class TransliterationRule {
        // =========================== REPLACE ==========================
        // We have a full match.  The key is between pos.start and
-        // keyLimit.  Segment indices have been recorded in segPos[].
+        // keyLimit.
        // Perform a replacement.
        if (segments == null) {
            text.replace(pos.start, keyLimit, output);
@ -629,12 +558,23 @@ class TransliterationRule {
                        buf.setLength(0);
                    }
                    // Copy segment with out-of-band data
-                    b *= 2;
+                    StringMatcher m = (StringMatcher) segments[b];
-                    int start = segPos[SEGMENTS_NUM(segments,b)];
+                    int start = m.getMatchStart();
-                    int limit = segPos[SEGMENTS_NUM(segments,b+1)];
+                    int limit = m.getMatchLimit();
                    // If there was no match, that means that a quantifier
                    // matched zero-length.  E.g., x (a)* y matched "xy".
                    if (start >= 0) {
                        // Adjust indices for segments in post context
                        // for any inserted text between the key and
                        // the post context.
                        if (start >= keyLimit) {
                            start += dest - keyLimit;
                            limit += dest - keyLimit;
                        }
                        text.copy(start, limit, dest);
                        dest += limit - start;
                    }
                }
                oOutput += UTF16.getCharCount(c);
            }
            // Insert any accumulated straight text.
@ -790,20 +730,6 @@ class TransliterationRule {
        StringBuffer rule = new StringBuffer();
        // iseg indexes into segments[] directly (not offset from FSPI)
        int iseg = FIRST_SEG_POS_INDEX-1;
        int nextSeg = -1;
        // Build an array of booleans specifying open vs. close paren
        boolean[] isOpen = null;
        if (segments != null) {
            isOpen = new boolean[2*SEGMENTS_COUNT(segments)];
            for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
                isOpen[SEGMENTS_NUM(segments,i)  ] = true;
                isOpen[SEGMENTS_NUM(segments,i+1)] = false;
            }
            nextSeg = segments[++iseg];
        }
        // Accumulate special characters (and non-specials following them)
        // into quoteBuf.  Append quoteBuf, within single quotes, when
        // a non-quoted element must be inserted.
@ -825,14 +751,6 @@ class TransliterationRule {
                appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
            }
            // Append either '(' or ')' if we are at a segment index
            if (i == nextSeg) {
                appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
                                 '(' : ')',
                                 true, escapeUnprintable, quoteBuf);
                nextSeg = segments[++iseg];
            }
            if (emitBraces && i == (anteContextLength + keyLength)) {
                appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
            }
@ -847,11 +765,6 @@ class TransliterationRule {
            }
        }
        if (i == nextSeg) {
            // assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
            appendToRule(rule, ')', true, escapeUnprintable, quoteBuf);
        }
        if (emitBraces && i == (anteContextLength + keyLength)) {
            appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
        }
@ -885,7 +798,7 @@ class TransliterationRule {
            } else {
                ++seg; // make 1-based
                appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
-                rule.append(0x24 /*$*/);
+                rule.append('$');
                boolean show = false; // true if we should display digits
                for (int p=9; p>=0; --p) {
                    int d = seg / POW10[p];
@ -938,6 +851,9 @@ class TransliterationRule {
 /**
 * $Log: TransliterationRule.java,v $
 * Revision 1.34  2001/10/30 18:04:08  alan
 * jitterbug 1406: make quantified segments behave like perl counterparts
 *
 * Revision 1.33  2001/10/25 23:22:15  alan
 * jitterbug 73: changes to support zero-length matchers at end of key
 *
--- a/icu4j/src/com/ibm/text/TransliteratorParser.java
+++ b/icu4j/src/com/ibm/text/TransliteratorParser.java
@ -4,8 +4,8 @@
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $
-* $Date: 2001/10/24 00:03:38 $
+* $Date: 2001/10/30 18:04:09 $
-* $Revision: 1.7 $
+* $Revision: 1.8 $
 **********************************************************************
 */
 package com.ibm.text;
@ -117,6 +117,7 @@ class TransliteratorParser {
    private static final char FWDREV_RULE_OP    = '~'; // internal rep of <> op
    private static final String OPERATORS = "=><";
    private static final String HALF_ENDERS = "=><;";
    // Other special characters
    private static final char QUOTE               = '\'';
@ -142,7 +143,7 @@ class TransliteratorParser {
    // private static final char ANCHOR_END       = '$';
    // Segments of the input string are delimited by "(" and ")".  In the
-    // output string these segments are referenced as "$1" through "$9".
+    // output string these segments are referenced as "$1", "$2", etc.
    private static final char SEGMENT_OPEN        = '(';
    private static final char SEGMENT_CLOSE       = ')';
@ -285,209 +286,6 @@ class TransliteratorParser {
        }
    };
    //----------------------------------------------------------------------
    // class Segments
    //----------------------------------------------------------------------
    /**
     * Segments are parentheses-enclosed regions of the input string.
     * These are referenced in the output string using the notation $1,
     * $2, etc.  Numbering is in order of appearance of the left
     * parenthesis.  Number is one-based.  Segments are defined as start,
     * limit pairs.  Segments may nest.
     *
     * During parsing, segment data is encoded in an object of class
     * Segments.  At runtime, the same data is encoded in compact form as
     * an array of integers in a TransliterationRule.  The runtime encoding
     * must satisfy three goals:
     *
     * 1. Iterate over the offsets in a pattern, from left to right,
     *    and indicate all segment boundaries, in order.  This is done
     *    during matching.
     *
     * 2. Given a reference $n, produce the start and limit offsets
     *    for that segment.  This is done during replacement.
     *
     * 3. Similar to goal 1, but in addition, indicate whether each
     *    segment boundary is a start or a limit, in other words, whether
     *    each is an open paren or a close paren.  This is required by
     *    the toRule() method.
     *
     * Goal 1 must be satisfied at high speed since this is done during
     * matching.  Goal 2 is next most important.  Goal 3 is not performance
     * critical since it is only needed by toRule().
     *
     * The array of integers is actually two arrays concatenated.  The
     * first gives the index values of the open and close parentheses in
     * the order they appear.  The second maps segment numbers to the
     * indices of the first array.  The two arrays have the same length.
     * Iterating over the first array satisfies goal 1.  Indexing into the
     * second array satisfies goal 2.  Goal 3 is satisfied by iterating
     * over the second array and constructing the required data when
     * needed.  This is what toRule() does.
     *
     * Example:  (a b(c d)e f)
     *            0 1 2 3 4 5 6
     *
     * First array: Indices are 0, 2, 4, and 6.
     * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
     * second array is 0, 3, 1 2 -- these give the indices in the
     * first array at which $1:open, $1:close, $2:open, and $2:close
     * occur.
     *
     * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
     *
     * Each subarray is terminated with a -1, and two leading entries
     * give the number of segments and the offset to the first entry
     * of the second array.  In addition, the second array value are
     * all offset by 2 so they index directly into the final array.
     * The total array size is 4*segments[0] + 4.  The second index is
     * 2*segments[0] + 3.
     *
     * In the output string, a segment reference is indicated by a
     * character in a special range, as defined by
     * RuleBasedTransliterator.Data.
     *
     * Most rules have no segments, in which case segments is null, and the
     * output string need not be checked for segment reference characters.
     *
     * See also rbt_rule.h/cpp.
     */
    private static class Segments {
        private Vector offsets; // holds Integer objects
        private Vector isOpenParen; // holds Boolean objects
        private int offset(int i) {
            return ((Integer) offsets.elementAt(i)).intValue();
        }
        private boolean isOpen(int i) {
            return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
        }
        // size of the Vectors
        private int size() {
            // assert(offset.size() == isOpenParen.size());
            return offsets.size();
        }
        public Segments() {
            offsets = new Vector();
            isOpenParen = new Vector();
        }
        public void addParenthesisAt(int offset, boolean isOpen) {
            offsets.addElement(new Integer(offset));
            isOpenParen.addElement(new Boolean(isOpen));
        }
        public int getLastParenOffset(boolean[] isOpenParen) {
            if (size() == 0) {
                return -1;
            }
            isOpenParen[0] = isOpen(size()-1);
            return offset(size()-1);
        }
        // Remove the last (rightmost) segment.  Store its offsets in start
        // and limit, and then convert all offsets at or after start to be
        // equal to start.  Upon failure, return FALSE.  Assume that the
        // caller has already called getLastParenOffset() and validated that
        // there is at least one parenthesis and that the last one is a close
        // paren.
        public boolean extractLastParenSubstring(int[] start, int[] limit) {
            // assert(offsets.size() > 0);
            // assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
            int i = size() - 1;
            int n = 1; // count of close parens we need to match
            // Record position of the last close paren
            limit[0] = offset(i);
            --i; // back up to the one before the last one
            while (i >= 0 && n != 0) {
                n += isOpen(i) ? -1 : 1;
            }
            if (n != 0) {
                return false;
            }
            // assert(i>=0);
            start[0] = offset(i);
            // Reset all segment pairs from i to size() - 1 to [start, start+1).
            while (i<size()) {
                int o = isOpen(i) ? start[0] : (start[0]+1);
                offsets.setElementAt(new Integer(o), i);
                ++i;
            }
            return true;
        }
        // Assume caller has already gotten a TRUE validate().
        public int[] createArray() {
            int c = count(); // number of segments
            int arrayLen = 4*c + 4;
            int[] array = new int[arrayLen];
            int a2offset = 2*c + 3; // offset to array 2
            array[0] = c;
            array[1] = a2offset;
            int i;
            for (i=0; i<2*c; ++i) {
                array[2+i] = offset(i);
            }
            array[a2offset-1] = -1;
            array[arrayLen-1] = -1;
            // Now walk through and match up segment numbers with parentheses.
            // Number segments from 0.  We're going to offset all entries by 2
            // to skip the first two elements, array[0] and array[1].
            Stack stack = new Stack();
            int nextOpen = 0; // seg # of next open, 0-based
            for (i=0; i<2*c; ++i) {
                boolean open = isOpen(i);
                // Let seg be the zero-based segment number.
                // Open parens are at 2*seg in array 2.
                // Close parens are at 2*seg+1 in array 2.
                if (open) {
                    array[a2offset + 2*nextOpen] = 2+i;
                    stack.push(new Integer(nextOpen));
                    ++nextOpen;
                } else {
                    int nextClose = ((Integer) stack.pop()).intValue();
                    array[a2offset + 2*nextClose+1] = 2+i;
                }
            }
            // assert(stack.empty());
            return array;
        }
        public boolean validate() {
            // want number of parens >= 2
            // want number of parens to be even
            // want first paren '('
            // want parens to match up in the end
            if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
                return false;
            }
            int n = 0;
            for (int i=0; i<size(); ++i) {
                n += isOpen(i) ? 1 : -1;
                if (n < 0) {
                    return false;
                }
            }
            return n == 0;
        }
        // Number of segments
        // Assume caller has already gotten a TRUE validate().
        public int count() {
            // assert(validate());
            return size() / 2;
        }
    }
    //----------------------------------------------------------------------
    // class RuleHalf
    //----------------------------------------------------------------------
@ -505,11 +303,7 @@ class TransliteratorParser {
        public int ante = -1;   // position of ante context marker '{' in text
        public int post = -1;   // position of post context marker '}' in text
-        // Record the position of the segment substrings and references.  A
+        public int maxRef = -1; // n where maximum segment ref is $n; 1-based
        // given side should have segments or segment references, but not
        // both.
        public Segments segments = null;
        public int maxRef = -1; // index of largest ref (1..9)
        // Record the offset to the cursor either to the left or to the
        // right of the key.  This is indicated by characters on the output
@ -521,29 +315,88 @@ class TransliteratorParser {
        // output text.
        public int cursorOffset = 0; // only nonzero on output side
        // Position of first CURSOR_OFFSET on _right_.  This will be -1
        // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
        private int cursorOffsetPos = 0;
        public boolean anchorStart = false;
        public boolean anchorEnd   = false;
        /**
         * UnicodeMatcher objects corresponding to each segment.
         */
        public Vector segments = new Vector();
        /**
         * The segment number from 0..n-1 of the next '(' we see
         * during parsing; 0-based.
         */
        private int nextSegmentNumber = 0;
        /**
         * Parse one side of a rule, stopping at either the limit,
-         * the END_OF_RULE character, or an operator.  Return
+         * the END_OF_RULE character, or an operator.
-         * the pos of the terminating character (or limit).
+         * @return the index after the terminating character, or
         * if limit was reached, limit
         */
        public int parse(String rule, int pos, int limit,
                         TransliteratorParser parser) {
            int start = pos;
            StringBuffer buf = new StringBuffer();
            pos = parseSection(rule, pos, limit, parser, buf, false);
            text = buf.toString();
            if (cursorOffset > 0 && cursor != cursorOffsetPos) {
                syntaxError("Misplaced " + CURSOR_POS, rule, start);
            }
            return pos;
        }
        /**
         * Parse a section of one side of a rule, stopping at either
         * the limit, the END_OF_RULE character, an operator, or a
         * segment close character.  This method parses both a
         * top-level rule half and a segment within such a rule half.
         * It calls itself recursively to parse segments and nested
         * segments.
         * @param buf buffer into which to accumulate the rule pattern
         * characters, either literal characters from the rule or
         * standins for UnicodeMatcher objects including segments.
         * @param isSegment if true, then we've already seen a '(' and
         * pos on entry points right after it.  Accumulate everything
         * up to the closing ')', put it in a segment matcher object,
         * generate a standin for it, and add the standin to buf.  As
         * a side effect, update the segments vector with a reference
         * to the segment matcher.  This works recursively for nested
         * segments.  If isSegment is false, just accumulate
         * characters into buf.
         * @return the index after the terminating character, or
         * if limit was reached, limit
         */
        private int parseSection(String rule, int pos, int limit,
                                 TransliteratorParser parser,
                                 StringBuffer buf,
                                 boolean isSegment) {
            int start = pos;
            ParsePosition pp = null;
            int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
            boolean done = false;
            int quoteStart = -1; // Most recent 'single quoted string'
            int quoteLimit = -1;
            int varStart = -1; // Most recent $variableReference
            int varLimit = -1;
            int[] iref = new int[1];
            // If isSegment, then bufSegStart is the offset in buf to
            // the first character of the segment we are parsing.
            int bufSegStart = 0;
            int segmentNumber = 0;
            if (isSegment) {
                bufSegStart = buf.length();
                segmentNumber = nextSegmentNumber++;
            }
        main:
-            while (pos < limit && !done) {
+            while (pos < limit) {
                char c = rule.charAt(pos++);
                if (Character.isWhitespace(c)) {
                    // Ignore whitespace.  Note that this is not Unicode
@ -551,8 +404,11 @@ class TransliteratorParser {
                    // whitespace likely to be seen in code.
                    continue;
                }
-                if (OPERATORS.indexOf(c) >= 0) {
+                // HALF_ENDERS is all chars that end a rule half: "<>=;"
-                    --pos; // Backup to point to operator
+                if (HALF_ENDERS.indexOf(c) >= 0) {
                    if (isSegment) {
                        syntaxError("Unclosed segment", rule, start);
                    }
                    break main;
                }
                if (anchorEnd) {
@ -614,7 +470,12 @@ class TransliteratorParser {
                    }
                    continue;
                }
                switch (c) {
                //------------------------------------------------------
                // Elements allowed within and out of segments
                //------------------------------------------------------
                case ANCHOR_START:
                    if (buf.length() == 0 && !anchorStart) {
                        anchorStart = true;
@ -624,17 +485,8 @@ class TransliteratorParser {
                    }
                    break;
                case SEGMENT_OPEN:
-                case SEGMENT_CLOSE:
+                    pos = parseSection(rule, pos, limit, parser, buf, true);
                    // Handle segment definitions "(" and ")"
                    // Parse "(", ")"
                    if (segments == null) {
                        segments = new Segments();
                    }
                    segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
                    break;
                case END_OF_RULE:
                    --pos; // Backup to point to END_OF_RULE
                    break main;
                case SymbolTable.SYMBOL_REF:
                    // Handle variable references and segment references "$1" .. "$9"
                    {
@ -697,25 +549,129 @@ class TransliteratorParser {
                        }
                    }
                    break;
                case DOT:
                    buf.append(parser.getDotStandIn());
                    break;
                case KLEENE_STAR:
                case ONE_OR_MORE:
                case ZERO_OR_ONE:
                    // Quantifiers.  We handle single characters, quoted strings,
                    // variable references, and segments.
                    //  a+      matches  aaa
                    //  'foo'+  matches  foofoofoo
                    //  $v+     matches  xyxyxy if $v == xy
                    //  (seg)+  matches  segsegseg
                    {
                        if (isSegment && buf.length() == bufSegStart) {
                            // The */+ immediately follows '('
                            syntaxError("Misplaced quantifier", rule, start);
                            break;
                        } 
                        int qstart, qlimit;
                        // The */+ follows an isolated character or quote
                        // or variable reference
                        if (buf.length() == quoteLimit) {
                            // The */+ follows a 'quoted string'
                            qstart = quoteStart;
                            qlimit = quoteLimit;
                        } else if (buf.length() == varLimit) {
                            // The */+ follows a $variableReference
                            qstart = varStart;
                            qlimit = varLimit;
                        } else {
                            // The */+ follows a single character, possibly
                            // a segment standin
                            qstart = buf.length() - 1;
                            qlimit = qstart + 1;
                        }
                        UnicodeMatcher m =
                            new StringMatcher(buf.toString(), qstart, qlimit,
                                              false, parser.data);
                        int min = 0;
                        int max = Quantifier.MAX;
                        switch (c) {
                        case ONE_OR_MORE:
                            min = 1;
                            break;
                        case ZERO_OR_ONE:
                            min = 0;
                            max = 1;
                            break;
                            // case KLEENE_STAR:
                            //    do nothing -- min, max already set
                        }
                        m = new Quantifier(m, min, max);
                        buf.setLength(qstart);
                        buf.append(parser.generateStandInFor(m));
                    }
                    break;
                //------------------------------------------------------
                // Elements allowed ONLY WITHIN segments
                //------------------------------------------------------
                case SEGMENT_CLOSE:
                    if (isSegment) {
                        // We're done parsing a segment.  The relevant
                        // characters are in buf, starting at offset
                        // bufSegStart.  Extract them into a string
                        // matcher, and replace them with a standin
                        // for that matcher.
                        StringMatcher m =
                            new StringMatcher(buf.substring(bufSegStart),
                                              true, parser.data);
                        // Since we call parseSection() recursively,
                        // nested segments will result in segment i+1
                        // getting parsed and stored before segment i;
                        // be careful with the vector handling here.
                        if ((segmentNumber+1) > segments.size()) {
                            segments.setSize(segmentNumber+1);
                        }
                        segments.setElementAt(m, segmentNumber);
                        buf.setLength(bufSegStart);
                        buf.append(parser.generateStandInFor(m));
                        break main;
                    }
                    // If we aren't in a segment, then a segment close
                    // character is a syntax error.
                    syntaxError("Unquoted special", rule, start);
                    break;
                //------------------------------------------------------
                // Elements allowed ONLY OUTSIDE segments
                //------------------------------------------------------
                case CONTEXT_ANTE:
                    if (isSegment) {
                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
                    }
                    if (ante >= 0) {
                        syntaxError("Multiple ante contexts", rule, start);
                    }
                    ante = buf.length();
                    break;
                case CONTEXT_POST:
                    if (isSegment) {
                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
                    }
                    if (post >= 0) {
                        syntaxError("Multiple post contexts", rule, start);
                    }
                    post = buf.length();
                    break;
                case CURSOR_POS:
                    if (isSegment) {
                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
                    }
                    if (cursor >= 0) {
                        syntaxError("Multiple cursors", rule, start);
                    }
                    cursor = buf.length();
                    break;
                case CURSOR_OFFSET:
                    if (isSegment) {
                        syntaxError("Illegal character '" + c + "' in segment", rule, start);
                    }
                    if (cursorOffset < 0) {
                        if (buf.length() > 0) {
                            syntaxError("Misplaced " + c, rule, start);
@ -737,74 +693,10 @@ class TransliteratorParser {
                        }
                    }
                    break;
-                case DOT:
+
-                    buf.append(parser.getDotStandIn());
+                //------------------------------------------------------
-                    break;
+                // Non-special characters
-                case KLEENE_STAR:
+                //------------------------------------------------------
                case ONE_OR_MORE:
                case ZERO_OR_ONE:
                    // Quantifiers.  We handle single characters, quoted strings,
                    // variable references, and segments.
                    //  a+      matches  aaa
                    //  'foo'+  matches  foofoofoo
                    //  $v+     matches  xyxyxy if $v == xy
                    //  (seg)+  matches  segsegseg
                    {
                        int qstart, qlimit;
                        boolean[] isOpenParen = new boolean[1];
                        boolean isSegment = false;
                        if (segments != null &&
                            segments.getLastParenOffset(isOpenParen) == buf.length()) {
                            // The */+ immediately follows a segment
                            if (isOpenParen[0]) {
                                syntaxError("Misplaced quantifier", rule, start);
                            }
                            int[] startparam = new int[1];
                            int[] limitparam = new int[1];
                            if (!segments.extractLastParenSubstring(startparam, limitparam)) {
                                syntaxError("Mismatched segment delimiters", rule, start);
                            }
                            qstart = startparam[0];
                            qlimit = limitparam[0];
                            isSegment = true;
                        } else {
                            // The */+ follows an isolated character or quote
                            // or variable reference
                            if (buf.length() == quoteLimit) {
                                // The */+ follows a 'quoted string'
                                qstart = quoteStart;
                                qlimit = quoteLimit;
                            } else if (buf.length() == varLimit) {
                                // The */+ follows a $variableReference
                                qstart = varStart;
                                qlimit = varLimit;
                            } else {
                                // The */+ follows a single character
                                qstart = buf.length() - 1;
                                qlimit = qstart + 1;
                            }
                        }
                        UnicodeMatcher m =
                            new StringMatcher(buf.toString(), qstart, qlimit,
                                              isSegment, parser.data);
                        int min = 0;
                        int max = Quantifier.MAX;
                        switch (c) {
                        case ONE_OR_MORE:
                            min = 1;
                            break;
                        case ZERO_OR_ONE:
                            min = 0;
                            max = 1;
                            break;
                            // case KLEENE_STAR:
                            //    do nothing -- min, max already set
                        }
                        m = new Quantifier(m, min, max);
                        buf.setLength(qstart);
                        buf.append(parser.generateStandInFor(m));
                    }
                    break;
                default:
                    // Disallow unquoted characters other than [0-9A-Za-z]
                    // in the printable ASCII range.  These characters are
@ -819,11 +711,6 @@ class TransliteratorParser {
                    break;
                }
            }
            if (cursorOffset > 0 && cursor != cursorOffsetPos) {
                syntaxError("Misplaced " + CURSOR_POS, rule, start);
            }
            text = buf.toString();
            return pos;
        }
@ -838,10 +725,12 @@ class TransliteratorParser {
        }
        /**
-         * Create and return an int[] array of segments.
+         * Create and return a UnicodeMatcher[] array of segments,
         * or null if there are no segments.
         */
-        int[] createSegments() {
+        UnicodeMatcher[] createSegments() {
-            return (segments == null) ? null : segments.createArray();
+            return (segments.size() == 0) ? null :
                (UnicodeMatcher[]) segments.toArray(new UnicodeMatcher[segments.size()]);
        }
    }
@ -1096,9 +985,10 @@ class TransliteratorParser {
        pos = left.parse(rule, pos, limit, this);
        if (pos == limit ||
-            OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
+            OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
-            syntaxError("No operator", rule, start);
+            syntaxError("No operator pos=" + pos, rule, start);
        }
        ++pos;
        // Found an operator char.  Check for forward-reverse operator.
        if (operator == REVERSE_RULE_OP &&
@ -1110,7 +1000,7 @@ class TransliteratorParser {
        pos = right.parse(rule, pos, limit, this);
        if (pos < limit) {
-            if (rule.charAt(pos) == END_OF_RULE) {
+            if (rule.charAt(--pos) == END_OF_RULE) {
                ++pos;
            } else {
                // RuleHalf parser must have terminated at an operator
@ -1173,7 +1063,7 @@ class TransliteratorParser {
        // apply.
        if (operator == FWDREV_RULE_OP) {
            right.removeContext();
-            right.segments = null;
+            right.segments.removeAllElements();
            left.cursor = left.maxRef = -1;
            left.cursorOffset = 0;
        }
@ -1193,7 +1083,7 @@ class TransliteratorParser {
        // cannot place the cursor outside the limits of the context.
        // Anchors are only allowed on the input side.
        if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
-            right.segments != null || left.maxRef >= 0 ||
+            right.segments.size() > 0 || left.maxRef >= 0 ||
            (right.cursorOffset != 0 && right.cursor < 0) ||
            // - The following two checks were used to ensure that the
            // - the cursor offset stayed within the ante- or postcontext.
@ -1208,14 +1098,8 @@ class TransliteratorParser {
        // Check integrity of segments and segment references.  Each
        // segment's start must have a corresponding limit, and the
        // references must not refer to segments that do not exist.
-        if (left.segments != null) {
+        if (right.maxRef > left.segments.size()) {
-            if (!left.segments.validate()) {
+            syntaxError("Undefined segment reference $" + right.maxRef, rule, start);
                syntaxError("Missing segment close", rule, start);
            }
            int n = left.segments.count();
            if (right.maxRef > n) {
                syntaxError("Undefined segment reference", rule, start);
            }
        }
        data.ruleSet.addRule(new TransliterationRule(
@ -1363,7 +1247,7 @@ class TransliteratorParser {
    char generateStandInFor(UnicodeMatcher matcher) {
        // assert(matcher != null);
        if (variableNext >= variableLimit) {
-            throw new RuntimeException("Private use variables exhausted");
+            throw new RuntimeException("Variable range exhausted");
        }
        variablesVector.addElement(matcher);
        return variableNext++;