From 5d8012a121f0d695aac4be3d5030e90c0c6b7be6 Mon Sep 17 00:00:00 2001
From: Alan Liu <alansliu@gmail.com>
Date: Wed, 26 Sep 2001 18:04:13 +0000
Subject: [PATCH] ICU-1243 sync parsers in icu4j, icu4c, esp. segment code

X-SVN-Rev: 5930
---
 icu4c/source/i18n/rbt_pars.cpp |   2 +-
 icu4c/source/i18n/rbt_rule.cpp | 123 ++++++++++++++++++---------------
 icu4c/source/i18n/rbt_rule.h   |   4 +-
 icu4c/source/i18n/translit.cpp |   5 +-
 4 files changed, 74 insertions(+), 60 deletions(-)

diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp
index c46b51220e..a3045cd886 100644
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@@ -597,7 +597,7 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
                 int32_t r = u_charDigitValue(c);
                 if (r >= 1 && r <= 9) {
                     ++pos;
-                    for (;;) {
+                    while (pos < limit) {
                         c = rule.charAt(pos);
                         int32_t d = u_charDigitValue(c);
                         if (d < 0) {
diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp
index f6ea63c941..66595b8e89 100644
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@@ -26,17 +26,17 @@ static const UChar BACKSLASH  = 0x005C; // '\'
 #define MAX_STATIC_SEGS 20
 
 // Macros for accessing the array of integers encoding the position of
-// the segments.  See rbt_pars.cpp::Segments for more details.
 // SEGMENTS_COUNT number of segments, n (half the number of parens)
 // SEGMENTS_LEN   length of the segments array (number of elements)
-// SEGMENTS_POS   position of parenthesis i, where i=0..2n-1
+// SEGMENTS_POS   position in 'pattern' of parenthesis i, where i=0..2n-1
 // SEGMENTS_NUM   index into segments to access POS of $1.open,
 //                $1.close, $2.open, $2.close,.., $n.open, $n.close
+//                Relative to FIRST_SEG_POS_INDEX.  Ranges from 0..2n-1.
 #define FIRST_SEG_POS_INDEX 2
 #define SEGMENTS_COUNT(x) x[0]
 #define SEGMENTS_LEN(x) (SEGMENTS_COUNT(x)*4+4)
 #define SEGMENTS_POS(x,i) x[FIRST_SEG_POS_INDEX+i]
-#define SEGMENTS_NUM(x,i) x[x[1]+i]
+#define SEGMENTS_NUM(x,i) (x[x[1]+i]-FIRST_SEG_POS_INDEX)
 
 /**
  * Construct a new rule with the given input, output text, and other
@@ -126,6 +126,7 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
                segments[firstKeySeg] < anteContextLength) {
             ++firstKeySeg;
         }
+        firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
     }
 
     pattern = input;
@@ -341,13 +342,23 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
     // Record the actual positions, in the text, of the segments.
 	// These are recorded in the order that they occur in the pattern.
 
+    // segPos[] is an array of 2*SEGMENTS_COUNT elements.  It
+    // records the position in 'text' of each segment boundary, in
+    // the order that they occur in 'pattern'.
     int32_t _segPos[2*MAX_STATIC_SEGS];
     int32_t *segPos = _segPos;
     if (segments != 0 && SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
         segPos = new int32_t[2*SEGMENTS_COUNT(segments)];
     }
+    // iSeg is an index into segments[] that accesses the first
+    // array.  As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
+    // When indexing into segments[] FIRST_SEG_POS_INDEX must be
+    // added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
     int32_t iSeg = firstKeySeg - 1;
-    int32_t nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;
+    // nextSegPos is an offset in 'pattern'.  When the cursor is
+    // equal to nextSegPos, we are at a segment boundary, and we
+    // record the position in the real text in segPos[].
+    int32_t nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
 
     UMatchDegree m;
     int32_t lenDelta, keyLimit;
@@ -357,49 +368,49 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
     // A mismatch in the ante context, or with the start anchor,
     // is an outright U_MISMATCH regardless of whether we are
     // incremental or not.
-    int32_t cursor;
+    int32_t oText; // offset into 'text'
     int32_t newStart = 0;
-    int32_t minCursor;
-    int32_t i;
+    int32_t minOText;
+    int32_t oPattern; // offset into 'pattern'
 
-    // Backup cursor by one
-    cursor = posBefore(text, pos.start);
+    // Backup oText by one
+    oText = posBefore(text, pos.start);
 
-    for (i=anteContextLength-1; i>=0; --i) {
-        UChar keyChar = pattern.charAt(i);
+    for (oPattern=anteContextLength-1; oPattern>=0; --oPattern) {
+        UChar keyChar = pattern.charAt(oPattern);
         const UnicodeMatcher* matcher = data->lookup(keyChar);
         if (matcher == 0) {
-            if (cursor >= pos.contextStart &&
-                keyChar == text.charAt(cursor)) {
-                --cursor;
+            if (oText >= pos.contextStart &&
+                keyChar == text.charAt(oText)) {
+                --oText;
             } else {
                 m = U_MISMATCH;
                 goto exit;
             }
         } else {
             // Subtract 1 from contextStart to make it a reverse limit
-            if (matcher->matches(text, cursor, pos.contextStart-1, FALSE)
+            if (matcher->matches(text, oText, pos.contextStart-1, FALSE)
                 != U_MATCH) {
                 m = U_MISMATCH;
                 goto exit;
             }
         }
-        while (nextSegPos == i) {
-            segPos[iSeg] = cursor;
-            if (cursor >= 0) {
-                segPos[iSeg] += UTF_CHAR_LENGTH(text.char32At(cursor));
+        while (nextSegPos == oPattern) {
+            segPos[iSeg] = oText;
+            if (oText >= 0) {
+                segPos[iSeg] += UTF_CHAR_LENGTH(text.char32At(oText));
             } else {
                 ++segPos[iSeg];
             }
-            nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[iSeg] : -1;
+            nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
         }
     }
 
-    minCursor = posAfter(text, cursor);
+    minOText = posAfter(text, oText);
 
     // ------------------------ Start Anchor ------------------------
 
-    if ((flags & ANCHOR_START) && cursor != posBefore(text, pos.contextStart)) {
+    if ((flags & ANCHOR_START) && oText != posBefore(text, pos.contextStart)) {
         m = U_MISMATCH;
         goto exit;
     }
@@ -407,63 +418,63 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
     // -------------------- Key and Post Context --------------------
 
     iSeg = firstKeySeg;
-    nextSegPos = (iSeg >= FIRST_SEG_POS_INDEX) ? (segments[iSeg] - anteContextLength) : -1;
+    nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
 
-    i = 0;
-    cursor = pos.start;
+    oPattern = 0;
+    oText = pos.start;
     keyLimit = 0;
-    while (i < (pattern.length() - anteContextLength)) {
-        if (incremental && cursor == pos.contextLimit) {
+    while (oPattern < (pattern.length() - anteContextLength)) {
+        if (incremental && oText == pos.contextLimit) {
             // We've reached the context limit without a mismatch and
             // without completing our match.
             m = U_PARTIAL_MATCH;
             goto exit;
         }
-        if (cursor == pos.limit && i < keyLength) {
+        if (oText == pos.limit && oPattern < keyLength) {
             // We're still in the pattern key but we're entering the
             // post context.
             m = U_MISMATCH;
             goto exit;
         }
-        while (i == nextSegPos) {
-            segPos[iSeg] = cursor;
-            nextSegPos = segments[++iSeg] - anteContextLength;
+        while (oPattern == nextSegPos) {
+            segPos[iSeg] = oText;
+            nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
         }
-        if (i == keyLength) {
-            keyLimit = cursor;
+        if (oPattern == keyLength) {
+            keyLimit = oText;
         }
-        UChar keyChar = pattern.charAt(anteContextLength + i++);
+        UChar keyChar = pattern.charAt(anteContextLength + oPattern++);
         const UnicodeMatcher* matcher = data->lookup(keyChar);
         if (matcher == 0) {
-            // Don't need the cursor < pos.contextLimit check if
+            // Don't need the oText < pos.contextLimit check if
             // incremental is TRUE (because it's done above); do need
             // it otherwise.
-            if (cursor < pos.contextLimit &&
-                keyChar == text.charAt(cursor)) {
-                ++cursor;
+            if (oText < pos.contextLimit &&
+                keyChar == text.charAt(oText)) {
+                ++oText;
             } else {
                 m = U_MISMATCH;
                 goto exit;
             }
         } else {
-            m = matcher->matches(text, cursor, pos.contextLimit, incremental);
+            m = matcher->matches(text, oText, pos.contextLimit, incremental);
             if (m != U_MATCH) {
                 goto exit;
             }
         }
     }
-    while (i == nextSegPos) {
-        segPos[iSeg] = cursor;
-        nextSegPos = segments[++iSeg] - anteContextLength;
+    while (oPattern == nextSegPos) {
+        segPos[iSeg] = oText;
+        nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
     }
-	if (i == keyLength) {
-		keyLimit = cursor;
+	if (oPattern == keyLength) {
+		keyLimit = oText;
 	}
 
     // ------------------------- Stop Anchor ------------------------
 
     if ((flags & ANCHOR_END) != 0) {
-        if (cursor != pos.contextLimit) {
+        if (oText != pos.contextLimit) {
             return U_MISMATCH;
         }
         if (incremental) {
@@ -508,12 +519,13 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
          */
         int32_t dest = keyLimit; // copy new text to here
         UnicodeString buf;
-        for (i=0; i<output.length(); ) {
-            if (i == cursorPos) {
+        int oOutput; // offset into 'output'
+        for (oOutput=0; oOutput<output.length(); ) {
+            if (oOutput == cursorPos) {
                 // Record the position of the cursor
                 newStart = dest - (keyLimit - pos.start);
             }
-            UChar32 c = output.char32At(i);
+            UChar32 c = output.char32At(oOutput);
             int32_t b = data->lookupSegmentReference(c);
             if (b < 0) {
                 // Accumulate straight (non-segment) text.
@@ -532,14 +544,14 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                 text.copy(start, limit, dest);
                 dest += limit - start;
             }
-            i += UTF_CHAR_LENGTH(c);
+            oOutput += UTF_CHAR_LENGTH(c);
         }
         // Insert any accumulated straight text.
         if (buf.length() > 0) {
             text.handleReplaceBetween(dest, dest, buf);
             dest += buf.length();
         }
-        if (i == cursorPos) {
+        if (oOutput == cursorPos) {
             // Record the position of the cursor
             newStart = dest - (keyLimit - pos.start);
         }
@@ -559,11 +571,11 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
         }
     }
     
-    cursor += lenDelta;
+    oText += lenDelta;
     pos.limit += lenDelta;
     pos.contextLimit += lenDelta;
-    // Restrict new value of start to [minCursor, min(cursor, pos.limit)].
-    pos.start = uprv_max(minCursor, uprv_min(uprv_min(cursor, pos.limit), newStart));
+    // Restrict new value of start to [minOText, min(oText, pos.limit)].
+    pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
     m = U_MATCH;
     
   exit:
@@ -691,6 +703,7 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
                                            UBool escapeUnprintable) const {
     int32_t i;
 
+    // iseg indexes into segments[] directly (not offset from FSPI)
     int32_t iseg = FIRST_SEG_POS_INDEX-1;
     int32_t nextSeg = -1;
     // Build an array of booleans specifying open vs. close paren
@@ -701,8 +714,8 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
             isOpen = new UBool[2*SEGMENTS_COUNT(segments)];
         }
         for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
-            isOpen[SEGMENTS_NUM(segments,i)  -FIRST_SEG_POS_INDEX] = TRUE;
-            isOpen[SEGMENTS_NUM(segments,i+1)-FIRST_SEG_POS_INDEX] = FALSE;
+            isOpen[SEGMENTS_NUM(segments,i)  ] = TRUE;
+            isOpen[SEGMENTS_NUM(segments,i+1)] = FALSE;
         }
         nextSeg = segments[++iseg];
     }
diff --git a/icu4c/source/i18n/rbt_rule.h b/icu4c/source/i18n/rbt_rule.h
index ff82d37a7f..f1edfe2d43 100644
--- a/icu4c/source/i18n/rbt_rule.h
+++ b/icu4c/source/i18n/rbt_rule.h
@@ -72,7 +72,9 @@ private:
      * A value we compute from segments.  The first index into segments[]
      * that is >= anteContextLength.  That is, the first one that is within
      * the forward scanned part of the pattern -- the key or the postContext.
-     * If there are no segments, this has the value -1.
+     * If there are no segments, this has the value -1.  This index is relative
+     * to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
+     * segments[FIRST_SEG_POS_INDEX + firstKeySeg].
      */
     int32_t firstKeySeg;
 
diff --git a/icu4c/source/i18n/translit.cpp b/icu4c/source/i18n/translit.cpp
index 19e4284b17..24c98881d6 100644
--- a/icu4c/source/i18n/translit.cpp
+++ b/icu4c/source/i18n/translit.cpp
@@ -953,9 +953,8 @@ Transliterator* Transliterator::parseID(const UnicodeString& ID,
     skipSpaces(ID, limit);
     sawDelimiter = (limit < ID.length() && ID.charAt(limit) == ID_DELIM);
     if (sawDelimiter) {
-        ++limit;
-    }
-    skipSpaces(ID, limit);
+        skipSpaces(ID, ++limit);
+   }
 
     if (!create) {
         // TODO Improve performance by scanning the UnicodeSet pattern