ICU-1076 allow unlimited @ operator; make @ increment by code points forward or backward; pin cursor to begining of antecontext and limit

X-SVN-Rev: 5440
2001-08-04 00:15:12 +00:00 · 2001-08-04 00:15:12 +00:00 · 264c1f3bcc
commit 264c1f3bcc
parent c427ceba1c
3 changed files with 70 additions and 110 deletions
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -1091,8 +1091,12 @@ int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
    if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
        right->segments != NULL || left->maxRef >= 0 ||
        (right->cursorOffset != 0 && right->cursor < 0) ||
-        (right->cursorOffset > (left->text.length() - left->post)) ||
-        (-right->cursorOffset > left->ante) ||
+        // - The following two checks were used to ensure that the
+        // - the cursor offset stayed within the ante- or postcontext.
+        // - However, with the addition of quantifiers, we have to
+        // - allow arbitrary cursor offsets and do runtime checking.
+        //(right->cursorOffset > (left->text.length() - left->post)) ||
+        //(-right->cursorOffset > left->ante) ||
        right->anchorStart || right->anchorEnd) {

        return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rule, start);
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@ -68,67 +68,7 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
                                         const TransliterationRuleData& theData,
                                         UErrorCode& status) :
    data(theData) {
-    init(input, anteContextPos, postContextPos,
-         outputStr, cursorPosition, cursorOffset, adoptedSegs,
-         anchorStart, anchorEnd, status);
-}

-/**
- * Construct a new rule with the given input, output text, and other
- * attributes.  A cursor position may be specified for the output text.
- * @param input input string, including key and optional ante and
- * post context
- * @param anteContextPos offset into input to end of ante context, or -1 if
- * none.  Must be <= input.length() if not -1.
- * @param postContextPos offset into input to start of post context, or -1
- * if none.  Must be <= input.length() if not -1, and must be >=
- * anteContextPos.
- * @param output output string
- * @param cursorPosition offset into output at which cursor is located, or -1 if
- * none.  If less than zero, then the cursor is placed after the
- * <code>output</code>; that is, -1 is equivalent to
- * <code>output.length()</code>.  If greater than
- * <code>output.length()</code> then an exception is thrown.
- */
-TransliterationRule::TransliterationRule(const UnicodeString& input,
-                                         int32_t anteContextPos, int32_t postContextPos,
-                                         const UnicodeString& outputStr,
-                                         int32_t cursorPosition,
-                                         const TransliterationRuleData& theData,
-                                         UErrorCode& status) :
-    data(theData) {
-    init(input, anteContextPos, postContextPos,
-         outputStr, cursorPosition, 0, NULL, FALSE, FALSE, status);
-}
-
-/**
- * Copy constructor.
- */
-TransliterationRule::TransliterationRule(TransliterationRule& other) :
-    pattern(other.pattern),
-    output(other.output),
-    anteContextLength(other.anteContextLength),
-    keyLength(other.keyLength),
-    cursorPos(other.cursorPos),
-    flags(other.flags),
-    firstKeySeg(other.firstKeySeg),
-    data(other.data) {
-
-    segments = 0;
-    if (other.segments != 0) {
-        int32_t len = SEGMENTS_LEN;
-        segments = new int32_t[len];
-        uprv_memcpy(segments, other.segments, len*sizeof(segments[0]));
-    }
-}
-
-void TransliterationRule::init(const UnicodeString& input,
-                               int32_t anteContextPos, int32_t postContextPos,
-                               const UnicodeString& outputStr,
-                               int32_t cursorPosition, int32_t cursorOffset,
-                               int32_t* adoptedSegs,
-                               UBool anchorStart, UBool anchorEnd,
-                               UErrorCode& status) {
    if (U_FAILURE(status)) {
        return;
    }
@ -193,6 +133,27 @@ void TransliterationRule::init(const UnicodeString& input,
    }
 }

+/**
+ * Copy constructor.
+ */
+TransliterationRule::TransliterationRule(TransliterationRule& other) :
+    pattern(other.pattern),
+    output(other.output),
+    anteContextLength(other.anteContextLength),
+    keyLength(other.keyLength),
+    cursorPos(other.cursorPos),
+    flags(other.flags),
+    firstKeySeg(other.firstKeySeg),
+    data(other.data) {
+
+    segments = 0;
+    if (other.segments != 0) {
+        int32_t len = SEGMENTS_LEN;
+        segments = new int32_t[len];
+        uprv_memcpy(segments, other.segments, len*sizeof(segments[0]));
+    }
+}
+
 TransliterationRule::~TransliterationRule() {
    delete[] segments;
 }
@ -326,6 +287,18 @@ UBool TransliterationRule::masks(const TransliterationRule& r2) const {
        0 == r2.pattern.compare(left2 - left, len, pattern);
 }

+inline int32_t posBefore(const Replaceable& str, int32_t pos) {
+    return (pos > 0) ?
+        pos - UTF_CHAR_LENGTH(str.char32At(pos-1)) :
+        pos - 1;
+}
+
+inline int32_t posAfter(const Replaceable& str, int32_t pos) {
+    return (pos < str.length()) ?
+        pos + UTF_CHAR_LENGTH(str.char32At(pos)) :
+        pos + 1;
+}
+
 /**
 * Attempt a match and replacement at the given position.  Return
 * the degree of match between this rule and the given text.  The
@ -385,16 +358,13 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    // A mismatch in the ante context, or with the start anchor,
    // is an outright U_MISMATCH regardless of whether we are
    // incremental or not.
-    int32_t cursor = pos.start;
+    int32_t cursor;
    int32_t newStart = 0;
+    int32_t minCursor;
    int32_t i;

    // Backup cursor by one
-    if (cursor > 0) {
-        cursor -= UTF_CHAR_LENGTH(text.char32At(cursor-1));
-    } else {
-        --cursor;
-    }
+    cursor = posBefore(text, pos.start);

    for (i=anteContextLength-1; i>=0; --i) {
        UChar keyChar = pattern.charAt(i);
@ -415,10 +385,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                goto exit;
            }
        }
-        if (cursorPos == (i - anteContextLength)) {
-            // Record the position of the cursor
-            newStart = cursor;
-        }
        while (nextSegPos == i) {
            segPos[iSeg] = cursor;
            if (cursor >= 0) {
@ -430,9 +396,11 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
        }
    }

+    minCursor = posAfter(text, cursor);
+
    // ------------------------ Start Anchor ------------------------

-    if ((flags & ANCHOR_START) && cursor != (pos.contextStart-1)) {
+    if ((flags & ANCHOR_START) && cursor != posBefore(text, pos.contextStart)) {
        m = U_MISMATCH;
        goto exit;
    }
@ -513,8 +481,18 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    if (segments == NULL) {
        text.handleReplaceBetween(pos.start, keyLimit, output);
        lenDelta = output.length() - (keyLimit - pos.start);
-        if (cursorPos >= 0) {
-            newStart = pos.start + cursorPos;
+        newStart = pos.start;
+        int32_t n = cursorPos;
+        // cursorPos counts 16-bit code units
+        while (n > 0) {
+            int32_t l = UTF_CHAR_LENGTH(text.char32At(newStart));
+            n -= l;
+            newStart += l;
+        }
+        while (n < 0) {
+            int32_t l = UTF_CHAR_LENGTH(text.char32At(newStart-1));
+            n += l;
+            newStart -= l;
        }
    } else {
        /* When there are segments to be copied, use the Replaceable.copy()
@ -567,11 +545,23 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
        buf.remove();
        text.handleReplaceBetween(pos.start, keyLimit, buf);
        lenDelta = dest - keyLimit - (keyLimit - pos.start);
+        // Handle cursor in postContext
+        if (cursorPos > output.length()) {
+            newStart = pos.start + (dest - keyLimit);
+            int32_t n = cursorPos - output.length();
+            // cursorPos counts 16-bit code units
+            while (n > 0) {
+                int32_t l = UTF_CHAR_LENGTH(text.char32At(newStart));
+                n -= l;
+                newStart += l;
+            }
+        }
    }
    
    pos.limit += lenDelta;
    pos.contextLimit += lenDelta;
-    pos.start = newStart;
+    // Restrict new value of start to [minCursor, pos.limit].
+    pos.start = uprv_max(minCursor, uprv_min(pos.limit, newStart));
    m = U_MATCH;
    
  exit:
--- a/icu4c/source/i18n/rbt_rule.h
+++ b/icu4c/source/i18n/rbt_rule.h
@ -158,30 +158,6 @@ public:
                        const TransliterationRuleData& data,
                        UErrorCode& status);

-    /**
-     * Construct a new rule with the given input, output text, and other
-     * attributes.  A cursor position may be specified for the output text.
-     * @param input input string, including key and optional ante and
-     * post context
-     * @param anteContextPos offset into input to end of ante context, or -1 if
-     * none.  Must be <= input.length() if not -1.
-     * @param postContextPos offset into input to start of post context, or -1
-     * if none.  Must be <= input.length() if not -1, and must be >=
-     * anteContextPos.
-     * @param output output string
-     * @param cursorPosition offset into output at which cursor is located, or -1 if
-     * none.  If less than zero, then the cursor is placed after the
-     * <code>output</code>; that is, -1 is equivalent to
-     * <code>output.length()</code>.  If greater than
-     * <code>output.length()</code> then an exception is thrown.
-     */
-    TransliterationRule(const UnicodeString& input,
-                        int32_t anteContextPos, int32_t postContextPos,
-                        const UnicodeString& outputStr,
-                        int32_t cursorPosition,
-                        const TransliterationRuleData& data,
-                        UErrorCode& status);
-
    /**
     * Copy constructor.
     */
@ -268,16 +244,6 @@ public:
     */
    virtual UnicodeString& toRule(UnicodeString& pat,
                                  UBool escapeUnprintable) const;
- private:
-
-    void init(const UnicodeString& input,
-              int32_t anteContextPos, int32_t postContextPos,
-              const UnicodeString& output,
-              int32_t cursorPos, int32_t cursorOffset,
-              int32_t* adoptedSegs,
-              UBool anchorStart, UBool anchorEnd,
-              UErrorCode& status);
-
 private:

    friend class StringMatcher;