ICU-199 rewrite UnicodeSet pattern parser

X-SVN-Rev: 547
2000-01-12 18:03:53 +00:00 · 2000-01-12 18:03:53 +00:00 · 13955b6ce2
commit 13955b6ce2
parent b06425886b
2 changed files with 257 additions and 200 deletions
--- a/icu4c/source/i18n/unicode/uniset.h
+++ b/icu4c/source/i18n/unicode/uniset.h
@ -14,6 +14,7 @@
 #include "unicode/unistr.h"

 class ParsePosition;
+class TransliterationRuleData;

 /**
 * A mutable set of Unicode characters.  Objects of this class
@ -203,6 +204,24 @@ class U_I18N_API UnicodeSet {
     */
    static UnicodeString* CATEGORY_PAIRS_CACHE;

+    /**
+     * Delimiter string used in patterns to close a category reference:
+     * ":]".  Example: "[:Lu:]".
+     */
+    static const UnicodeString CATEGORY_CLOSE;
+
+    /**
+     * Delimiter char beginning a variable reference:
+     * "{".  Example: "{var}".
+     */
+    static const UChar VARIABLE_REF_OPEN;
+    
+    /**
+     * Delimiter char ending a variable reference:
+     * "}".  Example: "{var}".
+     */
+    static const UChar VARIABLE_REF_CLOSE;
+
    //----------------------------------------------------------------
    // Debugging and testing
    //----------------------------------------------------------------
@ -503,6 +522,7 @@ private:
    static UnicodeString& parse(UnicodeString& pairsBuf /*result*/,
                                const UnicodeString& pattern,
                                ParsePosition& pos,
+                                const TransliterationRuleData* data,
                                UErrorCode& status);

    //----------------------------------------------------------------
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@ -10,6 +10,7 @@

 #include "unicode/uniset.h"
 #include "unicode/parsepos.h"
+#include "rbt_data.h"

 // N.B.: This mapping is different in ICU and Java
 const UnicodeString UnicodeSet::CATEGORY_NAMES(
@ -23,6 +24,24 @@ const UnicodeString UnicodeSet::CATEGORY_NAMES(
 UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE =
     new UnicodeString[Unicode::GENERAL_TYPES_COUNT];

+/**
+ * Delimiter string used in patterns to close a category reference:
+ * ":]".  Example: "[:Lu:]".
+ */
+const UnicodeString UnicodeSet::CATEGORY_CLOSE(":]", "");
+
+/**
+ * Delimiter char beginning a variable reference:
+ * "{".  Example: "{var}".
+ */
+const UChar UnicodeSet::VARIABLE_REF_OPEN = '{';
+
+/**
+ * Delimiter char ending a variable reference:
+ * "}".  Example: "{var}".
+ */
+const UChar UnicodeSet::VARIABLE_REF_CLOSE = '}';
+
 //----------------------------------------------------------------
 // Debugging and testing
 //----------------------------------------------------------------
@ -175,7 +194,14 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
        }
    }

-    parse(pairs, *pat, pos, status);
+    parse(pairs, *pat, pos, NULL, status);
+
+    // Skip over trailing whitespace -- clean up later
+    while (pos.getIndex() < pat->length() &&
+           Unicode::isWhitespace(pat->charAt(pos.getIndex()))) {
+        pos.setIndex(pos.getIndex() + 1);
+    }
+
    if (pos.getIndex() != pat->length()) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
    }
@ -418,6 +444,7 @@ void UnicodeSet::clear(void) {
 UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
                                 const UnicodeString& pattern,
                                 ParsePosition& pos,
+                                 const TransliterationRuleData* data,
                                 UErrorCode& status) {
    if (U_FAILURE(status)) {
        return pairsBuf;
@ -426,34 +453,96 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
    bool_t invert = FALSE;
    pairsBuf.remove();

-    /**
-     * Nodes:  0 - idle, waiting for '['
-     *        10 - like 11, but immediately after "[" or "[^"
-     *        11 - awaiting x, "]", "[...]", or "[:...:]"
-     *        21 - after x
-     *        23 - after x-
-     * 
-     * The parsing state machine moves from node 0 through zero or more
-     * other nodes back to node 0, in a U_SUCCESSful parse.
+    int32_t lastChar = -1; // This is either a char (0..FFFF) or -1
+    UChar lastOp = 0;
+
+    /* This loop iterates over the characters in the pattern.  We start at
+     * the position specified by pos.  We exit the loop when either a
+     * matching closing ']' is seen, or we read all characters of the
+     * pattern.  In the latter case an error will be thrown.
     */
-    int32_t node = 0;
-    UChar first = 0;
-    int32_t i;
-    /**
-     * This loop iterates over the characters in the pattern.  We
-     * start at the position specified by pos.  We exit the loop
-     * when either a matching closing ']' is seen, or we read all
-     * characters of the pattern.
+
+    /* Pattern syntax:
+     *  pat := '[' '^'? elem* ']'
+     *  elem := a | a '-' a | set | set op set
+     *  set := pat | (a set variable)
+     *  op := '&' | '-'
+     *  a := (a character, possibly defined by a var)
     */
-    for (i=pos.getIndex(); i<pattern.length(); ++i) {
-        UChar c = pattern.charAt(i);            /**
-         * Handle escapes here.  If a character is escaped, then
-         * it assumes its literal value.  This is true for all
-         * characters, both special characters and characters with
-         * no special meaning.  We also interpret '\\uxxxx' Unicode
-         * escapes here.
+
+    // mode 0: No chars parsed yet; next must be '['
+    // mode 1: '[' seen; if next is '^' or ':' then special
+    // mode 2: '[' '^'? seen; parse pattern and close with ']'
+    // mode 3: '[:' seen; parse category and close with ':]'
+    int8_t mode = 0;
+    int32_t openPos = 0; // offset to opening '['
+    int32_t i = pos.getIndex();
+    int32_t limit = pattern.length();
+    UnicodeString nestedAux;
+    UnicodeString* nestedPairs;
+    UnicodeString scratch;
+    for (; i<limit; ++i) {
+        /* If the next element is a single character, c will be set to it,
+         * and nestedPairs will be null.  In this case isLiteral indicates
+         * whether the character should assume special meaning if it has
+         * one.  If the next element is a nested set, either via a variable
+         * reference, or via an embedded "[..]"  or "[:..:]" pattern, then
+         * nestedPairs will be set to the pairs list for the nested set, and
+         * c's value should be ignored.
         */
+        UChar c = pattern.charAt(i);
+        nestedPairs = NULL;
        bool_t isLiteral = FALSE;
+
+        // Ignore whitespace.  This is not Unicode whitespace, but Java
+        // whitespace, a subset of Unicode whitespace.
+        if (Unicode::isWhitespace(c)) {
+            continue;
+        }
+
+        // Parse the opening '[' and optional following '^'
+        switch (mode) {
+        case 0:
+            if (c == '[') {
+                mode = 1; // Next look for '^'
+                openPos = i;
+                continue;
+            } else {
+                // throw new IllegalArgumentException("Missing opening '['");
+                status = U_ILLEGAL_ARGUMENT_ERROR;
+                return pairsBuf;
+            }
+        case 1:
+            mode = 2;
+            switch (c) {
+            case '^':
+                invert = TRUE;
+                continue; // Back to top to fetch next character
+            case ':':
+                if (i == openPos+1) {
+                    // '[:' cannot have whitespace in it
+                    --i;
+                    c = '[';
+                    mode = 3;
+                    // Fall through and parse category normally
+                }
+                break; // Fall through
+            case '-':
+                isLiteral = TRUE; // Treat leading '-' as a literal
+                break; // Fall through
+            }
+            // else fall through and parse this character normally
+        }
+
+        // After opening matter is parsed ("[", "[^", or "[:"), the mode
+        // will be 2 if we want a closing ']', or 3 if we should parse a
+        // category and close with ":]".
+
+        /* Handle escapes.  If a character is escaped, then it assumes its
+         * literal value.  This is true for all characters, both special
+         * characters and characters with no special meaning.  We also
+         * interpret '\\uxxxx' Unicode escapes here (as literals).
+         */
        if (c == '\\') {
            ++i;
            if (i < pattern.length()) {
@ -480,216 +569,164 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
                return pairsBuf;
            }
        }
-        /**
-         * Within this loop, we handle each of the four
-         * conditions: '[', ']', '-', other.  The first three
-         * characters must not be escaped.
-         */

-        /**
-         * An opening bracket indicates either the first bracket
-         * of the entire subpattern we are parsing, in which case
-         * we are in node 0 and move into node 10.  We also check
-         * for an immediately following '^', indicating the
-         * complement of the following pattern.  ('^' is any other
-         * position has no special meaning.)  If we are not in
-         * node 0, '[' represents a nested subpattern that must be
-         * recursively parsed and checked for following operators
-         * ('&' or '|').  If two nested subpatterns follow one
-         * another with no operator, their union is formed, just
-         * as with any other elements that follow one another
-         * without intervening operator.  The other thing we
-         * handle here is the syntax "[:Xx:]" or "[:X:]" that
-         * indicates a Unicode category or supercategory.
+        /* Parse variable references.  These are treated as literals.  If a
+         * variable refers to a UnicodeSet, nestedPairs is assigned here.
+         * Variable names are only parsed if varNameToChar is not null.
+         * Set variables are only looked up if varCharToSet is not null.
         */
-        if (!isLiteral && c == '[') {
-            bool_t parseOp = FALSE;
+        else if (data != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
+            ++i;
+            int32_t j = pattern.indexOf(VARIABLE_REF_CLOSE, i);
+            if (i == j || j < 0) { // empty or unterminated
+                // throw new IllegalArgumentException("Illegal variable reference");
+                status = U_ILLEGAL_ARGUMENT_ERROR;
+            } else {
+                scratch.truncate(0);
+                pattern.extractBetween(i, j, scratch);
+                ++j;
+                c = data->lookupVariable(scratch, status);
+            }
+            if (U_FAILURE(status)) {
+                // Either the reference was ill-formed (empty name, or no
+                // closing '}', or the specified name is not defined.
+                return pairsBuf;
+            }
+            isLiteral = TRUE;
+
+            UnicodeSet* set = data->lookupSet(c);
+            if (set != NULL) {
+                nestedPairs = &set->pairs;
+            }
+        }
+
+        /* An opening bracket indicates the first bracket of a nested
+         * subpattern, either a normal pattern or a category pattern.  We
+         * recognize these here and set nestedPairs accordingly.
+         */
+        else if (!isLiteral && c == '[') {
+            // Handle "[:...:]", representing a character category
            UChar d = charAfter(pattern, i);
-            // "[:...:]" represents a character category
            if (d == ':') {
-                if (node == 23) {
-                    status = U_ILLEGAL_ARGUMENT_ERROR;
-                    return pairsBuf;
-                }
-                if (node == 21) {
-                    addPair(pairsBuf, first, first);
-                    node = 11;
-                }
                i += 2;
-                int32_t j = pattern.indexOf(":]", i);
+                int32_t j = pattern.indexOf(CATEGORY_CLOSE, i);
                if (j < 0) {
+                    // throw new IllegalArgumentException("Missing \":]\"");
                    status = U_ILLEGAL_ARGUMENT_ERROR;
                    return pairsBuf;
                }
-                UnicodeString categoryName;
-                pattern.extract(i, j-i, categoryName);
-                UnicodeString temp;
-                doUnion(pairsBuf,
-                        getCategoryPairs(temp, categoryName, status));
+                scratch.truncate(0);
+                pattern.extractBetween(i, j, scratch);
+                nestedPairs = &getCategoryPairs(nestedAux, scratch, status);
                if (U_FAILURE(status)) {
                    return pairsBuf;
                }
-                i = j+1;
-                if (node == 10) {
-                    node = 11;
-                    parseOp = TRUE;
-                } else if (node == 0) {
+                i = j+1; // Make i point to ']'
+                if (mode == 3) {
+                    // Entire pattern is a category; leave parse loop
+                    pairsBuf.append(*nestedPairs);
                    break;
                }
            } else {
-                if (node == 0) {
-                    node = 10;
-                    if (d == '^') {
-                        invert = TRUE;
-                        ++i;
-                    }
-                } else {
-                    // Nested '['
-                    pos.setIndex(i);
-                    UnicodeString subPairs; // Pairs for the nested []
-                    doUnion(pairsBuf, parse(subPairs, pattern, pos, status));
-                    if (U_FAILURE(status)) {
-                        return pairsBuf;
-                    }
-                    i = pos.getIndex() - 1; // Subtract 1 to point at ']'
-                    parseOp = TRUE;
+                // Recurse to get the pairs for this nested set.
+                pos.setIndex(i);
+                nestedPairs = &parse(nestedAux, pattern, pos, data, status);
+                if (U_FAILURE(status)) {
+                    return pairsBuf;
                }
-            }
-            /**
-             * parseOp is true after "[:...:]" or a nested
-             * "[...]".  It is false only after the final closing
-             * ']'.  If parseOp is true, we look past the closing
-             * ']' to see if we have an operator character.  If
-             * so, we parse the subsequent "[...]" recursively,
-             * then perform the operation.  We do this in a loop
-             * until there are no more operators.  Note that this
-             * means the operators have equal precedence and are
-             * bound left-to-right.
-             */
-            if (parseOp) {
-                for (;;) {
-                    // Is the next character an operator?
-                    UChar op = charAfter(pattern, i);
-                    if (op == '-' || op == '&') {
-                        pos.setIndex(i+2); // Add 2 to point AFTER op
-                        UnicodeString rhs;
-                        parse(rhs, pattern, pos, status);
-                        if (U_FAILURE(status)) {
-                            return pairsBuf;
-                        }
-                        if (op == '-') {
-                            doDifference(pairsBuf, rhs);
-                        } else if (op == '&') {
-                            doIntersection(pairsBuf, rhs);
-                        }
-                        i = pos.getIndex() - 1; // - 1 to point at ']'
-                    } else {
-                        break;
-                    }
-                }
-            }          
-        }
-        /**
-         * A closing bracket can only be a closing bracket for
-         * "[...]", since the closing bracket for "[:...:]" is
-         * taken care of when the initial "[:" is seen.  When we
-         * see a closing bracket, we then know, if we were in node
-         * 21 (after x) or 23 (after x-) that nothing more is
-         * coming, and we add the last character(s) we saw to the
-         * set.  Note that a trailing '-' assumes its literal
-         * meaning, just as a leading '-' after "[" or "[^".
-         */
-        else if (!isLiteral && c == ']') {
-            if (node == 0) {
-                status = U_ILLEGAL_ARGUMENT_ERROR;
-                return pairsBuf;
-            }
-            if (node == 21 || node == 23) {
-                addPair(pairsBuf, first, first);
-                if (node == 23) {
-                    addPair(pairsBuf, '-', '-');
-                }
-            }
-            node = 0;
-            break;
-        }
-        /**
-         * '-' has the following interpretations: 1. Within
-         * "[...]", between two letters, it indicates a range.
-         * 2. Between two nested bracket patterns, "[[...]-[...]",
-         * it indicates asymmetric difference.  3. At the start of
-         * a bracket pattern, "[-...]", "[^-...]", it indicates
-         * the literal character '-'.  4. At the end of a bracket
-         * pattern, "[...-]", it indicates the literal character
-         * '-'.
-         *
-         * We handle cases 1 and 3 here.  Cases 2 and 4 are
-         * handled in the ']' parsing code.
-         */
-        else if (!isLiteral && c == '-') {
-            if (node == 10) {
-                addPair(pairsBuf, c, c); // Handle "[-...]", "[^-...]"
-            } else if (node == 21) {
-                node = 23;
-            } else {
-                status = U_ILLEGAL_ARGUMENT_ERROR;
-                return pairsBuf;
+                i = pos.getIndex() - 1; // - 1 to point at ']'
            }
        }
-        /**
-         * If we fall through to this point, we have a literal
-         * character, either one that has been escaped with a
-         * backslash, escaped with a backslash u, or that isn't
-         * a special '[', ']', or '-'.
-         *
-         * Literals can either start a range "x-...", end a range,
-         * "...-x", or indicate a single character "x".
+
+        /* At this point we have either a character c, or a nested set.  If
+         * we have encountered a nested set, either embedded in the pattern,
+         * or as a variable, we have a non-null nestedPairs, and c should be
+         * ignored.  Otherwise c is the current character, and isLiteral
+         * indicates whether it is an escaped literal (or variable) or a
+         * normal unescaped character.  Unescaped characters '-', '&', and
+         * ']' have special meanings.
         */
-        else {
-            if (node == 10 || node == 11) {
-                first = c;
-                node = 21;
-            } else if (node == 21) {
-                addPair(pairsBuf, first, first);
-                first = c;
-                node = 21;
-            } else if (node == 23) {
-                if (c < first) {
+        if (nestedPairs != NULL) {
+            if (lastChar >= 0) {
+                if (lastOp != 0) {
+                    // throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
                    status = U_ILLEGAL_ARGUMENT_ERROR;
                    return pairsBuf;
                }
-                addPair(pairsBuf, first, c);
-                node = 11;
-            } else {
-                status = U_ILLEGAL_ARGUMENT_ERROR;
-                return pairsBuf;
+                addPair(pairsBuf, (UChar)lastChar, (UChar)lastChar);
+                lastChar = -1;
            }
+            switch (lastOp) {
+            case '-':
+                doDifference(pairsBuf, *nestedPairs);
+                break;
+            case '&':
+                doIntersection(pairsBuf, *nestedPairs);
+                break;
+            case 0:
+                doUnion(pairsBuf, *nestedPairs);
+                break;
+            }
+            lastOp = 0;
+        } else if (!isLiteral && c == ']') {
+            // Final closing delimiter.  This is the only way we leave this
+            // loop if the pattern is well-formed.
+            break;
+        } else if (lastOp == 0 && !isLiteral && (c == '-' || c == '&')) {
+            lastOp = c;
+        } else if (lastOp == '-') {
+            addPair(pairsBuf, (UChar)lastChar, c);
+            lastOp = 0;
+            lastChar = -1;
+        } else if (lastOp != 0) {
+            // We have <set>&<char> or <char>&<char>
+            // throw new IllegalArgumentException("Unquoted " + lastOp);
+            status = U_ILLEGAL_ARGUMENT_ERROR;
+            return pairsBuf;
+        } else {
+            if (lastChar >= 0) {
+                // We have <char><char>
+                addPair(pairsBuf, (UChar)lastChar, (UChar)lastChar);
+            }
+            lastChar = c;
        }
    }

-    if (node != 0) {
+    // Handle unprocessed stuff preceding the closing ']'
+    if (lastOp == '-') {
+        // Trailing '-' is treated as literal
+        addPair(pairsBuf, lastOp, lastOp);
+    } else if (lastOp == '&') {
+        // throw new IllegalArgumentException("Unquoted trailing " + lastOp);
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return pairsBuf;
    }
+    if (lastChar >= 0) {
+        addPair(pairsBuf, (UChar)lastChar, (UChar)lastChar);                    
+    }
+
    /**
-     * i indexes the last character we parsed or is
-     * pattern.length().  In the latter case, the node will not be
-     * zero, since we have run off the end without finding a
-     * closing ']'.  Therefore, the above statement will have
-     * thrown an exception, and we'll never get here.  If we get
-     * here, we know i < pattern.length(), and we set the
-     * ParsePosition to the next character to be parsed.
-     */
-    pos.setIndex(i+1);
-    /**
-     * If we saw a '^' after the initial '[' of this pattern, then
-     * perform the complement.  (Inversion after '[:' is handled
-     * elsewhere.)
+     * If we saw a '^' after the initial '[' of this pattern, then perform
+     * the complement.  (Inversion after '[:' is handled elsewhere.)
     */
    if (invert) {
        doComplement(pairsBuf);
    }

+    /**
+     * i indexes the last character we parsed or is pattern.length().  In
+     * the latter case, we have run off the end without finding a closing
+     * ']'.  Otherwise, we know i < pattern.length(), and we set the
+     * ParsePosition to the next character to be parsed.
+     */
+    if (i == limit) {
+        // throw new IllegalArgumentException("Missing ']'");
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return pairsBuf;
+    }
+
+    pos.setIndex(i+1);
+
    return pairsBuf;
 }