ICU-990 add toRules API to TransliterationRule and TransliterationRuleSet

X-SVN-Rev: 4970
2001-06-12 18:02:16 +00:00 · 2001-06-12 18:02:16 +00:00 · c898f4fb24
commit c898f4fb24
parent 4bbf14902b
4 changed files with 230 additions and 36 deletions
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@ -12,6 +12,7 @@
 #include "rbt_data.h"
 #include "unicode/unifilt.h"
 #include "unicode/uniset.h"
+#include "unicode/unicode.h"
 #include "cmemory.h"

 const UChar TransliterationRule::ETHER = 0xFFFF;
@ -484,24 +485,190 @@ UBool TransliterationRule::charMatches(UChar keyChar, const Replaceable& text,
 }

 /**
- * Return true if the given key matches the given text.  This method
- * accounts for the fact that the key character may represent a character
- * set.  Note that the key and text characters may not be interchanged
- * without altering the results.
- * @param keyChar a character in the match key
- * @param textChar a character in the text being transliterated
- * @param data a dictionary of variables mapping <code>Character</code>
- * to <code>UnicodeSet</code>
- * @param filter the filter.  Any character for which
- * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
- * altered by this transliterator.  If <tt>filter</tt> is
- * <tt>null</tt> then no filtering is applied.
+ * Append a character to a rule that is being built up.
+ * @param rule the string to append the character to
+ * @param c the character to append
+ * @param isLiteral if true, then the given character should not be
+ * quoted or escaped.  Usually this means it is a syntactic element
+ * such as > or $
+ * @param escapeUnprintable if true, then unprintable characters
+ * should be escaped using \uxxxx or \Uxxxxxxxx.  These escapes will
+ * appear outside of quotes.
+ * @param quoteBuf a buffer which is used to build up quoted
+ * substrings.  The caller should initially supply an empty buffer,
+ * and thereafter should not modify the buffer.  The buffer should be
+ * cleared out by, at the end, calling this method with a literal
+ * character.
 */
-//[ANCHOR]UBool TransliterationRule::charMatches(UChar keyChar, UChar textChar,
-//[ANCHOR]                                        const TransliterationRuleData& data,
-//[ANCHOR]                                        const UnicodeFilter* filter) const {
-//[ANCHOR]    const UnicodeSet* set = 0;
-//[ANCHOR]    return (filter == 0 || filter->contains(textChar)) &&
-//[ANCHOR]        (((set = data.lookupSet(keyChar)) == 0) ?
-//[ANCHOR]         keyChar == textChar : set->contains(textChar));
-//[ANCHOR]}
+void TransliterationRule::_appendToRule(UnicodeString& rule,
+                                        UChar32 c,
+                                        UBool isLiteral,
+                                        UBool escapeUnprintable,
+                                        UnicodeString& quoteBuf) {
+    // If we are escaping unprintables, then escape them outside
+    // quotes.  \u and \U are not recognized within quotes.  The same
+    // logic applies to literals, but literals are never escaped.
+    if (isLiteral ||
+        (escapeUnprintable && UnicodeSet::_isUnprintable(c))) {
+        if (quoteBuf.length() > 0) {
+            rule.append((UChar) 0x0027 /*'*/);
+            rule.append(quoteBuf);
+            rule.append((UChar) 0x0027 /*'*/);
+            quoteBuf.truncate(0);
+        }
+        if (!UnicodeSet::_escapeUnprintable(rule, c)) {
+            // Literals should be printable and should get appended
+            // here.
+            rule.append(c);
+        }
+    }
+
+    // Double ' and '\' and don't begin a quote just for them
+    else  if (quoteBuf.length() == 0 &&
+              (c == (UChar) 0x0027 /*'*/ ||
+               c == (UChar) 0x005C /*\*/)) {
+        rule.append(c);
+        rule.append(c);
+    }
+
+    // Specials (printable ascii that isn't [0-9a-zA-Z]) and
+    // whitespace need quoting.  Also append stuff to quotes if we are
+    // building up a quoted substring already.
+    else if ((c >= 0x0021 && c <= 0x007E &&
+              !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
+                (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
+                (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
+             Unicode::isWhitespace(c) ||
+             quoteBuf.length() > 0) {
+        quoteBuf.append(c);
+        // Double ' within a quote
+        if (c == (UChar) 0x0027 /*'*/) {
+            quoteBuf.append(c);
+        }
+    }
+    
+    // Otherwise just append
+    else {
+        rule.append(c);
+    }
+}
+
+void TransliterationRule::_appendToRule(UnicodeString& rule,
+                                        const UnicodeString& text,
+                                        UBool isLiteral,
+                                        UBool escapeUnprintable,
+                                        UnicodeString& quoteBuf) {
+    for (int32_t i=0; i<text.length(); ++i) {
+        _appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
+    }
+}
+
+/**
+ * Create a source string that represents this rule.  Append it to the
+ * given string.
+ */
+UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
+                                           const TransliterationRuleData& data,
+                                           UBool escapeUnprintable) const {
+    int32_t i;
+
+    int32_t iseg = 0;
+    int32_t nextSeg = -1;
+    if (segments != 0) {
+        nextSeg = segments[iseg++];
+    }
+
+    // Accumulate special characters (and non-specials following them)
+    // into quoteBuf.  Append quoteBuf, within single quotes, when
+    // a non-quoted element must be inserted.
+    UnicodeString str, quoteBuf;
+
+    // Do not emit the braces '{' '}' around the pattern if there
+    // is neither anteContext nor postContext.
+    UBool emitBraces =
+        (anteContextLength != 0) || (keyLength != pattern.length());
+
+    // Emit the input pattern
+    for (i=0; i<pattern.length(); ++i) {
+        if (emitBraces && i == anteContextLength) {
+            _appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
+        }
+
+        // Append either '(' or ')' if we are at a segment index
+        if (i == nextSeg) {
+            _appendToRule(rule, ((iseg % 2) == 0) ?
+                             (UChar)0x0029 : (UChar)0x0028,
+                             TRUE, escapeUnprintable, quoteBuf);
+            nextSeg = segments[iseg++];
+        }
+
+        if (emitBraces && i == (anteContextLength + keyLength)) {
+            _appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
+        }
+
+        UChar c = pattern.charAt(i);
+        const UnicodeSet *set = data.lookupSet(c);
+        if (set == 0) {
+            _appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
+        } else {
+            _appendToRule(rule, set->toPattern(str, escapeUnprintable),
+                          TRUE, escapeUnprintable, quoteBuf);
+        }
+    }
+
+    if (i == nextSeg) {
+        // assert((iseg % 2) == 0);
+        _appendToRule(rule, (UChar)0x0029 /*)*/, TRUE, escapeUnprintable, quoteBuf);
+    }
+
+    if (emitBraces && i == (anteContextLength + keyLength)) {
+        _appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
+    }
+
+    _appendToRule(rule, UnicodeString(" > ", ""), TRUE, escapeUnprintable, quoteBuf);
+
+    // Emit the output pattern
+
+    // Handle a cursor preceding the output
+    int32_t cursor = cursorPos;
+    if (cursor < 0) {
+        while (cursor++ < 0) {
+            _appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
+        }
+        // Fall through and append '|' below
+    }
+
+    for (i=0; i<output.length(); ++i) {
+        if (i == cursor) {
+            _appendToRule(rule, (UChar) 0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
+        }
+        UChar c = output.charAt(i);
+        int32_t seg = data.lookupSegmentReference(c);
+        if (seg < 0) {
+            _appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
+        } else {
+            UChar segRef[4] = {
+                0x0020 /* */,
+                0x0024 /*$*/,
+                (0x0031 + seg) /*0..9*/,
+                0x0020 /* */
+            };
+            _appendToRule(rule, UnicodeString(FALSE, segRef, 4), TRUE, escapeUnprintable, quoteBuf);
+        }
+    }
+
+    // Handle a cursor after the output.  Use > rather than >= because
+    // if cursor == output.length() it is at the end of the output,
+    // which is the default position, so we need not emit it.
+    if (cursor > output.length()) {
+        cursor -= output.length();
+        while (cursor-- > 0) {
+            _appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
+        }
+        _appendToRule(rule, (UChar) 0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
+    }
+
+    _appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
+
+    return rule;
+}
--- a/icu4c/source/i18n/rbt_rule.h
+++ b/icu4c/source/i18n/rbt_rule.h
@ -362,23 +362,12 @@ public:
                              const UnicodeFilter* filter) const;

    /**
-     * Return true if the given key matches the given text.  This method
-     * accounts for the fact that the key character may represent a character
-     * set.  Note that the key and text characters may not be interchanged
-     * without altering the results.
-     * @param keyChar a character in the match key
-     * @param textChar a character in the text being transliterated
-     * @param data a dictionary of variables mapping <code>Character</code>
-     * to <code>UnicodeSet</code>
-     * @param filter the filter.  Any character for which
-     * <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
-     * altered by this transliterator.  If <tt>filter</tt> is
-     * <tt>null</tt> then no filtering is applied.
+     * Create a rule string that represents this rule object.  Append
+     * it to the given string.
     */
-//[ANCHOR]    virtual UBool charMatches(UChar keyChar, UChar textChar,
-//[ANCHOR]                               const TransliterationRuleData& data,
-//[ANCHOR]                               const UnicodeFilter* filter) const;
-
+    virtual UnicodeString& toRule(UnicodeString& pat,
+                                  const TransliterationRuleData& data,
+                                  UBool escapeUnprintable) const;
 private:

    void init(const UnicodeString& input,
@ -389,6 +378,17 @@ private:
              UBool anchorStart, UBool anchorEnd,
              UErrorCode& status);

+    static void _appendToRule(UnicodeString& rule,
+                              UChar32 c,
+                              UBool isLiteral,
+                              UBool escapeUnprintable,
+                              UnicodeString& quoteBuf);
+
+    static void _appendToRule(UnicodeString& rule,
+                              const UnicodeString& text,
+                              UBool isLiteral,
+                              UBool escapeUnprintable,
+                              UnicodeString& quoteBuf);
 };

 #endif
--- a/icu4c/source/i18n/rbt_set.cpp
+++ b/icu4c/source/i18n/rbt_set.cpp
@ -277,3 +277,21 @@ TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
    }
    return NULL;
 }
+
+/**
+ * Create rule strings that represents this rule set.
+ */
+UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
+                                               const TransliterationRuleData& data,
+                                               UBool escapeUnprintable) const {
+    int32_t i;
+    int32_t count = index[256];
+    ruleSource.truncate(0);
+    for (i=0; i<count; ++i) {
+        if (i != 0) {
+            ruleSource.append((UChar) 0x000A /*\n*/);
+        }
+        rules[i]->toRule(ruleSource, data, escapeUnprintable);
+    }
+    return ruleSource;
+}
--- a/icu4c/source/i18n/rbt_set.h
+++ b/icu4c/source/i18n/rbt_set.h
@ -156,5 +156,14 @@ public:
                                              const TransliterationRuleData& data,
                                              UBool& isPartial,
                                              const UnicodeFilter* filter) const;
+
+    /**
+     * Create rule strings that represents this rule set.
+     * @param result string to receive the rule strings.  Current
+     * contents will be deleted.
+     */
+    virtual UnicodeString& toRules(UnicodeString& result,
+                                   const TransliterationRuleData& data,
+                                   UBool escapeUnprintable) const;
 };
 #endif