ICU-4078 Fix for AIX with Visual Age 5 compiler, and make sure that the

disentanglement is correct by putting all the UnicodeSet virtual functions are in one file. Also move some of the rule whitespace handling into better locations. X-SVN-Rev: 16519
2004-10-15 22:56:26 +00:00 · 2004-10-15 22:56:26 +00:00 · e69fca9d5f
commit e69fca9d5f
parent 3ac97089c4
19 changed files with 240 additions and 214 deletions
--- a/icu4c/source/common/charstr.h
+++ b/icu4c/source/common/charstr.h
@ -8,6 +8,9 @@
 **********************************************************************
 */

+#ifndef CHARSTRING_H
+#define CHARSTRING_H
+
 #include "unicode/utypes.h"
 #include "unicode/uobject.h"
 #include "unicode/unistr.h"
@ -78,4 +81,5 @@ inline CharString::~CharString() {

 U_NAMESPACE_END

+#endif
 //eof
--- a/icu4c/source/common/ruleiter.cpp
+++ b/icu4c/source/common/ruleiter.cpp
@ -12,7 +12,7 @@
 #include "unicode/parsepos.h"
 #include "unicode/unistr.h"
 #include "unicode/symtable.h"
-#include "uprops.h"
+#include "util.h"

 U_NAMESPACE_BEGIN

--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@ -534,7 +534,7 @@ public:
     * U+000A, U+0020..U+007E.
     * @stable ICU 2.0
     */
-    UnicodeString& toPattern(UnicodeString& result,
+    virtual UnicodeString& toPattern(UnicodeString& result,
                             UBool escapeUnprintable = FALSE) const;

    /**
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@ -23,6 +23,24 @@
 #include "uassert.h"
 #include "hash.h"

+// Define UChar constants using hex for EBCDIC compatibility
+// Used #define to reduce private static exports and memory access time.
+#define SET_OPEN        ((UChar)0x005B) /*[*/
+#define SET_CLOSE       ((UChar)0x005D) /*]*/
+#define HYPHEN          ((UChar)0x002D) /*-*/
+#define COMPLEMENT      ((UChar)0x005E) /*^*/
+#define COLON           ((UChar)0x003A) /*:*/
+#define BACKSLASH       ((UChar)0x005C) /*\*/
+#define INTERSECTION    ((UChar)0x0026) /*&*/
+#define UPPER_U         ((UChar)0x0055) /*U*/
+#define LOWER_U         ((UChar)0x0075) /*u*/
+#define OPEN_BRACE      ((UChar)123)    /*{*/
+#define CLOSE_BRACE     ((UChar)125)    /*}*/
+#define UPPER_P         ((UChar)0x0050) /*P*/
+#define LOWER_P         ((UChar)0x0070) /*p*/
+#define UPPER_N         ((UChar)78)     /*N*/
+#define EQUALS          ((UChar)0x003D) /*=*/
+
 // HIGH_VALUE > all valid values. 110000 for codepoints
 #define UNICODESET_HIGH 0x0110000

@ -1645,4 +1663,172 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
    pat.truncate(0);
 }

+/**
+ * Append the <code>toPattern()</code> representation of a
+ * string to the given <code>StringBuffer</code>.
+ */
+void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
+escapeUnprintable) {
+    UChar32 cp;
+    for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
+        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
+    }
+}
+
+/**
+ * Append the <code>toPattern()</code> representation of a
+ * character to the given <code>StringBuffer</code>.
+ */
+void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
+escapeUnprintable) {
+    if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
+        // unprintable
+        if (ICU_Utility::escapeUnprintable(buf, c)) {
+            return;
+        }
+    }
+    // Okay to let ':' pass through
+    switch (c) {
+    case SET_OPEN:
+    case SET_CLOSE:
+    case HYPHEN:
+    case COMPLEMENT:
+    case INTERSECTION:
+    case BACKSLASH:
+    case OPEN_BRACE:
+    case CLOSE_BRACE:
+    case COLON:
+    case SymbolTable::SYMBOL_REF:
+        buf.append(BACKSLASH);
+        break;
+    default:
+        // Escape whitespace
+        if (uprv_isRuleWhiteSpace(c)) {
+            buf.append(BACKSLASH);
+        }
+        break;
+    }
+    buf.append(c);
+}
+
+/**
+ * Append a string representation of this set to result.  This will be
+ * a cleaned version of the string passed to applyPattern(), if there
+ * is one.  Otherwise it will be generated.
+ */
+UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
+                                      UBool escapeUnprintable) const {
+    if (pat.length() > 0) {
+        int32_t i;
+        int32_t backslashCount = 0;
+        for (i=0; i<pat.length(); ) {
+            UChar32 c = pat.char32At(i);
+            i += UTF_CHAR_LENGTH(c);
+            if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+                // If the unprintable character is preceded by an odd
+                // number of backslashes, then it has been escaped.
+                // Before unescaping it, we delete the final
+                // backslash.
+                if ((backslashCount % 2) == 1) {
+                    result.truncate(result.length() - 1);
+                }
+                ICU_Utility::escapeUnprintable(result, c);
+                backslashCount = 0;
+            } else {
+                result.append(c);
+                if (c == BACKSLASH) {
+                    ++backslashCount;
+                } else {
+                    backslashCount = 0;
+                }
+            }
+        }
+        return result;
+    }
+    
+    return _generatePattern(result, escapeUnprintable);
+}
+
+/**
+ * Returns a string representation of this set.  If the result of
+ * calling this function is passed to a UnicodeSet constructor, it
+ * will produce another set that is equal to this one.
+ */
+UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
+                                     UBool escapeUnprintable) const {
+    result.truncate(0);
+    return _toPattern(result, escapeUnprintable);
+}
+
+/**
+ * Generate and append a string representation of this set to result.
+ * This does not use this.pat, the cleaned up copy of the string
+ * passed to applyPattern().
+ */
+UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
+                                            UBool escapeUnprintable) const {
+    result.append(SET_OPEN);
+
+//  // Check against the predefined categories.  We implicitly build
+//  // up ALL category sets the first time toPattern() is called.
+//  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
+//      if (*this == getCategorySet(cat)) {
+//          result.append(COLON);
+//          result.append(CATEGORY_NAMES, cat*2, 2);
+//          return result.append(CATEGORY_CLOSE);
+//      }
+//  }
+
+    int32_t count = getRangeCount();
+
+    // If the set contains at least 2 intervals and includes both
+    // MIN_VALUE and MAX_VALUE, then the inverse representation will
+    // be more economical.
+    if (count > 1 &&
+        getRangeStart(0) == MIN_VALUE &&
+        getRangeEnd(count-1) == MAX_VALUE) {
+
+        // Emit the inverse
+        result.append(COMPLEMENT);
+
+        for (int32_t i = 1; i < count; ++i) {
+            UChar32 start = getRangeEnd(i-1)+1;
+            UChar32 end = getRangeStart(i)-1;
+            _appendToPat(result, start, escapeUnprintable);
+            if (start != end) {
+                if ((start+1) != end) {
+                    result.append(HYPHEN);
+                }
+                _appendToPat(result, end, escapeUnprintable);
+            }
+        }
+    }
+
+    // Default; emit the ranges as pairs
+    else {
+        for (int32_t i = 0; i < count; ++i) {
+            UChar32 start = getRangeStart(i);
+            UChar32 end = getRangeEnd(i);
+            _appendToPat(result, start, escapeUnprintable);
+            if (start != end) {
+                if ((start+1) != end) {
+                    result.append(HYPHEN);
+                }
+                _appendToPat(result, end, escapeUnprintable);
+            }
+        }
+    }
+    
+    for (int32_t i = 0; i<strings->size(); ++i) {
+        result.append(OPEN_BRACE);
+        _appendToPat(result,
+                     *(const UnicodeString*) strings->elementAt(i),
+                     escapeUnprintable);
+        result.append(CLOSE_BRACE);
+    }
+    return result.append(SET_CLOSE);
+}
+
+
 U_NAMESPACE_END
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@ -397,171 +397,6 @@ UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
        resemblesPropertyPattern(pattern, pos);
 }

-/**
- * Append the <code>toPattern()</code> representation of a
- * string to the given <code>StringBuffer</code>.
- */
-void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
-    UChar32 cp;
-    for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
-        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
-    }
-}
-
-/**
- * Append the <code>toPattern()</code> representation of a
- * character to the given <code>StringBuffer</code>.
- */
-void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
-    if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
-        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
-        // unprintable
-        if (ICU_Utility::escapeUnprintable(buf, c)) {
-            return;
-        }
-    }
-    // Okay to let ':' pass through
-    switch (c) {
-    case SET_OPEN:
-    case SET_CLOSE:
-    case HYPHEN:
-    case COMPLEMENT:
-    case INTERSECTION:
-    case BACKSLASH:
-    case 123/*{*/:
-    case 125/*}*/:
-    case SymbolTable::SYMBOL_REF:
-    case COLON:
-        buf.append(BACKSLASH);
-        break;
-    default:
-        // Escape whitespace
-        if (uprv_isRuleWhiteSpace(c)) {
-            buf.append(BACKSLASH);
-        }
-        break;
-    }
-    buf.append(c);
-}
-
-/**
- * Returns a string representation of this set.  If the result of
- * calling this function is passed to a UnicodeSet constructor, it
- * will produce another set that is equal to this one.
- */
-UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
-                                     UBool escapeUnprintable) const {
-    result.truncate(0);
-    return _toPattern(result, escapeUnprintable);
-}
-
-/**
- * Append a string representation of this set to result.  This will be
- * a cleaned version of the string passed to applyPattern(), if there
- * is one.  Otherwise it will be generated.
- */
-UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
-                                      UBool escapeUnprintable) const {
-    if (pat.length() > 0) {
-        int32_t i;
-        int32_t backslashCount = 0;
-        for (i=0; i<pat.length(); ) {
-            UChar32 c = pat.char32At(i);
-            i += UTF_CHAR_LENGTH(c);
-            if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
-                // If the unprintable character is preceded by an odd
-                // number of backslashes, then it has been escaped.
-                // Before unescaping it, we delete the final
-                // backslash.
-                if ((backslashCount % 2) == 1) {
-                    result.truncate(result.length() - 1);
-                }
-                ICU_Utility::escapeUnprintable(result, c);
-                backslashCount = 0;
-            } else {
-                result.append(c);
-                if (c == BACKSLASH) {
-                    ++backslashCount;
-                } else {
-                    backslashCount = 0;
-                }
-            }
-        }
-        return result;
-    }
-    
-    return _generatePattern(result, escapeUnprintable);
-}
-
-/**
- * Generate and append a string representation of this set to result.
- * This does not use this.pat, the cleaned up copy of the string
- * passed to applyPattern().
- */
-UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
-                                            UBool escapeUnprintable) const {
-    result.append(SET_OPEN);
-
-//  // Check against the predefined categories.  We implicitly build
-//  // up ALL category sets the first time toPattern() is called.
-//  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
-//      if (*this == getCategorySet(cat)) {
-//          result.append(COLON);
-//          result.append(CATEGORY_NAMES, cat*2, 2);
-//          return result.append(CATEGORY_CLOSE);
-//      }
-//  }
-
-    int32_t count = getRangeCount();
-
-    // If the set contains at least 2 intervals and includes both
-    // MIN_VALUE and MAX_VALUE, then the inverse representation will
-    // be more economical.
-    if (count > 1 &&
-        getRangeStart(0) == MIN_VALUE &&
-        getRangeEnd(count-1) == MAX_VALUE) {
-
-        // Emit the inverse
-        result.append(COMPLEMENT);
-
-        for (int32_t i = 1; i < count; ++i) {
-            UChar32 start = getRangeEnd(i-1)+1;
-            UChar32 end = getRangeStart(i)-1;
-            _appendToPat(result, start, escapeUnprintable);
-            if (start != end) {
-                if ((start+1) != end) {
-                    result.append(HYPHEN);
-                }
-                _appendToPat(result, end, escapeUnprintable);
-            }
-        }
-    }
-
-    // Default; emit the ranges as pairs
-    else {
-        for (int32_t i = 0; i < count; ++i) {
-            UChar32 start = getRangeStart(i);
-            UChar32 end = getRangeEnd(i);
-            _appendToPat(result, start, escapeUnprintable);
-            if (start != end) {
-                if ((start+1) != end) {
-                    result.append(HYPHEN);
-                }
-                _appendToPat(result, end, escapeUnprintable);
-            }
-        }
-    }
-    
-    for (int32_t i = 0; i<strings->size(); ++i) {
-        result.append(OPEN_BRACE);
-        _appendToPat(result,
-                     *(const UnicodeString*) strings->elementAt(i),
-                     escapeUnprintable);
-        result.append(CLOSE_BRACE);
-    }
-    return result.append(SET_CLOSE);
-}
-
 //----------------------------------------------------------------
 // Implementation: Pattern parsing
 //----------------------------------------------------------------
--- a/icu4c/source/common/uprops.c
+++ b/icu4c/source/common/uprops.c
@ -142,30 +142,6 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
    return FALSE;
 }

-U_CAPI UBool U_EXPORT2
-uprv_isRuleWhiteSpace(UChar32 c) {
-    /* "white space" in the sense of ICU rule parsers
-       This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
-       See UTR #31: http://www.unicode.org/reports/tr31/.
-       U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
-    */
-    return (c >= 0x0009 && c <= 0x2029 &&
-            (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
-             c == 0x200E || c == 0x200F || c >= 0x2028));
-}
-
-static const UChar _PATTERN[] = {
-    /* "[[:Cf:][:WSpace:]]" */
-    91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
-    83, 112, 97, 99, 101, 58, 93, 93, 0
-};
-
-U_CAPI USet* U_EXPORT2
-uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
-    return uset_openPattern(_PATTERN,
-                            sizeof(_PATTERN)/sizeof(_PATTERN[0])-1, ec);
-}
-
 U_CAPI int32_t U_EXPORT2
 u_getIntPropertyValue(UChar32 c, UProperty which) {
    UErrorCode errorCode;
--- a/icu4c/source/common/uprops.h
+++ b/icu4c/source/common/uprops.h
@ -278,21 +278,6 @@ enum {
    ZWNBSP  =0xfeff
 };

-/**
- * Is this character a "white space" in the sense of ICU rule parsers?
- * @internal
- */
-U_CAPI UBool U_EXPORT2
-uprv_isRuleWhiteSpace(UChar32 c);
-
-/**
- * Get the set of "white space" characters in the sense of ICU rule
- * parsers.  Caller must close/delete result.
- * @internal
- */
-U_CAPI USet* U_EXPORT2
-uprv_openRuleWhiteSpaceSet(UErrorCode* ec);
-
 /**
 * Get the maximum length of a (regular/1.0/extended) character name.
 * @return 0 if no character names available.
--- a/icu4c/source/common/uset_imp.h
+++ b/icu4c/source/common/uset_imp.h
@ -48,4 +48,13 @@ typedef struct USetAdder USetAdder;

 U_CDECL_END

+/**
+ * Get the set of "white space" characters in the sense of ICU rule
+ * parsers.  Caller must close/delete result.
+ * @internal
+ */
+U_CAPI USet* U_EXPORT2
+uprv_openRuleWhiteSpaceSet(UErrorCode* ec);
+
 #endif
+
--- a/icu4c/source/common/uset_props.cpp
+++ b/icu4c/source/common/uset_props.cpp
@ -132,3 +132,15 @@ uset_toPattern(const USet* set,
    ((const UnicodeSet*) set)->toPattern(pat, escapeUnprintable);
    return pat.extract(result, resultCapacity, *ec);
 }
+
+U_CAPI USet* U_EXPORT2
+uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
+    static const UChar _PATTERN[] = {
+        /* "[[:Cf:][:WSpace:]]" */
+        91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
+        83, 112, 97, 99, 101, 58, 93, 93, 0
+    };
+    return uset_openPattern(_PATTERN,
+                            sizeof(_PATTERN)/sizeof(_PATTERN[0])-1, ec);
+}
+
--- a/icu4c/source/common/util.cpp
+++ b/icu4c/source/common/util.cpp
@ -606,4 +606,16 @@ void ICU_Utility::appendToRule(UnicodeString& rule,
    }
 }

+U_CAPI UBool U_EXPORT2
+uprv_isRuleWhiteSpace(UChar32 c) {
+    /* "white space" in the sense of ICU rule parsers
+       This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
+       See UTR #31: http://www.unicode.org/reports/tr31/.
+       U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
+    */
+    return (c >= 0x0009 && c <= 0x2029 &&
+            (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
+             c == 0x200E || c == 0x200F || c >= 0x2028));
+}
+
 //eof
--- a/icu4c/source/common/util.h
+++ b/icu4c/source/common/util.h
@ -231,6 +231,13 @@ private:
    ICU_Utility();
 };

+/**
+ * Is this character a "white space" in the sense of ICU rule parsers?
+ * @internal
+ */
+U_CAPI UBool U_EXPORT2
+uprv_isRuleWhiteSpace(UChar32 c);
+
 U_NAMESPACE_END

 #endif
--- a/icu4c/source/i18n/decimfmt.cpp
+++ b/icu4c/source/i18n/decimfmt.cpp
@ -50,7 +50,7 @@
 #include "unicode/uchar.h"
 #include "unicode/curramt.h"
 #include "ucurrimp.h"
-#include "uprops.h"
+#include "util.h"
 #include "digitlst.h"
 #include "cmemory.h"
 #include "cstring.h"
--- a/icu4c/source/i18n/msgfmt.cpp
+++ b/icu4c/source/i18n/msgfmt.cpp
@ -34,7 +34,7 @@
 #include "unicode/rbnf.h"
 #include "ustrfmt.h"
 #include "cmemory.h"
-#include "uprops.h"
+#include "util.h"
 #include "uassert.h"

 // *****************************************************************************
--- a/icu4c/source/i18n/nfrs.cpp
+++ b/icu4c/source/i18n/nfrs.cpp
@ -25,7 +25,7 @@
 #include "cmemory.h"
 #endif

-#include "uprops.h"
+#include "util.h"

 U_NAMESPACE_BEGIN

--- a/icu4c/source/i18n/nfrule.cpp
+++ b/icu4c/source/i18n/nfrule.cpp
@ -25,7 +25,7 @@
 #include "nfrlist.h"
 #include "nfsubs.h"

-#include "uprops.h"
+#include "util.h"

 U_NAMESPACE_BEGIN

--- a/icu4c/source/i18n/rbnf.cpp
+++ b/icu4c/source/i18n/rbnf.cpp
@ -22,7 +22,7 @@

 #include "cmemory.h"
 #include "cstring.h"
-#include "uprops.h"
+#include "util.h"

 // debugging
 // #define DEBUG
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -21,7 +21,7 @@
 #include "unicode/parsepos.h"
 #include "unicode/parseerr.h"
 #include "unicode/regex.h"
-#include "uprops.h"
+#include "util.h"
 #include "cmemory.h"
 #include "cstring.h"
 #include "uvectr32.h"
--- a/icu4c/source/i18n/smpdtfmt.cpp
+++ b/icu4c/source/i18n/smpdtfmt.cpp
@ -39,7 +39,7 @@
 #include "unicode/dcfmtsym.h"
 #include "unicode/uchar.h"
 #include "unicode/ustring.h"
-#include "uprops.h"
+#include "util.h"
 #include "gregoimp.h" 
 #include "cstring.h"
 #include "uassert.h"
--- a/icu4c/source/i18n/ucol_tok.cpp
+++ b/icu4c/source/i18n/ucol_tok.cpp
@ -28,7 +28,7 @@
 
 #include "ucol_tok.h"
 #include "cmemory.h"
-#include "uprops.h"
+#include "util.h"

 U_CDECL_BEGIN
 static int32_t U_EXPORT2 U_CALLCONV