ICU-4078 Fix for AIX with Visual Age 5 compiler, and make sure that the

disentanglement is correct by putting all the UnicodeSet virtual functions are in one file. Also move some of the rule whitespace handling into better locations. X-SVN-Rev: 16519
2004-10-15 22:56:26 +00:00 · 2004-10-15 22:56:26 +00:00 · e69fca9d5f
commit e69fca9d5f
parent 3ac97089c4
19 changed files with 240 additions and 214 deletions
--- a/icu4c/source/common/charstr.h
+++ b/icu4c/source/common/charstr.h
@ -8,6 +8,9 @@
 **********************************************************************
 */
 #ifndef CHARSTRING_H
 #define CHARSTRING_H
 #include "unicode/utypes.h"
 #include "unicode/uobject.h"
 #include "unicode/unistr.h"
@ -78,4 +81,5 @@ inline CharString::~CharString() {
 U_NAMESPACE_END
 #endif
 //eof
--- a/icu4c/source/common/ruleiter.cpp
+++ b/icu4c/source/common/ruleiter.cpp
@ -12,7 +12,7 @@
 #include "unicode/parsepos.h"
 #include "unicode/unistr.h"
 #include "unicode/symtable.h"
-#include "uprops.h"
+#include "util.h"
 U_NAMESPACE_BEGIN
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@ -534,7 +534,7 @@ public:
     * U+000A, U+0020..U+007E.
     * @stable ICU 2.0
     */
-    UnicodeString& toPattern(UnicodeString& result,
+    virtual UnicodeString& toPattern(UnicodeString& result,
                             UBool escapeUnprintable = FALSE) const;
    /**
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@ -23,6 +23,24 @@
 #include "uassert.h"
 #include "hash.h"
 // Define UChar constants using hex for EBCDIC compatibility
 // Used #define to reduce private static exports and memory access time.
 #define SET_OPEN        ((UChar)0x005B) /*[*/
 #define SET_CLOSE       ((UChar)0x005D) /*]*/
 #define HYPHEN          ((UChar)0x002D) /*-*/
 #define COMPLEMENT      ((UChar)0x005E) /*^*/
 #define COLON           ((UChar)0x003A) /*:*/
 #define BACKSLASH       ((UChar)0x005C) /*\*/
 #define INTERSECTION    ((UChar)0x0026) /*&*/
 #define UPPER_U         ((UChar)0x0055) /*U*/
 #define LOWER_U         ((UChar)0x0075) /*u*/
 #define OPEN_BRACE      ((UChar)123)    /*{*/
 #define CLOSE_BRACE     ((UChar)125)    /*}*/
 #define UPPER_P         ((UChar)0x0050) /*P*/
 #define LOWER_P         ((UChar)0x0070) /*p*/
 #define UPPER_N         ((UChar)78)     /*N*/
 #define EQUALS          ((UChar)0x003D) /*=*/
 // HIGH_VALUE > all valid values. 110000 for codepoints
 #define UNICODESET_HIGH 0x0110000
@ -1645,4 +1663,172 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
    pat.truncate(0);
 }
 /**
 * Append the <code>toPattern()</code> representation of a
 * string to the given <code>StringBuffer</code>.
 */
 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
 escapeUnprintable) {
    UChar32 cp;
    for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
    }
 }
 /**
 * Append the <code>toPattern()</code> representation of a
 * character to the given <code>StringBuffer</code>.
 */
 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
 escapeUnprintable) {
    if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
        // unprintable
        if (ICU_Utility::escapeUnprintable(buf, c)) {
            return;
        }
    }
    // Okay to let ':' pass through
    switch (c) {
    case SET_OPEN:
    case SET_CLOSE:
    case HYPHEN:
    case COMPLEMENT:
    case INTERSECTION:
    case BACKSLASH:
    case OPEN_BRACE:
    case CLOSE_BRACE:
    case COLON:
    case SymbolTable::SYMBOL_REF:
        buf.append(BACKSLASH);
        break;
    default:
        // Escape whitespace
        if (uprv_isRuleWhiteSpace(c)) {
            buf.append(BACKSLASH);
        }
        break;
    }
    buf.append(c);
 }
 /**
 * Append a string representation of this set to result.  This will be
 * a cleaned version of the string passed to applyPattern(), if there
 * is one.  Otherwise it will be generated.
 */
 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
                                      UBool escapeUnprintable) const {
    if (pat.length() > 0) {
        int32_t i;
        int32_t backslashCount = 0;
        for (i=0; i<pat.length(); ) {
            UChar32 c = pat.char32At(i);
            i += UTF_CHAR_LENGTH(c);
            if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
                // If the unprintable character is preceded by an odd
                // number of backslashes, then it has been escaped.
                // Before unescaping it, we delete the final
                // backslash.
                if ((backslashCount % 2) == 1) {
                    result.truncate(result.length() - 1);
                }
                ICU_Utility::escapeUnprintable(result, c);
                backslashCount = 0;
            } else {
                result.append(c);
                if (c == BACKSLASH) {
                    ++backslashCount;
                } else {
                    backslashCount = 0;
                }
            }
        }
        return result;
    }
    return _generatePattern(result, escapeUnprintable);
 }
 /**
 * Returns a string representation of this set.  If the result of
 * calling this function is passed to a UnicodeSet constructor, it
 * will produce another set that is equal to this one.
 */
 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
                                     UBool escapeUnprintable) const {
    result.truncate(0);
    return _toPattern(result, escapeUnprintable);
 }
 /**
 * Generate and append a string representation of this set to result.
 * This does not use this.pat, the cleaned up copy of the string
 * passed to applyPattern().
 */
 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
                                            UBool escapeUnprintable) const {
    result.append(SET_OPEN);
 //  // Check against the predefined categories.  We implicitly build
 //  // up ALL category sets the first time toPattern() is called.
 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
 //      if (*this == getCategorySet(cat)) {
 //          result.append(COLON);
 //          result.append(CATEGORY_NAMES, cat*2, 2);
 //          return result.append(CATEGORY_CLOSE);
 //      }
 //  }
    int32_t count = getRangeCount();
    // If the set contains at least 2 intervals and includes both
    // MIN_VALUE and MAX_VALUE, then the inverse representation will
    // be more economical.
    if (count > 1 &&
        getRangeStart(0) == MIN_VALUE &&
        getRangeEnd(count-1) == MAX_VALUE) {
        // Emit the inverse
        result.append(COMPLEMENT);
        for (int32_t i = 1; i < count; ++i) {
            UChar32 start = getRangeEnd(i-1)+1;
            UChar32 end = getRangeStart(i)-1;
            _appendToPat(result, start, escapeUnprintable);
            if (start != end) {
                if ((start+1) != end) {
                    result.append(HYPHEN);
                }
                _appendToPat(result, end, escapeUnprintable);
            }
        }
    }
    // Default; emit the ranges as pairs
    else {
        for (int32_t i = 0; i < count; ++i) {
            UChar32 start = getRangeStart(i);
            UChar32 end = getRangeEnd(i);
            _appendToPat(result, start, escapeUnprintable);
            if (start != end) {
                if ((start+1) != end) {
                    result.append(HYPHEN);
                }
                _appendToPat(result, end, escapeUnprintable);
            }
        }
    }
    for (int32_t i = 0; i<strings->size(); ++i) {
        result.append(OPEN_BRACE);
        _appendToPat(result,
                     *(const UnicodeString*) strings->elementAt(i),
                     escapeUnprintable);
        result.append(CLOSE_BRACE);
    }
    return result.append(SET_CLOSE);
 }
 U_NAMESPACE_END
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@ -397,171 +397,6 @@ UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
        resemblesPropertyPattern(pattern, pos);
 }
 /**
 * Append the <code>toPattern()</code> representation of a
 * string to the given <code>StringBuffer</code>.
 */
 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
    UChar32 cp;
    for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
    }
 }
 /**
 * Append the <code>toPattern()</code> representation of a
 * character to the given <code>StringBuffer</code>.
 */
 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
    if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
        // unprintable
        if (ICU_Utility::escapeUnprintable(buf, c)) {
            return;
        }
    }
    // Okay to let ':' pass through
    switch (c) {
    case SET_OPEN:
    case SET_CLOSE:
    case HYPHEN:
    case COMPLEMENT:
    case INTERSECTION:
    case BACKSLASH:
    case 123/*{*/:
    case 125/*}*/:
    case SymbolTable::SYMBOL_REF:
    case COLON:
        buf.append(BACKSLASH);
        break;
    default:
        // Escape whitespace
        if (uprv_isRuleWhiteSpace(c)) {
            buf.append(BACKSLASH);
        }
        break;
    }
    buf.append(c);
 }
 /**
 * Returns a string representation of this set.  If the result of
 * calling this function is passed to a UnicodeSet constructor, it
 * will produce another set that is equal to this one.
 */
 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
                                     UBool escapeUnprintable) const {
    result.truncate(0);
    return _toPattern(result, escapeUnprintable);
 }
 /**
 * Append a string representation of this set to result.  This will be
 * a cleaned version of the string passed to applyPattern(), if there
 * is one.  Otherwise it will be generated.
 */
 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
                                      UBool escapeUnprintable) const {
    if (pat.length() > 0) {
        int32_t i;
        int32_t backslashCount = 0;
        for (i=0; i<pat.length(); ) {
            UChar32 c = pat.char32At(i);
            i += UTF_CHAR_LENGTH(c);
            if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
                // If the unprintable character is preceded by an odd
                // number of backslashes, then it has been escaped.
                // Before unescaping it, we delete the final
                // backslash.
                if ((backslashCount % 2) == 1) {
                    result.truncate(result.length() - 1);
                }
                ICU_Utility::escapeUnprintable(result, c);
                backslashCount = 0;
            } else {
                result.append(c);
                if (c == BACKSLASH) {
                    ++backslashCount;
                } else {
                    backslashCount = 0;
                }
            }
        }
        return result;
    }
    return _generatePattern(result, escapeUnprintable);
 }
 /**
 * Generate and append a string representation of this set to result.
 * This does not use this.pat, the cleaned up copy of the string
 * passed to applyPattern().
 */
 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
                                            UBool escapeUnprintable) const {
    result.append(SET_OPEN);
 //  // Check against the predefined categories.  We implicitly build
 //  // up ALL category sets the first time toPattern() is called.
 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
 //      if (*this == getCategorySet(cat)) {
 //          result.append(COLON);
 //          result.append(CATEGORY_NAMES, cat*2, 2);
 //          return result.append(CATEGORY_CLOSE);
 //      }
 //  }
    int32_t count = getRangeCount();
    // If the set contains at least 2 intervals and includes both
    // MIN_VALUE and MAX_VALUE, then the inverse representation will
    // be more economical.
    if (count > 1 &&
        getRangeStart(0) == MIN_VALUE &&
        getRangeEnd(count-1) == MAX_VALUE) {
        // Emit the inverse
        result.append(COMPLEMENT);
        for (int32_t i = 1; i < count; ++i) {
            UChar32 start = getRangeEnd(i-1)+1;
            UChar32 end = getRangeStart(i)-1;
            _appendToPat(result, start, escapeUnprintable);
            if (start != end) {
                if ((start+1) != end) {
                    result.append(HYPHEN);
                }
                _appendToPat(result, end, escapeUnprintable);
            }
        }
    }
    // Default; emit the ranges as pairs
    else {
        for (int32_t i = 0; i < count; ++i) {
            UChar32 start = getRangeStart(i);
            UChar32 end = getRangeEnd(i);
            _appendToPat(result, start, escapeUnprintable);
            if (start != end) {
                if ((start+1) != end) {
                    result.append(HYPHEN);
                }
                _appendToPat(result, end, escapeUnprintable);
            }
        }
    }
    for (int32_t i = 0; i<strings->size(); ++i) {
        result.append(OPEN_BRACE);
        _appendToPat(result,
                     *(const UnicodeString*) strings->elementAt(i),
                     escapeUnprintable);
        result.append(CLOSE_BRACE);
    }
    return result.append(SET_CLOSE);
 }
 //----------------------------------------------------------------
 // Implementation: Pattern parsing
 //----------------------------------------------------------------
--- a/icu4c/source/common/uprops.c
+++ b/icu4c/source/common/uprops.c
@ -142,30 +142,6 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
    return FALSE;
 }
 U_CAPI UBool U_EXPORT2
 uprv_isRuleWhiteSpace(UChar32 c) {
    /* "white space" in the sense of ICU rule parsers
       This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
       See UTR #31: http://www.unicode.org/reports/tr31/.
       U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
    */
    return (c >= 0x0009 && c <= 0x2029 &&
            (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
             c == 0x200E || c == 0x200F || c >= 0x2028));
 }
 static const UChar _PATTERN[] = {
    /* "[[:Cf:][:WSpace:]]" */
    91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
    83, 112, 97, 99, 101, 58, 93, 93, 0
 };
 U_CAPI USet* U_EXPORT2
 uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
    return uset_openPattern(_PATTERN,
                            sizeof(_PATTERN)/sizeof(_PATTERN[0])-1, ec);
 }
 U_CAPI int32_t U_EXPORT2
 u_getIntPropertyValue(UChar32 c, UProperty which) {
    UErrorCode errorCode;
--- a/icu4c/source/common/uprops.h
+++ b/icu4c/source/common/uprops.h
@ -278,21 +278,6 @@ enum {
    ZWNBSP  =0xfeff
 };
 /**
 * Is this character a "white space" in the sense of ICU rule parsers?
 * @internal
 */
 U_CAPI UBool U_EXPORT2
 uprv_isRuleWhiteSpace(UChar32 c);
 /**
 * Get the set of "white space" characters in the sense of ICU rule
 * parsers.  Caller must close/delete result.
 * @internal
 */
 U_CAPI USet* U_EXPORT2
 uprv_openRuleWhiteSpaceSet(UErrorCode* ec);
 /**
 * Get the maximum length of a (regular/1.0/extended) character name.
 * @return 0 if no character names available.
--- a/icu4c/source/common/uset_imp.h
+++ b/icu4c/source/common/uset_imp.h
@ -48,4 +48,13 @@ typedef struct USetAdder USetAdder;
 U_CDECL_END
 /**
 * Get the set of "white space" characters in the sense of ICU rule
 * parsers.  Caller must close/delete result.
 * @internal
 */
 U_CAPI USet* U_EXPORT2
 uprv_openRuleWhiteSpaceSet(UErrorCode* ec);
 #endif
--- a/icu4c/source/common/uset_props.cpp
+++ b/icu4c/source/common/uset_props.cpp
@ -132,3 +132,15 @@ uset_toPattern(const USet* set,
    ((const UnicodeSet*) set)->toPattern(pat, escapeUnprintable);
    return pat.extract(result, resultCapacity, *ec);
 }
 U_CAPI USet* U_EXPORT2
 uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
    static const UChar _PATTERN[] = {
        /* "[[:Cf:][:WSpace:]]" */
        91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
        83, 112, 97, 99, 101, 58, 93, 93, 0
    };
    return uset_openPattern(_PATTERN,
                            sizeof(_PATTERN)/sizeof(_PATTERN[0])-1, ec);
 }
--- a/icu4c/source/common/util.cpp
+++ b/icu4c/source/common/util.cpp
@ -606,4 +606,16 @@ void ICU_Utility::appendToRule(UnicodeString& rule,
    }
 }
 U_CAPI UBool U_EXPORT2
 uprv_isRuleWhiteSpace(UChar32 c) {
    /* "white space" in the sense of ICU rule parsers
       This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
       See UTR #31: http://www.unicode.org/reports/tr31/.
       U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
    */
    return (c >= 0x0009 && c <= 0x2029 &&
            (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
             c == 0x200E || c == 0x200F || c >= 0x2028));
 }
 //eof
--- a/icu4c/source/common/util.h
+++ b/icu4c/source/common/util.h
@ -231,6 +231,13 @@ private:
    ICU_Utility();
 };
 /**
 * Is this character a "white space" in the sense of ICU rule parsers?
 * @internal
 */
 U_CAPI UBool U_EXPORT2
 uprv_isRuleWhiteSpace(UChar32 c);
 U_NAMESPACE_END
 #endif
--- a/icu4c/source/i18n/decimfmt.cpp
+++ b/icu4c/source/i18n/decimfmt.cpp
@ -50,7 +50,7 @@
 #include "unicode/uchar.h"
 #include "unicode/curramt.h"
 #include "ucurrimp.h"
-#include "uprops.h"
+#include "util.h"
 #include "digitlst.h"
 #include "cmemory.h"
 #include "cstring.h"
--- a/icu4c/source/i18n/msgfmt.cpp
+++ b/icu4c/source/i18n/msgfmt.cpp
@ -34,7 +34,7 @@
 #include "unicode/rbnf.h"
 #include "ustrfmt.h"
 #include "cmemory.h"
-#include "uprops.h"
+#include "util.h"
 #include "uassert.h"
 // *****************************************************************************
--- a/icu4c/source/i18n/nfrs.cpp
+++ b/icu4c/source/i18n/nfrs.cpp
@ -25,7 +25,7 @@
 #include "cmemory.h"
 #endif
-#include "uprops.h"
+#include "util.h"
 U_NAMESPACE_BEGIN
--- a/icu4c/source/i18n/nfrule.cpp
+++ b/icu4c/source/i18n/nfrule.cpp
@ -25,7 +25,7 @@
 #include "nfrlist.h"
 #include "nfsubs.h"
-#include "uprops.h"
+#include "util.h"
 U_NAMESPACE_BEGIN
--- a/icu4c/source/i18n/rbnf.cpp
+++ b/icu4c/source/i18n/rbnf.cpp
@ -22,7 +22,7 @@
 #include "cmemory.h"
 #include "cstring.h"
-#include "uprops.h"
+#include "util.h"
 // debugging
 // #define DEBUG
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -21,7 +21,7 @@
 #include "unicode/parsepos.h"
 #include "unicode/parseerr.h"
 #include "unicode/regex.h"
-#include "uprops.h"
+#include "util.h"
 #include "cmemory.h"
 #include "cstring.h"
 #include "uvectr32.h"
--- a/icu4c/source/i18n/smpdtfmt.cpp
+++ b/icu4c/source/i18n/smpdtfmt.cpp
@ -39,7 +39,7 @@
 #include "unicode/dcfmtsym.h"
 #include "unicode/uchar.h"
 #include "unicode/ustring.h"
-#include "uprops.h"
+#include "util.h"
 #include "gregoimp.h" 
 #include "cstring.h"
 #include "uassert.h"
--- a/icu4c/source/i18n/ucol_tok.cpp
+++ b/icu4c/source/i18n/ucol_tok.cpp
@ -28,7 +28,7 @@
 #include "ucol_tok.h"
 #include "cmemory.h"
-#include "uprops.h"
+#include "util.h"
 U_CDECL_BEGIN
 static int32_t U_EXPORT2 U_CALLCONV