ICU-4078 Fix for AIX with Visual Age 5 compiler, and make sure that the

disentanglement is correct by putting all the UnicodeSet virtual functions are
in one file. Also move some of the rule whitespace handling into better
locations.

X-SVN-Rev: 16519
This commit is contained in:
George Rhoten 2004-10-15 22:56:26 +00:00
parent 3ac97089c4
commit e69fca9d5f
19 changed files with 240 additions and 214 deletions

View File

@ -8,6 +8,9 @@
**********************************************************************
*/
#ifndef CHARSTRING_H
#define CHARSTRING_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/unistr.h"
@ -78,4 +81,5 @@ inline CharString::~CharString() {
U_NAMESPACE_END
#endif
//eof

View File

@ -12,7 +12,7 @@
#include "unicode/parsepos.h"
#include "unicode/unistr.h"
#include "unicode/symtable.h"
#include "uprops.h"
#include "util.h"
U_NAMESPACE_BEGIN

View File

@ -534,7 +534,7 @@ public:
* U+000A, U+0020..U+007E.
* @stable ICU 2.0
*/
UnicodeString& toPattern(UnicodeString& result,
virtual UnicodeString& toPattern(UnicodeString& result,
UBool escapeUnprintable = FALSE) const;
/**

View File

@ -23,6 +23,24 @@
#include "uassert.h"
#include "hash.h"
// Define UChar constants using hex for EBCDIC compatibility
// Used #define to reduce private static exports and memory access time.
#define SET_OPEN ((UChar)0x005B) /*[*/
#define SET_CLOSE ((UChar)0x005D) /*]*/
#define HYPHEN ((UChar)0x002D) /*-*/
#define COMPLEMENT ((UChar)0x005E) /*^*/
#define COLON ((UChar)0x003A) /*:*/
#define BACKSLASH ((UChar)0x005C) /*\*/
#define INTERSECTION ((UChar)0x0026) /*&*/
#define UPPER_U ((UChar)0x0055) /*U*/
#define LOWER_U ((UChar)0x0075) /*u*/
#define OPEN_BRACE ((UChar)123) /*{*/
#define CLOSE_BRACE ((UChar)125) /*}*/
#define UPPER_P ((UChar)0x0050) /*P*/
#define LOWER_P ((UChar)0x0070) /*p*/
#define UPPER_N ((UChar)78) /*N*/
#define EQUALS ((UChar)0x003D) /*=*/
// HIGH_VALUE > all valid values. 110000 for codepoints
#define UNICODESET_HIGH 0x0110000
@ -1645,4 +1663,172 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
pat.truncate(0);
}
/**
* Append the <code>toPattern()</code> representation of a
* string to the given <code>StringBuffer</code>.
*/
void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
escapeUnprintable) {
UChar32 cp;
for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
}
}
/**
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
*/
void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
escapeUnprintable) {
if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
// unprintable
if (ICU_Utility::escapeUnprintable(buf, c)) {
return;
}
}
// Okay to let ':' pass through
switch (c) {
case SET_OPEN:
case SET_CLOSE:
case HYPHEN:
case COMPLEMENT:
case INTERSECTION:
case BACKSLASH:
case OPEN_BRACE:
case CLOSE_BRACE:
case COLON:
case SymbolTable::SYMBOL_REF:
buf.append(BACKSLASH);
break;
default:
// Escape whitespace
if (uprv_isRuleWhiteSpace(c)) {
buf.append(BACKSLASH);
}
break;
}
buf.append(c);
}
/**
* Append a string representation of this set to result. This will be
* a cleaned version of the string passed to applyPattern(), if there
* is one. Otherwise it will be generated.
*/
UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
UBool escapeUnprintable) const {
if (pat.length() > 0) {
int32_t i;
int32_t backslashCount = 0;
for (i=0; i<pat.length(); ) {
UChar32 c = pat.char32At(i);
i += UTF_CHAR_LENGTH(c);
if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
// Before unescaping it, we delete the final
// backslash.
if ((backslashCount % 2) == 1) {
result.truncate(result.length() - 1);
}
ICU_Utility::escapeUnprintable(result, c);
backslashCount = 0;
} else {
result.append(c);
if (c == BACKSLASH) {
++backslashCount;
} else {
backslashCount = 0;
}
}
}
return result;
}
return _generatePattern(result, escapeUnprintable);
}
/**
* Returns a string representation of this set. If the result of
* calling this function is passed to a UnicodeSet constructor, it
* will produce another set that is equal to this one.
*/
UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
UBool escapeUnprintable) const {
result.truncate(0);
return _toPattern(result, escapeUnprintable);
}
/**
* Generate and append a string representation of this set to result.
* This does not use this.pat, the cleaned up copy of the string
* passed to applyPattern().
*/
UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
UBool escapeUnprintable) const {
result.append(SET_OPEN);
// // Check against the predefined categories. We implicitly build
// // up ALL category sets the first time toPattern() is called.
// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
// if (*this == getCategorySet(cat)) {
// result.append(COLON);
// result.append(CATEGORY_NAMES, cat*2, 2);
// return result.append(CATEGORY_CLOSE);
// }
// }
int32_t count = getRangeCount();
// If the set contains at least 2 intervals and includes both
// MIN_VALUE and MAX_VALUE, then the inverse representation will
// be more economical.
if (count > 1 &&
getRangeStart(0) == MIN_VALUE &&
getRangeEnd(count-1) == MAX_VALUE) {
// Emit the inverse
result.append(COMPLEMENT);
for (int32_t i = 1; i < count; ++i) {
UChar32 start = getRangeEnd(i-1)+1;
UChar32 end = getRangeStart(i)-1;
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end) {
result.append(HYPHEN);
}
_appendToPat(result, end, escapeUnprintable);
}
}
}
// Default; emit the ranges as pairs
else {
for (int32_t i = 0; i < count; ++i) {
UChar32 start = getRangeStart(i);
UChar32 end = getRangeEnd(i);
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end) {
result.append(HYPHEN);
}
_appendToPat(result, end, escapeUnprintable);
}
}
}
for (int32_t i = 0; i<strings->size(); ++i) {
result.append(OPEN_BRACE);
_appendToPat(result,
*(const UnicodeString*) strings->elementAt(i),
escapeUnprintable);
result.append(CLOSE_BRACE);
}
return result.append(SET_CLOSE);
}
U_NAMESPACE_END

View File

@ -397,171 +397,6 @@ UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
resemblesPropertyPattern(pattern, pos);
}
/**
* Append the <code>toPattern()</code> representation of a
* string to the given <code>StringBuffer</code>.
*/
void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
UChar32 cp;
for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
}
}
/**
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
*/
void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
// unprintable
if (ICU_Utility::escapeUnprintable(buf, c)) {
return;
}
}
// Okay to let ':' pass through
switch (c) {
case SET_OPEN:
case SET_CLOSE:
case HYPHEN:
case COMPLEMENT:
case INTERSECTION:
case BACKSLASH:
case 123/*{*/:
case 125/*}*/:
case SymbolTable::SYMBOL_REF:
case COLON:
buf.append(BACKSLASH);
break;
default:
// Escape whitespace
if (uprv_isRuleWhiteSpace(c)) {
buf.append(BACKSLASH);
}
break;
}
buf.append(c);
}
/**
* Returns a string representation of this set. If the result of
* calling this function is passed to a UnicodeSet constructor, it
* will produce another set that is equal to this one.
*/
UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
UBool escapeUnprintable) const {
result.truncate(0);
return _toPattern(result, escapeUnprintable);
}
/**
* Append a string representation of this set to result. This will be
* a cleaned version of the string passed to applyPattern(), if there
* is one. Otherwise it will be generated.
*/
UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
UBool escapeUnprintable) const {
if (pat.length() > 0) {
int32_t i;
int32_t backslashCount = 0;
for (i=0; i<pat.length(); ) {
UChar32 c = pat.char32At(i);
i += UTF_CHAR_LENGTH(c);
if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
// Before unescaping it, we delete the final
// backslash.
if ((backslashCount % 2) == 1) {
result.truncate(result.length() - 1);
}
ICU_Utility::escapeUnprintable(result, c);
backslashCount = 0;
} else {
result.append(c);
if (c == BACKSLASH) {
++backslashCount;
} else {
backslashCount = 0;
}
}
}
return result;
}
return _generatePattern(result, escapeUnprintable);
}
/**
* Generate and append a string representation of this set to result.
* This does not use this.pat, the cleaned up copy of the string
* passed to applyPattern().
*/
UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
UBool escapeUnprintable) const {
result.append(SET_OPEN);
// // Check against the predefined categories. We implicitly build
// // up ALL category sets the first time toPattern() is called.
// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
// if (*this == getCategorySet(cat)) {
// result.append(COLON);
// result.append(CATEGORY_NAMES, cat*2, 2);
// return result.append(CATEGORY_CLOSE);
// }
// }
int32_t count = getRangeCount();
// If the set contains at least 2 intervals and includes both
// MIN_VALUE and MAX_VALUE, then the inverse representation will
// be more economical.
if (count > 1 &&
getRangeStart(0) == MIN_VALUE &&
getRangeEnd(count-1) == MAX_VALUE) {
// Emit the inverse
result.append(COMPLEMENT);
for (int32_t i = 1; i < count; ++i) {
UChar32 start = getRangeEnd(i-1)+1;
UChar32 end = getRangeStart(i)-1;
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end) {
result.append(HYPHEN);
}
_appendToPat(result, end, escapeUnprintable);
}
}
}
// Default; emit the ranges as pairs
else {
for (int32_t i = 0; i < count; ++i) {
UChar32 start = getRangeStart(i);
UChar32 end = getRangeEnd(i);
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end) {
result.append(HYPHEN);
}
_appendToPat(result, end, escapeUnprintable);
}
}
}
for (int32_t i = 0; i<strings->size(); ++i) {
result.append(OPEN_BRACE);
_appendToPat(result,
*(const UnicodeString*) strings->elementAt(i),
escapeUnprintable);
result.append(CLOSE_BRACE);
}
return result.append(SET_CLOSE);
}
//----------------------------------------------------------------
// Implementation: Pattern parsing
//----------------------------------------------------------------

View File

@ -142,30 +142,6 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
return FALSE;
}
U_CAPI UBool U_EXPORT2
uprv_isRuleWhiteSpace(UChar32 c) {
/* "white space" in the sense of ICU rule parsers
This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
See UTR #31: http://www.unicode.org/reports/tr31/.
U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
*/
return (c >= 0x0009 && c <= 0x2029 &&
(c <= 0x000D || c == 0x0020 || c == 0x0085 ||
c == 0x200E || c == 0x200F || c >= 0x2028));
}
static const UChar _PATTERN[] = {
/* "[[:Cf:][:WSpace:]]" */
91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
83, 112, 97, 99, 101, 58, 93, 93, 0
};
U_CAPI USet* U_EXPORT2
uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
return uset_openPattern(_PATTERN,
sizeof(_PATTERN)/sizeof(_PATTERN[0])-1, ec);
}
U_CAPI int32_t U_EXPORT2
u_getIntPropertyValue(UChar32 c, UProperty which) {
UErrorCode errorCode;

View File

@ -278,21 +278,6 @@ enum {
ZWNBSP =0xfeff
};
/**
* Is this character a "white space" in the sense of ICU rule parsers?
* @internal
*/
U_CAPI UBool U_EXPORT2
uprv_isRuleWhiteSpace(UChar32 c);
/**
* Get the set of "white space" characters in the sense of ICU rule
* parsers. Caller must close/delete result.
* @internal
*/
U_CAPI USet* U_EXPORT2
uprv_openRuleWhiteSpaceSet(UErrorCode* ec);
/**
* Get the maximum length of a (regular/1.0/extended) character name.
* @return 0 if no character names available.

View File

@ -48,4 +48,13 @@ typedef struct USetAdder USetAdder;
U_CDECL_END
/**
* Get the set of "white space" characters in the sense of ICU rule
* parsers. Caller must close/delete result.
* @internal
*/
U_CAPI USet* U_EXPORT2
uprv_openRuleWhiteSpaceSet(UErrorCode* ec);
#endif

View File

@ -132,3 +132,15 @@ uset_toPattern(const USet* set,
((const UnicodeSet*) set)->toPattern(pat, escapeUnprintable);
return pat.extract(result, resultCapacity, *ec);
}
U_CAPI USet* U_EXPORT2
uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
static const UChar _PATTERN[] = {
/* "[[:Cf:][:WSpace:]]" */
91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
83, 112, 97, 99, 101, 58, 93, 93, 0
};
return uset_openPattern(_PATTERN,
sizeof(_PATTERN)/sizeof(_PATTERN[0])-1, ec);
}

View File

@ -606,4 +606,16 @@ void ICU_Utility::appendToRule(UnicodeString& rule,
}
}
U_CAPI UBool U_EXPORT2
uprv_isRuleWhiteSpace(UChar32 c) {
/* "white space" in the sense of ICU rule parsers
This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
See UTR #31: http://www.unicode.org/reports/tr31/.
U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
*/
return (c >= 0x0009 && c <= 0x2029 &&
(c <= 0x000D || c == 0x0020 || c == 0x0085 ||
c == 0x200E || c == 0x200F || c >= 0x2028));
}
//eof

View File

@ -231,6 +231,13 @@ private:
ICU_Utility();
};
/**
* Is this character a "white space" in the sense of ICU rule parsers?
* @internal
*/
U_CAPI UBool U_EXPORT2
uprv_isRuleWhiteSpace(UChar32 c);
U_NAMESPACE_END
#endif

View File

@ -50,7 +50,7 @@
#include "unicode/uchar.h"
#include "unicode/curramt.h"
#include "ucurrimp.h"
#include "uprops.h"
#include "util.h"
#include "digitlst.h"
#include "cmemory.h"
#include "cstring.h"

View File

@ -34,7 +34,7 @@
#include "unicode/rbnf.h"
#include "ustrfmt.h"
#include "cmemory.h"
#include "uprops.h"
#include "util.h"
#include "uassert.h"
// *****************************************************************************

View File

@ -25,7 +25,7 @@
#include "cmemory.h"
#endif
#include "uprops.h"
#include "util.h"
U_NAMESPACE_BEGIN

View File

@ -25,7 +25,7 @@
#include "nfrlist.h"
#include "nfsubs.h"
#include "uprops.h"
#include "util.h"
U_NAMESPACE_BEGIN

View File

@ -22,7 +22,7 @@
#include "cmemory.h"
#include "cstring.h"
#include "uprops.h"
#include "util.h"
// debugging
// #define DEBUG

View File

@ -21,7 +21,7 @@
#include "unicode/parsepos.h"
#include "unicode/parseerr.h"
#include "unicode/regex.h"
#include "uprops.h"
#include "util.h"
#include "cmemory.h"
#include "cstring.h"
#include "uvectr32.h"

View File

@ -39,7 +39,7 @@
#include "unicode/dcfmtsym.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "uprops.h"
#include "util.h"
#include "gregoimp.h"
#include "cstring.h"
#include "uassert.h"

View File

@ -28,7 +28,7 @@
#include "ucol_tok.h"
#include "cmemory.h"
#include "uprops.h"
#include "util.h"
U_CDECL_BEGIN
static int32_t U_EXPORT2 U_CALLCONV