diff --git a/icu4c/source/common/convert.cpp b/icu4c/source/common/convert.cpp index 206f600870..0e8557a29d 100644 --- a/icu4c/source/common/convert.cpp +++ b/icu4c/source/common/convert.cpp @@ -142,7 +142,7 @@ UnicodeConverterCPP::fromUnicodeString(char* target, mySourceLength = source.length(); - mySource = source.getUChars(); + mySource = source.getArrayStart(); myTarget = target; ucnv_fromUnicode(&myConverter, &myTarget, diff --git a/icu4c/source/common/cpputils.cpp b/icu4c/source/common/cpputils.cpp index 2d19bb9ada..9cb0136d09 100644 --- a/icu4c/source/common/cpputils.cpp +++ b/icu4c/source/common/cpputils.cpp @@ -13,26 +13,32 @@ * Simple utility to set output buffer parameters ******************************************************/ void T_fillOutputParams(const UnicodeString* temp, - UChar* result, - const int32_t resultLength, - int32_t* resultLengthOut, - UErrorCode* status) + UChar* result, + const int32_t resultLength, + int32_t* resultLengthOut, + UErrorCode* status) { - - const int32_t actual = temp->length(); - const bool_t overflowed = actual >= resultLength; - const int32_t returnedSize = uprv_min(actual, resultLength-1); - if ((temp->length() < resultLength) && (result != temp->getUChars()) && (returnedSize > 0)) { - u_strcpy(result, temp->getUChars()); - } - + int32_t actual = temp->length(); + if (resultLength > 0) { - result[returnedSize] = 0; - } - if (resultLengthOut) { - *resultLengthOut = actual; - if (U_SUCCESS(*status) && overflowed) { - *status = U_BUFFER_OVERFLOW_ERROR; + // copy the contents; extract() will check if it needs to copy anything at all + temp->extract(0, resultLength - 1, result, 0); + + // zero-terminate the result buffer + if (actual < resultLength) { + result[actual] = 0; + } else { + result[resultLength - 1] = 0; } } + + // set the output length to the actual string length + if (resultLengthOut != 0) { + *resultLengthOut = actual; + } + + // set the error code according to the necessary buffer length + if (actual >= resultLength && U_SUCCESS(*status)) { + *status = U_BUFFER_OVERFLOW_ERROR; + } } diff --git a/icu4c/source/common/locid.cpp b/icu4c/source/common/locid.cpp index a3699b9941..0d4fa66620 100644 --- a/icu4c/source/common/locid.cpp +++ b/icu4c/source/common/locid.cpp @@ -23,7 +23,7 @@ * getLanguagesForCountry() * 03/16/99 bertrand rehaul. * 07/21/99 stephen Added U_CFUNC setDefault -* 11/09/99 weiv Added const char * getName() const; +* 11/09/99 weiv Added const char * getName() const; ******************************************************************************* */ @@ -322,32 +322,7 @@ void Locale::setHashCode() { UnicodeString fullNameUString(language, ""); - fullNameUString += UnicodeString(country, ""); - fullNameUString += UnicodeString(variant, ""); - const UChar *key = fullNameUString.getUChars(); - int32_t len = fullNameUString.length(); - int32_t hash = 0; - const UChar *limit = key + len; - int32_t inc = (len >= 128 ? len/64 : 1); - - /* - We compute the hash by iterating sparsely over 64 (at most) characters - spaced evenly through the string. For each character, we multiply the - previous hash value by a prime number and add the new character in, - in the manner of a additive linear congruential random number generator, - thus producing a pseudorandom deterministic value which should be well - distributed over the output range. [LIU] - */ - - while(key < limit) - { - hash = (hash * 37) + (char)*key; - key += inc; - } - - if(hash == 0) hash = 1; - - khashCode = hash & 0x7FFFFFFF; + khashCode = fullNameUString.append(UnicodeString(country, "")).append(UnicodeString(variant, "")).hashCode(); } @@ -753,13 +728,15 @@ Locale::getLanguagesForCountry(const UnicodeString& country, int32_t& count) // lookups. if(ctry2LangMapping == 0) { UErrorCode err = U_ZERO_ERROR; - UHashtable *temp = uhash_open(uhash_hashUChars, uhash_compareUChars, &err); + UHashtable *temp = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, &err); if (U_FAILURE(err)) { - count = 0; - return NULL; + count = 0; + return NULL; } - + + uhash_setKeyDeleter(temp, uhash_deleteUnicodeString); + int32_t i = 0; int32_t j; int32_t count = sizeof(compressedCtry2LangMapping) / sizeof(compressedCtry2LangMapping[0]); @@ -768,15 +745,15 @@ Locale::getLanguagesForCountry(const UnicodeString& country, int32_t& count) compressedCtry2LangMapping.extractBetween(i, i + 2, key); i += 2; for(j = i; j < count; j += 2) - if(Unicode::isUpperCase(compressedCtry2LangMapping[j])) - break; + if(Unicode::isUpperCase(compressedCtry2LangMapping[j])) + break; UnicodeString compressedValues; compressedCtry2LangMapping.extractBetween(i, j, compressedValues); UnicodeString *values = new UnicodeString[compressedValues.length() / 2]; int32_t valLen = sizeof(values) / sizeof(values[0]); for (int32_t k = 0; k < valLen; ++k) - compressedValues.extractBetween(k * 2, (k * 2) + 2, values[k]); - uhash_put(temp, (void*)key.getUChars(), values, &err); + compressedValues.extractBetween(k * 2, (k * 2) + 2, values[k]); + uhash_put(temp, new UnicodeString(key), values, &err); i = j; } @@ -786,9 +763,8 @@ Locale::getLanguagesForCountry(const UnicodeString& country, int32_t& count) else ctry2LangMapping = temp; } - - const UnicodeString *result = (const UnicodeString*) - uhash_get(ctry2LangMapping, country.getUChars()); + + const UnicodeString *result = (const UnicodeString*)uhash_get(ctry2LangMapping, &country); if(result == 0) count = 0; else diff --git a/icu4c/source/common/uloc.c b/icu4c/source/common/uloc.c index d3bf654b18..0dba9e43c8 100644 --- a/icu4c/source/common/uloc.c +++ b/icu4c/source/common/uloc.c @@ -36,7 +36,7 @@ /* UnicodeString stuff */ typedef struct UnicodeString UnicodeString; -U_CAPI const UChar* T_UnicodeString_getUChars(const UnicodeString *s); +U_CFUNC int32_t T_UnicodeString_length(const UnicodeString *s); U_CAPI int32_t T_UnicodeString_extract(const UnicodeString *s, char *dst); @@ -1051,7 +1051,7 @@ void _lazyEvaluate_installedLocales() for (i = 0; i < _installedLocalesCount; i++) { - strSize = u_strlen(T_UnicodeString_getUChars(temp[i])); + strSize = T_UnicodeString_length(temp[i]); temp2[i] = (char*) uprv_malloc(sizeof(char) * (strSize + 1)); diff --git a/icu4c/source/common/unicode/unistr.h b/icu4c/source/common/unicode/unistr.h index 0b95c12ee7..977976ebeb 100644 --- a/icu4c/source/common/unicode/unistr.h +++ b/icu4c/source/common/unicode/unistr.h @@ -27,12 +27,14 @@ #include "unicode/ucnv.h" #include "unicode/rep.h" -// Size of stack buffer for small strings -#define US_STACKBUF_SIZE 10 - class Locale; class UCharReference; class UnicodeStringStreamer; +class UnicodeConverterCPP; + +// for unistrm.h +class ostream; +U_COMMON_API ostream &operator<<(ostream& stream, const UnicodeString& s); /** * Unicode String literals in C++. @@ -59,9 +61,43 @@ class UnicodeStringStreamer; #endif /** - * UnicodeString is a concrete implementation of the abstract class - * UnicodeText. UnicodeString performs codeset conversion from char* - * data based on the type of data specified. + * UnicodeString is a concrete implementation of the abstract class Replaceable. + * It is a string class that stores Unicode characters directly and provides + * similar functionality as the Java string class. + * + * UnicodeString uses four storage models: + *
dst
itself as an external buffer,
+ * then extract() will not copy the contents.
+ *
* @param start offset of first character which will be copied into the array
* @param length the number of characters to extract
* @param dst array in which to copy characters. The length of dst
@@ -841,23 +880,6 @@ public:
*/
inline UnicodeString& setTo(const UnicodeString& srcText);
- /**
- * Set the characters in the UnicodeString object to the characters
- * in srcChars in the range
- * [srcStart, srcStart + srcLength).
- * srcChars is not modified.
- * @param srcChars the source for the new characters
- * @param srcStart the offset into srcChars where new characters
- * will be obtained
- * @param srcLength the number of characters in srcChars in the
- * replace string
- * @return a reference to this
- * @stable
- */
- inline UnicodeString& setTo(const UChar *srcChars,
- UTextOffset srcStart,
- int32_t srcLength);
-
/**
* Set the characters in the UnicodeString object to the characters
* in srcChars. srcChars is not modified.
@@ -879,6 +901,51 @@ public:
*/
UnicodeString& setTo(UChar srcChar);
+ /**
+ * Aliasing setTo() function, analogous to the readonly-aliasing UChar* constructor.
+ * The text will be used for the UnicodeString object, but
+ * it will not be released when the UnicodeString is destroyed.
+ * This has copy-on-write semantics:
+ * When the string is modified, then the buffer is first copied into
+ * newly allocated memory.
+ * The aliased buffer is never modified.
+ * In an assignment to another UnicodeString, the text will be aliased again,
+ * so that both strings then alias the same readonly-text.
+ *
+ * @param isTerminated specifies if text
is NUL
-terminated.
+ * This must be true if textLength==-1
.
+ * @param text The characters to alias for the UnicodeString.
+ * @param textLength The number of Unicode characters in text
to alias.
+ * If -1, then this constructor will determine the length
+ * by calling u_strlen()
.
+ * @draft
+ */
+ UnicodeString &setTo(bool_t isTerminated,
+ const UChar *text,
+ int32_t textLength);
+
+ /**
+ * Aliasing setTo() function, analogous to the writeable-aliasing UChar* constructor.
+ * The text will be used for the UnicodeString object, but
+ * it will not be released when the UnicodeString is destroyed.
+ * This has write-through semantics:
+ * For as long as the capacity of the buffer is sufficient, write operations
+ * will directly affect the buffer. When more capacity is necessary, then
+ * a new buffer will be allocated and the contents copied as with regularly
+ * constructed strings.
+ * In an assignment to another UnicodeString, the buffer will be copied.
+ * The extract(UChar *dst) function detects whether the dst pointer is the same
+ * as the string buffer itself and will in this case not copy the contents.
+ *
+ * @param buffer The characters to alias for the UnicodeString.
+ * @param buffLength The number of Unicode characters in buffer
to alias.
+ * @param buffCapacity The size of buffer
in UChars.
+ * @draft
+ */
+ UnicodeString &setTo(UChar *buffer,
+ int32_t buffLength,
+ int32_t buffCapacity);
+
/**
* Set the character at the specified offset to the specified character.
* @param offset A valid offset into the text of the character to set
@@ -1426,12 +1493,15 @@ public:
int32_t textLength);
/**
- * Aliasing UChar* constructor.
- * The text will be used for the new UnicodeString object, but
+ * Readonly-aliasing UChar* constructor.
+ * The text will be used for the UnicodeString object, but
* it will not be released when the UnicodeString is destroyed.
- * Be careful not to attempt to modify the contents of the UnicodeString
- * if the text is read-only. Operations that allocate an entirely
- * new buffer are harmless.
+ * This has copy-on-write semantics:
+ * When the string is modified, then the buffer is first copied into
+ * newly allocated memory.
+ * The aliased buffer is never modified.
+ * In an assignment to another UnicodeString, the text will be aliased again,
+ * so that both strings then alias the same readonly-text.
*
* @param isTerminated specifies if text
is NUL
-terminated.
* This must be true if textLength==-1
.
@@ -1445,6 +1515,26 @@ public:
UChar *text,
int32_t textLength);
+ /**
+ * Writeable-aliasing UChar* constructor.
+ * The text will be used for the UnicodeString object, but
+ * it will not be released when the UnicodeString is destroyed.
+ * This has write-through semantics:
+ * For as long as the capacity of the buffer is sufficient, write operations
+ * will directly affect the buffer. When more capacity is necessary, then
+ * a new buffer will be allocated and the contents copied as with regularly
+ * constructed strings.
+ * In an assignment to another UnicodeString, the buffer will be copied.
+ * The extract(UChar *dst) function detects whether the dst pointer is the same
+ * as the string buffer itself and will in this case not copy the contents.
+ *
+ * @param buffer The characters to alias for the UnicodeString.
+ * @param buffLength The number of Unicode characters in buffer
to alias.
+ * @param buffCapacity The size of buffer
in UChars.
+ * @draft
+ */
+ UnicodeString(UChar *buffer, int32_t buffLength, int32_t buffCapacity);
+
/**
* char* constructor.
* @param codepageData an array of bytes, null-terminated
@@ -1480,7 +1570,7 @@ public:
* @param that The UnicodeString object to copy.
* @stable
*/
- inline UnicodeString(const UnicodeString& that);
+ UnicodeString(const UnicodeString& that);
/** Destructor.
* @stable
@@ -1519,24 +1609,6 @@ public:
UCharReference operator[] (UTextOffset pos);
- // {sfb} remove these later?
- /* Hack to avoid circular dependencies */
-
- /**
- * Convert the characters in this to UPPER CASE following the conventions of
- * the default locale.
- * @retrurn A reference to this.
- */
- // UnicodeString& toUpper();
-
- /**
- * Convert the characters in this to lower case following the conventions of
- * the default locale.
- * @retrurn A reference to this.
- */
- // UnicodeString& toLower();
-
-
//========================================
// Implementation methods
//========================================
@@ -1575,9 +1647,6 @@ private:
inline UChar doCharAt(UTextOffset offset) const;
- UnicodeString& doSetCharAt(UTextOffset offset,
- UChar c);
-
UnicodeString& doReplace(UTextOffset start,
int32_t length,
const UnicodeString& srcText,
@@ -1603,6 +1672,15 @@ private:
// get the "real" capacity of the array, adjusted for ref count
inline int32_t getCapacity(void) const;
+ // allocate the array; result may be fStackBuffer
+ // sets refCount to 1 if appropriate
+ // sets fArray, fCapacity, and fFlags
+ // returns boolean for success or failure
+ bool_t allocate(int32_t capacity);
+
+ // release the array if owned
+ inline void releaseArray();
+
// utility method to get around lack of exception handling
void setToBogus(void);
@@ -1621,36 +1699,53 @@ private:
* subset ("invariant characters") of the platform encoding. See utypes.h.
*/
void doCodepageCreate(const char *codepageData,
- int32_t dataLength,
- const char *codepage);
+ int32_t dataLength,
+ const char *codepage);
- // clones array if refCount > 1
- void cloneArrayIfNeeded(void);
+ /*
+ * This function is called when write access to the array
+ * is necessary.
+ *
+ * We need to make a copy of the array if
+ * the buffer is read-only, or
+ * the buffer is refCounted (shared), and refCount>1, or
+ * the buffer is too small.
+ *
+ * Return FALSE if memory could not be allocated.
+ */
+ bool_t cloneArrayIfNeeded(int32_t newCapacity = -1,
+ int32_t growCapacity = -1,
+ bool_t doCopyArray = TRUE,
+ int32_t **pBufferToDelete = 0);
// ref counting
- inline uint16_t addRef(void);
- inline uint16_t removeRef(void);
- inline uint16_t refCount(void) const;
- inline uint16_t setRefCount(uint16_t count);
-
- UChar fStackBuffer [ US_STACKBUF_SIZE ]; // buffer for small strings
- UChar *fArray; // the Unicode data
- int32_t fLength; // number characters in fArray
- int32_t fCapacity; // sizeof fArray
- int32_t fHashCode; // the hash code
- bool_t fRefCounted; // indicates if we own storage
- bool_t fBogus; // indicates if an operation failed
+ inline int32_t addRef(void);
+ inline int32_t removeRef(void);
+ inline int32_t refCount(void) const;
+ inline int32_t setRefCount(int32_t count);
// constants
- static const UChar fgInvalidUChar; // invalid UChar index
- static const int32_t kGrowSize; // grow size for this buffer
- static const int32_t kInvalidHashCode; // invalid hash code
- static const int32_t kEmptyHashCode; // hash code for empty string
-
+ enum {
+ US_STACKBUF_SIZE=9, // Size of stack buffer for small strings
+ kInvalidUChar=0xffff, // invalid UChar index
+ kGrowSize=128, // grow size for this buffer
+ kInvalidHashCode=0, // invalid hash code
+ kEmptyHashCode=1, // hash code for empty string
+
+ // bit flag values for fFlags
+ kIsBogus=1, // this string is bogus, i.e., not valid
+ kUsingStackBuffer=2, // fArray==fStackBuffer
+ kRefCounted=4, // there is a refCount field before the characters in fArray
+ kBufferIsReadonly=8, // do not write to this buffer
+
+ // combined values for convenience
+ kShortString=kUsingStackBuffer,
+ kLongString=kRefCounted,
+ kReadonlyAlias=kBufferIsReadonly,
+ kWriteableAlias=0
+ };
+
// statics
- inline static int32_t allocation(int32_t minSize); // allocation algorithm
- inline static UChar* allocate(int32_t minSize, // allocate buffer >= minSize
- int32_t& actualSize);
// default converter cache
static UConverter* getDefaultConverter(UErrorCode& status);
@@ -1659,6 +1754,27 @@ private:
static UConverter *fgDefaultConverter;
friend class UnicodeStringStreamer;
+ friend class UnicodeConverterCPP;
+ friend U_COMMON_API ostream &operator<<(ostream& stream, const UnicodeString& s);
+
+ /*
+ * The following are all the class fields that are stored
+ * in each UnicodeString object.
+ * Note that UnicodeString has virtual functions,
+ * therefore there is an implicit vtable pointer
+ * as the first real field.
+ * The fields should be aligned such that no padding is
+ * necessary, mostly by having larger types first.
+ * On 32-bit machines, the size should be 40 bytes,
+ * on 64-bit machines (8-byte pointers), it should be 48 bytes.
+ */
+ // (implicit) *vtable;
+ UChar *fArray; // the Unicode data
+ int32_t fLength; // number characters in fArray
+ int32_t fCapacity; // sizeof fArray
+ int32_t fHashCode; // the hash code
+ uint16_t fFlags; // bit flags: see constants above
+ UChar fStackBuffer [ US_STACKBUF_SIZE ]; // buffer for small strings
public:
@@ -1684,19 +1800,11 @@ public:
/* @deprecated */
inline void operator delete(void *location);
-
//========================================
// Non-public API - will be removed!
//========================================
- /* @deprecated */
- UnicodeString(UChar *buff, int32_t bufLength, int32_t buffCapacity);
- /* @deprecated */
- const UChar* getUChars(void) const;
- /* @deprecated */
- inline const UChar* getUniChars(void) const;
- /* @deprecated */
- UChar* orphanStorage(void);
-
+ /* @deprecated */
+ const UChar* getUChars() const;
};
//========================================
@@ -1716,12 +1824,6 @@ uprv_arrayCopy(const UnicodeString *src, int32_t srcStart,
//========================================
// Inline members
//========================================
-inline
-UnicodeString::UnicodeString(const UnicodeString& that)
- : fArray(fStackBuffer), fLength(0), fCapacity(US_STACKBUF_SIZE),
- fRefCounted(FALSE), fHashCode(kEmptyHashCode), fBogus(FALSE)
-{ *this = that; }
-
//========================================
// Read-only alias methods
@@ -2089,10 +2191,10 @@ UnicodeString::extractBetween(UTextOffset start,
inline UChar
UnicodeString::doCharAt(UTextOffset offset) const
{
- if(offset < 0 || offset >= fLength)
- return fgInvalidUChar;
- // in ref-counted implementation, first char is ref count
- return fArray[ fRefCounted ? offset + 1 : offset ];
+ if(offset < 0 || offset >= fLength) {
+ return kInvalidUChar;
+ }
+ return fArray[ offset ];
}
inline UChar
@@ -2136,11 +2238,13 @@ inline UnicodeString&
UnicodeString::setTo(const UnicodeString& srcText)
{ return doReplace(0, fLength, srcText, 0, srcText.fLength); }
+#if 0
inline UnicodeString&
UnicodeString::setTo(const UChar *srcChars,
UTextOffset srcStart,
int32_t srcLength)
{ return doReplace(0, fLength, srcChars, srcStart, srcLength); }
+#endif
inline UnicodeString&
UnicodeString::setTo(const UChar *srcChars,
@@ -2311,7 +2415,7 @@ UnicodeString::reverse(UTextOffset start,
//========================================
inline bool_t
UnicodeString::isBogus() const
-{ return fBogus; }
+{ return fFlags & kIsBogus; }
//========================================
@@ -2320,31 +2424,38 @@ UnicodeString::isBogus() const
inline UChar*
UnicodeString::getArrayStart()
-{ return (fRefCounted ? fArray + 1 : fArray); }
+{ return fArray; }
inline const UChar*
UnicodeString::getArrayStart() const
-{ return (fRefCounted ? fArray + 1 : fArray); }
+{ return fArray; }
inline int32_t
UnicodeString::getCapacity() const
-{ return (fRefCounted ? fCapacity - 1 : fCapacity); }
+{ return fCapacity; }
-inline uint16_t
+inline void
+UnicodeString::releaseArray() {
+ if((fFlags & kRefCounted) && removeRef() == 0) {
+ delete [] ((int32_t *)fArray - 1);
+ }
+}
+
+inline int32_t
UnicodeString::addRef()
-{ return ++(fArray[0]); }
+{ return ++*((int32_t *)fArray - 1); }
-inline uint16_t
+inline int32_t
UnicodeString::removeRef()
-{ return --(fArray[0]); }
+{ return --*((int32_t *)fArray - 1); }
-inline uint16_t
+inline int32_t
UnicodeString::refCount() const
-{ return fArray[0]; }
+{ return *((int32_t *)fArray - 1); }
-inline uint16_t
-UnicodeString::setRefCount(uint16_t count)
-{ fRefCounted = TRUE; return (fArray[0] = count); }
+inline int32_t
+UnicodeString::setRefCount(int32_t count)
+{ return (*((int32_t *)fArray - 1) = count); }
// deprecated API - remove later
@@ -2352,10 +2463,6 @@ inline int32_t
UnicodeString::size() const
{ return fLength; }
-inline const UChar*
-UnicodeString::getUniChars() const
-{ return getUChars(); }
-
inline UnicodeString&
UnicodeString::findAndReplace(const UnicodeString& oldText,
const UnicodeString& newText,
@@ -2380,14 +2487,6 @@ UnicodeString::operator delete(void *location)
//========================================
// Static members
//========================================
-inline int32_t
-UnicodeString::allocation(int32_t minSize)
-{ return minSize < kGrowSize ? kGrowSize
- : (minSize * 2 + kGrowSize) & ~(kGrowSize - 1); }
-
-inline UChar*
-UnicodeString::allocate(int32_t minSize, int32_t& actualSize)
-{ actualSize = allocation(minSize); return new UChar[ actualSize ]; }
//========================================
// class UCharReference
@@ -2442,6 +2541,3 @@ UCharReference::operator UChar()
{ return fString->charAt(fPos); }
#endif
-
-
-
diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp
index 5e95cb30d3..651ee35026 100644
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@@ -17,7 +17,6 @@
*******************************************************************************
*/
-
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "unicode/locid.h"
@@ -78,11 +77,6 @@ us_arrayCopy(const UChar *src, int32_t srcStart,
}
}
-// static initialization
-const UChar UnicodeString::fgInvalidUChar = 0xFFFF;
-const int32_t UnicodeString::kGrowSize = 0x80;
-const int32_t UnicodeString::kInvalidHashCode = 0;
-const int32_t UnicodeString::kEmptyHashCode = 1;
UConverter* UnicodeString::fgDefaultConverter = 0;
//========================================
@@ -92,58 +86,47 @@ UnicodeString::UnicodeString()
: fArray(fStackBuffer),
fLength(0),
fCapacity(US_STACKBUF_SIZE),
- fRefCounted(FALSE),
fHashCode(kEmptyHashCode),
- fBogus(FALSE)
+ fFlags(kShortString)
{}
UnicodeString::UnicodeString(int32_t capacity)
: fArray(0),
fLength(0),
- fCapacity(0),
- fRefCounted(FALSE),
+ fCapacity(US_STACKBUF_SIZE),
fHashCode(kEmptyHashCode),
- fBogus(FALSE)
+ fFlags(0)
{
- fArray = allocate(capacity, fCapacity);
- if(! fArray) {
- setToBogus();
- return;
- }
-
- setRefCount(1);
+ allocate(capacity);
}
UnicodeString::UnicodeString(UChar ch)
: fArray(fStackBuffer),
- fLength(0),
+ fLength(1),
fCapacity(US_STACKBUF_SIZE),
- fRefCounted(FALSE),
- fHashCode(kEmptyHashCode),
- fBogus(FALSE)
+ fHashCode(kInvalidHashCode),
+ fFlags(kShortString)
{
- doReplace(0, 0, &ch, 0, 1);
+ fStackBuffer[0] = ch;
}
UnicodeString::UnicodeString(const UChar *text)
: fArray(fStackBuffer),
fLength(0),
fCapacity(US_STACKBUF_SIZE),
- fRefCounted(FALSE),
fHashCode(kEmptyHashCode),
- fBogus(FALSE)
+ fFlags(kShortString)
{
doReplace(0, 0, text, 0, u_strlen(text));
}
-UnicodeString::UnicodeString( const UChar *text,
- int32_t textLength)
+UnicodeString::UnicodeString(const UChar *text,
+ int32_t textLength)
: fArray(fStackBuffer),
fLength(0),
fCapacity(US_STACKBUF_SIZE),
- fRefCounted(FALSE),
fHashCode(kEmptyHashCode),
- fBogus(FALSE)
+ fFlags(kShortString)
{
doReplace(0, 0, text, 0, textLength);
}
@@ -152,54 +135,113 @@ UnicodeString::UnicodeString(bool_t isTerminated,
UChar *text,
int32_t textLength)
: fArray(text),
- fLength(textLength != -1 || !isTerminated ? textLength : u_strlen(text)),
- fCapacity(isTerminated ? fLength + 1 : fLength),
- fRefCounted(FALSE),
+ fLength(textLength),
+ fCapacity(isTerminated ? textLength + 1 : textLength),
fHashCode(kInvalidHashCode),
- fBogus(FALSE)
+ fFlags(kReadonlyAlias)
{
- if(fLength < 0) {
+ if(text == 0 || textLength < -1 || textLength == -1 && !isTerminated) {
+ setToBogus();
+ } else if(textLength == -1) {
+ // text is terminated, or else it would have failed the above test
+ fLength = u_strlen(text);
+ fCapacity = fLength + 1;
+ }
+}
+
+UnicodeString::UnicodeString(UChar *buff,
+ int32_t bufLength,
+ int32_t buffCapacity)
+ : fArray(buff),
+ fLength(bufLength),
+ fCapacity(buffCapacity),
+ fHashCode(kInvalidHashCode),
+ fFlags(kWriteableAlias)
+{
+ if(buff == 0 || bufLength < 0 || bufLength > buffCapacity) {
setToBogus();
}
}
UnicodeString::UnicodeString(const char *codepageData,
- const char *codepage)
+ const char *codepage)
: fArray(fStackBuffer),
fLength(0),
fCapacity(US_STACKBUF_SIZE),
- fRefCounted(FALSE),
fHashCode(kEmptyHashCode),
- fBogus(FALSE)
+ fFlags(kShortString)
{
- if(codepageData != 0)
+ if(codepageData != 0) {
doCodepageCreate(codepageData, uprv_strlen(codepageData), codepage);
+ }
}
UnicodeString::UnicodeString(const char *codepageData,
- int32_t dataLength,
- const char *codepage)
+ int32_t dataLength,
+ const char *codepage)
: fArray(fStackBuffer),
fLength(0),
fCapacity(US_STACKBUF_SIZE),
- fRefCounted(FALSE),
fHashCode(kEmptyHashCode),
- fBogus(FALSE)
+ fFlags(kShortString)
{
if(codepageData != 0) {
doCodepageCreate(codepageData, dataLength, codepage);
}
}
+UnicodeString::UnicodeString(const UnicodeString& that)
+ : fArray(fStackBuffer),
+ fLength(0),
+ fCapacity(US_STACKBUF_SIZE),
+ fHashCode(kEmptyHashCode),
+ fFlags(kShortString)
+{
+ *this = that;
+}
+
+//========================================
+// array allocation
+//========================================
+
+bool_t
+UnicodeString::allocate(int32_t capacity) {
+ if(capacity <= US_STACKBUF_SIZE) {
+ fArray = fStackBuffer;
+ fCapacity = US_STACKBUF_SIZE;
+ fFlags = kShortString;
+ } else {
+ // count bytes for the refCounter and the string capacity, and
+ // round up to a multiple of 16; then divide by 4 and allocate int32_t's
+ // to be safely aligned for the refCount
+ int32_t words = ((sizeof(int32_t) + capacity * U_SIZEOF_UCHAR + 15) & ~15) >> 2;
+ int32_t *array = new int32_t[words];
+ if(array != 0) {
+ // set initial refCount and point behind the refCount
+ *array++ = 1;
+
+ // have fArray point to the first UChar
+ fArray = (UChar *)array;
+ fCapacity = (words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR);
+ fFlags = kLongString;
+ } else {
+ fArray = 0;
+ fCapacity = 0;
+ fHashCode = kInvalidHashCode; // for constructor(capacity) to be correctly bogus
+ fFlags = kIsBogus;
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
+
//========================================
// Destructor
//========================================
UnicodeString::~UnicodeString()
{
- // decrement ref count and reclaim storage, if owned
- if(fRefCounted && removeRef() == 0)
- delete [] fArray;
+ releaseArray();
}
//========================================
@@ -209,37 +251,62 @@ UnicodeString&
UnicodeString::operator= (const UnicodeString& src)
{
// if assigning to ourselves, do nothing
- if(this == &src) {
+ if(this == 0 || this == &src) {
return *this;
}
- // if src is bogus, set ourselves to bogus
- if(src.isBogus()) {
+ // is the right side bogus?
+ if(&src == 0 || src.isBogus()) {
setToBogus();
return *this;
}
- // if src is aliased or ref counted, point ourselves at its array
- if(src.fArray != src.fStackBuffer) {
+ // delete the current contents
+ releaseArray();
- // if we're ref counted, decrement our current ref count
- if(fRefCounted && removeRef() == 0)
- delete [] fArray;
+ // we always copy the length and the hash code
+ fLength = src.fLength;
+ fHashCode = src.fHashCode;
- fArray = src.fArray;
- fLength = src.fLength;
- fCapacity = src.fCapacity;
- fHashCode = src.fHashCode;
- fRefCounted = src.fRefCounted;
- if(fRefCounted) {
- addRef();
+ switch(src.fFlags) {
+ case kShortString:
+ // short string using the stack buffer, do the same
+ fArray = fStackBuffer;
+ fCapacity = US_STACKBUF_SIZE;
+ fFlags = kShortString;
+ if(fLength > 0) {
+ uprv_memcpy(fStackBuffer, src.fArray, fLength * U_SIZEOF_UCHAR);
}
- fBogus = FALSE;
- }
- // if src isn't ref counted, just do a replace
- else {
- doReplace(0, fLength, src.fArray, 0, src.fLength);
- fHashCode = src.fHashCode;
+ break;
+ case kLongString:
+ // src uses a refCounted string buffer, use that buffer with refCount
+ // src is const, use a cast - we don't really change it
+ ((UnicodeString &)src).addRef();
+ // fall through to readonly alias copying: copy all fields
+ case kReadonlyAlias:
+ // src is a readonly alias, do the same
+ fArray = src.fArray;
+ fCapacity = src.fCapacity;
+ fFlags = src.fFlags;
+ break;
+ case kWriteableAlias:
+ // src is a writeable alias; we make a copy of that instead
+ if(allocate(fLength)) {
+ if(fLength > 0) {
+ uprv_memcpy(fArray, src.fArray, fLength * U_SIZEOF_UCHAR);
+ }
+ break;
+ }
+ // if there is not enough memory, then fall through to setting to bogus
+ default:
+ // if src is bogus, set ourselves to bogus
+ // do not call setToBogus() here because fArray and fFlags are not consistent here
+ fArray = 0;
+ fLength = 0;
+ fCapacity = 0;
+ fHashCode = kInvalidHashCode;
+ fFlags = kIsBogus;
+ break;
}
return *this;
@@ -317,6 +384,11 @@ UnicodeString::doCompare( UTextOffset start,
// get the correct pointer
const UChar *chars = getArrayStart();
+ // are we comparing the same buffer contents?
+ if(chars + start == srcChars + srcStart) {
+ return 0;
+ }
+
UTextOffset minLength;
int8_t lengthResult;
@@ -374,12 +446,14 @@ UnicodeString::doExtract(UTextOffset start,
UChar *dst,
UTextOffset dstStart) const
{
- // pin indices to legal values
- pinIndices(start, length);
- us_arrayCopy(getArrayStart(), start, dst, dstStart, length);
+ // do not copy anything if we alias dst itself
+ if(fArray + start != dst + dstStart) {
+ // pin indices to legal values
+ pinIndices(start, length);
+ us_arrayCopy(getArrayStart(), start, dst, dstStart, length);
+ }
}
-
UTextOffset
UnicodeString::indexOf(const UChar *srcChars,
UTextOffset srcStart,
@@ -557,17 +631,79 @@ UnicodeString::findAndReplace(UTextOffset start,
// Write implementation
//========================================
+void
+UnicodeString::setToBogus()
+{
+ releaseArray();
+
+ fArray = 0;
+ fCapacity = fLength = 0;
+ fHashCode = kInvalidHashCode;
+ fFlags = kIsBogus;
+}
+
+// setTo() analogous to the readonly-aliasing constructor with the same signature
+UnicodeString &
+UnicodeString::setTo(bool_t isTerminated,
+ const UChar *text,
+ int32_t textLength)
+{
+ if(text == 0 || textLength < -1 || textLength == -1 && !isTerminated) {
+ setToBogus();
+ return *this;
+ }
+
+ releaseArray();
+
+ fArray = (UChar *)text;
+ if(textLength != -1) {
+ fLength = textLength;
+ } else {
+ // text is terminated, or else it would have failed the above test
+ fLength = u_strlen(text);
+ fCapacity = fLength + 1;
+ }
+
+ fCapacity = isTerminated ? textLength + 1 : textLength;
+ fHashCode = kInvalidHashCode;
+ fFlags = kReadonlyAlias;
+ return *this;
+}
+
+// setTo() analogous to the writeable-aliasing constructor with the same signature
+UnicodeString &
+UnicodeString::setTo(UChar *buffer,
+ int32_t buffLength,
+ int32_t buffCapacity) {
+ if(buffer == 0 || buffLength < 0 || buffLength > buffCapacity) {
+ setToBogus();
+ return *this;
+ }
+
+ releaseArray();
+
+ fArray = buffer;
+ fLength = buffLength;
+ fCapacity = buffCapacity;
+ fHashCode = kInvalidHashCode;
+ fFlags = kWriteableAlias;
+ return *this;
+}
+
UnicodeString&
UnicodeString::setCharAt(UTextOffset offset,
UChar c)
{
- if(offset < 0)
- offset = 0;
- else if(offset >= fLength)
- offset = fLength - 1;
+ if(cloneArrayIfNeeded()) {
+ if(offset < 0) {
+ offset = 0;
+ } else if(offset >= fLength) {
+ offset = fLength - 1;
+ }
- doSetCharAt(offset, c);
- fHashCode = kInvalidHashCode;
+ fArray[offset] = c;
+ fHashCode = kInvalidHashCode;
+ }
return *this;
}
@@ -586,8 +722,16 @@ UnicodeString::toUpper(const Locale& locale)
UTextOffset limit = fLength;
UChar c;
UnicodeString lang;
+ char langChars[16];
+ if(!cloneArrayIfNeeded()) {
+ return *this;
+ }
+
+ // get char * locale language
locale.getLanguage(lang);
+ lang.extract(0, lang.length(), langChars, "");
+ langChars[lang.length()] = 0;
// The German sharp S character (U+00DF)'s uppercase equivalent is
// "SS", making it the only character that expands to two characters
@@ -598,56 +742,46 @@ UnicodeString::toUpper(const Locale& locale)
// string looking for sharp S characters and then go back and make
// room for the extra capital Ses if we find any. [For performance,
// we only do this extra work if the language is actually German]
- if(lang == "de") {
+ if(uprv_strcmp(langChars, "de") == 0) {
UChar SS [] = { 0x0053, 0x0053 };
while(start < limit) {
-
c = getArrayStart()[start];
// A sharp s needs to be replaced with two capital S's.
if(c == 0x00DF) {
- doReplace(start, 1, SS, 0, 2);
- start++;
- limit++;
+ doReplace(start, 1, SS, 0, 2);
+ start++;
+ limit++;
+ } else {
+ // Otherwise, the case conversion can be handled by the Unicode unit.
+ fArray[start] = Unicode::toUpperCase(c);
}
- // Otherwise, the case conversion can be handled by the Unicode unit.
- else if(Unicode::isLowerCase(c))
- doSetCharAt(start, Unicode::toUpperCase(c));
-
// If no conversion is necessary, do nothing
++start;
}
- }
-
- // If the specfied language is Turkish, then we have to special-case
- // for the Turkish dotted and dotless Is. The regular lowercase i
- // maps to the capital I with a dot (U+0130), and the lowercase i
- // without the dot (U+0131) maps to the regular capital I
- else if(lang == "tr") {
+ } else if(uprv_strcmp(langChars, "tr") == 0) {
+ // If the specfied language is Turkish, then we have to special-case
+ // for the Turkish dotted and dotless Is. The regular lowercase i
+ // maps to the capital I with a dot (U+0130), and the lowercase i
+ // without the dot (U+0131) maps to the regular capital I
while(start < limit) {
c = getArrayStart()[start];
- if(c == 0x0069/*'i'*/)
- doSetCharAt(start, 0x0130);
- else if(c == 0x0131)
- doSetCharAt(start, 0x0049/*'I'*/);
- else if(Unicode::isLowerCase(c))
- doSetCharAt(start, Unicode::toUpperCase(c));
+ if(c == 0x0069/*'i'*/) {
+ fArray[start] = 0x0130;
+ } else if(c == 0x0131) {
+ fArray[start] = 0x0049/*'I'*/;
+ } else {
+ fArray[start] = Unicode::toUpperCase(c);
+ }
++start;
}
- }
-
- else {
- // clone our array, if necessary
- cloneArrayIfNeeded();
+ } else {
UChar *array = getArrayStart();
while(start < limit) {
- c = array[start];
- if(Unicode::isLowerCase(c)) {
- array[start] = Unicode::toUpperCase(c);
- }
+ array[start] = Unicode::toUpperCase(array[start]);
++start;
}
}
@@ -664,59 +798,60 @@ UnicodeString::toLower(const Locale& locale)
UTextOffset limit = fLength;
UChar c;
UnicodeString lang;
+ char langChars[16];
+ if(!cloneArrayIfNeeded()) {
+ return *this;
+ }
+
+ // get char * locale language
locale.getLanguage(lang);
+ lang.extract(0, lang.length(), langChars, "");
+ langChars[lang.length()] = 0;
// if the specfied language is Turkish, then we have to special-case
// for the Turkish dotted and dotless Is. The capital I with a dot
// (U+0130) maps to the regular lowercase i, and the regular capital
// I maps to the lowercase i without the dot (U+0131)
- if(lang == "tr") {
+ if(uprv_strcmp(langChars, "tr") == 0) {
while(start < limit) {
c = getArrayStart()[start];
if(c == 0x0049) // 'I'
- doSetCharAt(start, 0x0131);
+ fArray[start] = 0x0131;
else if(c == 0x0130)
- doSetCharAt(start, 0x0069); // 'i'
- else if(Unicode::isUpperCase(c) || Unicode::isTitleCase(c))
- doSetCharAt(start, Unicode::toLowerCase(c));
+ fArray[start] = 0x0069; // 'i'
+ else {
+ fArray[start] = Unicode::toLowerCase(c);
+ }
++start;
}
- }
-
- // if the specfied language is Greek, then we have to special-case
- // for the capital letter sigma (U+3A3), which has two lower-case
- // forms. If the character following the capital sigma is a letter,
- // we use the medial form (U+3C3); otherwise, we use the final form
- // (U+3C2).
- else if(lang == "el") {
+ } else if(uprv_strcmp(langChars, "el") == 0) {
+ // if the specfied language is Greek, then we have to special-case
+ // for the capital letter sigma (U+3A3), which has two lower-case
+ // forms. If the character following the capital sigma is a letter,
+ // we use the medial form (U+3C3); otherwise, we use the final form
+ // (U+3C2).
while(start < limit) {
c = getArrayStart()[start];
if(c == 0x3a3) {
- if(start + 1 < limit && Unicode::isLetter(getArrayStart()[start + 1]))
- doSetCharAt(start, 0x3C3);
- else
- doSetCharAt(start, 0x3C2);
+ if(start + 1 < limit && Unicode::isLetter(getArrayStart()[start + 1])) {
+ fArray[start] = 0x3C3;
+ } else {
+ fArray[start] = 0x3C2;
+ }
+ } else {
+ fArray[start] = Unicode::toLowerCase(c);
}
- else if(Unicode::isUpperCase(c) || Unicode::isTitleCase(c))
- doSetCharAt(start, Unicode::toLowerCase(c));
++start;
}
- }
-
- // if the specified language is anything other than Turkish or
- // Greek, we rely on the Unicode class to do all our case mapping--
- // there are no other special cases
- else {
- // clone our array, if necessary
- cloneArrayIfNeeded();
+ } else {
+ // if the specified language is anything other than Turkish or
+ // Greek, we rely on the Unicode class to do all our case mapping--
+ // there are no other special cases
UChar *array = getArrayStart();
while(start < limit) {
- c = array[start];
- if(Unicode::isUpperCase(c) || Unicode::isTitleCase(c)) {
- array[start] = Unicode::toLowerCase(c);
- }
+ array[start] = Unicode::toLowerCase(array[start]);
++start;
}
}
@@ -726,19 +861,6 @@ UnicodeString::toLower(const Locale& locale)
return *this;
}
-// for speed, no bounds checking is performed and the hash code isn't changed
-UnicodeString&
-UnicodeString::doSetCharAt(UTextOffset offset,
- UChar c)
-{
- // clone our array, if necessary
- cloneArrayIfNeeded();
-
- // set the character
- fArray[ (fRefCounted ? offset + 1 : offset) ] = c;
- return *this;
-}
-
UnicodeString&
UnicodeString::doReplace( UTextOffset start,
int32_t length,
@@ -766,70 +888,52 @@ UnicodeString::doReplace(UTextOffset start,
UTextOffset srcStart,
int32_t srcLength)
{
- // if we're bogus, do nothing
- if(fBogus)
- return *this;
+ // if we're bogus, set us to empty first
+ if(isBogus()) {
+ fArray = fStackBuffer;
+ fLength = 0;
+ fCapacity = US_STACKBUF_SIZE;
+ fHashCode = kEmptyHashCode;
+ fFlags = kShortString;
+ }
if(srcChars == 0) {
srcStart = srcLength = 0;
}
- bool_t deleteWhenDone = FALSE;
- UChar *bufferToDelete = 0;
+ int32_t *bufferToDelete = 0;
- // clone our array, if necessary
- cloneArrayIfNeeded();
+ // the following may change fArray but will not copy the current contents;
+ // therefore we need to keep the current fArray
+ UChar *oldArray = fArray;
+ int32_t oldLength = fLength;
// pin the indices to legal values
pinIndices(start, length);
// calculate the size of the string after the replace
- int32_t newSize = fLength - length + srcLength;
+ int32_t newSize = oldLength - length + srcLength;
- // allocate a bigger array if needed
- if( newSize > getCapacity() ) {
-
- // allocate at minimum needed space
- int32_t tempLength;
- UChar *temp = allocate(newSize + 1, tempLength);
- if(! temp) {
- setToBogus();
- return *this;
- }
-
- // if we're not currently ref counted, shift the array right by one
- if(fRefCounted == FALSE)
- us_arrayCopy(fArray, 0, temp, 1, fLength);
- // otherwise, copy the old array into temp, including the ref count
- else
- us_arrayCopy(fArray, 0, temp, 0, fLength + 1);
-
- // delete the old array if we were ref counted
- if(fRefCounted && removeRef() == 0) {
- // if the srcChars array is the same as this object's array,
- // don't delete it until the end of the method. this can happen
- // in code like UnicodeString s = "foo"; s += s;
- if(srcChars != getArrayStart())
- delete [] fArray;
- else {
- deleteWhenDone = TRUE;
- bufferToDelete = fArray;
- }
- }
-
- // use the new array
- fCapacity = tempLength;
- fArray = temp;
- setRefCount(1);
+ // clone our array and allocate a bigger array if needed
+ if(!cloneArrayIfNeeded(newSize, newSize + (newSize >> 2) + kGrowSize,
+ FALSE, &bufferToDelete)
+ ) {
+ return *this;
}
// now do the replace
- // first copy the portion that isn't changing, leaving a hole
- if(length != srcLength) {
- us_arrayCopy(getArrayStart(), start + length,
- getArrayStart(), start + srcLength,
- fLength - (start + length));
+ if(fArray != oldArray) {
+ // if fArray changed, then we need to copy everything except what will change
+ us_arrayCopy(oldArray, 0, fArray, 0, start);
+ us_arrayCopy(oldArray, start + length,
+ fArray, start + srcLength,
+ oldLength - (start + length));
+ } else if(length != srcLength) {
+ // fArray did not change; copy only the portion that isn't changing, leaving a hole
+ us_arrayCopy(oldArray, start + length,
+ fArray, start + srcLength,
+ oldLength - (start + length));
}
// now fill in the hole with the new string
@@ -838,8 +942,9 @@ UnicodeString::doReplace(UTextOffset start,
fLength = newSize;
fHashCode = kInvalidHashCode;
- if(deleteWhenDone)
- delete [] bufferToDelete;
+ // delayed delete in case srcChars == fArray when we started, and
+ // to keep oldArray alive for the above operations
+ delete [] bufferToDelete;
return *this;
}
@@ -859,11 +964,9 @@ UnicodeString::doReverse(UTextOffset start,
int32_t length)
{
// if we're bogus, do nothing
- if(fBogus)
+ if(isBogus() || !cloneArrayIfNeeded()) {
return *this;
-
- // clone our array, if necessary
- cloneArrayIfNeeded();
+ }
// pin the indices to legal values
pinIndices(start, length);
@@ -890,10 +993,9 @@ int32_t
UnicodeString::doHashCode()
{
const UChar *key = getArrayStart();
- int32_t len = fLength;
+ int32_t len = fLength;
int32_t hash = kInvalidHashCode;
- const UChar *limit = key + len;
- int32_t inc = (len >= 128 ? len/64 : 1);
+ const UChar *limit = key + len;
/*
We compute the hash by iterating sparsely over 64 (at most)
@@ -904,47 +1006,41 @@ UnicodeString::doHashCode()
deterministic value which should be well distributed over the
output range. [LIU] */
- while(key < limit) {
- hash = (hash * 37) + *key;
- key += inc;
+ if(len <= 64) {
+ while(key < limit) {
+ hash = (hash * 37) + *key++;
+ }
+ } else {
+ int32_t inc = (len+63)/64;
+
+ while(key < limit) {
+ hash = (hash * 37) + *key;
+ key += inc;
+ }
}
- if(hash == kInvalidHashCode)
+ hash &= 0x7fffffff;
+ if(hash == kInvalidHashCode) {
hash = kEmptyHashCode;
+ }
fHashCode = hash;
return fHashCode;
}
-//========================================
-// Bogusify?
-//========================================
-void
-UnicodeString::setToBogus()
-{
- if(fRefCounted && removeRef() == 0) {
- delete [] fArray;
- }
-
- fArray = 0;
- fCapacity = fLength = 0;
- fHashCode = kInvalidHashCode;
- fRefCounted = FALSE;
- fBogus = TRUE;
-}
-
//========================================
// Codeset conversion
//========================================
int32_t
UnicodeString::extract(UTextOffset start,
- int32_t length,
- char *dst,
- const char *codepage) const
+ int32_t length,
+ char *dst,
+ const char *codepage) const
{
// if we're bogus or there's nothing to convert, do nothing
- if(fBogus || length == 0)
+ if(isBogus() || length <= 0) {
return 0;
+ }
// pin the indices to legal values
pinIndices(start, length);
@@ -976,10 +1072,11 @@ UnicodeString::extract(UTextOffset start,
// if it is an empty string, then use the "invariant character" conversion
if(U_FAILURE(status)) {
// close the converter
- if(codepage == 0)
+ if(codepage == 0) {
releaseDefaultConverter(converter);
- else
+ } else {
ucnv_close(converter);
+ }
return 0;
}
@@ -997,17 +1094,19 @@ UnicodeString::extract(UTextOffset start,
myTargetLimit = myTarget + arraySize;
/* Pin the limit to U_MAX_PTR. NULL check is for AS/400. */
- if((myTargetLimit < myTarget) || (myTargetLimit == NULL))
- myTargetLimit = (char*)U_MAX_PTR;
+ if((myTargetLimit < myTarget) || (myTargetLimit == NULL)) {
+ myTargetLimit = (char*)U_MAX_PTR;
+ }
ucnv_fromUnicode(converter, &myTarget, myTargetLimit,
- &mySource, mySourceEnd, NULL, TRUE, &status);
+ &mySource, mySourceEnd, 0, TRUE, &status);
// close the converter
- if(codepage == 0)
+ if(codepage == 0) {
releaseDefaultConverter(converter);
- else
+ } else {
ucnv_close(converter);
+ }
return (myTarget - dst);
}
@@ -1018,35 +1117,29 @@ UnicodeString::doCodepageCreate(const char *codepageData,
const char *codepage)
{
// if there's nothing to convert, do nothing
- if(codepageData == 0 || dataLength == 0)
+ if(codepageData == 0 || dataLength <= 0) {
return;
+ }
- // set up the conversion parameters
- int32_t sourceLen = dataLength;
- const char *mySource = codepageData;
- const char *mySourceEnd = mySource + sourceLen;
- UChar *myTarget;
- UErrorCode status = U_ZERO_ERROR;
- int32_t arraySize = getCapacity();
+ UErrorCode status = U_ZERO_ERROR;
// create the converter
- UConverter *converter = 0;
-
// if the codepage is the default, use our cache
// if it is an empty string, then use the "invariant character" conversion
- converter = (codepage == 0 ?
- getDefaultConverter(status) :
- *codepage == 0 ?
- 0 :
- ucnv_open(codepage, &status));
+ UConverter *converter = (codepage == 0 ?
+ getDefaultConverter(status) :
+ *codepage == 0 ?
+ 0 :
+ ucnv_open(codepage, &status));
// if we failed, set the appropriate flags and return
if(U_FAILURE(status)) {
// close the converter
- if(codepage == 0)
+ if(codepage == 0) {
releaseDefaultConverter(converter);
- else
+ } else {
ucnv_close(converter);
+ }
setToBogus();
return;
}
@@ -1056,170 +1149,84 @@ UnicodeString::doCodepageCreate(const char *codepageData,
// perform the conversion
if(converter == 0) {
// use the "invariant characters" conversion
- if(arraySize < dataLength) {
- int32_t tempCapacity;
- // allocate enough space for the dataLength, the refCount, and a NUL
- UChar *temp = allocate(dataLength + 2, tempCapacity);
-
- if(temp == 0) {
- // set flags and return
- setToBogus();
- return;
- }
-
- fArray = temp;
- fCapacity = tempCapacity;
-
- setRefCount(1);
-
- u_charsToUChars(codepageData, fArray + 1, dataLength);
- fArray[dataLength + 1] = 0;
- } else {
+ if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
u_charsToUChars(codepageData, getArrayStart(), dataLength);
+ fLength = dataLength;
+ } else {
+ setToBogus();
}
- fLength = dataLength;
return;
}
- myTarget = getArrayStart();
+ // set up the conversion parameters
+ const char *mySource = codepageData;
+ const char *mySourceEnd = mySource + dataLength;
+ UChar *myTarget;
+
+ // estimate the size needed:
+ // 1.25 UChar's per source byte should cover most cases
+ int32_t arraySize = dataLength + (dataLength >> 2);
+
+ // we do not care about the current contents
+ bool_t doCopyArray = FALSE;
for(;;) {
- // reset the error code
- status = U_ZERO_ERROR;
+ if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
+ setToBogus();
+ break;
+ }
// perform the conversion
- ucnv_toUnicode(converter, &myTarget, myTarget + arraySize,
- &mySource, mySourceEnd, NULL, TRUE, &status);
+ myTarget = fArray + fLength;
+ ucnv_toUnicode(converter, &myTarget, fArray + fCapacity,
+ &mySource, mySourceEnd, 0, FALSE, &status);
// update the conversion parameters
- fLength = myTarget - getArrayStart();
+ fLength = myTarget - fArray;
// allocate more space and copy data, if needed
if(status == U_INDEX_OUTOFBOUNDS_ERROR) {
- int32_t tempCapacity;
- UChar *temp = allocate(fCapacity, tempCapacity);
+ // reset the error code
+ status = U_ZERO_ERROR;
- if(! temp) {
- // set flags and return
- setToBogus();
- break;
- }
+ // keep the previous conversion results
+ doCopyArray = TRUE;
- if(fRefCounted) {
- // copy the old array into temp
- us_arrayCopy(fArray, 1, temp, 1, fLength);
- delete [] fArray;
- } else {
- // if we're not currently ref counted, shift the array right by one
- us_arrayCopy(fArray, 0, temp, 1, fLength);
- }
-
- fArray = temp;
- fCapacity = tempCapacity;
-
- setRefCount(1);
-
- myTarget = getArrayStart() + fLength;
- arraySize = getCapacity() - fLength;
+ // estimate the new size needed, larger than before
+ // try 2 UChar's per remaining source byte
+ arraySize = fLength + 2 * (mySourceEnd - mySource);
} else {
break;
}
}
// close the converter
- if(codepage == 0)
+ if(codepage == 0) {
releaseDefaultConverter(converter);
- else
+ } else {
ucnv_close(converter);
+ }
}
//========================================
// External Buffer
//========================================
-UnicodeString::UnicodeString(UChar *buff,
- int32_t bufLength,
- int32_t buffCapacity)
- : fArray(buff),
- fLength(bufLength),
- fCapacity(buffCapacity),
- fRefCounted(FALSE),
- fHashCode(kInvalidHashCode),
- fBogus(FALSE)
-{}
-
+// ### TODO:
+// this is very, very dirty: we should not ever expose our array to the outside,
+// and this also violates the const-ness of this object
+// this must be removed when the resource bundle implementation does not need it any more!
const UChar*
-UnicodeString::getUChars() const
-{
+UnicodeString::getUChars() const {
// if we're bogus, do nothing
- if(fBogus)
+ if(isBogus()) {
return 0;
-
- // no room for null, resize
- if(getCapacity() <= fLength) {
- // allocate at minimum the current capacity + needed space
- int32_t tempLength;
- UChar *temp = allocate(fCapacity + 1, tempLength);
- if(! temp) {
- ((UnicodeString*)this)->setToBogus();
- return 0;
- }
-
- // if we're not currently ref counted, shift the array right by one
- if(fRefCounted == FALSE)
- us_arrayCopy(fArray, 0, temp, 1, fLength);
- // otherwise, copy the old array into temp, including the ref count
- else
- us_arrayCopy(fArray, 0, temp, 0, fLength + 1);
-
- // delete the old array
- if(fRefCounted && ((UnicodeString*)this)->removeRef() == 0)
- delete [] ((UnicodeString*)this)->fArray;
-
- // use the new array
- ((UnicodeString*)this)->fCapacity = tempLength;
- ((UnicodeString*)this)->fArray = temp;
- ((UnicodeString*)this)->setRefCount(1);
}
- if(getArrayStart()[fLength] != 0) {
- // tack on a trailing null
- ((UChar *)getArrayStart())[fLength] = 0;
- }
-
- return getArrayStart();
-}
-
-UChar*
-UnicodeString::orphanStorage()
-{
- // if we're bogus, do nothing
- if(fBogus)
- return 0;
-
- UChar *retVal;
-
- // if we're ref counted, get rid of the leading ref count
- if(fRefCounted && removeRef() == 0) {
- retVal = fArray;
- } else {
- // if we don't own the memory, then we have to allocate it
- retVal = new UChar[fLength + 1];
- if(retVal == 0) {
- return 0;
+ if(fCapacity <= fLength || fArray[fLength] != 0) {
+ if(((UnicodeString &)*this).cloneArrayIfNeeded(fLength + 1)) {
+ fArray[fLength] = 0;
}
}
-
- // shift or copy characters
- us_arrayCopy(getArrayStart(), 0, retVal, 0, fLength);
- retVal[fLength] = 0;
-
- // set self to empty
- fArray = fStackBuffer;
- fLength = 0;
- fCapacity = US_STACKBUF_SIZE;
- fHashCode = kEmptyHashCode;
- fRefCounted = FALSE;
-
- return retVal;
+ return fArray;
}
//========================================
@@ -1230,67 +1237,91 @@ UnicodeString::pinIndices(UTextOffset& start,
int32_t& length) const
{
// pin indices
- if(length < 0 || start < 0)
+ if(length < 0 || start < 0) {
start = length = 0;
- else {
- if(length > (fLength - start))
- length = (fLength - start);
+ } else if(length > (fLength - start)) {
+ length = (fLength - start);
}
}
-void
-UnicodeString::cloneArrayIfNeeded()
-{
- // if we're aliased or ref counted, make a copy of the buffer if necessary
- if(fArray != fStackBuffer && (!fRefCounted || refCount() > 1)) {
- UChar *copy;
- bool_t refCounted;
- if(fLength <= US_STACKBUF_SIZE) {
- // a small string does not need allocation
- fCapacity = US_STACKBUF_SIZE;
- copy = fStackBuffer;
- refCounted = FALSE;
+bool_t
+UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
+ int32_t growCapacity,
+ bool_t doCopyArray,
+ int32_t **pBufferToDelete) {
+ // default parameters need to be static, therefore
+ // the defaults are -1 to have convenience defaults
+ if(newCapacity == -1) {
+ newCapacity = fCapacity;
+ }
+
+ /*
+ * We need to make a copy of the array if
+ * the buffer is read-only, or
+ * the buffer is refCounted (shared), and refCount>1, or
+ * the buffer is too small.
+ * Return FALSE if memory could not be allocated.
+ */
+ if(fFlags & kBufferIsReadonly ||
+ fFlags & kRefCounted && refCount() > 1 ||
+ newCapacity > fCapacity
+ ) {
+ // save old values
+ UChar *array = fArray;
+ uint16_t flags = fFlags;
+
+ // check growCapacity for default value and use of the stack buffer
+ if(growCapacity == -1) {
+ growCapacity = newCapacity;
+ } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
+ growCapacity = US_STACKBUF_SIZE;
+ }
+
+ // allocate a new array
+ if(allocate(growCapacity) ||
+ newCapacity < growCapacity && allocate(newCapacity)
+ ) {
+ if(doCopyArray) {
+ // copy the contents
+ // do not copy more than what fits - it may be smaller than before
+ if(fCapacity < fLength) {
+ fLength = fCapacity;
+ }
+ us_arrayCopy(array, 0, fArray, 0, fLength);
+ } else {
+ fLength = 0;
+ }
+
+ // release the old array
+ if(flags & kRefCounted) {
+ // the array is refCounted; decrement and release if 0
+ int32_t *pRefCount = ((int32_t *)array - 1);
+ if(--*pRefCount == 0) {
+ if(pBufferToDelete == 0) {
+ delete [] pRefCount;
+ } else {
+ // the caller requested to delete it himself
+ *pBufferToDelete = pRefCount;
+ }
+ }
+ }
} else {
- if(!fRefCounted) {
- // make room for the ref count
- ++fCapacity;
- }
- if(fCapacity - 1 <= fLength) {
- // make room for a terminating NUL
- fCapacity = fLength + 2;
- }
- copy = new UChar [ fCapacity ];
- if(copy == 0) {
- setToBogus();
- return;
- }
- refCounted = TRUE;
- }
-
- // copy the current shared array into our new array
- us_arrayCopy(getArrayStart(), 0, copy, refCounted ? 1 : 0, fLength);
-
- // remove a reference from the current shared array
- // if there are no more references to the current shared array,
- // after we remove the reference, delete the array
- if(fRefCounted && removeRef() == 0) {
- delete [] fArray;
- }
-
- // make our array point to the new copy and set the ref count to one
- fArray = copy;
- fRefCounted = refCounted;
- if(refCounted) {
- setRefCount(1);
+ // not enough memory for growCapacity and not even for the smaller newCapacity
+ // reset the old values for setToBogus() to release the array
+ fArray = array;
+ fFlags = flags;
+ setToBogus();
+ return FALSE;
}
}
+ return TRUE;
}
// private function for C API
-U_CFUNC const UChar*
-T_UnicodeString_getUChars(const UnicodeString *s)
+U_CFUNC int32_t
+T_UnicodeString_length(const UnicodeString *s)
{
- return s->getUChars();
+ return s->length();
}
// private function for C API
@@ -1323,8 +1354,9 @@ UnicodeString::getDefaultConverter(UErrorCode &status)
// if the cache was empty, create a converter
if(converter == 0) {
converter = ucnv_open(0, &status);
- if(U_FAILURE(status))
+ if(U_FAILURE(status)) {
return 0;
+ }
}
return converter;
@@ -1342,7 +1374,7 @@ UnicodeString::releaseDefaultConverter(UConverter *converter)
}
}
- // it's safe to close a NULL converter
+ // it's safe to close a 0 converter
ucnv_close(converter);
}
@@ -1427,14 +1459,16 @@ void
UnicodeStringStreamer::streamOut(const UnicodeString *s,
FileStream *os)
{
- if(!T_FileStream_error(os))
+ if(!T_FileStream_error(os)) {
writeLong(os, s->fLength);
+ }
const UChar *c = s->getArrayStart();
const UChar *end = c + s->fLength;
- while(c != end && ! T_FileStream_error(os))
+ while(c != end && ! T_FileStream_error(os)) {
writeUChar(os, *c++);
+ }
}
void
@@ -1456,40 +1490,16 @@ UnicodeStringStreamer::streamIn(UnicodeString *s,
}
// clone s's array, if needed
- s->cloneArrayIfNeeded();
-
- // if the string isn't big enough to hold the data, enlarge it
- if(s->getCapacity() < newSize) {
-
- int32_t tempLength;
- UChar *temp = s->allocate(newSize, tempLength);
- if(! temp) {
- s->setToBogus();
- return;
- }
-
- // if s is not currently ref counted, shift the array right by one
- if(s->fRefCounted == FALSE)
- us_arrayCopy(s->fArray, 0, temp, 1, s->fLength);
- // otherwise, copy the old array into temp, including the ref count
- else
- us_arrayCopy(s->fArray, 0, temp, 0, s->fLength + 1);
-
- // delete the old array if s is ref counted
- if(s->fRefCounted && s->removeRef() == 0)
- delete [] s->fArray;
-
- // use the new array
- s->fCapacity = tempLength;
- s->fArray = temp;
- s->setRefCount(1);
+ if(!s->cloneArrayIfNeeded(newSize, newSize, FALSE)) {
+ return;
}
UChar *c = s->getArrayStart();
UChar *end = c + newSize;
- while(c < end && ! (T_FileStream_error(is) || T_FileStream_eof(is)))
+ while(c < end && ! (T_FileStream_error(is) || T_FileStream_eof(is))) {
*c++ = readUChar(is);
+ }
// couldn't read all chars
if(c < end) {
@@ -1504,22 +1514,32 @@ UnicodeStringStreamer::streamIn(UnicodeString *s,
ostream&
operator<<(ostream& stream,
- const UnicodeString& s)
+ const UnicodeString& s)
{
- UTextOffset i;
- UChar c;
- int32_t saveFlags = stream.flags();
+ if(s.length() > 0) {
+ char buffer[200];
+ UConverter *converter;
+ UErrorCode errorCode = U_ZERO_ERROR;
- stream << hex;
+ // use the default converter to convert chunks of text
+ converter = UnicodeString::getDefaultConverter(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ const UChar *us = s.getArrayStart(), *uLimit = us + s.length();
+ char *s, *sLimit = buffer + sizeof(buffer);
+ do {
+ errorCode = U_ZERO_ERROR;
+ s = buffer;
+ ucnv_fromUnicode(converter, &s, sLimit, &us, uLimit, 0, FALSE, &errorCode);
- for(i = 0; i < s.length(); i++) {
- c = s.charAt(i);
- if((c >= ' ' && c <= '~') || c == '\n')
- stream << (char)c;
- else
- stream << "[0x" << c << "]";
+ // write this chunk
+ if(s > buffer) {
+ stream.write(buffer, s - buffer);
+ }
+ } while(errorCode == U_INDEX_OUTOFBOUNDS_ERROR);
+ UnicodeString::releaseDefaultConverter(converter);
+ }
}
+
stream.flush();
- stream.setf(saveFlags & ios::basefield, ios::basefield);
return stream;
}
diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp
index db6c746783..89c8b0e1a0 100644
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@@ -549,17 +549,6 @@ UnicodeStringTest::TestMiscellaneous()
for (i = 0; i < test2.length(); i++)
if (test2[i] != test4[i])
errln(UnicodeString("getUChars() failed: strings differ at position ") + i);
-
- test4 = test1.orphanStorage();
-
- if (test1.length() != 0)
- errln("orphanStorage() failed: orphaned string's contents is " + test1);
-
- for (i = 0; i < test2.length(); i++)
- if (test2[i] != test4[i])
- errln(UnicodeString("orphanStorage() failed: strings differ at position ") + i);
-
- delete (UChar*)test4;
}
void
@@ -606,13 +595,9 @@ UnicodeStringTest::TestStackAllocation()
errln("insert() on stack-allocated UnicodeString didn't work right");
if (guardWord2 != 0x4DED)
errln("insert() on stack-allocated UnicodeString overwrote guard word!");
-#if 0
- // the current implementation will always reallocate the memory
- // after it was aliased in case it was read-only;
- // therefore, this test must fail and we don't perform it
+
if (workingBuffer[24] != 0x67)
errln("insert() on stack-allocated UnicodeString didn't affect backing store");
-#endif
*test += " to the aid of their country.";
if (*test != "Now is the time for all good men to come to the aid of their country.")
@@ -624,9 +609,32 @@ UnicodeStringTest::TestStackAllocation()
if (*test != "ha!")
errln("Assignment to stack-allocated UnicodeString didn't work");
if (workingBuffer[0] != 0x4e)
- errln("Change to UnicodeString after overflow are stil affecting original buffer");
+ errln("Change to UnicodeString after overflow are still affecting original buffer");
if (guardWord2 != 0x4DED)
errln("Change to UnicodeString after overflow overwrote guard word!");
+ // test read-only aliasing with setTo()
+ workingBuffer[0] = 0x20ac;
+ workingBuffer[1] = 0x125;
+ workingBuffer[2] = 0;
+ test->setTo(TRUE, workingBuffer, 2);
+ if(test->length() != 2 || test->charAt(0) != 0x20ac || test->charAt(1) != 0x125) {
+ errln("UnicodeString.setTo(readonly alias) does not alias correctly");
+ }
+ workingBuffer[1] = 0x109;
+ if(test->charAt(1) != 0x109) {
+ errln("UnicodeString.setTo(readonly alias) made a copy: did not see change in buffer");
+ }
+
+ test->setTo(TRUE, workingBuffer, -1);
+ if(test->length() != 2 || test->charAt(0) != 0x20ac || test->charAt(1) != 0x109) {
+ errln("UnicodeString.setTo(readonly alias, length -1) does not alias correctly");
+ }
+
+ test->setTo(FALSE, workingBuffer, -1);
+ if(!test->isBogus()) {
+ errln("UnicodeString.setTo(unterminated readonly alias, length -1) does not result in isBogus()");
+ }
+
delete test;
}