diff --git a/icu4c/source/common/convert.cpp b/icu4c/source/common/convert.cpp index 206f600870..0e8557a29d 100644 --- a/icu4c/source/common/convert.cpp +++ b/icu4c/source/common/convert.cpp @@ -142,7 +142,7 @@ UnicodeConverterCPP::fromUnicodeString(char* target, mySourceLength = source.length(); - mySource = source.getUChars(); + mySource = source.getArrayStart(); myTarget = target; ucnv_fromUnicode(&myConverter, &myTarget, diff --git a/icu4c/source/common/cpputils.cpp b/icu4c/source/common/cpputils.cpp index 2d19bb9ada..9cb0136d09 100644 --- a/icu4c/source/common/cpputils.cpp +++ b/icu4c/source/common/cpputils.cpp @@ -13,26 +13,32 @@ * Simple utility to set output buffer parameters ******************************************************/ void T_fillOutputParams(const UnicodeString* temp, - UChar* result, - const int32_t resultLength, - int32_t* resultLengthOut, - UErrorCode* status) + UChar* result, + const int32_t resultLength, + int32_t* resultLengthOut, + UErrorCode* status) { - - const int32_t actual = temp->length(); - const bool_t overflowed = actual >= resultLength; - const int32_t returnedSize = uprv_min(actual, resultLength-1); - if ((temp->length() < resultLength) && (result != temp->getUChars()) && (returnedSize > 0)) { - u_strcpy(result, temp->getUChars()); - } - + int32_t actual = temp->length(); + if (resultLength > 0) { - result[returnedSize] = 0; - } - if (resultLengthOut) { - *resultLengthOut = actual; - if (U_SUCCESS(*status) && overflowed) { - *status = U_BUFFER_OVERFLOW_ERROR; + // copy the contents; extract() will check if it needs to copy anything at all + temp->extract(0, resultLength - 1, result, 0); + + // zero-terminate the result buffer + if (actual < resultLength) { + result[actual] = 0; + } else { + result[resultLength - 1] = 0; } } + + // set the output length to the actual string length + if (resultLengthOut != 0) { + *resultLengthOut = actual; + } + + // set the error code according to the necessary buffer length + if (actual >= resultLength && U_SUCCESS(*status)) { + *status = U_BUFFER_OVERFLOW_ERROR; + } } diff --git a/icu4c/source/common/locid.cpp b/icu4c/source/common/locid.cpp index a3699b9941..0d4fa66620 100644 --- a/icu4c/source/common/locid.cpp +++ b/icu4c/source/common/locid.cpp @@ -23,7 +23,7 @@ * getLanguagesForCountry() * 03/16/99 bertrand rehaul. * 07/21/99 stephen Added U_CFUNC setDefault -* 11/09/99 weiv Added const char * getName() const; +* 11/09/99 weiv Added const char * getName() const; ******************************************************************************* */ @@ -322,32 +322,7 @@ void Locale::setHashCode() { UnicodeString fullNameUString(language, ""); - fullNameUString += UnicodeString(country, ""); - fullNameUString += UnicodeString(variant, ""); - const UChar *key = fullNameUString.getUChars(); - int32_t len = fullNameUString.length(); - int32_t hash = 0; - const UChar *limit = key + len; - int32_t inc = (len >= 128 ? len/64 : 1); - - /* - We compute the hash by iterating sparsely over 64 (at most) characters - spaced evenly through the string. For each character, we multiply the - previous hash value by a prime number and add the new character in, - in the manner of a additive linear congruential random number generator, - thus producing a pseudorandom deterministic value which should be well - distributed over the output range. [LIU] - */ - - while(key < limit) - { - hash = (hash * 37) + (char)*key; - key += inc; - } - - if(hash == 0) hash = 1; - - khashCode = hash & 0x7FFFFFFF; + khashCode = fullNameUString.append(UnicodeString(country, "")).append(UnicodeString(variant, "")).hashCode(); } @@ -753,13 +728,15 @@ Locale::getLanguagesForCountry(const UnicodeString& country, int32_t& count) // lookups. if(ctry2LangMapping == 0) { UErrorCode err = U_ZERO_ERROR; - UHashtable *temp = uhash_open(uhash_hashUChars, uhash_compareUChars, &err); + UHashtable *temp = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, &err); if (U_FAILURE(err)) { - count = 0; - return NULL; + count = 0; + return NULL; } - + + uhash_setKeyDeleter(temp, uhash_deleteUnicodeString); + int32_t i = 0; int32_t j; int32_t count = sizeof(compressedCtry2LangMapping) / sizeof(compressedCtry2LangMapping[0]); @@ -768,15 +745,15 @@ Locale::getLanguagesForCountry(const UnicodeString& country, int32_t& count) compressedCtry2LangMapping.extractBetween(i, i + 2, key); i += 2; for(j = i; j < count; j += 2) - if(Unicode::isUpperCase(compressedCtry2LangMapping[j])) - break; + if(Unicode::isUpperCase(compressedCtry2LangMapping[j])) + break; UnicodeString compressedValues; compressedCtry2LangMapping.extractBetween(i, j, compressedValues); UnicodeString *values = new UnicodeString[compressedValues.length() / 2]; int32_t valLen = sizeof(values) / sizeof(values[0]); for (int32_t k = 0; k < valLen; ++k) - compressedValues.extractBetween(k * 2, (k * 2) + 2, values[k]); - uhash_put(temp, (void*)key.getUChars(), values, &err); + compressedValues.extractBetween(k * 2, (k * 2) + 2, values[k]); + uhash_put(temp, new UnicodeString(key), values, &err); i = j; } @@ -786,9 +763,8 @@ Locale::getLanguagesForCountry(const UnicodeString& country, int32_t& count) else ctry2LangMapping = temp; } - - const UnicodeString *result = (const UnicodeString*) - uhash_get(ctry2LangMapping, country.getUChars()); + + const UnicodeString *result = (const UnicodeString*)uhash_get(ctry2LangMapping, &country); if(result == 0) count = 0; else diff --git a/icu4c/source/common/uloc.c b/icu4c/source/common/uloc.c index d3bf654b18..0dba9e43c8 100644 --- a/icu4c/source/common/uloc.c +++ b/icu4c/source/common/uloc.c @@ -36,7 +36,7 @@ /* UnicodeString stuff */ typedef struct UnicodeString UnicodeString; -U_CAPI const UChar* T_UnicodeString_getUChars(const UnicodeString *s); +U_CFUNC int32_t T_UnicodeString_length(const UnicodeString *s); U_CAPI int32_t T_UnicodeString_extract(const UnicodeString *s, char *dst); @@ -1051,7 +1051,7 @@ void _lazyEvaluate_installedLocales() for (i = 0; i < _installedLocalesCount; i++) { - strSize = u_strlen(T_UnicodeString_getUChars(temp[i])); + strSize = T_UnicodeString_length(temp[i]); temp2[i] = (char*) uprv_malloc(sizeof(char) * (strSize + 1)); diff --git a/icu4c/source/common/unicode/unistr.h b/icu4c/source/common/unicode/unistr.h index 0b95c12ee7..977976ebeb 100644 --- a/icu4c/source/common/unicode/unistr.h +++ b/icu4c/source/common/unicode/unistr.h @@ -27,12 +27,14 @@ #include "unicode/ucnv.h" #include "unicode/rep.h" -// Size of stack buffer for small strings -#define US_STACKBUF_SIZE 10 - class Locale; class UCharReference; class UnicodeStringStreamer; +class UnicodeConverterCPP; + +// for unistrm.h +class ostream; +U_COMMON_API ostream &operator<<(ostream& stream, const UnicodeString& s); /** * Unicode String literals in C++. @@ -59,9 +61,43 @@ class UnicodeStringStreamer; #endif /** - * UnicodeString is a concrete implementation of the abstract class - * UnicodeText. UnicodeString performs codeset conversion from char* - * data based on the type of data specified. + * UnicodeString is a concrete implementation of the abstract class Replaceable. + * It is a string class that stores Unicode characters directly and provides + * similar functionality as the Java string class. + * + * UnicodeString uses four storage models: + *
    + *
  1. Short strings are normally stored inside the UnicodeString object itself. + * The object has fields for the "bookkeeping" and a small UChar array. + * When the object is copied, then the internal characters are copied + * into the destination object.
  2. + *
  3. Longer strings are normally stored in allocated memory. + * The allocated UChar array is preceeded by a reference counter. + * When the string object is copied, then the allocated buffer is shared by + * incrementing the reference counter.
  4. + *
  5. A UnicodeString can be constructed or setTo() such that it aliases a read-only + * buffer instead of copying the characters. In this case, the string object + * uses this aliased buffer for as long as it is not modified, and it will never + * attempt to modify or release the buffer. This has copy-on-write semantics: + * When the string object is modified, then the buffer contents is first copied + * into writeable memory (inside the object for short strings, or allocated + * buffer for longer strings). When a UnicodeString with a read-only alias + * is assigned to another UnicodeString, then both string objects will + * share the same read-only alias.
  6. + *
  7. A UnicodeString can be constructed or setTo() such that it aliases a writeable + * buffer instead of copying the characters. The difference from the above is that + * the string object will write through to this aliased buffer for write + * operations. Only when the capacity of the buffer is not sufficient is + * a new buffer allocated and the contents copied. + * An efficient way to get the string contents into the original buffer is + * to use the extract(..., UChar *dst, ...) function: It will only copy the + * string contents if the dst buffer is different from the buffer of the string + * object itself. If a string grows and shrinks during a sequence of operations, + * then it will not use the same buffer any more, but may fit into it again. + * When a UnicodeString with a writeable alias is assigned to another UnicodeString, + * then the contents is always copied. The destination string will not alias + * to the buffer that the source string aliases.
  8. + *
*/ class U_COMMON_API UnicodeString : public Replaceable { @@ -678,6 +714,9 @@ public: * Copy the characters in the range * [start, start + length) into the array dst, * beginning at dstStart. + * If the string aliases to dst itself as an external buffer, + * then extract() will not copy the contents. + * * @param start offset of first character which will be copied into the array * @param length the number of characters to extract * @param dst array in which to copy characters. The length of dst @@ -841,23 +880,6 @@ public: */ inline UnicodeString& setTo(const UnicodeString& srcText); - /** - * Set the characters in the UnicodeString object to the characters - * in srcChars in the range - * [srcStart, srcStart + srcLength). - * srcChars is not modified. - * @param srcChars the source for the new characters - * @param srcStart the offset into srcChars where new characters - * will be obtained - * @param srcLength the number of characters in srcChars in the - * replace string - * @return a reference to this - * @stable - */ - inline UnicodeString& setTo(const UChar *srcChars, - UTextOffset srcStart, - int32_t srcLength); - /** * Set the characters in the UnicodeString object to the characters * in srcChars. srcChars is not modified. @@ -879,6 +901,51 @@ public: */ UnicodeString& setTo(UChar srcChar); + /** + * Aliasing setTo() function, analogous to the readonly-aliasing UChar* constructor. + * The text will be used for the UnicodeString object, but + * it will not be released when the UnicodeString is destroyed. + * This has copy-on-write semantics: + * When the string is modified, then the buffer is first copied into + * newly allocated memory. + * The aliased buffer is never modified. + * In an assignment to another UnicodeString, the text will be aliased again, + * so that both strings then alias the same readonly-text. + * + * @param isTerminated specifies if text is NUL-terminated. + * This must be true if textLength==-1. + * @param text The characters to alias for the UnicodeString. + * @param textLength The number of Unicode characters in text to alias. + * If -1, then this constructor will determine the length + * by calling u_strlen(). + * @draft + */ + UnicodeString &setTo(bool_t isTerminated, + const UChar *text, + int32_t textLength); + + /** + * Aliasing setTo() function, analogous to the writeable-aliasing UChar* constructor. + * The text will be used for the UnicodeString object, but + * it will not be released when the UnicodeString is destroyed. + * This has write-through semantics: + * For as long as the capacity of the buffer is sufficient, write operations + * will directly affect the buffer. When more capacity is necessary, then + * a new buffer will be allocated and the contents copied as with regularly + * constructed strings. + * In an assignment to another UnicodeString, the buffer will be copied. + * The extract(UChar *dst) function detects whether the dst pointer is the same + * as the string buffer itself and will in this case not copy the contents. + * + * @param buffer The characters to alias for the UnicodeString. + * @param buffLength The number of Unicode characters in buffer to alias. + * @param buffCapacity The size of buffer in UChars. + * @draft + */ + UnicodeString &setTo(UChar *buffer, + int32_t buffLength, + int32_t buffCapacity); + /** * Set the character at the specified offset to the specified character. * @param offset A valid offset into the text of the character to set @@ -1426,12 +1493,15 @@ public: int32_t textLength); /** - * Aliasing UChar* constructor. - * The text will be used for the new UnicodeString object, but + * Readonly-aliasing UChar* constructor. + * The text will be used for the UnicodeString object, but * it will not be released when the UnicodeString is destroyed. - * Be careful not to attempt to modify the contents of the UnicodeString - * if the text is read-only. Operations that allocate an entirely - * new buffer are harmless. + * This has copy-on-write semantics: + * When the string is modified, then the buffer is first copied into + * newly allocated memory. + * The aliased buffer is never modified. + * In an assignment to another UnicodeString, the text will be aliased again, + * so that both strings then alias the same readonly-text. * * @param isTerminated specifies if text is NUL-terminated. * This must be true if textLength==-1. @@ -1445,6 +1515,26 @@ public: UChar *text, int32_t textLength); + /** + * Writeable-aliasing UChar* constructor. + * The text will be used for the UnicodeString object, but + * it will not be released when the UnicodeString is destroyed. + * This has write-through semantics: + * For as long as the capacity of the buffer is sufficient, write operations + * will directly affect the buffer. When more capacity is necessary, then + * a new buffer will be allocated and the contents copied as with regularly + * constructed strings. + * In an assignment to another UnicodeString, the buffer will be copied. + * The extract(UChar *dst) function detects whether the dst pointer is the same + * as the string buffer itself and will in this case not copy the contents. + * + * @param buffer The characters to alias for the UnicodeString. + * @param buffLength The number of Unicode characters in buffer to alias. + * @param buffCapacity The size of buffer in UChars. + * @draft + */ + UnicodeString(UChar *buffer, int32_t buffLength, int32_t buffCapacity); + /** * char* constructor. * @param codepageData an array of bytes, null-terminated @@ -1480,7 +1570,7 @@ public: * @param that The UnicodeString object to copy. * @stable */ - inline UnicodeString(const UnicodeString& that); + UnicodeString(const UnicodeString& that); /** Destructor. * @stable @@ -1519,24 +1609,6 @@ public: UCharReference operator[] (UTextOffset pos); - // {sfb} remove these later? - /* Hack to avoid circular dependencies */ - - /** - * Convert the characters in this to UPPER CASE following the conventions of - * the default locale. - * @retrurn A reference to this. - */ - // UnicodeString& toUpper(); - - /** - * Convert the characters in this to lower case following the conventions of - * the default locale. - * @retrurn A reference to this. - */ - // UnicodeString& toLower(); - - //======================================== // Implementation methods //======================================== @@ -1575,9 +1647,6 @@ private: inline UChar doCharAt(UTextOffset offset) const; - UnicodeString& doSetCharAt(UTextOffset offset, - UChar c); - UnicodeString& doReplace(UTextOffset start, int32_t length, const UnicodeString& srcText, @@ -1603,6 +1672,15 @@ private: // get the "real" capacity of the array, adjusted for ref count inline int32_t getCapacity(void) const; + // allocate the array; result may be fStackBuffer + // sets refCount to 1 if appropriate + // sets fArray, fCapacity, and fFlags + // returns boolean for success or failure + bool_t allocate(int32_t capacity); + + // release the array if owned + inline void releaseArray(); + // utility method to get around lack of exception handling void setToBogus(void); @@ -1621,36 +1699,53 @@ private: * subset ("invariant characters") of the platform encoding. See utypes.h. */ void doCodepageCreate(const char *codepageData, - int32_t dataLength, - const char *codepage); + int32_t dataLength, + const char *codepage); - // clones array if refCount > 1 - void cloneArrayIfNeeded(void); + /* + * This function is called when write access to the array + * is necessary. + * + * We need to make a copy of the array if + * the buffer is read-only, or + * the buffer is refCounted (shared), and refCount>1, or + * the buffer is too small. + * + * Return FALSE if memory could not be allocated. + */ + bool_t cloneArrayIfNeeded(int32_t newCapacity = -1, + int32_t growCapacity = -1, + bool_t doCopyArray = TRUE, + int32_t **pBufferToDelete = 0); // ref counting - inline uint16_t addRef(void); - inline uint16_t removeRef(void); - inline uint16_t refCount(void) const; - inline uint16_t setRefCount(uint16_t count); - - UChar fStackBuffer [ US_STACKBUF_SIZE ]; // buffer for small strings - UChar *fArray; // the Unicode data - int32_t fLength; // number characters in fArray - int32_t fCapacity; // sizeof fArray - int32_t fHashCode; // the hash code - bool_t fRefCounted; // indicates if we own storage - bool_t fBogus; // indicates if an operation failed + inline int32_t addRef(void); + inline int32_t removeRef(void); + inline int32_t refCount(void) const; + inline int32_t setRefCount(int32_t count); // constants - static const UChar fgInvalidUChar; // invalid UChar index - static const int32_t kGrowSize; // grow size for this buffer - static const int32_t kInvalidHashCode; // invalid hash code - static const int32_t kEmptyHashCode; // hash code for empty string - + enum { + US_STACKBUF_SIZE=9, // Size of stack buffer for small strings + kInvalidUChar=0xffff, // invalid UChar index + kGrowSize=128, // grow size for this buffer + kInvalidHashCode=0, // invalid hash code + kEmptyHashCode=1, // hash code for empty string + + // bit flag values for fFlags + kIsBogus=1, // this string is bogus, i.e., not valid + kUsingStackBuffer=2, // fArray==fStackBuffer + kRefCounted=4, // there is a refCount field before the characters in fArray + kBufferIsReadonly=8, // do not write to this buffer + + // combined values for convenience + kShortString=kUsingStackBuffer, + kLongString=kRefCounted, + kReadonlyAlias=kBufferIsReadonly, + kWriteableAlias=0 + }; + // statics - inline static int32_t allocation(int32_t minSize); // allocation algorithm - inline static UChar* allocate(int32_t minSize, // allocate buffer >= minSize - int32_t& actualSize); // default converter cache static UConverter* getDefaultConverter(UErrorCode& status); @@ -1659,6 +1754,27 @@ private: static UConverter *fgDefaultConverter; friend class UnicodeStringStreamer; + friend class UnicodeConverterCPP; + friend U_COMMON_API ostream &operator<<(ostream& stream, const UnicodeString& s); + + /* + * The following are all the class fields that are stored + * in each UnicodeString object. + * Note that UnicodeString has virtual functions, + * therefore there is an implicit vtable pointer + * as the first real field. + * The fields should be aligned such that no padding is + * necessary, mostly by having larger types first. + * On 32-bit machines, the size should be 40 bytes, + * on 64-bit machines (8-byte pointers), it should be 48 bytes. + */ + // (implicit) *vtable; + UChar *fArray; // the Unicode data + int32_t fLength; // number characters in fArray + int32_t fCapacity; // sizeof fArray + int32_t fHashCode; // the hash code + uint16_t fFlags; // bit flags: see constants above + UChar fStackBuffer [ US_STACKBUF_SIZE ]; // buffer for small strings public: @@ -1684,19 +1800,11 @@ public: /* @deprecated */ inline void operator delete(void *location); - //======================================== // Non-public API - will be removed! //======================================== - /* @deprecated */ - UnicodeString(UChar *buff, int32_t bufLength, int32_t buffCapacity); - /* @deprecated */ - const UChar* getUChars(void) const; - /* @deprecated */ - inline const UChar* getUniChars(void) const; - /* @deprecated */ - UChar* orphanStorage(void); - + /* @deprecated */ + const UChar* getUChars() const; }; //======================================== @@ -1716,12 +1824,6 @@ uprv_arrayCopy(const UnicodeString *src, int32_t srcStart, //======================================== // Inline members //======================================== -inline -UnicodeString::UnicodeString(const UnicodeString& that) - : fArray(fStackBuffer), fLength(0), fCapacity(US_STACKBUF_SIZE), - fRefCounted(FALSE), fHashCode(kEmptyHashCode), fBogus(FALSE) -{ *this = that; } - //======================================== // Read-only alias methods @@ -2089,10 +2191,10 @@ UnicodeString::extractBetween(UTextOffset start, inline UChar UnicodeString::doCharAt(UTextOffset offset) const { - if(offset < 0 || offset >= fLength) - return fgInvalidUChar; - // in ref-counted implementation, first char is ref count - return fArray[ fRefCounted ? offset + 1 : offset ]; + if(offset < 0 || offset >= fLength) { + return kInvalidUChar; + } + return fArray[ offset ]; } inline UChar @@ -2136,11 +2238,13 @@ inline UnicodeString& UnicodeString::setTo(const UnicodeString& srcText) { return doReplace(0, fLength, srcText, 0, srcText.fLength); } +#if 0 inline UnicodeString& UnicodeString::setTo(const UChar *srcChars, UTextOffset srcStart, int32_t srcLength) { return doReplace(0, fLength, srcChars, srcStart, srcLength); } +#endif inline UnicodeString& UnicodeString::setTo(const UChar *srcChars, @@ -2311,7 +2415,7 @@ UnicodeString::reverse(UTextOffset start, //======================================== inline bool_t UnicodeString::isBogus() const -{ return fBogus; } +{ return fFlags & kIsBogus; } //======================================== @@ -2320,31 +2424,38 @@ UnicodeString::isBogus() const inline UChar* UnicodeString::getArrayStart() -{ return (fRefCounted ? fArray + 1 : fArray); } +{ return fArray; } inline const UChar* UnicodeString::getArrayStart() const -{ return (fRefCounted ? fArray + 1 : fArray); } +{ return fArray; } inline int32_t UnicodeString::getCapacity() const -{ return (fRefCounted ? fCapacity - 1 : fCapacity); } +{ return fCapacity; } -inline uint16_t +inline void +UnicodeString::releaseArray() { + if((fFlags & kRefCounted) && removeRef() == 0) { + delete [] ((int32_t *)fArray - 1); + } +} + +inline int32_t UnicodeString::addRef() -{ return ++(fArray[0]); } +{ return ++*((int32_t *)fArray - 1); } -inline uint16_t +inline int32_t UnicodeString::removeRef() -{ return --(fArray[0]); } +{ return --*((int32_t *)fArray - 1); } -inline uint16_t +inline int32_t UnicodeString::refCount() const -{ return fArray[0]; } +{ return *((int32_t *)fArray - 1); } -inline uint16_t -UnicodeString::setRefCount(uint16_t count) -{ fRefCounted = TRUE; return (fArray[0] = count); } +inline int32_t +UnicodeString::setRefCount(int32_t count) +{ return (*((int32_t *)fArray - 1) = count); } // deprecated API - remove later @@ -2352,10 +2463,6 @@ inline int32_t UnicodeString::size() const { return fLength; } -inline const UChar* -UnicodeString::getUniChars() const -{ return getUChars(); } - inline UnicodeString& UnicodeString::findAndReplace(const UnicodeString& oldText, const UnicodeString& newText, @@ -2380,14 +2487,6 @@ UnicodeString::operator delete(void *location) //======================================== // Static members //======================================== -inline int32_t -UnicodeString::allocation(int32_t minSize) -{ return minSize < kGrowSize ? kGrowSize - : (minSize * 2 + kGrowSize) & ~(kGrowSize - 1); } - -inline UChar* -UnicodeString::allocate(int32_t minSize, int32_t& actualSize) -{ actualSize = allocation(minSize); return new UChar[ actualSize ]; } //======================================== // class UCharReference @@ -2442,6 +2541,3 @@ UCharReference::operator UChar() { return fString->charAt(fPos); } #endif - - - diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp index 5e95cb30d3..651ee35026 100644 --- a/icu4c/source/common/unistr.cpp +++ b/icu4c/source/common/unistr.cpp @@ -17,7 +17,6 @@ ******************************************************************************* */ - #include "unicode/utypes.h" #include "unicode/putil.h" #include "unicode/locid.h" @@ -78,11 +77,6 @@ us_arrayCopy(const UChar *src, int32_t srcStart, } } -// static initialization -const UChar UnicodeString::fgInvalidUChar = 0xFFFF; -const int32_t UnicodeString::kGrowSize = 0x80; -const int32_t UnicodeString::kInvalidHashCode = 0; -const int32_t UnicodeString::kEmptyHashCode = 1; UConverter* UnicodeString::fgDefaultConverter = 0; //======================================== @@ -92,58 +86,47 @@ UnicodeString::UnicodeString() : fArray(fStackBuffer), fLength(0), fCapacity(US_STACKBUF_SIZE), - fRefCounted(FALSE), fHashCode(kEmptyHashCode), - fBogus(FALSE) + fFlags(kShortString) {} UnicodeString::UnicodeString(int32_t capacity) : fArray(0), fLength(0), - fCapacity(0), - fRefCounted(FALSE), + fCapacity(US_STACKBUF_SIZE), fHashCode(kEmptyHashCode), - fBogus(FALSE) + fFlags(0) { - fArray = allocate(capacity, fCapacity); - if(! fArray) { - setToBogus(); - return; - } - - setRefCount(1); + allocate(capacity); } UnicodeString::UnicodeString(UChar ch) : fArray(fStackBuffer), - fLength(0), + fLength(1), fCapacity(US_STACKBUF_SIZE), - fRefCounted(FALSE), - fHashCode(kEmptyHashCode), - fBogus(FALSE) + fHashCode(kInvalidHashCode), + fFlags(kShortString) { - doReplace(0, 0, &ch, 0, 1); + fStackBuffer[0] = ch; } UnicodeString::UnicodeString(const UChar *text) : fArray(fStackBuffer), fLength(0), fCapacity(US_STACKBUF_SIZE), - fRefCounted(FALSE), fHashCode(kEmptyHashCode), - fBogus(FALSE) + fFlags(kShortString) { doReplace(0, 0, text, 0, u_strlen(text)); } -UnicodeString::UnicodeString( const UChar *text, - int32_t textLength) +UnicodeString::UnicodeString(const UChar *text, + int32_t textLength) : fArray(fStackBuffer), fLength(0), fCapacity(US_STACKBUF_SIZE), - fRefCounted(FALSE), fHashCode(kEmptyHashCode), - fBogus(FALSE) + fFlags(kShortString) { doReplace(0, 0, text, 0, textLength); } @@ -152,54 +135,113 @@ UnicodeString::UnicodeString(bool_t isTerminated, UChar *text, int32_t textLength) : fArray(text), - fLength(textLength != -1 || !isTerminated ? textLength : u_strlen(text)), - fCapacity(isTerminated ? fLength + 1 : fLength), - fRefCounted(FALSE), + fLength(textLength), + fCapacity(isTerminated ? textLength + 1 : textLength), fHashCode(kInvalidHashCode), - fBogus(FALSE) + fFlags(kReadonlyAlias) { - if(fLength < 0) { + if(text == 0 || textLength < -1 || textLength == -1 && !isTerminated) { + setToBogus(); + } else if(textLength == -1) { + // text is terminated, or else it would have failed the above test + fLength = u_strlen(text); + fCapacity = fLength + 1; + } +} + +UnicodeString::UnicodeString(UChar *buff, + int32_t bufLength, + int32_t buffCapacity) + : fArray(buff), + fLength(bufLength), + fCapacity(buffCapacity), + fHashCode(kInvalidHashCode), + fFlags(kWriteableAlias) +{ + if(buff == 0 || bufLength < 0 || bufLength > buffCapacity) { setToBogus(); } } UnicodeString::UnicodeString(const char *codepageData, - const char *codepage) + const char *codepage) : fArray(fStackBuffer), fLength(0), fCapacity(US_STACKBUF_SIZE), - fRefCounted(FALSE), fHashCode(kEmptyHashCode), - fBogus(FALSE) + fFlags(kShortString) { - if(codepageData != 0) + if(codepageData != 0) { doCodepageCreate(codepageData, uprv_strlen(codepageData), codepage); + } } UnicodeString::UnicodeString(const char *codepageData, - int32_t dataLength, - const char *codepage) + int32_t dataLength, + const char *codepage) : fArray(fStackBuffer), fLength(0), fCapacity(US_STACKBUF_SIZE), - fRefCounted(FALSE), fHashCode(kEmptyHashCode), - fBogus(FALSE) + fFlags(kShortString) { if(codepageData != 0) { doCodepageCreate(codepageData, dataLength, codepage); } } +UnicodeString::UnicodeString(const UnicodeString& that) + : fArray(fStackBuffer), + fLength(0), + fCapacity(US_STACKBUF_SIZE), + fHashCode(kEmptyHashCode), + fFlags(kShortString) +{ + *this = that; +} + +//======================================== +// array allocation +//======================================== + +bool_t +UnicodeString::allocate(int32_t capacity) { + if(capacity <= US_STACKBUF_SIZE) { + fArray = fStackBuffer; + fCapacity = US_STACKBUF_SIZE; + fFlags = kShortString; + } else { + // count bytes for the refCounter and the string capacity, and + // round up to a multiple of 16; then divide by 4 and allocate int32_t's + // to be safely aligned for the refCount + int32_t words = ((sizeof(int32_t) + capacity * U_SIZEOF_UCHAR + 15) & ~15) >> 2; + int32_t *array = new int32_t[words]; + if(array != 0) { + // set initial refCount and point behind the refCount + *array++ = 1; + + // have fArray point to the first UChar + fArray = (UChar *)array; + fCapacity = (words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR); + fFlags = kLongString; + } else { + fArray = 0; + fCapacity = 0; + fHashCode = kInvalidHashCode; // for constructor(capacity) to be correctly bogus + fFlags = kIsBogus; + return FALSE; + } + } + return TRUE; +} + //======================================== // Destructor //======================================== UnicodeString::~UnicodeString() { - // decrement ref count and reclaim storage, if owned - if(fRefCounted && removeRef() == 0) - delete [] fArray; + releaseArray(); } //======================================== @@ -209,37 +251,62 @@ UnicodeString& UnicodeString::operator= (const UnicodeString& src) { // if assigning to ourselves, do nothing - if(this == &src) { + if(this == 0 || this == &src) { return *this; } - // if src is bogus, set ourselves to bogus - if(src.isBogus()) { + // is the right side bogus? + if(&src == 0 || src.isBogus()) { setToBogus(); return *this; } - // if src is aliased or ref counted, point ourselves at its array - if(src.fArray != src.fStackBuffer) { + // delete the current contents + releaseArray(); - // if we're ref counted, decrement our current ref count - if(fRefCounted && removeRef() == 0) - delete [] fArray; + // we always copy the length and the hash code + fLength = src.fLength; + fHashCode = src.fHashCode; - fArray = src.fArray; - fLength = src.fLength; - fCapacity = src.fCapacity; - fHashCode = src.fHashCode; - fRefCounted = src.fRefCounted; - if(fRefCounted) { - addRef(); + switch(src.fFlags) { + case kShortString: + // short string using the stack buffer, do the same + fArray = fStackBuffer; + fCapacity = US_STACKBUF_SIZE; + fFlags = kShortString; + if(fLength > 0) { + uprv_memcpy(fStackBuffer, src.fArray, fLength * U_SIZEOF_UCHAR); } - fBogus = FALSE; - } - // if src isn't ref counted, just do a replace - else { - doReplace(0, fLength, src.fArray, 0, src.fLength); - fHashCode = src.fHashCode; + break; + case kLongString: + // src uses a refCounted string buffer, use that buffer with refCount + // src is const, use a cast - we don't really change it + ((UnicodeString &)src).addRef(); + // fall through to readonly alias copying: copy all fields + case kReadonlyAlias: + // src is a readonly alias, do the same + fArray = src.fArray; + fCapacity = src.fCapacity; + fFlags = src.fFlags; + break; + case kWriteableAlias: + // src is a writeable alias; we make a copy of that instead + if(allocate(fLength)) { + if(fLength > 0) { + uprv_memcpy(fArray, src.fArray, fLength * U_SIZEOF_UCHAR); + } + break; + } + // if there is not enough memory, then fall through to setting to bogus + default: + // if src is bogus, set ourselves to bogus + // do not call setToBogus() here because fArray and fFlags are not consistent here + fArray = 0; + fLength = 0; + fCapacity = 0; + fHashCode = kInvalidHashCode; + fFlags = kIsBogus; + break; } return *this; @@ -317,6 +384,11 @@ UnicodeString::doCompare( UTextOffset start, // get the correct pointer const UChar *chars = getArrayStart(); + // are we comparing the same buffer contents? + if(chars + start == srcChars + srcStart) { + return 0; + } + UTextOffset minLength; int8_t lengthResult; @@ -374,12 +446,14 @@ UnicodeString::doExtract(UTextOffset start, UChar *dst, UTextOffset dstStart) const { - // pin indices to legal values - pinIndices(start, length); - us_arrayCopy(getArrayStart(), start, dst, dstStart, length); + // do not copy anything if we alias dst itself + if(fArray + start != dst + dstStart) { + // pin indices to legal values + pinIndices(start, length); + us_arrayCopy(getArrayStart(), start, dst, dstStart, length); + } } - UTextOffset UnicodeString::indexOf(const UChar *srcChars, UTextOffset srcStart, @@ -557,17 +631,79 @@ UnicodeString::findAndReplace(UTextOffset start, // Write implementation //======================================== +void +UnicodeString::setToBogus() +{ + releaseArray(); + + fArray = 0; + fCapacity = fLength = 0; + fHashCode = kInvalidHashCode; + fFlags = kIsBogus; +} + +// setTo() analogous to the readonly-aliasing constructor with the same signature +UnicodeString & +UnicodeString::setTo(bool_t isTerminated, + const UChar *text, + int32_t textLength) +{ + if(text == 0 || textLength < -1 || textLength == -1 && !isTerminated) { + setToBogus(); + return *this; + } + + releaseArray(); + + fArray = (UChar *)text; + if(textLength != -1) { + fLength = textLength; + } else { + // text is terminated, or else it would have failed the above test + fLength = u_strlen(text); + fCapacity = fLength + 1; + } + + fCapacity = isTerminated ? textLength + 1 : textLength; + fHashCode = kInvalidHashCode; + fFlags = kReadonlyAlias; + return *this; +} + +// setTo() analogous to the writeable-aliasing constructor with the same signature +UnicodeString & +UnicodeString::setTo(UChar *buffer, + int32_t buffLength, + int32_t buffCapacity) { + if(buffer == 0 || buffLength < 0 || buffLength > buffCapacity) { + setToBogus(); + return *this; + } + + releaseArray(); + + fArray = buffer; + fLength = buffLength; + fCapacity = buffCapacity; + fHashCode = kInvalidHashCode; + fFlags = kWriteableAlias; + return *this; +} + UnicodeString& UnicodeString::setCharAt(UTextOffset offset, UChar c) { - if(offset < 0) - offset = 0; - else if(offset >= fLength) - offset = fLength - 1; + if(cloneArrayIfNeeded()) { + if(offset < 0) { + offset = 0; + } else if(offset >= fLength) { + offset = fLength - 1; + } - doSetCharAt(offset, c); - fHashCode = kInvalidHashCode; + fArray[offset] = c; + fHashCode = kInvalidHashCode; + } return *this; } @@ -586,8 +722,16 @@ UnicodeString::toUpper(const Locale& locale) UTextOffset limit = fLength; UChar c; UnicodeString lang; + char langChars[16]; + if(!cloneArrayIfNeeded()) { + return *this; + } + + // get char * locale language locale.getLanguage(lang); + lang.extract(0, lang.length(), langChars, ""); + langChars[lang.length()] = 0; // The German sharp S character (U+00DF)'s uppercase equivalent is // "SS", making it the only character that expands to two characters @@ -598,56 +742,46 @@ UnicodeString::toUpper(const Locale& locale) // string looking for sharp S characters and then go back and make // room for the extra capital Ses if we find any. [For performance, // we only do this extra work if the language is actually German] - if(lang == "de") { + if(uprv_strcmp(langChars, "de") == 0) { UChar SS [] = { 0x0053, 0x0053 }; while(start < limit) { - c = getArrayStart()[start]; // A sharp s needs to be replaced with two capital S's. if(c == 0x00DF) { - doReplace(start, 1, SS, 0, 2); - start++; - limit++; + doReplace(start, 1, SS, 0, 2); + start++; + limit++; + } else { + // Otherwise, the case conversion can be handled by the Unicode unit. + fArray[start] = Unicode::toUpperCase(c); } - // Otherwise, the case conversion can be handled by the Unicode unit. - else if(Unicode::isLowerCase(c)) - doSetCharAt(start, Unicode::toUpperCase(c)); - // If no conversion is necessary, do nothing ++start; } - } - - // If the specfied language is Turkish, then we have to special-case - // for the Turkish dotted and dotless Is. The regular lowercase i - // maps to the capital I with a dot (U+0130), and the lowercase i - // without the dot (U+0131) maps to the regular capital I - else if(lang == "tr") { + } else if(uprv_strcmp(langChars, "tr") == 0) { + // If the specfied language is Turkish, then we have to special-case + // for the Turkish dotted and dotless Is. The regular lowercase i + // maps to the capital I with a dot (U+0130), and the lowercase i + // without the dot (U+0131) maps to the regular capital I while(start < limit) { c = getArrayStart()[start]; - if(c == 0x0069/*'i'*/) - doSetCharAt(start, 0x0130); - else if(c == 0x0131) - doSetCharAt(start, 0x0049/*'I'*/); - else if(Unicode::isLowerCase(c)) - doSetCharAt(start, Unicode::toUpperCase(c)); + if(c == 0x0069/*'i'*/) { + fArray[start] = 0x0130; + } else if(c == 0x0131) { + fArray[start] = 0x0049/*'I'*/; + } else { + fArray[start] = Unicode::toUpperCase(c); + } ++start; } - } - - else { - // clone our array, if necessary - cloneArrayIfNeeded(); + } else { UChar *array = getArrayStart(); while(start < limit) { - c = array[start]; - if(Unicode::isLowerCase(c)) { - array[start] = Unicode::toUpperCase(c); - } + array[start] = Unicode::toUpperCase(array[start]); ++start; } } @@ -664,59 +798,60 @@ UnicodeString::toLower(const Locale& locale) UTextOffset limit = fLength; UChar c; UnicodeString lang; + char langChars[16]; + if(!cloneArrayIfNeeded()) { + return *this; + } + + // get char * locale language locale.getLanguage(lang); + lang.extract(0, lang.length(), langChars, ""); + langChars[lang.length()] = 0; // if the specfied language is Turkish, then we have to special-case // for the Turkish dotted and dotless Is. The capital I with a dot // (U+0130) maps to the regular lowercase i, and the regular capital // I maps to the lowercase i without the dot (U+0131) - if(lang == "tr") { + if(uprv_strcmp(langChars, "tr") == 0) { while(start < limit) { c = getArrayStart()[start]; if(c == 0x0049) // 'I' - doSetCharAt(start, 0x0131); + fArray[start] = 0x0131; else if(c == 0x0130) - doSetCharAt(start, 0x0069); // 'i' - else if(Unicode::isUpperCase(c) || Unicode::isTitleCase(c)) - doSetCharAt(start, Unicode::toLowerCase(c)); + fArray[start] = 0x0069; // 'i' + else { + fArray[start] = Unicode::toLowerCase(c); + } ++start; } - } - - // if the specfied language is Greek, then we have to special-case - // for the capital letter sigma (U+3A3), which has two lower-case - // forms. If the character following the capital sigma is a letter, - // we use the medial form (U+3C3); otherwise, we use the final form - // (U+3C2). - else if(lang == "el") { + } else if(uprv_strcmp(langChars, "el") == 0) { + // if the specfied language is Greek, then we have to special-case + // for the capital letter sigma (U+3A3), which has two lower-case + // forms. If the character following the capital sigma is a letter, + // we use the medial form (U+3C3); otherwise, we use the final form + // (U+3C2). while(start < limit) { c = getArrayStart()[start]; if(c == 0x3a3) { - if(start + 1 < limit && Unicode::isLetter(getArrayStart()[start + 1])) - doSetCharAt(start, 0x3C3); - else - doSetCharAt(start, 0x3C2); + if(start + 1 < limit && Unicode::isLetter(getArrayStart()[start + 1])) { + fArray[start] = 0x3C3; + } else { + fArray[start] = 0x3C2; + } + } else { + fArray[start] = Unicode::toLowerCase(c); } - else if(Unicode::isUpperCase(c) || Unicode::isTitleCase(c)) - doSetCharAt(start, Unicode::toLowerCase(c)); ++start; } - } - - // if the specified language is anything other than Turkish or - // Greek, we rely on the Unicode class to do all our case mapping-- - // there are no other special cases - else { - // clone our array, if necessary - cloneArrayIfNeeded(); + } else { + // if the specified language is anything other than Turkish or + // Greek, we rely on the Unicode class to do all our case mapping-- + // there are no other special cases UChar *array = getArrayStart(); while(start < limit) { - c = array[start]; - if(Unicode::isUpperCase(c) || Unicode::isTitleCase(c)) { - array[start] = Unicode::toLowerCase(c); - } + array[start] = Unicode::toLowerCase(array[start]); ++start; } } @@ -726,19 +861,6 @@ UnicodeString::toLower(const Locale& locale) return *this; } -// for speed, no bounds checking is performed and the hash code isn't changed -UnicodeString& -UnicodeString::doSetCharAt(UTextOffset offset, - UChar c) -{ - // clone our array, if necessary - cloneArrayIfNeeded(); - - // set the character - fArray[ (fRefCounted ? offset + 1 : offset) ] = c; - return *this; -} - UnicodeString& UnicodeString::doReplace( UTextOffset start, int32_t length, @@ -766,70 +888,52 @@ UnicodeString::doReplace(UTextOffset start, UTextOffset srcStart, int32_t srcLength) { - // if we're bogus, do nothing - if(fBogus) - return *this; + // if we're bogus, set us to empty first + if(isBogus()) { + fArray = fStackBuffer; + fLength = 0; + fCapacity = US_STACKBUF_SIZE; + fHashCode = kEmptyHashCode; + fFlags = kShortString; + } if(srcChars == 0) { srcStart = srcLength = 0; } - bool_t deleteWhenDone = FALSE; - UChar *bufferToDelete = 0; + int32_t *bufferToDelete = 0; - // clone our array, if necessary - cloneArrayIfNeeded(); + // the following may change fArray but will not copy the current contents; + // therefore we need to keep the current fArray + UChar *oldArray = fArray; + int32_t oldLength = fLength; // pin the indices to legal values pinIndices(start, length); // calculate the size of the string after the replace - int32_t newSize = fLength - length + srcLength; + int32_t newSize = oldLength - length + srcLength; - // allocate a bigger array if needed - if( newSize > getCapacity() ) { - - // allocate at minimum needed space - int32_t tempLength; - UChar *temp = allocate(newSize + 1, tempLength); - if(! temp) { - setToBogus(); - return *this; - } - - // if we're not currently ref counted, shift the array right by one - if(fRefCounted == FALSE) - us_arrayCopy(fArray, 0, temp, 1, fLength); - // otherwise, copy the old array into temp, including the ref count - else - us_arrayCopy(fArray, 0, temp, 0, fLength + 1); - - // delete the old array if we were ref counted - if(fRefCounted && removeRef() == 0) { - // if the srcChars array is the same as this object's array, - // don't delete it until the end of the method. this can happen - // in code like UnicodeString s = "foo"; s += s; - if(srcChars != getArrayStart()) - delete [] fArray; - else { - deleteWhenDone = TRUE; - bufferToDelete = fArray; - } - } - - // use the new array - fCapacity = tempLength; - fArray = temp; - setRefCount(1); + // clone our array and allocate a bigger array if needed + if(!cloneArrayIfNeeded(newSize, newSize + (newSize >> 2) + kGrowSize, + FALSE, &bufferToDelete) + ) { + return *this; } // now do the replace - // first copy the portion that isn't changing, leaving a hole - if(length != srcLength) { - us_arrayCopy(getArrayStart(), start + length, - getArrayStart(), start + srcLength, - fLength - (start + length)); + if(fArray != oldArray) { + // if fArray changed, then we need to copy everything except what will change + us_arrayCopy(oldArray, 0, fArray, 0, start); + us_arrayCopy(oldArray, start + length, + fArray, start + srcLength, + oldLength - (start + length)); + } else if(length != srcLength) { + // fArray did not change; copy only the portion that isn't changing, leaving a hole + us_arrayCopy(oldArray, start + length, + fArray, start + srcLength, + oldLength - (start + length)); } // now fill in the hole with the new string @@ -838,8 +942,9 @@ UnicodeString::doReplace(UTextOffset start, fLength = newSize; fHashCode = kInvalidHashCode; - if(deleteWhenDone) - delete [] bufferToDelete; + // delayed delete in case srcChars == fArray when we started, and + // to keep oldArray alive for the above operations + delete [] bufferToDelete; return *this; } @@ -859,11 +964,9 @@ UnicodeString::doReverse(UTextOffset start, int32_t length) { // if we're bogus, do nothing - if(fBogus) + if(isBogus() || !cloneArrayIfNeeded()) { return *this; - - // clone our array, if necessary - cloneArrayIfNeeded(); + } // pin the indices to legal values pinIndices(start, length); @@ -890,10 +993,9 @@ int32_t UnicodeString::doHashCode() { const UChar *key = getArrayStart(); - int32_t len = fLength; + int32_t len = fLength; int32_t hash = kInvalidHashCode; - const UChar *limit = key + len; - int32_t inc = (len >= 128 ? len/64 : 1); + const UChar *limit = key + len; /* We compute the hash by iterating sparsely over 64 (at most) @@ -904,47 +1006,41 @@ UnicodeString::doHashCode() deterministic value which should be well distributed over the output range. [LIU] */ - while(key < limit) { - hash = (hash * 37) + *key; - key += inc; + if(len <= 64) { + while(key < limit) { + hash = (hash * 37) + *key++; + } + } else { + int32_t inc = (len+63)/64; + + while(key < limit) { + hash = (hash * 37) + *key; + key += inc; + } } - if(hash == kInvalidHashCode) + hash &= 0x7fffffff; + if(hash == kInvalidHashCode) { hash = kEmptyHashCode; + } fHashCode = hash; return fHashCode; } -//======================================== -// Bogusify? -//======================================== -void -UnicodeString::setToBogus() -{ - if(fRefCounted && removeRef() == 0) { - delete [] fArray; - } - - fArray = 0; - fCapacity = fLength = 0; - fHashCode = kInvalidHashCode; - fRefCounted = FALSE; - fBogus = TRUE; -} - //======================================== // Codeset conversion //======================================== int32_t UnicodeString::extract(UTextOffset start, - int32_t length, - char *dst, - const char *codepage) const + int32_t length, + char *dst, + const char *codepage) const { // if we're bogus or there's nothing to convert, do nothing - if(fBogus || length == 0) + if(isBogus() || length <= 0) { return 0; + } // pin the indices to legal values pinIndices(start, length); @@ -976,10 +1072,11 @@ UnicodeString::extract(UTextOffset start, // if it is an empty string, then use the "invariant character" conversion if(U_FAILURE(status)) { // close the converter - if(codepage == 0) + if(codepage == 0) { releaseDefaultConverter(converter); - else + } else { ucnv_close(converter); + } return 0; } @@ -997,17 +1094,19 @@ UnicodeString::extract(UTextOffset start, myTargetLimit = myTarget + arraySize; /* Pin the limit to U_MAX_PTR. NULL check is for AS/400. */ - if((myTargetLimit < myTarget) || (myTargetLimit == NULL)) - myTargetLimit = (char*)U_MAX_PTR; + if((myTargetLimit < myTarget) || (myTargetLimit == NULL)) { + myTargetLimit = (char*)U_MAX_PTR; + } ucnv_fromUnicode(converter, &myTarget, myTargetLimit, - &mySource, mySourceEnd, NULL, TRUE, &status); + &mySource, mySourceEnd, 0, TRUE, &status); // close the converter - if(codepage == 0) + if(codepage == 0) { releaseDefaultConverter(converter); - else + } else { ucnv_close(converter); + } return (myTarget - dst); } @@ -1018,35 +1117,29 @@ UnicodeString::doCodepageCreate(const char *codepageData, const char *codepage) { // if there's nothing to convert, do nothing - if(codepageData == 0 || dataLength == 0) + if(codepageData == 0 || dataLength <= 0) { return; + } - // set up the conversion parameters - int32_t sourceLen = dataLength; - const char *mySource = codepageData; - const char *mySourceEnd = mySource + sourceLen; - UChar *myTarget; - UErrorCode status = U_ZERO_ERROR; - int32_t arraySize = getCapacity(); + UErrorCode status = U_ZERO_ERROR; // create the converter - UConverter *converter = 0; - // if the codepage is the default, use our cache // if it is an empty string, then use the "invariant character" conversion - converter = (codepage == 0 ? - getDefaultConverter(status) : - *codepage == 0 ? - 0 : - ucnv_open(codepage, &status)); + UConverter *converter = (codepage == 0 ? + getDefaultConverter(status) : + *codepage == 0 ? + 0 : + ucnv_open(codepage, &status)); // if we failed, set the appropriate flags and return if(U_FAILURE(status)) { // close the converter - if(codepage == 0) + if(codepage == 0) { releaseDefaultConverter(converter); - else + } else { ucnv_close(converter); + } setToBogus(); return; } @@ -1056,170 +1149,84 @@ UnicodeString::doCodepageCreate(const char *codepageData, // perform the conversion if(converter == 0) { // use the "invariant characters" conversion - if(arraySize < dataLength) { - int32_t tempCapacity; - // allocate enough space for the dataLength, the refCount, and a NUL - UChar *temp = allocate(dataLength + 2, tempCapacity); - - if(temp == 0) { - // set flags and return - setToBogus(); - return; - } - - fArray = temp; - fCapacity = tempCapacity; - - setRefCount(1); - - u_charsToUChars(codepageData, fArray + 1, dataLength); - fArray[dataLength + 1] = 0; - } else { + if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { u_charsToUChars(codepageData, getArrayStart(), dataLength); + fLength = dataLength; + } else { + setToBogus(); } - fLength = dataLength; return; } - myTarget = getArrayStart(); + // set up the conversion parameters + const char *mySource = codepageData; + const char *mySourceEnd = mySource + dataLength; + UChar *myTarget; + + // estimate the size needed: + // 1.25 UChar's per source byte should cover most cases + int32_t arraySize = dataLength + (dataLength >> 2); + + // we do not care about the current contents + bool_t doCopyArray = FALSE; for(;;) { - // reset the error code - status = U_ZERO_ERROR; + if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { + setToBogus(); + break; + } // perform the conversion - ucnv_toUnicode(converter, &myTarget, myTarget + arraySize, - &mySource, mySourceEnd, NULL, TRUE, &status); + myTarget = fArray + fLength; + ucnv_toUnicode(converter, &myTarget, fArray + fCapacity, + &mySource, mySourceEnd, 0, FALSE, &status); // update the conversion parameters - fLength = myTarget - getArrayStart(); + fLength = myTarget - fArray; // allocate more space and copy data, if needed if(status == U_INDEX_OUTOFBOUNDS_ERROR) { - int32_t tempCapacity; - UChar *temp = allocate(fCapacity, tempCapacity); + // reset the error code + status = U_ZERO_ERROR; - if(! temp) { - // set flags and return - setToBogus(); - break; - } + // keep the previous conversion results + doCopyArray = TRUE; - if(fRefCounted) { - // copy the old array into temp - us_arrayCopy(fArray, 1, temp, 1, fLength); - delete [] fArray; - } else { - // if we're not currently ref counted, shift the array right by one - us_arrayCopy(fArray, 0, temp, 1, fLength); - } - - fArray = temp; - fCapacity = tempCapacity; - - setRefCount(1); - - myTarget = getArrayStart() + fLength; - arraySize = getCapacity() - fLength; + // estimate the new size needed, larger than before + // try 2 UChar's per remaining source byte + arraySize = fLength + 2 * (mySourceEnd - mySource); } else { break; } } // close the converter - if(codepage == 0) + if(codepage == 0) { releaseDefaultConverter(converter); - else + } else { ucnv_close(converter); + } } //======================================== // External Buffer //======================================== -UnicodeString::UnicodeString(UChar *buff, - int32_t bufLength, - int32_t buffCapacity) - : fArray(buff), - fLength(bufLength), - fCapacity(buffCapacity), - fRefCounted(FALSE), - fHashCode(kInvalidHashCode), - fBogus(FALSE) -{} - +// ### TODO: +// this is very, very dirty: we should not ever expose our array to the outside, +// and this also violates the const-ness of this object +// this must be removed when the resource bundle implementation does not need it any more! const UChar* -UnicodeString::getUChars() const -{ +UnicodeString::getUChars() const { // if we're bogus, do nothing - if(fBogus) + if(isBogus()) { return 0; - - // no room for null, resize - if(getCapacity() <= fLength) { - // allocate at minimum the current capacity + needed space - int32_t tempLength; - UChar *temp = allocate(fCapacity + 1, tempLength); - if(! temp) { - ((UnicodeString*)this)->setToBogus(); - return 0; - } - - // if we're not currently ref counted, shift the array right by one - if(fRefCounted == FALSE) - us_arrayCopy(fArray, 0, temp, 1, fLength); - // otherwise, copy the old array into temp, including the ref count - else - us_arrayCopy(fArray, 0, temp, 0, fLength + 1); - - // delete the old array - if(fRefCounted && ((UnicodeString*)this)->removeRef() == 0) - delete [] ((UnicodeString*)this)->fArray; - - // use the new array - ((UnicodeString*)this)->fCapacity = tempLength; - ((UnicodeString*)this)->fArray = temp; - ((UnicodeString*)this)->setRefCount(1); } - if(getArrayStart()[fLength] != 0) { - // tack on a trailing null - ((UChar *)getArrayStart())[fLength] = 0; - } - - return getArrayStart(); -} - -UChar* -UnicodeString::orphanStorage() -{ - // if we're bogus, do nothing - if(fBogus) - return 0; - - UChar *retVal; - - // if we're ref counted, get rid of the leading ref count - if(fRefCounted && removeRef() == 0) { - retVal = fArray; - } else { - // if we don't own the memory, then we have to allocate it - retVal = new UChar[fLength + 1]; - if(retVal == 0) { - return 0; + if(fCapacity <= fLength || fArray[fLength] != 0) { + if(((UnicodeString &)*this).cloneArrayIfNeeded(fLength + 1)) { + fArray[fLength] = 0; } } - - // shift or copy characters - us_arrayCopy(getArrayStart(), 0, retVal, 0, fLength); - retVal[fLength] = 0; - - // set self to empty - fArray = fStackBuffer; - fLength = 0; - fCapacity = US_STACKBUF_SIZE; - fHashCode = kEmptyHashCode; - fRefCounted = FALSE; - - return retVal; + return fArray; } //======================================== @@ -1230,67 +1237,91 @@ UnicodeString::pinIndices(UTextOffset& start, int32_t& length) const { // pin indices - if(length < 0 || start < 0) + if(length < 0 || start < 0) { start = length = 0; - else { - if(length > (fLength - start)) - length = (fLength - start); + } else if(length > (fLength - start)) { + length = (fLength - start); } } -void -UnicodeString::cloneArrayIfNeeded() -{ - // if we're aliased or ref counted, make a copy of the buffer if necessary - if(fArray != fStackBuffer && (!fRefCounted || refCount() > 1)) { - UChar *copy; - bool_t refCounted; - if(fLength <= US_STACKBUF_SIZE) { - // a small string does not need allocation - fCapacity = US_STACKBUF_SIZE; - copy = fStackBuffer; - refCounted = FALSE; +bool_t +UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, + int32_t growCapacity, + bool_t doCopyArray, + int32_t **pBufferToDelete) { + // default parameters need to be static, therefore + // the defaults are -1 to have convenience defaults + if(newCapacity == -1) { + newCapacity = fCapacity; + } + + /* + * We need to make a copy of the array if + * the buffer is read-only, or + * the buffer is refCounted (shared), and refCount>1, or + * the buffer is too small. + * Return FALSE if memory could not be allocated. + */ + if(fFlags & kBufferIsReadonly || + fFlags & kRefCounted && refCount() > 1 || + newCapacity > fCapacity + ) { + // save old values + UChar *array = fArray; + uint16_t flags = fFlags; + + // check growCapacity for default value and use of the stack buffer + if(growCapacity == -1) { + growCapacity = newCapacity; + } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { + growCapacity = US_STACKBUF_SIZE; + } + + // allocate a new array + if(allocate(growCapacity) || + newCapacity < growCapacity && allocate(newCapacity) + ) { + if(doCopyArray) { + // copy the contents + // do not copy more than what fits - it may be smaller than before + if(fCapacity < fLength) { + fLength = fCapacity; + } + us_arrayCopy(array, 0, fArray, 0, fLength); + } else { + fLength = 0; + } + + // release the old array + if(flags & kRefCounted) { + // the array is refCounted; decrement and release if 0 + int32_t *pRefCount = ((int32_t *)array - 1); + if(--*pRefCount == 0) { + if(pBufferToDelete == 0) { + delete [] pRefCount; + } else { + // the caller requested to delete it himself + *pBufferToDelete = pRefCount; + } + } + } } else { - if(!fRefCounted) { - // make room for the ref count - ++fCapacity; - } - if(fCapacity - 1 <= fLength) { - // make room for a terminating NUL - fCapacity = fLength + 2; - } - copy = new UChar [ fCapacity ]; - if(copy == 0) { - setToBogus(); - return; - } - refCounted = TRUE; - } - - // copy the current shared array into our new array - us_arrayCopy(getArrayStart(), 0, copy, refCounted ? 1 : 0, fLength); - - // remove a reference from the current shared array - // if there are no more references to the current shared array, - // after we remove the reference, delete the array - if(fRefCounted && removeRef() == 0) { - delete [] fArray; - } - - // make our array point to the new copy and set the ref count to one - fArray = copy; - fRefCounted = refCounted; - if(refCounted) { - setRefCount(1); + // not enough memory for growCapacity and not even for the smaller newCapacity + // reset the old values for setToBogus() to release the array + fArray = array; + fFlags = flags; + setToBogus(); + return FALSE; } } + return TRUE; } // private function for C API -U_CFUNC const UChar* -T_UnicodeString_getUChars(const UnicodeString *s) +U_CFUNC int32_t +T_UnicodeString_length(const UnicodeString *s) { - return s->getUChars(); + return s->length(); } // private function for C API @@ -1323,8 +1354,9 @@ UnicodeString::getDefaultConverter(UErrorCode &status) // if the cache was empty, create a converter if(converter == 0) { converter = ucnv_open(0, &status); - if(U_FAILURE(status)) + if(U_FAILURE(status)) { return 0; + } } return converter; @@ -1342,7 +1374,7 @@ UnicodeString::releaseDefaultConverter(UConverter *converter) } } - // it's safe to close a NULL converter + // it's safe to close a 0 converter ucnv_close(converter); } @@ -1427,14 +1459,16 @@ void UnicodeStringStreamer::streamOut(const UnicodeString *s, FileStream *os) { - if(!T_FileStream_error(os)) + if(!T_FileStream_error(os)) { writeLong(os, s->fLength); + } const UChar *c = s->getArrayStart(); const UChar *end = c + s->fLength; - while(c != end && ! T_FileStream_error(os)) + while(c != end && ! T_FileStream_error(os)) { writeUChar(os, *c++); + } } void @@ -1456,40 +1490,16 @@ UnicodeStringStreamer::streamIn(UnicodeString *s, } // clone s's array, if needed - s->cloneArrayIfNeeded(); - - // if the string isn't big enough to hold the data, enlarge it - if(s->getCapacity() < newSize) { - - int32_t tempLength; - UChar *temp = s->allocate(newSize, tempLength); - if(! temp) { - s->setToBogus(); - return; - } - - // if s is not currently ref counted, shift the array right by one - if(s->fRefCounted == FALSE) - us_arrayCopy(s->fArray, 0, temp, 1, s->fLength); - // otherwise, copy the old array into temp, including the ref count - else - us_arrayCopy(s->fArray, 0, temp, 0, s->fLength + 1); - - // delete the old array if s is ref counted - if(s->fRefCounted && s->removeRef() == 0) - delete [] s->fArray; - - // use the new array - s->fCapacity = tempLength; - s->fArray = temp; - s->setRefCount(1); + if(!s->cloneArrayIfNeeded(newSize, newSize, FALSE)) { + return; } UChar *c = s->getArrayStart(); UChar *end = c + newSize; - while(c < end && ! (T_FileStream_error(is) || T_FileStream_eof(is))) + while(c < end && ! (T_FileStream_error(is) || T_FileStream_eof(is))) { *c++ = readUChar(is); + } // couldn't read all chars if(c < end) { @@ -1504,22 +1514,32 @@ UnicodeStringStreamer::streamIn(UnicodeString *s, ostream& operator<<(ostream& stream, - const UnicodeString& s) + const UnicodeString& s) { - UTextOffset i; - UChar c; - int32_t saveFlags = stream.flags(); + if(s.length() > 0) { + char buffer[200]; + UConverter *converter; + UErrorCode errorCode = U_ZERO_ERROR; - stream << hex; + // use the default converter to convert chunks of text + converter = UnicodeString::getDefaultConverter(errorCode); + if(U_SUCCESS(errorCode)) { + const UChar *us = s.getArrayStart(), *uLimit = us + s.length(); + char *s, *sLimit = buffer + sizeof(buffer); + do { + errorCode = U_ZERO_ERROR; + s = buffer; + ucnv_fromUnicode(converter, &s, sLimit, &us, uLimit, 0, FALSE, &errorCode); - for(i = 0; i < s.length(); i++) { - c = s.charAt(i); - if((c >= ' ' && c <= '~') || c == '\n') - stream << (char)c; - else - stream << "[0x" << c << "]"; + // write this chunk + if(s > buffer) { + stream.write(buffer, s - buffer); + } + } while(errorCode == U_INDEX_OUTOFBOUNDS_ERROR); + UnicodeString::releaseDefaultConverter(converter); + } } + stream.flush(); - stream.setf(saveFlags & ios::basefield, ios::basefield); return stream; } diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp index db6c746783..89c8b0e1a0 100644 --- a/icu4c/source/test/intltest/ustrtest.cpp +++ b/icu4c/source/test/intltest/ustrtest.cpp @@ -549,17 +549,6 @@ UnicodeStringTest::TestMiscellaneous() for (i = 0; i < test2.length(); i++) if (test2[i] != test4[i]) errln(UnicodeString("getUChars() failed: strings differ at position ") + i); - - test4 = test1.orphanStorage(); - - if (test1.length() != 0) - errln("orphanStorage() failed: orphaned string's contents is " + test1); - - for (i = 0; i < test2.length(); i++) - if (test2[i] != test4[i]) - errln(UnicodeString("orphanStorage() failed: strings differ at position ") + i); - - delete (UChar*)test4; } void @@ -606,13 +595,9 @@ UnicodeStringTest::TestStackAllocation() errln("insert() on stack-allocated UnicodeString didn't work right"); if (guardWord2 != 0x4DED) errln("insert() on stack-allocated UnicodeString overwrote guard word!"); -#if 0 - // the current implementation will always reallocate the memory - // after it was aliased in case it was read-only; - // therefore, this test must fail and we don't perform it + if (workingBuffer[24] != 0x67) errln("insert() on stack-allocated UnicodeString didn't affect backing store"); -#endif *test += " to the aid of their country."; if (*test != "Now is the time for all good men to come to the aid of their country.") @@ -624,9 +609,32 @@ UnicodeStringTest::TestStackAllocation() if (*test != "ha!") errln("Assignment to stack-allocated UnicodeString didn't work"); if (workingBuffer[0] != 0x4e) - errln("Change to UnicodeString after overflow are stil affecting original buffer"); + errln("Change to UnicodeString after overflow are still affecting original buffer"); if (guardWord2 != 0x4DED) errln("Change to UnicodeString after overflow overwrote guard word!"); + // test read-only aliasing with setTo() + workingBuffer[0] = 0x20ac; + workingBuffer[1] = 0x125; + workingBuffer[2] = 0; + test->setTo(TRUE, workingBuffer, 2); + if(test->length() != 2 || test->charAt(0) != 0x20ac || test->charAt(1) != 0x125) { + errln("UnicodeString.setTo(readonly alias) does not alias correctly"); + } + workingBuffer[1] = 0x109; + if(test->charAt(1) != 0x109) { + errln("UnicodeString.setTo(readonly alias) made a copy: did not see change in buffer"); + } + + test->setTo(TRUE, workingBuffer, -1); + if(test->length() != 2 || test->charAt(0) != 0x20ac || test->charAt(1) != 0x109) { + errln("UnicodeString.setTo(readonly alias, length -1) does not alias correctly"); + } + + test->setTo(FALSE, workingBuffer, -1); + if(!test->isBogus()) { + errln("UnicodeString.setTo(unterminated readonly alias, length -1) does not result in isBogus()"); + } + delete test; }