ICU-6606 optimized away calls to u_getDefaultConverter() from UnicodeString code (constructors and extract()) if U_CHARSET_IS_UTF8
X-SVN-Rev: 25571
This commit is contained in:
parent
c7b7271028
commit
e74be582d0
@ -727,14 +727,7 @@ ucnv_loadSharedData(const char *converterName, UConverterLookupData *lookup, UEr
|
||||
/* the default converter name is already canonical */
|
||||
#endif
|
||||
}
|
||||
else if((converterName[0] == 'U' ?
|
||||
( converterName[1] == 'T' && converterName[2] == 'F') :
|
||||
(converterName[0] == 'u' && converterName[1] == 't' && converterName[2] == 'f'))
|
||||
&&
|
||||
(converterName[3] == '-' ?
|
||||
(converterName[4] == '8' && converterName[5] == 0) :
|
||||
(converterName[3] == '8' && converterName[4] == 0)))
|
||||
{
|
||||
else if(UCNV_FAST_IS_UTF8(converterName)) {
|
||||
/* fastpath for UTF-8 */
|
||||
return (UConverterSharedData *)converterData[UCNV_UTF8];
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2007, International Business Machines
|
||||
* Copyright (C) 1999-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
@ -27,6 +27,21 @@
|
||||
#include "unicode/uloc.h"
|
||||
#include "ucnv_bld.h"
|
||||
|
||||
/*
|
||||
* Fast check for whether a charset name is "UTF-8".
|
||||
* This does not recognize all of the variations that ucnv_open()
|
||||
* and other functions recognize, but it covers most cases.
|
||||
* @param name const char * charset name
|
||||
* @return
|
||||
*/
|
||||
#define UCNV_FAST_IS_UTF8(name) \
|
||||
(((name[0]=='U' ? \
|
||||
( name[1]=='T' && name[2]=='F') : \
|
||||
(name[0]=='u' && name[1]=='t' && name[2]=='f'))) \
|
||||
&& (name[3]=='-' ? \
|
||||
(name[4]=='8' && name[5]==0) : \
|
||||
(name[3]=='8' && name[4]==0)))
|
||||
|
||||
/* figures out if we need to go to file to read in the data tables.
|
||||
* @param converterName The name of the converter
|
||||
* @param err The error code
|
||||
|
@ -3128,6 +3128,17 @@ protected:
|
||||
virtual UChar32 getChar32At(int32_t offset) const;
|
||||
|
||||
private:
|
||||
// For char* constructors. Could be made public.
|
||||
UnicodeString &setToUTF8(const StringPiece &utf8);
|
||||
// For extract(char*).
|
||||
// We could make a toUTF8(target, capacity, errorCode) public but not
|
||||
// this version: New API will be cleaner if we make callers create substrings
|
||||
// rather than having start+length on every method,
|
||||
// and it should take a UErrorCode&.
|
||||
int32_t
|
||||
toUTF8(int32_t start, int32_t len,
|
||||
char *target, int32_t capacity) const;
|
||||
|
||||
|
||||
inline int8_t
|
||||
doCompare(int32_t start,
|
||||
|
@ -295,6 +295,32 @@ UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
|
||||
}
|
||||
}
|
||||
|
||||
#if U_CHARSET_IS_UTF8
|
||||
|
||||
UnicodeString::UnicodeString(const char *codepageData)
|
||||
: fShortLength(0),
|
||||
fFlags(kShortString) {
|
||||
if(codepageData != 0) {
|
||||
setToUTF8(codepageData);
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
|
||||
: fShortLength(0),
|
||||
fFlags(kShortString) {
|
||||
// if there's nothing to convert, do nothing
|
||||
if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
|
||||
return;
|
||||
}
|
||||
if(dataLength == -1) {
|
||||
dataLength = (int32_t)uprv_strlen(codepageData);
|
||||
}
|
||||
setToUTF8(StringPiece(codepageData, dataLength));
|
||||
}
|
||||
|
||||
// else see unistr_cnv.cpp
|
||||
#endif
|
||||
|
||||
UnicodeString::UnicodeString(const UnicodeString& that)
|
||||
: Replaceable(),
|
||||
fShortLength(0),
|
||||
@ -381,26 +407,7 @@ UnicodeString::~UnicodeString()
|
||||
|
||||
UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
|
||||
UnicodeString result;
|
||||
int32_t length = utf8.length();
|
||||
int32_t capacity;
|
||||
// The UTF-16 string will be at most as long as the UTF-8 string.
|
||||
if(length <= US_STACKBUF_SIZE) {
|
||||
capacity = US_STACKBUF_SIZE;
|
||||
} else {
|
||||
capacity = length + 1; // +1 for the terminating NUL.
|
||||
}
|
||||
UChar *utf16 = result.getBuffer(capacity);
|
||||
int32_t length16;
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
u_strFromUTF8WithSub(utf16, result.getCapacity(), &length16,
|
||||
utf8.data(), length,
|
||||
0xfffd, // Substitution character.
|
||||
NULL, // Don't care about number of substitutions.
|
||||
&errorCode);
|
||||
result.releaseBuffer(length16);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
result.setToBogus();
|
||||
}
|
||||
result.setToUTF8(utf8);
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -772,6 +779,35 @@ UnicodeString::extract(int32_t start,
|
||||
return u_terminateChars(target, targetCapacity, length, &status);
|
||||
}
|
||||
|
||||
int32_t
|
||||
UnicodeString::toUTF8(int32_t start, int32_t len,
|
||||
char *target, int32_t capacity) const {
|
||||
pinIndices(start, len);
|
||||
int32_t length8;
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
u_strToUTF8WithSub(target, capacity, &length8,
|
||||
getBuffer() + start, len,
|
||||
0xFFFD, // Standard substitution character.
|
||||
NULL, // Don't care about number of substitutions.
|
||||
&errorCode);
|
||||
return length8;
|
||||
}
|
||||
|
||||
#if U_CHARSET_IS_UTF8
|
||||
|
||||
int32_t
|
||||
UnicodeString::extract(int32_t start, int32_t len,
|
||||
char *target, uint32_t dstSize) const {
|
||||
// if the arguments are illegal, then do nothing
|
||||
if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
|
||||
return 0;
|
||||
}
|
||||
return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
|
||||
}
|
||||
|
||||
// else see unistr_cnv.cpp
|
||||
#endif
|
||||
|
||||
void
|
||||
UnicodeString::extractBetween(int32_t start,
|
||||
int32_t limit,
|
||||
@ -1108,6 +1144,31 @@ UnicodeString::setTo(UChar *buffer,
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
|
||||
unBogus();
|
||||
int32_t length = utf8.length();
|
||||
int32_t capacity;
|
||||
// The UTF-16 string will be at most as long as the UTF-8 string.
|
||||
if(length <= US_STACKBUF_SIZE) {
|
||||
capacity = US_STACKBUF_SIZE;
|
||||
} else {
|
||||
capacity = length + 1; // +1 for the terminating NUL.
|
||||
}
|
||||
UChar *utf16 = getBuffer(capacity);
|
||||
int32_t length16;
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
|
||||
utf8.data(), length,
|
||||
0xfffd, // Substitution character.
|
||||
NULL, // Don't care about number of substitutions.
|
||||
&errorCode);
|
||||
releaseBuffer(length16);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
setToBogus();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeString&
|
||||
UnicodeString::setCharAt(int32_t offset,
|
||||
UChar c)
|
||||
|
@ -37,6 +37,8 @@ U_NAMESPACE_BEGIN
|
||||
// Constructors
|
||||
//========================================
|
||||
|
||||
#if !U_CHARSET_IS_UTF8
|
||||
|
||||
UnicodeString::UnicodeString(const char *codepageData)
|
||||
: fShortLength(0),
|
||||
fFlags(kShortString)
|
||||
@ -56,6 +58,9 @@ UnicodeString::UnicodeString(const char *codepageData,
|
||||
}
|
||||
}
|
||||
|
||||
// else see unistr.cpp
|
||||
#endif
|
||||
|
||||
UnicodeString::UnicodeString(const char *codepageData,
|
||||
const char *codepage)
|
||||
: fShortLength(0),
|
||||
@ -117,6 +122,9 @@ UnicodeString::UnicodeString(const char *src, int32_t srcLength,
|
||||
//========================================
|
||||
// Codeset conversion
|
||||
//========================================
|
||||
|
||||
#if !U_CHARSET_IS_UTF8
|
||||
|
||||
int32_t
|
||||
UnicodeString::extract(int32_t start,
|
||||
int32_t length,
|
||||
@ -125,6 +133,9 @@ UnicodeString::extract(int32_t start,
|
||||
return extract(start, length, target, dstSize, 0);
|
||||
}
|
||||
|
||||
// else see unistr.cpp
|
||||
#endif
|
||||
|
||||
int32_t
|
||||
UnicodeString::extract(int32_t start,
|
||||
int32_t length,
|
||||
@ -140,44 +151,59 @@ UnicodeString::extract(int32_t start,
|
||||
// pin the indices to legal values
|
||||
pinIndices(start, length);
|
||||
|
||||
// We need to cast dstSize to int32_t for all subsequent code.
|
||||
// I don't know why the API was defined with uint32_t but we are stuck with it.
|
||||
// Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
|
||||
// as a limit in some functions, it may wrap around and yield a pointer
|
||||
// that compares less-than target.
|
||||
int32_t capacity;
|
||||
if(dstSize < 0x7fffffff) {
|
||||
// Assume that the capacity is real and a limit pointer won't wrap around.
|
||||
capacity = (int32_t)dstSize;
|
||||
} else {
|
||||
char *targetLimit = target + 0x7fffffff;
|
||||
if(targetLimit < target) {
|
||||
// Pin the capacity so that a limit pointer does not wrap around.
|
||||
targetLimit = (char *)U_MAX_PTR(target);
|
||||
capacity = (int32_t)(targetLimit - target);
|
||||
} else {
|
||||
// Pin the capacity to the maximum int32_t value.
|
||||
capacity = 0x7fffffff;
|
||||
}
|
||||
}
|
||||
|
||||
// create the converter
|
||||
UConverter *converter;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
// just write the NUL if the string length is 0
|
||||
if(length == 0) {
|
||||
if(dstSize >= 0x80000000) {
|
||||
// careful: dstSize is unsigned! (0xffffffff means "unlimited")
|
||||
// make sure that the NUL-termination works (takes int32_t)
|
||||
dstSize=0x7fffffff;
|
||||
}
|
||||
return u_terminateChars(target, dstSize, 0, &status);
|
||||
return u_terminateChars(target, capacity, 0, &status);
|
||||
}
|
||||
|
||||
// if the codepage is the default, use our cache
|
||||
// if it is an empty string, then use the "invariant character" conversion
|
||||
if (codepage == 0) {
|
||||
const char *defaultName = ucnv_getDefaultName();
|
||||
if(UCNV_FAST_IS_UTF8(defaultName)) {
|
||||
return toUTF8(start, length, target, capacity);
|
||||
}
|
||||
converter = u_getDefaultConverter(&status);
|
||||
} else if (*codepage == 0) {
|
||||
// use the "invariant characters" conversion
|
||||
int32_t destLength;
|
||||
// careful: dstSize is unsigned! (0xffffffff means "unlimited")
|
||||
if(dstSize >= 0x80000000) {
|
||||
destLength = length;
|
||||
// make sure that the NUL-termination works (takes int32_t)
|
||||
dstSize=0x7fffffff;
|
||||
} else if(length <= (int32_t)dstSize) {
|
||||
if(length <= capacity) {
|
||||
destLength = length;
|
||||
} else {
|
||||
destLength = (int32_t)dstSize;
|
||||
destLength = capacity;
|
||||
}
|
||||
u_UCharsToChars(getArrayStart() + start, target, destLength);
|
||||
return u_terminateChars(target, (int32_t)dstSize, length, &status);
|
||||
return u_terminateChars(target, capacity, length, &status);
|
||||
} else {
|
||||
converter = ucnv_open(codepage, &status);
|
||||
}
|
||||
|
||||
length = doExtract(start, length, target, (int32_t)dstSize, converter, status);
|
||||
length = doExtract(start, length, target, capacity, converter, status);
|
||||
|
||||
// close the converter
|
||||
if (codepage == 0) {
|
||||
@ -298,20 +324,15 @@ UnicodeString::doCodepageCreate(const char *codepageData,
|
||||
// create the converter
|
||||
// if the codepage is the default, use our cache
|
||||
// if it is an empty string, then use the "invariant character" conversion
|
||||
UConverter *converter = (codepage == 0 ?
|
||||
u_getDefaultConverter(&status) :
|
||||
*codepage == 0 ?
|
||||
0 :
|
||||
ucnv_open(codepage, &status));
|
||||
|
||||
// if we failed, set the appropriate flags and return
|
||||
if(U_FAILURE(status)) {
|
||||
setToBogus();
|
||||
return;
|
||||
}
|
||||
|
||||
// perform the conversion
|
||||
if(converter == 0) {
|
||||
UConverter *converter;
|
||||
if (codepage == 0) {
|
||||
const char *defaultName = ucnv_getDefaultName();
|
||||
if(UCNV_FAST_IS_UTF8(defaultName)) {
|
||||
setToUTF8(StringPiece(codepageData, dataLength));
|
||||
return;
|
||||
}
|
||||
converter = u_getDefaultConverter(&status);
|
||||
} else if(*codepage == 0) {
|
||||
// use the "invariant characters" conversion
|
||||
if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
|
||||
u_charsToUChars(codepageData, getArrayStart(), dataLength);
|
||||
@ -320,9 +341,17 @@ UnicodeString::doCodepageCreate(const char *codepageData,
|
||||
setToBogus();
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
converter = ucnv_open(codepage, &status);
|
||||
}
|
||||
|
||||
// convert using the real converter
|
||||
// if we failed, set the appropriate flags and return
|
||||
if(U_FAILURE(status)) {
|
||||
setToBogus();
|
||||
return;
|
||||
}
|
||||
|
||||
// perform the conversion
|
||||
doCodepageCreate(codepageData, dataLength, converter, status);
|
||||
if(U_FAILURE(status)) {
|
||||
setToBogus();
|
||||
|
@ -232,6 +232,29 @@ UnicodeStringTest::TestBasicManipulation()
|
||||
errln("UnicodeString(const char *, length, cnv, errorCode) does not work with length==-1");
|
||||
}
|
||||
}
|
||||
|
||||
#if U_CHARSET_IS_UTF8
|
||||
{
|
||||
// Test the hardcoded-UTF-8 UnicodeString optimizations.
|
||||
static const uint8_t utf8[]={ 0x61, 0xC3, 0xA4, 0xC3, 0x9F, 0xE4, 0xB8, 0x80, 0 };
|
||||
static const UChar utf16[]={ 0x61, 0xE4, 0xDF, 0x4E00 };
|
||||
UnicodeString from8a = UnicodeString((const char *)utf8);
|
||||
UnicodeString from8b = UnicodeString((const char *)utf8, (int32_t)sizeof(utf8)-1);
|
||||
UnicodeString from16(FALSE, utf16, LENGTHOF(utf16));
|
||||
if(from8a != from16 || from8b != from16) {
|
||||
errln("UnicodeString(const char * U_CHARSET_IS_UTF8) failed");
|
||||
}
|
||||
char buffer[16];
|
||||
int32_t length8=from16.extract(0, 0x7fffffff, buffer, (uint32_t)sizeof(buffer));
|
||||
if(length8!=((int32_t)sizeof(utf8)-1) || 0!=uprv_memcmp(buffer, utf8, sizeof(utf8))) {
|
||||
errln("UnicodeString::extract(char * U_CHARSET_IS_UTF8) failed");
|
||||
}
|
||||
length8=from16.extract(1, 2, buffer, (uint32_t)sizeof(buffer));
|
||||
if(length8!=4 || buffer[length8]!=0 || 0!=uprv_memcmp(buffer, utf8+1, length8)) {
|
||||
errln("UnicodeString::extract(substring to char * U_CHARSET_IS_UTF8) failed");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
|
Loading…
Reference in New Issue
Block a user