ICU-6606 optimized away calls to u_getDefaultConverter() from UnicodeString code (constructors and extract()) if U_CHARSET_IS_UTF8

X-SVN-Rev: 25571
This commit is contained in:
Markus Scherer 2009-03-12 21:24:54 +00:00
parent c7b7271028
commit e74be582d0
6 changed files with 191 additions and 59 deletions

View File

@ -727,14 +727,7 @@ ucnv_loadSharedData(const char *converterName, UConverterLookupData *lookup, UEr
/* the default converter name is already canonical */
#endif
}
else if((converterName[0] == 'U' ?
( converterName[1] == 'T' && converterName[2] == 'F') :
(converterName[0] == 'u' && converterName[1] == 't' && converterName[2] == 'f'))
&&
(converterName[3] == '-' ?
(converterName[4] == '8' && converterName[5] == 0) :
(converterName[3] == '8' && converterName[4] == 0)))
{
else if(UCNV_FAST_IS_UTF8(converterName)) {
/* fastpath for UTF-8 */
return (UConverterSharedData *)converterData[UCNV_UTF8];
}

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2007, International Business Machines
* Copyright (C) 1999-2009, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -27,6 +27,21 @@
#include "unicode/uloc.h"
#include "ucnv_bld.h"
/*
* Fast check for whether a charset name is "UTF-8".
* This does not recognize all of the variations that ucnv_open()
* and other functions recognize, but it covers most cases.
* @param name const char * charset name
* @return
*/
#define UCNV_FAST_IS_UTF8(name) \
(((name[0]=='U' ? \
( name[1]=='T' && name[2]=='F') : \
(name[0]=='u' && name[1]=='t' && name[2]=='f'))) \
&& (name[3]=='-' ? \
(name[4]=='8' && name[5]==0) : \
(name[3]=='8' && name[4]==0)))
/* figures out if we need to go to file to read in the data tables.
* @param converterName The name of the converter
* @param err The error code

View File

@ -3128,6 +3128,17 @@ protected:
virtual UChar32 getChar32At(int32_t offset) const;
private:
// For char* constructors. Could be made public.
UnicodeString &setToUTF8(const StringPiece &utf8);
// For extract(char*).
// We could make a toUTF8(target, capacity, errorCode) public but not
// this version: New API will be cleaner if we make callers create substrings
// rather than having start+length on every method,
// and it should take a UErrorCode&.
int32_t
toUTF8(int32_t start, int32_t len,
char *target, int32_t capacity) const;
inline int8_t
doCompare(int32_t start,

View File

@ -295,6 +295,32 @@ UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
}
}
#if U_CHARSET_IS_UTF8
UnicodeString::UnicodeString(const char *codepageData)
: fShortLength(0),
fFlags(kShortString) {
if(codepageData != 0) {
setToUTF8(codepageData);
}
}
UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
: fShortLength(0),
fFlags(kShortString) {
// if there's nothing to convert, do nothing
if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
return;
}
if(dataLength == -1) {
dataLength = (int32_t)uprv_strlen(codepageData);
}
setToUTF8(StringPiece(codepageData, dataLength));
}
// else see unistr_cnv.cpp
#endif
UnicodeString::UnicodeString(const UnicodeString& that)
: Replaceable(),
fShortLength(0),
@ -381,26 +407,7 @@ UnicodeString::~UnicodeString()
UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
UnicodeString result;
int32_t length = utf8.length();
int32_t capacity;
// The UTF-16 string will be at most as long as the UTF-8 string.
if(length <= US_STACKBUF_SIZE) {
capacity = US_STACKBUF_SIZE;
} else {
capacity = length + 1; // +1 for the terminating NUL.
}
UChar *utf16 = result.getBuffer(capacity);
int32_t length16;
UErrorCode errorCode = U_ZERO_ERROR;
u_strFromUTF8WithSub(utf16, result.getCapacity(), &length16,
utf8.data(), length,
0xfffd, // Substitution character.
NULL, // Don't care about number of substitutions.
&errorCode);
result.releaseBuffer(length16);
if(U_FAILURE(errorCode)) {
result.setToBogus();
}
result.setToUTF8(utf8);
return result;
}
@ -772,6 +779,35 @@ UnicodeString::extract(int32_t start,
return u_terminateChars(target, targetCapacity, length, &status);
}
int32_t
UnicodeString::toUTF8(int32_t start, int32_t len,
char *target, int32_t capacity) const {
pinIndices(start, len);
int32_t length8;
UErrorCode errorCode = U_ZERO_ERROR;
u_strToUTF8WithSub(target, capacity, &length8,
getBuffer() + start, len,
0xFFFD, // Standard substitution character.
NULL, // Don't care about number of substitutions.
&errorCode);
return length8;
}
#if U_CHARSET_IS_UTF8
int32_t
UnicodeString::extract(int32_t start, int32_t len,
char *target, uint32_t dstSize) const {
// if the arguments are illegal, then do nothing
if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
return 0;
}
return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
}
// else see unistr_cnv.cpp
#endif
void
UnicodeString::extractBetween(int32_t start,
int32_t limit,
@ -1108,6 +1144,31 @@ UnicodeString::setTo(UChar *buffer,
return *this;
}
UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
unBogus();
int32_t length = utf8.length();
int32_t capacity;
// The UTF-16 string will be at most as long as the UTF-8 string.
if(length <= US_STACKBUF_SIZE) {
capacity = US_STACKBUF_SIZE;
} else {
capacity = length + 1; // +1 for the terminating NUL.
}
UChar *utf16 = getBuffer(capacity);
int32_t length16;
UErrorCode errorCode = U_ZERO_ERROR;
u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
utf8.data(), length,
0xfffd, // Substitution character.
NULL, // Don't care about number of substitutions.
&errorCode);
releaseBuffer(length16);
if(U_FAILURE(errorCode)) {
setToBogus();
}
return *this;
}
UnicodeString&
UnicodeString::setCharAt(int32_t offset,
UChar c)

View File

@ -37,6 +37,8 @@ U_NAMESPACE_BEGIN
// Constructors
//========================================
#if !U_CHARSET_IS_UTF8
UnicodeString::UnicodeString(const char *codepageData)
: fShortLength(0),
fFlags(kShortString)
@ -56,6 +58,9 @@ UnicodeString::UnicodeString(const char *codepageData,
}
}
// else see unistr.cpp
#endif
UnicodeString::UnicodeString(const char *codepageData,
const char *codepage)
: fShortLength(0),
@ -117,6 +122,9 @@ UnicodeString::UnicodeString(const char *src, int32_t srcLength,
//========================================
// Codeset conversion
//========================================
#if !U_CHARSET_IS_UTF8
int32_t
UnicodeString::extract(int32_t start,
int32_t length,
@ -125,6 +133,9 @@ UnicodeString::extract(int32_t start,
return extract(start, length, target, dstSize, 0);
}
// else see unistr.cpp
#endif
int32_t
UnicodeString::extract(int32_t start,
int32_t length,
@ -140,44 +151,59 @@ UnicodeString::extract(int32_t start,
// pin the indices to legal values
pinIndices(start, length);
// We need to cast dstSize to int32_t for all subsequent code.
// I don't know why the API was defined with uint32_t but we are stuck with it.
// Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
// as a limit in some functions, it may wrap around and yield a pointer
// that compares less-than target.
int32_t capacity;
if(dstSize < 0x7fffffff) {
// Assume that the capacity is real and a limit pointer won't wrap around.
capacity = (int32_t)dstSize;
} else {
char *targetLimit = target + 0x7fffffff;
if(targetLimit < target) {
// Pin the capacity so that a limit pointer does not wrap around.
targetLimit = (char *)U_MAX_PTR(target);
capacity = (int32_t)(targetLimit - target);
} else {
// Pin the capacity to the maximum int32_t value.
capacity = 0x7fffffff;
}
}
// create the converter
UConverter *converter;
UErrorCode status = U_ZERO_ERROR;
// just write the NUL if the string length is 0
if(length == 0) {
if(dstSize >= 0x80000000) {
// careful: dstSize is unsigned! (0xffffffff means "unlimited")
// make sure that the NUL-termination works (takes int32_t)
dstSize=0x7fffffff;
}
return u_terminateChars(target, dstSize, 0, &status);
return u_terminateChars(target, capacity, 0, &status);
}
// if the codepage is the default, use our cache
// if it is an empty string, then use the "invariant character" conversion
if (codepage == 0) {
const char *defaultName = ucnv_getDefaultName();
if(UCNV_FAST_IS_UTF8(defaultName)) {
return toUTF8(start, length, target, capacity);
}
converter = u_getDefaultConverter(&status);
} else if (*codepage == 0) {
// use the "invariant characters" conversion
int32_t destLength;
// careful: dstSize is unsigned! (0xffffffff means "unlimited")
if(dstSize >= 0x80000000) {
destLength = length;
// make sure that the NUL-termination works (takes int32_t)
dstSize=0x7fffffff;
} else if(length <= (int32_t)dstSize) {
if(length <= capacity) {
destLength = length;
} else {
destLength = (int32_t)dstSize;
destLength = capacity;
}
u_UCharsToChars(getArrayStart() + start, target, destLength);
return u_terminateChars(target, (int32_t)dstSize, length, &status);
return u_terminateChars(target, capacity, length, &status);
} else {
converter = ucnv_open(codepage, &status);
}
length = doExtract(start, length, target, (int32_t)dstSize, converter, status);
length = doExtract(start, length, target, capacity, converter, status);
// close the converter
if (codepage == 0) {
@ -298,20 +324,15 @@ UnicodeString::doCodepageCreate(const char *codepageData,
// create the converter
// if the codepage is the default, use our cache
// if it is an empty string, then use the "invariant character" conversion
UConverter *converter = (codepage == 0 ?
u_getDefaultConverter(&status) :
*codepage == 0 ?
0 :
ucnv_open(codepage, &status));
// if we failed, set the appropriate flags and return
if(U_FAILURE(status)) {
setToBogus();
return;
}
// perform the conversion
if(converter == 0) {
UConverter *converter;
if (codepage == 0) {
const char *defaultName = ucnv_getDefaultName();
if(UCNV_FAST_IS_UTF8(defaultName)) {
setToUTF8(StringPiece(codepageData, dataLength));
return;
}
converter = u_getDefaultConverter(&status);
} else if(*codepage == 0) {
// use the "invariant characters" conversion
if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
u_charsToUChars(codepageData, getArrayStart(), dataLength);
@ -320,9 +341,17 @@ UnicodeString::doCodepageCreate(const char *codepageData,
setToBogus();
}
return;
} else {
converter = ucnv_open(codepage, &status);
}
// convert using the real converter
// if we failed, set the appropriate flags and return
if(U_FAILURE(status)) {
setToBogus();
return;
}
// perform the conversion
doCodepageCreate(codepageData, dataLength, converter, status);
if(U_FAILURE(status)) {
setToBogus();

View File

@ -232,6 +232,29 @@ UnicodeStringTest::TestBasicManipulation()
errln("UnicodeString(const char *, length, cnv, errorCode) does not work with length==-1");
}
}
#if U_CHARSET_IS_UTF8
{
// Test the hardcoded-UTF-8 UnicodeString optimizations.
static const uint8_t utf8[]={ 0x61, 0xC3, 0xA4, 0xC3, 0x9F, 0xE4, 0xB8, 0x80, 0 };
static const UChar utf16[]={ 0x61, 0xE4, 0xDF, 0x4E00 };
UnicodeString from8a = UnicodeString((const char *)utf8);
UnicodeString from8b = UnicodeString((const char *)utf8, (int32_t)sizeof(utf8)-1);
UnicodeString from16(FALSE, utf16, LENGTHOF(utf16));
if(from8a != from16 || from8b != from16) {
errln("UnicodeString(const char * U_CHARSET_IS_UTF8) failed");
}
char buffer[16];
int32_t length8=from16.extract(0, 0x7fffffff, buffer, (uint32_t)sizeof(buffer));
if(length8!=((int32_t)sizeof(utf8)-1) || 0!=uprv_memcmp(buffer, utf8, sizeof(utf8))) {
errln("UnicodeString::extract(char * U_CHARSET_IS_UTF8) failed");
}
length8=from16.extract(1, 2, buffer, (uint32_t)sizeof(buffer));
if(length8!=4 || buffer[length8]!=0 || 0!=uprv_memcmp(buffer, utf8+1, length8)) {
errln("UnicodeString::extract(substring to char * U_CHARSET_IS_UTF8) failed");
}
}
#endif
}
void