ICU-6606 optimized away calls to u_getDefaultConverter() from UnicodeString code (constructors and extract()) if U_CHARSET_IS_UTF8

X-SVN-Rev: 25571
2009-03-12 21:24:54 +00:00 · 2009-03-12 21:24:54 +00:00 · e74be582d0
commit e74be582d0
parent c7b7271028
6 changed files with 191 additions and 59 deletions
--- a/icu4c/source/common/ucnv_bld.c
+++ b/icu4c/source/common/ucnv_bld.c
@ -727,14 +727,7 @@ ucnv_loadSharedData(const char *converterName, UConverterLookupData *lookup, UEr
        /* the default converter name is already canonical */
 #endif
    }
-    else if((converterName[0] == 'U' ?
-            (                           converterName[1] == 'T' && converterName[2] == 'F') :
-            (converterName[0] == 'u' && converterName[1] == 't' && converterName[2] == 'f'))
-        &&
-        (converterName[3] == '-' ?
-            (converterName[4] == '8' && converterName[5] == 0) :
-            (converterName[3] == '8' && converterName[4] == 0)))
-    {
+    else if(UCNV_FAST_IS_UTF8(converterName)) {
        /* fastpath for UTF-8 */
        return (UConverterSharedData *)converterData[UCNV_UTF8];
    }
--- a/icu4c/source/common/ucnv_imp.h
+++ b/icu4c/source/common/ucnv_imp.h
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 1999-2007, International Business Machines
+*   Copyright (C) 1999-2009, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *
@ -27,6 +27,21 @@
 #include "unicode/uloc.h"
 #include "ucnv_bld.h"

+/*
+ * Fast check for whether a charset name is "UTF-8".
+ * This does not recognize all of the variations that ucnv_open()
+ * and other functions recognize, but it covers most cases.
+ * @param name const char * charset name
+ * @return
+ */
+#define UCNV_FAST_IS_UTF8(name) \
+    (((name[0]=='U' ? \
+      (                name[1]=='T' && name[2]=='F') : \
+      (name[0]=='u' && name[1]=='t' && name[2]=='f'))) \
+  && (name[3]=='-' ? \
+     (name[4]=='8' && name[5]==0) : \
+     (name[3]=='8' && name[4]==0)))
+
 /* figures out if we need to go to file to read in the data tables.
 * @param converterName The name of the converter
 * @param err The error code
--- a/icu4c/source/common/unicode/unistr.h
+++ b/icu4c/source/common/unicode/unistr.h
@ -3128,6 +3128,17 @@ protected:
  virtual UChar32 getChar32At(int32_t offset) const;

 private:
+  // For char* constructors. Could be made public.
+  UnicodeString &setToUTF8(const StringPiece &utf8);
+  // For extract(char*).
+  // We could make a toUTF8(target, capacity, errorCode) public but not
+  // this version: New API will be cleaner if we make callers create substrings
+  // rather than having start+length on every method,
+  // and it should take a UErrorCode&.
+  int32_t
+  toUTF8(int32_t start, int32_t len,
+         char *target, int32_t capacity) const;
+

  inline int8_t
  doCompare(int32_t start,
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@ -295,6 +295,32 @@ UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
  }
 }

+#if U_CHARSET_IS_UTF8
+
+UnicodeString::UnicodeString(const char *codepageData)
+  : fShortLength(0),
+    fFlags(kShortString) {
+  if(codepageData != 0) {
+    setToUTF8(codepageData);
+  }
+}
+
+UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
+  : fShortLength(0),
+    fFlags(kShortString) {
+  // if there's nothing to convert, do nothing
+  if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
+    return;
+  }
+  if(dataLength == -1) {
+    dataLength = (int32_t)uprv_strlen(codepageData);
+  }
+  setToUTF8(StringPiece(codepageData, dataLength));
+}
+
+// else see unistr_cnv.cpp
+#endif
+
 UnicodeString::UnicodeString(const UnicodeString& that)
  : Replaceable(),
    fShortLength(0),
@ -381,26 +407,7 @@ UnicodeString::~UnicodeString()

 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
  UnicodeString result;
-  int32_t length = utf8.length();
-  int32_t capacity;
-  // The UTF-16 string will be at most as long as the UTF-8 string.
-  if(length <= US_STACKBUF_SIZE) {
-    capacity = US_STACKBUF_SIZE;
-  } else {
-    capacity = length + 1;  // +1 for the terminating NUL.
-  }
-  UChar *utf16 = result.getBuffer(capacity);
-  int32_t length16;
-  UErrorCode errorCode = U_ZERO_ERROR;
-  u_strFromUTF8WithSub(utf16, result.getCapacity(), &length16,
-      utf8.data(), length,
-      0xfffd,  // Substitution character.
-      NULL,    // Don't care about number of substitutions.
-      &errorCode);
-  result.releaseBuffer(length16);
-  if(U_FAILURE(errorCode)) {
-    result.setToBogus();
-  }
+  result.setToUTF8(utf8);
  return result;
 }

@ -772,6 +779,35 @@ UnicodeString::extract(int32_t start,
  return u_terminateChars(target, targetCapacity, length, &status);
 }

+int32_t
+UnicodeString::toUTF8(int32_t start, int32_t len,
+                      char *target, int32_t capacity) const {
+  pinIndices(start, len);
+  int32_t length8;
+  UErrorCode errorCode = U_ZERO_ERROR;
+  u_strToUTF8WithSub(target, capacity, &length8,
+                     getBuffer() + start, len,
+                     0xFFFD,  // Standard substitution character.
+                     NULL,    // Don't care about number of substitutions.
+                     &errorCode);
+  return length8;
+}
+
+#if U_CHARSET_IS_UTF8
+
+int32_t
+UnicodeString::extract(int32_t start, int32_t len,
+                       char *target, uint32_t dstSize) const {
+  // if the arguments are illegal, then do nothing
+  if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
+    return 0;
+  }
+  return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
+}
+
+// else see unistr_cnv.cpp
+#endif
+
 void 
 UnicodeString::extractBetween(int32_t start,
                  int32_t limit,
@ -1108,6 +1144,31 @@ UnicodeString::setTo(UChar *buffer,
  return *this;
 }

+UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
+  unBogus();
+  int32_t length = utf8.length();
+  int32_t capacity;
+  // The UTF-16 string will be at most as long as the UTF-8 string.
+  if(length <= US_STACKBUF_SIZE) {
+    capacity = US_STACKBUF_SIZE;
+  } else {
+    capacity = length + 1;  // +1 for the terminating NUL.
+  }
+  UChar *utf16 = getBuffer(capacity);
+  int32_t length16;
+  UErrorCode errorCode = U_ZERO_ERROR;
+  u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
+      utf8.data(), length,
+      0xfffd,  // Substitution character.
+      NULL,    // Don't care about number of substitutions.
+      &errorCode);
+  releaseBuffer(length16);
+  if(U_FAILURE(errorCode)) {
+    setToBogus();
+  }
+  return *this;
+}
+
 UnicodeString&
 UnicodeString::setCharAt(int32_t offset,
             UChar c)
--- a/icu4c/source/common/unistr_cnv.cpp
+++ b/icu4c/source/common/unistr_cnv.cpp
@ -37,6 +37,8 @@ U_NAMESPACE_BEGIN
 // Constructors
 //========================================

+#if !U_CHARSET_IS_UTF8
+
 UnicodeString::UnicodeString(const char *codepageData)
  : fShortLength(0),
    fFlags(kShortString)
@ -56,6 +58,9 @@ UnicodeString::UnicodeString(const char *codepageData,
    }
 }

+// else see unistr.cpp
+#endif
+
 UnicodeString::UnicodeString(const char *codepageData,
                             const char *codepage)
  : fShortLength(0),
@ -117,6 +122,9 @@ UnicodeString::UnicodeString(const char *src, int32_t srcLength,
 //========================================
 // Codeset conversion
 //========================================
+
+#if !U_CHARSET_IS_UTF8
+
 int32_t
 UnicodeString::extract(int32_t start,
                       int32_t length,
@ -125,6 +133,9 @@ UnicodeString::extract(int32_t start,
    return extract(start, length, target, dstSize, 0);
 }

+// else see unistr.cpp
+#endif
+
 int32_t
 UnicodeString::extract(int32_t start,
                       int32_t length,
@ -140,44 +151,59 @@ UnicodeString::extract(int32_t start,
    // pin the indices to legal values
    pinIndices(start, length);

+    // We need to cast dstSize to int32_t for all subsequent code.
+    // I don't know why the API was defined with uint32_t but we are stuck with it.
+    // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
+    // as a limit in some functions, it may wrap around and yield a pointer
+    // that compares less-than target.
+    int32_t capacity;
+    if(dstSize < 0x7fffffff) {
+        // Assume that the capacity is real and a limit pointer won't wrap around.
+        capacity = (int32_t)dstSize;
+    } else {
+        char *targetLimit = target + 0x7fffffff;
+        if(targetLimit < target) {
+            // Pin the capacity so that a limit pointer does not wrap around.
+            targetLimit = (char *)U_MAX_PTR(target);
+            capacity = (int32_t)(targetLimit - target);
+        } else {
+            // Pin the capacity to the maximum int32_t value.
+            capacity = 0x7fffffff;
+        }
+    }
+
    // create the converter
    UConverter *converter;
    UErrorCode status = U_ZERO_ERROR;

    // just write the NUL if the string length is 0
    if(length == 0) {
-        if(dstSize >= 0x80000000) {  
-            // careful: dstSize is unsigned! (0xffffffff means "unlimited")
-            // make sure that the NUL-termination works (takes int32_t)
-            dstSize=0x7fffffff;
-        }
-        return u_terminateChars(target, dstSize, 0, &status);
+        return u_terminateChars(target, capacity, 0, &status);
    }

    // if the codepage is the default, use our cache
    // if it is an empty string, then use the "invariant character" conversion
    if (codepage == 0) {
+        const char *defaultName = ucnv_getDefaultName();
+        if(UCNV_FAST_IS_UTF8(defaultName)) {
+            return toUTF8(start, length, target, capacity);
+        }
        converter = u_getDefaultConverter(&status);
    } else if (*codepage == 0) {
        // use the "invariant characters" conversion
        int32_t destLength;
-        // careful: dstSize is unsigned! (0xffffffff means "unlimited")
-        if(dstSize >= 0x80000000) {
-            destLength = length;
-            // make sure that the NUL-termination works (takes int32_t)
-            dstSize=0x7fffffff;
-        } else if(length <= (int32_t)dstSize) {
+        if(length <= capacity) {
            destLength = length;
        } else {
-            destLength = (int32_t)dstSize;
+            destLength = capacity;
        }
        u_UCharsToChars(getArrayStart() + start, target, destLength);
-        return u_terminateChars(target, (int32_t)dstSize, length, &status);
+        return u_terminateChars(target, capacity, length, &status);
    } else {
        converter = ucnv_open(codepage, &status);
    }

-    length = doExtract(start, length, target, (int32_t)dstSize, converter, status);
+    length = doExtract(start, length, target, capacity, converter, status);

    // close the converter
    if (codepage == 0) {
@ -298,20 +324,15 @@ UnicodeString::doCodepageCreate(const char *codepageData,
    // create the converter
    // if the codepage is the default, use our cache
    // if it is an empty string, then use the "invariant character" conversion
-    UConverter *converter = (codepage == 0 ?
-                             u_getDefaultConverter(&status) :
-                             *codepage == 0 ?
-                               0 :
-                               ucnv_open(codepage, &status));
-
-    // if we failed, set the appropriate flags and return
-    if(U_FAILURE(status)) {
-        setToBogus();
-        return;
-    }
-
-    // perform the conversion
-    if(converter == 0) {
+    UConverter *converter;
+    if (codepage == 0) {
+        const char *defaultName = ucnv_getDefaultName();
+        if(UCNV_FAST_IS_UTF8(defaultName)) {
+            setToUTF8(StringPiece(codepageData, dataLength));
+            return;
+        }
+        converter = u_getDefaultConverter(&status);
+    } else if(*codepage == 0) {
        // use the "invariant characters" conversion
        if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
            u_charsToUChars(codepageData, getArrayStart(), dataLength);
@ -320,9 +341,17 @@ UnicodeString::doCodepageCreate(const char *codepageData,
            setToBogus();
        }
        return;
+    } else {
+        converter = ucnv_open(codepage, &status);
    }

-    // convert using the real converter
+    // if we failed, set the appropriate flags and return
+    if(U_FAILURE(status)) {
+        setToBogus();
+        return;
+    }
+
+    // perform the conversion
    doCodepageCreate(codepageData, dataLength, converter, status);
    if(U_FAILURE(status)) {
        setToBogus();
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@ -232,6 +232,29 @@ UnicodeStringTest::TestBasicManipulation()
            errln("UnicodeString(const char *, length, cnv, errorCode) does not work with length==-1");
        }
    }
+
+#if U_CHARSET_IS_UTF8
+    {
+        // Test the hardcoded-UTF-8 UnicodeString optimizations.
+        static const uint8_t utf8[]={ 0x61, 0xC3, 0xA4, 0xC3, 0x9F, 0xE4, 0xB8, 0x80, 0 };
+        static const UChar utf16[]={ 0x61, 0xE4, 0xDF, 0x4E00 };
+        UnicodeString from8a = UnicodeString((const char *)utf8);
+        UnicodeString from8b = UnicodeString((const char *)utf8, (int32_t)sizeof(utf8)-1);
+        UnicodeString from16(FALSE, utf16, LENGTHOF(utf16));
+        if(from8a != from16 || from8b != from16) {
+            errln("UnicodeString(const char * U_CHARSET_IS_UTF8) failed");
+        }
+        char buffer[16];
+        int32_t length8=from16.extract(0, 0x7fffffff, buffer, (uint32_t)sizeof(buffer));
+        if(length8!=((int32_t)sizeof(utf8)-1) || 0!=uprv_memcmp(buffer, utf8, sizeof(utf8))) {
+            errln("UnicodeString::extract(char * U_CHARSET_IS_UTF8) failed");
+        }
+        length8=from16.extract(1, 2, buffer, (uint32_t)sizeof(buffer));
+        if(length8!=4 || buffer[length8]!=0 || 0!=uprv_memcmp(buffer, utf8+1, length8)) {
+            errln("UnicodeString::extract(substring to char * U_CHARSET_IS_UTF8) failed");
+        }
+    }
+#endif
 }

 void