diff --git a/icu4c/source/common/bytestream.cpp b/icu4c/source/common/bytestream.cpp new file mode 100644 index 0000000000..0821f7ab36 --- /dev/null +++ b/icu4c/source/common/bytestream.cpp @@ -0,0 +1,64 @@ +// Copyright (C) 2009, International Business Machines +// Corporation and others. All Rights Reserved. +// +// Copyright 2007 Google Inc. All Rights Reserved. +// Author: sanjay@google.com (Sanjay Ghemawat) + +#include "unicode/utypes.h" +#include "unicode/bytestream.h" + +U_NAMESPACE_BEGIN + +char* ByteSink::GetAppendBuffer(int32_t min_capacity, + int32_t desired_capacity_hint, + char* scratch, int32_t scratch_capacity, + int32_t* result_capacity) { + if (min_capacity < 1 || scratch_capacity < min_capacity) { + *result_capacity = 0; + return NULL; + } + *result_capacity = scratch_capacity; + return scratch; +} + +void ByteSink::Flush() {} + +CheckedArrayByteSink::CheckedArrayByteSink(char* outbuf, int32_t capacity) + : outbuf_(outbuf), capacity_(capacity < 0 ? 0 : capacity), size_(0), overflowed_(false) { +} + +void CheckedArrayByteSink::Append(const char* bytes, int32_t n) { + if (n <= 0) { + return; + } + int32_t available = capacity_ - size_; + if (n > available) { + n = available; + overflowed_ = true; + } + if (n > 0 && bytes != (outbuf_ + size_)) { + memcpy(outbuf_ + size_, bytes, n); + } + size_ += n; +} + +char* CheckedArrayByteSink::GetAppendBuffer(int32_t min_capacity, + int32_t desired_capacity_hint, + char* scratch, + int32_t scratch_capacity, + int32_t* result_capacity) { + if (min_capacity < 1 || scratch_capacity < min_capacity) { + *result_capacity = 0; + return NULL; + } + int32_t available = capacity_ - size_; + if (available >= min_capacity) { + *result_capacity = available; + return outbuf_ + size_; + } else { + *result_capacity = scratch_capacity; + return scratch; + } +} + +U_NAMESPACE_END diff --git a/icu4c/source/common/common.vcproj b/icu4c/source/common/common.vcproj index 0a6f56ae94..28431912d6 100644 --- a/icu4c/source/common/common.vcproj +++ b/icu4c/source/common/common.vcproj @@ -3401,6 +3401,50 @@ + + + + + + + + + + + + + + + + @@ -3549,6 +3593,50 @@ /> + + + + + + + + + + + + + + + + diff --git a/icu4c/source/common/stringpiece.cpp b/icu4c/source/common/stringpiece.cpp new file mode 100644 index 0000000000..82a034178b --- /dev/null +++ b/icu4c/source/common/stringpiece.cpp @@ -0,0 +1,45 @@ +// Copyright (C) 2009, International Business Machines +// Corporation and others. All Rights Reserved. +// +// Copyright 2004 and onwards Google Inc. +// +// Author: wilsonh@google.com (Wilson Hsieh) +// + +#include "unicode/utypes.h" +#include "unicode/stringpiece.h" +#include "cstring.h" + +U_NAMESPACE_BEGIN + +StringPiece::StringPiece(const char* str) + : ptr_(str), length_((str == NULL) ? 0 : static_cast(uprv_strlen(str))) { } + +StringPiece::StringPiece(const StringPiece& x, int32_t pos) { + if (pos < 0) { + pos = 0; + } else if (pos > x.length_) { + pos = x.length_; + } + ptr_ = x.ptr_ + pos; + length_ = x.length_ - pos; +} + +StringPiece::StringPiece(const StringPiece& x, int32_t pos, int32_t len) { + if (pos < 0) { + pos = 0; + } else if (pos > x.length_) { + pos = x.length_; + } + if (len < 0) { + len = 0; + } else if (len > x.length_ - pos) { + len = x.length_ - pos; + } + ptr_ = x.ptr_ + pos; + length_ = len; +} + +const int32_t StringPiece::npos; + +U_NAMESPACE_END diff --git a/icu4c/source/common/unicode/bytestream.h b/icu4c/source/common/unicode/bytestream.h new file mode 100644 index 0000000000..354a4ce3da --- /dev/null +++ b/icu4c/source/common/unicode/bytestream.h @@ -0,0 +1,151 @@ +// Copyright (C) 2009, International Business Machines +// Corporation and others. All Rights Reserved. +// +// Copyright 2007 Google Inc. All Rights Reserved. +// Author: sanjay@google.com (Sanjay Ghemawat) +// +// Abstract interface that consumes a sequence of bytes (ByteSink). +// +// Used so that we can write a single piece of code that can operate +// on a variety of output string types. +// +// Various implementations of this interface are provided: +// ByteSink: +// CheckedArrayByteSink Write to a flat array, with bounds checking +// StringByteSink Write to an STL string + +#ifndef __BYTESTREAM_H__ +#define __BYTESTREAM_H__ + +/** + * \file + * \brief C++ API: Interface for writing bytes, and implementation classes. + */ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "unicode/std_string.h" + +U_NAMESPACE_BEGIN + +// A ByteSink can be filled with bytes +// @draft ICU 4.2 +class U_COMMON_API ByteSink : public UMemory { +public: + // @draft ICU 4.2 + ByteSink() { } + // @draft ICU 4.2 + virtual ~ByteSink() { } + + // Append "bytes[0,n-1]" to this. + // @draft ICU 4.2 + virtual void Append(const char* bytes, int32_t n) = 0; + + // Returns a writable buffer for appending and writes the buffer's capacity to + // *result_capacity. Guarantees *result_capacity>=min_capacity. + // May return a pointer to the caller-owned scratch buffer which must have + // scratch_capacity>=min_capacity. + // The returned buffer is only valid until the next operation + // on this ByteSink. + // + // After writing at most *result_capacity bytes, call Append() with the + // pointer returned from this function and the number of bytes written. + // Many Append() implementations will avoid copying bytes if this function + // returned an internal buffer. + // + // Partial usage example: + // int32_t capacity; + // char* buffer = sink->GetAppendBuffer(..., &capacity); + // ... Write n bytes into buffer, with n <= capacity. + // sink->Append(buffer, n); + // In many implementations, that call to Append will avoid copying bytes. + // + // If the ByteSink allocates or reallocates an internal buffer, it should use + // the desired_capacity_hint if appropriate. + // If a caller cannot provide a reasonable guess at the desired capacity, + // it should pass desired_capacity_hint=0. + // + // If a non-scratch buffer is returned, the caller may only pass + // a prefix to it to Append(). + // That is, it is not correct to pass an interior pointer to Append(). + // + // The default implementation always returns the scratch buffer. + // @draft ICU 4.2 + virtual char* GetAppendBuffer(int32_t min_capacity, + int32_t desired_capacity_hint, + char* scratch, int32_t scratch_capacity, + int32_t* result_capacity); + + // Flush internal buffers. + // Some byte sinks use internal buffers or provide buffering + // and require calling Flush() at the end of the stream. + // The default implementation of Flush() does nothing. + // @draft ICU 4.2 + virtual void Flush(); + +private: + ByteSink(const ByteSink &); // copy constructor not implemented + ByteSink &operator=(const ByteSink &); // assignment operator not implemented +}; + +// ------------------------------------------------------------- +// Some standard implementations + +// Implementation of ByteSink that writes to a flat byte array, +// with bounds-checking: +// This sink will not write more than capacity bytes to outbuf. +// If more than capacity bytes are Append()ed, then excess bytes are ignored, +// and Overflowed() will return true. +// Overflow does not cause a runtime error. +// @draft ICU 4.2 +class U_COMMON_API CheckedArrayByteSink : public ByteSink { +public: + // @draft ICU 4.2 + CheckedArrayByteSink(char* outbuf, int32_t capacity); + // @draft ICU 4.2 + virtual void Append(const char* bytes, int32_t n); + // @draft ICU 4.2 + virtual char* GetAppendBuffer(int32_t min_capacity, + int32_t desired_capacity_hint, + char* scratch, int32_t scratch_capacity, + int32_t* result_capacity); + // Returns the number of bytes actually written to the sink. + // @draft ICU 4.2 + int32_t NumberOfBytesWritten() const { return size_; } + // Returns true if any bytes were discarded, i.e., if there was an + // attempt to write more than 'capacity' bytes. + // @draft ICU 4.2 + UBool Overflowed() const { return overflowed_; } +private: + char* outbuf_; + const int32_t capacity_; + int32_t size_; + bool overflowed_; + CheckedArrayByteSink(); // default constructor not implemented + CheckedArrayByteSink(const CheckedArrayByteSink &); // copy constructor not implemented + CheckedArrayByteSink &operator=(const CheckedArrayByteSink &); // assignment operator not implemented +}; + +#if U_HAVE_STD_STRING + +// Implementation of ByteSink that writes to a "string". +// @draft ICU 4.2 +template +class StringByteSink : public ByteSink { + public: + // @draft ICU 4.2 + StringByteSink(StringClass* dest) : dest_(dest) { } + // @draft ICU 4.2 + virtual void Append(const char* data, int32_t n) { dest_->append(data, n); } + private: + StringClass* dest_; + StringByteSink(); // default constructor not implemented + StringByteSink(const StringByteSink &); // copy constructor not implemented + StringByteSink &operator=(const StringByteSink &); // assignment operator not implemented +}; + +#endif + +U_NAMESPACE_END + +#endif // __BYTESTREAM_H__ diff --git a/icu4c/source/common/unicode/stringpiece.h b/icu4c/source/common/unicode/stringpiece.h new file mode 100644 index 0000000000..541c8f05df --- /dev/null +++ b/icu4c/source/common/unicode/stringpiece.h @@ -0,0 +1,113 @@ +// Copyright (C) 2009, International Business Machines +// Corporation and others. All Rights Reserved. +// +// Copyright 2001 and onwards Google Inc. +// Author: Sanjay Ghemawat +// +// A string-like object that points to a sized piece of memory. +// +// Functions or methods may use const StringPiece& parameters to accept either +// a "const char*" or a "string" value that will be implicitly converted to +// a StringPiece. +// +// Systematic usage of StringPiece is encouraged as it will reduce unnecessary +// conversions from "const char*" to "string" and back again. +// +// +// Arghh! I wish C++ literals were "string". + +#ifndef __STRINGPIECE_H__ +#define __STRINGPIECE_H__ + +/** + * \file + * \brief C++ API: Read-only byte string wrapper class. + */ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "unicode/std_string.h" + +U_NAMESPACE_BEGIN + +// @draft ICU 4.2 +class U_COMMON_API StringPiece : public UMemory { + private: + const char* ptr_; + int32_t length_; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + // @draft ICU 4.2 + StringPiece() : ptr_(NULL), length_(0) { } + // @draft ICU 4.2 + StringPiece(const char* str); +#if U_HAVE_STD_STRING + // @draft ICU 4.2 + StringPiece(const U_STD_NSQ string& str) + : ptr_(str.data()), length_(static_cast(str.size())) { } +#endif + // @draft ICU 4.2 + StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { } + // Substring of another StringPiece. + // pos must be non-negative and <= x.length(). + // @draft ICU 4.2 + StringPiece(const StringPiece& x, int32_t pos); + // Substring of another StringPiece. + // pos must be non-negative and <= x.length(). + // len must be non-negative and will be pinned to at most x.length() - pos. + // @draft ICU 4.2 + StringPiece(const StringPiece& x, int32_t pos, int32_t len); + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + // @draft ICU 4.2 + const char* data() const { return ptr_; } + // @draft ICU 4.2 + int32_t size() const { return length_; } + // @draft ICU 4.2 + int32_t length() const { return length_; } + // @draft ICU 4.2 + UBool empty() const { return length_ == 0; } + + // @draft ICU 4.2 + void clear() { ptr_ = NULL; length_ = 0; } + + // @draft ICU 4.2 + void remove_prefix(int32_t n) { + if (n >= 0) { + if (n > length_) { + n = length_; + } + ptr_ += n; + length_ -= n; + } + } + + // @draft ICU 4.2 + void remove_suffix(int32_t n) { + if (n >= 0) { + if (n <= length_) { + length_ -= n; + } else { + length_ = 0; + } + } + } + + // @draft ICU 4.2 + static const int32_t npos = 0x7fffffff; + + // @draft ICU 4.2 + StringPiece substr(int32_t pos, int32_t n = npos) const { + return StringPiece(*this, pos, n); + } +}; + +U_NAMESPACE_END + +#endif // __STRINGPIECE_H__ diff --git a/icu4c/source/common/unicode/unistr.h b/icu4c/source/common/unicode/unistr.h index 65a444fc31..8eae28f217 100644 --- a/icu4c/source/common/unicode/unistr.h +++ b/icu4c/source/common/unicode/unistr.h @@ -26,7 +26,11 @@ * \brief C++ API: Unicode String */ +#include "unicode/utypes.h" #include "unicode/rep.h" +#include "unicode/std_string.h" +#include "unicode/stringpiece.h" +#include "unicode/bytestream.h" struct UConverter; // unicode/ucnv.h class StringThreadTest; @@ -1532,6 +1536,40 @@ public: UConverter *cnv, UErrorCode &errorCode) const; +#endif + + /** + * Convert the UnicodeString to UTF-8 and write the result + * to a ByteSink. This is called by toUTF8String(). + * Unpaired surrogates are replaced with U+FFFD. + * Calls u_strToUTF8WithSub(). + * + * @param A ByteSink to which the UTF-8 version of the string is written. + * @draft ICU 4.2 + * @see toUTF8String + */ + void toUTF8(ByteSink &sink) const; + +#if U_HAVE_STD_STRING + + /** + * Convert the UnicodeString to UTF-8 and append the result + * to a standard string. + * Unpaired surrogates are replaced with U+FFFD. + * Calls toUTF8(). + * + * @param A standard string (or a compatible object) + * to which the UTF-8 version of the string is appended. + * @return The string object. + * @draft ICU 4.2 + * @see toUTF8 + */ + template + StringClass &toUTF8String(StringClass &result) const { + toUTF8(StringByteSink(&result)); + return result; + } + #endif /** @@ -2917,6 +2955,21 @@ public: */ virtual ~UnicodeString(); + /** + * Create a UnicodeString from a UTF-8 string. + * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string. + * Calls u_strFromUTF8WithSub(). + * + * @param utf8 UTF-8 input string. + * Note that a StringPiece can be implicitly constructed + * from a std::string or a NUL-terminated const char * string. + * @return A UnicodeString with equivalent UTF-16 contents. + * @see toUTF8 + * @see toUTF8String + * @draft ICU 4.2 + */ + static UnicodeString fromUTF8(const StringPiece &utf8); + /** * Create a UnicodeString from a UTF-32 string. * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string. diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp index a52d66d161..473b22352f 100644 --- a/icu4c/source/common/unistr.cpp +++ b/icu4c/source/common/unistr.cpp @@ -379,9 +379,34 @@ UnicodeString::~UnicodeString() // Factory methods //======================================== +UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) { + UnicodeString result; + int32_t length = utf8.length(); + int32_t capacity; + // The UTF-16 string will be at most as long as the UTF-8 string. + if(length <= US_STACKBUF_SIZE) { + capacity = US_STACKBUF_SIZE; + } else { + capacity = length + 1; // +1 for the terminating NUL. + } + UChar *utf16 = result.getBuffer(capacity); + int32_t length16; + UErrorCode errorCode = U_ZERO_ERROR; + u_strFromUTF8WithSub(utf16, result.getCapacity(), &length16, + utf8.data(), length, + 0xfffd, // Substitution character. + NULL, // Don't care about number of substitutions. + &errorCode); + result.releaseBuffer(length16); + if(U_FAILURE(errorCode)) { + result.setToBogus(); + } + return result; +} + UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) { UnicodeString result; - int32_t capacity = length; + int32_t capacity; // Most UTF-32 strings will be BMP-only and result in a same-length // UTF-16 string. We overestimate the capacity just slightly, // just in case there are a few supplementary characters. @@ -756,6 +781,51 @@ UnicodeString::extractBetween(int32_t start, doExtract(start, limit - start, target); } +// When converting from UTF-16 to UTF-8, the result will have at most 3 times +// as many bytes as the source has UChars. +// The "worst cases" are writing systems like Indic, Thai and CJK with +// 3:1 bytes:UChars. +void +UnicodeString::toUTF8(ByteSink &sink) const { + int32_t length16 = length(); + if(length16 != 0) { + char stackBuffer[1024]; + int32_t capacity = (int32_t)sizeof(stackBuffer); + UBool utf8IsOwned = FALSE; + char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity, + 3*length16, + stackBuffer, capacity, + &capacity); + int32_t length8 = 0; + UErrorCode errorCode = U_ZERO_ERROR; + u_strToUTF8WithSub(utf8, capacity, &length8, + getBuffer(), length16, + 0xFFFD, // Standard substitution character. + NULL, // Don't care about number of substitutions. + &errorCode); + if(errorCode == U_BUFFER_OVERFLOW_ERROR) { + utf8 = (char *)uprv_malloc(length8); + if(utf8 != NULL) { + utf8IsOwned = TRUE; + errorCode = U_ZERO_ERROR; + u_strToUTF8WithSub(utf8, length8, &length8, + getBuffer(), length16, + 0xFFFD, // Standard substitution character. + NULL, // Don't care about number of substitutions. + &errorCode); + } else { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } + } + if(U_SUCCESS(errorCode)) { + sink.Append(utf8, length8); + } + if(utf8IsOwned) { + uprv_free(utf8); + } + } +} + int32_t UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const { int32_t length32=0; diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp index 28dacf07e5..7fb300342f 100644 --- a/icu4c/source/test/intltest/ustrtest.cpp +++ b/icu4c/source/test/intltest/ustrtest.cpp @@ -5,6 +5,7 @@ ********************************************************************/ #include "ustrtest.h" +#include "unicode/std_string.h" #include "unicode/unistr.h" #include "unicode/uchar.h" #include "unicode/ustring.h" @@ -59,6 +60,7 @@ void UnicodeStringTest::runIndexedTest( int32_t index, UBool exec, const char* & case 16: name = "TestCharString"; if (exec) TestCharString(); break; case 17: name = "TestNameSpace"; if (exec) TestNameSpace(); break; case 18: name = "TestUTF32"; if (exec) TestUTF32(); break; + case 19: name = "TestUTF8"; if (exec) TestUTF8(); break; default: name = ""; break; //needed to end loop } @@ -1739,3 +1741,71 @@ UnicodeStringTest::TestUTF32() { errln("UnicodeString::toUTF32() did not create the expected string."); } } + +void +UnicodeStringTest::TestUTF8() { + static const uint8_t utf8[] = { + // Code points: + // 0x41, 0xd900, + // 0x61, 0xdc00, + // 0x110000, 0x5a, + // 0x50000, 0x7a, + // 0x10000, 0x20000, + // 0xe0000, 0x10ffff + 0x41, 0xed, 0xa4, 0x80, + 0x61, 0xed, 0xb0, 0x80, + 0xf4, 0x90, 0x80, 0x80, 0x5a, + 0xf1, 0x90, 0x80, 0x80, 0x7a, + 0xf0, 0x90, 0x80, 0x80, 0xf0, 0xa0, 0x80, 0x80, + 0xf3, 0xa0, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf + }; + static const UChar expected_utf16[] = { + 0x41, 0xfffd, + 0x61, 0xfffd, + 0xfffd, 0x5a, + 0xd900, 0xdc00, 0x7a, + 0xd800, 0xdc00, 0xd840, 0xdc00, + 0xdb40, 0xdc00, 0xdbff, 0xdfff + }; + UnicodeString from8 = UnicodeString::fromUTF8(StringPiece((const char *)utf8, (int32_t)sizeof(utf8))); + UnicodeString expected(FALSE, expected_utf16, LENGTHOF(expected_utf16)); + + if(from8 != expected) { + errln("UnicodeString::fromUTF8(StringPiece) did not create the expected string."); + } +#if U_HAVE_STD_STRING + U_STD_NSQ string utf8_string((const char *)utf8, sizeof(utf8)); + UnicodeString from8b = UnicodeString::fromUTF8(utf8_string); + if(from8b != expected) { + errln("UnicodeString::fromUTF8(std::string) did not create the expected string."); + } +#endif + + static const UChar utf16[] = { + 0x41, 0xd900, 0x61, 0xdc00, 0x5a, 0xd900, 0xdc00, 0x7a, 0xd800, 0xdc00, 0xdbff, 0xdfff + }; + static const uint8_t expected_utf8[] = { + 0x41, 0xef, 0xbf, 0xbd, 0x61, 0xef, 0xbf, 0xbd, 0x5a, 0xf1, 0x90, 0x80, 0x80, 0x7a, + 0xf0, 0x90, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf + }; + UnicodeString us(FALSE, utf16, LENGTHOF(utf16)); + + char buffer[64]; + CheckedArrayByteSink sink(buffer, (int32_t)sizeof(buffer)); + us.toUTF8(sink); + if( sink.NumberOfBytesWritten() != (int32_t)sizeof(expected_utf8) || + 0 != uprv_memcmp(buffer, expected_utf8, sizeof(expected_utf8)) + ) { + errln("UnicodeString::toUTF8() did not create the expected string."); + } +#if U_HAVE_STD_STRING + // Initial contents for testing that toUTF8String() appends. + U_STD_NSQ string result8 = "-->"; + U_STD_NSQ string expected8 = "-->" + U_STD_NSQ string((const char *)expected_utf8, sizeof(expected_utf8)); + // Use the return value just for testing. + U_STD_NSQ string &result8r = us.toUTF8String(result8); + if(result8r != expected8 || &result8r != &result8) { + errln("UnicodeString::toUTF8String() did not create the expected string."); + } +#endif +} diff --git a/icu4c/source/test/intltest/ustrtest.h b/icu4c/source/test/intltest/ustrtest.h index 96a9894639..5269c9ae13 100644 --- a/icu4c/source/test/intltest/ustrtest.h +++ b/icu4c/source/test/intltest/ustrtest.h @@ -76,6 +76,7 @@ public: void TestCharString(void); void TestNameSpace(void); void TestUTF32(void); + void TestUTF8(void); }; class StringCaseTest: public IntlTest {