ICU-6765 add API: StringPiece, ByteSink(s), UnicodeString::toUTF8() and fromUTF8()

X-SVN-Rev: 25519
2009-03-06 19:19:00 +00:00 · 2009-03-06 19:19:00 +00:00 · 63ea2e06da
commit 63ea2e06da
parent 025e6a09ae
9 changed files with 656 additions and 1 deletions
--- a/icu4c/source/common/bytestream.cpp
+++ b/icu4c/source/common/bytestream.cpp
@ -0,0 +1,64 @@
+// Copyright (C) 2009, International Business Machines
+// Corporation and others. All Rights Reserved.
+//
+// Copyright 2007 Google Inc. All Rights Reserved.
+// Author: sanjay@google.com (Sanjay Ghemawat)
+
+#include "unicode/utypes.h"
+#include "unicode/bytestream.h"
+
+U_NAMESPACE_BEGIN
+
+char* ByteSink::GetAppendBuffer(int32_t min_capacity,
+                                int32_t desired_capacity_hint,
+                                char* scratch, int32_t scratch_capacity,
+                                int32_t* result_capacity) {
+  if (min_capacity < 1 || scratch_capacity < min_capacity) {
+    *result_capacity = 0;
+    return NULL;
+  }
+  *result_capacity = scratch_capacity;
+  return scratch;
+}
+
+void ByteSink::Flush() {}
+
+CheckedArrayByteSink::CheckedArrayByteSink(char* outbuf, int32_t capacity)
+    : outbuf_(outbuf), capacity_(capacity < 0 ? 0 : capacity), size_(0), overflowed_(false) {
+}
+
+void CheckedArrayByteSink::Append(const char* bytes, int32_t n) {
+  if (n <= 0) {
+    return;
+  }
+  int32_t available = capacity_ - size_;
+  if (n > available) {
+    n = available;
+    overflowed_ = true;
+  }
+  if (n > 0 && bytes != (outbuf_ + size_)) {
+    memcpy(outbuf_ + size_, bytes, n);
+  }
+  size_ += n;
+}
+
+char* CheckedArrayByteSink::GetAppendBuffer(int32_t min_capacity,
+                                            int32_t desired_capacity_hint,
+                                            char* scratch,
+                                            int32_t scratch_capacity,
+                                            int32_t* result_capacity) {
+  if (min_capacity < 1 || scratch_capacity < min_capacity) {
+    *result_capacity = 0;
+    return NULL;
+  }
+  int32_t available = capacity_ - size_;
+  if (available >= min_capacity) {
+    *result_capacity = available;
+    return outbuf_ + size_;
+  } else {
+    *result_capacity = scratch_capacity;
+    return scratch;
+  }
+}
+
+U_NAMESPACE_END
--- a/icu4c/source/common/common.vcproj
+++ b/icu4c/source/common/common.vcproj
@ -3401,6 +3401,50 @@
 		<Filter
 			Name="strings"
 			>
+			<File
+				RelativePath=".\bytestream.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\unicode\bytestream.h"
+				>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+			</File>
 			<File
 				RelativePath=".\chariter.cpp"
 				>
@ -3549,6 +3593,50 @@
 					/>
 				</FileConfiguration>
 			</File>
+			<File
+				RelativePath=".\stringpiece.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\unicode\stringpiece.h"
+				>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
+						Outputs="..\..\include\unicode\$(InputFileName)"
+					/>
+				</FileConfiguration>
+			</File>
 			<File
 				RelativePath=".\ucasemap.c"
 				>
--- a/icu4c/source/common/stringpiece.cpp
+++ b/icu4c/source/common/stringpiece.cpp
@ -0,0 +1,45 @@
+// Copyright (C) 2009, International Business Machines
+// Corporation and others. All Rights Reserved.
+//
+// Copyright 2004 and onwards Google Inc.
+//
+// Author: wilsonh@google.com (Wilson Hsieh)
+//
+
+#include "unicode/utypes.h"
+#include "unicode/stringpiece.h"
+#include "cstring.h"
+
+U_NAMESPACE_BEGIN
+
+StringPiece::StringPiece(const char* str)
+    : ptr_(str), length_((str == NULL) ? 0 : static_cast<int32_t>(uprv_strlen(str))) { }
+
+StringPiece::StringPiece(const StringPiece& x, int32_t pos) {
+  if (pos < 0) {
+    pos = 0;
+  } else if (pos > x.length_) {
+    pos = x.length_;
+  }
+  ptr_ = x.ptr_ + pos;
+  length_ = x.length_ - pos;
+}
+
+StringPiece::StringPiece(const StringPiece& x, int32_t pos, int32_t len) {
+  if (pos < 0) {
+    pos = 0;
+  } else if (pos > x.length_) {
+    pos = x.length_;
+  }
+  if (len < 0) {
+    len = 0;
+  } else if (len > x.length_ - pos) {
+    len = x.length_ - pos;
+  }
+  ptr_ = x.ptr_ + pos;
+  length_ = len;
+}
+
+const int32_t StringPiece::npos;
+
+U_NAMESPACE_END
--- a/icu4c/source/common/unicode/bytestream.h
+++ b/icu4c/source/common/unicode/bytestream.h
@ -0,0 +1,151 @@
+// Copyright (C) 2009, International Business Machines
+// Corporation and others. All Rights Reserved.
+//
+// Copyright 2007 Google Inc. All Rights Reserved.
+// Author: sanjay@google.com (Sanjay Ghemawat)
+//
+// Abstract interface that consumes a sequence of bytes (ByteSink).
+//
+// Used so that we can write a single piece of code that can operate
+// on a variety of output string types.
+//
+// Various implementations of this interface are provided:
+//   ByteSink:
+//      CheckedArrayByteSink    Write to a flat array, with bounds checking
+//      StringByteSink          Write to an STL string
+
+#ifndef __BYTESTREAM_H__
+#define __BYTESTREAM_H__
+
+/**
+ * \file 
+ * \brief C++ API: Interface for writing bytes, and implementation classes.
+ */
+
+#include "unicode/utypes.h"
+#include "unicode/uobject.h"
+#include "unicode/std_string.h"
+
+U_NAMESPACE_BEGIN
+
+// A ByteSink can be filled with bytes
+// @draft ICU 4.2
+class U_COMMON_API ByteSink : public UMemory {
+public:
+  // @draft ICU 4.2
+  ByteSink() { }
+  // @draft ICU 4.2
+  virtual ~ByteSink() { }
+
+  // Append "bytes[0,n-1]" to this.
+  // @draft ICU 4.2
+  virtual void Append(const char* bytes, int32_t n) = 0;
+
+  // Returns a writable buffer for appending and writes the buffer's capacity to
+  // *result_capacity. Guarantees *result_capacity>=min_capacity.
+  // May return a pointer to the caller-owned scratch buffer which must have
+  // scratch_capacity>=min_capacity.
+  // The returned buffer is only valid until the next operation
+  // on this ByteSink.
+  //
+  // After writing at most *result_capacity bytes, call Append() with the
+  // pointer returned from this function and the number of bytes written.
+  // Many Append() implementations will avoid copying bytes if this function
+  // returned an internal buffer.
+  //
+  // Partial usage example:
+  //  int32_t capacity;
+  //  char* buffer = sink->GetAppendBuffer(..., &capacity);
+  //  ... Write n bytes into buffer, with n <= capacity.
+  //  sink->Append(buffer, n);
+  // In many implementations, that call to Append will avoid copying bytes.
+  //
+  // If the ByteSink allocates or reallocates an internal buffer, it should use
+  // the desired_capacity_hint if appropriate.
+  // If a caller cannot provide a reasonable guess at the desired capacity,
+  // it should pass desired_capacity_hint=0.
+  //
+  // If a non-scratch buffer is returned, the caller may only pass
+  // a prefix to it to Append().
+  // That is, it is not correct to pass an interior pointer to Append().
+  //
+  // The default implementation always returns the scratch buffer.
+  // @draft ICU 4.2
+  virtual char* GetAppendBuffer(int32_t min_capacity,
+                                int32_t desired_capacity_hint,
+                                char* scratch, int32_t scratch_capacity,
+                                int32_t* result_capacity);
+
+  // Flush internal buffers.
+  // Some byte sinks use internal buffers or provide buffering
+  // and require calling Flush() at the end of the stream.
+  // The default implementation of Flush() does nothing.
+  // @draft ICU 4.2
+  virtual void Flush();
+
+private:
+  ByteSink(const ByteSink &); // copy constructor not implemented
+  ByteSink &operator=(const ByteSink &); // assignment operator not implemented
+};
+
+// -------------------------------------------------------------
+// Some standard implementations
+
+// Implementation of ByteSink that writes to a flat byte array,
+// with bounds-checking:
+// This sink will not write more than capacity bytes to outbuf.
+// If more than capacity bytes are Append()ed, then excess bytes are ignored,
+// and Overflowed() will return true.
+// Overflow does not cause a runtime error.
+// @draft ICU 4.2
+class U_COMMON_API CheckedArrayByteSink : public ByteSink {
+public:
+  // @draft ICU 4.2
+  CheckedArrayByteSink(char* outbuf, int32_t capacity);
+  // @draft ICU 4.2
+  virtual void Append(const char* bytes, int32_t n);
+  // @draft ICU 4.2
+  virtual char* GetAppendBuffer(int32_t min_capacity,
+                                int32_t desired_capacity_hint,
+                                char* scratch, int32_t scratch_capacity,
+                                int32_t* result_capacity);
+  // Returns the number of bytes actually written to the sink.
+  // @draft ICU 4.2
+  int32_t NumberOfBytesWritten() const { return size_; }
+  // Returns true if any bytes were discarded, i.e., if there was an
+  // attempt to write more than 'capacity' bytes.
+  // @draft ICU 4.2
+  UBool Overflowed() const { return overflowed_; }
+private:
+  char* outbuf_;
+  const int32_t capacity_;
+  int32_t size_;
+  bool overflowed_;
+  CheckedArrayByteSink(); // default constructor not implemented
+  CheckedArrayByteSink(const CheckedArrayByteSink &); // copy constructor not implemented
+  CheckedArrayByteSink &operator=(const CheckedArrayByteSink &); // assignment operator not implemented
+};
+
+#if U_HAVE_STD_STRING
+
+// Implementation of ByteSink that writes to a "string".
+// @draft ICU 4.2
+template<typename StringClass>
+class StringByteSink : public ByteSink {
+ public:
+  // @draft ICU 4.2
+  StringByteSink(StringClass* dest) : dest_(dest) { }
+  // @draft ICU 4.2
+  virtual void Append(const char* data, int32_t n) { dest_->append(data, n); }
+ private:
+  StringClass* dest_;
+  StringByteSink(); // default constructor not implemented
+  StringByteSink(const StringByteSink &); // copy constructor not implemented
+  StringByteSink &operator=(const StringByteSink &); // assignment operator not implemented
+};
+
+#endif
+
+U_NAMESPACE_END
+
+#endif  // __BYTESTREAM_H__
--- a/icu4c/source/common/unicode/stringpiece.h
+++ b/icu4c/source/common/unicode/stringpiece.h
@ -0,0 +1,113 @@
+// Copyright (C) 2009, International Business Machines
+// Corporation and others. All Rights Reserved.
+//
+// Copyright 2001 and onwards Google Inc.
+// Author: Sanjay Ghemawat
+//
+// A string-like object that points to a sized piece of memory.
+//
+// Functions or methods may use const StringPiece& parameters to accept either
+// a "const char*" or a "string" value that will be implicitly converted to
+// a StringPiece.
+//
+// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
+// conversions from "const char*" to "string" and back again.
+//
+//
+// Arghh!  I wish C++ literals were "string".
+
+#ifndef __STRINGPIECE_H__
+#define __STRINGPIECE_H__
+
+/**
+ * \file 
+ * \brief C++ API: Read-only byte string wrapper class.
+ */
+
+#include "unicode/utypes.h"
+#include "unicode/uobject.h"
+#include "unicode/std_string.h"
+
+U_NAMESPACE_BEGIN
+
+// @draft ICU 4.2
+class U_COMMON_API StringPiece : public UMemory {
+ private:
+  const char*   ptr_;
+  int32_t       length_;
+
+ public:
+  // We provide non-explicit singleton constructors so users can pass
+  // in a "const char*" or a "string" wherever a "StringPiece" is
+  // expected.
+  // @draft ICU 4.2
+  StringPiece() : ptr_(NULL), length_(0) { }
+  // @draft ICU 4.2
+  StringPiece(const char* str);
+#if U_HAVE_STD_STRING
+  // @draft ICU 4.2
+  StringPiece(const U_STD_NSQ string& str)
+    : ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { }
+#endif
+  // @draft ICU 4.2
+  StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { }
+  // Substring of another StringPiece.
+  // pos must be non-negative and <= x.length().
+  // @draft ICU 4.2
+  StringPiece(const StringPiece& x, int32_t pos);
+  // Substring of another StringPiece.
+  // pos must be non-negative and <= x.length().
+  // len must be non-negative and will be pinned to at most x.length() - pos.
+  // @draft ICU 4.2
+  StringPiece(const StringPiece& x, int32_t pos, int32_t len);
+
+  // data() may return a pointer to a buffer with embedded NULs, and the
+  // returned buffer may or may not be null terminated.  Therefore it is
+  // typically a mistake to pass data() to a routine that expects a NUL
+  // terminated string.
+  // @draft ICU 4.2
+  const char* data() const { return ptr_; }
+  // @draft ICU 4.2
+  int32_t size() const { return length_; }
+  // @draft ICU 4.2
+  int32_t length() const { return length_; }
+  // @draft ICU 4.2
+  UBool empty() const { return length_ == 0; }
+
+  // @draft ICU 4.2
+  void clear() { ptr_ = NULL; length_ = 0; }
+
+  // @draft ICU 4.2
+  void remove_prefix(int32_t n) {
+    if (n >= 0) {
+      if (n > length_) {
+        n = length_;
+      }
+      ptr_ += n;
+      length_ -= n;
+    }
+  }
+
+  // @draft ICU 4.2
+  void remove_suffix(int32_t n) {
+    if (n >= 0) {
+      if (n <= length_) {
+        length_ -= n;
+      } else {
+        length_ = 0;
+      }
+    }
+  }
+
+  // @draft ICU 4.2
+  static const int32_t npos = 0x7fffffff;
+
+  // @draft ICU 4.2
+  StringPiece substr(int32_t pos, int32_t n = npos) const {
+    return StringPiece(*this, pos, n);
+  }
+};
+
+U_NAMESPACE_END
+
+#endif  // __STRINGPIECE_H__
--- a/icu4c/source/common/unicode/unistr.h
+++ b/icu4c/source/common/unicode/unistr.h
@ -26,7 +26,11 @@
 * \brief C++ API: Unicode String 
 */

+#include "unicode/utypes.h"
 #include "unicode/rep.h"
+#include "unicode/std_string.h"
+#include "unicode/stringpiece.h"
+#include "unicode/bytestream.h"

 struct UConverter;          // unicode/ucnv.h
 class  StringThreadTest;
@ -1532,6 +1536,40 @@ public:
                  UConverter *cnv,
                  UErrorCode &errorCode) const;

+#endif
+
+  /**
+   * Convert the UnicodeString to UTF-8 and write the result
+   * to a ByteSink. This is called by toUTF8String().
+   * Unpaired surrogates are replaced with U+FFFD.
+   * Calls u_strToUTF8WithSub().
+   *
+   * @param A ByteSink to which the UTF-8 version of the string is written.
+   * @draft ICU 4.2
+   * @see toUTF8String
+   */
+  void toUTF8(ByteSink &sink) const;
+
+#if U_HAVE_STD_STRING
+
+  /**
+   * Convert the UnicodeString to UTF-8 and append the result
+   * to a standard string.
+   * Unpaired surrogates are replaced with U+FFFD.
+   * Calls toUTF8().
+   *
+   * @param A standard string (or a compatible object)
+   *        to which the UTF-8 version of the string is appended.
+   * @return The string object.
+   * @draft ICU 4.2
+   * @see toUTF8
+   */
+  template<typename StringClass>
+  StringClass &toUTF8String(StringClass &result) const {
+    toUTF8(StringByteSink<StringClass>(&result));
+    return result;
+  }
+
 #endif

  /**
@ -2917,6 +2955,21 @@ public:
   */
  virtual ~UnicodeString();

+  /**
+   * Create a UnicodeString from a UTF-8 string.
+   * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string.
+   * Calls u_strFromUTF8WithSub().
+   *
+   * @param utf8 UTF-8 input string.
+   *             Note that a StringPiece can be implicitly constructed
+   *             from a std::string or a NUL-terminated const char * string.
+   * @return A UnicodeString with equivalent UTF-16 contents.
+   * @see toUTF8
+   * @see toUTF8String
+   * @draft ICU 4.2
+   */
+  static UnicodeString fromUTF8(const StringPiece &utf8);
+
  /**
   * Create a UnicodeString from a UTF-32 string.
   * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string.
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@ -379,9 +379,34 @@ UnicodeString::~UnicodeString()
 // Factory methods
 //========================================

+UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
+  UnicodeString result;
+  int32_t length = utf8.length();
+  int32_t capacity;
+  // The UTF-16 string will be at most as long as the UTF-8 string.
+  if(length <= US_STACKBUF_SIZE) {
+    capacity = US_STACKBUF_SIZE;
+  } else {
+    capacity = length + 1;  // +1 for the terminating NUL.
+  }
+  UChar *utf16 = result.getBuffer(capacity);
+  int32_t length16;
+  UErrorCode errorCode = U_ZERO_ERROR;
+  u_strFromUTF8WithSub(utf16, result.getCapacity(), &length16,
+      utf8.data(), length,
+      0xfffd,  // Substitution character.
+      NULL,    // Don't care about number of substitutions.
+      &errorCode);
+  result.releaseBuffer(length16);
+  if(U_FAILURE(errorCode)) {
+    result.setToBogus();
+  }
+  return result;
+}
+
 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
  UnicodeString result;
-  int32_t capacity = length;
+  int32_t capacity;
  // Most UTF-32 strings will be BMP-only and result in a same-length
  // UTF-16 string. We overestimate the capacity just slightly,
  // just in case there are a few supplementary characters.
@ -756,6 +781,51 @@ UnicodeString::extractBetween(int32_t start,
  doExtract(start, limit - start, target);
 }

+// When converting from UTF-16 to UTF-8, the result will have at most 3 times
+// as many bytes as the source has UChars.
+// The "worst cases" are writing systems like Indic, Thai and CJK with
+// 3:1 bytes:UChars.
+void
+UnicodeString::toUTF8(ByteSink &sink) const {
+  int32_t length16 = length();
+  if(length16 != 0) {
+    char stackBuffer[1024];
+    int32_t capacity = (int32_t)sizeof(stackBuffer);
+    UBool utf8IsOwned = FALSE;
+    char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
+                                      3*length16,
+                                      stackBuffer, capacity,
+                                      &capacity);
+    int32_t length8 = 0;
+    UErrorCode errorCode = U_ZERO_ERROR;
+    u_strToUTF8WithSub(utf8, capacity, &length8,
+                       getBuffer(), length16,
+                       0xFFFD,  // Standard substitution character.
+                       NULL,    // Don't care about number of substitutions.
+                       &errorCode);
+    if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
+      utf8 = (char *)uprv_malloc(length8);
+      if(utf8 != NULL) {
+        utf8IsOwned = TRUE;
+        errorCode = U_ZERO_ERROR;
+        u_strToUTF8WithSub(utf8, length8, &length8,
+                           getBuffer(), length16,
+                           0xFFFD,  // Standard substitution character.
+                           NULL,    // Don't care about number of substitutions.
+                           &errorCode);
+      } else {
+        errorCode = U_MEMORY_ALLOCATION_ERROR;
+      }
+    }
+    if(U_SUCCESS(errorCode)) {
+      sink.Append(utf8, length8);
+    }
+    if(utf8IsOwned) {
+      uprv_free(utf8);
+    }
+  }
+}
+
 int32_t
 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
  int32_t length32=0;
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@ -5,6 +5,7 @@
 ********************************************************************/

 #include "ustrtest.h"
+#include "unicode/std_string.h"
 #include "unicode/unistr.h"
 #include "unicode/uchar.h"
 #include "unicode/ustring.h"
@ -59,6 +60,7 @@ void UnicodeStringTest::runIndexedTest( int32_t index, UBool exec, const char* &
        case 16: name = "TestCharString"; if (exec) TestCharString(); break;
        case 17: name = "TestNameSpace"; if (exec) TestNameSpace(); break;
        case 18: name = "TestUTF32"; if (exec) TestUTF32(); break;
+        case 19: name = "TestUTF8"; if (exec) TestUTF8(); break;

        default: name = ""; break; //needed to end loop
    }
@ -1739,3 +1741,71 @@ UnicodeStringTest::TestUTF32() {
        errln("UnicodeString::toUTF32() did not create the expected string.");
    }
 }
+
+void
+UnicodeStringTest::TestUTF8() {
+    static const uint8_t utf8[] = {
+        // Code points:
+        // 0x41, 0xd900,
+        // 0x61, 0xdc00,
+        // 0x110000, 0x5a,
+        // 0x50000, 0x7a,
+        // 0x10000, 0x20000,
+        // 0xe0000, 0x10ffff
+        0x41, 0xed, 0xa4, 0x80,
+        0x61, 0xed, 0xb0, 0x80,
+        0xf4, 0x90, 0x80, 0x80, 0x5a,
+        0xf1, 0x90, 0x80, 0x80, 0x7a,
+        0xf0, 0x90, 0x80, 0x80, 0xf0, 0xa0, 0x80, 0x80,
+        0xf3, 0xa0, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
+    };
+    static const UChar expected_utf16[] = {
+        0x41, 0xfffd,
+        0x61, 0xfffd,
+        0xfffd, 0x5a,
+        0xd900, 0xdc00, 0x7a,
+        0xd800, 0xdc00, 0xd840, 0xdc00,
+        0xdb40, 0xdc00, 0xdbff, 0xdfff
+    };
+    UnicodeString from8 = UnicodeString::fromUTF8(StringPiece((const char *)utf8, (int32_t)sizeof(utf8)));
+    UnicodeString expected(FALSE, expected_utf16, LENGTHOF(expected_utf16));
+
+    if(from8 != expected) {
+        errln("UnicodeString::fromUTF8(StringPiece) did not create the expected string.");
+    }
+#if U_HAVE_STD_STRING
+    U_STD_NSQ string utf8_string((const char *)utf8, sizeof(utf8));
+    UnicodeString from8b = UnicodeString::fromUTF8(utf8_string);
+    if(from8b != expected) {
+        errln("UnicodeString::fromUTF8(std::string) did not create the expected string.");
+    }
+#endif
+
+    static const UChar utf16[] = {
+        0x41, 0xd900, 0x61, 0xdc00, 0x5a, 0xd900, 0xdc00, 0x7a, 0xd800, 0xdc00, 0xdbff, 0xdfff
+    };
+    static const uint8_t expected_utf8[] = {
+        0x41, 0xef, 0xbf, 0xbd, 0x61, 0xef, 0xbf, 0xbd, 0x5a, 0xf1, 0x90, 0x80, 0x80, 0x7a,
+        0xf0, 0x90, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
+    };
+    UnicodeString us(FALSE, utf16, LENGTHOF(utf16));
+
+    char buffer[64];
+    CheckedArrayByteSink sink(buffer, (int32_t)sizeof(buffer));
+    us.toUTF8(sink);
+    if( sink.NumberOfBytesWritten() != (int32_t)sizeof(expected_utf8) ||
+        0 != uprv_memcmp(buffer, expected_utf8, sizeof(expected_utf8))
+    ) {
+        errln("UnicodeString::toUTF8() did not create the expected string.");
+    }
+#if U_HAVE_STD_STRING
+    // Initial contents for testing that toUTF8String() appends.
+    U_STD_NSQ string result8 = "-->";
+    U_STD_NSQ string expected8 = "-->" + U_STD_NSQ string((const char *)expected_utf8, sizeof(expected_utf8));
+    // Use the return value just for testing.
+    U_STD_NSQ string &result8r = us.toUTF8String(result8);
+    if(result8r != expected8 || &result8r != &result8) {
+        errln("UnicodeString::toUTF8String() did not create the expected string.");
+    }
+#endif
+}
--- a/icu4c/source/test/intltest/ustrtest.h
+++ b/icu4c/source/test/intltest/ustrtest.h
@ -76,6 +76,7 @@ public:
    void TestCharString(void);
    void TestNameSpace(void);
    void TestUTF32(void);
+    void TestUTF8(void);
 };

 class StringCaseTest: public IntlTest {