ICU-6765 add API: StringPiece, ByteSink(s), UnicodeString::toUTF8() and fromUTF8()

X-SVN-Rev: 25519
This commit is contained in:
Markus Scherer 2009-03-06 19:19:00 +00:00
parent 025e6a09ae
commit 63ea2e06da
9 changed files with 656 additions and 1 deletions

View File

@ -0,0 +1,64 @@
// Copyright (C) 2009, International Business Machines
// Corporation and others. All Rights Reserved.
//
// Copyright 2007 Google Inc. All Rights Reserved.
// Author: sanjay@google.com (Sanjay Ghemawat)
#include "unicode/utypes.h"
#include "unicode/bytestream.h"
U_NAMESPACE_BEGIN
char* ByteSink::GetAppendBuffer(int32_t min_capacity,
int32_t desired_capacity_hint,
char* scratch, int32_t scratch_capacity,
int32_t* result_capacity) {
if (min_capacity < 1 || scratch_capacity < min_capacity) {
*result_capacity = 0;
return NULL;
}
*result_capacity = scratch_capacity;
return scratch;
}
void ByteSink::Flush() {}
CheckedArrayByteSink::CheckedArrayByteSink(char* outbuf, int32_t capacity)
: outbuf_(outbuf), capacity_(capacity < 0 ? 0 : capacity), size_(0), overflowed_(false) {
}
void CheckedArrayByteSink::Append(const char* bytes, int32_t n) {
if (n <= 0) {
return;
}
int32_t available = capacity_ - size_;
if (n > available) {
n = available;
overflowed_ = true;
}
if (n > 0 && bytes != (outbuf_ + size_)) {
memcpy(outbuf_ + size_, bytes, n);
}
size_ += n;
}
char* CheckedArrayByteSink::GetAppendBuffer(int32_t min_capacity,
int32_t desired_capacity_hint,
char* scratch,
int32_t scratch_capacity,
int32_t* result_capacity) {
if (min_capacity < 1 || scratch_capacity < min_capacity) {
*result_capacity = 0;
return NULL;
}
int32_t available = capacity_ - size_;
if (available >= min_capacity) {
*result_capacity = available;
return outbuf_ + size_;
} else {
*result_capacity = scratch_capacity;
return scratch;
}
}
U_NAMESPACE_END

View File

@ -3401,6 +3401,50 @@
<Filter
Name="strings"
>
<File
RelativePath=".\bytestream.cpp"
>
</File>
<File
RelativePath=".\unicode\bytestream.h"
>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\chariter.cpp"
>
@ -3549,6 +3593,50 @@
/>
</FileConfiguration>
</File>
<File
RelativePath=".\stringpiece.cpp"
>
</File>
<File
RelativePath=".\unicode\stringpiece.h"
>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\ucasemap.c"
>

View File

@ -0,0 +1,45 @@
// Copyright (C) 2009, International Business Machines
// Corporation and others. All Rights Reserved.
//
// Copyright 2004 and onwards Google Inc.
//
// Author: wilsonh@google.com (Wilson Hsieh)
//
#include "unicode/utypes.h"
#include "unicode/stringpiece.h"
#include "cstring.h"
U_NAMESPACE_BEGIN
StringPiece::StringPiece(const char* str)
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int32_t>(uprv_strlen(str))) { }
StringPiece::StringPiece(const StringPiece& x, int32_t pos) {
if (pos < 0) {
pos = 0;
} else if (pos > x.length_) {
pos = x.length_;
}
ptr_ = x.ptr_ + pos;
length_ = x.length_ - pos;
}
StringPiece::StringPiece(const StringPiece& x, int32_t pos, int32_t len) {
if (pos < 0) {
pos = 0;
} else if (pos > x.length_) {
pos = x.length_;
}
if (len < 0) {
len = 0;
} else if (len > x.length_ - pos) {
len = x.length_ - pos;
}
ptr_ = x.ptr_ + pos;
length_ = len;
}
const int32_t StringPiece::npos;
U_NAMESPACE_END

View File

@ -0,0 +1,151 @@
// Copyright (C) 2009, International Business Machines
// Corporation and others. All Rights Reserved.
//
// Copyright 2007 Google Inc. All Rights Reserved.
// Author: sanjay@google.com (Sanjay Ghemawat)
//
// Abstract interface that consumes a sequence of bytes (ByteSink).
//
// Used so that we can write a single piece of code that can operate
// on a variety of output string types.
//
// Various implementations of this interface are provided:
// ByteSink:
// CheckedArrayByteSink Write to a flat array, with bounds checking
// StringByteSink Write to an STL string
#ifndef __BYTESTREAM_H__
#define __BYTESTREAM_H__
/**
* \file
* \brief C++ API: Interface for writing bytes, and implementation classes.
*/
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/std_string.h"
U_NAMESPACE_BEGIN
// A ByteSink can be filled with bytes
// @draft ICU 4.2
class U_COMMON_API ByteSink : public UMemory {
public:
// @draft ICU 4.2
ByteSink() { }
// @draft ICU 4.2
virtual ~ByteSink() { }
// Append "bytes[0,n-1]" to this.
// @draft ICU 4.2
virtual void Append(const char* bytes, int32_t n) = 0;
// Returns a writable buffer for appending and writes the buffer's capacity to
// *result_capacity. Guarantees *result_capacity>=min_capacity.
// May return a pointer to the caller-owned scratch buffer which must have
// scratch_capacity>=min_capacity.
// The returned buffer is only valid until the next operation
// on this ByteSink.
//
// After writing at most *result_capacity bytes, call Append() with the
// pointer returned from this function and the number of bytes written.
// Many Append() implementations will avoid copying bytes if this function
// returned an internal buffer.
//
// Partial usage example:
// int32_t capacity;
// char* buffer = sink->GetAppendBuffer(..., &capacity);
// ... Write n bytes into buffer, with n <= capacity.
// sink->Append(buffer, n);
// In many implementations, that call to Append will avoid copying bytes.
//
// If the ByteSink allocates or reallocates an internal buffer, it should use
// the desired_capacity_hint if appropriate.
// If a caller cannot provide a reasonable guess at the desired capacity,
// it should pass desired_capacity_hint=0.
//
// If a non-scratch buffer is returned, the caller may only pass
// a prefix to it to Append().
// That is, it is not correct to pass an interior pointer to Append().
//
// The default implementation always returns the scratch buffer.
// @draft ICU 4.2
virtual char* GetAppendBuffer(int32_t min_capacity,
int32_t desired_capacity_hint,
char* scratch, int32_t scratch_capacity,
int32_t* result_capacity);
// Flush internal buffers.
// Some byte sinks use internal buffers or provide buffering
// and require calling Flush() at the end of the stream.
// The default implementation of Flush() does nothing.
// @draft ICU 4.2
virtual void Flush();
private:
ByteSink(const ByteSink &); // copy constructor not implemented
ByteSink &operator=(const ByteSink &); // assignment operator not implemented
};
// -------------------------------------------------------------
// Some standard implementations
// Implementation of ByteSink that writes to a flat byte array,
// with bounds-checking:
// This sink will not write more than capacity bytes to outbuf.
// If more than capacity bytes are Append()ed, then excess bytes are ignored,
// and Overflowed() will return true.
// Overflow does not cause a runtime error.
// @draft ICU 4.2
class U_COMMON_API CheckedArrayByteSink : public ByteSink {
public:
// @draft ICU 4.2
CheckedArrayByteSink(char* outbuf, int32_t capacity);
// @draft ICU 4.2
virtual void Append(const char* bytes, int32_t n);
// @draft ICU 4.2
virtual char* GetAppendBuffer(int32_t min_capacity,
int32_t desired_capacity_hint,
char* scratch, int32_t scratch_capacity,
int32_t* result_capacity);
// Returns the number of bytes actually written to the sink.
// @draft ICU 4.2
int32_t NumberOfBytesWritten() const { return size_; }
// Returns true if any bytes were discarded, i.e., if there was an
// attempt to write more than 'capacity' bytes.
// @draft ICU 4.2
UBool Overflowed() const { return overflowed_; }
private:
char* outbuf_;
const int32_t capacity_;
int32_t size_;
bool overflowed_;
CheckedArrayByteSink(); // default constructor not implemented
CheckedArrayByteSink(const CheckedArrayByteSink &); // copy constructor not implemented
CheckedArrayByteSink &operator=(const CheckedArrayByteSink &); // assignment operator not implemented
};
#if U_HAVE_STD_STRING
// Implementation of ByteSink that writes to a "string".
// @draft ICU 4.2
template<typename StringClass>
class StringByteSink : public ByteSink {
public:
// @draft ICU 4.2
StringByteSink(StringClass* dest) : dest_(dest) { }
// @draft ICU 4.2
virtual void Append(const char* data, int32_t n) { dest_->append(data, n); }
private:
StringClass* dest_;
StringByteSink(); // default constructor not implemented
StringByteSink(const StringByteSink &); // copy constructor not implemented
StringByteSink &operator=(const StringByteSink &); // assignment operator not implemented
};
#endif
U_NAMESPACE_END
#endif // __BYTESTREAM_H__

View File

@ -0,0 +1,113 @@
// Copyright (C) 2009, International Business Machines
// Corporation and others. All Rights Reserved.
//
// Copyright 2001 and onwards Google Inc.
// Author: Sanjay Ghemawat
//
// A string-like object that points to a sized piece of memory.
//
// Functions or methods may use const StringPiece& parameters to accept either
// a "const char*" or a "string" value that will be implicitly converted to
// a StringPiece.
//
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
// conversions from "const char*" to "string" and back again.
//
//
// Arghh! I wish C++ literals were "string".
#ifndef __STRINGPIECE_H__
#define __STRINGPIECE_H__
/**
* \file
* \brief C++ API: Read-only byte string wrapper class.
*/
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/std_string.h"
U_NAMESPACE_BEGIN
// @draft ICU 4.2
class U_COMMON_API StringPiece : public UMemory {
private:
const char* ptr_;
int32_t length_;
public:
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
// @draft ICU 4.2
StringPiece() : ptr_(NULL), length_(0) { }
// @draft ICU 4.2
StringPiece(const char* str);
#if U_HAVE_STD_STRING
// @draft ICU 4.2
StringPiece(const U_STD_NSQ string& str)
: ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { }
#endif
// @draft ICU 4.2
StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { }
// Substring of another StringPiece.
// pos must be non-negative and <= x.length().
// @draft ICU 4.2
StringPiece(const StringPiece& x, int32_t pos);
// Substring of another StringPiece.
// pos must be non-negative and <= x.length().
// len must be non-negative and will be pinned to at most x.length() - pos.
// @draft ICU 4.2
StringPiece(const StringPiece& x, int32_t pos, int32_t len);
// data() may return a pointer to a buffer with embedded NULs, and the
// returned buffer may or may not be null terminated. Therefore it is
// typically a mistake to pass data() to a routine that expects a NUL
// terminated string.
// @draft ICU 4.2
const char* data() const { return ptr_; }
// @draft ICU 4.2
int32_t size() const { return length_; }
// @draft ICU 4.2
int32_t length() const { return length_; }
// @draft ICU 4.2
UBool empty() const { return length_ == 0; }
// @draft ICU 4.2
void clear() { ptr_ = NULL; length_ = 0; }
// @draft ICU 4.2
void remove_prefix(int32_t n) {
if (n >= 0) {
if (n > length_) {
n = length_;
}
ptr_ += n;
length_ -= n;
}
}
// @draft ICU 4.2
void remove_suffix(int32_t n) {
if (n >= 0) {
if (n <= length_) {
length_ -= n;
} else {
length_ = 0;
}
}
}
// @draft ICU 4.2
static const int32_t npos = 0x7fffffff;
// @draft ICU 4.2
StringPiece substr(int32_t pos, int32_t n = npos) const {
return StringPiece(*this, pos, n);
}
};
U_NAMESPACE_END
#endif // __STRINGPIECE_H__

View File

@ -26,7 +26,11 @@
* \brief C++ API: Unicode String
*/
#include "unicode/utypes.h"
#include "unicode/rep.h"
#include "unicode/std_string.h"
#include "unicode/stringpiece.h"
#include "unicode/bytestream.h"
struct UConverter; // unicode/ucnv.h
class StringThreadTest;
@ -1532,6 +1536,40 @@ public:
UConverter *cnv,
UErrorCode &errorCode) const;
#endif
/**
* Convert the UnicodeString to UTF-8 and write the result
* to a ByteSink. This is called by toUTF8String().
* Unpaired surrogates are replaced with U+FFFD.
* Calls u_strToUTF8WithSub().
*
* @param A ByteSink to which the UTF-8 version of the string is written.
* @draft ICU 4.2
* @see toUTF8String
*/
void toUTF8(ByteSink &sink) const;
#if U_HAVE_STD_STRING
/**
* Convert the UnicodeString to UTF-8 and append the result
* to a standard string.
* Unpaired surrogates are replaced with U+FFFD.
* Calls toUTF8().
*
* @param A standard string (or a compatible object)
* to which the UTF-8 version of the string is appended.
* @return The string object.
* @draft ICU 4.2
* @see toUTF8
*/
template<typename StringClass>
StringClass &toUTF8String(StringClass &result) const {
toUTF8(StringByteSink<StringClass>(&result));
return result;
}
#endif
/**
@ -2917,6 +2955,21 @@ public:
*/
virtual ~UnicodeString();
/**
* Create a UnicodeString from a UTF-8 string.
* Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string.
* Calls u_strFromUTF8WithSub().
*
* @param utf8 UTF-8 input string.
* Note that a StringPiece can be implicitly constructed
* from a std::string or a NUL-terminated const char * string.
* @return A UnicodeString with equivalent UTF-16 contents.
* @see toUTF8
* @see toUTF8String
* @draft ICU 4.2
*/
static UnicodeString fromUTF8(const StringPiece &utf8);
/**
* Create a UnicodeString from a UTF-32 string.
* Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string.

View File

@ -379,9 +379,34 @@ UnicodeString::~UnicodeString()
// Factory methods
//========================================
UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
UnicodeString result;
int32_t length = utf8.length();
int32_t capacity;
// The UTF-16 string will be at most as long as the UTF-8 string.
if(length <= US_STACKBUF_SIZE) {
capacity = US_STACKBUF_SIZE;
} else {
capacity = length + 1; // +1 for the terminating NUL.
}
UChar *utf16 = result.getBuffer(capacity);
int32_t length16;
UErrorCode errorCode = U_ZERO_ERROR;
u_strFromUTF8WithSub(utf16, result.getCapacity(), &length16,
utf8.data(), length,
0xfffd, // Substitution character.
NULL, // Don't care about number of substitutions.
&errorCode);
result.releaseBuffer(length16);
if(U_FAILURE(errorCode)) {
result.setToBogus();
}
return result;
}
UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
UnicodeString result;
int32_t capacity = length;
int32_t capacity;
// Most UTF-32 strings will be BMP-only and result in a same-length
// UTF-16 string. We overestimate the capacity just slightly,
// just in case there are a few supplementary characters.
@ -756,6 +781,51 @@ UnicodeString::extractBetween(int32_t start,
doExtract(start, limit - start, target);
}
// When converting from UTF-16 to UTF-8, the result will have at most 3 times
// as many bytes as the source has UChars.
// The "worst cases" are writing systems like Indic, Thai and CJK with
// 3:1 bytes:UChars.
void
UnicodeString::toUTF8(ByteSink &sink) const {
int32_t length16 = length();
if(length16 != 0) {
char stackBuffer[1024];
int32_t capacity = (int32_t)sizeof(stackBuffer);
UBool utf8IsOwned = FALSE;
char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
3*length16,
stackBuffer, capacity,
&capacity);
int32_t length8 = 0;
UErrorCode errorCode = U_ZERO_ERROR;
u_strToUTF8WithSub(utf8, capacity, &length8,
getBuffer(), length16,
0xFFFD, // Standard substitution character.
NULL, // Don't care about number of substitutions.
&errorCode);
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
utf8 = (char *)uprv_malloc(length8);
if(utf8 != NULL) {
utf8IsOwned = TRUE;
errorCode = U_ZERO_ERROR;
u_strToUTF8WithSub(utf8, length8, &length8,
getBuffer(), length16,
0xFFFD, // Standard substitution character.
NULL, // Don't care about number of substitutions.
&errorCode);
} else {
errorCode = U_MEMORY_ALLOCATION_ERROR;
}
}
if(U_SUCCESS(errorCode)) {
sink.Append(utf8, length8);
}
if(utf8IsOwned) {
uprv_free(utf8);
}
}
}
int32_t
UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
int32_t length32=0;

View File

@ -5,6 +5,7 @@
********************************************************************/
#include "ustrtest.h"
#include "unicode/std_string.h"
#include "unicode/unistr.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
@ -59,6 +60,7 @@ void UnicodeStringTest::runIndexedTest( int32_t index, UBool exec, const char* &
case 16: name = "TestCharString"; if (exec) TestCharString(); break;
case 17: name = "TestNameSpace"; if (exec) TestNameSpace(); break;
case 18: name = "TestUTF32"; if (exec) TestUTF32(); break;
case 19: name = "TestUTF8"; if (exec) TestUTF8(); break;
default: name = ""; break; //needed to end loop
}
@ -1739,3 +1741,71 @@ UnicodeStringTest::TestUTF32() {
errln("UnicodeString::toUTF32() did not create the expected string.");
}
}
void
UnicodeStringTest::TestUTF8() {
static const uint8_t utf8[] = {
// Code points:
// 0x41, 0xd900,
// 0x61, 0xdc00,
// 0x110000, 0x5a,
// 0x50000, 0x7a,
// 0x10000, 0x20000,
// 0xe0000, 0x10ffff
0x41, 0xed, 0xa4, 0x80,
0x61, 0xed, 0xb0, 0x80,
0xf4, 0x90, 0x80, 0x80, 0x5a,
0xf1, 0x90, 0x80, 0x80, 0x7a,
0xf0, 0x90, 0x80, 0x80, 0xf0, 0xa0, 0x80, 0x80,
0xf3, 0xa0, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
};
static const UChar expected_utf16[] = {
0x41, 0xfffd,
0x61, 0xfffd,
0xfffd, 0x5a,
0xd900, 0xdc00, 0x7a,
0xd800, 0xdc00, 0xd840, 0xdc00,
0xdb40, 0xdc00, 0xdbff, 0xdfff
};
UnicodeString from8 = UnicodeString::fromUTF8(StringPiece((const char *)utf8, (int32_t)sizeof(utf8)));
UnicodeString expected(FALSE, expected_utf16, LENGTHOF(expected_utf16));
if(from8 != expected) {
errln("UnicodeString::fromUTF8(StringPiece) did not create the expected string.");
}
#if U_HAVE_STD_STRING
U_STD_NSQ string utf8_string((const char *)utf8, sizeof(utf8));
UnicodeString from8b = UnicodeString::fromUTF8(utf8_string);
if(from8b != expected) {
errln("UnicodeString::fromUTF8(std::string) did not create the expected string.");
}
#endif
static const UChar utf16[] = {
0x41, 0xd900, 0x61, 0xdc00, 0x5a, 0xd900, 0xdc00, 0x7a, 0xd800, 0xdc00, 0xdbff, 0xdfff
};
static const uint8_t expected_utf8[] = {
0x41, 0xef, 0xbf, 0xbd, 0x61, 0xef, 0xbf, 0xbd, 0x5a, 0xf1, 0x90, 0x80, 0x80, 0x7a,
0xf0, 0x90, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
};
UnicodeString us(FALSE, utf16, LENGTHOF(utf16));
char buffer[64];
CheckedArrayByteSink sink(buffer, (int32_t)sizeof(buffer));
us.toUTF8(sink);
if( sink.NumberOfBytesWritten() != (int32_t)sizeof(expected_utf8) ||
0 != uprv_memcmp(buffer, expected_utf8, sizeof(expected_utf8))
) {
errln("UnicodeString::toUTF8() did not create the expected string.");
}
#if U_HAVE_STD_STRING
// Initial contents for testing that toUTF8String() appends.
U_STD_NSQ string result8 = "-->";
U_STD_NSQ string expected8 = "-->" + U_STD_NSQ string((const char *)expected_utf8, sizeof(expected_utf8));
// Use the return value just for testing.
U_STD_NSQ string &result8r = us.toUTF8String(result8);
if(result8r != expected8 || &result8r != &result8) {
errln("UnicodeString::toUTF8String() did not create the expected string.");
}
#endif
}

View File

@ -76,6 +76,7 @@ public:
void TestCharString(void);
void TestNameSpace(void);
void TestUTF32(void);
void TestUTF8(void);
};
class StringCaseTest: public IntlTest {