ICU-6765 add API: StringPiece, ByteSink(s), UnicodeString::toUTF8() and fromUTF8()
X-SVN-Rev: 25519
This commit is contained in:
parent
025e6a09ae
commit
63ea2e06da
64
icu4c/source/common/bytestream.cpp
Normal file
64
icu4c/source/common/bytestream.cpp
Normal file
@ -0,0 +1,64 @@
|
||||
// Copyright (C) 2009, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//
|
||||
// Copyright 2007 Google Inc. All Rights Reserved.
|
||||
// Author: sanjay@google.com (Sanjay Ghemawat)
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestream.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
char* ByteSink::GetAppendBuffer(int32_t min_capacity,
|
||||
int32_t desired_capacity_hint,
|
||||
char* scratch, int32_t scratch_capacity,
|
||||
int32_t* result_capacity) {
|
||||
if (min_capacity < 1 || scratch_capacity < min_capacity) {
|
||||
*result_capacity = 0;
|
||||
return NULL;
|
||||
}
|
||||
*result_capacity = scratch_capacity;
|
||||
return scratch;
|
||||
}
|
||||
|
||||
void ByteSink::Flush() {}
|
||||
|
||||
CheckedArrayByteSink::CheckedArrayByteSink(char* outbuf, int32_t capacity)
|
||||
: outbuf_(outbuf), capacity_(capacity < 0 ? 0 : capacity), size_(0), overflowed_(false) {
|
||||
}
|
||||
|
||||
void CheckedArrayByteSink::Append(const char* bytes, int32_t n) {
|
||||
if (n <= 0) {
|
||||
return;
|
||||
}
|
||||
int32_t available = capacity_ - size_;
|
||||
if (n > available) {
|
||||
n = available;
|
||||
overflowed_ = true;
|
||||
}
|
||||
if (n > 0 && bytes != (outbuf_ + size_)) {
|
||||
memcpy(outbuf_ + size_, bytes, n);
|
||||
}
|
||||
size_ += n;
|
||||
}
|
||||
|
||||
char* CheckedArrayByteSink::GetAppendBuffer(int32_t min_capacity,
|
||||
int32_t desired_capacity_hint,
|
||||
char* scratch,
|
||||
int32_t scratch_capacity,
|
||||
int32_t* result_capacity) {
|
||||
if (min_capacity < 1 || scratch_capacity < min_capacity) {
|
||||
*result_capacity = 0;
|
||||
return NULL;
|
||||
}
|
||||
int32_t available = capacity_ - size_;
|
||||
if (available >= min_capacity) {
|
||||
*result_capacity = available;
|
||||
return outbuf_ + size_;
|
||||
} else {
|
||||
*result_capacity = scratch_capacity;
|
||||
return scratch;
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
@ -3401,6 +3401,50 @@
|
||||
<Filter
|
||||
Name="strings"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\bytestream.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicode\bytestream.h"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\chariter.cpp"
|
||||
>
|
||||
@ -3549,6 +3593,50 @@
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\stringpiece.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicode\stringpiece.h"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucasemap.c"
|
||||
>
|
||||
|
45
icu4c/source/common/stringpiece.cpp
Normal file
45
icu4c/source/common/stringpiece.cpp
Normal file
@ -0,0 +1,45 @@
|
||||
// Copyright (C) 2009, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//
|
||||
// Copyright 2004 and onwards Google Inc.
|
||||
//
|
||||
// Author: wilsonh@google.com (Wilson Hsieh)
|
||||
//
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "cstring.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
StringPiece::StringPiece(const char* str)
|
||||
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int32_t>(uprv_strlen(str))) { }
|
||||
|
||||
StringPiece::StringPiece(const StringPiece& x, int32_t pos) {
|
||||
if (pos < 0) {
|
||||
pos = 0;
|
||||
} else if (pos > x.length_) {
|
||||
pos = x.length_;
|
||||
}
|
||||
ptr_ = x.ptr_ + pos;
|
||||
length_ = x.length_ - pos;
|
||||
}
|
||||
|
||||
StringPiece::StringPiece(const StringPiece& x, int32_t pos, int32_t len) {
|
||||
if (pos < 0) {
|
||||
pos = 0;
|
||||
} else if (pos > x.length_) {
|
||||
pos = x.length_;
|
||||
}
|
||||
if (len < 0) {
|
||||
len = 0;
|
||||
} else if (len > x.length_ - pos) {
|
||||
len = x.length_ - pos;
|
||||
}
|
||||
ptr_ = x.ptr_ + pos;
|
||||
length_ = len;
|
||||
}
|
||||
|
||||
const int32_t StringPiece::npos;
|
||||
|
||||
U_NAMESPACE_END
|
151
icu4c/source/common/unicode/bytestream.h
Normal file
151
icu4c/source/common/unicode/bytestream.h
Normal file
@ -0,0 +1,151 @@
|
||||
// Copyright (C) 2009, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//
|
||||
// Copyright 2007 Google Inc. All Rights Reserved.
|
||||
// Author: sanjay@google.com (Sanjay Ghemawat)
|
||||
//
|
||||
// Abstract interface that consumes a sequence of bytes (ByteSink).
|
||||
//
|
||||
// Used so that we can write a single piece of code that can operate
|
||||
// on a variety of output string types.
|
||||
//
|
||||
// Various implementations of this interface are provided:
|
||||
// ByteSink:
|
||||
// CheckedArrayByteSink Write to a flat array, with bounds checking
|
||||
// StringByteSink Write to an STL string
|
||||
|
||||
#ifndef __BYTESTREAM_H__
|
||||
#define __BYTESTREAM_H__
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: Interface for writing bytes, and implementation classes.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/std_string.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// A ByteSink can be filled with bytes
|
||||
// @draft ICU 4.2
|
||||
class U_COMMON_API ByteSink : public UMemory {
|
||||
public:
|
||||
// @draft ICU 4.2
|
||||
ByteSink() { }
|
||||
// @draft ICU 4.2
|
||||
virtual ~ByteSink() { }
|
||||
|
||||
// Append "bytes[0,n-1]" to this.
|
||||
// @draft ICU 4.2
|
||||
virtual void Append(const char* bytes, int32_t n) = 0;
|
||||
|
||||
// Returns a writable buffer for appending and writes the buffer's capacity to
|
||||
// *result_capacity. Guarantees *result_capacity>=min_capacity.
|
||||
// May return a pointer to the caller-owned scratch buffer which must have
|
||||
// scratch_capacity>=min_capacity.
|
||||
// The returned buffer is only valid until the next operation
|
||||
// on this ByteSink.
|
||||
//
|
||||
// After writing at most *result_capacity bytes, call Append() with the
|
||||
// pointer returned from this function and the number of bytes written.
|
||||
// Many Append() implementations will avoid copying bytes if this function
|
||||
// returned an internal buffer.
|
||||
//
|
||||
// Partial usage example:
|
||||
// int32_t capacity;
|
||||
// char* buffer = sink->GetAppendBuffer(..., &capacity);
|
||||
// ... Write n bytes into buffer, with n <= capacity.
|
||||
// sink->Append(buffer, n);
|
||||
// In many implementations, that call to Append will avoid copying bytes.
|
||||
//
|
||||
// If the ByteSink allocates or reallocates an internal buffer, it should use
|
||||
// the desired_capacity_hint if appropriate.
|
||||
// If a caller cannot provide a reasonable guess at the desired capacity,
|
||||
// it should pass desired_capacity_hint=0.
|
||||
//
|
||||
// If a non-scratch buffer is returned, the caller may only pass
|
||||
// a prefix to it to Append().
|
||||
// That is, it is not correct to pass an interior pointer to Append().
|
||||
//
|
||||
// The default implementation always returns the scratch buffer.
|
||||
// @draft ICU 4.2
|
||||
virtual char* GetAppendBuffer(int32_t min_capacity,
|
||||
int32_t desired_capacity_hint,
|
||||
char* scratch, int32_t scratch_capacity,
|
||||
int32_t* result_capacity);
|
||||
|
||||
// Flush internal buffers.
|
||||
// Some byte sinks use internal buffers or provide buffering
|
||||
// and require calling Flush() at the end of the stream.
|
||||
// The default implementation of Flush() does nothing.
|
||||
// @draft ICU 4.2
|
||||
virtual void Flush();
|
||||
|
||||
private:
|
||||
ByteSink(const ByteSink &); // copy constructor not implemented
|
||||
ByteSink &operator=(const ByteSink &); // assignment operator not implemented
|
||||
};
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// Some standard implementations
|
||||
|
||||
// Implementation of ByteSink that writes to a flat byte array,
|
||||
// with bounds-checking:
|
||||
// This sink will not write more than capacity bytes to outbuf.
|
||||
// If more than capacity bytes are Append()ed, then excess bytes are ignored,
|
||||
// and Overflowed() will return true.
|
||||
// Overflow does not cause a runtime error.
|
||||
// @draft ICU 4.2
|
||||
class U_COMMON_API CheckedArrayByteSink : public ByteSink {
|
||||
public:
|
||||
// @draft ICU 4.2
|
||||
CheckedArrayByteSink(char* outbuf, int32_t capacity);
|
||||
// @draft ICU 4.2
|
||||
virtual void Append(const char* bytes, int32_t n);
|
||||
// @draft ICU 4.2
|
||||
virtual char* GetAppendBuffer(int32_t min_capacity,
|
||||
int32_t desired_capacity_hint,
|
||||
char* scratch, int32_t scratch_capacity,
|
||||
int32_t* result_capacity);
|
||||
// Returns the number of bytes actually written to the sink.
|
||||
// @draft ICU 4.2
|
||||
int32_t NumberOfBytesWritten() const { return size_; }
|
||||
// Returns true if any bytes were discarded, i.e., if there was an
|
||||
// attempt to write more than 'capacity' bytes.
|
||||
// @draft ICU 4.2
|
||||
UBool Overflowed() const { return overflowed_; }
|
||||
private:
|
||||
char* outbuf_;
|
||||
const int32_t capacity_;
|
||||
int32_t size_;
|
||||
bool overflowed_;
|
||||
CheckedArrayByteSink(); // default constructor not implemented
|
||||
CheckedArrayByteSink(const CheckedArrayByteSink &); // copy constructor not implemented
|
||||
CheckedArrayByteSink &operator=(const CheckedArrayByteSink &); // assignment operator not implemented
|
||||
};
|
||||
|
||||
#if U_HAVE_STD_STRING
|
||||
|
||||
// Implementation of ByteSink that writes to a "string".
|
||||
// @draft ICU 4.2
|
||||
template<typename StringClass>
|
||||
class StringByteSink : public ByteSink {
|
||||
public:
|
||||
// @draft ICU 4.2
|
||||
StringByteSink(StringClass* dest) : dest_(dest) { }
|
||||
// @draft ICU 4.2
|
||||
virtual void Append(const char* data, int32_t n) { dest_->append(data, n); }
|
||||
private:
|
||||
StringClass* dest_;
|
||||
StringByteSink(); // default constructor not implemented
|
||||
StringByteSink(const StringByteSink &); // copy constructor not implemented
|
||||
StringByteSink &operator=(const StringByteSink &); // assignment operator not implemented
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __BYTESTREAM_H__
|
113
icu4c/source/common/unicode/stringpiece.h
Normal file
113
icu4c/source/common/unicode/stringpiece.h
Normal file
@ -0,0 +1,113 @@
|
||||
// Copyright (C) 2009, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//
|
||||
// Copyright 2001 and onwards Google Inc.
|
||||
// Author: Sanjay Ghemawat
|
||||
//
|
||||
// A string-like object that points to a sized piece of memory.
|
||||
//
|
||||
// Functions or methods may use const StringPiece& parameters to accept either
|
||||
// a "const char*" or a "string" value that will be implicitly converted to
|
||||
// a StringPiece.
|
||||
//
|
||||
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
|
||||
// conversions from "const char*" to "string" and back again.
|
||||
//
|
||||
//
|
||||
// Arghh! I wish C++ literals were "string".
|
||||
|
||||
#ifndef __STRINGPIECE_H__
|
||||
#define __STRINGPIECE_H__
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: Read-only byte string wrapper class.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/std_string.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// @draft ICU 4.2
|
||||
class U_COMMON_API StringPiece : public UMemory {
|
||||
private:
|
||||
const char* ptr_;
|
||||
int32_t length_;
|
||||
|
||||
public:
|
||||
// We provide non-explicit singleton constructors so users can pass
|
||||
// in a "const char*" or a "string" wherever a "StringPiece" is
|
||||
// expected.
|
||||
// @draft ICU 4.2
|
||||
StringPiece() : ptr_(NULL), length_(0) { }
|
||||
// @draft ICU 4.2
|
||||
StringPiece(const char* str);
|
||||
#if U_HAVE_STD_STRING
|
||||
// @draft ICU 4.2
|
||||
StringPiece(const U_STD_NSQ string& str)
|
||||
: ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { }
|
||||
#endif
|
||||
// @draft ICU 4.2
|
||||
StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { }
|
||||
// Substring of another StringPiece.
|
||||
// pos must be non-negative and <= x.length().
|
||||
// @draft ICU 4.2
|
||||
StringPiece(const StringPiece& x, int32_t pos);
|
||||
// Substring of another StringPiece.
|
||||
// pos must be non-negative and <= x.length().
|
||||
// len must be non-negative and will be pinned to at most x.length() - pos.
|
||||
// @draft ICU 4.2
|
||||
StringPiece(const StringPiece& x, int32_t pos, int32_t len);
|
||||
|
||||
// data() may return a pointer to a buffer with embedded NULs, and the
|
||||
// returned buffer may or may not be null terminated. Therefore it is
|
||||
// typically a mistake to pass data() to a routine that expects a NUL
|
||||
// terminated string.
|
||||
// @draft ICU 4.2
|
||||
const char* data() const { return ptr_; }
|
||||
// @draft ICU 4.2
|
||||
int32_t size() const { return length_; }
|
||||
// @draft ICU 4.2
|
||||
int32_t length() const { return length_; }
|
||||
// @draft ICU 4.2
|
||||
UBool empty() const { return length_ == 0; }
|
||||
|
||||
// @draft ICU 4.2
|
||||
void clear() { ptr_ = NULL; length_ = 0; }
|
||||
|
||||
// @draft ICU 4.2
|
||||
void remove_prefix(int32_t n) {
|
||||
if (n >= 0) {
|
||||
if (n > length_) {
|
||||
n = length_;
|
||||
}
|
||||
ptr_ += n;
|
||||
length_ -= n;
|
||||
}
|
||||
}
|
||||
|
||||
// @draft ICU 4.2
|
||||
void remove_suffix(int32_t n) {
|
||||
if (n >= 0) {
|
||||
if (n <= length_) {
|
||||
length_ -= n;
|
||||
} else {
|
||||
length_ = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// @draft ICU 4.2
|
||||
static const int32_t npos = 0x7fffffff;
|
||||
|
||||
// @draft ICU 4.2
|
||||
StringPiece substr(int32_t pos, int32_t n = npos) const {
|
||||
return StringPiece(*this, pos, n);
|
||||
}
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __STRINGPIECE_H__
|
@ -26,7 +26,11 @@
|
||||
* \brief C++ API: Unicode String
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/rep.h"
|
||||
#include "unicode/std_string.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/bytestream.h"
|
||||
|
||||
struct UConverter; // unicode/ucnv.h
|
||||
class StringThreadTest;
|
||||
@ -1532,6 +1536,40 @@ public:
|
||||
UConverter *cnv,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Convert the UnicodeString to UTF-8 and write the result
|
||||
* to a ByteSink. This is called by toUTF8String().
|
||||
* Unpaired surrogates are replaced with U+FFFD.
|
||||
* Calls u_strToUTF8WithSub().
|
||||
*
|
||||
* @param A ByteSink to which the UTF-8 version of the string is written.
|
||||
* @draft ICU 4.2
|
||||
* @see toUTF8String
|
||||
*/
|
||||
void toUTF8(ByteSink &sink) const;
|
||||
|
||||
#if U_HAVE_STD_STRING
|
||||
|
||||
/**
|
||||
* Convert the UnicodeString to UTF-8 and append the result
|
||||
* to a standard string.
|
||||
* Unpaired surrogates are replaced with U+FFFD.
|
||||
* Calls toUTF8().
|
||||
*
|
||||
* @param A standard string (or a compatible object)
|
||||
* to which the UTF-8 version of the string is appended.
|
||||
* @return The string object.
|
||||
* @draft ICU 4.2
|
||||
* @see toUTF8
|
||||
*/
|
||||
template<typename StringClass>
|
||||
StringClass &toUTF8String(StringClass &result) const {
|
||||
toUTF8(StringByteSink<StringClass>(&result));
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -2917,6 +2955,21 @@ public:
|
||||
*/
|
||||
virtual ~UnicodeString();
|
||||
|
||||
/**
|
||||
* Create a UnicodeString from a UTF-8 string.
|
||||
* Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string.
|
||||
* Calls u_strFromUTF8WithSub().
|
||||
*
|
||||
* @param utf8 UTF-8 input string.
|
||||
* Note that a StringPiece can be implicitly constructed
|
||||
* from a std::string or a NUL-terminated const char * string.
|
||||
* @return A UnicodeString with equivalent UTF-16 contents.
|
||||
* @see toUTF8
|
||||
* @see toUTF8String
|
||||
* @draft ICU 4.2
|
||||
*/
|
||||
static UnicodeString fromUTF8(const StringPiece &utf8);
|
||||
|
||||
/**
|
||||
* Create a UnicodeString from a UTF-32 string.
|
||||
* Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string.
|
||||
|
@ -379,9 +379,34 @@ UnicodeString::~UnicodeString()
|
||||
// Factory methods
|
||||
//========================================
|
||||
|
||||
UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
|
||||
UnicodeString result;
|
||||
int32_t length = utf8.length();
|
||||
int32_t capacity;
|
||||
// The UTF-16 string will be at most as long as the UTF-8 string.
|
||||
if(length <= US_STACKBUF_SIZE) {
|
||||
capacity = US_STACKBUF_SIZE;
|
||||
} else {
|
||||
capacity = length + 1; // +1 for the terminating NUL.
|
||||
}
|
||||
UChar *utf16 = result.getBuffer(capacity);
|
||||
int32_t length16;
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
u_strFromUTF8WithSub(utf16, result.getCapacity(), &length16,
|
||||
utf8.data(), length,
|
||||
0xfffd, // Substitution character.
|
||||
NULL, // Don't care about number of substitutions.
|
||||
&errorCode);
|
||||
result.releaseBuffer(length16);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
result.setToBogus();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
|
||||
UnicodeString result;
|
||||
int32_t capacity = length;
|
||||
int32_t capacity;
|
||||
// Most UTF-32 strings will be BMP-only and result in a same-length
|
||||
// UTF-16 string. We overestimate the capacity just slightly,
|
||||
// just in case there are a few supplementary characters.
|
||||
@ -756,6 +781,51 @@ UnicodeString::extractBetween(int32_t start,
|
||||
doExtract(start, limit - start, target);
|
||||
}
|
||||
|
||||
// When converting from UTF-16 to UTF-8, the result will have at most 3 times
|
||||
// as many bytes as the source has UChars.
|
||||
// The "worst cases" are writing systems like Indic, Thai and CJK with
|
||||
// 3:1 bytes:UChars.
|
||||
void
|
||||
UnicodeString::toUTF8(ByteSink &sink) const {
|
||||
int32_t length16 = length();
|
||||
if(length16 != 0) {
|
||||
char stackBuffer[1024];
|
||||
int32_t capacity = (int32_t)sizeof(stackBuffer);
|
||||
UBool utf8IsOwned = FALSE;
|
||||
char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
|
||||
3*length16,
|
||||
stackBuffer, capacity,
|
||||
&capacity);
|
||||
int32_t length8 = 0;
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
u_strToUTF8WithSub(utf8, capacity, &length8,
|
||||
getBuffer(), length16,
|
||||
0xFFFD, // Standard substitution character.
|
||||
NULL, // Don't care about number of substitutions.
|
||||
&errorCode);
|
||||
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
|
||||
utf8 = (char *)uprv_malloc(length8);
|
||||
if(utf8 != NULL) {
|
||||
utf8IsOwned = TRUE;
|
||||
errorCode = U_ZERO_ERROR;
|
||||
u_strToUTF8WithSub(utf8, length8, &length8,
|
||||
getBuffer(), length16,
|
||||
0xFFFD, // Standard substitution character.
|
||||
NULL, // Don't care about number of substitutions.
|
||||
&errorCode);
|
||||
} else {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
sink.Append(utf8, length8);
|
||||
}
|
||||
if(utf8IsOwned) {
|
||||
uprv_free(utf8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
|
||||
int32_t length32=0;
|
||||
|
@ -5,6 +5,7 @@
|
||||
********************************************************************/
|
||||
|
||||
#include "ustrtest.h"
|
||||
#include "unicode/std_string.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ustring.h"
|
||||
@ -59,6 +60,7 @@ void UnicodeStringTest::runIndexedTest( int32_t index, UBool exec, const char* &
|
||||
case 16: name = "TestCharString"; if (exec) TestCharString(); break;
|
||||
case 17: name = "TestNameSpace"; if (exec) TestNameSpace(); break;
|
||||
case 18: name = "TestUTF32"; if (exec) TestUTF32(); break;
|
||||
case 19: name = "TestUTF8"; if (exec) TestUTF8(); break;
|
||||
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
@ -1739,3 +1741,71 @@ UnicodeStringTest::TestUTF32() {
|
||||
errln("UnicodeString::toUTF32() did not create the expected string.");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
UnicodeStringTest::TestUTF8() {
|
||||
static const uint8_t utf8[] = {
|
||||
// Code points:
|
||||
// 0x41, 0xd900,
|
||||
// 0x61, 0xdc00,
|
||||
// 0x110000, 0x5a,
|
||||
// 0x50000, 0x7a,
|
||||
// 0x10000, 0x20000,
|
||||
// 0xe0000, 0x10ffff
|
||||
0x41, 0xed, 0xa4, 0x80,
|
||||
0x61, 0xed, 0xb0, 0x80,
|
||||
0xf4, 0x90, 0x80, 0x80, 0x5a,
|
||||
0xf1, 0x90, 0x80, 0x80, 0x7a,
|
||||
0xf0, 0x90, 0x80, 0x80, 0xf0, 0xa0, 0x80, 0x80,
|
||||
0xf3, 0xa0, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
|
||||
};
|
||||
static const UChar expected_utf16[] = {
|
||||
0x41, 0xfffd,
|
||||
0x61, 0xfffd,
|
||||
0xfffd, 0x5a,
|
||||
0xd900, 0xdc00, 0x7a,
|
||||
0xd800, 0xdc00, 0xd840, 0xdc00,
|
||||
0xdb40, 0xdc00, 0xdbff, 0xdfff
|
||||
};
|
||||
UnicodeString from8 = UnicodeString::fromUTF8(StringPiece((const char *)utf8, (int32_t)sizeof(utf8)));
|
||||
UnicodeString expected(FALSE, expected_utf16, LENGTHOF(expected_utf16));
|
||||
|
||||
if(from8 != expected) {
|
||||
errln("UnicodeString::fromUTF8(StringPiece) did not create the expected string.");
|
||||
}
|
||||
#if U_HAVE_STD_STRING
|
||||
U_STD_NSQ string utf8_string((const char *)utf8, sizeof(utf8));
|
||||
UnicodeString from8b = UnicodeString::fromUTF8(utf8_string);
|
||||
if(from8b != expected) {
|
||||
errln("UnicodeString::fromUTF8(std::string) did not create the expected string.");
|
||||
}
|
||||
#endif
|
||||
|
||||
static const UChar utf16[] = {
|
||||
0x41, 0xd900, 0x61, 0xdc00, 0x5a, 0xd900, 0xdc00, 0x7a, 0xd800, 0xdc00, 0xdbff, 0xdfff
|
||||
};
|
||||
static const uint8_t expected_utf8[] = {
|
||||
0x41, 0xef, 0xbf, 0xbd, 0x61, 0xef, 0xbf, 0xbd, 0x5a, 0xf1, 0x90, 0x80, 0x80, 0x7a,
|
||||
0xf0, 0x90, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
|
||||
};
|
||||
UnicodeString us(FALSE, utf16, LENGTHOF(utf16));
|
||||
|
||||
char buffer[64];
|
||||
CheckedArrayByteSink sink(buffer, (int32_t)sizeof(buffer));
|
||||
us.toUTF8(sink);
|
||||
if( sink.NumberOfBytesWritten() != (int32_t)sizeof(expected_utf8) ||
|
||||
0 != uprv_memcmp(buffer, expected_utf8, sizeof(expected_utf8))
|
||||
) {
|
||||
errln("UnicodeString::toUTF8() did not create the expected string.");
|
||||
}
|
||||
#if U_HAVE_STD_STRING
|
||||
// Initial contents for testing that toUTF8String() appends.
|
||||
U_STD_NSQ string result8 = "-->";
|
||||
U_STD_NSQ string expected8 = "-->" + U_STD_NSQ string((const char *)expected_utf8, sizeof(expected_utf8));
|
||||
// Use the return value just for testing.
|
||||
U_STD_NSQ string &result8r = us.toUTF8String(result8);
|
||||
if(result8r != expected8 || &result8r != &result8) {
|
||||
errln("UnicodeString::toUTF8String() did not create the expected string.");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -76,6 +76,7 @@ public:
|
||||
void TestCharString(void);
|
||||
void TestNameSpace(void);
|
||||
void TestUTF32(void);
|
||||
void TestUTF8(void);
|
||||
};
|
||||
|
||||
class StringCaseTest: public IntlTest {
|
||||
|
Loading…
Reference in New Issue
Block a user