2001-09-18 03:41:09 +00:00
|
|
|
/*
|
|
|
|
******************************************************************************
|
|
|
|
*
|
2007-05-10 20:40:35 +00:00
|
|
|
* Copyright (C) 2001-2007, International Business Machines
|
2001-09-18 03:41:09 +00:00
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
|
|
|
******************************************************************************
|
|
|
|
*
|
|
|
|
* File ustrtrns.c
|
|
|
|
*
|
|
|
|
* Modification History:
|
|
|
|
*
|
|
|
|
* Date Name Description
|
|
|
|
* 9/10/2001 Ram Creation.
|
|
|
|
******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
*
|
|
|
|
* u_strTo* and u_strFrom* APIs
|
2004-09-07 17:49:59 +00:00
|
|
|
* WCS functions moved to ustr_wcs.c for better modularization
|
2001-09-18 03:41:09 +00:00
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include "unicode/putil.h"
|
|
|
|
#include "unicode/ustring.h"
|
|
|
|
#include "cstring.h"
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "ustr_imp.h"
|
|
|
|
|
|
|
|
U_CAPI UChar* U_EXPORT2
|
|
|
|
u_strFromUTF32(UChar *dest,
|
|
|
|
int32_t destCapacity,
|
|
|
|
int32_t *pDestLength,
|
2001-10-05 23:13:25 +00:00
|
|
|
const UChar32 *src,
|
2001-09-18 03:41:09 +00:00
|
|
|
int32_t srcLength,
|
2001-10-05 23:13:25 +00:00
|
|
|
UErrorCode *pErrorCode)
|
|
|
|
{
|
2001-09-18 03:41:09 +00:00
|
|
|
int32_t reqLength = 0;
|
|
|
|
uint32_t ch =0;
|
|
|
|
UChar *pDestLimit =dest+destCapacity;
|
|
|
|
UChar *pDest = dest;
|
2001-10-05 23:13:25 +00:00
|
|
|
const uint32_t *pSrc = (const uint32_t *)src;
|
|
|
|
|
2001-09-18 03:41:09 +00:00
|
|
|
/* args check */
|
2001-11-30 05:20:18 +00:00
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
2001-09-18 03:41:09 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2003-07-24 23:23:19 +00:00
|
|
|
if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
|
2001-09-18 03:41:09 +00:00
|
|
|
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check if the source is null terminated */
|
|
|
|
if(srcLength == -1 ){
|
2001-11-30 05:20:18 +00:00
|
|
|
while(((ch=*pSrc)!=0) && (pDest < pDestLimit)){
|
|
|
|
++pSrc;
|
|
|
|
if(ch<=0xFFFF){
|
|
|
|
*(pDest++)=(UChar)ch;
|
|
|
|
}else if(ch<=0x10ffff){
|
|
|
|
*(pDest++)=UTF16_LEAD(ch);
|
|
|
|
if(pDest<pDestLimit){
|
|
|
|
*(pDest++)=UTF16_TRAIL(ch);
|
2001-09-18 03:41:09 +00:00
|
|
|
}else{
|
2001-11-30 05:20:18 +00:00
|
|
|
reqLength++;
|
|
|
|
break;
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
|
|
|
}else{
|
2001-11-30 05:20:18 +00:00
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
while((ch=*pSrc++) != 0){
|
|
|
|
reqLength+=UTF_CHAR_LENGTH(ch);
|
|
|
|
}
|
|
|
|
}else{
|
2001-10-05 23:13:25 +00:00
|
|
|
const uint32_t* pSrcLimit = ((const uint32_t*)pSrc) + srcLength;
|
2001-11-30 22:15:36 +00:00
|
|
|
while((pSrc < pSrcLimit) && (pDest < pDestLimit)){
|
2001-11-30 05:20:18 +00:00
|
|
|
ch = *pSrc++;
|
|
|
|
if(ch<=0xFFFF){
|
|
|
|
*(pDest++)=(UChar)ch;
|
|
|
|
}else if(ch<=0x10FFFF){
|
|
|
|
*(pDest++)=UTF16_LEAD(ch);
|
|
|
|
if(pDest<pDestLimit){
|
|
|
|
*(pDest++)=UTF16_TRAIL(ch);
|
2001-09-18 03:41:09 +00:00
|
|
|
}else{
|
2001-11-30 05:20:18 +00:00
|
|
|
reqLength++;
|
|
|
|
break;
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
|
|
|
}else{
|
2001-11-30 05:20:18 +00:00
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
while(pSrc <pSrcLimit){
|
|
|
|
ch = *pSrc++;
|
|
|
|
reqLength+=UTF_CHAR_LENGTH(ch);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-12-08 01:05:40 +00:00
|
|
|
reqLength += (int32_t)(pDest - dest);
|
2001-09-18 03:41:09 +00:00
|
|
|
if(pDestLength){
|
|
|
|
*pDestLength = reqLength;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Terminate the buffer */
|
|
|
|
u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
|
|
|
|
|
|
|
|
return dest;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2001-10-05 23:13:25 +00:00
|
|
|
U_CAPI UChar32* U_EXPORT2
|
|
|
|
u_strToUTF32(UChar32 *dest,
|
2001-09-18 03:41:09 +00:00
|
|
|
int32_t destCapacity,
|
|
|
|
int32_t *pDestLength,
|
|
|
|
const UChar *src,
|
|
|
|
int32_t srcLength,
|
2001-10-05 23:13:25 +00:00
|
|
|
UErrorCode *pErrorCode)
|
|
|
|
{
|
2001-09-18 03:41:09 +00:00
|
|
|
const UChar* pSrc = src;
|
2001-11-30 05:20:18 +00:00
|
|
|
const UChar* pSrcLimit;
|
2001-09-18 03:41:09 +00:00
|
|
|
int32_t reqLength=0;
|
|
|
|
uint32_t ch=0;
|
2001-10-05 23:13:25 +00:00
|
|
|
uint32_t *pDest = (uint32_t *)dest;
|
|
|
|
uint32_t *pDestLimit = pDest + destCapacity;
|
2001-11-30 05:20:18 +00:00
|
|
|
UChar ch2=0;
|
2001-09-18 03:41:09 +00:00
|
|
|
|
|
|
|
/* args check */
|
2001-11-30 05:20:18 +00:00
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
2001-09-18 03:41:09 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2001-11-30 05:20:18 +00:00
|
|
|
|
2003-07-24 23:23:19 +00:00
|
|
|
if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
|
2001-09-18 03:41:09 +00:00
|
|
|
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2001-11-30 05:20:18 +00:00
|
|
|
if(srcLength==-1) {
|
|
|
|
while((ch=*pSrc)!=0 && pDest!=pDestLimit) {
|
|
|
|
++pSrc;
|
|
|
|
/*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
|
|
|
|
if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
|
|
|
|
++pSrc;
|
|
|
|
ch=UTF16_GET_PAIR_VALUE(ch, ch2);
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
2001-11-30 05:20:18 +00:00
|
|
|
*(pDest++)= ch;
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
2001-11-30 05:20:18 +00:00
|
|
|
while((ch=*pSrc++)!=0) {
|
|
|
|
if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
|
|
|
|
++pSrc;
|
|
|
|
}
|
|
|
|
++reqLength;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
pSrcLimit = pSrc+srcLength;
|
|
|
|
while(pSrc<pSrcLimit && pDest<pDestLimit) {
|
|
|
|
ch=*pSrc++;
|
|
|
|
if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
|
|
|
|
++pSrc;
|
|
|
|
ch=UTF16_GET_PAIR_VALUE(ch, ch2);
|
|
|
|
}
|
|
|
|
*(pDest++)= ch;
|
|
|
|
}
|
|
|
|
while(pSrc!=pSrcLimit) {
|
|
|
|
ch=*pSrc++;
|
|
|
|
if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
|
|
|
|
++pSrc;
|
|
|
|
}
|
|
|
|
++reqLength;
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-12-08 01:05:40 +00:00
|
|
|
reqLength+=(int32_t)(pDest - (uint32_t *)dest);
|
2001-09-18 03:41:09 +00:00
|
|
|
if(pDestLength){
|
|
|
|
*pDestLength = reqLength;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Terminate the buffer */
|
|
|
|
u_terminateUChar32s(dest,destCapacity,reqLength,pErrorCode);
|
|
|
|
|
|
|
|
return dest;
|
|
|
|
}
|
|
|
|
|
2006-06-15 19:22:04 +00:00
|
|
|
/* for utf8_nextCharSafeBodyTerminated() */
|
|
|
|
static const UChar32
|
|
|
|
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Version of utf8_nextCharSafeBody() with the following differences:
|
|
|
|
* - checks for NUL termination instead of length
|
|
|
|
* - works with pointers instead of indexes
|
|
|
|
* - always strict (strict==-1)
|
|
|
|
*
|
|
|
|
* *ps points to after the lead byte and will be moved to after the last trail byte.
|
|
|
|
* c is the lead byte.
|
|
|
|
* @return the code point, or U_SENTINEL
|
|
|
|
*/
|
|
|
|
static UChar32
|
|
|
|
utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
|
|
|
|
const uint8_t *s=*ps;
|
|
|
|
uint8_t trail, illegal=0;
|
|
|
|
uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
|
|
|
|
UTF8_MASK_LEAD_BYTE((c), count);
|
|
|
|
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
|
|
|
|
switch(count) {
|
|
|
|
/* each branch falls through to the next one */
|
|
|
|
case 5:
|
|
|
|
case 4:
|
|
|
|
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
|
|
|
|
illegal=1;
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
trail=(uint8_t)(*s++ - 0x80);
|
|
|
|
c=(c<<6)|trail;
|
|
|
|
if(trail>0x3f || c>=0x110) {
|
|
|
|
/* not a trail byte, or code point>0x10ffff (outside Unicode) */
|
|
|
|
illegal=1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case 2:
|
|
|
|
trail=(uint8_t)(*s++ - 0x80);
|
|
|
|
if(trail>0x3f) {
|
|
|
|
/* not a trail byte */
|
|
|
|
illegal=1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
c=(c<<6)|trail;
|
|
|
|
case 1:
|
|
|
|
trail=(uint8_t)(*s++ - 0x80);
|
|
|
|
if(trail>0x3f) {
|
|
|
|
/* not a trail byte */
|
|
|
|
illegal=1;
|
|
|
|
}
|
|
|
|
c=(c<<6)|trail;
|
|
|
|
break;
|
|
|
|
case 0:
|
|
|
|
return U_SENTINEL;
|
|
|
|
/* no default branch to optimize switch() - all values are covered */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
|
|
|
|
/* illegal is also set if count>=4 */
|
|
|
|
if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
|
|
|
|
/* error handling */
|
|
|
|
/* don't go beyond this sequence */
|
|
|
|
s=*ps;
|
|
|
|
while(count>0 && UTF8_IS_TRAIL(*s)) {
|
|
|
|
++s;
|
|
|
|
--count;
|
|
|
|
}
|
|
|
|
c=U_SENTINEL;
|
|
|
|
}
|
|
|
|
*ps=s;
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Version of utf8_nextCharSafeBody() with the following differences:
|
|
|
|
* - works with pointers instead of indexes
|
|
|
|
* - always strict (strict==-1)
|
|
|
|
*
|
|
|
|
* *ps points to after the lead byte and will be moved to after the last trail byte.
|
|
|
|
* c is the lead byte.
|
|
|
|
* @return the code point, or U_SENTINEL
|
|
|
|
*/
|
|
|
|
static UChar32
|
|
|
|
utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
|
|
|
|
const uint8_t *s=*ps;
|
|
|
|
uint8_t trail, illegal=0;
|
|
|
|
uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
|
|
|
|
if((limit-s)>=count) {
|
|
|
|
UTF8_MASK_LEAD_BYTE((c), count);
|
|
|
|
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
|
|
|
|
switch(count) {
|
|
|
|
/* each branch falls through to the next one */
|
|
|
|
case 5:
|
|
|
|
case 4:
|
|
|
|
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
|
|
|
|
illegal=1;
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
trail=*s++;
|
|
|
|
c=(c<<6)|(trail&0x3f);
|
|
|
|
if(c<0x110) {
|
|
|
|
illegal|=(trail&0xc0)^0x80;
|
|
|
|
} else {
|
|
|
|
/* code point>0x10ffff, outside Unicode */
|
|
|
|
illegal=1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case 2:
|
|
|
|
trail=*s++;
|
|
|
|
c=(c<<6)|(trail&0x3f);
|
|
|
|
illegal|=(trail&0xc0)^0x80;
|
|
|
|
case 1:
|
|
|
|
trail=*s++;
|
|
|
|
c=(c<<6)|(trail&0x3f);
|
|
|
|
illegal|=(trail&0xc0)^0x80;
|
|
|
|
break;
|
|
|
|
case 0:
|
|
|
|
return U_SENTINEL;
|
|
|
|
/* no default branch to optimize switch() - all values are covered */
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
illegal=1; /* too few bytes left */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
|
|
|
|
/* illegal is also set if count>=4 */
|
|
|
|
if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
|
|
|
|
/* error handling */
|
|
|
|
/* don't go beyond this sequence */
|
|
|
|
s=*ps;
|
|
|
|
while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) {
|
|
|
|
++s;
|
|
|
|
--count;
|
|
|
|
}
|
|
|
|
c=U_SENTINEL;
|
|
|
|
}
|
|
|
|
*ps=s;
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2001-09-18 03:41:09 +00:00
|
|
|
U_CAPI UChar* U_EXPORT2
|
2006-06-15 19:22:04 +00:00
|
|
|
u_strFromUTF8WithSub(UChar *dest,
|
2001-09-18 03:41:09 +00:00
|
|
|
int32_t destCapacity,
|
|
|
|
int32_t *pDestLength,
|
2006-06-15 19:22:04 +00:00
|
|
|
const char* src,
|
2001-09-18 03:41:09 +00:00
|
|
|
int32_t srcLength,
|
2006-06-15 19:22:04 +00:00
|
|
|
UChar32 subchar, int32_t *pNumSubstitutions,
|
2001-09-18 03:41:09 +00:00
|
|
|
UErrorCode *pErrorCode){
|
|
|
|
|
|
|
|
UChar *pDest = dest;
|
|
|
|
UChar *pDestLimit = dest+destCapacity;
|
2006-06-15 19:22:04 +00:00
|
|
|
UChar32 ch;
|
2001-09-18 03:41:09 +00:00
|
|
|
int32_t reqLength = 0;
|
2006-06-23 01:32:07 +00:00
|
|
|
const uint8_t* pSrc = (const uint8_t*) src;
|
2006-06-15 19:22:04 +00:00
|
|
|
uint8_t t1, t2; /* trail bytes */
|
|
|
|
int32_t numSubstitutions;
|
2002-08-03 00:54:39 +00:00
|
|
|
|
2001-09-18 03:41:09 +00:00
|
|
|
/* args check */
|
2001-11-30 05:20:18 +00:00
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
2001-09-18 03:41:09 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2001-11-30 05:20:18 +00:00
|
|
|
|
2006-06-15 19:22:04 +00:00
|
|
|
if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
|
|
|
|
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
|
|
|
|
) {
|
2001-09-18 03:41:09 +00:00
|
|
|
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2006-06-15 19:22:04 +00:00
|
|
|
numSubstitutions=0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inline processing of UTF-8 byte sequences:
|
|
|
|
*
|
|
|
|
* Byte sequences for the most common characters are handled inline in
|
|
|
|
* the conversion loops. In order to reduce the path lengths for those
|
|
|
|
* characters, the tests are arranged in a kind of binary search.
|
|
|
|
* ASCII (<=0x7f) is checked first, followed by the dividing point
|
|
|
|
* between 2- and 3-byte sequences (0xe0).
|
|
|
|
* The 3-byte branch is tested first to speed up CJK text.
|
|
|
|
* The compiler should combine the subtractions for the two tests for 0xe0.
|
|
|
|
* Each branch then tests for the other end of its range.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if(srcLength < 0){
|
|
|
|
/*
|
|
|
|
* Transform a NUL-terminated string.
|
|
|
|
* The code explicitly checks for NULs only in the lead byte position.
|
|
|
|
* A NUL byte in the trail byte position fails the trail byte range check anyway.
|
|
|
|
*/
|
|
|
|
while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
|
|
|
|
if(ch <= 0x7f){
|
|
|
|
*pDest++=(UChar)ch;
|
|
|
|
++pSrc;
|
|
|
|
} else {
|
|
|
|
if(ch > 0xe0) {
|
|
|
|
if( /* handle U+1000..U+CFFF inline */
|
|
|
|
ch <= 0xec &&
|
|
|
|
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
|
|
|
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
|
|
|
) {
|
|
|
|
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
|
|
|
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
|
|
|
pSrc += 3;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else if(ch < 0xe0) {
|
|
|
|
if( /* handle U+0080..U+07FF inline */
|
|
|
|
ch >= 0xc2 &&
|
|
|
|
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
|
|
|
) {
|
|
|
|
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
|
|
|
pSrc += 2;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* function call for "complicated" and error cases */
|
|
|
|
++pSrc; /* continue after the lead byte */
|
|
|
|
ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
|
|
|
|
if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
|
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
|
|
|
} else if(ch<=0xFFFF) {
|
|
|
|
*(pDest++)=(UChar)ch;
|
|
|
|
} else {
|
|
|
|
*(pDest++)=UTF16_LEAD(ch);
|
|
|
|
if(pDest<pDestLimit) {
|
|
|
|
*(pDest++)=UTF16_TRAIL(ch);
|
|
|
|
} else {
|
|
|
|
reqLength++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Pre-flight the rest of the string. */
|
|
|
|
while((ch = *pSrc) != 0) {
|
|
|
|
if(ch <= 0x7f){
|
|
|
|
++reqLength;
|
|
|
|
++pSrc;
|
|
|
|
} else {
|
|
|
|
if(ch > 0xe0) {
|
|
|
|
if( /* handle U+1000..U+CFFF inline */
|
|
|
|
ch <= 0xec &&
|
|
|
|
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
|
|
|
|
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
|
|
|
|
) {
|
|
|
|
++reqLength;
|
|
|
|
pSrc += 3;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else if(ch < 0xe0) {
|
|
|
|
if( /* handle U+0080..U+07FF inline */
|
|
|
|
ch >= 0xc2 &&
|
|
|
|
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
|
|
|
|
) {
|
|
|
|
++reqLength;
|
|
|
|
pSrc += 2;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* function call for "complicated" and error cases */
|
|
|
|
++pSrc; /* continue after the lead byte */
|
|
|
|
ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
|
|
|
|
if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
|
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
reqLength += U16_LENGTH(ch);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else /* srcLength >= 0 */ {
|
|
|
|
const uint8_t *pSrcLimit = pSrc + srcLength;
|
|
|
|
int32_t count;
|
|
|
|
|
|
|
|
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
|
|
|
|
for(;;) {
|
|
|
|
/*
|
|
|
|
* Each iteration of the inner loop progresses by at most 3 UTF-8
|
|
|
|
* bytes and one UChar, for most characters.
|
|
|
|
* For supplementary code points (4 & 2), which are rare,
|
|
|
|
* there is an additional adjustment.
|
|
|
|
*/
|
|
|
|
count = (int32_t)(pDestLimit - pDest);
|
|
|
|
srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
|
|
|
|
if(count > srcLength) {
|
|
|
|
count = srcLength; /* min(remaining dest, remaining src/3) */
|
|
|
|
}
|
|
|
|
if(count < 3) {
|
|
|
|
/*
|
|
|
|
* Too much overhead if we get near the end of the string,
|
|
|
|
* continue with the next loop.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
do {
|
|
|
|
ch = *pSrc;
|
|
|
|
if(ch <= 0x7f){
|
|
|
|
*pDest++=(UChar)ch;
|
|
|
|
++pSrc;
|
|
|
|
} else {
|
|
|
|
if(ch > 0xe0) {
|
|
|
|
if( /* handle U+1000..U+CFFF inline */
|
|
|
|
ch <= 0xec &&
|
|
|
|
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
|
|
|
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
|
|
|
) {
|
|
|
|
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
|
|
|
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
|
|
|
pSrc += 3;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else if(ch < 0xe0) {
|
|
|
|
if( /* handle U+0080..U+07FF inline */
|
|
|
|
ch >= 0xc2 &&
|
|
|
|
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
|
|
|
) {
|
|
|
|
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
|
|
|
pSrc += 2;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(ch >= 0xf0 || subchar > 0xffff) {
|
|
|
|
/*
|
|
|
|
* We may read up to six bytes and write up to two UChars,
|
|
|
|
* which we didn't account for with computing count,
|
|
|
|
* so we adjust it here.
|
|
|
|
*/
|
|
|
|
if(--count == 0) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* function call for "complicated" and error cases */
|
|
|
|
++pSrc; /* continue after the lead byte */
|
|
|
|
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
|
|
|
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
|
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
|
|
|
}else if(ch<=0xFFFF){
|
|
|
|
*(pDest++)=(UChar)ch;
|
|
|
|
}else{
|
|
|
|
*(pDest++)=UTF16_LEAD(ch);
|
|
|
|
if(pDest<pDestLimit){
|
|
|
|
*(pDest++)=UTF16_TRAIL(ch);
|
|
|
|
}else{
|
|
|
|
reqLength++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} while(--count > 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
|
|
|
|
ch = *pSrc;
|
|
|
|
if(ch <= 0x7f){
|
|
|
|
*pDest++=(UChar)ch;
|
|
|
|
++pSrc;
|
|
|
|
} else {
|
|
|
|
if(ch > 0xe0) {
|
|
|
|
if( /* handle U+1000..U+CFFF inline */
|
|
|
|
ch <= 0xec &&
|
|
|
|
((pSrcLimit - pSrc) >= 3) &&
|
|
|
|
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
|
|
|
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
|
|
|
) {
|
|
|
|
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
|
|
|
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
|
|
|
pSrc += 3;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else if(ch < 0xe0) {
|
|
|
|
if( /* handle U+0080..U+07FF inline */
|
|
|
|
ch >= 0xc2 &&
|
|
|
|
((pSrcLimit - pSrc) >= 2) &&
|
|
|
|
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
|
|
|
) {
|
|
|
|
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
|
|
|
pSrc += 2;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* function call for "complicated" and error cases */
|
|
|
|
++pSrc; /* continue after the lead byte */
|
|
|
|
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
|
|
|
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
|
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
|
|
|
}else if(ch<=0xFFFF){
|
|
|
|
*(pDest++)=(UChar)ch;
|
2001-09-18 03:41:09 +00:00
|
|
|
}else{
|
2006-06-15 19:22:04 +00:00
|
|
|
*(pDest++)=UTF16_LEAD(ch);
|
|
|
|
if(pDest<pDestLimit){
|
|
|
|
*(pDest++)=UTF16_TRAIL(ch);
|
|
|
|
}else{
|
|
|
|
reqLength++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* donot fill the dest buffer just count the UChars needed */
|
|
|
|
while(pSrc < pSrcLimit){
|
|
|
|
ch = *pSrc;
|
|
|
|
if(ch <= 0x7f){
|
|
|
|
reqLength++;
|
|
|
|
++pSrc;
|
|
|
|
} else {
|
|
|
|
if(ch > 0xe0) {
|
|
|
|
if( /* handle U+1000..U+CFFF inline */
|
|
|
|
ch <= 0xec &&
|
|
|
|
((pSrcLimit - pSrc) >= 3) &&
|
|
|
|
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
|
|
|
|
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
|
|
|
|
) {
|
|
|
|
reqLength++;
|
|
|
|
pSrc += 3;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else if(ch < 0xe0) {
|
|
|
|
if( /* handle U+0080..U+07FF inline */
|
|
|
|
ch >= 0xc2 &&
|
|
|
|
((pSrcLimit - pSrc) >= 2) &&
|
|
|
|
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
|
|
|
|
) {
|
|
|
|
reqLength++;
|
|
|
|
pSrc += 2;
|
|
|
|
continue;
|
|
|
|
}
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
2006-06-15 19:22:04 +00:00
|
|
|
|
|
|
|
/* function call for "complicated" and error cases */
|
|
|
|
++pSrc; /* continue after the lead byte */
|
|
|
|
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
|
|
|
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
|
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
reqLength+=UTF_CHAR_LENGTH(ch);
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2006-06-15 19:22:04 +00:00
|
|
|
|
|
|
|
reqLength+=(int32_t)(pDest - dest);
|
|
|
|
|
|
|
|
if(pNumSubstitutions!=NULL) {
|
|
|
|
*pNumSubstitutions=numSubstitutions;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(pDestLength){
|
|
|
|
*pDestLength = reqLength;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Terminate the buffer */
|
|
|
|
u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
|
|
|
|
|
|
|
|
return dest;
|
|
|
|
}
|
|
|
|
|
|
|
|
U_CAPI UChar* U_EXPORT2
|
|
|
|
u_strFromUTF8(UChar *dest,
|
|
|
|
int32_t destCapacity,
|
|
|
|
int32_t *pDestLength,
|
|
|
|
const char* src,
|
|
|
|
int32_t srcLength,
|
|
|
|
UErrorCode *pErrorCode){
|
|
|
|
return u_strFromUTF8WithSub(
|
|
|
|
dest, destCapacity, pDestLength,
|
|
|
|
src, srcLength,
|
|
|
|
U_SENTINEL, NULL,
|
|
|
|
pErrorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
U_CAPI UChar * U_EXPORT2
|
|
|
|
u_strFromUTF8Lenient(UChar *dest,
|
|
|
|
int32_t destCapacity,
|
|
|
|
int32_t *pDestLength,
|
|
|
|
const char *src,
|
|
|
|
int32_t srcLength,
|
|
|
|
UErrorCode *pErrorCode) {
|
|
|
|
|
|
|
|
UChar *pDest = dest;
|
|
|
|
UChar32 ch;
|
|
|
|
int32_t reqLength = 0;
|
|
|
|
uint8_t* pSrc = (uint8_t*) src;
|
|
|
|
|
|
|
|
/* args check */
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)) {
|
|
|
|
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(srcLength < 0) {
|
|
|
|
/* Transform a NUL-terminated string. */
|
|
|
|
UChar *pDestLimit = dest+destCapacity;
|
|
|
|
uint8_t t1, t2, t3; /* trail bytes */
|
|
|
|
|
|
|
|
while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
|
|
|
|
if(ch < 0xc0) {
|
|
|
|
/*
|
|
|
|
* ASCII, or a trail byte in lead position which is treated like
|
|
|
|
* a single-byte sequence for better character boundary
|
|
|
|
* resynchronization after illegal sequences.
|
|
|
|
*/
|
|
|
|
*pDest++=(UChar)ch;
|
|
|
|
++pSrc;
|
|
|
|
continue;
|
|
|
|
} else if(ch < 0xe0) { /* U+0080..U+07FF */
|
|
|
|
if((t1 = pSrc[1]) != 0) {
|
|
|
|
/* 0x3080 = (0xc0 << 6) + 0x80 */
|
|
|
|
*pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
|
|
|
|
pSrc += 2;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else if(ch < 0xf0) { /* U+0800..U+FFFF */
|
|
|
|
if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
|
|
|
|
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
|
|
|
/* 0x2080 = (0x80 << 6) + 0x80 */
|
|
|
|
*pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
|
|
|
|
pSrc += 3;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else /* f0..f4 */ { /* U+10000..U+10FFFF */
|
|
|
|
if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
|
|
|
|
pSrc += 4;
|
|
|
|
/* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
|
|
|
|
ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
|
|
|
|
*(pDest++) = U16_LEAD(ch);
|
|
|
|
if(pDest < pDestLimit) {
|
|
|
|
*(pDest++) = U16_TRAIL(ch);
|
|
|
|
} else {
|
|
|
|
reqLength = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2002-08-03 00:54:39 +00:00
|
|
|
}
|
2006-06-15 19:22:04 +00:00
|
|
|
|
|
|
|
/* truncated character at the end */
|
|
|
|
*pDest++ = 0xfffd;
|
|
|
|
while(*++pSrc != 0) {}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Pre-flight the rest of the string. */
|
|
|
|
while((ch = *pSrc) != 0) {
|
|
|
|
if(ch < 0xc0) {
|
|
|
|
/*
|
|
|
|
* ASCII, or a trail byte in lead position which is treated like
|
|
|
|
* a single-byte sequence for better character boundary
|
|
|
|
* resynchronization after illegal sequences.
|
|
|
|
*/
|
|
|
|
++reqLength;
|
|
|
|
++pSrc;
|
|
|
|
continue;
|
|
|
|
} else if(ch < 0xe0) { /* U+0080..U+07FF */
|
|
|
|
if(pSrc[1] != 0) {
|
|
|
|
++reqLength;
|
|
|
|
pSrc += 2;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else if(ch < 0xf0) { /* U+0800..U+FFFF */
|
|
|
|
if(pSrc[1] != 0 && pSrc[2] != 0) {
|
|
|
|
++reqLength;
|
|
|
|
pSrc += 3;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else /* f0..f4 */ { /* U+10000..U+10FFFF */
|
|
|
|
if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
|
|
|
|
reqLength += 2;
|
|
|
|
pSrc += 4;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* truncated character at the end */
|
|
|
|
++reqLength;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else /* srcLength >= 0 */ {
|
|
|
|
const uint8_t *pSrcLimit = pSrc + srcLength;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function requires that if srcLength is given, then it must be
|
|
|
|
* destCapatity >= srcLength so that we need not check for
|
|
|
|
* destination buffer overflow in the loop.
|
|
|
|
*/
|
|
|
|
if(destCapacity < srcLength) {
|
|
|
|
if(pDestLength != NULL) {
|
|
|
|
*pDestLength = srcLength; /* this likely overestimates the true destLength! */
|
|
|
|
}
|
|
|
|
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if((pSrcLimit - pSrc) >= 4) {
|
|
|
|
pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
|
|
|
|
|
|
|
|
/* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
|
|
|
|
do {
|
|
|
|
ch = *pSrc++;
|
|
|
|
if(ch < 0xc0) {
|
|
|
|
/*
|
|
|
|
* ASCII, or a trail byte in lead position which is treated like
|
|
|
|
* a single-byte sequence for better character boundary
|
|
|
|
* resynchronization after illegal sequences.
|
|
|
|
*/
|
|
|
|
*pDest++=(UChar)ch;
|
|
|
|
} else if(ch < 0xe0) { /* U+0080..U+07FF */
|
|
|
|
/* 0x3080 = (0xc0 << 6) + 0x80 */
|
|
|
|
*pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
|
|
|
|
} else if(ch < 0xf0) { /* U+0800..U+FFFF */
|
|
|
|
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
|
|
|
/* 0x2080 = (0x80 << 6) + 0x80 */
|
|
|
|
ch = (ch << 12) + (*pSrc++ << 6);
|
|
|
|
*pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
|
|
|
|
} else /* f0..f4 */ { /* U+10000..U+10FFFF */
|
|
|
|
/* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
|
|
|
|
ch = (ch << 18) + (*pSrc++ << 12);
|
|
|
|
ch += *pSrc++ << 6;
|
|
|
|
ch += *pSrc++ - 0x3c82080;
|
|
|
|
*(pDest++) = U16_LEAD(ch);
|
|
|
|
*(pDest++) = U16_TRAIL(ch);
|
|
|
|
}
|
|
|
|
} while(pSrc < pSrcLimit);
|
|
|
|
|
|
|
|
pSrcLimit += 3; /* restore original pSrcLimit */
|
|
|
|
}
|
|
|
|
|
|
|
|
while(pSrc < pSrcLimit) {
|
|
|
|
ch = *pSrc++;
|
|
|
|
if(ch < 0xc0) {
|
|
|
|
/*
|
|
|
|
* ASCII, or a trail byte in lead position which is treated like
|
|
|
|
* a single-byte sequence for better character boundary
|
|
|
|
* resynchronization after illegal sequences.
|
|
|
|
*/
|
|
|
|
*pDest++=(UChar)ch;
|
|
|
|
continue;
|
|
|
|
} else if(ch < 0xe0) { /* U+0080..U+07FF */
|
|
|
|
if(pSrc < pSrcLimit) {
|
|
|
|
/* 0x3080 = (0xc0 << 6) + 0x80 */
|
2007-05-10 20:40:35 +00:00
|
|
|
*pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
|
2006-06-15 19:22:04 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else if(ch < 0xf0) { /* U+0800..U+FFFF */
|
|
|
|
if((pSrcLimit - pSrc) >= 2) {
|
|
|
|
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
|
|
|
/* 0x2080 = (0x80 << 6) + 0x80 */
|
|
|
|
ch = (ch << 12) + (*pSrc++ << 6);
|
|
|
|
*pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
|
|
|
|
pSrc += 3;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else /* f0..f4 */ { /* U+10000..U+10FFFF */
|
|
|
|
if((pSrcLimit - pSrc) >= 3) {
|
|
|
|
/* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
|
|
|
|
ch = (ch << 18) + (*pSrc++ << 12);
|
|
|
|
ch += *pSrc++ << 6;
|
|
|
|
ch += *pSrc++ - 0x3c82080;
|
|
|
|
*(pDest++) = U16_LEAD(ch);
|
|
|
|
*(pDest++) = U16_TRAIL(ch);
|
|
|
|
pSrc += 4;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* truncated character at the end */
|
|
|
|
*pDest++ = 0xfffd;
|
|
|
|
break;
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-12-08 01:05:40 +00:00
|
|
|
reqLength+=(int32_t)(pDest - dest);
|
2001-09-18 03:41:09 +00:00
|
|
|
|
|
|
|
if(pDestLength){
|
|
|
|
*pDestLength = reqLength;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Terminate the buffer */
|
|
|
|
u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
|
|
|
|
|
|
|
|
return dest;
|
|
|
|
}
|
2001-11-30 22:15:36 +00:00
|
|
|
|
2002-11-30 04:41:53 +00:00
|
|
|
static U_INLINE uint8_t *
|
2001-11-30 22:15:36 +00:00
|
|
|
_appendUTF8(uint8_t *pDest, UChar32 c) {
|
2006-06-15 19:22:04 +00:00
|
|
|
/* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
|
|
|
|
if((c)<=0x7f) {
|
|
|
|
*pDest++=(uint8_t)c;
|
|
|
|
} else if(c<=0x7ff) {
|
2001-11-30 05:20:18 +00:00
|
|
|
*pDest++=(uint8_t)((c>>6)|0xc0);
|
|
|
|
*pDest++=(uint8_t)((c&0x3f)|0x80);
|
2006-06-15 19:22:04 +00:00
|
|
|
} else if(c<=0xffff) {
|
2001-11-30 05:20:18 +00:00
|
|
|
*pDest++=(uint8_t)((c>>12)|0xe0);
|
|
|
|
*pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
|
|
|
|
*pDest++=(uint8_t)(((c)&0x3f)|0x80);
|
2001-11-30 22:15:36 +00:00
|
|
|
} else /* if((uint32_t)(c)<=0x10ffff) */ {
|
2001-11-30 05:20:18 +00:00
|
|
|
*pDest++=(uint8_t)(((c)>>18)|0xf0);
|
|
|
|
*pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
|
|
|
|
*pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
|
|
|
|
*pDest++=(uint8_t)(((c)&0x3f)|0x80);
|
|
|
|
}
|
2001-11-30 22:15:36 +00:00
|
|
|
return pDest;
|
2001-11-30 05:20:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2001-09-22 03:00:46 +00:00
|
|
|
U_CAPI char* U_EXPORT2
|
2006-06-15 19:22:04 +00:00
|
|
|
u_strToUTF8WithSub(char *dest,
|
2001-09-18 03:41:09 +00:00
|
|
|
int32_t destCapacity,
|
|
|
|
int32_t *pDestLength,
|
2006-06-15 19:22:04 +00:00
|
|
|
const UChar *pSrc,
|
2001-09-18 03:41:09 +00:00
|
|
|
int32_t srcLength,
|
2006-06-15 19:22:04 +00:00
|
|
|
UChar32 subchar, int32_t *pNumSubstitutions,
|
2001-09-18 03:41:09 +00:00
|
|
|
UErrorCode *pErrorCode){
|
|
|
|
|
|
|
|
int32_t reqLength=0;
|
2001-11-30 05:20:18 +00:00
|
|
|
uint32_t ch=0,ch2=0;
|
|
|
|
uint8_t *pDest = (uint8_t *)dest;
|
|
|
|
uint8_t *pDestLimit = pDest + destCapacity;
|
2006-06-15 19:22:04 +00:00
|
|
|
int32_t numSubstitutions;
|
2001-09-18 03:41:09 +00:00
|
|
|
|
|
|
|
/* args check */
|
2001-11-30 05:20:18 +00:00
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
2001-09-18 03:41:09 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2001-11-30 05:20:18 +00:00
|
|
|
|
2006-06-15 19:22:04 +00:00
|
|
|
if( (pSrc==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
|
|
|
|
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
|
|
|
|
) {
|
2001-09-18 03:41:09 +00:00
|
|
|
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2006-06-15 19:22:04 +00:00
|
|
|
numSubstitutions=0;
|
|
|
|
|
2001-11-30 05:20:18 +00:00
|
|
|
if(srcLength==-1) {
|
2006-06-15 19:22:04 +00:00
|
|
|
while((ch=*pSrc)!=0) {
|
2001-11-30 05:20:18 +00:00
|
|
|
++pSrc;
|
2001-11-30 22:15:36 +00:00
|
|
|
if(ch <= 0x7f) {
|
2006-06-15 19:22:04 +00:00
|
|
|
if(pDest<pDestLimit) {
|
|
|
|
*pDest++ = (char)ch;
|
|
|
|
} else {
|
|
|
|
reqLength = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else if(ch <= 0x7ff) {
|
|
|
|
if((pDestLimit - pDest) >= 2) {
|
|
|
|
*pDest++=(uint8_t)((ch>>6)|0xc0);
|
|
|
|
*pDest++=(uint8_t)((ch&0x3f)|0x80);
|
|
|
|
} else {
|
|
|
|
reqLength = 2;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else if(ch <= 0xd7ff || ch >= 0xe000) {
|
|
|
|
if((pDestLimit - pDest) >= 3) {
|
|
|
|
*pDest++=(uint8_t)((ch>>12)|0xe0);
|
|
|
|
*pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
|
|
|
|
*pDest++=(uint8_t)((ch&0x3f)|0x80);
|
|
|
|
} else {
|
|
|
|
reqLength = 3;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else /* ch is a surrogate */ {
|
|
|
|
int32_t length;
|
2001-11-30 22:15:36 +00:00
|
|
|
|
2006-06-15 19:22:04 +00:00
|
|
|
/*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
|
2002-07-13 00:46:18 +00:00
|
|
|
if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
|
|
|
|
++pSrc;
|
|
|
|
ch=UTF16_GET_PAIR_VALUE(ch, ch2);
|
2006-06-15 19:22:04 +00:00
|
|
|
} else if(subchar>=0) {
|
|
|
|
ch=subchar;
|
|
|
|
++numSubstitutions;
|
2002-07-13 00:46:18 +00:00
|
|
|
} else {
|
|
|
|
/* Unicode 3.2 forbids surrogate code points in UTF-8 */
|
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
|
|
|
}
|
2006-06-15 19:22:04 +00:00
|
|
|
|
|
|
|
length = U8_LENGTH(ch);
|
|
|
|
if((pDestLimit - pDest) >= length) {
|
|
|
|
/* convert and append*/
|
|
|
|
pDest=_appendUTF8(pDest, ch);
|
|
|
|
} else {
|
|
|
|
reqLength = length;
|
|
|
|
break;
|
|
|
|
}
|
2001-11-30 05:20:18 +00:00
|
|
|
}
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
2001-11-30 05:20:18 +00:00
|
|
|
while((ch=*pSrc++)!=0) {
|
2002-07-13 00:46:18 +00:00
|
|
|
if(ch<=0x7f) {
|
|
|
|
++reqLength;
|
|
|
|
} else if(ch<=0x7ff) {
|
|
|
|
reqLength+=2;
|
|
|
|
} else if(!UTF_IS_SURROGATE(ch)) {
|
|
|
|
reqLength+=3;
|
|
|
|
} else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
|
2001-11-30 05:20:18 +00:00
|
|
|
++pSrc;
|
|
|
|
reqLength+=4;
|
2006-06-15 19:22:04 +00:00
|
|
|
} else if(subchar>=0) {
|
|
|
|
reqLength+=U8_LENGTH(subchar);
|
|
|
|
++numSubstitutions;
|
2001-11-30 05:20:18 +00:00
|
|
|
} else {
|
2002-07-13 00:46:18 +00:00
|
|
|
/* Unicode 3.2 forbids surrogate code points in UTF-8 */
|
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
2001-11-30 05:20:18 +00:00
|
|
|
}
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
2001-11-30 05:20:18 +00:00
|
|
|
} else {
|
2006-06-15 19:22:04 +00:00
|
|
|
const UChar *pSrcLimit = pSrc+srcLength;
|
|
|
|
int32_t count;
|
|
|
|
|
|
|
|
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
|
|
|
|
for(;;) {
|
|
|
|
/*
|
|
|
|
* Each iteration of the inner loop progresses by at most 3 UTF-8
|
|
|
|
* bytes and one UChar, for most characters.
|
|
|
|
* For supplementary code points (4 & 2), which are rare,
|
|
|
|
* there is an additional adjustment.
|
|
|
|
*/
|
|
|
|
count = (int32_t)((pDestLimit - pDest) / 3);
|
|
|
|
srcLength = (int32_t)(pSrcLimit - pSrc);
|
|
|
|
if(count > srcLength) {
|
|
|
|
count = srcLength; /* min(remaining dest/3, remaining src) */
|
|
|
|
}
|
|
|
|
if(count < 3) {
|
|
|
|
/*
|
|
|
|
* Too much overhead if we get near the end of the string,
|
|
|
|
* continue with the next loop.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
do {
|
|
|
|
ch=*pSrc++;
|
|
|
|
if(ch <= 0x7f) {
|
|
|
|
*pDest++ = (char)ch;
|
|
|
|
} else if(ch <= 0x7ff) {
|
|
|
|
*pDest++=(uint8_t)((ch>>6)|0xc0);
|
|
|
|
*pDest++=(uint8_t)((ch&0x3f)|0x80);
|
|
|
|
} else if(ch <= 0xd7ff || ch >= 0xe000) {
|
|
|
|
*pDest++=(uint8_t)((ch>>12)|0xe0);
|
|
|
|
*pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
|
|
|
|
*pDest++=(uint8_t)((ch&0x3f)|0x80);
|
|
|
|
} else /* ch is a surrogate */ {
|
|
|
|
/*
|
|
|
|
* We will read two UChars and probably output four bytes,
|
|
|
|
* which we didn't account for with computing count,
|
|
|
|
* so we adjust it here.
|
|
|
|
*/
|
|
|
|
if(--count == 0) {
|
|
|
|
--pSrc; /* undo ch=*pSrc++ for the lead surrogate */
|
|
|
|
break; /* recompute count */
|
|
|
|
}
|
|
|
|
|
|
|
|
if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
|
|
|
|
++pSrc;
|
|
|
|
ch=UTF16_GET_PAIR_VALUE(ch, ch2);
|
|
|
|
|
|
|
|
/* writing 4 bytes per 2 UChars is ok */
|
|
|
|
*pDest++=(uint8_t)((ch>>18)|0xf0);
|
|
|
|
*pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
|
|
|
|
*pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
|
|
|
|
*pDest++=(uint8_t)((ch&0x3f)|0x80);
|
|
|
|
} else {
|
|
|
|
/* Unicode 3.2 forbids surrogate code points in UTF-8 */
|
|
|
|
if(subchar>=0) {
|
|
|
|
ch=subchar;
|
|
|
|
++numSubstitutions;
|
|
|
|
} else {
|
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* convert and append*/
|
|
|
|
pDest=_appendUTF8(pDest, ch);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} while(--count > 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
while(pSrc<pSrcLimit) {
|
2001-11-30 05:20:18 +00:00
|
|
|
ch=*pSrc++;
|
2001-11-30 22:15:36 +00:00
|
|
|
if(ch <= 0x7f) {
|
2006-06-15 19:22:04 +00:00
|
|
|
if(pDest<pDestLimit) {
|
|
|
|
*pDest++ = (char)ch;
|
|
|
|
} else {
|
|
|
|
reqLength = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else if(ch <= 0x7ff) {
|
|
|
|
if((pDestLimit - pDest) >= 2) {
|
|
|
|
*pDest++=(uint8_t)((ch>>6)|0xc0);
|
|
|
|
*pDest++=(uint8_t)((ch&0x3f)|0x80);
|
|
|
|
} else {
|
|
|
|
reqLength = 2;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else if(ch <= 0xd7ff || ch >= 0xe000) {
|
|
|
|
if((pDestLimit - pDest) >= 3) {
|
|
|
|
*pDest++=(uint8_t)((ch>>12)|0xe0);
|
|
|
|
*pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
|
|
|
|
*pDest++=(uint8_t)((ch&0x3f)|0x80);
|
|
|
|
} else {
|
|
|
|
reqLength = 3;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else /* ch is a surrogate */ {
|
|
|
|
int32_t length;
|
2001-11-30 22:15:36 +00:00
|
|
|
|
2002-07-13 00:46:18 +00:00
|
|
|
if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
|
|
|
|
++pSrc;
|
|
|
|
ch=UTF16_GET_PAIR_VALUE(ch, ch2);
|
2006-06-15 19:22:04 +00:00
|
|
|
} else if(subchar>=0) {
|
|
|
|
ch=subchar;
|
|
|
|
++numSubstitutions;
|
2002-07-13 00:46:18 +00:00
|
|
|
} else {
|
|
|
|
/* Unicode 3.2 forbids surrogate code points in UTF-8 */
|
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
|
|
|
}
|
2006-06-15 19:22:04 +00:00
|
|
|
|
|
|
|
length = U8_LENGTH(ch);
|
|
|
|
if((pDestLimit - pDest) >= length) {
|
|
|
|
/* convert and append*/
|
|
|
|
pDest=_appendUTF8(pDest, ch);
|
|
|
|
} else {
|
|
|
|
reqLength = length;
|
|
|
|
break;
|
|
|
|
}
|
2001-11-30 05:20:18 +00:00
|
|
|
}
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
2001-11-30 05:20:18 +00:00
|
|
|
while(pSrc<pSrcLimit) {
|
|
|
|
ch=*pSrc++;
|
2002-07-13 00:46:18 +00:00
|
|
|
if(ch<=0x7f) {
|
|
|
|
++reqLength;
|
|
|
|
} else if(ch<=0x7ff) {
|
|
|
|
reqLength+=2;
|
|
|
|
} else if(!UTF_IS_SURROGATE(ch)) {
|
|
|
|
reqLength+=3;
|
|
|
|
} else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
|
2001-11-30 05:20:18 +00:00
|
|
|
++pSrc;
|
|
|
|
reqLength+=4;
|
2006-06-15 19:22:04 +00:00
|
|
|
} else if(subchar>=0) {
|
|
|
|
reqLength+=U8_LENGTH(subchar);
|
|
|
|
++numSubstitutions;
|
2001-11-30 05:20:18 +00:00
|
|
|
} else {
|
2002-07-13 00:46:18 +00:00
|
|
|
/* Unicode 3.2 forbids surrogate code points in UTF-8 */
|
|
|
|
*pErrorCode = U_INVALID_CHAR_FOUND;
|
|
|
|
return NULL;
|
2001-11-30 05:20:18 +00:00
|
|
|
}
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
|
|
|
}
|
2001-11-30 05:20:18 +00:00
|
|
|
|
2006-06-15 19:22:04 +00:00
|
|
|
reqLength+=(int32_t)(pDest - (uint8_t *)dest);
|
|
|
|
|
|
|
|
if(pNumSubstitutions!=NULL) {
|
|
|
|
*pNumSubstitutions=numSubstitutions;
|
|
|
|
}
|
|
|
|
|
2001-09-18 03:41:09 +00:00
|
|
|
if(pDestLength){
|
|
|
|
*pDestLength = reqLength;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Terminate the buffer */
|
|
|
|
u_terminateChars((char*)dest,destCapacity,reqLength,pErrorCode);
|
|
|
|
|
2001-09-22 03:00:46 +00:00
|
|
|
return (char*)dest;
|
2001-09-18 03:41:09 +00:00
|
|
|
}
|
2006-06-15 19:22:04 +00:00
|
|
|
|
|
|
|
U_CAPI char* U_EXPORT2
|
|
|
|
u_strToUTF8(char *dest,
|
|
|
|
int32_t destCapacity,
|
|
|
|
int32_t *pDestLength,
|
|
|
|
const UChar *pSrc,
|
|
|
|
int32_t srcLength,
|
|
|
|
UErrorCode *pErrorCode){
|
|
|
|
return u_strToUTF8WithSub(
|
|
|
|
dest, destCapacity, pDestLength,
|
|
|
|
pSrc, srcLength,
|
|
|
|
U_SENTINEL, NULL,
|
|
|
|
pErrorCode);
|
|
|
|
}
|