ICU-7344 C functions for Java modified UTF-8

X-SVN-Rev: 27261
This commit is contained in:
Markus Scherer 2010-01-14 19:11:07 +00:00
parent e6e5208a8d
commit ad83876755
3 changed files with 977 additions and 19 deletions

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1998-2009, International Business Machines
* Copyright (C) 1998-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -1602,4 +1602,95 @@ u_strFromUTF32WithSub(UChar *dest,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode);
/**
* Convert a 16-bit Unicode string to Java Modified UTF-8.
* See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
*
* This function behaves according to the documentation for Java DataOutput.writeUTF()
* except that it does not encode the output length in the destination buffer
* and does not have an output length restriction.
* See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String)
*
* The input string need not be well-formed UTF-16.
* (Therefore there is no subchar parameter.)
*
* @param dest A buffer for the result string. The result will be zero-terminated if
* the buffer is large enough.
* @param destCapacity The size of the buffer (number of chars). If it is 0, then
* dest may be NULL and the function will only return the length of the
* result without writing any of the result string (pre-flighting).
* @param pDestLength A pointer to receive the number of units written to the destination. If
* pDestLength!=NULL then *pDestLength is always set to the
* number of output units corresponding to the transformation of
* all the input units, even in case of a buffer overflow.
* @param src The original source string
* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
* @param pErrorCode Pointer to a standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return The pointer to destination buffer.
* @draft ICU 4.4
* @see u_strToUTF8WithSub
* @see u_strFromJavaModifiedUTF8WithSub
*/
U_DRAFT char* U_EXPORT2
u_strToJavaModifiedUTF8(
char *dest,
int32_t destCapacity,
int32_t *pDestLength,
const UChar *src,
int32_t srcLength,
UErrorCode *pErrorCode);
/**
* Convert a Java Modified UTF-8 string to a 16-bit Unicode string.
* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
*
* This function behaves according to the documentation for Java DataInput.readUTF()
* except that it takes a length parameter rather than
* interpreting the first two input bytes as the length.
* See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF()
*
* The output string may not be well-formed UTF-16.
*
* @param dest A buffer for the result string. The result will be zero-terminated if
* the buffer is large enough.
* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
* dest may be NULL and the function will only return the length of the
* result without writing any of the result string (pre-flighting).
* @param pDestLength A pointer to receive the number of units written to the destination. If
* pDestLength!=NULL then *pDestLength is always set to the
* number of output units corresponding to the transformation of
* all the input units, even in case of a buffer overflow.
* @param src The original source string
* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
* @param subchar The substitution character to use in place of an illegal input sequence,
* or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
* A substitution character can be any valid Unicode code point (up to U+10FFFF)
* except for surrogate code points (U+D800..U+DFFF).
* The recommended value is U+FFFD "REPLACEMENT CHARACTER".
* @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
* Set to 0 if no substitutions occur or subchar<0.
* pNumSubstitutions can be NULL.
* @param pErrorCode Pointer to a standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return The pointer to destination buffer.
* @see u_strFromUTF8WithSub
* @see u_strFromUTF8Lenient
* @see u_strToJavaModifiedUTF8
* @draft ICU 4.4
*/
U_DRAFT UChar* U_EXPORT2
u_strFromJavaModifiedUTF8WithSub(
UChar *dest,
int32_t destCapacity,
int32_t *pDestLength,
const char *src,
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode);
#endif

View File

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2001-2009, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -396,7 +396,6 @@ u_strFromUTF8WithSub(UChar *dest,
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode){
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
UChar32 ch;
@ -599,12 +598,7 @@ u_strFromUTF8WithSub(UChar *dest,
*(pDest++)=(UChar)ch;
}else{
*(pDest++)=UTF16_LEAD(ch);
if(pDest<pDestLimit){
*(pDest++)=UTF16_TRAIL(ch);
}else{
reqLength++;
break;
}
*(pDest++)=UTF16_TRAIL(ch);
}
}
} while(--count > 0);
@ -659,7 +653,7 @@ u_strFromUTF8WithSub(UChar *dest,
}
}
}
/* donot fill the dest buffer just count the UChars needed */
/* do not fill the dest buffer just count the UChars needed */
while(pSrc < pSrcLimit){
ch = *pSrc;
if(ch <= 0x7f){
@ -738,7 +732,6 @@ u_strFromUTF8Lenient(UChar *dest,
const char *src,
int32_t srcLength,
UErrorCode *pErrorCode) {
UChar *pDest = dest;
UChar32 ch;
int32_t reqLength = 0;
@ -977,7 +970,6 @@ u_strToUTF8WithSub(char *dest,
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode){
int32_t reqLength=0;
uint32_t ch=0,ch2=0;
uint8_t *pDest = (uint8_t *)dest;
@ -1006,7 +998,7 @@ u_strToUTF8WithSub(char *dest,
++pSrc;
if(ch <= 0x7f) {
if(pDest<pDestLimit) {
*pDest++ = (char)ch;
*pDest++ = (uint8_t)ch;
} else {
reqLength = 1;
break;
@ -1100,7 +1092,7 @@ u_strToUTF8WithSub(char *dest,
do {
ch=*pSrc++;
if(ch <= 0x7f) {
*pDest++ = (char)ch;
*pDest++ = (uint8_t)ch;
} else if(ch <= 0x7ff) {
*pDest++=(uint8_t)((ch>>6)|0xc0);
*pDest++=(uint8_t)((ch&0x3f)|0x80);
@ -1149,7 +1141,7 @@ u_strToUTF8WithSub(char *dest,
ch=*pSrc++;
if(ch <= 0x7f) {
if(pDest<pDestLimit) {
*pDest++ = (char)ch;
*pDest++ = (uint8_t)ch;
} else {
reqLength = 1;
break;
@ -1229,9 +1221,8 @@ u_strToUTF8WithSub(char *dest,
}
/* Terminate the buffer */
u_terminateChars((char*)dest,destCapacity,reqLength,pErrorCode);
return (char*)dest;
u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
return dest;
}
U_CAPI char* U_EXPORT2
@ -1247,3 +1238,397 @@ u_strToUTF8(char *dest,
U_SENTINEL, NULL,
pErrorCode);
}
U_CAPI UChar* U_EXPORT2
u_strFromJavaModifiedUTF8WithSub(
UChar *dest,
int32_t destCapacity,
int32_t *pDestLength,
const char *src,
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode) {
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
UChar32 ch;
int32_t reqLength = 0;
const uint8_t* pSrc = (const uint8_t*) src;
const uint8_t *pSrcLimit;
int32_t count;
uint8_t t1, t2; /* trail bytes */
int32_t numSubstitutions;
/* args check */
if(U_FAILURE(*pErrorCode)){
return NULL;
}
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
(dest==NULL && destCapacity!=0) || destCapacity<0 ||
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
) {
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
if(pNumSubstitutions!=NULL) {
*pNumSubstitutions=0;
}
numSubstitutions=0;
if(srcLength < 0) {
/*
* Transform a NUL-terminated ASCII string.
* Handle non-ASCII strings with slower code.
*/
while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
*pDest++=(UChar)ch;
++pSrc;
}
if(ch == 0) {
reqLength=(int32_t)(pDest - dest);
if(pDestLength) {
*pDestLength = reqLength;
}
/* Terminate the buffer */
u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
return dest;
}
srcLength = uprv_strlen((const char *)pSrc);
}
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
pSrcLimit = pSrc + srcLength;
for(;;) {
count = (int32_t)(pDestLimit - pDest);
srcLength = (int32_t)(pSrcLimit - pSrc);
if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
/* fast ASCII loop */
const uint8_t *prevSrc = pSrc;
int32_t delta;
while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
*pDest++=(UChar)ch;
++pSrc;
}
delta = (int32_t)(pSrc - prevSrc);
count -= delta;
srcLength -= delta;
}
/*
* Each iteration of the inner loop progresses by at most 3 UTF-8
* bytes and one UChar.
*/
srcLength /= 3;
if(count > srcLength) {
count = srcLength; /* min(remaining dest, remaining src/3) */
}
if(count < 3) {
/*
* Too much overhead if we get near the end of the string,
* continue with the next loop.
*/
break;
}
do {
ch = *pSrc;
if(ch <= 0x7f){
*pDest++=(UChar)ch;
++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
continue;
}
}
if(subchar < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else if(subchar > 0xffff && --count == 0) {
/*
* We need to write two UChars, adjusted count for that,
* and ran out of space.
*/
break;
} else {
/* function call for error cases */
++pSrc; /* continue after the lead byte */
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
++numSubstitutions;
if(subchar<=0xFFFF) {
*(pDest++)=(UChar)subchar;
} else {
*(pDest++)=U16_LEAD(subchar);
*(pDest++)=U16_TRAIL(subchar);
}
}
}
} while(--count > 0);
}
while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
ch = *pSrc;
if(ch <= 0x7f){
*pDest++=(UChar)ch;
++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
((pSrcLimit - pSrc) >= 3) &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
((pSrcLimit - pSrc) >= 2) &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
continue;
}
}
if(subchar < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else {
/* function call for error cases */
++pSrc; /* continue after the lead byte */
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
++numSubstitutions;
if(subchar<=0xFFFF) {
*(pDest++)=(UChar)subchar;
} else {
*(pDest++)=U16_LEAD(subchar);
if(pDest<pDestLimit) {
*(pDest++)=U16_TRAIL(subchar);
} else {
reqLength++;
break;
}
}
}
}
}
/* do not fill the dest buffer just count the UChars needed */
while(pSrc < pSrcLimit){
ch = *pSrc;
if(ch <= 0x7f) {
reqLength++;
++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
((pSrcLimit - pSrc) >= 3) &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
) {
reqLength++;
pSrc += 3;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
((pSrcLimit - pSrc) >= 2) &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
) {
reqLength++;
pSrc += 2;
continue;
}
}
if(subchar < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else {
/* function call for error cases */
++pSrc; /* continue after the lead byte */
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
++numSubstitutions;
reqLength+=U16_LENGTH(ch);
}
}
}
if(pNumSubstitutions!=NULL) {
*pNumSubstitutions=numSubstitutions;
}
reqLength+=(int32_t)(pDest - dest);
if(pDestLength) {
*pDestLength = reqLength;
}
/* Terminate the buffer */
u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
return dest;
}
U_CAPI char* U_EXPORT2
u_strToJavaModifiedUTF8(
char *dest,
int32_t destCapacity,
int32_t *pDestLength,
const UChar *src,
int32_t srcLength,
UErrorCode *pErrorCode) {
int32_t reqLength=0;
uint32_t ch=0,ch2=0;
uint8_t *pDest = (uint8_t *)dest;
uint8_t *pDestLimit = pDest + destCapacity;
const UChar *pSrcLimit;
int32_t count;
/* args check */
if(U_FAILURE(*pErrorCode)){
return NULL;
}
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
(dest==NULL && destCapacity!=0) || destCapacity<0
) {
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
if(srcLength==-1) {
/* Convert NUL-terminated ASCII, then find the string length. */
while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
*pDest++ = (uint8_t)ch;
++src;
}
if(ch == 0) {
reqLength=(int32_t)(pDest - (uint8_t *)dest);
if(pDestLength) {
*pDestLength = reqLength;
}
/* Terminate the buffer */
u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
return dest;
}
srcLength = u_strlen(src);
}
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
pSrcLimit = src+srcLength;
for(;;) {
count = (int32_t)(pDestLimit - pDest);
srcLength = (int32_t)(pSrcLimit - src);
if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
/* fast ASCII loop */
const UChar *prevSrc = src;
int32_t delta;
while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
*pDest++=(uint8_t)ch;
++src;
}
delta = (int32_t)(src - prevSrc);
count -= delta;
srcLength -= delta;
}
/*
* Each iteration of the inner loop progresses by at most 3 UTF-8
* bytes and one UChar.
*/
count /= 3;
if(count > srcLength) {
count = srcLength; /* min(remaining dest/3, remaining src) */
}
if(count < 3) {
/*
* Too much overhead if we get near the end of the string,
* continue with the next loop.
*/
break;
}
do {
ch=*src++;
if(ch <= 0x7f && ch != 0) {
*pDest++ = (uint8_t)ch;
} else if(ch <= 0x7ff) {
*pDest++=(uint8_t)((ch>>6)|0xc0);
*pDest++=(uint8_t)((ch&0x3f)|0x80);
} else {
*pDest++=(uint8_t)((ch>>12)|0xe0);
*pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
*pDest++=(uint8_t)((ch&0x3f)|0x80);
}
} while(--count > 0);
}
while(src<pSrcLimit) {
ch=*src++;
if(ch <= 0x7f && ch != 0) {
if(pDest<pDestLimit) {
*pDest++ = (uint8_t)ch;
} else {
reqLength = 1;
break;
}
} else if(ch <= 0x7ff) {
if((pDestLimit - pDest) >= 2) {
*pDest++=(uint8_t)((ch>>6)|0xc0);
*pDest++=(uint8_t)((ch&0x3f)|0x80);
} else {
reqLength = 2;
break;
}
} else {
if((pDestLimit - pDest) >= 3) {
*pDest++=(uint8_t)((ch>>12)|0xe0);
*pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
*pDest++=(uint8_t)((ch&0x3f)|0x80);
} else {
reqLength = 3;
break;
}
}
}
while(src<pSrcLimit) {
ch=*src++;
if(ch <= 0x7f && ch != 0) {
++reqLength;
} else if(ch<=0x7ff) {
reqLength+=2;
} else {
reqLength+=3;
}
}
reqLength+=(int32_t)(pDest - (uint8_t *)dest);
if(pDestLength){
*pDestLength = reqLength;
}
/* Terminate the buffer */
u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
return dest;
}

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2001-2009, International Business Machines Corporation and
* Copyright (c) 2001-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -17,6 +17,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/ures.h"
@ -40,6 +41,8 @@ static void Test_FromUTF8Lenient(void);
static void Test_UChar_WCHART_API(void);
static void Test_widestrs(void);
static void Test_WCHART_LongString(void);
static void Test_strToJavaModifiedUTF8(void);
static void Test_strFromJavaModifiedUTF8(void);
void
addUCharTransformTest(TestNode** root)
@ -56,6 +59,8 @@ addUCharTransformTest(TestNode** root)
#if !UCONFIG_NO_FILE_IO
addTest(root, &Test_WCHART_LongString, "custrtrn/Test_WCHART_LongString");
#endif
addTest(root, &Test_strToJavaModifiedUTF8, "custrtrn/Test_strToJavaModifiedUTF8");
addTest(root, &Test_strFromJavaModifiedUTF8, "custrtrn/Test_strFromJavaModifiedUTF8");
}
static const UChar32 src32[]={
@ -1473,3 +1478,480 @@ Test_WCHART_LongString(){
#endif
}
static void Test_strToJavaModifiedUTF8() {
static const UChar src[]={
0x61, 0x62, 0x63, 0xe1, 0xe2, 0xe3,
0xe01, 0xe02, 0xe03, 0xe001, 0xe002, 0xe003,
0xd800, 0xdc00, 0xdc00, 0xd800, 0,
0xdbff, 0xdfff,
0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0xed, 0xe0e, 0x6f
};
static const uint8_t expected[]={
0x61, 0x62, 0x63, 0xc3, 0xa1, 0xc3, 0xa2, 0xc3, 0xa3,
0xe0, 0xb8, 0x81, 0xe0, 0xb8, 0x82, 0xe0, 0xb8, 0x83,
0xee, 0x80, 0x81, 0xee, 0x80, 0x82, 0xee, 0x80, 0x83,
0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xc0, 0x80,
0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf,
0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0xc3, 0xad, 0xe0, 0xb8, 0x8e, 0x6f
};
static const UChar shortSrc[]={
0xe01, 0xe1, 0x61
};
static const uint8_t shortExpected[]={
0xe0, 0xb8, 0x81, 0xc3, 0xa1, 0x61
};
static const UChar asciiNul[]={
0x61, 0x62, 0x63, 0
};
static const uint8_t asciiNulExpected[]={
0x61, 0x62, 0x63
};
char dest[200];
char *p;
int32_t length, expectedTerminatedLength;
UErrorCode errorCode;
expectedTerminatedLength=(int32_t)(strstr((const char *)expected, "\xc0\x80")-
(const char *)expected);
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, (int32_t)sizeof(dest), &length,
src, LENGTHOF(src), &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=LENGTHOF(expected) || 0!=memcmp(dest, expected, length) ||
dest[length]!=0
) {
log_err("u_strToJavaModifiedUTF8(normal) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, (int32_t)sizeof(dest), NULL,
src, LENGTHOF(src), &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
0!=memcmp(dest, expected, LENGTHOF(expected)) ||
dest[LENGTHOF(expected)]!=0
) {
log_err("u_strToJavaModifiedUTF8(normal, pLength=NULL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, LENGTHOF(expected), &length,
src, LENGTHOF(src), &errorCode);
if( errorCode!=U_STRING_NOT_TERMINATED_WARNING || p!=dest ||
length!=LENGTHOF(expected) || 0!=memcmp(dest, expected, length) ||
dest[length]!=(char)0xff
) {
log_err("u_strToJavaModifiedUTF8(tight) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, (int32_t)sizeof(dest), &length, src, -1, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=expectedTerminatedLength || 0!=memcmp(dest, expected, length) ||
dest[length]!=0
) {
log_err("u_strToJavaModifiedUTF8(NUL-terminated) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, (int32_t)sizeof(dest), NULL, src, -1, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
0!=memcmp(dest, expected, expectedTerminatedLength) ||
dest[expectedTerminatedLength]!=0
) {
log_err("u_strToJavaModifiedUTF8(NUL-terminated, pLength=NULL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, LENGTHOF(expected)/2, &length,
src, LENGTHOF(src), &errorCode);
if( errorCode!=U_BUFFER_OVERFLOW_ERROR ||
length!=LENGTHOF(expected) || dest[LENGTHOF(expected)/2]!=(char)0xff
) {
log_err("u_strToJavaModifiedUTF8(overflow) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(NULL, 0, &length,
src, LENGTHOF(src), &errorCode);
if( errorCode!=U_BUFFER_OVERFLOW_ERROR ||
length!=LENGTHOF(expected) || dest[0]!=(char)0xff
) {
log_err("u_strToJavaModifiedUTF8(pure preflighting) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, (int32_t)sizeof(dest), &length,
shortSrc, LENGTHOF(shortSrc), &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=LENGTHOF(shortExpected) || 0!=memcmp(dest, shortExpected, length) ||
dest[length]!=0
) {
log_err("u_strToJavaModifiedUTF8(short) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, (int32_t)sizeof(dest), &length,
asciiNul, -1, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=LENGTHOF(asciiNulExpected) || 0!=memcmp(dest, asciiNulExpected, length) ||
dest[length]!=0
) {
log_err("u_strToJavaModifiedUTF8(asciiNul) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, (int32_t)sizeof(dest), &length,
NULL, 0, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=0 || dest[0]!=0
) {
log_err("u_strToJavaModifiedUTF8(empty) failed - %s\n", u_errorName(errorCode));
}
/* illegal arguments */
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(NULL, sizeof(dest), &length,
src, LENGTHOF(src), &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=(char)0xff) {
log_err("u_strToJavaModifiedUTF8(dest=NULL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, -1, &length,
src, LENGTHOF(src), &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=(char)0xff) {
log_err("u_strToJavaModifiedUTF8(destCapacity<0) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, sizeof(dest), &length,
NULL, LENGTHOF(src), &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=(char)0xff) {
log_err("u_strToJavaModifiedUTF8(src=NULL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=-5;
p=u_strToJavaModifiedUTF8(dest, sizeof(dest), &length,
NULL, -1, &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=(char)0xff) {
log_err("u_strToJavaModifiedUTF8(src=NULL, srcLength<0) failed - %s\n", u_errorName(errorCode));
}
}
static void Test_strFromJavaModifiedUTF8() {
static const uint8_t src[]={
0x61, 0x62, 0x63, 0xc3, 0xa1, 0xc3, 0xa2, 0xc3, 0xa3,
0xe0, 0xb8, 0x81, 0xe0, 0xb8, 0x82, 0xe0, 0xb8, 0x83,
0xee, 0x80, 0x81, 0xee, 0x80, 0x82, 0xee, 0x80, 0x83,
0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0,
0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf,
0x81, 0xc0, 0xe0, 0xb8, 0xf0, 0x90, 0x80, 0x80, /* invalid sequences */
0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
0xe0, 0x81, 0xac, 0xe0, 0x83, 0xad, /* non-shortest forms are allowed */
0xe0, 0xb8, 0x8e, 0x6f
};
static const UChar expected[]={
0x61, 0x62, 0x63, 0xe1, 0xe2, 0xe3,
0xe01, 0xe02, 0xe03, 0xe001, 0xe002, 0xe003,
0xd800, 0xdc00, 0xdc00, 0xd800, 0,
0xdbff, 0xdfff,
0xfffd, 0xfffd, 0xfffd, 0xfffd,
0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
0x6c, 0xed,
0xe0e, 0x6f
};
static const uint8_t shortSrc[]={
0xe0, 0xb8, 0x81, 0xc3, 0xa1, 0x61
};
static const UChar shortExpected[]={
0xe01, 0xe1, 0x61
};
static const uint8_t asciiNul[]={
0x61, 0x62, 0x63, 0
};
static const UChar asciiNulExpected[]={
0x61, 0x62, 0x63
};
static const uint8_t invalid[]={
0x81, 0xc0, 0xe0, 0xb8, 0xf0, 0x90, 0x80, 0x80
};
static const UChar invalidExpectedFFFD[]={
0xfffd, 0xfffd, 0xfffd, 0xfffd
};
static const UChar invalidExpected50000[]={
0xd900, 0xdc00, 0xd900, 0xdc00, 0xd900, 0xdc00, 0xd900, 0xdc00
};
UChar dest[200];
UChar *p;
int32_t length, expectedTerminatedLength;
int32_t numSubstitutions;
UErrorCode errorCode;
expectedTerminatedLength=(int32_t)(u_strchr(expected, 0)-expected);
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
(const char *)src, LENGTHOF(src),
0xfffd, &numSubstitutions, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=LENGTHOF(expected) || 0!=memcmp(dest, expected, length) ||
dest[length]!=0 ||
numSubstitutions!=LENGTHOF(invalidExpectedFFFD)
) {
log_err("u_strFromJavaModifiedUTF8WithSub(normal) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), NULL,
(const char *)src, LENGTHOF(src),
0xfffd, &numSubstitutions, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
0!=memcmp(dest, expected, LENGTHOF(expected)) ||
dest[LENGTHOF(expected)]!=0 ||
numSubstitutions!=LENGTHOF(invalidExpectedFFFD)
) {
log_err("u_strFromJavaModifiedUTF8WithSub(normal, pLength=NULL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
(const char *)src, LENGTHOF(src),
0xfffd, NULL, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=LENGTHOF(expected) || 0!=memcmp(dest, expected, length) ||
dest[length]!=0
) {
log_err("u_strFromJavaModifiedUTF8WithSub(normal, pNumSubstitutions=NULL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, LENGTHOF(expected), &length,
(const char *)src, LENGTHOF(src),
0xfffd, &numSubstitutions, &errorCode);
if( errorCode!=U_STRING_NOT_TERMINATED_WARNING || p!=dest ||
length!=LENGTHOF(expected) || 0!=memcmp(dest, expected, length) ||
dest[length]!=0xffff ||
numSubstitutions!=LENGTHOF(invalidExpectedFFFD)
) {
log_err("u_strFromJavaModifiedUTF8WithSub(tight) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
(const char *)src, -1,
0xfffd, &numSubstitutions, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=expectedTerminatedLength || 0!=memcmp(dest, expected, length) ||
dest[length]!=0 ||
numSubstitutions!=0
) {
log_err("u_strFromJavaModifiedUTF8WithSub(NUL-terminated) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), NULL,
(const char *)src, -1,
0xfffd, &numSubstitutions, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
0!=memcmp(dest, expected, expectedTerminatedLength) ||
dest[expectedTerminatedLength]!=0 ||
numSubstitutions!=0
) {
log_err("u_strFromJavaModifiedUTF8WithSub(NUL-terminated, pLength=NULL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
(const char *)src, -1,
0xfffd, NULL, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=expectedTerminatedLength || 0!=memcmp(dest, expected, length) ||
dest[length]!=0
) {
log_err("u_strFromJavaModifiedUTF8WithSub(NUL-terminated, pNumSubstitutions=NULL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, LENGTHOF(expected)/2, &length,
(const char *)src, LENGTHOF(src),
0xfffd, &numSubstitutions, &errorCode);
if( errorCode!=U_BUFFER_OVERFLOW_ERROR ||
length!=LENGTHOF(expected) || dest[LENGTHOF(expected)/2]!=0xffff
) {
log_err("u_strFromJavaModifiedUTF8WithSub(overflow) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(NULL, 0, &length,
(const char *)src, LENGTHOF(src),
0xfffd, &numSubstitutions, &errorCode);
if( errorCode!=U_BUFFER_OVERFLOW_ERROR ||
length!=LENGTHOF(expected) || dest[0]!=0xffff
) {
log_err("u_strFromJavaModifiedUTF8WithSub(pure preflighting) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
(const char *)shortSrc, LENGTHOF(shortSrc),
0xfffd, &numSubstitutions, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=LENGTHOF(shortExpected) || 0!=memcmp(dest, shortExpected, length) ||
dest[length]!=0 ||
numSubstitutions!=0
) {
log_err("u_strFromJavaModifiedUTF8WithSub(short) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
(const char *)asciiNul, -1,
0xfffd, &numSubstitutions, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=LENGTHOF(asciiNulExpected) || 0!=memcmp(dest, asciiNulExpected, length) ||
dest[length]!=0 ||
numSubstitutions!=0
) {
log_err("u_strFromJavaModifiedUTF8WithSub(asciiNul) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
NULL, 0, 0xfffd, &numSubstitutions, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=0 || dest[0]!=0 ||
numSubstitutions!=0
) {
log_err("u_strFromJavaModifiedUTF8WithSub(empty) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
(const char *)invalid, LENGTHOF(invalid),
0xfffd, &numSubstitutions, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=LENGTHOF(invalidExpectedFFFD) || 0!=memcmp(dest, invalidExpectedFFFD, length) ||
dest[length]!=0 ||
numSubstitutions!=LENGTHOF(invalidExpectedFFFD)
) {
log_err("u_strFromJavaModifiedUTF8WithSub(invalid->fffd) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
(const char *)invalid, LENGTHOF(invalid),
0x50000, &numSubstitutions, &errorCode);
if( U_FAILURE(errorCode) || p!=dest ||
length!=LENGTHOF(invalidExpected50000) || 0!=memcmp(dest, invalidExpected50000, length) ||
dest[length]!=0 ||
numSubstitutions!=LENGTHOF(invalidExpectedFFFD) /* not ...50000 */
) {
log_err("u_strFromJavaModifiedUTF8WithSub(invalid->50000) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
(const char *)invalid, LENGTHOF(invalid),
U_SENTINEL, &numSubstitutions, &errorCode);
if(errorCode!=U_INVALID_CHAR_FOUND || dest[0]!=0xffff || numSubstitutions!=0) {
log_err("u_strFromJavaModifiedUTF8WithSub(invalid->error) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, (int32_t)sizeof(dest), &length,
(const char *)src, LENGTHOF(src),
U_SENTINEL, &numSubstitutions, &errorCode);
if( errorCode!=U_INVALID_CHAR_FOUND ||
length>=LENGTHOF(expected) || dest[LENGTHOF(expected)-1]!=0xffff ||
numSubstitutions!=0
) {
log_err("u_strFromJavaModifiedUTF8WithSub(normal->error) failed - %s\n", u_errorName(errorCode));
}
/* illegal arguments */
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(NULL, sizeof(dest), &length,
(const char *)src, LENGTHOF(src),
0xfffd, &numSubstitutions, &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=0xffff) {
log_err("u_strFromJavaModifiedUTF8WithSub(dest=NULL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, -1, &length,
(const char *)src, LENGTHOF(src),
0xfffd, &numSubstitutions, &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=0xffff) {
log_err("u_strFromJavaModifiedUTF8WithSub(destCapacity<0) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, sizeof(dest), &length,
NULL, LENGTHOF(src),
0xfffd, &numSubstitutions, &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=0xffff) {
log_err("u_strFromJavaModifiedUTF8WithSub(src=NULL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, sizeof(dest), &length,
NULL, -1, 0xfffd, &numSubstitutions, &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=0xffff) {
log_err("u_strFromJavaModifiedUTF8WithSub(src=NULL, srcLength<0) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, sizeof(dest), &length,
(const char *)src, LENGTHOF(src),
0x110000, &numSubstitutions, &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=0xffff) {
log_err("u_strFromJavaModifiedUTF8WithSub(subchar=U_SENTINEL) failed - %s\n", u_errorName(errorCode));
}
memset(dest, 0xff, sizeof(dest));
errorCode=U_ZERO_ERROR;
length=numSubstitutions=-5;
p=u_strFromJavaModifiedUTF8WithSub(dest, sizeof(dest), &length,
(const char *)src, LENGTHOF(src),
0xdfff, &numSubstitutions, &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=0xffff) {
log_err("u_strFromJavaModifiedUTF8WithSub(subchar is surrogate) failed - %s\n", u_errorName(errorCode));
}
}