ICU-4223 do not write the UTF-16/32 BOM if there is no text to convert
X-SVN-Rev: 17866
This commit is contained in:
parent
a2e0e46710
commit
26091e6ae6
@ -23,6 +23,10 @@
|
||||
#include "ucnv_cnv.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
enum {
|
||||
UCNV_NEED_TO_WRITE_BOM=1
|
||||
};
|
||||
|
||||
/* UTF-16BE ----------------------------------------------------------------- */
|
||||
|
||||
#if U_IS_BIG_ENDIAN
|
||||
@ -39,7 +43,7 @@ _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
uint8_t *target;
|
||||
int32_t *offsets;
|
||||
|
||||
int32_t targetCapacity, length, count, sourceIndex;
|
||||
int32_t targetCapacity, length, sourceIndex;
|
||||
UChar c, trail;
|
||||
char overflow[4];
|
||||
|
||||
@ -50,13 +54,25 @@ _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
return;
|
||||
}
|
||||
|
||||
cnv=pArgs->converter;
|
||||
|
||||
/* write the BOM if necessary */
|
||||
if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
|
||||
static const char bom[]={ (char)0xfe, (char)0xff };
|
||||
ucnv_fromUWriteBytes(cnv,
|
||||
bom, 2,
|
||||
&pArgs->target, pArgs->targetLimit,
|
||||
&pArgs->offsets, -1,
|
||||
pErrorCode);
|
||||
cnv->fromUnicodeStatus=0;
|
||||
}
|
||||
|
||||
targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
|
||||
if(targetCapacity<=0) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
cnv=pArgs->converter;
|
||||
target=(uint8_t *)pArgs->target;
|
||||
offsets=pArgs->offsets;
|
||||
sourceIndex=0;
|
||||
@ -83,13 +99,13 @@ _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
cnv->fromUChar32=c=0;
|
||||
}
|
||||
|
||||
/* copy an even number of bytes for complete UChars */
|
||||
count=2*length;
|
||||
if(count>targetCapacity) {
|
||||
count=targetCapacity&~1;
|
||||
}
|
||||
/* count is even */
|
||||
if(c==0) {
|
||||
/* copy an even number of bytes for complete UChars */
|
||||
int32_t count=2*length;
|
||||
if(count>targetCapacity) {
|
||||
count=targetCapacity&~1;
|
||||
}
|
||||
/* count is even */
|
||||
targetCapacity-=count;
|
||||
count>>=1;
|
||||
length-=count;
|
||||
@ -581,7 +597,7 @@ _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
uint8_t *target;
|
||||
int32_t *offsets;
|
||||
|
||||
int32_t targetCapacity, length, count, sourceIndex;
|
||||
int32_t targetCapacity, length, sourceIndex;
|
||||
UChar c, trail;
|
||||
char overflow[4];
|
||||
|
||||
@ -592,13 +608,25 @@ _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
return;
|
||||
}
|
||||
|
||||
cnv=pArgs->converter;
|
||||
|
||||
/* write the BOM if necessary */
|
||||
if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
|
||||
static const char bom[]={ (char)0xff, (char)0xfe };
|
||||
ucnv_fromUWriteBytes(cnv,
|
||||
bom, 2,
|
||||
&pArgs->target, pArgs->targetLimit,
|
||||
&pArgs->offsets, -1,
|
||||
pErrorCode);
|
||||
cnv->fromUnicodeStatus=0;
|
||||
}
|
||||
|
||||
targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
|
||||
if(targetCapacity<=0) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
cnv=pArgs->converter;
|
||||
target=(uint8_t *)pArgs->target;
|
||||
offsets=pArgs->offsets;
|
||||
sourceIndex=0;
|
||||
@ -625,13 +653,13 @@ _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
cnv->fromUChar32=c=0;
|
||||
}
|
||||
|
||||
/* copy an even number of bytes for complete UChars */
|
||||
count=2*length;
|
||||
if(count>targetCapacity) {
|
||||
count=targetCapacity&~1;
|
||||
}
|
||||
/* count is even */
|
||||
if(c==0) {
|
||||
/* copy an even number of bytes for complete UChars */
|
||||
int32_t count=2*length;
|
||||
if(count>targetCapacity) {
|
||||
count=targetCapacity&~1;
|
||||
}
|
||||
/* count is even */
|
||||
targetCapacity-=count;
|
||||
count>>=1;
|
||||
length-=count;
|
||||
@ -1144,14 +1172,7 @@ _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
|
||||
}
|
||||
if(choice!=UCNV_RESET_TO_UNICODE) {
|
||||
/* reset fromUnicode: prepare to output the UTF-16PE BOM */
|
||||
cnv->charErrorBufferLength=2;
|
||||
#if U_IS_BIG_ENDIAN
|
||||
cnv->charErrorBuffer[0]=0xfe;
|
||||
cnv->charErrorBuffer[1]=0xff;
|
||||
#else
|
||||
cnv->charErrorBuffer[0]=0xff;
|
||||
cnv->charErrorBuffer[1]=0xfe;
|
||||
#endif
|
||||
cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -34,6 +34,10 @@
|
||||
/* -SURROGATE_LOW_START + HALF_BASE */
|
||||
#define SURROGATE_LOW_BASE 9216
|
||||
|
||||
enum {
|
||||
UCNV_NEED_TO_WRITE_BOM=1
|
||||
};
|
||||
|
||||
/* UTF-32BE ----------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
@ -204,13 +208,30 @@ T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
|
||||
UErrorCode * err)
|
||||
{
|
||||
const UChar *mySource = args->source;
|
||||
unsigned char *myTarget = (unsigned char *) args->target;
|
||||
unsigned char *myTarget;
|
||||
const UChar *sourceLimit = args->sourceLimit;
|
||||
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
|
||||
UChar32 ch, ch2;
|
||||
unsigned int indexToWrite;
|
||||
unsigned char temp[sizeof(uint32_t)];
|
||||
|
||||
if(mySource >= sourceLimit) {
|
||||
/* no input, nothing to do */
|
||||
return;
|
||||
}
|
||||
|
||||
/* write the BOM if necessary */
|
||||
if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
|
||||
static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
|
||||
ucnv_fromUWriteBytes(args->converter,
|
||||
bom, 4,
|
||||
&args->target, args->targetLimit,
|
||||
&args->offsets, -1,
|
||||
err);
|
||||
args->converter->fromUnicodeStatus=0;
|
||||
}
|
||||
|
||||
myTarget = (unsigned char *) args->target;
|
||||
temp[0] = 0;
|
||||
|
||||
if (args->converter->fromUChar32) {
|
||||
@ -288,8 +309,8 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
|
||||
UErrorCode * err)
|
||||
{
|
||||
const UChar *mySource = args->source;
|
||||
unsigned char *myTarget = (unsigned char *) args->target;
|
||||
int32_t *myOffsets = args->offsets;
|
||||
unsigned char *myTarget;
|
||||
int32_t *myOffsets;
|
||||
const UChar *sourceLimit = args->sourceLimit;
|
||||
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
|
||||
UChar32 ch, ch2;
|
||||
@ -297,6 +318,24 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
|
||||
unsigned int indexToWrite;
|
||||
unsigned char temp[sizeof(uint32_t)];
|
||||
|
||||
if(mySource >= sourceLimit) {
|
||||
/* no input, nothing to do */
|
||||
return;
|
||||
}
|
||||
|
||||
/* write the BOM if necessary */
|
||||
if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
|
||||
static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
|
||||
ucnv_fromUWriteBytes(args->converter,
|
||||
bom, 4,
|
||||
&args->target, args->targetLimit,
|
||||
&args->offsets, -1,
|
||||
err);
|
||||
args->converter->fromUnicodeStatus=0;
|
||||
}
|
||||
|
||||
myTarget = (unsigned char *) args->target;
|
||||
myOffsets = args->offsets;
|
||||
temp[0] = 0;
|
||||
|
||||
if (args->converter->fromUChar32) {
|
||||
@ -645,13 +684,30 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
|
||||
UErrorCode * err)
|
||||
{
|
||||
const UChar *mySource = args->source;
|
||||
unsigned char *myTarget = (unsigned char *) args->target;
|
||||
unsigned char *myTarget;
|
||||
const UChar *sourceLimit = args->sourceLimit;
|
||||
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
|
||||
UChar32 ch, ch2;
|
||||
unsigned int indexToWrite;
|
||||
unsigned char temp[sizeof(uint32_t)];
|
||||
|
||||
if(mySource >= sourceLimit) {
|
||||
/* no input, nothing to do */
|
||||
return;
|
||||
}
|
||||
|
||||
/* write the BOM if necessary */
|
||||
if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
|
||||
static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
|
||||
ucnv_fromUWriteBytes(args->converter,
|
||||
bom, 4,
|
||||
&args->target, args->targetLimit,
|
||||
&args->offsets, -1,
|
||||
err);
|
||||
args->converter->fromUnicodeStatus=0;
|
||||
}
|
||||
|
||||
myTarget = (unsigned char *) args->target;
|
||||
temp[3] = 0;
|
||||
|
||||
if (args->converter->fromUChar32)
|
||||
@ -737,8 +793,8 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
|
||||
UErrorCode * err)
|
||||
{
|
||||
const UChar *mySource = args->source;
|
||||
unsigned char *myTarget = (unsigned char *) args->target;
|
||||
int32_t *myOffsets = args->offsets;
|
||||
unsigned char *myTarget;
|
||||
int32_t *myOffsets;
|
||||
const UChar *sourceLimit = args->sourceLimit;
|
||||
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
|
||||
UChar32 ch, ch2;
|
||||
@ -746,6 +802,24 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
|
||||
unsigned char temp[sizeof(uint32_t)];
|
||||
int32_t offsetNum = 0;
|
||||
|
||||
if(mySource >= sourceLimit) {
|
||||
/* no input, nothing to do */
|
||||
return;
|
||||
}
|
||||
|
||||
/* write the BOM if necessary */
|
||||
if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
|
||||
static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
|
||||
ucnv_fromUWriteBytes(args->converter,
|
||||
bom, 4,
|
||||
&args->target, args->targetLimit,
|
||||
&args->offsets, -1,
|
||||
err);
|
||||
args->converter->fromUnicodeStatus=0;
|
||||
}
|
||||
|
||||
myTarget = (unsigned char *) args->target;
|
||||
myOffsets = args->offsets;
|
||||
temp[3] = 0;
|
||||
|
||||
if (args->converter->fromUChar32)
|
||||
@ -948,18 +1022,7 @@ _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
|
||||
}
|
||||
if(choice!=UCNV_RESET_TO_UNICODE) {
|
||||
/* reset fromUnicode: prepare to output the UTF-32PE BOM */
|
||||
cnv->charErrorBufferLength=4;
|
||||
#if U_IS_BIG_ENDIAN
|
||||
cnv->charErrorBuffer[0]=0;
|
||||
cnv->charErrorBuffer[1]=0;
|
||||
cnv->charErrorBuffer[2]=0xfe;
|
||||
cnv->charErrorBuffer[3]=0xff;
|
||||
#else
|
||||
cnv->charErrorBuffer[0]=0xff;
|
||||
cnv->charErrorBuffer[1]=0xfe;
|
||||
cnv->charErrorBuffer[2]=0;
|
||||
cnv->charErrorBuffer[3]=0;
|
||||
#endif
|
||||
cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
|
||||
}
|
||||
}
|
||||
|
||||
|
27
icu4c/source/test/testdata/conversion.txt
vendored
27
icu4c/source/test/testdata/conversion.txt
vendored
@ -445,12 +445,21 @@ conversion {
|
||||
fromUnicode {
|
||||
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
|
||||
Cases {
|
||||
// Improve ucnv_ext.c code coverage:
|
||||
// There will be a partial match up to the lead surrogate of U+603ff
|
||||
// which then results in one more unit in the prefetch buffer
|
||||
// than the match length when converting one code unit at a time.
|
||||
// See ucnv_extContinueMatchFromU() comment
|
||||
// "the match did not use all of preFromU[] - keep the rest for replay"
|
||||
// UTF-16/32: do not output a BOM if there is no data at all
|
||||
{
|
||||
"UTF-16",
|
||||
"",
|
||||
:bin{ "" },
|
||||
:intvector{ },
|
||||
:int{1}, :int{1}, "", "?", ""
|
||||
}
|
||||
{
|
||||
"UTF-32",
|
||||
"",
|
||||
:bin{ "" },
|
||||
:intvector{ },
|
||||
:int{1}, :int{1}, "", "?", ""
|
||||
}
|
||||
|
||||
// do not convert SO/SI/ESC
|
||||
{
|
||||
@ -491,6 +500,12 @@ conversion {
|
||||
:int{1}, :int{1}, "", "?", ""
|
||||
}
|
||||
|
||||
// Improve ucnv_ext.c code coverage:
|
||||
// There will be a partial match up to the lead surrogate of U+603ff
|
||||
// which then results in one more unit in the prefetch buffer
|
||||
// than the match length when converting one code unit at a time.
|
||||
// See ucnv_extContinueMatchFromU() comment
|
||||
// "the match did not use all of preFromU[] - keep the rest for replay"
|
||||
{
|
||||
"*test3",
|
||||
"\U00101234\U00101234\U00050005\U000603ff",
|
||||
|
Loading…
Reference in New Issue
Block a user