ICU-4223 do not write the UTF-16/32 BOM if there is no text to convert

X-SVN-Rev: 17866
This commit is contained in:
Markus Scherer 2005-06-12 21:15:18 +00:00
parent a2e0e46710
commit 26091e6ae6
3 changed files with 147 additions and 48 deletions

View File

@ -23,6 +23,10 @@
#include "ucnv_cnv.h"
#include "cmemory.h"
enum {
UCNV_NEED_TO_WRITE_BOM=1
};
/* UTF-16BE ----------------------------------------------------------------- */
#if U_IS_BIG_ENDIAN
@ -39,7 +43,7 @@ _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
uint8_t *target;
int32_t *offsets;
int32_t targetCapacity, length, count, sourceIndex;
int32_t targetCapacity, length, sourceIndex;
UChar c, trail;
char overflow[4];
@ -50,13 +54,25 @@ _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
return;
}
cnv=pArgs->converter;
/* write the BOM if necessary */
if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
static const char bom[]={ (char)0xfe, (char)0xff };
ucnv_fromUWriteBytes(cnv,
bom, 2,
&pArgs->target, pArgs->targetLimit,
&pArgs->offsets, -1,
pErrorCode);
cnv->fromUnicodeStatus=0;
}
targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
if(targetCapacity<=0) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
return;
}
cnv=pArgs->converter;
target=(uint8_t *)pArgs->target;
offsets=pArgs->offsets;
sourceIndex=0;
@ -83,13 +99,13 @@ _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
cnv->fromUChar32=c=0;
}
/* copy an even number of bytes for complete UChars */
count=2*length;
if(count>targetCapacity) {
count=targetCapacity&~1;
}
/* count is even */
if(c==0) {
/* copy an even number of bytes for complete UChars */
int32_t count=2*length;
if(count>targetCapacity) {
count=targetCapacity&~1;
}
/* count is even */
targetCapacity-=count;
count>>=1;
length-=count;
@ -581,7 +597,7 @@ _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
uint8_t *target;
int32_t *offsets;
int32_t targetCapacity, length, count, sourceIndex;
int32_t targetCapacity, length, sourceIndex;
UChar c, trail;
char overflow[4];
@ -592,13 +608,25 @@ _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
return;
}
cnv=pArgs->converter;
/* write the BOM if necessary */
if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
static const char bom[]={ (char)0xff, (char)0xfe };
ucnv_fromUWriteBytes(cnv,
bom, 2,
&pArgs->target, pArgs->targetLimit,
&pArgs->offsets, -1,
pErrorCode);
cnv->fromUnicodeStatus=0;
}
targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
if(targetCapacity<=0) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
return;
}
cnv=pArgs->converter;
target=(uint8_t *)pArgs->target;
offsets=pArgs->offsets;
sourceIndex=0;
@ -625,13 +653,13 @@ _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
cnv->fromUChar32=c=0;
}
/* copy an even number of bytes for complete UChars */
count=2*length;
if(count>targetCapacity) {
count=targetCapacity&~1;
}
/* count is even */
if(c==0) {
/* copy an even number of bytes for complete UChars */
int32_t count=2*length;
if(count>targetCapacity) {
count=targetCapacity&~1;
}
/* count is even */
targetCapacity-=count;
count>>=1;
length-=count;
@ -1144,14 +1172,7 @@ _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
}
if(choice!=UCNV_RESET_TO_UNICODE) {
/* reset fromUnicode: prepare to output the UTF-16PE BOM */
cnv->charErrorBufferLength=2;
#if U_IS_BIG_ENDIAN
cnv->charErrorBuffer[0]=0xfe;
cnv->charErrorBuffer[1]=0xff;
#else
cnv->charErrorBuffer[0]=0xff;
cnv->charErrorBuffer[1]=0xfe;
#endif
cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
}
}

View File

@ -34,6 +34,10 @@
/* -SURROGATE_LOW_START + HALF_BASE */
#define SURROGATE_LOW_BASE 9216
enum {
UCNV_NEED_TO_WRITE_BOM=1
};
/* UTF-32BE ----------------------------------------------------------------- */
static void
@ -204,13 +208,30 @@ T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
UErrorCode * err)
{
const UChar *mySource = args->source;
unsigned char *myTarget = (unsigned char *) args->target;
unsigned char *myTarget;
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
UChar32 ch, ch2;
unsigned int indexToWrite;
unsigned char temp[sizeof(uint32_t)];
if(mySource >= sourceLimit) {
/* no input, nothing to do */
return;
}
/* write the BOM if necessary */
if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
ucnv_fromUWriteBytes(args->converter,
bom, 4,
&args->target, args->targetLimit,
&args->offsets, -1,
err);
args->converter->fromUnicodeStatus=0;
}
myTarget = (unsigned char *) args->target;
temp[0] = 0;
if (args->converter->fromUChar32) {
@ -288,8 +309,8 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
UErrorCode * err)
{
const UChar *mySource = args->source;
unsigned char *myTarget = (unsigned char *) args->target;
int32_t *myOffsets = args->offsets;
unsigned char *myTarget;
int32_t *myOffsets;
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
UChar32 ch, ch2;
@ -297,6 +318,24 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
unsigned int indexToWrite;
unsigned char temp[sizeof(uint32_t)];
if(mySource >= sourceLimit) {
/* no input, nothing to do */
return;
}
/* write the BOM if necessary */
if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
ucnv_fromUWriteBytes(args->converter,
bom, 4,
&args->target, args->targetLimit,
&args->offsets, -1,
err);
args->converter->fromUnicodeStatus=0;
}
myTarget = (unsigned char *) args->target;
myOffsets = args->offsets;
temp[0] = 0;
if (args->converter->fromUChar32) {
@ -645,13 +684,30 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
UErrorCode * err)
{
const UChar *mySource = args->source;
unsigned char *myTarget = (unsigned char *) args->target;
unsigned char *myTarget;
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
UChar32 ch, ch2;
unsigned int indexToWrite;
unsigned char temp[sizeof(uint32_t)];
if(mySource >= sourceLimit) {
/* no input, nothing to do */
return;
}
/* write the BOM if necessary */
if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
ucnv_fromUWriteBytes(args->converter,
bom, 4,
&args->target, args->targetLimit,
&args->offsets, -1,
err);
args->converter->fromUnicodeStatus=0;
}
myTarget = (unsigned char *) args->target;
temp[3] = 0;
if (args->converter->fromUChar32)
@ -737,8 +793,8 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
UErrorCode * err)
{
const UChar *mySource = args->source;
unsigned char *myTarget = (unsigned char *) args->target;
int32_t *myOffsets = args->offsets;
unsigned char *myTarget;
int32_t *myOffsets;
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
UChar32 ch, ch2;
@ -746,6 +802,24 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
unsigned char temp[sizeof(uint32_t)];
int32_t offsetNum = 0;
if(mySource >= sourceLimit) {
/* no input, nothing to do */
return;
}
/* write the BOM if necessary */
if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
ucnv_fromUWriteBytes(args->converter,
bom, 4,
&args->target, args->targetLimit,
&args->offsets, -1,
err);
args->converter->fromUnicodeStatus=0;
}
myTarget = (unsigned char *) args->target;
myOffsets = args->offsets;
temp[3] = 0;
if (args->converter->fromUChar32)
@ -948,18 +1022,7 @@ _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
}
if(choice!=UCNV_RESET_TO_UNICODE) {
/* reset fromUnicode: prepare to output the UTF-32PE BOM */
cnv->charErrorBufferLength=4;
#if U_IS_BIG_ENDIAN
cnv->charErrorBuffer[0]=0;
cnv->charErrorBuffer[1]=0;
cnv->charErrorBuffer[2]=0xfe;
cnv->charErrorBuffer[3]=0xff;
#else
cnv->charErrorBuffer[0]=0xff;
cnv->charErrorBuffer[1]=0xfe;
cnv->charErrorBuffer[2]=0;
cnv->charErrorBuffer[3]=0;
#endif
cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
}
}

View File

@ -445,12 +445,21 @@ conversion {
fromUnicode {
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
Cases {
// Improve ucnv_ext.c code coverage:
// There will be a partial match up to the lead surrogate of U+603ff
// which then results in one more unit in the prefetch buffer
// than the match length when converting one code unit at a time.
// See ucnv_extContinueMatchFromU() comment
// "the match did not use all of preFromU[] - keep the rest for replay"
// UTF-16/32: do not output a BOM if there is no data at all
{
"UTF-16",
"",
:bin{ "" },
:intvector{ },
:int{1}, :int{1}, "", "?", ""
}
{
"UTF-32",
"",
:bin{ "" },
:intvector{ },
:int{1}, :int{1}, "", "?", ""
}
// do not convert SO/SI/ESC
{
@ -491,6 +500,12 @@ conversion {
:int{1}, :int{1}, "", "?", ""
}
// Improve ucnv_ext.c code coverage:
// There will be a partial match up to the lead surrogate of U+603ff
// which then results in one more unit in the prefetch buffer
// than the match length when converting one code unit at a time.
// See ucnv_extContinueMatchFromU() comment
// "the match did not use all of preFromU[] - keep the rest for replay"
{
"*test3",
"\U00101234\U00101234\U00050005\U000603ff",