ICU-4223 do not write the UTF-16/32 BOM if there is no text to convert

X-SVN-Rev: 17866
2005-06-12 21:15:18 +00:00 · 2005-06-12 21:15:18 +00:00 · 26091e6ae6
commit 26091e6ae6
parent a2e0e46710
3 changed files with 147 additions and 48 deletions
--- a/icu4c/source/common/ucnv_u16.c
+++ b/icu4c/source/common/ucnv_u16.c
@ -23,6 +23,10 @@
 #include "ucnv_cnv.h"
 #include "cmemory.h"

+enum {
+    UCNV_NEED_TO_WRITE_BOM=1
+};
+
 /* UTF-16BE ----------------------------------------------------------------- */

 #if U_IS_BIG_ENDIAN
@ -39,7 +43,7 @@ _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    uint8_t *target;
    int32_t *offsets;

-    int32_t targetCapacity, length, count, sourceIndex;
+    int32_t targetCapacity, length, sourceIndex;
    UChar c, trail;
    char overflow[4];

@ -50,13 +54,25 @@ _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
        return;
    }

+    cnv=pArgs->converter;
+
+    /* write the BOM if necessary */
+    if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ (char)0xfe, (char)0xff };
+        ucnv_fromUWriteBytes(cnv,
+                             bom, 2,
+                             &pArgs->target, pArgs->targetLimit,
+                             &pArgs->offsets, -1,
+                             pErrorCode);
+        cnv->fromUnicodeStatus=0;
+    }
+
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    if(targetCapacity<=0) {
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
        return;
    }

-    cnv=pArgs->converter;
    target=(uint8_t *)pArgs->target;
    offsets=pArgs->offsets;
    sourceIndex=0;
@ -83,13 +99,13 @@ _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
        cnv->fromUChar32=c=0;
    }

-    /* copy an even number of bytes for complete UChars */
-    count=2*length;
-    if(count>targetCapacity) {
-        count=targetCapacity&~1;
-    }
-    /* count is even */
    if(c==0) {
+        /* copy an even number of bytes for complete UChars */
+        int32_t count=2*length;
+        if(count>targetCapacity) {
+            count=targetCapacity&~1;
+        }
+        /* count is even */
        targetCapacity-=count;
        count>>=1;
        length-=count;
@ -581,7 +597,7 @@ _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    uint8_t *target;
    int32_t *offsets;

-    int32_t targetCapacity, length, count, sourceIndex;
+    int32_t targetCapacity, length, sourceIndex;
    UChar c, trail;
    char overflow[4];

@ -592,13 +608,25 @@ _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
        return;
    }

+    cnv=pArgs->converter;
+
+    /* write the BOM if necessary */
+    if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ (char)0xff, (char)0xfe };
+        ucnv_fromUWriteBytes(cnv,
+                             bom, 2,
+                             &pArgs->target, pArgs->targetLimit,
+                             &pArgs->offsets, -1,
+                             pErrorCode);
+        cnv->fromUnicodeStatus=0;
+    }
+
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    if(targetCapacity<=0) {
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
        return;
    }

-    cnv=pArgs->converter;
    target=(uint8_t *)pArgs->target;
    offsets=pArgs->offsets;
    sourceIndex=0;
@ -625,13 +653,13 @@ _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
        cnv->fromUChar32=c=0;
    }

-    /* copy an even number of bytes for complete UChars */
-    count=2*length;
-    if(count>targetCapacity) {
-        count=targetCapacity&~1;
-    }
-    /* count is even */
    if(c==0) {
+        /* copy an even number of bytes for complete UChars */
+        int32_t count=2*length;
+        if(count>targetCapacity) {
+            count=targetCapacity&~1;
+        }
+        /* count is even */
        targetCapacity-=count;
        count>>=1;
        length-=count;
@ -1144,14 +1172,7 @@ _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
    }
    if(choice!=UCNV_RESET_TO_UNICODE) {
        /* reset fromUnicode: prepare to output the UTF-16PE BOM */
-        cnv->charErrorBufferLength=2;
-#if U_IS_BIG_ENDIAN
-        cnv->charErrorBuffer[0]=0xfe;
-        cnv->charErrorBuffer[1]=0xff;
-#else
-        cnv->charErrorBuffer[0]=0xff;
-        cnv->charErrorBuffer[1]=0xfe;
-#endif
+        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
    }
 }

--- a/icu4c/source/common/ucnv_u32.c
+++ b/icu4c/source/common/ucnv_u32.c
@ -34,6 +34,10 @@
 /* -SURROGATE_LOW_START + HALF_BASE */
 #define SURROGATE_LOW_BASE      9216

+enum {
+    UCNV_NEED_TO_WRITE_BOM=1
+};
+
 /* UTF-32BE ----------------------------------------------------------------- */

 static void
@ -204,13 +208,30 @@ T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
                                  UErrorCode * err)
 {
    const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
+    unsigned char *myTarget;
    const UChar *sourceLimit = args->sourceLimit;
    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    UChar32 ch, ch2;
    unsigned int indexToWrite;
    unsigned char temp[sizeof(uint32_t)];

+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
    temp[0] = 0;

    if (args->converter->fromUChar32) {
@ -288,8 +309,8 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
                                               UErrorCode * err)
 {
    const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
-    int32_t *myOffsets = args->offsets;
+    unsigned char *myTarget;
+    int32_t *myOffsets;
    const UChar *sourceLimit = args->sourceLimit;
    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    UChar32 ch, ch2;
@ -297,6 +318,24 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
    unsigned int indexToWrite;
    unsigned char temp[sizeof(uint32_t)];

+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
+    myOffsets = args->offsets;
    temp[0] = 0;

    if (args->converter->fromUChar32) {
@ -645,13 +684,30 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
                                  UErrorCode * err)
 {
    const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
+    unsigned char *myTarget;
    const UChar *sourceLimit = args->sourceLimit;
    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    UChar32 ch, ch2;
    unsigned int indexToWrite;
    unsigned char temp[sizeof(uint32_t)];

+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
    temp[3] = 0;

    if (args->converter->fromUChar32)
@ -737,8 +793,8 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
                                               UErrorCode * err)
 {
    const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
-    int32_t *myOffsets = args->offsets;
+    unsigned char *myTarget;
+    int32_t *myOffsets;
    const UChar *sourceLimit = args->sourceLimit;
    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    UChar32 ch, ch2;
@ -746,6 +802,24 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
    unsigned char temp[sizeof(uint32_t)];
    int32_t offsetNum = 0;

+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
+    myOffsets = args->offsets;
    temp[3] = 0;

    if (args->converter->fromUChar32)
@ -948,18 +1022,7 @@ _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
    }
    if(choice!=UCNV_RESET_TO_UNICODE) {
        /* reset fromUnicode: prepare to output the UTF-32PE BOM */
-        cnv->charErrorBufferLength=4;
-#if U_IS_BIG_ENDIAN
-        cnv->charErrorBuffer[0]=0;
-        cnv->charErrorBuffer[1]=0;
-        cnv->charErrorBuffer[2]=0xfe;
-        cnv->charErrorBuffer[3]=0xff;
-#else
-        cnv->charErrorBuffer[0]=0xff;
-        cnv->charErrorBuffer[1]=0xfe;
-        cnv->charErrorBuffer[2]=0;
-        cnv->charErrorBuffer[3]=0;
-#endif
+        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
    }
 }

--- a/icu4c/source/test/testdata/conversion.txt
+++ b/icu4c/source/test/testdata/conversion.txt
@ -445,12 +445,21 @@ conversion {
    fromUnicode {
      Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
      Cases {
-        // Improve ucnv_ext.c code coverage:
-        // There will be a partial match up to the lead surrogate of U+603ff
-        // which then results in one more unit in the prefetch buffer
-        // than the match length when converting one code unit at a time.
-        // See ucnv_extContinueMatchFromU() comment
-        // "the match did not use all of preFromU[] - keep the rest for replay"
+        // UTF-16/32: do not output a BOM if there is no data at all
+        {
+          "UTF-16",
+          "",
+          :bin{       "" },
+          :intvector{  },
+          :int{1}, :int{1}, "", "?", ""
+        }
+        {
+          "UTF-32",
+          "",
+          :bin{       "" },
+          :intvector{  },
+          :int{1}, :int{1}, "", "?", ""
+        }

        // do not convert SO/SI/ESC
        {
@ -491,6 +500,12 @@ conversion {
          :int{1}, :int{1}, "", "?", ""
        }

+        // Improve ucnv_ext.c code coverage:
+        // There will be a partial match up to the lead surrogate of U+603ff
+        // which then results in one more unit in the prefetch buffer
+        // than the match length when converting one code unit at a time.
+        // See ucnv_extContinueMatchFromU() comment
+        // "the match did not use all of preFromU[] - keep the rest for replay"
        {
          "*test3",
          "\U00101234\U00101234\U00050005\U000603ff",