From 516103b627d1cb49b08f341064357c7c9851aa95 Mon Sep 17 00:00:00 2001 From: George Rhoten Date: Thu, 16 Nov 2000 17:20:03 +0000 Subject: [PATCH] ICU-206 Added UTF-32 converter X-SVN-Rev: 2917 --- icu4c/data/convrtrs.txt | 12 +- icu4c/source/common/ucnv_bld.c | 13 +- icu4c/source/common/ucnv_cnv.h | 4 +- icu4c/source/common/ucnv_utf.c | 482 +++++++++++++++++++++++- icu4c/source/common/unicode/ucnv.h | 8 +- icu4c/source/data/mappings/convrtrs.txt | 12 +- icu4c/source/test/cintltst/nucnvtst.c | 76 +++- 7 files changed, 578 insertions(+), 29 deletions(-) diff --git a/icu4c/data/convrtrs.txt b/icu4c/data/convrtrs.txt index 51bc606a63..3f25e9d3ad 100644 --- a/icu4c/data/convrtrs.txt +++ b/icu4c/data/convrtrs.txt @@ -63,11 +63,15 @@ # be changed - or else code and/or file names must also be changed. # Algorithmic -UTF8 utf-8 { MIME } ibm-1208 cp1208 -UTF16_BigEndian utf-16be { MIME } -UTF16_LittleEndian { MIME } utf-16le { MIME } -UTF16_PlatformEndian { MIME } ISO-10646-UCS-2 { IANA } csUnicode utf-16 { MIME } ibm-1200 cp1200 ucs-2 +UTF8 utf-8 { MIME } ibm-1208 cp1208 +UTF16_BigEndian utf-16be { MIME } +UTF16_LittleEndian utf-16le { MIME } +UTF16_PlatformEndian ISO-10646-UCS-2 { IANA } csUnicode utf-16 { MIME } ibm-1200 cp1200 ucs-2 UTF16_OppositeEndian +UTF32_BigEndian utf-32be { MIME } +UTF32_LittleEndian utf-32le { MIME } +UTF32_PlatformEndian ISO-10646-UCS-4 { IANA } csUCS4 utf-32 { MIME } ucs-4 +UTF32_OppositeEndian LATIN_1 iso-8859-1 { MIME } ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 cp367 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983 #!!!!! There's whole lot of names for this ISO_2022 iso-2022 { MIME } 2022 cp2022 ISO_2022,locale=ja,version=0 ISO_2022_JP, ISO-2022-JP, csISO2022JP, iso-2022-jp { MIME } diff --git a/icu4c/source/common/ucnv_bld.c b/icu4c/source/common/ucnv_bld.c index d2f028989f..149054e461 100644 --- a/icu4c/source/common/ucnv_bld.c +++ b/icu4c/source/common/ucnv_bld.c @@ -47,8 +47,8 @@ extern void UCNV_DEBUG_LOG(char *what, char *who, void *p, int l); static const UConverterSharedData * converterData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={ &_SBCSData, &_DBCSData, &_MBCSData, &_Latin1Data, - &_UTF8Data, &_UTF16BEData, &_UTF16LEData, &_EBCDICStatefulData, - &_ISO2022Data, + &_UTF8Data, &_UTF16BEData, &_UTF16LEData, &_UTF32BEData, &_UTF32LEData, + &_EBCDICStatefulData, &_ISO2022Data, &_LMBCSData1,&_LMBCSData2, &_LMBCSData3, &_LMBCSData4, &_LMBCSData5, &_LMBCSData6, &_LMBCSData8,&_LMBCSData11,&_LMBCSData16,&_LMBCSData17,&_LMBCSData18,&_LMBCSData19, &_HZData, @@ -68,6 +68,15 @@ static struct { #else { "UTF16_PlatformEndian", UCNV_UTF16_LittleEndian }, { "UTF16_OppositeEndian", UCNV_UTF16_BigEndian}, +#endif + { "UTF32_BigEndian", UCNV_UTF32_BigEndian }, + { "UTF32_LittleEndian", UCNV_UTF32_LittleEndian }, +#if U_IS_BIG_ENDIAN + { "UTF32_PlatformEndian", UCNV_UTF32_BigEndian }, + { "UTF32_OppositeEndian", UCNV_UTF32_LittleEndian }, +#else + { "UTF32_PlatformEndian", UCNV_UTF32_LittleEndian }, + { "UTF32_OppositeEndian", UCNV_UTF32_BigEndian}, #endif { "ISO_2022", UCNV_ISO_2022 }, { "LMBCS-1", UCNV_LMBCS_1 }, diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h index bf87b61427..5a3303f735 100644 --- a/icu4c/source/common/ucnv_cnv.h +++ b/icu4c/source/common/ucnv_cnv.h @@ -205,8 +205,8 @@ struct UConverterImpl { extern const UConverterSharedData _SBCSData, _DBCSData, _MBCSData, _Latin1Data, - _UTF8Data, _UTF16BEData, _UTF16LEData, _EBCDICStatefulData, - _ISO2022Data, + _UTF8Data, _UTF16BEData, _UTF16LEData, _UTF32BEData, _UTF32LEData, + _EBCDICStatefulData, _ISO2022Data, _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6, _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19,_HZData; diff --git a/icu4c/source/common/ucnv_utf.c b/icu4c/source/common/ucnv_utf.c index 4e92f91bba..982353dbcd 100644 --- a/icu4c/source/common/ucnv_utf.c +++ b/icu4c/source/common/ucnv_utf.c @@ -16,6 +16,7 @@ * 06/29/2000 helena Major rewrite of the callback APIs. * 07/20/2000 george Change the coding style to conform to the coding guidelines, * and a few miscellaneous bug fixes. +* 11/15/2000 george Added UTF-32 */ #include "cmemory.h" @@ -34,7 +35,7 @@ */ /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/ static const uint32_t MAXIMUM_UCS2 = 0x0000FFFF; -static const uint32_t MAXIMUM_UTF16 = 0x0010FFFF; +static const uint32_t MAXIMUM_UTF = 0x0010FFFF; static const uint32_t MAXIMUM_UCS4 = 0x7FFFFFFF; static const int8_t HALF_SHIFT = 10; static const uint32_t HALF_BASE = 0x0010000; @@ -73,7 +74,8 @@ static const int8_t bytesFromUTF8[256] = { * * @returns true when callback fails */ -UBool T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args, +static UBool +T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args, UErrorCode *err) { UConverter *converter = args->converter; @@ -99,7 +101,8 @@ UBool T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args return (UBool)U_FAILURE(*err); } -UBool T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args, +static UBool +T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args, int32_t currentOffset, UErrorCode *err) { @@ -195,7 +198,7 @@ morebytes: /* Remove the acummulated high bits */ ch -= offsetsFromUTF8[inBytes]; - if (i == inBytes && ch <= MAXIMUM_UTF16) + if (i == inBytes && ch <= MAXIMUM_UTF) { /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ if (ch <= MAXIMUM_UCS2) @@ -345,7 +348,7 @@ morebytes: /* Remove the acummulated high bits */ ch -= offsetsFromUTF8[inBytes]; - if (i == inBytes && ch <= MAXIMUM_UTF16) + if (i == inBytes && ch <= MAXIMUM_UTF) { /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ if (ch <= MAXIMUM_UCS2) @@ -1030,7 +1033,7 @@ U_CFUNC void T_UConverter_toUnicode_UTF16_LE (UConverterToUnicodeArgs * args, args->source += mySourceIndex; } -U_CFUNC void T_UConverter_fromUnicode_UTF16_LE (UConverterFromUnicodeArgs * args, +U_CFUNC void T_UConverter_fromUnicode_UTF16_LE (UConverterFromUnicodeArgs * args, UErrorCode * err) { const UChar *mySource = args->source; @@ -1157,3 +1160,470 @@ const UConverterSharedData _UTF16LEData={ NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, 0 }; + +/* UTF-32BE ----------------------------------------------------------------- */ + +void T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args, + UErrorCode * err) +{ + const unsigned char *mySource = (unsigned char *) args->source; + UChar *myTarget = args->target; + const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; + const UChar *targetLimit = args->targetLimit; + unsigned char *toUBytes = args->converter->toUBytes; + uint32_t ch, i; + + /* UTF-8 returns here for only non-offset, this needs to change.*/ + if (args->converter->toUnicodeStatus && myTarget < targetLimit) + { + i = args->converter->toULength; /* restore # of bytes consumed */ + + ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ + args->converter->toUnicodeStatus = 0; + goto morebytes; + } + + while (mySource < sourceLimit && myTarget < targetLimit) + { + i = 0; + ch = 0; +morebytes: + while (i < sizeof(uint32_t)) + { + if (mySource < sourceLimit) + { + ch = (ch << 8) | (uint8_t)(*mySource); + toUBytes[i++] = (char) *(mySource++); + } + else + { + if (args->flush) + { + if (U_SUCCESS(*err)) + { + *err = U_TRUNCATED_CHAR_FOUND; + args->converter->toUnicodeStatus = MAXIMUM_UCS4; + } + } + else + { /* stores a partially calculated target*/ + /* + 1 to make 0 a valid character */ + args->converter->toUnicodeStatus = ch + 1; + args->converter->toULength = (int8_t) i; + } + goto donefornow; + } + } + + if (ch <= MAXIMUM_UTF) + { + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ + if (ch <= MAXIMUM_UCS2) + { + /* fits in 16 bits */ + *(myTarget++) = (UChar) ch; + } + else + { + /* write out the surrogates */ + ch -= HALF_BASE; + *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); + ch = (ch & HALF_MASK) + SURROGATE_LOW_START; + if (myTarget < targetLimit) + { + *(myTarget++) = (UChar)ch; + } + else + { + /* Put in overflow buffer (not handled here) */ + args->converter->UCharErrorBuffer[0] = (UChar) ch; + args->converter->UCharErrorBufferLength = 1; + *err = U_BUFFER_OVERFLOW_ERROR; + break; + } + } + } + else + { + args->source = (const char *) mySource; + args->target = myTarget; + args->converter->invalidCharLength = (int8_t)i; + if (T_UConverter_toUnicode_InvalidChar_Callback(args, err)) + { + /* Stop if the error wasn't handled */ + break; + } + args->converter->invalidCharLength = 0; + mySource = (unsigned char *) args->source; + myTarget = args->target; + } + } + +donefornow: + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) + { + /* End of target buffer */ + *err = U_BUFFER_OVERFLOW_ERROR; + } + + args->target = myTarget; + args->source = (const char *) mySource; +} + +void T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args, + UErrorCode * err) +{ + const UChar *mySource = args->source; + unsigned char *myTarget = (unsigned char *) args->target; + const UChar *sourceLimit = args->sourceLimit; + const unsigned char *targetLimit = (unsigned char *) args->targetLimit; + UChar32 ch, ch2; + unsigned int indexToWrite; + unsigned char temp[sizeof(uint32_t)]; + + temp[0] = 0; + + if (args->converter->fromUnicodeStatus) + { + ch = args->converter->fromUnicodeStatus; + args->converter->fromUnicodeStatus = 0; + goto lowsurogate; + } + + while (mySource < sourceLimit && myTarget < targetLimit) + { + ch = *(mySource++); + + if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START) + { +lowsurogate: + if (mySource < sourceLimit) + { + ch2 = *mySource; + if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END) + { + ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; + mySource++; + } + } + else if (!args->flush) + { + // ran out of source + args->converter->fromUnicodeStatus = ch; + break; + } + } + + /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ + /* Todo: Can the & part be left off implicitly? Does it really save time? */ + temp[1] = (uint8_t) (ch >> 16 & 0x1F); + temp[2] = (uint8_t) (ch >> 8 & 0xFF); + temp[3] = (uint8_t) (ch & 0xFF); + + for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) + { + if (myTarget < targetLimit) + { + *(myTarget++) = temp[indexToWrite]; + } + else + { + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; + *err = U_BUFFER_OVERFLOW_ERROR; /* Todo: is this needed because of ending if */ + } + } + } + + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) + { + *err = U_BUFFER_OVERFLOW_ERROR; + } + + args->target = (char *) myTarget; + args->source = mySource; +} + +/* +UChar32 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args, + UErrorCode* err) +{ + *err = U_UNSUPPORTED_ERROR; + return 0; +} +*/ +static const UConverterImpl _UTF32BEImpl = { + UCNV_UTF32_BigEndian, + + NULL, + NULL, + + NULL, + NULL, + NULL, + + T_UConverter_toUnicode_UTF32_BE, + NULL, +// T_UConverter_toUnicode_UTF32_BE_OFFSETS_LOGIC, + T_UConverter_fromUnicode_UTF32_BE, + NULL, +// T_UConverter_fromUnicode_UTF32_BE_OFFSETS_LOGIC, + NULL, +// T_UConverter_getNextUChar_UTF32_BE, + + NULL +}; + +/** Todo: These numbers are probably in correct. */ +const UConverterStaticData _UTF32BEStaticData = { + sizeof(UConverterStaticData), +"UTF32_BigEndian", + 1200, UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4, + { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE, + {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +}; + + +const UConverterSharedData _UTF32BEData = { + sizeof(UConverterSharedData), ~((uint32_t) 0), + NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl, + 0 +}; + +/* UTF-32LE ---------------------------------------------------------- */ + +void T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args, + UErrorCode * err) +{ + const unsigned char *mySource = (unsigned char *) args->source; + UChar *myTarget = args->target; + const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; + const UChar *targetLimit = args->targetLimit; + unsigned char *toUBytes = args->converter->toUBytes; + uint32_t ch, i; + + /* UTF-8 returns here for only non-offset, this needs to change.*/ + if (args->converter->toUnicodeStatus && myTarget < targetLimit) + { + i = args->converter->toULength; /* restore # of bytes consumed */ + + ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ + if (ch == -1) + ch = 0; + args->converter->toUnicodeStatus = 0; + goto morebytes; + } + + while (mySource < sourceLimit && myTarget < targetLimit) + { + i = 0; + ch = 0; +morebytes: + while (i < sizeof(uint32_t)) + { + if (mySource < sourceLimit) + { + ch |= ((uint8_t)(*mySource)) << (i * 8); + toUBytes[i++] = (char) *(mySource++); + } + else + { + if (args->flush) + { + if (U_SUCCESS(*err)) + { + *err = U_TRUNCATED_CHAR_FOUND; + args->converter->toUnicodeStatus = 0; + } + } + else + { /* stores a partially calculated target*/ + if (ch == 0) + { + args->converter->toUnicodeStatus = -1; + } + else + { + args->converter->toUnicodeStatus = ch; + } + args->converter->toULength = (int8_t) i; + } + goto donefornow; + } + } + + if (ch <= MAXIMUM_UTF) + { + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ + if (ch <= MAXIMUM_UCS2) + { + /* fits in 16 bits */ + *(myTarget++) = (UChar) ch; + } + else + { + /* write out the surrogates */ + ch -= HALF_BASE; + *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); + ch = (ch & HALF_MASK) + SURROGATE_LOW_START; + if (myTarget < targetLimit) + { + *(myTarget++) = (UChar)ch; + } + else + { + /* Put in overflow buffer (not handled here) */ + args->converter->UCharErrorBuffer[0] = (UChar) ch; + args->converter->UCharErrorBufferLength = 1; + *err = U_BUFFER_OVERFLOW_ERROR; + break; + } + } + } + else + { + args->source = (const char *) mySource; + args->target = myTarget; + args->converter->invalidCharLength = (int8_t)i; + if (T_UConverter_toUnicode_InvalidChar_Callback(args, err)) + { + /* Stop if the error wasn't handled */ + break; + } + args->converter->invalidCharLength = 0; + mySource = (unsigned char *) args->source; + myTarget = args->target; + } + } + +donefornow: + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) + { + /* End of target buffer */ + *err = U_BUFFER_OVERFLOW_ERROR; + } + + args->target = myTarget; + args->source = (const char *) mySource; +// *err = U_UNSUPPORTED_ERROR; +} + +void T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args, + UErrorCode * err) +{ + const UChar *mySource = args->source; + unsigned char *myTarget = (unsigned char *) args->target; + const UChar *sourceLimit = args->sourceLimit; + const unsigned char *targetLimit = (unsigned char *) args->targetLimit; + UChar32 ch, ch2; + unsigned int indexToWrite; + unsigned char temp[sizeof(uint32_t)]; + + temp[3] = 0; + + if (args->converter->fromUnicodeStatus) + { + ch = args->converter->fromUnicodeStatus; + args->converter->fromUnicodeStatus = 0; + goto lowsurogate; + } + + while (mySource < sourceLimit && myTarget < targetLimit) + { + ch = *(mySource++); + + if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START) + { +lowsurogate: + if (mySource < sourceLimit) + { + ch2 = *mySource; + if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END) + { + ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; + mySource++; + } + } + else if (!args->flush) + { + // ran out of source + args->converter->fromUnicodeStatus = ch; + break; + } + } + + /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ + /* Todo: Can the & part be left off implicitly? Does it really save time? */ + temp[2] = (uint8_t) (ch >> 16 & 0x1F); + temp[1] = (uint8_t) (ch >> 8 & 0xFF); + temp[0] = (uint8_t) (ch & 0xFF); + + for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) + { + if (myTarget < targetLimit) + { + *(myTarget++) = temp[indexToWrite]; + } + else + { + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; + *err = U_BUFFER_OVERFLOW_ERROR; /* Todo: is this needed because of ending if */ + } + } + } + + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) + { + *err = U_BUFFER_OVERFLOW_ERROR; + } + + args->target = (char *) myTarget; + args->source = mySource; +// *err = U_UNSUPPORTED_ERROR; +} + +/* +UChar32 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args, + UErrorCode* err) +{ + *err = U_UNSUPPORTED_ERROR; + return 0; +} +*/ + +static const UConverterImpl _UTF32LEImpl = { + UCNV_UTF32_LittleEndian, + + NULL, + NULL, + + NULL, + NULL, + NULL, + + T_UConverter_toUnicode_UTF32_LE, + NULL, +// T_UConverter_toUnicode_UTF32_LE_OFFSETS_LOGIC, + T_UConverter_fromUnicode_UTF32_LE, + NULL, +// T_UConverter_fromUnicode_UTF32_LE_OFFSETS_LOGIC, + NULL, +// T_UConverter_getNextUChar_UTF32_LE, + + NULL +}; + +/** Todo: These numbers are probably in correct. */ +const UConverterStaticData _UTF32LEStaticData = { + sizeof(UConverterStaticData), +"UTF32_LittleEndian", + 1200, UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4, + { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE, + {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +}; + + +const UConverterSharedData _UTF32LEData = { + sizeof(UConverterSharedData), ~((uint32_t) 0), + NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl, + 0 +}; diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h index 2a7916b9a1..5dd6458d12 100644 --- a/icu4c/source/common/unicode/ucnv.h +++ b/icu4c/source/common/unicode/ucnv.h @@ -50,10 +50,12 @@ typedef enum { UCNV_UTF8 = 4, UCNV_UTF16_BigEndian = 5, UCNV_UTF16_LittleEndian = 6, - UCNV_EBCDIC_STATEFUL = 7, - UCNV_ISO_2022 = 8, + UCNV_UTF32_BigEndian = 7, + UCNV_UTF32_LittleEndian = 8, + UCNV_EBCDIC_STATEFUL = 9, + UCNV_ISO_2022 = 10, - UCNV_LMBCS_1 = 9, + UCNV_LMBCS_1 = 11, UCNV_LMBCS_2, UCNV_LMBCS_3, UCNV_LMBCS_4, diff --git a/icu4c/source/data/mappings/convrtrs.txt b/icu4c/source/data/mappings/convrtrs.txt index 51bc606a63..3f25e9d3ad 100644 --- a/icu4c/source/data/mappings/convrtrs.txt +++ b/icu4c/source/data/mappings/convrtrs.txt @@ -63,11 +63,15 @@ # be changed - or else code and/or file names must also be changed. # Algorithmic -UTF8 utf-8 { MIME } ibm-1208 cp1208 -UTF16_BigEndian utf-16be { MIME } -UTF16_LittleEndian { MIME } utf-16le { MIME } -UTF16_PlatformEndian { MIME } ISO-10646-UCS-2 { IANA } csUnicode utf-16 { MIME } ibm-1200 cp1200 ucs-2 +UTF8 utf-8 { MIME } ibm-1208 cp1208 +UTF16_BigEndian utf-16be { MIME } +UTF16_LittleEndian utf-16le { MIME } +UTF16_PlatformEndian ISO-10646-UCS-2 { IANA } csUnicode utf-16 { MIME } ibm-1200 cp1200 ucs-2 UTF16_OppositeEndian +UTF32_BigEndian utf-32be { MIME } +UTF32_LittleEndian utf-32le { MIME } +UTF32_PlatformEndian ISO-10646-UCS-4 { IANA } csUCS4 utf-32 { MIME } ucs-4 +UTF32_OppositeEndian LATIN_1 iso-8859-1 { MIME } ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 cp367 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983 #!!!!! There's whole lot of names for this ISO_2022 iso-2022 { MIME } 2022 cp2022 ISO_2022,locale=ja,version=0 ISO_2022_JP, ISO-2022-JP, csISO2022JP, iso-2022-jp { MIME } diff --git a/icu4c/source/test/cintltst/nucnvtst.c b/icu4c/source/test/cintltst/nucnvtst.c index 0e00b88002..ef601da681 100644 --- a/icu4c/source/test/cintltst/nucnvtst.c +++ b/icu4c/source/test/cintltst/nucnvtst.c @@ -583,21 +583,64 @@ void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize ) /* etc */ - const uint8_t expectedUTF16LE[] = - { 0x31, 0x00, 0x32, 0x00, 0x33, 0x00, 0x00, 0x00, 0x00, 0x4e, 0x8c, 0x4e, 0x09, 0x4e, 0x2e, 0x00 }; - int32_t toUTF16LEOffs[]= - { 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07}; - int32_t fmUTF16LEOffs[] = - { 0x0000, 0x0002, 0x0004, 0x0006, 0x0008, 0x000a, 0x000c, 0x000e }; - const uint8_t expectedUTF16BE[] = { 0x00, 0x31, 0x00, 0x32, 0x00, 0x33, 0x00, 0x00, 0x4e, 0x00, 0x4e, 0x8c, 0x4e, 0x09, 0x00, 0x2e }; int32_t toUTF16BEOffs[]= { 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07}; int32_t fmUTF16BEOffs[] = { 0x0000, 0x0002, 0x0004, 0x0006, 0x0008, 0x000a, 0x000c, 0x000e }; + + const uint8_t expectedUTF16LE[] = + { 0x31, 0x00, 0x32, 0x00, 0x33, 0x00, 0x00, 0x00, 0x00, 0x4e, 0x8c, 0x4e, 0x09, 0x4e, 0x2e, 0x00 }; + int32_t toUTF16LEOffs[]= + { 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07}; + int32_t fmUTF16LEOffs[] = + { 0x0000, 0x0002, 0x0004, 0x0006, 0x0008, 0x000a, 0x000c, 0x000e }; - + const uint8_t expectedUTF32BE[] = + { 0x00, 0x00, 0x00, 0x31, + 0x00, 0x00, 0x00, 0x32, + 0x00, 0x00, 0x00, 0x33, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x4e, 0x00, + 0x00, 0x00, 0x4e, 0x8c, + 0x00, 0x00, 0x4e, 0x09, + 0x00, 0x00, 0x00, 0x2e }; + int32_t toUTF32BEOffs[]= + { 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, + 0x03, 0x03, 0x03, 0x03, + 0x04, 0x04, 0x04, 0x04, + 0x05, 0x05, 0x05, 0x05, + 0x06, 0x06, 0x06, 0x06, + 0x07, 0x07, 0x07, 0x07, + 0x08, 0x08, 0x08, 0x08 }; + int32_t fmUTF32BEOffs[] = + { 0x0000, 0x0004, 0x0008, 0x000c, 0x0010, 0x0014, 0x0018, 0x001c }; + + const uint8_t expectedUTF32LE[] = + { 0x31, 0x00, 0x00, 0x00, + 0x32, 0x00, 0x00, 0x00, + 0x33, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x4e, 0x00, 0x00, + 0x8c, 0x4e, 0x00, 0x00, + 0x09, 0x4e, 0x00, 0x00, + 0x2e, 0x00, 0x00, 0x00 }; + int32_t toUTF32LEOffs[]= + { 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, + 0x03, 0x03, 0x03, 0x03, + 0x04, 0x04, 0x04, 0x04, + 0x05, 0x05, 0x05, 0x05, + 0x06, 0x06, 0x06, 0x06, + 0x07, 0x07, 0x07, 0x07, + 0x08, 0x08, 0x08, 0x08 }; + int32_t fmUTF32LEOffs[] = + { 0x0000, 0x0004, 0x0008, 0x000c, 0x0010, 0x0014, 0x0018, 0x001c }; + @@ -651,6 +694,14 @@ void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize ) if(!testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]), expectedUTF16BE, sizeof(expectedUTF16BE), "utf-16be", toUTF16BEOffs )) log_err("u-> utf-16be did not match.\n"); + /*UTF32 LE*/ + if(!testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]), + expectedUTF32LE, sizeof(expectedUTF32LE), "utf-32le", toUTF32LEOffs )) + log_err("u-> utf-32le did not match.\n"); + /*UTF32 BE*/ + if(!testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]), + expectedUTF32BE, sizeof(expectedUTF32BE), "utf-32be", toUTF32BEOffs )) + log_err("u-> utf-32be did not match.\n"); /*LATIN_1*/ if(!testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]), expectedLATIN1, sizeof(expectedLATIN1), "LATIN_1", toLATIN1Offs )) @@ -703,6 +754,14 @@ void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize ) if(!testConvertToU(expectedUTF16BE, sizeof(expectedUTF16BE), sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "utf-16be", fmUTF16BEOffs )) log_err("utf-16be -> u did not match.\n"); + /*UTF32 LE*/ + if(!testConvertToU(expectedUTF32LE, sizeof(expectedUTF32LE), + sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "utf-32le", fmUTF32LEOffs )) + log_err("utf-32le -> u did not match.\n"); + /*UTF32 BE*/ + if(!testConvertToU(expectedUTF32BE, sizeof(expectedUTF32BE), + sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "utf-32be", fmUTF32BEOffs )) + log_err("utf-32be -> u did not match.\n"); /*EBCDIC_STATEFUL*/ if(!testConvertToU(expectedIBM930, sizeof(expectedIBM930), sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "ibm-930", fmIBM930Offs )) @@ -712,6 +771,7 @@ void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize ) sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "ibm-943", fmIBM943Offs )) log_err("ibm-943 -> u did not match.\n"); + /* Try it again to make sure it still works */ if(!testConvertToU(expectedUTF16LE, sizeof(expectedUTF16LE), sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "utf-16le", fmUTF16LEOffs )) log_err("utf-16le -> u did not match.\n");