From 789228165c6254703e7f383b96130947ef60ab01 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 4 Dec 2003 19:36:20 +0000 Subject: [PATCH] ICU-3361 fix ucnv_getUnicodeSet(ISO-2022-xx) X-SVN-Rev: 14000 --- icu4c/source/common/ucnv2022.c | 36 ++++++--- icu4c/source/common/ucnvmbcs.c | 92 +++++++++++++++++++++++ icu4c/source/common/ucnvmbcs.h | 14 ++++ icu4c/source/test/testdata/conversion.txt | 36 +++++++++ 4 files changed, 168 insertions(+), 10 deletions(-) diff --git a/icu4c/source/common/ucnv2022.c b/icu4c/source/common/ucnv2022.c index c85ade6620..1a39e73370 100644 --- a/icu4c/source/common/ucnv2022.c +++ b/icu4c/source/common/ucnv2022.c @@ -488,9 +488,11 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti (myLocale[2]=='_' || myLocale[2]=='\0')){ /* open the required converters and cache them */ - myConverterData->myConverterArray[GB2312_1] = ucnv_open("ibm-5478",errorCode); - myConverterData->myConverterArray[ISO_IR_165] = ucnv_open("iso-ir-165",errorCode); - myConverterData->myConverterArray[CNS_11643] = ucnv_open("cns-11643-1992",errorCode); + myConverterData->myConverterArray[GB2312_1] = ucnv_open("ibm-5478",errorCode); + if(version==1) { + myConverterData->myConverterArray[ISO_IR_165] = ucnv_open("iso-ir-165",errorCode); + } + myConverterData->myConverterArray[CNS_11643] = ucnv_open("cns-11643-1992",errorCode); /*initialize the state variables*/ @@ -2793,12 +2795,14 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv, if (U_FAILURE(*pErrorCode)) { return; } +#ifdef U_ENABLE_GENERIC_ISO_2022 if (cnv->sharedData == &_ISO2022Data) { /* We use UTF-8 in this case */ uset_addRange(set, 0, 0xd7FF); uset_addRange(set, 0xE000, 0x10FFFF); return; } +#endif cnvData = (UConverterDataISO2022*)cnv->extraInfo; if (cnv->sharedData == &_ISO2022KRData && cnvData->currentConverter != NULL) { @@ -2811,28 +2815,29 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv, case 'j': if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { /* include Latin-1 for some variants of JP */ - cnvSet = uset_open(0, 0xff); + uset_addRange(set, 0, 0xff); } else { /* include ASCII for JP */ - cnvSet = uset_open(0, 0x7f); + uset_addRange(set, 0, 0x7f); } /* include half-width Katakana for JP */ - uset_addRange(cnvSet, 0xff61, 0xff9f); + uset_addRange(set, 0xff61, 0xff9f); break; case 'c': case 'z': /* include ASCII for CN */ - cnvSet = uset_open(0, 0x7f); + uset_addRange(set, 0, 0x7f); break; case 'k': /* there is only one converter for KR, and it is not in the myConverterArray[] */ ucnv_getUnicodeSet(cnvData->currentConverter, set, which, pErrorCode); return; default: - cnvSet = uset_open(1, 0); break; } + /* open a helper set because ucnv_getUnicodeSet() first empties its result set */ + cnvSet = uset_open(1, 0); if (!cnvSet) { *pErrorCode =U_MEMORY_ALLOCATION_ERROR; return; @@ -2847,8 +2852,19 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv, */ for (i=0; imyConverterArray[i]!=NULL) { - ucnv_getUnicodeSet(cnvData->myConverterArray[i], cnvSet, which, pErrorCode); - uset_addAll(set, cnvSet /* pErrorCode */); + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && + cnvData->version==0 && i==CNS_11643 + ) { + /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ + _MBCSGetUnicodeSetForBytes( + cnvData->myConverterArray[i], + set, UCNV_ROUNDTRIP_SET, + 0, 0x81, 0x82, + pErrorCode); + } else { + ucnv_getUnicodeSet(cnvData->myConverterArray[i], cnvSet, which, pErrorCode); + uset_addAll(set, cnvSet /* pErrorCode */); + } } } uset_close(cnvSet); diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index 897305aec3..3b32b29a2a 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -426,6 +426,98 @@ _MBCSSizeofFromUBytes(UConverterMBCSTable *mbcsTable) { } } +/* similar to _MBCSGetNextUChar() but recursive */ +static void +_getUnicodeSetForBytes(const UConverter *cnv, + const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits, + USet *set, + UConverterUnicodeSet which, + uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte, + + UErrorCode *pErrorCode) { + int32_t b, entry; + + for(b=lowByte; b<=highByte; ++b) { + entry=stateTable[state][b]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + _getUnicodeSetForBytes( + cnv, stateTable, unicodeCodeUnits, + set, which, + (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry), + offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), + 0, 0xff, + pErrorCode); + } else { + UChar32 c; + int32_t rowOffset=offset; + uint8_t action; + + c=U_SENTINEL; + + /* + * An if-else-if chain provides more reliable performance for + * the most common cases compared to a switch. + */ + action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); + if(action==MBCS_STATE_VALID_DIRECT_16) { + /* output BMP code point */ + c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); + } else if(action==MBCS_STATE_VALID_16) { + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); + c=unicodeCodeUnits[offset]; + if(c<0xfffe) { + /* output BMP code point */ + } else { + c=U_SENTINEL; + } + } else if(action==MBCS_STATE_VALID_16_PAIR) { + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); + c=unicodeCodeUnits[offset++]; + if(c<0xd800) { + /* output BMP code point below 0xd800 */ + } else if(c<=0xdbff) { + /* output roundtrip or fallback supplementary code point */ + c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); + } else if(c==0xe000) { + /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ + c=unicodeCodeUnits[offset]; + } else { + c=U_SENTINEL; + } + } else if(action==MBCS_STATE_VALID_DIRECT_20) { + /* output supplementary code point */ + c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); + } + + if(c>=0) { + uset_add(set, c); + } + offset=rowOffset; + } + } +} + +/* + * Internal function returning a UnicodeSet for toUnicode() conversion. + * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. + * In the future, if we add support for reverse-fallback sets, this function + * needs to be updated, and called for each initial state. + * Does not currently handle extensions. + * Does not empty the set first. + */ +U_CFUNC void +_MBCSGetUnicodeSetForBytes(const UConverter *cnv, + USet *set, + UConverterUnicodeSet which, + uint8_t state, int32_t lowByte, int32_t highByte, + UErrorCode *pErrorCode) { + _getUnicodeSetForBytes( + cnv, cnv->sharedData->mbcs.stateTable, cnv->sharedData->mbcs.unicodeCodeUnits, + set, which, + state, 0, lowByte, highByte, + pErrorCode); +} + static void _MBCSGetUnicodeSet(const UConverter *cnv, USet *set, diff --git a/icu4c/source/common/ucnvmbcs.h b/icu4c/source/common/ucnvmbcs.h index a763775768..4e9397a515 100644 --- a/icu4c/source/common/ucnvmbcs.h +++ b/icu4c/source/common/ucnvmbcs.h @@ -352,5 +352,19 @@ U_CFUNC void _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode); +/* + * Internal function returning a UnicodeSet for toUnicode() conversion. + * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. + * In the future, if we add support for reverse-fallback sets, this function + * needs to be updated, and called for each initial state. + * Does not currently handle extensions. + * Does not empty the set first. + */ +U_CFUNC void +_MBCSGetUnicodeSetForBytes(const UConverter *cnv, + USet *set, + UConverterUnicodeSet which, + uint8_t state, int32_t lowByte, int32_t highByte, + UErrorCode *pErrorCode); #endif diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index 34b3527e15..b9bda32421 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -503,6 +503,42 @@ conversion { // which - numeric UConverterUnicodeSet value Headers { "charset", "map", "mapnot", "which" } Cases { + // ISO-2022-KR + { + "ISO-2022-KR", + "[\x00-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac02\uffe6]", + "[\x80-\xa0\xa3\xa5\xff-\u0110\uac03\uffe7-\U0010ffff]", + :int{0} + } + + // versions of ISO-2022-JP + { + "ISO-2022-JP", + "[\x00-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]", + "[\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]", + :int{0} + } + { + "ISO-2022-JP-2", + "[\x00-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]", + "[\uffe7-\U0010ffff]", + :int{0} + } + + // versions of ISO-2022-CN + { + "ISO-2022-CN", + "[\x00-\x7f\u4e00\u4e01\u9f98\ufe6b]", + "[\u4e29\uffe6-\U0010ffff]", + :int{0} + } + { + "ISO-2022-CN-EXT", + "[\x00-\x7f\u4e00-\u4e05\u9f98\ufe6b\u4e28-\u4e2b\U00020000\U00020003-\U00020005\U00029664]", + "[\U00020001\U00020002\U0002a6d7-\U0010ffff]", + :int{0} + } + // DBCS-only { "ibm-971",