ICU-3361 fix ucnv_getUnicodeSet(ISO-2022-xx)

X-SVN-Rev: 14000
2003-12-04 19:36:20 +00:00 · 2003-12-04 19:36:20 +00:00 · 789228165c
commit 789228165c
parent 315f0b4633
4 changed files with 168 additions and 10 deletions
--- a/icu4c/source/common/ucnv2022.c
+++ b/icu4c/source/common/ucnv2022.c
@ -488,9 +488,11 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
            (myLocale[2]=='_' || myLocale[2]=='\0')){

            /* open the required converters and cache them */
-            myConverterData->myConverterArray[GB2312_1]     = ucnv_open("ibm-5478",errorCode);
-            myConverterData->myConverterArray[ISO_IR_165]   = ucnv_open("iso-ir-165",errorCode);
-            myConverterData->myConverterArray[CNS_11643]    = ucnv_open("cns-11643-1992",errorCode);
+            myConverterData->myConverterArray[GB2312_1]         = ucnv_open("ibm-5478",errorCode);
+            if(version==1) {
+                myConverterData->myConverterArray[ISO_IR_165]   = ucnv_open("iso-ir-165",errorCode);
+            }
+            myConverterData->myConverterArray[CNS_11643]        = ucnv_open("cns-11643-1992",errorCode);


            /*initialize the state variables*/
@ -2793,12 +2795,14 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
    if (U_FAILURE(*pErrorCode)) {
        return;
    }
+#ifdef U_ENABLE_GENERIC_ISO_2022
    if (cnv->sharedData == &_ISO2022Data) {
        /* We use UTF-8 in this case */
        uset_addRange(set, 0, 0xd7FF);
        uset_addRange(set, 0xE000, 0x10FFFF);
        return;
    }
+#endif

    cnvData = (UConverterDataISO2022*)cnv->extraInfo;
    if (cnv->sharedData == &_ISO2022KRData && cnvData->currentConverter != NULL) {
@ -2811,28 +2815,29 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
    case 'j':
        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
            /* include Latin-1 for some variants of JP */
-            cnvSet = uset_open(0, 0xff);
+            uset_addRange(set, 0, 0xff);
        } else {
            /* include ASCII for JP */
-            cnvSet = uset_open(0, 0x7f);
+            uset_addRange(set, 0, 0x7f);
        }
        /* include half-width Katakana for JP */
-        uset_addRange(cnvSet, 0xff61, 0xff9f);
+        uset_addRange(set, 0xff61, 0xff9f);
        break;
    case 'c':
    case 'z':
        /* include ASCII for CN */
-        cnvSet = uset_open(0, 0x7f);
+        uset_addRange(set, 0, 0x7f);
        break;
    case 'k':
        /* there is only one converter for KR, and it is not in the myConverterArray[] */
        ucnv_getUnicodeSet(cnvData->currentConverter, set, which, pErrorCode);
        return;
    default:
-        cnvSet = uset_open(1, 0);
        break;
    }

+    /* open a helper set because ucnv_getUnicodeSet() first empties its result set */
+    cnvSet = uset_open(1, 0);
    if (!cnvSet) {
        *pErrorCode =U_MEMORY_ALLOCATION_ERROR;
        return;
@ -2847,8 +2852,19 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
     */
    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
        if(cnvData->myConverterArray[i]!=NULL) {
-            ucnv_getUnicodeSet(cnvData->myConverterArray[i], cnvSet, which, pErrorCode);
-            uset_addAll(set, cnvSet /* pErrorCode */);
+            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
+                cnvData->version==0 && i==CNS_11643
+            ) {
+                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
+                _MBCSGetUnicodeSetForBytes(
+                        cnvData->myConverterArray[i],
+                        set, UCNV_ROUNDTRIP_SET,
+                        0, 0x81, 0x82,
+                        pErrorCode);
+            } else {
+                ucnv_getUnicodeSet(cnvData->myConverterArray[i], cnvSet, which, pErrorCode);
+                uset_addAll(set, cnvSet /* pErrorCode */);
+            }
        }
    }
    uset_close(cnvSet);
--- a/icu4c/source/common/ucnvmbcs.c
+++ b/icu4c/source/common/ucnvmbcs.c
@ -426,6 +426,98 @@ _MBCSSizeofFromUBytes(UConverterMBCSTable *mbcsTable) {
    }
 }

+/* similar to _MBCSGetNextUChar() but recursive */
+static void
+_getUnicodeSetForBytes(const UConverter *cnv,
+                       const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits,
+                       USet *set,
+                       UConverterUnicodeSet which,
+                       uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte,
+                      
+                       UErrorCode *pErrorCode) {
+    int32_t b, entry;
+
+    for(b=lowByte; b<=highByte; ++b) {
+        entry=stateTable[state][b];
+        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+            _getUnicodeSetForBytes(
+                cnv, stateTable, unicodeCodeUnits,
+                set, which,
+                (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry),
+                offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
+                0, 0xff,
+                pErrorCode);
+        } else {
+            UChar32 c;
+            int32_t rowOffset=offset;
+            uint8_t action;
+
+            c=U_SENTINEL;
+
+            /*
+             * An if-else-if chain provides more reliable performance for
+             * the most common cases compared to a switch.
+             */
+            action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
+            if(action==MBCS_STATE_VALID_DIRECT_16) {
+                /* output BMP code point */
+                c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
+            } else if(action==MBCS_STATE_VALID_16) {
+                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
+                c=unicodeCodeUnits[offset];
+                if(c<0xfffe) {
+                    /* output BMP code point */
+                } else {
+                    c=U_SENTINEL;
+                }
+            } else if(action==MBCS_STATE_VALID_16_PAIR) {
+                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
+                c=unicodeCodeUnits[offset++];
+                if(c<0xd800) {
+                    /* output BMP code point below 0xd800 */
+                } else if(c<=0xdbff) {
+                    /* output roundtrip or fallback supplementary code point */
+                    c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
+                } else if(c==0xe000) {
+                    /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
+                    c=unicodeCodeUnits[offset];
+                } else {
+                    c=U_SENTINEL;
+                }
+            } else if(action==MBCS_STATE_VALID_DIRECT_20) {
+                /* output supplementary code point */
+                c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
+            }
+
+            if(c>=0) {
+                uset_add(set, c);
+            }
+            offset=rowOffset;
+        }
+    }
+}
+
+/*
+ * Internal function returning a UnicodeSet for toUnicode() conversion.
+ * Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
+ * In the future, if we add support for reverse-fallback sets, this function
+ * needs to be updated, and called for each initial state.
+ * Does not currently handle extensions.
+ * Does not empty the set first.
+ */
+U_CFUNC void
+_MBCSGetUnicodeSetForBytes(const UConverter *cnv,
+                           USet *set,
+                           UConverterUnicodeSet which,
+                           uint8_t state, int32_t lowByte, int32_t highByte,
+                           UErrorCode *pErrorCode) {
+    _getUnicodeSetForBytes(
+        cnv, cnv->sharedData->mbcs.stateTable, cnv->sharedData->mbcs.unicodeCodeUnits,
+        set, which,
+        state, 0, lowByte, highByte,
+        pErrorCode);
+}
+
 static void
 _MBCSGetUnicodeSet(const UConverter *cnv,
                   USet *set,
--- a/icu4c/source/common/ucnvmbcs.h
+++ b/icu4c/source/common/ucnvmbcs.h
@ -352,5 +352,19 @@ U_CFUNC void
 _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                          UErrorCode *pErrorCode);

+/*
+ * Internal function returning a UnicodeSet for toUnicode() conversion.
+ * Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
+ * In the future, if we add support for reverse-fallback sets, this function
+ * needs to be updated, and called for each initial state.
+ * Does not currently handle extensions.
+ * Does not empty the set first.
+ */
+U_CFUNC void
+_MBCSGetUnicodeSetForBytes(const UConverter *cnv,
+                           USet *set,
+                           UConverterUnicodeSet which,
+                           uint8_t state, int32_t lowByte, int32_t highByte,
+                           UErrorCode *pErrorCode);

 #endif
--- a/icu4c/source/test/testdata/conversion.txt
+++ b/icu4c/source/test/testdata/conversion.txt
@ -503,6 +503,42 @@ conversion {
      // which - numeric UConverterUnicodeSet value
      Headers { "charset", "map", "mapnot", "which" }
      Cases {
+        // ISO-2022-KR
+        {
+          "ISO-2022-KR",
+          "[\x00-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac02\uffe6]",
+          "[\x80-\xa0\xa3\xa5\xff-\u0110\uac03\uffe7-\U0010ffff]",
+          :int{0}
+        }
+
+        // versions of ISO-2022-JP
+        {
+          "ISO-2022-JP",
+          "[\x00-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
+          "[\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
+          :int{0}
+        }
+        {
+          "ISO-2022-JP-2",
+          "[\x00-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
+          "[\uffe7-\U0010ffff]",
+          :int{0}
+        }
+
+        // versions of ISO-2022-CN
+        {
+          "ISO-2022-CN",
+          "[\x00-\x7f\u4e00\u4e01\u9f98\ufe6b]",
+          "[\u4e29\uffe6-\U0010ffff]",
+          :int{0}
+        }
+        {
+          "ISO-2022-CN-EXT",
+          "[\x00-\x7f\u4e00-\u4e05\u9f98\ufe6b\u4e28-\u4e2b\U00020000\U00020003-\U00020005\U00029664]",
+          "[\U00020001\U00020002\U0002a6d7-\U0010ffff]",
+          :int{0}
+        }
+
        // DBCS-only
        {
          "ibm-971",