ICU-5483 add comments and named constants

X-SVN-Rev: 22581
2007-08-30 23:35:19 +00:00 · 2007-08-30 23:35:19 +00:00 · aa5179f4bb
commit aa5179f4bb
parent ca7379f1a3
1 changed files with 37 additions and 8 deletions
--- a/icu4c/source/common/ucnv2022.c
+++ b/icu4c/source/common/ucnv2022.c
@ -84,6 +84,26 @@ static const char SHIFT_OUT_STR[] = "\x0E";
 #define V_TAB   0x0B
 #define SPACE   0x20

+enum {
+    HWKANA_START=0xff61,
+    HWKANA_END=0xff9f
+};
+
+/*
+ * 94-character sets with native byte values A1..FE are encoded in ISO 2022
+ * as bytes 21..7E. (Subtract 0x80.)
+ * 96-character sets with native byte values A0..FF are encoded in ISO 2022
+ * as bytes 20..7F. (Subtract 0x80.)
+ * Do not encode C1 control codes with native bytes 80..9F
+ * as bytes 00..1F (C0 control codes).
+ */
+enum {
+    GR94_START=0xa1,
+    GR94_END=0xfe,
+    GR96_START=0xa0,
+    GR96_END=0xff
+};
+
 /*
 * ISO 2022 control codes must not be converted from Unicode
 * because they would mess up the byte stream.
@ -1471,7 +1491,7 @@ getTrail:
                    }
                    break;
                case ISO8859_1:
-                    if(0xa0 <= sourceChar && sourceChar <= 0xff) {
+                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
                        targetValue = (uint32_t)sourceChar - 0x80;
                        len = 1;
                        cs = cs0;
@ -1479,18 +1499,18 @@ getTrail:
                    }
                    break;
                case HWKANA_7BIT:
-                    if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) {
+                    if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) {
                        if(converterData->version==3) {
                            /* JIS7: use G1 (SO) */
                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
-                            targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21));
+                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
                            len = 1;
                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
                            g = 1;
                        } else if(converterData->version==4) {
                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
-                            targetValue = (uint32_t)(sourceChar - (0xff61 - 0xa1));
+                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
                            len = 1;

                            cs = pFromU2022State->cs[0];
@ -1524,7 +1544,7 @@ getTrail:
                                converterData->myConverterArray[cs0],
                                sourceChar, &value,
                                useFallback);
-                    if(len2 != 0 && !(len2 < 0 && len != 0) && 0xa0 <= value && value <= 0xff) {
+                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
                        targetValue = value - 0x80;
                        len = len2;
                        cs = cs0;
@ -1540,6 +1560,15 @@ getTrail:
                                useFallback, MBCS_OUTPUT_2);
                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
                        if(cs0 == KSC5601) {
+                            /*
+                             * Check for valid bytes for the encoding scheme.
+                             * This is necessary because the sub-converter (windows-949)
+                             * has a broader encoding scheme than is valid for 2022.
+                             *
+                             * Check that the result is a 2-byte value with each byte in the range A1..FE
+                             * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte
+                             * to move it to the ISO 2022 range 21..7E.
+                             */
                            if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
                                (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
                            ) {
@ -1807,7 +1836,7 @@ escape:
                    !IS_JP_DBCS(cs)
                ) {
                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
-                    targetUniChar = mySourceChar + (0xff61 - 0xa1);
+                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);

                    /* return from a single-shift state to the previous one */
                    if(pToU2022State->g >= 2) {
@ -1848,7 +1877,7 @@ escape:
                case HWKANA_7BIT:
                    if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
                        /* 7-bit halfwidth Katakana */
-                        targetUniChar = mySourceChar + (0xff61 - 0x21);
+                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
                    }
                    break;
                default:
@ -3170,7 +3199,7 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
        }
        if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
            /* include half-width Katakana for JP */
-            sa->addRange(sa->set, 0xff61, 0xff9f);
+            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
        }
        break;
    case 'c':