ICU-4548 Update Unicode CCSIDs from IBM.

X-SVN-Rev: 17751
2005-05-31 22:04:26 +00:00 · 2005-05-31 22:04:26 +00:00 · 86b1781bad
commit 86b1781bad
parent 66840a241d
6 changed files with 73 additions and 29 deletions
--- a/icu4c/source/common/ucnv_u16.c
+++ b/icu4c/source/common/ucnv_u16.c
@ -1,6 +1,6 @@
 /*  
 **********************************************************************
-*   Copyright (C) 2002-2004, International Business Machines
+*   Copyright (C) 2002-2005, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  ucnv_u16.c
@ -1328,7 +1328,7 @@ static const UConverterImpl _UTF16Impl = {
 static const UConverterStaticData _UTF16StaticData = {
    sizeof(UConverterStaticData),
    "UTF-16",
-    0, /* ### TODO review correctness of all Unicode CCSIDs */
+    1204, /* CCSID for BOM sensitive UTF-16 */
    UCNV_IBM, UCNV_UTF16, 2, 2,
 #if U_IS_BIG_ENDIAN
    { 0xff, 0xfd, 0, 0 }, 2,
--- a/icu4c/source/common/ucnv_u32.c
+++ b/icu4c/source/common/ucnv_u32.c
@ -1,6 +1,6 @@
 /*  
 **********************************************************************
-*   Copyright (C) 2002-2004, International Business Machines
+*   Copyright (C) 2002-2005, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  ucnv_u32.c
@ -1156,10 +1156,11 @@ static const UConverterImpl _UTF32Impl = {
    ucnv_getCompleteUnicodeSet
 };

+/* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
 static const UConverterStaticData _UTF32StaticData = {
    sizeof(UConverterStaticData),
    "UTF-32",
-    0, /* ### TODO review correctness of all Unicode CCSIDs */
+    1236,
    UCNV_IBM, UCNV_UTF32, 4, 4,
 #if U_IS_BIG_ENDIAN
    { 0, 0, 0xff, 0xfd }, 4,
--- a/icu4c/source/common/ucnv_u8.c
+++ b/icu4c/source/common/ucnv_u8.c
@ -1,6 +1,6 @@
 /*  
 **********************************************************************
-*   Copyright (C) 2002-2004, International Business Machines
+*   Copyright (C) 2002-2005, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  ucnv_u8.c
@ -798,7 +798,8 @@ static const UConverterImpl _CESU8Impl={
 static const UConverterStaticData _CESU8StaticData={
    sizeof(UConverterStaticData),
    "CESU-8",
-    0, UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
+    9400, /* CCSID for CESU-8 */
+    UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
    0,
    0,
--- a/icu4c/source/common/ucnvbocu.c
+++ b/icu4c/source/common/ucnvbocu.c
@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2002-2004, International Business Machines
+*   Copyright (C) 2002-2005, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@ -1374,7 +1374,7 @@ static const UConverterImpl _Bocu1Impl={
 static const UConverterStaticData _Bocu1StaticData={
    sizeof(UConverterStaticData),
    "BOCU-1",
-    0, /* CCSID for BOCU-1 */
+    1214, /* CCSID for BOCU-1 */
    UCNV_IBM, UCNV_BOCU1,
    1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
    { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
--- a/icu4c/source/common/ucnvscsu.c
+++ b/icu4c/source/common/ucnvscsu.c
@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2000-2004, International Business Machines
+*   Copyright (C) 2000-2005, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@ -2008,7 +2008,7 @@ static const UConverterImpl _SCSUImpl={
 static const UConverterStaticData _SCSUStaticData={
    sizeof(UConverterStaticData),
    "SCSU",
-    0, /* CCSID for SCSU */
+    1212, /* CCSID for SCSU */
    UCNV_IBM, UCNV_SCSU,
    1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
    /*
--- a/icu4c/source/data/mappings/convrtrs.txt
+++ b/icu4c/source/data/mappings/convrtrs.txt
@ -134,36 +134,70 @@
 # Fully algorithmic converters

 UTF-8 { IANA* MIME* JAVA* WINDOWS }
-                                ibm-1208 { IBM* }
-                                ibm-1209 { IBM }
-                                ibm-5304 { IBM }
-                                ibm-5305 { IBM }
+                                ibm-1208 { IBM* } # UTF-8 with IBM PUA
+                                ibm-1209 { IBM }  # UTF-8
+                                ibm-5304 { IBM }  # Unicode 2.0, UTF-8 with IBM PUA
+                                ibm-5305 { IBM }  # Unicode 2.0, UTF-8
+                                ibm-13496 { IBM } # Unicode 3.0, UTF-8 with IBM PUA
+                                ibm-13497 { IBM } # Unicode 3.0, UTF-8
+                                ibm-17592 { IBM } # Unicode 4.0, UTF-8 with IBM PUA
+                                ibm-17593 { IBM } # Unicode 4.0, UTF-8
                                windows-65001 { WINDOWS* }
                                cp1208

 # The ICU 2.2 UTF-16/32 converters detect and write a BOM.
-UTF-16 { IANA* MIME* JAVA* }    ISO-10646-UCS-2 { IANA } unicode csUnicode ucs-2
+UTF-16 { IANA* MIME* JAVA* }    ISO-10646-UCS-2 { IANA }
+                                ibm-1204 { IBM* } # UTF-16 with IBM PUA and BOM sensitive
+                                ibm-1205 { IBM }  # UTF-16 BOM sensitive
+                                unicode
+                                csUnicode
+                                ucs-2
+# The following Unicode CCSIDs (IBM) are not valid in ICU because they are
+# considered pure DBCS (exactly 2 bytes) of Unicode,
+# and they are a subset of Unicode. ICU does not support their encoding structures.
+# 1400 1401 1402 1410 1414 1415 1446 1447 1448 1449 64770 64771 65520 5496 5497 5498 9592 13688
 UTF-16BE { IANA* MIME* JAVA* }  x-utf-16be { JAVA }
-                                # iso-10646-ucs-2 { JAVA } # This is ambiguous
-                                ibm-1200 { IBM* }
-                                ibm-1201 { IBM }
-                                ibm-5297 { IBM }
-                                ibm-13488 { IBM }
-                                ibm-17584 { IBM }
+                                ibm-1200 { IBM* } # UTF-16 BE with IBM PUA
+                                ibm-1201 { IBM }  # UTF-16 BE
+                                ibm-13488 { IBM } # Unicode 2.0, UTF-16 BE with IBM PUA
+                                ibm-13489 { IBM } # Unicode 2.0, UTF-16 BE
+                                ibm-17584 { IBM } # Unicode 3.0, UTF-16 BE with IBM PUA
+                                ibm-17585 { IBM } # Unicode 3.0, UTF-16 BE
+                                ibm-21680 { IBM } # Unicode 4.0, UTF-16 BE with IBM PUA
+                                ibm-21681 { IBM } # Unicode 4.0, UTF-16 BE
+                                ibm-61955 { IBM } # UTF-16BE with Gaidai University (Japan) PUA
+                                ibm-61956 { IBM } # UTF-16BE with HKSCS-Big 5 PUA
                                windows-1201 { WINDOWS* }
                                cp1200
                                cp1201
                                UTF16_BigEndian
+                                # ibm-5297 { IBM }  # Unicode 2.0, UTF-16 (BE) (reserved, never used)
+                                # iso-10646-ucs-2 { JAVA } # This is ambiguous
+                                # ibm-61952 is not a valid CCSID because it's Unicode 1.1
+                                # ibm-61953 is not a valid CCSID because it's Unicode 1.0
 UTF-16LE { IANA* MIME* JAVA* }  x-utf-16le { JAVA }
-                                ibm-1202 { IBM* }
-                                ibm-13490 { IBM }
-                                ibm-17586 { IBM }
+                                ibm-1202 { IBM* } # UTF-16 LE with IBM PUA
+                                ibm-1203 { IBM }  # UTF-16 LE
+                                ibm-13490 { IBM } # Unicode 2.0, UTF-16 LE with IBM PUA
+                                ibm-13491 { IBM } # Unicode 2.0, UTF-16 LE
+                                ibm-17586 { IBM } # Unicode 3.0, UTF-16 LE with IBM PUA
+                                ibm-17587 { IBM } # Unicode 3.0, UTF-16 LE
+                                ibm-21682 { IBM } # Unicode 4.0, UTF-16 LE with IBM PUA
+                                ibm-21683 { IBM } # Unicode 4.0, UTF-16 LE
                                UTF16_LittleEndian
                                windows-1200 { WINDOWS* }

-UTF-32 { IANA* MIME* }          ISO-10646-UCS-4 { IANA } csUCS4 ucs-4
-UTF-32BE { IANA* }              UTF32_BigEndian ibm-1232 { IBM* } ibm-1233 { IBM }
-UTF-32LE { IANA* }              UTF32_LittleEndian ibm-1234 { IBM* }
+UTF-32 { IANA* MIME* }          ISO-10646-UCS-4 { IANA }
+                                ibm-1236 { IBM* } # UTF-32 with IBM PUA and BOM sensitive
+                                ibm-1237 { IBM }  # UTF-32 BOM sensitive
+                                csUCS4
+                                ucs-4
+UTF-32BE { IANA* }              UTF32_BigEndian
+                                ibm-1232 { IBM* } # UTF-32 BE with IBM PUA
+                                ibm-1233 { IBM }  # UTF-32 BE
+UTF-32LE { IANA* }              UTF32_LittleEndian
+                                ibm-1234 { IBM* } # UTF-32 LE, with IBM PUA
+                                ibm-1235 { IBM }  # UTF-32 LE

 # ICU-specific names for special uses
 UTF16_PlatformEndian
@ -185,6 +219,9 @@ UTF32_OppositeEndian
 # For details about email headers see RFC 2047.
 UTF-7 { IANA* MIME* WINDOWS }   windows-65000 { WINDOWS* }

+# UTF-EBCDIC doesn't exist in ICU, but the aliases are here for reference.
+#UTF-EBCDIC ibm-1210 { IBM* } ibm-1211 { IBM }
+
 # IMAP-mailbox-name is an ICU-specific name for the encoding of IMAP mailbox names.
 # It is a substantially modified UTF-7 encoding. See the specification in:
 #
@ -194,11 +231,16 @@ UTF-7 { IANA* MIME* WINDOWS }   windows-65000 { WINDOWS* }
 IMAP-mailbox-name

 SCSU { IANA* }
-BOCU-1 { IANA* } csBOCU-1 { IANA }
+    ibm-1212 { IBM* } # SCSU with IBM PUA
+    ibm-1213 { IBM }  # SCSU
+BOCU-1 { IANA* }
+    csBOCU-1 { IANA }
+    ibm-1214 { IBM* } # BOCU-1 with IBM PUA
+    ibm-1215 { IBM }  # BOCU-1

 # See http://www.unicode.org/unicode/reports/tr26 for this Compatibility Encoding Scheme for UTF-16
 # The Unicode Consortium does not encourage the use of CESU-8
-CESU-8 { IANA* }
+CESU-8 { IANA* } ibm-9400 { IBM* }

 # Standard iso-8859-1, which does not have the Euro update.
 # See iso-8859-15 (latin9) for the Euro update