From 5a876de6741943ce7fe61abd9bab3b93158c4052 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 16 Apr 2009 22:18:03 +0000 Subject: [PATCH] ICU-5995 remove unused states from gb18030 and ibm-964 (EUC-TW) conversion tables X-SVN-Rev: 25801 --- icu4c/source/data/mappings/gb18030.ucm | 23 ++++++++------ .../data/mappings/ibm-964_P110-1999.ucm | 31 ++++++++++++++----- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/icu4c/source/data/mappings/gb18030.ucm b/icu4c/source/data/mappings/gb18030.ucm index b46166353f..05eb7adfb8 100644 --- a/icu4c/source/data/mappings/gb18030.ucm +++ b/icu4c/source/data/mappings/gb18030.ucm @@ -1,4 +1,4 @@ -# Copyright (C) 2000-2005, International Business Machines Corporation and others. +# Copyright (C) 2000-2009, International Business Machines Corporation and others. # All Rights Reserved. # ICU codepage data for GB 18030 @@ -16,23 +16,26 @@ # Similarly, some of the BMP mappings are marked as unassigned for the same reason. # Mostly assigned sequences, with branches in the lead bytes - 0-7f, 81:7, 82:8, 83:9, 84:a, 85-fe:4 - 30-39:2, 40-7e, 80-fe - 81-fe:3 +# The second line is commented out (and does not count) +# because the state table is hand-optimized and does not use what would be +# the natural path for the encoding scheme. + 0-7f, 81:6, 82:7, 83:8, 84:9, 85-fe:3 +# 30-39:2, 40-7e, 80-fe + 81-fe:2 30-39 # All-unassigned 4-byte sequences - 30-39:5, 40-7e, 80-fe - 81-fe:6 + 30-39:4, 40-7e, 80-fe + 81-fe:5 30-39.u # Some unassigned 4-byte sequences, one state for each of the lead bytes 81-84 # Each of these states branch on the second of four bytes; for the third and fourth bytes, # unassigned sequences continue with state 5, assigned ones with state 2 - 30:2, 31-35:5, 36-39:2, 40-7e, 80-fe - 30-35:2, 36-39:5, 40-7e, 80-fe - 30-35:5, 36:2, 37-39:5, 40-7e, 80-fe - 30-31:2, 32-39:5, 40-7e, 80-fe + 30:1, 31-35:4, 36-39:1, 40-7e, 80-fe + 30-35:1, 36-39:4, 40-7e, 80-fe + 30-35:4, 36:1, 37-39:4, 40-7e, 80-fe + 30-31:1, 32-39:4, 40-7e, 80-fe # GB 18030 BMP mappings that are not handled algorithmically are # generated using gbmake4 and gbtoucm tools. Please see charset/source/gb18030/gb18030.html diff --git a/icu4c/source/data/mappings/ibm-964_P110-1999.ucm b/icu4c/source/data/mappings/ibm-964_P110-1999.ucm index 16476fd00b..93530b82a0 100644 --- a/icu4c/source/data/mappings/ibm-964_P110-1999.ucm +++ b/icu4c/source/data/mappings/ibm-964_P110-1999.ucm @@ -1,6 +1,6 @@ # *************************************************************************** # * -# * Copyright (C) 1995-2007, International Business Machines +# * Copyright (C) 1995-2009, International Business Machines # * Corporation and others. All Rights Reserved. # * # *************************************************************************** @@ -18,15 +18,30 @@ "ASCII" "ibm-964_VPUA" - 0-8d, 8e:2, 90-9f, a1-fe:1, aa-c1:5, c3:5, fe:5 +# The fourth line is commented out (and does not count) +# because the state table is hand-optimized and does not use what would be +# the natural path for the encoding scheme. +# The third used to start with "a1-b0:3" but overrode every one +# of these byte values with a different state transition. + +# 0: Initial state, single bytes and lead bytes + 0-8d, 8e:2, 90-9f, a1-fe:1, aa-c1:4, c3:4, fe:4 +# 1: Trail byte state with mappings a1-fe - a1-b0:3, a1:4, a2:8, a3-ab:4, ac:7, ad:6, ae-b0:4 - a1-fe:1 - a1-fe:5 +# 2: Second of four bytes, follows lead byte 8e + a1:3, a2:7, a3-ab:3, ac:6, ad:5, ae-b0:3 +# (unreachable/optimized away) +# a1-fe:1 +# 3: Third of four bytes, 8e xx .. .. for most xx in a1-b0; all-unassigned + a1-fe:4 +# 4: All-unassigned trail byte state a1-fe.u - a1-a4:1, a5-fe:5 - a1-e2:1, e3-fe:5 - a1-f2:1, f3-fe:5 +# 5: 8e ad .. .. with some mappings + a1-a4:1, a5-fe:4 +# 6: 8e ac .. .. with some mappings + a1-e2:1, e3-fe:4 +# 7: 8e a2 .. .. with some mappings + a1-f2:1, f3-fe:4 CHARMAP \x00 |0