ICU-5995 remove unused states from gb18030 and ibm-964 (EUC-TW) conversion tables

X-SVN-Rev: 25801
This commit is contained in:
Markus Scherer 2009-04-16 22:18:03 +00:00
parent bda3b4e16c
commit 5a876de674
2 changed files with 36 additions and 18 deletions

View File

@ -1,4 +1,4 @@
# Copyright (C) 2000-2005, International Business Machines Corporation and others.
# Copyright (C) 2000-2009, International Business Machines Corporation and others.
# All Rights Reserved.
# ICU codepage data for GB 18030
@ -16,23 +16,26 @@
# Similarly, some of the BMP mappings are marked as unassigned for the same reason.
# Mostly assigned sequences, with branches in the lead bytes
<icu:state> 0-7f, 81:7, 82:8, 83:9, 84:a, 85-fe:4
<icu:state> 30-39:2, 40-7e, 80-fe
<icu:state> 81-fe:3
# The second <icu:state> line is commented out (and does not count)
# because the state table is hand-optimized and does not use what would be
# the natural path for the encoding scheme.
<icu:state> 0-7f, 81:6, 82:7, 83:8, 84:9, 85-fe:3
# <icu:state> 30-39:2, 40-7e, 80-fe
<icu:state> 81-fe:2
<icu:state> 30-39
# All-unassigned 4-byte sequences
<icu:state> 30-39:5, 40-7e, 80-fe
<icu:state> 81-fe:6
<icu:state> 30-39:4, 40-7e, 80-fe
<icu:state> 81-fe:5
<icu:state> 30-39.u
# Some unassigned 4-byte sequences, one state for each of the lead bytes 81-84
# Each of these states branch on the second of four bytes; for the third and fourth bytes,
# unassigned sequences continue with state 5, assigned ones with state 2
<icu:state> 30:2, 31-35:5, 36-39:2, 40-7e, 80-fe
<icu:state> 30-35:2, 36-39:5, 40-7e, 80-fe
<icu:state> 30-35:5, 36:2, 37-39:5, 40-7e, 80-fe
<icu:state> 30-31:2, 32-39:5, 40-7e, 80-fe
<icu:state> 30:1, 31-35:4, 36-39:1, 40-7e, 80-fe
<icu:state> 30-35:1, 36-39:4, 40-7e, 80-fe
<icu:state> 30-35:4, 36:1, 37-39:4, 40-7e, 80-fe
<icu:state> 30-31:1, 32-39:4, 40-7e, 80-fe
# GB 18030 BMP mappings that are not handled algorithmically are
# generated using gbmake4 and gbtoucm tools. Please see charset/source/gb18030/gb18030.html

View File

@ -1,6 +1,6 @@
# ***************************************************************************
# *
# * Copyright (C) 1995-2007, International Business Machines
# * Copyright (C) 1995-2009, International Business Machines
# * Corporation and others. All Rights Reserved.
# *
# ***************************************************************************
@ -18,15 +18,30 @@
<icu:charsetFamily> "ASCII"
<icu:alias> "ibm-964_VPUA"
<icu:state> 0-8d, 8e:2, 90-9f, a1-fe:1, aa-c1:5, c3:5, fe:5
# The fourth <icu:state> line is commented out (and does not count)
# because the state table is hand-optimized and does not use what would be
# the natural path for the encoding scheme.
# The third <icu:state> used to start with "a1-b0:3" but overrode every one
# of these byte values with a different state transition.
# 0: Initial state, single bytes and lead bytes
<icu:state> 0-8d, 8e:2, 90-9f, a1-fe:1, aa-c1:4, c3:4, fe:4
# 1: Trail byte state with mappings
<icu:state> a1-fe
<icu:state> a1-b0:3, a1:4, a2:8, a3-ab:4, ac:7, ad:6, ae-b0:4
<icu:state> a1-fe:1
<icu:state> a1-fe:5
# 2: Second of four bytes, follows lead byte 8e
<icu:state> a1:3, a2:7, a3-ab:3, ac:6, ad:5, ae-b0:3
# (unreachable/optimized away)
# <icu:state> a1-fe:1
# 3: Third of four bytes, 8e xx .. .. for most xx in a1-b0; all-unassigned
<icu:state> a1-fe:4
# 4: All-unassigned trail byte state
<icu:state> a1-fe.u
<icu:state> a1-a4:1, a5-fe:5
<icu:state> a1-e2:1, e3-fe:5
<icu:state> a1-f2:1, f3-fe:5
# 5: 8e ad .. .. with some mappings
<icu:state> a1-a4:1, a5-fe:4
# 6: 8e ac .. .. with some mappings
<icu:state> a1-e2:1, e3-fe:4
# 7: 8e a2 .. .. with some mappings
<icu:state> a1-f2:1, f3-fe:4
CHARMAP
<U0000> \x00 |0