ICU-544 fix mappings for GB+fe5e

X-SVN-Rev: 2818
This commit is contained in:
Markus Scherer 2000-10-27 18:04:54 +00:00
parent bfa7fdc918
commit 572ab3ff1d
5 changed files with 29 additions and 12 deletions

View File

@ -2753,7 +2753,7 @@ CHARMAP
<U2e94> \x81\x39\x81\x32 |0
<U2e95> \x81\x39\x81\x33 |0
<U2e96> \x81\x39\x81\x34 |0
<U2e97> \x81\x39\x81\x35 |0
<U2e97> \xfe\x5e |0
<U2e98> \x81\x39\x81\x36 |0
<U2e99> \x81\x39\x81\x37 |0
<U2e9a> \x81\x39\x81\x38 |0
@ -30141,7 +30141,7 @@ CHARMAP
<Ue820> \xfe\x5b |1
<Ue821> \xfe\x5c |1
<Ue822> \xfe\x5d |1
<Ue823> \xfe\x5e |0
<Ue823> \xfe\x5e |1
<Ue824> \xfe\x5f |1
<Ue825> \xfe\x60 |1
<Ue826> \xfe\x61 |0

View File

@ -2753,7 +2753,7 @@ CHARMAP
<U2e94> \x81\x39\x81\x32 |0
<U2e95> \x81\x39\x81\x33 |0
<U2e96> \x81\x39\x81\x34 |0
<U2e97> \x81\x39\x81\x35 |0
<U2e97> \xfe\x5e |0
<U2e98> \x81\x39\x81\x36 |0
<U2e99> \x81\x39\x81\x37 |0
<U2e9a> \x81\x39\x81\x38 |0
@ -30141,7 +30141,7 @@ CHARMAP
<Ue820> \xfe\x5b |1
<Ue821> \xfe\x5c |1
<Ue822> \xfe\x5d |1
<Ue823> \xfe\x5e |0
<Ue823> \xfe\x5e |1
<Ue824> \xfe\x5f |1
<Ue825> \xfe\x60 |1
<Ue826> \xfe\x61 |0

View File

@ -67,11 +67,12 @@ This section is most useful for understanding the genesis and structure of GB 18
This is not official at this point!</li>
<li>You should arrive at data like <a href="gbkuni30.txt">gbkuni30.txt</a>.
This file has the following simplified format on each line:<br>
<code>unicode (':' | '>') gb ['*']</code><br>
<code>unicode (':' | '>') gb ['*' ['*']]</code><br>
The left column contains the Unicode code point, the right column the byte sequence in GB 18030.
The delimiter is either a colon for roundtrip mappings or a greater-than sign
for fallbacks from Unicode to the codepage.
I have marked mappings of the appendix E characters with a star.</li>
I have marked mappings of the appendix E characters with a star.
In addition, I have marked one pair of mappings that <em>should be</em> in appendix E with a double star.</li>
<li>Now compile <a href="gbmake4.c">gbmake4</a> and run it with the above file as stdin input.
You will get as output all the four-byte mappings for all
BMP code points that do not have a one-byte or two-byte mapping.</li>

View File

@ -580,6 +580,7 @@
2e88:fe57*
2e8b:fe58*
2e8c:fe5d*
2e97:fe5e**
2ea7:fe6b*
2eaa:fe6e*
2eae:fe71*
@ -23908,7 +23909,7 @@ e81f>fe5a*
e820>fe5b*
e821>fe5c*
e822>fe5d*
e823:fe5e
e823>fe5e**
e824>fe5f*
e825>fe60*
e826:fe61

View File

@ -33,8 +33,13 @@
#include <stdlib.h>
#include <string.h>
/* in the printed standard, U+303e is mismapped; this sequence must be skipped */
static const unsigned char skip303eBytes[4]={ 0x81, 0x39, 0xa6, 0x34 };
/*
* In the printed standard, U+303e is mismapped; this sequence must be skipped.
* Also, GB+fe5e needs to be added to appendix E, mapping to U+2e97, which removes its sequence, too.
*/
static const unsigned char
skip2e97Bytes[4]={ 0x81, 0x39, 0x81, 0x35 },
skip303eBytes[4]={ 0x81, 0x39, 0xa6, 0x34 };
/* array of flags for each Unicode BMP code point */
static char
@ -64,8 +69,10 @@ incFourGB18030(unsigned char bytes[4]) {
static void
incSkipFourGB18030(unsigned char bytes[4]) {
incFourGB18030(bytes);
if(0==memcmp(bytes, skip303eBytes, 4) && flags[0x303e]==1) {
/* make sure to skip the mismapped sequence if the data correctly maps U+303e==GB+a989 */
if( 0==memcmp(bytes, skip2e97Bytes, 4) && flags[0x2e97]==1 ||
0==memcmp(bytes, skip303eBytes, 4) && flags[0x303e]==1
) {
/* make sure to skip mismapped sequences if the two-byte data covers their Unicode code points */
incFourGB18030(bytes);
}
}
@ -107,6 +114,10 @@ readRanges() {
/* set the flags for all code points in this range */
while(c1<=c2) {
if(flags[c1]!=0) {
fprintf(stderr, "error: range covers already-assigned U+%04lx\n", c1);
return 1;
}
flags[c1++]=2;
}
}
@ -157,11 +168,15 @@ main(int argc, const char *argv[]) {
}
/* set the flag for the code point */
if(flags[c]!=0) {
fprintf(stderr, "error: duplicate assignment for U+%04lx\n", c);
return 1;
}
flags[c]= b<=0xffff ? 1 : 2;
}
if(argc<=1) {
/* generate all four-byte sequences that are no already in the input */
/* generate all four-byte sequences that are not already in the input */
for(c=0x81; c<=0xffff; ++c) {
if(flags[c]==0) {
printf("%04lx:%02x%02x%02x%02x\n", c, bytes[0], bytes[1], bytes[2], bytes[3]);