ICU-544 fix mappings for GB+fe5e
X-SVN-Rev: 2818
This commit is contained in:
parent
bfa7fdc918
commit
572ab3ff1d
@ -2753,7 +2753,7 @@ CHARMAP
|
||||
<U2e94> \x81\x39\x81\x32 |0
|
||||
<U2e95> \x81\x39\x81\x33 |0
|
||||
<U2e96> \x81\x39\x81\x34 |0
|
||||
<U2e97> \x81\x39\x81\x35 |0
|
||||
<U2e97> \xfe\x5e |0
|
||||
<U2e98> \x81\x39\x81\x36 |0
|
||||
<U2e99> \x81\x39\x81\x37 |0
|
||||
<U2e9a> \x81\x39\x81\x38 |0
|
||||
@ -30141,7 +30141,7 @@ CHARMAP
|
||||
<Ue820> \xfe\x5b |1
|
||||
<Ue821> \xfe\x5c |1
|
||||
<Ue822> \xfe\x5d |1
|
||||
<Ue823> \xfe\x5e |0
|
||||
<Ue823> \xfe\x5e |1
|
||||
<Ue824> \xfe\x5f |1
|
||||
<Ue825> \xfe\x60 |1
|
||||
<Ue826> \xfe\x61 |0
|
||||
|
@ -2753,7 +2753,7 @@ CHARMAP
|
||||
<U2e94> \x81\x39\x81\x32 |0
|
||||
<U2e95> \x81\x39\x81\x33 |0
|
||||
<U2e96> \x81\x39\x81\x34 |0
|
||||
<U2e97> \x81\x39\x81\x35 |0
|
||||
<U2e97> \xfe\x5e |0
|
||||
<U2e98> \x81\x39\x81\x36 |0
|
||||
<U2e99> \x81\x39\x81\x37 |0
|
||||
<U2e9a> \x81\x39\x81\x38 |0
|
||||
@ -30141,7 +30141,7 @@ CHARMAP
|
||||
<Ue820> \xfe\x5b |1
|
||||
<Ue821> \xfe\x5c |1
|
||||
<Ue822> \xfe\x5d |1
|
||||
<Ue823> \xfe\x5e |0
|
||||
<Ue823> \xfe\x5e |1
|
||||
<Ue824> \xfe\x5f |1
|
||||
<Ue825> \xfe\x60 |1
|
||||
<Ue826> \xfe\x61 |0
|
||||
|
@ -67,11 +67,12 @@ This section is most useful for understanding the genesis and structure of GB 18
|
||||
This is not official at this point!</li>
|
||||
<li>You should arrive at data like <a href="gbkuni30.txt">gbkuni30.txt</a>.
|
||||
This file has the following simplified format on each line:<br>
|
||||
<code>unicode (':' | '>') gb ['*']</code><br>
|
||||
<code>unicode (':' | '>') gb ['*' ['*']]</code><br>
|
||||
The left column contains the Unicode code point, the right column the byte sequence in GB 18030.
|
||||
The delimiter is either a colon for roundtrip mappings or a greater-than sign
|
||||
for fallbacks from Unicode to the codepage.
|
||||
I have marked mappings of the appendix E characters with a star.</li>
|
||||
I have marked mappings of the appendix E characters with a star.
|
||||
In addition, I have marked one pair of mappings that <em>should be</em> in appendix E with a double star.</li>
|
||||
<li>Now compile <a href="gbmake4.c">gbmake4</a> and run it with the above file as stdin input.
|
||||
You will get as output all the four-byte mappings for all
|
||||
BMP code points that do not have a one-byte or two-byte mapping.</li>
|
||||
|
@ -580,6 +580,7 @@
|
||||
2e88:fe57*
|
||||
2e8b:fe58*
|
||||
2e8c:fe5d*
|
||||
2e97:fe5e**
|
||||
2ea7:fe6b*
|
||||
2eaa:fe6e*
|
||||
2eae:fe71*
|
||||
@ -23908,7 +23909,7 @@ e81f>fe5a*
|
||||
e820>fe5b*
|
||||
e821>fe5c*
|
||||
e822>fe5d*
|
||||
e823:fe5e
|
||||
e823>fe5e**
|
||||
e824>fe5f*
|
||||
e825>fe60*
|
||||
e826:fe61
|
||||
|
@ -33,8 +33,13 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* in the printed standard, U+303e is mismapped; this sequence must be skipped */
|
||||
static const unsigned char skip303eBytes[4]={ 0x81, 0x39, 0xa6, 0x34 };
|
||||
/*
|
||||
* In the printed standard, U+303e is mismapped; this sequence must be skipped.
|
||||
* Also, GB+fe5e needs to be added to appendix E, mapping to U+2e97, which removes its sequence, too.
|
||||
*/
|
||||
static const unsigned char
|
||||
skip2e97Bytes[4]={ 0x81, 0x39, 0x81, 0x35 },
|
||||
skip303eBytes[4]={ 0x81, 0x39, 0xa6, 0x34 };
|
||||
|
||||
/* array of flags for each Unicode BMP code point */
|
||||
static char
|
||||
@ -64,8 +69,10 @@ incFourGB18030(unsigned char bytes[4]) {
|
||||
static void
|
||||
incSkipFourGB18030(unsigned char bytes[4]) {
|
||||
incFourGB18030(bytes);
|
||||
if(0==memcmp(bytes, skip303eBytes, 4) && flags[0x303e]==1) {
|
||||
/* make sure to skip the mismapped sequence if the data correctly maps U+303e==GB+a989 */
|
||||
if( 0==memcmp(bytes, skip2e97Bytes, 4) && flags[0x2e97]==1 ||
|
||||
0==memcmp(bytes, skip303eBytes, 4) && flags[0x303e]==1
|
||||
) {
|
||||
/* make sure to skip mismapped sequences if the two-byte data covers their Unicode code points */
|
||||
incFourGB18030(bytes);
|
||||
}
|
||||
}
|
||||
@ -107,6 +114,10 @@ readRanges() {
|
||||
|
||||
/* set the flags for all code points in this range */
|
||||
while(c1<=c2) {
|
||||
if(flags[c1]!=0) {
|
||||
fprintf(stderr, "error: range covers already-assigned U+%04lx\n", c1);
|
||||
return 1;
|
||||
}
|
||||
flags[c1++]=2;
|
||||
}
|
||||
}
|
||||
@ -157,11 +168,15 @@ main(int argc, const char *argv[]) {
|
||||
}
|
||||
|
||||
/* set the flag for the code point */
|
||||
if(flags[c]!=0) {
|
||||
fprintf(stderr, "error: duplicate assignment for U+%04lx\n", c);
|
||||
return 1;
|
||||
}
|
||||
flags[c]= b<=0xffff ? 1 : 2;
|
||||
}
|
||||
|
||||
if(argc<=1) {
|
||||
/* generate all four-byte sequences that are no already in the input */
|
||||
/* generate all four-byte sequences that are not already in the input */
|
||||
for(c=0x81; c<=0xffff; ++c) {
|
||||
if(flags[c]==0) {
|
||||
printf("%04lx:%02x%02x%02x%02x\n", c, bytes[0], bytes[1], bytes[2], bytes[3]);
|
||||
|
Loading…
Reference in New Issue
Block a user