scuffed-code/icu4c/source/tools/makeconv/gb18030/gbmake4.c
2000-11-30 22:07:47 +00:00

253 lines
7.7 KiB
C

/*
*******************************************************************************
*
* Copyright (C) 2000, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: gbmake4.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2000oct19
* created by: Markus W. Scherer
*
* This tool reads and processes codepage mapping files for GB 18030.
* Its main function is to read a mapping table with the one- and two-byte
* mappings of GB 18030 and to then output a mapping table with all of the
* four-byte mappings for the BMP.
* Four-byte mappings that are included in the input are skipped in the output.
* When an "r" argument is specified, it will instead write a list of
* ranges of contiguous mappings where both Unicode code points and GB 18030
* four-byte sequences form contiguous blocks.
* This kind of output can be appended to a mapping table with a "ranges" line
* in between, and the resulting output will exclude the input ranges.
* This is useful for generating a partial mapping table and to handle the input
* ranges algorithmically in conversion.
*
* Single surrogates are excluded from the output.
*
* To compile, just call a C compiler/linker with this source file.
* On Windows: cl gbmake4.c
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* array of flags for each Unicode BMP code point */
static char
flags[0x10000]={ 0 };
/* flag values: 0: not assigned 1:from Unicode 2:to Unicode 4:four-byte sequence */
#define UNASSIGNED 0
#define FROMU 1
#define TOU 2
#define ROUNDTRIP 3
#define FOURBYTE 4
static void
incFourGB18030(unsigned char bytes[4]) {
if(bytes[3]<0x39) {
++bytes[3];
} else {
bytes[3]=0x30;
if(bytes[2]<0xfe) {
++bytes[2];
} else {
bytes[2]=0x81;
if(bytes[1]<0x39) {
++bytes[1];
} else {
bytes[1]=0x30;
++bytes[0];
}
}
}
}
static int
readRanges() {
char line[200];
char *s, *end;
unsigned long c1, c2;
/* parse the input file from stdin, in the format of gbkuni30.txt */
while(gets(line)!=NULL) {
/* skip empty and comment lines */
if(line[0]==0 || line[0]=='#') {
continue;
}
/* find the Unicode code point range */
s=strstr(line, "U+");
if(s==NULL) {
fprintf(stderr, "error parsing range from \"%s\"\n", line);
return 1;
}
/* read range */
s+=2;
c1=strtoul(s, &end, 16);
if(end==s || *end!='-') {
fprintf(stderr, "error parsing range start from \"%s\"\n", line);
return 1;
}
s=end+1;
c2=strtoul(s, &end, 16);
if(end==s || *end!=' ' && *end!=0) {
fprintf(stderr, "error parsing range end from \"%s\"\n", line);
return 1;
}
/* ignore ranges above the BMP */
if(c2>0xffff) {
c2=0xffff;
}
/* set the flags for all code points in this range */
while(c1<=c2) {
if(flags[c1]!=UNASSIGNED) {
fprintf(stderr, "error: range covers already-assigned U+%04lX\n", c1);
return 1;
}
flags[c1++]=ROUNDTRIP|FOURBYTE;
}
}
return 0;
}
extern int
main(int argc, const char *argv[]) {
char line[200];
char *end;
unsigned long c, b;
unsigned char bytes[4]={ 0x81, 0x30, 0x81, 0x30 };
char flag;
/* parse the input file from stdin, in the format of gbkuni30.txt */
while(gets(line)!=NULL) {
/* skip empty and comment lines */
if(line[0]==0 || line[0]=='#' || line[0]==0x1a) {
continue;
}
/* end of code points, beginning of ranges? */
if(0==strcmp(line, "ranges")) {
int result=readRanges();
if(result!=0) {
return result;
}
break;
}
/* read Unicode code point */
c=strtoul(line, &end, 16);
if(end==line) {
fprintf(stderr, "error: missing code point in \"%s\"\n", line);
return 1;
}
if(*end==':') {
flag=ROUNDTRIP;
} else if(*end=='>') {
flag=FROMU;
} else if(*end=='<') {
flag=TOU;
} else {
fprintf(stderr, "error: delimiter not one of :>< in \"%s\"\n", line);
return 1;
}
/* ignore non-BMP code points */
if(c>0xffff) {
continue;
}
/* read byte sequence as one long value */
b=strtoul(end+1, &end, 16);
if(*end!=0 && *end!='*') {
fprintf(stderr, "error parsing byte sequence from \"%s\"\n", line);
return 1;
}
if(b>0xffff) {
flag|=FOURBYTE;
}
/* set the flag for the code point, make sure the mapping from Unicode is not duplicate */
if((flags[c]&flag&FROMU)!=0) {
fprintf(stderr, "error: duplicate assignment for U+%04lX, old flags %u, new %s\n", c, flags[c], line);
return 1;
}
flags[c]|=flag;
}
if(argc<=1) {
/* generate all four-byte sequences that are not already in the input */
for(c=0x80; c<=0xffff; ++c) {
/* skip single surrogates */
if(c==0xd800) {
c=0xe000;
}
if(flags[c]==UNASSIGNED) {
printf("%04lX:%02X%02X%02X%02X\n", c, bytes[0], bytes[1], bytes[2], bytes[3]);
/* increment the sequence for the next code point */
incFourGB18030(bytes);
} else if(flags[c]&FOURBYTE) {
/* increment the four-byte sequence for each already-used four-byte sequence */
incFourGB18030(bytes);
}
}
} else if(0==strcmp(argv[1], "r")) {
/* generate ranges of contiguous code points with four-byte sequences for what is not covered by the input */
unsigned char b1[4], b2[4];
unsigned long c1, c2;
printf("ranges\n");
for(c1=0x80; c1<=0xffff;) {
/* skip single surrogates */
if(c1==0xd800) {
c1=0xe000;
}
/* get start bytes of range */
memcpy(b1, bytes, 4);
/* look for the first non-range code point */
for(c2=c1; c2<=0xffff && flags[c2]==UNASSIGNED && c2!=0xd800; ++c2) {
/* save this sequence to avoid decrementing it after this loop */
memcpy(b2, bytes, 4);
/* increment the sequence for the next code point */
incFourGB18030(bytes);
}
/* c2 is the first code point after the range; b2 are the bytes for the last code point in the range */
/* print this range, number of codes first for easy sorting */
printf("%06lX U+%04lX-%04lX GB+%02X%02X%02X%02X-%02X%02X%02X%02X\n",
c2-c1, c1, c2-1,
b1[0], b1[1], b1[2], b1[3],
b2[0], b2[1], b2[2], b2[3]);
/* skip single surrogates */
if(c2==0xd800) {
c2=0xe000;
}
/* skip all assigned Unicode BMP code points */
for(c1=c2; c1<=0xffff && flags[c1]!=UNASSIGNED; ++c1) {
if(flags[c1]&FOURBYTE) {
/* increment the four-byte sequence for each already-used four-byte sequence */
incFourGB18030(bytes);
}
}
}
} else {
fprintf(stderr, "unknown mode argument \"%s\"\n", argv[1]);
return 2;
}
return 0;
}