/* ******************************************************************************* * * Copyright (C) 2000, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: gbmake4.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2000oct19 * created by: Markus W. Scherer * * This tool reads and processes codepage mapping files for GB 18030. * Its main function is to read a mapping table with the one- and two-byte * mappings of GB 18030 and to then output a mapping table with all of the * four-byte mappings for the BMP. * Four-byte mappings that are included in the input are skipped in the output. * When an "r" argument is specified, it will instead write a list of * ranges of contiguous mappings where both Unicode code points and GB 18030 * four-byte sequences form contiguous blocks. * This kind of output can be appended to a mapping table with a "ranges" line * in between, and the resulting output will exclude the input ranges. * This is useful for generating a partial mapping table and to handle the input * ranges algorithmically in conversion. * * Single surrogates are excluded from the output. * * To compile, just call a C compiler/linker with this source file. * On Windows: cl gbmake4.c */ #include #include #include /* array of flags for each Unicode BMP code point */ static char flags[0x10000]={ 0 }; /* flag values: 0: not assigned 1:from Unicode 2:to Unicode 4:four-byte sequence */ #define UNASSIGNED 0 #define FROMU 1 #define TOU 2 #define ROUNDTRIP 3 #define FOURBYTE 4 static void incFourGB18030(unsigned char bytes[4]) { if(bytes[3]<0x39) { ++bytes[3]; } else { bytes[3]=0x30; if(bytes[2]<0xfe) { ++bytes[2]; } else { bytes[2]=0x81; if(bytes[1]<0x39) { ++bytes[1]; } else { bytes[1]=0x30; ++bytes[0]; } } } } static int readRanges() { char line[200]; char *s, *end; unsigned long c1, c2; /* parse the input file from stdin, in the format of gbkuni30.txt */ while(gets(line)!=NULL) { /* skip empty and comment lines */ if(line[0]==0 || line[0]=='#') { continue; } /* find the Unicode code point range */ s=strstr(line, "U+"); if(s==NULL) { fprintf(stderr, "error parsing range from \"%s\"\n", line); return 1; } /* read range */ s+=2; c1=strtoul(s, &end, 16); if(end==s || *end!='-') { fprintf(stderr, "error parsing range start from \"%s\"\n", line); return 1; } s=end+1; c2=strtoul(s, &end, 16); if(end==s || *end!=' ' && *end!=0) { fprintf(stderr, "error parsing range end from \"%s\"\n", line); return 1; } /* ignore ranges above the BMP */ if(c2>0xffff) { c2=0xffff; } /* set the flags for all code points in this range */ while(c1<=c2) { if(flags[c1]!=UNASSIGNED) { fprintf(stderr, "error: range covers already-assigned U+%04lX\n", c1); return 1; } flags[c1++]=ROUNDTRIP|FOURBYTE; } } return 0; } extern int main(int argc, const char *argv[]) { char line[200]; char *end; unsigned long c, b; unsigned char bytes[4]={ 0x81, 0x30, 0x81, 0x30 }; char flag; /* parse the input file from stdin, in the format of gbkuni30.txt */ while(gets(line)!=NULL) { /* skip empty and comment lines */ if(line[0]==0 || line[0]=='#' || line[0]==0x1a) { continue; } /* end of code points, beginning of ranges? */ if(0==strcmp(line, "ranges")) { int result=readRanges(); if(result!=0) { return result; } break; } /* read Unicode code point */ c=strtoul(line, &end, 16); if(end==line) { fprintf(stderr, "error: missing code point in \"%s\"\n", line); return 1; } if(*end==':') { flag=ROUNDTRIP; } else if(*end=='>') { flag=FROMU; } else if(*end=='<') { flag=TOU; } else { fprintf(stderr, "error: delimiter not one of :>< in \"%s\"\n", line); return 1; } /* ignore non-BMP code points */ if(c>0xffff) { continue; } /* read byte sequence as one long value */ b=strtoul(end+1, &end, 16); if(*end!=0 && *end!='*') { fprintf(stderr, "error parsing byte sequence from \"%s\"\n", line); return 1; } if(b>0xffff) { flag|=FOURBYTE; } /* set the flag for the code point, make sure the mapping from Unicode is not duplicate */ if((flags[c]&flag&FROMU)!=0) { fprintf(stderr, "error: duplicate assignment for U+%04lX, old flags %u, new %s\n", c, flags[c], line); return 1; } flags[c]|=flag; } if(argc<=1) { /* generate all four-byte sequences that are not already in the input */ for(c=0x80; c<=0xffff; ++c) { /* skip single surrogates */ if(c==0xd800) { c=0xe000; } if(flags[c]==UNASSIGNED) { printf("%04lX:%02X%02X%02X%02X\n", c, bytes[0], bytes[1], bytes[2], bytes[3]); /* increment the sequence for the next code point */ incFourGB18030(bytes); } else if(flags[c]&FOURBYTE) { /* increment the four-byte sequence for each already-used four-byte sequence */ incFourGB18030(bytes); } } } else if(0==strcmp(argv[1], "r")) { /* generate ranges of contiguous code points with four-byte sequences for what is not covered by the input */ unsigned char b1[4], b2[4]; unsigned long c1, c2; printf("ranges\n"); for(c1=0x80; c1<=0xffff;) { /* skip single surrogates */ if(c1==0xd800) { c1=0xe000; } /* get start bytes of range */ memcpy(b1, bytes, 4); /* look for the first non-range code point */ for(c2=c1; c2<=0xffff && flags[c2]==UNASSIGNED && c2!=0xd800; ++c2) { /* save this sequence to avoid decrementing it after this loop */ memcpy(b2, bytes, 4); /* increment the sequence for the next code point */ incFourGB18030(bytes); } /* c2 is the first code point after the range; b2 are the bytes for the last code point in the range */ /* print this range, number of codes first for easy sorting */ printf("%06lX U+%04lX-%04lX GB+%02X%02X%02X%02X-%02X%02X%02X%02X\n", c2-c1, c1, c2-1, b1[0], b1[1], b1[2], b1[3], b2[0], b2[1], b2[2], b2[3]); /* skip single surrogates */ if(c2==0xd800) { c2=0xe000; } /* skip all assigned Unicode BMP code points */ for(c1=c2; c1<=0xffff && flags[c1]!=UNASSIGNED; ++c1) { if(flags[c1]&FOURBYTE) { /* increment the four-byte sequence for each already-used four-byte sequence */ incFourGB18030(bytes); } } } } else { fprintf(stderr, "unknown mode argument \"%s\"\n", argv[1]); return 2; } return 0; }