/* ******************************************************************************* * * Copyright (C) 2000, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: canonucm.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2000nov08 * created by: Markus W. Scherer * * This tool reads a .ucm file and canonicalizes it: In the CHARMAP section, * - sort by Unicode code points * - print all code points in uppercase hexadecimal * - print all Unicode code points with 4, 5, or 6 digits as needed * - remove the comments * - remove unnecessary spaces * * To compile, just call a C compiler/linker with this source file. * On Windows: cl canonucm.c */ #include #include #include typedef struct Mapping { unsigned long u, b, f; } Mapping; static Mapping mappings[200000]; /* lexically compare Mappings for sorting */ static int compareMappings(const void *left, const void *right) { const Mapping *l=(const Mapping *)left, *r=(const Mapping *)right; long result; /* shift right 16 with sign-extend to take care of int possibly being 16 bits wide */ result=(long)(l->u-r->u); if(result!=0) { return (int)(result>>16)|1; } result=(long)(l->b-r->b); if(result!=0) { return (int)(result>>16)|1; } return (int)(l->f-r->f); } extern int main(int argc, const char *argv[]) { char line[200]; char *s, *end; unsigned long b, i, mappingsTop=0; /* parse the input file from stdin */ /* read and copy header */ do { if(gets(line)==NULL) { fprintf(stderr, "error: no mapping section"); return 1; } puts(line); } while(0!=strcmp(line, "CHARMAP")); /* copy empty and comment lines before the first mapping */ for(;;) { if(gets(line)==NULL) { fprintf(stderr, "error: no mappings"); return 1; } if(line[0]!=0 && line[0]!='#') { break; } puts(line); } /* process the charmap section, start with the line read above */ for(;;) { /* ignore empty and comment lines */ if(line[0]!=0 && line[0]!='#') { if(0!=strcmp(line, "END CHARMAP")) { if(mappingsTop==sizeof(mappings)/sizeof(mappings[0])) { fprintf(stderr, "too many mappings\n"); return 1; } /* parse mapping */ if(line[0]!='<' || line[1]!='U') { fprintf(stderr, "parse error (does not start with \"0x10ffff || *end!='>') { fprintf(stderr, "parse error (Unicode code point) in mapping line \"%s\"\n", line); return 1; } /* skip white space */ s=end+1; while(*s==' ' || *s=='\t') { ++s; } /* parse codepage bytes */ b=0; for(;;) { if(*s!='\\') { break; } if(s[1]!='x') { fprintf(stderr, "parse error (no 'x' in \"\\xXX\") in mapping line \"%s\"\n", line); return 1; } s+=2; b=(b<<8)|strtoul(s, &end, 16); if(end!=s+2) { fprintf(stderr, "parse error (codepage byte) in mapping line \"%s\"\n", line); return 1; } s+=2; } mappings[mappingsTop].b=b; /* skip everything until the fallback indicator */ while(*s!='|') { if(*s==0) { fprintf(stderr, "parse error (missing '|' fallback indicator) in mapping line \"%s\"\n", line); return 1; } ++s; } /* parse fallback indicator */ i=s[1]-'0'; if(i>3) { fprintf(stderr, "parse error (fallback indicator not 0..3) in mapping line \"%s\"\n", line); return 1; } mappings[mappingsTop++].f=i; } else { /* sort and write all mappings */ if(mappingsTop>0) { qsort(mappings, mappingsTop, sizeof(Mapping), compareMappings); for(i=0; i \\x%02lX |%lu\n", mappings[i].u, b, mappings[i].f); } else if(b<=0xffff) { printf(" \\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>8, b&0xff, mappings[i].f); } else if(b<=0xffffff) { printf(" \\x%02lX\\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>16, (b>>8)&0xff, b&0xff, mappings[i].f); } else { printf(" \\x%02lX\\x%02lX\\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>24, (b>>16)&0xff, (b>>8)&0xff, b&0xff, mappings[i].f); } } } /* output "END CHARMAP" */ puts(line); return 0; } } /* read the next line */ if(gets(line)==NULL) { fprintf(stderr, "incomplete charmap section\n"); return 1; } } }