ICU-701 standalone tool for canonicalizing .ucm files

X-SVN-Rev: 2885
This commit is contained in:
Markus Scherer 2000-11-08 19:28:01 +00:00
parent 21948fbb87
commit abc83a863c

View File

@ -0,0 +1,171 @@
/*
*******************************************************************************
*
* Copyright (C) 2000, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: canonucm.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2000nov08
* created by: Markus W. Scherer
*
* This tool reads a .ucm file and canonicalizes it: In the CHARMAP section,
* - sort by Unicode code points
* - print all code points in uppercase hexadecimal
* - print all Unicode code points with 4, 5, or 6 digits as needed
* - remove the comments
* - remove unnecessary spaces
*
* To compile, just call a C compiler/linker with this source file.
* On Windows: cl canonucm.c
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct Mapping {
unsigned long u, b, f;
} Mapping;
static Mapping
mappings[200000];
/* lexically compare Mappings for sorting */
static int
compareMappings(const void *left, const void *right) {
const Mapping *l=(const Mapping *)left, *r=(const Mapping *)right;
long result;
/* shift right 16 with sign-extend to take care of int possibly being 16 bits wide */
result=(long)(l->u-r->u);
if(result!=0) {
return (int)(result>>16)|1;
}
result=(long)(l->b-r->b);
if(result!=0) {
return (int)(result>>16)|1;
}
return (int)(l->f-r->f);
}
extern int
main(int argc, const char *argv[]) {
char line[200];
char *s, *end;
unsigned long b, i, mappingsTop=0;
/* parse the input file from stdin */
/* read and copy header */
do {
if(gets(line)==NULL) {
fprintf(stderr, "error: no mapping section");
return 1;
}
puts(line);
} while(0!=strcmp(line, "CHARMAP"));
/* copy empty and comment lines before the first mapping */
for(;;) {
if(gets(line)==NULL) {
fprintf(stderr, "error: no mappings");
return 1;
}
if(line[0]!=0 && line[0]!='#') {
break;
}
puts(line);
}
/* process the charmap section, start with the line read above */
for(;;) {
/* ignore empty and comment lines */
if(line[0]!=0 && line[0]!='#') {
if(0!=strcmp(line, "END CHARMAP")) {
if(mappingsTop==sizeof(mappings)/sizeof(mappings[0])) {
fprintf(stderr, "too many mappings\n");
return 1;
}
/* parse mapping */
if(line[0]!='<' || line[1]!='U') {
fprintf(stderr, "parse error (does not start with \"<U\") in mapping line \"%s\"\n", line);
return 1;
}
/* parse Unicode code point */
mappings[mappingsTop].u=strtoul(line+2, &end, 16);
if(end==line+2 || mappings[mappingsTop].u>0x10ffff || *end!='>') {
fprintf(stderr, "parse error (Unicode code point) in mapping line \"%s\"\n", line);
return 1;
}
/* skip white space */
s=end+1;
while(*s==' ' || *s=='\t') {
++s;
}
/* parse codepage bytes */
b=0;
for(;;) {
if(*s!='\\') {
break;
}
if(s[1]!='x') {
fprintf(stderr, "parse error (no 'x' in \"\\xXX\") in mapping line \"%s\"\n", line);
return 1;
}
s+=2;
b=(b<<8)|strtoul(s, &end, 16);
if(end!=s+2) {
fprintf(stderr, "parse error (codepage byte) in mapping line \"%s\"\n", line);
return 1;
}
s+=2;
}
mappings[mappingsTop].b=b;
/* skip everything until the fallback indicator */
while(*s!='|') {
if(*s==0) {
fprintf(stderr, "parse error (missing '|' fallback indicator) in mapping line \"%s\"\n", line);
return 1;
}
++s;
}
/* parse fallback indicator */
i=s[1]-'0';
if(i>3) {
fprintf(stderr, "parse error (fallback indicator not 0..3) in mapping line \"%s\"\n", line);
return 1;
}
mappings[mappingsTop++].f=i;
} else {
/* sort and write all mappings */
if(mappingsTop>0) {
qsort(mappings, mappingsTop, sizeof(Mapping), compareMappings);
for(i=0; i<mappingsTop; ++i) {
b=mappings[i].b;
if(b<=0xff) {
printf("<U%04lX> \\x%02lX |%lu\n", mappings[i].u, b, mappings[i].f);
} else if(b<=0xffff) {
printf("<U%04lX> \\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>8, b&0xff, mappings[i].f);
} else if(b<=0xffffff) {
printf("<U%04lX> \\x%02lX\\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>16, (b>>8)&0xff, b&0xff, mappings[i].f);
} else {
printf("<U%04lX> \\x%02lX\\x%02lX\\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>24, (b>>16)&0xff, (b>>8)&0xff, b&0xff, mappings[i].f);
}
}
}
/* output "END CHARMAP" */
puts(line);
return 0;
}
}
/* read the next line */
if(gets(line)==NULL) {
fprintf(stderr, "incomplete charmap section\n");
return 1;
}
}
}