ICU-701 standalone tool for canonicalizing .ucm files
X-SVN-Rev: 2885
This commit is contained in:
parent
21948fbb87
commit
abc83a863c
171
icu4c/source/tools/makeconv/misc/canonucm.c
Normal file
171
icu4c/source/tools/makeconv/misc/canonucm.c
Normal file
@ -0,0 +1,171 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: canonucm.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2000nov08
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* This tool reads a .ucm file and canonicalizes it: In the CHARMAP section,
|
||||
* - sort by Unicode code points
|
||||
* - print all code points in uppercase hexadecimal
|
||||
* - print all Unicode code points with 4, 5, or 6 digits as needed
|
||||
* - remove the comments
|
||||
* - remove unnecessary spaces
|
||||
*
|
||||
* To compile, just call a C compiler/linker with this source file.
|
||||
* On Windows: cl canonucm.c
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef struct Mapping {
|
||||
unsigned long u, b, f;
|
||||
} Mapping;
|
||||
|
||||
static Mapping
|
||||
mappings[200000];
|
||||
|
||||
/* lexically compare Mappings for sorting */
|
||||
static int
|
||||
compareMappings(const void *left, const void *right) {
|
||||
const Mapping *l=(const Mapping *)left, *r=(const Mapping *)right;
|
||||
long result;
|
||||
|
||||
/* shift right 16 with sign-extend to take care of int possibly being 16 bits wide */
|
||||
result=(long)(l->u-r->u);
|
||||
if(result!=0) {
|
||||
return (int)(result>>16)|1;
|
||||
}
|
||||
result=(long)(l->b-r->b);
|
||||
if(result!=0) {
|
||||
return (int)(result>>16)|1;
|
||||
}
|
||||
return (int)(l->f-r->f);
|
||||
}
|
||||
|
||||
extern int
|
||||
main(int argc, const char *argv[]) {
|
||||
char line[200];
|
||||
char *s, *end;
|
||||
unsigned long b, i, mappingsTop=0;
|
||||
|
||||
/* parse the input file from stdin */
|
||||
/* read and copy header */
|
||||
do {
|
||||
if(gets(line)==NULL) {
|
||||
fprintf(stderr, "error: no mapping section");
|
||||
return 1;
|
||||
}
|
||||
puts(line);
|
||||
} while(0!=strcmp(line, "CHARMAP"));
|
||||
|
||||
/* copy empty and comment lines before the first mapping */
|
||||
for(;;) {
|
||||
if(gets(line)==NULL) {
|
||||
fprintf(stderr, "error: no mappings");
|
||||
return 1;
|
||||
}
|
||||
if(line[0]!=0 && line[0]!='#') {
|
||||
break;
|
||||
}
|
||||
puts(line);
|
||||
}
|
||||
|
||||
/* process the charmap section, start with the line read above */
|
||||
for(;;) {
|
||||
/* ignore empty and comment lines */
|
||||
if(line[0]!=0 && line[0]!='#') {
|
||||
if(0!=strcmp(line, "END CHARMAP")) {
|
||||
if(mappingsTop==sizeof(mappings)/sizeof(mappings[0])) {
|
||||
fprintf(stderr, "too many mappings\n");
|
||||
return 1;
|
||||
}
|
||||
/* parse mapping */
|
||||
if(line[0]!='<' || line[1]!='U') {
|
||||
fprintf(stderr, "parse error (does not start with \"<U\") in mapping line \"%s\"\n", line);
|
||||
return 1;
|
||||
}
|
||||
/* parse Unicode code point */
|
||||
mappings[mappingsTop].u=strtoul(line+2, &end, 16);
|
||||
if(end==line+2 || mappings[mappingsTop].u>0x10ffff || *end!='>') {
|
||||
fprintf(stderr, "parse error (Unicode code point) in mapping line \"%s\"\n", line);
|
||||
return 1;
|
||||
}
|
||||
/* skip white space */
|
||||
s=end+1;
|
||||
while(*s==' ' || *s=='\t') {
|
||||
++s;
|
||||
}
|
||||
/* parse codepage bytes */
|
||||
b=0;
|
||||
for(;;) {
|
||||
if(*s!='\\') {
|
||||
break;
|
||||
}
|
||||
if(s[1]!='x') {
|
||||
fprintf(stderr, "parse error (no 'x' in \"\\xXX\") in mapping line \"%s\"\n", line);
|
||||
return 1;
|
||||
}
|
||||
s+=2;
|
||||
b=(b<<8)|strtoul(s, &end, 16);
|
||||
if(end!=s+2) {
|
||||
fprintf(stderr, "parse error (codepage byte) in mapping line \"%s\"\n", line);
|
||||
return 1;
|
||||
}
|
||||
s+=2;
|
||||
}
|
||||
mappings[mappingsTop].b=b;
|
||||
/* skip everything until the fallback indicator */
|
||||
while(*s!='|') {
|
||||
if(*s==0) {
|
||||
fprintf(stderr, "parse error (missing '|' fallback indicator) in mapping line \"%s\"\n", line);
|
||||
return 1;
|
||||
}
|
||||
++s;
|
||||
}
|
||||
/* parse fallback indicator */
|
||||
i=s[1]-'0';
|
||||
if(i>3) {
|
||||
fprintf(stderr, "parse error (fallback indicator not 0..3) in mapping line \"%s\"\n", line);
|
||||
return 1;
|
||||
}
|
||||
mappings[mappingsTop++].f=i;
|
||||
} else {
|
||||
/* sort and write all mappings */
|
||||
if(mappingsTop>0) {
|
||||
qsort(mappings, mappingsTop, sizeof(Mapping), compareMappings);
|
||||
for(i=0; i<mappingsTop; ++i) {
|
||||
b=mappings[i].b;
|
||||
if(b<=0xff) {
|
||||
printf("<U%04lX> \\x%02lX |%lu\n", mappings[i].u, b, mappings[i].f);
|
||||
} else if(b<=0xffff) {
|
||||
printf("<U%04lX> \\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>8, b&0xff, mappings[i].f);
|
||||
} else if(b<=0xffffff) {
|
||||
printf("<U%04lX> \\x%02lX\\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>16, (b>>8)&0xff, b&0xff, mappings[i].f);
|
||||
} else {
|
||||
printf("<U%04lX> \\x%02lX\\x%02lX\\x%02lX\\x%02lX |%lu\n", mappings[i].u, b>>24, (b>>16)&0xff, (b>>8)&0xff, b&0xff, mappings[i].f);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* output "END CHARMAP" */
|
||||
puts(line);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
/* read the next line */
|
||||
if(gets(line)==NULL) {
|
||||
fprintf(stderr, "incomplete charmap section\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user