/* ******************************************************************************* * * Copyright (C) 2003, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: ucdmerge.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2003feb20 * created by: Markus W. Scherer * * Simple tool for Unicode Character Database files with semicolon-delimited fields. * Merges adjacent, identical per-code point data lines into one line with range syntax. * * To compile, just call a C compiler/linker with this source file. * On Windows: cl ucdmerge.c */ #include #include #include static const char * skipWhitespace(const char *s) { while(*s==' ' || *s=='\t') { ++s; } return s; } /* return the first character position after the end of the data */ static char * endOfData(const char *l) { char *end; char c; end=strchr(l, '#'); if(end!=NULL) { /* ignore whitespace before the comment */ while(l!=end && ((c=*(end-1))==' ' || c=='\t')) { --end; } } else { end=strchr(l, 0); } return end; } static int sameData(const char *l1, const char *l2) { char *end1, *end2; int length; /* find the first semicolon in each line - there must be one */ l1=strchr(l1, ';')+1; l2=strchr(l2, ';')+1; /* find the end of data: end of string or start of comment */ end1=endOfData(l1); end2=endOfData(l2); /* compare the line data portions */ length=end1-l1; return length==(end2-l2) && 0==memcmp(l1, l2, length); } extern int main(int argc, const char *argv[]) { static char line[2000], firstLine[2000], lastLine[2000]; char *end; long first, last, c; int finished; first=last=-1; finished=0; for(;;) { if(gets(line)!=NULL) { /* parse the initial code point, if any */ c=strtol(line, &end, 16); if(end!=line && *skipWhitespace(end)==';') { /* single code point followed by semicolon and data, keep c */ } else { c=-1; } } else { line[0]=0; c=-1; finished=1; } if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) { /* output the current range */ if(first==last) { /* there was no range, just output the one line we found */ puts(firstLine); } else { /* there was a real range, merge their lines */ end=strchr(lastLine, '#'); if(end==NULL) { /* no comment in second line */ printf("%04lX..%04lX%s\n", first, last, /* code point range */ strchr(firstLine, ';'));/* first line starting from the first ; */ } else if(strchr(firstLine, '#')==NULL) { /* no comment in first line */ printf("%04lX..%04lX%s%s\n", first, last, /* code point range */ strchr(firstLine, ';'), /* first line starting from the first ; */ end); /* comment from second line */ } else { /* merge comments from both lines */ printf("%04lX..%04lX%s..%s\n", first, last, /* code point range */ strchr(firstLine, ';'), /* first line starting from the first ; */ skipWhitespace(end+1)); /* comment from second line, after # and spaces */ } } first=last=-1; } if(c<0) { if(finished) { break; } /* no data on this line, output as is */ puts(line); } else { /* data on this line, store for possible range compaction */ if(last<0) { /* set as the first line in a possible range */ first=last=c; strcpy(firstLine, line); lastLine[0]=0; } else /* must be c==(last+1) && sameData() because of previous conditions */ { /* continue with the current range */ last=c; strcpy(lastLine, line); } } } return 0; }