150 lines
4.5 KiB
C
150 lines
4.5 KiB
C
|
/*
|
||
|
*******************************************************************************
|
||
|
*
|
||
|
* Copyright (C) 2003, International Business Machines
|
||
|
* Corporation and others. All Rights Reserved.
|
||
|
*
|
||
|
*******************************************************************************
|
||
|
* file name: ucdmerge.c
|
||
|
* encoding: US-ASCII
|
||
|
* tab size: 8 (not used)
|
||
|
* indentation:4
|
||
|
*
|
||
|
* created on: 2003feb20
|
||
|
* created by: Markus W. Scherer
|
||
|
*
|
||
|
* Simple tool for Unicode Character Database files with semicolon-delimited fields.
|
||
|
* Merges adjacent, identical per-code point data lines into one line with range syntax.
|
||
|
*
|
||
|
* To compile, just call a C compiler/linker with this source file.
|
||
|
* On Windows: cl ucdmerge.c
|
||
|
*/
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include <string.h>
|
||
|
#include <stdlib.h>
|
||
|
|
||
|
static const char *
|
||
|
skipWhitespace(const char *s) {
|
||
|
while(*s==' ' || *s=='\t') {
|
||
|
++s;
|
||
|
}
|
||
|
return s;
|
||
|
}
|
||
|
|
||
|
/* return the first character position after the end of the data */
|
||
|
static char *
|
||
|
endOfData(const char *l) {
|
||
|
char *end;
|
||
|
char c;
|
||
|
|
||
|
end=strchr(l, '#');
|
||
|
if(end!=NULL) {
|
||
|
/* ignore whitespace before the comment */
|
||
|
while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
|
||
|
--end;
|
||
|
}
|
||
|
} else {
|
||
|
end=strchr(l, 0);
|
||
|
}
|
||
|
return end;
|
||
|
}
|
||
|
|
||
|
static int
|
||
|
sameData(const char *l1, const char *l2) {
|
||
|
char *end1, *end2;
|
||
|
int length;
|
||
|
|
||
|
/* find the first semicolon in each line - there must be one */
|
||
|
l1=strchr(l1, ';')+1;
|
||
|
l2=strchr(l2, ';')+1;
|
||
|
|
||
|
/* find the end of data: end of string or start of comment */
|
||
|
end1=endOfData(l1);
|
||
|
end2=endOfData(l2);
|
||
|
|
||
|
/* compare the line data portions */
|
||
|
length=end1-l1;
|
||
|
return length==(end2-l2) && 0==memcmp(l1, l2, length);
|
||
|
}
|
||
|
|
||
|
extern int
|
||
|
main(int argc, const char *argv[]) {
|
||
|
static char line[2000], firstLine[2000], lastLine[2000];
|
||
|
char *end;
|
||
|
long first, last, c;
|
||
|
int finished;
|
||
|
|
||
|
first=last=-1;
|
||
|
finished=0;
|
||
|
|
||
|
for(;;) {
|
||
|
if(gets(line)!=NULL) {
|
||
|
/* parse the initial code point, if any */
|
||
|
c=strtol(line, &end, 16);
|
||
|
if(end!=line && *skipWhitespace(end)==';') {
|
||
|
/* single code point followed by semicolon and data, keep c */
|
||
|
} else {
|
||
|
c=-1;
|
||
|
}
|
||
|
} else {
|
||
|
line[0]=0;
|
||
|
c=-1;
|
||
|
finished=1;
|
||
|
}
|
||
|
|
||
|
if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
|
||
|
/* output the current range */
|
||
|
if(first==last) {
|
||
|
/* there was no range, just output the one line we found */
|
||
|
puts(firstLine);
|
||
|
} else {
|
||
|
/* there was a real range, merge their lines */
|
||
|
end=strchr(lastLine, '#');
|
||
|
if(end==NULL) {
|
||
|
/* no comment in second line */
|
||
|
printf("%04lX..%04lX%s\n",
|
||
|
first, last, /* code point range */
|
||
|
strchr(firstLine, ';'));/* first line starting from the first ; */
|
||
|
} else if(strchr(firstLine, '#')==NULL) {
|
||
|
/* no comment in first line */
|
||
|
printf("%04lX..%04lX%s%s\n",
|
||
|
first, last, /* code point range */
|
||
|
strchr(firstLine, ';'), /* first line starting from the first ; */
|
||
|
end); /* comment from second line */
|
||
|
} else {
|
||
|
/* merge comments from both lines */
|
||
|
printf("%04lX..%04lX%s..%s\n",
|
||
|
first, last, /* code point range */
|
||
|
strchr(firstLine, ';'), /* first line starting from the first ; */
|
||
|
skipWhitespace(end+1)); /* comment from second line, after # and spaces */
|
||
|
}
|
||
|
}
|
||
|
first=last=-1;
|
||
|
}
|
||
|
|
||
|
if(c<0) {
|
||
|
if(finished) {
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
/* no data on this line, output as is */
|
||
|
puts(line);
|
||
|
} else {
|
||
|
/* data on this line, store for possible range compaction */
|
||
|
if(last<0) {
|
||
|
/* set as the first line in a possible range */
|
||
|
first=last=c;
|
||
|
strcpy(firstLine, line);
|
||
|
lastLine[0]=0;
|
||
|
} else /* must be c==(last+1) && sameData() because of previous conditions */ {
|
||
|
/* continue with the current range */
|
||
|
last=c;
|
||
|
strcpy(lastLine, line);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
}
|