1999-12-10 01:44:19 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
1999-12-13 22:25:50 +00:00
|
|
|
*
|
|
|
|
* Copyright (C) 1999, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
1999-12-10 01:44:19 +00:00
|
|
|
*******************************************************************************
|
|
|
|
* file name: genprops.c
|
|
|
|
* encoding: US-ASCII
|
|
|
|
* tab size: 8 (not used)
|
|
|
|
* indentation:4
|
|
|
|
*
|
|
|
|
* created on: 1999dec08
|
|
|
|
* created by: Markus W. Scherer
|
|
|
|
*
|
|
|
|
* This program reads the Unicode character database text file,
|
|
|
|
* parses it, and extracts most of the properties for each character.
|
|
|
|
* It then writes a binary file containing the properties
|
|
|
|
* that is designed to be used directly for random-access to
|
|
|
|
* the properties of each Unicode character.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include "utypes.h"
|
|
|
|
#include "uchar.h"
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "cstring.h"
|
|
|
|
#include "filestrm.h"
|
|
|
|
#include "udata.h"
|
|
|
|
#include "unewdata.h"
|
1999-12-13 22:25:50 +00:00
|
|
|
#include "genprops.h"
|
1999-12-10 01:44:19 +00:00
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
extern bool_t beVerbose=FALSE, haveCopyright=TRUE;
|
1999-12-10 01:44:19 +00:00
|
|
|
|
|
|
|
/* general categories */
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
extern const char *const
|
|
|
|
genCategoryNames[U_CHAR_CATEGORY_COUNT]={
|
1999-12-10 01:44:19 +00:00
|
|
|
NULL,
|
|
|
|
"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
|
|
|
|
"Mc", "Nd", "Nl", "No",
|
|
|
|
"Zs", "Zl", "Zp",
|
|
|
|
"Cc", "Cf", "Co", "Cs",
|
|
|
|
"Pd", "Ps", "Pe", "Pc", "Po",
|
|
|
|
"Sm", "Sc", "Sk", "So",
|
|
|
|
"Pi", "Pf",
|
|
|
|
"Cn"
|
|
|
|
};
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
extern const char *const
|
|
|
|
bidiNames[U_CHAR_DIRECTION_COUNT]={
|
1999-12-10 01:44:19 +00:00
|
|
|
"L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S",
|
|
|
|
"WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
|
|
|
|
};
|
|
|
|
|
|
|
|
/* prototypes --------------------------------------------------------------- */
|
|
|
|
|
|
|
|
static void
|
|
|
|
init();
|
|
|
|
|
|
|
|
static void
|
|
|
|
parseDB(FileStream *in);
|
|
|
|
|
|
|
|
static int16_t
|
|
|
|
getField(char *line, int16_t start, int16_t limit);
|
|
|
|
|
|
|
|
static void
|
|
|
|
checkLineIndex(uint32_t code, int16_t limit, int16_t length);
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
extern int
|
|
|
|
main(int argc, char *argv[]) {
|
|
|
|
FileStream *in;
|
|
|
|
char *arg, *filename=NULL;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if(argc<=1) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
|
|
|
|
"\tread the UnicodeData.txt file and \n"
|
|
|
|
"\tcreate a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
|
|
|
|
"\toptions:\n"
|
|
|
|
"\t\t-v[+|-] verbose output\n"
|
|
|
|
"\t\t-c[+|-] do (not) include a copyright notice\n"
|
|
|
|
"\t\tfilename absolute path/filename for the\n"
|
|
|
|
"\t\t\tUnicode database text file (default: standard input)\n",
|
|
|
|
argv[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
for(i=1; i<argc; ++i) {
|
|
|
|
arg=argv[i];
|
|
|
|
if(arg[0]=='-') {
|
|
|
|
switch(arg[1]) {
|
|
|
|
case 'v':
|
|
|
|
beVerbose= arg[2]=='+';
|
|
|
|
break;
|
|
|
|
case 'c':
|
|
|
|
haveCopyright= arg[2]=='+';
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
filename=arg;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(filename==NULL) {
|
|
|
|
in=T_FileStream_stdin();
|
|
|
|
} else {
|
|
|
|
in=T_FileStream_open(filename, "r");
|
|
|
|
if(in==NULL) {
|
|
|
|
fprintf(stderr, "genprops: unable to open input file %s\n", filename);
|
|
|
|
exit(U_FILE_ACCESS_ERROR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
init();
|
1999-12-15 19:04:11 +00:00
|
|
|
initStore();
|
1999-12-10 01:44:19 +00:00
|
|
|
parseDB(in);
|
1999-12-15 04:42:56 +00:00
|
|
|
repeatProps();
|
1999-12-13 22:25:50 +00:00
|
|
|
compactProps();
|
1999-12-15 04:42:56 +00:00
|
|
|
compactStage3();
|
|
|
|
compactStage2();
|
1999-12-10 01:44:19 +00:00
|
|
|
generateData();
|
|
|
|
|
|
|
|
if(in!=T_FileStream_stdin()) {
|
|
|
|
T_FileStream_close(in);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
init() {
|
|
|
|
}
|
|
|
|
|
|
|
|
/* parsing ------------------------------------------------------------------ */
|
|
|
|
|
|
|
|
static void
|
|
|
|
parseDB(FileStream *in) {
|
|
|
|
char line[300];
|
|
|
|
char *end;
|
|
|
|
Props p;
|
|
|
|
uint32_t value;
|
|
|
|
int16_t start, limit, length, i;
|
|
|
|
bool_t hasNumericValue;
|
|
|
|
|
|
|
|
while(T_FileStream_readLine(in, line, sizeof(line))!=NULL) {
|
|
|
|
length=icu_strlen(line);
|
|
|
|
|
|
|
|
/* remove trailing newline characters */
|
|
|
|
while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) {
|
|
|
|
line[--length]=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* reset the properties */
|
|
|
|
icu_memset(&p, 0, sizeof(p));
|
|
|
|
hasNumericValue=FALSE;
|
|
|
|
|
|
|
|
/* get the character code, field 0 */
|
|
|
|
p.code=icu_strtoul(line, &end, 16);
|
|
|
|
limit=end-line;
|
|
|
|
if(limit<1 || *end!=';') {
|
|
|
|
fprintf(stderr, "genprops: syntax error in field 0 at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* skip character name, field 1 */
|
|
|
|
checkLineIndex(p.code, ++limit, length);
|
|
|
|
limit=getField(line, limit, length);
|
|
|
|
|
|
|
|
/* get general category, field 2 */
|
|
|
|
start=limit+1;
|
|
|
|
checkLineIndex(p.code, start, length);
|
|
|
|
limit=getField(line, start, length);
|
|
|
|
line[limit]=0;
|
|
|
|
for(i=1;;) {
|
|
|
|
if(icu_strcmp(line+start, genCategoryNames[i])==0) {
|
|
|
|
p.generalCategory=(uint8_t)i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if(++i==U_CHAR_CATEGORY_COUNT) {
|
|
|
|
fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n", line+start, p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get canonical combining class, field 3 */
|
|
|
|
start=limit+1;
|
|
|
|
checkLineIndex(p.code, start, length);
|
|
|
|
p.canonicalCombining=(uint8_t)icu_strtoul(line+start, &end, 10);
|
|
|
|
limit=end-line;
|
|
|
|
if(start>=limit || *end!=';') {
|
|
|
|
fprintf(stderr, "genprops: syntax error in field 3 at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get BiDi category, field 4 */
|
|
|
|
start=limit+1;
|
|
|
|
checkLineIndex(p.code, start, length);
|
|
|
|
limit=getField(line, start, length);
|
|
|
|
line[limit]=0;
|
|
|
|
for(i=0;;) {
|
|
|
|
if(icu_strcmp(line+start, bidiNames[i])==0) {
|
|
|
|
p.bidi=(uint8_t)i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if(++i==U_CHAR_DIRECTION_COUNT) {
|
|
|
|
fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n", line+start, p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* character decomposition mapping, field 5 */
|
|
|
|
/* ### skip for now */
|
|
|
|
checkLineIndex(p.code, ++limit, length);
|
|
|
|
limit=getField(line, limit, length);
|
|
|
|
|
|
|
|
/* decimal digit value, field 6 */
|
|
|
|
start=limit+1;
|
|
|
|
checkLineIndex(p.code, start, length);
|
|
|
|
value=icu_strtoul(line+start, &end, 10);
|
|
|
|
if(*end!=';') {
|
|
|
|
fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
limit=end-line;
|
|
|
|
if(start<limit) {
|
|
|
|
p.numericValue=value;
|
|
|
|
hasNumericValue=TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* digit value, field 7 */
|
|
|
|
start=limit+1;
|
|
|
|
checkLineIndex(p.code, start, length);
|
|
|
|
value=icu_strtoul(line+start, &end, 10);
|
|
|
|
if(*end!=';') {
|
|
|
|
fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
limit=end-line;
|
|
|
|
if(start<limit) {
|
|
|
|
if(hasNumericValue) {
|
|
|
|
if(p.numericValue!=value) {
|
|
|
|
fprintf(stderr, "genprops: more than one numeric value at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
p.numericValue=value;
|
|
|
|
hasNumericValue=TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* numeric value, field 8 */
|
|
|
|
start=limit+1;
|
|
|
|
checkLineIndex(p.code, start, length);
|
|
|
|
value=icu_strtoul(line+start, &end, 10);
|
|
|
|
if(value>0 && *end=='/') {
|
|
|
|
p.denominator=icu_strtoul(end+1, &end, 10);
|
|
|
|
}
|
|
|
|
if(*end!=';') {
|
|
|
|
fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
limit=end-line;
|
|
|
|
if(start<limit) {
|
|
|
|
if(hasNumericValue) {
|
|
|
|
if(p.numericValue!=value) {
|
|
|
|
fprintf(stderr, "genprops: more than one numeric value at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
p.numericValue=value;
|
|
|
|
hasNumericValue=TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get Mirrored flag, field 9 */
|
|
|
|
start=limit+1;
|
|
|
|
checkLineIndex(p.code, start, length);
|
|
|
|
limit=getField(line, start, length);
|
|
|
|
if(line[start]=='Y') {
|
|
|
|
p.isMirrored=1;
|
|
|
|
} else if(limit-start!=1 || line[start]!='N') {
|
|
|
|
fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* skip Unicode 1.0 character name, field 10 */
|
|
|
|
checkLineIndex(p.code, ++limit, length);
|
|
|
|
limit=getField(line, limit, length);
|
|
|
|
|
|
|
|
/* skip comment, field 11 */
|
|
|
|
checkLineIndex(p.code, ++limit, length);
|
|
|
|
limit=getField(line, limit, length);
|
|
|
|
|
|
|
|
/* get uppercase mapping, field 12 */
|
|
|
|
start=limit+1;
|
|
|
|
checkLineIndex(p.code, start, length);
|
|
|
|
p.upperCase=icu_strtoul(line+start, &end, 16);
|
|
|
|
limit=end-line;
|
|
|
|
if(*end!=';') {
|
|
|
|
fprintf(stderr, "genprops: syntax error in field 12 at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get lowercase mapping, field 13 */
|
|
|
|
start=limit+1;
|
|
|
|
checkLineIndex(p.code, start, length);
|
|
|
|
p.lowerCase=icu_strtoul(line+start, &end, 16);
|
|
|
|
limit=end-line;
|
|
|
|
if(*end!=';') {
|
|
|
|
fprintf(stderr, "genprops: syntax error in field 13 at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get titlecase mapping, field 14 */
|
|
|
|
start=limit+1;
|
|
|
|
if(start<length) {
|
|
|
|
/* this is the last field */
|
|
|
|
p.titleCase=icu_strtoul(line+start, &end, 16);
|
|
|
|
if(*end!=';' && *end!=0) {
|
|
|
|
fprintf(stderr, "genprops: syntax error in field 14 at code 0x%lx\n", p.code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1999-12-15 04:42:56 +00:00
|
|
|
#if 0
|
|
|
|
/* debug output */
|
|
|
|
if(beVerbose) {
|
|
|
|
printf(
|
|
|
|
"0x%06lx "
|
|
|
|
"%s(%2d) "
|
|
|
|
"comb=%3d "
|
|
|
|
"bidi=%3s(%2d) "
|
|
|
|
"num=%7d/%7d "
|
|
|
|
"mirr=%d "
|
|
|
|
"u%06lx l%06lx t%06lx"
|
|
|
|
"\n",
|
|
|
|
p.code,
|
|
|
|
genCategoryNames[p.generalCategory], p.generalCategory,
|
|
|
|
p.canonicalCombining,
|
|
|
|
bidiNames[p.bidi], p.bidi,
|
|
|
|
p.numericValue, p.denominator,
|
|
|
|
p.isMirrored,
|
|
|
|
p.upperCase, p.lowerCase, p.titleCase);
|
|
|
|
}
|
|
|
|
#endif
|
1999-12-10 01:44:19 +00:00
|
|
|
|
|
|
|
addProps(&p);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int16_t
|
|
|
|
getField(char *line, int16_t start, int16_t limit) {
|
|
|
|
while(start<limit && line[start]!=';') {
|
|
|
|
++start;
|
|
|
|
}
|
|
|
|
return start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
checkLineIndex(uint32_t code, int16_t index, int16_t length) {
|
|
|
|
if(index>=length) {
|
|
|
|
fprintf(stderr, "genprops: too few fields at code 0x%lx\n", code);
|
|
|
|
exit(U_PARSE_ERROR);
|
|
|
|
}
|
|
|
|
}
|