c5dda76d0a
X-SVN-Rev: 14097
1060 lines
35 KiB
C
1060 lines
35 KiB
C
/*
|
|
*******************************************************************************
|
|
*
|
|
* Copyright (C) 1999-2003, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
*******************************************************************************
|
|
* file name: genprops.c
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 1999dec08
|
|
* created by: Markus W. Scherer
|
|
*
|
|
* This program reads several of the Unicode character database text files,
|
|
* parses them, and extracts most of the properties for each character.
|
|
* It then writes a binary file containing the properties
|
|
* that is designed to be used directly for random-access to
|
|
* the properties of each Unicode character.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/uset.h"
|
|
#include "unicode/putil.h"
|
|
#include "unicode/uclean.h"
|
|
#include "cmemory.h"
|
|
#include "cstring.h"
|
|
#include "unewdata.h"
|
|
#include "uoptions.h"
|
|
#include "uparse.h"
|
|
#include "uprops.h"
|
|
#include "propsvec.h"
|
|
|
|
U_CDECL_BEGIN
|
|
#include "genprops.h"
|
|
U_CDECL_END
|
|
|
|
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
|
|
|
|
UBool beVerbose=FALSE, haveCopyright=TRUE;
|
|
|
|
/*
|
|
* Unicode set collecting the case-sensitive characters;
|
|
* see uchar.h UCHAR_CASE_SENSITIVE.
|
|
* Add code points from case mappings/foldings in
|
|
* the root locale and with default options.
|
|
*/
|
|
static USet *caseSensitive;
|
|
|
|
/* prototypes --------------------------------------------------------------- */
|
|
|
|
static void
|
|
parseBidiMirroring(const char *filename, UErrorCode *pErrorCode);
|
|
|
|
static void
|
|
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
|
|
|
|
static void
|
|
parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
|
|
|
|
static void
|
|
parseDB(const char *filename, UErrorCode *pErrorCode);
|
|
|
|
/* -------------------------------------------------------------------------- */
|
|
|
|
|
|
enum
|
|
{
|
|
HELP_H,
|
|
HELP_QUESTION_MARK,
|
|
VERBOSE,
|
|
COPYRIGHT,
|
|
DESTDIR,
|
|
SOURCEDIR,
|
|
UNICODE_VERSION,
|
|
ICUDATADIR
|
|
};
|
|
|
|
/* Keep these values in sync with the above enums */
|
|
static UOption options[]={
|
|
UOPTION_HELP_H,
|
|
UOPTION_HELP_QUESTION_MARK,
|
|
UOPTION_VERBOSE,
|
|
UOPTION_COPYRIGHT,
|
|
UOPTION_DESTDIR,
|
|
UOPTION_SOURCEDIR,
|
|
{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
|
|
UOPTION_ICUDATADIR
|
|
};
|
|
|
|
extern int
|
|
main(int argc, char* argv[]) {
|
|
char filename[300];
|
|
const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
|
|
char *basename=NULL;
|
|
UErrorCode errorCode=U_ZERO_ERROR;
|
|
|
|
U_MAIN_INIT_ARGS(argc, argv);
|
|
|
|
/* preset then read command line options */
|
|
options[DESTDIR].value=u_getDataDirectory();
|
|
options[SOURCEDIR].value="";
|
|
options[UNICODE_VERSION].value="";
|
|
options[ICUDATADIR].value=u_getDataDirectory();
|
|
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
|
|
|
/* error handling, printing usage message */
|
|
if(argc<0) {
|
|
fprintf(stderr,
|
|
"error in command line argument \"%s\"\n",
|
|
argv[-argc]);
|
|
}
|
|
if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
|
|
/*
|
|
* Broken into chucks because the C89 standard says the minimum
|
|
* required supported string length is 509 bytes.
|
|
*/
|
|
fprintf(stderr,
|
|
"Usage: %s [-options] [suffix]\n"
|
|
"\n"
|
|
"read the UnicodeData.txt file and other Unicode properties files and\n"
|
|
"create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
|
|
"\n",
|
|
argv[0]);
|
|
fprintf(stderr,
|
|
"Options:\n"
|
|
"\t-h or -? or --help this usage text\n"
|
|
"\t-v or --verbose verbose output\n"
|
|
"\t-c or --copyright include a copyright notice\n"
|
|
"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
|
|
fprintf(stderr,
|
|
"\t-d or --destdir destination directory, followed by the path\n"
|
|
"\t-s or --sourcedir source directory, followed by the path\n"
|
|
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
|
|
"\t followed by path, defaults to %s\n"
|
|
"\tsuffix suffix that is to be appended with a '-'\n"
|
|
"\t to the source file basenames before opening;\n"
|
|
"\t 'genprops new' will read UnicodeData-new.txt etc.\n",
|
|
u_getDataDirectory());
|
|
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
|
}
|
|
|
|
/* get the options values */
|
|
beVerbose=options[VERBOSE].doesOccur;
|
|
haveCopyright=options[COPYRIGHT].doesOccur;
|
|
srcDir=options[SOURCEDIR].value;
|
|
destDir=options[DESTDIR].value;
|
|
|
|
if(argc>=2) {
|
|
suffix=argv[1];
|
|
} else {
|
|
suffix=NULL;
|
|
}
|
|
|
|
if(options[UNICODE_VERSION].doesOccur) {
|
|
setUnicodeVersion(options[UNICODE_VERSION].value);
|
|
}
|
|
/* else use the default dataVersion in store.c */
|
|
|
|
if (options[ICUDATADIR].doesOccur) {
|
|
u_setDataDirectory(options[ICUDATADIR].value);
|
|
}
|
|
|
|
/* prepare the filename beginning with the source dir */
|
|
uprv_strcpy(filename, srcDir);
|
|
basename=filename+uprv_strlen(filename);
|
|
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
|
|
*basename++=U_FILE_SEP_CHAR;
|
|
}
|
|
|
|
/* initialize */
|
|
initStore();
|
|
caseSensitive=uset_open(1, 0); /* empty set (start>end) */
|
|
|
|
/* process BidiMirroring.txt */
|
|
writeUCDFilename(basename, "BidiMirroring", suffix);
|
|
parseBidiMirroring(filename, &errorCode);
|
|
|
|
/* process SpecialCasing.txt */
|
|
writeUCDFilename(basename, "SpecialCasing", suffix);
|
|
parseSpecialCasing(filename, &errorCode);
|
|
|
|
/* process CaseFolding.txt */
|
|
writeUCDFilename(basename, "CaseFolding", suffix);
|
|
parseCaseFolding(filename, &errorCode);
|
|
|
|
/* process UnicodeData.txt */
|
|
writeUCDFilename(basename, "UnicodeData", suffix);
|
|
parseDB(filename, &errorCode);
|
|
|
|
/* process additional properties files */
|
|
*basename=0;
|
|
generateAdditionalProperties(filename, suffix, &errorCode);
|
|
|
|
/* process parsed data */
|
|
if(U_SUCCESS(errorCode)) {
|
|
/* write the properties data file */
|
|
generateData(destDir);
|
|
}
|
|
|
|
u_cleanup();
|
|
return errorCode;
|
|
}
|
|
|
|
U_CFUNC void
|
|
writeUCDFilename(char *basename, const char *filename, const char *suffix) {
|
|
int32_t length=(int32_t)uprv_strlen(filename);
|
|
uprv_strcpy(basename, filename);
|
|
if(suffix!=NULL) {
|
|
basename[length++]='-';
|
|
uprv_strcpy(basename+length, suffix);
|
|
length+=(int32_t)uprv_strlen(suffix);
|
|
}
|
|
uprv_strcpy(basename+length, ".txt");
|
|
}
|
|
|
|
U_CFUNC UBool
|
|
isToken(const char *token, const char *s) {
|
|
const char *z;
|
|
int32_t j;
|
|
|
|
s=u_skipWhitespace(s);
|
|
for(j=0;; ++j) {
|
|
if(token[j]!=0) {
|
|
if(s[j]!=token[j]) {
|
|
break;
|
|
}
|
|
} else {
|
|
z=u_skipWhitespace(s+j);
|
|
if(*z==';' || *z==0) {
|
|
return TRUE;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
U_CFUNC int32_t
|
|
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
|
|
const char *t, *z;
|
|
int32_t i, j;
|
|
|
|
s=u_skipWhitespace(s);
|
|
for(i=0; i<countTokens; ++i) {
|
|
t=tokens[i];
|
|
if(t!=NULL) {
|
|
for(j=0;; ++j) {
|
|
if(t[j]!=0) {
|
|
if(s[j]!=t[j]) {
|
|
break;
|
|
}
|
|
} else {
|
|
z=u_skipWhitespace(s+j);
|
|
if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
|
|
return i;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static void
|
|
_set_addAll(USet *set, const UChar *s, int32_t length) {
|
|
UChar32 c;
|
|
int32_t i;
|
|
|
|
/* needs length>=0 */
|
|
for(i=0; i<length; /* U16_NEXT advances i */) {
|
|
U16_NEXT(s, i, length, c);
|
|
uset_add(set, c);
|
|
}
|
|
}
|
|
|
|
/* parser for BidiMirroring.txt --------------------------------------------- */
|
|
|
|
#define MAX_MIRROR_COUNT 2000
|
|
|
|
static uint32_t mirrorMappings[MAX_MIRROR_COUNT][2];
|
|
static int32_t mirrorCount=0;
|
|
|
|
static void U_CALLCONV
|
|
mirrorLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
char *end;
|
|
static uint32_t prevCode=0;
|
|
|
|
mirrorMappings[mirrorCount][0]=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
|
|
if(end<=fields[0][0] || end!=fields[0][1]) {
|
|
fprintf(stderr, "genprops: syntax error in BidiMirroring.txt field 0 at %s\n", fields[0][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
mirrorMappings[mirrorCount][1]=(uint32_t)uprv_strtoul(fields[1][0], &end, 16);
|
|
if(end<=fields[1][0] || end!=fields[1][1]) {
|
|
fprintf(stderr, "genprops: syntax error in BidiMirroring.txt field 1 at %s\n", fields[1][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* check that the code points (mirrorMappings[mirrorCount][0]) are in ascending order */
|
|
if(mirrorMappings[mirrorCount][0]<=prevCode && mirrorMappings[mirrorCount][0]>0) {
|
|
fprintf(stderr, "genprops: error - BidiMirroring entries out of order, U+%04lx after U+%04lx\n",
|
|
(unsigned long)mirrorMappings[mirrorCount][0],
|
|
(unsigned long)prevCode);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
prevCode=mirrorMappings[mirrorCount][0];
|
|
|
|
if(++mirrorCount==MAX_MIRROR_COUNT) {
|
|
fprintf(stderr, "genprops: too many mirror mappings\n");
|
|
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
|
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
|
}
|
|
}
|
|
|
|
static void
|
|
parseBidiMirroring(const char *filename, UErrorCode *pErrorCode) {
|
|
char *fields[2][2];
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
u_parseDelimitedFile(filename, ';', fields, 2, mirrorLineFn, NULL, pErrorCode);
|
|
}
|
|
|
|
/* parser for SpecialCasing.txt --------------------------------------------- */
|
|
|
|
#define MAX_SPECIAL_CASING_COUNT 500
|
|
|
|
static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
|
|
static int32_t specialCasingCount=0;
|
|
|
|
static void U_CALLCONV
|
|
specialCasingLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
char *end;
|
|
|
|
/* get code point */
|
|
specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
|
|
end=(char *)u_skipWhitespace(end);
|
|
if(end<=fields[0][0] || end!=fields[0][1]) {
|
|
fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* is this a complex mapping? */
|
|
if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
|
|
/* there is some condition text in the fifth field */
|
|
specialCasings[specialCasingCount].isComplex=TRUE;
|
|
|
|
/* do not store any actual mappings for this */
|
|
specialCasings[specialCasingCount].lowerCase[0]=0;
|
|
specialCasings[specialCasingCount].upperCase[0]=0;
|
|
specialCasings[specialCasingCount].titleCase[0]=0;
|
|
} else {
|
|
/* just set the "complex" flag and get the case mappings */
|
|
specialCasings[specialCasingCount].isComplex=FALSE;
|
|
specialCasings[specialCasingCount].lowerCase[0]=
|
|
(UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
|
|
specialCasings[specialCasingCount].upperCase[0]=
|
|
(UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
|
|
specialCasings[specialCasingCount].titleCase[0]=
|
|
(UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: error parsing special casing at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
|
|
uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
|
|
_set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
|
|
_set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
|
|
_set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
|
|
}
|
|
|
|
if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
|
|
fprintf(stderr, "genprops: too many special casing mappings\n");
|
|
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
|
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
|
}
|
|
}
|
|
|
|
static int
|
|
compareSpecialCasings(const void *left, const void *right) {
|
|
return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
|
|
}
|
|
|
|
static void
|
|
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
|
|
char *fields[5][2];
|
|
int32_t i, j;
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
|
|
|
|
/* sort the special casing entries by code point */
|
|
if(specialCasingCount>0) {
|
|
qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
|
|
}
|
|
|
|
/* replace multiple entries for any code point by one "complex" one */
|
|
j=0;
|
|
for(i=1; i<specialCasingCount; ++i) {
|
|
if(specialCasings[i-1].code==specialCasings[i].code) {
|
|
/* there is a duplicate code point */
|
|
specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following qsort */
|
|
specialCasings[i].isComplex=TRUE; /* make the following one complex */
|
|
specialCasings[i].lowerCase[0]=0;
|
|
specialCasings[i].upperCase[0]=0;
|
|
specialCasings[i].titleCase[0]=0;
|
|
++j;
|
|
}
|
|
}
|
|
|
|
/* if some entries just were removed, then re-sort */
|
|
if(j>0) {
|
|
qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
|
|
specialCasingCount-=j;
|
|
}
|
|
|
|
/*
|
|
* Add one complex mapping to caseSensitive that was filtered out above:
|
|
* Greek final Sigma has a conditional mapping but not locale-sensitive,
|
|
* and it is taken when lowercasing just U+03A3 alone.
|
|
* 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
|
|
*/
|
|
uset_add(caseSensitive, 0x3c2);
|
|
}
|
|
|
|
/* parser for CaseFolding.txt ----------------------------------------------- */
|
|
|
|
#define MAX_CASE_FOLDING_COUNT 2000
|
|
|
|
static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
|
|
static int32_t caseFoldingCount=0;
|
|
|
|
static void U_CALLCONV
|
|
caseFoldingLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
char *end;
|
|
static uint32_t prevCode=0;
|
|
int32_t count;
|
|
char status;
|
|
|
|
/* get code point */
|
|
caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
|
|
end=(char *)u_skipWhitespace(end);
|
|
if(end<=fields[0][0] || end!=fields[0][1]) {
|
|
fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* get the status of this mapping */
|
|
caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
|
|
if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
|
|
fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
|
|
if(status=='L') {
|
|
return;
|
|
}
|
|
|
|
/* get the mapping */
|
|
count=caseFoldings[caseFoldingCount].full[0]=
|
|
(UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, &caseFoldings[caseFoldingCount].simple, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
|
|
/* there is a simple mapping only if there is exactly one code point (count is in UChars) */
|
|
if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
|
|
caseFoldings[caseFoldingCount].simple=0;
|
|
}
|
|
|
|
/* update the case-sensitive set */
|
|
if(status!='T') {
|
|
uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
|
|
_set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
|
|
}
|
|
|
|
/* check the status */
|
|
if(status=='S') {
|
|
/* check if there was a full mapping for this code point before */
|
|
if( caseFoldingCount>0 &&
|
|
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
|
|
caseFoldings[caseFoldingCount-1].status=='F'
|
|
) {
|
|
/* merge the two entries */
|
|
caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
|
|
return;
|
|
}
|
|
} else if(status=='F') {
|
|
/* check if there was a simple mapping for this code point before */
|
|
if( caseFoldingCount>0 &&
|
|
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
|
|
caseFoldings[caseFoldingCount-1].status=='S'
|
|
) {
|
|
/* merge the two entries */
|
|
uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
|
|
return;
|
|
}
|
|
} else if(status=='I' || status=='T') {
|
|
/* check if there was a default mapping for this code point before (remove it) */
|
|
while(caseFoldingCount>0 &&
|
|
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
|
|
) {
|
|
prevCode=0;
|
|
--caseFoldingCount;
|
|
}
|
|
/* store only a marker for special handling for cases like dotless i */
|
|
caseFoldings[caseFoldingCount].simple=0;
|
|
caseFoldings[caseFoldingCount].full[0]=0;
|
|
}
|
|
|
|
/* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
|
|
if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
|
|
fprintf(stderr, "genprops: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
|
|
(unsigned long)caseFoldings[caseFoldingCount].code,
|
|
(unsigned long)prevCode);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
prevCode=caseFoldings[caseFoldingCount].code;
|
|
|
|
if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
|
|
fprintf(stderr, "genprops: too many case folding mappings\n");
|
|
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
|
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
|
}
|
|
}
|
|
|
|
static void
|
|
parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
|
|
char *fields[3][2];
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
|
|
}
|
|
|
|
/* parser for UnicodeData.txt ----------------------------------------------- */
|
|
|
|
/* general categories */
|
|
const char *const
|
|
genCategoryNames[U_CHAR_CATEGORY_COUNT]={
|
|
"Cn",
|
|
"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
|
|
"Mc", "Nd", "Nl", "No",
|
|
"Zs", "Zl", "Zp",
|
|
"Cc", "Cf", "Co", "Cs",
|
|
"Pd", "Ps", "Pe", "Pc", "Po",
|
|
"Sm", "Sc", "Sk", "So",
|
|
"Pi", "Pf"
|
|
};
|
|
|
|
const char *const
|
|
bidiNames[U_CHAR_DIRECTION_COUNT]={
|
|
"L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S",
|
|
"WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
|
|
};
|
|
|
|
const char *const
|
|
decompositionTypeNames[U_DT_COUNT]={
|
|
NULL,
|
|
NULL,
|
|
"compat",
|
|
"circle",
|
|
"final",
|
|
"font",
|
|
"fraction",
|
|
"initial",
|
|
"isolated",
|
|
"medial",
|
|
"narrow",
|
|
"noBreak",
|
|
"small",
|
|
"square",
|
|
"sub",
|
|
"super",
|
|
"vertical",
|
|
"wide"
|
|
};
|
|
|
|
static struct {
|
|
uint32_t first, last, props;
|
|
char name[80];
|
|
} unicodeAreas[32];
|
|
|
|
static int32_t unicodeAreaIndex=0, mirrorIndex=0, specialCasingIndex=0, caseFoldingIndex=0;
|
|
|
|
static void U_CALLCONV
|
|
unicodeDataLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
Props p;
|
|
char *end;
|
|
static uint32_t prevCode=0;
|
|
uint32_t value;
|
|
int32_t i;
|
|
|
|
/* reset the properties */
|
|
uprv_memset(&p, 0, sizeof(Props));
|
|
|
|
/* get the character code, field 0 */
|
|
p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
|
|
if(end<=fields[0][0] || end!=fields[0][1]) {
|
|
fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* get general category, field 2 */
|
|
i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
|
|
if(i>=0) {
|
|
p.generalCategory=(uint8_t)i;
|
|
} else {
|
|
fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
|
|
fields[2][0], (unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* get BiDi category, field 4 */
|
|
i=getTokenIndex(bidiNames, U_CHAR_DIRECTION_COUNT, fields[4][0]);
|
|
if(i>=0) {
|
|
p.bidi=(uint8_t)i;
|
|
} else {
|
|
fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n",
|
|
fields[4][0], (unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* get decomposition type, field 5 */
|
|
if(fields[5][0]<fields[5][1]) {
|
|
/* there is some decomposition */
|
|
if(*fields[5][0]!='<') {
|
|
/* canonical */
|
|
i=U_DT_CANONICAL;
|
|
} else {
|
|
/* get compatibility type */
|
|
end=fields[5][0]+1;
|
|
while(end<fields[5][1] && *end!='>') {
|
|
++end;
|
|
}
|
|
*end='#';
|
|
i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
|
|
if(i<0) {
|
|
fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
|
|
fields[5][0], (unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
}
|
|
if(!upvec_setValue(pv, p.code, p.code+1, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
/* decimal digit value, field 6 */
|
|
if(fields[6][0]<fields[6][1]) {
|
|
value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
|
|
if(end!=fields[6][1] || value>0x7fff) {
|
|
fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
p.numericValue=(int32_t)value;
|
|
p.numericType=1;
|
|
}
|
|
|
|
/* digit value, field 7 */
|
|
if(fields[7][0]<fields[7][1]) {
|
|
value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
|
|
if(end!=fields[7][1] || value>0x7fff) {
|
|
fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
if(p.numericType==0) {
|
|
p.numericValue=(int32_t)value;
|
|
p.numericType=2;
|
|
} else if((int32_t)value!=p.numericValue) {
|
|
fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
}
|
|
|
|
/* numeric value, field 8 */
|
|
if(fields[8][0]<fields[8][1]) {
|
|
char *s=fields[8][0];
|
|
UBool isNegative;
|
|
|
|
/* get a possible minus sign */
|
|
if(*s=='-') {
|
|
isNegative=TRUE;
|
|
++s;
|
|
} else {
|
|
isNegative=FALSE;
|
|
}
|
|
|
|
value=(uint32_t)uprv_strtoul(s, &end, 10);
|
|
if(value>0 && *end=='/') {
|
|
/* field 8 may contain a fractional value, get the denominator */
|
|
if(p.numericType>0) {
|
|
fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
|
|
if(p.denominator==0) {
|
|
fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
}
|
|
if(end!=fields[8][1] || value>0x7fffffff) {
|
|
fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
if(p.numericType==0) {
|
|
if(isNegative) {
|
|
p.numericValue=-(int32_t)value;
|
|
} else {
|
|
p.numericValue=(int32_t)value;
|
|
}
|
|
p.numericType=3;
|
|
} else if((int32_t)value!=p.numericValue) {
|
|
fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
}
|
|
|
|
/* get Mirrored flag, field 9 */
|
|
if(*fields[9][0]=='Y') {
|
|
p.isMirrored=1;
|
|
} else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') {
|
|
fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* get uppercase mapping, field 12 */
|
|
value=(uint32_t)uprv_strtoul(fields[12][0], &end, 16);
|
|
if(end!=fields[12][1]) {
|
|
fprintf(stderr, "genprops: syntax error in field 12 at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
if(value!=0 && value!=p.code) {
|
|
p.upperCase=value;
|
|
uset_add(caseSensitive, (UChar32)p.code);
|
|
uset_add(caseSensitive, (UChar32)value);
|
|
}
|
|
|
|
/* get lowercase value, field 13 */
|
|
value=(uint32_t)uprv_strtoul(fields[13][0], &end, 16);
|
|
if(end!=fields[13][1]) {
|
|
fprintf(stderr, "genprops: syntax error in field 13 at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
if(value!=0 && value!=p.code) {
|
|
p.lowerCase=value;
|
|
uset_add(caseSensitive, (UChar32)p.code);
|
|
uset_add(caseSensitive, (UChar32)value);
|
|
}
|
|
|
|
/* get titlecase value, field 14 */
|
|
value=(uint32_t)uprv_strtoul(fields[14][0], &end, 16);
|
|
if(end!=fields[14][1]) {
|
|
fprintf(stderr, "genprops: syntax error in field 14 at code 0x%lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
if(value!=0 && value!=p.code) {
|
|
p.titleCase=value;
|
|
uset_add(caseSensitive, (UChar32)p.code);
|
|
uset_add(caseSensitive, (UChar32)value);
|
|
}
|
|
|
|
/* set additional properties from previously parsed files */
|
|
if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
|
|
p.mirrorMapping=mirrorMappings[mirrorIndex++][1];
|
|
}
|
|
if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
|
|
p.specialCasing=specialCasings+specialCasingIndex++;
|
|
} else {
|
|
p.specialCasing=NULL;
|
|
}
|
|
if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
|
|
p.caseFolding=caseFoldings+caseFoldingIndex++;
|
|
|
|
/* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
|
|
if( p.caseFolding->status=='C' &&
|
|
p.caseFolding->simple==p.lowerCase
|
|
) {
|
|
p.caseFolding=NULL;
|
|
}
|
|
} else {
|
|
p.caseFolding=NULL;
|
|
}
|
|
|
|
value=makeProps(&p);
|
|
|
|
if(*fields[1][0]=='<') {
|
|
/* first or last entry of a Unicode area */
|
|
size_t length=fields[1][1]-fields[1][0];
|
|
|
|
if(length<9) {
|
|
/* name too short for an area name */
|
|
} else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
|
|
/* set the current area */
|
|
if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
|
|
length-=9;
|
|
unicodeAreas[unicodeAreaIndex].first=p.code;
|
|
unicodeAreas[unicodeAreaIndex].props=value;
|
|
uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
|
|
unicodeAreas[unicodeAreaIndex].name[length]=0;
|
|
} else {
|
|
/* error: a previous area is incomplete */
|
|
fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
return;
|
|
} else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
|
|
/* check that the current area matches, and complete it with the last code point */
|
|
length-=8;
|
|
if( unicodeAreas[unicodeAreaIndex].props==value &&
|
|
0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
|
|
unicodeAreas[unicodeAreaIndex].name[length]==0 &&
|
|
unicodeAreas[unicodeAreaIndex].first<p.code
|
|
) {
|
|
unicodeAreas[unicodeAreaIndex].last=p.code;
|
|
if(beVerbose) {
|
|
printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
|
|
(unsigned long)unicodeAreas[unicodeAreaIndex].first,
|
|
(unsigned long)unicodeAreas[unicodeAreaIndex].last,
|
|
unicodeAreas[unicodeAreaIndex].name);
|
|
}
|
|
unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
|
|
} else {
|
|
/* error: different properties between first & last, different area name, first>=last */
|
|
fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
return;
|
|
} else {
|
|
/* not an area name */
|
|
}
|
|
}
|
|
|
|
/* check for non-character code points */
|
|
if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
|
|
fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
|
|
(unsigned long)p.code);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* check that the code points (p.code) are in ascending order */
|
|
if(p.code<=prevCode && p.code>0) {
|
|
fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
|
|
(unsigned long)p.code, (unsigned long)prevCode);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
prevCode=p.code;
|
|
|
|
/* properties for a single code point */
|
|
addProps(p.code, value);
|
|
}
|
|
|
|
/* set repeated properties for the areas */
|
|
static void
|
|
repeatAreaProps() {
|
|
uint32_t puaProps;
|
|
int32_t i;
|
|
UBool hasPlane15PUA, hasPlane16PUA;
|
|
UErrorCode errorCode;
|
|
|
|
/*
|
|
* UnicodeData.txt before 3.0.1 did not contain the PUAs on
|
|
* planes 15 and 16.
|
|
* If that is the case, then we add them here, using the properties
|
|
* from the BMP PUA.
|
|
*/
|
|
puaProps=0;
|
|
hasPlane15PUA=hasPlane16PUA=FALSE;
|
|
|
|
for(i=0; i<unicodeAreaIndex; ++i) {
|
|
repeatProps(unicodeAreas[i].first,
|
|
unicodeAreas[i].last,
|
|
unicodeAreas[i].props);
|
|
if(unicodeAreas[i].first==0xe000) {
|
|
puaProps=unicodeAreas[i].props;
|
|
} else if(unicodeAreas[i].first==0xf0000) {
|
|
hasPlane15PUA=TRUE;
|
|
} else if(unicodeAreas[i].first==0x100000) {
|
|
hasPlane16PUA=TRUE;
|
|
}
|
|
}
|
|
|
|
if(puaProps!=0) {
|
|
if(!hasPlane15PUA) {
|
|
repeatProps(0xf0000, 0xffffd, puaProps);
|
|
}
|
|
if(!hasPlane16PUA) {
|
|
repeatProps(0x100000, 0x10fffd, puaProps);
|
|
}
|
|
}
|
|
|
|
/* Hangul have canonical decompositions */
|
|
errorCode=U_ZERO_ERROR;
|
|
if(!upvec_setValue(pv, 0xac00, 0xd7a4, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
}
|
|
|
|
static void
|
|
parseDB(const char *filename, UErrorCode *pErrorCode) {
|
|
/* default Bidi classes for unassigned code points */
|
|
static const uint32_t defaultBidi[][2]={ /* { limit, class } */
|
|
{ 0x0590, U_LEFT_TO_RIGHT },
|
|
{ 0x0600, U_RIGHT_TO_LEFT },
|
|
{ 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
|
|
{ 0xFB1D, U_LEFT_TO_RIGHT },
|
|
{ 0xFB50, U_RIGHT_TO_LEFT },
|
|
{ 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
|
|
{ 0xFE70, U_LEFT_TO_RIGHT },
|
|
{ 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
|
|
{ 0x110000, U_LEFT_TO_RIGHT }
|
|
};
|
|
|
|
char *fields[15][2];
|
|
UChar32 start, end;
|
|
uint32_t prev;
|
|
int32_t i;
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Set default Bidi classes for unassigned code points.
|
|
* See table 3-7 "Bidirectional Character Types" in UAX #9.
|
|
* http://www.unicode.org/reports/tr9/
|
|
*/
|
|
prev=0;
|
|
for(i=0; i<LENGTHOF(defaultBidi); ++i) {
|
|
if(defaultBidi[i][1]!=0) {
|
|
repeatProps(prev, defaultBidi[i][0]-1, defaultBidi[i][1]<<UPROPS_BIDI_SHIFT);
|
|
}
|
|
prev=defaultBidi[i][0];
|
|
}
|
|
|
|
/* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
|
|
unicodeAreas[0].first=0xffffffff;
|
|
|
|
u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
|
|
|
|
if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
|
|
fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
|
|
unicodeAreas[unicodeAreaIndex].name,
|
|
(unsigned long)unicodeAreas[unicodeAreaIndex].first);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
repeatAreaProps();
|
|
|
|
/* are all sub-properties consumed? */
|
|
if(mirrorIndex<mirrorCount) {
|
|
fprintf(stderr, "genprops: error - some code points in BidiMirroring.txt are missing from UnicodeData.txt\n");
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
if(specialCasingIndex<specialCasingCount) {
|
|
fprintf(stderr, "genprops: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
if(caseFoldingIndex<caseFoldingCount) {
|
|
fprintf(stderr, "genprops: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
for(i=0;
|
|
0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
|
|
++i
|
|
) {
|
|
addCaseSensitive(start, end);
|
|
}
|
|
if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
|
|
*pErrorCode=U_ZERO_ERROR;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Hey, Emacs, please set the following:
|
|
*
|
|
* Local Variables:
|
|
* indent-tabs-mode: nil
|
|
* End:
|
|
*
|
|
*/
|
|
|