/* ******************************************************************************* * * Copyright (C) 2004-2005, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: gencase.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2004aug28 * created by: Markus W. Scherer * * This program reads several of the Unicode character database text files, * parses them, and the case mapping properties for each character. * It then writes a binary file containing the properties * that is designed to be used directly for random-access to * the properties of each Unicode character. */ #include #include "unicode/utypes.h" #include "unicode/uchar.h" #include "unicode/uset.h" #include "unicode/putil.h" #include "unicode/uclean.h" #include "cmemory.h" #include "cstring.h" #include "uarrsort.h" #include "unewdata.h" #include "uoptions.h" #include "uparse.h" #include "uprops.h" #include "propsvec.h" #include "gencase.h" #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) /* data --------------------------------------------------------------------- */ uint32_t *pv; UBool beVerbose=FALSE, haveCopyright=TRUE; /* * Unicode set collecting the case-sensitive characters; * see uchar.h UCHAR_CASE_SENSITIVE. * Add code points from case mappings/foldings in * the root locale and with default options. */ static USet *caseSensitive; /* prototypes --------------------------------------------------------------- */ static void parseSpecialCasing(const char *filename, UErrorCode *pErrorCode); static void parseCaseFolding(const char *filename, UErrorCode *pErrorCode); static void parseDB(const char *filename, UErrorCode *pErrorCode); /* parse files with multiple binary properties ------------------------------ */ /* TODO: more common code, move functions to uparse.h|c */ /* TODO: similar to genprops/props2.c but not the same */ struct Binary { const char *propName; int32_t vecWord; uint32_t vecValue, vecMask; }; typedef struct Binary Binary; struct Binaries { const char *ucdFile; const Binary *binaries; int32_t binariesCount; }; typedef struct Binaries Binaries; static const Binary propListNames[]={ { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK } }; static const Binaries propListBinaries={ "PropList", propListNames, LENGTHOF(propListNames) }; static const Binary derCorePropsNames[]={ { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK }, { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK } }; static const Binaries derCorePropsBinaries={ "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) }; /* treat Word_Break=MidLetter as a binary property (we ignore all other Word_Break values) */ static const Binary wordBreakNames[]={ { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) } }; static const Binaries wordBreakBinaries={ "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames) }; static void U_CALLCONV binariesLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { const Binaries *bin; char *s; uint32_t start, limit; int32_t i; bin=(const Binaries *)context; u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); exit(*pErrorCode); } ++limit; /* parse binary property name */ s=(char *)u_skipWhitespace(fields[1][0]); for(i=0;; ++i) { if(i==bin->binariesCount) { /* ignore unrecognized properties */ return; } if(isToken(bin->binaries[i].propName, s)) { break; } } if(bin->binaries[i].vecMask==0) { fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n", (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName); exit(U_INTERNAL_PROGRAM_ERROR); } if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) { fprintf(stderr, "gencase error: unable to set %s, code: %s\n", bin->binaries[i].propName, u_errorName(*pErrorCode)); exit(*pErrorCode); } } static void parseBinariesFile(char *filename, char *basename, const char *suffix, const Binaries *bin, UErrorCode *pErrorCode) { char *fields[2][2]; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } writeUCDFilename(basename, bin->ucdFile, suffix); u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); } } /* -------------------------------------------------------------------------- */ enum { HELP_H, HELP_QUESTION_MARK, VERBOSE, COPYRIGHT, DESTDIR, SOURCEDIR, UNICODE_VERSION, ICUDATADIR, CSOURCE }; /* Keep these values in sync with the above enums */ static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, UOPTION_VERBOSE, UOPTION_COPYRIGHT, UOPTION_DESTDIR, UOPTION_SOURCEDIR, UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), UOPTION_ICUDATADIR, UOPTION_DEF("csource", 'C', UOPT_NO_ARG) }; extern int main(int argc, char* argv[]) { char filename[300]; const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; char *basename=NULL; UErrorCode errorCode=U_ZERO_ERROR; U_MAIN_INIT_ARGS(argc, argv); /* preset then read command line options */ options[DESTDIR].value=u_getDataDirectory(); options[SOURCEDIR].value=""; options[UNICODE_VERSION].value=""; options[ICUDATADIR].value=u_getDataDirectory(); argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); /* error handling, printing usage message */ if(argc<0) { fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); } if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { /* * Broken into chucks because the C89 standard says the minimum * required supported string length is 509 bytes. */ fprintf(stderr, "Usage: %s [-options] [suffix]\n" "\n" "read the UnicodeData.txt file and other Unicode properties files and\n" "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n" "\n", argv[0]); fprintf(stderr, "Options:\n" "\t-h or -? or --help this usage text\n" "\t-v or --verbose verbose output\n" "\t-c or --copyright include a copyright notice\n" "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" "\t-C or --csource generate a .c source file rather than the .icu binary\n"); fprintf(stderr, "\t-d or --destdir destination directory, followed by the path\n" "\t-s or --sourcedir source directory, followed by the path\n" "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" "\t followed by path, defaults to %s\n" "\tsuffix suffix that is to be appended with a '-'\n" "\t to the source file basenames before opening;\n" "\t 'gencase new' will read UnicodeData-new.txt etc.\n", u_getDataDirectory()); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } /* get the options values */ beVerbose=options[VERBOSE].doesOccur; haveCopyright=options[COPYRIGHT].doesOccur; srcDir=options[SOURCEDIR].value; destDir=options[DESTDIR].value; if(argc>=2) { suffix=argv[1]; } else { suffix=NULL; } if(options[UNICODE_VERSION].doesOccur) { setUnicodeVersion(options[UNICODE_VERSION].value); } /* else use the default dataVersion in store.c */ if (options[ICUDATADIR].doesOccur) { u_setDataDirectory(options[ICUDATADIR].value); } /* prepare the filename beginning with the source dir */ uprv_strcpy(filename, srcDir); basename=filename+uprv_strlen(filename); if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { *basename++=U_FILE_SEP_CHAR; } /* initialize */ pv=upvec_open(2, 10000); caseSensitive=uset_open(1, 0); /* empty set (start>end) */ /* process SpecialCasing.txt */ writeUCDFilename(basename, "SpecialCasing", suffix); parseSpecialCasing(filename, &errorCode); /* process CaseFolding.txt */ writeUCDFilename(basename, "CaseFolding", suffix); parseCaseFolding(filename, &errorCode); /* process additional properties files */ *basename=0; parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode); parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode); if(ucdVersion>=UNI_4_1) { parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode); } /* process UnicodeData.txt */ writeUCDFilename(basename, "UnicodeData", suffix); parseDB(filename, &errorCode); /* process parsed data */ makeCaseClosure(); makeExceptions(); if(U_SUCCESS(errorCode)) { /* write the properties data file */ generateData(destDir, options[CSOURCE].doesOccur); } u_cleanup(); return errorCode; } U_CFUNC void writeUCDFilename(char *basename, const char *filename, const char *suffix) { int32_t length=(int32_t)uprv_strlen(filename); uprv_strcpy(basename, filename); if(suffix!=NULL) { basename[length++]='-'; uprv_strcpy(basename+length, suffix); length+=(int32_t)uprv_strlen(suffix); } uprv_strcpy(basename+length, ".txt"); } /* TODO: move to toolutil */ U_CFUNC UBool isToken(const char *token, const char *s) { const char *z; int32_t j; s=u_skipWhitespace(s); for(j=0;; ++j) { if(token[j]!=0) { if(s[j]!=token[j]) { break; } } else { z=u_skipWhitespace(s+j); if(*z==';' || *z==0) { return TRUE; } else { break; } } } return FALSE; } static int32_t getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { const char *t, *z; int32_t i, j; s=u_skipWhitespace(s); for(i=0; i=0 */ for(i=0; icode-((const SpecialCasing *)right)->code; } static void parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { char *fields[5][2]; int32_t i, j; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); /* sort the special casing entries by code point */ if(specialCasingCount>0) { uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings, NULL, FALSE, pErrorCode); } if(U_FAILURE(*pErrorCode)) { return; } /* replace multiple entries for any code point by one "complex" one */ j=0; for(i=1; i0) { uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings, NULL, FALSE, pErrorCode); specialCasingCount-=j; } if(U_FAILURE(*pErrorCode)) { return; } /* * Add one complex mapping to caseSensitive that was filtered out above: * Greek final Sigma has a conditional mapping but not locale-sensitive, * and it is taken when lowercasing just U+03A3 alone. * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA */ uset_add(caseSensitive, 0x3c2); } /* parser for CaseFolding.txt ----------------------------------------------- */ #define MAX_CASE_FOLDING_COUNT 2000 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT]; static int32_t caseFoldingCount=0; static void U_CALLCONV caseFoldingLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *end; static UChar32 prevCode=0; int32_t count; char status; /* get code point */ caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); end=(char *)u_skipWhitespace(end); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get the status of this mapping */ caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ if(status=='L') { return; } /* get the mapping */ count=caseFoldings[caseFoldingCount].full[0]= (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); exit(*pErrorCode); } /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { caseFoldings[caseFoldingCount].simple=0; } /* update the case-sensitive set */ if(status!='T') { uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); } /* check the status */ if(status=='S') { /* check if there was a full mapping for this code point before */ if( caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && caseFoldings[caseFoldingCount-1].status=='F' ) { /* merge the two entries */ caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; return; } } else if(status=='F') { /* check if there was a simple mapping for this code point before */ if( caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && caseFoldings[caseFoldingCount-1].status=='S' ) { /* merge the two entries */ uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); return; } } else if(status=='I' || status=='T') { /* check if there was a default mapping for this code point before (remove it) */ while(caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code ) { prevCode=0; --caseFoldingCount; } /* store only a marker for special handling for cases like dotless i */ caseFoldings[caseFoldingCount].simple=0; caseFoldings[caseFoldingCount].full[0]=0; } /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", (unsigned long)caseFoldings[caseFoldingCount].code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } prevCode=caseFoldings[caseFoldingCount].code; if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { fprintf(stderr, "gencase: too many case folding mappings\n"); *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; exit(U_INDEX_OUTOFBOUNDS_ERROR); } } static void parseCaseFolding(const char *filename, UErrorCode *pErrorCode) { char *fields[3][2]; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode); } /* parser for UnicodeData.txt ----------------------------------------------- */ /* general categories */ const char *const genCategoryNames[U_CHAR_CATEGORY_COUNT]={ "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Co", "Cs", "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", "Pi", "Pf" }; static int32_t specialCasingIndex=0, caseFoldingIndex=0; static void U_CALLCONV unicodeDataLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { Props p; char *end; static UChar32 prevCode=0; UChar32 value; int32_t i; /* reset the properties */ uprv_memset(&p, 0, sizeof(Props)); /* get the character code, field 0 */ p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get general category, field 2 */ i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); if(i>=0) { p.gc=(uint8_t)i; } else { fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n", fields[2][0], (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get canonical combining class, field 3 */ value=(UChar32)uprv_strtoul(fields[3][0], &end, 10); if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } p.cc=(uint8_t)value; /* get uppercase mapping, field 12 */ value=(UChar32)uprv_strtoul(fields[12][0], &end, 16); if(end!=fields[12][1]) { fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(value!=0 && value!=p.code) { p.upperCase=value; uset_add(caseSensitive, p.code); uset_add(caseSensitive, value); } /* get lowercase value, field 13 */ value=(UChar32)uprv_strtoul(fields[13][0], &end, 16); if(end!=fields[13][1]) { fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(value!=0 && value!=p.code) { p.lowerCase=value; uset_add(caseSensitive, p.code); uset_add(caseSensitive, value); } /* get titlecase value, field 14 */ value=(UChar32)uprv_strtoul(fields[14][0], &end, 16); if(end!=fields[14][1]) { fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(value!=0 && value!=p.code) { p.titleCase=value; uset_add(caseSensitive, p.code); uset_add(caseSensitive, value); } /* set additional properties from previously parsed files */ if(specialCasingIndexstatus=='C' && p.caseFolding->simple==p.lowerCase ) { p.caseFolding=NULL; } } else { p.caseFolding=NULL; } /* check for non-character code points */ if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check that the code points (p.code) are in ascending order */ if(p.code<=prevCode && p.code>0) { fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", (unsigned long)p.code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* properties for a single code point */ setProps(&p); prevCode=p.code; } static void parseDB(const char *filename, UErrorCode *pErrorCode) { char *fields[15][2]; UChar32 start, end; int32_t i; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); /* are all sub-properties consumed? */ if(specialCasingIndex