From 3f657d5bdc23d6d9497f0c589fa7048426063946 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 25 Feb 2002 18:48:30 +0000 Subject: [PATCH] ICU-1721 parse and store additional UCD properties X-SVN-Rev: 7776 --- icu4c/source/tools/genprops/Makefile.in | 4 +- icu4c/source/tools/genprops/genprops.c | 70 ++++------ icu4c/source/tools/genprops/genprops.dsp | 12 ++ icu4c/source/tools/genprops/genprops.h | 16 ++- icu4c/source/tools/genprops/props2.c | 161 +++++++++++++++++++++++ icu4c/source/tools/genprops/store.c | 33 +++-- 6 files changed, 239 insertions(+), 57 deletions(-) create mode 100644 icu4c/source/tools/genprops/props2.c diff --git a/icu4c/source/tools/genprops/Makefile.in b/icu4c/source/tools/genprops/Makefile.in index aa1bca34b9..0add85a2f8 100644 --- a/icu4c/source/tools/genprops/Makefile.in +++ b/icu4c/source/tools/genprops/Makefile.in @@ -1,5 +1,5 @@ ## Makefile.in for ICU - tools/genprops -## Copyright (c) 1999-2000, International Business Machines Corporation and +## Copyright (c) 1999-2002, International Business Machines Corporation and ## others. All Rights Reserved. ## Steven R. Loomis @@ -43,7 +43,7 @@ endif LDFLAGS = @LDFLAGS@ $(RPATHLDFLAGS) LIBS = $(LIBICUTOOLUTIL) $(LIBICUUC) @LIBS@ @LIB_M@ -OBJECTS = genprops.o store.o +OBJECTS = genprops.o props2.o propsvec.o store.o DEPS = $(OBJECTS:.o=.d) diff --git a/icu4c/source/tools/genprops/genprops.c b/icu4c/source/tools/genprops/genprops.c index 7b30d308d5..a56fc278b4 100644 --- a/icu4c/source/tools/genprops/genprops.c +++ b/icu4c/source/tools/genprops/genprops.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2001, International Business Machines +* Copyright (C) 1999-2002, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -135,49 +135,25 @@ main(int argc, char* argv[]) { initStore(); /* process BidiMirroring.txt */ - if(suffix==NULL) { - uprv_strcpy(basename, "BidiMirroring.txt"); - } else { - uprv_strcpy(basename, "BidiMirroring"); - basename[6]='-'; - uprv_strcpy(basename+7, suffix); - uprv_strcat(basename+7, ".txt"); - } + writeUCDFilename(basename, "BidiMirroring", suffix); parseBidiMirroring(filename, &errorCode); /* process SpecialCasing.txt */ - if(suffix==NULL) { - uprv_strcpy(basename, "SpecialCasing.txt"); - } else { - uprv_strcpy(basename, "SpecialCasing"); - basename[13]='-'; - uprv_strcpy(basename+14, suffix); - uprv_strcat(basename+14, ".txt"); - } + writeUCDFilename(basename, "SpecialCasing", suffix); parseSpecialCasing(filename, &errorCode); /* process CaseFolding.txt */ - if(suffix==NULL) { - uprv_strcpy(basename, "CaseFolding.txt"); - } else { - uprv_strcpy(basename, "CaseFolding"); - basename[11]='-'; - uprv_strcpy(basename+12, suffix); - uprv_strcat(basename+12, ".txt"); - } + writeUCDFilename(basename, "CaseFolding", suffix); parseCaseFolding(filename, &errorCode); /* process UnicodeData.txt */ - if(suffix==NULL) { - uprv_strcpy(basename, "UnicodeData.txt"); - } else { - uprv_strcpy(basename, "UnicodeData"); - basename[11]='-'; - uprv_strcpy(basename+12, suffix); - uprv_strcat(basename+12, ".txt"); - } + writeUCDFilename(basename, "UnicodeData", suffix); parseDB(filename, &errorCode); + /* process additional properties files */ + *basename=0; + generateAdditionalProperties(filename, suffix, &errorCode); + /* process parsed data */ if(U_SUCCESS(errorCode)) { /* write the properties data file */ @@ -187,12 +163,16 @@ main(int argc, char* argv[]) { return errorCode; } -static const char * -skipWhitespace(const char *s) { - while(*s==' ' || *s=='\t') { - ++s; +U_CFUNC void +writeUCDFilename(char *basename, const char *filename, const char *suffix) { + int32_t length=uprv_strlen(filename); + uprv_strcpy(basename, filename); + if(suffix!=NULL) { + basename[length++]='-'; + uprv_strcpy(basename+length, suffix); + length+=uprv_strlen(suffix); } - return s; + uprv_strcpy(basename+length, ".txt"); } /* @@ -217,7 +197,7 @@ parseCodePoints(const char *s, count=0; i=1; /* leave dest[0] for the length value */ for(;;) { - s=skipWhitespace(s); + s=u_skipWhitespace(s); if(*s==';' || *s==0) { dest[0]=(UChar)(i-1); return count; @@ -321,8 +301,8 @@ specialCasingLineFn(void *context, char *end; /* get code point */ - specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(skipWhitespace(fields[0][0]), &end, 16); - end=(char *)skipWhitespace(end); + specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); + end=(char *)u_skipWhitespace(end); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; @@ -330,7 +310,7 @@ specialCasingLineFn(void *context, } /* is this a complex mapping? */ - if(*skipWhitespace(fields[4][0])!=0) { + if(*u_skipWhitespace(fields[4][0])!=0) { /* there is some condition text in the fifth field */ specialCasings[specialCasingCount].isComplex=TRUE; @@ -416,8 +396,8 @@ caseFoldingLineFn(void *context, char status; /* get code point */ - caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(skipWhitespace(fields[0][0]), &end, 16); - end=(char *)skipWhitespace(end); + caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); + end=(char *)u_skipWhitespace(end); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; @@ -425,7 +405,7 @@ caseFoldingLineFn(void *context, } /* get the status of this mapping */ - caseFoldings[caseFoldingCount].status=status=*skipWhitespace(fields[1][0]); + caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I') { fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; diff --git a/icu4c/source/tools/genprops/genprops.dsp b/icu4c/source/tools/genprops/genprops.dsp index 90d4c0000a..2fc2a4eda9 100644 --- a/icu4c/source/tools/genprops/genprops.dsp +++ b/icu4c/source/tools/genprops/genprops.dsp @@ -109,6 +109,14 @@ SOURCE=.\genprops.c # End Source File # Begin Source File +SOURCE=.\props2.c +# End Source File +# Begin Source File + +SOURCE=.\propsvec.c +# End Source File +# Begin Source File + SOURCE=.\store.c # End Source File # End Group @@ -119,6 +127,10 @@ SOURCE=.\store.c SOURCE=.\genprops.h # End Source File +# Begin Source File + +SOURCE=.\propsvec.h +# End Source File # End Group # Begin Group "Resource Files" diff --git a/icu4c/source/tools/genprops/genprops.h b/icu4c/source/tools/genprops/genprops.h index 63b4558e0a..2cec055525 100644 --- a/icu4c/source/tools/genprops/genprops.h +++ b/icu4c/source/tools/genprops/genprops.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2001, International Business Machines +* Copyright (C) 1999-2002, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -18,6 +18,7 @@ #define __GENPROPS_H__ #include "unicode/utypes.h" +#include "utrie.h" /* file definitions */ #define DATA_NAME "uprops" @@ -59,6 +60,9 @@ extern const char *const genCategoryNames[]; /* prototypes */ +U_CFUNC void +writeUCDFilename(char *basename, const char *filename, const char *suffix); + extern void setUnicodeVersion(const char *v); @@ -74,8 +78,18 @@ addProps(uint32_t c, uint32_t props); extern void repeatProps(uint32_t first, uint32_t last, uint32_t props); +U_CAPI uint32_t U_EXPORT2 +getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset); + extern void generateData(const char *dataDir); +/* props2.c */ +U_CFUNC void +generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode); + +U_CFUNC int32_t +writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[16]); + #endif diff --git a/icu4c/source/tools/genprops/props2.c b/icu4c/source/tools/genprops/props2.c new file mode 100644 index 0000000000..a9b3173850 --- /dev/null +++ b/icu4c/source/tools/genprops/props2.c @@ -0,0 +1,161 @@ +/* +******************************************************************************* +* +* Copyright (C) 2002, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: props2.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002feb24 +* created by: Markus W. Scherer +* +* Parse more Unicode Character Database files and store +* additional Unicode character properties in bit set vectors. +*/ + +#include +#include "unicode/utypes.h" +#include "cstring.h" +#include "cmemory.h" +#include "utrie.h" +#include "uprops.h" +#include "propsvec.h" +#include "uparse.h" +#include "genprops.h" + +/* data --------------------------------------------------------------------- */ + +static UNewTrie *trie; +static uint32_t *pv; +static int32_t pvCount; + +/* prototypes --------------------------------------------------------------- */ + +static void +parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode); + +/* -------------------------------------------------------------------------- */ + +U_CFUNC void +generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) { + char *basename; + + basename=filename+uprv_strlen(filename); + + pv=upvec_open(UPROPS_VECTOR_WORDS, 20000); + + /* process DerivedAge.txt */ + writeUCDFilename(basename, "DerivedAge", suffix); + parseAge(filename, pv, pErrorCode); + + trie=utrie_open(NULL, NULL, 50000, 0, FALSE); + if(trie==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + upvec_close(pv); + return; + } + + pvCount=upvec_toTrie(pv, trie, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode)); + exit(*pErrorCode); + } +} + +static void +ageLineFn(void *context, + char *fields[][2], int32_t fieldCount, + UErrorCode *pErrorCode) { + uint32_t *pv=(uint32_t *)context; + char *s, *end; + uint32_t value, start, limit, version; + + u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]); + exit(*pErrorCode); + } + ++limit; + + /* parse version number */ + s=(char *)u_skipWhitespace(fields[1][0]); + value=(uint32_t)uprv_strtoul(s, &end, 10); + if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) { + fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + version=value<<4; + + /* parse minor version number */ + if(*end=='.') { + s=(char *)u_skipWhitespace(end+1); + value=(uint32_t)uprv_strtoul(s, &end, 10); + if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) { + fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + version|=value; + } + + if(!upvec_setValue(pv, start, limit, 0, version<>2; - indexes[0]=offset; /* uint32_t offset to props[] */ + indexes[UPROPS_PROPS32_INDEX]=offset; /* uint32_t offset to props[] */ offset+=propsTop; - indexes[1]=offset; /* uint32_t offset to exceptions[] */ + indexes[UPROPS_EXCEPTIONS_INDEX]=offset;/* uint32_t offset to exceptions[] */ offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */ - indexes[2]=offset; + indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=offset; /* round up UChar count to 4-alignement */ ucharsTop=(ucharsTop+1)&~1; offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */ - indexes[3]=offset; - - size=4*offset; /* total size of data */ + indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset; if(beVerbose) { printf("trie size in bytes: %5u\n", trieSize); @@ -809,6 +817,12 @@ generateData(const char *dataDir) { printf("number of code points with exceptions: %5u\n", exceptionsCount); printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop); printf("number of UChars for special mappings: %5u\n", ucharsTop); + } + + additionalPropsSize=writeAdditionalData(additionalProps, sizeof(additionalProps), indexes); + + size=4*offset+additionalPropsSize; /* total size of data */ + if(beVerbose) { printf("data size: %6lu\n", (unsigned long)size); } @@ -825,6 +839,7 @@ generateData(const char *dataDir) { udata_writeBlock(pData, props32, 4*propsTop); udata_writeBlock(pData, exceptions, 4*exceptionsTop); udata_writeBlock(pData, uchars, 2*ucharsTop); + udata_writeBlock(pData, additionalProps, additionalPropsSize); /* finish up */ dataLength=udata_finish(pData, &errorCode);