ICU-1721 parse and store additional UCD properties

X-SVN-Rev: 7776
This commit is contained in:
Markus Scherer 2002-02-25 18:48:30 +00:00
parent 669f5f5965
commit 3f657d5bdc
6 changed files with 239 additions and 57 deletions

View File

@ -1,5 +1,5 @@
## Makefile.in for ICU - tools/genprops ## Makefile.in for ICU - tools/genprops
## Copyright (c) 1999-2000, International Business Machines Corporation and ## Copyright (c) 1999-2002, International Business Machines Corporation and
## others. All Rights Reserved. ## others. All Rights Reserved.
## Steven R. Loomis ## Steven R. Loomis
@ -43,7 +43,7 @@ endif
LDFLAGS = @LDFLAGS@ $(RPATHLDFLAGS) LDFLAGS = @LDFLAGS@ $(RPATHLDFLAGS)
LIBS = $(LIBICUTOOLUTIL) $(LIBICUUC) @LIBS@ @LIB_M@ LIBS = $(LIBICUTOOLUTIL) $(LIBICUUC) @LIBS@ @LIB_M@
OBJECTS = genprops.o store.o OBJECTS = genprops.o props2.o propsvec.o store.o
DEPS = $(OBJECTS:.o=.d) DEPS = $(OBJECTS:.o=.d)

View File

@ -1,7 +1,7 @@
/* /*
******************************************************************************* *******************************************************************************
* *
* Copyright (C) 1999-2001, International Business Machines * Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved. * Corporation and others. All Rights Reserved.
* *
******************************************************************************* *******************************************************************************
@ -135,49 +135,25 @@ main(int argc, char* argv[]) {
initStore(); initStore();
/* process BidiMirroring.txt */ /* process BidiMirroring.txt */
if(suffix==NULL) { writeUCDFilename(basename, "BidiMirroring", suffix);
uprv_strcpy(basename, "BidiMirroring.txt");
} else {
uprv_strcpy(basename, "BidiMirroring");
basename[6]='-';
uprv_strcpy(basename+7, suffix);
uprv_strcat(basename+7, ".txt");
}
parseBidiMirroring(filename, &errorCode); parseBidiMirroring(filename, &errorCode);
/* process SpecialCasing.txt */ /* process SpecialCasing.txt */
if(suffix==NULL) { writeUCDFilename(basename, "SpecialCasing", suffix);
uprv_strcpy(basename, "SpecialCasing.txt");
} else {
uprv_strcpy(basename, "SpecialCasing");
basename[13]='-';
uprv_strcpy(basename+14, suffix);
uprv_strcat(basename+14, ".txt");
}
parseSpecialCasing(filename, &errorCode); parseSpecialCasing(filename, &errorCode);
/* process CaseFolding.txt */ /* process CaseFolding.txt */
if(suffix==NULL) { writeUCDFilename(basename, "CaseFolding", suffix);
uprv_strcpy(basename, "CaseFolding.txt");
} else {
uprv_strcpy(basename, "CaseFolding");
basename[11]='-';
uprv_strcpy(basename+12, suffix);
uprv_strcat(basename+12, ".txt");
}
parseCaseFolding(filename, &errorCode); parseCaseFolding(filename, &errorCode);
/* process UnicodeData.txt */ /* process UnicodeData.txt */
if(suffix==NULL) { writeUCDFilename(basename, "UnicodeData", suffix);
uprv_strcpy(basename, "UnicodeData.txt");
} else {
uprv_strcpy(basename, "UnicodeData");
basename[11]='-';
uprv_strcpy(basename+12, suffix);
uprv_strcat(basename+12, ".txt");
}
parseDB(filename, &errorCode); parseDB(filename, &errorCode);
/* process additional properties files */
*basename=0;
generateAdditionalProperties(filename, suffix, &errorCode);
/* process parsed data */ /* process parsed data */
if(U_SUCCESS(errorCode)) { if(U_SUCCESS(errorCode)) {
/* write the properties data file */ /* write the properties data file */
@ -187,12 +163,16 @@ main(int argc, char* argv[]) {
return errorCode; return errorCode;
} }
static const char * U_CFUNC void
skipWhitespace(const char *s) { writeUCDFilename(char *basename, const char *filename, const char *suffix) {
while(*s==' ' || *s=='\t') { int32_t length=uprv_strlen(filename);
++s; uprv_strcpy(basename, filename);
if(suffix!=NULL) {
basename[length++]='-';
uprv_strcpy(basename+length, suffix);
length+=uprv_strlen(suffix);
} }
return s; uprv_strcpy(basename+length, ".txt");
} }
/* /*
@ -217,7 +197,7 @@ parseCodePoints(const char *s,
count=0; count=0;
i=1; /* leave dest[0] for the length value */ i=1; /* leave dest[0] for the length value */
for(;;) { for(;;) {
s=skipWhitespace(s); s=u_skipWhitespace(s);
if(*s==';' || *s==0) { if(*s==';' || *s==0) {
dest[0]=(UChar)(i-1); dest[0]=(UChar)(i-1);
return count; return count;
@ -321,8 +301,8 @@ specialCasingLineFn(void *context,
char *end; char *end;
/* get code point */ /* get code point */
specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(skipWhitespace(fields[0][0]), &end, 16); specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
end=(char *)skipWhitespace(end); end=(char *)u_skipWhitespace(end);
if(end<=fields[0][0] || end!=fields[0][1]) { if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR; *pErrorCode=U_PARSE_ERROR;
@ -330,7 +310,7 @@ specialCasingLineFn(void *context,
} }
/* is this a complex mapping? */ /* is this a complex mapping? */
if(*skipWhitespace(fields[4][0])!=0) { if(*u_skipWhitespace(fields[4][0])!=0) {
/* there is some condition text in the fifth field */ /* there is some condition text in the fifth field */
specialCasings[specialCasingCount].isComplex=TRUE; specialCasings[specialCasingCount].isComplex=TRUE;
@ -416,8 +396,8 @@ caseFoldingLineFn(void *context,
char status; char status;
/* get code point */ /* get code point */
caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(skipWhitespace(fields[0][0]), &end, 16); caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
end=(char *)skipWhitespace(end); end=(char *)u_skipWhitespace(end);
if(end<=fields[0][0] || end!=fields[0][1]) { if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR; *pErrorCode=U_PARSE_ERROR;
@ -425,7 +405,7 @@ caseFoldingLineFn(void *context,
} }
/* get the status of this mapping */ /* get the status of this mapping */
caseFoldings[caseFoldingCount].status=status=*skipWhitespace(fields[1][0]); caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I') { if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I') {
fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR; *pErrorCode=U_PARSE_ERROR;

View File

@ -109,6 +109,14 @@ SOURCE=.\genprops.c
# End Source File # End Source File
# Begin Source File # Begin Source File
SOURCE=.\props2.c
# End Source File
# Begin Source File
SOURCE=.\propsvec.c
# End Source File
# Begin Source File
SOURCE=.\store.c SOURCE=.\store.c
# End Source File # End Source File
# End Group # End Group
@ -119,6 +127,10 @@ SOURCE=.\store.c
SOURCE=.\genprops.h SOURCE=.\genprops.h
# End Source File # End Source File
# Begin Source File
SOURCE=.\propsvec.h
# End Source File
# End Group # End Group
# Begin Group "Resource Files" # Begin Group "Resource Files"

View File

@ -1,7 +1,7 @@
/* /*
******************************************************************************* *******************************************************************************
* *
* Copyright (C) 1999-2001, International Business Machines * Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved. * Corporation and others. All Rights Reserved.
* *
******************************************************************************* *******************************************************************************
@ -18,6 +18,7 @@
#define __GENPROPS_H__ #define __GENPROPS_H__
#include "unicode/utypes.h" #include "unicode/utypes.h"
#include "utrie.h"
/* file definitions */ /* file definitions */
#define DATA_NAME "uprops" #define DATA_NAME "uprops"
@ -59,6 +60,9 @@ extern const char *const
genCategoryNames[]; genCategoryNames[];
/* prototypes */ /* prototypes */
U_CFUNC void
writeUCDFilename(char *basename, const char *filename, const char *suffix);
extern void extern void
setUnicodeVersion(const char *v); setUnicodeVersion(const char *v);
@ -74,8 +78,18 @@ addProps(uint32_t c, uint32_t props);
extern void extern void
repeatProps(uint32_t first, uint32_t last, uint32_t props); repeatProps(uint32_t first, uint32_t last, uint32_t props);
U_CAPI uint32_t U_EXPORT2
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset);
extern void extern void
generateData(const char *dataDir); generateData(const char *dataDir);
/* props2.c */
U_CFUNC void
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode);
U_CFUNC int32_t
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[16]);
#endif #endif

View File

@ -0,0 +1,161 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: props2.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002feb24
* created by: Markus W. Scherer
*
* Parse more Unicode Character Database files and store
* additional Unicode character properties in bit set vectors.
*/
#include <stdio.h>
#include "unicode/utypes.h"
#include "cstring.h"
#include "cmemory.h"
#include "utrie.h"
#include "uprops.h"
#include "propsvec.h"
#include "uparse.h"
#include "genprops.h"
/* data --------------------------------------------------------------------- */
static UNewTrie *trie;
static uint32_t *pv;
static int32_t pvCount;
/* prototypes --------------------------------------------------------------- */
static void
parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode);
/* -------------------------------------------------------------------------- */
U_CFUNC void
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
char *basename;
basename=filename+uprv_strlen(filename);
pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
/* process DerivedAge.txt */
writeUCDFilename(basename, "DerivedAge", suffix);
parseAge(filename, pv, pErrorCode);
trie=utrie_open(NULL, NULL, 50000, 0, FALSE);
if(trie==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
upvec_close(pv);
return;
}
pvCount=upvec_toTrie(pv, trie, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
static void
ageLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
uint32_t *pv=(uint32_t *)context;
char *s, *end;
uint32_t value, start, limit, version;
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
exit(*pErrorCode);
}
++limit;
/* parse version number */
s=(char *)u_skipWhitespace(fields[1][0]);
value=(uint32_t)uprv_strtoul(s, &end, 10);
if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) {
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
version=value<<4;
/* parse minor version number */
if(*end=='.') {
s=(char *)u_skipWhitespace(end+1);
value=(uint32_t)uprv_strtoul(s, &end, 10);
if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) {
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
version|=value;
}
if(!upvec_setValue(pv, start, limit, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode)) {
fprintf(stderr, "genprops: unable to set character age: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
static void
parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode) {
char *fields[2][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 2, ageLineFn, pv, pErrorCode);
}
U_CFUNC int32_t
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[16]) {
int32_t length;
UErrorCode errorCode;
errorCode=U_ZERO_ERROR;
length=utrie_serialize(trie, p, capacity, getFoldedPropsValue, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
exit(errorCode);
}
if(p!=NULL) {
p+=length;
capacity-=length;
if(beVerbose) {
printf("size in bytes of additional props trie:%5u\n", length);
}
/* set indexes */
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
indexes[UPROPS_ADDITIONAL_VECTORS_TOP_INDEX]=
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
}
if(p!=NULL && (pvCount*4)<=capacity) {
uprv_memcpy(p, pv, pvCount*4);
if(beVerbose) {
printf("number of additional props vectors: %5u\n", pvCount/UPROPS_VECTOR_WORDS);
}
}
length+=pvCount*4;
if(p!=NULL) {
utrie_close(trie);
upvec_close(pv);
}
return length;
}

View File

@ -1,7 +1,7 @@
/* /*
******************************************************************************* *******************************************************************************
* *
* Copyright (C) 1999-2001, International Business Machines * Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved. * Corporation and others. All Rights Reserved.
* *
******************************************************************************* *******************************************************************************
@ -27,10 +27,19 @@
#include "utrie.h" #include "utrie.h"
#include "unicode/udata.h" #include "unicode/udata.h"
#include "unewdata.h" #include "unewdata.h"
#include "uprops.h"
#include "genprops.h" #include "genprops.h"
#define DO_DEBUG_OUT 0 #define DO_DEBUG_OUT 0
/*
* ### TODO
* document structure with additional properties
* increment version number
* use index enums in uchar.c
* improve UTrie compaction: remove identical data blocks before folding! - need to remember which ones are skipped?!
*/
/* Unicode character properties file format ------------------------------------ /* Unicode character properties file format ------------------------------------
The file format prepared and written here contains several data The file format prepared and written here contains several data
@ -739,7 +748,7 @@ compareProps(const void *l, const void *r) {
/* generate output data ----------------------------------------------------- */ /* generate output data ----------------------------------------------------- */
/* folding value: just store the offset (16 bits) if there is any non-0 entry */ /* folding value: just store the offset (16 bits) if there is any non-0 entry */
static uint32_t U_CALLCONV U_CAPI uint32_t U_EXPORT2
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) { getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t value; uint32_t value;
UChar32 limit; UChar32 limit;
@ -768,11 +777,12 @@ generateData(const char *dataDir) {
0, 0, 0, 0 0, 0, 0, 0
}; };
static uint8_t trieBlock[40000]; static uint8_t trieBlock[40000];
static uint8_t additionalProps[40000];
UNewDataMemory *pData; UNewDataMemory *pData;
UErrorCode errorCode=U_ZERO_ERROR; UErrorCode errorCode=U_ZERO_ERROR;
uint32_t size; uint32_t size;
int32_t trieSize, offset; int32_t trieSize, additionalPropsSize, offset;
long dataLength; long dataLength;
compactProps(); compactProps();
@ -788,20 +798,18 @@ generateData(const char *dataDir) {
/* round up trie size to 4-alignement */ /* round up trie size to 4-alignement */
trieSize=(trieSize+3)&~3; trieSize=(trieSize+3)&~3;
offset+=trieSize>>2; offset+=trieSize>>2;
indexes[0]=offset; /* uint32_t offset to props[] */ indexes[UPROPS_PROPS32_INDEX]=offset; /* uint32_t offset to props[] */
offset+=propsTop; offset+=propsTop;
indexes[1]=offset; /* uint32_t offset to exceptions[] */ indexes[UPROPS_EXCEPTIONS_INDEX]=offset;/* uint32_t offset to exceptions[] */
offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */ offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */
indexes[2]=offset; indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=offset;
/* round up UChar count to 4-alignement */ /* round up UChar count to 4-alignement */
ucharsTop=(ucharsTop+1)&~1; ucharsTop=(ucharsTop+1)&~1;
offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */ offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */
indexes[3]=offset; indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
size=4*offset; /* total size of data */
if(beVerbose) { if(beVerbose) {
printf("trie size in bytes: %5u\n", trieSize); printf("trie size in bytes: %5u\n", trieSize);
@ -809,6 +817,12 @@ generateData(const char *dataDir) {
printf("number of code points with exceptions: %5u\n", exceptionsCount); printf("number of code points with exceptions: %5u\n", exceptionsCount);
printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop); printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
printf("number of UChars for special mappings: %5u\n", ucharsTop); printf("number of UChars for special mappings: %5u\n", ucharsTop);
}
additionalPropsSize=writeAdditionalData(additionalProps, sizeof(additionalProps), indexes);
size=4*offset+additionalPropsSize; /* total size of data */
if(beVerbose) {
printf("data size: %6lu\n", (unsigned long)size); printf("data size: %6lu\n", (unsigned long)size);
} }
@ -825,6 +839,7 @@ generateData(const char *dataDir) {
udata_writeBlock(pData, props32, 4*propsTop); udata_writeBlock(pData, props32, 4*propsTop);
udata_writeBlock(pData, exceptions, 4*exceptionsTop); udata_writeBlock(pData, exceptions, 4*exceptionsTop);
udata_writeBlock(pData, uchars, 2*ucharsTop); udata_writeBlock(pData, uchars, 2*ucharsTop);
udata_writeBlock(pData, additionalProps, additionalPropsSize);
/* finish up */ /* finish up */
dataLength=udata_finish(pData, &errorCode); dataLength=udata_finish(pData, &errorCode);