ICU-1721 parse and store additional UCD properties

X-SVN-Rev: 7776
This commit is contained in:
Markus Scherer 2002-02-25 18:48:30 +00:00
parent 669f5f5965
commit 3f657d5bdc
6 changed files with 239 additions and 57 deletions

View File

@ -1,5 +1,5 @@
## Makefile.in for ICU - tools/genprops
## Copyright (c) 1999-2000, International Business Machines Corporation and
## Copyright (c) 1999-2002, International Business Machines Corporation and
## others. All Rights Reserved.
## Steven R. Loomis
@ -43,7 +43,7 @@ endif
LDFLAGS = @LDFLAGS@ $(RPATHLDFLAGS)
LIBS = $(LIBICUTOOLUTIL) $(LIBICUUC) @LIBS@ @LIB_M@
OBJECTS = genprops.o store.o
OBJECTS = genprops.o props2.o propsvec.o store.o
DEPS = $(OBJECTS:.o=.d)

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2001, International Business Machines
* Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -135,49 +135,25 @@ main(int argc, char* argv[]) {
initStore();
/* process BidiMirroring.txt */
if(suffix==NULL) {
uprv_strcpy(basename, "BidiMirroring.txt");
} else {
uprv_strcpy(basename, "BidiMirroring");
basename[6]='-';
uprv_strcpy(basename+7, suffix);
uprv_strcat(basename+7, ".txt");
}
writeUCDFilename(basename, "BidiMirroring", suffix);
parseBidiMirroring(filename, &errorCode);
/* process SpecialCasing.txt */
if(suffix==NULL) {
uprv_strcpy(basename, "SpecialCasing.txt");
} else {
uprv_strcpy(basename, "SpecialCasing");
basename[13]='-';
uprv_strcpy(basename+14, suffix);
uprv_strcat(basename+14, ".txt");
}
writeUCDFilename(basename, "SpecialCasing", suffix);
parseSpecialCasing(filename, &errorCode);
/* process CaseFolding.txt */
if(suffix==NULL) {
uprv_strcpy(basename, "CaseFolding.txt");
} else {
uprv_strcpy(basename, "CaseFolding");
basename[11]='-';
uprv_strcpy(basename+12, suffix);
uprv_strcat(basename+12, ".txt");
}
writeUCDFilename(basename, "CaseFolding", suffix);
parseCaseFolding(filename, &errorCode);
/* process UnicodeData.txt */
if(suffix==NULL) {
uprv_strcpy(basename, "UnicodeData.txt");
} else {
uprv_strcpy(basename, "UnicodeData");
basename[11]='-';
uprv_strcpy(basename+12, suffix);
uprv_strcat(basename+12, ".txt");
}
writeUCDFilename(basename, "UnicodeData", suffix);
parseDB(filename, &errorCode);
/* process additional properties files */
*basename=0;
generateAdditionalProperties(filename, suffix, &errorCode);
/* process parsed data */
if(U_SUCCESS(errorCode)) {
/* write the properties data file */
@ -187,12 +163,16 @@ main(int argc, char* argv[]) {
return errorCode;
}
static const char *
skipWhitespace(const char *s) {
while(*s==' ' || *s=='\t') {
++s;
U_CFUNC void
writeUCDFilename(char *basename, const char *filename, const char *suffix) {
int32_t length=uprv_strlen(filename);
uprv_strcpy(basename, filename);
if(suffix!=NULL) {
basename[length++]='-';
uprv_strcpy(basename+length, suffix);
length+=uprv_strlen(suffix);
}
return s;
uprv_strcpy(basename+length, ".txt");
}
/*
@ -217,7 +197,7 @@ parseCodePoints(const char *s,
count=0;
i=1; /* leave dest[0] for the length value */
for(;;) {
s=skipWhitespace(s);
s=u_skipWhitespace(s);
if(*s==';' || *s==0) {
dest[0]=(UChar)(i-1);
return count;
@ -321,8 +301,8 @@ specialCasingLineFn(void *context,
char *end;
/* get code point */
specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(skipWhitespace(fields[0][0]), &end, 16);
end=(char *)skipWhitespace(end);
specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
end=(char *)u_skipWhitespace(end);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
@ -330,7 +310,7 @@ specialCasingLineFn(void *context,
}
/* is this a complex mapping? */
if(*skipWhitespace(fields[4][0])!=0) {
if(*u_skipWhitespace(fields[4][0])!=0) {
/* there is some condition text in the fifth field */
specialCasings[specialCasingCount].isComplex=TRUE;
@ -416,8 +396,8 @@ caseFoldingLineFn(void *context,
char status;
/* get code point */
caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(skipWhitespace(fields[0][0]), &end, 16);
end=(char *)skipWhitespace(end);
caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
end=(char *)u_skipWhitespace(end);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
@ -425,7 +405,7 @@ caseFoldingLineFn(void *context,
}
/* get the status of this mapping */
caseFoldings[caseFoldingCount].status=status=*skipWhitespace(fields[1][0]);
caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I') {
fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;

View File

@ -109,6 +109,14 @@ SOURCE=.\genprops.c
# End Source File
# Begin Source File
SOURCE=.\props2.c
# End Source File
# Begin Source File
SOURCE=.\propsvec.c
# End Source File
# Begin Source File
SOURCE=.\store.c
# End Source File
# End Group
@ -119,6 +127,10 @@ SOURCE=.\store.c
SOURCE=.\genprops.h
# End Source File
# Begin Source File
SOURCE=.\propsvec.h
# End Source File
# End Group
# Begin Group "Resource Files"

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2001, International Business Machines
* Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -18,6 +18,7 @@
#define __GENPROPS_H__
#include "unicode/utypes.h"
#include "utrie.h"
/* file definitions */
#define DATA_NAME "uprops"
@ -59,6 +60,9 @@ extern const char *const
genCategoryNames[];
/* prototypes */
U_CFUNC void
writeUCDFilename(char *basename, const char *filename, const char *suffix);
extern void
setUnicodeVersion(const char *v);
@ -74,8 +78,18 @@ addProps(uint32_t c, uint32_t props);
extern void
repeatProps(uint32_t first, uint32_t last, uint32_t props);
U_CAPI uint32_t U_EXPORT2
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset);
extern void
generateData(const char *dataDir);
/* props2.c */
U_CFUNC void
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode);
U_CFUNC int32_t
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[16]);
#endif

View File

@ -0,0 +1,161 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: props2.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002feb24
* created by: Markus W. Scherer
*
* Parse more Unicode Character Database files and store
* additional Unicode character properties in bit set vectors.
*/
#include <stdio.h>
#include "unicode/utypes.h"
#include "cstring.h"
#include "cmemory.h"
#include "utrie.h"
#include "uprops.h"
#include "propsvec.h"
#include "uparse.h"
#include "genprops.h"
/* data --------------------------------------------------------------------- */
static UNewTrie *trie;
static uint32_t *pv;
static int32_t pvCount;
/* prototypes --------------------------------------------------------------- */
static void
parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode);
/* -------------------------------------------------------------------------- */
U_CFUNC void
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
char *basename;
basename=filename+uprv_strlen(filename);
pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
/* process DerivedAge.txt */
writeUCDFilename(basename, "DerivedAge", suffix);
parseAge(filename, pv, pErrorCode);
trie=utrie_open(NULL, NULL, 50000, 0, FALSE);
if(trie==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
upvec_close(pv);
return;
}
pvCount=upvec_toTrie(pv, trie, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
static void
ageLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
uint32_t *pv=(uint32_t *)context;
char *s, *end;
uint32_t value, start, limit, version;
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
exit(*pErrorCode);
}
++limit;
/* parse version number */
s=(char *)u_skipWhitespace(fields[1][0]);
value=(uint32_t)uprv_strtoul(s, &end, 10);
if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) {
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
version=value<<4;
/* parse minor version number */
if(*end=='.') {
s=(char *)u_skipWhitespace(end+1);
value=(uint32_t)uprv_strtoul(s, &end, 10);
if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) {
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
version|=value;
}
if(!upvec_setValue(pv, start, limit, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode)) {
fprintf(stderr, "genprops: unable to set character age: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
static void
parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode) {
char *fields[2][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 2, ageLineFn, pv, pErrorCode);
}
U_CFUNC int32_t
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[16]) {
int32_t length;
UErrorCode errorCode;
errorCode=U_ZERO_ERROR;
length=utrie_serialize(trie, p, capacity, getFoldedPropsValue, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
exit(errorCode);
}
if(p!=NULL) {
p+=length;
capacity-=length;
if(beVerbose) {
printf("size in bytes of additional props trie:%5u\n", length);
}
/* set indexes */
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
indexes[UPROPS_ADDITIONAL_VECTORS_TOP_INDEX]=
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
}
if(p!=NULL && (pvCount*4)<=capacity) {
uprv_memcpy(p, pv, pvCount*4);
if(beVerbose) {
printf("number of additional props vectors: %5u\n", pvCount/UPROPS_VECTOR_WORDS);
}
}
length+=pvCount*4;
if(p!=NULL) {
utrie_close(trie);
upvec_close(pv);
}
return length;
}

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2001, International Business Machines
* Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -27,10 +27,19 @@
#include "utrie.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "uprops.h"
#include "genprops.h"
#define DO_DEBUG_OUT 0
/*
* ### TODO
* document structure with additional properties
* increment version number
* use index enums in uchar.c
* improve UTrie compaction: remove identical data blocks before folding! - need to remember which ones are skipped?!
*/
/* Unicode character properties file format ------------------------------------
The file format prepared and written here contains several data
@ -739,7 +748,7 @@ compareProps(const void *l, const void *r) {
/* generate output data ----------------------------------------------------- */
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
static uint32_t U_CALLCONV
U_CAPI uint32_t U_EXPORT2
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t value;
UChar32 limit;
@ -768,11 +777,12 @@ generateData(const char *dataDir) {
0, 0, 0, 0
};
static uint8_t trieBlock[40000];
static uint8_t additionalProps[40000];
UNewDataMemory *pData;
UErrorCode errorCode=U_ZERO_ERROR;
uint32_t size;
int32_t trieSize, offset;
int32_t trieSize, additionalPropsSize, offset;
long dataLength;
compactProps();
@ -788,20 +798,18 @@ generateData(const char *dataDir) {
/* round up trie size to 4-alignement */
trieSize=(trieSize+3)&~3;
offset+=trieSize>>2;
indexes[0]=offset; /* uint32_t offset to props[] */
indexes[UPROPS_PROPS32_INDEX]=offset; /* uint32_t offset to props[] */
offset+=propsTop;
indexes[1]=offset; /* uint32_t offset to exceptions[] */
indexes[UPROPS_EXCEPTIONS_INDEX]=offset;/* uint32_t offset to exceptions[] */
offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */
indexes[2]=offset;
indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=offset;
/* round up UChar count to 4-alignement */
ucharsTop=(ucharsTop+1)&~1;
offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */
indexes[3]=offset;
size=4*offset; /* total size of data */
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
if(beVerbose) {
printf("trie size in bytes: %5u\n", trieSize);
@ -809,6 +817,12 @@ generateData(const char *dataDir) {
printf("number of code points with exceptions: %5u\n", exceptionsCount);
printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
printf("number of UChars for special mappings: %5u\n", ucharsTop);
}
additionalPropsSize=writeAdditionalData(additionalProps, sizeof(additionalProps), indexes);
size=4*offset+additionalPropsSize; /* total size of data */
if(beVerbose) {
printf("data size: %6lu\n", (unsigned long)size);
}
@ -825,6 +839,7 @@ generateData(const char *dataDir) {
udata_writeBlock(pData, props32, 4*propsTop);
udata_writeBlock(pData, exceptions, 4*exceptionsTop);
udata_writeBlock(pData, uchars, 2*ucharsTop);
udata_writeBlock(pData, additionalProps, additionalPropsSize);
/* finish up */
dataLength=udata_finish(pData, &errorCode);