ICU-1721 parse and store additional UCD properties
X-SVN-Rev: 7776
This commit is contained in:
parent
669f5f5965
commit
3f657d5bdc
@ -1,5 +1,5 @@
|
||||
## Makefile.in for ICU - tools/genprops
|
||||
## Copyright (c) 1999-2000, International Business Machines Corporation and
|
||||
## Copyright (c) 1999-2002, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
## Steven R. Loomis
|
||||
|
||||
@ -43,7 +43,7 @@ endif
|
||||
LDFLAGS = @LDFLAGS@ $(RPATHLDFLAGS)
|
||||
LIBS = $(LIBICUTOOLUTIL) $(LIBICUUC) @LIBS@ @LIB_M@
|
||||
|
||||
OBJECTS = genprops.o store.o
|
||||
OBJECTS = genprops.o props2.o propsvec.o store.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2001, International Business Machines
|
||||
* Copyright (C) 1999-2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -135,49 +135,25 @@ main(int argc, char* argv[]) {
|
||||
initStore();
|
||||
|
||||
/* process BidiMirroring.txt */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "BidiMirroring.txt");
|
||||
} else {
|
||||
uprv_strcpy(basename, "BidiMirroring");
|
||||
basename[6]='-';
|
||||
uprv_strcpy(basename+7, suffix);
|
||||
uprv_strcat(basename+7, ".txt");
|
||||
}
|
||||
writeUCDFilename(basename, "BidiMirroring", suffix);
|
||||
parseBidiMirroring(filename, &errorCode);
|
||||
|
||||
/* process SpecialCasing.txt */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "SpecialCasing.txt");
|
||||
} else {
|
||||
uprv_strcpy(basename, "SpecialCasing");
|
||||
basename[13]='-';
|
||||
uprv_strcpy(basename+14, suffix);
|
||||
uprv_strcat(basename+14, ".txt");
|
||||
}
|
||||
writeUCDFilename(basename, "SpecialCasing", suffix);
|
||||
parseSpecialCasing(filename, &errorCode);
|
||||
|
||||
/* process CaseFolding.txt */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "CaseFolding.txt");
|
||||
} else {
|
||||
uprv_strcpy(basename, "CaseFolding");
|
||||
basename[11]='-';
|
||||
uprv_strcpy(basename+12, suffix);
|
||||
uprv_strcat(basename+12, ".txt");
|
||||
}
|
||||
writeUCDFilename(basename, "CaseFolding", suffix);
|
||||
parseCaseFolding(filename, &errorCode);
|
||||
|
||||
/* process UnicodeData.txt */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "UnicodeData.txt");
|
||||
} else {
|
||||
uprv_strcpy(basename, "UnicodeData");
|
||||
basename[11]='-';
|
||||
uprv_strcpy(basename+12, suffix);
|
||||
uprv_strcat(basename+12, ".txt");
|
||||
}
|
||||
writeUCDFilename(basename, "UnicodeData", suffix);
|
||||
parseDB(filename, &errorCode);
|
||||
|
||||
/* process additional properties files */
|
||||
*basename=0;
|
||||
generateAdditionalProperties(filename, suffix, &errorCode);
|
||||
|
||||
/* process parsed data */
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
/* write the properties data file */
|
||||
@ -187,12 +163,16 @@ main(int argc, char* argv[]) {
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
static const char *
|
||||
skipWhitespace(const char *s) {
|
||||
while(*s==' ' || *s=='\t') {
|
||||
++s;
|
||||
U_CFUNC void
|
||||
writeUCDFilename(char *basename, const char *filename, const char *suffix) {
|
||||
int32_t length=uprv_strlen(filename);
|
||||
uprv_strcpy(basename, filename);
|
||||
if(suffix!=NULL) {
|
||||
basename[length++]='-';
|
||||
uprv_strcpy(basename+length, suffix);
|
||||
length+=uprv_strlen(suffix);
|
||||
}
|
||||
return s;
|
||||
uprv_strcpy(basename+length, ".txt");
|
||||
}
|
||||
|
||||
/*
|
||||
@ -217,7 +197,7 @@ parseCodePoints(const char *s,
|
||||
count=0;
|
||||
i=1; /* leave dest[0] for the length value */
|
||||
for(;;) {
|
||||
s=skipWhitespace(s);
|
||||
s=u_skipWhitespace(s);
|
||||
if(*s==';' || *s==0) {
|
||||
dest[0]=(UChar)(i-1);
|
||||
return count;
|
||||
@ -321,8 +301,8 @@ specialCasingLineFn(void *context,
|
||||
char *end;
|
||||
|
||||
/* get code point */
|
||||
specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(skipWhitespace(fields[0][0]), &end, 16);
|
||||
end=(char *)skipWhitespace(end);
|
||||
specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
|
||||
end=(char *)u_skipWhitespace(end);
|
||||
if(end<=fields[0][0] || end!=fields[0][1]) {
|
||||
fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
@ -330,7 +310,7 @@ specialCasingLineFn(void *context,
|
||||
}
|
||||
|
||||
/* is this a complex mapping? */
|
||||
if(*skipWhitespace(fields[4][0])!=0) {
|
||||
if(*u_skipWhitespace(fields[4][0])!=0) {
|
||||
/* there is some condition text in the fifth field */
|
||||
specialCasings[specialCasingCount].isComplex=TRUE;
|
||||
|
||||
@ -416,8 +396,8 @@ caseFoldingLineFn(void *context,
|
||||
char status;
|
||||
|
||||
/* get code point */
|
||||
caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(skipWhitespace(fields[0][0]), &end, 16);
|
||||
end=(char *)skipWhitespace(end);
|
||||
caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
|
||||
end=(char *)u_skipWhitespace(end);
|
||||
if(end<=fields[0][0] || end!=fields[0][1]) {
|
||||
fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
@ -425,7 +405,7 @@ caseFoldingLineFn(void *context,
|
||||
}
|
||||
|
||||
/* get the status of this mapping */
|
||||
caseFoldings[caseFoldingCount].status=status=*skipWhitespace(fields[1][0]);
|
||||
caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
|
||||
if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I') {
|
||||
fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
|
@ -109,6 +109,14 @@ SOURCE=.\genprops.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\props2.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\propsvec.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\store.c
|
||||
# End Source File
|
||||
# End Group
|
||||
@ -119,6 +127,10 @@ SOURCE=.\store.c
|
||||
|
||||
SOURCE=.\genprops.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\propsvec.h
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Resource Files"
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2001, International Business Machines
|
||||
* Copyright (C) 1999-2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -18,6 +18,7 @@
|
||||
#define __GENPROPS_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "utrie.h"
|
||||
|
||||
/* file definitions */
|
||||
#define DATA_NAME "uprops"
|
||||
@ -59,6 +60,9 @@ extern const char *const
|
||||
genCategoryNames[];
|
||||
|
||||
/* prototypes */
|
||||
U_CFUNC void
|
||||
writeUCDFilename(char *basename, const char *filename, const char *suffix);
|
||||
|
||||
extern void
|
||||
setUnicodeVersion(const char *v);
|
||||
|
||||
@ -74,8 +78,18 @@ addProps(uint32_t c, uint32_t props);
|
||||
extern void
|
||||
repeatProps(uint32_t first, uint32_t last, uint32_t props);
|
||||
|
||||
U_CAPI uint32_t U_EXPORT2
|
||||
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset);
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir);
|
||||
|
||||
/* props2.c */
|
||||
U_CFUNC void
|
||||
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC int32_t
|
||||
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[16]);
|
||||
|
||||
#endif
|
||||
|
||||
|
161
icu4c/source/tools/genprops/props2.c
Normal file
161
icu4c/source/tools/genprops/props2.c
Normal file
@ -0,0 +1,161 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: props2.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2002feb24
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Parse more Unicode Character Database files and store
|
||||
* additional Unicode character properties in bit set vectors.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "cstring.h"
|
||||
#include "cmemory.h"
|
||||
#include "utrie.h"
|
||||
#include "uprops.h"
|
||||
#include "propsvec.h"
|
||||
#include "uparse.h"
|
||||
#include "genprops.h"
|
||||
|
||||
/* data --------------------------------------------------------------------- */
|
||||
|
||||
static UNewTrie *trie;
|
||||
static uint32_t *pv;
|
||||
static int32_t pvCount;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode);
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
U_CFUNC void
|
||||
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
|
||||
char *basename;
|
||||
|
||||
basename=filename+uprv_strlen(filename);
|
||||
|
||||
pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
|
||||
|
||||
/* process DerivedAge.txt */
|
||||
writeUCDFilename(basename, "DerivedAge", suffix);
|
||||
parseAge(filename, pv, pErrorCode);
|
||||
|
||||
trie=utrie_open(NULL, NULL, 50000, 0, FALSE);
|
||||
if(trie==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
upvec_close(pv);
|
||||
return;
|
||||
}
|
||||
|
||||
pvCount=upvec_toTrie(pv, trie, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
ageLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
uint32_t *pv=(uint32_t *)context;
|
||||
char *s, *end;
|
||||
uint32_t value, start, limit, version;
|
||||
|
||||
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
++limit;
|
||||
|
||||
/* parse version number */
|
||||
s=(char *)u_skipWhitespace(fields[1][0]);
|
||||
value=(uint32_t)uprv_strtoul(s, &end, 10);
|
||||
if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) {
|
||||
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
version=value<<4;
|
||||
|
||||
/* parse minor version number */
|
||||
if(*end=='.') {
|
||||
s=(char *)u_skipWhitespace(end+1);
|
||||
value=(uint32_t)uprv_strtoul(s, &end, 10);
|
||||
if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) {
|
||||
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
version|=value;
|
||||
}
|
||||
|
||||
if(!upvec_setValue(pv, start, limit, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode)) {
|
||||
fprintf(stderr, "genprops: unable to set character age: %s\n", u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode) {
|
||||
char *fields[2][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 2, ageLineFn, pv, pErrorCode);
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[16]) {
|
||||
int32_t length;
|
||||
UErrorCode errorCode;
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=utrie_serialize(trie, p, capacity, getFoldedPropsValue, TRUE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
if(p!=NULL) {
|
||||
p+=length;
|
||||
capacity-=length;
|
||||
if(beVerbose) {
|
||||
printf("size in bytes of additional props trie:%5u\n", length);
|
||||
}
|
||||
|
||||
/* set indexes */
|
||||
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
|
||||
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
|
||||
indexes[UPROPS_ADDITIONAL_VECTORS_TOP_INDEX]=
|
||||
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
|
||||
}
|
||||
|
||||
if(p!=NULL && (pvCount*4)<=capacity) {
|
||||
uprv_memcpy(p, pv, pvCount*4);
|
||||
if(beVerbose) {
|
||||
printf("number of additional props vectors: %5u\n", pvCount/UPROPS_VECTOR_WORDS);
|
||||
}
|
||||
}
|
||||
length+=pvCount*4;
|
||||
|
||||
if(p!=NULL) {
|
||||
utrie_close(trie);
|
||||
upvec_close(pv);
|
||||
}
|
||||
return length;
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2001, International Business Machines
|
||||
* Copyright (C) 1999-2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -27,10 +27,19 @@
|
||||
#include "utrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unewdata.h"
|
||||
#include "uprops.h"
|
||||
#include "genprops.h"
|
||||
|
||||
#define DO_DEBUG_OUT 0
|
||||
|
||||
/*
|
||||
* ### TODO
|
||||
* document structure with additional properties
|
||||
* increment version number
|
||||
* use index enums in uchar.c
|
||||
* improve UTrie compaction: remove identical data blocks before folding! - need to remember which ones are skipped?!
|
||||
*/
|
||||
|
||||
/* Unicode character properties file format ------------------------------------
|
||||
|
||||
The file format prepared and written here contains several data
|
||||
@ -739,7 +748,7 @@ compareProps(const void *l, const void *r) {
|
||||
/* generate output data ----------------------------------------------------- */
|
||||
|
||||
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
|
||||
static uint32_t U_CALLCONV
|
||||
U_CAPI uint32_t U_EXPORT2
|
||||
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
uint32_t value;
|
||||
UChar32 limit;
|
||||
@ -768,11 +777,12 @@ generateData(const char *dataDir) {
|
||||
0, 0, 0, 0
|
||||
};
|
||||
static uint8_t trieBlock[40000];
|
||||
static uint8_t additionalProps[40000];
|
||||
|
||||
UNewDataMemory *pData;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
uint32_t size;
|
||||
int32_t trieSize, offset;
|
||||
int32_t trieSize, additionalPropsSize, offset;
|
||||
long dataLength;
|
||||
|
||||
compactProps();
|
||||
@ -788,20 +798,18 @@ generateData(const char *dataDir) {
|
||||
/* round up trie size to 4-alignement */
|
||||
trieSize=(trieSize+3)&~3;
|
||||
offset+=trieSize>>2;
|
||||
indexes[0]=offset; /* uint32_t offset to props[] */
|
||||
indexes[UPROPS_PROPS32_INDEX]=offset; /* uint32_t offset to props[] */
|
||||
|
||||
offset+=propsTop;
|
||||
indexes[1]=offset; /* uint32_t offset to exceptions[] */
|
||||
indexes[UPROPS_EXCEPTIONS_INDEX]=offset;/* uint32_t offset to exceptions[] */
|
||||
|
||||
offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */
|
||||
indexes[2]=offset;
|
||||
indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=offset;
|
||||
|
||||
/* round up UChar count to 4-alignement */
|
||||
ucharsTop=(ucharsTop+1)&~1;
|
||||
offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */
|
||||
indexes[3]=offset;
|
||||
|
||||
size=4*offset; /* total size of data */
|
||||
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
|
||||
|
||||
if(beVerbose) {
|
||||
printf("trie size in bytes: %5u\n", trieSize);
|
||||
@ -809,6 +817,12 @@ generateData(const char *dataDir) {
|
||||
printf("number of code points with exceptions: %5u\n", exceptionsCount);
|
||||
printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
|
||||
printf("number of UChars for special mappings: %5u\n", ucharsTop);
|
||||
}
|
||||
|
||||
additionalPropsSize=writeAdditionalData(additionalProps, sizeof(additionalProps), indexes);
|
||||
|
||||
size=4*offset+additionalPropsSize; /* total size of data */
|
||||
if(beVerbose) {
|
||||
printf("data size: %6lu\n", (unsigned long)size);
|
||||
}
|
||||
|
||||
@ -825,6 +839,7 @@ generateData(const char *dataDir) {
|
||||
udata_writeBlock(pData, props32, 4*propsTop);
|
||||
udata_writeBlock(pData, exceptions, 4*exceptionsTop);
|
||||
udata_writeBlock(pData, uchars, 2*ucharsTop);
|
||||
udata_writeBlock(pData, additionalProps, additionalPropsSize);
|
||||
|
||||
/* finish up */
|
||||
dataLength=udata_finish(pData, &errorCode);
|
||||
|
Loading…
Reference in New Issue
Block a user