ICU-840 parse CaseFolding.txt and store the mappings in uprops.dat

X-SVN-Rev: 3617
This commit is contained in:
Markus Scherer 2001-02-14 00:45:29 +00:00
parent 299c7b148f
commit 7c708e636a
3 changed files with 199 additions and 39 deletions

View File

@ -49,6 +49,9 @@ parseMirror(const char *filename, UErrorCode *pErrorCode);
static void
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
static void
parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
static void
parseDB(const char *filename, UErrorCode *pErrorCode);
@ -149,6 +152,17 @@ main(int argc, char* argv[]) {
}
parseSpecialCasing(filename, &errorCode);
/* process CaseFolding.txt */
if(suffix==NULL) {
uprv_strcpy(basename, "CaseFolding.txt");
} else {
uprv_strcpy(basename, "CaseFolding");
basename[11]='-';
uprv_strcpy(basename+12, suffix);
uprv_strcat(basename+12, ".txt");
}
parseCaseFolding(filename, &errorCode);
/* process UnicodeData.txt */
if(suffix==NULL) {
uprv_strcpy(basename, "UnicodeData.txt");
@ -186,20 +200,32 @@ skipWhitespace(const char *s) {
return s;
}
static void
/*
* parse a list of code points
* store them as a string in dest[destSize] with the string length in dest[0]
* set the first code point in *pFirst
* return the number of code points
*/
static int32_t
parseCodePoints(const char *s,
UChar *dest, int32_t destSize,
uint32_t *pFirst,
UErrorCode *pErrorCode) {
char *end;
uint32_t value;
int32_t i;
int32_t i, count;
if(pFirst!=NULL) {
*pFirst=0xffff;
}
count=0;
i=1; /* leave dest[0] for the length value */
for(;;) {
s=skipWhitespace(s);
if(*s==';' || *s==0) {
dest[0]=(UChar)(i-1);
return;
return count;
}
/* read one code point */
@ -207,7 +233,12 @@ parseCodePoints(const char *s,
if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) {
fprintf(stderr, "genprops: syntax error parsing code point at %s\n", s);
*pErrorCode=U_PARSE_ERROR;
return;
return -1;
}
/* store the first code point */
if(++count==1 && pFirst!=NULL) {
*pFirst=value;
}
/* append it to the destination array */
@ -217,7 +248,7 @@ parseCodePoints(const char *s,
if(i>=destSize) {
fprintf(stderr, "genprops: code point sequence too long at at %s\n", s);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return;
return -1;
}
/* go to the following characters */
@ -281,7 +312,6 @@ static void
specialCasingLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
static int32_t mirrorIndex=0;
char *end;
/* get code point */
@ -305,9 +335,9 @@ specialCasingLineFn(void *context,
} else {
/* just set the "complex" flag and get the case mappings */
specialCasings[specialCasingCount].isComplex=FALSE;
parseCodePoints(fields[1][0], specialCasings[specialCasingCount].lowerCase, 32, pErrorCode);
parseCodePoints(fields[3][0], specialCasings[specialCasingCount].upperCase, 32, pErrorCode);
parseCodePoints(fields[2][0], specialCasings[specialCasingCount].titleCase, 32, pErrorCode);
parseCodePoints(fields[1][0], specialCasings[specialCasingCount].lowerCase, 32, NULL, pErrorCode);
parseCodePoints(fields[3][0], specialCasings[specialCasingCount].upperCase, 32, NULL, pErrorCode);
parseCodePoints(fields[2][0], specialCasings[specialCasingCount].titleCase, 32, NULL, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: error parsing special casing at %s\n", fields[0][0]);
exit(*pErrorCode);
@ -363,6 +393,100 @@ parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
}
}
/* parser for CaseFolding.txt ----------------------------------------------- */
#define MAX_CASE_FOLDING_COUNT 500
static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
static int32_t caseFoldingCount=0;
static void
caseFoldingLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
char *end;
int32_t count;
char status;
/* get code point */
caseFoldings[caseFoldingCount].code=uprv_strtoul(skipWhitespace(fields[0][0]), &end, 16);
end=(char *)skipWhitespace(end);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
*pErrorCode = U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get the status of this mapping */
caseFoldings[caseFoldingCount].status=status=*skipWhitespace(fields[1][0]);
if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I') {
fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
*pErrorCode = U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
if(status=='L') {
return;
}
/* get the mapping */
count=parseCodePoints(fields[2][0], caseFoldings[caseFoldingCount].full, 32, &caseFoldings[caseFoldingCount].simple, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
exit(*pErrorCode);
}
/* there is a simple mapping only if there is exactly one code point */
if(count!=1) {
caseFoldings[caseFoldingCount].simple=0;
}
/* check the status */
if(status=='S') {
/* check if there was a full mapping for this code point before */
if( caseFoldingCount>0 &&
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
caseFoldings[caseFoldingCount-1].status=='F'
) {
/* merge the two entries */
caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
return;
}
} else if(status=='F') {
/* check if there was a simple mapping for this code point before */
if( caseFoldingCount>0 &&
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
caseFoldings[caseFoldingCount-1].status=='S'
) {
/* merge the two entries */
uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
return;
}
} else if(status=='I') {
/* store only a marker for special handling for cases like dotless i */
caseFoldings[caseFoldingCount].simple=0;
caseFoldings[caseFoldingCount].full[0]=0;
}
if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
fprintf(stderr, "genprops: too many case folding mappings\n");
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
}
static void
parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
char *fields[3][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
}
/* parser for UnicodeData.txt ----------------------------------------------- */
/* general categories */
@ -406,7 +530,7 @@ static void
unicodeDataLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
static int32_t mirrorIndex=0, specialCasingIndex=0;
static int32_t mirrorIndex=0, specialCasingIndex=0, caseFoldingIndex=0;
Props p;
char *end;
uint32_t value;
@ -558,6 +682,18 @@ unicodeDataLineFn(void *context,
} else {
p.specialCasing=NULL;
}
if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
p.caseFolding=caseFoldings+caseFoldingIndex++;
/* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
if( p.caseFolding->status=='C' &&
p.caseFolding->simple==p.lowerCase
) {
p.caseFolding=NULL;
}
} else {
p.caseFolding=NULL;
}
addProps(&p);
}

View File

@ -30,6 +30,13 @@ typedef struct {
UChar lowerCase[32], upperCase[32], titleCase[32];
} SpecialCasing;
/* case folding data */
typedef struct {
uint32_t code, simple;
char status;
UChar full[32];
} CaseFolding;
/* character properties */
typedef struct {
uint32_t code, lowerCase, upperCase, titleCase, mirrorMapping;
@ -38,6 +45,7 @@ typedef struct {
uint32_t denominator; /* 0: no value */
uint8_t generalCategory, canonicalCombining, bidi, isMirrored, hasNumericValue;
SpecialCasing *specialCasing;
CaseFolding *caseFolding;
} Props;
/* global flags */

View File

@ -204,6 +204,7 @@ bit
5 has denominator value
6 has a mirror-image Unicode code point
7 has SpecialCasing.txt entries
8 has CaseFolding.txt entries
According to the flags in this word, one or more uint32_t words follow it
in the sequence of the bit flags in the flags word; if a flag is not set,
@ -245,6 +246,19 @@ One UChar value with lengths as follows:
Followed by the UChars for lowercase, uppercase, titlecase mappings in this order.
For case folding mappings, the 32-bit exception word contains:
31..24 number of UChars used for the full mapping
23..16 reserved
15.. 0 UChar offset from the beginning of the UChars array where the
UChars for the special case mappings are stored in the following format:
Format of case folding UChars:
Two UChars contain the simple mapping as follows:
0, 0 no simple mapping
BMP,0 a simple mapping to a BMP code point
s1, s2 a simple mapping to a supplementary code point stored as two surrogates
This is followed by the UChars for the full case folding mappings.
Example:
U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
mapping and a numeric value.
@ -267,7 +281,7 @@ static UDataInfo dataInfo={
0,
0x55, 0x50, 0x72, 0x6f, /* dataFormat="UPro" */
1, 2, 0, 0, /* formatVersion */
1, 3, 0, 0, /* formatVersion */
3, 0, 0, 0 /* dataVersion */
};
@ -365,35 +379,10 @@ addUChars(const UChar *s, uint32_t length);
/* -------------------------------------------------------------------------- */
/* ### this must become public in putil.c */
static void
__versionFromString(UVersionInfo versionArray, const char *versionString) {
char *end;
uint16_t part=0;
if(versionArray==NULL) {
return;
}
if(versionString!=NULL) {
for(;;) {
versionArray[part]=(uint8_t)uprv_strtoul(versionString, &end, 10);
if(end==versionString || ++part==U_MAX_VERSION_LENGTH || *end!=U_VERSION_DELIMITER) {
break;
}
versionString=end+1;
}
}
while(part<U_MAX_VERSION_LENGTH) {
versionArray[part++]=0;
}
}
extern void
setUnicodeVersion(const char *v) {
UVersionInfo version;
__versionFromString(version, v);
u_versionFromString(version, v);
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
@ -529,6 +518,10 @@ addProps(Props *p) {
x=EXCEPTION_BIT;
++count;
}
if(p->caseFolding!=NULL) {
x=EXCEPTION_BIT;
++count;
}
/* handle exceptions */
if(count>1 || x!=0 || value<MIN_VALUE || MAX_VALUE<value) {
@ -598,9 +591,9 @@ addProps(Props *p) {
exceptions[value+length++]=p->mirrorMapping;
}
if(p->specialCasing!=NULL) {
first|=0x80;
if(p->specialCasing->isComplex) {
/* complex special casing */
first|=0x80;
exceptions[value+length++]=0x80000000;
} else {
/* unconditional special casing */
@ -630,10 +623,33 @@ addProps(Props *p) {
}
u[0]=entry;
first|=0x80;
exceptions[value+length++]=(i<<24)|addUChars(u, i);
}
}
if(p->caseFolding!=NULL) {
first|=0x100;
if(p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
/* special case folding, store only a marker */
exceptions[value+length++]=0;
} else {
/* normal case folding with a simple and a full mapping */
UChar u[128];
uint16_t i;
/* store the simple mapping into the first two UChars */
i=0;
u[1]=0;
UTF_APPEND_CHAR_UNSAFE(u, i, p->caseFolding->simple);
/* store the full mapping after that */
i=p->caseFolding->full[0];
if(i>0) {
uprv_memcpy(u+2, p->caseFolding->full+1, 2*i);
}
exceptions[value+length++]=(i<<24)|addUChars(u, 2+i);
}
}
exceptions[value]=first;
exceptionsTop+=length;
}