ICU-840 parse CaseFolding.txt and store the mappings in uprops.dat
X-SVN-Rev: 3617
This commit is contained in:
parent
299c7b148f
commit
7c708e636a
@ -49,6 +49,9 @@ parseMirror(const char *filename, UErrorCode *pErrorCode);
|
||||
static void
|
||||
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
parseDB(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
@ -149,6 +152,17 @@ main(int argc, char* argv[]) {
|
||||
}
|
||||
parseSpecialCasing(filename, &errorCode);
|
||||
|
||||
/* process CaseFolding.txt */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "CaseFolding.txt");
|
||||
} else {
|
||||
uprv_strcpy(basename, "CaseFolding");
|
||||
basename[11]='-';
|
||||
uprv_strcpy(basename+12, suffix);
|
||||
uprv_strcat(basename+12, ".txt");
|
||||
}
|
||||
parseCaseFolding(filename, &errorCode);
|
||||
|
||||
/* process UnicodeData.txt */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "UnicodeData.txt");
|
||||
@ -186,20 +200,32 @@ skipWhitespace(const char *s) {
|
||||
return s;
|
||||
}
|
||||
|
||||
static void
|
||||
/*
|
||||
* parse a list of code points
|
||||
* store them as a string in dest[destSize] with the string length in dest[0]
|
||||
* set the first code point in *pFirst
|
||||
* return the number of code points
|
||||
*/
|
||||
static int32_t
|
||||
parseCodePoints(const char *s,
|
||||
UChar *dest, int32_t destSize,
|
||||
uint32_t *pFirst,
|
||||
UErrorCode *pErrorCode) {
|
||||
char *end;
|
||||
uint32_t value;
|
||||
int32_t i;
|
||||
int32_t i, count;
|
||||
|
||||
if(pFirst!=NULL) {
|
||||
*pFirst=0xffff;
|
||||
}
|
||||
|
||||
count=0;
|
||||
i=1; /* leave dest[0] for the length value */
|
||||
for(;;) {
|
||||
s=skipWhitespace(s);
|
||||
if(*s==';' || *s==0) {
|
||||
dest[0]=(UChar)(i-1);
|
||||
return;
|
||||
return count;
|
||||
}
|
||||
|
||||
/* read one code point */
|
||||
@ -207,7 +233,12 @@ parseCodePoints(const char *s,
|
||||
if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) {
|
||||
fprintf(stderr, "genprops: syntax error parsing code point at %s\n", s);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* store the first code point */
|
||||
if(++count==1 && pFirst!=NULL) {
|
||||
*pFirst=value;
|
||||
}
|
||||
|
||||
/* append it to the destination array */
|
||||
@ -217,7 +248,7 @@ parseCodePoints(const char *s,
|
||||
if(i>=destSize) {
|
||||
fprintf(stderr, "genprops: code point sequence too long at at %s\n", s);
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* go to the following characters */
|
||||
@ -281,7 +312,6 @@ static void
|
||||
specialCasingLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
static int32_t mirrorIndex=0;
|
||||
char *end;
|
||||
|
||||
/* get code point */
|
||||
@ -305,9 +335,9 @@ specialCasingLineFn(void *context,
|
||||
} else {
|
||||
/* just set the "complex" flag and get the case mappings */
|
||||
specialCasings[specialCasingCount].isComplex=FALSE;
|
||||
parseCodePoints(fields[1][0], specialCasings[specialCasingCount].lowerCase, 32, pErrorCode);
|
||||
parseCodePoints(fields[3][0], specialCasings[specialCasingCount].upperCase, 32, pErrorCode);
|
||||
parseCodePoints(fields[2][0], specialCasings[specialCasingCount].titleCase, 32, pErrorCode);
|
||||
parseCodePoints(fields[1][0], specialCasings[specialCasingCount].lowerCase, 32, NULL, pErrorCode);
|
||||
parseCodePoints(fields[3][0], specialCasings[specialCasingCount].upperCase, 32, NULL, pErrorCode);
|
||||
parseCodePoints(fields[2][0], specialCasings[specialCasingCount].titleCase, 32, NULL, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genprops: error parsing special casing at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
@ -363,6 +393,100 @@ parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
|
||||
}
|
||||
}
|
||||
|
||||
/* parser for CaseFolding.txt ----------------------------------------------- */
|
||||
|
||||
#define MAX_CASE_FOLDING_COUNT 500
|
||||
|
||||
static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
|
||||
static int32_t caseFoldingCount=0;
|
||||
|
||||
static void
|
||||
caseFoldingLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
char *end;
|
||||
int32_t count;
|
||||
char status;
|
||||
|
||||
/* get code point */
|
||||
caseFoldings[caseFoldingCount].code=uprv_strtoul(skipWhitespace(fields[0][0]), &end, 16);
|
||||
end=(char *)skipWhitespace(end);
|
||||
if(end<=fields[0][0] || end!=fields[0][1]) {
|
||||
fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
|
||||
*pErrorCode = U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* get the status of this mapping */
|
||||
caseFoldings[caseFoldingCount].status=status=*skipWhitespace(fields[1][0]);
|
||||
if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I') {
|
||||
fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
|
||||
*pErrorCode = U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
|
||||
if(status=='L') {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the mapping */
|
||||
count=parseCodePoints(fields[2][0], caseFoldings[caseFoldingCount].full, 32, &caseFoldings[caseFoldingCount].simple, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genprops: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
/* there is a simple mapping only if there is exactly one code point */
|
||||
if(count!=1) {
|
||||
caseFoldings[caseFoldingCount].simple=0;
|
||||
}
|
||||
|
||||
/* check the status */
|
||||
if(status=='S') {
|
||||
/* check if there was a full mapping for this code point before */
|
||||
if( caseFoldingCount>0 &&
|
||||
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
|
||||
caseFoldings[caseFoldingCount-1].status=='F'
|
||||
) {
|
||||
/* merge the two entries */
|
||||
caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
|
||||
return;
|
||||
}
|
||||
} else if(status=='F') {
|
||||
/* check if there was a simple mapping for this code point before */
|
||||
if( caseFoldingCount>0 &&
|
||||
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
|
||||
caseFoldings[caseFoldingCount-1].status=='S'
|
||||
) {
|
||||
/* merge the two entries */
|
||||
uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
|
||||
return;
|
||||
}
|
||||
} else if(status=='I') {
|
||||
/* store only a marker for special handling for cases like dotless i */
|
||||
caseFoldings[caseFoldingCount].simple=0;
|
||||
caseFoldings[caseFoldingCount].full[0]=0;
|
||||
}
|
||||
|
||||
if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
|
||||
fprintf(stderr, "genprops: too many case folding mappings\n");
|
||||
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
|
||||
char *fields[3][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
|
||||
}
|
||||
|
||||
/* parser for UnicodeData.txt ----------------------------------------------- */
|
||||
|
||||
/* general categories */
|
||||
@ -406,7 +530,7 @@ static void
|
||||
unicodeDataLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
static int32_t mirrorIndex=0, specialCasingIndex=0;
|
||||
static int32_t mirrorIndex=0, specialCasingIndex=0, caseFoldingIndex=0;
|
||||
Props p;
|
||||
char *end;
|
||||
uint32_t value;
|
||||
@ -558,6 +682,18 @@ unicodeDataLineFn(void *context,
|
||||
} else {
|
||||
p.specialCasing=NULL;
|
||||
}
|
||||
if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
|
||||
p.caseFolding=caseFoldings+caseFoldingIndex++;
|
||||
|
||||
/* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
|
||||
if( p.caseFolding->status=='C' &&
|
||||
p.caseFolding->simple==p.lowerCase
|
||||
) {
|
||||
p.caseFolding=NULL;
|
||||
}
|
||||
} else {
|
||||
p.caseFolding=NULL;
|
||||
}
|
||||
|
||||
addProps(&p);
|
||||
}
|
||||
|
@ -30,6 +30,13 @@ typedef struct {
|
||||
UChar lowerCase[32], upperCase[32], titleCase[32];
|
||||
} SpecialCasing;
|
||||
|
||||
/* case folding data */
|
||||
typedef struct {
|
||||
uint32_t code, simple;
|
||||
char status;
|
||||
UChar full[32];
|
||||
} CaseFolding;
|
||||
|
||||
/* character properties */
|
||||
typedef struct {
|
||||
uint32_t code, lowerCase, upperCase, titleCase, mirrorMapping;
|
||||
@ -38,6 +45,7 @@ typedef struct {
|
||||
uint32_t denominator; /* 0: no value */
|
||||
uint8_t generalCategory, canonicalCombining, bidi, isMirrored, hasNumericValue;
|
||||
SpecialCasing *specialCasing;
|
||||
CaseFolding *caseFolding;
|
||||
} Props;
|
||||
|
||||
/* global flags */
|
||||
|
@ -204,6 +204,7 @@ bit
|
||||
5 has denominator value
|
||||
6 has a mirror-image Unicode code point
|
||||
7 has SpecialCasing.txt entries
|
||||
8 has CaseFolding.txt entries
|
||||
|
||||
According to the flags in this word, one or more uint32_t words follow it
|
||||
in the sequence of the bit flags in the flags word; if a flag is not set,
|
||||
@ -245,6 +246,19 @@ One UChar value with lengths as follows:
|
||||
|
||||
Followed by the UChars for lowercase, uppercase, titlecase mappings in this order.
|
||||
|
||||
For case folding mappings, the 32-bit exception word contains:
|
||||
31..24 number of UChars used for the full mapping
|
||||
23..16 reserved
|
||||
15.. 0 UChar offset from the beginning of the UChars array where the
|
||||
UChars for the special case mappings are stored in the following format:
|
||||
|
||||
Format of case folding UChars:
|
||||
Two UChars contain the simple mapping as follows:
|
||||
0, 0 no simple mapping
|
||||
BMP,0 a simple mapping to a BMP code point
|
||||
s1, s2 a simple mapping to a supplementary code point stored as two surrogates
|
||||
This is followed by the UChars for the full case folding mappings.
|
||||
|
||||
Example:
|
||||
U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
|
||||
mapping and a numeric value.
|
||||
@ -267,7 +281,7 @@ static UDataInfo dataInfo={
|
||||
0,
|
||||
|
||||
0x55, 0x50, 0x72, 0x6f, /* dataFormat="UPro" */
|
||||
1, 2, 0, 0, /* formatVersion */
|
||||
1, 3, 0, 0, /* formatVersion */
|
||||
3, 0, 0, 0 /* dataVersion */
|
||||
};
|
||||
|
||||
@ -365,35 +379,10 @@ addUChars(const UChar *s, uint32_t length);
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
/* ### this must become public in putil.c */
|
||||
static void
|
||||
__versionFromString(UVersionInfo versionArray, const char *versionString) {
|
||||
char *end;
|
||||
uint16_t part=0;
|
||||
|
||||
if(versionArray==NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
if(versionString!=NULL) {
|
||||
for(;;) {
|
||||
versionArray[part]=(uint8_t)uprv_strtoul(versionString, &end, 10);
|
||||
if(end==versionString || ++part==U_MAX_VERSION_LENGTH || *end!=U_VERSION_DELIMITER) {
|
||||
break;
|
||||
}
|
||||
versionString=end+1;
|
||||
}
|
||||
}
|
||||
|
||||
while(part<U_MAX_VERSION_LENGTH) {
|
||||
versionArray[part++]=0;
|
||||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
setUnicodeVersion(const char *v) {
|
||||
UVersionInfo version;
|
||||
__versionFromString(version, v);
|
||||
u_versionFromString(version, v);
|
||||
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
||||
}
|
||||
|
||||
@ -529,6 +518,10 @@ addProps(Props *p) {
|
||||
x=EXCEPTION_BIT;
|
||||
++count;
|
||||
}
|
||||
if(p->caseFolding!=NULL) {
|
||||
x=EXCEPTION_BIT;
|
||||
++count;
|
||||
}
|
||||
|
||||
/* handle exceptions */
|
||||
if(count>1 || x!=0 || value<MIN_VALUE || MAX_VALUE<value) {
|
||||
@ -598,9 +591,9 @@ addProps(Props *p) {
|
||||
exceptions[value+length++]=p->mirrorMapping;
|
||||
}
|
||||
if(p->specialCasing!=NULL) {
|
||||
first|=0x80;
|
||||
if(p->specialCasing->isComplex) {
|
||||
/* complex special casing */
|
||||
first|=0x80;
|
||||
exceptions[value+length++]=0x80000000;
|
||||
} else {
|
||||
/* unconditional special casing */
|
||||
@ -630,10 +623,33 @@ addProps(Props *p) {
|
||||
}
|
||||
u[0]=entry;
|
||||
|
||||
first|=0x80;
|
||||
exceptions[value+length++]=(i<<24)|addUChars(u, i);
|
||||
}
|
||||
}
|
||||
if(p->caseFolding!=NULL) {
|
||||
first|=0x100;
|
||||
if(p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
|
||||
/* special case folding, store only a marker */
|
||||
exceptions[value+length++]=0;
|
||||
} else {
|
||||
/* normal case folding with a simple and a full mapping */
|
||||
UChar u[128];
|
||||
uint16_t i;
|
||||
|
||||
/* store the simple mapping into the first two UChars */
|
||||
i=0;
|
||||
u[1]=0;
|
||||
UTF_APPEND_CHAR_UNSAFE(u, i, p->caseFolding->simple);
|
||||
|
||||
/* store the full mapping after that */
|
||||
i=p->caseFolding->full[0];
|
||||
if(i>0) {
|
||||
uprv_memcpy(u+2, p->caseFolding->full+1, 2*i);
|
||||
}
|
||||
|
||||
exceptions[value+length++]=(i<<24)|addUChars(u, 2+i);
|
||||
}
|
||||
}
|
||||
exceptions[value]=first;
|
||||
exceptionsTop+=length;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user