ICU-3969 store case-ignorable flags, maxFullLength; other fixes

X-SVN-Rev: 16258
This commit is contained in:
Markus Scherer 2004-09-06 16:00:23 +00:00
parent 29038e96b7
commit 057bcfd819
2 changed files with 125 additions and 50 deletions

View File

@ -224,7 +224,7 @@ main(int argc, char* argv[]) {
"Usage: %s [-options] [suffix]\n"
"\n"
"read the UnicodeData.txt file and other Unicode properties files and\n"
"create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the character properties\n"
"create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
"\n",
argv[0]);
fprintf(stderr,
@ -347,6 +347,34 @@ isToken(const char *token, const char *s) {
return FALSE;
}
U_CFUNC int32_t
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
const char *t, *z;
int32_t i, j;
s=u_skipWhitespace(s);
for(i=0; i<countTokens; ++i) {
t=tokens[i];
if(t!=NULL) {
for(j=0;; ++j) {
if(t[j]!=0) {
if(s[j]!=t[j]) {
break;
}
} else {
z=u_skipWhitespace(s+j);
if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
return i;
} else {
break;
}
}
}
}
}
return -1;
}
static void
_set_addAll(USet *set, const UChar *s, int32_t length) {
UChar32 c;
@ -596,6 +624,19 @@ parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
/* parser for UnicodeData.txt ----------------------------------------------- */
/* general categories */
const char *const
genCategoryNames[U_CHAR_CATEGORY_COUNT]={
"Cn",
"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
"Mc", "Nd", "Nl", "No",
"Zs", "Zl", "Zp",
"Cc", "Cf", "Co", "Cs",
"Pd", "Ps", "Pe", "Pc", "Po",
"Sm", "Sc", "Sk", "So",
"Pi", "Pf"
};
static int32_t specialCasingIndex=0, caseFoldingIndex=0;
static void U_CALLCONV
@ -606,7 +647,7 @@ unicodeDataLineFn(void *context,
char *end;
static UChar32 prevCode=0;
UChar32 value;
UBool something=FALSE;
int32_t i;
/* reset the properties */
uprv_memset(&p, 0, sizeof(Props));
@ -620,9 +661,14 @@ unicodeDataLineFn(void *context,
}
/* get general category, field 2 */
if(isToken("Lt", fields[2][0])) {
p.isTitle=TRUE;
something=TRUE;
i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
if(i>=0) {
p.gc=(uint8_t)i;
} else {
fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
fields[2][0], (unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get canonical combining class, field 3 */
@ -632,10 +678,7 @@ unicodeDataLineFn(void *context,
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(value>0) {
p.cc=(uint8_t)value;
something=TRUE;
}
p.cc=(uint8_t)value;
/* get uppercase mapping, field 12 */
value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
@ -649,7 +692,6 @@ unicodeDataLineFn(void *context,
p.upperCase=value;
uset_add(caseSensitive, p.code);
uset_add(caseSensitive, value);
something=TRUE;
}
/* get lowercase value, field 13 */
@ -664,7 +706,6 @@ unicodeDataLineFn(void *context,
p.lowerCase=value;
uset_add(caseSensitive, p.code);
uset_add(caseSensitive, value);
something=TRUE;
}
/* get titlecase value, field 14 */
@ -679,19 +720,16 @@ unicodeDataLineFn(void *context,
p.titleCase=value;
uset_add(caseSensitive, p.code);
uset_add(caseSensitive, value);
something=TRUE;
}
/* set additional properties from previously parsed files */
if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
p.specialCasing=specialCasings+specialCasingIndex++;
something=TRUE;
} else {
p.specialCasing=NULL;
}
if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
p.caseFolding=caseFoldings+caseFoldingIndex++;
something=TRUE;
/* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
if( p.caseFolding->status=='C' &&
@ -720,9 +758,7 @@ unicodeDataLineFn(void *context,
}
/* properties for a single code point */
if(something) {
setProps(&p);
}
setProps(&p);
prevCode=p.code;
}

View File

@ -53,7 +53,9 @@ The file contains the following structures:
i2 trieSize; -- size in bytes of the case mapping properties trie
i3 exceptionsLength; -- length in uint16_t of the exceptions array
i4..indexes[i0] reservedIndexes; -- reserved values; 0 for now
i4..i14 reservedIndexes; -- reserved values; 0 for now
i15 maxFullLength; -- maximum length of a full case mapping/folding string
Serizalied trie, see utrie.h;
@ -69,6 +71,9 @@ if(exception) {
if(not uncased) {
15..6 signed delta to simple case mapping code point
(add delta to input code point)
} else {
6 the code point is case-ignorable
(U+0307 is also case-ignorable but has an exception)
}
5..4 0 normal character with cc=0
1 soft-dotted character
@ -159,6 +164,9 @@ static uint16_t exceptionsTop=0;
static Props excProps[MAX_EXC_COUNT];
static uint16_t exceptionsCount=0;
/* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
static int32_t maxFullLength=U16_MAX_LENGTH;
/* -------------------------------------------------------------------------- */
extern void
@ -173,16 +181,16 @@ setUnicodeVersion(const char *v) {
extern void
setProps(Props *p) {
UErrorCode errorCode;
uint32_t value;
uint32_t value, oldValue;
int32_t delta;
uint16_t count;
/* count the case mappings and other values competing for the value bit field */
value=upvec_getValue(pv, p->code, 0);
/* get the non-UnicodeData.txt properties */
value=oldValue=upvec_getValue(pv, p->code, 0);
/* default: map to self */
delta=0;
count=0;
if(p->isTitle) {
if(p->gc==U_TITLECASE_LETTER) {
/* the Titlecase property is read late, from UnicodeData.txt */
value|=UCASE_TITLE;
}
@ -197,7 +205,7 @@ setProps(Props *p) {
}
if(p->lowerCase!=0) {
/* lowercase mapping as delta if the character is uppercase or titlecase */
if((value&UCASE_TYPE_MASK)==UCASE_UPPER || (value&UCASE_TYPE_MASK)==UCASE_TITLE) {
if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) {
delta=p->lowerCase-p->code;
} else {
value|=UCASE_EXCEPTION;
@ -229,6 +237,33 @@ setProps(Props *p) {
}
}
/* encode case-ignorable as delta==1 on uncased characters */
if(
(value&UCASE_TYPE_MASK)==UCASE_NONE &&
p->code!=0x307 &&
((U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
p->code==0x27 || p->code==0xad || p->code==0x2019)
) {
/*
* We use one of the delta/exception bits, which works because we only
* store the case-ignorable flag for uncased characters.
* There is no delta for uncased characters (see checks above).
* If there is an exception for an uncased, case-ignorable character
* (although there should not be any case mappings if it's uncased)
* then we have a problem.
* There is one character which is case-ignorable but has an exception:
* U+0307 is uncased, Mn, has conditional special casing and
* is therefore handled in code instead.
*/
if(value&UCASE_EXCEPTION) {
fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
(unsigned long)p->code);
exit(U_INTERNAL_PROGRAM_ERROR);
}
delta=1;
}
/* handle exceptions */
if(value&UCASE_EXCEPTION) {
/* simply store exceptions for later processing and encoding */
@ -244,7 +279,9 @@ setProps(Props *p) {
}
errorCode=U_ZERO_ERROR;
if(!upvec_setValue(pv, p->code, p->code+1, 0, value, 0xffffffff, &errorCode)) {
if( value!=oldValue &&
!upvec_setValue(pv, p->code, p->code+1, 0, value, 0xffffffff, &errorCode)
) {
fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
u_errorName(errorCode));
exit(errorCode);
@ -305,6 +342,28 @@ makeException(uint32_t value, Props *p) {
/* copy and shift the soft-dotted bits */
excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT;
/* update maxFullLength */
if(p->specialCasing!=NULL) {
length=p->specialCasing->lowerCase[0];
if(length>maxFullLength) {
maxFullLength=length;
}
length=p->specialCasing->upperCase[0];
if(length>maxFullLength) {
maxFullLength=length;
}
length=p->specialCasing->titleCase[0];
if(length>maxFullLength) {
maxFullLength=length;
}
}
if(p->caseFolding!=NULL) {
length=p->caseFolding->full[0];
if(length>maxFullLength) {
maxFullLength=length;
}
}
/* set the bits for conditional mappings */
if(p->specialCasing!=NULL && p->specialCasing->isComplex) {
excWord|=UCASE_EXC_CONDITIONAL_SPECIAL;
@ -355,6 +414,7 @@ makeException(uint32_t value, Props *p) {
excWord|=U_MASK(UCASE_EXC_LOWER);
}
if( p->caseFolding!=NULL &&
p->caseFolding->simple!=0 &&
(p->lowerCase!=0 ?
p->caseFolding->simple!=p->lowerCase :
p->caseFolding->simple!=p->code)
@ -461,29 +521,6 @@ makeExceptions() {
/* generate output data ----------------------------------------------------- */
/* TODO: create/use default folding function?! */
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
U_CFUNC uint32_t U_EXPORT2
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t value;
UChar32 limit;
UBool inBlockZero;
limit=start+0x400;
while(start<limit) {
value=utrie_get32(trie, start, &inBlockZero);
if(inBlockZero) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else if(value!=0) {
return (uint32_t)(offset|0x8000);
} else {
++start;
}
}
return 0;
}
extern void
generateData(const char *dataDir) {
static int32_t indexes[UCASE_IX_TOP]={
@ -514,7 +551,7 @@ generateData(const char *dataDir) {
}
}
trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), getFoldedPropsValue, TRUE, &errorCode);
trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
exit(errorCode);
@ -524,6 +561,8 @@ generateData(const char *dataDir) {
indexes[UCASE_IX_TRIE_SIZE]=trieSize;
indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop;
indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;
if(beVerbose) {
printf("trie size in bytes: %5d\n", (int)trieSize);
printf("number of code points with exceptions: %5d\n", exceptionsCount);