ICU-96 tailoring cleanup, general collation cleanup

X-SVN-Rev: 3896
This commit is contained in:
Vladimir Weinstein 2001-03-03 09:27:42 +00:00
parent 8e85d63536
commit 454ade03e9
3 changed files with 113 additions and 73 deletions

View File

@ -344,7 +344,7 @@ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode
/* This loop has to change the CE at the end of contraction REDO!*/
uprv_cnttab_changeLastCE(contractions, CE, element->mapCE, TRUE, status);
} else {
//fprintf(stderr, "Warning - trying to overwrite already existing data for codepoint %04X\n", element->cPoints[0]);
fprintf(stderr, "Warning - trying to overwrite already existing data for codepoint %04X\n", element->cPoints[0]);
//*status = U_ILLEGAL_ARGUMENT_ERROR;
}
} else {

View File

@ -212,8 +212,8 @@ U_CFUNC void ucol_inv_getGapPositions(UColTokListHeader *lh) {
lh->gapsHi[3*i+1] = 0;
lh->gapsHi[3*i+2] = 0;
lh->gapsLo[3*i] = 0;
lh->gapsHi[3*i+1] = 0;
lh->gapsHi[3*i+1] = 0;
lh->gapsLo[3*i+1] = 0;
lh->gapsLo[3*i+2] = 0;
lh->numStr[i] = 0;
lh->fStrToken[i] = NULL;
lh->lStrToken[i] = NULL;
@ -236,7 +236,6 @@ U_CFUNC void ucol_inv_getGapPositions(UColTokListHeader *lh) {
}
tok = tok->next;
}
if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
/* check if previous interval is the same and merge the intervals if it is so */
if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
@ -246,7 +245,6 @@ U_CFUNC void ucol_inv_getGapPositions(UColTokListHeader *lh) {
lh->pos[tokStrength+1] = -1;
}
}
if(tok != NULL) {
tokStrength = tok->strength;
} else {
@ -333,8 +331,8 @@ ucol_close(UCollator *coll)
if(coll->mapping != NULL) {
ucmp32_close(coll->mapping);
}
if(coll->rules != NULL) {
uprv_free(coll->rules);
if(coll->rules != NULL && coll->freeRulesOnClose) {
uprv_free((UChar *)coll->rules);
}
if(coll->rb != NULL) { /* pointing to read-only memory */
ures_close(coll->rb);
@ -358,16 +356,30 @@ ucol_close(UCollator *coll)
U_CFUNC uint32_t ucol_getNextGenerated(ucolCEGenerator *g) {
g->current += (1<<(32-(g->byteSize*8)));
if(g->current > g->fLow && g->current < g->fHigh) {
g->current = g->fHigh;
}
return g->current;
}
U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t low, uint32_t high, uint32_t count) {
static uint32_t fbHigh[3] = {0, /*0,*/UCOL_COMMON_TOP2, 0};
static uint32_t fbLow[3] = {0, /*0,*/UCOL_COMMON_BOT2, 0};
U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t low, uint32_t high, UColToken *tok, uint32_t strength) {
uint32_t count = tok->toInsert;
g->fHigh = fbHigh[strength];
g->fLow = fbLow[strength];
uint32_t lobytes = 0, hibytes = 0;
ucol_countBytes(low, lobytes);
ucol_countBytes(high, hibytes);
if(low == high && strength > 0) {
high = (0xFF << (32-hibytes*8));
}
g->firstLow = low + (1 << (32-lobytes*8));
g->lastHigh = high - (1 << (32-hibytes*8));
@ -383,6 +395,19 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t low, uint32_t
ucol_countBytes(g->lastMid, g->midByteCount);
ucol_countBytes(g->lastHigh, g->highByteCount);
if(g->firstLow < low || g->lastLow > high) {
g->firstLow = g->lastLow = 0;
g->lowByteCount = 0xFFFF;
}
if(g->firstMid < low || g->lastMid > high) {
g->firstMid = g->lastMid = 0;
g->midByteCount = 0xFFFF;
}
if(g->firstHigh < low || g->lastHigh > high) {
g->firstHigh = g->lastHigh = 0;
g->highByteCount = 0xFFFF;
}
g->lowCount = (g->lastLow - g->firstLow) >> (32-g->lowByteCount*8);
g->midCount = (g->lastMid - g->firstMid) >> (32-g->midByteCount*8);
@ -395,38 +420,38 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t low, uint32_t
g->limit = 0;
/* Let's get the best one now */
if(g->lowCount > count ) {
if(g->lowCount > count+(g->fHigh - g->fLow) ) {
g->byteSize = g->lowByteCount;
g->start = g->firstLow;
g->limit = g->lastLow;
}
if(g->midCount > count && g->midByteCount < g->byteSize) {
if(g->midCount > count+(g->fHigh - g->fLow) && g->midByteCount < g->byteSize) {
g->byteSize = g->midByteCount;
g->start = g->firstMid;
g->limit = g->lastMid;
}
if(g->highCount > count && g->highByteCount < g->byteSize) {
if(g->highCount > count+(g->fHigh - g->fLow) && g->highByteCount < g->byteSize) {
g->byteSize = g->highByteCount;
g->start = g->firstHigh;
g->limit = g->lastHigh;
}
if(g->byteSize == 0xFFFFFFFF) { /* Still no solution */
if((g->lowCount)*254 > count ) {
if((g->lowCount)*254 > count+(g->fHigh - g->fLow) ) {
g->byteSize = g->lowByteCount+1;
g->start = g->firstLow | (0x02 << (32-g->byteSize*8));
g->limit = g->lastLow;
}
if((g->midCount)*254 > count && g->midByteCount+1 < g->byteSize) {
if((g->midCount)*254 > count+(g->fHigh - g->fLow) && g->midByteCount+1 < g->byteSize) {
g->byteSize = g->midByteCount+1;
g->start = g->firstMid | (0x02 << (32-g->byteSize*8));
g->limit = g->lastMid;
}
if((g->highCount)*254 > count && g->highByteCount+1 < g->byteSize) {
if((g->highCount)*254 > count+(g->fHigh - g->fLow) && g->highByteCount+1 < g->byteSize) {
g->byteSize = g->highByteCount+1;
g->start = g->firstHigh | (0x02 << (32-g->byteSize*8));
g->limit = g->lastHigh | (0xFF << (32-g->byteSize*8));
@ -447,6 +472,12 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t low, uint32_t
g->limit = g->firstLow | (0xFFFF << (32-g->byteSize*8));
}
}
g->fLow = g->fLow << 24;
g->fHigh = g->fHigh << 24;
if(g->current > g->fLow && g->current < g->fHigh) {
g->current = g->fHigh;
}
return g->current;
}
@ -560,7 +591,6 @@ U_CFUNC void ucol_initBuffers(UColTokListHeader *lh, UHashtable *tailored, UErro
*/
ucol_inv_getGapPositions(lh);
#if 0
fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
int32_t j = 2;
@ -569,7 +599,6 @@ U_CFUNC void ucol_initBuffers(UColTokListHeader *lh, UHashtable *tailored, UErro
fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
}
#endif
/* I strongly believe that this code can be refactored and simplified. */
/* have to do CE generation now, so let this soak a little bit */
@ -603,7 +632,7 @@ U_CFUNC void ucol_initBuffers(UColTokListHeader *lh, UHashtable *tailored, UErro
}
CEparts[0] = lh->gapsLo[fStrength*3];
CEparts[1] = lh->gapsLo[fStrength*3+1];
CEparts[2] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok->toInsert);
CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY);
while(tok != NULL && tok->strength >= UCOL_TERTIARY) {
ucol_doCE(CEparts, tok, tailored, status);
@ -631,11 +660,11 @@ U_CFUNC void ucol_initBuffers(UColTokListHeader *lh, UHashtable *tailored, UErro
/* &0 [, <funny_tertiary_different_zero>] ; <funny_secondary_different_zero> = FunnySecZero */
CEparts[0] = lh->gapsLo[fStrength*3];
CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok->toInsert);
CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);
if(tok->next->strength == UCOL_TERTIARY) {
CEparts[2] = ucol_getCEGenerator(&Gens[2], 0x02000000, 0xFF000000, tok->next->toInsert);
CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], 0x02000000, 0xFF000000, tok->next, UCOL_TERTIARY);
} else {
CEparts[2] = 0x03000000;
CEparts[UCOL_TERTIARY] = 0x03000000;
}
ucol_doCE(CEparts, tok, tailored, status);
@ -648,9 +677,9 @@ U_CFUNC void ucol_initBuffers(UColTokListHeader *lh, UHashtable *tailored, UErro
} else if(tok->strength == UCOL_SECONDARY) {
CEparts[1] = ucol_getNextGenerated(&Gens[1]);
if(tok->next->strength == UCOL_SECONDARY) {
CEparts[2] = 0x03000000;
CEparts[UCOL_TERTIARY] = 0x03000000;
} else {
CEparts[2] = ucol_getCEGenerator(&Gens[2], 0x02000000, 0xFF000000, tok->next->toInsert);
CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], 0x02000000, 0xFF000000, tok->next, UCOL_TERTIARY);
}
ucol_doCE(CEparts, tok, tailored, status);
} else { /* Strength is identical */
@ -690,16 +719,16 @@ U_CFUNC void ucol_initBuffers(UColTokListHeader *lh, UHashtable *tailored, UErro
/* How should the things be set up */
if(tok->next != NULL) {
CEparts[0] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok->toInsert);
CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);
if(tok->next->strength == UCOL_PRIMARY) {
CEparts[1] = 0x03000000;
CEparts[2] = 0x03000000;
} else { /* Secondaries will also be generated */
CEparts[1] = ucol_getCEGenerator(&Gens[1], 0x02000000, 0xFF000000, tok->next->toInsert);
CEparts[1] = ucol_getCEGenerator(&Gens[1], 0x02000000, 0xFF000000, tok->next, 1);
if(tok->next->strength == UCOL_SECONDARY) {
CEparts[2] = 0x03000000;
CEparts[UCOL_TERTIARY] = 0x03000000;
} else {
CEparts[2] = ucol_getCEGenerator(&Gens[2], 0x02000000, 0xFF000000, tok->next->toInsert);
CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], 0x02000000, 0xFF000000, tok->next, UCOL_TERTIARY);
}
}
@ -718,23 +747,23 @@ U_CFUNC void ucol_initBuffers(UColTokListHeader *lh, UHashtable *tailored, UErro
} else if(tok->strength == UCOL_SECONDARY) {
CEparts[1] = ucol_getNextGenerated(&Gens[1]);
if(tok->next->strength == UCOL_TERTIARY) {
CEparts[2] = ucol_getCEGenerator(&Gens[2], 0x02000000, 0xFF000000, tok->next->toInsert);
CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], 0x02000000, 0xFF000000, tok->next, UCOL_TERTIARY);
} else { /* UCOL_SECONDARY */
CEparts[2] = 0x03000000;
CEparts[UCOL_TERTIARY] = 0x03000000;
}
ucol_doCE(CEparts, tok, tailored, status);
} else {
CEparts[0] = ucol_getNextGenerated(&Gens[0]);
if(tok->next->strength == UCOL_PRIMARY) {
CEparts[1] = 0x03000000;
CEparts[2] = 0x03000000;
CEparts[UCOL_TERTIARY] = 0x03000000;
} else {
if(tok->next->strength == UCOL_SECONDARY) {
CEparts[2] = 0x03000000;
CEparts[UCOL_TERTIARY] = 0x03000000;
} else { /* UCOL_TERTIARY */
CEparts[2] = ucol_getCEGenerator(&Gens[2], 0x02000000, 0xFF000000, tok->next->toInsert);
CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], 0x02000000, 0xFF000000, tok->next, UCOL_TERTIARY);
}
CEparts[1] = ucol_getCEGenerator(&Gens[1], 0x02000000, 0xFF000000, tok->next->toInsert);
CEparts[1] = ucol_getCEGenerator(&Gens[1], 0x02000000, 0xFF000000, tok->next, 1);
}
ucol_doCE(CEparts, tok, tailored, status);
}
@ -848,9 +877,9 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
#define STR_LENGTH_MASK 0x0003
int32_t uprv_ucol_decompose (UChar curChar, UChar *result) {
int32_t minDecomp = 0;
int32_t resSize = 0;
/* either 0 or MAX_COMPAT = 11177 if we want just canonical */
int32_t minDecomp = 11177;
int32_t resSize = 0;
uint16_t offset = ucmp16_getu(DecompData::offsets, curChar);
uint16_t index = (uint16_t)(offset & DecompData::DECOMP_MASK);
@ -897,9 +926,10 @@ int32_t uprv_ucol_decompose (UChar curChar, UChar *result) {
}
void ucol_setupClosureCE(tempUCATable *t, UChar *decomp, uint32_t noOfDec, UCAElements *el, UErrorCode *status) {
uint32_t ucol_getDynamicCEs(tempUCATable *t, UChar *decomp, uint32_t noOfDec, uint32_t *result, uint32_t resultSize, UErrorCode *status) {
uint32_t j = 0, i = 0;
uint32_t CE = 0;
uint32_t resLen = 0;
collIterate colIt;
UBool lastNotFound = FALSE;
@ -912,11 +942,11 @@ void ucol_setupClosureCE(tempUCATable *t, UChar *decomp, uint32_t noOfDec, UCAEl
while(CE != UCOL_NO_MORE_CES) {
CE = ucol_getNextCE(UCA, &colIt, status);
if(CE != UCOL_NO_MORE_CES) {
el->CEs[el->noOfCEs++] = CE;
result[resLen++] = CE;
}
}
} else if(CE < UCOL_NOT_FOUND) { /*normal CE */
el->CEs[el->noOfCEs++] = CE;
result[resLen++] = CE;
} else { /* special CE, contraction, expansion or Thai */
for(;;) {
uint32_t tag = getCETag(CE);
@ -925,11 +955,11 @@ void ucol_setupClosureCE(tempUCATable *t, UChar *decomp, uint32_t noOfDec, UCAEl
uint32_t size = getExpansionCount(CE);
if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
for(i = 1; i<size; i++) {
el->CEs[el->noOfCEs++] = *CEOffset++;
result[resLen++] = *CEOffset++;
}
} else { /* else, we do */
while(*CEOffset != 0) {
el->CEs[el->noOfCEs++] = *CEOffset++;
result[resLen++] = *CEOffset++;
}
}
break;
@ -954,7 +984,7 @@ void ucol_setupClosureCE(tempUCATable *t, UChar *decomp, uint32_t noOfDec, UCAEl
} else if(CE > UCOL_NOT_FOUND) {
continue;
} else {
el->CEs[el->noOfCEs++] = CE;
result[resLen++] = CE;
break;
}
}
@ -963,6 +993,7 @@ void ucol_setupClosureCE(tempUCATable *t, UChar *decomp, uint32_t noOfDec, UCAEl
}
j++;
}
return resLen;
}
UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
@ -1036,33 +1067,32 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
UCAElements el;
el.isThai = FALSE;
collIterate colIt;
/*
myData = uprv_uca_assembleTable(t, status);
UCollator *temp = ucol_initCollator(myData, NULL, status);
temp->rules = NULL;
temp->rb = NULL;
temp->hasRealData = FALSE;
*/
uint32_t decompCE[256];
uint32_t compCE[256];
uint32_t decompRes = 0, compRes = 0;
/* produce canonical & compatibility closure */
for(u = 0; u < 0x10000; u++) {
if((noOfDec = unorm_normalize((const UChar *)&u, 1, UNORM_NFD, 0, decomp, 256, status)) > 1) {
/*if((noOfDec = uprv_ucol_decompose ((UChar)u, decomp)) > 1) {*/
for(i = 0; i<noOfDec; i++) {
if((CE = ucmp32_get(t->mapping, decomp[i])) != UCOL_NOT_FOUND) {
/*if((noOfDec = unorm_normalize((const UChar *)&u, 1, UNORM_NFD, 0, decomp, 256, status)) > 1)*/
if((noOfDec = uprv_ucol_decompose ((UChar)u, decomp)) > 1) {
compRes = ucol_getDynamicCEs(t, (UChar *)&u, 1, compCE, 256, status);
decompRes = ucol_getDynamicCEs(t, decomp, noOfDec, decompCE, 256, status);
if((compRes != decompRes) || (uprv_memcmp(compCE, decompCE, compRes*sizeof(uint32_t)) != 0)) {
el.uchars[0] = (UChar)u;
el.cPoints = el.uchars;
el.codepoint = (UChar)u;
el.cSize = 1;
el.noOfCEs = 0;
ucol_setupClosureCE(t, decomp, noOfDec, &el, status);
el.noOfCEs = decompRes;
for(i = 0; i<decompRes; i++) {
el.CEs[i] = decompCE[i];
}
uprv_uca_addAnElement(t, &el, status);
break;
}
}
}
}
/*ucol_close(temp);*/
/* add latin-1 stuff */
for(u = 0; u<0x100; u++) {
@ -1173,8 +1203,8 @@ ucol_openRules( const UChar *rules,
result->dataInfo.dataVersion[0] = UCOL_BUILDER_VERSION;
if(U_SUCCESS(*status)) {
result->rules = (UChar *)uprv_malloc((u_strlen(rules)+1)*sizeof(UChar));
u_strcpy(result->rules, rules);
u_strcpy((UChar *)result->rules, rules);
result->freeRulesOnClose = TRUE;
result->rb = 0;
} else {
if(table != NULL) {
@ -1632,7 +1662,7 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
}
/* Make up an artifical CE from code point as per UCA */
order = 0xD0800303 | (ch & 0xF000) << 12 | (ch & 0x0FE0) << 11;
*(collationSource->CEpos++) = 0x04000083 | (ch & 0x001F) << 27;
*(collationSource->CEpos++) = 0x04000080 | (ch & 0x001F) << 27;
}
}
return order; /* return the CE */
@ -1764,7 +1794,7 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
*(collationSource->CEpos ++) = 0xD0800303 | (ch & 0xF000) << 12 |
(ch & 0x0FE0) << 11;
collationSource->toReturn = collationSource->CEpos;
order = 0x04000083 | (ch & 0x001F) << 27;
order = 0x04000080 | (ch & 0x001F) << 27;
*(collationSource->CEpos ++) = order;
}
}
@ -1843,15 +1873,16 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
/* First we position ourselves at the begining of contraction sequence */
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
/* we need to convey the notion of having a backward search - most probably through the context object */
/* if (backwardsSearch) offset += contractionUChars[(int16_t)offset]; else UCharOffset++; */
UCharOffset++; /* skip the backward offset, see above */
if (source->pos>=source->len) { /* this is the end of string */
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); /* So we'll pick whatever we have at the point... */
break;
}
/* we need to convey the notion of having a backward search - most probably through the context object */
/* if (backwardsSearch) offset += contractionUChars[(int16_t)offset]; else UCharOffset++; */
UCharOffset++; /* skip the backward offset, see above */
schar = *source->pos++;
while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
UCharOffset++;
@ -3328,8 +3359,8 @@ ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int3
if(delta == UCOL_FULL_RULES) {
UErrorCode status = U_ZERO_ERROR;
/* take the UCA rules and append real rules at the end */
/* UCA rules will be probably coming from the root RB */\
ucaRules = ures_getStringByKey(coll->rb,"%%UCARules",&UCAlen,&status);
/* UCA rules will be probably coming from the root RB */
ucaRules = ures_getStringByKey(coll->rb,"%%UCARULES",&UCAlen,&status);
}
if(bufferLen >= len + UCAlen) {
@ -3346,15 +3377,21 @@ U_CAPI const UChar*
ucol_getRules( const UCollator *coll,
int32_t *length)
{
/*
const UnicodeString& rules = ((RuleBasedCollator*)coll)->getRules();
*length = rules.length();
return rules.getUChars();
*/
if(coll->rules != NULL) {
*length = u_strlen(coll->rules);
return coll->rules;
} else {
UErrorCode status = U_ZERO_ERROR;
if(coll->rb != NULL) {
UResourceBundle *collElem = ures_getByKey(coll->rb, "CollationElements", NULL, &status);
if(U_SUCCESS(status)) {
/*Semantic const */
((UCollator *)coll)->rules = ures_getStringByKey(collElem, "Sequence", length, &status);
((UCollator *)coll)->freeRulesOnClose = FALSE;
ures_close(collElem);
return coll->rules;
}
}
*length = 0;
return &coll->zero;
}

View File

@ -483,8 +483,10 @@ typedef struct {
uint32_t byteSize; uint32_t start; uint32_t limit;
int32_t count;
uint32_t current;
uint32_t fLow; /*forbidden Low */
uint32_t fHigh; /*forbidden High */
} ucolCEGenerator;
typedef struct {
int32_t size;
/* all the offsets are in bytes */
@ -557,7 +559,8 @@ struct UCollator {
UBool hasRealData; /* some collators have only options, like French, no rules */
/* to speed up things, we use the UCA image, but we don't want it */
/* to run around */
UChar *rules;
const UChar *rules;
UBool freeRulesOnClose;
UChar zero;
UDataInfo dataInfo; /* Data info of UCA table */