ICU-1602 long primary build code. Also fix for a bug that assigned values to implicit CEs in CJK range (the data was correct, but it increased the size of the data file)

X-SVN-Rev: 7411
This commit is contained in:
Vladimir Weinstein 2002-01-08 22:16:56 +00:00
parent a0d2227498
commit ae091aac7b

View File

@ -853,22 +853,6 @@ static uint32_t uprv_uca_finalizeAddition(tempUCATable *t, UCAElements *element,
UTF_NEXT_CHAR(element->cPoints, i, element->cSize, cp);
/*CE = ucmpe32_get(t->mapping, cp);*/
CE = utrie_get32(t->mapping, cp, NULL);
#if 0
UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements));
uprv_memcpy(composed, element, sizeof(UCAElements));
composed->cPoints = composed->uchars;
*composed->cPoints = *element->cPoints;
composed->cSize = unorm_normalize(element->cPoints+1, element->cSize-1, UNORM_NFC, 0, composed->cPoints+1, 128, status);
composed->cSize++;
if(composed->cSize != element->cSize || uprv_memcmp(composed->cPoints+1, element->cPoints+1, element->cSize-1)) {
// do it!
CE = uprv_uca_addContraction(t, CE, composed, status);
#ifdef UCOL_DEBUG
fprintf(stderr, "Adding composed for %04X\n", *element->cPoints);
#endif
}
uprv_free(composed);
#endif
CE = uprv_uca_addContraction(t, CE, element, status);
} else { /* easy case, */
@ -921,30 +905,57 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
element->mapCE = expansion;
}
} else {
expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT)
| ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
& 0xFFFFF0);
for(i = 1; i<element->noOfCEs; i++) {
uprv_uca_addExpansion(expansions, element->CEs[i], status);
}
if(element->noOfCEs <= 0xF) {
expansion |= element->noOfCEs;
/* ICU 2.1 long primaries */
/* unfortunately, it looks like we have to look for a long primary here */
/* since in canonical closure we are going to hit some long primaries from */
/* the first phase, and they will come back as continuations/expansions */
/* destroying the effect of the previous opitimization */
/* A long primary is a three byte primary with starting secondaries and tertiaries */
/* It can appear in long runs of only primary differences (like east Asian tailorings) */
/* also, it should not be an expansion, as expansions would break with this */
// This part came in from ucol_bld.cpp
//if(tok->expansion == 0
//&& noOfBytes[0] == 3 && noOfBytes[1] == 1 && noOfBytes[2] == 1
//&& CEparts[1] == (UCOL_BYTE_COMMON << 24) && CEparts[2] == (UCOL_BYTE_COMMON << 24)) {
/* we will construct a special CE that will go unchanged to the table */
if(element->noOfCEs == 2 // a two CE expansion
&& isContinuation(element->CEs[1]) // which is a continuation
&& (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation,
&& (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary
&& ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary
) {
#ifdef UCOL_DEBUG
fprintf(stdout, "Long primary %04X\n", element->cPoints[0]);
#endif
element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special
| ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary
| ((element->CEs[1]>>24) & 0xFF); // third byte of primary
} else {
uprv_uca_addExpansion(expansions, 0, status);
}
element->mapCE = expansion;
uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1],
(uint8_t)element->noOfCEs,
t->maxExpansions,
status);
if(UCOL_ISJAMO(element->cPoints[0])) {
t->image->jamoSpecial = TRUE;
uprv_uca_setMaxJamoExpansion(element->cPoints[0],
element->CEs[element->noOfCEs - 1],
expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT)
| ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
& 0xFFFFF0);
for(i = 1; i<element->noOfCEs; i++) {
uprv_uca_addExpansion(expansions, element->CEs[i], status);
}
if(element->noOfCEs <= 0xF) {
expansion |= element->noOfCEs;
} else {
uprv_uca_addExpansion(expansions, 0, status);
}
element->mapCE = expansion;
uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1],
(uint8_t)element->noOfCEs,
t->maxJamoExpansions,
t->maxExpansions,
status);
if(UCOL_ISJAMO(element->cPoints[0])) {
t->image->jamoSpecial = TRUE;
uprv_uca_setMaxJamoExpansion(element->cPoints[0],
element->CEs[element->noOfCEs - 1],
(uint8_t)element->noOfCEs,
t->maxJamoExpansions,
status);
}
}
}
@ -987,7 +998,7 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
CE = uprv_uca_finalizeAddition(t, element, status);
if(element->cSize > 1) { // this is a contraction, we should check whether a composed form should also be included
if(element->cSize > 1 && !(element->cSize==2 && UTF16_IS_LEAD(element->cPoints[0]) && UTF16_IS_TRAIL(element->cPoints[1]))) { // this is a contraction, we should check whether a composed form should also be included
UChar composed[256];
uint32_t compLen = unorm_normalize(element->cPoints, element->cSize, UNORM_NFC, 0, composed, 256, status);;
@ -1072,6 +1083,7 @@ static inline uint32_t getFoldedValue(UNewTrie *trie, UChar32 start, int32_t off
uint32_t tag;
UChar32 limit;
UBool inBlockZero;
static int32_t count = 1;
limit=start+0x400;
while(start<limit) {
@ -1079,7 +1091,10 @@ static inline uint32_t getFoldedValue(UNewTrie *trie, UChar32 start, int32_t off
tag = getCETag(value);
if(inBlockZero == TRUE) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else if(value!=0 && tag != IMPLICIT_TAG) {
} else if(value!=0 && tag != IMPLICIT_TAG && tag != NOT_FOUND_TAG) {
#ifdef UCOL_DEBUG
fprintf(stdout, "%i, Folded %08X, value %08X\n", count++, start, value);
#endif
return (uint32_t)(UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24) | offset);
} else {
++start;
@ -1088,6 +1103,28 @@ static inline uint32_t getFoldedValue(UNewTrie *trie, UChar32 start, int32_t off
return 0;
}
#ifdef UCOL_DEBUG
// This is a debug function to print the contents of a trie.
// It is used in conjuction with the code around utrie_unserialize call
void enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
if(start<0x10000) {
fprintf(stdout, "%08X, %08X, %08X\n", start, limit, value);
} else {
fprintf(stdout, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start, UTF16_LEAD(start), UTF16_TRAIL(start), limit, UTF16_LEAD(limit), UTF16_TRAIL(limit), value);
}
}
int32_t
myGetFoldingOffset(uint32_t data) {
if(data > UCOL_NOT_FOUND && getCETag(data) == SURROGATE_TAG) {
return (data&0xFFFFFF);
} else {
return 0;
}
}
#endif
U_CAPI UCATableHeader* U_EXPORT2
uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
/*CompactEIntArray *mapping = t->mapping;*/
@ -1194,8 +1231,20 @@ uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
myData->mappingPosition = tableOffset;
utrie_serialize(mapping, dataStart+tableOffset, toAllocate-tableOffset, getFoldedValue, FALSE, status);
#ifdef UCOL_DEBUG
// This is debug code to dump the contents of the trie. It needs two functions defined above
{
UTrie UCAt = { 0 };
utrie_unserialize(&UCAt, dataStart+tableOffset, 9999999, status);
UCAt.getFoldingOffset = myGetFoldingOffset;
if(U_SUCCESS(*status)) {
utrie_enum(&UCAt, NULL, enumRange, NULL);
}
}
#endif
tableOffset += paddedsize(mappingSize);
int32_t i = 0;
#if 0
/* construct the fast tracker for latin one*/