#include "ucol_bld.h" /* checkout this one - it might be replaceable by something faster */ #include "dcmpdata.h" static const InverseTableHeader* invUCA = NULL; static UBool U_CALLCONV isAcceptableInvUCA(void *context, const char *type, const char *name, const UDataInfo *pInfo){ /* context, type & name are intentionally not used */ if( pInfo->size>=20 && pInfo->isBigEndian==U_IS_BIG_ENDIAN && pInfo->charsetFamily==U_CHARSET_FAMILY && pInfo->dataFormat[0]==0x49 && /* dataFormat="InvC" */ pInfo->dataFormat[1]==0x6e && pInfo->dataFormat[2]==0x76 && pInfo->dataFormat[3]==0x43 && pInfo->formatVersion[0]==1 && pInfo->dataVersion[0]==3 && pInfo->dataVersion[1]==0 && pInfo->dataVersion[2]==0 && pInfo->dataVersion[3]==0) { return TRUE; } else { return FALSE; } } int32_t ucol_inv_findCE(uint32_t CE, uint32_t SecondCE) { uint32_t bottom = 0, top = invUCA->tableSize; uint32_t i = 0; uint32_t first = 0, second = 0; uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table); while(bottom < top-1) { i = (top+bottom)/2; first = *(CETable+3*i); second = *(CETable+3*i+1); if(first > CE) { top = i; } else if(first < CE) { bottom = i; } else { if(second > SecondCE) { top = i; } else if(second < SecondCE) { bottom = i; } else { break; } } } if((first == CE && second == SecondCE)) { return i; } else { return -1; } } static uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { 0xFFFF0000, 0xFFFFFF00, 0xFFFFFFFF }; int32_t ucol_inv_getPrevious(UColTokListHeader *lh, uint32_t strength) { uint32_t CE = lh->baseCE; uint32_t SecondCE = lh->baseContCE; uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table); uint32_t previousCE, previousContCE; int32_t iCE; iCE = ucol_inv_findCE(CE, SecondCE); if(iCE<0) { return -1; } CE &= strengthMask[strength]; SecondCE &= strengthMask[strength]; previousCE = CE; previousContCE = SecondCE; while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { previousCE = (*(CETable+3*(--iCE))); previousContCE = (*(CETable+3*(iCE)+1)); } lh->previousCE = previousCE; lh->previousContCE = previousContCE; return iCE; } int32_t ucol_inv_getNext(UColTokListHeader *lh, uint32_t strength) { uint32_t CE = lh->baseCE; uint32_t SecondCE = lh->baseContCE; uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table); uint32_t nextCE, nextContCE; int32_t iCE; iCE = ucol_inv_findCE(CE, SecondCE); if(iCE<0) { return -1; } CE &= strengthMask[strength]; SecondCE &= strengthMask[strength]; nextCE = CE; nextContCE = SecondCE; while((nextCE & strengthMask[strength]) == CE && (nextContCE & strengthMask[strength]) == SecondCE) { nextCE = (*(CETable+3*(++iCE))); nextContCE = (*(CETable+3*(iCE)+1)); } lh->nextCE = nextCE; lh->nextContCE = nextContCE; return iCE; } U_CFUNC void ucol_inv_getGapPositions(UColTokListHeader *lh) { /* reset all the gaps */ int32_t i = 0; uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table); uint32_t st = 0; uint32_t t1, t2; int32_t pos; UColToken *tok = lh->first[UCOL_TOK_POLARITY_POSITIVE]; uint32_t tokStrength = tok->strength; for(i = 0; i<3; i++) { lh->gapsHi[3*i] = 0; lh->gapsHi[3*i+1] = 0; lh->gapsHi[3*i+2] = 0; lh->gapsLo[3*i] = 0; lh->gapsLo[3*i+1] = 0; lh->gapsLo[3*i+2] = 0; lh->numStr[i] = 0; lh->fStrToken[i] = NULL; lh->lStrToken[i] = NULL; lh->pos[i] = -1; } if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { lh->pos[0] = 0; t1 = UCOL_RESET_TOP_VALUE; t2 = 0; lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK); lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16; lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24; t1 = UCOL_NEXT_TOP_VALUE; t2 = 0; lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK); lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16; lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24; } else { for(;;) { if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { if((lh->pos[tokStrength] = ucol_inv_getNext(lh, tokStrength)) >= 0) { lh->fStrToken[tokStrength] = tok; } else { /* Error */ fprintf(stderr, "Error! couldn't find the CE!\n"); } } while(tok != NULL && tok->strength >= tokStrength) { if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { lh->lStrToken[tokStrength] = tok; } tok = tok->next; } if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { /* check if previous interval is the same and merge the intervals if it is so */ if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; lh->fStrToken[tokStrength+1] = NULL; lh->lStrToken[tokStrength+1] = NULL; lh->pos[tokStrength+1] = -1; } } if(tok != NULL) { tokStrength = tok->strength; } else { break; } } for(st = 0; st < 3; st++) { if((pos = lh->pos[st]) >= 0) { t1 = *(CETable+3*(pos)); t2 = *(CETable+3*(pos)+1); lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; pos--; t1 = *(CETable+3*(pos)); t2 = *(CETable+3*(pos)+1); lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; lh->gapsLo[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; } } } } #define ucol_countBytes(value, noOfBytes) \ { \ uint32_t mask = 0xFFFFFFFF; \ (noOfBytes) = 0; \ while(mask != 0) { \ if(((value) & mask) != 0) { \ (noOfBytes)++; \ } \ mask >>= 8; \ } \ } U_CFUNC uint32_t ucol_getNextGenerated(ucolCEGenerator *g) { g->current += (1<<(32-(g->byteSize*8))); if(g->current > g->fLow && g->current < g->fHigh) { g->current = g->fHigh; } return g->current; } static uint32_t fbHigh[3] = {0, /*0,*/UCOL_COMMON_TOP2, 0}; static uint32_t fbLow[3] = {0, /*0,*/UCOL_COMMON_BOT2, 0}; U_CFUNC uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, uint32_t low, uint32_t high, UColToken *tok, uint32_t strength) { uint32_t count = tok->toInsert; uint32_t lobytes = 0xFFFFFFFF, hibytes = 0xFFFFFFFF; g->fHigh = fbHigh[strength]; g->fLow = fbLow[strength]; ucol_countBytes(low, lobytes); ucol_countBytes(high, hibytes); g->lowCount = high-low; g->midCount = 0; g->highCount = 0; g->count = count + g->fHigh-g->fLow; g->byteSize = 0xFFFFFFFF; g->start = 0; g->limit = 0; if(g->lowCount >= g->count) { g->byteSize = lobytes; g->start = low; g->limit = high; } else if((g->lowCount)*254 > g->count) { g->byteSize = lobytes+1; g->start = low | (0x02 << (32-g->byteSize*8)); g->limit = high; } g->current = g->start; g->fLow = g->fLow << 24; g->fHigh = g->fHigh << 24; if(g->current > g->fLow && g->current < g->fHigh) { g->current = g->fHigh; } return g->current; } U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, uint32_t strength) { uint32_t low = lows[fStrength*3+strength]; uint32_t high = highs[fStrength*3+strength]; uint32_t count = tok->toInsert; g->fHigh = fbHigh[strength]; g->fLow = fbLow[strength]; uint32_t lobytes = 0, hibytes = 0; ucol_countBytes(low, lobytes); ucol_countBytes(high, hibytes); if(low == high) { #ifdef UCOL_DEBUG fprintf(stderr, "problem? low:%08X is equal to high:%08X at the strength level %i\n", low, high, strength); #endif if(strength > 0 && lows[fStrength*3+(strength-1)] != highs[fStrength*3+(strength-1)]) { low = (0x02 << (32-lobytes*8)); high = (0xFF << (32-hibytes*8)); #ifdef UCOL_DEBUG fprintf(stderr, "resolved! stronger strengths do give solution: %08X != %08X!\n", lows[fStrength*3+(strength-1)], highs[fStrength*3+(strength-1)]); #endif } else { #ifdef UCOL_DEBUG fprintf(stderr, "bad! stronger strengths do not give solution: %08X == %08X!\n", lows[fStrength*3+(strength-1)], highs[fStrength*3+(strength-1)]); #endif } } g->firstLow = low + (1 << (32-lobytes*8)); g->lastHigh = high - (1 << (32-hibytes*8)); if(g->firstLow != g->lastHigh) { if(lobytes > 1) { g->firstMid = g->firstLow + (1 << (32-(lobytes-1)*8)) & (0xFFFFFF00 << (32-lobytes*8)); g->lastLow = g->firstMid - (1 << (32-lobytes*8)); } else if(lobytes < hibytes) { g->lastLow = g->lastHigh - (1 << (32-(hibytes-1)*8)) & (0xFFFFFF00 << (32-hibytes*8)); g->firstMid = g->lastMid = 0; } else { g->lastLow = g->lastHigh; g->firstMid = g->lastMid = 0; } if(hibytes > 1) { g->lastMid = g->lastHigh - (1 << (32-(hibytes-1)*8)) & (0xFFFFFF00 << (32-hibytes*8)); g->firstHigh = g->lastMid + (1 << (32-(hibytes-1)*8)) + (0x02 << (32-(hibytes)*8)); } else if(lobytes > hibytes) { g->firstHigh = g->firstLow + (1 << (32-(lobytes-1)*8)) & (0xFFFFFF00 << (32-lobytes*8)); g->firstMid = g->lastMid = 0; } else { g->firstHigh = g->firstLow; g->firstMid = g->lastMid = 0; } ucol_countBytes(g->lastLow, g->lowByteCount); ucol_countBytes(g->lastMid, g->midByteCount); ucol_countBytes(g->lastHigh, g->highByteCount); if(g->firstLow < low || g->lastLow > high) { g->firstLow = g->lastLow = 0; g->lowByteCount = 0xFFFF; } if(g->firstMid < low || g->lastMid > high) { g->firstMid = g->lastMid = 0; g->midByteCount = 0xFFFF; } if(g->firstHigh < low || g->lastHigh > high) { g->firstHigh = g->lastHigh = 0; g->highByteCount = 0xFFFF; } g->maxCount = g->lowCount = (g->lastLow - g->firstLow) >> (32-g->lowByteCount*8); g->midCount = (g->lastMid - g->firstMid) >> (32-g->midByteCount*8); if(g->midCount > g->maxCount) { g->maxCount = g->midCount; } g->highCount = (g->lastHigh - g->firstHigh) >> (32-g->highByteCount*8); if(g->highCount > g->maxCount) { g->maxCount = g->highCount; } g->count = count; g->byteSize = 0xFFFFFFFF; g->start = 0; g->limit = 0; /* Let's get the best one now */ if(g->maxCount > g->count) { if(g->lowCount > count+(g->fHigh - g->fLow) ) { g->byteSize = g->lowByteCount; g->start = g->firstLow; g->limit = g->lastLow; } if(g->midCount > count+(g->fHigh - g->fLow) && g->midByteCount < g->byteSize) { g->byteSize = g->midByteCount; g->start = g->firstMid; g->limit = g->lastMid; } if(g->highCount > count+(g->fHigh - g->fLow) && g->highByteCount < g->byteSize) { g->byteSize = g->highByteCount; g->start = g->firstHigh; g->limit = g->lastHigh; } } if(g->byteSize == 0xFFFFFFFF && g->maxCount*254 > g->count) { /* Still no solution */ if((g->lowCount)*254 > count+(g->fHigh - g->fLow) ) { g->byteSize = g->lowByteCount+1; g->start = g->firstLow | (0x02 << (32-g->byteSize*8)); g->limit = g->lastLow; } if((g->midCount)*254 > count+(g->fHigh - g->fLow) && g->midByteCount+1 < g->byteSize) { g->byteSize = g->midByteCount+1; g->start = g->firstMid | (0x02 << (32-g->byteSize*8)); g->limit = g->lastMid; } if((g->highCount)*254 > count+(g->fHigh - g->fLow) && g->highByteCount+1 < g->byteSize) { g->byteSize = g->highByteCount+1; g->start = g->firstHigh | (0x02 << (32-g->byteSize*8)); g->limit = g->lastHigh | (0xFF << (32-g->byteSize*8)); } } if(g->byteSize == 0xFFFFFFFF) { /* Still no solution, we need to see if we can do anything about it */ #ifdef UCOL_DEBUG fprintf(stderr, "Too many elements to fit! %08X in %08X, will try splitting\n", g->count, g->maxCount*254); #endif } g->current = g->start; } else { /* only trivial space size 1 */ if(count == 1) { g->byteSize = lobytes; g->current = g->start = g->limit = g->firstLow; } else if(count < 254) { g->byteSize = lobytes+1; g->current = g->start = g->firstLow | (0x02 << (32-g->byteSize*8)); g->limit = g->firstLow | (0xFF << (32-g->byteSize*8)); } else { g->byteSize = lobytes+2; g->current = g->start = g->firstLow | (0x0202 << (32-g->byteSize*8)); g->limit = g->firstLow | (0xFFFF << (32-g->byteSize*8)); } } g->fLow = g->fLow << 24; g->fHigh = g->fHigh << 24; if(g->current > g->fLow && g->current < g->fHigh) { g->current = g->fHigh; } return g->current; } U_CFUNC void ucol_doCE(uint32_t *CEparts, UColToken *tok, UHashtable *tailored, UErrorCode *status) { /* this one makes the table and stuff */ uint32_t noOfBytes[3]; uint32_t i; for(i = 0; i<3; i++) { ucol_countBytes(CEparts[i], noOfBytes[i]); } /* Here we have to pack CEs from parts */ uint32_t CEi = 0; uint32_t value = 0; while(2*CEi 0) { value = 0x80; /* Continuation marker */ } else { value = 0; } if(tok->caseBit == TRUE) { value |= 0x40; } if(2*CEi>(32-16*(CEi+1))) & 0xFFFF) << 16; } if(CEi>(32-8*(CEi+1))) & 0xFF) << 8; } if(CEi>(32-8*(CEi+1))) & 0x3F); } tok->CEs[CEi] = value; CEi++; } if(CEi == 0) { /* totally ignorable */ tok->noOfCEs = 1; tok->CEs[0] = 0; } else { /* there is at least something */ tok->noOfCEs = CEi; } /* We'll need to handle expansions slightly differently than in */ /* UCA generation since we don't know if the value for expansion is from UCA or is it tailored */ uhash_put(tailored, (void *)tok->source, tok, status); /* and add them to a data table */ #if UCOL_DEBUG==2 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); for(i = 0; inoOfCEs; i++) { fprintf(stderr, "%08X ", tok->CEs[i]); } fprintf(stderr, "\n"); #endif } U_CFUNC void ucol_initBuffers(UColTokListHeader *lh, UHashtable *tailored, UErrorCode *status) { ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; uint32_t i = 0; UColToken *tok = lh->last[UCOL_TOK_POLARITY_POSITIVE]; uint32_t t[UCOL_STRENGTH_LIMIT]; for(i=0; itoInsert = 1; t[tok->strength] = 1; while(tok->previous != NULL) { if(tok->previous->strength < tok->strength) { /* going up */ t[tok->strength] = 0; t[tok->previous->strength]++; } else if(tok->previous->strength > tok->strength) { /* going down */ t[tok->previous->strength] = 1; } else { t[tok->strength]++; } tok=tok->previous; tok->toInsert = t[tok->strength]; } tok->toInsert = t[tok->strength]; /* tok=lh->first[UCOL_TOK_POLARITY_POSITIVE]; do { fprintf(stderr,"%i", tok->strength); tok = tok->next; } while(tok != NULL); fprintf(stderr, "\n"); tok=lh->first[UCOL_TOK_POLARITY_POSITIVE]; do { fprintf(stderr,"%i", tok->toInsert); tok = tok->next; } while(tok != NULL); */ ucol_inv_getGapPositions(lh); #if UCOL_DEBUG fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); int32_t j = 2; for(j = 2; j >= 0; j--) { fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); } #endif /* I strongly believe that this code can be refactored and simplified. */ /* have to do CE generation now, so let this soak a little bit */ tok = lh->first[UCOL_TOK_POLARITY_POSITIVE]; uint32_t fStrength = tok->strength; /* Treat starting identicals */ /* &0 = nula = zero */ if(tok != NULL && fStrength == UCOL_IDENTICAL) { CEparts[0] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; CEparts[1] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; CEparts[2] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; while(tok != NULL && tok->strength == UCOL_IDENTICAL) { ucol_doCE(CEparts, tok, tailored, status); tok = tok->next; } } if(tok != NULL && tok->strength == UCOL_TERTIARY) { /* starting with tertiary */ fStrength = tok->strength; if(lh->pos[fStrength] == -1) { while(lh->pos[fStrength] == -1 && fStrength > 0) { fStrength--; } if(lh->pos[fStrength] == -1) { fprintf(stderr, "OH MY GOD! NO PLACE TO PUT TERTIARIES!\n"); exit(-1); } } CEparts[0] = lh->gapsLo[fStrength*3]; CEparts[1] = lh->gapsLo[fStrength*3+1]; /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo, lh->gapsHi, tok, fStrength, UCOL_TERTIARY); while(tok != NULL && tok->strength >= UCOL_TERTIARY) { ucol_doCE(CEparts, tok, tailored, status); tok = tok->next; /* Treat identicals in starting tertiaries by NOT changing the tertiary value */ if(tok != NULL && tok->strength == UCOL_TERTIARY) { CEparts[2] = ucol_getNextGenerated(&Gens[2]); } } } if(tok != NULL && tok->strength == UCOL_SECONDARY) { /* secondaries */ fStrength = tok->strength; if(lh->pos[1] == -1) { fStrength = 0; if(lh->pos[fStrength] == -1) { fprintf(stderr, "OH MY GOD! NO PLACE TO PUT SECONDARIES!\n"); exit(-1); } } if(tok->next != NULL) { /* Treat identicals in starting secondaries*/ /* &0 [, ] ; = FunnySecZero */ CEparts[0] = lh->gapsLo[fStrength*3]; /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo, lh->gapsHi, tok, fStrength, 1); if(tok->next->strength == UCOL_TERTIARY) { CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[2], 0x03000000, 0xFF000000, tok->next, UCOL_TERTIARY); } else { CEparts[UCOL_TERTIARY] = 0x03000000; } ucol_doCE(CEparts, tok, tailored, status); tok = tok->next; while(tok->next != NULL && tok->next->strength > 0) { if(tok->strength == UCOL_TERTIARY) { CEparts[2] = ucol_getNextGenerated(&Gens[2]); ucol_doCE(CEparts, tok, tailored, status); } else if(tok->strength == UCOL_SECONDARY) { CEparts[1] = ucol_getNextGenerated(&Gens[1]); if(tok->next->strength == UCOL_SECONDARY) { CEparts[UCOL_TERTIARY] = 0x03000000; } else { CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[2], 0x03000000, 0xFF000000, tok->next, UCOL_TERTIARY); } ucol_doCE(CEparts, tok, tailored, status); } else { /* Strength is identical */ ucol_doCE(CEparts, tok, tailored, status); } tok = tok->next; } /* This is the last token in rule */ if(tok->strength == UCOL_TERTIARY) { CEparts[2] = ucol_getNextGenerated(&Gens[2]); } else if(tok->strength == UCOL_SECONDARY) { CEparts[1] = ucol_getNextGenerated(&Gens[1]); CEparts[2] = 0x03000000; } /* if the strength is identical, it will just repeat the last CE value */ ucol_doCE(CEparts, tok, tailored, status); tok = tok->next; } else { /* only one secondary at the end of the rule fragment */ CEparts[0] = lh->gapsLo[fStrength*3]; CEparts[1] = lh->gapsLo[fStrength*3+1]; CEparts[2] = lh->gapsLo[fStrength*3+2]; ucol_doCE(CEparts, tok, tailored, status); tok = NULL; } } /* This is essentialy the main loop. Two loops in front of this one were just for postponing with lower bounding weights */ if(tok != NULL) { /* regular primaries */ fStrength = tok->strength; if(lh->pos[0] == -1) { fprintf(stderr, "OH MY GOD! NO PLACE TO PUT PRIMARIES!\n"); exit(-1); } /* what if the next token is identical??? */ /* How should the things be set up */ if(tok->next != NULL) { /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo, lh->gapsHi, tok, fStrength, UCOL_PRIMARY); if(tok->next->strength == UCOL_PRIMARY) { CEparts[1] = 0x03000000; CEparts[2] = 0x03000000; } else { /* Secondaries will also be generated */ CEparts[1] = ucol_getSimpleCEGenerator(&Gens[1], 0x03000000, 0xFF000000, tok->next, 1); if(tok->next->strength == UCOL_SECONDARY) { CEparts[UCOL_TERTIARY] = 0x03000000; } else { CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[2], 0x03000000, 0xFF000000, tok->next, UCOL_TERTIARY); } } ucol_doCE(CEparts, tok, tailored, status); tok = tok->next; while(tok->next != NULL) { /* Treat identicals*/ /* < 1 = one = jedan < 2 = two = dva < 3 = three = tri ... */ if(tok->strength == UCOL_IDENTICAL) { ucol_doCE(CEparts, tok, tailored, status); } else if(tok->strength == UCOL_TERTIARY) { CEparts[2] = ucol_getNextGenerated(&Gens[2]); ucol_doCE(CEparts, tok, tailored, status); } else if(tok->strength == UCOL_SECONDARY) { CEparts[1] = ucol_getNextGenerated(&Gens[1]); if(tok->next->strength == UCOL_TERTIARY) { CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[2], 0x03000000, 0xFF000000, tok->next, UCOL_TERTIARY); } else { /* UCOL_SECONDARY */ CEparts[UCOL_TERTIARY] = 0x03000000; } ucol_doCE(CEparts, tok, tailored, status); } else { CEparts[0] = ucol_getNextGenerated(&Gens[0]); if(tok->next->strength == UCOL_PRIMARY) { CEparts[1] = 0x03000000; CEparts[UCOL_TERTIARY] = 0x03000000; } else { if(tok->next->strength == UCOL_SECONDARY) { CEparts[UCOL_TERTIARY] = 0x03000000; } else { /* UCOL_TERTIARY */ CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[2], 0x03000000, 0xFF000000, tok->next, UCOL_TERTIARY); } CEparts[1] = ucol_getSimpleCEGenerator(&Gens[1], 0x03000000, 0xFF000000, tok->next, 1); } ucol_doCE(CEparts, tok, tailored, status); } tok = tok->next; } /* OK, there are no next tokens, we just have to wrap up with the last one */ if(tok->strength == UCOL_TERTIARY) { CEparts[2] = ucol_getNextGenerated(&Gens[2]); } else if(tok->strength == UCOL_SECONDARY) { CEparts[1] = ucol_getNextGenerated(&Gens[1]); CEparts[2] = 0x03000000; } else if(tok->strength == UCOL_PRIMARY) { CEparts[0] = ucol_getNextGenerated(&Gens[0]); CEparts[1] = 0x03000000; CEparts[2] = 0x03000000; } /* else it is identical and do nothing */ ucol_doCE(CEparts, tok, tailored, status); } else { /* there is only one primary in this sequence and it ends with it */ CEparts[0] = lh->gapsLo[0]; CEparts[1] = lh->gapsLo[1]; CEparts[2] = lh->gapsLo[2]; ucol_doCE(CEparts, tok, tailored, status); } } } U_CFUNC uint32_t ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { collIterate colIt; uint32_t order; init_collIterate(&u, 1, &colIt, FALSE); order = ucol_getNextCE(coll, &colIt, status); /*UCOL_GETNEXTCE(order, coll, colIt, status);*/ return order; } U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UHashtable *tailored, UErrorCode *status) { UCAElements el; UColToken *tok = lh->first[UCOL_TOK_POLARITY_POSITIVE]; UColToken *expt = NULL; uint32_t i = 0; while(tok != NULL) { /* first, check if there are any expansions */ if(tok->expansion != 0) { if((expt = (UColToken *)uhash_get(tailored, (void *)tok->expansion)) != NULL) { /* expansion is tailored */ /* just copy CEs from tailored token to this one */ for(i = 0; inoOfCEs; i++) { tok->expCEs[i] = expt->CEs[i]; } tok->noOfExpCEs = expt->noOfCEs; } else { /* need to pick it from the UCA */ /* first, get the UChars from the rules */ /* then pick CEs out until there is no more and stuff them into expansion */ UChar source[256],buff[256]; collIterate s; uint32_t order = 0; uint32_t len = tok->expansion >> 24; uprv_memcpy(buff, (tok->expansion & 0x00FFFFFF) + src->source, len*sizeof(UChar)); unorm_normalize(buff, len, UNORM_NFD, 0, source, 256, status); init_collIterate(source, len, &s, FALSE); for(;;) { UCOL_GETNEXTCE(order, src->UCA, s, status); if(order == UCOL_NO_MORE_CES) { break; } tok->expCEs[tok->noOfExpCEs++] = order; } } } else { tok->noOfExpCEs = 0; } /* set the ucaelement with obtained values */ el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; /* copy CEs */ for(i = 0; inoOfCEs; i++) { el.CEs[i] = tok->CEs[i]; } for(i = 0; inoOfExpCEs; i++) { el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; } /* copy UChars */ /* key.source = newCharsLen << 24 | charsOffset; key.expansion = newExtensionsLen << 24 | extensionOffset; */ UChar buff[128]; uint32_t decompSize; uprv_memcpy(buff, (tok->source & 0x00FFFFFF) + src->source, (tok->source >> 24)*sizeof(UChar)); decompSize = unorm_normalize(buff, tok->source >> 24, UNORM_NFD, 0, el.uchars, 128, status); /*uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, (tok->source >> 24)*sizeof(UChar));*/ /* I think I don't want to have expansion chars in chars for UCAelement... HMMM! */ /*uprv_memcpy(el.uchars+(tok->source >> 24), (tok->expansion & 0x00FFFFFF) + src->source, (tok->expansion >> 24)*sizeof(UChar));*/ el.cSize = decompSize; /*(tok->source >> 24); *//* + (tok->expansion >> 24);*/ el.cPoints = el.uchars; el.caseBit = FALSE; /* how to see if there is case bit - pick it out from the UCA */ if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) { el.isThai = TRUE; } else { el.isThai = FALSE; } /* we also need a case bit here, and we'll fish it out from the UCA for the first codepoint */ uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status); if((caseCE & 0x40) != 0) { el.caseBit = TRUE; /* el.CEs[0] |= 0x40;*/ for(i = 0; inext; } } /* These are some normalizer constants */ #define STR_INDEX_SHIFT 2 //Must agree with the constants used in NormalizerBuilder #define STR_LENGTH_MASK 0x0003 int32_t uprv_ucol_decompose (UChar curChar, UChar *result) { /* either 0 or MAX_COMPAT = 11177 if we want just canonical */ int32_t minDecomp = 11177; int32_t resSize = 0; uint16_t offset = ucmp16_getu(DecompData::offsets, curChar); uint16_t index = (uint16_t)(offset & DecompData::DECOMP_MASK); if (index > minDecomp) { if ((offset & DecompData::DECOMP_RECURSE) != 0) { // Let Normalizer::decompose() handle recursive decomp UnicodeString temp(curChar); UnicodeString res; UErrorCode status = U_ZERO_ERROR; Normalizer::decompose(temp, minDecomp > 0, /*hangul ? Normalizer::IGNORE_HANGUL : 0,*/ Normalizer::IGNORE_HANGUL, res, status); T_fillOutputParams(&res, result, 356, &resSize, &status); } else { const UChar *source = (const UChar*)&(DecompData::contents); uint16_t ind = (int16_t)(index >> STR_INDEX_SHIFT); uint16_t length = (int16_t)(index & STR_LENGTH_MASK); if (length == 0) { UChar ch; while ((ch = source[ind++]) != 0x0000) { result[resSize++] = ch; } } else { while (length-- > 0) { result[resSize++] = source[ind++]; } } } return resSize; } #if 0 else if (hangul && curChar >= Normalizer::HANGUL_BASE && curChar < Normalizer::HANGUL_LIMIT) { Normalizer::hangulToJamo(curChar, result, (uint16_t)minDecomp); /* this has something to do with jamo hangul, check tomorrow */ } #endif else { /*result += curChar; this doesn't decompose */ return 0; } } uint32_t ucol_getDynamicCEs(UColTokenParser *src, tempUCATable *t, UChar *decomp, uint32_t noOfDec, uint32_t *result, uint32_t resultSize, UErrorCode *status) { uint32_t j = 0, i = 0; uint32_t CE = 0; uint32_t resLen = 0; collIterate colIt; UBool lastNotFound = FALSE; while(jmapping, decomp[j]); if(CE == UCOL_NOT_FOUND || lastNotFound) { /* get it from the UCA */ lastNotFound = FALSE; init_collIterate(decomp+j, 1, &colIt, TRUE); while(CE != UCOL_NO_MORE_CES) { CE = ucol_getNextCE(src->UCA, &colIt, status); if(CE != UCOL_NO_MORE_CES) { result[resLen++] = CE; } } } else if(CE < UCOL_NOT_FOUND) { /*normal CE */ result[resLen++] = CE; } else { /* special CE, contraction, expansion or Thai */ for(;;) { uint32_t tag = getCETag(CE); if(tag == THAI_TAG || tag == EXPANSION_TAG) { uint32_t *CEOffset = t->expansions->CEs+(getExpansionOffset(CE) - (paddedsize(sizeof(UCATableHeader))>>2)); /* find the offset to expansion table */ uint32_t size = getExpansionCount(CE); if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ for(i = 1; icontractions->elements[getContractOffset(CE)]; UChar c = decomp[++j]; /* what if this is already over */ i = 0; while(c > ctb->codePoints[i] && i < ctb->position) { i++; } if(c == ctb->codePoints[i] && jCEs[i]; } else { CE = ctb->CEs[0]; j--; } if(CE == UCOL_NOT_FOUND) { lastNotFound = TRUE; j--; break; } else if(CE > UCOL_NOT_FOUND) { continue; } else { result[resLen++] = CE; break; } } } } if(resLen >= resultSize) { *status = U_MEMORY_ALLOCATION_ERROR; } j++; } return resLen; } UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { uint32_t i = 0; /* 2. Eliminate the negative lists by doing the following for each non-null negative list: o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, create new ListHeader X o reverse the list, add to the end of X's positive list. Reset the strength of the first item you add, based on the stronger strength levels of the two lists. */ /* 3. For each ListHeader with a non-null positive list: */ /* o Find all character strings with CEs between the baseCE and the next/previous CE, at the strength of the first token. Add these to the tailoring. ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the tailoring has & x < z... ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... */ /* It is possible that this part should be done even while constructing list */ /* The problem is that it is unknown what is going to be the strongest weight */ /* So we might as well do it here */ /* o Allocate CEs for each token in the list, based on the total number N of the largest level difference, and the gap G between baseCE and nextCE at that level. The relation * between the last item and nextCE is the same as the strongest strength. o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) ? There are 3 primary items: a, d, e. Fit them into the primary gap. Then fit b and c into the secondary gap between a and d, then fit q into the tertiary gap between b and c. o Example: baseCE << b <<< q << c * nextCE(X,2) ? There are 2 secondary items: b, c. Fit them into the secondary gap. Then fit q into the tertiary gap between b and c. o When incrementing primary values, we will not cross high byte boundaries except where there is only a single-byte primary. That is to ensure that the script reordering will continue to work. */ UHashtable *tailored = uhash_open(uhash_hashLong, uhash_compareLong, status); for(i = 0; iresultLen; i++) { /* now we need to generate the CEs */ /* We stuff the initial value in the buffers, and increase the appropriate buffer */ /* According to strength */ ucol_initBuffers(&src->lh[i], tailored, status); } tempUCATable *t = uprv_uca_initTempTable(src->image, src->UCA, status); /* After this, we have assigned CE values to all regular CEs */ /* now we will go through list once more and resolve expansions, */ /* make UCAElements structs and add them to table */ for(i = 0; iresultLen; i++) { /* now we need to generate the CEs */ /* We stuff the initial value in the buffers, and increase the appropriate buffer */ /* According to strength */ ucol_createElements(src, t, &src->lh[i], tailored, status); } UCATableHeader *myData = NULL; { UChar decomp[256]; uint32_t noOfDec = 0, i = 0, CE = UCOL_NOT_FOUND; uint32_t u = 0; UCAElements el; el.isThai = FALSE; collIterate colIt; /*uint32_t decompCE[256];*/ uint32_t compCE[256]; uint32_t compRes = 0; /* produce canonical closure */ for(u = 0; u < 0x10000; u++) { /*if((noOfDec = unorm_normalize((const UChar *)&u, 1, UNORM_NFD, 0, decomp, 256, status)) > 1 || (noOfDec == 1 && *decomp != (UChar)u))*/ if((noOfDec = uprv_ucol_decompose ((UChar)u, decomp)) > 1 || (noOfDec == 1 && *decomp != (UChar)u)) { compRes = ucol_getDynamicCEs(src, t, (UChar *)&u, 1, compCE, 256, status); el.noOfCEs = ucol_getDynamicCEs(src, t, decomp, noOfDec, el.CEs, 128, status); if((compRes != el.noOfCEs) || (uprv_memcmp(compCE, el.CEs, compRes*sizeof(uint32_t)) != 0)) { el.uchars[0] = (UChar)u; el.cPoints = el.uchars; el.cSize = 1; uprv_uca_addAnElement(t, &el, status); } } } /* still need to produce compatibility closure */ /* add latin-1 stuff */ for(u = 0; u<0x100; u++) { if((CE = ucmp32_get(t->mapping, u)) == UCOL_NOT_FOUND /*) {*/ /* this test is for contractions that are missing the starting element. Looks like latin-1 should be done before assembling */ /* the table, even if it results in more false closure elements */ || ((isContraction(CE)) && (uprv_cnttab_getCE(t->contractions, CE, 0, TRUE, status) == UCOL_NOT_FOUND)) ) { decomp[0] = (UChar)u; el.uchars[0] = (UChar)u; el.cPoints = el.uchars; el.cSize = 1; el.noOfCEs = 0; init_collIterate(decomp, 1, &colIt, TRUE); while(CE != UCOL_NO_MORE_CES) { CE = ucol_getNextCE(src->UCA, &colIt, status); /*UCOL_GETNEXTCE(CE, temp, colIt, status);*/ if(CE != UCOL_NO_MORE_CES) { el.CEs[el.noOfCEs++] = CE; } } uprv_uca_addAnElement(t, &el, status); } } } myData = uprv_uca_assembleTable(t, status); uhash_close(tailored); uprv_uca_closeTempTable(t); return myData; } const InverseTableHeader *ucol_initInverseUCA(UErrorCode *status) { if(U_FAILURE(*status)) return NULL; if(invUCA == NULL) { InverseTableHeader *newInvUCA = NULL; /*(InverseTableHeader *)uprv_malloc(sizeof(InverseTableHeader ));*/ UDataMemory *result = udata_openChoice(NULL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status); if(U_FAILURE(*status)) { udata_close(result); uprv_free(newInvUCA); } if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ newInvUCA = (InverseTableHeader *)udata_getMemory(result); umtx_lock(NULL); if(invUCA == NULL) { invUCA = newInvUCA; newInvUCA = NULL; } umtx_unlock(NULL); if(newInvUCA != NULL) { udata_close(result); uprv_free(newInvUCA); } } } return invUCA; }