ICU-96 fixed jamo special handling, made building more robust

X-SVN-Rev: 4274
This commit is contained in:
Vladimir Weinstein 2001-03-22 18:45:31 +00:00
parent 5735b8a078
commit 5bbb91861f
3 changed files with 165 additions and 35 deletions

View File

@ -245,8 +245,8 @@ ucol_openRules( const UChar *rules,
ucol_setOptionsFromHeader(result, src.image, status);
result->hasRealData = FALSE;
}
result->dataInfo.dataVersion[0] = UCOL_BUILDER_VERSION;
if(U_SUCCESS(*status)) {
result->dataInfo.dataVersion[0] = UCOL_BUILDER_VERSION;
result->rules = (UChar *)uprv_malloc((u_strlen(rules)+1)*sizeof(UChar));
u_strcpy((UChar *)result->rules, rules);
result->freeRulesOnClose = TRUE;
@ -489,6 +489,8 @@ uint32_t ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
return order;
}
#if 0
/* bogus code, based on the wrong assumption */
void getSpecialJamo(const UCollator *coll, uint32_t CE, uint32_t **buffer) {
for(;;) {
uint32_t tag = getCETag(CE);
@ -535,6 +537,8 @@ void ucol_getJamoCEs(const UCollator *coll, UChar ch, uint32_t **buffer) {
*(*buffer++) = order;
}
#endif
/* This function tries to get a CE from UCA, which should be always around */
/* UChar is passed in in order to speed things up */
/* here is also the generation of implicit CEs */
@ -592,12 +596,39 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
return ucmp32_get(UCA->mapping, L); // return first one
} else { // Jamo is Special
collIterate jamos;
UChar jamoString[3];
uint32_t CE = UCOL_NOT_FOUND;
const UCollator *collator = collationSource->coll;
jamoString[0] = L;
jamoString[1] = V;
if (T != TBase) {
jamoString[2] = T;
init_collIterate(collator, jamoString, 3, &jamos, TRUE);
} else {
init_collIterate(collator, jamoString, 2, &jamos, TRUE);
}
CE = ucol_getNextCE(collator, &jamos, status);
while(CE != UCOL_NO_MORE_CES) {
*(collationSource->CEpos++) = CE;
CE = ucol_getNextCE(collator, &jamos, status);
}
return *(collationSource->toReturn++);
/* Code and pseudocode below is bogus - we didn't take into */
/* account that any combo of L,V,T could be */
/* in fact a contraction - we cannot look at them separately */
/*
ucol_getJamoCEs(collationSource->coll, L, &collationSource->CEpos);
ucol_getJamoCEs(collationSource->coll, V, &collationSource->CEpos);
if (T != TBase) {
ucol_getJamoCEs(collationSource->coll, T, &collationSource->CEpos);
}
return *(collationSource->toReturn++);
*/
/*
// do recursive processing of L, V, and T with fetchCE (but T only if not equal to TBase!!)
// Since fetchCE returns a CE, and (potentially) stuffs items into the ce buffer,
@ -718,6 +749,30 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
collationSource->toReturn = collationSource->CEpos - 1;
return *(collationSource->toReturn);
} else {
collIterate jamos;
UChar jamoString[3];
uint32_t CE = UCOL_NOT_FOUND;
const UCollator *collator = collationSource->coll;
jamoString[0] = L;
jamoString[1] = V;
if (T != TBase) {
jamoString[2] = T;
init_collIterate(collator, jamoString, 3, &jamos, TRUE);
} else {
init_collIterate(collator, jamoString, 2, &jamos, TRUE);
}
CE = ucol_getNextCE(collator, &jamos, status);
while(CE != UCOL_NO_MORE_CES) {
*(collationSource->CEpos++) = CE;
CE = ucol_getNextCE(collator, &jamos, status);
}
collationSource->toReturn = collationSource->CEpos - 1;
return *(collationSource->toReturn);
/*return *(collationSource->toReturn++);*/
/*
ucol_getJamoCEs(collationSource->coll, L, &collationSource->CEpos);
ucol_getJamoCEs(collationSource->coll, V, &collationSource->CEpos);
if (T != TBase) {
@ -725,6 +780,7 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
}
collationSource->toReturn = collationSource->CEpos - 1;
return *(collationSource->toReturn);
*/
/*
Jamo is Special
do recursive processing of L, V, and T with fetchCE (but T only if not
@ -2687,6 +2743,37 @@ static UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
return ((htbyte >> (hash & 7)) & 1) == 1;
}
/* This internal API checks whether a character is tailored or not */
U_CAPI UBool isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
uint32_t CE = UCOL_NOT_FOUND;
const UChar *ContractionStart = NULL;
if(U_SUCCESS(*status) && coll != NULL) {
if(coll == UCA) {
return FALSE;
} else if(u < 0x100) { /* latin-1 */
CE = coll->latinOneMapping[u];
if(CE == UCA->latinOneMapping[u]) {
return FALSE;
}
} else { /* regular */
CE = ucmp32_get(coll->mapping, u);
}
if(isContraction(CE)) {
ContractionStart = (UChar *)coll->image+getContractOffset(CE);
CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
}
if(CE == UCOL_NOT_FOUND) {
return FALSE;
} else {
return TRUE;
}
} else {
return FALSE;
}
}
/****************************************************************************/
/* Following are the string compare functions */
@ -4029,12 +4116,35 @@ uint32_t ucol_getIncrementalUCA(UChar ch, incrementalContext *collationSource, U
return ucmp32_get(UCA->mapping, L); // return first one
} else { // Jamo is Special
collIterate jamos;
UChar jamoString[3];
uint32_t CE = UCOL_NOT_FOUND;
const UCollator *collator = collationSource->coll;
jamoString[0] = L;
jamoString[1] = V;
if (T != TBase) {
jamoString[2] = T;
init_collIterate(collator, jamoString, 3, &jamos, TRUE);
} else {
init_collIterate(collator, jamoString, 2, &jamos, TRUE);
}
CE = ucol_getNextCE(collator, &jamos, status);
while(CE != UCOL_NO_MORE_CES) {
*(collationSource->CEpos++) = CE;
CE = ucol_getNextCE(collator, &jamos, status);
}
return *(collationSource->toReturn++);
/*
ucol_getJamoCEs(collationSource->coll, L, &collationSource->CEpos);
ucol_getJamoCEs(collationSource->coll, V, &collationSource->CEpos);
if (T != TBase) {
ucol_getJamoCEs(collationSource->coll, T, &collationSource->CEpos);
}
return *(collationSource->toReturn++);
*/
/*
// do recursive processing of L, V, and T with fetchCE (but T only if not equal to TBase!!)

View File

@ -573,6 +573,14 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
el.isThai = FALSE;
}
if(src->UCA != NULL) {
for(i = 0; i<el.cSize; i++) {
if(UCOL_ISJAMO(el.cPoints[i])) {
t->image->jamoSpecial = TRUE;
}
}
}
/* we also need a case bit here, and we'll fish it out from the UCA for the first codepoint */
uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status);
if((caseCE & 0x40) != 0) {
@ -769,6 +777,9 @@ uint32_t ucol_getDynamicCEs(UColTokenParser *src, tempUCATable *t, UChar *decomp
UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
uint32_t i = 0;
if(U_FAILURE(*status)) {
return NULL;
}
/*
2. Eliminate the negative lists by doing the following for each non-null negative list:
o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
@ -814,7 +825,9 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
/* now we need to generate the CEs */
/* We stuff the initial value in the buffers, and increase the appropriate buffer */
/* According to strength */
ucol_initBuffers(&src->lh[i], tailored, status);
if(U_SUCCESS(*status)) {
ucol_initBuffers(&src->lh[i], tailored, status);
}
}
tempUCATable *t = uprv_uca_initTempTable(src->image, src->UCA, status);
@ -827,7 +840,9 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
/* now we need to generate the CEs */
/* We stuff the initial value in the buffers, and increase the appropriate buffer */
/* According to strength */
ucol_createElements(src, t, &src->lh[i], tailored, status);
if(U_SUCCESS(*status)) {
ucol_createElements(src, t, &src->lh[i], tailored, status);
}
}
UCATableHeader *myData = NULL;
@ -842,21 +857,23 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
uint32_t compCE[256];
uint32_t compRes = 0;
/* produce canonical closure */
for(u = 0; u < 0xFFFF; u++) {
if((noOfDec = unorm_normalize(&u, 1, UNORM_NFD, 0, decomp, 256, status)) > 1
|| (noOfDec == 1 && *decomp != (UChar)u))
/*if((noOfDec = uprv_ucol_decompose ((UChar)u, decomp)) > 1 || (noOfDec == 1 && *decomp != (UChar)u))*/
{
compRes = ucol_getDynamicCEs(src, t, (UChar *)&u, 1, compCE, 256, status);
el.noOfCEs = ucol_getDynamicCEs(src, t, decomp, noOfDec, el.CEs, 128, status);
if(U_SUCCESS(*status)) {
/* produce canonical closure */
for(u = 0; u < 0xFFFF; u++) {
if((noOfDec = unorm_normalize(&u, 1, UNORM_NFD, 0, decomp, 256, status)) > 1
|| (noOfDec == 1 && *decomp != (UChar)u))
/*if((noOfDec = uprv_ucol_decompose ((UChar)u, decomp)) > 1 || (noOfDec == 1 && *decomp != (UChar)u))*/
{
compRes = ucol_getDynamicCEs(src, t, (UChar *)&u, 1, compCE, 256, status);
el.noOfCEs = ucol_getDynamicCEs(src, t, decomp, noOfDec, el.CEs, 128, status);
if((compRes != el.noOfCEs) || (uprv_memcmp(compCE, el.CEs, compRes*sizeof(uint32_t)) != 0)) {
el.uchars[0] = (UChar)u;
el.cPoints = el.uchars;
el.cSize = 1;
if((compRes != el.noOfCEs) || (uprv_memcmp(compCE, el.CEs, compRes*sizeof(uint32_t)) != 0)) {
el.uchars[0] = (UChar)u;
el.cPoints = el.uchars;
el.cSize = 1;
uprv_uca_addAnElement(t, &el, status);
uprv_uca_addAnElement(t, &el, status);
}
}
}
}
@ -864,27 +881,29 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
/* still need to produce compatibility closure */
/* add latin-1 stuff */
for(u = 0; u<0x100; u++) {
if((CE = ucmp32_get(t->mapping, u)) == UCOL_NOT_FOUND /*) {*/
/* this test is for contractions that are missing the starting element. Looks like latin-1 should be done before assembling */
/* the table, even if it results in more false closure elements */
|| ((isContraction(CE)) &&
(uprv_cnttab_getCE(t->contractions, CE, 0, TRUE, status) == UCOL_NOT_FOUND))
) {
decomp[0] = (UChar)u;
el.uchars[0] = (UChar)u;
el.cPoints = el.uchars;
el.cSize = 1;
el.noOfCEs = 0;
init_collIterate(src->UCA, decomp, 1, &colIt, TRUE);
while(CE != UCOL_NO_MORE_CES) {
CE = ucol_getNextCE(src->UCA, &colIt, status);
/*UCOL_GETNEXTCE(CE, temp, colIt, status);*/
if(CE != UCOL_NO_MORE_CES) {
el.CEs[el.noOfCEs++] = CE;
if(U_SUCCESS(*status)) {
for(u = 0; u<0x100; u++) {
if((CE = ucmp32_get(t->mapping, u)) == UCOL_NOT_FOUND /*) {*/
/* this test is for contractions that are missing the starting element. Looks like latin-1 should be done before assembling */
/* the table, even if it results in more false closure elements */
|| ((isContraction(CE)) &&
(uprv_cnttab_getCE(t->contractions, CE, 0, TRUE, status) == UCOL_NOT_FOUND))
) {
decomp[0] = (UChar)u;
el.uchars[0] = (UChar)u;
el.cPoints = el.uchars;
el.cSize = 1;
el.noOfCEs = 0;
init_collIterate(src->UCA, decomp, 1, &colIt, TRUE);
while(CE != UCOL_NO_MORE_CES) {
CE = ucol_getNextCE(src->UCA, &colIt, status);
/*UCOL_GETNEXTCE(CE, temp, colIt, status);*/
if(CE != UCOL_NO_MORE_CES) {
el.CEs[el.noOfCEs++] = CE;
}
}
uprv_uca_addAnElement(t, &el, status);
}
uprv_uca_addAnElement(t, &el, status);
}
}
}

View File

@ -550,6 +550,7 @@ int32_t ucol_getIncrementalSpecialCE(const UCollator *coll, uint32_t CE, increme
void ucol_updateInternalState(UCollator *coll);
uint32_t ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status);
U_CAPI char U_EXPORT2 *ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len);
U_CAPI UBool isTailored(const UCollator *coll, const UChar u, UErrorCode *status);
#endif