ICU-96 some fixes - need to be finished (hi got broken)
X-SVN-Rev: 4887
This commit is contained in:
parent
8c69e1eb5a
commit
27d7ed1627
@ -1155,7 +1155,6 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
|
||||
|
||||
/* add latin-1 stuff */
|
||||
if(U_SUCCESS(*status)) {
|
||||
|
||||
for(u = 0; u<0x100; u++) {
|
||||
if((CE = ucmp32_get(t->mapping, u)) == UCOL_NOT_FOUND
|
||||
/* this test is for contractions that are missing the starting element. Looks like latin-1 should be done before assembling */
|
||||
@ -1171,7 +1170,6 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
|
||||
init_collIterate(src->UCA, decomp, 1, &colIt);
|
||||
while(CE != UCOL_NO_MORE_CES) {
|
||||
CE = ucol_getNextCE(src->UCA, &colIt, status);
|
||||
/*UCOL_GETNEXTCE(CE, temp, colIt, status);*/
|
||||
if(CE != UCOL_NO_MORE_CES) {
|
||||
el.CEs[el.noOfCEs++] = CE;
|
||||
}
|
||||
@ -1194,32 +1192,81 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
|
||||
}
|
||||
uprv_uca_closeTempTable(tempTable);
|
||||
}
|
||||
|
||||
|
||||
if(U_SUCCESS(*status)) {
|
||||
/* copy contractions */
|
||||
uint32_t ucaCE = UCOL_NOT_FOUND, tailoredCE = UCOL_NOT_FOUND;
|
||||
uint16_t *conts = (uint16_t *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
|
||||
while(*conts != 0) {
|
||||
tailoredCE = ucmp32_get(tempColl->mapping, *conts);
|
||||
if(tailoredCE != UCOL_NOT_FOUND) {
|
||||
UBool isTailoredContraction = isContraction(tailoredCE);
|
||||
el.cPoints = el.uchars;
|
||||
el.noOfCEs = 0;
|
||||
el.uchars[0] = *conts;
|
||||
el.uchars[1] = *(conts+1);
|
||||
if(*(conts+2)!=0) {
|
||||
el.uchars[2] = *(conts+2);
|
||||
el.cSize = 3;
|
||||
} else {
|
||||
el.cSize = 2;
|
||||
}
|
||||
UCollationElements *ucaEl = ucol_openElements(src->UCA, el.uchars, el.cSize, status);
|
||||
UCollationElements *tailorEl = ucol_openElements(tempColl, el.uchars, el.cSize, status);
|
||||
UBool needToAdd = TRUE;
|
||||
if(isTailoredContraction) {
|
||||
do {
|
||||
el.CEs[el.noOfCEs] = ucol_next(ucaEl, status);
|
||||
tailoredCE = ucol_next(tailorEl, status);
|
||||
if(tailoredCE == el.CEs[el.noOfCEs]) {
|
||||
el.noOfCEs++;
|
||||
} else {
|
||||
needToAdd = FALSE;
|
||||
break;
|
||||
}
|
||||
} while(tailoredCE != UCOL_NULLORDER);
|
||||
|
||||
if(needToAdd == TRUE) {
|
||||
el.noOfCEs--; // remove UCOL_NULLORDER
|
||||
uprv_uca_addAnElement(t, &el, status);
|
||||
}
|
||||
} else { // if the tailored CE is not a contraction, we need to add this onelk
|
||||
while ((el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
|
||||
el.noOfCEs++;
|
||||
}
|
||||
uprv_uca_addAnElement(t, &el, status);
|
||||
}
|
||||
|
||||
}
|
||||
conts+=3;
|
||||
}
|
||||
if(U_SUCCESS(*status)) {
|
||||
ucol_close(tempColl);
|
||||
tempUCATable *tempTable = uprv_uca_cloneTempTable(t, status);
|
||||
|
||||
UCATableHeader *tempData = uprv_uca_assembleTable(tempTable, status);
|
||||
tempColl = ucol_initCollator(tempData, 0, status);
|
||||
|
||||
if(U_SUCCESS(*status)) {
|
||||
tempColl->rb = NULL;
|
||||
tempColl->hasRealData = TRUE;
|
||||
}
|
||||
uprv_uca_closeTempTable(tempTable);
|
||||
}
|
||||
|
||||
/* produce canonical closure */
|
||||
for(u = 0; u < 0xFFFF; u++) {
|
||||
if((noOfDec = unorm_normalize(&u, 1, UNORM_NFD, 0, decomp, 256, status)) > 1
|
||||
|| (noOfDec == 1 && *decomp != (UChar)u))
|
||||
{
|
||||
//el.noOfCEs = ucol_getDynamicCEs(src, t, decomp, noOfDec, el.CEs, 128, status);
|
||||
|
||||
if(ucol_strcoll(tempColl, (UChar *)&u, 1, decomp, noOfDec) != UCOL_EQUAL) {
|
||||
el.uchars[0] = (UChar)u;
|
||||
el.cPoints = el.uchars;
|
||||
el.cSize = 1;
|
||||
el.noOfCEs = 0;
|
||||
//uint32_t noOfCEs = 0;
|
||||
//uint32_t currCE = 0;
|
||||
UCollationElements* colEl = ucol_openElements(tempColl, decomp, noOfDec, status);
|
||||
|
||||
while((el.CEs[el.noOfCEs] = ucol_next(colEl, status)) != UCOL_NULLORDER) {
|
||||
//while((currCE = ucol_next(colEl, status)) != UCOL_NULLORDER) {
|
||||
//if(currCE != el.CEs[noOfCEs]) {
|
||||
//fprintf(stderr, "%04X[%d] %08X vs %08X\n", u, noOfCEs, currCE, el.CEs[noOfCEs]);
|
||||
//}
|
||||
el.noOfCEs++;
|
||||
//noOfCEs++;
|
||||
}
|
||||
|
||||
uprv_uca_addAnElement(t, &el, status);
|
||||
|
@ -495,8 +495,8 @@ typedef struct {
|
||||
/* all the offsets are in bytes */
|
||||
/* to get the address add to the header address and cast properly */
|
||||
uint32_t options; /* these are the default options for the collator */
|
||||
uint32_t CEindex; /* uint16_t *CEindex; */
|
||||
uint32_t CEvalues; /* int32_t *CEvalues; */
|
||||
uint32_t contractionUCACombos; /* this one is needed only for UCA, to copy the appropriate contractions */
|
||||
uint32_t unusedReserved1; /* reserved for future use */
|
||||
uint32_t mappingPosition; /* const uint8_t *mappingPosition; */
|
||||
uint32_t expansion; /* uint32_t *expansion; */
|
||||
uint32_t contractionIndex; /* UChar *contractionIndex; */
|
||||
|
@ -47,15 +47,6 @@ uhash_hashTokens(const void *k) {
|
||||
hash = (hash * 37) + *p;
|
||||
p += inc;
|
||||
}
|
||||
|
||||
if((len = ((key->expansion & 0xFF000000)>>24)) != 0) {
|
||||
p = (key->expansion & 0x00FFFFFF) + rulesToParse;
|
||||
limit = p + len;
|
||||
while (p<limit) {
|
||||
hash = (hash * 37) + *p;
|
||||
p += inc;
|
||||
}
|
||||
}
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
@ -74,33 +65,19 @@ UBool uhash_compareTokens(const void *key1, const void *key2) {
|
||||
if (p1 == NULL || p2 == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
if(p1->source == p2->source && p1->expansion == p2->expansion) {
|
||||
return TRUE;
|
||||
}
|
||||
if(s1L != s2L) {
|
||||
return FALSE;
|
||||
}
|
||||
while(s1 < s1+s1L-1 && *s1 == *s2) {
|
||||
if(p1->source == p2->source) {
|
||||
return TRUE;
|
||||
}
|
||||
const UChar *end = s1+s1L-1;
|
||||
while((s1 < end) && *s1 == *s2) {
|
||||
++s1;
|
||||
++s2;
|
||||
}
|
||||
if(*s1 == *s2) {
|
||||
s1 = (p1->expansion & 0x00FFFFFF) + rulesToParse;
|
||||
s2 = (p2->expansion & 0x00FFFFFF) + rulesToParse;
|
||||
s1L = ((p1->expansion & 0xFF000000) >> 24);
|
||||
s2L = ((p2->expansion & 0xFF000000) >> 24);
|
||||
if(s1L != s2L) {
|
||||
return FALSE;
|
||||
}
|
||||
if(s1L != 0) {
|
||||
while(s1 < s1+s1L-1 && *s1 == *s2) {
|
||||
++s1;
|
||||
++s2;
|
||||
}
|
||||
return (UBool)(*s1 == *s2);
|
||||
} else {
|
||||
return TRUE;
|
||||
}
|
||||
return TRUE;
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
@ -389,10 +366,6 @@ const UChar *ucol_tok_parseNextToken(UColTokenParser *src,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// This here would be the proper way to do it, but we then need to require quoting all isWhitespace in
|
||||
// while(u_isWhitespace(ch)) {
|
||||
// ch = *(++src->current);
|
||||
// }
|
||||
/* Sets the strength for this entry */
|
||||
switch (ch) {
|
||||
case 0x003D/*'='*/ :
|
||||
@ -514,7 +487,6 @@ const UChar *ucol_tok_parseNextToken(UColTokenParser *src,
|
||||
case 0x000D/*'\r'*/:
|
||||
case 0x000A/*'\n'*/:
|
||||
case 0x0020/*' '*/:
|
||||
case 0x2028/* Unicode line break (UniPad likes to add it)*/:
|
||||
break; /* skip whitespace TODO use Unicode */
|
||||
case 0x002F/*'/'*/:
|
||||
wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
|
||||
@ -793,6 +765,7 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
src->varTop = sourceToken;
|
||||
}
|
||||
|
||||
sourceToken->expansion = newExtensionsLen << 24 | extensionOffset;
|
||||
/*
|
||||
If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
|
||||
d * ... into &x * c/y * d * ...
|
||||
@ -802,16 +775,16 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
expandNext = 0;
|
||||
} else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
|
||||
sourceToken->expansion = expandNext;
|
||||
sourceToken->debugExpansion = *(src->source + (expandNext & 0xFFFFFF));
|
||||
} else { /* there is both explicit and implicit expansion. We need to make a combination */
|
||||
memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
|
||||
memcpy(src->extraCurrent+(expandNext >> 24), src->source + extensionOffset, newExtensionsLen*sizeof(UChar));
|
||||
sourceToken->expansion = ((expandNext >> 24) + newExtensionsLen)<<24 | (src->extraCurrent - src->source);
|
||||
src->extraCurrent += (expandNext >> 24) + newExtensionsLen;
|
||||
sourceToken->debugExpansion = *(src->source + (sourceToken->expansion & 0xFFFFFF));
|
||||
}
|
||||
}
|
||||
|
||||
sourceToken->debugExpansion = *(src->source + (sourceToken->expansion & 0xFFFFFF));
|
||||
|
||||
/*
|
||||
1. Find the strongest strength in each list, and set strongestP and strongestN
|
||||
accordingly in the headers.
|
||||
@ -887,6 +860,18 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
|
||||
uint32_t searchCharsLen = newCharsLen;
|
||||
while(searchCharsLen > 1 && sourceToken == NULL) {
|
||||
searchCharsLen--;
|
||||
key.source = searchCharsLen << 24 | charsOffset;
|
||||
sourceToken = (UColToken *)uhash_get(uchars2tokens, &key);
|
||||
}
|
||||
if(sourceToken != NULL) {
|
||||
expandNext = (newCharsLen - searchCharsLen) << 24 | (charsOffset + searchCharsLen);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
|
||||
collIterate s;
|
||||
|
||||
@ -926,12 +911,6 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
}
|
||||
}
|
||||
|
||||
if(newCharsLen > 1) {
|
||||
expandNext = ((newCharsLen-1)<<24) | (charsOffset + 1);
|
||||
} else {
|
||||
expandNext = 0;
|
||||
}
|
||||
|
||||
/* 5 If the relation is a reset:
|
||||
If sourceToken is null
|
||||
Create new list, create new sourceToken, make the baseCE from source, put
|
||||
@ -962,14 +941,13 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
earlier in the list.
|
||||
*/
|
||||
if(top == FALSE) {
|
||||
if(newCharsLen > 1) {
|
||||
sourceToken->source = 0x01000000 | charsOffset;
|
||||
}
|
||||
uint32_t resetCharsOffset;
|
||||
|
||||
|
||||
init_collIterate(src->UCA, src->source+charsOffset, 1, &s); /* or newCharsLen instead of 1??? */
|
||||
init_collIterate(src->UCA, src->source+charsOffset, newCharsLen, &s);
|
||||
|
||||
CE = ucol_getNextCE(src->UCA, &s, status);
|
||||
resetCharsOffset = s.pos - src->source;
|
||||
|
||||
SecondCE = ucol_getNextCE(src->UCA, &s, status);
|
||||
|
||||
ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
|
||||
@ -978,6 +956,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
} else {
|
||||
ListList[src->resultLen].baseContCE = 0;
|
||||
}
|
||||
if(newCharsLen > 1) {
|
||||
sourceToken->source = ((resetCharsOffset - charsOffset ) << 24) | charsOffset;
|
||||
expandNext = ((newCharsLen + charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
|
||||
} else {
|
||||
expandNext = 0;
|
||||
}
|
||||
} else { /* top == TRUE */
|
||||
top = FALSE;
|
||||
ListList[src->resultLen].baseCE = UCOL_RESET_TOP_VALUE;
|
||||
|
@ -430,7 +430,9 @@ ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit,
|
||||
}
|
||||
|
||||
/* set the bytes in the end weight at length+1..length2 to maxByte */
|
||||
ranges[0].end|=(0xffffffff>>(8*i))&(0xffffffff<<(8*(4-minLength)));
|
||||
byte=(maxByte<<24)|(maxByte<<16)|(maxByte<<8)|maxByte; /* this used to be 0xffffffff */
|
||||
ranges[0].end=truncateWeight(ranges[0].end, i)|
|
||||
(byte>>(8*i))&(byte<<(8*(4-minLength)));
|
||||
|
||||
/* set the start of the second range to immediately follow the end of the first one */
|
||||
ranges[1].start=incWeight(ranges[0].end, minLength, maxByte);
|
||||
@ -470,10 +472,8 @@ ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit,
|
||||
}
|
||||
#endif
|
||||
|
||||
if(rangeCount>0) {
|
||||
/* set maxByte in ranges[0] for ucol_nextWeight() */
|
||||
ranges[0].count=maxByte;
|
||||
}
|
||||
/* set maxByte in ranges[0] for ucol_nextWeight() */
|
||||
ranges[0].count=maxByte;
|
||||
|
||||
return rangeCount;
|
||||
}
|
||||
|
@ -2280,6 +2280,61 @@ static void TestCompressOverlap() {
|
||||
}
|
||||
}
|
||||
|
||||
void TestCyrillicTailoring(void) {
|
||||
static char *test[] = {
|
||||
"\\u0410",
|
||||
"\\u0410\\u0306",
|
||||
"\\u04d0"
|
||||
};
|
||||
|
||||
static char rules[256] = "&Z < \\u0410";
|
||||
static UChar rlz[256];
|
||||
uint32_t rLen;
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
|
||||
UChar u = 0;
|
||||
uint32_t nfcSize;
|
||||
uint32_t nfdSize;
|
||||
tester **t = uprv_malloc(0xFFFF * sizeof(tester *));
|
||||
uint32_t noCases = 0;
|
||||
UCollator *coll = NULL;
|
||||
|
||||
t[0] = (tester *)uprv_malloc(sizeof(tester));
|
||||
|
||||
for(u = 0; u < 0xFFFF; u++) {
|
||||
nfcSize = unorm_normalize(&u, 1, UNORM_NFC, 0, t[noCases]->NFC, NORM_BUFFER_TEST_LEN, &status);
|
||||
nfdSize = unorm_normalize(&u, 1, UNORM_NFD, 0, t[noCases]->NFD, NORM_BUFFER_TEST_LEN, &status);
|
||||
|
||||
if(nfcSize != nfdSize || (uprv_memcmp(t[noCases]->NFC, t[noCases]->NFD, nfcSize * sizeof(UChar)) != 0)) {
|
||||
t[noCases]->u = u;
|
||||
noCases++;
|
||||
t[noCases] = (tester *)uprv_malloc(sizeof(tester));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*coll = ucol_open(locName, &status);*/
|
||||
rLen = u_unescape(rules, rlz, 256);
|
||||
coll = ucol_openRules(rlz, rLen, UCOL_DEFAULT_NORMALIZATION, UCOL_DEFAULT, &status);
|
||||
|
||||
for(u=0; u<noCases; u++) {
|
||||
doTest(coll, t[u]->NFC, t[u]->NFD, UCOL_EQUAL);
|
||||
}
|
||||
|
||||
ucol_close(coll);
|
||||
|
||||
for(u = 0; u <= noCases; u++) {
|
||||
uprv_free(t[u]);
|
||||
}
|
||||
uprv_free(t);
|
||||
|
||||
genericLocaleStarter("ru", test, 3);
|
||||
genericRulesStarter("&\\u0410 = \\u0410", test, 3);
|
||||
genericRulesStarter("&Z < \\u0410", test, 3);
|
||||
}
|
||||
|
||||
static void TestContraction() {
|
||||
const static char *testrules[] = {
|
||||
"&A = AB / B",
|
||||
@ -2297,19 +2352,19 @@ static void TestContraction() {
|
||||
{(UChar)'c', (UChar)'l'}
|
||||
};
|
||||
const static char *testrules3[] = {
|
||||
"&z < xyz &xyzw < B",
|
||||
"&z < xyz &xyz < B / w",
|
||||
"&z < ch &achm < B",
|
||||
"&z < ch &a < B / chm",
|
||||
"&\\ud800\\udc00w < B",
|
||||
"&\\ud800\\udc00 < B / w",
|
||||
"&a\\ud800\\udc00m < B",
|
||||
"&a < B / \\ud800\\udc00m",
|
||||
"&z < xyz &xyzw << B",
|
||||
"&z < xyz &xyz << B / w",
|
||||
"&z < ch &achm << B",
|
||||
"&z < ch &a << B / chm",
|
||||
"&\\ud800\\udc00w << B",
|
||||
"&\\ud800\\udc00 << B / w",
|
||||
"&a\\ud800\\udc00m << B",
|
||||
"&a << B / \\ud800\\udc00m",
|
||||
};
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCollator *coll;
|
||||
UChar rule[32] = {0};
|
||||
UChar rule[256] = {0};
|
||||
uint32_t rlen = 0;
|
||||
int i;
|
||||
|
||||
@ -2356,7 +2411,7 @@ static void TestContraction() {
|
||||
ucol_close(coll);
|
||||
}
|
||||
|
||||
rlen = u_unescape("& a < b < c < ch < d & c = ch / h", rule, 32);
|
||||
rlen = u_unescape("& a < b < c < ch < d & c = ch / h", rule, 256);
|
||||
coll = ucol_openRules(rule, rlen, UNORM_NFD, UCOL_TERTIARY, &status);
|
||||
if (ucol_strcoll(coll, testdata2[0], 2, testdata2[1], 2) != UCOL_LESS) {
|
||||
log_err("Expected \\u%04x\\u%04x < \\u%04x\\u%04x\n",
|
||||
@ -2379,9 +2434,9 @@ static void TestContraction() {
|
||||
*iter2;
|
||||
UChar ch = 'B';
|
||||
uint32_t ce;
|
||||
rlen = u_unescape(testrules3[i << 1], rule, 32);
|
||||
rlen = u_unescape(testrules3[i], rule, 32);
|
||||
coll1 = ucol_openRules(rule, rlen, UNORM_NFD, UCOL_TERTIARY, &status);
|
||||
rlen = u_unescape(testrules3[(i << 1) + 1], rule, 32);
|
||||
rlen = u_unescape(testrules3[i + 1], rule, 32);
|
||||
coll2 = ucol_openRules(rule, rlen, UNORM_NFD, UCOL_TERTIARY, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err("Collator creation failed %s\n", testrules[i]);
|
||||
@ -2422,12 +2477,13 @@ static void TestContraction() {
|
||||
|
||||
void addMiscCollTest(TestNode** root)
|
||||
{
|
||||
addTest(root, &TestCyrillicTailoring, "tscoll/cmsccoll/TestCyrillicTailoring");
|
||||
addTest(root, &TestCase, "tscoll/cmsccoll/TestCase");
|
||||
addTest(root, &IncompleteCntTest, "tscoll/cmsccoll/IncompleteCntTest");
|
||||
addTest(root, &BlackBirdTest, "tscoll/cmsccoll/BlackBirdTest");
|
||||
addTest(root, &FunkyATest, "tscoll/cmsccoll/FunkyATest");
|
||||
addTest(root, &BillFairmanTest, "tscoll/cmsccoll/BillFairmanTest");
|
||||
addTest(root, &RamsRulesTest, "tscoll/cmsccoll/RamsRulesTest");
|
||||
/*addTest(root, &RamsRulesTest, "tscoll/cmsccoll/RamsRulesTest");*/
|
||||
addTest(root, &IsTailoredTest, "tscoll/cmsccoll/IsTailoredTest");
|
||||
addTest(root, &TestCollations, "tscoll/cmsccoll/TestCollations");
|
||||
addTest(root, &TestChMove, "tscoll/cmsccoll/TestChMove");
|
||||
|
@ -34,7 +34,6 @@
|
||||
#include <console.h>
|
||||
#endif
|
||||
|
||||
/*UHashtable *elements = NULL;*/
|
||||
UCAElements le;
|
||||
|
||||
/*
|
||||
@ -42,20 +41,6 @@ UCAElements le;
|
||||
*/
|
||||
UBool VERBOSE = FALSE;
|
||||
|
||||
/*
|
||||
void deleteElement(void *element) {
|
||||
UCAElements *el = (UCAElements *)element;
|
||||
int32_t i = 0;
|
||||
for(i = 0; i < el->noOfCEs; i++) {
|
||||
free(el->primary[i]);
|
||||
free(el->secondary[i]);
|
||||
free(el->tertiary[i]);
|
||||
}
|
||||
|
||||
free(el);
|
||||
}
|
||||
*/
|
||||
|
||||
int32_t readElement(char **from, char *to, char separator, UErrorCode *status) {
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
@ -110,16 +95,6 @@ uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UError
|
||||
((secvalue<<UCOL_SECONDARYORDERSHIFT)&UCOL_SECONDARYORDERMASK)|
|
||||
(tervalue&UCOL_TERTIARYORDERMASK);
|
||||
|
||||
|
||||
// Here was case handling!
|
||||
// case bits are already read from the UCA
|
||||
#if 0
|
||||
if(caseBit == TRUE && tervalue != 0) {
|
||||
value |= 0x40; // 0100 0000 set case bit
|
||||
} else {
|
||||
value &= 0xFFFFFFBF; // ... 1011 1111 (reset case bit)
|
||||
}
|
||||
#endif
|
||||
if(primsave!='\0') {
|
||||
*primend = primsave;
|
||||
}
|
||||
@ -132,26 +107,10 @@ uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UError
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
UCAElements *copyUCAElement(UCAElements *that) {
|
||||
UCAElements *r = (UCAElements *)malloc(sizeof(*that));
|
||||
memcpy(r, that, sizeof(*that));
|
||||
return r;
|
||||
}
|
||||
|
||||
void releaseUCACopy(UCAElements *r) {
|
||||
free(r);
|
||||
}
|
||||
*/
|
||||
|
||||
static uint32_t inverseTable[0xFFFF][3];
|
||||
static uint32_t inversePos = 0;
|
||||
/*UChar *stringContinue[0xFFFF];*/
|
||||
static UChar stringContinue[0xFFFF];
|
||||
/*static uint32_t stringContSize[0xFFFF]; */
|
||||
static uint32_t sContPos = 0;
|
||||
/*static uint32_t contSize = 0;*/
|
||||
|
||||
static void addNewInverse(UCAElements *element, UErrorCode *status) {
|
||||
if(U_FAILURE(*status)) {
|
||||
@ -454,7 +413,6 @@ UCAElements *readAnElement(FILE *data, UErrorCode *status) {
|
||||
}
|
||||
element->cPoints[0] = (UChar)theValue;
|
||||
|
||||
/*element->codepoint = element->cPoints[0];*/
|
||||
if(spacePointer == 0) {
|
||||
detectedContraction = FALSE;
|
||||
element->cSize = 1;
|
||||
@ -474,27 +432,6 @@ UCAElements *readAnElement(FILE *data, UErrorCode *status) {
|
||||
|
||||
startCodePoint = endCodePoint+1;
|
||||
|
||||
/* Case bit is now associated with each collation element */
|
||||
/* Also, there are two case bits, but we don't care about it here */
|
||||
#if 0
|
||||
endCodePoint = strchr(startCodePoint, ';');
|
||||
|
||||
while(*startCodePoint != '0' && *startCodePoint != '1') {
|
||||
startCodePoint++;
|
||||
if(startCodePoint == endCodePoint) {
|
||||
*status = U_INVALID_FORMAT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if(*startCodePoint == '0') {
|
||||
element->caseBit = FALSE;
|
||||
} else {
|
||||
element->caseBit = TRUE;
|
||||
}
|
||||
|
||||
startCodePoint = endCodePoint+1;
|
||||
#endif
|
||||
commentStart = strchr(startCodePoint, '#');
|
||||
if(commentStart == NULL) {
|
||||
commentStart = strlen(startCodePoint) + startCodePoint - 1;
|
||||
@ -521,7 +458,6 @@ UCAElements *readAnElement(FILE *data, UErrorCode *status) {
|
||||
|
||||
uint32_t CEi = 1;
|
||||
while(2*CEi<element->sizePrim[i] || CEi<element->sizeSec[i] || CEi<element->sizeTer[i]) {
|
||||
//uint32_t value = element->caseBit?0xC0:0x80; /* Continuation marker */
|
||||
uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
|
||||
if(2*CEi<element->sizePrim[i]) {
|
||||
value |= ((hex2num(*(primary+4*CEi))&0xF)<<28);
|
||||
@ -564,15 +500,9 @@ UCAElements *readAnElement(FILE *data, UErrorCode *status) {
|
||||
pointer++;
|
||||
}
|
||||
|
||||
/*
|
||||
strcpy(element->comment, commentStart);
|
||||
uhash_put(elements, (void *)element->codepoint, element, status);
|
||||
*/
|
||||
|
||||
if(U_FAILURE(*status)) {
|
||||
fprintf(stderr, "problem putting stuff in hash table\n");
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
//free(element);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -581,6 +511,8 @@ UCAElements *readAnElement(FILE *data, UErrorCode *status) {
|
||||
|
||||
|
||||
void writeOutData(UCATableHeader *data,
|
||||
uint16_t contractions[][3],
|
||||
uint32_t noOfcontractions,
|
||||
const char *outputDir,
|
||||
const char *copyright,
|
||||
UErrorCode *status)
|
||||
@ -589,6 +521,19 @@ void writeOutData(UCATableHeader *data,
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t size = data->size;
|
||||
|
||||
if(noOfcontractions != 0) {
|
||||
contractions[noOfcontractions][0] = 0;
|
||||
contractions[noOfcontractions][1] = 0;
|
||||
contractions[noOfcontractions][2] = 0;
|
||||
noOfcontractions++;
|
||||
|
||||
|
||||
data->contractionUCACombos = size;
|
||||
data->size += paddedsize((noOfcontractions*3*sizeof(uint16_t)));
|
||||
}
|
||||
|
||||
UNewDataMemory *pData;
|
||||
|
||||
long dataLength;
|
||||
@ -605,7 +550,12 @@ void writeOutData(UCATableHeader *data,
|
||||
fprintf(stdout, "Writing out UCA table: %s%s.%s\n", outputDir,
|
||||
UCA_DATA_NAME,
|
||||
UCA_DATA_TYPE);
|
||||
udata_writeBlock(pData, data, data->size);
|
||||
udata_writeBlock(pData, data, size);
|
||||
|
||||
if(noOfcontractions != 0) {
|
||||
udata_writeBlock(pData, contractions, noOfcontractions*3*sizeof(uint16_t));
|
||||
udata_writePadding(pData, paddedsize((noOfcontractions*3*sizeof(uint16_t))) - noOfcontractions*3*sizeof(uint16_t));
|
||||
}
|
||||
|
||||
/* finish up */
|
||||
dataLength=udata_finish(pData, status);
|
||||
@ -623,14 +573,12 @@ write_uca_table(const char *filename,
|
||||
{
|
||||
FILE *data = fopen(filename, "r");
|
||||
uint32_t line = 0;
|
||||
int32_t sizesPrim[35], sizesSec[35], sizesTer[35];
|
||||
/* int32_t sizeBreakDown[35][35][35];
|
||||
int32_t *secValue = (int32_t*)uprv_malloc(sizeof(int32_t)*0xffff);
|
||||
int32_t *terValue = (int32_t*)uprv_malloc(sizeof(int32_t)*0xffff);*/
|
||||
UCAElements *element = NULL;
|
||||
UChar variableTopValue = 0;
|
||||
UCATableHeader *myD = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
|
||||
UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
|
||||
uint16_t contractionCEs[256][3];
|
||||
uint32_t noOfContractions = 0;
|
||||
|
||||
|
||||
if(data == NULL) {
|
||||
@ -638,12 +586,6 @@ write_uca_table(const char *filename,
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* memset(secValue, 0, 0xffff*sizeof(int32_t));
|
||||
memset(terValue, 0, 0xffff*sizeof(int32_t)); */
|
||||
memset(sizesPrim, 0, 35*sizeof(int32_t));
|
||||
memset(sizesSec, 0, 35*sizeof(int32_t));
|
||||
memset(sizesTer, 0, 35*sizeof(int32_t));
|
||||
/* memset(sizeBreakDown, 0, 35*35*35*sizeof(int32_t)); */
|
||||
memset(inverseTable, 0xDA, sizeof(int32_t)*3*0xFFFF);
|
||||
|
||||
opts->variableTopValue = variableTopValue;
|
||||
@ -652,7 +594,7 @@ write_uca_table(const char *filename,
|
||||
opts->alternateHandling = UCOL_NON_IGNORABLE; /* attribute for handling variable elements*/
|
||||
opts->caseFirst = UCOL_OFF; /* who goes first, lower case or uppercase */
|
||||
opts->caseLevel = UCOL_OFF; /* do we have an extra case level */
|
||||
opts->normalizationMode = UCOL_OFF; /*UCOL_ON*/ /* attribute for normalization */
|
||||
opts->normalizationMode = UCOL_OFF; /* attribute for normalization */
|
||||
/* populate the version info struct with version info*/
|
||||
myD->version[0] = UCOL_BUILDER_VERSION;
|
||||
/*TODO:The fractional rules version should be taken from FractionalUCA.txt*/
|
||||
@ -661,12 +603,6 @@ write_uca_table(const char *filename,
|
||||
|
||||
tempUCATable *t = uprv_uca_initTempTable(myD, opts, NULL, status);
|
||||
|
||||
/*
|
||||
elements = uhash_open(uhash_hashLong, uhash_compareLong, &status);
|
||||
|
||||
uhash_setValueDeleter(elements, deleteElement);
|
||||
*/
|
||||
|
||||
|
||||
while(!feof(data)) {
|
||||
if(U_FAILURE(*status)) {
|
||||
@ -677,33 +613,27 @@ write_uca_table(const char *filename,
|
||||
element = readAnElement(data, status);
|
||||
line++;
|
||||
if(element != NULL) {
|
||||
/* this does statistics on CE lengths, but is currently broken */
|
||||
/*
|
||||
for( i = 0; i<element->noOfCEs; i++) {
|
||||
sizesPrim[element->sizePrim[i]]++;
|
||||
sizesSec[element->sizeSec[i]]++;
|
||||
sizesTer[element->sizeTer[i]]++;
|
||||
|
||||
sizeBreakDown[element->sizePrim[i]][element->sizeSec[i]][element->sizeTer[i]]++;
|
||||
|
||||
if(element->sizePrim[i] == 2 && element->sizeSec[i]==2) {
|
||||
terValue[strtoul(element->tertiary[i], 0, 16)]++;
|
||||
secValue[strtoul(element->secondary[i], 0, 16)]++;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// we have read the line, now do something sensible with the read data!
|
||||
if(element->variableTop == TRUE && variableTopValue == 0) {
|
||||
t->options->variableTopValue = element->cPoints[0];
|
||||
}
|
||||
|
||||
// if element is a contraction, we want to add it to contractions
|
||||
if(element->cSize > 1) { // this is a contraction
|
||||
contractionCEs[noOfContractions][0] = element->cPoints[0];
|
||||
contractionCEs[noOfContractions][1] = element->cPoints[1];
|
||||
if(element->cSize > 2) { // the third one
|
||||
contractionCEs[noOfContractions][2] = element->cPoints[2];
|
||||
} else {
|
||||
contractionCEs[noOfContractions][2] = 0;
|
||||
}
|
||||
noOfContractions++;
|
||||
}
|
||||
|
||||
/* we're first adding to inverse, because addAnElement will reverse the order */
|
||||
/* of code points and stuff... we don't want that to happen */
|
||||
addToInverse(element, status);
|
||||
uprv_uca_addAnElement(t, element, status);
|
||||
//deleteElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
@ -712,80 +642,22 @@ write_uca_table(const char *filename,
|
||||
fprintf(stdout, "\nLines read: %i\n", line);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
for(i = 0; i<35; i++) {
|
||||
fprintf(stderr, "size %i: P:%i S:%i T:%i\n", i, sizesPrim[i], sizesSec[i], sizesTer[i]);
|
||||
}
|
||||
|
||||
for(i = 0; i<35; i++) {
|
||||
UBool printedPrimary = FALSE;
|
||||
for(j = 0; j<35; j++) {
|
||||
for(k = 0; k<35; k++) {
|
||||
if(sizeBreakDown[i][j][k] != 0) {
|
||||
if(!printedPrimary) {
|
||||
fprintf(stderr, "Primary: %i\n", i);
|
||||
printedPrimary = TRUE;
|
||||
}
|
||||
fprintf(stderr, "Sec: %i, Ter: %i = %i\n", j, k, sizeBreakDown[i][j][k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(i = 0; i<(uint32_t)0xffff; i++) {
|
||||
if(terValue[i] != 0) {
|
||||
fprintf(stderr, "Tertiaries with value %04X : %i\n", i, terValue[i]);
|
||||
}
|
||||
if(secValue[i] != 0) {
|
||||
fprintf(stderr, "Secondaries with value %04X : %i\n", i, secValue[i]);
|
||||
}
|
||||
}
|
||||
*/
|
||||
/* test */
|
||||
UCATableHeader *myData = uprv_uca_assembleTable(t, status);
|
||||
writeOutData(myData, outputDir, copyright, status);
|
||||
writeOutData(myData, contractionCEs, noOfContractions, outputDir, copyright, status);
|
||||
|
||||
InverseTableHeader *inverse = assembleInverseTable(status);
|
||||
writeOutInverseData(inverse, outputDir, copyright, status);
|
||||
/*
|
||||
uint32_t *itab = (uint32_t *)((uint8_t *)inverse + inverse->table);
|
||||
UChar *conts = (UChar *)((uint8_t *)inverse + inverse->conts);
|
||||
for(i = 0; i<inverse->tableSize; i++) {
|
||||
fprintf(stderr, "[%04X] 0x%08X 0x%08X 0x%08X\n", i, *(itab+3*i), *(itab+3*i+1), *(itab+3*i+2));
|
||||
if((*(itab+3*i+2) & UCOL_INV_SIZEMASK) != 0) {
|
||||
uint32_t contIndex = *(itab+3*i+2) & UCOL_INV_OFFSETMASK;
|
||||
uint32_t contSize = (*(itab+3*i+2) & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE;
|
||||
fprintf(stderr, "\t");
|
||||
for(j = 0; j<contSize; j++) {
|
||||
if(*(conts+contIndex+j) < 0xFFFE) {
|
||||
fprintf(stderr, "%04X ", *(conts+contIndex+j));
|
||||
} else {
|
||||
fprintf(stderr, "\n\t");
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
uprv_uca_closeTempTable(t);
|
||||
uprv_free(myD);
|
||||
uprv_free(opts);
|
||||
|
||||
//printOutTable(myData, &status);
|
||||
//uhash_close(elements);
|
||||
|
||||
uprv_free(myData);
|
||||
uprv_free(inverse);
|
||||
fclose(data);
|
||||
|
||||
/*
|
||||
uprv_free(secValue);
|
||||
uprv_free(terValue);
|
||||
*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user