ICU-96 Hangul tailoring fix, different case bit function, added comments to strcoll
X-SVN-Rev: 4761
This commit is contained in:
parent
94e1fd78a5
commit
152b11f484
@ -316,7 +316,7 @@ ucol_openRules( const UChar *rules,
|
||||
UCollationStrength strength,
|
||||
UErrorCode *status)
|
||||
{
|
||||
uint32_t listLen = 0;
|
||||
uint32_t listLen = 0, nSize = 0;
|
||||
UColTokenParser src;
|
||||
UColAttributeValue norm;
|
||||
|
||||
@ -342,9 +342,11 @@ ucol_openRules( const UChar *rules,
|
||||
|
||||
/*src.source = rules;*/
|
||||
src.source = (UChar *)uprv_malloc((rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
|
||||
uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar));
|
||||
nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src.source, rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
|
||||
//uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar));
|
||||
src.current = src.source;
|
||||
src.end = src.source+rulesLength;
|
||||
src.end = src.source+nSize;
|
||||
//src.end = src.source+rulesLength;
|
||||
src.sourceCurrent = src.source;
|
||||
src.extraCurrent = src.end;
|
||||
src.extraEnd = src.end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
|
||||
@ -4615,7 +4617,7 @@ ucol_strcoll( const UCollator *coll,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// setting up the collator parameters
|
||||
UColAttributeValue strength = coll->strength;
|
||||
UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
|
||||
|
||||
@ -4628,63 +4630,69 @@ ucol_strcoll( const UCollator *coll,
|
||||
UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
|
||||
UBool qShifted = shifted && checkQuad;
|
||||
|
||||
uint8_t caseSwitch = coll->caseSwitch;
|
||||
uint8_t tertiaryMask = coll->tertiaryMask;
|
||||
|
||||
// This is the lowest primary value that will not be ignored if shifted
|
||||
uint32_t LVT = (shifted)?((coll->variableMax1)<<24 | (coll->variableMax2)<<16):0;
|
||||
|
||||
UCollationResult result = UCOL_EQUAL;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
// Preparing the context objects for iterating over strings
|
||||
collIterate sColl, tColl;
|
||||
|
||||
|
||||
IInit_collIterate(coll, source, sourceLength, &sColl);
|
||||
IInit_collIterate(coll, target, targetLength, &tColl);
|
||||
|
||||
// Preparing the CE buffers. They will be filled during the primary phase
|
||||
ucol_CEBuf sCEs;
|
||||
ucol_CEBuf tCEs;
|
||||
UCOL_INIT_CEBUF(&sCEs);
|
||||
UCOL_INIT_CEBUF(&tCEs);
|
||||
|
||||
uint8_t caseSwitch = coll->caseSwitch;
|
||||
uint8_t tertiaryMask = coll->tertiaryMask;
|
||||
|
||||
uint32_t LVT = (shifted)?((coll->variableMax1)<<24 | (coll->variableMax2)<<16):0;
|
||||
|
||||
uint32_t secS = 0, secT = 0;
|
||||
|
||||
uint32_t sOrder=0, tOrder=0;
|
||||
|
||||
// Non shifted primary processing is quite simple
|
||||
if(!shifted) {
|
||||
for(;;) {
|
||||
/* Get the next collation element in each of the strings, unless */
|
||||
/* we've been requested to skip it. */
|
||||
while(sOrder == 0) {
|
||||
sOrder = ucol_IGetNextCE(coll, &sColl, &status);
|
||||
UCOL_CEBUF_PUT(&sCEs, sOrder, &sColl);
|
||||
sOrder &= UCOL_PRIMARYMASK;
|
||||
}
|
||||
|
||||
while(tOrder == 0) {
|
||||
// We fetch CEs until we hit a non ignorable primary or end.
|
||||
do {
|
||||
// We get the next CE
|
||||
sOrder = ucol_IGetNextCE(coll, &sColl, &status);
|
||||
// Stuff it in the buffer
|
||||
UCOL_CEBUF_PUT(&sCEs, sOrder, &sColl);
|
||||
// And keep just the primary part.
|
||||
sOrder &= UCOL_PRIMARYMASK;
|
||||
} while(sOrder == 0);
|
||||
|
||||
// see the comments on the above block
|
||||
do {
|
||||
tOrder = ucol_IGetNextCE(coll, &tColl, &status);
|
||||
UCOL_CEBUF_PUT(&tCEs, tOrder, &tColl);
|
||||
tOrder &= UCOL_PRIMARYMASK;
|
||||
}
|
||||
} while(tOrder == 0);
|
||||
|
||||
// if both primaries are the same
|
||||
if(sOrder == tOrder) {
|
||||
// and there are no more CEs, we advance to the next level
|
||||
if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
|
||||
|
||||
break;
|
||||
} else {
|
||||
sOrder = 0; tOrder = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// if two primaries are different, we are done
|
||||
result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
|
||||
goto commonReturn;
|
||||
}
|
||||
} /* no primary difference... do the rest from the buffers */
|
||||
} else { /* shifted - do a slightly more complicated processing */
|
||||
} // no primary difference... do the rest from the buffers
|
||||
} else { // shifted - do a slightly more complicated processing :)
|
||||
for(;;) {
|
||||
UBool sInShifted = FALSE;
|
||||
UBool tInShifted = FALSE;
|
||||
|
||||
/* This is where abridged version for shifted should go */
|
||||
// This version of code can be refactored. However, it seems easier to understand this way.
|
||||
// Source loop. Sam as the target loop.
|
||||
for(;;) {
|
||||
sOrder = ucol_IGetNextCE(coll, &sColl, &status);
|
||||
if(sOrder == UCOL_NO_MORE_CES) {
|
||||
|
@ -635,22 +635,75 @@ U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UHash
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t ucol_uprv_getCaseBits(const UChar *s, uint32_t len, UErrorCode *status) {
|
||||
uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
|
||||
UChar n[128];
|
||||
UChar nu[128];
|
||||
//UChar nu[128];
|
||||
uint32_t i = 0;
|
||||
|
||||
uint32_t nLen = 0;
|
||||
uint32_t nuLen = 0;
|
||||
|
||||
nLen = unorm_normalize(s, len, UNORM_NFKD, 0, n, 128, status);
|
||||
collIterate s;
|
||||
uint32_t order = 0;
|
||||
|
||||
nuLen = u_strToUpper(nu, 128, n, nLen, "", status);
|
||||
if(nuLen == nLen) {
|
||||
if(u_strncmp(n, nu, nuLen) == 0) {
|
||||
return UCOL_UPPER_CASE;
|
||||
uint8_t caseBits;
|
||||
UBool isMixed = FALSE;
|
||||
|
||||
if(U_FAILURE(*status)) {
|
||||
return UCOL_LOWER_CASE;
|
||||
}
|
||||
|
||||
nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
|
||||
|
||||
init_collIterate(UCA, n, nLen, &s);
|
||||
|
||||
order = ucol_getNextCE(UCA, &s, status);
|
||||
if(isContinuation(order)) {
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
return UCOL_LOWER_CASE;
|
||||
}
|
||||
|
||||
caseBits = order & UCOL_CASE_BIT_MASK;
|
||||
for(;;) {
|
||||
order = ucol_getNextCE(UCA, &s, status);
|
||||
if(order == UCOL_NO_MORE_CES) {
|
||||
break;
|
||||
}
|
||||
if(isContinuation(order)) {
|
||||
continue;
|
||||
}
|
||||
if(caseBits != (order & UCOL_CASE_BIT_MASK)) {
|
||||
isMixed = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(isMixed == TRUE) {
|
||||
uint32_t noUpper = 0;
|
||||
uint32_t noLower = 0;
|
||||
|
||||
// Let's analyze again, letter by letter
|
||||
for(i = 0; i < nLen; i++) {
|
||||
if(u_isupper(n[i]) == TRUE) {
|
||||
noUpper++;
|
||||
}
|
||||
if(u_islower(n[i]) == TRUE) {
|
||||
noLower++;
|
||||
}
|
||||
if(u_istitle(n[i]) == TRUE) {
|
||||
return UCOL_MIXED_CASE;
|
||||
}
|
||||
}
|
||||
|
||||
if(noUpper > 0 && noLower > 0 && noUpper + noLower <= nLen) {
|
||||
return UCOL_MIXED_CASE;
|
||||
}
|
||||
}
|
||||
|
||||
return caseBits;
|
||||
|
||||
|
||||
#if 0
|
||||
nuLen = u_strToLower(nu, 128, n, nLen, "", status);
|
||||
if(nuLen == nLen) {
|
||||
if(u_strncmp(n, nu, nuLen) == 0) {
|
||||
@ -658,7 +711,14 @@ uint8_t ucol_uprv_getCaseBits(const UChar *s, uint32_t len, UErrorCode *status)
|
||||
}
|
||||
}
|
||||
|
||||
nuLen = u_strToUpper(nu, 128, n, nLen, "", status);
|
||||
if(nuLen == nLen) {
|
||||
if(u_strncmp(n, nu, nuLen) == 0) {
|
||||
return UCOL_UPPER_CASE;
|
||||
}
|
||||
}
|
||||
return UCOL_MIXED_CASE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
@ -699,13 +759,14 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
|
||||
/* will have to get one from UCA */
|
||||
/* first, get the UChars from the rules */
|
||||
/* then pick CEs out until there is no more and stuff them into expansion */
|
||||
UChar source[256],buff[256];
|
||||
//UChar source[256],buff[256];
|
||||
collIterate s;
|
||||
uint32_t order = 0;
|
||||
uint32_t normSize = 0;
|
||||
uprv_memcpy(buff, expOffset + src->source, 1*sizeof(UChar));
|
||||
normSize = unorm_normalize(buff, 1, UNORM_NFD, 0, source, 256, status);
|
||||
init_collIterate(src->UCA, source, normSize, &s);
|
||||
//uint32_t normSize = 0;
|
||||
//uprv_memcpy(buff, expOffset + src->source, 1*sizeof(UChar));
|
||||
//normSize = unorm_normalize(buff, 1, UNORM_NFD, 0, source, 256, status);
|
||||
//init_collIterate(src->UCA, source, normSize, &s);
|
||||
init_collIterate(src->UCA, expOffset + src->source, 1, &s);
|
||||
|
||||
for(;;) {
|
||||
order = ucol_getNextCE(src->UCA, &s, status);
|
||||
@ -735,11 +796,13 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
|
||||
|
||||
/* copy UChars */
|
||||
|
||||
UChar buff[128];
|
||||
uint32_t decompSize;
|
||||
uprv_memcpy(buff, (tok->source & 0x00FFFFFF) + src->source, (tok->source >> 24)*sizeof(UChar));
|
||||
decompSize = unorm_normalize(buff, tok->source >> 24, UNORM_NFD, 0, el.uchars, 128, status);
|
||||
el.cSize = decompSize; /*(tok->source >> 24); *//* + (tok->expansion >> 24);*/
|
||||
//UChar buff[128];
|
||||
//uint32_t decompSize;
|
||||
//uprv_memcpy(buff, (tok->source & 0x00FFFFFF) + src->source, (tok->source >> 24)*sizeof(UChar));
|
||||
//decompSize = unorm_normalize(buff, tok->source >> 24, UNORM_NFD, 0, el.uchars, 128, status);
|
||||
//el.cSize = decompSize; /*(tok->source >> 24); *//* + (tok->expansion >> 24);*/
|
||||
el.cSize = (tok->source >> 24);
|
||||
uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
|
||||
el.cPoints = el.uchars;
|
||||
|
||||
if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) {
|
||||
@ -760,7 +823,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
|
||||
el.CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
|
||||
if(el.cSize > 1) {
|
||||
// Do it manually
|
||||
el.CEs[0] |= ucol_uprv_getCaseBits(el.cPoints, el.cSize, status);
|
||||
el.CEs[0] |= ucol_uprv_getCaseBits(src->UCA, el.cPoints, el.cSize, status);
|
||||
} else {
|
||||
// Copy it from the UCA
|
||||
uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status);
|
||||
|
@ -734,6 +734,16 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
src->varTop = sourceToken;
|
||||
}
|
||||
|
||||
/*
|
||||
If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
|
||||
d * ... into &x * c/y * d * ...
|
||||
*/
|
||||
if(expandNext != 0 && sourceToken->expansion == 0) {
|
||||
sourceToken->expansion = expandNext;
|
||||
sourceToken->debugExpansion = *(src->source + (expandNext & 0xFFFFFF));
|
||||
//expandNext = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
1. Find the strongest strength in each list, and set strongestP and strongestN
|
||||
accordingly in the headers.
|
||||
@ -769,16 +779,6 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
lastToken->next = sourceToken;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
|
||||
d * ... into &x * c/y * d * ...
|
||||
*/
|
||||
if(expandNext != 0 && sourceToken->expansion == 0) {
|
||||
sourceToken->expansion = expandNext;
|
||||
sourceToken->debugExpansion = *(src->source + (expandNext & 0xFFFFFF));
|
||||
expandNext = 0;
|
||||
}
|
||||
} else {
|
||||
/* Otherwise (when LAST is not a reset)
|
||||
if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
|
||||
|
@ -1954,7 +1954,56 @@ static void TestIncrementalNormalize() {
|
||||
uprv_free(strB);
|
||||
}
|
||||
|
||||
#if 0
|
||||
static void TestGetCaseBit() {
|
||||
static const char *caseBitData[] = {
|
||||
"a", "A", "ch", "Ch", "CH",
|
||||
"\\uFF9E", "\\u0009"
|
||||
};
|
||||
|
||||
static const uint8_t results[] = {
|
||||
UCOL_LOWER_CASE, UCOL_UPPER_CASE, UCOL_LOWER_CASE, UCOL_MIXED_CASE, UCOL_UPPER_CASE,
|
||||
UCOL_UPPER_CASE, UCOL_LOWER_CASE
|
||||
};
|
||||
|
||||
uint32_t i, blen = 0;
|
||||
UChar b[256] = {0};
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCollator *UCA = ucol_open("", &status);
|
||||
uint8_t res = 0;
|
||||
|
||||
for(i = 0; i<sizeof(results)/sizeof(results[0]); i++) {
|
||||
blen = u_unescape(caseBitData[i], b, 256);
|
||||
res = ucol_uprv_getCaseBits(UCA, b, blen, &status);
|
||||
if(results[i] != res) {
|
||||
log_err("Expected case = %02X, got %02X for %04X\n", results[i], res, b[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static void TestHangulTailoring() {
|
||||
static const char *koreanData[] = {
|
||||
"\\uac00", "\\u4f3d", "\\u4f73", "\\u5047", "\\u50f9", "\\u52a0", "\\u53ef", "\\u5475,"
|
||||
" \\u54e5", "\\u5609", "\\u5ac1", "\\u5bb6", "\\u6687", "\\u67b6", "\\u67b7", "\\u67ef,"
|
||||
" \\u6b4c", "\\u73c2", "\\u75c2", "\\u7a3c", "\\u82db", "\\u8304", "\\u8857", "\\u8888,"
|
||||
" \\u8a36", "\\u8cc8", "\\u8dcf", "\\u8efb", "\\u8fe6", "\\u99d5,"
|
||||
" \\u4EEE", "\\u50A2", "\\u5496", "\\u54FF", "\\u5777", "\\u5B8A", "\\u659D", "\\u698E,"
|
||||
" \\u6A9F", "\\u73C8", "\\u7B33", "\\u801E", "\\u8238", "\\u846D", "\\u8B0C"
|
||||
};
|
||||
|
||||
char rules =
|
||||
"&\\uac00 <<< \\u4f3d <<< \\u4f73 <<< \\u5047 <<< \\u50f9 <<< \\u52a0 <<< \\u53ef <<< \\u5475 "
|
||||
"<<< \\u54e5 <<< \\u5609 <<< \\u5ac1 <<< \\u5bb6 <<< \\u6687 <<< \\u67b6 <<< \\u67b7 <<< \\u67ef "
|
||||
"<<< \\u6b4c <<< \\u73c2 <<< \\u75c2 <<< \\u7a3c <<< \\u82db <<< \\u8304 <<< \\u8857 <<< \\u8888 "
|
||||
"<<< \\u8a36 <<< \\u8cc8 <<< \\u8dcf <<< \\u8efb <<< \\u8fe6 <<< \\u99d5 "
|
||||
"<<< \\u4EEE <<< \\u50A2 <<< \\u5496 <<< \\u54FF <<< \\u5777 <<< \\u5B8A <<< \\u659D <<< \\u698E " //k1
|
||||
"<<< \\u6A9F <<< \\u73C8 <<< \\u7B33 <<< \\u801E <<< \\u8238 <<< \\u846D <<< \\u8B0C"
|
||||
|
||||
|
||||
genericRulesStarter(rules, koreanData, sizeof(koreanData)/sizeof(koreanData[0]));
|
||||
|
||||
}
|
||||
|
||||
void addMiscCollTest(TestNode** root)
|
||||
{
|
||||
@ -1975,11 +2024,13 @@ void addMiscCollTest(TestNode** root)
|
||||
addTest(root, &TestJ831, "tscoll/cmsccoll/TestJ831");
|
||||
addTest(root, &TestBefore, "tscoll/cmsccoll/TestBefore");
|
||||
addTest(root, &TestRedundantRules, "tscoll/cmsccoll/TestRedundantRules");
|
||||
addTest(root, &TestHangulTailoring, "tscoll/cmsccoll/TestHangulTailoring");
|
||||
/*addTest(root, &TestUCAZero, "tscoll/cmsccoll/TestUCAZero");*/
|
||||
/*addTest(root, &TestUnmappedSpaces, "tscoll/cmsccoll/TestUnmappedSpaces");*/
|
||||
/*addTest(root, &PrintMarkDavis, "tscoll/cmsccoll/PrintMarkDavis");*/
|
||||
/*addTest(root, &TestVariableTop, "tscoll/cmsccoll/TestVariableTop");*/
|
||||
addTest(root, &TestIncrementalNormalize, "tscoll/cmsccoll/TestIncrementalNormalize");
|
||||
addTest(root, &TestComposeDecompose, "tscoll/cmsccoll/TestComposeDecompose");
|
||||
/*addTest(root, &TestGetCaseBit, "tscoll/cmsccoll/TestGetCaseBit");*/
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user