From 152b11f4841c9cebde27395329a57cd5cbceabc4 Mon Sep 17 00:00:00 2001 From: Vladimir Weinstein Date: Tue, 22 May 2001 22:26:58 +0000 Subject: [PATCH] ICU-96 Hangul tailoring fix, different case bit function, added comments to strcoll X-SVN-Rev: 4761 --- icu4c/source/i18n/ucol.cpp | 66 ++++++++++-------- icu4c/source/i18n/ucol_bld.cpp | 99 ++++++++++++++++++++++----- icu4c/source/i18n/ucol_tok.cpp | 20 +++--- icu4c/source/test/cintltst/cmsccoll.c | 51 ++++++++++++++ 4 files changed, 179 insertions(+), 57 deletions(-) diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index 5ce5508aef..7aea9d4b59 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -316,7 +316,7 @@ ucol_openRules( const UChar *rules, UCollationStrength strength, UErrorCode *status) { - uint32_t listLen = 0; + uint32_t listLen = 0, nSize = 0; UColTokenParser src; UColAttributeValue norm; @@ -342,9 +342,11 @@ ucol_openRules( const UChar *rules, /*src.source = rules;*/ src.source = (UChar *)uprv_malloc((rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); - uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar)); + nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src.source, rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); + //uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar)); src.current = src.source; - src.end = src.source+rulesLength; + src.end = src.source+nSize; + //src.end = src.source+rulesLength; src.sourceCurrent = src.source; src.extraCurrent = src.end; src.extraEnd = src.end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; @@ -4615,7 +4617,7 @@ ucol_strcoll( const UCollator *coll, } - + // setting up the collator parameters UColAttributeValue strength = coll->strength; UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); @@ -4628,63 +4630,69 @@ ucol_strcoll( const UCollator *coll, UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); UBool qShifted = shifted && checkQuad; + uint8_t caseSwitch = coll->caseSwitch; + uint8_t tertiaryMask = coll->tertiaryMask; + + // This is the lowest primary value that will not be ignored if shifted + uint32_t LVT = (shifted)?((coll->variableMax1)<<24 | (coll->variableMax2)<<16):0; + UCollationResult result = UCOL_EQUAL; UErrorCode status = U_ZERO_ERROR; + // Preparing the context objects for iterating over strings collIterate sColl, tColl; - IInit_collIterate(coll, source, sourceLength, &sColl); IInit_collIterate(coll, target, targetLength, &tColl); + // Preparing the CE buffers. They will be filled during the primary phase ucol_CEBuf sCEs; ucol_CEBuf tCEs; UCOL_INIT_CEBUF(&sCEs); UCOL_INIT_CEBUF(&tCEs); - uint8_t caseSwitch = coll->caseSwitch; - uint8_t tertiaryMask = coll->tertiaryMask; - - uint32_t LVT = (shifted)?((coll->variableMax1)<<24 | (coll->variableMax2)<<16):0; - uint32_t secS = 0, secT = 0; - uint32_t sOrder=0, tOrder=0; + + // Non shifted primary processing is quite simple if(!shifted) { for(;;) { - /* Get the next collation element in each of the strings, unless */ - /* we've been requested to skip it. */ - while(sOrder == 0) { - sOrder = ucol_IGetNextCE(coll, &sColl, &status); - UCOL_CEBUF_PUT(&sCEs, sOrder, &sColl); - sOrder &= UCOL_PRIMARYMASK; - } - while(tOrder == 0) { + // We fetch CEs until we hit a non ignorable primary or end. + do { + // We get the next CE + sOrder = ucol_IGetNextCE(coll, &sColl, &status); + // Stuff it in the buffer + UCOL_CEBUF_PUT(&sCEs, sOrder, &sColl); + // And keep just the primary part. + sOrder &= UCOL_PRIMARYMASK; + } while(sOrder == 0); + + // see the comments on the above block + do { tOrder = ucol_IGetNextCE(coll, &tColl, &status); UCOL_CEBUF_PUT(&tCEs, tOrder, &tColl); tOrder &= UCOL_PRIMARYMASK; - } + } while(tOrder == 0); + // if both primaries are the same if(sOrder == tOrder) { + // and there are no more CEs, we advance to the next level if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { - break; - } else { - sOrder = 0; tOrder = 0; - continue; - } + } } else { + // if two primaries are different, we are done result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; goto commonReturn; } - } /* no primary difference... do the rest from the buffers */ - } else { /* shifted - do a slightly more complicated processing */ + } // no primary difference... do the rest from the buffers + } else { // shifted - do a slightly more complicated processing :) for(;;) { UBool sInShifted = FALSE; UBool tInShifted = FALSE; - -/* This is where abridged version for shifted should go */ + // This version of code can be refactored. However, it seems easier to understand this way. + // Source loop. Sam as the target loop. for(;;) { sOrder = ucol_IGetNextCE(coll, &sColl, &status); if(sOrder == UCOL_NO_MORE_CES) { diff --git a/icu4c/source/i18n/ucol_bld.cpp b/icu4c/source/i18n/ucol_bld.cpp index 0afc8a494e..6abc7d0596 100644 --- a/icu4c/source/i18n/ucol_bld.cpp +++ b/icu4c/source/i18n/ucol_bld.cpp @@ -635,22 +635,75 @@ U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UHash } } -uint8_t ucol_uprv_getCaseBits(const UChar *s, uint32_t len, UErrorCode *status) { +uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { UChar n[128]; - UChar nu[128]; + //UChar nu[128]; + uint32_t i = 0; uint32_t nLen = 0; uint32_t nuLen = 0; - nLen = unorm_normalize(s, len, UNORM_NFKD, 0, n, 128, status); + collIterate s; + uint32_t order = 0; - nuLen = u_strToUpper(nu, 128, n, nLen, "", status); - if(nuLen == nLen) { - if(u_strncmp(n, nu, nuLen) == 0) { - return UCOL_UPPER_CASE; + uint8_t caseBits; + UBool isMixed = FALSE; + + if(U_FAILURE(*status)) { + return UCOL_LOWER_CASE; + } + + nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); + + init_collIterate(UCA, n, nLen, &s); + + order = ucol_getNextCE(UCA, &s, status); + if(isContinuation(order)) { + *status = U_INTERNAL_PROGRAM_ERROR; + return UCOL_LOWER_CASE; + } + + caseBits = order & UCOL_CASE_BIT_MASK; + for(;;) { + order = ucol_getNextCE(UCA, &s, status); + if(order == UCOL_NO_MORE_CES) { + break; + } + if(isContinuation(order)) { + continue; + } + if(caseBits != (order & UCOL_CASE_BIT_MASK)) { + isMixed = TRUE; + break; } } + if(isMixed == TRUE) { + uint32_t noUpper = 0; + uint32_t noLower = 0; + + // Let's analyze again, letter by letter + for(i = 0; i < nLen; i++) { + if(u_isupper(n[i]) == TRUE) { + noUpper++; + } + if(u_islower(n[i]) == TRUE) { + noLower++; + } + if(u_istitle(n[i]) == TRUE) { + return UCOL_MIXED_CASE; + } + } + + if(noUpper > 0 && noLower > 0 && noUpper + noLower <= nLen) { + return UCOL_MIXED_CASE; + } + } + + return caseBits; + + +#if 0 nuLen = u_strToLower(nu, 128, n, nLen, "", status); if(nuLen == nLen) { if(u_strncmp(n, nu, nuLen) == 0) { @@ -658,7 +711,14 @@ uint8_t ucol_uprv_getCaseBits(const UChar *s, uint32_t len, UErrorCode *status) } } + nuLen = u_strToUpper(nu, 128, n, nLen, "", status); + if(nuLen == nLen) { + if(u_strncmp(n, nu, nuLen) == 0) { + return UCOL_UPPER_CASE; + } + } return UCOL_MIXED_CASE; +#endif } @@ -699,13 +759,14 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL /* will have to get one from UCA */ /* first, get the UChars from the rules */ /* then pick CEs out until there is no more and stuff them into expansion */ - UChar source[256],buff[256]; + //UChar source[256],buff[256]; collIterate s; uint32_t order = 0; - uint32_t normSize = 0; - uprv_memcpy(buff, expOffset + src->source, 1*sizeof(UChar)); - normSize = unorm_normalize(buff, 1, UNORM_NFD, 0, source, 256, status); - init_collIterate(src->UCA, source, normSize, &s); + //uint32_t normSize = 0; + //uprv_memcpy(buff, expOffset + src->source, 1*sizeof(UChar)); + //normSize = unorm_normalize(buff, 1, UNORM_NFD, 0, source, 256, status); + //init_collIterate(src->UCA, source, normSize, &s); + init_collIterate(src->UCA, expOffset + src->source, 1, &s); for(;;) { order = ucol_getNextCE(src->UCA, &s, status); @@ -735,11 +796,13 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL /* copy UChars */ - UChar buff[128]; - uint32_t decompSize; - uprv_memcpy(buff, (tok->source & 0x00FFFFFF) + src->source, (tok->source >> 24)*sizeof(UChar)); - decompSize = unorm_normalize(buff, tok->source >> 24, UNORM_NFD, 0, el.uchars, 128, status); - el.cSize = decompSize; /*(tok->source >> 24); *//* + (tok->expansion >> 24);*/ + //UChar buff[128]; + //uint32_t decompSize; + //uprv_memcpy(buff, (tok->source & 0x00FFFFFF) + src->source, (tok->source >> 24)*sizeof(UChar)); + //decompSize = unorm_normalize(buff, tok->source >> 24, UNORM_NFD, 0, el.uchars, 128, status); + //el.cSize = decompSize; /*(tok->source >> 24); *//* + (tok->expansion >> 24);*/ + el.cSize = (tok->source >> 24); + uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); el.cPoints = el.uchars; if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) { @@ -760,7 +823,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL el.CEs[0] &= 0xFFFFFF3F; // Clean the case bits field if(el.cSize > 1) { // Do it manually - el.CEs[0] |= ucol_uprv_getCaseBits(el.cPoints, el.cSize, status); + el.CEs[0] |= ucol_uprv_getCaseBits(src->UCA, el.cPoints, el.cSize, status); } else { // Copy it from the UCA uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status); diff --git a/icu4c/source/i18n/ucol_tok.cpp b/icu4c/source/i18n/ucol_tok.cpp index a652f03ccc..e9389dcfae 100644 --- a/icu4c/source/i18n/ucol_tok.cpp +++ b/icu4c/source/i18n/ucol_tok.cpp @@ -734,6 +734,16 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu src->varTop = sourceToken; } + /* + If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * + d * ... into &x * c/y * d * ... + */ + if(expandNext != 0 && sourceToken->expansion == 0) { + sourceToken->expansion = expandNext; + sourceToken->debugExpansion = *(src->source + (expandNext & 0xFFFFFF)); + //expandNext = 0; + } + /* 1. Find the strongest strength in each list, and set strongestP and strongestN accordingly in the headers. @@ -769,16 +779,6 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu lastToken->next = sourceToken; } } - - /* - If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * - d * ... into &x * c/y * d * ... - */ - if(expandNext != 0 && sourceToken->expansion == 0) { - sourceToken->expansion = expandNext; - sourceToken->debugExpansion = *(src->source + (expandNext & 0xFFFFFF)); - expandNext = 0; - } } else { /* Otherwise (when LAST is not a reset) if polarity (LAST) == polarity(relation), insert sourceToken after LAST, diff --git a/icu4c/source/test/cintltst/cmsccoll.c b/icu4c/source/test/cintltst/cmsccoll.c index b875431ee9..9f25ae94b3 100644 --- a/icu4c/source/test/cintltst/cmsccoll.c +++ b/icu4c/source/test/cintltst/cmsccoll.c @@ -1954,7 +1954,56 @@ static void TestIncrementalNormalize() { uprv_free(strB); } +#if 0 +static void TestGetCaseBit() { + static const char *caseBitData[] = { + "a", "A", "ch", "Ch", "CH", + "\\uFF9E", "\\u0009" + }; + static const uint8_t results[] = { + UCOL_LOWER_CASE, UCOL_UPPER_CASE, UCOL_LOWER_CASE, UCOL_MIXED_CASE, UCOL_UPPER_CASE, + UCOL_UPPER_CASE, UCOL_LOWER_CASE + }; + + uint32_t i, blen = 0; + UChar b[256] = {0}; + UErrorCode status = U_ZERO_ERROR; + UCollator *UCA = ucol_open("", &status); + uint8_t res = 0; + + for(i = 0; i