diff --git a/icu4c/source/common/ucol_swp.cpp b/icu4c/source/common/ucol_swp.cpp index df1bbc9905..301aad0569 100644 --- a/icu4c/source/common/ucol_swp.cpp +++ b/icu4c/source/common/ucol_swp.cpp @@ -219,8 +219,6 @@ ucol_swapBinary(const UDataSwapper *ds, /* swap the necessary pieces in the order of their occurrence in the data */ - udata_printError(ds, "@@@@@ Here inside the collator data swapper\n"); - /* read more of the UCATableHeader (the size field was read above) */ header.options= ds->readUInt32(inHeader->options); header.UCAConsts= ds->readUInt32(inHeader->UCAConsts); diff --git a/icu4c/source/data/in/coll/ucadata.icu b/icu4c/source/data/in/coll/ucadata.icu index 13f3b6d8ef..ec89d9a3f8 100644 Binary files a/icu4c/source/data/in/coll/ucadata.icu and b/icu4c/source/data/in/coll/ucadata.icu differ diff --git a/icu4c/source/i18n/coll.cpp b/icu4c/source/i18n/coll.cpp index ab487e1a2f..9003a377d4 100644 --- a/icu4c/source/i18n/coll.cpp +++ b/icu4c/source/i18n/coll.cpp @@ -833,16 +833,16 @@ Collator::getFunctionalEquivalent(const char* keyword, const Locale& locale, return Locale::createFromName(loc); } -int32_t Collator::getScriptOrder(int32_t *dest, - const int32_t destCapacity, +uint32_t Collator::getScriptOrder(int32_t *dest, + const uint32_t destCapacity, UErrorCode& status) const { status = U_UNSUPPORTED_ERROR; - return 0; + return 0; } void Collator::setScriptOrder(const int32_t *scriptOrder, - const int32_t scriptOrderLength, + const uint32_t scriptOrderLength, UErrorCode& status) { status = U_UNSUPPORTED_ERROR; diff --git a/icu4c/source/i18n/tblcoll.cpp b/icu4c/source/i18n/tblcoll.cpp index 69be523553..eccc459c42 100644 --- a/icu4c/source/i18n/tblcoll.cpp +++ b/icu4c/source/i18n/tblcoll.cpp @@ -587,15 +587,15 @@ void RuleBasedCollator::setStrength(ECollationStrength newStrength) ucol_setAttribute(ucollator, UCOL_STRENGTH, strength, &intStatus); } -int32_t RuleBasedCollator::getScriptOrder(int32_t *dest, - const int32_t destCapacity, +uint32_t RuleBasedCollator::getScriptOrder(int32_t *dest, + const uint32_t destCapacity, UErrorCode& status) const { return ucol_getScriptOrder(ucollator, dest, destCapacity, &status); } void RuleBasedCollator::setScriptOrder(const int32_t *scriptOrder, - const int32_t scriptOrderLength, + const uint32_t scriptOrderLength, UErrorCode& status) { ucol_setScriptOrder(ucollator, scriptOrder, scriptOrderLength); diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index 590b57bc70..bb1b81eef6 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -869,6 +869,7 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con result->rules = NULL; result->rulesLength = 0; result->freeRulesOnClose = FALSE; + result->scriptReorderTable = NULL; /* get the version info from UCATableHeader and populate the Collator struct*/ result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ @@ -907,13 +908,6 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con result->latinOneFailed = FALSE; result->UCA = UCA; - /* set attributes */ - ucol_setOptionsFromHeader( - result, - (UColOptionSet*)((uint8_t*)result->image+result->image->options), - status); - result->freeOptionsOnClose = FALSE; - /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ result->ucaRules = NULL; result->actualLocale = NULL; @@ -921,7 +915,13 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con result->requestedLocale = NULL; result->hasRealData = FALSE; // real data lives in .dat file... result->freeImageOnClose = FALSE; - result->scriptReorderTable = NULL; + + /* set attributes */ + ucol_setOptionsFromHeader( + result, + (UColOptionSet*)((uint8_t*)result->image+result->image->options), + status); + result->freeOptionsOnClose = FALSE; return result; } @@ -1134,6 +1134,7 @@ uprv_uca_getImplicitFromRaw(UChar32 cp) { static uint32_t U_EXPORT2 uprv_uca_getImplicitPrimary(UChar32 cp) { + //fprintf(stdout, "Incoming: %04x\n", cp); //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); cp = swapCJK(cp); @@ -1141,6 +1142,7 @@ uprv_uca_getImplicitPrimary(UChar32 cp) { // we now have a range of numbers from 0 to 21FFFF. //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); + //fprintf(stdout, "CJK swapped: %04x\n", cp); return uprv_uca_getImplicitFromRaw(cp); } @@ -2935,17 +2937,17 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ size = getExpansionCount(CE); CE = *CEOffset++; - //source->offsetRepeatCount = -1; + //source->offsetRepeatCount = -1; if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ for(i = 1; iCEpos++) = *CEOffset++; - source->offsetRepeatCount += 1; + source->offsetRepeatCount += 1; } } else { /* else, we do */ while(*CEOffset != 0) { *(source->CEpos++) = *CEOffset++; - source->offsetRepeatCount += 1; + source->offsetRepeatCount += 1; } } @@ -3565,14 +3567,14 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, return (uint32_t)UCOL_NULLORDER; } - if (source->offsetRepeatValue != 0) { + if (source->offsetRepeatValue != 0) { if (CECount > noChars) { - source->offsetRepeatCount += temp.offsetRepeatCount; + source->offsetRepeatCount += temp.offsetRepeatCount; } else { // **** does this really skip the right offsets? **** source->offsetReturn -= (noChars - CECount); } - } + } if (offsetBias >= 0) { source->offsetReturn = source->offsetStore - 1; @@ -5381,7 +5383,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll, primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); primary1 = (uint8_t)(order >> 8); - if(coll->scriptReorderTable != NULL && notIsContinuation){ + if (coll->scriptReorderTable != NULL && notIsContinuation) { primary1 = coll->scriptReorderTable[primary1]; } @@ -6584,7 +6586,8 @@ ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, { uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; UBool reverseSecondary = FALSE; - if(!isContinuation(CE)) { + UBool continuation = isContinuation(CE); + if(!continuation) { tertiary = (uint8_t)((CE & coll->tertiaryMask)); tertiary ^= coll->caseSwitch; reverseSecondary = TRUE; @@ -6599,6 +6602,10 @@ ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, primary1 = (uint8_t)(CE >> 8); if(primary1 != 0) { + if (coll->scriptReorderTable != NULL && !continuation) { + primary1 = coll->scriptReorderTable[primary1]; + } + coll->latinOneCEs[ch] |= (primary1 << *primShift); *primShift -= 8; } @@ -7111,22 +7118,21 @@ ucol_getStrength(const UCollator *coll) return ucol_getAttribute(coll, UCOL_STRENGTH, &status); } -U_INTERNAL int32_t U_EXPORT2 +U_INTERNAL uint32_t U_EXPORT2 ucol_getScriptOrder(const UCollator *coll, int32_t *dest, - const int32_t destCapacity, + const uint32_t destCapacity, UErrorCode *pErrorCode){ - int i; - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ + if (pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return NULL; } - if(coll->scriptOrder == NULL){ + if (coll->scriptOrder == NULL) { return 0; } - if(coll->scriptOrderLength > destCapacity){ + if (coll->scriptOrderLength > destCapacity) { *pErrorCode = U_BUFFER_OVERFLOW_ERROR; } - for(i = 0; (i < coll->scriptOrderLength) && (i < destCapacity); i++){ + for (uint32_t i = 0; (i < coll->scriptOrderLength) && (i < destCapacity); i++) { dest[i] = coll->scriptOrder[i]; } return coll->scriptOrderLength; @@ -7135,17 +7141,18 @@ ucol_getScriptOrder(const UCollator *coll, U_INTERNAL void U_EXPORT2 ucol_setScriptOrder(UCollator *coll, const int32_t *scriptOrder, - const int32_t scriptOrderLength){ - int i; + const uint32_t scriptOrderLength) { + UErrorCode status = U_ZERO_ERROR; if (coll->scriptOrder != NULL) { uprv_free(coll->scriptOrder); } coll->scriptOrder = (int32_t*) uprv_malloc(scriptOrderLength*sizeof(int32_t)); - for (i = 0; i < scriptOrderLength; i++) { + for (uint32_t i = 0; i < scriptOrderLength; i++) { coll->scriptOrder[i] = scriptOrder[i]; } coll->scriptOrderLength = scriptOrderLength; - ucol_buildScriptReorderTable(coll); + ucol_buildScriptReorderTable(coll, &status); + // TODO: something with the status if error condition } @@ -7483,11 +7490,6 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) tOrder &= UCOL_PRIMARYMASK; } while(tOrder == 0); - if(coll->scriptReorderTable != NULL){ - sOrder = (coll->scriptReorderTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); - tOrder = (coll->scriptReorderTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); - } - // if both primaries are the same if(sOrder == tOrder) { // and there are no more CEs, we advance to the next level @@ -7501,6 +7503,12 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) } } } else { + // only need to check one for continuation + // if one is then the other must be or the preceding CE would be a prefix of the other + if (coll->scriptReorderTable != NULL && !isContinuation(sOrder)) { + sOrder = (coll->scriptReorderTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); + tOrder = (coll->scriptReorderTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); + } // if two primaries are different, we are done result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; goto commonReturn; @@ -8083,10 +8091,6 @@ ucol_strcollUseLatin1( const UCollator *coll, } } } - if(coll->scriptReorderTable != NULL){ - sOrder = (coll->scriptReorderTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); - tOrder = (coll->scriptReorderTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); - } if(endOfSource) { // source is finished, but target is not, say the result. return UCOL_LESS; } diff --git a/icu4c/source/i18n/ucol_bld.cpp b/icu4c/source/i18n/ucol_bld.cpp index e52f51286b..be7216f0c3 100644 --- a/icu4c/source/i18n/ucol_bld.cpp +++ b/icu4c/source/i18n/ucol_bld.cpp @@ -1376,15 +1376,16 @@ ucol_initInverseUCA(UErrorCode *status) return _staticInvUCA; } -/* This is the data that is used for non-script reordering codes. +/* This is the data that is used for non-script reordering codes. These _must_ be kept + * in order that they are to be applied as defaults and in synch with the UColReorderCode enum. */ const char* ReorderingTokenNames[] = { - "SPACE", - "PUNCT", - "SYMBOL", - "CURRENCY", - "DIGIT", - NULL + "SPACE", + "PUNCT", + "SYMBOL", + "CURRENCY", + "DIGIT", + NULL }; void toUpper(const char* src, char* dst, uint32_t length) { @@ -1396,14 +1397,14 @@ void toUpper(const char* src, char* dst, uint32_t length) { U_INTERNAL int32_t U_EXPORT2 ucol_findReorderingEntry(const char* name) { - char buffer[32]; - toUpper(name, buffer, 32); - for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) { - if (strcmp(buffer, ReorderingTokenNames[entry]) == 0) { - return entry + UCOL_REORDERCODE_FIRST; - } - } - return USCRIPT_INVALID_CODE; + char buffer[32]; + toUpper(name, buffer, 32); + for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) { + if (strcmp(buffer, ReorderingTokenNames[entry]) == 0) { + return entry + UCOL_REORDERCODE_FIRST; + } + } + return USCRIPT_INVALID_CODE; } #endif /* #if !UCONFIG_NO_COLLATION */ diff --git a/icu4c/source/i18n/ucol_imp.h b/icu4c/source/i18n/ucol_imp.h index 357a5d7608..bdd7ef383c 100644 --- a/icu4c/source/i18n/ucol_imp.h +++ b/icu4c/source/i18n/ucol_imp.h @@ -770,7 +770,7 @@ typedef struct { /*UColAttributeValue*/ int32_t hiraganaQ; /* attribute for special Hiragana */ /*UColAttributeValue*/ int32_t numericCollation; /* attribute for numeric collation */ /* reorder code */ int32_t* scriptOrder; - int32_t scriptOrderLength; + uint32_t scriptOrderLength; uint32_t reserved[15]; /* for future use */ } UColOptionSet; @@ -1019,7 +1019,7 @@ struct UCollator { UVersionInfo dataVersion; /* Data info of UCA table */ int32_t* scriptOrder; - int32_t scriptOrderLength; + uint32_t scriptOrderLength; uint8_t* scriptReorderTable; }; @@ -1073,7 +1073,7 @@ uprv_uca_getCodePointFromRaw(UChar32 i); -U_CAPI void ucol_buildScriptReorderTable(UCollator *coll); +U_CAPI void ucol_buildScriptReorderTable(UCollator *coll, UErrorCode *status); #ifdef XP_CPLUSPLUS /* diff --git a/icu4c/source/i18n/ucol_res.cpp b/icu4c/source/i18n/ucol_res.cpp index 0205eab8c9..b1ebef3677 100644 --- a/icu4c/source/i18n/ucol_res.cpp +++ b/icu4c/source/i18n/ucol_res.cpp @@ -211,12 +211,11 @@ int ucol_getReorderCodesForLeadByte(UCollator *coll, int leadByte, int16_t* retu return reorderCodeCount; } -void ucol_buildScriptReorderTable(UCollator *coll) { - int32_t *next; +void ucol_buildScriptReorderTable(UCollator *coll, UErrorCode *status) { uint16_t leadBytesSize = 256; uint16_t leadBytes[256]; - uint16_t reorderCodesSize = 256; - int16_t reorderCodes[256]; + uint32_t internalScriptOrderLength = coll->scriptOrderLength + (UCOL_REORDERCODE_LIMIT - UCOL_REORDERCODE_FIRST); + int32_t* internalScriptOrder; // The lowest byte that hasn't been assigned a mapping int toBottom = 0x03; @@ -227,16 +226,17 @@ void ucol_buildScriptReorderTable(UCollator *coll) { bool fromTheBottom = true; // lead bytes that have alread been assigned to the permutation table - bool leadByteUsed[256]; + bool newLeadByteUsed[256]; // permutation table slots that have already been filled bool permutationSlotFilled[256]; // nothing to do - if (coll->scriptOrderLength == 0) { + if(U_FAILURE(*status) || coll == NULL || coll->scriptOrderLength == 0) { if (coll->scriptReorderTable != NULL) { uprv_free(coll->scriptReorderTable); coll->scriptReorderTable = NULL; } + coll->scriptOrderLength = 0; return; } @@ -244,14 +244,27 @@ void ucol_buildScriptReorderTable(UCollator *coll) { coll->scriptReorderTable = (uint8_t*)uprv_malloc(256*sizeof(uint8_t)); } + // prefill the reordering codes with the leading entries + internalScriptOrder = (int32_t*)uprv_malloc(internalScriptOrderLength * sizeof(int32_t)); + for (uint32_t codeIndex = 0; codeIndex < (UCOL_REORDERCODE_LIMIT - UCOL_REORDERCODE_FIRST); codeIndex++) { + internalScriptOrder[codeIndex] = UCOL_REORDERCODE_FIRST + codeIndex; + } + for (uint32_t codeIndex = 0; codeIndex < coll->scriptOrderLength; codeIndex++) { + uint32_t scriptOrderCode = coll->scriptOrder[codeIndex]; + internalScriptOrder[codeIndex + (UCOL_REORDERCODE_LIMIT - UCOL_REORDERCODE_FIRST)] = scriptOrderCode; + if (scriptOrderCode >= UCOL_REORDERCODE_FIRST && scriptOrderCode < UCOL_REORDERCODE_LIMIT) { + internalScriptOrder[scriptOrderCode - UCOL_REORDERCODE_FIRST] = UCOL_REORDERCODE_IGNORE; + } + } + for (int i = 0; i < 256; i++) { if (i < toBottom || i > toTop) { permutationSlotFilled[i] = true; - leadByteUsed[i] = true; + newLeadByteUsed[i] = true; coll->scriptReorderTable[i] = i; } else { permutationSlotFilled[i] = false; - leadByteUsed[i] = false; + newLeadByteUsed[i] = false; coll->scriptReorderTable[i] = 0; } } @@ -262,62 +275,122 @@ void ucol_buildScriptReorderTable(UCollator *coll) { * possible location. At each step, we also need to make sure that any scripts * that need to not be moved are copied to their same location in the final table. */ - next = coll->scriptOrder; - while (next < coll->scriptOrder + coll->scriptOrderLength) { - if (*next == UCOL_REORDERCODE_IGNORE) { - next++; + for (int scriptOrderIndex = 0; scriptOrderIndex < internalScriptOrderLength; scriptOrderIndex++) { + int32_t next = internalScriptOrder[scriptOrderIndex]; + if (next == UCOL_REORDERCODE_IGNORE) { continue; } - if (*next == USCRIPT_UNKNOWN) { + if (next == USCRIPT_UNKNOWN) { if (fromTheBottom == false) { - //TODO - error condition - bad script order + // double turnaround + *status = U_ILLEGAL_ARGUMENT_ERROR; + if (coll->scriptReorderTable != NULL) { + uprv_free(coll->scriptReorderTable); + coll->scriptReorderTable = NULL; + } + coll->scriptOrderLength = 0; + if (internalScriptOrder != NULL) { + uprv_free(internalScriptOrder); + } + fprintf(stdout, "\treturn - next == USCRIPT_UNKNOWN\n"); + return; } - fromTheBottom = false; - next++; + fromTheBottom = false; continue; } - uint16_t leadByteCount = ucol_getLeadBytesForReorderCode(coll, *next, leadBytes, leadBytesSize); + uint16_t leadByteCount = ucol_getLeadBytesForReorderCode(coll, next, leadBytes, leadBytesSize); if (fromTheBottom) { for (int leadByteIndex = 0; leadByteIndex < leadByteCount; leadByteIndex++) { // don't place a lead byte twice in the permutation table - if (leadByteUsed[leadBytes[leadByteIndex]]) { - // TODO - or should this be an error condition? - continue; + if (permutationSlotFilled[leadBytes[leadByteIndex]]) { + // lead byte already used + *status = U_ILLEGAL_ARGUMENT_ERROR; + if (coll->scriptReorderTable != NULL) { + uprv_free(coll->scriptReorderTable); + coll->scriptReorderTable = NULL; + } + coll->scriptOrderLength = 0; + if (internalScriptOrder != NULL) { + uprv_free(internalScriptOrder); + } + fprintf(stdout, "\treturn - fromTheBottom reuse lead byte\n"); + return; } coll->scriptReorderTable[leadBytes[leadByteIndex]] = toBottom; - leadByteUsed[toBottom] = true; + newLeadByteUsed[toBottom] = true; permutationSlotFilled[leadBytes[leadByteIndex]] = true; toBottom++; } } else { for (int leadByteIndex = leadByteCount - 1; leadByteIndex >= 0; leadByteIndex--) { // don't place a lead byte twice in the permutation table - if (leadByteUsed[leadBytes[leadByteIndex]]) { - // TODO - or should this be an error condition? - continue; + if (permutationSlotFilled[leadBytes[leadByteIndex]]) { + // lead byte already used + *status = U_ILLEGAL_ARGUMENT_ERROR; + if (coll->scriptReorderTable != NULL) { + uprv_free(coll->scriptReorderTable); + coll->scriptReorderTable = NULL; + } + coll->scriptOrderLength = 0; + if (internalScriptOrder != NULL) { + uprv_free(internalScriptOrder); + } + fprintf(stdout, "\treturn - fromTheTop reuse lead byte\n"); + return; } coll->scriptReorderTable[leadBytes[leadByteIndex]] = toTop; - leadByteUsed[toTop] = true; + newLeadByteUsed[toTop] = true; permutationSlotFilled[leadBytes[leadByteIndex]] = true; toTop--; } } - next++; } + +#ifdef REORDER_DEBUG + fprintf(stdout, "\n@@@@ Partial Script Reordering Table\n"); + for (int i = 0; i < 256; i++) { + fprintf(stdout, "\t%02x = %02x\n", i, coll->scriptReorderTable[i]); + } + fprintf(stdout, "\n@@@@ Lead Byte Used Table\n"); + for (int i = 0; i < 256; i++) { + fprintf(stdout, "\t%02x = %02x\n", i, newLeadByteUsed[i]); + } + fprintf(stdout, "\n@@@@ Permutation Slot Filled Table\n"); + for (int i = 0; i < 256; i++) { + fprintf(stdout, "\t%02x = %02x\n", i, permutationSlotFilled[i]); + } +#endif /* Copy everything that's left over */ int reorderCode = 0; for (int i = 0; i < 256; i++) { if (!permutationSlotFilled[i]) { - while (reorderCode < 256 && leadByteUsed[reorderCode++]) { - ; + while (reorderCode < 256 && newLeadByteUsed[reorderCode]) { + reorderCode++; } coll->scriptReorderTable[i] = reorderCode; + permutationSlotFilled[i] = true; + newLeadByteUsed[reorderCode] = true; } - } + } + +#ifdef REORDER_DEBUG + fprintf(stdout, "\n@@@@ Script Reordering Table\n"); + for (int i = 0; i < 256; i++) { + fprintf(stdout, "\t%02x = %02x\n", i, coll->scriptReorderTable[i]); + } +#endif + + if (internalScriptOrder != NULL) { + uprv_free(internalScriptOrder); + } + + // force a regen of the latin one table since it is affected by the script reordering + coll->latinOneRegenTable = TRUE; + ucol_updateInternalState(coll, status); } // API in ucol_imp.h @@ -623,9 +696,9 @@ ucol_openRules( const UChar *rules, result->actualLocale = NULL; result->validLocale = NULL; result->requestedLocale = NULL; + ucol_buildScriptReorderTable(result, status); ucol_setAttribute(result, UCOL_STRENGTH, strength, status); ucol_setAttribute(result, UCOL_NORMALIZATION_MODE, norm, status); - ucol_buildScriptReorderTable(result); } else { cleanup: if(result != NULL) { diff --git a/icu4c/source/i18n/ucol_tok.cpp b/icu4c/source/i18n/ucol_tok.cpp index 4fc865b45f..792a759c48 100644 --- a/icu4c/source/i18n/ucol_tok.cpp +++ b/icu4c/source/i18n/ucol_tok.cpp @@ -615,22 +615,22 @@ void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status){ int32_t codeCount = 0; int32_t codeIndex = 0; char conversion[64]; - int32_t tokenLength = 0; - const UChar* space; - + int32_t tokenLength = 0; + const UChar* space; + const UChar* current = src->current; const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current); // eat leading whitespace - while(current < end && u_isWhitespace(*current)) { - current++; - } - + while(current < end && u_isWhitespace(*current)) { + current++; + } + while(current < end) { - space = u_memchr(current, 0x0020, end - current); - space = space == 0 ? end : space; - tokenLength = space - current; - if (tokenLength < 4) { + space = u_memchr(current, 0x0020, end - current); + space = space == 0 ? end : space; + tokenLength = space - current; + if (tokenLength < 4) { *status = U_INVALID_FORMAT_ERROR; return; } @@ -642,51 +642,42 @@ void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status){ } if (codeCount == 0) { - *status = U_INVALID_FORMAT_ERROR; - } + *status = U_INVALID_FORMAT_ERROR; + } - int32_t nonScriptReorderCodes = UCOL_REORDERCODE_LIMIT - UCOL_REORDERCODE_FIRST; - codeCount += nonScriptReorderCodes; // to account for the non-script codes src->opts->scriptOrderLength = codeCount; src->opts->scriptOrder = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t)); - current = src->current; - - for (codeIndex = 0; codeIndex < nonScriptReorderCodes; codeIndex++) { - src->opts->scriptOrder[codeIndex] = UCOL_REORDERCODE_FIRST + codeIndex; - } - - // eat leading whitespace - while(current < end && u_isWhitespace(*current)) { - current++; - } + current = src->current; + + // eat leading whitespace + while(current < end && u_isWhitespace(*current)) { + current++; + } while(current < end) { - space = u_memchr(current, 0x0020, end - current); - space = space == 0 ? end : space; - tokenLength = space - current; - if (tokenLength < 4) { - *status = U_INVALID_FORMAT_ERROR; + space = u_memchr(current, 0x0020, end - current); + space = space == 0 ? end : space; + tokenLength = space - current; + if (tokenLength < 4) { + *status = U_ILLEGAL_ARGUMENT_ERROR; return; } else { u_UCharsToChars(current, conversion, tokenLength); - conversion[tokenLength] = '\0'; - src->opts->scriptOrder[codeIndex] = ucol_findReorderingEntry(conversion); - if (src->opts->scriptOrder[codeIndex] != USCRIPT_INVALID_CODE) { - // non-script reorder code used in rule so remove it from the leading slot - src->opts->scriptOrder[src->opts->scriptOrder[codeIndex] - UCOL_REORDERCODE_FIRST] = UCOL_REORDERCODE_IGNORE; - } else { - src->opts->scriptOrder[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion); - } - if (src->opts->scriptOrder[codeIndex] == USCRIPT_INVALID_CODE) { - *status = U_INVALID_FORMAT_ERROR; - } + conversion[tokenLength] = '\0'; + src->opts->scriptOrder[codeIndex] = ucol_findReorderingEntry(conversion); + if (src->opts->scriptOrder[codeIndex] == USCRIPT_INVALID_CODE) { + src->opts->scriptOrder[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion); + } + if (src->opts->scriptOrder[codeIndex] == USCRIPT_INVALID_CODE) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + } } codeIndex++; current += tokenLength; while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ ++current; } - } + } } // reads and conforms to various options in rules diff --git a/icu4c/source/i18n/unicode/coll.h b/icu4c/source/i18n/unicode/coll.h index 5294cdb5f2..a0ed55d77f 100644 --- a/icu4c/source/i18n/unicode/coll.h +++ b/icu4c/source/i18n/unicode/coll.h @@ -606,9 +606,9 @@ public: * @see ucol_setScriptOrder * @internal */ - virtual int32_t getScriptOrder(int32_t *dest, - const int32_t destCapacity, - UErrorCode& status) const; + virtual uint32_t getScriptOrder(int32_t *dest, + const uint32_t destCapacity, + UErrorCode& status) const; /** * Set the ordering of scripts for this collator. @@ -618,7 +618,7 @@ public: * @internal */ virtual void setScriptOrder(const int32_t* scriptOrder, - const int32_t scriptOrderLength, + const uint32_t scriptOrderLength, UErrorCode& status) ; /** diff --git a/icu4c/source/i18n/unicode/tblcoll.h b/icu4c/source/i18n/unicode/tblcoll.h index f55d5dac35..9d2a48aea6 100644 --- a/icu4c/source/i18n/unicode/tblcoll.h +++ b/icu4c/source/i18n/unicode/tblcoll.h @@ -675,9 +675,9 @@ public: * @see ucol_setScriptOrder * @internal */ - virtual int32_t getScriptOrder(int32_t* dest, - const int32_t destCapacity, - UErrorCode& status) const; + virtual uint32_t getScriptOrder(int32_t* dest, + const uint32_t destCapacity, + UErrorCode& status) const; /** * Set the ordering of scripts for this collator. @@ -687,7 +687,7 @@ public: * @internal */ virtual void setScriptOrder(const int32_t* scriptOrder, - const int32_t scriptOrderLength, + const uint32_t scriptOrderLength, UErrorCode& status); diff --git a/icu4c/source/i18n/unicode/ucol.h b/icu4c/source/i18n/unicode/ucol.h index 9351757dc4..def785250c 100644 --- a/icu4c/source/i18n/unicode/ucol.h +++ b/icu4c/source/i18n/unicode/ucol.h @@ -138,14 +138,14 @@ typedef enum { * @internal */ typedef enum { - UCOL_REORDERCODE_FIRST = 0x1000, - UCOL_REORDERCODE_SPACE = 0x1000, - UCOL_REORDERCODE_PUNCTUATION = 0x1001, - UCOL_REORDERCODE_SYMBOL = 0x1002, - UCOL_REORDERCODE_CURRENCY = 0x1003, - UCOL_REORDERCODE_DIGIT = 0x1004, - UCOL_REORDERCODE_LIMIT = 0x1005, - UCOL_REORDERCODE_IGNORE = 0x7FFF + UCOL_REORDERCODE_FIRST = 0x1000, + UCOL_REORDERCODE_SPACE = 0x1000, + UCOL_REORDERCODE_PUNCTUATION = 0x1001, + UCOL_REORDERCODE_SYMBOL = 0x1002, + UCOL_REORDERCODE_CURRENCY = 0x1003, + UCOL_REORDERCODE_DIGIT = 0x1004, + UCOL_REORDERCODE_LIMIT = 0x1005, + UCOL_REORDERCODE_IGNORE = 0x7FFF } UColReorderCode; /** @@ -547,10 +547,10 @@ ucol_setStrength(UCollator *coll, * @see ucol_setScriptOrder * @internal */ -U_INTERNAL int32_t U_EXPORT2 +U_INTERNAL uint32_t U_EXPORT2 ucol_getScriptOrder(const UCollator* coll, int32_t* dest, - const int32_t destCapacity, + const uint32_t destCapacity, UErrorCode *pErrorCode); /** @@ -564,7 +564,7 @@ ucol_getScriptOrder(const UCollator* coll, U_INTERNAL void U_EXPORT2 ucol_setScriptOrder(UCollator* coll, const int32_t* scriptOrder, - const int32_t scriptOrderLength); + const uint32_t scriptOrderLength); /** * Get the display name for a UCollator. diff --git a/icu4c/source/test/cintltst/callcoll.c b/icu4c/source/test/cintltst/callcoll.c index 021fad18c6..56aa1707f9 100644 --- a/icu4c/source/test/cintltst/callcoll.c +++ b/icu4c/source/test/cintltst/callcoll.c @@ -29,6 +29,7 @@ * equlivalent to word 'one'. */ +#include #include #include diff --git a/icu4c/source/test/cintltst/cmsccoll.c b/icu4c/source/test/cintltst/cmsccoll.c index cbf51980ab..388511a60d 100644 --- a/icu4c/source/test/cintltst/cmsccoll.c +++ b/icu4c/source/test/cintltst/cmsccoll.c @@ -950,8 +950,8 @@ static void testAgainstUCA(UCollator *coll, UCollator *UCA, const char *refName, src.extraEnd = src.end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; *first = *second = 0; - /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to - the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */ + /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to + the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */ while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,status)) != NULL) { strength = src.parsedToken.strength; chOffset = src.parsedToken.charsOffset; @@ -5829,6 +5829,12 @@ static void TestBeforeRuleWithScriptReordering(void) UChar rules[500]; uint32_t rulesLength = 0; UScriptCode scriptOrder[1] = {USCRIPT_GREEK}; + UCollationResult collResult; + + uint8_t baseKey[256]; + uint32_t baseKeyLength; + uint8_t beforeKey[256]; + uint32_t beforeKeyLength; UChar base[] = { 0x03b1 }; /* base */ int32_t baseLen = sizeof(base)/sizeof(*base); @@ -5836,15 +5842,13 @@ static void TestBeforeRuleWithScriptReordering(void) UChar before[] = { 0x0e01 }; /* ko kai */ int32_t beforeLen = sizeof(before)/sizeof(*before); - UCollationResult collResult; - uint8_t baseKey[256]; - uint32_t baseKeyLength; - uint8_t beforeKey[256]; - uint32_t beforeKeyLength; + /*UChar *data[] = { before, base }; + genericRulesStarter(srules, data, 2);*/ + + log_verbose("Testing the &[before 1] rule with [scriptReorder grek]\n"); - log_verbose("Testing the &[before 1] rule with [scriptReorder grek]\n"); - - /* build collator */ + + /* build collator */ rulesLength = u_unescape(srules, rules, LEN(rules)); myCollation = ucol_openRules(rules, rulesLength, UCOL_ON, UCOL_TERTIARY, &error, &status); if(U_FAILURE(status)) { @@ -5852,85 +5856,174 @@ static void TestBeforeRuleWithScriptReordering(void) return; } - /* check collation results - before rule applied but not script reordering */ + /* check collation results - before rule applied but not script reordering */ collResult = ucol_strcoll(myCollation, base, baseLen, before, beforeLen); - if (collResult != UCOL_GREATER) { - log_err("Collation result not correct before script reordering = %d\n", collResult); - } + if (collResult != UCOL_GREATER) { + log_err("Collation result not correct before script reordering = %d\n", collResult); + } - /* check the lead byte of the collation keys before script reordering */ + /* check the lead byte of the collation keys before script reordering */ baseKeyLength = ucol_getSortKey(myCollation, base, baseLen, baseKey, 256); beforeKeyLength = ucol_getSortKey(myCollation, before, beforeLen, beforeKey, 256); if (baseKey[0] != beforeKey[0]) { log_err("Different lead byte for sort keys using before rule and before script reordering. base character lead byte = %02x, before character lead byte = %02x\n", baseKey[0], beforeKey[0]); } - /* reirder the scripts */ + /* reorder the scripts */ ucol_setScriptOrder(myCollation, scriptOrder, 1); - /* check collation results - before rule applied and after script reordering */ + /* check collation results - before rule applied and after script reordering */ collResult = ucol_strcoll(myCollation, base, baseLen, before, beforeLen); - if (collResult != UCOL_GREATER) { - log_err("Collation result not correct after script reordering = %d\n", collResult); - } - - /* check the lead byte of the collation keys after script reordering */ + if (collResult != UCOL_GREATER) { + log_err("Collation result not correct after script reordering = %d\n", collResult); + } + + /* check the lead byte of the collation keys after script reordering */ ucol_getSortKey(myCollation, base, baseLen, baseKey, 256); ucol_getSortKey(myCollation, before, beforeLen, beforeKey, 256); if (baseKey[0] != beforeKey[0]) { - log_err("Different lead byte for sort keys using before fule and after script reordering. base character lead byte = %02x, before character lead byte = %02x\n", baseKey[0], beforeKey[0]); + log_err("Different lead byte for sort keys using before fule and after script reordering. base character lead byte = %02x, before character lead byte = %02x\n", baseKey[0], beforeKey[0]); } ucol_close(myCollation); } +/* + * Utility function to test one collation reordering test case. + * @param testcases Array of test cases. + * @param n_testcases Size of the array testcases. + * @param str_rules Array of rules. These rules should be specifying the same rule in different formats. + * @param n_rules Size of the array str_rules. + */ +static void doTestOneReorderingAPITestCase(const OneTestCase testCases[], uint32_t testCasesLen, const int32_t reorderTokens[], uint32_t reorderTokensLen) +{ + int testCaseNum; + UErrorCode status = U_ZERO_ERROR; + UCollator *myCollation; + + for (testCaseNum = 0; testCaseNum < testCasesLen; ++testCaseNum) { + myCollation = ucol_open("", &status); + if (U_FAILURE(status)) { + log_err_status(status, "ERROR: in creation of collator: %s\n", myErrorName(status)); + return; + } + /*ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); + ucol_setStrength(myCollation, UCOL_TERTIARY);*/ + ucol_setScriptOrder(myCollation, reorderTokens, reorderTokensLen); + for (testCaseNum = 0; testCaseNum < testCasesLen; ++testCaseNum) { + doTest(myCollation, + testCases[testCaseNum].source, + testCases[testCaseNum].target, + testCases[testCaseNum].result + ); + } + ucol_close(myCollation); + } +} + static void TestGreekFirstReorder(void) { - const char* strRules[] = { - "[scriptReorder Grek]" - }; + const char* strRules[] = { + "[scriptReorder Grek]" + }; - const static OneTestCase privateUseCharacterStrings[] = { - { {0x0391}, {0x0391}, UCOL_EQUAL }, - { {0x0041}, {0x0391}, UCOL_GREATER }, - { {0x03B1, 0x0041}, {0x03B1, 0x0391}, UCOL_GREATER }, - { {0x0060}, {0x0391}, UCOL_LESS }, - { {0x0391}, {0xe2dc}, UCOL_LESS }, - { {0x0391}, {0x0060}, UCOL_GREATER }, - }; - doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules)); + const int32_t apiRules[] = { + USCRIPT_GREEK + }; + + const static OneTestCase privateUseCharacterStrings[] = { + { {0x0391}, {0x0391}, UCOL_EQUAL }, + { {0x0041}, {0x0391}, UCOL_GREATER }, + { {0x03B1, 0x0041}, {0x03B1, 0x0391}, UCOL_GREATER }, + { {0x0060}, {0x0391}, UCOL_LESS }, + { {0x0391}, {0xe2dc}, UCOL_LESS }, + { {0x0391}, {0x0060}, UCOL_GREATER }, + }; + + /* Test rules creation */ + doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules)); + + /* Test collation reordering API */ + doTestOneReorderingAPITestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), apiRules, LEN(apiRules)); } static void TestGreekLastReorder(void) { - const char* strRules[] = { - "[scriptReorder Zzzz Grek]" - }; + const char* strRules[] = { + "[scriptReorder Zzzz Grek]" + }; - const static OneTestCase privateUseCharacterStrings[] = { - { {0x0391}, {0x0391}, UCOL_EQUAL }, - { {0x0041}, {0x0391}, UCOL_LESS }, - { {0x03B1, 0x0041}, {0x03B1, 0x0391}, UCOL_LESS }, - { {0x0060}, {0x0391}, UCOL_LESS }, - { {0x0391}, {0xe2dc}, UCOL_GREATER }, - }; - doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules)); + const int32_t apiRules[] = { + USCRIPT_UNKNOWN, USCRIPT_GREEK + }; + + const static OneTestCase privateUseCharacterStrings[] = { + { {0x0391}, {0x0391}, UCOL_EQUAL }, + { {0x0041}, {0x0391}, UCOL_LESS }, + { {0x03B1, 0x0041}, {0x03B1, 0x0391}, UCOL_LESS }, + { {0x0060}, {0x0391}, UCOL_LESS }, + { {0x0391}, {0xe2dc}, UCOL_GREATER }, + }; + + /* Test rules creation */ + doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules)); + + /* Test collation reordering API */ + doTestOneReorderingAPITestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), apiRules, LEN(apiRules)); } static void TestNonScriptReorder(void) { - const char* strRules[] = { - "[scriptReorder Grek Symbol DIGIT Latn Punct space Zzzz cURRENCy]" - }; + const char* strRules[] = { + "[scriptReorder Grek Symbol DIGIT Latn Punct space Zzzz cURRENCy]" + }; - const static OneTestCase privateUseCharacterStrings[] = { - { {0x0391}, {0x0041}, UCOL_LESS }, - { {0x0041}, {0x0391}, UCOL_GREATER }, - { {0x0060}, {0x0041}, UCOL_LESS }, - { {0x0060}, {0x0391}, UCOL_GREATER }, - { {0x0024}, {0x0041}, UCOL_GREATER }, - }; - doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules)); + const int32_t apiRules[] = { + USCRIPT_GREEK, UCOL_REORDERCODE_SYMBOL, UCOL_REORDERCODE_DIGIT, USCRIPT_LATIN, + UCOL_REORDERCODE_PUNCTUATION, UCOL_REORDERCODE_SPACE, USCRIPT_UNKNOWN, + UCOL_REORDERCODE_CURRENCY + }; + + const static OneTestCase privateUseCharacterStrings[] = { + { {0x0391}, {0x0041}, UCOL_LESS }, + { {0x0041}, {0x0391}, UCOL_GREATER }, + { {0x0060}, {0x0041}, UCOL_LESS }, + { {0x0060}, {0x0391}, UCOL_GREATER }, + { {0x0024}, {0x0041}, UCOL_GREATER }, + }; + + /* Test rules creation */ + doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules)); + + /* Test collation reordering API */ + doTestOneReorderingAPITestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), apiRules, LEN(apiRules)); +} + +static void TestHaniReorder(void) +{ + const char* strRules[] = { + "[scriptReorder Hani]" + }; + + const int32_t apiRules[] = { + USCRIPT_HAN + }; + + const static OneTestCase privateUseCharacterStrings[] = { + { {0x4e00}, {0x0041}, UCOL_LESS }, + { {0x4e00}, {0x0060}, UCOL_GREATER }, + { {0xD86D, 0xDF40}, {0x0041}, UCOL_LESS }, + { {0xD86D, 0xDF40}, {0x0060}, UCOL_GREATER }, + { {0x4e00}, {0xD86D, 0xDF40}, UCOL_LESS }, + { {0xfa27}, {0x0041}, UCOL_LESS }, + { {0xD869, 0xDF00}, {0x0041}, UCOL_LESS }, + }; + + /* Test rules creation */ + doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules)); + + /* Test collation reordering API */ + doTestOneReorderingAPITestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), apiRules, LEN(apiRules)); } @@ -6010,11 +6103,6 @@ void addMiscCollTest(TestNode** root) TEST(TestUCAPrecontext); TEST(TestOutOfBuffer5468); TEST(TestSameStrengthList); - - TEST(TestGreekFirstReorder); - TEST(TestGreekLastReorder); - TEST(TestBeforeRuleWithScriptReordering); - TEST(TestNonScriptReorder); TEST(TestSameStrengthListQuoted); TEST(TestSameStrengthListSupplemental); @@ -6027,6 +6115,12 @@ void addMiscCollTest(TestNode** root) TEST(TestPrivateUseCharactersInList); TEST(TestPrivateUseCharactersInRange); TEST(TestInvalidListsAndRanges); + + TEST(TestGreekFirstReorder); + TEST(TestGreekLastReorder); + TEST(TestBeforeRuleWithScriptReordering); + TEST(TestNonScriptReorder); + TEST(TestHaniReorder); } #endif /* #if !UCONFIG_NO_COLLATION */ diff --git a/icu4c/source/tools/icupkg/icupkg.cpp b/icu4c/source/tools/icupkg/icupkg.cpp index 15feb6591c..cc99ef56c9 100644 --- a/icu4c/source/tools/icupkg/icupkg.cpp +++ b/icu4c/source/tools/icupkg/icupkg.cpp @@ -348,7 +348,6 @@ fprintf(stderr, "isPackage = %x\n", isPackage); return U_ILLEGAL_ARGUMENT_ERROR; } if(isModified) { -fprintf(stderr, "@@@@ Calling Package::extractItem\n"); pkg->extractItem(destPath, outFilename, 0, outType); }