ICU-3984 changes to the collation reordering - API works same as rules and enhanced testing

X-SVN-Rev: 28960
This commit is contained in:
Stuart Gill 2010-10-30 00:42:12 +00:00
parent e2b5a4245f
commit 0c21fdf068
15 changed files with 378 additions and 217 deletions

View File

@ -219,8 +219,6 @@ ucol_swapBinary(const UDataSwapper *ds,
/* swap the necessary pieces in the order of their occurrence in the data */
udata_printError(ds, "@@@@@ Here inside the collator data swapper\n");
/* read more of the UCATableHeader (the size field was read above) */
header.options= ds->readUInt32(inHeader->options);
header.UCAConsts= ds->readUInt32(inHeader->UCAConsts);

View File

@ -833,16 +833,16 @@ Collator::getFunctionalEquivalent(const char* keyword, const Locale& locale,
return Locale::createFromName(loc);
}
int32_t Collator::getScriptOrder(int32_t *dest,
const int32_t destCapacity,
uint32_t Collator::getScriptOrder(int32_t *dest,
const uint32_t destCapacity,
UErrorCode& status) const
{
status = U_UNSUPPORTED_ERROR;
return 0;
return 0;
}
void Collator::setScriptOrder(const int32_t *scriptOrder,
const int32_t scriptOrderLength,
const uint32_t scriptOrderLength,
UErrorCode& status)
{
status = U_UNSUPPORTED_ERROR;

View File

@ -587,15 +587,15 @@ void RuleBasedCollator::setStrength(ECollationStrength newStrength)
ucol_setAttribute(ucollator, UCOL_STRENGTH, strength, &intStatus);
}
int32_t RuleBasedCollator::getScriptOrder(int32_t *dest,
const int32_t destCapacity,
uint32_t RuleBasedCollator::getScriptOrder(int32_t *dest,
const uint32_t destCapacity,
UErrorCode& status) const
{
return ucol_getScriptOrder(ucollator, dest, destCapacity, &status);
}
void RuleBasedCollator::setScriptOrder(const int32_t *scriptOrder,
const int32_t scriptOrderLength,
const uint32_t scriptOrderLength,
UErrorCode& status)
{
ucol_setScriptOrder(ucollator, scriptOrder, scriptOrderLength);

View File

@ -869,6 +869,7 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con
result->rules = NULL;
result->rulesLength = 0;
result->freeRulesOnClose = FALSE;
result->scriptReorderTable = NULL;
/* get the version info from UCATableHeader and populate the Collator struct*/
result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
@ -907,13 +908,6 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con
result->latinOneFailed = FALSE;
result->UCA = UCA;
/* set attributes */
ucol_setOptionsFromHeader(
result,
(UColOptionSet*)((uint8_t*)result->image+result->image->options),
status);
result->freeOptionsOnClose = FALSE;
/* Normally these will be set correctly later. This is the default if you use UCA or the default. */
result->ucaRules = NULL;
result->actualLocale = NULL;
@ -921,7 +915,13 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con
result->requestedLocale = NULL;
result->hasRealData = FALSE; // real data lives in .dat file...
result->freeImageOnClose = FALSE;
result->scriptReorderTable = NULL;
/* set attributes */
ucol_setOptionsFromHeader(
result,
(UColOptionSet*)((uint8_t*)result->image+result->image->options),
status);
result->freeOptionsOnClose = FALSE;
return result;
}
@ -1134,6 +1134,7 @@ uprv_uca_getImplicitFromRaw(UChar32 cp) {
static uint32_t U_EXPORT2
uprv_uca_getImplicitPrimary(UChar32 cp) {
//fprintf(stdout, "Incoming: %04x\n", cp);
//if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
cp = swapCJK(cp);
@ -1141,6 +1142,7 @@ uprv_uca_getImplicitPrimary(UChar32 cp) {
// we now have a range of numbers from 0 to 21FFFF.
//if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
//fprintf(stdout, "CJK swapped: %04x\n", cp);
return uprv_uca_getImplicitFromRaw(cp);
}
@ -2935,17 +2937,17 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
size = getExpansionCount(CE);
CE = *CEOffset++;
//source->offsetRepeatCount = -1;
//source->offsetRepeatCount = -1;
if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
for(i = 1; i<size; i++) {
*(source->CEpos++) = *CEOffset++;
source->offsetRepeatCount += 1;
source->offsetRepeatCount += 1;
}
} else { /* else, we do */
while(*CEOffset != 0) {
*(source->CEpos++) = *CEOffset++;
source->offsetRepeatCount += 1;
source->offsetRepeatCount += 1;
}
}
@ -3565,14 +3567,14 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
return (uint32_t)UCOL_NULLORDER;
}
if (source->offsetRepeatValue != 0) {
if (source->offsetRepeatValue != 0) {
if (CECount > noChars) {
source->offsetRepeatCount += temp.offsetRepeatCount;
source->offsetRepeatCount += temp.offsetRepeatCount;
} else {
// **** does this really skip the right offsets? ****
source->offsetReturn -= (noChars - CECount);
}
}
}
if (offsetBias >= 0) {
source->offsetReturn = source->offsetStore - 1;
@ -5381,7 +5383,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
primary1 = (uint8_t)(order >> 8);
if(coll->scriptReorderTable != NULL && notIsContinuation){
if (coll->scriptReorderTable != NULL && notIsContinuation) {
primary1 = coll->scriptReorderTable[primary1];
}
@ -6584,7 +6586,8 @@ ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
{
uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
UBool reverseSecondary = FALSE;
if(!isContinuation(CE)) {
UBool continuation = isContinuation(CE);
if(!continuation) {
tertiary = (uint8_t)((CE & coll->tertiaryMask));
tertiary ^= coll->caseSwitch;
reverseSecondary = TRUE;
@ -6599,6 +6602,10 @@ ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
primary1 = (uint8_t)(CE >> 8);
if(primary1 != 0) {
if (coll->scriptReorderTable != NULL && !continuation) {
primary1 = coll->scriptReorderTable[primary1];
}
coll->latinOneCEs[ch] |= (primary1 << *primShift);
*primShift -= 8;
}
@ -7111,22 +7118,21 @@ ucol_getStrength(const UCollator *coll)
return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
}
U_INTERNAL int32_t U_EXPORT2
U_INTERNAL uint32_t U_EXPORT2
ucol_getScriptOrder(const UCollator *coll,
int32_t *dest,
const int32_t destCapacity,
const uint32_t destCapacity,
UErrorCode *pErrorCode){
int i;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
if (pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return NULL;
}
if(coll->scriptOrder == NULL){
if (coll->scriptOrder == NULL) {
return 0;
}
if(coll->scriptOrderLength > destCapacity){
if (coll->scriptOrderLength > destCapacity) {
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
}
for(i = 0; (i < coll->scriptOrderLength) && (i < destCapacity); i++){
for (uint32_t i = 0; (i < coll->scriptOrderLength) && (i < destCapacity); i++) {
dest[i] = coll->scriptOrder[i];
}
return coll->scriptOrderLength;
@ -7135,17 +7141,18 @@ ucol_getScriptOrder(const UCollator *coll,
U_INTERNAL void U_EXPORT2
ucol_setScriptOrder(UCollator *coll,
const int32_t *scriptOrder,
const int32_t scriptOrderLength){
int i;
const uint32_t scriptOrderLength) {
UErrorCode status = U_ZERO_ERROR;
if (coll->scriptOrder != NULL) {
uprv_free(coll->scriptOrder);
}
coll->scriptOrder = (int32_t*) uprv_malloc(scriptOrderLength*sizeof(int32_t));
for (i = 0; i < scriptOrderLength; i++) {
for (uint32_t i = 0; i < scriptOrderLength; i++) {
coll->scriptOrder[i] = scriptOrder[i];
}
coll->scriptOrderLength = scriptOrderLength;
ucol_buildScriptReorderTable(coll);
ucol_buildScriptReorderTable(coll, &status);
// TODO: something with the status if error condition
}
@ -7483,11 +7490,6 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
tOrder &= UCOL_PRIMARYMASK;
} while(tOrder == 0);
if(coll->scriptReorderTable != NULL){
sOrder = (coll->scriptReorderTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
tOrder = (coll->scriptReorderTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
}
// if both primaries are the same
if(sOrder == tOrder) {
// and there are no more CEs, we advance to the next level
@ -7501,6 +7503,12 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
}
}
} else {
// only need to check one for continuation
// if one is then the other must be or the preceding CE would be a prefix of the other
if (coll->scriptReorderTable != NULL && !isContinuation(sOrder)) {
sOrder = (coll->scriptReorderTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
tOrder = (coll->scriptReorderTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
}
// if two primaries are different, we are done
result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
goto commonReturn;
@ -8083,10 +8091,6 @@ ucol_strcollUseLatin1( const UCollator *coll,
}
}
}
if(coll->scriptReorderTable != NULL){
sOrder = (coll->scriptReorderTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
tOrder = (coll->scriptReorderTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
}
if(endOfSource) { // source is finished, but target is not, say the result.
return UCOL_LESS;
}

View File

@ -1376,15 +1376,16 @@ ucol_initInverseUCA(UErrorCode *status)
return _staticInvUCA;
}
/* This is the data that is used for non-script reordering codes.
/* This is the data that is used for non-script reordering codes. These _must_ be kept
* in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
*/
const char* ReorderingTokenNames[] = {
"SPACE",
"PUNCT",
"SYMBOL",
"CURRENCY",
"DIGIT",
NULL
"SPACE",
"PUNCT",
"SYMBOL",
"CURRENCY",
"DIGIT",
NULL
};
void toUpper(const char* src, char* dst, uint32_t length) {
@ -1396,14 +1397,14 @@ void toUpper(const char* src, char* dst, uint32_t length) {
U_INTERNAL int32_t U_EXPORT2
ucol_findReorderingEntry(const char* name) {
char buffer[32];
toUpper(name, buffer, 32);
for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
if (strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
return entry + UCOL_REORDERCODE_FIRST;
}
}
return USCRIPT_INVALID_CODE;
char buffer[32];
toUpper(name, buffer, 32);
for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
if (strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
return entry + UCOL_REORDERCODE_FIRST;
}
}
return USCRIPT_INVALID_CODE;
}
#endif /* #if !UCONFIG_NO_COLLATION */

View File

@ -770,7 +770,7 @@ typedef struct {
/*UColAttributeValue*/ int32_t hiraganaQ; /* attribute for special Hiragana */
/*UColAttributeValue*/ int32_t numericCollation; /* attribute for numeric collation */
/* reorder code */ int32_t* scriptOrder;
int32_t scriptOrderLength;
uint32_t scriptOrderLength;
uint32_t reserved[15]; /* for future use */
} UColOptionSet;
@ -1019,7 +1019,7 @@ struct UCollator {
UVersionInfo dataVersion; /* Data info of UCA table */
int32_t* scriptOrder;
int32_t scriptOrderLength;
uint32_t scriptOrderLength;
uint8_t* scriptReorderTable;
};
@ -1073,7 +1073,7 @@ uprv_uca_getCodePointFromRaw(UChar32 i);
U_CAPI void ucol_buildScriptReorderTable(UCollator *coll);
U_CAPI void ucol_buildScriptReorderTable(UCollator *coll, UErrorCode *status);
#ifdef XP_CPLUSPLUS
/*

View File

@ -211,12 +211,11 @@ int ucol_getReorderCodesForLeadByte(UCollator *coll, int leadByte, int16_t* retu
return reorderCodeCount;
}
void ucol_buildScriptReorderTable(UCollator *coll) {
int32_t *next;
void ucol_buildScriptReorderTable(UCollator *coll, UErrorCode *status) {
uint16_t leadBytesSize = 256;
uint16_t leadBytes[256];
uint16_t reorderCodesSize = 256;
int16_t reorderCodes[256];
uint32_t internalScriptOrderLength = coll->scriptOrderLength + (UCOL_REORDERCODE_LIMIT - UCOL_REORDERCODE_FIRST);
int32_t* internalScriptOrder;
// The lowest byte that hasn't been assigned a mapping
int toBottom = 0x03;
@ -227,16 +226,17 @@ void ucol_buildScriptReorderTable(UCollator *coll) {
bool fromTheBottom = true;
// lead bytes that have alread been assigned to the permutation table
bool leadByteUsed[256];
bool newLeadByteUsed[256];
// permutation table slots that have already been filled
bool permutationSlotFilled[256];
// nothing to do
if (coll->scriptOrderLength == 0) {
if(U_FAILURE(*status) || coll == NULL || coll->scriptOrderLength == 0) {
if (coll->scriptReorderTable != NULL) {
uprv_free(coll->scriptReorderTable);
coll->scriptReorderTable = NULL;
}
coll->scriptOrderLength = 0;
return;
}
@ -244,14 +244,27 @@ void ucol_buildScriptReorderTable(UCollator *coll) {
coll->scriptReorderTable = (uint8_t*)uprv_malloc(256*sizeof(uint8_t));
}
// prefill the reordering codes with the leading entries
internalScriptOrder = (int32_t*)uprv_malloc(internalScriptOrderLength * sizeof(int32_t));
for (uint32_t codeIndex = 0; codeIndex < (UCOL_REORDERCODE_LIMIT - UCOL_REORDERCODE_FIRST); codeIndex++) {
internalScriptOrder[codeIndex] = UCOL_REORDERCODE_FIRST + codeIndex;
}
for (uint32_t codeIndex = 0; codeIndex < coll->scriptOrderLength; codeIndex++) {
uint32_t scriptOrderCode = coll->scriptOrder[codeIndex];
internalScriptOrder[codeIndex + (UCOL_REORDERCODE_LIMIT - UCOL_REORDERCODE_FIRST)] = scriptOrderCode;
if (scriptOrderCode >= UCOL_REORDERCODE_FIRST && scriptOrderCode < UCOL_REORDERCODE_LIMIT) {
internalScriptOrder[scriptOrderCode - UCOL_REORDERCODE_FIRST] = UCOL_REORDERCODE_IGNORE;
}
}
for (int i = 0; i < 256; i++) {
if (i < toBottom || i > toTop) {
permutationSlotFilled[i] = true;
leadByteUsed[i] = true;
newLeadByteUsed[i] = true;
coll->scriptReorderTable[i] = i;
} else {
permutationSlotFilled[i] = false;
leadByteUsed[i] = false;
newLeadByteUsed[i] = false;
coll->scriptReorderTable[i] = 0;
}
}
@ -262,62 +275,122 @@ void ucol_buildScriptReorderTable(UCollator *coll) {
* possible location. At each step, we also need to make sure that any scripts
* that need to not be moved are copied to their same location in the final table.
*/
next = coll->scriptOrder;
while (next < coll->scriptOrder + coll->scriptOrderLength) {
if (*next == UCOL_REORDERCODE_IGNORE) {
next++;
for (int scriptOrderIndex = 0; scriptOrderIndex < internalScriptOrderLength; scriptOrderIndex++) {
int32_t next = internalScriptOrder[scriptOrderIndex];
if (next == UCOL_REORDERCODE_IGNORE) {
continue;
}
if (*next == USCRIPT_UNKNOWN) {
if (next == USCRIPT_UNKNOWN) {
if (fromTheBottom == false) {
//TODO - error condition - bad script order
// double turnaround
*status = U_ILLEGAL_ARGUMENT_ERROR;
if (coll->scriptReorderTable != NULL) {
uprv_free(coll->scriptReorderTable);
coll->scriptReorderTable = NULL;
}
coll->scriptOrderLength = 0;
if (internalScriptOrder != NULL) {
uprv_free(internalScriptOrder);
}
fprintf(stdout, "\treturn - next == USCRIPT_UNKNOWN\n");
return;
}
fromTheBottom = false;
next++;
fromTheBottom = false;
continue;
}
uint16_t leadByteCount = ucol_getLeadBytesForReorderCode(coll, *next, leadBytes, leadBytesSize);
uint16_t leadByteCount = ucol_getLeadBytesForReorderCode(coll, next, leadBytes, leadBytesSize);
if (fromTheBottom) {
for (int leadByteIndex = 0; leadByteIndex < leadByteCount; leadByteIndex++) {
// don't place a lead byte twice in the permutation table
if (leadByteUsed[leadBytes[leadByteIndex]]) {
// TODO - or should this be an error condition?
continue;
if (permutationSlotFilled[leadBytes[leadByteIndex]]) {
// lead byte already used
*status = U_ILLEGAL_ARGUMENT_ERROR;
if (coll->scriptReorderTable != NULL) {
uprv_free(coll->scriptReorderTable);
coll->scriptReorderTable = NULL;
}
coll->scriptOrderLength = 0;
if (internalScriptOrder != NULL) {
uprv_free(internalScriptOrder);
}
fprintf(stdout, "\treturn - fromTheBottom reuse lead byte\n");
return;
}
coll->scriptReorderTable[leadBytes[leadByteIndex]] = toBottom;
leadByteUsed[toBottom] = true;
newLeadByteUsed[toBottom] = true;
permutationSlotFilled[leadBytes[leadByteIndex]] = true;
toBottom++;
}
} else {
for (int leadByteIndex = leadByteCount - 1; leadByteIndex >= 0; leadByteIndex--) {
// don't place a lead byte twice in the permutation table
if (leadByteUsed[leadBytes[leadByteIndex]]) {
// TODO - or should this be an error condition?
continue;
if (permutationSlotFilled[leadBytes[leadByteIndex]]) {
// lead byte already used
*status = U_ILLEGAL_ARGUMENT_ERROR;
if (coll->scriptReorderTable != NULL) {
uprv_free(coll->scriptReorderTable);
coll->scriptReorderTable = NULL;
}
coll->scriptOrderLength = 0;
if (internalScriptOrder != NULL) {
uprv_free(internalScriptOrder);
}
fprintf(stdout, "\treturn - fromTheTop reuse lead byte\n");
return;
}
coll->scriptReorderTable[leadBytes[leadByteIndex]] = toTop;
leadByteUsed[toTop] = true;
newLeadByteUsed[toTop] = true;
permutationSlotFilled[leadBytes[leadByteIndex]] = true;
toTop--;
}
}
next++;
}
#ifdef REORDER_DEBUG
fprintf(stdout, "\n@@@@ Partial Script Reordering Table\n");
for (int i = 0; i < 256; i++) {
fprintf(stdout, "\t%02x = %02x\n", i, coll->scriptReorderTable[i]);
}
fprintf(stdout, "\n@@@@ Lead Byte Used Table\n");
for (int i = 0; i < 256; i++) {
fprintf(stdout, "\t%02x = %02x\n", i, newLeadByteUsed[i]);
}
fprintf(stdout, "\n@@@@ Permutation Slot Filled Table\n");
for (int i = 0; i < 256; i++) {
fprintf(stdout, "\t%02x = %02x\n", i, permutationSlotFilled[i]);
}
#endif
/* Copy everything that's left over */
int reorderCode = 0;
for (int i = 0; i < 256; i++) {
if (!permutationSlotFilled[i]) {
while (reorderCode < 256 && leadByteUsed[reorderCode++]) {
;
while (reorderCode < 256 && newLeadByteUsed[reorderCode]) {
reorderCode++;
}
coll->scriptReorderTable[i] = reorderCode;
permutationSlotFilled[i] = true;
newLeadByteUsed[reorderCode] = true;
}
}
#ifdef REORDER_DEBUG
fprintf(stdout, "\n@@@@ Script Reordering Table\n");
for (int i = 0; i < 256; i++) {
fprintf(stdout, "\t%02x = %02x\n", i, coll->scriptReorderTable[i]);
}
#endif
if (internalScriptOrder != NULL) {
uprv_free(internalScriptOrder);
}
// force a regen of the latin one table since it is affected by the script reordering
coll->latinOneRegenTable = TRUE;
ucol_updateInternalState(coll, status);
}
// API in ucol_imp.h
@ -623,9 +696,9 @@ ucol_openRules( const UChar *rules,
result->actualLocale = NULL;
result->validLocale = NULL;
result->requestedLocale = NULL;
ucol_buildScriptReorderTable(result, status);
ucol_setAttribute(result, UCOL_STRENGTH, strength, status);
ucol_setAttribute(result, UCOL_NORMALIZATION_MODE, norm, status);
ucol_buildScriptReorderTable(result);
} else {
cleanup:
if(result != NULL) {

View File

@ -615,22 +615,22 @@ void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status){
int32_t codeCount = 0;
int32_t codeIndex = 0;
char conversion[64];
int32_t tokenLength = 0;
const UChar* space;
int32_t tokenLength = 0;
const UChar* space;
const UChar* current = src->current;
const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
// eat leading whitespace
while(current < end && u_isWhitespace(*current)) {
current++;
}
while(current < end && u_isWhitespace(*current)) {
current++;
}
while(current < end) {
space = u_memchr(current, 0x0020, end - current);
space = space == 0 ? end : space;
tokenLength = space - current;
if (tokenLength < 4) {
space = u_memchr(current, 0x0020, end - current);
space = space == 0 ? end : space;
tokenLength = space - current;
if (tokenLength < 4) {
*status = U_INVALID_FORMAT_ERROR;
return;
}
@ -642,44 +642,35 @@ void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status){
}
if (codeCount == 0) {
*status = U_INVALID_FORMAT_ERROR;
}
*status = U_INVALID_FORMAT_ERROR;
}
int32_t nonScriptReorderCodes = UCOL_REORDERCODE_LIMIT - UCOL_REORDERCODE_FIRST;
codeCount += nonScriptReorderCodes; // to account for the non-script codes
src->opts->scriptOrderLength = codeCount;
src->opts->scriptOrder = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
current = src->current;
current = src->current;
for (codeIndex = 0; codeIndex < nonScriptReorderCodes; codeIndex++) {
src->opts->scriptOrder[codeIndex] = UCOL_REORDERCODE_FIRST + codeIndex;
}
// eat leading whitespace
while(current < end && u_isWhitespace(*current)) {
current++;
}
// eat leading whitespace
while(current < end && u_isWhitespace(*current)) {
current++;
}
while(current < end) {
space = u_memchr(current, 0x0020, end - current);
space = space == 0 ? end : space;
tokenLength = space - current;
if (tokenLength < 4) {
*status = U_INVALID_FORMAT_ERROR;
space = u_memchr(current, 0x0020, end - current);
space = space == 0 ? end : space;
tokenLength = space - current;
if (tokenLength < 4) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
} else {
u_UCharsToChars(current, conversion, tokenLength);
conversion[tokenLength] = '\0';
src->opts->scriptOrder[codeIndex] = ucol_findReorderingEntry(conversion);
if (src->opts->scriptOrder[codeIndex] != USCRIPT_INVALID_CODE) {
// non-script reorder code used in rule so remove it from the leading slot
src->opts->scriptOrder[src->opts->scriptOrder[codeIndex] - UCOL_REORDERCODE_FIRST] = UCOL_REORDERCODE_IGNORE;
} else {
src->opts->scriptOrder[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
}
if (src->opts->scriptOrder[codeIndex] == USCRIPT_INVALID_CODE) {
*status = U_INVALID_FORMAT_ERROR;
}
conversion[tokenLength] = '\0';
src->opts->scriptOrder[codeIndex] = ucol_findReorderingEntry(conversion);
if (src->opts->scriptOrder[codeIndex] == USCRIPT_INVALID_CODE) {
src->opts->scriptOrder[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
}
if (src->opts->scriptOrder[codeIndex] == USCRIPT_INVALID_CODE) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
codeIndex++;
current += tokenLength;

View File

@ -606,9 +606,9 @@ public:
* @see ucol_setScriptOrder
* @internal
*/
virtual int32_t getScriptOrder(int32_t *dest,
const int32_t destCapacity,
UErrorCode& status) const;
virtual uint32_t getScriptOrder(int32_t *dest,
const uint32_t destCapacity,
UErrorCode& status) const;
/**
* Set the ordering of scripts for this collator.
@ -618,7 +618,7 @@ public:
* @internal
*/
virtual void setScriptOrder(const int32_t* scriptOrder,
const int32_t scriptOrderLength,
const uint32_t scriptOrderLength,
UErrorCode& status) ;
/**

View File

@ -675,9 +675,9 @@ public:
* @see ucol_setScriptOrder
* @internal
*/
virtual int32_t getScriptOrder(int32_t* dest,
const int32_t destCapacity,
UErrorCode& status) const;
virtual uint32_t getScriptOrder(int32_t* dest,
const uint32_t destCapacity,
UErrorCode& status) const;
/**
* Set the ordering of scripts for this collator.
@ -687,7 +687,7 @@ public:
* @internal
*/
virtual void setScriptOrder(const int32_t* scriptOrder,
const int32_t scriptOrderLength,
const uint32_t scriptOrderLength,
UErrorCode& status);

View File

@ -138,14 +138,14 @@ typedef enum {
* @internal
*/
typedef enum {
UCOL_REORDERCODE_FIRST = 0x1000,
UCOL_REORDERCODE_SPACE = 0x1000,
UCOL_REORDERCODE_PUNCTUATION = 0x1001,
UCOL_REORDERCODE_SYMBOL = 0x1002,
UCOL_REORDERCODE_CURRENCY = 0x1003,
UCOL_REORDERCODE_DIGIT = 0x1004,
UCOL_REORDERCODE_LIMIT = 0x1005,
UCOL_REORDERCODE_IGNORE = 0x7FFF
UCOL_REORDERCODE_FIRST = 0x1000,
UCOL_REORDERCODE_SPACE = 0x1000,
UCOL_REORDERCODE_PUNCTUATION = 0x1001,
UCOL_REORDERCODE_SYMBOL = 0x1002,
UCOL_REORDERCODE_CURRENCY = 0x1003,
UCOL_REORDERCODE_DIGIT = 0x1004,
UCOL_REORDERCODE_LIMIT = 0x1005,
UCOL_REORDERCODE_IGNORE = 0x7FFF
} UColReorderCode;
/**
@ -547,10 +547,10 @@ ucol_setStrength(UCollator *coll,
* @see ucol_setScriptOrder
* @internal
*/
U_INTERNAL int32_t U_EXPORT2
U_INTERNAL uint32_t U_EXPORT2
ucol_getScriptOrder(const UCollator* coll,
int32_t* dest,
const int32_t destCapacity,
const uint32_t destCapacity,
UErrorCode *pErrorCode);
/**
@ -564,7 +564,7 @@ ucol_getScriptOrder(const UCollator* coll,
U_INTERNAL void U_EXPORT2
ucol_setScriptOrder(UCollator* coll,
const int32_t* scriptOrder,
const int32_t scriptOrderLength);
const uint32_t scriptOrderLength);
/**
* Get the display name for a UCollator.

View File

@ -29,6 +29,7 @@
* equlivalent to word 'one'.
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

View File

@ -950,8 +950,8 @@ static void testAgainstUCA(UCollator *coll, UCollator *UCA, const char *refName,
src.extraEnd = src.end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
*first = *second = 0;
/* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
/* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,status)) != NULL) {
strength = src.parsedToken.strength;
chOffset = src.parsedToken.charsOffset;
@ -5829,6 +5829,12 @@ static void TestBeforeRuleWithScriptReordering(void)
UChar rules[500];
uint32_t rulesLength = 0;
UScriptCode scriptOrder[1] = {USCRIPT_GREEK};
UCollationResult collResult;
uint8_t baseKey[256];
uint32_t baseKeyLength;
uint8_t beforeKey[256];
uint32_t beforeKeyLength;
UChar base[] = { 0x03b1 }; /* base */
int32_t baseLen = sizeof(base)/sizeof(*base);
@ -5836,15 +5842,13 @@ static void TestBeforeRuleWithScriptReordering(void)
UChar before[] = { 0x0e01 }; /* ko kai */
int32_t beforeLen = sizeof(before)/sizeof(*before);
UCollationResult collResult;
uint8_t baseKey[256];
uint32_t baseKeyLength;
uint8_t beforeKey[256];
uint32_t beforeKeyLength;
/*UChar *data[] = { before, base };
genericRulesStarter(srules, data, 2);*/
log_verbose("Testing the &[before 1] rule with [scriptReorder grek]\n");
log_verbose("Testing the &[before 1] rule with [scriptReorder grek]\n");
/* build collator */
/* build collator */
rulesLength = u_unescape(srules, rules, LEN(rules));
myCollation = ucol_openRules(rules, rulesLength, UCOL_ON, UCOL_TERTIARY, &error, &status);
if(U_FAILURE(status)) {
@ -5852,85 +5856,174 @@ static void TestBeforeRuleWithScriptReordering(void)
return;
}
/* check collation results - before rule applied but not script reordering */
/* check collation results - before rule applied but not script reordering */
collResult = ucol_strcoll(myCollation, base, baseLen, before, beforeLen);
if (collResult != UCOL_GREATER) {
log_err("Collation result not correct before script reordering = %d\n", collResult);
}
if (collResult != UCOL_GREATER) {
log_err("Collation result not correct before script reordering = %d\n", collResult);
}
/* check the lead byte of the collation keys before script reordering */
/* check the lead byte of the collation keys before script reordering */
baseKeyLength = ucol_getSortKey(myCollation, base, baseLen, baseKey, 256);
beforeKeyLength = ucol_getSortKey(myCollation, before, beforeLen, beforeKey, 256);
if (baseKey[0] != beforeKey[0]) {
log_err("Different lead byte for sort keys using before rule and before script reordering. base character lead byte = %02x, before character lead byte = %02x\n", baseKey[0], beforeKey[0]);
}
/* reirder the scripts */
/* reorder the scripts */
ucol_setScriptOrder(myCollation, scriptOrder, 1);
/* check collation results - before rule applied and after script reordering */
/* check collation results - before rule applied and after script reordering */
collResult = ucol_strcoll(myCollation, base, baseLen, before, beforeLen);
if (collResult != UCOL_GREATER) {
log_err("Collation result not correct after script reordering = %d\n", collResult);
}
if (collResult != UCOL_GREATER) {
log_err("Collation result not correct after script reordering = %d\n", collResult);
}
/* check the lead byte of the collation keys after script reordering */
/* check the lead byte of the collation keys after script reordering */
ucol_getSortKey(myCollation, base, baseLen, baseKey, 256);
ucol_getSortKey(myCollation, before, beforeLen, beforeKey, 256);
if (baseKey[0] != beforeKey[0]) {
log_err("Different lead byte for sort keys using before fule and after script reordering. base character lead byte = %02x, before character lead byte = %02x\n", baseKey[0], beforeKey[0]);
log_err("Different lead byte for sort keys using before fule and after script reordering. base character lead byte = %02x, before character lead byte = %02x\n", baseKey[0], beforeKey[0]);
}
ucol_close(myCollation);
}
/*
* Utility function to test one collation reordering test case.
* @param testcases Array of test cases.
* @param n_testcases Size of the array testcases.
* @param str_rules Array of rules. These rules should be specifying the same rule in different formats.
* @param n_rules Size of the array str_rules.
*/
static void doTestOneReorderingAPITestCase(const OneTestCase testCases[], uint32_t testCasesLen, const int32_t reorderTokens[], uint32_t reorderTokensLen)
{
int testCaseNum;
UErrorCode status = U_ZERO_ERROR;
UCollator *myCollation;
for (testCaseNum = 0; testCaseNum < testCasesLen; ++testCaseNum) {
myCollation = ucol_open("", &status);
if (U_FAILURE(status)) {
log_err_status(status, "ERROR: in creation of collator: %s\n", myErrorName(status));
return;
}
/*ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
ucol_setStrength(myCollation, UCOL_TERTIARY);*/
ucol_setScriptOrder(myCollation, reorderTokens, reorderTokensLen);
for (testCaseNum = 0; testCaseNum < testCasesLen; ++testCaseNum) {
doTest(myCollation,
testCases[testCaseNum].source,
testCases[testCaseNum].target,
testCases[testCaseNum].result
);
}
ucol_close(myCollation);
}
}
static void TestGreekFirstReorder(void)
{
const char* strRules[] = {
"[scriptReorder Grek]"
};
const char* strRules[] = {
"[scriptReorder Grek]"
};
const static OneTestCase privateUseCharacterStrings[] = {
{ {0x0391}, {0x0391}, UCOL_EQUAL },
{ {0x0041}, {0x0391}, UCOL_GREATER },
{ {0x03B1, 0x0041}, {0x03B1, 0x0391}, UCOL_GREATER },
{ {0x0060}, {0x0391}, UCOL_LESS },
{ {0x0391}, {0xe2dc}, UCOL_LESS },
{ {0x0391}, {0x0060}, UCOL_GREATER },
};
doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules));
const int32_t apiRules[] = {
USCRIPT_GREEK
};
const static OneTestCase privateUseCharacterStrings[] = {
{ {0x0391}, {0x0391}, UCOL_EQUAL },
{ {0x0041}, {0x0391}, UCOL_GREATER },
{ {0x03B1, 0x0041}, {0x03B1, 0x0391}, UCOL_GREATER },
{ {0x0060}, {0x0391}, UCOL_LESS },
{ {0x0391}, {0xe2dc}, UCOL_LESS },
{ {0x0391}, {0x0060}, UCOL_GREATER },
};
/* Test rules creation */
doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules));
/* Test collation reordering API */
doTestOneReorderingAPITestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), apiRules, LEN(apiRules));
}
static void TestGreekLastReorder(void)
{
const char* strRules[] = {
"[scriptReorder Zzzz Grek]"
};
const char* strRules[] = {
"[scriptReorder Zzzz Grek]"
};
const static OneTestCase privateUseCharacterStrings[] = {
{ {0x0391}, {0x0391}, UCOL_EQUAL },
{ {0x0041}, {0x0391}, UCOL_LESS },
{ {0x03B1, 0x0041}, {0x03B1, 0x0391}, UCOL_LESS },
{ {0x0060}, {0x0391}, UCOL_LESS },
{ {0x0391}, {0xe2dc}, UCOL_GREATER },
};
doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules));
const int32_t apiRules[] = {
USCRIPT_UNKNOWN, USCRIPT_GREEK
};
const static OneTestCase privateUseCharacterStrings[] = {
{ {0x0391}, {0x0391}, UCOL_EQUAL },
{ {0x0041}, {0x0391}, UCOL_LESS },
{ {0x03B1, 0x0041}, {0x03B1, 0x0391}, UCOL_LESS },
{ {0x0060}, {0x0391}, UCOL_LESS },
{ {0x0391}, {0xe2dc}, UCOL_GREATER },
};
/* Test rules creation */
doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules));
/* Test collation reordering API */
doTestOneReorderingAPITestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), apiRules, LEN(apiRules));
}
static void TestNonScriptReorder(void)
{
const char* strRules[] = {
"[scriptReorder Grek Symbol DIGIT Latn Punct space Zzzz cURRENCy]"
};
const char* strRules[] = {
"[scriptReorder Grek Symbol DIGIT Latn Punct space Zzzz cURRENCy]"
};
const static OneTestCase privateUseCharacterStrings[] = {
{ {0x0391}, {0x0041}, UCOL_LESS },
{ {0x0041}, {0x0391}, UCOL_GREATER },
{ {0x0060}, {0x0041}, UCOL_LESS },
{ {0x0060}, {0x0391}, UCOL_GREATER },
{ {0x0024}, {0x0041}, UCOL_GREATER },
};
doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules));
const int32_t apiRules[] = {
USCRIPT_GREEK, UCOL_REORDERCODE_SYMBOL, UCOL_REORDERCODE_DIGIT, USCRIPT_LATIN,
UCOL_REORDERCODE_PUNCTUATION, UCOL_REORDERCODE_SPACE, USCRIPT_UNKNOWN,
UCOL_REORDERCODE_CURRENCY
};
const static OneTestCase privateUseCharacterStrings[] = {
{ {0x0391}, {0x0041}, UCOL_LESS },
{ {0x0041}, {0x0391}, UCOL_GREATER },
{ {0x0060}, {0x0041}, UCOL_LESS },
{ {0x0060}, {0x0391}, UCOL_GREATER },
{ {0x0024}, {0x0041}, UCOL_GREATER },
};
/* Test rules creation */
doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules));
/* Test collation reordering API */
doTestOneReorderingAPITestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), apiRules, LEN(apiRules));
}
static void TestHaniReorder(void)
{
const char* strRules[] = {
"[scriptReorder Hani]"
};
const int32_t apiRules[] = {
USCRIPT_HAN
};
const static OneTestCase privateUseCharacterStrings[] = {
{ {0x4e00}, {0x0041}, UCOL_LESS },
{ {0x4e00}, {0x0060}, UCOL_GREATER },
{ {0xD86D, 0xDF40}, {0x0041}, UCOL_LESS },
{ {0xD86D, 0xDF40}, {0x0060}, UCOL_GREATER },
{ {0x4e00}, {0xD86D, 0xDF40}, UCOL_LESS },
{ {0xfa27}, {0x0041}, UCOL_LESS },
{ {0xD869, 0xDF00}, {0x0041}, UCOL_LESS },
};
/* Test rules creation */
doTestOneTestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), strRules, LEN(strRules));
/* Test collation reordering API */
doTestOneReorderingAPITestCase(privateUseCharacterStrings, LEN(privateUseCharacterStrings), apiRules, LEN(apiRules));
}
@ -6011,11 +6104,6 @@ void addMiscCollTest(TestNode** root)
TEST(TestOutOfBuffer5468);
TEST(TestSameStrengthList);
TEST(TestGreekFirstReorder);
TEST(TestGreekLastReorder);
TEST(TestBeforeRuleWithScriptReordering);
TEST(TestNonScriptReorder);
TEST(TestSameStrengthListQuoted);
TEST(TestSameStrengthListSupplemental);
TEST(TestSameStrengthListQwerty);
@ -6027,6 +6115,12 @@ void addMiscCollTest(TestNode** root)
TEST(TestPrivateUseCharactersInList);
TEST(TestPrivateUseCharactersInRange);
TEST(TestInvalidListsAndRanges);
TEST(TestGreekFirstReorder);
TEST(TestGreekLastReorder);
TEST(TestBeforeRuleWithScriptReordering);
TEST(TestNonScriptReorder);
TEST(TestHaniReorder);
}
#endif /* #if !UCONFIG_NO_COLLATION */

View File

@ -348,7 +348,6 @@ fprintf(stderr, "isPackage = %x\n", isPackage);
return U_ILLEGAL_ARGUMENT_ERROR;
}
if(isModified) {
fprintf(stderr, "@@@@ Calling Package::extractItem\n");
pkg->extractItem(destPath, outFilename, 0, outType);
}