ICU-96 sortkey algorithm is now in... just compiles for now...
X-SVN-Rev: 3387
This commit is contained in:
parent
e8272cf29b
commit
8eb80185f6
@ -148,7 +148,7 @@ ucol_openNew( const char *loc,
|
|||||||
/* Do we need a name or other stuff? */
|
/* Do we need a name or other stuff? */
|
||||||
UCollatorNew *result = NULL;
|
UCollatorNew *result = NULL;
|
||||||
UResourceBundle *b = ures_open(NULL, loc, status);
|
UResourceBundle *b = ures_open(NULL, loc, status);
|
||||||
UResourceBundle *binary = ures_getByKey(b, "%%Collation", NULL, status);
|
UResourceBundle *binary = ures_getByKey(b, "%%CollationNew", NULL, status);
|
||||||
|
|
||||||
if(*status = U_MISSING_RESOURCE_ERROR) { /* if we don't find tailoring, we'll fallback to UCA */
|
if(*status = U_MISSING_RESOURCE_ERROR) { /* if we don't find tailoring, we'll fallback to UCA */
|
||||||
result = UCA;
|
result = UCA;
|
||||||
@ -409,6 +409,20 @@ uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *sec
|
|||||||
return newStart;
|
return newStart;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define MIN_VALUE 0x02
|
||||||
|
#define UNMARKED 0x03
|
||||||
|
#define UCOL_VARIABLE_MAX 0x20
|
||||||
|
|
||||||
|
void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
|
||||||
|
uint8_t temp;
|
||||||
|
while(start<end) {
|
||||||
|
temp = *start;
|
||||||
|
*start++ = *end;
|
||||||
|
*end-- = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int32_t
|
int32_t
|
||||||
ucol_calcSortKeyNew(const UCollatorNew *coll,
|
ucol_calcSortKeyNew(const UCollatorNew *coll,
|
||||||
const UChar *source,
|
const UChar *source,
|
||||||
@ -453,7 +467,14 @@ ucol_calcSortKeyNew(const UCollatorNew *coll,
|
|||||||
UBool compareQuad = (strength >= UCOL_QUATERNARY);
|
UBool compareQuad = (strength >= UCOL_QUATERNARY);
|
||||||
UBool compareIdent = (strength == UCOL_IDENTICAL);
|
UBool compareIdent = (strength == UCOL_IDENTICAL);
|
||||||
UBool doCase = (ucol_getAttributeNew(coll, UCOL_CASE_LEVEL, status) == UCOL_ON);
|
UBool doCase = (ucol_getAttributeNew(coll, UCOL_CASE_LEVEL, status) == UCOL_ON);
|
||||||
UBool lowerFirst = (ucol_getAttributeNew(coll, UCOL_CASE_FIRST, status) == UCOL_LOWER_FIRST);
|
UBool upperFirst = (ucol_getAttributeNew(coll, UCOL_CASE_FIRST, status) == UCOL_UPPER_FIRST);
|
||||||
|
UBool shifted = (ucol_getAttributeNew(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED);
|
||||||
|
UBool isFrenchSec = (ucol_getAttributeNew(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
|
||||||
|
|
||||||
|
/* support for special features like caselevel and funky secondaries */
|
||||||
|
uint8_t *frenchStartPtr = NULL;
|
||||||
|
uint8_t *frenchEndPtr = NULL;
|
||||||
|
uint32_t caseShift = 0;
|
||||||
|
|
||||||
sortKeySize += ((compareSec?1:0) + (compareTer?1:0) + (doCase?1:0) + (compareQuad?1:0) + (compareIdent?1:0));
|
sortKeySize += ((compareSec?1:0) + (compareTer?1:0) + (doCase?1:0) + (compareQuad?1:0) + (compareIdent?1:0));
|
||||||
|
|
||||||
@ -490,8 +511,12 @@ ucol_calcSortKeyNew(const UCollatorNew *coll,
|
|||||||
uint8_t *quadStart = quads;
|
uint8_t *quadStart = quads;
|
||||||
|
|
||||||
uint32_t order = 0;
|
uint32_t order = 0;
|
||||||
|
uint32_t ce = 0;
|
||||||
|
|
||||||
uint16_t primary = 0;
|
uint16_t primary = 0;
|
||||||
|
uint8_t primary1 = 0;
|
||||||
|
uint8_t primary2 = 0;
|
||||||
|
uint8_t primary3 = 0;
|
||||||
uint8_t secondary = 0;
|
uint8_t secondary = 0;
|
||||||
uint8_t tertiary = 0;
|
uint8_t tertiary = 0;
|
||||||
|
|
||||||
@ -510,10 +535,119 @@ ucol_calcSortKeyNew(const UCollatorNew *coll,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
primary = ((order & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT);
|
/* We're saving order in ce, since we will destroy order in order to get primary, secondary, tertiary in order ;)*/
|
||||||
secondary = ((order & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT);
|
ce = order;
|
||||||
tertiary = (order & UCOL_TERTIARYORDERMASK);
|
|
||||||
|
|
||||||
|
|
||||||
|
tertiary = (order & UCOL_TERTIARYORDERMASK);
|
||||||
|
secondary = (order >>= 8) & 0xFF;
|
||||||
|
primary3 = 0; /* the third primary */
|
||||||
|
primary2 = (order >>= 8) & 0xFF;;
|
||||||
|
primary1 = order >>= 8;
|
||||||
|
|
||||||
|
if((tertiary & 0xF0) == 0xF0) { /* This indicates a long primary (11110000) */
|
||||||
|
/* Note: long primary can appear both as a normal CE or as a continuation CE (not that it matters much) */
|
||||||
|
primary3 = secondary;
|
||||||
|
secondary = (tertiary & 0x0F) + MIN_VALUE;
|
||||||
|
tertiary = UNMARKED;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(upperFirst && !(isContinuation(ce))) {
|
||||||
|
/* Upper cases have this bit turned on, so that they always come after the lower cases */
|
||||||
|
/* if we want to reverse this situation, we'll flip this bit */
|
||||||
|
tertiary ^= 0x80;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* In the code below, every increase in any of buffers is followed by the increase to */
|
||||||
|
/* sortKeySize - this might look tedious, but it is needed so that we can find out if */
|
||||||
|
/* we're using too much space and need to reallocate the primary buffer or easily bail */
|
||||||
|
/* out to ucol_getSortKeySizeNew. */
|
||||||
|
|
||||||
|
if(shifted && primary1 < UCOL_VARIABLE_MAX && primary1 > 0) {
|
||||||
|
/* We are dealing with a variable and we're treating them as shifted */
|
||||||
|
/* This is a shifted ignorable */
|
||||||
|
*quads++ = primary1;
|
||||||
|
sortKeySize++;
|
||||||
|
if(primary2 != 0) {
|
||||||
|
*quads++ = primary2;
|
||||||
|
sortKeySize++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
|
||||||
|
/* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
|
||||||
|
/* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
|
||||||
|
if(primary1 != 0) {
|
||||||
|
*primaries++ = primary1; /* scriptOrder[primary1]; */ /* This is the script ordering thingie */
|
||||||
|
sortKeySize++;
|
||||||
|
}
|
||||||
|
if(primary2 != 0) {
|
||||||
|
*primaries++ = primary2; /* second part */
|
||||||
|
sortKeySize++;
|
||||||
|
}
|
||||||
|
if(primary3 != 0) {
|
||||||
|
*primaries++ = primary2; /* third part */
|
||||||
|
sortKeySize++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(compareSec && secondary != 0) { /* I think that != 0 test should be != IGNORABLE */
|
||||||
|
/* This thing should also contain the compression logic, as in: */
|
||||||
|
/*
|
||||||
|
if (ws == COMMON2 && COMMON2 <= secondary[-1] && secondary[-1] < COMMON_MAX2)
|
||||||
|
++secondary[-1]; // simply increment!!
|
||||||
|
else *secondary++ = ws;
|
||||||
|
*/
|
||||||
|
|
||||||
|
*secondaries++ = secondary;
|
||||||
|
sortKeySize++;
|
||||||
|
if(isFrenchSec) {
|
||||||
|
/* Do the special handling for French secondaries */
|
||||||
|
/* We need to get continuation elements and do intermediate restore */
|
||||||
|
/* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
|
||||||
|
if(isContinuation(ce)) {
|
||||||
|
if (frenchStartPtr == NULL) {
|
||||||
|
frenchStartPtr = secondaries - 2;
|
||||||
|
}
|
||||||
|
frenchEndPtr = secondaries-1;
|
||||||
|
} else if (frenchStartPtr != NULL) {
|
||||||
|
/* reverse secondaries from frenchStartPtr up to frenchEndPtr */
|
||||||
|
uprv_ucol_reverse_buffer(frenchStartPtr, frenchEndPtr);
|
||||||
|
frenchStartPtr = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(doCase) {
|
||||||
|
if (caseShift == 0) {
|
||||||
|
*cases++ = 0x80;
|
||||||
|
sortKeySize++;
|
||||||
|
caseShift = 7;
|
||||||
|
}
|
||||||
|
*(cases-1) |= (tertiary & 0x80) >> (8-caseShift--);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(compareTer && tertiary != 0) { /* I think that != 0 test should be != IGNORABLE */
|
||||||
|
/* This thing should also contain the compression logic, as in: */
|
||||||
|
/*
|
||||||
|
if (ws == COMMON2 && COMMON2 <= secondary[-1] && secondary[-1] < COMMON_MAX2)
|
||||||
|
++secondary[-1]; // simply increment!!
|
||||||
|
else *secondary++ = ws;
|
||||||
|
*/
|
||||||
|
|
||||||
|
*tertiaries++ = tertiary;
|
||||||
|
sortKeySize++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(compareQuad && shifted && primary1 > 0) {
|
||||||
|
*quads++ = 0xFF;
|
||||||
|
sortKeySize++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This is an old peace of code... I'm leaving it here just for discussion regarding */
|
||||||
|
/* ignorables and situations with primary ignorable vs. variable top and ignorables */
|
||||||
|
#if 0
|
||||||
|
/*
|
||||||
if(primary != UCOL_PRIMIGNORABLE) {
|
if(primary != UCOL_PRIMIGNORABLE) {
|
||||||
*(primaries++) = (primary>>8);
|
*(primaries++) = (primary>>8);
|
||||||
*(primaries++) = (primary&0xFF);
|
*(primaries++) = (primary&0xFF);
|
||||||
@ -541,26 +675,27 @@ ucol_calcSortKeyNew(const UCollatorNew *coll,
|
|||||||
sortKeySize++;
|
sortKeySize++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(sortKeySize>resultLength) {
|
*/
|
||||||
if(allocatePrimary == FALSE) {
|
#endif
|
||||||
resultOverflow = TRUE;
|
if(sortKeySize>resultLength) { /* We have stepped over the primary buffer */
|
||||||
sortKeySize = ucol_getSortKeySizeNew(coll, &s, sortKeySize, strength, len);
|
if(allocatePrimary == FALSE) { /* need to save our butts if we cannot reallocate */
|
||||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
resultOverflow = TRUE;
|
||||||
finished = TRUE;
|
sortKeySize = ucol_getSortKeySizeNew(coll, &s, sortKeySize, strength, len);
|
||||||
break;
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||||
/*goto cleanup;*/
|
finished = TRUE;
|
||||||
} else {
|
break;
|
||||||
uint8_t *newStart;
|
} else { /* It's much nicer if we can actually reallocate */
|
||||||
newStart = (uint8_t *)uprv_realloc(primStart, 2*sortKeySize);
|
uint8_t *newStart;
|
||||||
if(primStart == NULL) {
|
newStart = (uint8_t *)uprv_realloc(primStart, 2*sortKeySize);
|
||||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
if(primStart == NULL) {
|
||||||
finished = TRUE;
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||||
break;
|
finished = TRUE;
|
||||||
}
|
break;
|
||||||
primaries=newStart+(primaries-primStart);
|
|
||||||
resultLength = 2*sortKeySize;
|
|
||||||
primStart = *result = newStart;
|
|
||||||
}
|
}
|
||||||
|
primaries=newStart+(primaries-primStart);
|
||||||
|
resultLength = 2*sortKeySize;
|
||||||
|
primStart = *result = newStart;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(finished) {
|
if(finished) {
|
||||||
@ -580,14 +715,18 @@ ucol_calcSortKeyNew(const UCollatorNew *coll,
|
|||||||
if(compareSec) {
|
if(compareSec) {
|
||||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||||
uint32_t secsize = secondaries-secStart;
|
uint32_t secsize = secondaries-secStart;
|
||||||
if(ucol_getAttributeNew(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON) { // do the reverse copy
|
if(isFrenchSec) { /* do the reverse copy */
|
||||||
for(i = 0; i<secsize; i++) {
|
/* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
|
||||||
*(primaries++) = *(secondaries-i-1);
|
if(frenchStartPtr != NULL) {
|
||||||
}
|
uprv_ucol_reverse_buffer(frenchStartPtr, frenchEndPtr);
|
||||||
} else {
|
}
|
||||||
uprv_memcpy(primaries, secStart, secsize);
|
for(i = 0; i<secsize; i++) {
|
||||||
primaries += secsize;
|
*(primaries++) = *(secondaries-i-1);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
uprv_memcpy(primaries, secStart, secsize);
|
||||||
|
primaries += secsize;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -234,7 +234,9 @@ ucol_cloneRuleData(UCollator *coll, int32_t *length, UErrorCode *status);
|
|||||||
#define UCOL_THAI 0xF3000000
|
#define UCOL_THAI 0xF3000000
|
||||||
|
|
||||||
#define isSpecial(CE) ((((CE)&UCOL_SPECIAL_FLAG)>>28)==0xF)
|
#define isSpecial(CE) ((((CE)&UCOL_SPECIAL_FLAG)>>28)==0xF)
|
||||||
|
#define isContinuation(CE) isSpecial((CE))
|
||||||
#define getCETag(CE) (((CE)&UCOL_TAG_MASK)>>UCOL_TAG_SHIFT)
|
#define getCETag(CE) (((CE)&UCOL_TAG_MASK)>>UCOL_TAG_SHIFT)
|
||||||
|
#define isContraction(CE) (isSpecial((CE)) && (getCETag((CE)) == CONTRACTION_TAG))
|
||||||
#define constructContractCE(CE) (UCOL_SPECIAL_FLAG | (CONTRACTION_TAG<<UCOL_TAG_SHIFT) | ((CE))&0xFFFFFF)
|
#define constructContractCE(CE) (UCOL_SPECIAL_FLAG | (CONTRACTION_TAG<<UCOL_TAG_SHIFT) | ((CE))&0xFFFFFF)
|
||||||
#define getContractOffset(CE) ((CE)&0xFFFFFF)
|
#define getContractOffset(CE) ((CE)&0xFFFFFF)
|
||||||
#define getExpansionOffset(CE) (((CE)&0x00FFFFF0)>>4)
|
#define getExpansionOffset(CE) (((CE)&0x00FFFFF0)>>4)
|
||||||
|
@ -233,6 +233,10 @@ U_CAPI UCollator*
|
|||||||
ucol_open( const char *loc,
|
ucol_open( const char *loc,
|
||||||
UErrorCode *status);
|
UErrorCode *status);
|
||||||
|
|
||||||
|
U_CAPI UCollatorNew*
|
||||||
|
ucol_openNew( const char *loc,
|
||||||
|
UErrorCode *status);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Open a UCollator for comparing strings.
|
* Open a UCollator for comparing strings.
|
||||||
* The UCollator may be used in calls to \Ref{ucol_strcoll}.
|
* The UCollator may be used in calls to \Ref{ucol_strcoll}.
|
||||||
@ -264,6 +268,9 @@ ucol_openRules( const UChar *rules,
|
|||||||
U_CAPI void
|
U_CAPI void
|
||||||
ucol_close(UCollator *coll);
|
ucol_close(UCollator *coll);
|
||||||
|
|
||||||
|
U_CAPI void
|
||||||
|
ucol_closeNew(UCollatorNew *coll);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compare two strings.
|
* Compare two strings.
|
||||||
* The strings will be compared using the normalization mode and options
|
* The strings will be compared using the normalization mode and options
|
||||||
|
Loading…
Reference in New Issue
Block a user