ICU-96 sortkey algorithm is now in... just compiles for now...

X-SVN-Rev: 3387
This commit is contained in:
Vladimir Weinstein 2001-01-08 06:51:18 +00:00
parent e8272cf29b
commit 8eb80185f6
3 changed files with 179 additions and 31 deletions

View File

@ -148,7 +148,7 @@ ucol_openNew( const char *loc,
/* Do we need a name or other stuff? */ /* Do we need a name or other stuff? */
UCollatorNew *result = NULL; UCollatorNew *result = NULL;
UResourceBundle *b = ures_open(NULL, loc, status); UResourceBundle *b = ures_open(NULL, loc, status);
UResourceBundle *binary = ures_getByKey(b, "%%Collation", NULL, status); UResourceBundle *binary = ures_getByKey(b, "%%CollationNew", NULL, status);
if(*status = U_MISSING_RESOURCE_ERROR) { /* if we don't find tailoring, we'll fallback to UCA */ if(*status = U_MISSING_RESOURCE_ERROR) { /* if we don't find tailoring, we'll fallback to UCA */
result = UCA; result = UCA;
@ -409,6 +409,20 @@ uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *sec
return newStart; return newStart;
} }
#define MIN_VALUE 0x02
#define UNMARKED 0x03
#define UCOL_VARIABLE_MAX 0x20
void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
uint8_t temp;
while(start<end) {
temp = *start;
*start++ = *end;
*end-- = temp;
}
}
int32_t int32_t
ucol_calcSortKeyNew(const UCollatorNew *coll, ucol_calcSortKeyNew(const UCollatorNew *coll,
const UChar *source, const UChar *source,
@ -453,7 +467,14 @@ ucol_calcSortKeyNew(const UCollatorNew *coll,
UBool compareQuad = (strength >= UCOL_QUATERNARY); UBool compareQuad = (strength >= UCOL_QUATERNARY);
UBool compareIdent = (strength == UCOL_IDENTICAL); UBool compareIdent = (strength == UCOL_IDENTICAL);
UBool doCase = (ucol_getAttributeNew(coll, UCOL_CASE_LEVEL, status) == UCOL_ON); UBool doCase = (ucol_getAttributeNew(coll, UCOL_CASE_LEVEL, status) == UCOL_ON);
UBool lowerFirst = (ucol_getAttributeNew(coll, UCOL_CASE_FIRST, status) == UCOL_LOWER_FIRST); UBool upperFirst = (ucol_getAttributeNew(coll, UCOL_CASE_FIRST, status) == UCOL_UPPER_FIRST);
UBool shifted = (ucol_getAttributeNew(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED);
UBool isFrenchSec = (ucol_getAttributeNew(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
/* support for special features like caselevel and funky secondaries */
uint8_t *frenchStartPtr = NULL;
uint8_t *frenchEndPtr = NULL;
uint32_t caseShift = 0;
sortKeySize += ((compareSec?1:0) + (compareTer?1:0) + (doCase?1:0) + (compareQuad?1:0) + (compareIdent?1:0)); sortKeySize += ((compareSec?1:0) + (compareTer?1:0) + (doCase?1:0) + (compareQuad?1:0) + (compareIdent?1:0));
@ -490,8 +511,12 @@ ucol_calcSortKeyNew(const UCollatorNew *coll,
uint8_t *quadStart = quads; uint8_t *quadStart = quads;
uint32_t order = 0; uint32_t order = 0;
uint32_t ce = 0;
uint16_t primary = 0; uint16_t primary = 0;
uint8_t primary1 = 0;
uint8_t primary2 = 0;
uint8_t primary3 = 0;
uint8_t secondary = 0; uint8_t secondary = 0;
uint8_t tertiary = 0; uint8_t tertiary = 0;
@ -510,10 +535,119 @@ ucol_calcSortKeyNew(const UCollatorNew *coll,
break; break;
} }
primary = ((order & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT); /* We're saving order in ce, since we will destroy order in order to get primary, secondary, tertiary in order ;)*/
secondary = ((order & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT); ce = order;
tertiary = (order & UCOL_TERTIARYORDERMASK);
tertiary = (order & UCOL_TERTIARYORDERMASK);
secondary = (order >>= 8) & 0xFF;
primary3 = 0; /* the third primary */
primary2 = (order >>= 8) & 0xFF;;
primary1 = order >>= 8;
if((tertiary & 0xF0) == 0xF0) { /* This indicates a long primary (11110000) */
/* Note: long primary can appear both as a normal CE or as a continuation CE (not that it matters much) */
primary3 = secondary;
secondary = (tertiary & 0x0F) + MIN_VALUE;
tertiary = UNMARKED;
}
if(upperFirst && !(isContinuation(ce))) {
/* Upper cases have this bit turned on, so that they always come after the lower cases */
/* if we want to reverse this situation, we'll flip this bit */
tertiary ^= 0x80;
}
/* In the code below, every increase in any of buffers is followed by the increase to */
/* sortKeySize - this might look tedious, but it is needed so that we can find out if */
/* we're using too much space and need to reallocate the primary buffer or easily bail */
/* out to ucol_getSortKeySizeNew. */
if(shifted && primary1 < UCOL_VARIABLE_MAX && primary1 > 0) {
/* We are dealing with a variable and we're treating them as shifted */
/* This is a shifted ignorable */
*quads++ = primary1;
sortKeySize++;
if(primary2 != 0) {
*quads++ = primary2;
sortKeySize++;
}
} else {
/* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
/* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
/* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
if(primary1 != 0) {
*primaries++ = primary1; /* scriptOrder[primary1]; */ /* This is the script ordering thingie */
sortKeySize++;
}
if(primary2 != 0) {
*primaries++ = primary2; /* second part */
sortKeySize++;
}
if(primary3 != 0) {
*primaries++ = primary2; /* third part */
sortKeySize++;
}
if(compareSec && secondary != 0) { /* I think that != 0 test should be != IGNORABLE */
/* This thing should also contain the compression logic, as in: */
/*
if (ws == COMMON2 && COMMON2 <= secondary[-1] && secondary[-1] < COMMON_MAX2)
++secondary[-1]; // simply increment!!
else *secondary++ = ws;
*/
*secondaries++ = secondary;
sortKeySize++;
if(isFrenchSec) {
/* Do the special handling for French secondaries */
/* We need to get continuation elements and do intermediate restore */
/* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
if(isContinuation(ce)) {
if (frenchStartPtr == NULL) {
frenchStartPtr = secondaries - 2;
}
frenchEndPtr = secondaries-1;
} else if (frenchStartPtr != NULL) {
/* reverse secondaries from frenchStartPtr up to frenchEndPtr */
uprv_ucol_reverse_buffer(frenchStartPtr, frenchEndPtr);
frenchStartPtr = NULL;
}
}
}
if(doCase) {
if (caseShift == 0) {
*cases++ = 0x80;
sortKeySize++;
caseShift = 7;
}
*(cases-1) |= (tertiary & 0x80) >> (8-caseShift--);
}
if(compareTer && tertiary != 0) { /* I think that != 0 test should be != IGNORABLE */
/* This thing should also contain the compression logic, as in: */
/*
if (ws == COMMON2 && COMMON2 <= secondary[-1] && secondary[-1] < COMMON_MAX2)
++secondary[-1]; // simply increment!!
else *secondary++ = ws;
*/
*tertiaries++ = tertiary;
sortKeySize++;
}
if(compareQuad && shifted && primary1 > 0) {
*quads++ = 0xFF;
sortKeySize++;
}
}
/* This is an old peace of code... I'm leaving it here just for discussion regarding */
/* ignorables and situations with primary ignorable vs. variable top and ignorables */
#if 0
/*
if(primary != UCOL_PRIMIGNORABLE) { if(primary != UCOL_PRIMIGNORABLE) {
*(primaries++) = (primary>>8); *(primaries++) = (primary>>8);
*(primaries++) = (primary&0xFF); *(primaries++) = (primary&0xFF);
@ -541,26 +675,27 @@ ucol_calcSortKeyNew(const UCollatorNew *coll,
sortKeySize++; sortKeySize++;
} }
} }
if(sortKeySize>resultLength) { */
if(allocatePrimary == FALSE) { #endif
resultOverflow = TRUE; if(sortKeySize>resultLength) { /* We have stepped over the primary buffer */
sortKeySize = ucol_getSortKeySizeNew(coll, &s, sortKeySize, strength, len); if(allocatePrimary == FALSE) { /* need to save our butts if we cannot reallocate */
*status = U_MEMORY_ALLOCATION_ERROR; resultOverflow = TRUE;
finished = TRUE; sortKeySize = ucol_getSortKeySizeNew(coll, &s, sortKeySize, strength, len);
break; *status = U_MEMORY_ALLOCATION_ERROR;
/*goto cleanup;*/ finished = TRUE;
} else { break;
uint8_t *newStart; } else { /* It's much nicer if we can actually reallocate */
newStart = (uint8_t *)uprv_realloc(primStart, 2*sortKeySize); uint8_t *newStart;
if(primStart == NULL) { newStart = (uint8_t *)uprv_realloc(primStart, 2*sortKeySize);
*status = U_MEMORY_ALLOCATION_ERROR; if(primStart == NULL) {
finished = TRUE; *status = U_MEMORY_ALLOCATION_ERROR;
break; finished = TRUE;
} break;
primaries=newStart+(primaries-primStart);
resultLength = 2*sortKeySize;
primStart = *result = newStart;
} }
primaries=newStart+(primaries-primStart);
resultLength = 2*sortKeySize;
primStart = *result = newStart;
}
} }
} }
if(finished) { if(finished) {
@ -580,14 +715,18 @@ ucol_calcSortKeyNew(const UCollatorNew *coll,
if(compareSec) { if(compareSec) {
*(primaries++) = UCOL_LEVELTERMINATOR; *(primaries++) = UCOL_LEVELTERMINATOR;
uint32_t secsize = secondaries-secStart; uint32_t secsize = secondaries-secStart;
if(ucol_getAttributeNew(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON) { // do the reverse copy if(isFrenchSec) { /* do the reverse copy */
for(i = 0; i<secsize; i++) { /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
*(primaries++) = *(secondaries-i-1); if(frenchStartPtr != NULL) {
} uprv_ucol_reverse_buffer(frenchStartPtr, frenchEndPtr);
} else { }
uprv_memcpy(primaries, secStart, secsize); for(i = 0; i<secsize; i++) {
primaries += secsize; *(primaries++) = *(secondaries-i-1);
} }
} else {
uprv_memcpy(primaries, secStart, secsize);
primaries += secsize;
}
} }

View File

@ -234,7 +234,9 @@ ucol_cloneRuleData(UCollator *coll, int32_t *length, UErrorCode *status);
#define UCOL_THAI 0xF3000000 #define UCOL_THAI 0xF3000000
#define isSpecial(CE) ((((CE)&UCOL_SPECIAL_FLAG)>>28)==0xF) #define isSpecial(CE) ((((CE)&UCOL_SPECIAL_FLAG)>>28)==0xF)
#define isContinuation(CE) isSpecial((CE))
#define getCETag(CE) (((CE)&UCOL_TAG_MASK)>>UCOL_TAG_SHIFT) #define getCETag(CE) (((CE)&UCOL_TAG_MASK)>>UCOL_TAG_SHIFT)
#define isContraction(CE) (isSpecial((CE)) && (getCETag((CE)) == CONTRACTION_TAG))
#define constructContractCE(CE) (UCOL_SPECIAL_FLAG | (CONTRACTION_TAG<<UCOL_TAG_SHIFT) | ((CE))&0xFFFFFF) #define constructContractCE(CE) (UCOL_SPECIAL_FLAG | (CONTRACTION_TAG<<UCOL_TAG_SHIFT) | ((CE))&0xFFFFFF)
#define getContractOffset(CE) ((CE)&0xFFFFFF) #define getContractOffset(CE) ((CE)&0xFFFFFF)
#define getExpansionOffset(CE) (((CE)&0x00FFFFF0)>>4) #define getExpansionOffset(CE) (((CE)&0x00FFFFF0)>>4)

View File

@ -233,6 +233,10 @@ U_CAPI UCollator*
ucol_open( const char *loc, ucol_open( const char *loc,
UErrorCode *status); UErrorCode *status);
U_CAPI UCollatorNew*
ucol_openNew( const char *loc,
UErrorCode *status);
/** /**
* Open a UCollator for comparing strings. * Open a UCollator for comparing strings.
* The UCollator may be used in calls to \Ref{ucol_strcoll}. * The UCollator may be used in calls to \Ref{ucol_strcoll}.
@ -264,6 +268,9 @@ ucol_openRules( const UChar *rules,
U_CAPI void U_CAPI void
ucol_close(UCollator *coll); ucol_close(UCollator *coll);
U_CAPI void
ucol_closeNew(UCollatorNew *coll);
/** /**
* Compare two strings. * Compare two strings.
* The strings will be compared using the normalization mode and options * The strings will be compared using the normalization mode and options