ICU-96 collation update
X-SVN-Rev: 3582
This commit is contained in:
parent
df7bd557f7
commit
bf9f0b3ef3
@ -682,6 +682,21 @@ u_charMirror(UChar32 c) {
|
||||
}
|
||||
}
|
||||
|
||||
U_CFUNC uint8_t
|
||||
u_internalGetCombiningClass(UChar32 c) {
|
||||
uint32_t props=GET_PROPS_UNSAFE(c);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if(GET_CATEGORY(props)==U_NON_SPACING_MARK) {
|
||||
return (uint8_t)GET_UNSIGNED_VALUE(props);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
/* the combining class is in bits 23..16 of the first exception value */
|
||||
return (uint8_t)(*GET_EXCEPTIONS(props)>>16);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI uint8_t U_EXPORT2
|
||||
u_getCombiningClass(UChar32 c) {
|
||||
uint32_t props=GET_PROPS(c);
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "unicode/normlzr.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cpputils.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
// added by synwee
|
||||
#include "unicode/uchar.h"
|
||||
@ -6672,10 +6673,15 @@ u_quickCheck(const UChar* source,
|
||||
UChar32 codepoint;
|
||||
UQUICK_CHECK_VALUES result = UQUICK_CHECK_YES;
|
||||
|
||||
if(u_getCombiningClass(0x300) == 0) {
|
||||
return UQUICK_CHECK_NO;
|
||||
}
|
||||
|
||||
while (count != sourcelength)
|
||||
{
|
||||
UTF16_NEXT_CHAR_SAFE(source, count, sourcelength, codepoint, TRUE);
|
||||
combiningclass = u_getCombiningClass(codepoint);
|
||||
/*UTF16_NEXT_CHAR_SAFE(source, count, sourcelength, codepoint, TRUE);*/
|
||||
UTF_NEXT_CHAR(source, count, sourcelength, codepoint);
|
||||
combiningclass = u_internalGetCombiningClass(codepoint);
|
||||
|
||||
// not in canonical order
|
||||
if (oldcombiningclass > combiningclass && combiningclass != 0)
|
||||
|
@ -46,4 +46,12 @@ u_internalStrToUpper(UChar *dest, int32_t destCapacity,
|
||||
GrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/*
|
||||
* Internal, somewhat faster version of u_getCombiningClass()
|
||||
* for use by normalization quick check etc.
|
||||
* First make sure that data is loaded by u_getCombiningClass(0x300)!=0
|
||||
*/
|
||||
U_CFUNC uint8_t
|
||||
u_internalGetCombiningClass(UChar32 c);
|
||||
|
||||
#endif
|
||||
|
@ -490,7 +490,7 @@ uint32_t ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UEr
|
||||
collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
|
||||
}
|
||||
} else if(collationSource->pos < collationSource->len) { /* This is the real business now */
|
||||
UChar ch = *collationSource->pos;
|
||||
UChar ch = *collationSource->pos++;
|
||||
if(ch <= 0xFF) { /* if it's Latin One, we'll try to fast track it */
|
||||
order = coll->latinOneMapping[ch]; /* by looking in up in an array */
|
||||
} else { /* otherwise, */
|
||||
@ -503,7 +503,7 @@ uint32_t ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UEr
|
||||
order = ucol_getNextUCA(ch, collationSource, status);
|
||||
}
|
||||
}
|
||||
collationSource->pos++; /* we're advancing to the next codepoint */
|
||||
//collationSource->pos++; /* we're advancing to the next codepoint */
|
||||
} else {
|
||||
order = UCOL_NO_MORE_CES; /* if so, we won't play any more */
|
||||
}
|
||||
@ -530,7 +530,7 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
|
||||
UChar nextChar;
|
||||
if(UTF_IS_FIRST_SURROGATE(ch)) {
|
||||
if( (collationSource->pos<collationSource->len) &&
|
||||
UTF_IS_SECOND_SURROGATE((nextChar=*(collationSource->pos+1)))) {
|
||||
UTF_IS_SECOND_SURROGATE((nextChar=*collationSource->pos))) {
|
||||
uint32_t cp = (((ch)<<10UL)+(nextChar)-((0xd800<<10UL)+0xdc00));
|
||||
collationSource->pos++;
|
||||
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
|
||||
@ -578,6 +578,7 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
|
||||
/* Thai/Lao reordering */
|
||||
if(source->isThai == TRUE) { /* if we encountered Thai prevowel & the string is not yet touched */
|
||||
source->isThai = FALSE; /* We will touch the string */
|
||||
--source->pos;
|
||||
if((source->len - source->pos) > UCOL_WRITABLE_BUFFER_SIZE) {
|
||||
/* Problematic part - if the stack buffer is too small, we need to allocate */
|
||||
/* However, somebody needs to keep track of that allocated space */
|
||||
@ -598,7 +599,7 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
|
||||
*(targetCopy++) = *(sourceCopy++);
|
||||
}
|
||||
}
|
||||
source->pos = source->writableBuffer-1;
|
||||
source->pos = source->writableBuffer;
|
||||
source->len = targetCopy;
|
||||
source->CEpos = source->toReturn = source->CEs;
|
||||
CE = UCOL_IGNORABLE;
|
||||
@ -616,12 +617,13 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
|
||||
/* we need to convey the notion of having a backward search - most probably through the context object */
|
||||
/* if (backwardsSearch) offset += contractionUChars[(int16_t)offset]; else UCharOffset++; */
|
||||
UCharOffset++; /* skip the backward offset, see above */
|
||||
|
||||
if (source->pos>=source->len) { /* this is the end of string */
|
||||
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); /* So we'll pick whatever we have at the point... */
|
||||
source->pos--; /* I think, since we'll advance in the getCE */
|
||||
break;
|
||||
}
|
||||
schar = *(++source->pos);
|
||||
|
||||
schar = *source->pos++;
|
||||
while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
|
||||
UCharOffset++;
|
||||
}
|
||||
@ -633,8 +635,6 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
|
||||
}
|
||||
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
|
||||
if(!isContraction(CE)) {
|
||||
/* Maybe not */
|
||||
/*source->pos--;*/ /* I think, since we'll advance in the getCE */
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -655,7 +655,6 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
|
||||
*(source->CEpos++) = *CEOffset++;
|
||||
}
|
||||
}
|
||||
/*source->toReturn++;*/
|
||||
return CE;
|
||||
break;
|
||||
case CHARSET_TAG:
|
||||
@ -740,6 +739,7 @@ ucol_getSortKey(const UCollator *coll,
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
return ucol_calcSortKey(coll, source, sourceLength, &result, resultLength, FALSE, &status);
|
||||
/*return ucol_calcSortKeySimpleTertiary(coll, source, sourceLength, &result, resultLength, FALSE, &status);*/
|
||||
}
|
||||
|
||||
/* this function is called by the C++ API for sortkey generation */
|
||||
@ -1045,8 +1045,8 @@ ucol_calcSortKey(const UCollator *coll,
|
||||
for(;;) {
|
||||
for(i=prevBuffSize; i<minBufferSize; ++i) {
|
||||
|
||||
order = ucol_getNextCE(coll, &s, status);
|
||||
/*UCOL_GETNEXTCE(order, coll, s, status);*/
|
||||
/*order = ucol_getNextCE(coll, &s, status);*/
|
||||
UCOL_GETNEXTCE(order, coll, s, status);
|
||||
|
||||
if((order & 0xFFFFFFBF) == 0) {
|
||||
continue;
|
||||
@ -1072,6 +1072,7 @@ ucol_calcSortKey(const UCollator *coll,
|
||||
if(notIsContinuation) {
|
||||
/* it appears tht something should be done with the case bit */
|
||||
/* however, it is not clear when */
|
||||
/* TODO : continuations also have case bits now, should this go out of the if */
|
||||
if(upperFirst) { /* if there is a case bit */
|
||||
/* Upper cases have this bit turned on, so that they always come after the lower cases */
|
||||
/* if we want to reverse this situation, we'll flip this bit */
|
||||
@ -1168,7 +1169,7 @@ ucol_calcSortKey(const UCollator *coll,
|
||||
*cases++ = 0x80;
|
||||
caseShift = 7;
|
||||
}
|
||||
if(tertiary > 0) {
|
||||
if(tertiary != 0) {
|
||||
*(cases-1) |= caseBit << (caseShift--);
|
||||
}
|
||||
}
|
||||
@ -1201,6 +1202,7 @@ ucol_calcSortKey(const UCollator *coll,
|
||||
*tertiaries++ = tertiary;
|
||||
}
|
||||
}
|
||||
|
||||
if(shifted && notIsContinuation) {
|
||||
count4++;
|
||||
}
|
||||
@ -1379,6 +1381,263 @@ ucol_calcSortKey(const UCollator *coll,
|
||||
return sortKeySize;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ucol_calcSortKeySimpleTertiary(const UCollator *coll,
|
||||
const UChar *source,
|
||||
int32_t sourceLength,
|
||||
uint8_t **result,
|
||||
int32_t resultLength,
|
||||
UBool allocatePrimary,
|
||||
UErrorCode *status)
|
||||
{
|
||||
uint32_t i = 0; /* general purpose counter */
|
||||
|
||||
/* Stack allocated buffers for buffers we use */
|
||||
uint8_t second[UCOL_MAX_BUFFER], tert[UCOL_MAX_BUFFER];
|
||||
|
||||
uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
|
||||
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(primaries == NULL && allocatePrimary == TRUE) {
|
||||
primaries = *result = (uint8_t *)uprv_malloc(2*UCOL_MAX_BUFFER);
|
||||
resultLength = 2*UCOL_MAX_BUFFER;
|
||||
}
|
||||
uint8_t *primarySafeEnd = primaries + resultLength - 2;
|
||||
|
||||
int32_t primSize = resultLength, secSize = UCOL_MAX_BUFFER, terSize = UCOL_MAX_BUFFER;
|
||||
|
||||
int32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
|
||||
|
||||
UChar normBuffer[UCOL_NORMALIZATION_GROWTH*UCOL_MAX_BUFFER];
|
||||
UChar *normSource = normBuffer;
|
||||
int32_t normSourceLen = UCOL_NORMALIZATION_GROWTH*UCOL_MAX_BUFFER;
|
||||
|
||||
int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
|
||||
|
||||
uint8_t variableMax1 = coll->variableMax1;
|
||||
uint8_t variableMax2 = coll->variableMax2;
|
||||
|
||||
collIterate s;
|
||||
init_collIterate((UChar *)source, len, &s, FALSE);
|
||||
|
||||
/* If we need to normalize, we'll do it all at once at the beggining! */
|
||||
UColAttributeValue normMode = coll->normalizationMode;
|
||||
if((normMode != UCOL_OFF) && (u_quickCheck(source, len, UNORM_NFC, status) != UQUICK_CHECK_YES)) {
|
||||
normSourceLen = u_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status);
|
||||
if(U_FAILURE(*status)) {
|
||||
*status=U_ZERO_ERROR;
|
||||
normSource = (UChar *) uprv_malloc((normSourceLen+1)*sizeof(UChar));
|
||||
normSourceLen = u_normalize(source, sourceLength, UNORM_NFD, 0, normSource, (normSourceLen+1), status);
|
||||
}
|
||||
normSource[normSourceLen] = 0;
|
||||
s.string = normSource;
|
||||
s.pos = normSource;
|
||||
s.len = normSource+normSourceLen;
|
||||
}
|
||||
|
||||
len = s.len-s.pos;
|
||||
|
||||
if(resultLength == 0) {
|
||||
return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
|
||||
}
|
||||
|
||||
int32_t minBufferSize = UCOL_MAX_BUFFER;
|
||||
|
||||
uint8_t *primStart = primaries;
|
||||
uint8_t *secStart = secondaries;
|
||||
uint8_t *terStart = tertiaries;
|
||||
|
||||
uint32_t order = 0;
|
||||
uint32_t ce = 0;
|
||||
|
||||
uint8_t primary1 = 0;
|
||||
uint8_t primary2 = 0;
|
||||
uint8_t secondary = 0;
|
||||
uint8_t tertiary = 0;
|
||||
|
||||
int32_t prevBuffSize = 0;
|
||||
|
||||
UBool finished = FALSE;
|
||||
UBool resultOverflow = FALSE;
|
||||
UBool notIsContinuation = FALSE;
|
||||
|
||||
uint32_t count2 = 0, count3 = 0;
|
||||
|
||||
for(;;) {
|
||||
for(i=prevBuffSize; i<minBufferSize; ++i) {
|
||||
|
||||
/*order = ucol_getNextCE(coll, &s, status);*/
|
||||
UCOL_GETNEXTCE(order, coll, s, status);
|
||||
|
||||
if((order & 0xFFFFFFBF) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(order == UCOL_NO_MORE_CES) {
|
||||
finished = TRUE;
|
||||
break;
|
||||
}
|
||||
|
||||
/* We're saving order in ce, since we will destroy order in order to get primary, secondary, tertiary in order ;)*/
|
||||
ce = order;
|
||||
notIsContinuation = !isContinuation(ce);
|
||||
|
||||
tertiary = (order & 0x3f); /* this is temporary - removing case bit */
|
||||
secondary = (order >>= 8) & 0xFF;
|
||||
primary2 = (order >>= 8) & 0xFF;;
|
||||
primary1 = order >>= 8;
|
||||
|
||||
/* In the code below, every increase in any of buffers is followed by the increase to */
|
||||
/* sortKeySize - this might look tedious, but it is needed so that we can find out if */
|
||||
/* we're using too much space and need to reallocate the primary buffer or easily bail */
|
||||
/* out to ucol_getSortKeySizeNew. */
|
||||
|
||||
/* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
|
||||
/* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
|
||||
/* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
|
||||
if(primary1 != UCOL_NEW_IGNORABLE) {
|
||||
*primaries++ = primary1; /* scriptOrder[primary1]; */ /* This is the script ordering thingie */
|
||||
if(primary2 != UCOL_NEW_IGNORABLE) {
|
||||
*primaries++ = primary2; /* second part */
|
||||
}
|
||||
}
|
||||
|
||||
/* This is compression code. */
|
||||
if (secondary == UCOL_COMMON2 && notIsContinuation) {
|
||||
++count2;
|
||||
} else {
|
||||
if (count2 > 0) {
|
||||
if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
|
||||
while (count2 >= UCOL_TOP_COUNT2) {
|
||||
*secondaries++ = UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2;
|
||||
count2 -= UCOL_TOP_COUNT2;
|
||||
}
|
||||
*secondaries++ = UCOL_COMMON_TOP2 - count2;
|
||||
} else {
|
||||
while (count2 >= UCOL_BOT_COUNT2) {
|
||||
*secondaries++ = UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2;
|
||||
count2 -= UCOL_BOT_COUNT2;
|
||||
}
|
||||
*secondaries++ = UCOL_COMMON_BOT2 + count2;
|
||||
}
|
||||
count2 = 0;
|
||||
}
|
||||
*secondaries++ = secondary;
|
||||
}
|
||||
|
||||
|
||||
/* This is compression code. */
|
||||
/* sequence size check is included in the if clause */
|
||||
if (tertiary == UCOL_COMMON3 && notIsContinuation) {
|
||||
++count3;
|
||||
} else {
|
||||
if(tertiary > UCOL_COMMON3) {
|
||||
tertiary |= UCOL_FLAG_BIT_MASK;
|
||||
}
|
||||
if (count3 > 0) {
|
||||
if (tertiary > UCOL_COMMON3) {
|
||||
while (count3 >= UCOL_TOP_COUNT3) {
|
||||
*tertiaries++ = UCOL_COMMON_TOP3 - UCOL_TOP_COUNT3;
|
||||
count3 -= UCOL_TOP_COUNT3;
|
||||
}
|
||||
*tertiaries++ = UCOL_COMMON_TOP3 - count3;
|
||||
} else {
|
||||
while (count3 >= UCOL_BOT_COUNT3) {
|
||||
*tertiaries++ = UCOL_COMMON_BOT3 + UCOL_BOT_COUNT3;
|
||||
count3 -= UCOL_BOT_COUNT3;
|
||||
}
|
||||
*tertiaries++ = UCOL_COMMON_BOT3 + count3;
|
||||
}
|
||||
count3 = 0;
|
||||
}
|
||||
*tertiaries++ = tertiary;
|
||||
}
|
||||
|
||||
if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
|
||||
int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
|
||||
if(allocatePrimary == FALSE) { /* need to save our butts if we cannot reallocate */
|
||||
resultOverflow = TRUE;
|
||||
sortKeySize = ucol_getSortKeySize(coll, &s, sks, coll->strength, len);
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
finished = TRUE;
|
||||
break;
|
||||
} else { /* It's much nicer if we can actually reallocate */
|
||||
uint8_t *newStart;
|
||||
newStart = (uint8_t *)uprv_realloc(primStart, 2*sks);
|
||||
if(primStart == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
finished = TRUE;
|
||||
break;
|
||||
}
|
||||
primaries=newStart+(primaries-primStart);
|
||||
resultLength = 2*sks;
|
||||
primStart = *result = newStart;
|
||||
primarySafeEnd = primStart + resultLength - 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(finished) {
|
||||
break;
|
||||
} else {
|
||||
prevBuffSize = minBufferSize;
|
||||
secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, status);
|
||||
terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, status);
|
||||
minBufferSize *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
if(U_SUCCESS(*status)) {
|
||||
sortKeySize += (primaries - primStart);
|
||||
/* we have done all the CE's, now let's put them together to form a key */
|
||||
if (count2 > 0) {
|
||||
while (count2 >= UCOL_BOT_COUNT2) {
|
||||
*secondaries++ = UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2;
|
||||
count2 -= UCOL_BOT_COUNT2;
|
||||
}
|
||||
*secondaries++ = UCOL_COMMON_BOT2 + count2;
|
||||
}
|
||||
uint32_t secsize = secondaries-secStart;
|
||||
sortKeySize += secsize;
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
/* Need overflow test here */
|
||||
uprv_memcpy(primaries, secStart, secsize);
|
||||
primaries += secsize;
|
||||
|
||||
if (count3 > 0) {
|
||||
while (count3 >= UCOL_BOT_COUNT3) {
|
||||
*tertiaries++ = UCOL_COMMON_BOT3 + UCOL_BOT_COUNT3;
|
||||
count3 -= UCOL_BOT_COUNT3;
|
||||
}
|
||||
*tertiaries++ = UCOL_COMMON_BOT3 + count3;
|
||||
}
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
uint32_t tersize = tertiaries - terStart;
|
||||
sortKeySize += tersize;
|
||||
/* Need overflow test here */
|
||||
uprv_memcpy(primaries, terStart, tersize);
|
||||
primaries += tersize;
|
||||
|
||||
*(primaries++) = '\0';
|
||||
} else {
|
||||
/* This is wrong - we should return a key size - not set it to zero */
|
||||
sortKeySize = 0;
|
||||
}
|
||||
|
||||
if(terStart != tert) {
|
||||
uprv_free(terStart);
|
||||
uprv_free(secStart);
|
||||
}
|
||||
|
||||
if(normSource != normBuffer) {
|
||||
uprv_free(normSource);
|
||||
}
|
||||
|
||||
return sortKeySize;
|
||||
}
|
||||
|
||||
/* This is a trick string compare function that goes in and uses sortkeys to compare */
|
||||
/* It is used when compare gets in trouble and needs to bail out */
|
||||
UCollationResult ucol_compareUsingSortKeys(const UCollator *coll,
|
||||
|
@ -185,7 +185,7 @@ static uint8_t utf16fixup[32] = {
|
||||
(collationSource).CEpos = (collationSource).toReturn = (collationSource).CEs; \
|
||||
} \
|
||||
} else if((collationSource).pos < (collationSource).len) { \
|
||||
UChar ch = *(collationSource).pos; \
|
||||
UChar ch = *(collationSource).pos++; \
|
||||
if(ch <= 0xFF) { \
|
||||
(order) = (coll)->latinOneMapping[ch]; \
|
||||
} else { \
|
||||
@ -197,7 +197,6 @@ static uint8_t utf16fixup[32] = {
|
||||
(order) = ucol_getNextUCA(ch, &(collationSource), (status)); \
|
||||
} \
|
||||
} \
|
||||
(collationSource).pos++; \
|
||||
} else { \
|
||||
(order) = UCOL_NO_MORE_CES; \
|
||||
} \
|
||||
@ -227,6 +226,14 @@ ucol_calcSortKey(const UCollator *coll,
|
||||
UBool allocatePrimary,
|
||||
UErrorCode *status);
|
||||
|
||||
int32_t
|
||||
ucol_calcSortKeySimpleTertiary(const UCollator *coll,
|
||||
const UChar *source,
|
||||
int32_t sourceLength,
|
||||
uint8_t **result,
|
||||
int32_t resultLength,
|
||||
UBool allocatePrimary,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Makes a copy of the Collator's rule data. The format is
|
||||
|
Loading…
Reference in New Issue
Block a user