ICU-1007 call more efficient normalization, especially FCD instead of NFD when possible, also use growBuffer in some places

X-SVN-Rev: 5937
This commit is contained in:
Markus Scherer 2001-09-27 01:01:30 +00:00
parent aaf6902e7d
commit 1305c4495f
2 changed files with 110 additions and 134 deletions

View File

@ -450,7 +450,7 @@ ucol_openRules( const UChar *rules,
}
newRules[rulesLength]=0;
result->rules = newRules;
// ### TODO: should store rulesLength in result in case there are embedded NULs, right?!
result->rulesLength = rulesLength;
result->freeRulesOnClose = TRUE;
result->rb = 0;
ucol_setAttribute(result, UCOL_STRENGTH, strength, status);
@ -632,9 +632,10 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UEr
result->scriptOrder = NULL;
result->zero = 0;
result->rules = NULL;
/* get the version info form UCATableHeader and populate the Collator struct*/
result->rulesLength = 0;
/* get the version info from UCATableHeader and populate the Collator struct*/
result->dataInfo.dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
result->dataInfo.dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
@ -745,43 +746,37 @@ void collIterNormalize(collIterate *collationSource)
UErrorCode status = U_ZERO_ERROR;
UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
uint32_t normLen;
int32_t normLen;
normLen = unorm_normalize(srcP, endP-srcP, UNORM_NFD, 0, collationSource->writableBuffer,
collationSource->writableBufSize, &status);
if (normLen == collationSource->writableBufSize) {
UChar *temp = (UChar *)uprv_malloc((normLen+1)*sizeof(UChar));
uprv_memcpy(temp, collationSource->writableBuffer, normLen * sizeof(UChar));
temp[normLen] = 0;
freeHeapWritableBuffer(collationSource);
collationSource->writableBuffer = temp;
}
if (U_FAILURE(status)) { /* This would be buffer overflow */
if (status == U_BUFFER_OVERFLOW_ERROR) {
freeHeapWritableBuffer(collationSource);
collationSource->writableBuffer = (UChar *)uprv_malloc((normLen+1)*sizeof(UChar));
collationSource->flags |= UCOL_ITER_ALLOCATED;
/* to enable null termination */
collationSource->writableBufSize = normLen + 1;
status = U_ZERO_ERROR;
unorm_normalize(srcP, endP-srcP, UNORM_NFD, 0, collationSource->writableBuffer,
collationSource->writableBufSize, &status);
if (status != U_ZERO_ERROR) {
normLen = unorm_decompose(&collationSource->writableBuffer, (int32_t *)&collationSource->writableBufSize,
srcP, (int32_t)(endP - srcP),
FALSE, FALSE,
u_growBufferFromStatic, collationSource->stackWritableBuffer,
&status);
if (U_FAILURE(status)) {
#ifdef UCOL_DEBUG
fprintf(stderr, "collIterNormalize(), normalize #2 failed, status = %d\n", status);
fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
#endif
return;
}
collationSource->writableBuffer[normLen] = 0;
}
else {
return;
}
if(status == U_STRING_NOT_TERMINATED_WARNING) {
// reallocate and terminate
if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
&collationSource->writableBuffer,
(int32_t *)&collationSource->writableBufSize, normLen + 1,
normLen)
) {
#ifdef UCOL_DEBUG
fprintf(stderr, "collIterNormalize(), normalize #1 failed, status = %d\n", status);
fprintf(stderr, "collIterNormalize(), out of memory\n");
#endif
return;
}
collationSource->writableBuffer[normLen] = 0;
}
if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
collationSource->flags |= UCOL_ITER_ALLOCATED;
}
collationSource->pos = collationSource->writableBuffer;
collationSource->origFlags = collationSource->flags;
collationSource->flags |= UCOL_ITER_INNORMBUF;
@ -1060,6 +1055,7 @@ void collPrevIterNormalize(collIterate *data)
freeHeapWritableBuffer(data);
data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
sizeof(UChar));
data->flags |= UCOL_ITER_ALLOCATED;
/* to handle the zero termination */
data->writableBufSize = normLen + 1;
}
@ -1467,6 +1463,7 @@ inline void normalizeNextContraction(collIterate *data)
freeHeapWritableBuffer(data);
data->writableBuffer = temp;
data->writableBufSize = size;
data->flags |= UCOL_ITER_ALLOCATED;
}
status = U_ZERO_ERROR;
@ -2801,14 +2798,13 @@ ucol_getSortKey(const UCollator *coll,
}
/* this function is called by the C++ API for sortkey generation */
U_CFUNC uint8_t *ucol_getSortKeyWithAllocation(const UCollator *coll,
const UChar *source,
int32_t sourceLength,
int32_t *resultLen) {
uint8_t *result = NULL;
UErrorCode status = U_ZERO_ERROR;
*resultLen = coll->sortKeyGen(coll, source, sourceLength, &result, 0, TRUE, &status);
return result;
U_CFUNC int32_t
ucol_getSortKeyWithAllocation(const UCollator *coll,
const UChar *source, int32_t sourceLength,
uint8_t **pResult,
UErrorCode *pErrorCode) {
*pResult = 0;
return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
}
#define UCOL_FSEC_BUF_SIZE 256
@ -3171,37 +3167,32 @@ ucol_calcSortKey(const UCollator *coll,
sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + (qShifted?1:0)/*(compareQuad?0:1)*/ + (compareIdent?1:0));
/* If we need to normalize, we'll do it all at once at the beginning! */
UNormalizationMode normMode;
if(compareIdent) {
normMode = UNORM_NFD;
} else if(coll->normalizationMode != UCOL_OFF) {
normMode = UNORM_FCD;
} else {
normMode = UNORM_NONE;
}
if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
len = unorm_internalNormalize(&normSource, &normSourceLen,
source, len,
normMode, FALSE,
u_growBufferFromStatic, normBuffer,
status);
if(U_FAILURE(*status)) {
return 0;
}
source = normSource;
}
collIterate s;
IInit_collIterate(coll, (UChar *)source, len, &s);
/* If we need to normalize, we'll do it all at once at the beggining! */
UColAttributeValue normMode = coll->normalizationMode;
if(compareIdent) {
if(unorm_quickCheck(source, len, UNORM_NFD, status) != UNORM_YES) {
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status);
if(U_FAILURE(*status)) {
*status=U_ZERO_ERROR;
normSource = (UChar *) uprv_malloc(normSourceLen*sizeof(UChar));
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status);
}
IInit_collIterate(coll, normSource, normSourceLen, &s);
if(source == normSource) {
s.flags &= ~UCOL_ITER_NORM;
len = normSourceLen;
}
} else if((normMode != UCOL_OFF)
/* changed by synwee */
&& UNORM_YES!=unorm_quickCheck(source, len, UNORM_FCD, status))
{
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status);
if(U_FAILURE(*status)) {
*status=U_ZERO_ERROR;
normSource = (UChar *) uprv_malloc(normSourceLen*sizeof(UChar));
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status);
}
IInit_collIterate(coll, normSource, normSourceLen, &s);
s.flags &= ~UCOL_ITER_NORM;
len = normSourceLen;
}
if(resultLength == 0 || primaries == NULL) {
@ -3666,28 +3657,25 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
int32_t len = sourceLength;
/* If we need to normalize, we'll do it all at once at the beginning! */
if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
len = unorm_internalNormalize(&normSource, &normSourceLen,
source, len,
UNORM_FCD, FALSE,
u_growBufferFromStatic, normBuffer,
status);
if(U_FAILURE(*status)) {
return 0;
}
source = normSource;
}
collIterate s;
IInit_collIterate(coll, (UChar *)source, len, &s);
/* If we need to normalize, we'll do it all at once at the beggining! */
UColAttributeValue normMode = coll->normalizationMode;
if(normMode != UCOL_OFF) {
if (UNORM_YES!=unorm_quickCheck(source, len, UNORM_FCD, status))
{
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status);
if(U_FAILURE(*status)) {
*status=U_ZERO_ERROR;
normSource = (UChar *) uprv_malloc((normSourceLen+1)*sizeof(UChar));
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, (normSourceLen+1), status);
}
IInit_collIterate(coll, normSource, normSourceLen, &s);
s.flags &= ~(UCOL_ITER_NORM);
len = normSourceLen;
}
if(source == normSource) {
s.flags &= ~UCOL_ITER_NORM;
}
if(resultLength == 0 || primaries == NULL) {
int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
if(normSource != normBuffer) {
@ -4361,12 +4349,14 @@ ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int3
return len+UCAlen;
}
static const UChar _NUL = 0;
U_CAPI const UChar*
ucol_getRules( const UCollator *coll,
int32_t *length)
{
if(coll->rules != NULL) {
*length = u_strlen(coll->rules);
*length = coll->rulesLength;
return coll->rules;
} else {
UErrorCode status = U_ZERO_ERROR;
@ -4375,13 +4365,14 @@ ucol_getRules( const UCollator *coll,
if(U_SUCCESS(status)) {
/*Semantic const */
((UCollator *)coll)->rules = ures_getStringByKey(collElem, "Sequence", length, &status);
((UCollator *)coll)->rulesLength = *length;
((UCollator *)coll)->freeRulesOnClose = FALSE;
ures_close(collElem);
return coll->rules;
}
}
*length = 0;
return &coll->zero;
return &_NUL;
}
}
@ -4493,48 +4484,40 @@ UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBoo
int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
UChar *tBuf = tColl->string;
// uint32_t compLen = 0;
uint32_t normLength;
UErrorCode status = U_ZERO_ERROR;
UCollationResult result;
UBool sAlloc = FALSE;
UBool tAlloc = FALSE;
if (normalize) {
UErrorCode status;
status = U_ZERO_ERROR;
if (unorm_quickCheck(sColl->string, sLen, UNORM_NFD, &status) != UNORM_YES) {
sLen = unorm_decompose(&sColl->writableBuffer, (int32_t *)&sColl->writableBufSize,
sBuf, sLen,
FALSE, FALSE,
u_growBufferFromStatic, sColl->stackWritableBuffer,
&status);
sBuf = sColl->writableBuffer;
normLength = unorm_normalize(sColl->string, sLen, UNORM_NFD, 0,
sBuf, UCOL_WRITABLE_BUFFER_SIZE, &status);
if (U_FAILURE(status)) { /*this would be buffer overflow */
sBuf = (UChar *)uprv_malloc((normLength+1)*sizeof(UChar));
sAlloc = TRUE;
status = U_ZERO_ERROR;
normLength = unorm_normalize(sColl->string, sLen, UNORM_NFD, 0, sBuf, normLength+1, &status);
if (sBuf != sColl->stackWritableBuffer) {
sColl->flags |= UCOL_ITER_ALLOCATED;
}
sLen = normLength;
}
status = U_ZERO_ERROR;
if (unorm_quickCheck(tColl->string, tLen, UNORM_NFD, &status) != UNORM_YES) {
tLen = unorm_decompose(&tColl->writableBuffer, (int32_t *)&tColl->writableBufSize,
tBuf, tLen,
FALSE, FALSE,
u_growBufferFromStatic, tColl->stackWritableBuffer,
&status);
tBuf = tColl->writableBuffer;
normLength = unorm_normalize(tColl->string, tLen, UNORM_NFD, 0,
tBuf, UCOL_WRITABLE_BUFFER_SIZE, &status);
if (U_FAILURE(status)) { /*this would be buffer overflow */
tBuf = (UChar *)uprv_malloc((normLength+1)*sizeof(UChar));
tAlloc = TRUE;
status = U_ZERO_ERROR;
normLength = unorm_normalize(tColl->string, tLen, UNORM_NFD, 0, tBuf, normLength+1, &status);
if (tBuf != tColl->stackWritableBuffer) {
tColl->flags |= UCOL_ITER_ALLOCATED;
}
tLen = normLength;
}
}
if (sLen == -1 && tLen == -1) {
comparison = u_strcmpCodePointOrder(sBuf, tBuf);
}
else
{
} else {
if (sLen == -1) {
sLen = u_strlen(sBuf);
}
@ -4542,28 +4525,18 @@ UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBoo
tLen = u_strlen(tBuf);
}
comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
}
result = UCOL_LESS;
if (comparison > 0) {
result = UCOL_GREATER;
}
else if (comparison == 0) {
if(sLen > tLen) {
result = UCOL_GREATER;
} else if (sLen == tLen){
result = UCOL_EQUAL;
if (comparison == 0) {
comparison = sLen - tLen;
}
}
if (sAlloc) {
uprv_free(sBuf);
if (comparison < 0) {
return UCOL_LESS;
} else if (comparison == 0) {
return UCOL_EQUAL;
} else /* comparison > 0 */ {
return UCOL_GREATER;
}
if (tAlloc) {
uprv_free(tBuf);
}
return result;
}
/* CEBuf - A struct and some inline functions to handle the saving */

View File

@ -250,9 +250,11 @@ U_CAPI uint32_t U_EXPORT2 ucol_getPrevCE(const UCollator *coll,
collIterate *collationSource,
UErrorCode *status);
/* function used by C++ getCollationKey to prevent restarting the calculation */
U_CFUNC uint8_t *ucol_getSortKeyWithAllocation(const UCollator *coll,
const UChar *source,
int32_t sourceLength, int32_t *resultLen);
U_CFUNC int32_t
ucol_getSortKeyWithAllocation(const UCollator *coll,
const UChar *source, int32_t sourceLength,
uint8_t **pResult,
UErrorCode *pErrorCode);
/* get some memory */
void *ucol_getABuffer(const UCollator *coll, uint32_t size);
@ -607,8 +609,9 @@ struct UCollator {
/* to speed up things, we use the UCA image, but we don't want it */
/* to run around */
const UChar *rules;
int32_t rulesLength;
UBool freeRulesOnClose;
UChar zero;
UDataInfo dataInfo; /* Data info of UCA table */
UErrorCode errorCode; /* internal error code */