Added normalization to contraction.
Note this is not totally working yet.

X-SVN-Rev: 4538
This commit is contained in:
Syn Wee Quek 2001-04-24 03:18:54 +00:00
parent 4c635dfeca
commit 662d4ab558

View File

@ -603,9 +603,6 @@ void ucol_initUCA(UErrorCode *status) {
} }
} }
/* collIterNormalize Incremental Normalization happens here. */ /* collIterNormalize Incremental Normalization happens here. */
/* pick up the range of chars identifed by FCD, */ /* pick up the range of chars identifed by FCD, */
/* normalize it into the collIterate's writable buffer, */ /* normalize it into the collIterate's writable buffer, */
@ -620,6 +617,15 @@ void collIterNormalize(collIterate *collationSource)
normLen = unorm_normalize(srcP, endP-srcP, UNORM_NFD, 0, collationSource->writableBuffer, normLen = unorm_normalize(srcP, endP-srcP, UNORM_NFD, 0, collationSource->writableBuffer,
collationSource->writableBufSize, &status); collationSource->writableBufSize, &status);
if (normLen == collationSource->writableBufSize) {
UChar *temp = (UChar *)uprv_malloc((normLen+1)*sizeof(UChar));
uprv_memcpy(temp, collationSource->writableBuffer, normLen * sizeof(UChar));
temp[normLen] = 0;
if (collationSource->writableBuffer != collationSource->stackWritableBuffer) {
uprv_free( collationSource->writableBuffer);
}
collationSource->writableBuffer = temp;
}
if (U_FAILURE(status)) { /* This would be buffer overflow */ if (U_FAILURE(status)) { /* This would be buffer overflow */
if (status == U_BUFFER_OVERFLOW_ERROR) { if (status == U_BUFFER_OVERFLOW_ERROR) {
if (collationSource->writableBuffer != collationSource->stackWritableBuffer) { if (collationSource->writableBuffer != collationSource->stackWritableBuffer) {
@ -636,6 +642,7 @@ void collIterNormalize(collIterate *collationSource)
return; return;
} }
} }
collationSource->pos = collationSource->writableBuffer; collationSource->pos = collationSource->writableBuffer;
collationSource->origFlags = collationSource->flags; collationSource->origFlags = collationSource->flags;
collationSource->flags |= UCOL_ITER_INNORMBUF; collationSource->flags |= UCOL_ITER_INNORMBUF;
@ -656,7 +663,7 @@ void collIterNormalize(collIterate *collationSource)
/* the trailing combining class of the previous char was zero. */ /* the trailing combining class of the previous char was zero. */
/* True because the previous call to this function will have always exited */ /* True because the previous call to this function will have always exited */
/* that way, and we get called for every char where cc might be non-zero. */ /* that way, and we get called for every char where cc might be non-zero. */
inline void collIterFCD(collIterate *collationSource) { inline UBool collIterFCD(collIterate *collationSource) {
UChar32 codepoint; UChar32 codepoint;
UChar *srcP; UChar *srcP;
int length; int length;
@ -710,22 +717,16 @@ inline void collIterFCD(collIterate *collationSource) {
} }
} }
collationSource->fcdPosition = srcP + count; collationSource->fcdPosition = srcP + (count - 1);
if (codepoint == 0 && (collationSource->flags & UCOL_ITER_HASLEN)==0) { if (codepoint == 0 && (collationSource->flags & UCOL_ITER_HASLEN)==0) {
// We checked the string's trailing null, which would advance fcdPosition past the null. // We checked the string's trailing null, which would advance fcdPosition past the null.
// back it up to point to the null. // back it up to point to the null.
collationSource->fcdPosition--; collationSource->fcdPosition--;
} }
if (needNormalize) { return needNormalize;
collIterNormalize(collationSource);
}
} }
/****************************************************************************/ /****************************************************************************/
/* Following are the CE retrieval functions */ /* Following are the CE retrieval functions */
/* */ /* */
@ -827,7 +828,9 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
} }
// Need a more complete FCD check and possible normalization. // Need a more complete FCD check and possible normalization.
collIterFCD(collationSource); if (collIterFCD(collationSource)) {
collIterNormalize(collationSource);
}
if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
// No normalization was needed. Go ahead and process the char we already had. // No normalization was needed. Go ahead and process the char we already had.
break; break;
@ -875,7 +878,7 @@ U_CAPI uint32_t ucol_getNextCE(const UCollator *coll, collIterate *collationSour
void collPrevIterNormalize(collIterate *data) void collPrevIterNormalize(collIterate *data)
{ {
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
UChar *pEnd = data->pos + 1; /* End normalize + 1 */ UChar *pEnd = data->pos; /* End normalize + 1 */
UChar *pStart; UChar *pStart;
uint32_t normLen; uint32_t normLen;
UChar *pStartNorm; UChar *pStartNorm;
@ -888,8 +891,9 @@ void collPrevIterNormalize(collIterate *data)
pStart = data->fcdPosition + 1; pStart = data->fcdPosition + 1;
} }
normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
data->writableBuffer, 0, &status); data->writableBuffer, 0, &status);
if (data->writableBufSize <= normLen) { if (data->writableBufSize <= normLen) {
if (data->writableBuffer != data->stackWritableBuffer) { if (data->writableBuffer != data->stackWritableBuffer) {
uprv_free( data->writableBuffer); uprv_free( data->writableBuffer);
@ -906,8 +910,8 @@ void collPrevIterNormalize(collIterate *data)
*/ */
pStartNorm = data->writableBuffer + (data->writableBufSize - normLen); pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
*(pStartNorm - 1) = 0; *(pStartNorm - 1) = 0;
unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen, unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
&status); normLen, &status);
data->pos = data->writableBuffer + data->writableBufSize; data->pos = data->writableBuffer + data->writableBufSize;
data->origFlags = data->flags; data->origFlags = data->flags;
@ -1326,13 +1330,14 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
//*(collationSource->CEpos++) = 0x04000080 | (ch & 0x001F) << 27; //*(collationSource->CEpos++) = 0x04000080 | (ch & 0x001F) << 27;
} }
// we must skip all 00, 01, 02 bytes, so most bytes have 253 values /*
// we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case) we must skip all 00, 01, 02 bytes, so most bytes have 253 values
// we shift so that HAN all has the same first primary, for compression. we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
// for the 4 byte case, we make the gap as large as we can fit. we shift so that HAN all has the same first primary, for compression.
// Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1) for the 4 byte case, we make the gap as large as we can fit.
// Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14) Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
*/
int32_t last0 = cp - IMPLICIT_BOUNDARY_; int32_t last0 = cp - IMPLICIT_BOUNDARY_;
uint32_t r = 0; uint32_t r = 0;
@ -1527,12 +1532,200 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
return order; /* return the CE */ return order; /* return the CE */
} }
/**
* Inserts the argument character into the end of the buffer pushing back the
* null terminator.
* @param data collation element iterator data
* @param ch character to be appended
*/
inline void insertBufferEnd(collIterate *data, UChar ch)
{
uint32_t size = data->writableBufSize;
uint32_t strlen = u_strlen(data->writableBuffer);
UChar *newbuffer;
const uint32_t incsize = 5;
if (size > strlen) {
UChar *end = data->writableBuffer + strlen;
*end = ch;
*(end + 1) = 0;
return;
}
/*
buffer will always be null terminated at the end.
giving extra space since it is likely that more characters will be added.
*/
size += incsize;
newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
uprv_memcpy(newbuffer, data->writableBuffer,
data->writableBufSize * sizeof(UChar));
newbuffer = newbuffer + data->writableBufSize;
*newbuffer = ch;
*(newbuffer + 1) = 0;
if (data->writableBuffer != data->stackWritableBuffer) {
uprv_free(data->writableBuffer);
}
data->writableBufSize = size;
data->writableBuffer = newbuffer;
}
/**
* Special normalization function for contraction in the forwards iterator.
* This normalization sequence will place the current character at source->pos
* and its following normalized sequence into the buffer.
* The fcd position, pos will be changed.
* pos will now point to positions in the buffer.
* Flags will be changed accordingly.
* @param data collation iterator data
*/
inline void normalizeNextContraction(collIterate *data)
{
UChar *buffer = data->writableBuffer;
uint32_t buffersize = data->writableBufSize;
uint32_t strsize;
UErrorCode status = U_ZERO_ERROR;
/* data->pos - 1 is already in buffer */
UChar *pStart = data->pos;
UChar *pEnd;
uint32_t normLen;
UChar *pStartNorm;
if (data->flags & UCOL_ITER_HASLEN) {
*data->writableBuffer = *(pStart - 1);
strsize = 1;
}
else {
strsize = u_strlen(data->writableBuffer);
}
pEnd = data->fcdPosition;
normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
&status);
if (buffersize <= normLen + strsize) {
uint32_t size = strsize + normLen + 1;
UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
if (data->writableBuffer != data->stackWritableBuffer) {
uprv_free(buffer);
}
data->writableBuffer = temp;
data->writableBufSize = size;
}
status = U_ZERO_ERROR;
pStartNorm = buffer + strsize;
/* null-termination will be added here */
unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
&status);
data->pos = data->writableBuffer + strsize;
data->origFlags = data->flags;
data->flags |= UCOL_ITER_INNORMBUF;
data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
}
/**
* Contraction character management function that returns the next character
* for the forwards iterator.
* Does nothing if the next character is in buffer and not the first character
* in it.
* Else it checks next character in data string to see if it is normalizable.
* If it is not, the character is simply copied into the buffer, else
* the whole normalized substring is copied into the buffer, including the
* current character.
* @param data collation element iterator data
* @return next character
*/
inline UChar getNextNormalizedChar(collIterate *data)
{
UChar nextch = 0;
UChar ch;
if ((data->flags & UCOL_ITER_NORM) == 0 ||
((data->flags & UCOL_ITER_INNORMBUF) && *data->pos != 0)) {
/*
if no normalization.
if next character is in normalized buffer, no further normalization
is required
*/
return *(data->pos ++);
}
ch = *(data->pos);
if (data->flags & UCOL_ITER_HASLEN) {
/* in data string */
if (data->pos + 1 == data->endp) {
data->pos ++;
return ch;
}
nextch = *(data->pos + 1);
}
else {
if (data->flags & UCOL_ITER_INNORMBUF) {
/*
in writable buffer, at this point fcdPosition can not be
pointing to the end of the data string. see contracting tag.
*/
if (data->fcdPosition == data->endp - 1) {
/* at the end of the string, just dump it into the normalizer */
insertBufferEnd(data, *(data->fcdPosition));
data->pos = data->endp;
return *(data->fcdPosition);
}
data->pos = data->fcdPosition;
}
else {
if (*(data->pos + 1) == 0) {
return *(data->pos ++);
}
}
ch = *(data->pos);
nextch = *(data->pos + 1);
}
/*
* if the current character is not fcd.
* Trailing combining class == 0.
*/
if (data->fcdPosition < data->pos &&
(nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
ch >= NFC_ZERO_CC_BLOCK_LIMIT_) && collIterFCD(data)) {
/*
Need a more complete FCD check and possible normalization.
normalize substring will be appended to buffer
*/
normalizeNextContraction(data);
data->pos ++;
return ch;
}
if (data->flags & UCOL_ITER_INNORMBUF) {
/*
no normalization is to be done hence only one character will be
appended to the buffer.
*/
insertBufferEnd(data, ch);
}
/* points back to the pos in string */
data->pos ++;
return ch;
}
/* This function handles the special CEs like contractions, expansions, surrogates, Thai */ /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
/* It is called by both getNextCE and getNextUCA */ /* It is called by both getNextCE and getNextUCA */
uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, UErrorCode *status) { uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, UErrorCode *status) {
uint32_t i = 0; /* general counter */ uint32_t i = 0; /* general counter */
uint32_t firstCE = UCOL_NOT_FOUND; uint32_t firstCE = UCOL_NOT_FOUND;
UChar *firstUChar = source->pos; UChar *firstUChar = source->pos;
collIterateState state;
backupState(source, &state);
//uint32_t CE = *source->CEpos; //uint32_t CE = *source->CEpos;
for (;;) { for (;;) {
const uint32_t *CEOffset = NULL; const uint32_t *CEOffset = NULL;
@ -1581,16 +1774,19 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
/* First we position ourselves at the begining of contraction sequence */ /* First we position ourselves at the begining of contraction sequence */
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
if ((source->flags & UCOL_ITER_HASLEN) && source->pos>=source->endp) { if (((source->flags & UCOL_ITER_HASLEN) && source->pos>=source->endp)
|| ((source->flags & UCOL_ITER_INNORMBUF) && *source->pos == 0 &&
source->fcdPosition >= source->endp)) {
/* this is the end of string. (Null terminated handled later, /* this is the end of string. (Null terminated handled later,
when the null doesn't match the contraction sequence.) */ when the null doesn't match the contraction sequence.) */
{ {
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); /* So we'll pick whatever we have at the point... */ CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); /* So we'll pick whatever we have at the point... */
if (CE == UCOL_NOT_FOUND) { if (CE == UCOL_NOT_FOUND) {
source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */ // source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */
if(firstCE != UCOL_NOT_FOUND) { if(firstCE != UCOL_NOT_FOUND) {
CE = firstCE; CE = firstCE;
} }
loadState(source, &state);
} }
} }
break; break;
@ -1601,7 +1797,7 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
UCharOffset++; /* skip the backward offset, see above */ UCharOffset++; /* skip the backward offset, see above */
schar = *source->pos++; schar = getNextNormalizedChar(source);
while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
UCharOffset++; UCharOffset++;
} }
@ -1613,7 +1809,8 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
if(CE == UCOL_NOT_FOUND) { if(CE == UCOL_NOT_FOUND) {
source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */ // source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */
loadState(source, &state);
if(firstCE != UCOL_NOT_FOUND) { if(firstCE != UCOL_NOT_FOUND) {
CE = firstCE; CE = firstCE;
} }
@ -1624,7 +1821,8 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
if(tempCE != UCOL_NOT_FOUND) { if(tempCE != UCOL_NOT_FOUND) {
firstCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); firstCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
firstUChar = source->pos-1; /* firstUChar = source->pos-1; */
backupState(source, &state);
} }
} else { } else {
break; break;
@ -1664,8 +1862,6 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
/** /**
* Inserts the argument character into the front of the buffer replacing the * Inserts the argument character into the front of the buffer replacing the
* front null terminator. * front null terminator.
* Repoints the pos pointer to the next character in the writablebuffer.
* Changes the flags up to date.
* @param data collation element iterator data * @param data collation element iterator data
* @param ch character to be appended * @param ch character to be appended
*/ */
@ -1680,7 +1876,9 @@ inline void insertBufferFront(collIterate *data, UChar ch)
if (*end == 0) { if (*end == 0) {
*end = ch; *end = ch;
*(end - 1) = 0; *(end - 1) = 0;
return;
} }
end --;
} }
/* /*
@ -1798,7 +1996,7 @@ inline UChar getPrevNormalizedChar(collIterate *data)
UChar ch; UChar ch;
UChar *start; UChar *start;
if ((data->flags & UCOL_ITER_NORM) == 0 || if ((data->flags & UCOL_ITER_NORM) == 0 ||
(data->flags & UCOL_ITER_INNORMBUF) || *(data->pos - 1) != 0) { ((data->flags & UCOL_ITER_INNORMBUF) && *(data->pos - 1) != 0)) {
/* /*
if no normalization. if no normalization.
if previous character is in normalized buffer, no further normalization if previous character is in normalized buffer, no further normalization
@ -1959,8 +2157,10 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
/* this is the start of string */ /* this is the start of string */
CE = *(coll->contractionCEs + CE = *(coll->contractionCEs +
(UCharOffset - coll->contractionIndex)); (UCharOffset - coll->contractionIndex));
if (CE == UCOL_NOT_FOUND && firstCE != UCOL_NOT_FOUND) { if (CE == UCOL_NOT_FOUND) {
CE = firstCE; if (firstCE != UCOL_NOT_FOUND) {
CE = firstCE;
}
loadState(source, &state); loadState(source, &state);
} }