ICU-880 UnsafeCP and Contraction End hash tables, merge UCA data into all of them.

X-SVN-Rev: 4653
This commit is contained in:
Andy Heninger 2001-05-11 01:13:08 +00:00
parent 6db6b757bd
commit 22be543774
2 changed files with 179 additions and 191 deletions

View File

@ -90,14 +90,14 @@ static const uint32_t IMPLICIT_SUPPLEMENTARY_COUNT_ = 0x100000;
static const uint32_t IMPLICIT_BYTES_TO_AVOID_ = 3;
static const uint32_t IMPLICIT_OTHER_COUNT_ = 256 - IMPLICIT_BYTES_TO_AVOID_;
static const uint32_t IMPLICIT_LAST_COUNT_ = IMPLICIT_OTHER_COUNT_ / 2;
static const uint32_t IMPLICIT_LAST_COUNT2_ =
(IMPLICIT_SUPPLEMENTARY_COUNT_ - 1) /
static const uint32_t IMPLICIT_LAST_COUNT2_ =
(IMPLICIT_SUPPLEMENTARY_COUNT_ - 1) /
(IMPLICIT_OTHER_COUNT_ * IMPLICIT_OTHER_COUNT_) + 1;
static const uint32_t IMPLICIT_HAN_SHIFT_ = IMPLICIT_LAST_COUNT_ *
static const uint32_t IMPLICIT_HAN_SHIFT_ = IMPLICIT_LAST_COUNT_ *
IMPLICIT_OTHER_COUNT_ - IMPLICIT_HAN_START_;
static const uint32_t IMPLICIT_BOUNDARY_ = 2 * IMPLICIT_OTHER_COUNT_ *
static const uint32_t IMPLICIT_BOUNDARY_ = 2 * IMPLICIT_OTHER_COUNT_ *
IMPLICIT_LAST_COUNT_ + IMPLICIT_HAN_START_;
static const uint32_t IMPLICIT_LAST2_MULTIPLIER_ = IMPLICIT_OTHER_COUNT_ /
static const uint32_t IMPLICIT_LAST2_MULTIPLIER_ = IMPLICIT_OTHER_COUNT_ /
IMPLICIT_LAST_COUNT2_;
inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
@ -147,43 +147,43 @@ inline void backupState(const collIterate *data, collIterateState *backup)
* Loads the state into the collIterate struct data
* @param data collIterate to backup
* @param backup storage
* @param forwards boolean to indicate if forwards iteration is used,
* @param forwards boolean to indicate if forwards iteration is used,
* false indicates backwards iteration
*/
inline void loadState(collIterate *data, const collIterateState *backup,
inline void loadState(collIterate *data, const collIterateState *backup,
UBool forwards)
{
data->flags = backup->flags;
data->origFlags = backup->origFlags;
data->pos = backup->pos;
if ((data->flags & UCOL_ITER_INNORMBUF) &&
if ((data->flags & UCOL_ITER_INNORMBUF) &&
data->writableBuffer != backup->bufferaddress) {
/*
this is when a new buffer has been reallocated and we'll have to
/*
this is when a new buffer has been reallocated and we'll have to
calculate the new position.
note the new buffer has to contain the contents of the old buffer.
*/
if (forwards) {
data->pos = data->writableBuffer +
data->pos = data->writableBuffer +
(data->pos - backup->bufferaddress);
}
else {
/* backwards direction */
uint32_t temp = backup->buffersize -
uint32_t temp = backup->buffersize -
(data->pos - backup->bufferaddress);
data->pos = data->writableBuffer + (data->writableBufSize - temp);
}
}
if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
/*
/*
this is alittle tricky.
if we are initially not in the normalization buffer, even if we
if we are initially not in the normalization buffer, even if we
normalize in the later stage, the data in the buffer will be
ignored, since we skip back up to the data string.
however if we are already in the normalization buffer, any
further normalization will pull data into the normalization
further normalization will pull data into the normalization
buffer and modify the fcdPosition.
since we are keeping the data in the buffer for use, the
since we are keeping the data in the buffer for use, the
fcdPosition can not be reverted back.
arrgghh....
*/
@ -308,7 +308,7 @@ ucol_close(UCollator *coll)
}
uprv_free(coll);
}
U_CAPI UCollator*
ucol_openRules( const UChar *rules,
int32_t rulesLength,
@ -327,7 +327,7 @@ ucol_openRules( const UChar *rules,
case UNORM_NFD:
norm = UCOL_ON;
break;
case UCOL_DEFAULT_NORMALIZATION:
case UCOL_DEFAULT_NORMALIZATION:
case UCOL_DEFAULT:
norm = UCOL_DEFAULT;
break;
@ -483,7 +483,6 @@ static const uint16_t *FCD_STAGE_3_;
inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
if (c < coll->minUnsafeCP) {
return FALSE;
}
@ -500,18 +499,7 @@ inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
}
htbyte = coll->unsafeCP[hash>>3];
if (((htbyte >> (hash & 7)) & 1) == 1) {
return TRUE;
}
/* TODO: main UCA table data needs to be merged into tailoring tables, */
/* and this second level of test removed from here. */
if (coll == UCA || UCA == NULL) {
return FALSE;
}
htbyte = UCA->unsafeCP[hash>>3];
return ((htbyte >> (hash & 7)) & 1) == 1;
return (((htbyte >> (hash & 7)) & 1) == 1);
}
inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
@ -526,21 +514,11 @@ inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
}
htbyte = coll->contrEndCP[hash>>3];
if (((htbyte >> (hash & 7)) & 1) == 1) {
return TRUE;
}
/* TODO: main UCA table data needs to be merged into tailoring tables, */
/* and this second level of test removed from here. */
if (coll == UCA || UCA == NULL) {
return FALSE;
}
htbyte = UCA->contrEndCP[hash>>3];
return ((htbyte >> (hash & 7)) & 1) == 1;
return (((htbyte >> (hash & 7)) & 1) == 1);
}
UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UErrorCode *status) {
UChar c;
UCollator *result = fillIn;
@ -791,17 +769,17 @@ inline UBool collIterFCD(collIterate *collationSource) {
prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
}
}
collationSource->fcdPosition = srcP + count;
// if (codepoint == 0 && (collationSource->flags & UCOL_ITER_HASLEN)==0) {
/*
We checked the string's trailing null, which would advance
/*
We checked the string's trailing null, which would advance
fcdPosition past the null. back it up to point to the null.
*/
/*collationSource->fcdPosition--;
}*/
return needNormalize;
}
@ -860,7 +838,7 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
if (ch == 0) {
// Ran off end of buffer.
if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
// Ran off end of main string.
// Ran off end of main string.
return UCOL_NO_MORE_CES;
}
else
@ -971,7 +949,7 @@ void collPrevIterNormalize(collIterate *data)
normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
data->writableBuffer, 0, &status);
if (data->writableBufSize <= normLen) {
freeHeapWritableBuffer(data);
data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
@ -980,13 +958,13 @@ void collPrevIterNormalize(collIterate *data)
data->writableBufSize = normLen + 1;
}
status = U_ZERO_ERROR;
/*
/*
this puts the null termination infront of the normalized string instead
of the end
*/
pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
*(pStartNorm - 1) = 0;
unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
normLen, &status);
data->pos = data->writableBuffer + data->writableBufSize;
@ -1008,10 +986,10 @@ void collPrevIterNormalize(collIterate *data)
* True because the previous call to this function will have always exited
* that way, and we get called for every char where cc might be non-zero.
* @param data collation iterate struct
* @return normalization status, TRUE for normalization to be done, FALSE
* @return normalization status, TRUE for normalization to be done, FALSE
* otherwise
*/
inline UBool collPrevIterFCD(collIterate *data)
inline UBool collPrevIterFCD(collIterate *data)
{
UChar32 codepoint;
uint8_t leadingCC;
@ -1106,7 +1084,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
side buffer / original string, and we need to start again to get the
next character.
*/
for (;;) {
if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
/*
@ -1409,7 +1387,7 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
//*(collationSource->CEpos++) = 0x04000080 | (ch & 0x001F) << 27;
}
/*
/*
we must skip all 00, 01, 02 bytes, so most bytes have 253 values
we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
we shift so that HAN all has the same first primary, for compression.
@ -1554,7 +1532,7 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
prevChar = *prev;
/* Handles Han and Supplementary characters here.*/
if (UTF_IS_FIRST_SURROGATE(prevChar))
if (UTF_IS_FIRST_SURROGATE(prevChar))
{
cp = ((prevChar << 10UL) + ch - ((0xd800 << 10UL) + 0xdc00));
collationSource->pos = prev;
@ -1581,7 +1559,7 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
for the 4 byte case, we make the gap as large as we can fit.
Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
*/
*/
int32_t last0 = cp - IMPLICIT_BOUNDARY_;
uint32_t r = 0;
@ -1597,10 +1575,10 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
last0 %= IMPLICIT_LAST_COUNT2_;
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
last1 %= IMPLICIT_OTHER_COUNT_;
r = 0xEF030303 + (last2 << 16) + (last1 << 8) +
r = 0xEF030303 + (last2 << 16) + (last1 << 8) +
(last0 * IMPLICIT_LAST2_MULTIPLIER_);
}
/*
/*
order = (r & 0xFFFF0000) | 0x00000303;
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x00000080;
*/
@ -1612,14 +1590,14 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
}
/**
* Inserts the argument character into the end of the buffer pushing back the
* Inserts the argument character into the end of the buffer pushing back the
* null terminator.
* @param data collIterate struct data
* @param pNull pointer to the null termination
* @param ch character to be appended
* @return the position of the new addition
*/
inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
{
uint32_t size = data->writableBufSize;
UChar *newbuffer;
@ -1631,13 +1609,13 @@ inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
return pNull;
}
/*
/*
buffer will always be null terminated at the end.
giving extra space since it is likely that more characters will be added.
*/
size += incsize;
newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
uprv_memcpy(newbuffer, data->writableBuffer,
uprv_memcpy(newbuffer, data->writableBuffer,
data->writableBufSize * sizeof(UChar));
freeHeapWritableBuffer(data);
@ -1651,7 +1629,7 @@ inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
}
/**
* Inserts the argument string into the end of the buffer pushing back the
* Inserts the argument string into the end of the buffer pushing back the
* null terminator.
* @param data collIterate struct data
* @param pNull pointer to the null termination
@ -1660,18 +1638,18 @@ inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
* @return the position of the new addition
*/
inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
int length)
int length)
{
uint32_t size = pNull - data->writableBuffer;
UChar *newbuffer;
if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
uprv_memcpy(pNull, str, length * sizeof(UChar));
*(pNull + length) = 0;
return pNull;
}
/*
/*
buffer will always be null terminated at the end.
giving extra space since it is likely that more characters will be added.
*/
@ -1690,19 +1668,19 @@ inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
* Special normalization function for contraction in the forwards iterator.
* This normalization sequence will place the current character at source->pos
* and its following normalized sequence into the buffer.
* The fcd position, pos will be changed.
* The fcd position, pos will be changed.
* pos will now point to positions in the buffer.
* Flags will be changed accordingly.
* @param data collation iterator data
*/
inline void normalizeNextContraction(collIterate *data)
{
{
UChar *buffer = data->writableBuffer;
uint32_t buffersize = data->writableBufSize;
uint32_t strsize;
UErrorCode status = U_ZERO_ERROR;
/* because the pointer points to the next character */
UChar *pStart = data->pos - 1;
UChar *pStart = data->pos - 1;
UChar *pEnd;
uint32_t normLen;
UChar *pStartNorm;
@ -1715,9 +1693,9 @@ inline void normalizeNextContraction(collIterate *data)
strsize = u_strlen(data->writableBuffer);
}
pEnd = data->fcdPosition;
normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
pEnd = data->fcdPosition;
normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
&status);
if (buffersize <= normLen + strsize) {
@ -1732,9 +1710,9 @@ inline void normalizeNextContraction(collIterate *data)
status = U_ZERO_ERROR;
pStartNorm = buffer + strsize;
/* null-termination will be added here */
unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
normLen + 1, &status);
data->pos = data->writableBuffer + strsize;
data->origFlags = data->flags;
data->flags |= UCOL_ITER_INNORMBUF;
@ -1744,16 +1722,16 @@ inline void normalizeNextContraction(collIterate *data)
/**
* Contraction character management function that returns the next character
* for the forwards iterator.
* Does nothing if the next character is in buffer and not the first character
* Does nothing if the next character is in buffer and not the first character
* in it.
* Else it checks next character in data string to see if it is normalizable.
* If it is not, the character is simply copied into the buffer, else
* the whole normalized substring is copied into the buffer, including the
* the whole normalized substring is copied into the buffer, including the
* current character.
* @param data collation element iterator data
* @return next character
*/
inline UChar getNextNormalizedChar(collIterate *data)
inline UChar getNextNormalizedChar(collIterate *data)
{
UChar nextch;
UChar ch;
@ -1761,9 +1739,9 @@ inline UChar getNextNormalizedChar(collIterate *data)
UChar *pEndWritableBuffer = NULL;
if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
(innormbuf && *data->pos != 0) ||
(data->fcdPosition != NULL && !innormbuf &&
(data->fcdPosition != NULL && !innormbuf &&
data->pos < data->fcdPosition)) {
/*
/*
if no normalization and not in buffer.
if next character is in normalized buffer, no further normalization
is required
@ -1779,14 +1757,14 @@ inline UChar getNextNormalizedChar(collIterate *data)
}
else {
if (innormbuf) {
/*
in writable buffer, at this point fcdPosition can not be
/*
in writable buffer, at this point fcdPosition can not be
pointing to the end of the data string. see contracting tag.
*/
if (*(data->fcdPosition + 1) == 0 ||
data->fcdPosition + 1 == data->endp) {
/* at the end of the string, dump it into the normalizer */
data->pos = insertBufferEnd(data, data->pos,
data->pos = insertBufferEnd(data, data->pos,
*(data->fcdPosition)) + 1;
return *(data->fcdPosition ++);
}
@ -1802,41 +1780,41 @@ inline UChar getNextNormalizedChar(collIterate *data)
ch = *data->pos ++;
nextch = *data->pos;
/*
/*
* if the current character is not fcd.
* Trailing combining class == 0.
*/
if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
(nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
/*
Need a more complete FCD check and possible normalization.
normalize substring will be appended to buffer
/*
Need a more complete FCD check and possible normalization.
normalize substring will be appended to buffer
*/
if (collIterFCD(data)) {
normalizeNextContraction(data);
return *(data->pos ++);
}
else if (innormbuf) {
/* fcdposition shifted even when there's no normalization, if we
don't input the rest into this, we'll get the wrong position when
/* fcdposition shifted even when there's no normalization, if we
don't input the rest into this, we'll get the wrong position when
we reach the end of the writableBuffer */
int length = data->fcdPosition - data->pos + 1;
data->pos = insertBufferEnd(data, pEndWritableBuffer,
data->pos = insertBufferEnd(data, pEndWritableBuffer,
data->pos - 1, length);
return *(data->pos ++);
}
}
if (innormbuf) {
/*
no normalization is to be done hence only one character will be
/*
no normalization is to be done hence only one character will be
appended to the buffer.
*/
data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
}
/* points back to the pos in string */
return ch;
}
@ -1859,7 +1837,7 @@ inline uint32_t getDiscontiguos(const UCollator *coll, collIterate *source,
UChar *tempdb = buffer;
const UChar *tempconstart = constart;
uint8_t tempflags = source->flags;
*tempdb = *(source->pos - 1);
tempdb ++;
while (TRUE) {
@ -1868,15 +1846,15 @@ inline uint32_t getDiscontiguos(const UCollator *coll, collIterate *source,
tchar;
uint32_t result;
if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
|| (*source->pos == 0 &&
if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
|| (*source->pos == 0 &&
((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
source->fcdPosition == NULL ||
source->fcdPosition == NULL ||
source->fcdPosition == source->endp ||
*(source->fcdPosition) == 0 ||
u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
/* end of string in null terminated string or stopped by a
null character, note fcd does not always point to a base
/* end of string in null terminated string or stopped by a
null character, note fcd does not always point to a base
character after the discontiguos change */
u_getCombiningClass(*(source->pos)) == 0) {
break;
@ -1884,42 +1862,42 @@ inline uint32_t getDiscontiguos(const UCollator *coll, collIterate *source,
UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
schar = getNextNormalizedChar(source);
while (schar > (tchar = *UCharOffset)) {
while (schar > (tchar = *UCharOffset)) {
UCharOffset++;
}
if (schar != tchar) {
/* not the correct codepoint. we stuff the current codepoint into
if (schar != tchar) {
/* not the correct codepoint. we stuff the current codepoint into
the discontiguos buffer and try the next character */
*tempdb = schar;
tempdb ++;
continue;
}
else {
if (u_getCombiningClass(schar) ==
if (u_getCombiningClass(schar) ==
u_getCombiningClass(*(source->pos - 2))) {
*tempdb = schar;
tempdb ++;
continue;
}
result = *(coll->contractionCEs +
result = *(coll->contractionCEs +
(UCharOffset - coll->contractionIndex));
}
*tempdb = 0;
if (result == UCOL_NOT_FOUND) {
break;
} else if (isContraction(result)) {
} else if (isContraction(result)) {
/* this is a multi-contraction*/
tempconstart = (UChar *)coll->image + getContractOffset(result);
} else {
/* okay confusing part here. to ensure that the skipped characters
are considered later, we need to place it in the appropriate
are considered later, we need to place it in the appropriate
position in the normalization buffer and reassign the pos pointer.
simple case if pos reside in string, simply copy to normalization
simple case if pos reside in string, simply copy to normalization
buffer and fcdposition = pos, pos = start of normalization buffer.
if pos in normalization buffer, we'll insert the copy infront of
if pos in normalization buffer, we'll insert the copy infront of
pos and point pos to the start of the normalization buffer.
why am i doing these copies? well, so that the whole chunk of codes
in the getNextCE, getSpecialCE does not require any changes, i can
@ -1938,7 +1916,7 @@ inline uint32_t getDiscontiguos(const UCollator *coll, collIterate *source,
length = u_strlen(buffer);
if (length >= source->writableBufSize) {
freeHeapWritableBuffer(source);
source->writableBuffer =
source->writableBuffer =
(UChar *)uprv_malloc((length + 1) * sizeof(UChar));
source->writableBufSize = length;
}
@ -1949,13 +1927,13 @@ inline uint32_t getDiscontiguos(const UCollator *coll, collIterate *source,
return result;
}
}
/* no problems simply reverting just like that,
if we are in string before getting into this function, points back to
/* no problems simply reverting just like that,
if we are in string before getting into this function, points back to
string hence no problem.
if we are in normalization buffer before getting into this function,
since we'll never use another normalization within this function, we
know that fcdposition points to a base character. the normalization buffer
if we are in normalization buffer before getting into this function,
since we'll never use another normalization within this function, we
know that fcdposition points to a base character. the normalization buffer
never change, hence this revert works. */
source->pos = temppos - 1;
source->flags = tempflags;
@ -2018,18 +1996,18 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
/* First we position ourselves at the begining of contraction sequence */
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
if (source->pos == source->endp ||
if (source->pos == source->endp ||
/* end of string in non-null terminated string */
(*source->pos == 0 && (source->flags & UCOL_ITER_HASLEN) == 0 &&
/* end of null-termination string or normalization buffer */
((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
/* end of null-terminated string */
source->fcdPosition == NULL ||
source->fcdPosition == source->endp ||
((source->origFlags & UCOL_ITER_HASLEN) == 0 &&
source->fcdPosition == source->endp ||
((source->origFlags & UCOL_ITER_HASLEN) == 0 &&
*source->fcdPosition == 0)))) {
/* fcd does not point to a valid character*/
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
/* fcd does not point to a valid character*/
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
// So we'll pick whatever we have at the point...
if (CE == UCOL_NOT_FOUND) {
// spit all the not found chars, which led us in this contraction
@ -2047,11 +2025,11 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
UCharOffset++;
}
if (schar != tchar) {
if (schar != tchar) {
UChar tempchar = 0;
if (u_getCombiningClass(schar) != 0 &&
source->pos != source->endp &&
(*source->pos != 0 ||
source->pos != source->endp &&
(*source->pos != 0 ||
((source->flags & UCOL_ITER_INNORMBUF) &&
source->fcdPosition != NULL &&
source->fcdPosition != source->endp &&
@ -2062,9 +2040,9 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
source->pos --;
}
if (tempchar == 0 || u_getCombiningClass(tempchar) == 0) {
source->pos --;
source->pos --;
/* Spit out the last char of the string, wasn't tasty enough */
CE = *(coll->contractionCEs +
CE = *(coll->contractionCEs +
(ContractionStart - coll->contractionIndex));
}
else {
@ -2072,7 +2050,7 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
}
}
else {
CE = *(coll->contractionCEs +
CE = *(coll->contractionCEs +
(UCharOffset - coll->contractionIndex));
}
@ -2127,14 +2105,14 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
}
/**
* Inserts the argument character into the front of the buffer replacing the
* Inserts the argument character into the front of the buffer replacing the
* front null terminator.
* @param data collation element iterator data
* @param pNull pointer to the null terminator
* @param ch character to be appended
* @return positon of added character
*/
inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
{
uint32_t size = data->writableBufSize;
UChar *end;
@ -2147,14 +2125,14 @@ inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
return pNull;
}
/*
/*
buffer will always be null terminated infront.
giving extra space since it is likely that more characters will be added.
*/
size += incsize;
newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
end = newbuffer + incsize;
uprv_memcpy(end, data->writableBuffer,
uprv_memcpy(end, data->writableBuffer,
data->writableBufSize * sizeof(UChar));
*end = ch;
*(end - 1) = 0;
@ -2170,13 +2148,13 @@ inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
* Special normalization function for contraction in the previous iterator.
* This normalization sequence will place the current character at source->pos
* and its following normalized sequence into the buffer.
* The fcd position, pos will be changed.
* The fcd position, pos will be changed.
* pos will now point to positions in the buffer.
* Flags will be changed accordingly.
* @param data collation iterator data
*/
inline void normalizePrevContraction(collIterate *data)
{
{
UChar *buffer = data->writableBuffer;
uint32_t buffersize = data->writableBufSize;
uint32_t nulltermsize;
@ -2187,8 +2165,8 @@ inline void normalizePrevContraction(collIterate *data)
UChar *pStartNorm;
if (data->flags & UCOL_ITER_HASLEN) {
/*
normalization buffer not used yet, we'll pull down the next
/*
normalization buffer not used yet, we'll pull down the next
character into the end of the buffer
*/
*(buffer + (buffersize - 1)) = *(data->pos + 1);
@ -2207,17 +2185,17 @@ inline void normalizePrevContraction(collIterate *data)
pStart = data->string;
}
else {
pStart = data->fcdPosition + 1;
pStart = data->fcdPosition + 1;
}
normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
&status);
if (nulltermsize <= normLen) {
uint32_t size = buffersize - nulltermsize + normLen + 1;
UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
nulltermsize = normLen + 1;
uprv_memcpy(temp + normLen, buffer,
uprv_memcpy(temp + normLen, buffer,
sizeof(UChar) * (buffersize - nulltermsize));
freeHeapWritableBuffer(data);
data->writableBuffer = temp;
@ -2225,15 +2203,15 @@ inline void normalizePrevContraction(collIterate *data)
}
status = U_ZERO_ERROR;
/*
/*
this puts the null termination infront of the normalized string instead
of the end
*/
pStartNorm = buffer + (nulltermsize - normLen);
*(pStartNorm - 1) = 0;
unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
&status);
data->pos = data->writableBuffer + nulltermsize;
data->origFlags = data->flags;
data->flags |= UCOL_ITER_INNORMBUF;
@ -2243,26 +2221,26 @@ inline void normalizePrevContraction(collIterate *data)
/**
* Contraction character management function that returns the previous character
* for the backwards iterator.
* Does nothing if the previous character is in buffer and not the first
* Does nothing if the previous character is in buffer and not the first
* character in it.
* Else it checks previous character in data string to see if it is
* Else it checks previous character in data string to see if it is
* normalizable.
* If it is not, the character is simply copied into the buffer, else
* the whole normalized substring is copied into the buffer, including the
* the whole normalized substring is copied into the buffer, including the
* current character.
* @param data collation element iterator data
* @return previous character
*/
inline UChar getPrevNormalizedChar(collIterate *data)
inline UChar getPrevNormalizedChar(collIterate *data)
{
UChar prevch;
UChar ch;
UChar *start;
UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
UChar *pNull = NULL;
if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
(innormbuf && *(data->pos - 1) != 0)) {
/*
/*
if no normalization.
if previous character is in normalized buffer, no further normalization
is required
@ -2281,7 +2259,7 @@ inline UChar getPrevNormalizedChar(collIterate *data)
prevch = *(start - 1);
}
else {
/*
/*
in writable buffer, at this point fcdPosition can not be NULL.
see contracting tag.
*/
@ -2296,16 +2274,16 @@ inline UChar getPrevNormalizedChar(collIterate *data)
ch = *start;
prevch = *(start - 1);
}
/*
/*
* if the current character is not fcd.
* Trailing combining class == 0.
*/
if (data->fcdPosition > start &&
(ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
if (data->fcdPosition > start &&
(ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
{
/*
Need a more complete FCD check and possible normalization.
normalize substring will be appended to buffer
/*
Need a more complete FCD check and possible normalization.
normalize substring will be appended to buffer
*/
UChar *backuppos = data->pos;
data->pos = start;
@ -2316,20 +2294,20 @@ inline UChar getPrevNormalizedChar(collIterate *data)
data->pos = backuppos;
data->fcdPosition ++;
}
if (innormbuf) {
/*
no normalization is to be done hence only one character will be
/*
no normalization is to be done hence only one character will be
appended to the buffer.
*/
insertBufferFront(data, pNull, ch);
data->fcdPosition --;
}
return ch;
}
/**
/**
* This function handles the special CEs like contractions, expansions,
* surrogates, Thai.
* It is called by both getPrevCE and getPrevUCA
@ -2373,7 +2351,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
Move the prevowel and the following base Consonant into the
normalization buffer with their order swapped
*/
UChar *tempbuffer = source->writableBuffer +
UChar *tempbuffer = source->writableBuffer +
(source->writableBufSize - 1);
*(tempbuffer - 2) = 0;
*(tempbuffer - 1) = *source->pos;
@ -2399,16 +2377,16 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
}
break;
case CONTRACTION_TAG:
/* to ensure that the backwards and forwards iteration matches, we
/* to ensure that the backwards and forwards iteration matches, we
take the current region of most possible match and pass it through
the forward iteration. this will ensure that the obstinate problem of
overlapping contractions will not occur.
*/
schar = *(source->pos);
constart = (UChar *)coll->image + getContractOffset(CE);
if (source->pos == source->string ||
((source->flags & UCOL_ITER_INNORMBUF) &&
*(source->pos - 1) == 0 && source->fcdPosition == NULL) ||
if (source->pos == source->string ||
((source->flags & UCOL_ITER_INNORMBUF) &&
*(source->pos - 1) == 0 && source->fcdPosition == NULL) ||
!ucol_contractionEndCP(schar, coll)) {
/* start of string or this is not the end of any contraction */
CE = *(coll->contractionCEs + (constart - coll->contractionIndex));
@ -2422,8 +2400,8 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
UCharOffset --;
schar = getPrevNormalizedChar(source);
source->pos --;
if (source->pos == source->string ||
((source->flags & UCOL_ITER_INNORMBUF) &&
if (source->pos == source->string ||
((source->flags & UCOL_ITER_INNORMBUF) &&
*(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
break;
}
@ -2432,7 +2410,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
*(UCharOffset) = schar;
/* a new collIterate is used to simply things, since using the current
collIterate will mean that the forward and backwards iteration will
collIterate will mean that the forward and backwards iteration will
share and change the same buffers. we don't want to get into that. */
collIterate temp;
IInit_collIterate(coll, UCharOffset, -1, &temp);
@ -2440,7 +2418,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
CE = ucol_IGetNextCE(coll, &temp, status);
while (CE != UCOL_NO_MORE_CES) {
*(source->CEpos ++) = CE;
*(source->CEpos ++) = CE;
CE = ucol_IGetNextCE(coll, &temp, status);
}
freeHeapWritableBuffer(&temp);
@ -2673,14 +2651,14 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
currentSize++;
leadPrimary = 0;
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
/* not compressible */
leadPrimary = 0;
currentSize+=2;
} else { /* compress */
leadPrimary = primary1;
leadPrimary = primary1;
currentSize+=2;
}
}
}
} else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
currentSize++;
@ -2688,7 +2666,7 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
currentSize++;
}
}
}
}
#else
if(primary1 != UCOL_IGNORABLE) {
currentSize++;
@ -2731,7 +2709,7 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
caseShift = UCOL_CASE_SHIFT_START;
}
caseShift--;
}
}
}
} else {
if(notIsContinuation) {
@ -2745,7 +2723,7 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
c3++;
} else {
if(c3 > 0) {
if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
|| (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
} else {
@ -3117,23 +3095,23 @@ ucol_calcSortKey(const UCollator *coll,
*primaries++ = primary1;
leadPrimary = 0;
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
/* not compressible */
leadPrimary = 0;
*primaries++ = primary1;
*primaries++ = primary2;
} else { /* compress */
*primaries++ = leadPrimary = primary1;
*primaries++ = leadPrimary = primary1;
*primaries++ = primary2;
}
}
}
} else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
*primaries++ = primary1;
*primaries++ = primary1;
if(primary2 != UCOL_IGNORABLE) {
*primaries++ = primary2; /* second part */
}
}
}
}
#else
if(primary1 != UCOL_IGNORABLE) {
*primaries++ = primary1; /* scriptOrder[primary1]; */ /* This is the script ordering thingie */
@ -3232,7 +3210,7 @@ ucol_calcSortKey(const UCollator *coll,
if (tertiary == tertiaryCommon && notIsContinuation) {
++count3;
} else {
if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
|| (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
tertiary += tertiaryAddition;
}
@ -3538,7 +3516,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
len = normSourceLen;
}
}
if(resultLength == 0 || primaries == NULL) {
return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
@ -3618,23 +3596,23 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
*primaries++ = primary1;
leadPrimary = 0;
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
/* not compressible */
leadPrimary = 0;
*primaries++ = primary1;
*primaries++ = primary2;
} else { /* compress */
*primaries++ = leadPrimary = primary1;
*primaries++ = leadPrimary = primary1;
*primaries++ = primary2;
}
}
}
} else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
*primaries++ = primary1;
*primaries++ = primary1;
if(primary2 != UCOL_IGNORABLE) {
*primaries++ = primary2; /* second part */
}
}
}
}
#else
if(primary1 != UCOL_IGNORABLE) {
*primaries++ = primary1; /* scriptOrder[primary1]; */ /* This is the script ordering thingie */
@ -3679,7 +3657,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
if (tertiary == tertiaryCommon && notIsContinuation) {
++count3;
} else {
if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
|| (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
tertiary += tertiaryAddition;
}

View File

@ -670,12 +670,22 @@ UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
/* Unsafe chars table. Finish it off, then copy it. */
uprv_uca_unsafeCPAddCCNZ(t);
if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */
for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) {
t->unsafeCP[i] |= t->UCA->unsafeCP[i];
}
}
myData->unsafeCP = tableOffset;
uprv_memcpy(dataStart + tableOffset, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE);
tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE);
/* Contraction Ending chars hash table. Copy it out. */
/* Finish building Contraction Ending chars hash table and then copy it out. */
if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */
for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) {
t->contrEndCP[i] |= t->UCA->contrEndCP[i];
}
}
myData->contrEndCP = tableOffset;
uprv_memcpy(dataStart + tableOffset, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE);
tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE);