ICU-861
Implemented new incremental normalization in backwards iteration. X-SVN-Rev: 4524
This commit is contained in:
parent
2b0da7cddd
commit
34372f991b
@ -45,6 +45,8 @@
|
||||
#define LAST_BYTE_MASK_ 0xFF
|
||||
#define SECOND_LAST_BYTE_SHIFT_ 8
|
||||
|
||||
#define ZERO_CC_LIMIT_ 0xC0
|
||||
|
||||
static UCollator* UCA = NULL;
|
||||
|
||||
extern "C" UBool checkFCD(const UChar*, int32_t, UErrorCode*);
|
||||
@ -81,6 +83,22 @@ isAcceptableUCA(void *context,
|
||||
}
|
||||
}
|
||||
|
||||
/* added for Han implicit CE */
|
||||
static const uint32_t IMPLICIT_HAN_START_ = 0x3400;
|
||||
static const uint32_t IMPLICIT_HAN_LIMIT_ = 0xA000;
|
||||
static const uint32_t IMPLICIT_SUPPLEMENTARY_COUNT_ = 0x100000;
|
||||
static const uint32_t IMPLICIT_BYTES_TO_AVOID_ = 3;
|
||||
static const uint32_t IMPLICIT_OTHER_COUNT_ = 256 - IMPLICIT_BYTES_TO_AVOID_;
|
||||
static const uint32_t IMPLICIT_LAST_COUNT_ = IMPLICIT_OTHER_COUNT_ / 2;
|
||||
static const uint32_t IMPLICIT_LAST_COUNT2_ =
|
||||
(IMPLICIT_SUPPLEMENTARY_COUNT_ - 1) /
|
||||
(IMPLICIT_OTHER_COUNT_ * IMPLICIT_OTHER_COUNT_) + 1;
|
||||
static const uint32_t IMPLICIT_HAN_SHIFT_ = IMPLICIT_LAST_COUNT_ *
|
||||
IMPLICIT_OTHER_COUNT_ - IMPLICIT_HAN_START_;
|
||||
static const uint32_t IMPLICIT_BOUNDARY_ = 2 * IMPLICIT_OTHER_COUNT_ *
|
||||
IMPLICIT_LAST_COUNT_ + IMPLICIT_HAN_START_;
|
||||
static const uint32_t IMPLICIT_LAST2_MULTIPLIER_ = IMPLICIT_OTHER_COUNT_ /
|
||||
IMPLICIT_LAST_COUNT2_;
|
||||
|
||||
inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
|
||||
int32_t sourceLen, collIterate *s) {
|
||||
@ -116,6 +134,8 @@ inline void backupState(const collIterate *data, collIterateState *backup)
|
||||
backup->flags = data->flags;
|
||||
backup->origFlags = data->origFlags;
|
||||
backup->pos = data->pos;
|
||||
backup->bufferaddress = (long)(data->writableBuffer);
|
||||
backup->buffersize = data->writableBufSize;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -129,6 +149,17 @@ inline void loadState(collIterate *data, const collIterateState *backup)
|
||||
data->flags = backup->flags;
|
||||
data->origFlags = backup->origFlags;
|
||||
data->pos = backup->pos;
|
||||
if ((data->flags & UCOL_ITER_INNORMBUF) &&
|
||||
(long)(data->writableBuffer) != backup->bufferaddress) {
|
||||
/*
|
||||
this is when a new buffer has been reallocated and we'll have to
|
||||
calculate the new position.
|
||||
note the new buffer has to contain the contents of the old buffer.
|
||||
*/
|
||||
uint32_t temp = backup->buffersize -
|
||||
((long)(data->pos) - backup->bufferaddress);
|
||||
data->pos = data->writableBuffer + (data->writableBufSize - temp);
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
@ -504,8 +535,7 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UEr
|
||||
|
||||
result->zero = 0;
|
||||
result->rules = NULL;
|
||||
|
||||
/* get the version info from UCATableHeader and populate the Collator struct*/
|
||||
/* get the version info form UCATableHeader and populate the Collator struct*/
|
||||
result->dataInfo.dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
|
||||
result->dataInfo.dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
|
||||
|
||||
@ -652,7 +682,7 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
|
||||
break;
|
||||
}
|
||||
|
||||
if (ch < 0xC0 ) {
|
||||
if (ch < ZERO_CC_LIMIT_ ) {
|
||||
// Fast fcd safe path. Trailing combining class == 0. This char is OK.
|
||||
break;
|
||||
}
|
||||
@ -723,6 +753,7 @@ void collPrevIterNormalize(collIterate *data)
|
||||
UChar *pEnd = data->pos + 1; /* End normalize + 1 */
|
||||
UChar *pStart;
|
||||
uint32_t normLen;
|
||||
UChar *pStartNorm;
|
||||
|
||||
/* Start normalize */
|
||||
if (data->fcdPosition == NULL) {
|
||||
@ -733,11 +764,8 @@ void collPrevIterNormalize(collIterate *data)
|
||||
}
|
||||
|
||||
normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0,
|
||||
data->writableBuffer, data->writableBufSize,
|
||||
&status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
if (status == U_BUFFER_OVERFLOW_ERROR) { /* This would be buffer overflow */
|
||||
data->writableBuffer, 0, &status);
|
||||
if (data->writableBufSize <= normLen) {
|
||||
if (data->writableBuffer != data->stackWritableBuffer) {
|
||||
uprv_free( data->writableBuffer);
|
||||
}
|
||||
@ -745,16 +773,18 @@ void collPrevIterNormalize(collIterate *data)
|
||||
sizeof(UChar));
|
||||
/* to handle the zero termination */
|
||||
data->writableBufSize = normLen + 1;
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0,
|
||||
data->writableBuffer, data->writableBufSize, &status);
|
||||
}
|
||||
else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
/*
|
||||
this puts the null termination infront of the normalized string instead
|
||||
of the end
|
||||
*/
|
||||
pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
|
||||
*(pStartNorm - 1) = 0;
|
||||
unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
|
||||
&status);
|
||||
|
||||
data->pos = data->writableBuffer + normLen;
|
||||
data->pos = data->writableBuffer + data->writableBufSize;
|
||||
data->origFlags = data->flags;
|
||||
data->flags |= UCOL_ITER_INNORMBUF;
|
||||
data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
|
||||
@ -773,14 +803,16 @@ void collPrevIterNormalize(collIterate *data)
|
||||
* True because the previous call to this function will have always exited
|
||||
* that way, and we get called for every char where cc might be non-zero.
|
||||
* @param data collation iterate struct
|
||||
* @return normalization status, TRUE for normalization to be done, FALSE
|
||||
* otherwise
|
||||
*/
|
||||
inline void collPrevIterFCD(collIterate *data)
|
||||
inline UBool collPrevIterFCD(collIterate *data)
|
||||
{
|
||||
UChar32 codepoint;
|
||||
uint8_t leadingCC;
|
||||
uint8_t trailingCC = 0;
|
||||
uint16_t fcd;
|
||||
UBool needNormalize = FALSE;
|
||||
UBool result = FALSE;
|
||||
int length;
|
||||
|
||||
length = (data->pos + 1) - data->string;
|
||||
@ -822,7 +854,7 @@ inline void collPrevIterFCD(collIterate *data)
|
||||
}
|
||||
|
||||
if (leadingCC < trailingCC) {
|
||||
needNormalize = TRUE;
|
||||
result = TRUE;
|
||||
}
|
||||
|
||||
leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
|
||||
@ -836,9 +868,7 @@ inline void collPrevIterFCD(collIterate *data)
|
||||
data->fcdPosition = data->string + length;
|
||||
}
|
||||
|
||||
if (needNormalize) {
|
||||
collPrevIterNormalize(data);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -870,6 +900,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
||||
side buffer / original string, and we need to start again to get the
|
||||
next character.
|
||||
*/
|
||||
|
||||
while (TRUE) {
|
||||
if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
|
||||
/*
|
||||
@ -883,7 +914,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
||||
}
|
||||
else {
|
||||
/* we are in the side buffer. */
|
||||
if (data->pos <= data->writableBuffer) {
|
||||
if (*(data->pos - 1) == 0) {
|
||||
/*
|
||||
At the start of the normalize side buffer.
|
||||
Go back to string.
|
||||
@ -912,7 +943,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
||||
*/
|
||||
if ((data->flags & UCOL_ITER_NORM) == 0 ||
|
||||
data->fcdPosition <= data->pos ||
|
||||
ch < 0xC0) {
|
||||
ch < ZERO_CC_LIMIT_) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -930,7 +961,10 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
||||
}
|
||||
|
||||
/* Need a more complete FCD check and possible normalization. */
|
||||
collPrevIterFCD(data);
|
||||
if (collPrevIterFCD(data)) {
|
||||
collPrevIterNormalize(data);
|
||||
}
|
||||
|
||||
if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
|
||||
/* No normalization. Go ahead and process the char. */
|
||||
break;
|
||||
@ -1342,35 +1376,28 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
|
||||
order = ucmp32_get(UCA->mapping, ch);
|
||||
}
|
||||
|
||||
if (order >= UCOL_NOT_FOUND) {
|
||||
if (order > UCOL_NOT_FOUND) {
|
||||
order = getSpecialPrevCE(UCA, order, collationSource, status);
|
||||
}
|
||||
|
||||
if (order == UCOL_NOT_FOUND)
|
||||
{
|
||||
uint32_t cp = 0;
|
||||
/*
|
||||
This is where we have to resort to algorithmical generation.
|
||||
We have to check if ch is possibly a first surrogate - then we need to
|
||||
take the next code unit and make a bigger CE
|
||||
*/
|
||||
UChar prevChar;
|
||||
uint32_t
|
||||
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
|
||||
LCount = 19, VCount = 21, TCount = 28,
|
||||
NCount = VCount * TCount, /* 588 */
|
||||
SCount = LCount * NCount; /* 11172 */
|
||||
/*
|
||||
LLimit = LBase + LCount, // 1113
|
||||
VLimit = VBase + VCount, // 1176
|
||||
TLimit = TBase + TCount, // 11C3
|
||||
SLimit = SBase + SCount; // D7A4
|
||||
*/
|
||||
|
||||
/*
|
||||
once we have failed to find a match for codepoint cp, and are in the
|
||||
implicit code.
|
||||
*/
|
||||
|
||||
uint32_t L = ch - SBase;
|
||||
if (L < SCount)
|
||||
{ /* since it is unsigned, catchs zero case too */
|
||||
@ -1424,61 +1451,35 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
|
||||
}
|
||||
collationSource->toReturn = collationSource->CEpos - 1;
|
||||
return *(collationSource->toReturn);
|
||||
|
||||
/*return *(collationSource->toReturn++);*/
|
||||
/*
|
||||
ucol_getJamoCEs(collationSource->coll, L, &collationSource->CEpos);
|
||||
ucol_getJamoCEs(collationSource->coll, V, &collationSource->CEpos);
|
||||
if (T != TBase) {
|
||||
ucol_getJamoCEs(collationSource->coll, T, &collationSource->CEpos);
|
||||
}
|
||||
collationSource->toReturn = collationSource->CEpos - 1;
|
||||
return *(collationSource->toReturn);
|
||||
*/
|
||||
/*
|
||||
Jamo is Special
|
||||
do recursive processing of L, V, and T with fetchCE (but T only if not
|
||||
equal to TBase!!)
|
||||
Since fetchCE returns a CE, and (potentially) stuffs items into the ce
|
||||
buffer,
|
||||
this is how it is done.
|
||||
*/
|
||||
/*
|
||||
int firstCE = fetchCE(L, ...);
|
||||
// set pointer, leave gap!
|
||||
int* lastExpansion = expansionBufferEnd++;
|
||||
*lastExpansion = fetchCE(V,...);
|
||||
if (T != TBase) {
|
||||
lastExpansion = expansionBufferEnd++; // set pointer, leave gap!
|
||||
*lastExpansion = fetchCE(T,...);
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
if (UTF_IS_SECOND_SURROGATE(ch))
|
||||
{
|
||||
/* This is where the s***t hits the fan */
|
||||
/* it turns out, the first part of the if can be satisfied even if we're */
|
||||
/* at the beggining of the string */
|
||||
/* we have to make sure we know what is the situation we're in */
|
||||
/* quick fix is by using isUsingWritable, as shown below */
|
||||
if ((collationSource->string < collationSource->pos) &&
|
||||
(UTF_IS_FIRST_SURROGATE(prevChar = *(collationSource->pos - 1))))
|
||||
UChar prevChar;
|
||||
UChar *prev;
|
||||
if ((collationSource->string == collationSource->pos) ||
|
||||
(collationSource->pos == collationSource->writableBuffer &&
|
||||
collationSource->fcdPosition == NULL)) {
|
||||
/* we are at the start of the string, wrong place to be at */
|
||||
return 0;
|
||||
}
|
||||
if (collationSource->pos != collationSource->writableBuffer) {
|
||||
prev = collationSource->pos - 1;
|
||||
}
|
||||
else {
|
||||
prev = collationSource->fcdPosition;
|
||||
}
|
||||
prevChar = *prev;
|
||||
|
||||
/* Handles Han and Supplementary characters here.*/
|
||||
if (UTF_IS_FIRST_SURROGATE(prevChar))
|
||||
{
|
||||
uint32_t cp = ((prevChar << 10UL) + ch - ((0xd800 << 10UL) + 0xdc00));
|
||||
collationSource->pos --;
|
||||
cp = ((prevChar << 10UL) + ch - ((0xd800 << 10UL) + 0xdc00));
|
||||
collationSource->pos = prev;
|
||||
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
|
||||
return 0; /* illegal code value, use completely ignoreable! */
|
||||
}
|
||||
|
||||
/*
|
||||
This is a code point minus 0x10000, that's what algorithm requires
|
||||
*/
|
||||
*(collationSource->CEpos ++) = 0xE0010303 | (cp & 0xFFE00) << 8;
|
||||
order = 0x80200080 | (cp & 0x001FF) << 22;
|
||||
collationSource->toReturn = collationSource->CEpos;
|
||||
*(collationSource->CEpos ++) = order;
|
||||
}
|
||||
else {
|
||||
return 0; /* completely ignorable */
|
||||
@ -1490,14 +1491,41 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
|
||||
if (UTF_IS_FIRST_SURROGATE(ch) || (ch & 0xFFFE) == 0xFFFE) {
|
||||
return 0; /* completely ignorable */
|
||||
}
|
||||
|
||||
/* Make up an artifical CE from code point as per UCA */
|
||||
*(collationSource->CEpos ++) = 0xD0800303 | (ch & 0xF000) << 12 |
|
||||
(ch & 0x0FE0) << 11;
|
||||
collationSource->toReturn = collationSource->CEpos;
|
||||
order = 0x04000080 | (ch & 0x001F) << 27;
|
||||
*(collationSource->CEpos ++) = order;
|
||||
cp = ch;
|
||||
}
|
||||
|
||||
/* we must skip all 00, 01, 02 bytes, so most bytes have 253 values
|
||||
we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
|
||||
we shift so that HAN all has the same first primary, for compression.
|
||||
for the 4 byte case, we make the gap as large as we can fit.
|
||||
Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
|
||||
Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
|
||||
*/
|
||||
int32_t last0 = cp - IMPLICIT_BOUNDARY_;
|
||||
uint32_t r = 0;
|
||||
|
||||
if (last0 < 0) {
|
||||
cp += IMPLICIT_HAN_SHIFT_; // shift so HAN shares single block
|
||||
int32_t last1 = cp / IMPLICIT_LAST_COUNT_;
|
||||
last0 = cp % IMPLICIT_LAST_COUNT_;
|
||||
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
|
||||
last1 %= IMPLICIT_OTHER_COUNT_;
|
||||
r = 0xEC030300 + (last2 << 24) + (last1 << 16) + (last0 << 9);
|
||||
} else {
|
||||
int32_t last1 = last0 / IMPLICIT_LAST_COUNT2_;
|
||||
last0 %= IMPLICIT_LAST_COUNT2_;
|
||||
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
|
||||
last1 %= IMPLICIT_OTHER_COUNT_;
|
||||
r = 0xEF030303 + (last2 << 16) + (last1 << 8) +
|
||||
(last0 * IMPLICIT_LAST2_MULTIPLIER_);
|
||||
}
|
||||
/*
|
||||
order = (r & 0xFFFF0000) | 0x00000303;
|
||||
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x00000080;
|
||||
*/
|
||||
*(collationSource->CEpos++) = (r & 0xFFFF0000) | 0x00000303;
|
||||
collationSource->toReturn = collationSource->CEpos;
|
||||
order = ((r & 0x0000FFFF)<<16) | 0x00000080;
|
||||
}
|
||||
return order; /* return the CE */
|
||||
}
|
||||
@ -1636,6 +1664,204 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
|
||||
return CE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts the argument character into the front of the buffer replacing the
|
||||
* front null terminator.
|
||||
* Repoints the pos pointer to the next character in the writablebuffer.
|
||||
* Changes the flags up to date.
|
||||
* @param data collation element iterator data
|
||||
* @param ch character to be appended
|
||||
*/
|
||||
inline void insertBufferFront(collIterate *data, UChar ch)
|
||||
{
|
||||
uint32_t size = data->writableBufSize;
|
||||
UChar *end = data->writableBuffer + (size - 1);
|
||||
UChar *newbuffer;
|
||||
const uint32_t incsize = 5;
|
||||
|
||||
while (end > data->writableBuffer) {
|
||||
if (*end == 0) {
|
||||
*end = ch;
|
||||
*(end - 1) = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
buffer will always be null terminated infront.
|
||||
giving extra space since it is likely that more characters will be added.
|
||||
*/
|
||||
size += incsize;
|
||||
newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
|
||||
end = newbuffer + incsize;
|
||||
uprv_memcpy(end, data->writableBuffer,
|
||||
data->writableBufSize * sizeof(UChar));
|
||||
*end = ch;
|
||||
*(end - 1) = 0;
|
||||
|
||||
if (data->writableBuffer != data->stackWritableBuffer) {
|
||||
uprv_free(data->writableBuffer);
|
||||
}
|
||||
|
||||
data->writableBufSize = size;
|
||||
data->writableBuffer = newbuffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Special normalization function for contraction in the previous iterator.
|
||||
* This normalization sequence will place the current character at source->pos
|
||||
* and its following normalized sequence into the buffer.
|
||||
* The fcd position, pos will be changed.
|
||||
* pos will now point to positions in the buffer.
|
||||
* Flags will be changed accordingly.
|
||||
* @param data collation iterator data
|
||||
*/
|
||||
inline void normalizePrevContraction(collIterate *data)
|
||||
{
|
||||
UChar *buffer = data->writableBuffer;
|
||||
uint32_t buffersize = data->writableBufSize;
|
||||
uint32_t nulltermsize;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UChar *pEnd = data->pos + 1; /* End normalize + 1 */
|
||||
UChar *pStart;
|
||||
uint32_t normLen;
|
||||
UChar *pStartNorm;
|
||||
|
||||
if (data->flags & UCOL_ITER_HASLEN) {
|
||||
/*
|
||||
normalization buffer not used yet, we'll pull down the next
|
||||
character into the end of the buffer
|
||||
*/
|
||||
*(buffer + (buffersize - 1)) = *(data->pos - 1);
|
||||
nulltermsize = buffersize - 1;
|
||||
}
|
||||
else {
|
||||
nulltermsize = buffersize;
|
||||
UChar *temp = buffer + (nulltermsize - 1);
|
||||
while (*(temp --) != 0) {
|
||||
nulltermsize --;
|
||||
}
|
||||
}
|
||||
|
||||
/* Start normalize */
|
||||
if (data->fcdPosition == NULL) {
|
||||
pStart = data->string;
|
||||
}
|
||||
else {
|
||||
pStart = data->fcdPosition + 1;
|
||||
}
|
||||
|
||||
normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
|
||||
&status);
|
||||
|
||||
if (nulltermsize <= normLen) {
|
||||
uint32_t size = buffersize - nulltermsize + normLen + 1;
|
||||
UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
|
||||
nulltermsize = normLen + 1;
|
||||
uprv_memcpy(temp + normLen, buffer,
|
||||
sizeof(UChar) * (buffersize - nulltermsize));
|
||||
if (data->writableBuffer != data->stackWritableBuffer) {
|
||||
uprv_free(buffer);
|
||||
}
|
||||
data->writableBuffer = temp;
|
||||
data->writableBufSize = size;
|
||||
}
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
/*
|
||||
this puts the null termination infront of the normalized string instead
|
||||
of the end
|
||||
*/
|
||||
pStartNorm = buffer + (nulltermsize - normLen);
|
||||
*(pStartNorm - 1) = 0;
|
||||
unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
|
||||
&status);
|
||||
|
||||
data->pos = data->writableBuffer + nulltermsize;
|
||||
data->origFlags = data->flags;
|
||||
data->flags |= UCOL_ITER_INNORMBUF;
|
||||
data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
|
||||
}
|
||||
|
||||
/**
|
||||
* Contraction character management function that returns the previous character
|
||||
* for the backwards iterator.
|
||||
* Does nothing if the previous character is in buffer and not the first
|
||||
* character in it.
|
||||
* Else it checks previous character in data string to see if it is
|
||||
* normalizable.
|
||||
* If it is not, the character is simply copied into the buffer, else
|
||||
* the whole normalized substring is copied into the buffer, including the
|
||||
* current character.
|
||||
* @param data collation element iterator data
|
||||
* @return previous character
|
||||
*/
|
||||
inline UChar getPrevNormalizedChar(collIterate *data)
|
||||
{
|
||||
UChar prevch;
|
||||
UChar ch;
|
||||
UChar *start;
|
||||
if ((data->flags & UCOL_ITER_NORM) == 0 ||
|
||||
(data->flags & UCOL_ITER_INNORMBUF) || *(data->pos - 1) != 0) {
|
||||
/*
|
||||
if no normalization.
|
||||
if previous character is in normalized buffer, no further normalization
|
||||
is required
|
||||
*/
|
||||
return *(data->pos - 1);
|
||||
}
|
||||
|
||||
start = data->pos;
|
||||
if (data->flags & UCOL_ITER_HASLEN) {
|
||||
/* in data string */
|
||||
if ((start - 1) == data->string) {
|
||||
return *(start - 1);
|
||||
}
|
||||
data->pos = start - 1;
|
||||
}
|
||||
else {
|
||||
/*
|
||||
in writable buffer, at this point fcdPosition can not be NULL.
|
||||
see contracting tag.
|
||||
*/
|
||||
if (data->fcdPosition == data->string) {
|
||||
/* at the start of the string, just dump it into the normalizer */
|
||||
insertBufferFront(data, *(data->fcdPosition));
|
||||
return *(data->fcdPosition);
|
||||
}
|
||||
data->pos = data->fcdPosition;
|
||||
}
|
||||
ch = *(data->pos);
|
||||
prevch = *(data->pos - 1);
|
||||
/*
|
||||
* if the current character is not fcd.
|
||||
* Trailing combining class == 0.
|
||||
*/
|
||||
if (data->fcdPosition > data->pos &&
|
||||
(ch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
|
||||
prevch >= NFC_ZERO_CC_BLOCK_LIMIT_) && collPrevIterFCD(data)) {
|
||||
/*
|
||||
Need a more complete FCD check and possible normalization.
|
||||
normalize substring will be appended to buffer
|
||||
*/
|
||||
normalizePrevContraction(data);
|
||||
return ch;
|
||||
}
|
||||
|
||||
if (data->flags & UCOL_ITER_INNORMBUF) {
|
||||
/*
|
||||
no normalization is to be done hence only one character will be
|
||||
appended to the buffer.
|
||||
*/
|
||||
insertBufferFront(data, ch);
|
||||
}
|
||||
else {
|
||||
/* points back to the pos in string */
|
||||
data->pos = start;
|
||||
}
|
||||
|
||||
return ch;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function handles the special CEs like contractions, expansions,
|
||||
* surrogates, Thai.
|
||||
@ -1659,8 +1885,8 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
|
||||
uint8_t firstflags = source->flags;
|
||||
*/
|
||||
collIterateState state;
|
||||
|
||||
backupState(source, &state);
|
||||
|
||||
for(;;)
|
||||
{
|
||||
/* the only ces that loops are thai and contractions */
|
||||
@ -1689,9 +1915,11 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
|
||||
Move the prevowel and the following base Consonant into the
|
||||
normalization buffer with their order swapped
|
||||
*/
|
||||
source->writableBuffer[0] = *source->pos;
|
||||
source->writableBuffer[1] = *(source->pos - 1);
|
||||
source->writableBuffer[2] = 0;
|
||||
UChar *tempbuffer = source->writableBuffer +
|
||||
(source->writableBufSize - 1);
|
||||
*(tempbuffer - 2) = 0;
|
||||
*(tempbuffer - 1) = *source->pos;
|
||||
*(tempbuffer) = *(source->pos - 1);
|
||||
|
||||
/*
|
||||
Indicate where to continue in main input string after exhausting
|
||||
@ -1703,7 +1931,8 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
|
||||
else {
|
||||
source->fcdPosition = source->pos - 2;
|
||||
}
|
||||
source->pos = source->writableBuffer + 2;
|
||||
|
||||
source->pos = tempbuffer;
|
||||
source->origFlags = source->flags;
|
||||
source->flags |= UCOL_ITER_INNORMBUF;
|
||||
source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
|
||||
@ -1715,28 +1944,26 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
|
||||
/* This should handle contractions */
|
||||
for(;;)
|
||||
{
|
||||
uint32_t tempfirstCE;
|
||||
/* First we position at the begining of contraction sequence */
|
||||
constart = UCharOffset = (UChar *)coll->image +
|
||||
getContractOffset(CE);
|
||||
strend = source->endp;
|
||||
|
||||
if (firstCE == UCOL_NOT_FOUND) {
|
||||
firstCE = *(coll->contractionCEs +
|
||||
tempfirstCE = *(coll->contractionCEs +
|
||||
(UCharOffset - coll->contractionIndex));
|
||||
if (tempfirstCE != UCOL_NOT_FOUND) {
|
||||
firstCE = tempfirstCE;
|
||||
backupState(source, &state);
|
||||
}
|
||||
|
||||
if ((source->pos == source->string) ||
|
||||
(source->pos == source->writableBuffer &&
|
||||
if ((source->pos == source->string) || (*(source->pos - 1) == 0 &&
|
||||
source->fcdPosition == NULL)) {
|
||||
/* this is the start of string */
|
||||
CE = *(coll->contractionCEs +
|
||||
(UCharOffset - coll->contractionIndex));
|
||||
if (CE == UCOL_NOT_FOUND && firstCE != UCOL_NOT_FOUND) {
|
||||
CE = firstCE;
|
||||
/*
|
||||
source->pos = firstUChar;
|
||||
source->flags = firstflags;
|
||||
*/
|
||||
loadState(source, &state);
|
||||
}
|
||||
|
||||
@ -1747,15 +1974,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
|
||||
Progressing to backwards block
|
||||
*/
|
||||
UCharOffset += *UCharOffset;
|
||||
|
||||
/* not at the border of the writable buffer */
|
||||
if ((source->flags & UCOL_ITER_HASLEN) ||
|
||||
(source->pos != source->writableBuffer)) {
|
||||
schar = *(source->pos - 1);
|
||||
}
|
||||
else {
|
||||
schar = *(source->fcdPosition);
|
||||
}
|
||||
schar = getPrevNormalizedChar(source);
|
||||
|
||||
while (schar > (tchar = *UCharOffset)) {
|
||||
UCharOffset ++;
|
||||
@ -1765,24 +1984,14 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
|
||||
UCharOffset = constart;
|
||||
}
|
||||
else {
|
||||
if ((source->flags & UCOL_ITER_HASLEN) ||
|
||||
(source->pos != source->writableBuffer)) {
|
||||
source->pos --;
|
||||
}
|
||||
else {
|
||||
source->pos = source->fcdPosition;
|
||||
source->flags = source->origFlags;
|
||||
}
|
||||
}
|
||||
|
||||
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
|
||||
CE = *(coll->contractionCEs +
|
||||
(UCharOffset - coll->contractionIndex));
|
||||
if (!isContraction(CE)) {
|
||||
if (CE == UCOL_NOT_FOUND) {
|
||||
CE = firstCE;
|
||||
/*
|
||||
source->pos = firstUChar;
|
||||
source->flags = firstflags;
|
||||
*/
|
||||
loadState(source, &state);
|
||||
}
|
||||
firstCE = UCOL_NOT_FOUND;
|
||||
|
@ -118,6 +118,8 @@ data similar to collIterate.
|
||||
struct collIterateState {
|
||||
UChar *pos; /* This is position in the string. Can be to original or writable buf */
|
||||
UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
|
||||
long bufferaddress; /* address of the normalization buffer */
|
||||
uint32_t buffersize;
|
||||
uint8_t flags;
|
||||
uint8_t origFlags;
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user