Updated backwards collation element iterator codes.

X-SVN-Rev: 3760
This commit is contained in:
Syn Wee Quek 2001-02-23 23:36:42 +00:00
parent 6d5b35e584
commit ec4c07eeb0
3 changed files with 123 additions and 147 deletions

View File

@ -7,7 +7,6 @@
* Date Name Comments
* 02/16/2001 synwee Added internal method getPrevSpecialCE
*/
#include "ucolimp.h"
#include "ucoltok.h"
@ -1068,12 +1067,13 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
T += TBase;
// return the first CE, but first put the rest into the expansion buffer
if (!collationSource->JamoSpecial) { // FAST PATH
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, V);
if (T != TBase) {
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T);
}
return ucmp32_get(UCA->mapping, L); // return first one
} else { // Jamo is Special
@ -1103,6 +1103,7 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
}
/* This is a code point minus 0x10000, that's what algorithm requires */
order = 0xE0010303 | (cp & 0xFFE00) << 8;
*(collationSource->CEpos++) = 0x80200080 | (cp & 0x001FF) << 22;
} else {
return 0; /* completely ignorable */
@ -1144,7 +1145,7 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
We have to check if ch is possibly a first surrogate - then we need to
take the next code unit and make a bigger CE
*/
UChar nextChar;
UChar prevChar;
const int
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
LCount = 19, VCount = 21, TCount = 28,
@ -1184,11 +1185,13 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
*/
if (!collationSource->JamoSpecial)
{
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, L);
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, V);
if (T != TBase)
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T);
/* return first one */
return ucmp32_get(UCA->mapping, L);
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, T);
collationSource->toReturn = collationSource->CEpos - 1;
return *(collationSource->toReturn);
} else {
/*
Jamo is Special
@ -1213,28 +1216,23 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
if (UTF_IS_SECOND_SURROGATE(ch))
{
if ((collationSource->len - collationSource->pos != length) &&
(UTF_IS_FIRST_SURROGATE(nextChar = *collationSource->pos)))
UChar *temp = collationSource->pos;
if (((collationSource->string < temp) ||
(collationSource->writableBuffer < temp)) &&
(UTF_IS_FIRST_SURROGATE(prevChar = *(collationSource->pos - 1))))
{
uint32_t cp = ((ch << 10UL) + nextChar - ((0xd800 << 10UL) + 0xdc00));
if (collationSource->pos != collationSource->writableBuffer)
collationSource->pos --;
else
{
collationSource->pos = collationSource->string +
(length - (collationSource->len - collationSource->writableBuffer));
collationSource->len = collationSource->string + length;
collationSource->isThai = TRUE;
}
uint32_t cp = ((prevChar << 10UL) + ch - ((0xd800 << 10UL) + 0xdc00));
collationSource->pos --;
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00))
return 0; /* illegal code value, use completely ignoreable! */
/*
This is a code point minus 0x10000, that's what algorithm requires
*/
order = 0xE0010303 | (cp & 0xFFE00) << 8;
*(collationSource->CEpos ++) = 0x80200080 | (cp & 0x001FF) << 22;
collationSource->toReturn ++;
*(collationSource->CEpos ++) = 0xE0010303 | (cp & 0xFFE00) << 8;
order = 0x80200080 | (cp & 0x001FF) << 22;
collationSource->toReturn = collationSource->CEpos;
*(collationSource->CEpos ++) = order;
}
else
return 0; /* completely ignorable */
@ -1246,9 +1244,11 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
return 0; /* completely ignorable */
/* Make up an artifical CE from code point as per UCA */
order = 0xD08003C3 | (ch & 0xF000) << 12 | (ch & 0x0FE0) << 11;
*(collationSource->CEpos ++) = 0x04000080 | (ch & 0x001F) << 27;
collationSource->toReturn ++;
*(collationSource->CEpos ++) = 0xD08003C3 | (ch & 0xF000) << 12 |
(ch & 0x0FE0) << 11;
collationSource->toReturn = collationSource->CEpos;
order = 0x04000080 | (ch & 0x001F) << 27;
*(collationSource->CEpos ++) = order;
}
}
return order; /* return the CE */
@ -1397,12 +1397,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
if (source->isThai == TRUE)
{ /* if we encountered Thai prevowel & the string is not yet touched */
source->isThai = FALSE;
/*
sigh... to cater for getNextCE, we'll have to modify and store the
whole string instead of a substring as in getSpecialCE
*/
UCharOffset = source->pos;
strend = source->len;
strend = source->pos;
size = strend - source->string;
if (size > UCOL_WRITABLE_BUFFER_SIZE)
{
@ -1417,22 +1412,21 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
}
UChar *sourceCopy = source->string;
UChar *targetCopy = source->writableBuffer;
while (sourceCopy < strend)
while (sourceCopy <= strend)
{
if (UCOL_ISTHAIPREVOWEL(*sourceCopy) &&
/* This is the combination that needs to be swapped */
UCOL_ISTHAIBASECONSONANT(*(sourceCopy + 1)))
{
*(targetCopy) = *(sourceCopy + count + 1);
*(targetCopy+1) = *(sourceCopy + count);
targetCopy+=2;
sourceCopy+=2;
*(targetCopy) = *(sourceCopy + 1);
*(targetCopy + 1) = *(sourceCopy);
targetCopy += 2;
sourceCopy += 2;
}
else
*(targetCopy++) = *(sourceCopy++);
*(targetCopy ++) = *(sourceCopy ++);
}
source->pos = source->writableBuffer +
(UCharOffset - source->string);
source->pos = targetCopy;
source->len = targetCopy;
source->CEpos = source->toReturn = source->CEs;
CE = UCOL_IGNORABLE;
@ -1470,32 +1464,22 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
*/
UCharOffset += *UCharOffset;
schar = *source->pos;
schar = *(source->pos - 1);
while (schar > (tchar = *UCharOffset))
UCharOffset ++;
if (schar != tchar)
{
{
/*
we didn't find the correct codepoint. We can use either the first or
the last CE
*/
if (tchar != 0xFFFF)
UCharOffset = constart;
/* testing if (tchar != 0xFFFF) */
UCharOffset = constart;
}
else
{
/* Move up one character */
if (source->pos != source->writableBuffer)
source->pos --;
else
{
source->pos = source->string +
(length - (source->len - source->writableBuffer));
source->len = source->string + length;
source->isThai = TRUE;
}
}
source->pos --;
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
if (!isContraction(CE))
break;
@ -1521,7 +1505,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
while (*CEOffset != 0)
*(source->CEpos ++) = *CEOffset ++;
source->toReturn = source->CEpos - 1;
return *(source->toReturn --);
return *(source->toReturn);
case CHARSET_TAG:
/* probably after 1.8 */
return UCOL_NOT_FOUND;

View File

@ -57,6 +57,7 @@ ucol_openElements(const UCollator *coll,
textLength = u_strlen(text);
result->length_ = textLength;
result->reset_ = TRUE;
init_collIterate(text, textLength, &result->iteratordata_, FALSE);
return result;
@ -77,6 +78,7 @@ U_CAPI void
ucol_reset(UCollationElements *elems)
{
collIterate *ci = &(elems->iteratordata_);
elems->reset_ = TRUE;
ci->pos = ci->string;
ci->len = ci->string + elems->length_;
ci->CEpos = ci->toReturn = ci->CEs;
@ -97,40 +99,42 @@ U_CAPI int32_t
ucol_next(UCollationElements *elems,
UErrorCode *status)
{
int32_t result;
if (U_FAILURE(*status))
return UCOL_NULLORDER;
int32_t result;
elems->reset_ = FALSE;
UCOL_GETNEXTCE(result, elems->collator_, elems->iteratordata_, status);
/*
if ((elems->iteratordata_).CEpos > (elems->iteratordata_).toReturn)
{
result = *((elems->iteratordata_).toReturn++);
if ((elems->iteratordata_).CEpos == (elems->iteratordata_).toReturn)
(elems->iteratordata_).CEpos = (elems->iteratordata_).toReturn =
(elems->iteratordata_).CEs;
}
else
if ((elems->iteratordata_).pos < (elems->iteratordata_).len)
{
UChar ch = *(elems->iteratordata_).pos++;
if (ch <= 0xFF)
(result) = (elems->collator_)->latinOneMapping[ch];
else
(result) = ucmp32_get((elems->collator_)->mapping, ch);
if((result) >= UCOL_NOT_FOUND)
{
(result) = getSpecialCE((elems->collator_), (result),
&(elems->iteratordata_), (status));
if ((result) == UCOL_NOT_FOUND)
(result) = ucol_getNextUCA(ch, &(elems->iteratordata_), (status));
}
{
result = *((elems->iteratordata_).toReturn++);
if ((elems->iteratordata_).CEpos == (elems->iteratordata_).toReturn)
(elems->iteratordata_).CEpos = (elems->iteratordata_).toReturn =
(elems->iteratordata_).CEs;
}
else
(result) = UCOL_NO_MORE_CES;
else
if ((elems->iteratordata_).pos < (elems->iteratordata_).len)
{
UChar ch = *(elems->iteratordata_).pos++;
if (ch <= 0xFF)
(result) = (elems->collator_)->latinOneMapping[ch];
else
(result) = ucmp32_get((elems->collator_)->mapping, ch);
if((result) >= UCOL_NOT_FOUND)
{
(result) = getSpecialCE((elems->collator_), (result),
&(elems->iteratordata_), (status));
if ((result) == UCOL_NOT_FOUND)
(result) = ucol_getNextUCA(ch, &(elems->iteratordata_), (status));
}
}
else
(result) = UCOL_NO_MORE_CES;
*/
if (result == UCOL_NO_MORE_CES)
result = UCOL_NULLORDER;
return result;
@ -142,62 +146,61 @@ ucol_previous(UCollationElements *elems,
{
if(U_FAILURE(*status))
return UCOL_NULLORDER;
else
{
int32_t result;
int32_t result;
UCOL_GETPREVCE(result, elems->collator_, elems->iteratordata_,
elems->length_, status);
if (elems->reset_ &&
(elems->iteratordata_.pos == elems->iteratordata_.string))
elems->iteratordata_.pos = elems->iteratordata_.len;
/* synwee : to be removed, only for testing
const UCollator *coll = elems->collator_;
collIterate *data = &(elems->iteratordata_);
int32_t length = elems->length_;
elems->reset_ = FALSE;
if (data->CEpos > data->CEs)
{
data->toReturn --;
(result) = *(data->toReturn);
if (data->CEs == data->toReturn)
data->CEpos = data->toReturn = data->CEs;
}
else
{
/* pointers are always at the next position to be retrieved for getnextce
for every first previous step after a next, value returned will the same
as the last next value
*/
/*if (data->len - data->pos == length)
(result) = UCOL_NO_MORE_CES;
UCOL_GETPREVCE(result, elems->collator_, elems->iteratordata_,
elems->length_, status);
/* synwee : to be removed, only for testing */
/*
const UCollator *coll = elems->collator_;
collIterate *data = &(elems->iteratordata_);
int32_t length = elems->length_;
if (data->CEpos > data->CEs)
{
data->toReturn --;
(result) = *(data->toReturn);
if (data->CEs == data->toReturn)
data->CEpos = data->toReturn = data->CEs;
}
else
{
if (data->pos != data->writableBuffer)
data->pos --;
{
if (data->pos == data->string || data->pos == data->writableBuffer)
(result) = UCOL_NO_MORE_CES;
else
{
data->pos = data->string +
(length - (data->len - data->writableBuffer));
data->len = data->string + length;
data->isThai = TRUE;
}
UChar ch = *(data->pos);
if (ch <= 0xFF)
(result) = (coll)->latinOneMapping[ch];
else
(result) = ucmp32_get((coll)->mapping, ch);
{
data->pos --;
UChar ch = *(data->pos);
if (ch <= 0xFF)
(result) = (coll)->latinOneMapping[ch];
else
(result) = ucmp32_get((coll)->mapping, ch);
if ((result) >= UCOL_NOT_FOUND)
{
(result) = getSpecialPrevCE(coll, result, data, length, status);
if ((result) == UCOL_NOT_FOUND)
(result) = ucol_getPrevUCA(ch, data, length, status);
}
}
} */
if ((result) >= UCOL_NOT_FOUND)
{
(result) = getSpecialPrevCE(coll, result, data, length, status);
if ((result) == UCOL_NOT_FOUND)
(result) = ucol_getPrevUCA(ch, data, length, status);
}
}
}
*/
if (result == UCOL_NO_MORE_CES)
result = UCOL_NULLORDER;
if (result == UCOL_NO_MORE_CES)
result = UCOL_NULLORDER;
return result;
return result;
}
}
U_CAPI int32_t
@ -240,14 +243,7 @@ ucol_getOffset(const UCollationElements *elems)
if (ci->isThai == TRUE)
return ci->pos - ci->string;
/*
if it is a thai string with reversed elements, since getNextCE does not
store only a substring in writeablebuffer, we'll have to do some calculation
to get the offset out.
need discussion to see if it is a better idea to store the whole string
instead.
*/
return elems->length_ - (ci->len - ci->pos);
return ci->pos - ci->writableBuffer;
}
U_CAPI void

View File

@ -87,6 +87,10 @@ struct UCollationElements
* Source text length
*/
int32_t length_;
/**
* Indicates if this data has been reset.
*/
UBool reset_;
};
struct incrementalContext {
@ -240,20 +244,12 @@ struct incrementalContext {
} \
} \
else { \
if ((data).len - (data).pos == length) { \
if ((data).pos == (data).string || (data).pos == (data).writableBuffer) {\
(order) = UCOL_NO_MORE_CES; \
} \
else { \
UChar ch; \
if ((data).pos != (data).writableBuffer) { \
(data).pos --; \
} \
else { \
(data).pos = (data).string + \
(length - ((data).len - (data).writableBuffer)); \
(data).len = (data).string + length; \
(data).isThai = TRUE; \
} \
(data).pos --; \
ch = *((data).pos); \
if (ch <= 0xFF) { \
(order) = (coll)->latinOneMapping[ch]; \