Updated backwards collation element iterator codes.

X-SVN-Rev: 3760
This commit is contained in:
Syn Wee Quek 2001-02-23 23:36:42 +00:00
parent 6d5b35e584
commit ec4c07eeb0
3 changed files with 123 additions and 147 deletions

View File

@ -7,7 +7,6 @@
* Date Name Comments * Date Name Comments
* 02/16/2001 synwee Added internal method getPrevSpecialCE * 02/16/2001 synwee Added internal method getPrevSpecialCE
*/ */
#include "ucolimp.h" #include "ucolimp.h"
#include "ucoltok.h" #include "ucoltok.h"
@ -1068,12 +1067,13 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
T += TBase; T += TBase;
// return the first CE, but first put the rest into the expansion buffer // return the first CE, but first put the rest into the expansion buffer
if (!collationSource->JamoSpecial) { // FAST PATH if (!collationSource->JamoSpecial) { // FAST PATH
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, V); *(collationSource->CEpos++) = ucmp32_get(UCA->mapping, V);
if (T != TBase) { if (T != TBase) {
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T); *(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T);
} }
return ucmp32_get(UCA->mapping, L); // return first one return ucmp32_get(UCA->mapping, L); // return first one
} else { // Jamo is Special } else { // Jamo is Special
@ -1103,6 +1103,7 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
} }
/* This is a code point minus 0x10000, that's what algorithm requires */ /* This is a code point minus 0x10000, that's what algorithm requires */
order = 0xE0010303 | (cp & 0xFFE00) << 8; order = 0xE0010303 | (cp & 0xFFE00) << 8;
*(collationSource->CEpos++) = 0x80200080 | (cp & 0x001FF) << 22; *(collationSource->CEpos++) = 0x80200080 | (cp & 0x001FF) << 22;
} else { } else {
return 0; /* completely ignorable */ return 0; /* completely ignorable */
@ -1144,7 +1145,7 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
We have to check if ch is possibly a first surrogate - then we need to We have to check if ch is possibly a first surrogate - then we need to
take the next code unit and make a bigger CE take the next code unit and make a bigger CE
*/ */
UChar nextChar; UChar prevChar;
const int const int
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
LCount = 19, VCount = 21, TCount = 28, LCount = 19, VCount = 21, TCount = 28,
@ -1184,11 +1185,13 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
*/ */
if (!collationSource->JamoSpecial) if (!collationSource->JamoSpecial)
{ {
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, L);
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, V); *(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, V);
if (T != TBase) if (T != TBase)
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T); *(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, T);
/* return first one */
return ucmp32_get(UCA->mapping, L); collationSource->toReturn = collationSource->CEpos - 1;
return *(collationSource->toReturn);
} else { } else {
/* /*
Jamo is Special Jamo is Special
@ -1213,28 +1216,23 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
if (UTF_IS_SECOND_SURROGATE(ch)) if (UTF_IS_SECOND_SURROGATE(ch))
{ {
if ((collationSource->len - collationSource->pos != length) && UChar *temp = collationSource->pos;
(UTF_IS_FIRST_SURROGATE(nextChar = *collationSource->pos))) if (((collationSource->string < temp) ||
(collationSource->writableBuffer < temp)) &&
(UTF_IS_FIRST_SURROGATE(prevChar = *(collationSource->pos - 1))))
{ {
uint32_t cp = ((ch << 10UL) + nextChar - ((0xd800 << 10UL) + 0xdc00)); uint32_t cp = ((prevChar << 10UL) + ch - ((0xd800 << 10UL) + 0xdc00));
if (collationSource->pos != collationSource->writableBuffer) collationSource->pos --;
collationSource->pos --;
else
{
collationSource->pos = collationSource->string +
(length - (collationSource->len - collationSource->writableBuffer));
collationSource->len = collationSource->string + length;
collationSource->isThai = TRUE;
}
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00))
return 0; /* illegal code value, use completely ignoreable! */ return 0; /* illegal code value, use completely ignoreable! */
/* /*
This is a code point minus 0x10000, that's what algorithm requires This is a code point minus 0x10000, that's what algorithm requires
*/ */
order = 0xE0010303 | (cp & 0xFFE00) << 8; *(collationSource->CEpos ++) = 0xE0010303 | (cp & 0xFFE00) << 8;
*(collationSource->CEpos ++) = 0x80200080 | (cp & 0x001FF) << 22; order = 0x80200080 | (cp & 0x001FF) << 22;
collationSource->toReturn ++; collationSource->toReturn = collationSource->CEpos;
*(collationSource->CEpos ++) = order;
} }
else else
return 0; /* completely ignorable */ return 0; /* completely ignorable */
@ -1246,9 +1244,11 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
return 0; /* completely ignorable */ return 0; /* completely ignorable */
/* Make up an artifical CE from code point as per UCA */ /* Make up an artifical CE from code point as per UCA */
order = 0xD08003C3 | (ch & 0xF000) << 12 | (ch & 0x0FE0) << 11; *(collationSource->CEpos ++) = 0xD08003C3 | (ch & 0xF000) << 12 |
*(collationSource->CEpos ++) = 0x04000080 | (ch & 0x001F) << 27; (ch & 0x0FE0) << 11;
collationSource->toReturn ++; collationSource->toReturn = collationSource->CEpos;
order = 0x04000080 | (ch & 0x001F) << 27;
*(collationSource->CEpos ++) = order;
} }
} }
return order; /* return the CE */ return order; /* return the CE */
@ -1397,12 +1397,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
if (source->isThai == TRUE) if (source->isThai == TRUE)
{ /* if we encountered Thai prevowel & the string is not yet touched */ { /* if we encountered Thai prevowel & the string is not yet touched */
source->isThai = FALSE; source->isThai = FALSE;
/* strend = source->pos;
sigh... to cater for getNextCE, we'll have to modify and store the
whole string instead of a substring as in getSpecialCE
*/
UCharOffset = source->pos;
strend = source->len;
size = strend - source->string; size = strend - source->string;
if (size > UCOL_WRITABLE_BUFFER_SIZE) if (size > UCOL_WRITABLE_BUFFER_SIZE)
{ {
@ -1417,22 +1412,21 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
} }
UChar *sourceCopy = source->string; UChar *sourceCopy = source->string;
UChar *targetCopy = source->writableBuffer; UChar *targetCopy = source->writableBuffer;
while (sourceCopy < strend) while (sourceCopy <= strend)
{ {
if (UCOL_ISTHAIPREVOWEL(*sourceCopy) && if (UCOL_ISTHAIPREVOWEL(*sourceCopy) &&
/* This is the combination that needs to be swapped */ /* This is the combination that needs to be swapped */
UCOL_ISTHAIBASECONSONANT(*(sourceCopy + 1))) UCOL_ISTHAIBASECONSONANT(*(sourceCopy + 1)))
{ {
*(targetCopy) = *(sourceCopy + count + 1); *(targetCopy) = *(sourceCopy + 1);
*(targetCopy+1) = *(sourceCopy + count); *(targetCopy + 1) = *(sourceCopy);
targetCopy+=2; targetCopy += 2;
sourceCopy+=2; sourceCopy += 2;
} }
else else
*(targetCopy++) = *(sourceCopy++); *(targetCopy ++) = *(sourceCopy ++);
} }
source->pos = source->writableBuffer + source->pos = targetCopy;
(UCharOffset - source->string);
source->len = targetCopy; source->len = targetCopy;
source->CEpos = source->toReturn = source->CEs; source->CEpos = source->toReturn = source->CEs;
CE = UCOL_IGNORABLE; CE = UCOL_IGNORABLE;
@ -1470,32 +1464,22 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
*/ */
UCharOffset += *UCharOffset; UCharOffset += *UCharOffset;
schar = *source->pos; schar = *(source->pos - 1);
while (schar > (tchar = *UCharOffset)) while (schar > (tchar = *UCharOffset))
UCharOffset ++; UCharOffset ++;
if (schar != tchar) if (schar != tchar)
{ {
/* /*
we didn't find the correct codepoint. We can use either the first or we didn't find the correct codepoint. We can use either the first or
the last CE the last CE
*/ */
if (tchar != 0xFFFF) /* testing if (tchar != 0xFFFF) */
UCharOffset = constart; UCharOffset = constart;
} }
else else
{
/* Move up one character */ /* Move up one character */
if (source->pos != source->writableBuffer) source->pos --;
source->pos --;
else
{
source->pos = source->string +
(length - (source->len - source->writableBuffer));
source->len = source->string + length;
source->isThai = TRUE;
}
}
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
if (!isContraction(CE)) if (!isContraction(CE))
break; break;
@ -1521,7 +1505,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
while (*CEOffset != 0) while (*CEOffset != 0)
*(source->CEpos ++) = *CEOffset ++; *(source->CEpos ++) = *CEOffset ++;
source->toReturn = source->CEpos - 1; source->toReturn = source->CEpos - 1;
return *(source->toReturn --); return *(source->toReturn);
case CHARSET_TAG: case CHARSET_TAG:
/* probably after 1.8 */ /* probably after 1.8 */
return UCOL_NOT_FOUND; return UCOL_NOT_FOUND;

View File

@ -57,6 +57,7 @@ ucol_openElements(const UCollator *coll,
textLength = u_strlen(text); textLength = u_strlen(text);
result->length_ = textLength; result->length_ = textLength;
result->reset_ = TRUE;
init_collIterate(text, textLength, &result->iteratordata_, FALSE); init_collIterate(text, textLength, &result->iteratordata_, FALSE);
return result; return result;
@ -77,6 +78,7 @@ U_CAPI void
ucol_reset(UCollationElements *elems) ucol_reset(UCollationElements *elems)
{ {
collIterate *ci = &(elems->iteratordata_); collIterate *ci = &(elems->iteratordata_);
elems->reset_ = TRUE;
ci->pos = ci->string; ci->pos = ci->string;
ci->len = ci->string + elems->length_; ci->len = ci->string + elems->length_;
ci->CEpos = ci->toReturn = ci->CEs; ci->CEpos = ci->toReturn = ci->CEs;
@ -97,40 +99,42 @@ U_CAPI int32_t
ucol_next(UCollationElements *elems, ucol_next(UCollationElements *elems,
UErrorCode *status) UErrorCode *status)
{ {
int32_t result;
if (U_FAILURE(*status)) if (U_FAILURE(*status))
return UCOL_NULLORDER; return UCOL_NULLORDER;
int32_t result; elems->reset_ = FALSE;
UCOL_GETNEXTCE(result, elems->collator_, elems->iteratordata_, status); UCOL_GETNEXTCE(result, elems->collator_, elems->iteratordata_, status);
/* /*
if ((elems->iteratordata_).CEpos > (elems->iteratordata_).toReturn) if ((elems->iteratordata_).CEpos > (elems->iteratordata_).toReturn)
{ {
result = *((elems->iteratordata_).toReturn++); result = *((elems->iteratordata_).toReturn++);
if ((elems->iteratordata_).CEpos == (elems->iteratordata_).toReturn) if ((elems->iteratordata_).CEpos == (elems->iteratordata_).toReturn)
(elems->iteratordata_).CEpos = (elems->iteratordata_).toReturn = (elems->iteratordata_).CEpos = (elems->iteratordata_).toReturn =
(elems->iteratordata_).CEs; (elems->iteratordata_).CEs;
}
else
if ((elems->iteratordata_).pos < (elems->iteratordata_).len)
{
UChar ch = *(elems->iteratordata_).pos++;
if (ch <= 0xFF)
(result) = (elems->collator_)->latinOneMapping[ch];
else
(result) = ucmp32_get((elems->collator_)->mapping, ch);
if((result) >= UCOL_NOT_FOUND)
{
(result) = getSpecialCE((elems->collator_), (result),
&(elems->iteratordata_), (status));
if ((result) == UCOL_NOT_FOUND)
(result) = ucol_getNextUCA(ch, &(elems->iteratordata_), (status));
}
} }
else else
(result) = UCOL_NO_MORE_CES; if ((elems->iteratordata_).pos < (elems->iteratordata_).len)
{
UChar ch = *(elems->iteratordata_).pos++;
if (ch <= 0xFF)
(result) = (elems->collator_)->latinOneMapping[ch];
else
(result) = ucmp32_get((elems->collator_)->mapping, ch);
if((result) >= UCOL_NOT_FOUND)
{
(result) = getSpecialCE((elems->collator_), (result),
&(elems->iteratordata_), (status));
if ((result) == UCOL_NOT_FOUND)
(result) = ucol_getNextUCA(ch, &(elems->iteratordata_), (status));
}
}
else
(result) = UCOL_NO_MORE_CES;
*/ */
if (result == UCOL_NO_MORE_CES) if (result == UCOL_NO_MORE_CES)
result = UCOL_NULLORDER; result = UCOL_NULLORDER;
return result; return result;
@ -142,62 +146,61 @@ ucol_previous(UCollationElements *elems,
{ {
if(U_FAILURE(*status)) if(U_FAILURE(*status))
return UCOL_NULLORDER; return UCOL_NULLORDER;
else
{
int32_t result;
int32_t result; if (elems->reset_ &&
UCOL_GETPREVCE(result, elems->collator_, elems->iteratordata_, (elems->iteratordata_.pos == elems->iteratordata_.string))
elems->length_, status); elems->iteratordata_.pos = elems->iteratordata_.len;
/* synwee : to be removed, only for testing elems->reset_ = FALSE;
const UCollator *coll = elems->collator_;
collIterate *data = &(elems->iteratordata_);
int32_t length = elems->length_;
if (data->CEpos > data->CEs) UCOL_GETPREVCE(result, elems->collator_, elems->iteratordata_,
{ elems->length_, status);
data->toReturn --;
(result) = *(data->toReturn); /* synwee : to be removed, only for testing */
if (data->CEs == data->toReturn) /*
data->CEpos = data->toReturn = data->CEs; const UCollator *coll = elems->collator_;
} collIterate *data = &(elems->iteratordata_);
else int32_t length = elems->length_;
{
/* pointers are always at the next position to be retrieved for getnextce if (data->CEpos > data->CEs)
for every first previous step after a next, value returned will the same {
as the last next value data->toReturn --;
*/ (result) = *(data->toReturn);
/*if (data->len - data->pos == length) if (data->CEs == data->toReturn)
(result) = UCOL_NO_MORE_CES; data->CEpos = data->toReturn = data->CEs;
}
else else
{ {
if (data->pos != data->writableBuffer) if (data->pos == data->string || data->pos == data->writableBuffer)
data->pos --; (result) = UCOL_NO_MORE_CES;
else else
{ {
data->pos = data->string + data->pos --;
(length - (data->len - data->writableBuffer));
data->len = data->string + length; UChar ch = *(data->pos);
data->isThai = TRUE; if (ch <= 0xFF)
} (result) = (coll)->latinOneMapping[ch];
else
UChar ch = *(data->pos); (result) = ucmp32_get((coll)->mapping, ch);
if (ch <= 0xFF)
(result) = (coll)->latinOneMapping[ch];
else
(result) = ucmp32_get((coll)->mapping, ch);
if ((result) >= UCOL_NOT_FOUND) if ((result) >= UCOL_NOT_FOUND)
{ {
(result) = getSpecialPrevCE(coll, result, data, length, status); (result) = getSpecialPrevCE(coll, result, data, length, status);
if ((result) == UCOL_NOT_FOUND) if ((result) == UCOL_NOT_FOUND)
(result) = ucol_getPrevUCA(ch, data, length, status); (result) = ucol_getPrevUCA(ch, data, length, status);
} }
} }
} */ }
*/
if (result == UCOL_NO_MORE_CES)
result = UCOL_NULLORDER;
if (result == UCOL_NO_MORE_CES) return result;
result = UCOL_NULLORDER; }
return result;
} }
U_CAPI int32_t U_CAPI int32_t
@ -240,14 +243,7 @@ ucol_getOffset(const UCollationElements *elems)
if (ci->isThai == TRUE) if (ci->isThai == TRUE)
return ci->pos - ci->string; return ci->pos - ci->string;
/* return ci->pos - ci->writableBuffer;
if it is a thai string with reversed elements, since getNextCE does not
store only a substring in writeablebuffer, we'll have to do some calculation
to get the offset out.
need discussion to see if it is a better idea to store the whole string
instead.
*/
return elems->length_ - (ci->len - ci->pos);
} }
U_CAPI void U_CAPI void

View File

@ -87,6 +87,10 @@ struct UCollationElements
* Source text length * Source text length
*/ */
int32_t length_; int32_t length_;
/**
* Indicates if this data has been reset.
*/
UBool reset_;
}; };
struct incrementalContext { struct incrementalContext {
@ -240,20 +244,12 @@ struct incrementalContext {
} \ } \
} \ } \
else { \ else { \
if ((data).len - (data).pos == length) { \ if ((data).pos == (data).string || (data).pos == (data).writableBuffer) {\
(order) = UCOL_NO_MORE_CES; \ (order) = UCOL_NO_MORE_CES; \
} \ } \
else { \ else { \
UChar ch; \ UChar ch; \
if ((data).pos != (data).writableBuffer) { \ (data).pos --; \
(data).pos --; \
} \
else { \
(data).pos = (data).string + \
(length - ((data).len - (data).writableBuffer)); \
(data).len = (data).string + length; \
(data).isThai = TRUE; \
} \
ch = *((data).pos); \ ch = *((data).pos); \
if (ch <= 0xFF) { \ if (ch <= 0xFF) { \
(order) = (coll)->latinOneMapping[ch]; \ (order) = (coll)->latinOneMapping[ch]; \