ICU-871 revert syn wee's changes to contraction handling

X-SVN-Rev: 13839
This commit is contained in:
Vladimir Weinstein 2003-11-24 19:40:10 +00:00
parent 7e210c5402
commit 6be38162d2
3 changed files with 70 additions and 34 deletions

View File

@ -2896,7 +2896,6 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
uint32_t firstCE = UCOL_NOT_FOUND;
const UChar *UCharOffset;
UChar schar, tchar;
UBool wasIgnorable = FALSE;
for (;;) {
/* This loop will run once per source string character, for as long as we */
@ -2917,28 +2916,9 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
source->flags = source->origFlags;
}
}
else if (wasIgnorable) {
// move back to last non-ignorable position
// this is to synch with the reverse direction
loadState(source, &state, TRUE);
if(source->origFlags & UCOL_USE_ITERATOR) {
source->flags = source->origFlags;
}
}
break;
}
uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
if(tempCE != UCOL_NOT_FOUND && wasIgnorable == FALSE) {
// We have scanned a a section of source string for which there is a
// non-ignorable CE from the contraction table.
// Remember the CE and scan position, so
// that we can return to this point if further scanning fails to
// match a longer contraction sequence.
firstCE = tempCE;
backupState(source, &state);
}
uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
@ -2952,24 +2932,22 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
// Pick up the corresponding CE from the table.
CE = *(coll->contractionCEs +
(UCharOffset - coll->contractionIndex));
wasIgnorable = FALSE;
}
else
{
// if there is a completely ignorable code point in the middle of
// contraction, we need to act as if it's not there
uint32_t nextCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
// it's easy for BMP code points
if(nextCE == 0) {
wasIgnorable = TRUE;
if(isZeroCE == 0) {
continue;
} else if(UTF_IS_LEAD(schar)) {
if(!collIter_eos(source)) {
backupState(source, &state);
UChar trail = getNextNormalizedChar(source);
if(UTF_IS_TRAIL(trail)) { // do stuff with trail
if(getCETag(nextCE) == SURROGATE_TAG) {
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, nextCE&0xFFFFFF, trail);
if(getCETag(isZeroCE) == SURROGATE_TAG) {
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, trail);
if(finalCE == 0) {
continue;
}
@ -3038,11 +3016,6 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
// out of loop, this CE will end up being returned. This is the normal
// way out of contraction handling when the source actually contained
// the contraction.
if (wasIgnorable) {
// move back to last non-ignorable position
// this is to synch with the reverse direction
loadState(source, &state, TRUE);
}
break;
}
@ -3050,6 +3023,31 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
// The source string char was in the contraction table, and the corresponding
// CE is IS a contraction CE. We will continue looping to check the source
// string for the remaining chars in the contraction.
uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
if(tempCE != UCOL_NOT_FOUND) {
// We have scanned a a section of source string for which there is a
// CE from the contraction table. Remember the CE and scan position, so
// that we can return to this point if further scanning fails to
// match a longer contraction sequence.
firstCE = tempCE;
goBackOne(source);
backupState(source, &state);
getNextNormalizedChar(source);
// Another way to do this is:
//collIterateState tempState;
//backupState(source, &tempState);
//goBackOne(source);
//backupState(source, &state);
//loadState(source, &tempState, TRUE);
// The problem is that for incomplete contractions we have to remember the previous
// position. Before, the only thing I needed to do was state.pos--;
// After iterator introduction and especially after introduction of normalizing
// iterators, it became much more difficult to decrease the saved state.
// I'm not yet sure which of the two methods above is faster.
}
} // for(;;)
break;
} // case CONTRACTION_TAG:
@ -3402,7 +3400,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
source->flags |= UCOL_ITER_INNORMBUF;
source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
return ucol_IGetNextCE(coll, source, status); // *** (UCOL_IGNORABLE);
return(UCOL_IGNORABLE);
}
}
case CHARSET_TAG:
@ -3533,7 +3531,8 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
source->flags |= UCOL_ITER_INNORMBUF;
source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
return ucol_IGetPrevCE(coll, source, status); // return UCOL_IGNORABLE;
//CE = UCOL_IGNORABLE;
return(UCOL_IGNORABLE);
}
break;
case SPEC_PROC_TAG:
@ -4070,7 +4069,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
source->flags |= UCOL_ITER_INNORMBUF;
source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
return ucol_IGetPrevCE(coll, source, status);;
return(UCOL_IGNORABLE);
}
}
case LEAD_SURROGATE_TAG: /* D800-DBFF*/

View File

@ -415,9 +415,25 @@ backAndForth(UCollationElements *iter)
{
if (o != orders[-- index])
{
if (o == 0)
index ++;
else
{
while (index > 0 && orders[-- index] == 0)
{
}
if (o != orders[index])
{
log_err("Mismatch at index : 0x%x\n", index);
return;
}
}
}
}
while (index != 0 && orders[index - 1] == 0) {
index --;
}
if (index != 0)

View File

@ -637,17 +637,38 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
while ((o = iter.previous(status)) != CollationElementIterator::NULLORDER)
{
if (index == 0) {
if(o == 0) {
continue;
} else { // this is an error, orders exhausted but there are non-ignorable CEs from
// going backwards
errln("Backward iteration returned a non ignorable after orders are exhausted");
break;
}
}
if (o != orders[--index])
{
if (o == 0)
index ++;
else
{
while (index > 0 && orders[--index] == 0)
{
}
if (o != orders[index])
{
errln("Mismatch at index %d: 0x%X vs 0x%X", index,
orders[index], o);
break;
}
}
}
}
while (index != 0 && orders[index - 1] == 0)
{
index --;
}
if (index != 0)
{
UnicodeString msg("Didn't get back to beginning - index is ");