ICU-2397 add UCharIterator getState() and setState()

X-SVN-Rev: 10868
This commit is contained in:
Markus Scherer 2003-01-18 01:03:46 +00:00
parent c456ab0bbe
commit 7ec4d2f3e9

View File

@ -27,6 +27,8 @@
#include "unicode/utypes.h"
#include "unicode/uiter.h"
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
#define log_err printf
/* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
@ -226,14 +228,15 @@ lenient8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32
* context pointer to UTF-8 string
* length UTF-16 length of the string; -1 until lazy evaluation
* start current UTF-8 index
* index current UTF-16 index
* index current UTF-16 index; may be -1="unknown" after setState()
* limit UTF-8 length of the string
* reservedField supplementary code point
*
* Since UCharIterator delivers 16-bit code units, the iteration can be
* currently in the middle of the byte sequence for a supplementary code point.
* In this case, reservedField will contain that code point and start will
* point to after the corresponding byte sequence.
* point to after the corresponding byte sequence. The UTF-16 index will be
* one less than what it would otherwise be corresponding to the UTF-8 index.
* Otherwise, reservedField will be 0.
*/
@ -249,6 +252,33 @@ lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
case UITER_START:
return 0;
case UITER_CURRENT:
if(iter->index<0) {
/* the current UTF-16 index is unknown after setState(), count from the beginning */
const uint8_t *s;
UChar32 c;
int32_t i, limit, index;
s=(const uint8_t *)iter->context;
i=index=0;
limit=iter->start; /* count up to the UTF-8 index */
while(i<limit) {
L8_NEXT(s, i, limit, c);
if(c<=0xffff) {
++index;
} else {
index+=2;
}
}
iter->start=i; /* just in case setState() did not get us to a code point boundary */
if(i==iter->limit) {
iter->length=index; /* in case it was <0 or wrong */
}
if(iter->reservedField!=0) {
--index; /* we are in the middle of a supplementary code point */
}
iter->index=index;
}
return iter->index;
case UITER_LIMIT:
case UITER_LENGTH:
@ -258,13 +288,37 @@ lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
int32_t i, limit, length;
s=(const uint8_t *)iter->context;
i=iter->start;
limit=iter->limit;
length=iter->index;
if(iter->reservedField!=0) {
iter->reservedField=0;
++length;
if(iter->index<0) {
/*
* the current UTF-16 index is unknown after setState(),
* we must first count from the beginning to here
*/
i=length=0;
limit=iter->start;
/* count from the beginning to the current index */
while(i<limit) {
L8_NEXT(s, i, limit, c);
if(c<=0xffff) {
++length;
} else {
length+=2;
}
}
/* assume i==limit==iter->start, set the UTF-16 index */
iter->start=i; /* just in case setState() did not get us to a code point boundary */
iter->index= iter->reservedField!=0 ? length-1 : length;
} else {
i=iter->start;
length=iter->index;
if(iter->reservedField!=0) {
++length;
}
}
/* count from the current index to the end */
limit=iter->limit;
while(i<limit) {
L8_NEXT(s, i, limit, c);
if(c<=0xffff) {
@ -288,63 +342,94 @@ lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin ori
const uint8_t *s;
UChar32 c;
int32_t pos; /* requested UTF-16 index */
int32_t i, limit; /* UTF-8 index & length */
int32_t i; /* UTF-8 index */
UBool havePos;
/* calculate the requested UTF-16 position */
/* calculate the requested UTF-16 index */
switch(origin) {
case UITER_ZERO:
case UITER_START:
pos=delta;
havePos=TRUE;
/* iter->index<0 (unknown) is possible */
break;
case UITER_CURRENT:
pos=iter->index+delta;
if(iter->index>=0) {
pos=iter->index+delta;
havePos=TRUE;
} else {
/* the current UTF-16 index is unknown after setState(), use only delta */
pos=0;
havePos=FALSE;
}
break;
case UITER_LIMIT:
case UITER_LENGTH:
pos=lenient8IteratorGetIndex(iter, UITER_LENGTH)+delta;
havePos=TRUE;
/* even if the UTF-16 index was unknown, we know it now: iter->index>=0 here */
break;
default:
return -1; /* Error */
}
/* shortcuts: pinning to the edges of the string */
if(pos<=0) {
iter->index=iter->start=iter->reservedField=0;
return 0;
} else if(iter->length>=0 && pos>=iter->length) {
iter->index=iter->length;
iter->start=iter->limit;
iter->reservedField=0;
return iter->index;
if(havePos) {
/* shortcuts: pinning to the edges of the string */
if(pos<=0) {
iter->index=iter->start=iter->reservedField=0;
return 0;
} else if(iter->length>=0 && pos>=iter->length) {
iter->index=iter->length;
iter->start=iter->limit;
iter->reservedField=0;
return iter->index;
}
/* minimize the number of L8_NEXT/PREV operations */
if(iter->index<0 || pos<iter->index/2) {
/* go forward from the start instead of backward from the current index */
iter->index=iter->start=iter->reservedField=0;
} else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
/*
* if we have the UTF-16 index and length and the new position is
* closer to the end than the current index,
* then go backward from the end instead of forward from the current index
*/
iter->index=iter->length;
iter->start=iter->limit;
iter->reservedField=0;
}
delta=pos-iter->index;
if(delta==0) {
return iter->index; /* nothing to do */
}
} else {
/* move relative to unknown UTF-16 index */
if(delta==0) {
return UITER_MOVE_UNKNOWN_INDEX; /* nothing to do */
} else if(-delta>=iter->start) {
/* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
iter->index=iter->start=iter->reservedField=0;
return 0;
} else if(delta>=(iter->limit-iter->start)) {
/* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
iter->index=iter->length; /* may or may not be <0 (unknown) */
iter->start=iter->limit;
iter->reservedField=0;
return iter->index>=0 ? iter->index : UITER_MOVE_UNKNOWN_INDEX;
}
}
/* minimize the number of L8_NEXT/PREV operations */
if(pos<iter->index/2) {
/* go forward from the start instead of backward from the current index */
iter->index=iter->start=iter->reservedField=0;
} else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
/*
* if we have the UTF-16 length and the new position is
* closer to the end than the current index,
* then go backward from the end instead of forward from the current index
*/
iter->index=iter->length;
iter->start=iter->limit;
iter->reservedField=0;
}
/* delta!=0 */
delta=pos-iter->index;
if(delta==0) {
return iter->index; /* nothing to do */
}
/* move towards the requested position if possible */
/* move towards the requested position, pin to the edges of the string */
s=(const uint8_t *)iter->context;
pos=iter->index;
pos=iter->index; /* could be <0 (unknown) */
i=iter->start;
limit=iter->limit;
if(delta>0) {
/* go forward */
int32_t limit=iter->limit;
if(iter->reservedField!=0) {
iter->reservedField=0;
++pos;
@ -365,13 +450,18 @@ lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin ori
break; /* delta=0; */
}
}
if(i==limit && iter->length<0) {
iter->length=pos;
if(i==limit) {
if(iter->length<0 && iter->index>=0) {
iter->length= iter->reservedField==0 ? pos : pos+1;
} else if(iter->index<0 && iter->length>=0) {
iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
}
}
} else /* delta<0 */ {
/* go backward */
if(iter->reservedField!=0) {
iter->reservedField=0;
i-=4; /* we stayed behind the supplementary code point; go before it now */
--pos;
++delta;
}
@ -385,6 +475,7 @@ lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin ori
delta+=2;
} else /* delta==-1 */ {
/* stop in the middle of a supplementary code point */
i+=4; /* back to behind this supplementary code point for consistent state */
iter->reservedField=c;
--pos;
break; /* delta=0; */
@ -393,7 +484,17 @@ lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin ori
}
iter->start=i;
return iter->index=pos;
if(iter->index>=0) {
return iter->index=pos;
} else {
/* we started with index<0 (unknown) so pos is bogus */
if(i<=1) {
return iter->index=i; /* reached the beginning */
} else {
/* we still don't know the UTF-16 index */
return UITER_MOVE_UNKNOWN_INDEX;
}
}
}
static UBool U_CALLCONV
@ -403,7 +504,7 @@ lenient8IteratorHasNext(UCharIterator *iter) {
static UBool U_CALLCONV
lenient8IteratorHasPrevious(UCharIterator *iter) {
return iter->index>0;
return iter->start>0;
}
static UChar32 U_CALLCONV
@ -430,19 +531,27 @@ lenient8IteratorCurrent(UCharIterator *iter) {
static UChar32 U_CALLCONV
lenient8IteratorNext(UCharIterator *iter) {
int32_t index;
if(iter->reservedField!=0) {
UChar trail=U16_TRAIL(iter->reservedField);
iter->reservedField=0;
++iter->index;
if((index=iter->index)>=0) {
iter->index=index+1;
}
return trail;
} else if(iter->start<iter->limit) {
const uint8_t *s=(const uint8_t *)iter->context;
UChar32 c;
L8_NEXT(s, iter->start, iter->limit, c);
++iter->index;
if(iter->length<0 && iter->start==iter->limit) {
iter->length= c<=0xffff ? iter->index : iter->index+1;
if((index=iter->index)>=0) {
iter->index=++index;
if(iter->length<0 && iter->start==iter->limit) {
iter->length= c<=0xffff ? index : index+1;
}
} else if(iter->start==iter->limit && iter->length>=0) {
iter->index= c<=0xffff ? iter->length : iter->length-1;
}
if(c<0) {
return 0xfffd;
@ -459,18 +568,26 @@ lenient8IteratorNext(UCharIterator *iter) {
static UChar32 U_CALLCONV
lenient8IteratorPrevious(UCharIterator *iter) {
int32_t index;
if(iter->reservedField!=0) {
UChar lead=U16_LEAD(iter->reservedField);
iter->reservedField=0;
iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
--iter->index;
if((index=iter->index)>0) {
iter->index=index-1;
}
return lead;
} else if(iter->start>0) {
const uint8_t *s=(const uint8_t *)iter->context;
UChar32 c;
L8_PREV(s, 0, iter->start, c);
--iter->index;
if((index=iter->index)>0) {
iter->index=index-1;
} else if(iter->start<=1) {
iter->index= c<=0xffff ? iter->start : iter->start+1;
}
if(c<0) {
return 0xfffd;
} else if(c<=0xffff) {
@ -485,6 +602,54 @@ lenient8IteratorPrevious(UCharIterator *iter) {
}
}
static uint32_t U_CALLCONV
lenient8IteratorGetState(const UCharIterator *iter) {
if(iter==NULL) {
return 1; /* invalid */
} else {
uint32_t state=(uint32_t)(iter->start<<1);
if(iter->reservedField!=0) {
state|=1;
}
return state;
}
}
static void U_CALLCONV
lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
/* do nothing */
} else if(iter==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
} else {
int32_t index=(int32_t)(state>>1); /* UTF-8 index */
state&=1; /* 1 if in surrogate pair, must be index>=4 */
if((state==0 ? index<0 : index<4) || iter->limit<index) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
} else {
iter->start=index; /* restore UTF-8 byte index */
if(index<=1) {
iter->index=index;
} else {
iter->index=-1; /* unknown UTF-16 index */
}
if(state==0) {
iter->reservedField=0;
} else {
/* verified index>=4 above */
UChar32 c;
L8_PREV((const uint8_t *)iter->context, 0, index, c);
if(c<=0xffff) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
} else {
iter->reservedField=c;
}
}
}
}
}
static const UCharIterator lenient8Iterator={
0, 0, 0, 0, 0, 0,
lenient8IteratorGetIndex,
@ -494,10 +659,12 @@ static const UCharIterator lenient8Iterator={
lenient8IteratorCurrent,
lenient8IteratorNext,
lenient8IteratorPrevious,
0
NULL,
lenient8IteratorGetState,
lenient8IteratorSetState
};
static void
U_CAPI void U_EXPORT2
uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
if(iter!=0) {
if(s!=0 && length>=-1) {
@ -508,10 +675,10 @@ uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
} else {
iter->limit=strlen(s);
}
iter->length= iter->limit==0 ? 0 : -1;
iter->length= iter->limit<=1 ? iter->limit : -1;
} else {
/* set no-op iterator */
uiter_setUTF8(iter, NULL, 0);
uiter_setString(iter, NULL, 0);
}
}
}
@ -691,6 +858,92 @@ compareIterators(UCharIterator *iter1, const char *n1,
}
}
/*
* Test the iterator's getState() and setState() functions.
* iter1 and iter2 must be set up for the same iterator type and the same string
* but may be physically different structs (different addresses).
*
* Assume that the text is not empty and that
* iteration start==0 and iteration limit==length.
* It must be 2<=middle<=length-2.
*/
static void
testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
UChar32 u[4];
UErrorCode errorCode;
UChar32 c;
uint32_t state;
int32_t i, j;
/* get four UChars from the middle of the string */
iter1->move(iter1, middle-2, UITER_ZERO);
for(i=0; i<4; ++i) {
c=iter1->next(iter1);
if(c<0) {
/* the test violates the assumptions, see comment above */
log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
return;
}
u[i]=c;
}
/* move to the middle and get the state */
iter1->move(iter1, -2, UITER_CURRENT);
state=uiter_getState(iter1);
/* set the state into the second iterator and compare the results */
errorCode=U_ZERO_ERROR;
uiter_setState(iter2, state, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
return;
}
c=iter2->current(iter2);
if(c!=u[2]) {
log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
}
c=iter2->previous(iter2);
if(c!=u[1]) {
log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
}
iter2->move(iter2, 2, UITER_CURRENT);
c=iter2->next(iter2);
if(c!=u[3]) {
log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
}
iter2->move(iter2, -3, UITER_CURRENT);
c=iter2->previous(iter2);
if(c!=u[0]) {
log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
}
/* move the second iterator back to the middle */
iter2->move(iter2, 1, UITER_CURRENT);
iter2->next(iter2);
/* check that both are in the middle */
i=iter1->getIndex(iter1, UITER_CURRENT);
j=iter2->getIndex(iter2, UITER_CURRENT);
if(i!=middle) {
log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
}
if(i!=j) {
log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
}
/* compare lengths */
i=iter1->getIndex(iter1, UITER_LENGTH);
j=iter2->getIndex(iter2, UITER_LENGTH);
if(i!=j) {
log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
}
}
static void
TestLenient8Iterator() {
static const UChar text[]={
@ -708,6 +961,7 @@ TestLenient8Iterator() {
UCharIterator iter1, iter2;
UChar32 c1, c2;
int32_t length;
puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
@ -720,9 +974,18 @@ TestLenient8Iterator() {
uiter_setLenient8(&iter2, (const char *)bytes, -1);
compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
/* test get/set state */
length=LENGTHOF(text)-1;
uiter_setLenient8(&iter1, bytes, -1);
testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
/* ---------------------------------------------------------------------- */
puts("no output so far means that the lenient-8 iterator works fine");
puts("iterate forward:\nUTF-16\tlenient-8");
uiter_setString(&iter1, text, -1);
iter1.move(&iter1, 0, UITER_START);
iter2.move(&iter2, 0, UITER_START);
for(;;) {