ICU-2397 add UCharIterator getState() and setState()
X-SVN-Rev: 10868
This commit is contained in:
parent
c456ab0bbe
commit
7ec4d2f3e9
@ -27,6 +27,8 @@
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uiter.h"
|
||||
|
||||
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
#define log_err printf
|
||||
|
||||
/* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
|
||||
@ -226,14 +228,15 @@ lenient8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32
|
||||
* context pointer to UTF-8 string
|
||||
* length UTF-16 length of the string; -1 until lazy evaluation
|
||||
* start current UTF-8 index
|
||||
* index current UTF-16 index
|
||||
* index current UTF-16 index; may be -1="unknown" after setState()
|
||||
* limit UTF-8 length of the string
|
||||
* reservedField supplementary code point
|
||||
*
|
||||
* Since UCharIterator delivers 16-bit code units, the iteration can be
|
||||
* currently in the middle of the byte sequence for a supplementary code point.
|
||||
* In this case, reservedField will contain that code point and start will
|
||||
* point to after the corresponding byte sequence.
|
||||
* point to after the corresponding byte sequence. The UTF-16 index will be
|
||||
* one less than what it would otherwise be corresponding to the UTF-8 index.
|
||||
* Otherwise, reservedField will be 0.
|
||||
*/
|
||||
|
||||
@ -249,6 +252,33 @@ lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
|
||||
case UITER_START:
|
||||
return 0;
|
||||
case UITER_CURRENT:
|
||||
if(iter->index<0) {
|
||||
/* the current UTF-16 index is unknown after setState(), count from the beginning */
|
||||
const uint8_t *s;
|
||||
UChar32 c;
|
||||
int32_t i, limit, index;
|
||||
|
||||
s=(const uint8_t *)iter->context;
|
||||
i=index=0;
|
||||
limit=iter->start; /* count up to the UTF-8 index */
|
||||
while(i<limit) {
|
||||
L8_NEXT(s, i, limit, c);
|
||||
if(c<=0xffff) {
|
||||
++index;
|
||||
} else {
|
||||
index+=2;
|
||||
}
|
||||
}
|
||||
|
||||
iter->start=i; /* just in case setState() did not get us to a code point boundary */
|
||||
if(i==iter->limit) {
|
||||
iter->length=index; /* in case it was <0 or wrong */
|
||||
}
|
||||
if(iter->reservedField!=0) {
|
||||
--index; /* we are in the middle of a supplementary code point */
|
||||
}
|
||||
iter->index=index;
|
||||
}
|
||||
return iter->index;
|
||||
case UITER_LIMIT:
|
||||
case UITER_LENGTH:
|
||||
@ -258,13 +288,37 @@ lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
|
||||
int32_t i, limit, length;
|
||||
|
||||
s=(const uint8_t *)iter->context;
|
||||
i=iter->start;
|
||||
limit=iter->limit;
|
||||
length=iter->index;
|
||||
if(iter->reservedField!=0) {
|
||||
iter->reservedField=0;
|
||||
++length;
|
||||
if(iter->index<0) {
|
||||
/*
|
||||
* the current UTF-16 index is unknown after setState(),
|
||||
* we must first count from the beginning to here
|
||||
*/
|
||||
i=length=0;
|
||||
limit=iter->start;
|
||||
|
||||
/* count from the beginning to the current index */
|
||||
while(i<limit) {
|
||||
L8_NEXT(s, i, limit, c);
|
||||
if(c<=0xffff) {
|
||||
++length;
|
||||
} else {
|
||||
length+=2;
|
||||
}
|
||||
}
|
||||
|
||||
/* assume i==limit==iter->start, set the UTF-16 index */
|
||||
iter->start=i; /* just in case setState() did not get us to a code point boundary */
|
||||
iter->index= iter->reservedField!=0 ? length-1 : length;
|
||||
} else {
|
||||
i=iter->start;
|
||||
length=iter->index;
|
||||
if(iter->reservedField!=0) {
|
||||
++length;
|
||||
}
|
||||
}
|
||||
|
||||
/* count from the current index to the end */
|
||||
limit=iter->limit;
|
||||
while(i<limit) {
|
||||
L8_NEXT(s, i, limit, c);
|
||||
if(c<=0xffff) {
|
||||
@ -288,63 +342,94 @@ lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin ori
|
||||
const uint8_t *s;
|
||||
UChar32 c;
|
||||
int32_t pos; /* requested UTF-16 index */
|
||||
int32_t i, limit; /* UTF-8 index & length */
|
||||
int32_t i; /* UTF-8 index */
|
||||
UBool havePos;
|
||||
|
||||
/* calculate the requested UTF-16 position */
|
||||
/* calculate the requested UTF-16 index */
|
||||
switch(origin) {
|
||||
case UITER_ZERO:
|
||||
case UITER_START:
|
||||
pos=delta;
|
||||
havePos=TRUE;
|
||||
/* iter->index<0 (unknown) is possible */
|
||||
break;
|
||||
case UITER_CURRENT:
|
||||
pos=iter->index+delta;
|
||||
if(iter->index>=0) {
|
||||
pos=iter->index+delta;
|
||||
havePos=TRUE;
|
||||
} else {
|
||||
/* the current UTF-16 index is unknown after setState(), use only delta */
|
||||
pos=0;
|
||||
havePos=FALSE;
|
||||
}
|
||||
break;
|
||||
case UITER_LIMIT:
|
||||
case UITER_LENGTH:
|
||||
pos=lenient8IteratorGetIndex(iter, UITER_LENGTH)+delta;
|
||||
havePos=TRUE;
|
||||
/* even if the UTF-16 index was unknown, we know it now: iter->index>=0 here */
|
||||
break;
|
||||
default:
|
||||
return -1; /* Error */
|
||||
}
|
||||
|
||||
/* shortcuts: pinning to the edges of the string */
|
||||
if(pos<=0) {
|
||||
iter->index=iter->start=iter->reservedField=0;
|
||||
return 0;
|
||||
} else if(iter->length>=0 && pos>=iter->length) {
|
||||
iter->index=iter->length;
|
||||
iter->start=iter->limit;
|
||||
iter->reservedField=0;
|
||||
return iter->index;
|
||||
if(havePos) {
|
||||
/* shortcuts: pinning to the edges of the string */
|
||||
if(pos<=0) {
|
||||
iter->index=iter->start=iter->reservedField=0;
|
||||
return 0;
|
||||
} else if(iter->length>=0 && pos>=iter->length) {
|
||||
iter->index=iter->length;
|
||||
iter->start=iter->limit;
|
||||
iter->reservedField=0;
|
||||
return iter->index;
|
||||
}
|
||||
|
||||
/* minimize the number of L8_NEXT/PREV operations */
|
||||
if(iter->index<0 || pos<iter->index/2) {
|
||||
/* go forward from the start instead of backward from the current index */
|
||||
iter->index=iter->start=iter->reservedField=0;
|
||||
} else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
|
||||
/*
|
||||
* if we have the UTF-16 index and length and the new position is
|
||||
* closer to the end than the current index,
|
||||
* then go backward from the end instead of forward from the current index
|
||||
*/
|
||||
iter->index=iter->length;
|
||||
iter->start=iter->limit;
|
||||
iter->reservedField=0;
|
||||
}
|
||||
|
||||
delta=pos-iter->index;
|
||||
if(delta==0) {
|
||||
return iter->index; /* nothing to do */
|
||||
}
|
||||
} else {
|
||||
/* move relative to unknown UTF-16 index */
|
||||
if(delta==0) {
|
||||
return UITER_MOVE_UNKNOWN_INDEX; /* nothing to do */
|
||||
} else if(-delta>=iter->start) {
|
||||
/* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
|
||||
iter->index=iter->start=iter->reservedField=0;
|
||||
return 0;
|
||||
} else if(delta>=(iter->limit-iter->start)) {
|
||||
/* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
|
||||
iter->index=iter->length; /* may or may not be <0 (unknown) */
|
||||
iter->start=iter->limit;
|
||||
iter->reservedField=0;
|
||||
return iter->index>=0 ? iter->index : UITER_MOVE_UNKNOWN_INDEX;
|
||||
}
|
||||
}
|
||||
|
||||
/* minimize the number of L8_NEXT/PREV operations */
|
||||
if(pos<iter->index/2) {
|
||||
/* go forward from the start instead of backward from the current index */
|
||||
iter->index=iter->start=iter->reservedField=0;
|
||||
} else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
|
||||
/*
|
||||
* if we have the UTF-16 length and the new position is
|
||||
* closer to the end than the current index,
|
||||
* then go backward from the end instead of forward from the current index
|
||||
*/
|
||||
iter->index=iter->length;
|
||||
iter->start=iter->limit;
|
||||
iter->reservedField=0;
|
||||
}
|
||||
/* delta!=0 */
|
||||
|
||||
delta=pos-iter->index;
|
||||
if(delta==0) {
|
||||
return iter->index; /* nothing to do */
|
||||
}
|
||||
|
||||
/* move towards the requested position if possible */
|
||||
/* move towards the requested position, pin to the edges of the string */
|
||||
s=(const uint8_t *)iter->context;
|
||||
pos=iter->index;
|
||||
pos=iter->index; /* could be <0 (unknown) */
|
||||
i=iter->start;
|
||||
limit=iter->limit;
|
||||
if(delta>0) {
|
||||
/* go forward */
|
||||
int32_t limit=iter->limit;
|
||||
if(iter->reservedField!=0) {
|
||||
iter->reservedField=0;
|
||||
++pos;
|
||||
@ -365,13 +450,18 @@ lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin ori
|
||||
break; /* delta=0; */
|
||||
}
|
||||
}
|
||||
if(i==limit && iter->length<0) {
|
||||
iter->length=pos;
|
||||
if(i==limit) {
|
||||
if(iter->length<0 && iter->index>=0) {
|
||||
iter->length= iter->reservedField==0 ? pos : pos+1;
|
||||
} else if(iter->index<0 && iter->length>=0) {
|
||||
iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
|
||||
}
|
||||
}
|
||||
} else /* delta<0 */ {
|
||||
/* go backward */
|
||||
if(iter->reservedField!=0) {
|
||||
iter->reservedField=0;
|
||||
i-=4; /* we stayed behind the supplementary code point; go before it now */
|
||||
--pos;
|
||||
++delta;
|
||||
}
|
||||
@ -385,6 +475,7 @@ lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin ori
|
||||
delta+=2;
|
||||
} else /* delta==-1 */ {
|
||||
/* stop in the middle of a supplementary code point */
|
||||
i+=4; /* back to behind this supplementary code point for consistent state */
|
||||
iter->reservedField=c;
|
||||
--pos;
|
||||
break; /* delta=0; */
|
||||
@ -393,7 +484,17 @@ lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin ori
|
||||
}
|
||||
|
||||
iter->start=i;
|
||||
return iter->index=pos;
|
||||
if(iter->index>=0) {
|
||||
return iter->index=pos;
|
||||
} else {
|
||||
/* we started with index<0 (unknown) so pos is bogus */
|
||||
if(i<=1) {
|
||||
return iter->index=i; /* reached the beginning */
|
||||
} else {
|
||||
/* we still don't know the UTF-16 index */
|
||||
return UITER_MOVE_UNKNOWN_INDEX;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static UBool U_CALLCONV
|
||||
@ -403,7 +504,7 @@ lenient8IteratorHasNext(UCharIterator *iter) {
|
||||
|
||||
static UBool U_CALLCONV
|
||||
lenient8IteratorHasPrevious(UCharIterator *iter) {
|
||||
return iter->index>0;
|
||||
return iter->start>0;
|
||||
}
|
||||
|
||||
static UChar32 U_CALLCONV
|
||||
@ -430,19 +531,27 @@ lenient8IteratorCurrent(UCharIterator *iter) {
|
||||
|
||||
static UChar32 U_CALLCONV
|
||||
lenient8IteratorNext(UCharIterator *iter) {
|
||||
int32_t index;
|
||||
|
||||
if(iter->reservedField!=0) {
|
||||
UChar trail=U16_TRAIL(iter->reservedField);
|
||||
iter->reservedField=0;
|
||||
++iter->index;
|
||||
if((index=iter->index)>=0) {
|
||||
iter->index=index+1;
|
||||
}
|
||||
return trail;
|
||||
} else if(iter->start<iter->limit) {
|
||||
const uint8_t *s=(const uint8_t *)iter->context;
|
||||
UChar32 c;
|
||||
|
||||
L8_NEXT(s, iter->start, iter->limit, c);
|
||||
++iter->index;
|
||||
if(iter->length<0 && iter->start==iter->limit) {
|
||||
iter->length= c<=0xffff ? iter->index : iter->index+1;
|
||||
if((index=iter->index)>=0) {
|
||||
iter->index=++index;
|
||||
if(iter->length<0 && iter->start==iter->limit) {
|
||||
iter->length= c<=0xffff ? index : index+1;
|
||||
}
|
||||
} else if(iter->start==iter->limit && iter->length>=0) {
|
||||
iter->index= c<=0xffff ? iter->length : iter->length-1;
|
||||
}
|
||||
if(c<0) {
|
||||
return 0xfffd;
|
||||
@ -459,18 +568,26 @@ lenient8IteratorNext(UCharIterator *iter) {
|
||||
|
||||
static UChar32 U_CALLCONV
|
||||
lenient8IteratorPrevious(UCharIterator *iter) {
|
||||
int32_t index;
|
||||
|
||||
if(iter->reservedField!=0) {
|
||||
UChar lead=U16_LEAD(iter->reservedField);
|
||||
iter->reservedField=0;
|
||||
iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
|
||||
--iter->index;
|
||||
if((index=iter->index)>0) {
|
||||
iter->index=index-1;
|
||||
}
|
||||
return lead;
|
||||
} else if(iter->start>0) {
|
||||
const uint8_t *s=(const uint8_t *)iter->context;
|
||||
UChar32 c;
|
||||
|
||||
L8_PREV(s, 0, iter->start, c);
|
||||
--iter->index;
|
||||
if((index=iter->index)>0) {
|
||||
iter->index=index-1;
|
||||
} else if(iter->start<=1) {
|
||||
iter->index= c<=0xffff ? iter->start : iter->start+1;
|
||||
}
|
||||
if(c<0) {
|
||||
return 0xfffd;
|
||||
} else if(c<=0xffff) {
|
||||
@ -485,6 +602,54 @@ lenient8IteratorPrevious(UCharIterator *iter) {
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t U_CALLCONV
|
||||
lenient8IteratorGetState(const UCharIterator *iter) {
|
||||
if(iter==NULL) {
|
||||
return 1; /* invalid */
|
||||
} else {
|
||||
uint32_t state=(uint32_t)(iter->start<<1);
|
||||
if(iter->reservedField!=0) {
|
||||
state|=1;
|
||||
}
|
||||
return state;
|
||||
}
|
||||
}
|
||||
|
||||
static void U_CALLCONV
|
||||
lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
/* do nothing */
|
||||
} else if(iter==NULL) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
} else {
|
||||
int32_t index=(int32_t)(state>>1); /* UTF-8 index */
|
||||
state&=1; /* 1 if in surrogate pair, must be index>=4 */
|
||||
|
||||
if((state==0 ? index<0 : index<4) || iter->limit<index) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
} else {
|
||||
iter->start=index; /* restore UTF-8 byte index */
|
||||
if(index<=1) {
|
||||
iter->index=index;
|
||||
} else {
|
||||
iter->index=-1; /* unknown UTF-16 index */
|
||||
}
|
||||
if(state==0) {
|
||||
iter->reservedField=0;
|
||||
} else {
|
||||
/* verified index>=4 above */
|
||||
UChar32 c;
|
||||
L8_PREV((const uint8_t *)iter->context, 0, index, c);
|
||||
if(c<=0xffff) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
} else {
|
||||
iter->reservedField=c;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const UCharIterator lenient8Iterator={
|
||||
0, 0, 0, 0, 0, 0,
|
||||
lenient8IteratorGetIndex,
|
||||
@ -494,10 +659,12 @@ static const UCharIterator lenient8Iterator={
|
||||
lenient8IteratorCurrent,
|
||||
lenient8IteratorNext,
|
||||
lenient8IteratorPrevious,
|
||||
0
|
||||
NULL,
|
||||
lenient8IteratorGetState,
|
||||
lenient8IteratorSetState
|
||||
};
|
||||
|
||||
static void
|
||||
U_CAPI void U_EXPORT2
|
||||
uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
|
||||
if(iter!=0) {
|
||||
if(s!=0 && length>=-1) {
|
||||
@ -508,10 +675,10 @@ uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
|
||||
} else {
|
||||
iter->limit=strlen(s);
|
||||
}
|
||||
iter->length= iter->limit==0 ? 0 : -1;
|
||||
iter->length= iter->limit<=1 ? iter->limit : -1;
|
||||
} else {
|
||||
/* set no-op iterator */
|
||||
uiter_setUTF8(iter, NULL, 0);
|
||||
uiter_setString(iter, NULL, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -691,6 +858,92 @@ compareIterators(UCharIterator *iter1, const char *n1,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Test the iterator's getState() and setState() functions.
|
||||
* iter1 and iter2 must be set up for the same iterator type and the same string
|
||||
* but may be physically different structs (different addresses).
|
||||
*
|
||||
* Assume that the text is not empty and that
|
||||
* iteration start==0 and iteration limit==length.
|
||||
* It must be 2<=middle<=length-2.
|
||||
*/
|
||||
static void
|
||||
testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
|
||||
UChar32 u[4];
|
||||
|
||||
UErrorCode errorCode;
|
||||
UChar32 c;
|
||||
uint32_t state;
|
||||
int32_t i, j;
|
||||
|
||||
/* get four UChars from the middle of the string */
|
||||
iter1->move(iter1, middle-2, UITER_ZERO);
|
||||
for(i=0; i<4; ++i) {
|
||||
c=iter1->next(iter1);
|
||||
if(c<0) {
|
||||
/* the test violates the assumptions, see comment above */
|
||||
log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
|
||||
return;
|
||||
}
|
||||
u[i]=c;
|
||||
}
|
||||
|
||||
/* move to the middle and get the state */
|
||||
iter1->move(iter1, -2, UITER_CURRENT);
|
||||
state=uiter_getState(iter1);
|
||||
|
||||
/* set the state into the second iterator and compare the results */
|
||||
errorCode=U_ZERO_ERROR;
|
||||
uiter_setState(iter2, state, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
c=iter2->current(iter2);
|
||||
if(c!=u[2]) {
|
||||
log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
|
||||
}
|
||||
|
||||
c=iter2->previous(iter2);
|
||||
if(c!=u[1]) {
|
||||
log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
|
||||
}
|
||||
|
||||
iter2->move(iter2, 2, UITER_CURRENT);
|
||||
c=iter2->next(iter2);
|
||||
if(c!=u[3]) {
|
||||
log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
|
||||
}
|
||||
|
||||
iter2->move(iter2, -3, UITER_CURRENT);
|
||||
c=iter2->previous(iter2);
|
||||
if(c!=u[0]) {
|
||||
log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
|
||||
}
|
||||
|
||||
/* move the second iterator back to the middle */
|
||||
iter2->move(iter2, 1, UITER_CURRENT);
|
||||
iter2->next(iter2);
|
||||
|
||||
/* check that both are in the middle */
|
||||
i=iter1->getIndex(iter1, UITER_CURRENT);
|
||||
j=iter2->getIndex(iter2, UITER_CURRENT);
|
||||
if(i!=middle) {
|
||||
log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
|
||||
}
|
||||
if(i!=j) {
|
||||
log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
|
||||
}
|
||||
|
||||
/* compare lengths */
|
||||
i=iter1->getIndex(iter1, UITER_LENGTH);
|
||||
j=iter2->getIndex(iter2, UITER_LENGTH);
|
||||
if(i!=j) {
|
||||
log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
TestLenient8Iterator() {
|
||||
static const UChar text[]={
|
||||
@ -708,6 +961,7 @@ TestLenient8Iterator() {
|
||||
|
||||
UCharIterator iter1, iter2;
|
||||
UChar32 c1, c2;
|
||||
int32_t length;
|
||||
|
||||
puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
|
||||
|
||||
@ -720,9 +974,18 @@ TestLenient8Iterator() {
|
||||
uiter_setLenient8(&iter2, (const char *)bytes, -1);
|
||||
compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
|
||||
|
||||
/* test get/set state */
|
||||
length=LENGTHOF(text)-1;
|
||||
uiter_setLenient8(&iter1, bytes, -1);
|
||||
testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
|
||||
testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
puts("no output so far means that the lenient-8 iterator works fine");
|
||||
|
||||
puts("iterate forward:\nUTF-16\tlenient-8");
|
||||
uiter_setString(&iter1, text, -1);
|
||||
iter1.move(&iter1, 0, UITER_START);
|
||||
iter2.move(&iter2, 0, UITER_START);
|
||||
for(;;) {
|
||||
|
Loading…
Reference in New Issue
Block a user