ICU-2397 add UTF-16BE UCharIterator

X-SVN-Rev: 10833
This commit is contained in:
Markus Scherer 2003-01-10 19:42:28 +00:00
parent 84d1432306
commit 7daa35bdd9
2 changed files with 143 additions and 1 deletions

View File

@ -186,6 +186,118 @@ uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
}
}
/* UCharIterator implementation for UTF-16BE strings ------------------------ */
/*
* This is an implementation of a code unit (UChar) iterator
* for UTF-16BE strings, i.e., strings in byte-vectors where
* each UChar is stored as a big-endian pair of bytes.
*
* The UCharIterator.context field holds a pointer to the string.
* Everything works just like with a normal UChar iterator (uiter_setString),
* except that UChars are assembled from byte pairs.
*/
static UChar32 U_CALLCONV
utf16BEIteratorCurrent(UCharIterator *iter) {
int32_t index;
if((index=iter->index)<iter->limit) {
const uint8_t *p=(const uint8_t *)iter->context;
return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
utf16BEIteratorNext(UCharIterator *iter) {
int32_t index;
if((index=iter->index)<iter->limit) {
const uint8_t *p=(const uint8_t *)iter->context;
iter->index=index+1;
return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
utf16BEIteratorPrevious(UCharIterator *iter) {
int32_t index;
if((index=iter->index)>iter->start) {
const uint8_t *p=(const uint8_t *)iter->context;
iter->index=--index;
return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
} else {
return U_SENTINEL;
}
}
static const UCharIterator utf16BEIterator={
0, 0, 0, 0, 0, 0,
stringIteratorGetIndex,
stringIteratorMove,
stringIteratorHasNext,
stringIteratorHasPrevious,
utf16BEIteratorCurrent,
utf16BEIteratorNext,
utf16BEIteratorPrevious,
0
};
/*
* Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
* i.e., before a pair of 0 bytes where the first 0 byte is at an even
* offset from s.
*/
static int32_t
utf16BE_strlen(const char *s) {
if(((int32_t)s&1)==0) {
/*
* even-aligned, call u_strlen(s)
* we are probably on a little-endian machine, but searching for UChar NUL
* does not care about endianness
*/
return u_strlen((const UChar *)s);
} else {
/* odd-aligned, search for pair of 0 bytes */
const char *p=s;
while(!(*p==0 && p[1]==0)) {
p+=2;
}
return (int32_t)((p-s)/2);
}
}
U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
if(iter!=0) {
/* allow only even-length strings (the input length counts bytes) */
if(s!=0 && length==-1 || (length>=0 && (length&1)==0)) {
if(U_IS_BIG_ENDIAN && ((int32_t)s&1)==0) {
/* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
uiter_setString(iter, (const UChar *)s, length/2);
return;
}
*iter=utf16BEIterator;
iter->context=s;
if(length>=0) {
iter->length=length/2;
} else {
iter->length=utf16BE_strlen(s);
}
iter->limit=iter->length;
} else {
*iter=noopIterator;
}
}
}
/* UCharIterator wrapper around CharacterIterator --------------------------- */
/*
@ -475,11 +587,20 @@ utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin)
return iter->index;
}
/* minimize the number of U8_NEXT/PREV operations */
if(pos<iter->index/2) {
/* go forward from the start instead of backward from the current index */
iter->index=iter->start=iter->reservedField=0;
} else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
/*
* if we have the UTF-16 length and the new position is
* closer to the end than the current index,
* then go backward from the end instead of forward from the current index
*/
iter->index=iter->length;
iter->start=iter->limit;
iter->reservedField=0;
}
/* ### TODO: consider going backward from the end in some cases! */
delta=pos-iter->index;
if(delta==0) {

View File

@ -395,6 +395,26 @@ uiter_previous32(UCharIterator *iter);
U_CAPI void U_EXPORT2
uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
/**
* Set up a UCharIterator to iterate over a UTF-16BE string
* (byte vector with a big-endian pair of bytes per UChar).
*
* Everything works just like with a normal UChar iterator (uiter_setString),
* except that UChars are assembled from byte pairs,
* and that the length argument here indicates an even number of bytes.
*
* @param iter UCharIterator structure to be set for iteration
* @param s UTF-16BE string to iterate over
* @param length Length of s as an even number of bytes, or -1 if NUL-terminated
* (NUL means pair of 0 bytes at even index from s)
*
* @see UCharIterator
* @see uiter_setString
* @draft ICU 2.6
*/
U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
/**
* Set up a UCharIterator to iterate over a UTF-8 string.
*
@ -402,6 +422,7 @@ uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
* with UTF-8 iteration boundaries 0 and length.
* The implementation counts the UTF-16 index on the fly and
* lazily evaluates the UTF-16 length of the text.
*
* The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
* When the reservedField is not 0, then it contains a supplementary code point
* and the UTF-16 index is between the two corresponding surrogates.