ICU-2397 add UTF-16BE UCharIterator
X-SVN-Rev: 10833
This commit is contained in:
parent
84d1432306
commit
7daa35bdd9
@ -186,6 +186,118 @@ uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
|
||||
}
|
||||
}
|
||||
|
||||
/* UCharIterator implementation for UTF-16BE strings ------------------------ */
|
||||
|
||||
/*
|
||||
* This is an implementation of a code unit (UChar) iterator
|
||||
* for UTF-16BE strings, i.e., strings in byte-vectors where
|
||||
* each UChar is stored as a big-endian pair of bytes.
|
||||
*
|
||||
* The UCharIterator.context field holds a pointer to the string.
|
||||
* Everything works just like with a normal UChar iterator (uiter_setString),
|
||||
* except that UChars are assembled from byte pairs.
|
||||
*/
|
||||
|
||||
static UChar32 U_CALLCONV
|
||||
utf16BEIteratorCurrent(UCharIterator *iter) {
|
||||
int32_t index;
|
||||
|
||||
if((index=iter->index)<iter->limit) {
|
||||
const uint8_t *p=(const uint8_t *)iter->context;
|
||||
return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
|
||||
} else {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
}
|
||||
|
||||
static UChar32 U_CALLCONV
|
||||
utf16BEIteratorNext(UCharIterator *iter) {
|
||||
int32_t index;
|
||||
|
||||
if((index=iter->index)<iter->limit) {
|
||||
const uint8_t *p=(const uint8_t *)iter->context;
|
||||
iter->index=index+1;
|
||||
return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
|
||||
} else {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
}
|
||||
|
||||
static UChar32 U_CALLCONV
|
||||
utf16BEIteratorPrevious(UCharIterator *iter) {
|
||||
int32_t index;
|
||||
|
||||
if((index=iter->index)>iter->start) {
|
||||
const uint8_t *p=(const uint8_t *)iter->context;
|
||||
iter->index=--index;
|
||||
return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
|
||||
} else {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
}
|
||||
|
||||
static const UCharIterator utf16BEIterator={
|
||||
0, 0, 0, 0, 0, 0,
|
||||
stringIteratorGetIndex,
|
||||
stringIteratorMove,
|
||||
stringIteratorHasNext,
|
||||
stringIteratorHasPrevious,
|
||||
utf16BEIteratorCurrent,
|
||||
utf16BEIteratorNext,
|
||||
utf16BEIteratorPrevious,
|
||||
0
|
||||
};
|
||||
|
||||
/*
|
||||
* Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
|
||||
* i.e., before a pair of 0 bytes where the first 0 byte is at an even
|
||||
* offset from s.
|
||||
*/
|
||||
static int32_t
|
||||
utf16BE_strlen(const char *s) {
|
||||
if(((int32_t)s&1)==0) {
|
||||
/*
|
||||
* even-aligned, call u_strlen(s)
|
||||
* we are probably on a little-endian machine, but searching for UChar NUL
|
||||
* does not care about endianness
|
||||
*/
|
||||
return u_strlen((const UChar *)s);
|
||||
} else {
|
||||
/* odd-aligned, search for pair of 0 bytes */
|
||||
const char *p=s;
|
||||
|
||||
while(!(*p==0 && p[1]==0)) {
|
||||
p+=2;
|
||||
}
|
||||
return (int32_t)((p-s)/2);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
|
||||
if(iter!=0) {
|
||||
/* allow only even-length strings (the input length counts bytes) */
|
||||
if(s!=0 && length==-1 || (length>=0 && (length&1)==0)) {
|
||||
if(U_IS_BIG_ENDIAN && ((int32_t)s&1)==0) {
|
||||
/* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
|
||||
uiter_setString(iter, (const UChar *)s, length/2);
|
||||
return;
|
||||
}
|
||||
|
||||
*iter=utf16BEIterator;
|
||||
iter->context=s;
|
||||
if(length>=0) {
|
||||
iter->length=length/2;
|
||||
} else {
|
||||
iter->length=utf16BE_strlen(s);
|
||||
}
|
||||
iter->limit=iter->length;
|
||||
} else {
|
||||
*iter=noopIterator;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* UCharIterator wrapper around CharacterIterator --------------------------- */
|
||||
|
||||
/*
|
||||
@ -475,11 +587,20 @@ utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin)
|
||||
return iter->index;
|
||||
}
|
||||
|
||||
/* minimize the number of U8_NEXT/PREV operations */
|
||||
if(pos<iter->index/2) {
|
||||
/* go forward from the start instead of backward from the current index */
|
||||
iter->index=iter->start=iter->reservedField=0;
|
||||
} else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
|
||||
/*
|
||||
* if we have the UTF-16 length and the new position is
|
||||
* closer to the end than the current index,
|
||||
* then go backward from the end instead of forward from the current index
|
||||
*/
|
||||
iter->index=iter->length;
|
||||
iter->start=iter->limit;
|
||||
iter->reservedField=0;
|
||||
}
|
||||
/* ### TODO: consider going backward from the end in some cases! */
|
||||
|
||||
delta=pos-iter->index;
|
||||
if(delta==0) {
|
||||
|
@ -395,6 +395,26 @@ uiter_previous32(UCharIterator *iter);
|
||||
U_CAPI void U_EXPORT2
|
||||
uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
|
||||
|
||||
/**
|
||||
* Set up a UCharIterator to iterate over a UTF-16BE string
|
||||
* (byte vector with a big-endian pair of bytes per UChar).
|
||||
*
|
||||
* Everything works just like with a normal UChar iterator (uiter_setString),
|
||||
* except that UChars are assembled from byte pairs,
|
||||
* and that the length argument here indicates an even number of bytes.
|
||||
*
|
||||
* @param iter UCharIterator structure to be set for iteration
|
||||
* @param s UTF-16BE string to iterate over
|
||||
* @param length Length of s as an even number of bytes, or -1 if NUL-terminated
|
||||
* (NUL means pair of 0 bytes at even index from s)
|
||||
*
|
||||
* @see UCharIterator
|
||||
* @see uiter_setString
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
|
||||
|
||||
/**
|
||||
* Set up a UCharIterator to iterate over a UTF-8 string.
|
||||
*
|
||||
@ -402,6 +422,7 @@ uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
|
||||
* with UTF-8 iteration boundaries 0 and length.
|
||||
* The implementation counts the UTF-16 index on the fly and
|
||||
* lazily evaluates the UTF-16 length of the text.
|
||||
*
|
||||
* The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
|
||||
* When the reservedField is not 0, then it contains a supplementary code point
|
||||
* and the UTF-16 index is between the two corresponding surrogates.
|
||||
|
Loading…
Reference in New Issue
Block a user