ICU-9662 add & test & use U8_GET_OR_FFFD(), U8_NEXT_OR_FFFD(), U8_PREV_OR_FFFD()
X-SVN-Rev: 32796
This commit is contained in:
parent
cb4157921b
commit
bc31ae8173
@ -690,16 +690,9 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
|
||||
|
||||
int32_t prev=length;
|
||||
UChar32 c;
|
||||
if(b<0xc0) {
|
||||
// trail byte: collect a multi-byte character
|
||||
c=utf8_prevCharSafeBody(s, 0, &length, b, -1);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
} else {
|
||||
// lead byte in last-trail position
|
||||
c=0xfffd;
|
||||
}
|
||||
// trail byte: collect a multi-byte character
|
||||
// (or lead byte in last-trail position)
|
||||
c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
|
||||
// c is a valid code point, not ASCII, not a surrogate
|
||||
if(c<=0x7ff) {
|
||||
if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2011, International Business Machines
|
||||
* Copyright (C) 2002-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -600,12 +600,8 @@ utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
|
||||
i=index=0;
|
||||
limit=iter->start; /* count up to the UTF-8 index */
|
||||
while(i<limit) {
|
||||
U8_NEXT(s, i, limit, c);
|
||||
if(c<=0xffff) {
|
||||
++index;
|
||||
} else {
|
||||
index+=2;
|
||||
}
|
||||
U8_NEXT_OR_FFFD(s, i, limit, c);
|
||||
index+=U16_LENGTH(c);
|
||||
}
|
||||
|
||||
iter->start=i; /* just in case setState() did not get us to a code point boundary */
|
||||
@ -636,12 +632,8 @@ utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
|
||||
|
||||
/* count from the beginning to the current index */
|
||||
while(i<limit) {
|
||||
U8_NEXT(s, i, limit, c);
|
||||
if(c<=0xffff) {
|
||||
++length;
|
||||
} else {
|
||||
length+=2;
|
||||
}
|
||||
U8_NEXT_OR_FFFD(s, i, limit, c);
|
||||
length+=U16_LENGTH(c);
|
||||
}
|
||||
|
||||
/* assume i==limit==iter->start, set the UTF-16 index */
|
||||
@ -658,12 +650,8 @@ utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
|
||||
/* count from the current index to the end */
|
||||
limit=iter->limit;
|
||||
while(i<limit) {
|
||||
U8_NEXT(s, i, limit, c);
|
||||
if(c<=0xffff) {
|
||||
++length;
|
||||
} else {
|
||||
length+=2;
|
||||
}
|
||||
U8_NEXT_OR_FFFD(s, i, limit, c);
|
||||
length+=U16_LENGTH(c);
|
||||
}
|
||||
iter->length=length;
|
||||
}
|
||||
@ -787,8 +775,8 @@ utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin)
|
||||
--delta;
|
||||
}
|
||||
while(delta>0 && i<limit) {
|
||||
U8_NEXT(s, i, limit, c);
|
||||
if(c<0xffff) {
|
||||
U8_NEXT_OR_FFFD(s, i, limit, c);
|
||||
if(c<=0xffff) {
|
||||
++pos;
|
||||
--delta;
|
||||
} else if(delta>=2) {
|
||||
@ -817,8 +805,8 @@ utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin)
|
||||
++delta;
|
||||
}
|
||||
while(delta<0 && i>0) {
|
||||
U8_PREV(s, 0, i, c);
|
||||
if(c<0xffff) {
|
||||
U8_PREV_OR_FFFD(s, 0, i, c);
|
||||
if(c<=0xffff) {
|
||||
--pos;
|
||||
++delta;
|
||||
} else if(delta<=-2) {
|
||||
@ -867,10 +855,8 @@ utf8IteratorCurrent(UCharIterator *iter) {
|
||||
UChar32 c;
|
||||
int32_t i=iter->start;
|
||||
|
||||
U8_NEXT(s, i, iter->limit, c);
|
||||
if(c<0) {
|
||||
return 0xfffd;
|
||||
} else if(c<=0xffff) {
|
||||
U8_NEXT_OR_FFFD(s, i, iter->limit, c);
|
||||
if(c<=0xffff) {
|
||||
return c;
|
||||
} else {
|
||||
return U16_LEAD(c);
|
||||
@ -895,7 +881,7 @@ utf8IteratorNext(UCharIterator *iter) {
|
||||
const uint8_t *s=(const uint8_t *)iter->context;
|
||||
UChar32 c;
|
||||
|
||||
U8_NEXT(s, iter->start, iter->limit, c);
|
||||
U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
|
||||
if((index=iter->index)>=0) {
|
||||
iter->index=++index;
|
||||
if(iter->length<0 && iter->start==iter->limit) {
|
||||
@ -904,9 +890,7 @@ utf8IteratorNext(UCharIterator *iter) {
|
||||
} else if(iter->start==iter->limit && iter->length>=0) {
|
||||
iter->index= c<=0xffff ? iter->length : iter->length-1;
|
||||
}
|
||||
if(c<0) {
|
||||
return 0xfffd;
|
||||
} else if(c<=0xffff) {
|
||||
if(c<=0xffff) {
|
||||
return c;
|
||||
} else {
|
||||
iter->reservedField=c;
|
||||
@ -933,15 +917,13 @@ utf8IteratorPrevious(UCharIterator *iter) {
|
||||
const uint8_t *s=(const uint8_t *)iter->context;
|
||||
UChar32 c;
|
||||
|
||||
U8_PREV(s, 0, iter->start, c);
|
||||
U8_PREV_OR_FFFD(s, 0, iter->start, c);
|
||||
if((index=iter->index)>0) {
|
||||
iter->index=index-1;
|
||||
} else if(iter->start<=1) {
|
||||
iter->index= c<=0xffff ? iter->start : iter->start+1;
|
||||
}
|
||||
if(c<0) {
|
||||
return 0xfffd;
|
||||
} else if(c<=0xffff) {
|
||||
if(c<=0xffff) {
|
||||
return c;
|
||||
} else {
|
||||
iter->start+=4; /* back to behind this supplementary code point for consistent state */
|
||||
@ -991,7 +973,7 @@ utf8IteratorSetState(UCharIterator *iter,
|
||||
} else {
|
||||
/* verified index>=4 above */
|
||||
UChar32 c;
|
||||
U8_PREV((const uint8_t *)iter->context, 0, index, c);
|
||||
U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
|
||||
if(c<=0xffff) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
} else {
|
||||
|
@ -253,6 +253,37 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
U8_NEXT(s, _u8_get_index, length, c); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a code point from a string at a random-access offset,
|
||||
* without changing the offset.
|
||||
* The offset may point to either the lead byte or one of the trail bytes
|
||||
* for a code point, in which case the macro will read all of the bytes
|
||||
* for the code point.
|
||||
*
|
||||
* The length can be negative for a NUL-terminated string.
|
||||
*
|
||||
* If the offset points to an illegal UTF-8 byte sequence, then
|
||||
* c is set to U+FFFD.
|
||||
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
|
||||
*
|
||||
* This macro does not distinguish between a real U+FFFD in the text
|
||||
* and U+FFFD returned for an ill-formed sequence.
|
||||
* Use U8_GET() if that distinction is important.
|
||||
*
|
||||
* @param s const uint8_t * string
|
||||
* @param start int32_t starting string offset
|
||||
* @param i int32_t string offset, must be start<=i<length
|
||||
* @param length int32_t string length
|
||||
* @param c output UChar32 variable, set to U+FFFD in case of an error
|
||||
* @see U8_GET
|
||||
* @draft ICU 51
|
||||
*/
|
||||
#define U8_GET_OR_FFFD(s, start, i, length, c) { \
|
||||
int32_t _u8_get_index=(i); \
|
||||
U8_SET_CP_START(s, start, _u8_get_index); \
|
||||
U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
|
||||
}
|
||||
|
||||
/* definitions with forward iteration --------------------------------------- */
|
||||
|
||||
/**
|
||||
@ -328,11 +359,60 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
) { \
|
||||
(c)=(UChar)((((c)&0x1f)<<6)|__t1); \
|
||||
++(i); \
|
||||
} else if(U8_IS_LEAD(c)) { \
|
||||
} else { \
|
||||
/* function call for "complicated" and error cases */ \
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a code point from a string at a code point boundary offset,
|
||||
* and advance the offset to the next code point boundary.
|
||||
* (Post-incrementing forward iteration.)
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
*
|
||||
* The length can be negative for a NUL-terminated string.
|
||||
*
|
||||
* The offset may point to the lead byte of a multi-byte sequence,
|
||||
* in which case the macro will read the whole sequence.
|
||||
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
|
||||
* c is set to U+FFFD.
|
||||
*
|
||||
* This macro does not distinguish between a real U+FFFD in the text
|
||||
* and U+FFFD returned for an ill-formed sequence.
|
||||
* Use U8_NEXT() if that distinction is important.
|
||||
*
|
||||
* @param s const uint8_t * string
|
||||
* @param i int32_t string offset, must be i<length
|
||||
* @param length int32_t string length
|
||||
* @param c output UChar32 variable, set to U+FFFD in case of an error
|
||||
* @see U8_NEXT
|
||||
* @draft ICU 51
|
||||
*/
|
||||
#define U8_NEXT_OR_FFFD(s, i, length, c) { \
|
||||
(c)=(uint8_t)(s)[(i)++]; \
|
||||
if((c)>=0x80) { \
|
||||
uint8_t __t1, __t2; \
|
||||
if( /* handle U+1000..U+CFFF inline */ \
|
||||
(0xe0<(c) && (c)<=0xec) && \
|
||||
(((i)+1)<(length) || (length)<0) && \
|
||||
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
|
||||
(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
|
||||
) { \
|
||||
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
|
||||
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
|
||||
(i)+=2; \
|
||||
} else if( /* handle U+0080..U+07FF inline */ \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
((i)!=(length)) && \
|
||||
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
|
||||
) { \
|
||||
(c)=(UChar)((((c)&0x1f)<<6)|__t1); \
|
||||
++(i); \
|
||||
} else { \
|
||||
(c)=U_SENTINEL; \
|
||||
/* function call for "complicated" and error cases */ \
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
@ -588,11 +668,38 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
#define U8_PREV(s, start, i, c) { \
|
||||
(c)=(uint8_t)(s)[--(i)]; \
|
||||
if((c)>=0x80) { \
|
||||
if((c)<=0xbf) { \
|
||||
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
|
||||
} else { \
|
||||
(c)=U_SENTINEL; \
|
||||
} \
|
||||
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one
|
||||
* and get the code point between them.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
*
|
||||
* The input offset may be the same as the string length.
|
||||
* If the offset is behind a multi-byte sequence, then the macro will read
|
||||
* the whole sequence.
|
||||
* If the offset is behind a lead byte, then that itself
|
||||
* will be returned as the code point.
|
||||
* If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
|
||||
*
|
||||
* This macro does not distinguish between a real U+FFFD in the text
|
||||
* and U+FFFD returned for an ill-formed sequence.
|
||||
* Use U8_PREV() if that distinction is important.
|
||||
*
|
||||
* @param s const uint8_t * string
|
||||
* @param start int32_t starting string offset (usually 0)
|
||||
* @param i int32_t string offset, must be start<i
|
||||
* @param c output UChar32 variable, set to U+FFFD in case of an error
|
||||
* @see U8_PREV
|
||||
* @draft ICU 51
|
||||
*/
|
||||
#define U8_PREV_OR_FFFD(s, start, i, c) { \
|
||||
(c)=(uint8_t)(s)[--(i)]; \
|
||||
if((c)>=0x80) { \
|
||||
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2011, International Business Machines
|
||||
* Copyright (C) 1999-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -2234,10 +2234,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp
|
||||
UChar32 c;
|
||||
int32_t start=0, prev=0;
|
||||
do {
|
||||
U8_NEXT(s, start, length, c);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
U8_NEXT_OR_FFFD(s, start, length, c);
|
||||
if(spanCondition!=contains(c)) {
|
||||
break;
|
||||
}
|
||||
@ -2275,10 +2272,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio
|
||||
UChar32 c;
|
||||
int32_t prev=length;
|
||||
do {
|
||||
U8_PREV(s, 0, length, c);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
U8_PREV_OR_FFFD(s, 0, length, c);
|
||||
if(spanCondition!=contains(c)) {
|
||||
break;
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2007-2011, International Business Machines
|
||||
* Copyright (C) 2007-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
@ -503,9 +503,9 @@ spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
|
||||
if((int8_t)c>=0) {
|
||||
return set.contains(c) ? 1 : -1;
|
||||
}
|
||||
// Take advantage of non-ASCII fastpaths in U8_NEXT().
|
||||
// Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
|
||||
int32_t i=0;
|
||||
U8_NEXT(s, i, length, c);
|
||||
U8_NEXT_OR_FFFD(s, i, length, c);
|
||||
return set.contains(c) ? i : -i;
|
||||
}
|
||||
|
||||
@ -516,7 +516,7 @@ spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
|
||||
return set.contains(c) ? 1 : -1;
|
||||
}
|
||||
int32_t i=length-1;
|
||||
c=utf8_prevCharSafeBody(s, 0, &i, c, -1);
|
||||
c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
|
||||
length-=i;
|
||||
return set.contains(c) ? length : -length;
|
||||
}
|
||||
|
@ -1217,15 +1217,11 @@ fillForward:
|
||||
int32_t cIx = srcIx;
|
||||
int32_t dIx = destIx;
|
||||
int32_t dIxSaved = destIx;
|
||||
U8_NEXT(s8, srcIx, strLen, c);
|
||||
U8_NEXT_OR_FFFD(s8, srcIx, strLen, c);
|
||||
if (c==0 && nulTerminated) {
|
||||
srcIx--;
|
||||
break;
|
||||
}
|
||||
if (c<0) {
|
||||
// Illegal UTF-8. Replace with sub character.
|
||||
c = 0x0fffd;
|
||||
}
|
||||
|
||||
U16_APPEND_UNSAFE(buf, destIx, c);
|
||||
do {
|
||||
@ -1334,15 +1330,11 @@ fillReverse:
|
||||
int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char
|
||||
|
||||
// Get the full character from the UTF8 string.
|
||||
// use code derived from tbe macros in utf.8
|
||||
// use code derived from tbe macros in utf8.h
|
||||
// Leaves srcIx pointing at the first byte of the UTF-8 char.
|
||||
//
|
||||
if (c<=0xbf) {
|
||||
c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -1);
|
||||
// leaves srcIx at first byte of the multi-byte char.
|
||||
} else {
|
||||
c=0x0fffd;
|
||||
}
|
||||
c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3);
|
||||
// leaves srcIx at first byte of the multi-byte char.
|
||||
|
||||
// Store the character in UTF-16 buffer.
|
||||
if (c<0x10000) {
|
||||
@ -1415,10 +1407,7 @@ utext_strFromUTF8(UChar *dest,
|
||||
if(ch <=0x7f){
|
||||
*pDest++=(UChar)ch;
|
||||
}else{
|
||||
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
|
||||
if(ch<0){
|
||||
ch = 0xfffd;
|
||||
}
|
||||
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
|
||||
if(U_IS_BMP(ch)){
|
||||
*(pDest++)=(UChar)ch;
|
||||
}else{
|
||||
@ -1438,10 +1427,7 @@ utext_strFromUTF8(UChar *dest,
|
||||
if(ch <= 0x7f){
|
||||
reqLength++;
|
||||
}else{
|
||||
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
|
||||
if(ch<0){
|
||||
ch = 0xfffd;
|
||||
}
|
||||
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
|
||||
reqLength+=U16_LENGTH(ch);
|
||||
}
|
||||
}
|
||||
|
@ -86,15 +86,31 @@ utf8_errorValue[6]={
|
||||
0x3ffffff, 0x7fffffff
|
||||
};
|
||||
|
||||
static UChar32
|
||||
errorValue(int32_t count, int8_t strict) {
|
||||
if(strict>=0) {
|
||||
return utf8_errorValue[count];
|
||||
} else if(strict==-3) {
|
||||
return 0xfffd;
|
||||
} else {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling
|
||||
* UTF8_NEXT_CHAR_SAFE().
|
||||
* Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
|
||||
* and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
|
||||
*
|
||||
* U8_NEXT() supports NUL-terminated strings indicated via length<0.
|
||||
*
|
||||
* The "strict" parameter controls the error behavior:
|
||||
* <0 "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative
|
||||
* code point result.
|
||||
* <0 "Safe" behavior of U8_NEXT():
|
||||
* -1: All illegal byte sequences yield U_SENTINEL=-1.
|
||||
* -2: Same as -1, except for lenient treatment of surrogate code points as legal.
|
||||
* Some implementations use this for roundtripping of
|
||||
* Unicode 16-bit strings that are not well-formed UTF-16, that is, they
|
||||
* contain unpaired surrogates.
|
||||
* -3: All illegal byte sequences yield U+FFFD.
|
||||
* 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
|
||||
* All illegal byte sequences yield a positive code point such that this
|
||||
* result code point would be encoded with the same number of bytes as
|
||||
@ -103,11 +119,6 @@ utf8_errorValue[6]={
|
||||
* Same as the obsolete "safe" behavior, but non-characters are also treated
|
||||
* like illegal sequences.
|
||||
*
|
||||
* The special negative (<0) value -2 is used for lenient treatment of surrogate
|
||||
* code points as legal. Some implementations use this for roundtripping of
|
||||
* Unicode 16-bit strings that are not well-formed UTF-16, that is, they
|
||||
* contain unpaired surrogates.
|
||||
*
|
||||
* Note that a UBool is the same as an int8_t.
|
||||
*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
@ -165,11 +176,7 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
|
||||
++i;
|
||||
--count;
|
||||
}
|
||||
if(strict>=0) {
|
||||
c=utf8_errorValue[i-*pi];
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
c=errorValue(i-*pi, strict);
|
||||
*pi=i;
|
||||
return c;
|
||||
}
|
||||
@ -224,18 +231,15 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
||||
int32_t i=*pi;
|
||||
uint8_t b, count=1, shift=6;
|
||||
|
||||
if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
|
||||
|
||||
/* extract value bits from the last trail byte */
|
||||
c&=0x3f;
|
||||
|
||||
for(;;) {
|
||||
if(i<=start) {
|
||||
/* no lead byte at all */
|
||||
if(strict>=0) {
|
||||
return UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
/*break;*/
|
||||
return errorValue(0, strict);
|
||||
}
|
||||
|
||||
/* read another previous byte */
|
||||
@ -255,11 +259,7 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
||||
if(count>=4) {
|
||||
count=3;
|
||||
}
|
||||
if(strict>=0) {
|
||||
c=utf8_errorValue[count];
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
c=errorValue(count, strict);
|
||||
} else {
|
||||
/* exit with correct c */
|
||||
}
|
||||
@ -269,17 +269,9 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
||||
include the trail byte that we started with */
|
||||
if(count<shouldCount) {
|
||||
*pi=i;
|
||||
if(strict>=0) {
|
||||
c=utf8_errorValue[count];
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
c=errorValue(count, strict);
|
||||
} else {
|
||||
if(strict>=0) {
|
||||
c=UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
c=errorValue(0, strict);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -290,20 +282,12 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
||||
shift+=6;
|
||||
} else {
|
||||
/* more than 5 trail bytes is illegal */
|
||||
if(strict>=0) {
|
||||
c=UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
c=errorValue(0, strict);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* single-byte character precedes trailing bytes */
|
||||
if(strict>=0) {
|
||||
c=UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
c=errorValue(0, strict);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -8038,163 +8038,9 @@ endOfSecLoop:
|
||||
}
|
||||
|
||||
/*
|
||||
Slightly modified version of U8_NEXT macro defined in utf8.h. U8_NEXT requires
|
||||
the length of UTF-8 string. This version assumes that the UTF-8 string is null
|
||||
terminated and does not require the length as input.
|
||||
|
||||
Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
|
||||
null terminated input string takes extra amount of CPU cycles.
|
||||
*/
|
||||
static const UChar32
|
||||
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
#define UTF8_ERROR_VALUE_1 0x15
|
||||
#define UTF8_ERROR_VALUE_2 0x9f
|
||||
#define UTF_ERROR_VALUE 0xffff
|
||||
|
||||
static const UChar32
|
||||
utf8_errorValue[6]={
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
|
||||
0x3ffffff, 0x7fffffff
|
||||
};
|
||||
|
||||
static
|
||||
UChar32 utf8_nextCharSafeBodyNullTerm(const uint8_t *s, int32_t *pi, UChar32 c, UBool strict) {
|
||||
int32_t i=*pi;
|
||||
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
|
||||
U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
|
||||
|
||||
if (c) {
|
||||
uint8_t trail, illegal=0;
|
||||
|
||||
U8_MASK_LEAD_BYTE((c), count);
|
||||
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
|
||||
switch(count) {
|
||||
/* each branch falls through to the next one */
|
||||
case 5:
|
||||
case 4:
|
||||
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
|
||||
illegal=1;
|
||||
break;
|
||||
case 3:
|
||||
trail=s[(i)];
|
||||
if (trail==0) {
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
(c)=((c)<<6)|(trail&0x3f);
|
||||
if(c<0x110) {
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
} else {
|
||||
/* code point>0x10ffff, outside Unicode */
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
++(i);
|
||||
case 2:
|
||||
trail=s[(i)];
|
||||
if (trail==0) {
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
(c)=((c)<<6)|(trail&0x3f);
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
++(i);
|
||||
case 1:
|
||||
trail=s[(i)];
|
||||
if (trail==0) {
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
(c)=((c)<<6)|(trail&0x3f);
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
++(i);
|
||||
break;
|
||||
case 0:
|
||||
if(strict>=0) {
|
||||
return UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
/* no default branch to optimize switch() - all values are covered */
|
||||
}
|
||||
|
||||
/*
|
||||
* All the error handling should return a value
|
||||
* that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
|
||||
*
|
||||
* Starting with Unicode 3.0.1, non-shortest forms are illegal.
|
||||
* Starting with Unicode 3.2, surrogate code points must not be
|
||||
* encoded in UTF-8, and there are no irregular sequences any more.
|
||||
*
|
||||
* U8_ macros (new in ICU 2.4) return negative values for error conditions.
|
||||
*/
|
||||
|
||||
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
|
||||
/* illegal is also set if count>=4 */
|
||||
if(illegal || (c)<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2)) {
|
||||
/* error handling */
|
||||
uint8_t errorCount=count;
|
||||
/* don't go beyond this sequence */
|
||||
i=*pi;
|
||||
while(count>0 && U8_IS_TRAIL(s[i])) {
|
||||
++(i);
|
||||
--count;
|
||||
}
|
||||
if(strict>=0) {
|
||||
c=utf8_errorValue[errorCount-count];
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
} else if((strict)>0 && U_IS_UNICODE_NONCHAR(c)) {
|
||||
/* strict: forbid non-characters like U+fffe */
|
||||
c=utf8_errorValue[count];
|
||||
}
|
||||
}
|
||||
*pi=i;
|
||||
return c;
|
||||
}
|
||||
|
||||
#define U8_NEXT_NULLTERM(s, i, c) { \
|
||||
(c)=(uint8_t)(s)[(i)]; \
|
||||
if((c)>=0x80) { \
|
||||
uint8_t __t1, __t2; \
|
||||
if( /* handle U+1000..U+CFFF inline */ \
|
||||
(0xe0<(c) && (c)<=0xec) && \
|
||||
(__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 && \
|
||||
(__t2=(uint8_t)((s)[(i)+2]-0x80))<= 0x3f && __t2 != 0 \
|
||||
) { \
|
||||
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
|
||||
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
|
||||
(i)+=3; \
|
||||
} else if( /* handle U+0080..U+07FF inline */ \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
(__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 \
|
||||
) { \
|
||||
(c)=(UChar)((((c)&0x1f)<<6)|__t1); \
|
||||
(i)+=2; \
|
||||
} else if(U8_IS_LEAD(c)) { \
|
||||
/* function call for "complicated" and error cases */ \
|
||||
++(i); \
|
||||
(c)=utf8_nextCharSafeBodyNullTerm((const uint8_t *)s, &(i), c, -1); \
|
||||
} else { \
|
||||
(c)=U_SENTINEL; \
|
||||
++(i); \
|
||||
} \
|
||||
} else { \
|
||||
if ((c)) { \
|
||||
++(i); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define U8_GET_NULLTERM(s, start, i, c) { \
|
||||
int32_t _u8_get_index=(int32_t)(i); \
|
||||
U8_SET_CP_START(s, start, _u8_get_index); \
|
||||
U8_NEXT_NULLTERM(s, _u8_get_index, c); \
|
||||
}
|
||||
|
||||
|
||||
static UCollationResult
|
||||
ucol_strcollRegularUTF8(
|
||||
const UCollator *coll,
|
||||
@ -8253,19 +8099,12 @@ ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
|
||||
UChar32 schar = 0, tchar = 0;
|
||||
|
||||
for(;;) {
|
||||
if (len == -1) {
|
||||
U8_GET_NULLTERM((const uint8_t*)s, 0, *index, schar);
|
||||
if (schar == 0) {
|
||||
return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
|
||||
}
|
||||
} else {
|
||||
if (*index == len) {
|
||||
return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
|
||||
}
|
||||
U8_GET((const uint8_t*)s, 0, *index, len, schar);
|
||||
if (*index == len) {
|
||||
return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
|
||||
}
|
||||
if (schar == -1) {
|
||||
schar = 0xfffd;
|
||||
U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar);
|
||||
if (len < 0 && schar == 0) {
|
||||
return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
|
||||
}
|
||||
|
||||
while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
|
||||
@ -8320,22 +8159,15 @@ ucol_strcollUseLatin1UTF8(
|
||||
for(;;) {
|
||||
while(sOrder==0) { // this loop skips primary ignorables
|
||||
// sOrder=getNextlatinOneCE(source);
|
||||
if (sLen==-1) {
|
||||
U8_NEXT_NULLTERM(source, sIndex, sChar);
|
||||
if (sChar == 0) {
|
||||
endOfSource = TRUE;
|
||||
sLen = sIndex;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (sIndex == sLen) {
|
||||
endOfSource = TRUE;
|
||||
break;
|
||||
}
|
||||
U8_NEXT(source, sIndex, sLen ,sChar);
|
||||
if (sIndex == sLen) {
|
||||
endOfSource = TRUE;
|
||||
break;
|
||||
}
|
||||
if (sChar == -1) {
|
||||
sChar = 0xfffd; // fallback for the bad code
|
||||
U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar);
|
||||
if (sLen < 0 && sChar == 0) {
|
||||
endOfSource = TRUE;
|
||||
sLen = sIndex;
|
||||
break;
|
||||
}
|
||||
if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
|
||||
//fprintf(stderr, "R");
|
||||
@ -8360,28 +8192,21 @@ ucol_strcollUseLatin1UTF8(
|
||||
|
||||
while(tOrder==0) { // this loop skips primary ignorables
|
||||
// tOrder=getNextlatinOneCE(target);
|
||||
if (tLen == -1) {
|
||||
U8_NEXT_NULLTERM(target, tIndex, tChar);
|
||||
if (tChar == 0) {
|
||||
if(endOfSource) {
|
||||
tLen = tIndex;
|
||||
goto endOfPrimLoopU8;
|
||||
} else {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
if (tIndex == tLen) {
|
||||
if(endOfSource) {
|
||||
goto endOfPrimLoopU8;
|
||||
} else {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
} else {
|
||||
if (tIndex == tLen) {
|
||||
if(endOfSource) {
|
||||
goto endOfPrimLoopU8;
|
||||
} else {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
U8_NEXT(target, tIndex, tLen, tChar);
|
||||
}
|
||||
if (tChar == -1) {
|
||||
tChar = 0xfffd;
|
||||
U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
|
||||
if (tLen < 0 && tChar == 0) {
|
||||
if(endOfSource) {
|
||||
tLen = tIndex;
|
||||
goto endOfPrimLoopU8;
|
||||
} else {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
|
||||
//fprintf(stderr, "R");
|
||||
@ -8448,7 +8273,7 @@ endOfPrimLoopU8:
|
||||
break;
|
||||
}
|
||||
U_ASSERT(sLen >= 0);
|
||||
U8_NEXT(source, sIndex, sLen, sChar);
|
||||
U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
|
||||
U_ASSERT(sChar >= 0 && sChar <= 0xFF);
|
||||
sOrder = elements[sChar];
|
||||
if(sOrder > UCOL_NOT_FOUND) {
|
||||
@ -8465,7 +8290,7 @@ endOfPrimLoopU8:
|
||||
}
|
||||
}
|
||||
U_ASSERT(tLen >= 0);
|
||||
U8_NEXT(target, tIndex, tLen, tChar);
|
||||
U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
|
||||
U_ASSERT(tChar >= 0 && tChar <= 0xFF);
|
||||
tOrder = elements[tChar];
|
||||
if(tOrder > UCOL_NOT_FOUND) {
|
||||
@ -8505,7 +8330,7 @@ endOfPrimLoopU8:
|
||||
endOfSource = TRUE;
|
||||
break;
|
||||
}
|
||||
U8_PREV(source, 0, sIndex, sChar);
|
||||
U8_PREV_OR_FFFD(source, 0, sIndex, sChar);
|
||||
U_ASSERT(sChar >= 0 && sChar <= 0xFF);
|
||||
sOrder = elements[sChar];
|
||||
// don't even look for contractions
|
||||
@ -8519,7 +8344,7 @@ endOfPrimLoopU8:
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
U8_PREV(target, 0, tIndex, tChar);
|
||||
U8_PREV_OR_FFFD(target, 0, tIndex, tChar);
|
||||
U_ASSERT(tChar >= 0 && tChar <= 0xFF);
|
||||
tOrder = elements[tChar];
|
||||
// don't even look for contractions
|
||||
@ -8560,7 +8385,7 @@ endOfSecLoopU8:
|
||||
break;
|
||||
}
|
||||
U_ASSERT(sLen >= 0);
|
||||
U8_NEXT(source, sIndex, sLen, sChar);
|
||||
U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
|
||||
U_ASSERT(sChar >= 0 && sChar <= 0xFF);
|
||||
sOrder = elements[sChar];
|
||||
if(sOrder > UCOL_NOT_FOUND) {
|
||||
@ -8576,7 +8401,7 @@ endOfSecLoopU8:
|
||||
}
|
||||
}
|
||||
U_ASSERT(tLen >= 0);
|
||||
U8_NEXT(target, tIndex, tLen, tChar);
|
||||
U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
|
||||
U_ASSERT(tChar >= 0 && tChar <= 0xFF);
|
||||
tOrder = elements[tChar];
|
||||
if(tOrder > UCOL_NOT_FOUND) {
|
||||
@ -8963,36 +8788,18 @@ ucol_strcollUTF8(
|
||||
UChar32 uc32 = -1;
|
||||
|
||||
if (!bSrcLimit) {
|
||||
if (sourceLength >= 0) {
|
||||
U8_GET((uint8_t*)source, 0, equalLength, sourceLength, uc32);
|
||||
} else {
|
||||
U8_GET_NULLTERM((uint8_t*)source, 0, equalLength, uc32);
|
||||
}
|
||||
if (uc32 == -1) {
|
||||
uc32 = 0xfffd;
|
||||
bSawNonLatin1 |= TRUE;
|
||||
} else {
|
||||
if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
|
||||
bUnsafeCP = TRUE;
|
||||
}
|
||||
bSawNonLatin1 |= (uc32 > 0xff);
|
||||
U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32);
|
||||
if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
|
||||
bUnsafeCP = TRUE;
|
||||
}
|
||||
bSawNonLatin1 |= (uc32 > 0xff);
|
||||
}
|
||||
if (!bTargLimit) {
|
||||
if (targetLength >= 0) {
|
||||
U8_GET((uint8_t*)target, 0, equalLength, targetLength, uc32);
|
||||
} else {
|
||||
U8_GET_NULLTERM((uint8_t*)target, 0, equalLength, uc32);
|
||||
}
|
||||
if (uc32 == -1) {
|
||||
uc32 = 0xfffd;
|
||||
bSawNonLatin1 |= TRUE;
|
||||
} else {
|
||||
if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
|
||||
bUnsafeCP = TRUE;
|
||||
}
|
||||
bSawNonLatin1 |= (uc32 > 0xff);
|
||||
U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32);
|
||||
if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
|
||||
bUnsafeCP = TRUE;
|
||||
}
|
||||
bSawNonLatin1 |= (uc32 > 0xff);
|
||||
}
|
||||
|
||||
if (bUnsafeCP) {
|
||||
@ -9000,7 +8807,7 @@ ucol_strcollUTF8(
|
||||
// We are stopped in the middle of a contraction.
|
||||
// Scan backwards through the == part of the string looking for the start of the contraction.
|
||||
// It doesn't matter which string we scan, since they are the same in this region.
|
||||
U8_PREV((uint8_t*)source, 0, equalLength, uc32);
|
||||
U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32);
|
||||
bSawNonLatin1 |= (uc32 > 0xff);
|
||||
if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
|
||||
break;
|
||||
|
@ -195,7 +195,7 @@ static void TestGetChar()
|
||||
0x240, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
|
||||
};
|
||||
uint16_t i=0;
|
||||
UChar32 c;
|
||||
UChar32 c, expected;
|
||||
uint32_t offset=0;
|
||||
|
||||
for(offset=0; offset<sizeof(input); offset++) {
|
||||
@ -213,14 +213,22 @@ static void TestGetChar()
|
||||
}
|
||||
}
|
||||
|
||||
U8_GET(input, 0, offset, sizeof(input), c);
|
||||
if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
|
||||
log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
|
||||
expected=result[i+1];
|
||||
if(c != expected){
|
||||
log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
|
||||
if(c != result[i+1]){
|
||||
log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
U8_GET(input, 0, offset, sizeof(input), c);
|
||||
if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
|
||||
if(expected<0) { expected=0xfffd; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
|
||||
@ -228,7 +236,7 @@ static void TestGetChar()
|
||||
log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
|
||||
}
|
||||
|
||||
i=(uint16_t)(i+3);
|
||||
i=(uint16_t)(i+3);
|
||||
}
|
||||
}
|
||||
|
||||
@ -274,7 +282,7 @@ static void TestNextPrevChar() {
|
||||
};
|
||||
/* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */
|
||||
|
||||
UChar32 c=0x0000;
|
||||
UChar32 c, expected;
|
||||
uint32_t i=0;
|
||||
uint32_t offset=0;
|
||||
int32_t setOffset=0;
|
||||
@ -285,9 +293,10 @@ static void TestNextPrevChar() {
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
}
|
||||
if(c != result[i+1]){
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
expected=result[i+1];
|
||||
if(c != expected){
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_NEXT(input, setOffset, sizeof(input), c);
|
||||
@ -295,9 +304,21 @@ static void TestNextPrevChar() {
|
||||
log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
}
|
||||
if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
|
||||
log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
}
|
||||
if(expected<0) { expected=0xfffd; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
|
||||
@ -320,9 +341,10 @@ static void TestNextPrevChar() {
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+4], setOffset);
|
||||
}
|
||||
if(c != result[i+4]){
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
|
||||
}
|
||||
expected=result[i+4];
|
||||
if(c != expected){
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_PREV(input, 0, setOffset, c);
|
||||
@ -330,9 +352,21 @@ static void TestNextPrevChar() {
|
||||
log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+4], setOffset);
|
||||
}
|
||||
if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){
|
||||
log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
|
||||
}
|
||||
if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_PREV_OR_FFFD(input, 0, setOffset, c);
|
||||
if(setOffset != movedOffset[i+4]){
|
||||
log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+4], setOffset);
|
||||
}
|
||||
if(expected<0) { expected=0xfffd; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
|
||||
@ -378,14 +412,24 @@ static void TestNulTerminated() {
|
||||
0
|
||||
};
|
||||
|
||||
UChar32 c, c2;
|
||||
UChar32 c, c2, expected;
|
||||
int32_t i0, i=0, j, k, expectedIndex;
|
||||
int32_t cpIndex=0;
|
||||
do {
|
||||
i0=i;
|
||||
U8_NEXT(input, i, -1, c);
|
||||
if(c!=result[cpIndex]) {
|
||||
log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, result[cpIndex]);
|
||||
expected=result[cpIndex];
|
||||
if(c!=expected) {
|
||||
log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
|
||||
}
|
||||
j=i0;
|
||||
U8_NEXT_OR_FFFD(input, j, -1, c);
|
||||
if(expected<0) { expected=0xfffd; }
|
||||
if(c!=expected) {
|
||||
log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
|
||||
}
|
||||
if(j!=i) {
|
||||
log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
|
||||
}
|
||||
j=i0;
|
||||
U8_FWD_1(input, j, -1);
|
||||
@ -414,6 +458,11 @@ static void TestNulTerminated() {
|
||||
if(c2!=c) {
|
||||
log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
|
||||
}
|
||||
U8_GET_OR_FFFD(input, 0, j, -1, c2);
|
||||
expected= (c>=0) ? c : 0xfffd;
|
||||
if(c2!=expected) {
|
||||
log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
|
||||
}
|
||||
/* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
|
||||
k=j+1;
|
||||
U8_SET_CP_LIMIT(input, 0, k, -1);
|
||||
|
@ -2626,10 +2626,7 @@ static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s,
|
||||
UChar32 c;
|
||||
int32_t start=0, prev;
|
||||
while((prev=start)<length) {
|
||||
U8_NEXT(s, start, length, c);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
U8_NEXT_OR_FFFD(s, start, length, c);
|
||||
if(realSet.contains(c)!=spanCondition) {
|
||||
break;
|
||||
}
|
||||
@ -2640,10 +2637,7 @@ static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s,
|
||||
UChar32 c;
|
||||
int32_t start, next;
|
||||
for(start=next=0; start<length;) {
|
||||
U8_NEXT(s, next, length, c);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
U8_NEXT_OR_FFFD(s, next, length, c);
|
||||
if(realSet.contains(c)) {
|
||||
break;
|
||||
}
|
||||
@ -2664,10 +2658,7 @@ static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s,
|
||||
UChar32 c;
|
||||
int32_t start, next, maxSpanLimit=0;
|
||||
for(start=next=0; start<length;) {
|
||||
U8_NEXT(s, next, length, c);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
U8_NEXT_OR_FFFD(s, next, length, c);
|
||||
if(!realSet.contains(c)) {
|
||||
next=start; // Do not span this single, not-contained code point.
|
||||
}
|
||||
@ -2738,10 +2729,7 @@ static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char
|
||||
UChar32 c;
|
||||
int32_t prev=length;
|
||||
do {
|
||||
U8_PREV(s, 0, length, c);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
U8_PREV_OR_FFFD(s, 0, length, c);
|
||||
if(realSet.contains(c)!=spanCondition) {
|
||||
break;
|
||||
}
|
||||
@ -2752,10 +2740,7 @@ static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char
|
||||
UChar32 c;
|
||||
int32_t prev=length;
|
||||
do {
|
||||
U8_PREV(s, 0, length, c);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
U8_PREV_OR_FFFD(s, 0, length, c);
|
||||
if(realSet.contains(c)) {
|
||||
break;
|
||||
}
|
||||
@ -2775,10 +2760,7 @@ static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char
|
||||
UChar32 c;
|
||||
int32_t prev=length, minSpanStart=length;
|
||||
do {
|
||||
U8_PREV(s, 0, length, c);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
U8_PREV_OR_FFFD(s, 0, length, c);
|
||||
if(!realSet.contains(c)) {
|
||||
length=prev; // Do not span this single, not-contained code point.
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user