ICU-13311 change illegal-UTF-8 handling in non-converter code

X-SVN-Rev: 40445
This commit is contained in:
Markus Scherer 2017-09-21 23:45:08 +00:00
parent 119d75dc46
commit 27c08578ac
19 changed files with 913 additions and 1057 deletions

View File

@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list(parentList), listLength(parentListLength) {
uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
uprv_memset(table7FF, 0, sizeof(table7FF));
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
}
list4kStarts[0x11]=listLength-1;
containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
initBits();
overrideIllegal();
}
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
containsFFFD(otherBMPSet.containsFFFD),
list(newParentList), listLength(newParentListLength) {
uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
@ -120,7 +122,7 @@ void BMPSet::initBits() {
UChar32 start, limit;
int32_t listIndex=0;
// Set asciiBytes[].
// Set latin1Contains[].
do {
start=list[listIndex++];
if(listIndex<listLength) {
@ -128,13 +130,30 @@ void BMPSet::initBits() {
} else {
limit=0x110000;
}
if(start>=0x80) {
if(start>=0x100) {
break;
}
do {
asciiBytes[start++]=1;
} while(start<limit && start<0x80);
} while(limit<=0x80);
latin1Contains[start++]=1;
} while(start<limit && start<0x100);
} while(limit<=0x100);
// Find the first range overlapping with (or after) 80..FF again,
// to include them in table7FF as well.
for(listIndex=0;;) {
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
} else {
limit=0x110000;
}
if(limit>0x80) {
if(start<0x80) {
start=0x80;
}
break;
}
}
// Set table7FF[].
while(start<0x800) {
@ -204,19 +223,14 @@ void BMPSet::initBits() {
* for faster validity checking at runtime.
* No need to set 0 values where they were reset to 0 in the constructor
* and not modified by initBits().
* (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* Need to set 0 values for surrogates D800..DFFF.
*/
void BMPSet::overrideIllegal() {
uint32_t bits, mask;
int32_t i;
if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
// contains(FFFD)==TRUE
for(i=0x80; i<0xc0; ++i) {
asciiBytes[i]=1;
}
if(containsFFFD) {
bits=3; // Lead bytes 0xC0 and 0xC1.
for(i=0; i<64; ++i) {
table7FF[i]|=bits;
@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
}
} else {
// contains(FFFD)==FALSE
mask=~(0x10001<<0xd); // Lead byte 0xED.
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]&=mask;
@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
UBool
BMPSet::contains(UChar32 c) const {
if((uint32_t)c<=0x7f) {
return (UBool)asciiBytes[c];
if((uint32_t)c<=0xff) {
return (UBool)latin1Contains[c];
} else if((uint32_t)c<=0x7ff) {
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
// span
do {
c=*s;
if(c<=0x7f) {
if(!asciiBytes[c]) {
if(c<=0xff) {
if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
// span not
do {
c=*s;
if(c<=0x7f) {
if(asciiBytes[c]) {
if(c<=0xff) {
if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
// span
for(;;) {
c=*(--limit);
if(c<=0x7f) {
if(!asciiBytes[c]) {
if(c<=0xff) {
if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
// span not
for(;;) {
c=*(--limit);
if(c<=0x7f) {
if(asciiBytes[c]) {
if(c<=0xff) {
if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@ -497,22 +510,22 @@ const uint8_t *
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
const uint8_t *limit=s+length;
uint8_t b=*s;
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
// Initial all-ASCII span.
if(spanCondition) {
do {
if(!asciiBytes[b] || ++s==limit) {
if(!latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b] || ++s==limit) {
if(latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
}
length=(int32_t)(limit-s);
}
@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
// single trail byte, check for preceding 3- or 4-byte lead byte
if(length>=2 && (b=*(limit-2))>=0xe0) {
limit-=2;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
// 4-byte lead byte with only two trail bytes
limit-=3;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
} else {
// lead byte with no trail bytes
--limit;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
while(s<limit) {
b=*s;
if(b<0xc0) {
// ASCII; or trail bytes with the result of contains(FFFD).
if(U8_IS_SINGLE(b)) {
// ASCII
if(spanCondition) {
do {
if(!asciiBytes[b]) {
if(!latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(b<0xc0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b]) {
if(latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(b<0xc0);
} while(U8_IS_SINGLE(b));
}
}
++s; // Advance past the lead byte.
@ -619,7 +632,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
if( ( (0x10000<=c && c<=0x10ffff) ?
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
asciiBytes[0x80]
containsFFFD
) != spanCondition
) {
return s-1;
@ -627,8 +640,9 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
s+=3;
continue;
}
} else /* 0xc0<=b<0xe0 */ {
} else {
if( /* handle U+0000..U+07FF inline */
b>=0xc0 &&
(t1=(uint8_t)(*s-0x80)) <= 0x3f
) {
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
// Give an illegal sequence the same value as the result of contains(FFFD).
// Handle each byte of an illegal sequence separately to simplify the code;
// no need to optimize error handling.
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
return s-1;
}
}
@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
do {
b=s[--length];
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
// ASCII sub-span
if(spanCondition) {
do {
if(!asciiBytes[b]) {
if(!latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b]) {
if(latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
}
}

View File

@ -28,11 +28,12 @@ U_NAMESPACE_BEGIN
* Helper class for frozen UnicodeSets, implements contains() and span()
* optimized for BMP code points. Structured to be UTF-8-friendly.
*
* ASCII: Look up bytes.
* Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
* with mixed for illegal ranges.
* Supplementary characters: Call contains() on the parent set.
* Supplementary characters: Binary search over
* the supplementary part of the parent set's inversion list.
*/
class BMPSet : public UMemory {
public:
@ -96,12 +97,12 @@ private:
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
/*
* One byte per ASCII character, or trail byte in lead position.
* 0 or 1 for ASCII characters.
* The value for trail bytes is the result of contains(FFFD)
* for faster validity checking at runtime.
* One byte 0 or 1 per Latin-1 character.
*/
UBool asciiBytes[0xc0];
UBool latin1Contains[0x100];
/* TRUE if contains(U+FFFD). */
UBool containsFFFD;
/*
* One bit per code point from U+0000..U+07FF.

View File

@ -23,9 +23,6 @@
* This file defines macros for checking whether a code point is
* a surrogate or a non-character etc.
*
* The UChar and UChar32 data types for Unicode code units and code points
* are defined in umachine.h because they can be machine-dependent.
*
* If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h
* and itself includes utf8.h and utf16.h after some
* common definitions.
@ -50,11 +47,11 @@
* but are optimized for the much more frequently occurring BMP code points.
*
* umachine.h defines UChar to be an unsigned 16-bit integer.
* Where available, UChar is defined to be a char16_t
* or a wchar_t (if that is an unsigned 16-bit type), otherwise uint16_t.
* Since ICU 59, ICU uses char16_t in C++, UChar only in C,
* and defines UChar=char16_t by default. See the UChar API docs for details.
*
* UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
* Unicode code point (Unicode scalar value, 0..0x10ffff).
* Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1).
* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
* the definition of UChar. For details see the documentation for UChar32 itself.
*
@ -63,11 +60,20 @@
* For actual Unicode character properties see uchar.h.
*
* By default, string operations must be done with error checking in case
* a string is not well-formed UTF-16.
* The macros will detect if a surrogate code unit is unpaired
* a string is not well-formed UTF-16 or UTF-8.
*
* The U16_ macros detect if a surrogate code unit is unpaired
* (lead unit without trail unit or vice versa) and just return the unit itself
* as the code point.
*
* The U8_ macros detect illegal byte sequences and return a negative value.
* Starting with ICU 60, the observable length of a single illegal byte sequence
* skipped by one of these macros follows the Unicode 6+ recommendation
* which is consistent with the W3C Encoding Standard.
*
* There are ..._OR_FFFD versions of both U16_ and U8_ macros
* that return U+FFFD for illegal code unit sequences.
*
* The regular "safe" macros require that the initial, passed-in string index
* is within bounds. They only check the index when they read more than one
* code unit. This is usually done with code similar to the following loop:
@ -91,10 +97,7 @@
* The performance differences are much larger here because UTF-8 provides so
* many opportunities for malformed sequences.
* The unsafe UTF-8 macros are entirely implemented inside the macro definitions
* and are fast, while the safe UTF-8 macros call functions for all but the
* trivial (ASCII) cases.
* (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common
* characters inline as well.)
* and are fast, while the safe UTF-8 macros call functions for some complicated cases.
*
* Unlike with UTF-16, malformed sequences cannot be expressed with distinct
* code point values (0..U+10ffff). They are indicated with negative values instead.
@ -126,8 +129,7 @@
*/
#define U_IS_UNICODE_NONCHAR(c) \
((c)>=0xfdd0 && \
((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
(uint32_t)(c)<=0x10ffff)
((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)
/**
* Is c a Unicode code point value (0..U+10ffff)
@ -148,9 +150,7 @@
*/
#define U_IS_UNICODE_CHAR(c) \
((uint32_t)(c)<0xd800 || \
((uint32_t)(c)>0xdfff && \
(uint32_t)(c)<=0x10ffff && \
!U_IS_UNICODE_NONCHAR(c)))
(0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
/**
* Is this code point a BMP code point (U+0000..U+ffff)?

View File

@ -41,34 +41,24 @@
/* internal definitions ----------------------------------------------------- */
/**
* Counts the trail bytes for a UTF-8 lead byte.
* Returns 0 for 0..0xbf as well as for 0xfe and 0xff.
* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* Note: Beginning with ICU 50, the implementation uses a multi-condition expression
* which was shown in 2012 (on x86-64) to compile to fast, branch-free code.
* leadByte is evaluated multiple times.
*
* The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:
* #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])
* leadByte was evaluated exactly once.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES(leadByte) \
((uint8_t)(leadByte)<0xf0 ? \
((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \
(uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0)
((uint8_t)(leadByte)<=0xf4 ? \
((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0) : 0)
/**
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
* The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
* Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
@ -78,7 +68,7 @@
* @internal
*/
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
(((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
(((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
/**
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
@ -89,6 +79,34 @@
*/
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
/**
* Internal bit vector for 3-byte UTF-8 validity check.
* Lead byte E0..EF bits 3..0 as byte index,
* first trail byte bits 7..5 as bit index into that byte.
* @internal
*/
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
/**
* Internal 3-byte UTF-8 validity check.
* @internal
*/
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
/**
* Internal bit vector for 4-byte UTF-8 validity check.
* First trail byte bits 7..4 as byte index,
* lead byte F0..F4 bits 2..0 as bit index into that byte.
* @internal
*/
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
/**
* Internal 4-byte UTF-8 validity check.
* @internal
*/
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
/**
* Function for handling "next code point" with error-checking.
*
@ -153,7 +171,8 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
// 0x32=0xf4-0xc2
/**
* Is this code unit (byte) a UTF-8 trail byte?
@ -161,7 +180,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
/**
* How many code units (bytes) are used for the UTF-8 encoding
@ -289,7 +308,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_NEXT_UNSAFE(s, i, c) { \
(c)=(uint8_t)(s)[(i)++]; \
if((c)>=0x80) { \
if(!U8_IS_SINGLE(c)) { \
if((c)<0xe0) { \
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
} else if((c)<0xf0) { \
@ -325,22 +344,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_NEXT(s, i, length, c) { \
(c)=(uint8_t)(s)[(i)++]; \
if((c)>=0x80) { \
if(!U8_IS_SINGLE(c)) { \
uint8_t __t1, __t2; \
if( /* handle U+1000..U+CFFF inline */ \
(0xe0<(c) && (c)<=0xec) && \
(((i)+1)<(length) || (length)<0) && \
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
) { \
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
if( /* handle U+0800..U+FFFF inline */ \
(0xe0<=(c) && (c)<0xf0) && \
(((i)+1)<(length) || (length)<0) && \
U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
(__t2=(s)[(i)+1]-0x80)<=0x3f) { \
(c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
(i)+=2; \
} else if( /* handle U+0080..U+07FF inline */ \
((c)<0xe0 && (c)>=0xc2) && \
((i)!=(length)) && \
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
) { \
((c)<0xe0 && (c)>=0xc2) && \
((i)!=(length)) && \
(__t1=(s)[i]-0x80)<=0x3f) { \
(c)=(((c)&0x1f)<<6)|__t1; \
++(i); \
} else { \
@ -376,22 +392,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_NEXT_OR_FFFD(s, i, length, c) { \
(c)=(uint8_t)(s)[(i)++]; \
if((c)>=0x80) { \
if(!U8_IS_SINGLE(c)) { \
uint8_t __t1, __t2; \
if( /* handle U+1000..U+CFFF inline */ \
(0xe0<(c) && (c)<=0xec) && \
(((i)+1)<(length) || (length)<0) && \
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
) { \
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
if( /* handle U+0800..U+FFFF inline */ \
(0xe0<=(c) && (c)<0xf0) && \
(((i)+1)<(length) || (length)<0) && \
U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
(__t2=(s)[(i)+1]-0x80)<=0x3f) { \
(c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
(i)+=2; \
} else if( /* handle U+0080..U+07FF inline */ \
((c)<0xe0 && (c)>=0xc2) && \
((i)!=(length)) && \
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
) { \
((c)<0xe0 && (c)>=0xc2) && \
((i)!=(length)) && \
(__t1=(s)[i]-0x80)<=0x3f) { \
(c)=(((c)&0x1f)<<6)|__t1; \
++(i); \
} else { \
@ -476,7 +489,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
* @stable ICU 2.4
*/
#define U8_FWD_1_UNSAFE(s, i) { \
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
}
/**
@ -493,15 +506,24 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
* @stable ICU 2.4
*/
#define U8_FWD_1(s, i, length) { \
uint8_t __b=(uint8_t)(s)[(i)++]; \
if(U8_IS_LEAD(__b)) { \
uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
if((i)+__count>(length) && (length)>=0) { \
__count=(uint8_t)((length)-(i)); \
} \
while(__count>0 && U8_IS_TRAIL((s)[i])) { \
++(i); \
--__count; \
uint8_t __b=(s)[(i)++]; \
if(U8_IS_LEAD(__b) && (i)!=(length)) { \
uint8_t __t1=(s)[i]; \
if((0xe0<=__b && __b<0xf0)) { \
if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} else if(__b<0xe0) { \
if(U8_IS_TRAIL(__t1)) { \
++(i); \
} \
} else /* c>=0xf0 */ { \
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} \
} \
}
@ -615,7 +637,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
/* c is a trail byte */ \
(c)&=0x3f; \
for(;;) { \
__b=(uint8_t)(s)[--(i)]; \
__b=(s)[--(i)]; \
if(__b>=0xc0) { \
U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(UChar32)__b<<__shift; \
@ -651,7 +673,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_PREV(s, start, i, c) { \
(c)=(uint8_t)(s)[--(i)]; \
if((c)>=0x80) { \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
} \
}
@ -682,7 +704,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_PREV_OR_FFFD(s, start, i, c) { \
(c)=(uint8_t)(s)[--(i)]; \
if((c)>=0x80) { \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
} \
}

View File

@ -502,7 +502,7 @@ spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
static inline int32_t
spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
UChar32 c=*s;
if((int8_t)c>=0) {
if(U8_IS_SINGLE(c)) {
return set.contains(c) ? 1 : -1;
}
// Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
@ -514,7 +514,7 @@ spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
static inline int32_t
spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
UChar32 c=s[length-1];
if((int8_t)c>=0) {
if(U8_IS_SINGLE(c)) {
return set.contains(c) ? 1 : -1;
}
int32_t i=length-1;
@ -1006,11 +1006,9 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
// Try to match if the increment is not listed already.
// Match at code point boundaries. (The UTF-8 strings were converted
// from UTF-16 and are guaranteed to be well-formed.)
if( !U8_IS_TRAIL(s[pos-overlap]) &&
!offsets.containsOffset(inc) &&
matches8(s+pos-overlap, s8, length8)
) {
if(!U8_IS_TRAIL(s[pos-overlap]) &&
!offsets.containsOffset(inc) &&
matches8(s+pos-overlap, s8, length8)) {
if(inc==rest) {
return length; // Reached the end of the string.
}
@ -1052,11 +1050,10 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
// Try to match if the string is longer or starts earlier.
// Match at code point boundaries. (The UTF-8 strings were converted
// from UTF-16 and are guaranteed to be well-formed.)
if( !U8_IS_TRAIL(s[pos-overlap]) &&
(overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
matches8(s+pos-overlap, s8, length8)
) {
if(!U8_IS_TRAIL(s[pos-overlap]) &&
(overlap>maxOverlap ||
/* redundant overlap==maxOverlap && */ inc>maxInc) &&
matches8(s+pos-overlap, s8, length8)) {
maxInc=inc; // Longest match from earliest start.
maxOverlap=overlap;
break;

View File

@ -256,152 +256,6 @@ u_strToUTF32(UChar32 *dest,
pErrorCode);
}
/* for utf8_nextCharSafeBodyTerminated() */
static const UChar32
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
/*
* Version of utf8_nextCharSafeBody() with the following differences:
* - checks for NUL termination instead of length
* - works with pointers instead of indexes
* - always strict (strict==-1)
*
* *ps points to after the lead byte and will be moved to after the last trail byte.
* c is the lead byte.
* @return the code point, or U_SENTINEL
*/
static UChar32
utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
const uint8_t *s=*ps;
uint8_t trail, illegal=0;
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
U_ASSERT(count<6);
U8_MASK_LEAD_BYTE((c), count);
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
switch(count) {
/* each branch falls through to the next one */
case 5:
case 4:
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
illegal=1;
break;
case 3:
trail=(uint8_t)(*s++ - 0x80);
c=(c<<6)|trail;
if(trail>0x3f || c>=0x110) {
/* not a trail byte, or code point>0x10ffff (outside Unicode) */
illegal=1;
break;
}
U_FALLTHROUGH;
case 2:
trail=(uint8_t)(*s++ - 0x80);
if(trail>0x3f) {
/* not a trail byte */
illegal=1;
break;
}
c=(c<<6)|trail;
U_FALLTHROUGH;
case 1:
trail=(uint8_t)(*s++ - 0x80);
if(trail>0x3f) {
/* not a trail byte */
illegal=1;
}
c=(c<<6)|trail;
break;
case 0:
return U_SENTINEL;
/* no default branch to optimize switch() - all values are covered */
}
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
/* illegal is also set if count>=4 */
if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
/* error handling */
/* don't go beyond this sequence */
s=*ps;
while(count>0 && U8_IS_TRAIL(*s)) {
++s;
--count;
}
c=U_SENTINEL;
}
*ps=s;
return c;
}
/*
* Version of utf8_nextCharSafeBody() with the following differences:
* - works with pointers instead of indexes
* - always strict (strict==-1)
*
* *ps points to after the lead byte and will be moved to after the last trail byte.
* c is the lead byte.
* @return the code point, or U_SENTINEL
*/
static UChar32
utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
const uint8_t *s=*ps;
uint8_t trail, illegal=0;
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
if((limit-s)>=count) {
U8_MASK_LEAD_BYTE((c), count);
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
switch(count) {
/* each branch falls through to the next one */
case 5:
case 4:
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
illegal=1;
break;
case 3:
trail=*s++;
c=(c<<6)|(trail&0x3f);
if(c<0x110) {
illegal|=(trail&0xc0)^0x80;
} else {
/* code point>0x10ffff, outside Unicode */
illegal=1;
break;
}
U_FALLTHROUGH;
case 2:
trail=*s++;
c=(c<<6)|(trail&0x3f);
illegal|=(trail&0xc0)^0x80;
U_FALLTHROUGH;
case 1:
trail=*s++;
c=(c<<6)|(trail&0x3f);
illegal|=(trail&0xc0)^0x80;
break;
case 0:
return U_SENTINEL;
/* no default branch to optimize switch() - all values are covered */
}
} else {
illegal=1; /* too few bytes left */
}
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
/* illegal is also set if count>=4 */
U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
/* error handling */
/* don't go beyond this sequence */
s=*ps;
while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
++s;
--count;
}
c=U_SENTINEL;
}
*ps=s;
return c;
}
U_CAPI UChar* U_EXPORT2
u_strFromUTF8WithSub(UChar *dest,
int32_t destCapacity,
@ -410,19 +264,10 @@ u_strFromUTF8WithSub(UChar *dest,
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode){
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
UChar32 ch;
int32_t reqLength = 0;
const uint8_t* pSrc = (const uint8_t*) src;
uint8_t t1, t2; /* trail bytes */
int32_t numSubstitutions;
/* args check */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
(destCapacity<0) || (dest == NULL && destCapacity > 0) ||
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
@ -434,7 +279,10 @@ u_strFromUTF8WithSub(UChar *dest,
if(pNumSubstitutions!=NULL) {
*pNumSubstitutions=0;
}
numSubstitutions=0;
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
int32_t reqLength = 0;
int32_t numSubstitutions=0;
/*
* Inline processing of UTF-8 byte sequences:
@ -455,95 +303,81 @@ u_strFromUTF8WithSub(UChar *dest,
* The code explicitly checks for NULs only in the lead byte position.
* A NUL byte in the trail byte position fails the trail byte range check anyway.
*/
while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
if(ch <= 0x7f){
*pDest++=(UChar)ch;
++pSrc;
int32_t i;
UChar32 c;
for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
// modified copy of U8_NEXT()
++i;
if(U8_IS_SINGLE(c)) {
*pDest++=(UChar)c;
} else {
if(ch > 0xe0) {
if( /* handle U+1000..U+CFFF inline */
ch <= 0xec &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
continue;
}
} else if(ch < 0xe0) {
if( /* handle U+0080..U+07FF inline */
ch >= 0xc2 &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
continue;
}
}
/* function call for "complicated" and error cases */
++pSrc; /* continue after the lead byte */
ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else if(ch<=0xFFFF) {
*(pDest++)=(UChar)ch;
uint8_t __t1, __t2;
if( /* handle U+0800..U+FFFF inline */
(0xe0<=(c) && (c)<0xf0) &&
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
(__t2=src[(i)+1]-0x80)<=0x3f) {
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
i+=2;
} else if( /* handle U+0080..U+07FF inline */
((c)<0xe0 && (c)>=0xc2) &&
(__t1=src[i]-0x80)<=0x3f) {
*pDest++ = (((c)&0x1f)<<6)|__t1;
++(i);
} else {
*(pDest++)=U16_LEAD(ch);
if(pDest<pDestLimit) {
*(pDest++)=U16_TRAIL(ch);
/* function call for "complicated" and error cases */
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else if(c<=0xFFFF) {
*(pDest++)=(UChar)c;
} else {
reqLength++;
break;
*(pDest++)=U16_LEAD(c);
if(pDest<pDestLimit) {
*(pDest++)=U16_TRAIL(c);
} else {
reqLength++;
break;
}
}
}
}
}
/* Pre-flight the rest of the string. */
while((ch = *pSrc) != 0) {
if(ch <= 0x7f){
while((c = (uint8_t)src[i]) != 0) {
// modified copy of U8_NEXT()
++i;
if(U8_IS_SINGLE(c)) {
++reqLength;
++pSrc;
} else {
if(ch > 0xe0) {
if( /* handle U+1000..U+CFFF inline */
ch <= 0xec &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
) {
++reqLength;
pSrc += 3;
continue;
}
} else if(ch < 0xe0) {
if( /* handle U+0080..U+07FF inline */
ch >= 0xc2 &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
) {
++reqLength;
pSrc += 2;
continue;
uint8_t __t1, __t2;
if( /* handle U+0800..U+FFFF inline */
(0xe0<=(c) && (c)<0xf0) &&
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
(__t2=src[(i)+1]-0x80)<=0x3f) {
++reqLength;
i+=2;
} else if( /* handle U+0080..U+07FF inline */
((c)<0xe0 && (c)>=0xc2) &&
(__t1=src[i]-0x80)<=0x3f) {
++reqLength;
++(i);
} else {
/* function call for "complicated" and error cases */
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}
reqLength += U16_LENGTH(c);
}
/* function call for "complicated" and error cases */
++pSrc; /* continue after the lead byte */
ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}
reqLength += U16_LENGTH(ch);
}
}
} else /* srcLength >= 0 */ {
const uint8_t *pSrcLimit = pSrc + srcLength;
int32_t count;
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
/* Faster loop without ongoing checking for srcLength and pDestLimit. */
int32_t i = 0;
UChar32 c;
for(;;) {
/*
* Each iteration of the inner loop progresses by at most 3 UTF-8
@ -551,10 +385,10 @@ u_strFromUTF8WithSub(UChar *dest,
* For supplementary code points (4 & 2), which are rare,
* there is an additional adjustment.
*/
count = (int32_t)(pDestLimit - pDest);
srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
if(count > srcLength) {
count = srcLength; /* min(remaining dest, remaining src/3) */
int32_t count = (int32_t)(pDestLimit - pDest);
int32_t count2 = (srcLength - i) / 3;
if(count > count2) {
count = count2; /* min(remaining dest, remaining src/3) */
}
if(count < 3) {
/*
@ -565,147 +399,123 @@ u_strFromUTF8WithSub(UChar *dest,
}
do {
ch = *pSrc;
if(ch <= 0x7f){
*pDest++=(UChar)ch;
++pSrc;
// modified copy of U8_NEXT()
c = (uint8_t)src[i++];
if(U8_IS_SINGLE(c)) {
*pDest++=(UChar)c;
} else {
if(ch > 0xe0) {
if( /* handle U+1000..U+CFFF inline */
ch <= 0xec &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
continue;
uint8_t __t1, __t2;
if( /* handle U+0800..U+FFFF inline */
(0xe0<=(c) && (c)<0xf0) &&
((i)+1)<srcLength &&
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
(__t2=src[(i)+1]-0x80)<=0x3f) {
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
i+=2;
} else if( /* handle U+0080..U+07FF inline */
((c)<0xe0 && (c)>=0xc2) &&
((i)!=srcLength) &&
(__t1=src[i]-0x80)<=0x3f) {
*pDest++ = (((c)&0x1f)<<6)|__t1;
++(i);
} else {
if(c >= 0xf0 || subchar > 0xffff) {
// We may read up to four bytes and write up to two UChars,
// which we didn't account for with computing count,
// so we adjust it here.
if(--count == 0) {
--i; // back out byte c
break;
}
}
} else if(ch < 0xe0) {
if( /* handle U+0080..U+07FF inline */
ch >= 0xc2 &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
continue;
}
}
if(ch >= 0xf0 || subchar > 0xffff) {
/*
* We may read up to six bytes and write up to two UChars,
* which we didn't account for with computing count,
* so we adjust it here.
*/
if(--count == 0) {
break;
/* function call for "complicated" and error cases */
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else if(c<=0xFFFF) {
*(pDest++)=(UChar)c;
} else {
*(pDest++)=U16_LEAD(c);
*(pDest++)=U16_TRAIL(c);
}
}
/* function call for "complicated" and error cases */
++pSrc; /* continue after the lead byte */
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}else if(ch<=0xFFFF){
*(pDest++)=(UChar)ch;
}else{
*(pDest++)=U16_LEAD(ch);
*(pDest++)=U16_TRAIL(ch);
}
}
} while(--count > 0);
}
while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
ch = *pSrc;
if(ch <= 0x7f){
*pDest++=(UChar)ch;
++pSrc;
while(i < srcLength && (pDest < pDestLimit)) {
// modified copy of U8_NEXT()
c = (uint8_t)src[i++];
if(U8_IS_SINGLE(c)) {
*pDest++=(UChar)c;
} else {
if(ch > 0xe0) {
if( /* handle U+1000..U+CFFF inline */
ch <= 0xec &&
((pSrcLimit - pSrc) >= 3) &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
continue;
}
} else if(ch < 0xe0) {
if( /* handle U+0080..U+07FF inline */
ch >= 0xc2 &&
((pSrcLimit - pSrc) >= 2) &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
continue;
}
}
/* function call for "complicated" and error cases */
++pSrc; /* continue after the lead byte */
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}else if(ch<=0xFFFF){
*(pDest++)=(UChar)ch;
}else{
*(pDest++)=U16_LEAD(ch);
if(pDest<pDestLimit){
*(pDest++)=U16_TRAIL(ch);
}else{
reqLength++;
break;
uint8_t __t1, __t2;
if( /* handle U+0800..U+FFFF inline */
(0xe0<=(c) && (c)<0xf0) &&
((i)+1)<srcLength &&
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
(__t2=src[(i)+1]-0x80)<=0x3f) {
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
i+=2;
} else if( /* handle U+0080..U+07FF inline */
((c)<0xe0 && (c)>=0xc2) &&
((i)!=srcLength) &&
(__t1=src[i]-0x80)<=0x3f) {
*pDest++ = (((c)&0x1f)<<6)|__t1;
++(i);
} else {
/* function call for "complicated" and error cases */
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else if(c<=0xFFFF) {
*(pDest++)=(UChar)c;
} else {
*(pDest++)=U16_LEAD(c);
if(pDest<pDestLimit) {
*(pDest++)=U16_TRAIL(c);
} else {
reqLength++;
break;
}
}
}
}
}
/* do not fill the dest buffer just count the UChars needed */
while(pSrc < pSrcLimit){
ch = *pSrc;
if(ch <= 0x7f){
reqLength++;
++pSrc;
} else {
if(ch > 0xe0) {
if( /* handle U+1000..U+CFFF inline */
ch <= 0xec &&
((pSrcLimit - pSrc) >= 3) &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
) {
reqLength++;
pSrc += 3;
continue;
}
} else if(ch < 0xe0) {
if( /* handle U+0080..U+07FF inline */
ch >= 0xc2 &&
((pSrcLimit - pSrc) >= 2) &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
) {
reqLength++;
pSrc += 2;
continue;
}
}
/* function call for "complicated" and error cases */
++pSrc; /* continue after the lead byte */
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
/* Pre-flight the rest of the string. */
while(i < srcLength) {
// modified copy of U8_NEXT()
c = (uint8_t)src[i++];
if(U8_IS_SINGLE(c)) {
++reqLength;
} else {
uint8_t __t1, __t2;
if( /* handle U+0800..U+FFFF inline */
(0xe0<=(c) && (c)<0xf0) &&
((i)+1)<srcLength &&
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
(__t2=src[(i)+1]-0x80)<=0x3f) {
++reqLength;
i+=2;
} else if( /* handle U+0080..U+07FF inline */
((c)<0xe0 && (c)>=0xc2) &&
((i)!=srcLength) &&
(__t1=src[i]-0x80)<=0x3f) {
++reqLength;
++(i);
} else {
/* function call for "complicated" and error cases */
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}
reqLength += U16_LENGTH(c);
}
reqLength+=U16_LENGTH(ch);
}
}
}
@ -753,7 +563,7 @@ u_strFromUTF8Lenient(UChar *dest,
uint8_t* pSrc = (uint8_t*) src;
/* args check */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
if(U_FAILURE(*pErrorCode)){
return NULL;
}
@ -994,7 +804,7 @@ u_strToUTF8WithSub(char *dest,
int32_t numSubstitutions;
/* args check */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
if(U_FAILURE(*pErrorCode)){
return NULL;
}
@ -1266,18 +1076,8 @@ u_strFromJavaModifiedUTF8WithSub(
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode) {
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
UChar32 ch;
int32_t reqLength = 0;
const uint8_t* pSrc = (const uint8_t*) src;
const uint8_t *pSrcLimit;
int32_t count;
uint8_t t1, t2; /* trail bytes */
int32_t numSubstitutions;
/* args check */
if(U_FAILURE(*pErrorCode)){
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
@ -1291,18 +1091,22 @@ u_strFromJavaModifiedUTF8WithSub(
if(pNumSubstitutions!=NULL) {
*pNumSubstitutions=0;
}
numSubstitutions=0;
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
int32_t reqLength = 0;
int32_t numSubstitutions=0;
if(srcLength < 0) {
/*
* Transform a NUL-terminated ASCII string.
* Handle non-ASCII strings with slower code.
*/
while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
*pDest++=(UChar)ch;
++pSrc;
UChar32 c;
while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
*pDest++=(UChar)c;
++src;
}
if(ch == 0) {
if(c == 0) {
reqLength=(int32_t)(pDest - dest);
if(pDestLength) {
*pDestLength = reqLength;
@ -1312,33 +1116,38 @@ u_strFromJavaModifiedUTF8WithSub(
u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
return dest;
}
srcLength = static_cast<int32_t>(uprv_strlen((const char *)pSrc));
srcLength = static_cast<int32_t>(uprv_strlen(src));
}
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
/* Faster loop without ongoing checking for srcLength and pDestLimit. */
UChar32 ch;
uint8_t t1, t2;
int32_t i = 0;
for(;;) {
count = (int32_t)(pDestLimit - pDest);
srcLength = (int32_t)(pSrcLimit - pSrc);
if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
int32_t count = (int32_t)(pDestLimit - pDest);
int32_t count2 = srcLength - i;
if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
/* fast ASCII loop */
const uint8_t *prevSrc = pSrc;
int32_t delta;
while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
*pDest++=(UChar)ch;
++pSrc;
int32_t start = i;
uint8_t b;
while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
*pDest++=b;
++i;
}
delta = (int32_t)(pSrc - prevSrc);
int32_t delta = i - start;
count -= delta;
srcLength -= delta;
count2 -= delta;
}
/*
* Each iteration of the inner loop progresses by at most 3 UTF-8
* bytes and one UChar.
*/
srcLength /= 3;
if(count > srcLength) {
count = srcLength; /* min(remaining dest, remaining src/3) */
if(subchar > 0xFFFF) {
break;
}
count2 /= 3;
if(count > count2) {
count = count2; /* min(remaining dest, remaining src/3) */
}
if(count < 3) {
/*
@ -1348,29 +1157,28 @@ u_strFromJavaModifiedUTF8WithSub(
break;
}
do {
ch = *pSrc;
if(ch <= 0x7f){
ch = (uint8_t)src[i++];
if(U8_IS_SINGLE(ch)) {
*pDest++=(UChar)ch;
++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
++i;
continue;
}
}
@ -1383,49 +1191,43 @@ u_strFromJavaModifiedUTF8WithSub(
* We need to write two UChars, adjusted count for that,
* and ran out of space.
*/
--i; // back out byte ch
break;
} else {
/* function call for error cases */
++pSrc; /* continue after the lead byte */
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
if(subchar<=0xFFFF) {
*(pDest++)=(UChar)subchar;
} else {
*(pDest++)=U16_LEAD(subchar);
*(pDest++)=U16_TRAIL(subchar);
}
*(pDest++)=(UChar)subchar;
}
}
} while(--count > 0);
}
while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
ch = *pSrc;
if(ch <= 0x7f){
while(i < srcLength && (pDest < pDestLimit)) {
ch = (uint8_t)src[i++];
if(U8_IS_SINGLE(ch)){
*pDest++=(UChar)ch;
++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
((pSrcLimit - pSrc) >= 3) &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
(i+1) < srcLength &&
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
((pSrcLimit - pSrc) >= 2) &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
i < srcLength &&
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
++i;
continue;
}
}
@ -1435,8 +1237,7 @@ u_strFromJavaModifiedUTF8WithSub(
return NULL;
} else {
/* function call for error cases */
++pSrc; /* continue after the lead byte */
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
if(subchar<=0xFFFF) {
*(pDest++)=(UChar)subchar;
@ -1453,32 +1254,31 @@ u_strFromJavaModifiedUTF8WithSub(
}
}
/* do not fill the dest buffer just count the UChars needed */
while(pSrc < pSrcLimit){
ch = *pSrc;
if(ch <= 0x7f) {
/* Pre-flight the rest of the string. */
while(i < srcLength) {
ch = (uint8_t)src[i++];
if(U8_IS_SINGLE(ch)) {
reqLength++;
++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
((pSrcLimit - pSrc) >= 3) &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
(i+1) < srcLength &&
(uint8_t)(src[i] - 0x80) <= 0x3f &&
(uint8_t)(src[i+1] - 0x80) <= 0x3f
) {
reqLength++;
pSrc += 3;
i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
((pSrcLimit - pSrc) >= 2) &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
i < srcLength &&
(uint8_t)(src[i] - 0x80) <= 0x3f
) {
reqLength++;
pSrc += 2;
++i;
continue;
}
}
@ -1488,8 +1288,7 @@ u_strFromJavaModifiedUTF8WithSub(
return NULL;
} else {
/* function call for error cases */
++pSrc; /* continue after the lead byte */
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
reqLength+=U16_LENGTH(ch);
}

View File

@ -847,15 +847,11 @@ U_CDECL_END
//------------------------------------------------------------------------------
// Chunk size.
// Must be less than 42 (256/6), because of byte mapping from UChar indexes to native indexes.
// Worst case there are six UTF-8 bytes per UChar.
// obsolete 6 byte form fd + 5 trails maps to fffd
// obsolete 5 byte form fc + 4 trails maps to fffd
// non-shortest 4 byte forms maps to fffd
// normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
// mapToUChars array size must allow for the worst case, 6.
// This could be brought down to 4, by treating fd and fc as pure illegal,
// rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
// Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes.
// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
// to two UChars.)
// The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
// is a three-byte sequence (truncated four-byte sequence).
//
enum { UTF8_TEXT_CHUNK_SIZE=32 };
@ -895,7 +891,7 @@ struct UTF8Buf {
// Requires two extra slots,
// one for a supplementary starting in the last normal position,
// and one for an entry for the buffer limit position.
uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*6+6]; // Map native offset from bufNativeStart to
uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
// correspoding offset in filled part of buf.
int32_t align;
};

View File

@ -7,7 +7,7 @@
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: utf_impl.c
* file name: utf_impl.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
@ -54,10 +54,6 @@
* - SUB AX, BX (result)
* -finish:
* (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
*
* In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
* lead bytes above 0xf4 are illegal.
* We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
*/
extern "C" U_EXPORT const uint8_t
utf8_countTrailBytes[256]={
@ -76,27 +72,24 @@ utf8_countTrailBytes[256]={
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// illegal C0 & C1
// 2-byte lead bytes C2..DF
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// 3-byte lead bytes E0..EF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3,
3, 3, 3, /* illegal in Unicode */
4, 4, 4, 4, /* illegal in Unicode */
5, 5, /* illegal in Unicode */
0, 0 /* illegal bytes 0xfe and 0xff */
// 4-byte lead bytes F0..F4
// illegal F5..FF
3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
static const UChar32
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
static const UChar32
utf8_errorValue[6]={
// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
// but without relying on the obsolete unicode/utf_old.h.
0x15, 0x9f, 0xffff,
0x10ffff,
0x3ffffff, 0x7fffffff
0x10ffff
};
static UChar32
@ -136,61 +129,59 @@ errorValue(int32_t count, int8_t strict) {
*/
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
// *pi is one after byte c.
int32_t i=*pi;
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
if(i+count<=length || length<0) {
uint8_t trail;
U8_MASK_LEAD_BYTE(c, count);
/* support NUL-terminated strings: do not read beyond the first non-trail byte */
switch(count) {
/* each branch falls through to the next one */
case 0:
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
case 5:
case 4:
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
break;
case 3:
trail=s[i++]-0x80;
c=(c<<6)|trail;
/* c>=0x110 would result in code point>0x10ffff, outside Unicode */
if(c>=0x110 || trail>0x3f) { break; }
U_FALLTHROUGH;
case 2:
trail=s[i++]-0x80;
c=(c<<6)|trail;
/*
* test for a surrogate d800..dfff unless we are lenient:
* before the last (c<<6), a surrogate is c=360..37f
*/
if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
U_FALLTHROUGH;
case 1:
trail=s[i++]-0x80;
c=(c<<6)|trail;
if(trail>0x3f) { break; }
/* correct sequence - all trail bytes have (b7..b6)==(10) */
if(c>=utf8_minLegal[count] &&
/* strict: forbid non-characters like U+fffe */
(strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
// length can be negative for NUL-terminated strings: Read and validate one byte at a time.
if(i==length || c>0xf4) {
// end of string, or not a lead byte
} else if(c>=0xf0) {
// Test for 4-byte sequences first because
// U8_NEXT() handles shorter valid sequences inline.
uint8_t t1=s[i], t2, t3;
c&=7;
if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f &&
++i!=length && (t3=s[i]-0x80)<=0x3f) {
++i;
c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
// strict: forbid non-characters like U+fffe
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
/* no default branch to optimize switch() - all values are covered */
}
} else {
/* too few bytes left */
count=length-i;
}
} else if(c>=0xe0) {
c&=0xf;
if(strict!=-2) {
uint8_t t1=s[i], t2;
if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
++i;
c=(c<<12)|((t1&0x3f)<<6)|t2;
// strict: forbid non-characters like U+fffe
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
}
} else {
// strict=-2 -> lenient: allow surrogates
uint8_t t1=s[i]-0x80, t2;
if(t1<=0x3f && (c>0 || t1>=0x20) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
*pi=i+1;
return (c<<12)|(t1<<6)|t2;
}
}
} else if(c>=0xc2) {
uint8_t t1=s[i]-0x80;
if(t1<=0x3f) {
*pi=i+1;
return ((c-0xc0)<<6)|t1;
}
} // else 0x80<=c<0xc2 is not a lead byte
/* error handling */
i=*pi;
while(count>0 && U8_IS_TRAIL(s[i])) {
++i;
--count;
}
c=errorValue(i-*pi, strict);
*pi=i;
return c;
@ -243,99 +234,99 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
U_CAPI UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
// *pi is the index of byte c.
int32_t i=*pi;
uint8_t b, count=1, shift=6;
if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
/* extract value bits from the last trail byte */
c&=0x3f;
for(;;) {
if(i<=start) {
/* no lead byte at all */
return errorValue(0, strict);
}
/* read another previous byte */
b=s[--i];
if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
if(b&0x40) {
/* lead byte, this will always end the loop */
uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
if(count==shouldCount) {
/* set the new position */
*pi=i;
U8_MASK_LEAD_BYTE(b, count);
c|=(UChar32)b<<shift;
if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
/* illegal sequence or (strict and non-character) */
if(count>=4) {
count=3;
if(U8_IS_TRAIL(c) && i>start) {
uint8_t b1=s[--i];
if(0xc2<=b1 && b1<0xe0) {
*pi=i;
return ((b1-0xc0)<<6)|(c&0x3f);
} else if(U8_IS_TRAIL(b1) && i>start) {
// Extract the value bits from the last trail byte.
c&=0x3f;
uint8_t b2=s[--i];
if(0xe0<=b2 && b2<0xf0) {
b2&=0xf;
if(strict!=-2) {
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
*pi=i;
c=(b2<<12)|((b1&0x3f)<<6)|c;
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
return c;
} else {
// strict: forbid non-characters like U+fffe
return errorValue(2, strict);
}
c=errorValue(count, strict);
} else {
/* exit with correct c */
}
} else {
/* the lead byte does not match the number of trail bytes */
/* only set the position to the lead byte if it would
include the trail byte that we started with */
if(count<shouldCount) {
// strict=-2 -> lenient: allow surrogates
b1-=0x80;
if((b2>0 || b1>=0x20)) {
*pi=i;
c=errorValue(count, strict);
} else {
c=errorValue(0, strict);
return (b2<<12)|(b1<<6)|c;
}
}
break;
} else if(count<5) {
/* trail byte */
c|=(UChar32)(b&0x3f)<<shift;
++count;
shift+=6;
} else {
/* more than 5 trail bytes is illegal */
c=errorValue(0, strict);
break;
} else if(U8_IS_TRAIL(b2) && i>start) {
uint8_t b3=s[--i];
if(0xf0<=b3 && b3<=0xf4) {
b3&=7;
if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
*pi=i;
c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
return c;
} else {
// strict: forbid non-characters like U+fffe
return errorValue(3, strict);
}
}
}
} else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
// Truncated 4-byte sequence.
*pi=i;
return errorValue(2, strict);
}
} else {
/* single-byte character precedes trailing bytes */
c=errorValue(0, strict);
break;
} else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
// Truncated 3- or 4-byte sequence.
*pi=i;
return errorValue(1, strict);
}
}
return c;
return errorValue(0, strict);
}
U_CAPI int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
/* i had been decremented once before the function call */
int32_t I=i, Z;
uint8_t b;
/* read at most the 6 bytes s[Z] to s[i], inclusively */
if(I-5>start) {
Z=I-5;
} else {
Z=start;
}
/* return I if the sequence starting there is long enough to include i */
do {
b=s[I];
if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
break;
} else if(b>=0xc0) {
if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
return I;
} else {
break;
// Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
int32_t orig_i=i;
uint8_t c=s[i];
if(U8_IS_TRAIL(c) && i>start) {
uint8_t b1=s[--i];
if(0xc2<=b1 && b1<0xe0) {
return i;
} else if(U8_IS_TRAIL(b1) && i>start) {
uint8_t b2=s[--i];
if(0xe0<=b2 && b2<0xf0) {
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
return i;
}
} else if(U8_IS_TRAIL(b2) && i>start) {
uint8_t b3=s[--i];
if(0xf0<=b3 && b3<=0xf4) {
if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
return i;
}
}
} else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
// Truncated 4-byte sequence.
return i;
}
} else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
// Truncated 3- or 4-byte sequence.
return i;
}
} while(Z<=--I);
/* return i itself to be consistent with the FWD_1 macro */
return i;
}
return orig_i;
}

View File

@ -20,6 +20,7 @@
#define __UTRIE2_H__
#include "unicode/utypes.h"
#include "unicode/utf8.h"
#include "putilimp.h"
#include "udataswp.h"
@ -54,6 +55,8 @@ typedef struct UTrie UTrie;
* is truncated, omitting both the BMP portion and the high range.
* - There is a special small index for 2-byte UTF-8, and the initial data
* entries are designed for fast 1/2-byte UTF-8 lookup.
* Starting with ICU 60, C0 and C1 are not recognized as UTF-8 lead bytes any more at all,
* and the associated 2-byte indexes are unused.
*/
/**
@ -933,29 +936,29 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
/** Internal UTF-8 next-post-increment: get the next code point's data. */
#define _UTRIE2_U8_NEXT(trie, ascii, data, src, limit, result) { \
uint8_t __lead=(uint8_t)*(src)++; \
if(__lead<0xc0) { \
if(U8_IS_SINGLE(__lead)) { \
(result)=(trie)->ascii[__lead]; \
} else { \
uint8_t __t1, __t2; \
if( /* handle U+0000..U+07FF inline */ \
__lead<0xe0 && (src)<(limit) && \
if( /* handle U+0800..U+FFFF inline */ \
0xe0<=__lead && __lead<0xf0 && ((src)+1)<(limit) && \
U8_IS_VALID_LEAD3_AND_T1(__lead, __t1=(uint8_t)*(src)) && \
(__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
) { \
(src)+=2; \
(result)=(trie)->data[ \
((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
((__t1&0x3f)<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
<<UTRIE2_INDEX_SHIFT)+ \
(__t2&UTRIE2_DATA_MASK)]; \
} else if( /* handle U+0080..U+07FF inline */ \
__lead<0xe0 && __lead>=0xc2 && (src)<(limit) && \
(__t1=(uint8_t)(*(src)-0x80))<=0x3f \
) { \
++(src); \
(result)=(trie)->data[ \
(trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \
__t1]; \
} else if( /* handle U+0000..U+CFFF inline */ \
__lead<0xed && ((src)+1)<(limit) && \
(__t1=(uint8_t)(*(src)-0x80))<=0x3f && (__lead>0xe0 || __t1>=0x20) && \
(__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
) { \
(src)+=2; \
(result)=(trie)->data[ \
((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
(__t1<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
<<UTRIE2_INDEX_SHIFT)+ \
(__t2&UTRIE2_DATA_MASK)]; \
} else { \
int32_t __index=utrie2_internalU8NextIndex((trie), __lead, (const uint8_t *)(src), \
(const uint8_t *)(limit)); \
@ -968,7 +971,7 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
/** Internal UTF-8 pre-decrement-previous: get the previous code point's data. */
#define _UTRIE2_U8_PREV(trie, ascii, data, start, src, result) { \
uint8_t __b=(uint8_t)*--(src); \
if(__b<0x80) { \
if(U8_IS_SINGLE(__b)) { \
(result)=(trie)->ascii[__b]; \
} else { \
int32_t __index=utrie2_internalU8PrevIndex((trie), __b, (const uint8_t *)(start), \

View File

@ -49,26 +49,25 @@ UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
}
// Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
c = u8[pos++];
if(c < 0xc0) {
// ASCII 00..7F; trail bytes 80..BF map to error values.
if(U8_IS_SINGLE(c)) {
// ASCII 00..7F
return trie->data32[c];
}
uint8_t t1, t2;
if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
// U+0080..U+07FF; 00..7F map to error values.
if(0xe0 <= c && c < 0xf0 &&
((pos + 1) < length || length < 0) &&
U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
(t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
// U+0800..U+FFFF except surrogates
c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
pos += 2;
return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
} else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
// U+0080..U+07FF
uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
c = ((c & 0x1f) << 6) | t1;
++pos;
return ce32;
} else if(c <= 0xef &&
((pos + 1) < length || length < 0) &&
(t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
(t2 = (u8[pos + 1] - 0x80)) <= 0x3f
) {
// U+0800..U+FFFF; caller maps surrogates to error values.
c = (UChar)((c << 12) | (t1 << 6) | t2);
pos += 2;
return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
} else {
// Function call for supplementary code points and error cases.
// Illegal byte sequences yield U+FFFD.
@ -158,28 +157,17 @@ FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
return Collation::FALLBACK_CE32;
}
c = u8[pos++];
if(c < 0xc0) {
// ASCII 00..7F; trail bytes 80..BF map to error values.
if(U8_IS_SINGLE(c)) {
// ASCII 00..7F
return trie->data32[c];
}
uint8_t t1, t2;
if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
// U+0080..U+07FF; 00..7F map to error values.
uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
c = ((c & 0x1f) << 6) | t1;
++pos;
if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
pos -= 2;
} else {
return ce32;
}
} else if(c <= 0xef &&
((pos + 1) < length || length < 0) &&
(t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
(t2 = (u8[pos + 1] - 0x80)) <= 0x3f
) {
// U+0800..U+FFFF; caller maps surrogates to error values.
c = (UChar)((c << 12) | (t1 << 6) | t2);
if(0xe0 <= c && c < 0xf0 &&
((pos + 1) < length || length < 0) &&
U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
(t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
// U+0800..U+FFFF except surrogates
c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
pos += 2;
if(CollationFCD::hasTccc(c) &&
(CollationFCD::maybeTibetanCompositeVowel(c) ||
@ -188,6 +176,16 @@ FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
} else {
break; // return CE32(BMP)
}
} else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
// U+0080..U+07FF
uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
c = ((c & 0x1f) << 6) | t1;
++pos;
if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
pos -= 2;
} else {
return ce32;
}
} else {
// Function call for supplementary code points and error cases.
// Illegal byte sequences yield U+FFFD.
@ -237,7 +235,7 @@ UBool
FCDUTF8CollationIterator::previousHasTccc() const {
U_ASSERT(state == CHECK_BWD && pos != 0);
UChar32 c = u8[pos - 1];
if(c < 0x80) { return FALSE; }
if(U8_IS_SINGLE(c)) { return FALSE; }
int32_t i = pos;
U8_PREV_OR_FFFD(u8, 0, i, c);
if(c > 0xffff) { c = U16_LEAD(c); }
@ -271,7 +269,7 @@ FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
return U_SENTINEL;
}
if(c < 0x80) {
if(U8_IS_SINGLE(c)) {
++pos;
return c;
}
@ -309,7 +307,7 @@ FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
if(pos == 0) {
return U_SENTINEL;
}
if((c = u8[pos - 1]) < 0x80) {
if(U8_IS_SINGLE(c = u8[pos - 1])) {
--pos;
return c;
}

View File

@ -670,12 +670,13 @@ static void Test_UChar_UTF8_API(void){
}
/* test UTF-8 with single surrogates - illegal in Unicode 3.2 */
// Since ICU 60, each surrogate byte sequence is treated as 3 single-byte errors.
{
static const UChar
withLead16[]={ 0x1800, 0xd89a, 0x0061 },
withTrail16[]={ 0x1800, 0xdcba, 0x0061, 0 },
withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0xfffd, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0xd900, 0xdc05, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
static const uint8_t
withLead8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xa2, 0x9a, 0x61 },
withTrail8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xb2, 0xba, 0x61, 0 },
@ -706,7 +707,7 @@ static void Test_UChar_UTF8_API(void){
&err);
if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16Sub50005) ||
0!=u_memcmp(withTrail16Sub50005, out16, uDestLen+1) ||
numSubstitutions!=1) {
numSubstitutions!=3) {
log_err("error: u_strFromUTF8WithSub(length) failed\n");
}
@ -721,7 +722,7 @@ static void Test_UChar_UTF8_API(void){
&err);
if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16SubFFFD) ||
0!=u_memcmp(withTrail16SubFFFD, out16, uDestLen+1) ||
numSubstitutions!=1) {
numSubstitutions!=3) {
log_err("error: u_strFromUTF8WithSub(NUL termination) failed\n");
}
@ -734,7 +735,7 @@ static void Test_UChar_UTF8_API(void){
(const char *)withTrail8, -1,
0x50005, &numSubstitutions,
&err);
if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=1) {
if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=3) {
log_err("error: u_strFromUTF8WithSub(preflight/NUL termination) failed\n");
}
@ -1015,14 +1016,6 @@ Test_FromUTF8Lenient(void) {
log_err("u_strFromUTF8Lenient(U_MEMORY_ALLOCATION_ERROR) failed\n");
}
dest[0]=0x1234;
destLength=-1;
errorCode=U_MEMORY_ALLOCATION_ERROR;
pDest=u_strFromUTF8Lenient(dest, 1, &destLength, (const char *)bytes, -1, NULL);
if(dest[0]!=0x1234) {
log_err("u_strFromUTF8Lenient(pErrorCode=NULL) failed\n");
}
/* test normal behavior */
number=0; /* string number for log_err() */

View File

@ -350,6 +350,11 @@ static void
testTrieUTF8(const char *testName,
const UTrie2 *trie, UTrie2ValueBits valueBits,
const CheckRange checkRanges[], int32_t countCheckRanges) {
// Note: The byte sequence comments refer to the original UTF-8 definition.
// Starting with ICU 60, any sequence that is not a prefix of a valid one
// is treated as multiple single-byte errors.
// For testing, we only rely on U8_... and UTrie2 UTF-8 macros
// iterating consistently.
static const uint8_t illegal[]={
0xc0, 0x80, /* non-shortest U+0000 */
0xc1, 0xbf, /* non-shortest U+007f */
@ -394,15 +399,36 @@ testTrieUTF8(const char *testName,
value=checkRanges[i].value;
/* write three legal (or surrogate) code points */
U8_APPEND_UNSAFE(s, length, prevCP); /* start of the range */
values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value;
if(U_IS_SURROGATE(prevCP)) {
// A surrogate byte sequence counts as 3 single-byte errors.
values[countValues++]=errorValue;
values[countValues++]=errorValue;
values[countValues++]=errorValue;
} else {
values[countValues++]=value;
}
c=checkRanges[i].limit;
prevCP=(prevCP+c)/2; /* middle of the range */
U8_APPEND_UNSAFE(s, length, prevCP);
values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value;
if(U_IS_SURROGATE(prevCP)) {
// A surrogate byte sequence counts as 3 single-byte errors.
values[countValues++]=errorValue;
values[countValues++]=errorValue;
values[countValues++]=errorValue;
} else {
values[countValues++]=value;
}
prevCP=c;
--c; /* end of the range */
U8_APPEND_UNSAFE(s, length, c);
values[countValues++]=U_IS_SURROGATE(c) ? errorValue : value;
if(U_IS_SURROGATE(prevCP)) {
// A surrogate byte sequence counts as 3 single-byte errors.
values[countValues++]=errorValue;
values[countValues++]=errorValue;
values[countValues++]=errorValue;
} else {
values[countValues++]=value;
}
/* write an illegal byte sequence */
if(i8<sizeof(illegal)) {
U8_FWD_1(illegal, i8, sizeof(illegal));
@ -435,17 +461,20 @@ testTrieUTF8(const char *testName,
}
bytes=0;
if(value!=values[i] || i8!=(p-s)) {
while(prev8<i8) {
bytes=(bytes<<8)|s[prev8++];
int32_t k=prev8;
while(k<i8) {
bytes=(bytes<<8)|s[k++];
}
}
if(value!=values[i]) {
log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n",
testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]);
log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx) (read %d bytes): "
"0x%lx instead of 0x%lx\n",
testName, (int)prev8, (unsigned long)bytes, (long)c, (int)((p-s)-prev8),
(long)value, (long)values[i]);
}
if(i8!=(p-s)) {
log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): %ld != %ld\n",
testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx): %ld != %ld\n",
testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
continue;
}
++i;
@ -471,12 +500,14 @@ testTrieUTF8(const char *testName,
}
}
if(value!=values[i]) {
log_err("error: wrong value from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n",
testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]);
log_err("error: wrong value from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx) (read %d bytes): "
": 0x%lx instead of 0x%lx\n",
testName, (int)prev8, (unsigned long)bytes, (long)c, (int)(prev8-(p-s)),
(long)value, (long)values[i]);
}
if(i8!=(p-s)) {
log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): %ld != %ld\n",
testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx): %ld != %ld\n",
testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
continue;
}
}

View File

@ -121,7 +121,7 @@ addUTF8Test(TestNode** root)
static void TestCodeUnitValues()
{
static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
int16_t i;
for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
@ -231,28 +231,31 @@ static void TestGetChar()
0x10401, 0x10401, 0x10401 ,
0x10401, 0x10401, 0x10401 ,
0x10401, 0x10401, 0x10401,
0x25, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x65, 0x65, 0x65,
0x31, 0x31, 0x31,
0x31, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x240, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
-1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
};
uint16_t i=0;
UChar32 c, expected;
uint32_t offset=0;
for(offset=0; offset<sizeof(input); offset++) {
if (offset < sizeof(input) - 1) {
expected = result[i];
if (expected >= 0 && offset < sizeof(input) - 1) {
#if !U_HIDE_OBSOLETE_UTF_OLD_H
UTF8_GET_CHAR_UNSAFE(input, offset, c);
if(c != result[i]){
log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
if(c != expected) {
log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
offset, expected, c);
}
#endif
U8_GET_UNSAFE(input, offset, c);
if(c != result[i]){
log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
if(c != expected) {
log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
offset, expected, c);
}
}
@ -285,146 +288,160 @@ static void TestGetChar()
}
static void TestNextPrevChar() {
static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
static const uint8_t input[]={
0x61,
0xf0, 0x90, 0x90, 0x81,
0xc0, 0x80, // non-shortest form
0xf3, 0xbe, // truncated
0xc2, // truncated
0x61,
0x81, 0x90, 0x90, 0xf0, // "backwards" sequence
0x00
};
static const UChar32 result[]={
/* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000,
0x10401, 0x10401, 0x10401, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841410, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1050, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x61, 0x61,
0x80, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xc2, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0xfd, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x77e, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
0xbe, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0xa1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
0x61, 0x61, 0x61, 0xc0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401, 0x10401,
0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
0x0840, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061
/* next_safe_ns next_safe_s prev_safe_ns prev_safe_s */
0x0061, 0x0061, 0x0000, 0x0000,
0x10401, 0x10401, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x61, 0x61,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x61, 0x61, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
0x0000, 0x0000, 0x0061, 0x0061
};
static const int32_t movedOffset[]={
/* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
1, 1, 1, 15, 15, 15,
5, 5, 5, 14, 14 , 14,
3, 3, 3, 9, 13, 13,
4, 4, 4, 9, 12, 12,
5, 5, 5, 9, 11, 11,
7, 7, 7, 10, 10, 10,
7, 7, 7, 9, 9, 9,
8, 9, 9, 7, 7, 7,
9, 9, 9, 7, 7, 7,
11, 10, 10, 5, 5, 5,
11, 11, 11, 5, 5, 5,
12, 12, 12, 1, 1, 1,
13, 13, 13, 1, 1, 1,
14, 14, 14, 1, 1, 1,
14, 15, 15, 1, 1, 1,
14, 16, 16, 0, 0, 0,
/* next_safe prev_safe_s */
1, 15,
5, 14,
3, 13,
4, 12,
5, 11,
6, 10,
7, 9,
9, 7,
9, 7,
10, 6,
11, 5,
12, 1,
13, 1,
14, 1,
15, 1,
16, 0,
};
/* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */
UChar32 c, expected;
uint32_t i=0;
uint32_t i=0, j=0;
uint32_t offset=0;
int32_t setOffset=0;
for(offset=0; offset<sizeof(input); offset++){
expected=result[i+1];
expected=result[i]; // next_safe_ns
#if !U_HIDE_OBSOLETE_UTF_OLD_H
setOffset=offset;
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
if(setOffset != movedOffset[i+1]){
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+1], setOffset);
}
if(c != expected){
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
setOffset=offset;
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
if(setOffset != movedOffset[j]) {
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[j], setOffset);
}
if(c != expected) {
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
#endif
setOffset=offset;
U8_NEXT(input, setOffset, sizeof(input), c);
if(setOffset != movedOffset[i+1]){
log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+1], setOffset);
}
setOffset=offset;
U8_NEXT(input, setOffset, sizeof(input), c);
if(setOffset != movedOffset[j]) {
log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[j], setOffset);
}
if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
if(c != expected){
log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
if(c != expected) {
log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
setOffset=offset;
U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
if(setOffset != movedOffset[i+1]){
if(setOffset != movedOffset[j]) {
log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+1], setOffset);
offset, movedOffset[j], setOffset);
}
if(expected<0) { expected=0xfffd; }
if(c != expected){
log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
if(c != expected) {
log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
setOffset=offset;
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
if(setOffset != movedOffset[i+1]){
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+2], setOffset);
}
if(c != result[i+2]){
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
}
setOffset=offset;
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
if(setOffset != movedOffset[j]) {
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[j], setOffset);
}
expected=result[i+1]; // next_safe_s
if(c != expected) {
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
offset, expected, c);
}
#endif
i=i+6;
i=i+4;
j=j+2;
}
i=0;
i=j=0;
for(offset=sizeof(input); offset > 0; --offset){
expected=result[i+4];
expected=result[i+2]; // prev_safe_ns
#if !U_HIDE_OBSOLETE_UTF_OLD_H
setOffset=offset;
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
if(setOffset != movedOffset[i+4]){
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+4], setOffset);
}
if(c != expected){
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
setOffset=offset;
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
if(setOffset != movedOffset[j+1]) {
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[j+1], setOffset);
}
if(c != expected) {
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
#endif
setOffset=offset;
U8_PREV(input, 0, setOffset, c);
if(setOffset != movedOffset[i+4]){
log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+4], setOffset);
}
setOffset=offset;
U8_PREV(input, 0, setOffset, c);
if(setOffset != movedOffset[j+1]) {
log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[j+1], setOffset);
}
if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
if(c != expected){
log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
if(c != expected) {
log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
setOffset=offset;
U8_PREV_OR_FFFD(input, 0, setOffset, c);
if(setOffset != movedOffset[i+4]){
if(setOffset != movedOffset[j+1]) {
log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+4], setOffset);
offset, movedOffset[j+1], setOffset);
}
if(expected<0) { expected=0xfffd; }
if(c != expected){
log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
if(c != expected) {
log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
setOffset=offset;
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
if(setOffset != movedOffset[i+5]){
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+5], setOffset);
}
if(c != result[i+5]){
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
}
setOffset=offset;
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
if(setOffset != movedOffset[j+1]) {
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[j+1], setOffset);
}
expected=result[i+3]; // prev_safe_s
if(c != expected) {
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
offset, expected, c);
}
#endif
i=i+6;
i=i+4;
j=j+2;
}
}
@ -433,11 +450,13 @@ static void TestNulTerminated() {
static const uint8_t input[]={
/* 0 */ 0x61,
/* 1 */ 0xf0, 0x90, 0x90, 0x81,
/* 5 */ 0xc0, 0x80,
/* 5 */ 0xc0,
/* 6 */ 0x80,
/* 7 */ 0xdf, 0x80,
/* 9 */ 0xc2,
/* 10 */ 0x62,
/* 11 */ 0xfd, 0xbe,
/* 11 */ 0xfd,
/* 12 */ 0xbe,
/* 13 */ 0xe0, 0xa0, 0x80,
/* 16 */ 0xe2, 0x82, 0xac,
/* 19 */ 0xf0, 0x90, 0x90,
@ -447,14 +466,16 @@ static void TestNulTerminated() {
static const UChar32 result[]={
0x61,
0x10401,
U_SENTINEL,
U_SENTINEL, // C0 not a lead byte
U_SENTINEL, // 80
0x7c0,
U_SENTINEL,
U_SENTINEL, // C2
0x62,
U_SENTINEL,
U_SENTINEL, // FD not a lead byte
U_SENTINEL, // BE
0x800,
0x20ac,
U_SENTINEL,
U_SENTINEL, // truncated F0 90 90
0
};
@ -544,6 +565,22 @@ static void TestNextPrevNonCharacters() {
log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
}
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
for(idx=0; idx<(int32_t)sizeof(nonChars);) {
UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE);
if(ch!=expected) {
log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
}
}
for(idx=(int32_t)sizeof(nonChars); idx>0;) {
UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE);
UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
if(ch!=expected) {
log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
}
}
#endif
}
static void TestNextPrevCharUnsafe() {
@ -563,58 +600,83 @@ static void TestNextPrevCharUnsafe() {
static const UChar32 codePoints[]={
0x61,
0x10401,
0,
-1,
0x20ac,
0xa1,
0x10ffff,
0
};
UChar32 c;
UChar32 c, expected;
int32_t i;
uint32_t offset;
#if !U_HIDE_OBSOLETE_UTF_OLD_H
for(i=0, offset=0; offset<sizeof(input); ++i) {
UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
if(c != codePoints[i]){
expected = codePoints[i];
if(expected >= 0 && c != expected) {
log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
offset, codePoints[i], c);
offset, expected, c);
}
if(offset==6) {
// The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
// while the new one skips C0 80 together.
++offset;
}
}
#endif
for(i=0, offset=0; offset<sizeof(input); ++i) {
U8_NEXT_UNSAFE(input, offset, c);
if(c != codePoints[i]){
expected = codePoints[i];
if(expected >= 0 && c != expected) {
log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
offset, codePoints[i], c);
offset, expected, c);
}
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
UTF8_PREV_CHAR_UNSAFE(input, offset, c);
if(c != codePoints[i]){
log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
offset, codePoints[i], c);
}
UTF8_PREV_CHAR_UNSAFE(input, offset, c);
expected = codePoints[i];
if(expected >= 0 && c != expected) {
log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
offset, expected, c);
}
}
#endif
for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
U8_PREV_UNSAFE(input, offset, c);
if(c != codePoints[i]){
log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
offset, codePoints[i], c);
}
U8_PREV_UNSAFE(input, offset, c);
expected = codePoints[i];
if(expected >= 0 && c != expected) {
log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
offset, expected, c);
}
}
}
static void TestFwdBack() {
static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
static const uint8_t input[]={
0x61,
0xF0, 0x90, 0x90, 0x81,
0xff,
0x62,
0xc0,
0x80,
0x7f,
0x8f,
0xc0,
0x63,
0x81,
0x90,
0x90,
0xF0,
0x00
};
static const uint16_t fwd_safe[] ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0};
static const uint16_t back_N_safe[] ={18, 17, 15, 11, 10, 8, 7, 0};
uint32_t offsafe=0;
@ -707,7 +769,10 @@ static void TestFwdBackUnsafe() {
0xf4, 0x8f, 0xbf, 0xbf,
0x00
};
static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
// forward unsafe skips only C0
static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
// backward unsafe skips C0 80 together
static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
int32_t offset;
int32_t i;
@ -726,17 +791,17 @@ static void TestFwdBackUnsafe() {
}
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
UTF8_BACK_1_UNSAFE(input, offset);
if(offset != boundaries[i]){
log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
if(offset != backBoundaries[i]){
log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
}
}
#endif
for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
U8_BACK_1_UNSAFE(input, offset);
if(offset != boundaries[i]){
log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
if(offset != backBoundaries[i]){
log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
}
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
@ -756,21 +821,21 @@ static void TestFwdBackUnsafe() {
}
}
#if !U_HIDE_OBSOLETE_UTF_OLD_H
for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
offset=UPRV_LENGTHOF(input);
UTF8_BACK_N_UNSAFE(input, offset, i);
if(offset != boundaries[j]) {
log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
if(offset != backBoundaries[j]) {
log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
}
}
#endif
for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
offset=UPRV_LENGTHOF(input);
U8_BACK_N_UNSAFE(input, offset, i);
if(offset != boundaries[j]) {
log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
if(offset != backBoundaries[j]) {
log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
}
}
}
@ -1138,8 +1203,12 @@ TestSurrogates() {
log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
}
if(is!=iu || il!=iu) {
log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
// U8_NEXT() skips only the first byte of a surrogate byte sequence.
if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
}
if(il!=iu) {
log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
}
++k; /* next code point */
@ -1175,8 +1244,12 @@ TestSurrogates() {
log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
}
if(is!=iu || il !=iu) {
log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
// U8_PREV() skips only the last byte of a surrogate byte sequence.
if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
}
if(il !=iu) {
log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
}
i=iu; /* go back by one UTF-8 sequence */

View File

@ -294,24 +294,22 @@ void CollationTest::TestIllegalUTF8() {
coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
static const char *strings[] = {
// U+FFFD
"a\xef\xbf\xbdz",
// illegal byte sequences
"a\x80z", // trail byte
"a\xc1\x81z", // non-shortest form
"a\xe0\x82\x83z", // non-shortest form
"a\xed\xa0\x80z", // lead surrogate: would be U+D800
"a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
"a\xf0\x8f\xbf\xbfz", // non-shortest form
"a\xf4\x90\x80\x80z" // out of range: would be U+110000
// string with U+FFFD == illegal byte sequence
u8"a\uFFFDz", "a\x80z", // trail byte
u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
};
StringPiece fffd(strings[0]);
for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
StringPiece illegal(strings[i]);
for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
StringPiece fffd(strings[i]);
StringPiece illegal(strings[i + 1]);
UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
if(order != UCOL_EQUAL) {
errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
(int)i, order);
}
}

View File

@ -146,7 +146,7 @@ void
StringTest::Test_UTF8_COUNT_TRAIL_BYTES() {
#if !U_HIDE_OBSOLETE_UTF_OLD_H
if(UTF8_COUNT_TRAIL_BYTES(0x7F) != 0
|| UTF8_COUNT_TRAIL_BYTES(0xC0) != 1
|| UTF8_COUNT_TRAIL_BYTES(0xC2) != 1
|| UTF8_COUNT_TRAIL_BYTES(0xE0) != 2
|| UTF8_COUNT_TRAIL_BYTES(0xF0) != 3) {
errln("UTF8_COUNT_TRAIL_BYTES does not work right! See utf_old.h.");
@ -155,7 +155,7 @@ StringTest::Test_UTF8_COUNT_TRAIL_BYTES() {
// Note: U8_COUNT_TRAIL_BYTES (current) and UTF8_COUNT_TRAIL_BYTES (deprecated)
// have completely different implementations.
if (U8_COUNT_TRAIL_BYTES(0x7F) != 0
|| U8_COUNT_TRAIL_BYTES(0xC0) != 1
|| U8_COUNT_TRAIL_BYTES(0xC2) != 1
|| U8_COUNT_TRAIL_BYTES(0xE0) != 2
|| U8_COUNT_TRAIL_BYTES(0xF0) != 3) {
errln("U8_COUNT_TRAIL_BYTES does not work right! See utf8.h.");

View File

@ -1881,9 +1881,9 @@ UnicodeStringTest::TestUTF8() {
0xf3, 0xa0, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
};
static const UChar expected_utf16[] = {
0x41, 0xfffd,
0x61, 0xfffd,
0xfffd, 0x5a,
0x41, 0xfffd, 0xfffd, 0xfffd,
0x61, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd,0x5a,
0xd900, 0xdc00, 0x7a,
0xd800, 0xdc00, 0xd840, 0xdc00,
0xdb40, 0xdc00, 0xdbff, 0xdfff

View File

@ -60,7 +60,6 @@ UTextTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE_AUTO(Ticket10562);
TESTCASE_AUTO(Ticket10983);
TESTCASE_AUTO(Ticket12130);
TESTCASE_AUTO(Ticket12888);
TESTCASE_AUTO(Ticket13344);
TESTCASE_AUTO_END;
}
@ -951,10 +950,14 @@ void UTextTest::ErrorTest()
UChar buf[10];
int n = utext_extract(ut, 0, 9, buf, 10, &status);
TEST_SUCCESS(status);
TEST_ASSERT(n==5);
TEST_ASSERT(n==7);
TEST_ASSERT(buf[0] == 0x41);
TEST_ASSERT(buf[1] == 0xfffd);
TEST_ASSERT(buf[3] == 0xfffd);
TEST_ASSERT(buf[2] == 0x42);
TEST_ASSERT(buf[3] == 0xfffd);
TEST_ASSERT(buf[4] == 0xfffd);
TEST_ASSERT(buf[5] == 0xfffd);
TEST_ASSERT(buf[6] == 0x43);
utext_close(ut);
}
@ -1578,66 +1581,6 @@ void UTextTest::Ticket12130() {
utext_close(&ut);
}
// Ticket 12888: bad handling of illegal utf-8 containing many instances of the archaic, now illegal,
// six byte utf-8 forms. Original implementation had an assumption that
// there would be at most three utf-8 bytes per UTF-16 code unit.
// The five and six byte sequences map to a single replacement character.
void UTextTest::Ticket12888() {
const char *badString =
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80";
UErrorCode status = U_ZERO_ERROR;
LocalUTextPointer ut(utext_openUTF8(NULL, badString, -1, &status));
TEST_SUCCESS(status);
for (;;) {
UChar32 c = utext_next32(ut.getAlias());
if (c == U_SENTINEL) {
break;
}
}
int32_t endIdx = utext_getNativeIndex(ut.getAlias());
if (endIdx != (int32_t)strlen(badString)) {
errln("%s:%d expected=%d, actual=%d", __FILE__, __LINE__, strlen(badString), endIdx);
return;
}
for (int32_t prevIndex = endIdx; prevIndex>0;) {
UChar32 c = utext_previous32(ut.getAlias());
int32_t currentIndex = utext_getNativeIndex(ut.getAlias());
if (c != 0xfffd) {
errln("%s:%d (expected, actual, index) = (%d, %d, %d)\n",
__FILE__, __LINE__, 0xfffd, c, currentIndex);
break;
}
if (currentIndex != prevIndex - 6) {
errln("%s:%d: wrong index. Expected, actual = %d, %d",
__FILE__, __LINE__, prevIndex - 6, currentIndex);
break;
}
prevIndex = currentIndex;
}
}
// Ticket 13344 The macro form of UTEXT_SETNATIVEINDEX failed when target was a trail surrogate
// of a supplementary character.

View File

@ -38,7 +38,6 @@ public:
void Ticket10562();
void Ticket10983();
void Ticket12130();
void Ticket12888();
void Ticket13344();
private:

View File

@ -16,11 +16,12 @@ import com.ibm.icu.util.OutputInt;
/**
* Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
*
*
* Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
* Supplementary characters: Call contains() on the parent set.
* Supplementary characters: Binary search over
* the supplementary part of the parent set's inversion list.
*/
public final class BMPSet {
public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
@ -34,9 +35,8 @@ public final class BMPSet {
* One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
* correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
* trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
*
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
* runtime.
*
* Bits for 0..FF are unused (0).
*/
private int[] table7FF;
@ -46,9 +46,8 @@ public final class BMPSet {
* t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
* indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
* and set.contains(c) must be called.
*
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
* validity checking at runtime.
*
* Bits for 0..7FF are unused (0).
*/
private int[] bmpBlockBits;
@ -127,7 +126,7 @@ public final class BMPSet {
/**
* Span the initial substring for which each character c has spanCondition==contains(c). It must be
* spanCondition==0 or 1.
*
*
* @param start The start index
* @param outCount If not null: Receives the number of code points in the span.
* @return the limit (exclusive end) of the span
@ -232,7 +231,7 @@ public final class BMPSet {
* Symmetrical with span().
* Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
* limit and spanCondition==0 or 1.
*
*
* @return The string index which starts the span (i.e. inclusive).
*/
public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
@ -462,10 +461,10 @@ public final class BMPSet {
/**
* Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
* points in a certain range.
*
*
* For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
* hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
*
*
* @param c
* a character in a subrange of MIN_VALUE..MAX_VALUE
* @param lo
@ -512,4 +511,3 @@ public final class BMPSet {
return (0 != (findCodePoint(c, lo, hi) & 1));
}
}