ICU-13311 change illegal-UTF-8 handling in non-converter code
X-SVN-Rev: 40445
This commit is contained in:
parent
119d75dc46
commit
27c08578ac
@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN
|
||||
|
||||
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
|
||||
list(parentList), listLength(parentListLength) {
|
||||
uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
|
||||
uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
|
||||
uprv_memset(table7FF, 0, sizeof(table7FF));
|
||||
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
|
||||
|
||||
@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
|
||||
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
|
||||
}
|
||||
list4kStarts[0x11]=listLength-1;
|
||||
containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
|
||||
|
||||
initBits();
|
||||
overrideIllegal();
|
||||
}
|
||||
|
||||
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
|
||||
containsFFFD(otherBMPSet.containsFFFD),
|
||||
list(newParentList), listLength(newParentListLength) {
|
||||
uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
|
||||
uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
|
||||
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
|
||||
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
|
||||
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
|
||||
@ -120,7 +122,7 @@ void BMPSet::initBits() {
|
||||
UChar32 start, limit;
|
||||
int32_t listIndex=0;
|
||||
|
||||
// Set asciiBytes[].
|
||||
// Set latin1Contains[].
|
||||
do {
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
@ -128,13 +130,30 @@ void BMPSet::initBits() {
|
||||
} else {
|
||||
limit=0x110000;
|
||||
}
|
||||
if(start>=0x80) {
|
||||
if(start>=0x100) {
|
||||
break;
|
||||
}
|
||||
do {
|
||||
asciiBytes[start++]=1;
|
||||
} while(start<limit && start<0x80);
|
||||
} while(limit<=0x80);
|
||||
latin1Contains[start++]=1;
|
||||
} while(start<limit && start<0x100);
|
||||
} while(limit<=0x100);
|
||||
|
||||
// Find the first range overlapping with (or after) 80..FF again,
|
||||
// to include them in table7FF as well.
|
||||
for(listIndex=0;;) {
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
limit=list[listIndex++];
|
||||
} else {
|
||||
limit=0x110000;
|
||||
}
|
||||
if(limit>0x80) {
|
||||
if(start<0x80) {
|
||||
start=0x80;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Set table7FF[].
|
||||
while(start<0x800) {
|
||||
@ -204,19 +223,14 @@ void BMPSet::initBits() {
|
||||
* for faster validity checking at runtime.
|
||||
* No need to set 0 values where they were reset to 0 in the constructor
|
||||
* and not modified by initBits().
|
||||
* (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
|
||||
* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
|
||||
* Need to set 0 values for surrogates D800..DFFF.
|
||||
*/
|
||||
void BMPSet::overrideIllegal() {
|
||||
uint32_t bits, mask;
|
||||
int32_t i;
|
||||
|
||||
if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
|
||||
// contains(FFFD)==TRUE
|
||||
for(i=0x80; i<0xc0; ++i) {
|
||||
asciiBytes[i]=1;
|
||||
}
|
||||
|
||||
if(containsFFFD) {
|
||||
bits=3; // Lead bytes 0xC0 and 0xC1.
|
||||
for(i=0; i<64; ++i) {
|
||||
table7FF[i]|=bits;
|
||||
@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
|
||||
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
|
||||
}
|
||||
} else {
|
||||
// contains(FFFD)==FALSE
|
||||
mask=~(0x10001<<0xd); // Lead byte 0xED.
|
||||
for(i=32; i<64; ++i) { // Second half of 4k block.
|
||||
bmpBlockBits[i]&=mask;
|
||||
@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
|
||||
|
||||
UBool
|
||||
BMPSet::contains(UChar32 c) const {
|
||||
if((uint32_t)c<=0x7f) {
|
||||
return (UBool)asciiBytes[c];
|
||||
if((uint32_t)c<=0xff) {
|
||||
return (UBool)latin1Contains[c];
|
||||
} else if((uint32_t)c<=0x7ff) {
|
||||
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
|
||||
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
|
||||
@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
|
||||
// span
|
||||
do {
|
||||
c=*s;
|
||||
if(c<=0x7f) {
|
||||
if(!asciiBytes[c]) {
|
||||
if(c<=0xff) {
|
||||
if(!latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
|
||||
// span not
|
||||
do {
|
||||
c=*s;
|
||||
if(c<=0x7f) {
|
||||
if(asciiBytes[c]) {
|
||||
if(c<=0xff) {
|
||||
if(latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
|
||||
// span
|
||||
for(;;) {
|
||||
c=*(--limit);
|
||||
if(c<=0x7f) {
|
||||
if(!asciiBytes[c]) {
|
||||
if(c<=0xff) {
|
||||
if(!latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
|
||||
// span not
|
||||
for(;;) {
|
||||
c=*(--limit);
|
||||
if(c<=0x7f) {
|
||||
if(asciiBytes[c]) {
|
||||
if(c<=0xff) {
|
||||
if(latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
@ -497,22 +510,22 @@ const uint8_t *
|
||||
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
|
||||
const uint8_t *limit=s+length;
|
||||
uint8_t b=*s;
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
// Initial all-ASCII span.
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!asciiBytes[b] || ++s==limit) {
|
||||
if(!latin1Contains[b] || ++s==limit) {
|
||||
return s;
|
||||
}
|
||||
b=*s;
|
||||
} while((int8_t)b>=0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
} else {
|
||||
do {
|
||||
if(asciiBytes[b] || ++s==limit) {
|
||||
if(latin1Contains[b] || ++s==limit) {
|
||||
return s;
|
||||
}
|
||||
b=*s;
|
||||
} while((int8_t)b>=0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
}
|
||||
length=(int32_t)(limit-s);
|
||||
}
|
||||
@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
|
||||
// single trail byte, check for preceding 3- or 4-byte lead byte
|
||||
if(length>=2 && (b=*(limit-2))>=0xe0) {
|
||||
limit-=2;
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
if(containsFFFD!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
|
||||
// 4-byte lead byte with only two trail bytes
|
||||
limit-=3;
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
if(containsFFFD!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// lead byte with no trail bytes
|
||||
--limit;
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
if(containsFFFD!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
}
|
||||
@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
|
||||
|
||||
while(s<limit) {
|
||||
b=*s;
|
||||
if(b<0xc0) {
|
||||
// ASCII; or trail bytes with the result of contains(FFFD).
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
// ASCII
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!asciiBytes[b]) {
|
||||
if(!latin1Contains[b]) {
|
||||
return s;
|
||||
} else if(++s==limit) {
|
||||
return limit0;
|
||||
}
|
||||
b=*s;
|
||||
} while(b<0xc0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
} else {
|
||||
do {
|
||||
if(asciiBytes[b]) {
|
||||
if(latin1Contains[b]) {
|
||||
return s;
|
||||
} else if(++s==limit) {
|
||||
return limit0;
|
||||
}
|
||||
b=*s;
|
||||
} while(b<0xc0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
}
|
||||
}
|
||||
++s; // Advance past the lead byte.
|
||||
@ -619,7 +632,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
|
||||
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
|
||||
if( ( (0x10000<=c && c<=0x10ffff) ?
|
||||
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
|
||||
asciiBytes[0x80]
|
||||
containsFFFD
|
||||
) != spanCondition
|
||||
) {
|
||||
return s-1;
|
||||
@ -627,8 +640,9 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
|
||||
s+=3;
|
||||
continue;
|
||||
}
|
||||
} else /* 0xc0<=b<0xe0 */ {
|
||||
} else {
|
||||
if( /* handle U+0000..U+07FF inline */
|
||||
b>=0xc0 &&
|
||||
(t1=(uint8_t)(*s-0x80)) <= 0x3f
|
||||
) {
|
||||
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
|
||||
@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
|
||||
// Give an illegal sequence the same value as the result of contains(FFFD).
|
||||
// Handle each byte of an illegal sequence separately to simplify the code;
|
||||
// no need to optimize error handling.
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
if(containsFFFD!=spanCondition) {
|
||||
return s-1;
|
||||
}
|
||||
}
|
||||
@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
|
||||
|
||||
do {
|
||||
b=s[--length];
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
// ASCII sub-span
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!asciiBytes[b]) {
|
||||
if(!latin1Contains[b]) {
|
||||
return length+1;
|
||||
} else if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
b=s[--length];
|
||||
} while((int8_t)b>=0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
} else {
|
||||
do {
|
||||
if(asciiBytes[b]) {
|
||||
if(latin1Contains[b]) {
|
||||
return length+1;
|
||||
} else if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
b=s[--length];
|
||||
} while((int8_t)b>=0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -28,11 +28,12 @@ U_NAMESPACE_BEGIN
|
||||
* Helper class for frozen UnicodeSets, implements contains() and span()
|
||||
* optimized for BMP code points. Structured to be UTF-8-friendly.
|
||||
*
|
||||
* ASCII: Look up bytes.
|
||||
* Latin-1: Look up bytes.
|
||||
* 2-byte characters: Bits organized vertically.
|
||||
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
|
||||
* with mixed for illegal ranges.
|
||||
* Supplementary characters: Call contains() on the parent set.
|
||||
* Supplementary characters: Binary search over
|
||||
* the supplementary part of the parent set's inversion list.
|
||||
*/
|
||||
class BMPSet : public UMemory {
|
||||
public:
|
||||
@ -96,12 +97,12 @@ private:
|
||||
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
|
||||
|
||||
/*
|
||||
* One byte per ASCII character, or trail byte in lead position.
|
||||
* 0 or 1 for ASCII characters.
|
||||
* The value for trail bytes is the result of contains(FFFD)
|
||||
* for faster validity checking at runtime.
|
||||
* One byte 0 or 1 per Latin-1 character.
|
||||
*/
|
||||
UBool asciiBytes[0xc0];
|
||||
UBool latin1Contains[0x100];
|
||||
|
||||
/* TRUE if contains(U+FFFD). */
|
||||
UBool containsFFFD;
|
||||
|
||||
/*
|
||||
* One bit per code point from U+0000..U+07FF.
|
||||
|
@ -23,9 +23,6 @@
|
||||
* This file defines macros for checking whether a code point is
|
||||
* a surrogate or a non-character etc.
|
||||
*
|
||||
* The UChar and UChar32 data types for Unicode code units and code points
|
||||
* are defined in umachine.h because they can be machine-dependent.
|
||||
*
|
||||
* If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h
|
||||
* and itself includes utf8.h and utf16.h after some
|
||||
* common definitions.
|
||||
@ -50,11 +47,11 @@
|
||||
* but are optimized for the much more frequently occurring BMP code points.
|
||||
*
|
||||
* umachine.h defines UChar to be an unsigned 16-bit integer.
|
||||
* Where available, UChar is defined to be a char16_t
|
||||
* or a wchar_t (if that is an unsigned 16-bit type), otherwise uint16_t.
|
||||
* Since ICU 59, ICU uses char16_t in C++, UChar only in C,
|
||||
* and defines UChar=char16_t by default. See the UChar API docs for details.
|
||||
*
|
||||
* UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
|
||||
* Unicode code point (Unicode scalar value, 0..0x10ffff).
|
||||
* Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1).
|
||||
* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
|
||||
* the definition of UChar. For details see the documentation for UChar32 itself.
|
||||
*
|
||||
@ -63,11 +60,20 @@
|
||||
* For actual Unicode character properties see uchar.h.
|
||||
*
|
||||
* By default, string operations must be done with error checking in case
|
||||
* a string is not well-formed UTF-16.
|
||||
* The macros will detect if a surrogate code unit is unpaired
|
||||
* a string is not well-formed UTF-16 or UTF-8.
|
||||
*
|
||||
* The U16_ macros detect if a surrogate code unit is unpaired
|
||||
* (lead unit without trail unit or vice versa) and just return the unit itself
|
||||
* as the code point.
|
||||
*
|
||||
* The U8_ macros detect illegal byte sequences and return a negative value.
|
||||
* Starting with ICU 60, the observable length of a single illegal byte sequence
|
||||
* skipped by one of these macros follows the Unicode 6+ recommendation
|
||||
* which is consistent with the W3C Encoding Standard.
|
||||
*
|
||||
* There are ..._OR_FFFD versions of both U16_ and U8_ macros
|
||||
* that return U+FFFD for illegal code unit sequences.
|
||||
*
|
||||
* The regular "safe" macros require that the initial, passed-in string index
|
||||
* is within bounds. They only check the index when they read more than one
|
||||
* code unit. This is usually done with code similar to the following loop:
|
||||
@ -91,10 +97,7 @@
|
||||
* The performance differences are much larger here because UTF-8 provides so
|
||||
* many opportunities for malformed sequences.
|
||||
* The unsafe UTF-8 macros are entirely implemented inside the macro definitions
|
||||
* and are fast, while the safe UTF-8 macros call functions for all but the
|
||||
* trivial (ASCII) cases.
|
||||
* (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common
|
||||
* characters inline as well.)
|
||||
* and are fast, while the safe UTF-8 macros call functions for some complicated cases.
|
||||
*
|
||||
* Unlike with UTF-16, malformed sequences cannot be expressed with distinct
|
||||
* code point values (0..U+10ffff). They are indicated with negative values instead.
|
||||
@ -126,8 +129,7 @@
|
||||
*/
|
||||
#define U_IS_UNICODE_NONCHAR(c) \
|
||||
((c)>=0xfdd0 && \
|
||||
((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
|
||||
(uint32_t)(c)<=0x10ffff)
|
||||
((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)
|
||||
|
||||
/**
|
||||
* Is c a Unicode code point value (0..U+10ffff)
|
||||
@ -148,9 +150,7 @@
|
||||
*/
|
||||
#define U_IS_UNICODE_CHAR(c) \
|
||||
((uint32_t)(c)<0xd800 || \
|
||||
((uint32_t)(c)>0xdfff && \
|
||||
(uint32_t)(c)<=0x10ffff && \
|
||||
!U_IS_UNICODE_NONCHAR(c)))
|
||||
(0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
|
||||
|
||||
/**
|
||||
* Is this code point a BMP code point (U+0000..U+ffff)?
|
||||
|
@ -41,34 +41,24 @@
|
||||
|
||||
/* internal definitions ----------------------------------------------------- */
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Counts the trail bytes for a UTF-8 lead byte.
|
||||
* Returns 0 for 0..0xbf as well as for 0xfe and 0xff.
|
||||
* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
|
||||
* leadByte might be evaluated multiple times.
|
||||
*
|
||||
* This is internal since it is not meant to be called directly by external clients;
|
||||
* however it is called by public macros in this file and thus must remain stable.
|
||||
*
|
||||
* Note: Beginning with ICU 50, the implementation uses a multi-condition expression
|
||||
* which was shown in 2012 (on x86-64) to compile to fast, branch-free code.
|
||||
* leadByte is evaluated multiple times.
|
||||
*
|
||||
* The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:
|
||||
* #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])
|
||||
* leadByte was evaluated exactly once.
|
||||
*
|
||||
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_COUNT_TRAIL_BYTES(leadByte) \
|
||||
((uint8_t)(leadByte)<0xf0 ? \
|
||||
((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \
|
||||
(uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0)
|
||||
((uint8_t)(leadByte)<=0xf4 ? \
|
||||
((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0) : 0)
|
||||
|
||||
/**
|
||||
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
|
||||
* The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
|
||||
* Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
|
||||
* leadByte might be evaluated multiple times.
|
||||
*
|
||||
* This is internal since it is not meant to be called directly by external clients;
|
||||
@ -78,7 +68,7 @@
|
||||
* @internal
|
||||
*/
|
||||
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
|
||||
(((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
|
||||
(((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
|
||||
|
||||
/**
|
||||
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
|
||||
@ -89,6 +79,34 @@
|
||||
*/
|
||||
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
|
||||
|
||||
/**
|
||||
* Internal bit vector for 3-byte UTF-8 validity check.
|
||||
* Lead byte E0..EF bits 3..0 as byte index,
|
||||
* first trail byte bits 7..5 as bit index into that byte.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
|
||||
|
||||
/**
|
||||
* Internal 3-byte UTF-8 validity check.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
|
||||
|
||||
/**
|
||||
* Internal bit vector for 4-byte UTF-8 validity check.
|
||||
* First trail byte bits 7..4 as byte index,
|
||||
* lead byte F0..F4 bits 2..0 as bit index into that byte.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
|
||||
|
||||
/**
|
||||
* Internal 4-byte UTF-8 validity check.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
|
||||
|
||||
/**
|
||||
* Function for handling "next code point" with error-checking.
|
||||
*
|
||||
@ -153,7 +171,8 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
* @return TRUE or FALSE
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
|
||||
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
|
||||
// 0x32=0xf4-0xc2
|
||||
|
||||
/**
|
||||
* Is this code unit (byte) a UTF-8 trail byte?
|
||||
@ -161,7 +180,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
* @return TRUE or FALSE
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
|
||||
#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
|
||||
|
||||
/**
|
||||
* How many code units (bytes) are used for the UTF-8 encoding
|
||||
@ -289,7 +308,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
*/
|
||||
#define U8_NEXT_UNSAFE(s, i, c) { \
|
||||
(c)=(uint8_t)(s)[(i)++]; \
|
||||
if((c)>=0x80) { \
|
||||
if(!U8_IS_SINGLE(c)) { \
|
||||
if((c)<0xe0) { \
|
||||
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
|
||||
} else if((c)<0xf0) { \
|
||||
@ -325,22 +344,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
*/
|
||||
#define U8_NEXT(s, i, length, c) { \
|
||||
(c)=(uint8_t)(s)[(i)++]; \
|
||||
if((c)>=0x80) { \
|
||||
if(!U8_IS_SINGLE(c)) { \
|
||||
uint8_t __t1, __t2; \
|
||||
if( /* handle U+1000..U+CFFF inline */ \
|
||||
(0xe0<(c) && (c)<=0xec) && \
|
||||
(((i)+1)<(length) || (length)<0) && \
|
||||
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
|
||||
(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
|
||||
) { \
|
||||
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
|
||||
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
|
||||
if( /* handle U+0800..U+FFFF inline */ \
|
||||
(0xe0<=(c) && (c)<0xf0) && \
|
||||
(((i)+1)<(length) || (length)<0) && \
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
|
||||
(__t2=(s)[(i)+1]-0x80)<=0x3f) { \
|
||||
(c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
|
||||
(i)+=2; \
|
||||
} else if( /* handle U+0080..U+07FF inline */ \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
((i)!=(length)) && \
|
||||
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
|
||||
) { \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
((i)!=(length)) && \
|
||||
(__t1=(s)[i]-0x80)<=0x3f) { \
|
||||
(c)=(((c)&0x1f)<<6)|__t1; \
|
||||
++(i); \
|
||||
} else { \
|
||||
@ -376,22 +392,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
*/
|
||||
#define U8_NEXT_OR_FFFD(s, i, length, c) { \
|
||||
(c)=(uint8_t)(s)[(i)++]; \
|
||||
if((c)>=0x80) { \
|
||||
if(!U8_IS_SINGLE(c)) { \
|
||||
uint8_t __t1, __t2; \
|
||||
if( /* handle U+1000..U+CFFF inline */ \
|
||||
(0xe0<(c) && (c)<=0xec) && \
|
||||
(((i)+1)<(length) || (length)<0) && \
|
||||
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
|
||||
(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
|
||||
) { \
|
||||
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
|
||||
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
|
||||
if( /* handle U+0800..U+FFFF inline */ \
|
||||
(0xe0<=(c) && (c)<0xf0) && \
|
||||
(((i)+1)<(length) || (length)<0) && \
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
|
||||
(__t2=(s)[(i)+1]-0x80)<=0x3f) { \
|
||||
(c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
|
||||
(i)+=2; \
|
||||
} else if( /* handle U+0080..U+07FF inline */ \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
((i)!=(length)) && \
|
||||
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
|
||||
) { \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
((i)!=(length)) && \
|
||||
(__t1=(s)[i]-0x80)<=0x3f) { \
|
||||
(c)=(((c)&0x1f)<<6)|__t1; \
|
||||
++(i); \
|
||||
} else { \
|
||||
@ -476,7 +489,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U8_FWD_1_UNSAFE(s, i) { \
|
||||
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \
|
||||
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
|
||||
}
|
||||
|
||||
/**
|
||||
@ -493,15 +506,24 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U8_FWD_1(s, i, length) { \
|
||||
uint8_t __b=(uint8_t)(s)[(i)++]; \
|
||||
if(U8_IS_LEAD(__b)) { \
|
||||
uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
|
||||
if((i)+__count>(length) && (length)>=0) { \
|
||||
__count=(uint8_t)((length)-(i)); \
|
||||
} \
|
||||
while(__count>0 && U8_IS_TRAIL((s)[i])) { \
|
||||
++(i); \
|
||||
--__count; \
|
||||
uint8_t __b=(s)[(i)++]; \
|
||||
if(U8_IS_LEAD(__b) && (i)!=(length)) { \
|
||||
uint8_t __t1=(s)[i]; \
|
||||
if((0xe0<=__b && __b<0xf0)) { \
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
|
||||
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
|
||||
++(i); \
|
||||
} \
|
||||
} else if(__b<0xe0) { \
|
||||
if(U8_IS_TRAIL(__t1)) { \
|
||||
++(i); \
|
||||
} \
|
||||
} else /* c>=0xf0 */ { \
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
|
||||
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
|
||||
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
|
||||
++(i); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
@ -615,7 +637,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
/* c is a trail byte */ \
|
||||
(c)&=0x3f; \
|
||||
for(;;) { \
|
||||
__b=(uint8_t)(s)[--(i)]; \
|
||||
__b=(s)[--(i)]; \
|
||||
if(__b>=0xc0) { \
|
||||
U8_MASK_LEAD_BYTE(__b, __count); \
|
||||
(c)|=(UChar32)__b<<__shift; \
|
||||
@ -651,7 +673,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
*/
|
||||
#define U8_PREV(s, start, i, c) { \
|
||||
(c)=(uint8_t)(s)[--(i)]; \
|
||||
if((c)>=0x80) { \
|
||||
if(!U8_IS_SINGLE(c)) { \
|
||||
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
|
||||
} \
|
||||
}
|
||||
@ -682,7 +704,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
*/
|
||||
#define U8_PREV_OR_FFFD(s, start, i, c) { \
|
||||
(c)=(uint8_t)(s)[--(i)]; \
|
||||
if((c)>=0x80) { \
|
||||
if(!U8_IS_SINGLE(c)) { \
|
||||
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
|
||||
} \
|
||||
}
|
||||
|
@ -502,7 +502,7 @@ spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
|
||||
static inline int32_t
|
||||
spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
|
||||
UChar32 c=*s;
|
||||
if((int8_t)c>=0) {
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
return set.contains(c) ? 1 : -1;
|
||||
}
|
||||
// Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
|
||||
@ -514,7 +514,7 @@ spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
|
||||
static inline int32_t
|
||||
spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
|
||||
UChar32 c=s[length-1];
|
||||
if((int8_t)c>=0) {
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
return set.contains(c) ? 1 : -1;
|
||||
}
|
||||
int32_t i=length-1;
|
||||
@ -1006,11 +1006,9 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
|
||||
// Try to match if the increment is not listed already.
|
||||
// Match at code point boundaries. (The UTF-8 strings were converted
|
||||
// from UTF-16 and are guaranteed to be well-formed.)
|
||||
if( !U8_IS_TRAIL(s[pos-overlap]) &&
|
||||
!offsets.containsOffset(inc) &&
|
||||
matches8(s+pos-overlap, s8, length8)
|
||||
|
||||
) {
|
||||
if(!U8_IS_TRAIL(s[pos-overlap]) &&
|
||||
!offsets.containsOffset(inc) &&
|
||||
matches8(s+pos-overlap, s8, length8)) {
|
||||
if(inc==rest) {
|
||||
return length; // Reached the end of the string.
|
||||
}
|
||||
@ -1052,11 +1050,10 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
|
||||
// Try to match if the string is longer or starts earlier.
|
||||
// Match at code point boundaries. (The UTF-8 strings were converted
|
||||
// from UTF-16 and are guaranteed to be well-formed.)
|
||||
if( !U8_IS_TRAIL(s[pos-overlap]) &&
|
||||
(overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
|
||||
matches8(s+pos-overlap, s8, length8)
|
||||
|
||||
) {
|
||||
if(!U8_IS_TRAIL(s[pos-overlap]) &&
|
||||
(overlap>maxOverlap ||
|
||||
/* redundant overlap==maxOverlap && */ inc>maxInc) &&
|
||||
matches8(s+pos-overlap, s8, length8)) {
|
||||
maxInc=inc; // Longest match from earliest start.
|
||||
maxOverlap=overlap;
|
||||
break;
|
||||
|
@ -256,152 +256,6 @@ u_strToUTF32(UChar32 *dest,
|
||||
pErrorCode);
|
||||
}
|
||||
|
||||
/* for utf8_nextCharSafeBodyTerminated() */
|
||||
static const UChar32
|
||||
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
/*
|
||||
* Version of utf8_nextCharSafeBody() with the following differences:
|
||||
* - checks for NUL termination instead of length
|
||||
* - works with pointers instead of indexes
|
||||
* - always strict (strict==-1)
|
||||
*
|
||||
* *ps points to after the lead byte and will be moved to after the last trail byte.
|
||||
* c is the lead byte.
|
||||
* @return the code point, or U_SENTINEL
|
||||
*/
|
||||
static UChar32
|
||||
utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
|
||||
const uint8_t *s=*ps;
|
||||
uint8_t trail, illegal=0;
|
||||
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
|
||||
U_ASSERT(count<6);
|
||||
U8_MASK_LEAD_BYTE((c), count);
|
||||
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
|
||||
switch(count) {
|
||||
/* each branch falls through to the next one */
|
||||
case 5:
|
||||
case 4:
|
||||
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
|
||||
illegal=1;
|
||||
break;
|
||||
case 3:
|
||||
trail=(uint8_t)(*s++ - 0x80);
|
||||
c=(c<<6)|trail;
|
||||
if(trail>0x3f || c>=0x110) {
|
||||
/* not a trail byte, or code point>0x10ffff (outside Unicode) */
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
U_FALLTHROUGH;
|
||||
case 2:
|
||||
trail=(uint8_t)(*s++ - 0x80);
|
||||
if(trail>0x3f) {
|
||||
/* not a trail byte */
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
c=(c<<6)|trail;
|
||||
U_FALLTHROUGH;
|
||||
case 1:
|
||||
trail=(uint8_t)(*s++ - 0x80);
|
||||
if(trail>0x3f) {
|
||||
/* not a trail byte */
|
||||
illegal=1;
|
||||
}
|
||||
c=(c<<6)|trail;
|
||||
break;
|
||||
case 0:
|
||||
return U_SENTINEL;
|
||||
/* no default branch to optimize switch() - all values are covered */
|
||||
}
|
||||
|
||||
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
|
||||
/* illegal is also set if count>=4 */
|
||||
if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
|
||||
/* error handling */
|
||||
/* don't go beyond this sequence */
|
||||
s=*ps;
|
||||
while(count>0 && U8_IS_TRAIL(*s)) {
|
||||
++s;
|
||||
--count;
|
||||
}
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
*ps=s;
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* Version of utf8_nextCharSafeBody() with the following differences:
|
||||
* - works with pointers instead of indexes
|
||||
* - always strict (strict==-1)
|
||||
*
|
||||
* *ps points to after the lead byte and will be moved to after the last trail byte.
|
||||
* c is the lead byte.
|
||||
* @return the code point, or U_SENTINEL
|
||||
*/
|
||||
static UChar32
|
||||
utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
|
||||
const uint8_t *s=*ps;
|
||||
uint8_t trail, illegal=0;
|
||||
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
|
||||
if((limit-s)>=count) {
|
||||
U8_MASK_LEAD_BYTE((c), count);
|
||||
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
|
||||
switch(count) {
|
||||
/* each branch falls through to the next one */
|
||||
case 5:
|
||||
case 4:
|
||||
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
|
||||
illegal=1;
|
||||
break;
|
||||
case 3:
|
||||
trail=*s++;
|
||||
c=(c<<6)|(trail&0x3f);
|
||||
if(c<0x110) {
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
} else {
|
||||
/* code point>0x10ffff, outside Unicode */
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
U_FALLTHROUGH;
|
||||
case 2:
|
||||
trail=*s++;
|
||||
c=(c<<6)|(trail&0x3f);
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
U_FALLTHROUGH;
|
||||
case 1:
|
||||
trail=*s++;
|
||||
c=(c<<6)|(trail&0x3f);
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
break;
|
||||
case 0:
|
||||
return U_SENTINEL;
|
||||
/* no default branch to optimize switch() - all values are covered */
|
||||
}
|
||||
} else {
|
||||
illegal=1; /* too few bytes left */
|
||||
}
|
||||
|
||||
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
|
||||
/* illegal is also set if count>=4 */
|
||||
U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
|
||||
if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
|
||||
/* error handling */
|
||||
/* don't go beyond this sequence */
|
||||
s=*ps;
|
||||
while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
|
||||
++s;
|
||||
--count;
|
||||
}
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
*ps=s;
|
||||
return c;
|
||||
}
|
||||
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_strFromUTF8WithSub(UChar *dest,
|
||||
int32_t destCapacity,
|
||||
@ -410,19 +264,10 @@ u_strFromUTF8WithSub(UChar *dest,
|
||||
int32_t srcLength,
|
||||
UChar32 subchar, int32_t *pNumSubstitutions,
|
||||
UErrorCode *pErrorCode){
|
||||
UChar *pDest = dest;
|
||||
UChar *pDestLimit = dest+destCapacity;
|
||||
UChar32 ch;
|
||||
int32_t reqLength = 0;
|
||||
const uint8_t* pSrc = (const uint8_t*) src;
|
||||
uint8_t t1, t2; /* trail bytes */
|
||||
int32_t numSubstitutions;
|
||||
|
||||
/* args check */
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
|
||||
(destCapacity<0) || (dest == NULL && destCapacity > 0) ||
|
||||
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
|
||||
@ -434,7 +279,10 @@ u_strFromUTF8WithSub(UChar *dest,
|
||||
if(pNumSubstitutions!=NULL) {
|
||||
*pNumSubstitutions=0;
|
||||
}
|
||||
numSubstitutions=0;
|
||||
UChar *pDest = dest;
|
||||
UChar *pDestLimit = dest+destCapacity;
|
||||
int32_t reqLength = 0;
|
||||
int32_t numSubstitutions=0;
|
||||
|
||||
/*
|
||||
* Inline processing of UTF-8 byte sequences:
|
||||
@ -455,95 +303,81 @@ u_strFromUTF8WithSub(UChar *dest,
|
||||
* The code explicitly checks for NULs only in the lead byte position.
|
||||
* A NUL byte in the trail byte position fails the trail byte range check anyway.
|
||||
*/
|
||||
while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
|
||||
if(ch <= 0x7f){
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
int32_t i;
|
||||
UChar32 c;
|
||||
for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
|
||||
// modified copy of U8_NEXT()
|
||||
++i;
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
*pDest++=(UChar)c;
|
||||
} else {
|
||||
if(ch > 0xe0) {
|
||||
if( /* handle U+1000..U+CFFF inline */
|
||||
ch <= 0xec &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
||||
) {
|
||||
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
||||
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
||||
pSrc += 3;
|
||||
continue;
|
||||
}
|
||||
} else if(ch < 0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
ch >= 0xc2 &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
||||
pSrc += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* function call for "complicated" and error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
|
||||
if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
} else if(ch<=0xFFFF) {
|
||||
*(pDest++)=(UChar)ch;
|
||||
uint8_t __t1, __t2;
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
(0xe0<=(c) && (c)<0xf0) &&
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
|
||||
(__t2=src[(i)+1]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
|
||||
i+=2;
|
||||
} else if( /* handle U+0080..U+07FF inline */
|
||||
((c)<0xe0 && (c)>=0xc2) &&
|
||||
(__t1=src[i]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0x1f)<<6)|__t1;
|
||||
++(i);
|
||||
} else {
|
||||
*(pDest++)=U16_LEAD(ch);
|
||||
if(pDest<pDestLimit) {
|
||||
*(pDest++)=U16_TRAIL(ch);
|
||||
/* function call for "complicated" and error cases */
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
|
||||
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
} else if(c<=0xFFFF) {
|
||||
*(pDest++)=(UChar)c;
|
||||
} else {
|
||||
reqLength++;
|
||||
break;
|
||||
*(pDest++)=U16_LEAD(c);
|
||||
if(pDest<pDestLimit) {
|
||||
*(pDest++)=U16_TRAIL(c);
|
||||
} else {
|
||||
reqLength++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Pre-flight the rest of the string. */
|
||||
while((ch = *pSrc) != 0) {
|
||||
if(ch <= 0x7f){
|
||||
while((c = (uint8_t)src[i]) != 0) {
|
||||
// modified copy of U8_NEXT()
|
||||
++i;
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
++reqLength;
|
||||
++pSrc;
|
||||
} else {
|
||||
if(ch > 0xe0) {
|
||||
if( /* handle U+1000..U+CFFF inline */
|
||||
ch <= 0xec &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
|
||||
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
|
||||
) {
|
||||
++reqLength;
|
||||
pSrc += 3;
|
||||
continue;
|
||||
}
|
||||
} else if(ch < 0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
ch >= 0xc2 &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
|
||||
) {
|
||||
++reqLength;
|
||||
pSrc += 2;
|
||||
continue;
|
||||
uint8_t __t1, __t2;
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
(0xe0<=(c) && (c)<0xf0) &&
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
|
||||
(__t2=src[(i)+1]-0x80)<=0x3f) {
|
||||
++reqLength;
|
||||
i+=2;
|
||||
} else if( /* handle U+0080..U+07FF inline */
|
||||
((c)<0xe0 && (c)>=0xc2) &&
|
||||
(__t1=src[i]-0x80)<=0x3f) {
|
||||
++reqLength;
|
||||
++(i);
|
||||
} else {
|
||||
/* function call for "complicated" and error cases */
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
|
||||
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}
|
||||
reqLength += U16_LENGTH(c);
|
||||
}
|
||||
|
||||
/* function call for "complicated" and error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
|
||||
if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}
|
||||
reqLength += U16_LENGTH(ch);
|
||||
}
|
||||
}
|
||||
} else /* srcLength >= 0 */ {
|
||||
const uint8_t *pSrcLimit = pSrc + srcLength;
|
||||
int32_t count;
|
||||
|
||||
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
|
||||
/* Faster loop without ongoing checking for srcLength and pDestLimit. */
|
||||
int32_t i = 0;
|
||||
UChar32 c;
|
||||
for(;;) {
|
||||
/*
|
||||
* Each iteration of the inner loop progresses by at most 3 UTF-8
|
||||
@ -551,10 +385,10 @@ u_strFromUTF8WithSub(UChar *dest,
|
||||
* For supplementary code points (4 & 2), which are rare,
|
||||
* there is an additional adjustment.
|
||||
*/
|
||||
count = (int32_t)(pDestLimit - pDest);
|
||||
srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
|
||||
if(count > srcLength) {
|
||||
count = srcLength; /* min(remaining dest, remaining src/3) */
|
||||
int32_t count = (int32_t)(pDestLimit - pDest);
|
||||
int32_t count2 = (srcLength - i) / 3;
|
||||
if(count > count2) {
|
||||
count = count2; /* min(remaining dest, remaining src/3) */
|
||||
}
|
||||
if(count < 3) {
|
||||
/*
|
||||
@ -565,147 +399,123 @@ u_strFromUTF8WithSub(UChar *dest,
|
||||
}
|
||||
|
||||
do {
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f){
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
// modified copy of U8_NEXT()
|
||||
c = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
*pDest++=(UChar)c;
|
||||
} else {
|
||||
if(ch > 0xe0) {
|
||||
if( /* handle U+1000..U+CFFF inline */
|
||||
ch <= 0xec &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
||||
) {
|
||||
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
||||
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
||||
pSrc += 3;
|
||||
continue;
|
||||
uint8_t __t1, __t2;
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
(0xe0<=(c) && (c)<0xf0) &&
|
||||
((i)+1)<srcLength &&
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
|
||||
(__t2=src[(i)+1]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
|
||||
i+=2;
|
||||
} else if( /* handle U+0080..U+07FF inline */
|
||||
((c)<0xe0 && (c)>=0xc2) &&
|
||||
((i)!=srcLength) &&
|
||||
(__t1=src[i]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0x1f)<<6)|__t1;
|
||||
++(i);
|
||||
} else {
|
||||
if(c >= 0xf0 || subchar > 0xffff) {
|
||||
// We may read up to four bytes and write up to two UChars,
|
||||
// which we didn't account for with computing count,
|
||||
// so we adjust it here.
|
||||
if(--count == 0) {
|
||||
--i; // back out byte c
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if(ch < 0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
ch >= 0xc2 &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
||||
pSrc += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if(ch >= 0xf0 || subchar > 0xffff) {
|
||||
/*
|
||||
* We may read up to six bytes and write up to two UChars,
|
||||
* which we didn't account for with computing count,
|
||||
* so we adjust it here.
|
||||
*/
|
||||
if(--count == 0) {
|
||||
break;
|
||||
/* function call for "complicated" and error cases */
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
|
||||
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
} else if(c<=0xFFFF) {
|
||||
*(pDest++)=(UChar)c;
|
||||
} else {
|
||||
*(pDest++)=U16_LEAD(c);
|
||||
*(pDest++)=U16_TRAIL(c);
|
||||
}
|
||||
}
|
||||
|
||||
/* function call for "complicated" and error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}else if(ch<=0xFFFF){
|
||||
*(pDest++)=(UChar)ch;
|
||||
}else{
|
||||
*(pDest++)=U16_LEAD(ch);
|
||||
*(pDest++)=U16_TRAIL(ch);
|
||||
}
|
||||
}
|
||||
} while(--count > 0);
|
||||
}
|
||||
|
||||
while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f){
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
while(i < srcLength && (pDest < pDestLimit)) {
|
||||
// modified copy of U8_NEXT()
|
||||
c = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
*pDest++=(UChar)c;
|
||||
} else {
|
||||
if(ch > 0xe0) {
|
||||
if( /* handle U+1000..U+CFFF inline */
|
||||
ch <= 0xec &&
|
||||
((pSrcLimit - pSrc) >= 3) &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
||||
) {
|
||||
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
||||
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
||||
pSrc += 3;
|
||||
continue;
|
||||
}
|
||||
} else if(ch < 0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
ch >= 0xc2 &&
|
||||
((pSrcLimit - pSrc) >= 2) &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
||||
pSrc += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* function call for "complicated" and error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}else if(ch<=0xFFFF){
|
||||
*(pDest++)=(UChar)ch;
|
||||
}else{
|
||||
*(pDest++)=U16_LEAD(ch);
|
||||
if(pDest<pDestLimit){
|
||||
*(pDest++)=U16_TRAIL(ch);
|
||||
}else{
|
||||
reqLength++;
|
||||
break;
|
||||
uint8_t __t1, __t2;
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
(0xe0<=(c) && (c)<0xf0) &&
|
||||
((i)+1)<srcLength &&
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
|
||||
(__t2=src[(i)+1]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
|
||||
i+=2;
|
||||
} else if( /* handle U+0080..U+07FF inline */
|
||||
((c)<0xe0 && (c)>=0xc2) &&
|
||||
((i)!=srcLength) &&
|
||||
(__t1=src[i]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0x1f)<<6)|__t1;
|
||||
++(i);
|
||||
} else {
|
||||
/* function call for "complicated" and error cases */
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
|
||||
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
} else if(c<=0xFFFF) {
|
||||
*(pDest++)=(UChar)c;
|
||||
} else {
|
||||
*(pDest++)=U16_LEAD(c);
|
||||
if(pDest<pDestLimit) {
|
||||
*(pDest++)=U16_TRAIL(c);
|
||||
} else {
|
||||
reqLength++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* do not fill the dest buffer just count the UChars needed */
|
||||
while(pSrc < pSrcLimit){
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f){
|
||||
reqLength++;
|
||||
++pSrc;
|
||||
} else {
|
||||
if(ch > 0xe0) {
|
||||
if( /* handle U+1000..U+CFFF inline */
|
||||
ch <= 0xec &&
|
||||
((pSrcLimit - pSrc) >= 3) &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
|
||||
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
|
||||
) {
|
||||
reqLength++;
|
||||
pSrc += 3;
|
||||
continue;
|
||||
}
|
||||
} else if(ch < 0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
ch >= 0xc2 &&
|
||||
((pSrcLimit - pSrc) >= 2) &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
|
||||
) {
|
||||
reqLength++;
|
||||
pSrc += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* function call for "complicated" and error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
/* Pre-flight the rest of the string. */
|
||||
while(i < srcLength) {
|
||||
// modified copy of U8_NEXT()
|
||||
c = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
++reqLength;
|
||||
} else {
|
||||
uint8_t __t1, __t2;
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
(0xe0<=(c) && (c)<0xf0) &&
|
||||
((i)+1)<srcLength &&
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
|
||||
(__t2=src[(i)+1]-0x80)<=0x3f) {
|
||||
++reqLength;
|
||||
i+=2;
|
||||
} else if( /* handle U+0080..U+07FF inline */
|
||||
((c)<0xe0 && (c)>=0xc2) &&
|
||||
((i)!=srcLength) &&
|
||||
(__t1=src[i]-0x80)<=0x3f) {
|
||||
++reqLength;
|
||||
++(i);
|
||||
} else {
|
||||
/* function call for "complicated" and error cases */
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
|
||||
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}
|
||||
reqLength += U16_LENGTH(c);
|
||||
}
|
||||
reqLength+=U16_LENGTH(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -753,7 +563,7 @@ u_strFromUTF8Lenient(UChar *dest,
|
||||
uint8_t* pSrc = (uint8_t*) src;
|
||||
|
||||
/* args check */
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -994,7 +804,7 @@ u_strToUTF8WithSub(char *dest,
|
||||
int32_t numSubstitutions;
|
||||
|
||||
/* args check */
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -1266,18 +1076,8 @@ u_strFromJavaModifiedUTF8WithSub(
|
||||
int32_t srcLength,
|
||||
UChar32 subchar, int32_t *pNumSubstitutions,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar *pDest = dest;
|
||||
UChar *pDestLimit = dest+destCapacity;
|
||||
UChar32 ch;
|
||||
int32_t reqLength = 0;
|
||||
const uint8_t* pSrc = (const uint8_t*) src;
|
||||
const uint8_t *pSrcLimit;
|
||||
int32_t count;
|
||||
uint8_t t1, t2; /* trail bytes */
|
||||
int32_t numSubstitutions;
|
||||
|
||||
/* args check */
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
|
||||
@ -1291,18 +1091,22 @@ u_strFromJavaModifiedUTF8WithSub(
|
||||
if(pNumSubstitutions!=NULL) {
|
||||
*pNumSubstitutions=0;
|
||||
}
|
||||
numSubstitutions=0;
|
||||
UChar *pDest = dest;
|
||||
UChar *pDestLimit = dest+destCapacity;
|
||||
int32_t reqLength = 0;
|
||||
int32_t numSubstitutions=0;
|
||||
|
||||
if(srcLength < 0) {
|
||||
/*
|
||||
* Transform a NUL-terminated ASCII string.
|
||||
* Handle non-ASCII strings with slower code.
|
||||
*/
|
||||
while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
UChar32 c;
|
||||
while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
|
||||
*pDest++=(UChar)c;
|
||||
++src;
|
||||
}
|
||||
if(ch == 0) {
|
||||
if(c == 0) {
|
||||
reqLength=(int32_t)(pDest - dest);
|
||||
if(pDestLength) {
|
||||
*pDestLength = reqLength;
|
||||
@ -1312,33 +1116,38 @@ u_strFromJavaModifiedUTF8WithSub(
|
||||
u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
|
||||
return dest;
|
||||
}
|
||||
srcLength = static_cast<int32_t>(uprv_strlen((const char *)pSrc));
|
||||
srcLength = static_cast<int32_t>(uprv_strlen(src));
|
||||
}
|
||||
|
||||
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
|
||||
pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
|
||||
/* Faster loop without ongoing checking for srcLength and pDestLimit. */
|
||||
UChar32 ch;
|
||||
uint8_t t1, t2;
|
||||
int32_t i = 0;
|
||||
for(;;) {
|
||||
count = (int32_t)(pDestLimit - pDest);
|
||||
srcLength = (int32_t)(pSrcLimit - pSrc);
|
||||
if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
|
||||
int32_t count = (int32_t)(pDestLimit - pDest);
|
||||
int32_t count2 = srcLength - i;
|
||||
if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
|
||||
/* fast ASCII loop */
|
||||
const uint8_t *prevSrc = pSrc;
|
||||
int32_t delta;
|
||||
while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
int32_t start = i;
|
||||
uint8_t b;
|
||||
while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
|
||||
*pDest++=b;
|
||||
++i;
|
||||
}
|
||||
delta = (int32_t)(pSrc - prevSrc);
|
||||
int32_t delta = i - start;
|
||||
count -= delta;
|
||||
srcLength -= delta;
|
||||
count2 -= delta;
|
||||
}
|
||||
/*
|
||||
* Each iteration of the inner loop progresses by at most 3 UTF-8
|
||||
* bytes and one UChar.
|
||||
*/
|
||||
srcLength /= 3;
|
||||
if(count > srcLength) {
|
||||
count = srcLength; /* min(remaining dest, remaining src/3) */
|
||||
if(subchar > 0xFFFF) {
|
||||
break;
|
||||
}
|
||||
count2 /= 3;
|
||||
if(count > count2) {
|
||||
count = count2; /* min(remaining dest, remaining src/3) */
|
||||
}
|
||||
if(count < 3) {
|
||||
/*
|
||||
@ -1348,29 +1157,28 @@ u_strFromJavaModifiedUTF8WithSub(
|
||||
break;
|
||||
}
|
||||
do {
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f){
|
||||
ch = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(ch)) {
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
} else {
|
||||
if(ch >= 0xe0) {
|
||||
if( /* handle U+0000..U+FFFF inline */
|
||||
ch <= 0xef &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
||||
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
||||
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
||||
pSrc += 3;
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if( /* handle U+0000..U+07FF inline */
|
||||
ch >= 0xc0 &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
||||
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
|
||||
) {
|
||||
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
||||
pSrc += 2;
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -1383,49 +1191,43 @@ u_strFromJavaModifiedUTF8WithSub(
|
||||
* We need to write two UChars, adjusted count for that,
|
||||
* and ran out of space.
|
||||
*/
|
||||
--i; // back out byte ch
|
||||
break;
|
||||
} else {
|
||||
/* function call for error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
|
||||
++numSubstitutions;
|
||||
if(subchar<=0xFFFF) {
|
||||
*(pDest++)=(UChar)subchar;
|
||||
} else {
|
||||
*(pDest++)=U16_LEAD(subchar);
|
||||
*(pDest++)=U16_TRAIL(subchar);
|
||||
}
|
||||
*(pDest++)=(UChar)subchar;
|
||||
}
|
||||
}
|
||||
} while(--count > 0);
|
||||
}
|
||||
|
||||
while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f){
|
||||
while(i < srcLength && (pDest < pDestLimit)) {
|
||||
ch = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(ch)){
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
} else {
|
||||
if(ch >= 0xe0) {
|
||||
if( /* handle U+0000..U+FFFF inline */
|
||||
ch <= 0xef &&
|
||||
((pSrcLimit - pSrc) >= 3) &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
||||
(i+1) < srcLength &&
|
||||
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
||||
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
||||
pSrc += 3;
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if( /* handle U+0000..U+07FF inline */
|
||||
ch >= 0xc0 &&
|
||||
((pSrcLimit - pSrc) >= 2) &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
||||
i < srcLength &&
|
||||
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
|
||||
) {
|
||||
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
||||
pSrc += 2;
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -1435,8 +1237,7 @@ u_strFromJavaModifiedUTF8WithSub(
|
||||
return NULL;
|
||||
} else {
|
||||
/* function call for error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
|
||||
++numSubstitutions;
|
||||
if(subchar<=0xFFFF) {
|
||||
*(pDest++)=(UChar)subchar;
|
||||
@ -1453,32 +1254,31 @@ u_strFromJavaModifiedUTF8WithSub(
|
||||
}
|
||||
}
|
||||
|
||||
/* do not fill the dest buffer just count the UChars needed */
|
||||
while(pSrc < pSrcLimit){
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f) {
|
||||
/* Pre-flight the rest of the string. */
|
||||
while(i < srcLength) {
|
||||
ch = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(ch)) {
|
||||
reqLength++;
|
||||
++pSrc;
|
||||
} else {
|
||||
if(ch >= 0xe0) {
|
||||
if( /* handle U+0000..U+FFFF inline */
|
||||
ch <= 0xef &&
|
||||
((pSrcLimit - pSrc) >= 3) &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
|
||||
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
|
||||
(i+1) < srcLength &&
|
||||
(uint8_t)(src[i] - 0x80) <= 0x3f &&
|
||||
(uint8_t)(src[i+1] - 0x80) <= 0x3f
|
||||
) {
|
||||
reqLength++;
|
||||
pSrc += 3;
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if( /* handle U+0000..U+07FF inline */
|
||||
ch >= 0xc0 &&
|
||||
((pSrcLimit - pSrc) >= 2) &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
|
||||
i < srcLength &&
|
||||
(uint8_t)(src[i] - 0x80) <= 0x3f
|
||||
) {
|
||||
reqLength++;
|
||||
pSrc += 2;
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -1488,8 +1288,7 @@ u_strFromJavaModifiedUTF8WithSub(
|
||||
return NULL;
|
||||
} else {
|
||||
/* function call for error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
|
||||
++numSubstitutions;
|
||||
reqLength+=U16_LENGTH(ch);
|
||||
}
|
||||
|
@ -847,15 +847,11 @@ U_CDECL_END
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// Chunk size.
|
||||
// Must be less than 42 (256/6), because of byte mapping from UChar indexes to native indexes.
|
||||
// Worst case there are six UTF-8 bytes per UChar.
|
||||
// obsolete 6 byte form fd + 5 trails maps to fffd
|
||||
// obsolete 5 byte form fc + 4 trails maps to fffd
|
||||
// non-shortest 4 byte forms maps to fffd
|
||||
// normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
|
||||
// mapToUChars array size must allow for the worst case, 6.
|
||||
// This could be brought down to 4, by treating fd and fc as pure illegal,
|
||||
// rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
|
||||
// Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes.
|
||||
// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
|
||||
// to two UChars.)
|
||||
// The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
|
||||
// is a three-byte sequence (truncated four-byte sequence).
|
||||
//
|
||||
enum { UTF8_TEXT_CHUNK_SIZE=32 };
|
||||
|
||||
@ -895,7 +891,7 @@ struct UTF8Buf {
|
||||
// Requires two extra slots,
|
||||
// one for a supplementary starting in the last normal position,
|
||||
// and one for an entry for the buffer limit position.
|
||||
uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*6+6]; // Map native offset from bufNativeStart to
|
||||
uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
|
||||
// correspoding offset in filled part of buf.
|
||||
int32_t align;
|
||||
};
|
||||
|
@ -7,7 +7,7 @@
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: utf_impl.c
|
||||
* file name: utf_impl.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
@ -54,10 +54,6 @@
|
||||
* - SUB AX, BX (result)
|
||||
* -finish:
|
||||
* (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
|
||||
*
|
||||
* In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
|
||||
* lead bytes above 0xf4 are illegal.
|
||||
* We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
|
||||
*/
|
||||
extern "C" U_EXPORT const uint8_t
|
||||
utf8_countTrailBytes[256]={
|
||||
@ -76,27 +72,24 @@ utf8_countTrailBytes[256]={
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
// illegal C0 & C1
|
||||
// 2-byte lead bytes C2..DF
|
||||
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
|
||||
// 3-byte lead bytes E0..EF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3,
|
||||
3, 3, 3, /* illegal in Unicode */
|
||||
4, 4, 4, 4, /* illegal in Unicode */
|
||||
5, 5, /* illegal in Unicode */
|
||||
0, 0 /* illegal bytes 0xfe and 0xff */
|
||||
// 4-byte lead bytes F0..F4
|
||||
// illegal F5..FF
|
||||
3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
static const UChar32
|
||||
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
static const UChar32
|
||||
utf8_errorValue[6]={
|
||||
// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
|
||||
// but without relying on the obsolete unicode/utf_old.h.
|
||||
0x15, 0x9f, 0xffff,
|
||||
0x10ffff,
|
||||
0x3ffffff, 0x7fffffff
|
||||
0x10ffff
|
||||
};
|
||||
|
||||
static UChar32
|
||||
@ -136,61 +129,59 @@ errorValue(int32_t count, int8_t strict) {
|
||||
*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
|
||||
// *pi is one after byte c.
|
||||
int32_t i=*pi;
|
||||
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
|
||||
U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
|
||||
if(i+count<=length || length<0) {
|
||||
uint8_t trail;
|
||||
|
||||
U8_MASK_LEAD_BYTE(c, count);
|
||||
/* support NUL-terminated strings: do not read beyond the first non-trail byte */
|
||||
switch(count) {
|
||||
/* each branch falls through to the next one */
|
||||
case 0:
|
||||
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
|
||||
case 5:
|
||||
case 4:
|
||||
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
|
||||
break;
|
||||
case 3:
|
||||
trail=s[i++]-0x80;
|
||||
c=(c<<6)|trail;
|
||||
/* c>=0x110 would result in code point>0x10ffff, outside Unicode */
|
||||
if(c>=0x110 || trail>0x3f) { break; }
|
||||
U_FALLTHROUGH;
|
||||
case 2:
|
||||
trail=s[i++]-0x80;
|
||||
c=(c<<6)|trail;
|
||||
/*
|
||||
* test for a surrogate d800..dfff unless we are lenient:
|
||||
* before the last (c<<6), a surrogate is c=360..37f
|
||||
*/
|
||||
if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
|
||||
U_FALLTHROUGH;
|
||||
case 1:
|
||||
trail=s[i++]-0x80;
|
||||
c=(c<<6)|trail;
|
||||
if(trail>0x3f) { break; }
|
||||
/* correct sequence - all trail bytes have (b7..b6)==(10) */
|
||||
if(c>=utf8_minLegal[count] &&
|
||||
/* strict: forbid non-characters like U+fffe */
|
||||
(strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
|
||||
// length can be negative for NUL-terminated strings: Read and validate one byte at a time.
|
||||
if(i==length || c>0xf4) {
|
||||
// end of string, or not a lead byte
|
||||
} else if(c>=0xf0) {
|
||||
// Test for 4-byte sequences first because
|
||||
// U8_NEXT() handles shorter valid sequences inline.
|
||||
uint8_t t1=s[i], t2, t3;
|
||||
c&=7;
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
|
||||
++i!=length && (t2=s[i]-0x80)<=0x3f &&
|
||||
++i!=length && (t3=s[i]-0x80)<=0x3f) {
|
||||
++i;
|
||||
c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
|
||||
// strict: forbid non-characters like U+fffe
|
||||
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
|
||||
*pi=i;
|
||||
return c;
|
||||
}
|
||||
/* no default branch to optimize switch() - all values are covered */
|
||||
}
|
||||
} else {
|
||||
/* too few bytes left */
|
||||
count=length-i;
|
||||
}
|
||||
} else if(c>=0xe0) {
|
||||
c&=0xf;
|
||||
if(strict!=-2) {
|
||||
uint8_t t1=s[i], t2;
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
|
||||
++i!=length && (t2=s[i]-0x80)<=0x3f) {
|
||||
++i;
|
||||
c=(c<<12)|((t1&0x3f)<<6)|t2;
|
||||
// strict: forbid non-characters like U+fffe
|
||||
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
|
||||
*pi=i;
|
||||
return c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// strict=-2 -> lenient: allow surrogates
|
||||
uint8_t t1=s[i]-0x80, t2;
|
||||
if(t1<=0x3f && (c>0 || t1>=0x20) &&
|
||||
++i!=length && (t2=s[i]-0x80)<=0x3f) {
|
||||
*pi=i+1;
|
||||
return (c<<12)|(t1<<6)|t2;
|
||||
}
|
||||
}
|
||||
} else if(c>=0xc2) {
|
||||
uint8_t t1=s[i]-0x80;
|
||||
if(t1<=0x3f) {
|
||||
*pi=i+1;
|
||||
return ((c-0xc0)<<6)|t1;
|
||||
}
|
||||
} // else 0x80<=c<0xc2 is not a lead byte
|
||||
|
||||
/* error handling */
|
||||
i=*pi;
|
||||
while(count>0 && U8_IS_TRAIL(s[i])) {
|
||||
++i;
|
||||
--count;
|
||||
}
|
||||
c=errorValue(i-*pi, strict);
|
||||
*pi=i;
|
||||
return c;
|
||||
@ -243,99 +234,99 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
|
||||
// *pi is the index of byte c.
|
||||
int32_t i=*pi;
|
||||
uint8_t b, count=1, shift=6;
|
||||
|
||||
if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
|
||||
|
||||
/* extract value bits from the last trail byte */
|
||||
c&=0x3f;
|
||||
|
||||
for(;;) {
|
||||
if(i<=start) {
|
||||
/* no lead byte at all */
|
||||
return errorValue(0, strict);
|
||||
}
|
||||
|
||||
/* read another previous byte */
|
||||
b=s[--i];
|
||||
if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
|
||||
if(b&0x40) {
|
||||
/* lead byte, this will always end the loop */
|
||||
uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
|
||||
|
||||
if(count==shouldCount) {
|
||||
/* set the new position */
|
||||
*pi=i;
|
||||
U8_MASK_LEAD_BYTE(b, count);
|
||||
c|=(UChar32)b<<shift;
|
||||
if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
|
||||
/* illegal sequence or (strict and non-character) */
|
||||
if(count>=4) {
|
||||
count=3;
|
||||
if(U8_IS_TRAIL(c) && i>start) {
|
||||
uint8_t b1=s[--i];
|
||||
if(0xc2<=b1 && b1<0xe0) {
|
||||
*pi=i;
|
||||
return ((b1-0xc0)<<6)|(c&0x3f);
|
||||
} else if(U8_IS_TRAIL(b1) && i>start) {
|
||||
// Extract the value bits from the last trail byte.
|
||||
c&=0x3f;
|
||||
uint8_t b2=s[--i];
|
||||
if(0xe0<=b2 && b2<0xf0) {
|
||||
b2&=0xf;
|
||||
if(strict!=-2) {
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
*pi=i;
|
||||
c=(b2<<12)|((b1&0x3f)<<6)|c;
|
||||
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
|
||||
return c;
|
||||
} else {
|
||||
// strict: forbid non-characters like U+fffe
|
||||
return errorValue(2, strict);
|
||||
}
|
||||
c=errorValue(count, strict);
|
||||
} else {
|
||||
/* exit with correct c */
|
||||
}
|
||||
} else {
|
||||
/* the lead byte does not match the number of trail bytes */
|
||||
/* only set the position to the lead byte if it would
|
||||
include the trail byte that we started with */
|
||||
if(count<shouldCount) {
|
||||
// strict=-2 -> lenient: allow surrogates
|
||||
b1-=0x80;
|
||||
if((b2>0 || b1>=0x20)) {
|
||||
*pi=i;
|
||||
c=errorValue(count, strict);
|
||||
} else {
|
||||
c=errorValue(0, strict);
|
||||
return (b2<<12)|(b1<<6)|c;
|
||||
}
|
||||
}
|
||||
break;
|
||||
} else if(count<5) {
|
||||
/* trail byte */
|
||||
c|=(UChar32)(b&0x3f)<<shift;
|
||||
++count;
|
||||
shift+=6;
|
||||
} else {
|
||||
/* more than 5 trail bytes is illegal */
|
||||
c=errorValue(0, strict);
|
||||
break;
|
||||
} else if(U8_IS_TRAIL(b2) && i>start) {
|
||||
uint8_t b3=s[--i];
|
||||
if(0xf0<=b3 && b3<=0xf4) {
|
||||
b3&=7;
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
|
||||
*pi=i;
|
||||
c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
|
||||
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
|
||||
return c;
|
||||
} else {
|
||||
// strict: forbid non-characters like U+fffe
|
||||
return errorValue(3, strict);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
// Truncated 4-byte sequence.
|
||||
*pi=i;
|
||||
return errorValue(2, strict);
|
||||
}
|
||||
} else {
|
||||
/* single-byte character precedes trailing bytes */
|
||||
c=errorValue(0, strict);
|
||||
break;
|
||||
} else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
|
||||
((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
// Truncated 3- or 4-byte sequence.
|
||||
*pi=i;
|
||||
return errorValue(1, strict);
|
||||
}
|
||||
}
|
||||
return c;
|
||||
return errorValue(0, strict);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
|
||||
/* i had been decremented once before the function call */
|
||||
int32_t I=i, Z;
|
||||
uint8_t b;
|
||||
|
||||
/* read at most the 6 bytes s[Z] to s[i], inclusively */
|
||||
if(I-5>start) {
|
||||
Z=I-5;
|
||||
} else {
|
||||
Z=start;
|
||||
}
|
||||
|
||||
/* return I if the sequence starting there is long enough to include i */
|
||||
do {
|
||||
b=s[I];
|
||||
if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
|
||||
break;
|
||||
} else if(b>=0xc0) {
|
||||
if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
|
||||
return I;
|
||||
} else {
|
||||
break;
|
||||
// Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
|
||||
int32_t orig_i=i;
|
||||
uint8_t c=s[i];
|
||||
if(U8_IS_TRAIL(c) && i>start) {
|
||||
uint8_t b1=s[--i];
|
||||
if(0xc2<=b1 && b1<0xe0) {
|
||||
return i;
|
||||
} else if(U8_IS_TRAIL(b1) && i>start) {
|
||||
uint8_t b2=s[--i];
|
||||
if(0xe0<=b2 && b2<0xf0) {
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
return i;
|
||||
}
|
||||
} else if(U8_IS_TRAIL(b2) && i>start) {
|
||||
uint8_t b3=s[--i];
|
||||
if(0xf0<=b3 && b3<=0xf4) {
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
} else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
// Truncated 4-byte sequence.
|
||||
return i;
|
||||
}
|
||||
} else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
|
||||
((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
// Truncated 3- or 4-byte sequence.
|
||||
return i;
|
||||
}
|
||||
} while(Z<=--I);
|
||||
|
||||
/* return i itself to be consistent with the FWD_1 macro */
|
||||
return i;
|
||||
}
|
||||
return orig_i;
|
||||
}
|
||||
|
@ -20,6 +20,7 @@
|
||||
#define __UTRIE2_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/utf8.h"
|
||||
#include "putilimp.h"
|
||||
#include "udataswp.h"
|
||||
|
||||
@ -54,6 +55,8 @@ typedef struct UTrie UTrie;
|
||||
* is truncated, omitting both the BMP portion and the high range.
|
||||
* - There is a special small index for 2-byte UTF-8, and the initial data
|
||||
* entries are designed for fast 1/2-byte UTF-8 lookup.
|
||||
* Starting with ICU 60, C0 and C1 are not recognized as UTF-8 lead bytes any more at all,
|
||||
* and the associated 2-byte indexes are unused.
|
||||
*/
|
||||
|
||||
/**
|
||||
@ -933,29 +936,29 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
|
||||
/** Internal UTF-8 next-post-increment: get the next code point's data. */
|
||||
#define _UTRIE2_U8_NEXT(trie, ascii, data, src, limit, result) { \
|
||||
uint8_t __lead=(uint8_t)*(src)++; \
|
||||
if(__lead<0xc0) { \
|
||||
if(U8_IS_SINGLE(__lead)) { \
|
||||
(result)=(trie)->ascii[__lead]; \
|
||||
} else { \
|
||||
uint8_t __t1, __t2; \
|
||||
if( /* handle U+0000..U+07FF inline */ \
|
||||
__lead<0xe0 && (src)<(limit) && \
|
||||
if( /* handle U+0800..U+FFFF inline */ \
|
||||
0xe0<=__lead && __lead<0xf0 && ((src)+1)<(limit) && \
|
||||
U8_IS_VALID_LEAD3_AND_T1(__lead, __t1=(uint8_t)*(src)) && \
|
||||
(__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
|
||||
) { \
|
||||
(src)+=2; \
|
||||
(result)=(trie)->data[ \
|
||||
((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
|
||||
((__t1&0x3f)<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
|
||||
<<UTRIE2_INDEX_SHIFT)+ \
|
||||
(__t2&UTRIE2_DATA_MASK)]; \
|
||||
} else if( /* handle U+0080..U+07FF inline */ \
|
||||
__lead<0xe0 && __lead>=0xc2 && (src)<(limit) && \
|
||||
(__t1=(uint8_t)(*(src)-0x80))<=0x3f \
|
||||
) { \
|
||||
++(src); \
|
||||
(result)=(trie)->data[ \
|
||||
(trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \
|
||||
__t1]; \
|
||||
} else if( /* handle U+0000..U+CFFF inline */ \
|
||||
__lead<0xed && ((src)+1)<(limit) && \
|
||||
(__t1=(uint8_t)(*(src)-0x80))<=0x3f && (__lead>0xe0 || __t1>=0x20) && \
|
||||
(__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
|
||||
) { \
|
||||
(src)+=2; \
|
||||
(result)=(trie)->data[ \
|
||||
((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
|
||||
(__t1<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
|
||||
<<UTRIE2_INDEX_SHIFT)+ \
|
||||
(__t2&UTRIE2_DATA_MASK)]; \
|
||||
} else { \
|
||||
int32_t __index=utrie2_internalU8NextIndex((trie), __lead, (const uint8_t *)(src), \
|
||||
(const uint8_t *)(limit)); \
|
||||
@ -968,7 +971,7 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
|
||||
/** Internal UTF-8 pre-decrement-previous: get the previous code point's data. */
|
||||
#define _UTRIE2_U8_PREV(trie, ascii, data, start, src, result) { \
|
||||
uint8_t __b=(uint8_t)*--(src); \
|
||||
if(__b<0x80) { \
|
||||
if(U8_IS_SINGLE(__b)) { \
|
||||
(result)=(trie)->ascii[__b]; \
|
||||
} else { \
|
||||
int32_t __index=utrie2_internalU8PrevIndex((trie), __b, (const uint8_t *)(start), \
|
||||
|
@ -49,26 +49,25 @@ UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
|
||||
}
|
||||
// Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
|
||||
c = u8[pos++];
|
||||
if(c < 0xc0) {
|
||||
// ASCII 00..7F; trail bytes 80..BF map to error values.
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
// ASCII 00..7F
|
||||
return trie->data32[c];
|
||||
}
|
||||
uint8_t t1, t2;
|
||||
if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
|
||||
// U+0080..U+07FF; 00..7F map to error values.
|
||||
if(0xe0 <= c && c < 0xf0 &&
|
||||
((pos + 1) < length || length < 0) &&
|
||||
U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
|
||||
(t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
|
||||
// U+0800..U+FFFF except surrogates
|
||||
c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
|
||||
pos += 2;
|
||||
return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
|
||||
} else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
|
||||
// U+0080..U+07FF
|
||||
uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
|
||||
c = ((c & 0x1f) << 6) | t1;
|
||||
++pos;
|
||||
return ce32;
|
||||
} else if(c <= 0xef &&
|
||||
((pos + 1) < length || length < 0) &&
|
||||
(t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
|
||||
(t2 = (u8[pos + 1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
// U+0800..U+FFFF; caller maps surrogates to error values.
|
||||
c = (UChar)((c << 12) | (t1 << 6) | t2);
|
||||
pos += 2;
|
||||
return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
|
||||
} else {
|
||||
// Function call for supplementary code points and error cases.
|
||||
// Illegal byte sequences yield U+FFFD.
|
||||
@ -158,28 +157,17 @@ FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
|
||||
return Collation::FALLBACK_CE32;
|
||||
}
|
||||
c = u8[pos++];
|
||||
if(c < 0xc0) {
|
||||
// ASCII 00..7F; trail bytes 80..BF map to error values.
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
// ASCII 00..7F
|
||||
return trie->data32[c];
|
||||
}
|
||||
uint8_t t1, t2;
|
||||
if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
|
||||
// U+0080..U+07FF; 00..7F map to error values.
|
||||
uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
|
||||
c = ((c & 0x1f) << 6) | t1;
|
||||
++pos;
|
||||
if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
|
||||
pos -= 2;
|
||||
} else {
|
||||
return ce32;
|
||||
}
|
||||
} else if(c <= 0xef &&
|
||||
((pos + 1) < length || length < 0) &&
|
||||
(t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
|
||||
(t2 = (u8[pos + 1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
// U+0800..U+FFFF; caller maps surrogates to error values.
|
||||
c = (UChar)((c << 12) | (t1 << 6) | t2);
|
||||
if(0xe0 <= c && c < 0xf0 &&
|
||||
((pos + 1) < length || length < 0) &&
|
||||
U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
|
||||
(t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
|
||||
// U+0800..U+FFFF except surrogates
|
||||
c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
|
||||
pos += 2;
|
||||
if(CollationFCD::hasTccc(c) &&
|
||||
(CollationFCD::maybeTibetanCompositeVowel(c) ||
|
||||
@ -188,6 +176,16 @@ FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
|
||||
} else {
|
||||
break; // return CE32(BMP)
|
||||
}
|
||||
} else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
|
||||
// U+0080..U+07FF
|
||||
uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
|
||||
c = ((c & 0x1f) << 6) | t1;
|
||||
++pos;
|
||||
if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
|
||||
pos -= 2;
|
||||
} else {
|
||||
return ce32;
|
||||
}
|
||||
} else {
|
||||
// Function call for supplementary code points and error cases.
|
||||
// Illegal byte sequences yield U+FFFD.
|
||||
@ -237,7 +235,7 @@ UBool
|
||||
FCDUTF8CollationIterator::previousHasTccc() const {
|
||||
U_ASSERT(state == CHECK_BWD && pos != 0);
|
||||
UChar32 c = u8[pos - 1];
|
||||
if(c < 0x80) { return FALSE; }
|
||||
if(U8_IS_SINGLE(c)) { return FALSE; }
|
||||
int32_t i = pos;
|
||||
U8_PREV_OR_FFFD(u8, 0, i, c);
|
||||
if(c > 0xffff) { c = U16_LEAD(c); }
|
||||
@ -271,7 +269,7 @@ FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
|
||||
if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
if(c < 0x80) {
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
++pos;
|
||||
return c;
|
||||
}
|
||||
@ -309,7 +307,7 @@ FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
|
||||
if(pos == 0) {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
if((c = u8[pos - 1]) < 0x80) {
|
||||
if(U8_IS_SINGLE(c = u8[pos - 1])) {
|
||||
--pos;
|
||||
return c;
|
||||
}
|
||||
|
@ -670,12 +670,13 @@ static void Test_UChar_UTF8_API(void){
|
||||
}
|
||||
|
||||
/* test UTF-8 with single surrogates - illegal in Unicode 3.2 */
|
||||
// Since ICU 60, each surrogate byte sequence is treated as 3 single-byte errors.
|
||||
{
|
||||
static const UChar
|
||||
withLead16[]={ 0x1800, 0xd89a, 0x0061 },
|
||||
withTrail16[]={ 0x1800, 0xdcba, 0x0061, 0 },
|
||||
withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
|
||||
withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
|
||||
withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0xfffd, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
|
||||
withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0xd900, 0xdc05, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
|
||||
static const uint8_t
|
||||
withLead8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xa2, 0x9a, 0x61 },
|
||||
withTrail8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xb2, 0xba, 0x61, 0 },
|
||||
@ -706,7 +707,7 @@ static void Test_UChar_UTF8_API(void){
|
||||
&err);
|
||||
if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16Sub50005) ||
|
||||
0!=u_memcmp(withTrail16Sub50005, out16, uDestLen+1) ||
|
||||
numSubstitutions!=1) {
|
||||
numSubstitutions!=3) {
|
||||
log_err("error: u_strFromUTF8WithSub(length) failed\n");
|
||||
}
|
||||
|
||||
@ -721,7 +722,7 @@ static void Test_UChar_UTF8_API(void){
|
||||
&err);
|
||||
if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16SubFFFD) ||
|
||||
0!=u_memcmp(withTrail16SubFFFD, out16, uDestLen+1) ||
|
||||
numSubstitutions!=1) {
|
||||
numSubstitutions!=3) {
|
||||
log_err("error: u_strFromUTF8WithSub(NUL termination) failed\n");
|
||||
}
|
||||
|
||||
@ -734,7 +735,7 @@ static void Test_UChar_UTF8_API(void){
|
||||
(const char *)withTrail8, -1,
|
||||
0x50005, &numSubstitutions,
|
||||
&err);
|
||||
if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=1) {
|
||||
if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=3) {
|
||||
log_err("error: u_strFromUTF8WithSub(preflight/NUL termination) failed\n");
|
||||
}
|
||||
|
||||
@ -1015,14 +1016,6 @@ Test_FromUTF8Lenient(void) {
|
||||
log_err("u_strFromUTF8Lenient(U_MEMORY_ALLOCATION_ERROR) failed\n");
|
||||
}
|
||||
|
||||
dest[0]=0x1234;
|
||||
destLength=-1;
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
pDest=u_strFromUTF8Lenient(dest, 1, &destLength, (const char *)bytes, -1, NULL);
|
||||
if(dest[0]!=0x1234) {
|
||||
log_err("u_strFromUTF8Lenient(pErrorCode=NULL) failed\n");
|
||||
}
|
||||
|
||||
/* test normal behavior */
|
||||
number=0; /* string number for log_err() */
|
||||
|
||||
|
@ -350,6 +350,11 @@ static void
|
||||
testTrieUTF8(const char *testName,
|
||||
const UTrie2 *trie, UTrie2ValueBits valueBits,
|
||||
const CheckRange checkRanges[], int32_t countCheckRanges) {
|
||||
// Note: The byte sequence comments refer to the original UTF-8 definition.
|
||||
// Starting with ICU 60, any sequence that is not a prefix of a valid one
|
||||
// is treated as multiple single-byte errors.
|
||||
// For testing, we only rely on U8_... and UTrie2 UTF-8 macros
|
||||
// iterating consistently.
|
||||
static const uint8_t illegal[]={
|
||||
0xc0, 0x80, /* non-shortest U+0000 */
|
||||
0xc1, 0xbf, /* non-shortest U+007f */
|
||||
@ -394,15 +399,36 @@ testTrieUTF8(const char *testName,
|
||||
value=checkRanges[i].value;
|
||||
/* write three legal (or surrogate) code points */
|
||||
U8_APPEND_UNSAFE(s, length, prevCP); /* start of the range */
|
||||
values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value;
|
||||
if(U_IS_SURROGATE(prevCP)) {
|
||||
// A surrogate byte sequence counts as 3 single-byte errors.
|
||||
values[countValues++]=errorValue;
|
||||
values[countValues++]=errorValue;
|
||||
values[countValues++]=errorValue;
|
||||
} else {
|
||||
values[countValues++]=value;
|
||||
}
|
||||
c=checkRanges[i].limit;
|
||||
prevCP=(prevCP+c)/2; /* middle of the range */
|
||||
U8_APPEND_UNSAFE(s, length, prevCP);
|
||||
values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value;
|
||||
if(U_IS_SURROGATE(prevCP)) {
|
||||
// A surrogate byte sequence counts as 3 single-byte errors.
|
||||
values[countValues++]=errorValue;
|
||||
values[countValues++]=errorValue;
|
||||
values[countValues++]=errorValue;
|
||||
} else {
|
||||
values[countValues++]=value;
|
||||
}
|
||||
prevCP=c;
|
||||
--c; /* end of the range */
|
||||
U8_APPEND_UNSAFE(s, length, c);
|
||||
values[countValues++]=U_IS_SURROGATE(c) ? errorValue : value;
|
||||
if(U_IS_SURROGATE(prevCP)) {
|
||||
// A surrogate byte sequence counts as 3 single-byte errors.
|
||||
values[countValues++]=errorValue;
|
||||
values[countValues++]=errorValue;
|
||||
values[countValues++]=errorValue;
|
||||
} else {
|
||||
values[countValues++]=value;
|
||||
}
|
||||
/* write an illegal byte sequence */
|
||||
if(i8<sizeof(illegal)) {
|
||||
U8_FWD_1(illegal, i8, sizeof(illegal));
|
||||
@ -435,17 +461,20 @@ testTrieUTF8(const char *testName,
|
||||
}
|
||||
bytes=0;
|
||||
if(value!=values[i] || i8!=(p-s)) {
|
||||
while(prev8<i8) {
|
||||
bytes=(bytes<<8)|s[prev8++];
|
||||
int32_t k=prev8;
|
||||
while(k<i8) {
|
||||
bytes=(bytes<<8)|s[k++];
|
||||
}
|
||||
}
|
||||
if(value!=values[i]) {
|
||||
log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n",
|
||||
testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]);
|
||||
log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx) (read %d bytes): "
|
||||
"0x%lx instead of 0x%lx\n",
|
||||
testName, (int)prev8, (unsigned long)bytes, (long)c, (int)((p-s)-prev8),
|
||||
(long)value, (long)values[i]);
|
||||
}
|
||||
if(i8!=(p-s)) {
|
||||
log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): %ld != %ld\n",
|
||||
testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
|
||||
log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx): %ld != %ld\n",
|
||||
testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
|
||||
continue;
|
||||
}
|
||||
++i;
|
||||
@ -471,12 +500,14 @@ testTrieUTF8(const char *testName,
|
||||
}
|
||||
}
|
||||
if(value!=values[i]) {
|
||||
log_err("error: wrong value from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n",
|
||||
testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]);
|
||||
log_err("error: wrong value from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx) (read %d bytes): "
|
||||
": 0x%lx instead of 0x%lx\n",
|
||||
testName, (int)prev8, (unsigned long)bytes, (long)c, (int)(prev8-(p-s)),
|
||||
(long)value, (long)values[i]);
|
||||
}
|
||||
if(i8!=(p-s)) {
|
||||
log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): %ld != %ld\n",
|
||||
testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
|
||||
log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx): %ld != %ld\n",
|
||||
testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -121,7 +121,7 @@ addUTF8Test(TestNode** root)
|
||||
|
||||
static void TestCodeUnitValues()
|
||||
{
|
||||
static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
|
||||
static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
|
||||
|
||||
int16_t i;
|
||||
for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
|
||||
@ -231,28 +231,31 @@ static void TestGetChar()
|
||||
0x10401, 0x10401, 0x10401 ,
|
||||
0x10401, 0x10401, 0x10401 ,
|
||||
0x10401, 0x10401, 0x10401,
|
||||
0x25, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
-1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0x65, 0x65, 0x65,
|
||||
0x31, 0x31, 0x31,
|
||||
0x31, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0x240, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
|
||||
-1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
-1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
|
||||
};
|
||||
uint16_t i=0;
|
||||
UChar32 c, expected;
|
||||
uint32_t offset=0;
|
||||
|
||||
for(offset=0; offset<sizeof(input); offset++) {
|
||||
if (offset < sizeof(input) - 1) {
|
||||
expected = result[i];
|
||||
if (expected >= 0 && offset < sizeof(input) - 1) {
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
UTF8_GET_CHAR_UNSAFE(input, offset, c);
|
||||
if(c != result[i]){
|
||||
log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
|
||||
if(c != expected) {
|
||||
log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
|
||||
offset, expected, c);
|
||||
|
||||
}
|
||||
#endif
|
||||
U8_GET_UNSAFE(input, offset, c);
|
||||
if(c != result[i]){
|
||||
log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
|
||||
if(c != expected) {
|
||||
log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
|
||||
offset, expected, c);
|
||||
|
||||
}
|
||||
}
|
||||
@ -285,146 +288,160 @@ static void TestGetChar()
|
||||
}
|
||||
|
||||
static void TestNextPrevChar() {
|
||||
static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
|
||||
static const uint8_t input[]={
|
||||
0x61,
|
||||
0xf0, 0x90, 0x90, 0x81,
|
||||
0xc0, 0x80, // non-shortest form
|
||||
0xf3, 0xbe, // truncated
|
||||
0xc2, // truncated
|
||||
0x61,
|
||||
0x81, 0x90, 0x90, 0xf0, // "backwards" sequence
|
||||
0x00
|
||||
};
|
||||
static const UChar32 result[]={
|
||||
/* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
|
||||
0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000,
|
||||
0x10401, 0x10401, 0x10401, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841410, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1050, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x61, 0x61,
|
||||
0x80, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xc2, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0xfd, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x77e, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
|
||||
0xbe, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0xa1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
|
||||
0x61, 0x61, 0x61, 0xc0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401, 0x10401,
|
||||
0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
|
||||
0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
|
||||
0x0840, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061
|
||||
/* next_safe_ns next_safe_s prev_safe_ns prev_safe_s */
|
||||
0x0061, 0x0061, 0x0000, 0x0000,
|
||||
0x10401, 0x10401, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x61, 0x61,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0x61, 0x61, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
|
||||
0x0000, 0x0000, 0x0061, 0x0061
|
||||
};
|
||||
static const int32_t movedOffset[]={
|
||||
/* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
|
||||
1, 1, 1, 15, 15, 15,
|
||||
5, 5, 5, 14, 14 , 14,
|
||||
3, 3, 3, 9, 13, 13,
|
||||
4, 4, 4, 9, 12, 12,
|
||||
5, 5, 5, 9, 11, 11,
|
||||
7, 7, 7, 10, 10, 10,
|
||||
7, 7, 7, 9, 9, 9,
|
||||
8, 9, 9, 7, 7, 7,
|
||||
9, 9, 9, 7, 7, 7,
|
||||
11, 10, 10, 5, 5, 5,
|
||||
11, 11, 11, 5, 5, 5,
|
||||
12, 12, 12, 1, 1, 1,
|
||||
13, 13, 13, 1, 1, 1,
|
||||
14, 14, 14, 1, 1, 1,
|
||||
14, 15, 15, 1, 1, 1,
|
||||
14, 16, 16, 0, 0, 0,
|
||||
/* next_safe prev_safe_s */
|
||||
1, 15,
|
||||
5, 14,
|
||||
3, 13,
|
||||
4, 12,
|
||||
5, 11,
|
||||
6, 10,
|
||||
7, 9,
|
||||
9, 7,
|
||||
9, 7,
|
||||
10, 6,
|
||||
11, 5,
|
||||
12, 1,
|
||||
13, 1,
|
||||
14, 1,
|
||||
15, 1,
|
||||
16, 0,
|
||||
};
|
||||
/* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */
|
||||
|
||||
UChar32 c, expected;
|
||||
uint32_t i=0;
|
||||
uint32_t i=0, j=0;
|
||||
uint32_t offset=0;
|
||||
int32_t setOffset=0;
|
||||
for(offset=0; offset<sizeof(input); offset++){
|
||||
expected=result[i+1];
|
||||
expected=result[i]; // next_safe_ns
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
setOffset=offset;
|
||||
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
}
|
||||
if(c != expected){
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
setOffset=offset;
|
||||
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
|
||||
if(setOffset != movedOffset[j]) {
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[j], setOffset);
|
||||
}
|
||||
if(c != expected) {
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
#endif
|
||||
setOffset=offset;
|
||||
U8_NEXT(input, setOffset, sizeof(input), c);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
}
|
||||
setOffset=offset;
|
||||
U8_NEXT(input, setOffset, sizeof(input), c);
|
||||
if(setOffset != movedOffset[j]) {
|
||||
log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[j], setOffset);
|
||||
}
|
||||
if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
if(c != expected) {
|
||||
log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
if(setOffset != movedOffset[j]) {
|
||||
log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
offset, movedOffset[j], setOffset);
|
||||
}
|
||||
if(expected<0) { expected=0xfffd; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
if(c != expected) {
|
||||
log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
setOffset=offset;
|
||||
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+2], setOffset);
|
||||
}
|
||||
if(c != result[i+2]){
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
|
||||
}
|
||||
setOffset=offset;
|
||||
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
|
||||
if(setOffset != movedOffset[j]) {
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[j], setOffset);
|
||||
}
|
||||
expected=result[i+1]; // next_safe_s
|
||||
if(c != expected) {
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
|
||||
offset, expected, c);
|
||||
}
|
||||
#endif
|
||||
i=i+6;
|
||||
i=i+4;
|
||||
j=j+2;
|
||||
}
|
||||
|
||||
i=0;
|
||||
i=j=0;
|
||||
for(offset=sizeof(input); offset > 0; --offset){
|
||||
expected=result[i+4];
|
||||
expected=result[i+2]; // prev_safe_ns
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
setOffset=offset;
|
||||
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
|
||||
if(setOffset != movedOffset[i+4]){
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+4], setOffset);
|
||||
}
|
||||
if(c != expected){
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
setOffset=offset;
|
||||
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
|
||||
if(setOffset != movedOffset[j+1]) {
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[j+1], setOffset);
|
||||
}
|
||||
if(c != expected) {
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
#endif
|
||||
setOffset=offset;
|
||||
U8_PREV(input, 0, setOffset, c);
|
||||
if(setOffset != movedOffset[i+4]){
|
||||
log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+4], setOffset);
|
||||
}
|
||||
setOffset=offset;
|
||||
U8_PREV(input, 0, setOffset, c);
|
||||
if(setOffset != movedOffset[j+1]) {
|
||||
log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[j+1], setOffset);
|
||||
}
|
||||
if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
if(c != expected) {
|
||||
log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_PREV_OR_FFFD(input, 0, setOffset, c);
|
||||
if(setOffset != movedOffset[i+4]){
|
||||
if(setOffset != movedOffset[j+1]) {
|
||||
log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+4], setOffset);
|
||||
offset, movedOffset[j+1], setOffset);
|
||||
}
|
||||
if(expected<0) { expected=0xfffd; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
if(c != expected) {
|
||||
log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
setOffset=offset;
|
||||
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
|
||||
if(setOffset != movedOffset[i+5]){
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+5], setOffset);
|
||||
}
|
||||
if(c != result[i+5]){
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
|
||||
}
|
||||
setOffset=offset;
|
||||
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
|
||||
if(setOffset != movedOffset[j+1]) {
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[j+1], setOffset);
|
||||
}
|
||||
expected=result[i+3]; // prev_safe_s
|
||||
if(c != expected) {
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
|
||||
offset, expected, c);
|
||||
}
|
||||
#endif
|
||||
i=i+6;
|
||||
i=i+4;
|
||||
j=j+2;
|
||||
}
|
||||
}
|
||||
|
||||
@ -433,11 +450,13 @@ static void TestNulTerminated() {
|
||||
static const uint8_t input[]={
|
||||
/* 0 */ 0x61,
|
||||
/* 1 */ 0xf0, 0x90, 0x90, 0x81,
|
||||
/* 5 */ 0xc0, 0x80,
|
||||
/* 5 */ 0xc0,
|
||||
/* 6 */ 0x80,
|
||||
/* 7 */ 0xdf, 0x80,
|
||||
/* 9 */ 0xc2,
|
||||
/* 10 */ 0x62,
|
||||
/* 11 */ 0xfd, 0xbe,
|
||||
/* 11 */ 0xfd,
|
||||
/* 12 */ 0xbe,
|
||||
/* 13 */ 0xe0, 0xa0, 0x80,
|
||||
/* 16 */ 0xe2, 0x82, 0xac,
|
||||
/* 19 */ 0xf0, 0x90, 0x90,
|
||||
@ -447,14 +466,16 @@ static void TestNulTerminated() {
|
||||
static const UChar32 result[]={
|
||||
0x61,
|
||||
0x10401,
|
||||
U_SENTINEL,
|
||||
U_SENTINEL, // C0 not a lead byte
|
||||
U_SENTINEL, // 80
|
||||
0x7c0,
|
||||
U_SENTINEL,
|
||||
U_SENTINEL, // C2
|
||||
0x62,
|
||||
U_SENTINEL,
|
||||
U_SENTINEL, // FD not a lead byte
|
||||
U_SENTINEL, // BE
|
||||
0x800,
|
||||
0x20ac,
|
||||
U_SENTINEL,
|
||||
U_SENTINEL, // truncated F0 90 90
|
||||
0
|
||||
};
|
||||
|
||||
@ -544,6 +565,22 @@ static void TestNextPrevNonCharacters() {
|
||||
log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
|
||||
}
|
||||
}
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
for(idx=0; idx<(int32_t)sizeof(nonChars);) {
|
||||
UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
|
||||
UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE);
|
||||
if(ch!=expected) {
|
||||
log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
|
||||
}
|
||||
}
|
||||
for(idx=(int32_t)sizeof(nonChars); idx>0;) {
|
||||
UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE);
|
||||
UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
|
||||
if(ch!=expected) {
|
||||
log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void TestNextPrevCharUnsafe() {
|
||||
@ -563,58 +600,83 @@ static void TestNextPrevCharUnsafe() {
|
||||
static const UChar32 codePoints[]={
|
||||
0x61,
|
||||
0x10401,
|
||||
0,
|
||||
-1,
|
||||
0x20ac,
|
||||
0xa1,
|
||||
0x10ffff,
|
||||
0
|
||||
};
|
||||
|
||||
UChar32 c;
|
||||
UChar32 c, expected;
|
||||
int32_t i;
|
||||
uint32_t offset;
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
for(i=0, offset=0; offset<sizeof(input); ++i) {
|
||||
UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
|
||||
if(c != codePoints[i]){
|
||||
expected = codePoints[i];
|
||||
if(expected >= 0 && c != expected) {
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
|
||||
offset, codePoints[i], c);
|
||||
offset, expected, c);
|
||||
}
|
||||
if(offset==6) {
|
||||
// The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
|
||||
// while the new one skips C0 80 together.
|
||||
++offset;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for(i=0, offset=0; offset<sizeof(input); ++i) {
|
||||
U8_NEXT_UNSAFE(input, offset, c);
|
||||
if(c != codePoints[i]){
|
||||
expected = codePoints[i];
|
||||
if(expected >= 0 && c != expected) {
|
||||
log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
|
||||
offset, codePoints[i], c);
|
||||
offset, expected, c);
|
||||
}
|
||||
}
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
|
||||
UTF8_PREV_CHAR_UNSAFE(input, offset, c);
|
||||
if(c != codePoints[i]){
|
||||
log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
|
||||
offset, codePoints[i], c);
|
||||
}
|
||||
UTF8_PREV_CHAR_UNSAFE(input, offset, c);
|
||||
expected = codePoints[i];
|
||||
if(expected >= 0 && c != expected) {
|
||||
log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
|
||||
offset, expected, c);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
|
||||
U8_PREV_UNSAFE(input, offset, c);
|
||||
if(c != codePoints[i]){
|
||||
log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
|
||||
offset, codePoints[i], c);
|
||||
}
|
||||
U8_PREV_UNSAFE(input, offset, c);
|
||||
expected = codePoints[i];
|
||||
if(expected >= 0 && c != expected) {
|
||||
log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
|
||||
offset, expected, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void TestFwdBack() {
|
||||
static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
|
||||
static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
|
||||
static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
|
||||
static const uint8_t input[]={
|
||||
0x61,
|
||||
0xF0, 0x90, 0x90, 0x81,
|
||||
0xff,
|
||||
0x62,
|
||||
0xc0,
|
||||
0x80,
|
||||
0x7f,
|
||||
0x8f,
|
||||
0xc0,
|
||||
0x63,
|
||||
0x81,
|
||||
0x90,
|
||||
0x90,
|
||||
0xF0,
|
||||
0x00
|
||||
};
|
||||
static const uint16_t fwd_safe[] ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
|
||||
static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
|
||||
|
||||
static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
|
||||
static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
|
||||
static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
|
||||
static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0};
|
||||
static const uint16_t back_N_safe[] ={18, 17, 15, 11, 10, 8, 7, 0};
|
||||
|
||||
uint32_t offsafe=0;
|
||||
|
||||
@ -707,7 +769,10 @@ static void TestFwdBackUnsafe() {
|
||||
0xf4, 0x8f, 0xbf, 0xbf,
|
||||
0x00
|
||||
};
|
||||
static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
|
||||
// forward unsafe skips only C0
|
||||
static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
|
||||
// backward unsafe skips C0 80 together
|
||||
static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
|
||||
|
||||
int32_t offset;
|
||||
int32_t i;
|
||||
@ -726,17 +791,17 @@ static void TestFwdBackUnsafe() {
|
||||
}
|
||||
}
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
|
||||
for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
|
||||
UTF8_BACK_1_UNSAFE(input, offset);
|
||||
if(offset != boundaries[i]){
|
||||
log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
|
||||
if(offset != backBoundaries[i]){
|
||||
log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
|
||||
for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
|
||||
U8_BACK_1_UNSAFE(input, offset);
|
||||
if(offset != boundaries[i]){
|
||||
log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
|
||||
if(offset != backBoundaries[i]){
|
||||
log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
|
||||
}
|
||||
}
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
@ -756,21 +821,21 @@ static void TestFwdBackUnsafe() {
|
||||
}
|
||||
}
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
|
||||
int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
|
||||
for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
|
||||
int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
|
||||
offset=UPRV_LENGTHOF(input);
|
||||
UTF8_BACK_N_UNSAFE(input, offset, i);
|
||||
if(offset != boundaries[j]) {
|
||||
log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
|
||||
if(offset != backBoundaries[j]) {
|
||||
log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
|
||||
int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
|
||||
for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
|
||||
int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
|
||||
offset=UPRV_LENGTHOF(input);
|
||||
U8_BACK_N_UNSAFE(input, offset, i);
|
||||
if(offset != boundaries[j]) {
|
||||
log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
|
||||
if(offset != backBoundaries[j]) {
|
||||
log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1138,8 +1203,12 @@ TestSurrogates() {
|
||||
log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
|
||||
}
|
||||
|
||||
if(is!=iu || il!=iu) {
|
||||
log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
|
||||
// U8_NEXT() skips only the first byte of a surrogate byte sequence.
|
||||
if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
|
||||
log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
|
||||
}
|
||||
if(il!=iu) {
|
||||
log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
|
||||
}
|
||||
|
||||
++k; /* next code point */
|
||||
@ -1175,8 +1244,12 @@ TestSurrogates() {
|
||||
log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
|
||||
}
|
||||
|
||||
if(is!=iu || il !=iu) {
|
||||
log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
|
||||
// U8_PREV() skips only the last byte of a surrogate byte sequence.
|
||||
if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
|
||||
log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
|
||||
}
|
||||
if(il !=iu) {
|
||||
log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
|
||||
}
|
||||
|
||||
i=iu; /* go back by one UTF-8 sequence */
|
||||
|
@ -294,24 +294,22 @@ void CollationTest::TestIllegalUTF8() {
|
||||
coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
|
||||
|
||||
static const char *strings[] = {
|
||||
// U+FFFD
|
||||
"a\xef\xbf\xbdz",
|
||||
// illegal byte sequences
|
||||
"a\x80z", // trail byte
|
||||
"a\xc1\x81z", // non-shortest form
|
||||
"a\xe0\x82\x83z", // non-shortest form
|
||||
"a\xed\xa0\x80z", // lead surrogate: would be U+D800
|
||||
"a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
|
||||
"a\xf0\x8f\xbf\xbfz", // non-shortest form
|
||||
"a\xf4\x90\x80\x80z" // out of range: would be U+110000
|
||||
// string with U+FFFD == illegal byte sequence
|
||||
u8"a\uFFFDz", "a\x80z", // trail byte
|
||||
u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
|
||||
u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
|
||||
u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
|
||||
u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
|
||||
u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
|
||||
u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
|
||||
};
|
||||
|
||||
StringPiece fffd(strings[0]);
|
||||
for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
|
||||
StringPiece illegal(strings[i]);
|
||||
for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
|
||||
StringPiece fffd(strings[i]);
|
||||
StringPiece illegal(strings[i + 1]);
|
||||
UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
|
||||
if(order != UCOL_EQUAL) {
|
||||
errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
|
||||
errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
|
||||
(int)i, order);
|
||||
}
|
||||
}
|
||||
|
@ -146,7 +146,7 @@ void
|
||||
StringTest::Test_UTF8_COUNT_TRAIL_BYTES() {
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
if(UTF8_COUNT_TRAIL_BYTES(0x7F) != 0
|
||||
|| UTF8_COUNT_TRAIL_BYTES(0xC0) != 1
|
||||
|| UTF8_COUNT_TRAIL_BYTES(0xC2) != 1
|
||||
|| UTF8_COUNT_TRAIL_BYTES(0xE0) != 2
|
||||
|| UTF8_COUNT_TRAIL_BYTES(0xF0) != 3) {
|
||||
errln("UTF8_COUNT_TRAIL_BYTES does not work right! See utf_old.h.");
|
||||
@ -155,7 +155,7 @@ StringTest::Test_UTF8_COUNT_TRAIL_BYTES() {
|
||||
// Note: U8_COUNT_TRAIL_BYTES (current) and UTF8_COUNT_TRAIL_BYTES (deprecated)
|
||||
// have completely different implementations.
|
||||
if (U8_COUNT_TRAIL_BYTES(0x7F) != 0
|
||||
|| U8_COUNT_TRAIL_BYTES(0xC0) != 1
|
||||
|| U8_COUNT_TRAIL_BYTES(0xC2) != 1
|
||||
|| U8_COUNT_TRAIL_BYTES(0xE0) != 2
|
||||
|| U8_COUNT_TRAIL_BYTES(0xF0) != 3) {
|
||||
errln("U8_COUNT_TRAIL_BYTES does not work right! See utf8.h.");
|
||||
|
@ -1881,9 +1881,9 @@ UnicodeStringTest::TestUTF8() {
|
||||
0xf3, 0xa0, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
|
||||
};
|
||||
static const UChar expected_utf16[] = {
|
||||
0x41, 0xfffd,
|
||||
0x61, 0xfffd,
|
||||
0xfffd, 0x5a,
|
||||
0x41, 0xfffd, 0xfffd, 0xfffd,
|
||||
0x61, 0xfffd, 0xfffd, 0xfffd,
|
||||
0xfffd, 0xfffd, 0xfffd, 0xfffd,0x5a,
|
||||
0xd900, 0xdc00, 0x7a,
|
||||
0xd800, 0xdc00, 0xd840, 0xdc00,
|
||||
0xdb40, 0xdc00, 0xdbff, 0xdfff
|
||||
|
@ -60,7 +60,6 @@ UTextTest::runIndexedTest(int32_t index, UBool exec,
|
||||
TESTCASE_AUTO(Ticket10562);
|
||||
TESTCASE_AUTO(Ticket10983);
|
||||
TESTCASE_AUTO(Ticket12130);
|
||||
TESTCASE_AUTO(Ticket12888);
|
||||
TESTCASE_AUTO(Ticket13344);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
@ -951,10 +950,14 @@ void UTextTest::ErrorTest()
|
||||
UChar buf[10];
|
||||
int n = utext_extract(ut, 0, 9, buf, 10, &status);
|
||||
TEST_SUCCESS(status);
|
||||
TEST_ASSERT(n==5);
|
||||
TEST_ASSERT(n==7);
|
||||
TEST_ASSERT(buf[0] == 0x41);
|
||||
TEST_ASSERT(buf[1] == 0xfffd);
|
||||
TEST_ASSERT(buf[3] == 0xfffd);
|
||||
TEST_ASSERT(buf[2] == 0x42);
|
||||
TEST_ASSERT(buf[3] == 0xfffd);
|
||||
TEST_ASSERT(buf[4] == 0xfffd);
|
||||
TEST_ASSERT(buf[5] == 0xfffd);
|
||||
TEST_ASSERT(buf[6] == 0x43);
|
||||
utext_close(ut);
|
||||
}
|
||||
|
||||
@ -1578,66 +1581,6 @@ void UTextTest::Ticket12130() {
|
||||
utext_close(&ut);
|
||||
}
|
||||
|
||||
// Ticket 12888: bad handling of illegal utf-8 containing many instances of the archaic, now illegal,
|
||||
// six byte utf-8 forms. Original implementation had an assumption that
|
||||
// there would be at most three utf-8 bytes per UTF-16 code unit.
|
||||
// The five and six byte sequences map to a single replacement character.
|
||||
|
||||
void UTextTest::Ticket12888() {
|
||||
const char *badString =
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
|
||||
"\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80";
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalUTextPointer ut(utext_openUTF8(NULL, badString, -1, &status));
|
||||
TEST_SUCCESS(status);
|
||||
for (;;) {
|
||||
UChar32 c = utext_next32(ut.getAlias());
|
||||
if (c == U_SENTINEL) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
int32_t endIdx = utext_getNativeIndex(ut.getAlias());
|
||||
if (endIdx != (int32_t)strlen(badString)) {
|
||||
errln("%s:%d expected=%d, actual=%d", __FILE__, __LINE__, strlen(badString), endIdx);
|
||||
return;
|
||||
}
|
||||
|
||||
for (int32_t prevIndex = endIdx; prevIndex>0;) {
|
||||
UChar32 c = utext_previous32(ut.getAlias());
|
||||
int32_t currentIndex = utext_getNativeIndex(ut.getAlias());
|
||||
if (c != 0xfffd) {
|
||||
errln("%s:%d (expected, actual, index) = (%d, %d, %d)\n",
|
||||
__FILE__, __LINE__, 0xfffd, c, currentIndex);
|
||||
break;
|
||||
}
|
||||
if (currentIndex != prevIndex - 6) {
|
||||
errln("%s:%d: wrong index. Expected, actual = %d, %d",
|
||||
__FILE__, __LINE__, prevIndex - 6, currentIndex);
|
||||
break;
|
||||
}
|
||||
prevIndex = currentIndex;
|
||||
}
|
||||
}
|
||||
|
||||
// Ticket 13344 The macro form of UTEXT_SETNATIVEINDEX failed when target was a trail surrogate
|
||||
// of a supplementary character.
|
||||
|
||||
|
@ -38,7 +38,6 @@ public:
|
||||
void Ticket10562();
|
||||
void Ticket10983();
|
||||
void Ticket12130();
|
||||
void Ticket12888();
|
||||
void Ticket13344();
|
||||
|
||||
private:
|
||||
|
@ -16,11 +16,12 @@ import com.ibm.icu.util.OutputInt;
|
||||
|
||||
/**
|
||||
* Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
|
||||
*
|
||||
*
|
||||
* Latin-1: Look up bytes.
|
||||
* 2-byte characters: Bits organized vertically.
|
||||
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
|
||||
* Supplementary characters: Call contains() on the parent set.
|
||||
* Supplementary characters: Binary search over
|
||||
* the supplementary part of the parent set's inversion list.
|
||||
*/
|
||||
public final class BMPSet {
|
||||
public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
|
||||
@ -34,9 +35,8 @@ public final class BMPSet {
|
||||
* One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
|
||||
* correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
|
||||
* trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
|
||||
*
|
||||
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
|
||||
* runtime.
|
||||
*
|
||||
* Bits for 0..FF are unused (0).
|
||||
*/
|
||||
private int[] table7FF;
|
||||
|
||||
@ -46,9 +46,8 @@ public final class BMPSet {
|
||||
* t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
|
||||
* indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
|
||||
* and set.contains(c) must be called.
|
||||
*
|
||||
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
|
||||
* validity checking at runtime.
|
||||
*
|
||||
* Bits for 0..7FF are unused (0).
|
||||
*/
|
||||
private int[] bmpBlockBits;
|
||||
|
||||
@ -127,7 +126,7 @@ public final class BMPSet {
|
||||
/**
|
||||
* Span the initial substring for which each character c has spanCondition==contains(c). It must be
|
||||
* spanCondition==0 or 1.
|
||||
*
|
||||
*
|
||||
* @param start The start index
|
||||
* @param outCount If not null: Receives the number of code points in the span.
|
||||
* @return the limit (exclusive end) of the span
|
||||
@ -232,7 +231,7 @@ public final class BMPSet {
|
||||
* Symmetrical with span().
|
||||
* Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
|
||||
* limit and spanCondition==0 or 1.
|
||||
*
|
||||
*
|
||||
* @return The string index which starts the span (i.e. inclusive).
|
||||
*/
|
||||
public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
|
||||
@ -462,10 +461,10 @@ public final class BMPSet {
|
||||
/**
|
||||
* Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
|
||||
* points in a certain range.
|
||||
*
|
||||
*
|
||||
* For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
|
||||
* hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
|
||||
*
|
||||
*
|
||||
* @param c
|
||||
* a character in a subrange of MIN_VALUE..MAX_VALUE
|
||||
* @param lo
|
||||
@ -512,4 +511,3 @@ public final class BMPSet {
|
||||
return (0 != (findCodePoint(c, lo, hi) & 1));
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user