ICU-13311 change illegal-UTF-8 handling in converter code
X-SVN-Rev: 40455
This commit is contained in:
parent
6793988c7b
commit
fa2ddc86c7
@ -31,6 +31,7 @@
|
||||
#include "ucnv_bld.h"
|
||||
#include "ucnv_cnv.h"
|
||||
#include "cmemory.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/* Prototypes --------------------------------------------------------------- */
|
||||
|
||||
@ -44,51 +45,13 @@ U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args
|
||||
|
||||
/* UTF-8 -------------------------------------------------------------------- */
|
||||
|
||||
/* UTF-8 Conversion DATA
|
||||
* for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
|
||||
*/
|
||||
/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
|
||||
#define MAXIMUM_UCS2 0x0000FFFF
|
||||
#define MAXIMUM_UTF 0x0010FFFF
|
||||
#define MAXIMUM_UCS4 0x7FFFFFFF
|
||||
#define HALF_SHIFT 10
|
||||
#define HALF_BASE 0x0010000
|
||||
#define HALF_MASK 0x3FF
|
||||
#define SURROGATE_HIGH_START 0xD800
|
||||
#define SURROGATE_HIGH_END 0xDBFF
|
||||
#define SURROGATE_LOW_START 0xDC00
|
||||
#define SURROGATE_LOW_END 0xDFFF
|
||||
|
||||
/* -SURROGATE_LOW_START + HALF_BASE */
|
||||
#define SURROGATE_LOW_BASE 9216
|
||||
|
||||
static const uint32_t offsetsFromUTF8[7] = {0,
|
||||
static const uint32_t offsetsFromUTF8[5] = {0,
|
||||
(uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
|
||||
(uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
|
||||
(uint32_t) 0x03C82080
|
||||
};
|
||||
|
||||
/* END OF UTF-8 Conversion DATA */
|
||||
|
||||
static const int8_t bytesFromUTF8[256] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
|
||||
};
|
||||
|
||||
/*
|
||||
* Starting with Unicode 3.0.1:
|
||||
* UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
|
||||
* byte sequences with more than 4 bytes are illegal in UTF-8,
|
||||
* which is tested with impossible values for them
|
||||
*/
|
||||
static const uint32_t
|
||||
utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
|
||||
|
||||
static UBool hasCESU8Data(const UConverter *cnv)
|
||||
{
|
||||
#if UCONFIG_ONLY_HTML_CONVERSION
|
||||
@ -127,7 +90,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
|
||||
while (mySource < sourceLimit && myTarget < targetLimit)
|
||||
{
|
||||
ch = *(mySource++);
|
||||
if (ch < 0x80) /* Simple case */
|
||||
if (U8_IS_SINGLE(ch)) /* Simple case */
|
||||
{
|
||||
*(myTarget++) = (UChar) ch;
|
||||
}
|
||||
@ -135,7 +98,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
|
||||
{
|
||||
/* store the first char */
|
||||
toUBytes[0] = (char)ch;
|
||||
inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
|
||||
inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
|
||||
i = 1;
|
||||
|
||||
morebytes:
|
||||
@ -144,7 +107,8 @@ morebytes:
|
||||
if (mySource < sourceLimit)
|
||||
{
|
||||
toUBytes[i] = (char) (ch2 = *mySource);
|
||||
if (!U8_IS_TRAIL(ch2))
|
||||
if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
|
||||
!(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
|
||||
{
|
||||
break; /* i < inBytes */
|
||||
}
|
||||
@ -162,24 +126,12 @@ morebytes:
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove the accumulated high bits */
|
||||
ch -= offsetsFromUTF8[inBytes];
|
||||
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
*/
|
||||
if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
|
||||
(isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
|
||||
// In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
if (i == inBytes && (!isCESU8 || i <= 3))
|
||||
{
|
||||
/* Remove the accumulated high bits */
|
||||
ch -= offsetsFromUTF8[inBytes];
|
||||
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= MAXIMUM_UCS2)
|
||||
{
|
||||
@ -189,9 +141,8 @@ morebytes:
|
||||
else
|
||||
{
|
||||
/* write out the surrogates */
|
||||
ch -= HALF_BASE;
|
||||
*(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
|
||||
ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
|
||||
*(myTarget++) = U16_LEAD(ch);
|
||||
ch = U16_TRAIL(ch);
|
||||
if (myTarget < targetLimit)
|
||||
{
|
||||
*(myTarget++) = (UChar)ch;
|
||||
@ -256,7 +207,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr
|
||||
while (mySource < sourceLimit && myTarget < targetLimit)
|
||||
{
|
||||
ch = *(mySource++);
|
||||
if (ch < 0x80) /* Simple case */
|
||||
if (U8_IS_SINGLE(ch)) /* Simple case */
|
||||
{
|
||||
*(myTarget++) = (UChar) ch;
|
||||
*(myOffsets++) = offsetNum++;
|
||||
@ -264,7 +215,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr
|
||||
else
|
||||
{
|
||||
toUBytes[0] = (char)ch;
|
||||
inBytes = bytesFromUTF8[ch];
|
||||
inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
|
||||
i = 1;
|
||||
|
||||
morebytes:
|
||||
@ -273,7 +224,8 @@ morebytes:
|
||||
if (mySource < sourceLimit)
|
||||
{
|
||||
toUBytes[i] = (char) (ch2 = *mySource);
|
||||
if (!U8_IS_TRAIL(ch2))
|
||||
if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
|
||||
!(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
|
||||
{
|
||||
break; /* i < inBytes */
|
||||
}
|
||||
@ -290,24 +242,12 @@ morebytes:
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove the accumulated high bits */
|
||||
ch -= offsetsFromUTF8[inBytes];
|
||||
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
*/
|
||||
if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
|
||||
(isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
|
||||
// In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
if (i == inBytes && (!isCESU8 || i <= 3))
|
||||
{
|
||||
/* Remove the accumulated high bits */
|
||||
ch -= offsetsFromUTF8[inBytes];
|
||||
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= MAXIMUM_UCS2)
|
||||
{
|
||||
@ -318,10 +258,9 @@ morebytes:
|
||||
else
|
||||
{
|
||||
/* write out the surrogates */
|
||||
ch -= HALF_BASE;
|
||||
*(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
|
||||
*(myTarget++) = U16_LEAD(ch);
|
||||
*(myOffsets++) = offsetNum;
|
||||
ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
|
||||
ch = U16_TRAIL(ch);
|
||||
if (myTarget < targetLimit)
|
||||
{
|
||||
*(myTarget++) = (UChar)ch;
|
||||
@ -616,10 +555,9 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
||||
UConverter *cnv;
|
||||
const uint8_t *sourceInitial;
|
||||
const uint8_t *source;
|
||||
uint16_t extraBytesToWrite;
|
||||
uint8_t myByte;
|
||||
UChar32 ch;
|
||||
int8_t i, isLegalSequence;
|
||||
int8_t i;
|
||||
|
||||
/* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
|
||||
|
||||
@ -633,14 +571,14 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
||||
}
|
||||
|
||||
myByte = (uint8_t)*(source++);
|
||||
if (myByte < 0x80)
|
||||
if (U8_IS_SINGLE(myByte))
|
||||
{
|
||||
args->source = (const char *)source;
|
||||
return (UChar32)myByte;
|
||||
}
|
||||
|
||||
extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
|
||||
if (extraBytesToWrite == 0) {
|
||||
uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
|
||||
if (countTrailBytes == 0) {
|
||||
cnv->toUBytes[0] = myByte;
|
||||
cnv->toULength = 1;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
@ -649,15 +587,17 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
||||
}
|
||||
|
||||
/*The byte sequence is longer than the buffer area passed*/
|
||||
if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
|
||||
if (((const char *)source + countTrailBytes) > args->sourceLimit)
|
||||
{
|
||||
/* check if all of the remaining bytes are trail bytes */
|
||||
uint16_t extraBytesToWrite = countTrailBytes + 1;
|
||||
cnv->toUBytes[0] = myByte;
|
||||
i = 1;
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
while(source < (const uint8_t *)args->sourceLimit) {
|
||||
if(U8_IS_TRAIL(myByte = *source)) {
|
||||
cnv->toUBytes[i++] = myByte;
|
||||
uint8_t b = *source;
|
||||
if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
|
||||
cnv->toUBytes[i++] = b;
|
||||
++source;
|
||||
} else {
|
||||
/* error even before we run out of input */
|
||||
@ -670,81 +610,28 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
isLegalSequence = 1;
|
||||
ch = myByte << 6;
|
||||
switch(extraBytesToWrite)
|
||||
{
|
||||
/* note: code falls through cases! (sic)*/
|
||||
case 6:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!U8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
if(countTrailBytes == 2) {
|
||||
uint8_t t1 = *source, t2;
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
|
||||
args->source = (const char *)(source + 1);
|
||||
return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
|
||||
}
|
||||
++source;
|
||||
U_FALLTHROUGH;
|
||||
case 5:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!U8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
} else if(countTrailBytes == 1) {
|
||||
uint8_t t1 = *source;
|
||||
if(U8_IS_TRAIL(t1)) {
|
||||
args->source = (const char *)(source + 1);
|
||||
return (ch + t1) - offsetsFromUTF8[2];
|
||||
}
|
||||
++source;
|
||||
U_FALLTHROUGH;
|
||||
case 4:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!U8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
} else { // countTrailBytes == 3
|
||||
uint8_t t1 = *source, t2, t3;
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
|
||||
U8_IS_TRAIL(t3 = *++source)) {
|
||||
args->source = (const char *)(source + 1);
|
||||
return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
|
||||
}
|
||||
++source;
|
||||
U_FALLTHROUGH;
|
||||
case 3:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!U8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
++source;
|
||||
U_FALLTHROUGH;
|
||||
case 2:
|
||||
ch += (myByte = *source);
|
||||
if (!U8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
++source;
|
||||
};
|
||||
ch -= offsetsFromUTF8[extraBytesToWrite];
|
||||
args->source = (const char *)source;
|
||||
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
*/
|
||||
if (isLegalSequence &&
|
||||
(uint32_t)ch <= MAXIMUM_UTF &&
|
||||
(uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
|
||||
!U_IS_SURROGATE(ch)
|
||||
) {
|
||||
return ch; /* return the code point */
|
||||
}
|
||||
args->source = (const char *)source;
|
||||
|
||||
for(i = 0; sourceInitial < source; ++i) {
|
||||
cnv->toUBytes[i] = *sourceInitial++;
|
||||
@ -757,14 +644,6 @@ U_CDECL_END
|
||||
|
||||
/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
|
||||
|
||||
/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
|
||||
static const UChar32
|
||||
utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
|
||||
static const UChar32
|
||||
utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
|
||||
|
||||
U_CDECL_BEGIN
|
||||
/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
|
||||
static void U_CALLCONV
|
||||
@ -812,39 +691,35 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
*pErrorCode=U_USING_DEFAULT_WARNING;
|
||||
return;
|
||||
} else {
|
||||
/*
|
||||
* Use a single counter for source and target, counting the minimum of
|
||||
* the source length and the target capacity.
|
||||
* As a result, the source length is checked only once per multi-byte
|
||||
* character instead of twice.
|
||||
*
|
||||
* Make sure that the last byte sequence is complete, or else
|
||||
* stop just before it.
|
||||
* (The longest legal byte sequence has 3 trail bytes.)
|
||||
* Count oldToULength (number of source bytes from a previous buffer)
|
||||
* into the source length but reduce the source index by toULimit
|
||||
* while going back over trail bytes in order to not go back into
|
||||
* the bytes that will be read for finishing a partial
|
||||
* sequence from the previous buffer.
|
||||
* Let the standard converter handle edge cases.
|
||||
*/
|
||||
int32_t i;
|
||||
|
||||
// Use a single counter for source and target, counting the minimum of
|
||||
// the source length and the target capacity.
|
||||
// Let the standard converter handle edge cases.
|
||||
if(count>targetCapacity) {
|
||||
count=targetCapacity;
|
||||
}
|
||||
|
||||
i=0;
|
||||
while(i<3 && i<(count-toULimit)) {
|
||||
b=source[count-oldToULength-i-1];
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
++i;
|
||||
} else {
|
||||
if(i<U8_COUNT_TRAIL_BYTES(b)) {
|
||||
/* stop converting before the lead byte if there are not enough trail bytes for it */
|
||||
count-=i+1;
|
||||
// The conversion loop checks count>0 only once per 1/2/3-byte character.
|
||||
// If the buffer ends with a truncated 2- or 3-byte sequence,
|
||||
// then we reduce the count to stop before that,
|
||||
// and collect the remaining bytes after the conversion loop.
|
||||
{
|
||||
// Do not go back into the bytes that will be read for finishing a partial
|
||||
// sequence from the previous buffer.
|
||||
int32_t length=count-toULimit;
|
||||
if(length>0) {
|
||||
uint8_t b1=*(sourceLimit-1);
|
||||
if(U8_IS_SINGLE(b1)) {
|
||||
// common ASCII character
|
||||
} else if(U8_IS_TRAIL(b1) && length>=2) {
|
||||
uint8_t b2=*(sourceLimit-2);
|
||||
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
// truncated 3-byte sequence
|
||||
count-=2;
|
||||
}
|
||||
} else if(0xc2<=b1 && b1<0xf0) {
|
||||
// truncated 2- or 3-byte sequence
|
||||
--count;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -859,17 +734,17 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
/* conversion loop */
|
||||
while(count>0) {
|
||||
b=*source++;
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
/* convert ASCII */
|
||||
*target++=b;
|
||||
--count;
|
||||
continue;
|
||||
} else {
|
||||
if(b>0xe0) {
|
||||
if( /* handle U+1000..U+D7FF inline */
|
||||
(t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
|
||||
(b==0xed && (t1 <= 0x9f))) &&
|
||||
(t2=source[1]) >= 0x80 && t2 <= 0xbf
|
||||
if(b>=0xe0) {
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
b<0xf0 &&
|
||||
U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
|
||||
U8_IS_TRAIL(t2=source[1])
|
||||
) {
|
||||
source+=2;
|
||||
*target++=b;
|
||||
@ -878,10 +753,10 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
count-=3;
|
||||
continue;
|
||||
}
|
||||
} else if(b<0xe0) {
|
||||
} else {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
b>=0xc2 &&
|
||||
(t1=*source) >= 0x80 && t1 <= 0xbf
|
||||
U8_IS_TRAIL(t1=*source)
|
||||
) {
|
||||
++source;
|
||||
*target++=b;
|
||||
@ -889,30 +764,18 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
count-=2;
|
||||
continue;
|
||||
}
|
||||
} else if(b==0xe0) {
|
||||
if( /* handle U+0800..U+0FFF inline */
|
||||
(t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
|
||||
(t2=source[1]) >= 0x80 && t2 <= 0xbf
|
||||
) {
|
||||
source+=2;
|
||||
*target++=b;
|
||||
*target++=t1;
|
||||
*target++=t2;
|
||||
count-=3;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle "complicated" and error cases, and continuing partial characters */
|
||||
oldToULength=0;
|
||||
toULength=1;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES_NON_ASCII(b);
|
||||
c=b;
|
||||
moreBytes:
|
||||
while(toULength<toULimit) {
|
||||
if(source<sourceLimit) {
|
||||
b=*source;
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
|
||||
++source;
|
||||
++toULength;
|
||||
c=(c<<6)+b;
|
||||
@ -934,18 +797,7 @@ moreBytes:
|
||||
}
|
||||
}
|
||||
|
||||
if( toULength==toULimit && /* consumed all trail bytes */
|
||||
(toULength==3 || toULength==2) && /* BMP */
|
||||
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
|
||||
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
|
||||
) {
|
||||
/* legal byte sequence for BMP code point */
|
||||
} else if(
|
||||
toULength==toULimit && toULength==4 &&
|
||||
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
|
||||
) {
|
||||
/* legal byte sequence for supplementary code point */
|
||||
} else {
|
||||
if(toULength!=toULimit) {
|
||||
/* error handling: illegal UTF-8 byte sequence */
|
||||
source-=(toULength-oldToULength);
|
||||
while(oldToULength<toULength) {
|
||||
@ -979,7 +831,7 @@ moreBytes:
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
} else {
|
||||
b=*source;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES(b);
|
||||
if(toULimit>(sourceLimit-source)) {
|
||||
/* collect a truncated byte sequence */
|
||||
toULength=0;
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include "unicode/utf8.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "ucnv_cnv.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/* control optimizations according to the platform */
|
||||
#define LATIN1_UNROLL_FROM_UNICODE 1
|
||||
@ -374,7 +375,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
while(source<sourceLimit) {
|
||||
if(targetCapacity>0) {
|
||||
b=*source++;
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
/* convert ASCII */
|
||||
*target++=(uint8_t)b;
|
||||
--targetCapacity;
|
||||
@ -409,7 +410,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
|
||||
utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;
|
||||
utf8->toULength=1;
|
||||
utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
utf8->mode=U8_COUNT_BYTES(b);
|
||||
}
|
||||
|
||||
/* write back the updated pointers */
|
||||
|
@ -59,6 +59,7 @@
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "umutex.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/* control optimizations according to the platform */
|
||||
#define MBCS_UNROLL_SINGLE_TO_BMP 1
|
||||
@ -5011,13 +5012,9 @@ ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
|
||||
|
||||
/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
|
||||
|
||||
/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
|
||||
static const UChar32
|
||||
utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
|
||||
static const UChar32
|
||||
utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
|
||||
utf8_offsets[5]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
|
||||
|
||||
static void U_CALLCONV
|
||||
ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
@ -5075,28 +5072,27 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
toULength=oldToULength=toULimit=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that the last byte sequence before sourceLimit is complete
|
||||
* or runs into a lead byte.
|
||||
* Do not go back into the bytes that will be read for finishing a partial
|
||||
* sequence from the previous buffer.
|
||||
* In the conversion loop compare source with sourceLimit only once
|
||||
* per multi-byte character.
|
||||
*/
|
||||
// The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
|
||||
// If the buffer ends with a truncated 2- or 3-byte sequence,
|
||||
// then we reduce the sourceLimit to before that,
|
||||
// and collect the remaining bytes after the conversion loop.
|
||||
{
|
||||
int32_t i, length;
|
||||
|
||||
length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
|
||||
for(i=0; i<3 && i<length;) {
|
||||
b=*(sourceLimit-i-1);
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
++i;
|
||||
} else {
|
||||
if(i<U8_COUNT_TRAIL_BYTES(b)) {
|
||||
/* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
|
||||
sourceLimit-=i+1;
|
||||
// Do not go back into the bytes that will be read for finishing a partial
|
||||
// sequence from the previous buffer.
|
||||
int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
|
||||
if(length>0) {
|
||||
uint8_t b1=*(sourceLimit-1);
|
||||
if(U8_IS_SINGLE(b1)) {
|
||||
// common ASCII character
|
||||
} else if(U8_IS_TRAIL(b1) && length>=2) {
|
||||
uint8_t b2=*(sourceLimit-2);
|
||||
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
// truncated 3-byte sequence
|
||||
sourceLimit-=2;
|
||||
}
|
||||
break;
|
||||
} else if(0xc2<=b1 && b1<0xf0) {
|
||||
// truncated 2- or 3-byte sequence
|
||||
--sourceLimit;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -5130,7 +5126,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
while(source<sourceLimit) {
|
||||
if(targetCapacity>0) {
|
||||
b=*source++;
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
/* convert ASCII */
|
||||
if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
|
||||
*target++=(uint8_t)b;
|
||||
@ -5185,7 +5181,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
/* handle "complicated" and error cases, and continuing partial characters */
|
||||
oldToULength=0;
|
||||
toULength=1;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES_NON_ASCII(b);
|
||||
c=b;
|
||||
moreBytes:
|
||||
while(toULength<toULimit) {
|
||||
@ -5198,7 +5194,7 @@ moreBytes:
|
||||
*/
|
||||
if(source<(uint8_t *)pToUArgs->sourceLimit) {
|
||||
b=*source;
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
|
||||
++source;
|
||||
++toULength;
|
||||
c=(c<<6)+b;
|
||||
@ -5220,22 +5216,18 @@ moreBytes:
|
||||
}
|
||||
}
|
||||
|
||||
if( toULength==toULimit && /* consumed all trail bytes */
|
||||
(toULength==3 || toULength==2) && /* BMP */
|
||||
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
|
||||
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
|
||||
) {
|
||||
value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
|
||||
} else if(
|
||||
toULength==toULimit && toULength==4 &&
|
||||
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
|
||||
) {
|
||||
/* supplementary code point */
|
||||
if(!hasSupplementary) {
|
||||
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
||||
value=0;
|
||||
} else {
|
||||
if(toULength==toULimit) {
|
||||
c-=utf8_offsets[toULength];
|
||||
if(toULength<=3) { /* BMP */
|
||||
value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
|
||||
} else {
|
||||
/* supplementary code point */
|
||||
if(!hasSupplementary) {
|
||||
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
||||
value=0;
|
||||
} else {
|
||||
value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* error handling: illegal UTF-8 byte sequence */
|
||||
@ -5310,7 +5302,7 @@ moreBytes:
|
||||
source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
|
||||
c=utf8->toUBytes[0]=b=*source++;
|
||||
toULength=1;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES(b);
|
||||
while(source<sourceLimit) {
|
||||
utf8->toUBytes[toULength++]=b=*source++;
|
||||
c=(c<<6)+b;
|
||||
@ -5375,28 +5367,27 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
toULength=oldToULength=toULimit=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that the last byte sequence before sourceLimit is complete
|
||||
* or runs into a lead byte.
|
||||
* Do not go back into the bytes that will be read for finishing a partial
|
||||
* sequence from the previous buffer.
|
||||
* In the conversion loop compare source with sourceLimit only once
|
||||
* per multi-byte character.
|
||||
*/
|
||||
// The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
|
||||
// If the buffer ends with a truncated 2- or 3-byte sequence,
|
||||
// then we reduce the sourceLimit to before that,
|
||||
// and collect the remaining bytes after the conversion loop.
|
||||
{
|
||||
int32_t i, length;
|
||||
|
||||
length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
|
||||
for(i=0; i<3 && i<length;) {
|
||||
b=*(sourceLimit-i-1);
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
++i;
|
||||
} else {
|
||||
if(i<U8_COUNT_TRAIL_BYTES(b)) {
|
||||
/* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
|
||||
sourceLimit-=i+1;
|
||||
// Do not go back into the bytes that will be read for finishing a partial
|
||||
// sequence from the previous buffer.
|
||||
int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
|
||||
if(length>0) {
|
||||
uint8_t b1=*(sourceLimit-1);
|
||||
if(U8_IS_SINGLE(b1)) {
|
||||
// common ASCII character
|
||||
} else if(U8_IS_TRAIL(b1) && length>=2) {
|
||||
uint8_t b2=*(sourceLimit-2);
|
||||
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
// truncated 3-byte sequence
|
||||
sourceLimit-=2;
|
||||
}
|
||||
break;
|
||||
} else if(0xc2<=b1 && b1<0xf0) {
|
||||
// truncated 2- or 3-byte sequence
|
||||
--sourceLimit;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -5412,7 +5403,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
while(source<sourceLimit) {
|
||||
if(targetCapacity>0) {
|
||||
b=*source++;
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
/* convert ASCII */
|
||||
if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
|
||||
*target++=b;
|
||||
@ -5426,13 +5417,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(b>0xe0) {
|
||||
if( /* handle U+1000..U+D7FF inline */
|
||||
(((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
|
||||
(b==0xed && (t1 <= 0x1f))) &&
|
||||
if(b>=0xe0) {
|
||||
if( /* handle U+0800..U+D7FF inline */
|
||||
b<=0xed && // do not assume maxFastUChar>0xd7ff
|
||||
U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
|
||||
(t2=(uint8_t)(source[1]-0x80)) <= 0x3f
|
||||
) {
|
||||
c=((b&0xf)<<6)|t1;
|
||||
c=((b&0xf)<<6)|(t1&0x3f);
|
||||
source+=2;
|
||||
value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
|
||||
if(value==0) {
|
||||
@ -5442,7 +5433,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
} else {
|
||||
c=-1;
|
||||
}
|
||||
} else if(b<0xe0) {
|
||||
} else {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
b>=0xc2 &&
|
||||
(t1=(uint8_t)(*source-0x80)) <= 0x3f
|
||||
@ -5457,15 +5448,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
} else {
|
||||
c=-1;
|
||||
}
|
||||
} else {
|
||||
c=-1;
|
||||
}
|
||||
|
||||
if(c<0) {
|
||||
/* handle "complicated" and error cases, and continuing partial characters */
|
||||
oldToULength=0;
|
||||
toULength=1;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES_NON_ASCII(b);
|
||||
c=b;
|
||||
moreBytes:
|
||||
while(toULength<toULimit) {
|
||||
@ -5478,7 +5467,7 @@ moreBytes:
|
||||
*/
|
||||
if(source<(uint8_t *)pToUArgs->sourceLimit) {
|
||||
b=*source;
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
|
||||
++source;
|
||||
++toULength;
|
||||
c=(c<<6)+b;
|
||||
@ -5500,22 +5489,18 @@ moreBytes:
|
||||
}
|
||||
}
|
||||
|
||||
if( toULength==toULimit && /* consumed all trail bytes */
|
||||
(toULength==3 || toULength==2) && /* BMP */
|
||||
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
|
||||
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
|
||||
) {
|
||||
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
|
||||
} else if(
|
||||
toULength==toULimit && toULength==4 &&
|
||||
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
|
||||
) {
|
||||
/* supplementary code point */
|
||||
if(!hasSupplementary) {
|
||||
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
||||
stage2Entry=0;
|
||||
} else {
|
||||
if(toULength==toULimit) {
|
||||
c-=utf8_offsets[toULength];
|
||||
if(toULength<=3) { /* BMP */
|
||||
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
|
||||
} else {
|
||||
/* supplementary code point */
|
||||
if(!hasSupplementary) {
|
||||
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
||||
stage2Entry=0;
|
||||
} else {
|
||||
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* error handling: illegal UTF-8 byte sequence */
|
||||
@ -5620,7 +5605,7 @@ unassigned:
|
||||
source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
|
||||
c=utf8->toUBytes[0]=b=*source++;
|
||||
toULength=1;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES(b);
|
||||
while(source<sourceLimit) {
|
||||
utf8->toUBytes[toULength++]=b=*source++;
|
||||
c=(c<<6)+b;
|
||||
|
@ -53,8 +53,8 @@
|
||||
* @internal
|
||||
*/
|
||||
#define U8_COUNT_TRAIL_BYTES(leadByte) \
|
||||
((uint8_t)(leadByte)<=0xf4 ? \
|
||||
((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0) : 0)
|
||||
(U8_IS_LEAD(leadByte) ? \
|
||||
((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
|
||||
|
||||
/**
|
||||
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
|
||||
@ -80,29 +80,35 @@
|
||||
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
|
||||
|
||||
/**
|
||||
* Internal bit vector for 3-byte UTF-8 validity check.
|
||||
* Lead byte E0..EF bits 3..0 as byte index,
|
||||
* first trail byte bits 7..5 as bit index into that byte.
|
||||
* Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
|
||||
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
|
||||
* Lead byte E0..EF bits 3..0 are used as byte index,
|
||||
* first trail byte bits 7..5 are used as bit index into that byte.
|
||||
* @see U8_IS_VALID_LEAD3_AND_T1
|
||||
* @internal
|
||||
*/
|
||||
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
|
||||
|
||||
/**
|
||||
* Internal 3-byte UTF-8 validity check.
|
||||
* Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
|
||||
|
||||
/**
|
||||
* Internal bit vector for 4-byte UTF-8 validity check.
|
||||
* First trail byte bits 7..4 as byte index,
|
||||
* lead byte F0..F4 bits 2..0 as bit index into that byte.
|
||||
* Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
|
||||
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
|
||||
* First trail byte bits 7..4 are used as byte index,
|
||||
* lead byte F0..F4 bits 2..0 are used as bit index into that byte.
|
||||
* @see U8_IS_VALID_LEAD4_AND_T1
|
||||
* @internal
|
||||
*/
|
||||
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
|
||||
|
||||
/**
|
||||
* Internal 4-byte UTF-8 validity check.
|
||||
* Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
|
||||
@ -166,7 +172,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
|
||||
|
||||
/**
|
||||
* Is this code unit (byte) a UTF-8 lead byte?
|
||||
* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
* @stable ICU 2.4
|
||||
@ -175,7 +181,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
// 0x32=0xf4-0xc2
|
||||
|
||||
/**
|
||||
* Is this code unit (byte) a UTF-8 trail byte?
|
||||
* Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
* @stable ICU 2.4
|
||||
|
@ -18,6 +18,7 @@
|
||||
#define __USTR_IMP_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/utf8.h"
|
||||
|
||||
/**
|
||||
* Internal option for unorm_cmpEquivFold() for strncmp style.
|
||||
@ -81,4 +82,62 @@ u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorC
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
|
||||
* Returns 1 for ASCII 0..0x7f.
|
||||
* Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff.
|
||||
* leadByte might be evaluated multiple times.
|
||||
*
|
||||
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
|
||||
* @return 0..4
|
||||
*/
|
||||
#define U8_COUNT_BYTES(leadByte) \
|
||||
(U8_IS_SINGLE(leadByte) ? 1 : U8_COUNT_BYTES_NON_ASCII(leadByte))
|
||||
|
||||
/**
|
||||
* Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
|
||||
* Returns 0 for 0x00..0xc1 as well as for 0xf5..0xff.
|
||||
* leadByte might be evaluated multiple times.
|
||||
*
|
||||
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
|
||||
* @return 0 or 2..4
|
||||
*/
|
||||
#define U8_COUNT_BYTES_NON_ASCII(leadByte) \
|
||||
(U8_IS_LEAD(leadByte) ? ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+2 : 0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UTF8 {
|
||||
public:
|
||||
UTF8() = delete; // all static
|
||||
|
||||
/**
|
||||
* Is t a valid UTF-8 trail byte?
|
||||
*
|
||||
* @param prev Must be the preceding lead byte if i==1 and length>=3;
|
||||
* otherwise ignored.
|
||||
* @param t The i-th byte following the lead byte.
|
||||
* @param i The index (1..3) of byte t in the byte sequence. 0<i<length
|
||||
* @param length The length (2..4) of the byte sequence according to the lead byte.
|
||||
* @return TRUE if t is a valid trail byte in this context.
|
||||
*/
|
||||
static inline UBool isValidTrail(int32_t prev, uint8_t t, int32_t i, int32_t length) {
|
||||
// The first trail byte after a 3- or 4-byte lead byte
|
||||
// needs to be validated together with its lead byte.
|
||||
if (length <= 2 || i > 1) {
|
||||
return U8_IS_TRAIL(t);
|
||||
} else if (length == 3) {
|
||||
return U8_IS_VALID_LEAD3_AND_T1(prev, t);
|
||||
} else { // length == 4
|
||||
return U8_IS_VALID_LEAD4_AND_T1(prev, t);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif
|
||||
|
@ -281,13 +281,13 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
} else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
// Truncated 4-byte sequence.
|
||||
*pi=i;
|
||||
return errorValue(2, strict);
|
||||
}
|
||||
} else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
|
||||
((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
} else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
|
||||
(0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
// Truncated 3- or 4-byte sequence.
|
||||
*pi=i;
|
||||
return errorValue(1, strict);
|
||||
@ -318,12 +318,12 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
} else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
} else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
// Truncated 4-byte sequence.
|
||||
return i;
|
||||
}
|
||||
} else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
|
||||
((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
} else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
|
||||
(0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
// Truncated 3- or 4-byte sequence.
|
||||
return i;
|
||||
}
|
||||
|
@ -2495,6 +2495,26 @@ static UBool getTestChar(UConverter *cnv, const char *converterName,
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static UBool isOneTruncatedUTF8(const char *s, int32_t length) {
|
||||
if(length==0) {
|
||||
return FALSE;
|
||||
} else if(length==1) {
|
||||
return U8_IS_LEAD(s[0]);
|
||||
} else {
|
||||
int32_t count=U8_COUNT_TRAIL_BYTES(s[0]);
|
||||
if(length<=count) {
|
||||
// 2 or more bytes, but fewer than the lead byte indicates.
|
||||
int32_t oneLength=0;
|
||||
U8_FWD_1(s, oneLength, length);
|
||||
// Truncated if we reach the end of the string.
|
||||
// Not true if the lead byte and first trail byte do not start a valid sequence,
|
||||
// e.g., E0 80 -> oneLength=1.
|
||||
return oneLength==length;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
static void testFromTruncatedUTF8(UConverter *utf8Cnv, UConverter *cnv, const char *converterName,
|
||||
char charUTF8[4], int32_t charUTF8Length,
|
||||
char char0[8], int32_t char0Length,
|
||||
@ -2526,7 +2546,7 @@ static void testFromTruncatedUTF8(UConverter *utf8Cnv, UConverter *cnv, const ch
|
||||
for(i=0; i<UPRV_LENGTHOF(badUTF8); ++i) {
|
||||
/* truncated sequence? */
|
||||
int32_t length=strlen(badUTF8[i]);
|
||||
if(length>=(1+U8_COUNT_TRAIL_BYTES(badUTF8[i][0]))) {
|
||||
if(!isOneTruncatedUTF8(badUTF8[i], length)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -1482,7 +1482,7 @@ static void TestSub(int32_t inputsize, int32_t outputsize)
|
||||
if(!testConvertFromUnicode(testinput, UPRV_LENGTHOF(testinput),
|
||||
expectedUTF8, UPRV_LENGTHOF(expectedUTF8), "utf8",
|
||||
UCNV_FROM_U_CALLBACK_SUBSTITUTE, offsets, NULL, 0 )) {
|
||||
log_err("u-> utf8 with stop did not match.\n");
|
||||
log_err("u-> utf8 with substitute did not match.\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1614,8 +1614,8 @@ static void TestSub(int32_t inputsize, int32_t outputsize)
|
||||
{
|
||||
const uint8_t sampleText1[] = { 0x31, 0xe4, 0xba, 0x8c,
|
||||
0xe0, 0x80, 0x61,};
|
||||
UChar expected1[] = { 0x0031, 0x4e8c, 0xfffd, 0x0061};
|
||||
int32_t offsets1[] = { 0x0000, 0x0001, 0x0004, 0x0006};
|
||||
UChar expected1[] = { 0x0031, 0x4e8c, 0xfffd, 0xfffd, 0x0061};
|
||||
int32_t offsets1[] = { 0x0000, 0x0001, 0x0004, 0x0005, 0x0006};
|
||||
|
||||
if(!testConvertToUnicode(sampleText1, UPRV_LENGTHOF(sampleText1),
|
||||
expected1, UPRV_LENGTHOF(expected1),"utf8",
|
||||
|
@ -963,8 +963,8 @@ static void TestWithBufferSize(int32_t insize, int32_t outsize){
|
||||
{
|
||||
const uint8_t sampleText1[] = { 0x31, 0xe4, 0xba, 0x8c,
|
||||
0xe0, 0x80, 0x61};
|
||||
UChar expected1[] = { 0x0031, 0x4e8c, 0xfffd, 0x0061};
|
||||
int32_t offsets1[] = { 0x0000, 0x0001, 0x0004, 0x0006};
|
||||
UChar expected1[] = { 0x0031, 0x4e8c, 0xfffd, 0xfffd, 0x0061};
|
||||
int32_t offsets1[] = { 0x0000, 0x0001, 0x0004, 0x0005, 0x0006};
|
||||
|
||||
if(!testConvertToU(sampleText1, sizeof(sampleText1),
|
||||
expected1, UPRV_LENGTHOF(expected1),"utf8", UCNV_TO_U_CALLBACK_SUBSTITUTE, offsets1,FALSE))
|
||||
|
@ -1113,26 +1113,36 @@ static void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize )
|
||||
0xf4, 0x8f, 0xbf, 0xbf, /* 10FFFF */
|
||||
0xdf, 0xbf, /* 7ff */
|
||||
0xbf, /* truncated tail */
|
||||
0xf4, 0x90, 0x80, 0x80, /* 11FFFF */
|
||||
0xf4, 0x90, 0x80, 0x80, /* 110000 */
|
||||
0x02
|
||||
};
|
||||
|
||||
static const uint16_t utf8Expected[]={
|
||||
0x0061,
|
||||
0xfffd,
|
||||
0xfffd, 0xfffd, 0xfffd, 0xfffd,
|
||||
0x0000,
|
||||
0x0062,
|
||||
0xfffd,
|
||||
0xfffd,
|
||||
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
|
||||
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
|
||||
0xdbff, 0xdfff,
|
||||
0x07ff,
|
||||
0xfffd,
|
||||
0xfffd,
|
||||
0xfffd, 0xfffd, 0xfffd, 0xfffd,
|
||||
0x0002
|
||||
};
|
||||
|
||||
static const int32_t utf8Offsets[]={
|
||||
0, 1, 5, 6, 7, 12, 17, 17, 21, 23, 24, 28
|
||||
0,
|
||||
1, 2, 3, 4,
|
||||
5,
|
||||
6,
|
||||
7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 16,
|
||||
17, 17,
|
||||
21,
|
||||
23,
|
||||
24, 25, 26, 27,
|
||||
28
|
||||
};
|
||||
testConvertToU(utf8, sizeof(utf8),
|
||||
utf8Expected, UPRV_LENGTHOF(utf8Expected), "utf-8", utf8Offsets ,FALSE);
|
||||
|
4
icu4c/source/test/testdata/conversion.txt
vendored
4
icu4c/source/test/testdata/conversion.txt
vendored
@ -763,9 +763,9 @@ conversion:table(nofallback) {
|
||||
// surrogates in CESU-8
|
||||
{ "CESU-8", :bin{ eda080eda081edb081 }, "\ud800\U00010401", :intvector{ 0, 3, 6 }, :int{1}, :int{0}, "", "", :bin{""} }
|
||||
// e080 is a partial sequence
|
||||
{ "UTF-8", :bin{ 31ffe4ba8ce08061 }, "1\ufffd\u4e8c\ufffda", :intvector{ 0, 1, 2, 5, 7 }, :int{0}, :int{0}, "", "", :bin{ e080 } }
|
||||
{ "UTF-8", :bin{ 31ffe4ba8ce08061 }, "1\ufffd\u4e8c\ufffd\ufffda", :intvector{ 0, 1, 2, 5, 6, 7 }, :int{0}, :int{0}, "", "", :bin{ 80 } }
|
||||
// fbbfbfbfbf exceedes U+10ffff
|
||||
{ "UTF-8", :bin{ 31fbbfbfbfbf61 }, "1\ufffda", :intvector{ 0, 1, 6 }, :int{0}, :int{0}, "", "", :bin{ fbbfbfbfbf } }
|
||||
{ "UTF-8", :bin{ 31fbbfbfbfbf61 }, "1\ufffd\ufffd\ufffd\ufffd\ufffda", :intvector{ 0, 1, 2, 3, 4, 5, 6 }, :int{0}, :int{0}, "", "", :bin{ bf } }
|
||||
|
||||
// lead byte a2 without trail byte
|
||||
{ "ibm-1363", :bin{ a2aea2 }, "\u00a1", :intvector{ 0 }, :int{1}, :int{0}, "truncated", ".", :bin{ a2 } }
|
||||
|
@ -36,26 +36,7 @@ class CharsetUTF8 extends CharsetICU {
|
||||
maxCharsPerByte = 1;
|
||||
}
|
||||
|
||||
private static final int BITMASK_FROM_UTF8[] = { -1, 0x7f, 0x1f, 0xf, 0x7, 0x3, 0x1 };
|
||||
|
||||
private static final byte BYTES_FROM_UTF8[] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
|
||||
};
|
||||
|
||||
/*
|
||||
* Starting with Unicode 3.0.1: UTF-8 byte sequences of length N _must_ encode code points of or
|
||||
* above utf8_minChar32[N]; byte sequences with more than 4 bytes are illegal in UTF-8, which is
|
||||
* tested with impossible values for them
|
||||
*/
|
||||
private static final int UTF8_MIN_CHAR32[] = { 0, 0, 0x80, 0x800, 0x10000,
|
||||
Integer.MAX_VALUE, Integer.MAX_VALUE };
|
||||
private static final int BITMASK_FROM_UTF8[] = { -1, 0x7f, 0x1f, 0xf, 0x7 };
|
||||
|
||||
private final boolean isCESU8 = this instanceof CharsetCESU8;
|
||||
|
||||
@ -92,9 +73,9 @@ class CharsetUTF8 extends CharsetICU {
|
||||
|
||||
if (mode == 0) {
|
||||
/* nothing is stored in toUnicodeStatus, read a byte as input */
|
||||
char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff;
|
||||
bytesExpected = BYTES_FROM_UTF8[char32];
|
||||
char32 &= BITMASK_FROM_UTF8[bytesExpected];
|
||||
toUBytesArray[0] = ch = sourceArray[sourceIndex++];
|
||||
bytesExpected = UTF8.countBytes(ch);
|
||||
char32 = ch & BITMASK_FROM_UTF8[bytesExpected];
|
||||
bytesSoFar = 1;
|
||||
} else {
|
||||
/* a partially or fully built code point is stored in toUnicodeStatus */
|
||||
@ -118,8 +99,9 @@ class CharsetUTF8 extends CharsetICU {
|
||||
cr = CoderResult.UNDERFLOW;
|
||||
break;
|
||||
}
|
||||
if (((ch = toUBytesArray[bytesSoFar] = sourceArray[sourceIndex++]) & 0xc0) != 0x80) {
|
||||
/* not a trail byte (is not of the form 10xxxxxx) */
|
||||
toUBytesArray[bytesSoFar] = ch = sourceArray[sourceIndex++];
|
||||
if (!UTF8.isValidTrail(char32, ch, bytesSoFar, bytesExpected)
|
||||
&& !(isCESU8 && bytesSoFar == 1 && char32 == 0xd && UTF8.isTrail(ch))) {
|
||||
sourceIndex--;
|
||||
toULength = bytesSoFar;
|
||||
cr = CoderResult.malformedForLength(bytesSoFar);
|
||||
@ -127,8 +109,7 @@ class CharsetUTF8 extends CharsetICU {
|
||||
}
|
||||
char32 = (char32 << 6) | (ch & 0x3f);
|
||||
bytesSoFar++;
|
||||
} else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff
|
||||
&& (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) {
|
||||
} else if (bytesSoFar == bytesExpected && (!isCESU8 || bytesSoFar <= 3)) {
|
||||
/*
|
||||
* char32 is a valid code point and is composed of the correct number of
|
||||
* bytes ... we now need to output it in UTF-16
|
||||
@ -168,8 +149,8 @@ class CharsetUTF8 extends CharsetICU {
|
||||
}
|
||||
|
||||
/* keep reading the next input (and writing it) while bytes == 1 */
|
||||
while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff]) == 1) {
|
||||
targetArray[targetIndex++] = (char) char32;
|
||||
while (UTF8.isSingle(ch = sourceArray[sourceIndex++])) {
|
||||
targetArray[targetIndex++] = (char) ch;
|
||||
if (sourceIndex >= sourceLimit) {
|
||||
cr = CoderResult.UNDERFLOW;
|
||||
break outer;
|
||||
@ -179,9 +160,11 @@ class CharsetUTF8 extends CharsetICU {
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
toUBytesArray[0] = ch;
|
||||
|
||||
/* remove the bits that indicate the number of bytes */
|
||||
char32 &= BITMASK_FROM_UTF8[bytesExpected];
|
||||
bytesExpected = UTF8.countBytes(ch);
|
||||
char32 = ch & BITMASK_FROM_UTF8[bytesExpected];
|
||||
bytesSoFar = 1;
|
||||
} else {
|
||||
/*
|
||||
@ -212,9 +195,9 @@ class CharsetUTF8 extends CharsetICU {
|
||||
|
||||
if (mode == 0) {
|
||||
/* nothing is stored in toUnicodeStatus, read a byte as input */
|
||||
char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff;
|
||||
bytesExpected = BYTES_FROM_UTF8[char32];
|
||||
char32 &= BITMASK_FROM_UTF8[bytesExpected];
|
||||
toUBytesArray[0] = ch = source.get(sourceIndex++);
|
||||
bytesExpected = UTF8.countBytes(ch);
|
||||
char32 = ch & BITMASK_FROM_UTF8[bytesExpected];
|
||||
bytesSoFar = 1;
|
||||
} else {
|
||||
/* a partially or fully built code point is stored in toUnicodeStatus */
|
||||
@ -238,8 +221,9 @@ class CharsetUTF8 extends CharsetICU {
|
||||
cr = CoderResult.UNDERFLOW;
|
||||
break;
|
||||
}
|
||||
if (((ch = toUBytesArray[bytesSoFar] = source.get(sourceIndex++)) & 0xc0) != 0x80) {
|
||||
/* not a trail byte (is not of the form 10xxxxxx) */
|
||||
toUBytesArray[bytesSoFar] = ch = source.get(sourceIndex++);
|
||||
if (!UTF8.isValidTrail(char32, ch, bytesSoFar, bytesExpected)
|
||||
&& !(isCESU8 && bytesSoFar == 1 && char32 == 0xd && UTF8.isTrail(ch))) {
|
||||
sourceIndex--;
|
||||
toULength = bytesSoFar;
|
||||
cr = CoderResult.malformedForLength(bytesSoFar);
|
||||
@ -247,21 +231,7 @@ class CharsetUTF8 extends CharsetICU {
|
||||
}
|
||||
char32 = (char32 << 6) | (ch & 0x3f);
|
||||
bytesSoFar++;
|
||||
}
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
*/
|
||||
else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff
|
||||
&& (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) {
|
||||
} else if (bytesSoFar == bytesExpected && (!isCESU8 || bytesSoFar <= 3)) {
|
||||
/*
|
||||
* char32 is a valid code point and is composed of the correct number of
|
||||
* bytes ... we now need to output it in UTF-16
|
||||
@ -305,8 +275,8 @@ class CharsetUTF8 extends CharsetICU {
|
||||
}
|
||||
|
||||
/* keep reading the next input (and writing it) while bytes == 1 */
|
||||
while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff]) == 1) {
|
||||
target.put(targetIndex++, (char) char32);
|
||||
while (UTF8.isSingle(ch = source.get(sourceIndex++))) {
|
||||
target.put(targetIndex++, (char) ch);
|
||||
if (sourceIndex >= sourceLimit) {
|
||||
cr = CoderResult.UNDERFLOW;
|
||||
break outer;
|
||||
@ -316,9 +286,11 @@ class CharsetUTF8 extends CharsetICU {
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
toUBytesArray[0] = ch;
|
||||
|
||||
/* remove the bits that indicate the number of bytes */
|
||||
char32 &= BITMASK_FROM_UTF8[bytesExpected];
|
||||
bytesExpected = UTF8.countBytes(ch);
|
||||
char32 = ch & BITMASK_FROM_UTF8[bytesExpected];
|
||||
bytesSoFar = 1;
|
||||
} else {
|
||||
/*
|
||||
@ -658,32 +630,6 @@ class CharsetUTF8 extends CharsetICU {
|
||||
return (byte) (0x80 | (char32 & 0x3f));
|
||||
}
|
||||
|
||||
/* single-code point definitions -------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
*/
|
||||
// static final boolean isSingle(byte c) {return (((c)&0x80)==0);}
|
||||
/*
|
||||
* Is this code unit (byte) a UTF-8 lead byte?
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
*/
|
||||
// static final boolean isLead(byte c) {return ((((c)-0xc0) &
|
||||
// UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);}
|
||||
/*
|
||||
* Is this code unit (byte) a UTF-8 trail byte?
|
||||
*
|
||||
* @param c
|
||||
* 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
*/
|
||||
/*private static final boolean isTrail(byte c) {
|
||||
return (((c) & 0xc0) == 0x80);
|
||||
}*/
|
||||
|
||||
@Override
|
||||
public CharsetDecoder newDecoder() {
|
||||
return new CharsetDecoderUTF8(this);
|
||||
|
172
icu4j/main/classes/charset/src/com/ibm/icu/charset/UTF8.java
Normal file
172
icu4j/main/classes/charset/src/com/ibm/icu/charset/UTF8.java
Normal file
@ -0,0 +1,172 @@
|
||||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
package com.ibm.icu.charset;
|
||||
|
||||
/**
|
||||
* Partial Java port of ICU4C unicode/utf8.h and ustr_imp.h.
|
||||
*/
|
||||
class UTF8 {
|
||||
/**
|
||||
* Counts the trail bytes for a UTF-8 lead byte.
|
||||
* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
|
||||
*
|
||||
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
|
||||
* @return 0..3
|
||||
*/
|
||||
static int countTrailBytes(byte leadByte) {
|
||||
if (leadByte < (byte)0xe0) {
|
||||
return leadByte < (byte)0xc2 ? 0 : 1;
|
||||
} else if (leadByte < (byte)0xf0) {
|
||||
return 2;
|
||||
} else {
|
||||
return leadByte <= (byte)0xf4 ? 3 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
|
||||
* Returns 1 for ASCII 0..0x7f.
|
||||
* Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff.
|
||||
*
|
||||
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
|
||||
* @return 0..4
|
||||
*/
|
||||
static int countBytes(byte leadByte) {
|
||||
if (leadByte >= 0) {
|
||||
return 1;
|
||||
} else if (leadByte < (byte)0xe0) {
|
||||
return leadByte < (byte)0xc2 ? 0 : 2;
|
||||
} else if (leadByte < (byte)0xf0) {
|
||||
return 3;
|
||||
} else {
|
||||
return leadByte <= (byte)0xf4 ? 4 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal bit vector for 3-byte UTF-8 validity check, for use in {@link #isValidLead3AndT1}.
|
||||
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
|
||||
* Lead byte E0..EF bits 3..0 are used as data int index,
|
||||
* first trail byte bits 7..5 are used as bit index into that int.
|
||||
*
|
||||
* @see #isValidLead3AndT1
|
||||
*/
|
||||
private static final int[] U8_LEAD3_T1_BITS = {
|
||||
0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x10, 0x30, 0x30
|
||||
};
|
||||
|
||||
/**
|
||||
* Internal 3-byte UTF-8 validity check.
|
||||
*
|
||||
* @param lead E0..EF
|
||||
* @param t1 00..FF
|
||||
* @return true if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
|
||||
*/
|
||||
static boolean isValidLead3AndT1(int lead, byte t1) {
|
||||
return (U8_LEAD3_T1_BITS[lead & 0xf] & (1 << ((t1 & 0xff) >> 5))) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal bit vector for 4-byte UTF-8 validity check, for use in {@link #isValidLead4AndT1}.
|
||||
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
|
||||
* Lead byte F0..F4 bits 2..0 are used as data int index,
|
||||
* first trail byte bits 7..4 are used as bit index into that int.
|
||||
*
|
||||
* @see #isValidLead4AndT1
|
||||
*/
|
||||
private static final int[] U8_LEAD4_T1_BITS = {
|
||||
0x0e00, 0x0f00, 0x0f00, 0x0f00, 0x0100
|
||||
};
|
||||
|
||||
/**
|
||||
* Internal 4-byte UTF-8 validity check.
|
||||
*
|
||||
* @param lead F0..F4
|
||||
* @param t1 00..FF
|
||||
* @return true if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
|
||||
*/
|
||||
static boolean isValidLead4AndT1(int lead, byte t1) {
|
||||
return (U8_LEAD4_T1_BITS[lead & 7] & (1 << ((t1 & 0xff) >> 4))) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
|
||||
*
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return true if c is an ASCII byte
|
||||
*/
|
||||
static boolean isSingle(byte c) {
|
||||
return c >= 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this code unit (byte) a UTF-8 lead byte?
|
||||
*
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return true if c is a lead byte
|
||||
*/
|
||||
static boolean isLead(byte c) {
|
||||
return ((c - 0xc2) & 0xff) <= 0x32; // 0x32=0xf4-0xc2
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
|
||||
*
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return true if c is a trail byte
|
||||
*/
|
||||
static boolean isTrail(byte c) {
|
||||
return c < (byte)0xc0;
|
||||
}
|
||||
|
||||
/**
|
||||
* How many code units (bytes) are used for the UTF-8 encoding
|
||||
* of this Unicode code point?
|
||||
*
|
||||
* @param c 32-bit code point
|
||||
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
|
||||
*/
|
||||
static int length(int c) {
|
||||
if (c >= 0) {
|
||||
if (c <= 0x7f) {
|
||||
return 1;
|
||||
} else if (c <= 0x7ff) {
|
||||
return 2;
|
||||
} else if (c <= 0xd7ff) {
|
||||
return 3;
|
||||
} else if (c <= 0xffff) {
|
||||
return c >= 0xe000 ? 3 : 0;
|
||||
} else if (c <= 0x10ffff) {
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 4: The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
|
||||
*/
|
||||
static int MAX_LENGTH = 4;
|
||||
|
||||
/**
|
||||
* Is t a valid UTF-8 trail byte?
|
||||
*
|
||||
* @param prev Must be the preceding lead byte if i==1 and length>=3;
|
||||
* otherwise ignored.
|
||||
* @param t The i-th byte following the lead byte.
|
||||
* @param i The index (1..3) of byte t in the byte sequence. 0<i<length
|
||||
* @param length The length (2..4) of the byte sequence according to the lead byte.
|
||||
* @return true if t is a valid trail byte in this context.
|
||||
*/
|
||||
static boolean isValidTrail(int prev, byte t, int i, int length) {
|
||||
// The first trail byte after a 3- or 4-byte lead byte
|
||||
// needs to be validated together with its lead byte.
|
||||
if (length <= 2 || i > 1) {
|
||||
return isTrail(t);
|
||||
} else if (length == 3) {
|
||||
return isValidLead3AndT1(prev, t);
|
||||
} else { // length == 4
|
||||
return isValidLead4AndT1(prev, t);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:fd856769e94b963fb8a0b63148c63349198ef0c0ec3729173170ccbfd94c4999
|
||||
size 812769
|
||||
oid sha256:a99e848a9249a672092d5fc14d8fe02dc5728ad1f3548c287a9d1c5b12088013
|
||||
size 812760
|
||||
|
@ -562,7 +562,7 @@ public class TestCharset extends TestFmwk {
|
||||
cr = decoder.decode(bs, us, true);
|
||||
bs.rewind();
|
||||
us.rewind();
|
||||
if (!cr.isMalformed() || cr.length() != 3) {
|
||||
if (!cr.isMalformed() || cr.length() != 1) {
|
||||
errln("Incorrect result in " + converter + " decoder for 0x"
|
||||
+ Integer.toHexString(i) + " received " + cr);
|
||||
break;
|
||||
@ -584,7 +584,7 @@ public class TestCharset extends TestFmwk {
|
||||
cr = decoder.decode(bs, us, true);
|
||||
bs.rewind();
|
||||
us.rewind();
|
||||
if (!cr.isMalformed() || cr.length() != 3) {
|
||||
if (!cr.isMalformed() || cr.length() != 1) {
|
||||
errln("Incorrect result in " + converter + " decoder for 0x"
|
||||
+ Integer.toHexString(i) + " received " + cr);
|
||||
break;
|
||||
@ -4653,7 +4653,7 @@ public class TestCharset extends TestFmwk {
|
||||
//decoding code coverage
|
||||
//test malform error
|
||||
decoder.reset();
|
||||
bs.put((byte)0xC0); bs.put((byte)0xC0);
|
||||
bs.put((byte)0xC2); bs.put((byte)0xC2);
|
||||
us.put((char)0x0000);
|
||||
bs2 = bs.asReadOnlyBuffer();
|
||||
|
||||
|
@ -39,18 +39,18 @@ import junitparams.JUnitParamsRunner;
|
||||
import junitparams.Parameters;
|
||||
|
||||
/**
|
||||
* This maps to convtest.c which tests the test file for data-driven conversion tests.
|
||||
*
|
||||
* This maps to convtest.c which tests the test file for data-driven conversion tests.
|
||||
*
|
||||
*/
|
||||
@RunWith(JUnitParamsRunner.class)
|
||||
public class TestConversion extends TestFmwk {
|
||||
/**
|
||||
* This maps to the C struct of conversion case in convtest.h that stores the
|
||||
* data for a conversion test
|
||||
*
|
||||
*
|
||||
*/
|
||||
private class ConversionCase {
|
||||
int caseNr; // testcase index
|
||||
int caseNr; // testcase index
|
||||
String option = null; // callback options
|
||||
CodingErrorAction cbErrorAction = null; // callback action type
|
||||
CharBuffer toUnicodeResult = null;
|
||||
@ -64,7 +64,7 @@ public class TestConversion extends TestFmwk {
|
||||
boolean finalFlush; // flush
|
||||
boolean fallbacks; // fallback
|
||||
String outErrorCode; // errorCode
|
||||
String cbopt; // callback
|
||||
String cbopt; // callback
|
||||
|
||||
// TestGetUnicodeSet variables
|
||||
String map;
|
||||
@ -91,7 +91,7 @@ public class TestConversion extends TestFmwk {
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private List<TestDataPair> getTestData() throws Exception {
|
||||
private List<TestDataPair> getTestData() throws Exception {
|
||||
return ModuleTest.getTestData("com/ibm/icu/dev/data/testdata/", "conversion");
|
||||
}
|
||||
|
||||
@ -132,7 +132,7 @@ public class TestConversion extends TestFmwk {
|
||||
// private methods -------------------------------------------------------
|
||||
|
||||
|
||||
// fromUnicode test worker functions ---------------------------------------
|
||||
// fromUnicode test worker functions ---------------------------------------
|
||||
private void TestFromUnicode(DataMap testcase, int caseNr) {
|
||||
|
||||
ConversionCase cc = new ConversionCase();
|
||||
@ -154,7 +154,7 @@ public class TestConversion extends TestFmwk {
|
||||
errln("error parsing conversion/toUnicode test case " + cc.caseNr);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Skip the following data driven converter tests.
|
||||
* These tests were added to the data driven conversion test in ICU
|
||||
@ -215,7 +215,7 @@ public class TestConversion extends TestFmwk {
|
||||
break;
|
||||
}
|
||||
|
||||
// check for any options for the callback value --
|
||||
// check for any options for the callback value --
|
||||
cc.option = cc.cbErrorAction == null ? cc.cbopt : cc.cbopt
|
||||
.substring(1);
|
||||
if (cc.option == null) {
|
||||
@ -225,7 +225,7 @@ public class TestConversion extends TestFmwk {
|
||||
FromUnicodeCase(cc);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void FromUnicodeCase(ConversionCase cc) {
|
||||
// create charset encoder for conversion test
|
||||
CharsetProviderICU provider = new CharsetProviderICU();
|
||||
@ -238,7 +238,7 @@ public class TestConversion extends TestFmwk {
|
||||
"com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
|
||||
: (Charset) provider.charsetForName(cc.charset);
|
||||
if (charset != null) {
|
||||
encoder = (CharsetEncoder) charset.newEncoder();
|
||||
encoder = charset.newEncoder();
|
||||
encoder.onMalformedInput(CodingErrorAction.REPLACE);
|
||||
encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
|
||||
if (encoder instanceof CharsetEncoderICU) {
|
||||
@ -260,7 +260,7 @@ public class TestConversion extends TestFmwk {
|
||||
return;
|
||||
}
|
||||
|
||||
// set the callback for the encoder
|
||||
// set the callback for the encoder
|
||||
if (cc.cbErrorAction != null) {
|
||||
if (cc.cbEncoder != null) {
|
||||
((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.malformedForLength(1), cc.cbEncoder, cc.option);
|
||||
@ -364,7 +364,7 @@ public class TestConversion extends TestFmwk {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private int stepFromUnicode(ConversionCase cc, CharsetEncoder encoder, int step) {
|
||||
if (step < 0) {
|
||||
errln("Negative step size, test internal error.");
|
||||
@ -387,7 +387,7 @@ public class TestConversion extends TestFmwk {
|
||||
currentSourceLimit = sourceLen;
|
||||
currentTargetLimit = targetLen;
|
||||
}
|
||||
|
||||
|
||||
CoderResult cr = null;
|
||||
|
||||
for (;;) {
|
||||
@ -529,7 +529,7 @@ public class TestConversion extends TestFmwk {
|
||||
"com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
|
||||
: (Charset) provider.charsetForName(cc.charset);
|
||||
if (charset != null) {
|
||||
decoder = (CharsetDecoder) charset.newDecoder();
|
||||
decoder = charset.newDecoder();
|
||||
decoder.onMalformedInput(CodingErrorAction.REPLACE);
|
||||
decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
|
||||
}
|
||||
@ -588,13 +588,13 @@ public class TestConversion extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
// Check the step to unicode
|
||||
// Check the step to unicode
|
||||
boolean ok;
|
||||
int resultLength;
|
||||
|
||||
String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked
|
||||
{ "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } };
|
||||
/* TODO: currently not supported test steps, getNext API is not supported for now
|
||||
/* TODO: currently not supported test steps, getNext API is not supported for now
|
||||
{ "-1", "getNext" },
|
||||
{ "-2", "toU(bulk)+getNext" },
|
||||
{ "-3", "getNext+toU(bulk)" },
|
||||
@ -702,14 +702,14 @@ public class TestConversion extends TestFmwk {
|
||||
target.limit(target.capacity());
|
||||
flush = cc.finalFlush;
|
||||
}
|
||||
// convert
|
||||
// convert
|
||||
CoderResult cr = null;
|
||||
if (source.hasRemaining()) {
|
||||
|
||||
cr = decoder.decode(source, target, flush);
|
||||
// check pointers and errors
|
||||
if (cr.isOverflow()) {
|
||||
// the partial target is filled, set a new limit,
|
||||
// the partial target is filled, set a new limit,
|
||||
oStep = (target.position() + step);
|
||||
target.limit((oStep < target.capacity()) ? oStep
|
||||
: target.capacity());
|
||||
@ -733,7 +733,7 @@ public class TestConversion extends TestFmwk {
|
||||
|
||||
cr = decoder.decode(source, target, true);
|
||||
|
||||
//due to limitation of the API we need to check for target limit for expected
|
||||
//due to limitation of the API we need to check for target limit for expected
|
||||
if (target.position() != cc.unicode.length()) {
|
||||
if (target.limit() != cc.unicode.length()) {
|
||||
target.limit(cc.unicode.length());
|
||||
@ -781,7 +781,7 @@ public class TestConversion extends TestFmwk {
|
||||
if (cr.isOverflow()) {
|
||||
|
||||
if (target.limit() >= target.capacity()) {
|
||||
// target has reached its limit, an error occurred
|
||||
// target has reached its limit, an error occurred
|
||||
logln("UnExpected error: Target Buffer is larger than capacity");
|
||||
break;
|
||||
} else {
|
||||
@ -841,7 +841,7 @@ public class TestConversion extends TestFmwk {
|
||||
}
|
||||
CoderResult cr = decoder.decode(source, target, source
|
||||
.limit() == sourceLen);
|
||||
// check pointers and errors
|
||||
// check pointers and errors
|
||||
if (cr.isOverflow()) {
|
||||
// one character has been consumed
|
||||
if (target.limit() >= target.capacity()) {
|
||||
@ -915,12 +915,12 @@ public class TestConversion extends TestFmwk {
|
||||
"com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
|
||||
: (CharsetICU) provider.charsetForName(cc.charset);
|
||||
|
||||
//checking for converter that are not supported at this point
|
||||
//checking for converter that are not supported at this point
|
||||
try{
|
||||
if(charset==null ||
|
||||
charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2" ||
|
||||
charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" ||
|
||||
charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" ||
|
||||
charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" ||
|
||||
charset.name()=="lmbcs18"|| charset.name()=="lmbcs19"){
|
||||
logln("Converter not supported at this point :" + cc.charset);
|
||||
return;
|
||||
@ -944,7 +944,7 @@ public class TestConversion extends TestFmwk {
|
||||
charset.getUnicodeSet(unicodeset, cc.which);
|
||||
UnicodeSet diffset = new UnicodeSet();
|
||||
|
||||
//are there items that must be in unicodeset but are not?
|
||||
//are there items that must be in unicodeset but are not?
|
||||
(diffset = mapset).removeAll(unicodeset);
|
||||
if(!diffset.isEmpty()){
|
||||
StringBuffer s = new StringBuffer(diffset.toPattern(true));
|
||||
@ -975,11 +975,11 @@ public class TestConversion extends TestFmwk {
|
||||
* This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
|
||||
* start of the stream for example U+FEFF (the Unicode BOM/signature
|
||||
* character) that can be ignored.
|
||||
*
|
||||
*
|
||||
* Detects Unicode signature byte sequences at the start of the byte stream
|
||||
* and returns number of bytes of the BOM of the indicated Unicode charset.
|
||||
* 0 is returned when no Unicode signature is recognized.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
private String detectUnicodeSignature(ByteBuffer source) {
|
||||
|
Loading…
Reference in New Issue
Block a user