ICU-2449 refactor conversion - call toUnicode and getNextUChar callbacks only from ucnv.c framework

X-SVN-Rev: 12731
This commit is contained in:
Markus Scherer 2003-08-01 14:58:43 +00:00
parent d65fa8f193
commit e8b985f363
9 changed files with 430 additions and 995 deletions

View File

@ -518,7 +518,7 @@ static const UConverterImpl _LMBCSImpl##n={\
_LMBCSToUnicodeWithOffsets,\
_LMBCSFromUnicode,\
_LMBCSFromUnicode,\
_LMBCSGetNextUChar,\
NULL,\
NULL,\
NULL,\
NULL,\
@ -930,16 +930,6 @@ _LMBCSFromUnicode(UConverterFromUnicodeArgs* args,
/* Now, the Unicode from LMBCS section */
/*
Special codes for the getNextUnicodeWorker -- usually as the result of
special error-callback behavior:
ULMBCS_SKIP To control skipping over LMBCS sequences
ULMBCS_MULTI To indicate that a single LMBCS char translates to
multiple uniChars
*/
#define ULMBCS_SKIP U_ERROR_LIMIT
#define ULMBCS_MULTI ULMBCS_SKIP+1
/* A function to call when we are looking at the Unicode group byte in LMBCS */
static UChar
GetUniFromLMBCSUni(char const ** ppLMBCSin) /* Called with LMBCS-style Unicode byte stream */
@ -958,26 +948,22 @@ GetUniFromLMBCSUni(char const ** ppLMBCSin) /* Called with LMBCS-style Unicode
/* CHECK_SOURCE_LIMIT: Helper macro to verify that there are at least'index'
bytes left in source up to sourceLimit.Errors appropriately if not
bytes left in source up to sourceLimit.Errors appropriately if not.
If we reach the limit, then update the source pointer to there to consume
all input as required by ICU converter semantics.
*/
#define CHECK_SOURCE_LIMIT(index) \
if (args->source+index > args->sourceLimit){\
*err = U_TRUNCATED_CHAR_FOUND;\
args->source = saveSource;\
args->source = args->sourceLimit;\
return 0xffff;}
/* Return the Unicode representation for the current LMBCS character
This worker function is used by both ucnv_getNextUChar() and ucnv_ToUnicode().
The last parameter says whether the return value should be treated as UTF-16 or
UTF-32. The only difference is in surrogate handling
*/
/* Return the Unicode representation for the current LMBCS character */
static UChar32
_LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
UErrorCode* err,
UBool returnUTF32)
UErrorCode* err)
{
UChar32 uniChar = 0; /* an output UNICODE char */
ulmbcs_byte_t CurByte; /* A byte from the input stream */
@ -1027,20 +1013,10 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
else
if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BigEndian UTF16 */
{
UChar second;
CHECK_SOURCE_LIMIT(2);
uniChar = GetUniFromLMBCSUni(&(args->source));
/* at this point we are usually done, but we need to make sure we are not in
a situation where we can successfully put together a surrogate pair */
if(returnUTF32 && UTF_IS_FIRST_SURROGATE(uniChar) && (args->source+3 <= args->sourceLimit)
&& *(args->source)++ == ULMBCS_GRP_UNICODE
&& UTF_IS_SECOND_SURROGATE(second = GetUniFromLMBCSUni(&(args->source))))
{
uniChar = UTF16_GET_PAIR_VALUE(uniChar, second);
}
/* don't check for error indicators fffe/ffff below */
return GetUniFromLMBCSUni(&(args->source));
}
else if (CurByte <= ULMBCS_CTRLOFFSET)
{
@ -1126,69 +1102,10 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
}
}
}
if (((uint32_t)uniChar - 0xfffe) <= 1) /* 0xfffe<=uniChar<=0xffff */
{
UConverterToUnicodeArgs cbArgs = *args;
UConverterCallbackReason reason;
UChar UCh;
if (uniChar == 0xfffe)
{
reason = UCNV_UNASSIGNED;
*err = U_INVALID_CHAR_FOUND;
}
else
{
reason = UCNV_ILLEGAL;
*err = U_ILLEGAL_CHAR_FOUND;
}
cbArgs.target = &UCh;
cbArgs.targetLimit = &UCh + 1;
cbArgs.converter->fromCharErrorBehaviour(cbArgs.converter->toUContext,
&cbArgs,
saveSource,
args->source - saveSource,
reason,
err);
if (cbArgs.target != &UCh)
{
uniChar = (UChar32) UCh;
}
/* Did error functor skip */
if (U_SUCCESS(*err) && cbArgs.target == &UCh)
{
*err = ULMBCS_SKIP;
}
/* Did error functor try to write multiple UChars? */
else if (*err == U_BUFFER_OVERFLOW_ERROR)
{
*err = ULMBCS_MULTI;
}
}
return uniChar;
}
/* The exported function that gets one UTF32 character from a LMBCS stream
*/
static UChar32
_LMBCSGetNextUChar(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
UChar32 nextUChar;
do {
nextUChar = _LMBCSGetNextUCharWorker(args, err, TRUE);
} while (*err == ULMBCS_SKIP);
if (*err == ULMBCS_MULTI)
{
*err = U_ZERO_ERROR;
}
return nextUChar;
}
/* The exported function that converts lmbcs to one or more
UChars - currently UTF-16
*/
@ -1196,28 +1113,24 @@ static void
_LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
char LMBCS [ULMBCS_CHARSIZE_MAX];
UChar uniChar; /* one output UNICODE char */
const char * saveSource = args->source; /* beginning of current code point */
const char * saveSource; /* beginning of current code point */
const char * pStartLMBCS = args->source; /* beginning of whole string */
const char * errSource = NULL; /* pointer to actual input in case an error occurs */
int8_t savebytes = 0;
if (args->targetLimit == args->target) /* error check may belong in common code */
{
*err = U_BUFFER_OVERFLOW_ERROR;
return;
}
/* Process from source to limit, or until error */
while (!*err && args->sourceLimit > args->source && args->targetLimit > args->target)
while (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit > args->target)
{
saveSource = args->source; /* beginning of current code point */
if (args->converter->toULength) /* reassemble char from previous call */
{
char LMBCS [ULMBCS_CHARSIZE_MAX];
const char *pLMBCS = LMBCS, *saveSourceLimit;
const char *saveSourceLimit;
size_t size_old = args->converter->toULength;
/* limit from source is either reminder of temp buffer, or user limit on source */
/* limit from source is either remainder of temp buffer, or user limit on source */
size_t size_new_maybe_1 = sizeof(LMBCS) - size_old;
size_t size_new_maybe_2 = args->sourceLimit - args->source;
size_t size_new = (size_new_maybe_1 < size_new_maybe_2) ? size_new_maybe_1 : size_new_maybe_2;
@ -1226,18 +1139,16 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
uprv_memcpy(LMBCS, args->converter->toUBytes, size_old);
uprv_memcpy(LMBCS + size_old, args->source, size_new);
saveSourceLimit = args->sourceLimit;
args->source = pLMBCS;
args->sourceLimit = pLMBCS+size_old+size_new;
uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE);
pLMBCS = args->source;
args->source =saveSource;
args->source = errSource = LMBCS;
args->sourceLimit = LMBCS+size_old+size_new;
savebytes = (int8_t)(size_old+size_new);
uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
args->source = saveSource + ((args->source - LMBCS) - size_old);
args->sourceLimit = saveSourceLimit;
args->source += (pLMBCS - LMBCS - size_old);
if (*err == U_TRUNCATED_CHAR_FOUND)
{
/* evil special case: source buffers so small a char spans more than 2 buffers */
int8_t savebytes = (int8_t)(size_old+size_new);
args->converter->toULength = savebytes;
uprv_memcpy(args->converter->toUBytes, LMBCS, savebytes);
args->source = args->sourceLimit;
@ -1252,7 +1163,9 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
}
else
{
uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE);
errSource = saveSource;
uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
savebytes = (int8_t)(args->source - saveSource);
}
if (U_SUCCESS(*err))
{
@ -1273,52 +1186,22 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
*err = U_ILLEGAL_CHAR_FOUND;
}
}
else if (*err == ULMBCS_MULTI)
{
UChar * pUChar = args->converter->UCharErrorBuffer;
int8_t BufferLength = args->converter->UCharErrorBufferLength;
*err = U_ZERO_ERROR;
do
{ /* error functor wants to write multiple UniChars */
*(args->target)++ = uniChar;
if(args->offsets)
{
*(args->offsets)++ = saveSource - pStartLMBCS;
}
uniChar = *pUChar++;
}
while(BufferLength-- && args->targetLimit > args->target);
if (++BufferLength > 0)
{ /* fix up remaining UChars that can't fit in caller's buffer */
uprv_memmove( args->converter->UCharErrorBuffer,
args->converter->UCharErrorBuffer + args->converter->UCharErrorBufferLength - BufferLength,
sizeof(UChar) * BufferLength);
}
args->converter->UCharErrorBufferLength = BufferLength;
}
else if (*err == ULMBCS_SKIP)
{
*err = U_ZERO_ERROR; /* and just go around again..*/
}
}
/* if target ran out before source, return U_BUFFER_OVERFLOW_ERROR */
if (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit <= args->target)
{
*err = U_BUFFER_OVERFLOW_ERROR;
}
/* If character incomplete, store away partial char if more to come */
if (*err == U_TRUNCATED_CHAR_FOUND)
else if (U_FAILURE(*err))
{
args->source = args->sourceLimit;
{
int8_t savebytes = (int8_t)(args->sourceLimit - saveSource);
args->converter->toULength = (int8_t)savebytes;
uprv_memcpy(args->converter->toUBytes, saveSource, savebytes);
*err = U_ZERO_ERROR;
}
/* If character incomplete or unmappable/illegal, store it in toUBytes[] */
args->converter->toULength = savebytes;
if (savebytes > 0) {
uprv_memcpy(args->converter->toUBytes, errSource, savebytes);
}
if (*err == U_TRUNCATED_CHAR_FOUND) {
*err = U_ZERO_ERROR;
}
}
}

View File

@ -359,52 +359,51 @@ _UTF16OEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
# define _UTF16LEFromUnicodeWithOffsets _UTF16PEFromUnicodeWithOffsets
#endif
static UChar32 T_UConverter_getNextUChar_UTF16_BE(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
UChar32 myUChar;
uint16_t first;
/*Checks boundaries and set appropriate error codes*/
if (args->source+2 > args->sourceLimit)
{
if (args->source >= args->sourceLimit)
{
/*Either caller has reached the end of the byte stream*/
*err = U_INDEX_OUTOFBOUNDS_ERROR;
}
else
{
/* a character was cut in half*/
*err = U_TRUNCATED_CHAR_FOUND;
}
static UChar32
_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
const uint8_t *s, *sourceLimit;
UChar32 c;
s=(const uint8_t *)pArgs->source;
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
if(s>=sourceLimit) {
/* no input */
*err=U_INDEX_OUTOFBOUNDS_ERROR;
return 0xffff;
}
/*Gets the corresponding codepoint*/
first = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*((args->source)+1)));
myUChar = first;
args->source += 2;
if(s+2>sourceLimit) {
/* only one byte: truncated UChar */
pArgs->converter->toUBytes[0]=*s++;
pArgs->converter->toULength=1;
pArgs->source=(const char *)s;
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
if(UTF_IS_FIRST_SURROGATE(first)) {
uint16_t second;
/* get one UChar */
c=((UChar32)*s<<8)|s[1];
s+=2;
if (args->source+2 > args->sourceLimit) {
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
/*
* check for surrogate pairs
* surrogate code points are not currently considered an error
* TODO see Jitterbug 1838
*/
if(U16_IS_LEAD(c) && s+2<=sourceLimit) {
UChar trail;
/* get the second surrogate and assemble the code point */
second = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*(args->source+1)));
/* ignore unmatched surrogates and just deliver the first one in such a case */
if(UTF_IS_SECOND_SURROGATE(second)) {
/* matched pair, get pair value */
myUChar = UTF16_GET_PAIR_VALUE(first, second);
args->source += 2;
/* get a second UChar and see if it is a trail surrogate */
trail=((UChar)*s<<8)|s[1];
if(U16_IS_TRAIL(trail)) {
c=U16_GET_SUPPLEMENTARY(c, trail);
s+=2;
}
}
return myUChar;
pArgs->source=(const char *)s;
return c;
}
static const UConverterImpl _UTF16BEImpl={
@ -421,7 +420,7 @@ static const UConverterImpl _UTF16BEImpl={
_UTF16BEToUnicodeWithOffsets,
_UTF16BEFromUnicodeWithOffsets,
_UTF16BEFromUnicodeWithOffsets,
T_UConverter_getNextUChar_UTF16_BE,
_UTF16BEGetNextUChar,
NULL,
NULL,
@ -450,57 +449,51 @@ const UConverterSharedData _UTF16BEData={
/* UTF-16LE ----------------------------------------------------------------- */
static UChar32 T_UConverter_getNextUChar_UTF16_LE(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
UChar32 myUChar;
uint16_t first;
/*Checks boundaries and set appropriate error codes*/
if (args->source+2 > args->sourceLimit)
{
if (args->source >= args->sourceLimit)
{
/*Either caller has reached the end of the byte stream*/
*err = U_INDEX_OUTOFBOUNDS_ERROR;
}
else
{
/* a character was cut in half*/
*err = U_TRUNCATED_CHAR_FOUND;
}
static UChar32
_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
const uint8_t *s, *sourceLimit;
UChar32 c;
s=(const uint8_t *)pArgs->source;
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
if(s>=sourceLimit) {
/* no input */
*err=U_INDEX_OUTOFBOUNDS_ERROR;
return 0xffff;
}
/*Gets the corresponding codepoint*/
first = (uint16_t)(((uint16_t)*((args->source)+1) << 8) | ((uint8_t)(*(args->source))));
myUChar=first;
/*updates the source*/
args->source += 2;
if(s+2>sourceLimit) {
/* only one byte: truncated UChar */
pArgs->converter->toUBytes[0]=*s++;
pArgs->converter->toULength=1;
pArgs->source=(const char *)s;
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
if (UTF_IS_FIRST_SURROGATE(first))
{
uint16_t second;
/* get one UChar */
c=((UChar32)s[1]<<8)|*s;
s+=2;
if (args->source+2 > args->sourceLimit)
{
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
/*
* check for surrogate pairs
* surrogate code points are not currently considered an error
* TODO see Jitterbug 1838
*/
if(U16_IS_LEAD(c) && s+2<=sourceLimit) {
UChar trail;
/* get the second surrogate and assemble the code point */
second = (uint16_t)(((uint16_t)*(args->source+1) << 8) |((uint8_t)(*(args->source))));
/* ignore unmatched surrogates and just deliver the first one in such a case */
if(UTF_IS_SECOND_SURROGATE(second))
{
/* matched pair, get pair value */
myUChar = UTF16_GET_PAIR_VALUE(first, second);
args->source += 2;
/* get a second UChar and see if it is a trail surrogate */
trail=((UChar)s[1]<<8)|*s;
if(U16_IS_TRAIL(trail)) {
c=U16_GET_SUPPLEMENTARY(c, trail);
s+=2;
}
}
return myUChar;
pArgs->source=(const char *)s;
return c;
}
static const UConverterImpl _UTF16LEImpl={
@ -517,7 +510,7 @@ static const UConverterImpl _UTF16LEImpl={
_UTF16LEToUnicodeWithOffsets,
_UTF16LEFromUnicodeWithOffsets,
_UTF16LEFromUnicodeWithOffsets,
T_UConverter_getNextUChar_UTF16_LE,
_UTF16LEGetNextUChar,
NULL,
NULL,
@ -725,9 +718,9 @@ _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
switch(pArgs->converter->mode) {
case 8:
return T_UConverter_getNextUChar_UTF16_BE(pArgs, pErrorCode);
return _UTF16BEGetNextUChar(pArgs, pErrorCode);
case 9:
return T_UConverter_getNextUChar_UTF16_LE(pArgs, pErrorCode);
return _UTF16LEGetNextUChar(pArgs, pErrorCode);
default:
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
}

View File

@ -35,62 +35,6 @@
/* -SURROGATE_LOW_START + HALF_BASE */
#define SURROGATE_LOW_BASE 9216
/**
* Calls invalid char callback when an invalid character sequence is encountered.
* It presumes that the converter has a callback to call.
*
* @returns true when callback fails
*/
static UBool
T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args,
UConverterCallbackReason reason,
UErrorCode *err)
{
UConverter *converter = args->converter;
if (U_SUCCESS(*err))
{
if (reason == UCNV_ILLEGAL) {
*err = U_ILLEGAL_CHAR_FOUND;
} else {
*err = U_INVALID_CHAR_FOUND;
}
}
/* copy the toUBytes to the invalidCharBuffer */
uprv_memcpy(converter->invalidCharBuffer,
converter->toUBytes,
converter->invalidCharLength);
/* Call the ErrorFunction */
args->converter->fromCharErrorBehaviour(converter->toUContext,
args,
converter->invalidCharBuffer,
converter->invalidCharLength,
reason,
err);
return (UBool)U_FAILURE(*err);
}
static UBool
T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args,
int32_t currentOffset,
UConverterCallbackReason reason,
UErrorCode *err)
{
int32_t *saveOffsets = args->offsets;
UBool result;
result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err);
while (saveOffsets < args->offsets)
{
*(saveOffsets++) = currentOffset;
}
return result;
}
/* UTF-32BE ----------------------------------------------------------------- */
static void
@ -166,17 +110,9 @@ morebytes:
}
else
{
args->source = (const char *) mySource;
args->target = myTarget;
args->converter->invalidCharLength = (int8_t)i;
if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
{
/* Stop if the error wasn't handled */
break;
}
args->converter->invalidCharLength = 0;
mySource = (unsigned char *) args->source;
myTarget = args->target;
args->converter->toULength = (int8_t)i;
*err = U_ILLEGAL_CHAR_FOUND;
break;
}
}
@ -268,19 +204,9 @@ morebytes:
}
else
{
args->source = (const char *) mySource;
args->target = myTarget;
args->converter->invalidCharLength = (int8_t)i;
args->offsets = myOffsets;
if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, offsetNum, UCNV_ILLEGAL, err))
{
/* Stop if the error wasn't handled */
break;
}
args->converter->invalidCharLength = 0;
mySource = (unsigned char *) args->source;
myTarget = args->target;
myOffsets = args->offsets;
args->converter->toULength = (int8_t)i;
*err = U_ILLEGAL_CHAR_FOUND;
break;
}
offsetNum += i;
}
@ -464,65 +390,44 @@ static UChar32
T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
UChar myUCharBuf[2];
UChar *myUCharPtr;
const unsigned char *mySource;
const uint8_t *mySource;
UChar32 myUChar;
int32_t length;
while (args->source < args->sourceLimit)
mySource = (const uint8_t *)args->source;
if (mySource >= (const uint8_t *)args->sourceLimit)
{
if (args->source + 4 > args->sourceLimit)
{
/* got a partial character */
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
/* Don't even try to do a direct cast because the value may be on an odd address. */
mySource = (unsigned char *) args->source;
myUChar = (mySource[0] << 24)
| (mySource[1] << 16)
| (mySource[2] << 8)
| (mySource[3]);
args->source = (const char *)(mySource + 4);
if (myUChar <= MAXIMUM_UTF && myUChar >= 0) {
return myUChar;
}
uprv_memcpy(args->converter->invalidCharBuffer, mySource, 4);
args->converter->invalidCharLength = 4;
myUCharPtr = myUCharBuf;
*err = U_ILLEGAL_CHAR_FOUND;
args->target = myUCharPtr;
args->targetLimit = myUCharBuf + 2;
args->converter->fromCharErrorBehaviour(args->converter->toUContext,
args,
(const char *)mySource,
4,
UCNV_ILLEGAL,
err);
if(U_SUCCESS(*err)) {
length = (uint16_t)(args->target - myUCharBuf);
if(length > 0) {
return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, length);
}
/* else (callback did not write anything) continue */
} else if(*err == U_BUFFER_OVERFLOW_ERROR) {
*err = U_ZERO_ERROR;
return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, 2);
} else {
/* break on error */
/* ### what if a callback set an error but _also_ generated output?! */
return 0xffff;
}
/* no input */
*err = U_INDEX_OUTOFBOUNDS_ERROR;
return 0xffff;
}
/* no input or only skipping callbacks */
*err = U_INDEX_OUTOFBOUNDS_ERROR;
length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
if (length < 4)
{
/* got a partial character */
uprv_memcpy(args->converter->toUBytes, mySource, length);
args->converter->toULength = (int8_t)length;
args->source = (const char *)(mySource + length);
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
/* Don't even try to do a direct cast because the value may be on an odd address. */
myUChar = ((UChar32)mySource[0] << 24)
| ((UChar32)mySource[1] << 16)
| ((UChar32)mySource[2] << 8)
| ((UChar32)mySource[3]);
args->source = (const char *)(mySource + 4);
if ((uint32_t)myUChar <= MAXIMUM_UTF) {
return myUChar;
}
uprv_memcpy(args->converter->toUBytes, mySource, 4);
args->converter->toULength = 4;
*err = U_ILLEGAL_CHAR_FOUND;
return 0xffff;
}
@ -643,17 +548,9 @@ morebytes:
}
else
{
args->source = (const char *) mySource;
args->target = myTarget;
args->converter->invalidCharLength = (int8_t)i;
if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
{
/* Stop if the error wasn't handled */
break;
}
args->converter->invalidCharLength = 0;
mySource = (unsigned char *) args->source;
myTarget = args->target;
args->converter->toULength = (int8_t)i;
*err = U_ILLEGAL_CHAR_FOUND;
break;
}
}
@ -747,19 +644,9 @@ morebytes:
}
else
{
args->source = (const char *) mySource;
args->target = myTarget;
args->converter->invalidCharLength = (int8_t)i;
args->offsets = myOffsets;
if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, offsetNum, UCNV_ILLEGAL, err))
{
/* Stop if the error wasn't handled */
break;
}
args->converter->invalidCharLength = 0;
mySource = (unsigned char *) args->source;
myTarget = args->target;
myOffsets = args->offsets;
args->converter->toULength = (int8_t)i;
*err = U_ILLEGAL_CHAR_FOUND;
break;
}
offsetNum += i;
}
@ -935,65 +822,44 @@ static UChar32
T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
UChar myUCharBuf[2];
UChar *myUCharPtr;
const unsigned char *mySource;
const uint8_t *mySource;
UChar32 myUChar;
int32_t length;
while (args->source < args->sourceLimit)
mySource = (const uint8_t *)args->source;
if (mySource >= (const uint8_t *)args->sourceLimit)
{
if (args->source + 4 > args->sourceLimit)
{
/* got a partial character */
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
/* Don't even try to do a direct cast because the value may be on an odd address. */
mySource = (unsigned char *) args->source;
myUChar = (mySource[0])
| (mySource[1] << 8)
| (mySource[2] << 16)
| (mySource[3] << 24);
args->source = (const char *)(mySource + 4);
if (myUChar <= MAXIMUM_UTF && myUChar >= 0) {
return myUChar;
}
uprv_memcpy(args->converter->invalidCharBuffer, mySource, 4);
args->converter->invalidCharLength = 4;
myUCharPtr = myUCharBuf;
*err = U_ILLEGAL_CHAR_FOUND;
args->target = myUCharPtr;
args->targetLimit = myUCharBuf + 2;
args->converter->fromCharErrorBehaviour(args->converter->toUContext,
args,
(const char *)mySource,
4,
UCNV_ILLEGAL,
err);
if(U_SUCCESS(*err)) {
length = (uint16_t)(args->target - myUCharBuf);
if(length > 0) {
return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, length);
}
/* else (callback did not write anything) continue */
} else if(*err == U_BUFFER_OVERFLOW_ERROR) {
*err = U_ZERO_ERROR;
return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, 2);
} else {
/* break on error */
/* ### what if a callback set an error but _also_ generated output?! */
return 0xffff;
}
/* no input */
*err = U_INDEX_OUTOFBOUNDS_ERROR;
return 0xffff;
}
/* no input or only skipping callbacks */
*err = U_INDEX_OUTOFBOUNDS_ERROR;
length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
if (length < 4)
{
/* got a partial character */
uprv_memcpy(args->converter->toUBytes, mySource, length);
args->converter->toULength = (int8_t)length;
args->source = (const char *)(mySource + length);
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
/* Don't even try to do a direct cast because the value may be on an odd address. */
myUChar = ((UChar32)mySource[3] << 24)
| ((UChar32)mySource[2] << 16)
| ((UChar32)mySource[1] << 8)
| ((UChar32)mySource[0]);
args->source = (const char *)(mySource + 4);
if ((uint32_t)myUChar <= MAXIMUM_UTF) {
return myUChar;
}
uprv_memcpy(args->converter->toUBytes, mySource, 4);
args->converter->toULength = 4;
*err = U_ILLEGAL_CHAR_FOUND;
return 0xffff;
}

View File

@ -22,7 +22,6 @@
/* UTF-7 -------------------------------------------------------------------- */
/* ### TODO: in user guide, document version option (=1 for escaping set O characters) */
/*
* UTF-7 is a stateful encoding of Unicode.
* It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
@ -247,7 +246,6 @@ _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
sourceIndex=byteIndex==0 ? 0 : -1;
nextSourceIndex=0;
loop:
if(inDirectMode) {
directMode:
/*
@ -270,8 +268,8 @@ directMode:
/* illegal */
bytes[0]=b;
byteIndex=1;
nextSourceIndex=sourceIndex+1;
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
} else if(b!=PLUS) {
/* write directly encoded character */
*target++=b;
@ -312,7 +310,8 @@ unicodeMode:
if(b>=126) {
/* illegal - test other illegal US-ASCII values by base64Value==-3 */
inDirectMode=TRUE;
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
} else if((base64Value=fromBase64[b])>=0) {
/* collect base64 bytes into UChars */
switch(base64Counter) {
@ -377,7 +376,8 @@ unicodeMode:
/* absorb the minus and leave the Unicode Mode */
if(bits!=0) {
/* bits are illegally left over, a UChar is incomplete */
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
}
}
sourceIndex=nextSourceIndex;
@ -392,7 +392,8 @@ unicodeMode:
bytes[0]=PLUS;
bytes[1]=b;
byteIndex=2;
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
} else if(bits==0) {
/* un-read the character in case it is a plus sign */
--source;
@ -400,12 +401,14 @@ unicodeMode:
goto directMode;
} else {
/* bits are illegally left over, a UChar is incomplete */
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
}
} else /* base64Value==-3 for illegal characters */ {
/* illegal */
inDirectMode=TRUE;
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
}
} else {
/* target is full */
@ -414,7 +417,6 @@ unicodeMode:
}
}
}
endloop:
if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
/*
@ -430,69 +432,11 @@ endloop:
cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
cnv->toULength=byteIndex;
finish:
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
return;
callback:
/* call the callback function with all the preparations and post-processing */
/* update the arguments structure */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
/* copy the current bytes to invalidCharBuffer */
for(b=0; b<(uint8_t)byteIndex; ++b) {
cnv->invalidCharBuffer[b]=(char)bytes[b];
}
cnv->invalidCharLength=byteIndex;
/* set the converter state in UConverter to deal with the next character */
cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
cnv->toULength=0;
/* call the callback function */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
/* get the converter state from UConverter */
{
uint32_t status=cnv->toUnicodeStatus;
inDirectMode=(UBool)((status>>24)&1);
base64Counter=(int8_t)(status>>16);
bits=(uint16_t)status;
}
byteIndex=cnv->toULength;
/* update target and deal with offsets if necessary */
offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
target=pArgs->target;
/* update the source pointer and index */
sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
source=(const uint8_t *)pArgs->source;
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
goto endloop;
} else if(cnv->UCharErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
goto endloop;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
cnv->toULength=0;
goto finish;
} else {
goto loop;
}
}
static void
@ -961,7 +905,6 @@ _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
sourceIndex=byteIndex==0 ? 0 : -1;
nextSourceIndex=0;
loop:
if(inDirectMode) {
directMode:
/*
@ -983,8 +926,8 @@ directMode:
/* illegal */
bytes[0]=b;
byteIndex=1;
nextSourceIndex=sourceIndex+1;
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
} else if(b!=AMPERSAND) {
/* write directly encoded character */
*target++=b;
@ -995,8 +938,7 @@ directMode:
/* switch to Unicode mode */
nextSourceIndex=++sourceIndex;
inDirectMode=FALSE;
bytes[0]=b;
byteIndex=1;
byteIndex=0;
bits=0;
base64Counter=-1;
goto unicodeMode;
@ -1027,7 +969,8 @@ unicodeMode:
if(b>0x7e) {
/* illegal - test other illegal US-ASCII values by base64Value==-3 */
inDirectMode=TRUE;
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
} else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
/* collect base64 bytes into UChars */
switch(base64Counter) {
@ -1048,7 +991,8 @@ unicodeMode:
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto endloop;
}
*target++=c;
if(offsets!=NULL) {
@ -1065,7 +1009,8 @@ unicodeMode:
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto endloop;
}
*target++=c;
if(offsets!=NULL) {
@ -1082,7 +1027,8 @@ unicodeMode:
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto endloop;
}
*target++=c;
if(offsets!=NULL) {
@ -1111,7 +1057,8 @@ unicodeMode:
if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
/* bits are illegally left over, a UChar is incomplete */
/* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
}
}
sourceIndex=nextSourceIndex;
@ -1129,7 +1076,8 @@ unicodeMode:
/* base64Value==-3 for illegal characters */
/* illegal */
inDirectMode=TRUE;
goto callback;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
}
} else {
/* target is full */
@ -1140,73 +1088,41 @@ unicodeMode:
}
endloop:
/*
* the end of the input stream and detection of truncated input
* are handled by the framework, but here we must check if we are in Unicode
* mode and byteIndex==0 because we must end in direct mode
*
* conditions:
* successful
* in Unicode mode and byteIndex==0
* end of input and no truncated input
*/
if( U_SUCCESS(*pErrorCode) &&
!inDirectMode && byteIndex==0 &&
pArgs->flush && source>=sourceLimit
) {
if(base64Counter==-1) {
/* & at the very end of the input */
/* make the ampersand the reported sequence */
bytes[0]=AMPERSAND;
byteIndex=1;
}
/* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
inDirectMode=TRUE; /* avoid looping */
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
}
/* set the converter state back into UConverter */
cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
cnv->toULength=byteIndex;
finish:
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
return;
callback:
/* call the callback function with all the preparations and post-processing */
/* update the arguments structure */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
/* copy the current bytes to invalidCharBuffer */
for(b=0; b<(uint8_t)byteIndex; ++b) {
cnv->invalidCharBuffer[b]=(char)bytes[b];
}
cnv->invalidCharLength=byteIndex;
/* set the converter state in UConverter to deal with the next character */
cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
cnv->toULength=0;
/* call the callback function */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
/* get the converter state from UConverter */
{
uint32_t status=cnv->toUnicodeStatus;
inDirectMode=(UBool)((status>>24)&1);
base64Counter=(int8_t)(status>>16);
bits=(uint16_t)status;
}
byteIndex=cnv->toULength;
/* update target and deal with offsets if necessary */
offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
target=pArgs->target;
/* update the source pointer and index */
sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
source=(const uint8_t *)pArgs->source;
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
goto endloop;
} else if(cnv->UCharErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
goto endloop;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
cnv->toULength=0;
goto finish;
} else {
goto loop;
}
}
static void
@ -1522,7 +1438,7 @@ static const UConverterImpl _IMAPImpl={
static const UConverterStaticData _IMAPStaticData={
sizeof(UConverterStaticData),
"IMAP-mailbox-name",
0, /* TODO CCSID for UTF-7 */
0, /* TODO CCSID for IMAP-mailbox-name */
UCNV_IBM, UCNV_IMAP_MAILBOX,
1, 4,
{ 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */

View File

@ -88,64 +88,6 @@ static const int8_t bytesFromUTF8[256] = {
static const uint32_t
utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
/**
* Calls invalid char callback when an invalid character sequence is encountered.
* It presumes that the converter has a callback to call.
*
* @returns true when callback fails
*/
static UBool
T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args,
UConverterCallbackReason reason,
UErrorCode *err)
{
UConverter *converter = args->converter;
if (U_SUCCESS(*err))
{
if (reason == UCNV_ILLEGAL) {
*err = U_ILLEGAL_CHAR_FOUND;
} else {
*err = U_INVALID_CHAR_FOUND;
}
}
/* copy the toUBytes to the invalidCharBuffer */
uprv_memcpy(converter->invalidCharBuffer,
converter->toUBytes,
converter->toULength);
converter->invalidCharLength = converter->toULength;
converter->toULength = 0;
/* Call the ErrorFunction */
args->converter->fromCharErrorBehaviour(converter->toUContext,
args,
converter->invalidCharBuffer,
converter->invalidCharLength,
reason,
err);
return (UBool)U_FAILURE(*err);
}
static UBool
T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args,
int32_t currentOffset,
UConverterCallbackReason reason,
UErrorCode *err)
{
int32_t *saveOffsets = args->offsets;
UBool result;
result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err);
while (saveOffsets < args->offsets)
{
*(saveOffsets++) = currentOffset;
}
return result;
}
U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
UErrorCode * err)
{
@ -159,7 +101,6 @@ U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
int32_t i, inBytes;
/* Restore size of current sequence */
start:
if (args->converter->toUnicodeStatus && myTarget < targetLimit)
{
inBytes = args->converter->mode; /* restore # of bytes to consume */
@ -256,22 +197,9 @@ morebytes:
}
else
{
args->source = (const char *) mySource;
args->target = myTarget;
args->converter->toULength = (int8_t)i;
if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
{
/* Stop if the error wasn't handled */
/* args and err should already be set properly */
return;
}
mySource = (unsigned char *) args->source;
myTarget = args->target;
/* goto the start to handle state left behind by the callback */
goto start;
*err = U_ILLEGAL_CHAR_FOUND;
break;
}
}
}
@ -302,7 +230,6 @@ U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs
int32_t i, inBytes;
/* Restore size of current sequence */
start:
if (args->converter->toUnicodeStatus && myTarget < targetLimit)
{
inBytes = args->converter->mode; /* restore # of bytes to consume */
@ -399,26 +326,9 @@ morebytes:
}
else
{
args->source = (const char *) mySource;
args->target = myTarget;
args->offsets = myOffsets;
args->converter->toULength = (int8_t)i;
if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args,
offsetNum, UCNV_ILLEGAL, err))
{
/* Stop if the error wasn't handled */
/* args and err should already be set properly */
return;
}
offsetNum += i + ((unsigned char *) args->source - mySource);
mySource = (unsigned char *) args->source;
myTarget = args->target;
myOffsets = args->offsets;
/* goto the start to handle state left behind by the callback */
goto start;
*err = U_ILLEGAL_CHAR_FOUND;
break;
}
}
}
@ -683,159 +593,140 @@ lowsurrogate:
U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
UErrorCode *err) {
UChar buffer[2];
const char *sourceInitial;
UConverter *cnv;
const uint8_t *sourceInitial;
const uint8_t *source;
UChar* myUCharPtr;
uint16_t extraBytesToWrite;
uint8_t myByte;
UChar32 ch;
int8_t isLegalSequence;
UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
int8_t i, isLegalSequence;
while (args->source < args->sourceLimit)
/* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
cnv = args->converter;
sourceInitial = source = (const uint8_t *)args->source;
if (source >= (const uint8_t *)args->sourceLimit)
{
sourceInitial = args->source;
myByte = (uint8_t)*(args->source++);
if (myByte < 0x80)
{
return (UChar32)myByte;
}
extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
if (extraBytesToWrite == 0) {
isLegalSequence = FALSE;
ch = 0;
goto CALL_ERROR_FUNCTION;
}
/*The byte sequence is longer than the buffer area passed*/
source = (const uint8_t *)args->source;
if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
{
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
else
{
isLegalSequence = 1;
ch = myByte << 6;
switch(extraBytesToWrite)
{
/* note: code falls through cases! (sic)*/
case 6:
ch += (myByte = *source++);
ch <<= 6;
if (!UTF8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
case 5:
ch += (myByte = *source++);
ch <<= 6;
if (!UTF8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
case 4:
ch += (myByte = *source++);
ch <<= 6;
if (!UTF8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
case 3:
ch += (myByte = *source++);
ch <<= 6;
if (!UTF8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
case 2:
ch += (myByte = *source++);
if (!UTF8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
}
};
}
ch -= offsetsFromUTF8[extraBytesToWrite];
args->source = (const char *)source;
/*
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
* - use only trail bytes after a lead byte (checked above)
* - use the right number of trail bytes for a given lead byte
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
*/
if (isLegalSequence && (uint32_t)ch <= MAXIMUM_UTF && (uint32_t)ch >= utf8_minChar32[extraBytesToWrite]) {
if(isCESU8) {
if(extraBytesToWrite <= 3) {
if( UTF_IS_FIRST_SURROGATE(ch) &&
(const char *)(source + 3) <= args->sourceLimit &&
source[0] == 0xed && (source[1] & 0xf0) == 0xb0 && (source[2] & 0xc0) == 0x80
) {
/* ch is a lead surrogate followed by a trail surrogate */
ch = (ch << 10) +
((source[1] & 0xf) << 6) + (source[2] & 0x3f) -
((0xd800 << 10) - 0x10000);
args->source = (const char *)(source + 3);
}
return ch; /* return the code point */
}
/* illegal CESU-8 */
} else {
if(!UTF_IS_SURROGATE(ch)) {
return ch; /* return the code point */
}
/* illegal UTF-8 */
}
}
CALL_ERROR_FUNCTION:
extraBytesToWrite = (uint16_t)(args->source - sourceInitial);
args->converter->invalidCharLength = (uint8_t)extraBytesToWrite;
uprv_memcpy(args->converter->invalidCharBuffer, sourceInitial, extraBytesToWrite);
myUCharPtr = buffer;
*err = U_ILLEGAL_CHAR_FOUND;
args->target = myUCharPtr;
args->targetLimit = buffer + 2;
args->converter->fromCharErrorBehaviour(args->converter->toUContext,
args,
sourceInitial,
extraBytesToWrite,
UCNV_ILLEGAL,
err);
if(U_SUCCESS(*err)) {
extraBytesToWrite = (uint16_t)(args->target - buffer);
if(extraBytesToWrite > 0) {
return ucnv_getUChar32KeepOverflow(args->converter, buffer, extraBytesToWrite);
}
/* else (callback did not write anything) continue */
} else if(*err == U_BUFFER_OVERFLOW_ERROR) {
*err = U_ZERO_ERROR;
return ucnv_getUChar32KeepOverflow(args->converter, buffer, 2);
} else {
/* break on error */
/* ### what if a callback set an error but _also_ generated output?! */
return 0xffff;
}
/* no input */
*err = U_INDEX_OUTOFBOUNDS_ERROR;
return 0xffff;
}
/* no input or only skipping callback calls */
*err = U_INDEX_OUTOFBOUNDS_ERROR;
myByte = (uint8_t)*(source++);
if (myByte < 0x80)
{
args->source = (const char *)source;
return (UChar32)myByte;
}
extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
if (extraBytesToWrite == 0) {
cnv->toUBytes[0] = myByte;
cnv->toULength = 1;
*err = U_ILLEGAL_CHAR_FOUND;
args->source = (const char *)source;
return 0xffff;
}
/*The byte sequence is longer than the buffer area passed*/
if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
{
/* check if all of the remaining bytes are trail bytes */
cnv->toUBytes[0] = myByte;
i = 1;
*err = U_TRUNCATED_CHAR_FOUND;
while(source < (const uint8_t *)args->sourceLimit) {
if(U8_IS_TRAIL(myByte = *source)) {
cnv->toUBytes[i++] = myByte;
++source;
} else {
/* error even before we run out of input */
*err = U_ILLEGAL_CHAR_FOUND;
break;
}
}
cnv->toULength = i;
args->source = (const char *)source;
return 0xffff;
}
isLegalSequence = 1;
ch = myByte << 6;
switch(extraBytesToWrite)
{
/* note: code falls through cases! (sic)*/
case 6:
ch += (myByte = *source);
ch <<= 6;
if (!UTF8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
++source;
case 5:
ch += (myByte = *source);
ch <<= 6;
if (!UTF8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
++source;
case 4:
ch += (myByte = *source);
ch <<= 6;
if (!UTF8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
++source;
case 3:
ch += (myByte = *source);
ch <<= 6;
if (!UTF8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
++source;
case 2:
ch += (myByte = *source);
if (!UTF8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
++source;
};
ch -= offsetsFromUTF8[extraBytesToWrite];
args->source = (const char *)source;
/*
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
* - use only trail bytes after a lead byte (checked above)
* - use the right number of trail bytes for a given lead byte
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
*/
if (isLegalSequence &&
(uint32_t)ch <= MAXIMUM_UTF &&
(uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
!U_IS_SURROGATE(ch)
) {
return ch; /* return the code point */
}
for(i = 0; sourceInitial < source; ++i) {
cnv->toUBytes[i] = *sourceInitial++;
}
cnv->toULength = i;
*err = U_ILLEGAL_CHAR_FOUND;
return 0xffff;
}
@ -884,6 +775,29 @@ const UConverterSharedData _UTF8Data={
/* CESU-8 converter data ---------------------------------------------------- */
static const UConverterImpl _CESU8Impl={
UCNV_CESU8,
NULL,
NULL,
NULL,
NULL,
NULL,
T_UConverter_toUnicode_UTF8,
T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC,
T_UConverter_fromUnicode_UTF8,
T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC,
NULL,
NULL,
NULL,
NULL,
NULL,
ucnv_getCompleteUnicodeSet
};
static const UConverterStaticData _CESU8StaticData={
sizeof(UConverterStaticData),
"CESU-8",
@ -897,6 +811,6 @@ static const UConverterStaticData _CESU8StaticData={
const UConverterSharedData _CESU8Data={
sizeof(UConverterSharedData), ~((uint32_t) 0),
NULL, NULL, &_CESU8StaticData, FALSE, &_UTF8Impl,
NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
0
};

View File

@ -262,62 +262,22 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
}
else if(targetUniChar>=0xfffe){
SAVE_STATE:
{
const char *saveSource = args->source;
UChar *saveTarget = args->target;
int32_t *saveOffsets = args->offsets;
UConverterCallbackReason reason;
int32_t currentOffset ;
int32_t saveIndex = (int32_t)(myTarget - args->target);
args->converter->invalidCharLength=0;
if(targetUniChar == 0xfffe){
reason = UCNV_UNASSIGNED;
*err = U_INVALID_CHAR_FOUND;
}
else{
reason = UCNV_ILLEGAL;
*err = U_ILLEGAL_CHAR_FOUND;
}
if(myData->isStateDBCS){
args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)(tempBuf[0]-0x80);
args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)(tempBuf[1]-0x80);
currentOffset= (int32_t)(mySource - args->source -2);
}
else{
args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)mySourceChar;
currentOffset= (int32_t)(mySource - args->source -1);
}
args->offsets = args->offsets?args->offsets+(myTarget - args->target):0;
args->target = myTarget;
args->source = mySource;
myTarget = saveTarget;
args->converter->fromCharErrorBehaviour (
args->converter->toUContext,
args,
args->converter->invalidCharBuffer,
args->converter->invalidCharLength,
reason,
err);
if(args->offsets){
args->offsets = saveOffsets;
for (;saveIndex < (args->target - myTarget);saveIndex++) {
args->offsets[saveIndex] += currentOffset;
}
}
args->source = saveSource;
myTarget = args->target;
args->target = saveTarget;
args->offsets = saveOffsets;
if(U_FAILURE(*err))
break;
if(targetUniChar == 0xfffe){
*err = U_INVALID_CHAR_FOUND;
}
else{
*err = U_ILLEGAL_CHAR_FOUND;
}
if(myData->isStateDBCS){
args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
args->converter->toULength=2;
}
else{
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
args->converter->toULength=1;
}
break;
}
}
else{

View File

@ -1069,7 +1069,6 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
uint32_t targetUniChar = 0x0000;
uint8_t sourceChar = 0x0000;
UConverterDataISCII* data;
UConverterCallbackReason reason;
UChar32* toUnicodeStatus=NULL;
UChar* contextCharToUnicode = NULL;
@ -1108,17 +1107,14 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
data->currentDeltaToUnicode = data->defDeltaToUnicode;
data->currentMaskToUnicode = data->defMaskToUnicode;
}else{
if((sourceChar >= 0x21 && sourceChar <= 0x3F)){
/* these are display codes consume and continue */
}else{
*err =U_ILLEGAL_CHAR_FOUND;
/* reset */
*contextCharToUnicode=NO_CHAR_MARKER;
reason = UCNV_ILLEGAL;
goto CALLBACK;
}
}
/* reset */
@ -1148,11 +1144,9 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
/* byte unit is unassigned */
targetUniChar = missingCharMarker;
*err= U_INVALID_CHAR_FOUND;
reason = UCNV_UNASSIGNED;
}else{
/* only 0xA1 - 0xEE are legal after EXT char */
*contextCharToUnicode= NO_CHAR_MARKER;
reason= UCNV_ILLEGAL;
*err = U_ILLEGAL_CHAR_FOUND;
}
goto CALLBACK;
@ -1260,49 +1254,11 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
/* we reach here only if targetUniChar == missingCharMarker
* so assign codes to reason and err
*/
reason = UCNV_UNASSIGNED;
*err = U_INVALID_CHAR_FOUND;
CALLBACK:
{
const char *saveSource = args->source;
UChar *saveTarget = args->target;
int32_t *saveOffsets = NULL;
int32_t currentOffset = (int32_t)(source - args->source -1);
int32_t saveIndex = (int32_t)(target - args->target);
args->converter->invalidCharLength=0;
args->converter->invalidCharBuffer[args->converter->invalidCharLength++] =
(char) sourceChar;
if(args->offsets){
saveOffsets=args->offsets;
args->offsets = args->offsets+(target - args->target);
}
args->target =target;
target =saveTarget;
args->source = source;
args->converter->fromCharErrorBehaviour (
args->converter->toUContext,
args,
args->converter->invalidCharBuffer,
args->converter->invalidCharLength,
reason,
err);
if(args->offsets){
args->offsets = saveOffsets;
for (;saveIndex < (args->target - target);saveIndex++) {
*(args->offsets)++ = currentOffset;
}
}
target=args->target;
args->source = saveSource;
args->target = saveTarget;
}
args->converter->toUBytes[0] = (uint8_t) sourceChar;
args->converter->toULength = 1;
break;
}
}
@ -1312,7 +1268,7 @@ CALLBACK:
}
}
if(args->flush && source == sourceLimit) {
if(U_SUCCESS(*err) && args->flush && source == sourceLimit) {
/* end of the input stream */
UConverter *cnv = args->converter;

View File

@ -482,10 +482,10 @@ _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
}
if(c>0x7f) {
/* callback(illegal); copy the current bytes to invalidCharBuffer */
/* callback(illegal); copy the current bytes to toUBytes[] */
UConverter *cnv=pArgs->converter;
cnv->invalidCharBuffer[0]=c;
cnv->invalidCharLength=1;
cnv->toUBytes[0]=c;
cnv->toULength=1;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
} else if(source<sourceLimit && target>=pArgs->targetLimit) {
/* target is full */
@ -511,62 +511,25 @@ _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
static UChar32
_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UChar buffer[UTF_MAX_CHAR_LENGTH];
const uint8_t *source;
uint8_t b;
/* set up the local pointers */
source=(const uint8_t *)pArgs->source;
/* conversion loop */
while(source<(const uint8_t *)pArgs->sourceLimit) {
if(source<(const uint8_t *)pArgs->sourceLimit) {
b=*source++;
pArgs->source=(const char *)source;
if(b<=0x7f) {
return b;
} else {
/* call the callback function with all the preparations and post-processing */
UConverter *cnv=pArgs->converter;
/* callback(illegal) */
cnv->toUBytes[0]=b;
cnv->toULength=1;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
/* update the arguments structure */
pArgs->target=buffer;
pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
/* copy the current byte to invalidCharBuffer */
cnv->invalidCharBuffer[0]=(char)b;
cnv->invalidCharLength=1;
/* call the callback function */
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode);
/* update the source pointer */
source=(const uint8_t *)pArgs->source;
/*
* return the first character if the callback wrote some
* we do not need to goto finish because the converter state is already set
*/
if(U_SUCCESS(*pErrorCode)) {
int32_t length=pArgs->target-buffer;
if(length>0) {
return ucnv_getUChar32KeepOverflow(cnv, buffer, length);
}
/* else (callback did not write anything) continue */
} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
*pErrorCode=U_ZERO_ERROR;
return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH);
} else {
/* break on error */
/* ### what if a callback set an error but _also_ generated output?! */
return 0xffff;
}
return 0xffff;
}
}
/* no output because of empty input or only skipping callbacks */
/* no output because of empty input */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0xffff;
}

View File

@ -542,21 +542,11 @@ fastUnicode:
}
endloop:
/* set the converter state back into UConverter */
if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
/* copy the input sequence into the error buffer */
int8_t i;
for(i=0; i<cnv->toULength; ++i) {
cnv->invalidCharBuffer[i]=(char)cnv->toUBytes[i];
}
cnv->invalidCharLength=i;
/* reset to deal with the next character */
state=readCommand;
}
/* set the converter state back into UConverter */
if(state==readCommand) {
} else if(state==readCommand) {
/* not in a multi-byte sequence, reset toULength */
cnv->toULength=0;
}
@ -845,21 +835,11 @@ fastUnicode:
}
endloop:
/* set the converter state back into UConverter */
if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
/* copy the input sequence into the error buffer */
int8_t i;
for(i=0; i<cnv->toULength; ++i) {
cnv->invalidCharBuffer[i]=(char)cnv->toUBytes[i];
}
cnv->invalidCharLength=i;
/* reset to deal with the next character */
state=readCommand;
}
/* set the converter state back into UConverter */
if(state==readCommand) {
} else if(state==readCommand) {
/* not in a multi-byte sequence, reset toULength */
cnv->toULength=0;
}
@ -2032,7 +2012,13 @@ static const UConverterStaticData _SCSUStaticData={
0, /* CCSID for SCSU */
UCNV_IBM, UCNV_SCSU,
1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
{ 0x0e, 0xff, 0xfd, 0 }, 3, /* ### the subchar really must be written by an SCSU function! */
/*
* ### TODO the subchar really must be written by an SCSU function
* however, currently SCSU's fromUnicode() never causes errors, therefore
* no callbacks will be called and no subchars written
* See Jitterbug 2837 - RFE: forbid converting surrogate code points in all charsets
*/
{ 0x0e, 0xff, 0xfd, 0 }, 3,
FALSE, FALSE,
0,
0,
@ -2044,5 +2030,3 @@ const UConverterSharedData _SCSUData={
NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
0
};
/* ### clarify: if an error occurs, does a converter reset itself? or is it in a defined or undefined state? */