ICU-2449 refactor conversion - call toUnicode and getNextUChar callbacks only from ucnv.c framework
X-SVN-Rev: 12731
This commit is contained in:
parent
d65fa8f193
commit
e8b985f363
@ -518,7 +518,7 @@ static const UConverterImpl _LMBCSImpl##n={\
|
||||
_LMBCSToUnicodeWithOffsets,\
|
||||
_LMBCSFromUnicode,\
|
||||
_LMBCSFromUnicode,\
|
||||
_LMBCSGetNextUChar,\
|
||||
NULL,\
|
||||
NULL,\
|
||||
NULL,\
|
||||
NULL,\
|
||||
@ -930,16 +930,6 @@ _LMBCSFromUnicode(UConverterFromUnicodeArgs* args,
|
||||
/* Now, the Unicode from LMBCS section */
|
||||
|
||||
|
||||
/*
|
||||
Special codes for the getNextUnicodeWorker -- usually as the result of
|
||||
special error-callback behavior:
|
||||
ULMBCS_SKIP To control skipping over LMBCS sequences
|
||||
ULMBCS_MULTI To indicate that a single LMBCS char translates to
|
||||
multiple uniChars
|
||||
*/
|
||||
#define ULMBCS_SKIP U_ERROR_LIMIT
|
||||
#define ULMBCS_MULTI ULMBCS_SKIP+1
|
||||
|
||||
/* A function to call when we are looking at the Unicode group byte in LMBCS */
|
||||
static UChar
|
||||
GetUniFromLMBCSUni(char const ** ppLMBCSin) /* Called with LMBCS-style Unicode byte stream */
|
||||
@ -958,26 +948,22 @@ GetUniFromLMBCSUni(char const ** ppLMBCSin) /* Called with LMBCS-style Unicode
|
||||
|
||||
|
||||
/* CHECK_SOURCE_LIMIT: Helper macro to verify that there are at least'index'
|
||||
bytes left in source up to sourceLimit.Errors appropriately if not
|
||||
bytes left in source up to sourceLimit.Errors appropriately if not.
|
||||
If we reach the limit, then update the source pointer to there to consume
|
||||
all input as required by ICU converter semantics.
|
||||
*/
|
||||
|
||||
#define CHECK_SOURCE_LIMIT(index) \
|
||||
if (args->source+index > args->sourceLimit){\
|
||||
*err = U_TRUNCATED_CHAR_FOUND;\
|
||||
args->source = saveSource;\
|
||||
args->source = args->sourceLimit;\
|
||||
return 0xffff;}
|
||||
|
||||
/* Return the Unicode representation for the current LMBCS character
|
||||
|
||||
This worker function is used by both ucnv_getNextUChar() and ucnv_ToUnicode().
|
||||
The last parameter says whether the return value should be treated as UTF-16 or
|
||||
UTF-32. The only difference is in surrogate handling
|
||||
*/
|
||||
/* Return the Unicode representation for the current LMBCS character */
|
||||
|
||||
static UChar32
|
||||
_LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
|
||||
UErrorCode* err,
|
||||
UBool returnUTF32)
|
||||
UErrorCode* err)
|
||||
{
|
||||
UChar32 uniChar = 0; /* an output UNICODE char */
|
||||
ulmbcs_byte_t CurByte; /* A byte from the input stream */
|
||||
@ -1027,20 +1013,10 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
|
||||
else
|
||||
if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BigEndian UTF16 */
|
||||
{
|
||||
UChar second;
|
||||
CHECK_SOURCE_LIMIT(2);
|
||||
|
||||
uniChar = GetUniFromLMBCSUni(&(args->source));
|
||||
|
||||
/* at this point we are usually done, but we need to make sure we are not in
|
||||
a situation where we can successfully put together a surrogate pair */
|
||||
|
||||
if(returnUTF32 && UTF_IS_FIRST_SURROGATE(uniChar) && (args->source+3 <= args->sourceLimit)
|
||||
&& *(args->source)++ == ULMBCS_GRP_UNICODE
|
||||
&& UTF_IS_SECOND_SURROGATE(second = GetUniFromLMBCSUni(&(args->source))))
|
||||
{
|
||||
uniChar = UTF16_GET_PAIR_VALUE(uniChar, second);
|
||||
}
|
||||
/* don't check for error indicators fffe/ffff below */
|
||||
return GetUniFromLMBCSUni(&(args->source));
|
||||
}
|
||||
else if (CurByte <= ULMBCS_CTRLOFFSET)
|
||||
{
|
||||
@ -1126,69 +1102,10 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
|
||||
}
|
||||
}
|
||||
}
|
||||
if (((uint32_t)uniChar - 0xfffe) <= 1) /* 0xfffe<=uniChar<=0xffff */
|
||||
{
|
||||
UConverterToUnicodeArgs cbArgs = *args;
|
||||
UConverterCallbackReason reason;
|
||||
UChar UCh;
|
||||
|
||||
if (uniChar == 0xfffe)
|
||||
{
|
||||
reason = UCNV_UNASSIGNED;
|
||||
*err = U_INVALID_CHAR_FOUND;
|
||||
}
|
||||
else
|
||||
{
|
||||
reason = UCNV_ILLEGAL;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
|
||||
cbArgs.target = &UCh;
|
||||
cbArgs.targetLimit = &UCh + 1;
|
||||
cbArgs.converter->fromCharErrorBehaviour(cbArgs.converter->toUContext,
|
||||
&cbArgs,
|
||||
saveSource,
|
||||
args->source - saveSource,
|
||||
reason,
|
||||
err);
|
||||
|
||||
if (cbArgs.target != &UCh)
|
||||
{
|
||||
uniChar = (UChar32) UCh;
|
||||
}
|
||||
/* Did error functor skip */
|
||||
if (U_SUCCESS(*err) && cbArgs.target == &UCh)
|
||||
{
|
||||
*err = ULMBCS_SKIP;
|
||||
}
|
||||
/* Did error functor try to write multiple UChars? */
|
||||
else if (*err == U_BUFFER_OVERFLOW_ERROR)
|
||||
{
|
||||
*err = ULMBCS_MULTI;
|
||||
}
|
||||
}
|
||||
return uniChar;
|
||||
}
|
||||
|
||||
|
||||
/* The exported function that gets one UTF32 character from a LMBCS stream
|
||||
*/
|
||||
static UChar32
|
||||
_LMBCSGetNextUChar(UConverterToUnicodeArgs* args,
|
||||
UErrorCode* err)
|
||||
{
|
||||
UChar32 nextUChar;
|
||||
do {
|
||||
nextUChar = _LMBCSGetNextUCharWorker(args, err, TRUE);
|
||||
} while (*err == ULMBCS_SKIP);
|
||||
|
||||
if (*err == ULMBCS_MULTI)
|
||||
{
|
||||
*err = U_ZERO_ERROR;
|
||||
}
|
||||
return nextUChar;
|
||||
}
|
||||
|
||||
/* The exported function that converts lmbcs to one or more
|
||||
UChars - currently UTF-16
|
||||
*/
|
||||
@ -1196,28 +1113,24 @@ static void
|
||||
_LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
|
||||
UErrorCode* err)
|
||||
{
|
||||
char LMBCS [ULMBCS_CHARSIZE_MAX];
|
||||
UChar uniChar; /* one output UNICODE char */
|
||||
const char * saveSource = args->source; /* beginning of current code point */
|
||||
const char * saveSource; /* beginning of current code point */
|
||||
const char * pStartLMBCS = args->source; /* beginning of whole string */
|
||||
const char * errSource = NULL; /* pointer to actual input in case an error occurs */
|
||||
int8_t savebytes = 0;
|
||||
|
||||
if (args->targetLimit == args->target) /* error check may belong in common code */
|
||||
{
|
||||
*err = U_BUFFER_OVERFLOW_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Process from source to limit, or until error */
|
||||
while (!*err && args->sourceLimit > args->source && args->targetLimit > args->target)
|
||||
while (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit > args->target)
|
||||
{
|
||||
saveSource = args->source; /* beginning of current code point */
|
||||
|
||||
if (args->converter->toULength) /* reassemble char from previous call */
|
||||
{
|
||||
char LMBCS [ULMBCS_CHARSIZE_MAX];
|
||||
const char *pLMBCS = LMBCS, *saveSourceLimit;
|
||||
const char *saveSourceLimit;
|
||||
size_t size_old = args->converter->toULength;
|
||||
|
||||
/* limit from source is either reminder of temp buffer, or user limit on source */
|
||||
/* limit from source is either remainder of temp buffer, or user limit on source */
|
||||
size_t size_new_maybe_1 = sizeof(LMBCS) - size_old;
|
||||
size_t size_new_maybe_2 = args->sourceLimit - args->source;
|
||||
size_t size_new = (size_new_maybe_1 < size_new_maybe_2) ? size_new_maybe_1 : size_new_maybe_2;
|
||||
@ -1226,18 +1139,16 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
|
||||
uprv_memcpy(LMBCS, args->converter->toUBytes, size_old);
|
||||
uprv_memcpy(LMBCS + size_old, args->source, size_new);
|
||||
saveSourceLimit = args->sourceLimit;
|
||||
args->source = pLMBCS;
|
||||
args->sourceLimit = pLMBCS+size_old+size_new;
|
||||
uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE);
|
||||
pLMBCS = args->source;
|
||||
args->source =saveSource;
|
||||
args->source = errSource = LMBCS;
|
||||
args->sourceLimit = LMBCS+size_old+size_new;
|
||||
savebytes = (int8_t)(size_old+size_new);
|
||||
uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
|
||||
args->source = saveSource + ((args->source - LMBCS) - size_old);
|
||||
args->sourceLimit = saveSourceLimit;
|
||||
args->source += (pLMBCS - LMBCS - size_old);
|
||||
|
||||
if (*err == U_TRUNCATED_CHAR_FOUND)
|
||||
{
|
||||
/* evil special case: source buffers so small a char spans more than 2 buffers */
|
||||
int8_t savebytes = (int8_t)(size_old+size_new);
|
||||
args->converter->toULength = savebytes;
|
||||
uprv_memcpy(args->converter->toUBytes, LMBCS, savebytes);
|
||||
args->source = args->sourceLimit;
|
||||
@ -1252,7 +1163,9 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
|
||||
}
|
||||
else
|
||||
{
|
||||
uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE);
|
||||
errSource = saveSource;
|
||||
uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
|
||||
savebytes = (int8_t)(args->source - saveSource);
|
||||
}
|
||||
if (U_SUCCESS(*err))
|
||||
{
|
||||
@ -1273,52 +1186,22 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
}
|
||||
else if (*err == ULMBCS_MULTI)
|
||||
{
|
||||
UChar * pUChar = args->converter->UCharErrorBuffer;
|
||||
int8_t BufferLength = args->converter->UCharErrorBufferLength;
|
||||
|
||||
*err = U_ZERO_ERROR;
|
||||
do
|
||||
{ /* error functor wants to write multiple UniChars */
|
||||
*(args->target)++ = uniChar;
|
||||
if(args->offsets)
|
||||
{
|
||||
*(args->offsets)++ = saveSource - pStartLMBCS;
|
||||
}
|
||||
uniChar = *pUChar++;
|
||||
}
|
||||
while(BufferLength-- && args->targetLimit > args->target);
|
||||
|
||||
if (++BufferLength > 0)
|
||||
{ /* fix up remaining UChars that can't fit in caller's buffer */
|
||||
uprv_memmove( args->converter->UCharErrorBuffer,
|
||||
args->converter->UCharErrorBuffer + args->converter->UCharErrorBufferLength - BufferLength,
|
||||
sizeof(UChar) * BufferLength);
|
||||
}
|
||||
args->converter->UCharErrorBufferLength = BufferLength;
|
||||
}
|
||||
else if (*err == ULMBCS_SKIP)
|
||||
{
|
||||
*err = U_ZERO_ERROR; /* and just go around again..*/
|
||||
}
|
||||
}
|
||||
/* if target ran out before source, return U_BUFFER_OVERFLOW_ERROR */
|
||||
if (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit <= args->target)
|
||||
{
|
||||
*err = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
|
||||
/* If character incomplete, store away partial char if more to come */
|
||||
if (*err == U_TRUNCATED_CHAR_FOUND)
|
||||
else if (U_FAILURE(*err))
|
||||
{
|
||||
args->source = args->sourceLimit;
|
||||
{
|
||||
int8_t savebytes = (int8_t)(args->sourceLimit - saveSource);
|
||||
args->converter->toULength = (int8_t)savebytes;
|
||||
uprv_memcpy(args->converter->toUBytes, saveSource, savebytes);
|
||||
*err = U_ZERO_ERROR;
|
||||
}
|
||||
/* If character incomplete or unmappable/illegal, store it in toUBytes[] */
|
||||
args->converter->toULength = savebytes;
|
||||
if (savebytes > 0) {
|
||||
uprv_memcpy(args->converter->toUBytes, errSource, savebytes);
|
||||
}
|
||||
if (*err == U_TRUNCATED_CHAR_FOUND) {
|
||||
*err = U_ZERO_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -359,52 +359,51 @@ _UTF16OEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
# define _UTF16LEFromUnicodeWithOffsets _UTF16PEFromUnicodeWithOffsets
|
||||
#endif
|
||||
|
||||
static UChar32 T_UConverter_getNextUChar_UTF16_BE(UConverterToUnicodeArgs* args,
|
||||
UErrorCode* err)
|
||||
{
|
||||
UChar32 myUChar;
|
||||
uint16_t first;
|
||||
/*Checks boundaries and set appropriate error codes*/
|
||||
if (args->source+2 > args->sourceLimit)
|
||||
{
|
||||
if (args->source >= args->sourceLimit)
|
||||
{
|
||||
/*Either caller has reached the end of the byte stream*/
|
||||
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* a character was cut in half*/
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
}
|
||||
static UChar32
|
||||
_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
|
||||
const uint8_t *s, *sourceLimit;
|
||||
UChar32 c;
|
||||
|
||||
s=(const uint8_t *)pArgs->source;
|
||||
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
|
||||
|
||||
if(s>=sourceLimit) {
|
||||
/* no input */
|
||||
*err=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
/*Gets the corresponding codepoint*/
|
||||
first = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*((args->source)+1)));
|
||||
myUChar = first;
|
||||
args->source += 2;
|
||||
if(s+2>sourceLimit) {
|
||||
/* only one byte: truncated UChar */
|
||||
pArgs->converter->toUBytes[0]=*s++;
|
||||
pArgs->converter->toULength=1;
|
||||
pArgs->source=(const char *)s;
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
if(UTF_IS_FIRST_SURROGATE(first)) {
|
||||
uint16_t second;
|
||||
/* get one UChar */
|
||||
c=((UChar32)*s<<8)|s[1];
|
||||
s+=2;
|
||||
|
||||
if (args->source+2 > args->sourceLimit) {
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
/*
|
||||
* check for surrogate pairs
|
||||
* surrogate code points are not currently considered an error
|
||||
* TODO see Jitterbug 1838
|
||||
*/
|
||||
if(U16_IS_LEAD(c) && s+2<=sourceLimit) {
|
||||
UChar trail;
|
||||
|
||||
/* get the second surrogate and assemble the code point */
|
||||
second = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*(args->source+1)));
|
||||
|
||||
/* ignore unmatched surrogates and just deliver the first one in such a case */
|
||||
if(UTF_IS_SECOND_SURROGATE(second)) {
|
||||
/* matched pair, get pair value */
|
||||
myUChar = UTF16_GET_PAIR_VALUE(first, second);
|
||||
args->source += 2;
|
||||
/* get a second UChar and see if it is a trail surrogate */
|
||||
trail=((UChar)*s<<8)|s[1];
|
||||
if(U16_IS_TRAIL(trail)) {
|
||||
c=U16_GET_SUPPLEMENTARY(c, trail);
|
||||
s+=2;
|
||||
}
|
||||
}
|
||||
|
||||
return myUChar;
|
||||
pArgs->source=(const char *)s;
|
||||
return c;
|
||||
}
|
||||
|
||||
static const UConverterImpl _UTF16BEImpl={
|
||||
@ -421,7 +420,7 @@ static const UConverterImpl _UTF16BEImpl={
|
||||
_UTF16BEToUnicodeWithOffsets,
|
||||
_UTF16BEFromUnicodeWithOffsets,
|
||||
_UTF16BEFromUnicodeWithOffsets,
|
||||
T_UConverter_getNextUChar_UTF16_BE,
|
||||
_UTF16BEGetNextUChar,
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
@ -450,57 +449,51 @@ const UConverterSharedData _UTF16BEData={
|
||||
|
||||
/* UTF-16LE ----------------------------------------------------------------- */
|
||||
|
||||
static UChar32 T_UConverter_getNextUChar_UTF16_LE(UConverterToUnicodeArgs* args,
|
||||
UErrorCode* err)
|
||||
{
|
||||
UChar32 myUChar;
|
||||
uint16_t first;
|
||||
/*Checks boundaries and set appropriate error codes*/
|
||||
if (args->source+2 > args->sourceLimit)
|
||||
{
|
||||
if (args->source >= args->sourceLimit)
|
||||
{
|
||||
/*Either caller has reached the end of the byte stream*/
|
||||
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* a character was cut in half*/
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
}
|
||||
static UChar32
|
||||
_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
|
||||
const uint8_t *s, *sourceLimit;
|
||||
UChar32 c;
|
||||
|
||||
s=(const uint8_t *)pArgs->source;
|
||||
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
|
||||
|
||||
if(s>=sourceLimit) {
|
||||
/* no input */
|
||||
*err=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
/*Gets the corresponding codepoint*/
|
||||
first = (uint16_t)(((uint16_t)*((args->source)+1) << 8) | ((uint8_t)(*(args->source))));
|
||||
myUChar=first;
|
||||
/*updates the source*/
|
||||
args->source += 2;
|
||||
if(s+2>sourceLimit) {
|
||||
/* only one byte: truncated UChar */
|
||||
pArgs->converter->toUBytes[0]=*s++;
|
||||
pArgs->converter->toULength=1;
|
||||
pArgs->source=(const char *)s;
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
if (UTF_IS_FIRST_SURROGATE(first))
|
||||
{
|
||||
uint16_t second;
|
||||
/* get one UChar */
|
||||
c=((UChar32)s[1]<<8)|*s;
|
||||
s+=2;
|
||||
|
||||
if (args->source+2 > args->sourceLimit)
|
||||
{
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
/*
|
||||
* check for surrogate pairs
|
||||
* surrogate code points are not currently considered an error
|
||||
* TODO see Jitterbug 1838
|
||||
*/
|
||||
if(U16_IS_LEAD(c) && s+2<=sourceLimit) {
|
||||
UChar trail;
|
||||
|
||||
/* get the second surrogate and assemble the code point */
|
||||
second = (uint16_t)(((uint16_t)*(args->source+1) << 8) |((uint8_t)(*(args->source))));
|
||||
|
||||
/* ignore unmatched surrogates and just deliver the first one in such a case */
|
||||
if(UTF_IS_SECOND_SURROGATE(second))
|
||||
{
|
||||
/* matched pair, get pair value */
|
||||
myUChar = UTF16_GET_PAIR_VALUE(first, second);
|
||||
args->source += 2;
|
||||
/* get a second UChar and see if it is a trail surrogate */
|
||||
trail=((UChar)s[1]<<8)|*s;
|
||||
if(U16_IS_TRAIL(trail)) {
|
||||
c=U16_GET_SUPPLEMENTARY(c, trail);
|
||||
s+=2;
|
||||
}
|
||||
}
|
||||
|
||||
return myUChar;
|
||||
pArgs->source=(const char *)s;
|
||||
return c;
|
||||
}
|
||||
|
||||
static const UConverterImpl _UTF16LEImpl={
|
||||
@ -517,7 +510,7 @@ static const UConverterImpl _UTF16LEImpl={
|
||||
_UTF16LEToUnicodeWithOffsets,
|
||||
_UTF16LEFromUnicodeWithOffsets,
|
||||
_UTF16LEFromUnicodeWithOffsets,
|
||||
T_UConverter_getNextUChar_UTF16_LE,
|
||||
_UTF16LEGetNextUChar,
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
@ -725,9 +718,9 @@ _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
switch(pArgs->converter->mode) {
|
||||
case 8:
|
||||
return T_UConverter_getNextUChar_UTF16_BE(pArgs, pErrorCode);
|
||||
return _UTF16BEGetNextUChar(pArgs, pErrorCode);
|
||||
case 9:
|
||||
return T_UConverter_getNextUChar_UTF16_LE(pArgs, pErrorCode);
|
||||
return _UTF16LEGetNextUChar(pArgs, pErrorCode);
|
||||
default:
|
||||
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
|
||||
}
|
||||
|
@ -35,62 +35,6 @@
|
||||
/* -SURROGATE_LOW_START + HALF_BASE */
|
||||
#define SURROGATE_LOW_BASE 9216
|
||||
|
||||
/**
|
||||
* Calls invalid char callback when an invalid character sequence is encountered.
|
||||
* It presumes that the converter has a callback to call.
|
||||
*
|
||||
* @returns true when callback fails
|
||||
*/
|
||||
static UBool
|
||||
T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode *err)
|
||||
{
|
||||
UConverter *converter = args->converter;
|
||||
|
||||
if (U_SUCCESS(*err))
|
||||
{
|
||||
if (reason == UCNV_ILLEGAL) {
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
} else {
|
||||
*err = U_INVALID_CHAR_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
/* copy the toUBytes to the invalidCharBuffer */
|
||||
uprv_memcpy(converter->invalidCharBuffer,
|
||||
converter->toUBytes,
|
||||
converter->invalidCharLength);
|
||||
|
||||
/* Call the ErrorFunction */
|
||||
args->converter->fromCharErrorBehaviour(converter->toUContext,
|
||||
args,
|
||||
converter->invalidCharBuffer,
|
||||
converter->invalidCharLength,
|
||||
reason,
|
||||
err);
|
||||
|
||||
return (UBool)U_FAILURE(*err);
|
||||
}
|
||||
|
||||
static UBool
|
||||
T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args,
|
||||
int32_t currentOffset,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode *err)
|
||||
{
|
||||
int32_t *saveOffsets = args->offsets;
|
||||
UBool result;
|
||||
|
||||
result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err);
|
||||
|
||||
while (saveOffsets < args->offsets)
|
||||
{
|
||||
*(saveOffsets++) = currentOffset;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* UTF-32BE ----------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
@ -166,17 +110,9 @@ morebytes:
|
||||
}
|
||||
else
|
||||
{
|
||||
args->source = (const char *) mySource;
|
||||
args->target = myTarget;
|
||||
args->converter->invalidCharLength = (int8_t)i;
|
||||
if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
|
||||
{
|
||||
/* Stop if the error wasn't handled */
|
||||
break;
|
||||
}
|
||||
args->converter->invalidCharLength = 0;
|
||||
mySource = (unsigned char *) args->source;
|
||||
myTarget = args->target;
|
||||
args->converter->toULength = (int8_t)i;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -268,19 +204,9 @@ morebytes:
|
||||
}
|
||||
else
|
||||
{
|
||||
args->source = (const char *) mySource;
|
||||
args->target = myTarget;
|
||||
args->converter->invalidCharLength = (int8_t)i;
|
||||
args->offsets = myOffsets;
|
||||
if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, offsetNum, UCNV_ILLEGAL, err))
|
||||
{
|
||||
/* Stop if the error wasn't handled */
|
||||
break;
|
||||
}
|
||||
args->converter->invalidCharLength = 0;
|
||||
mySource = (unsigned char *) args->source;
|
||||
myTarget = args->target;
|
||||
myOffsets = args->offsets;
|
||||
args->converter->toULength = (int8_t)i;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
offsetNum += i;
|
||||
}
|
||||
@ -464,65 +390,44 @@ static UChar32
|
||||
T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
|
||||
UErrorCode* err)
|
||||
{
|
||||
UChar myUCharBuf[2];
|
||||
UChar *myUCharPtr;
|
||||
const unsigned char *mySource;
|
||||
const uint8_t *mySource;
|
||||
UChar32 myUChar;
|
||||
int32_t length;
|
||||
|
||||
while (args->source < args->sourceLimit)
|
||||
mySource = (const uint8_t *)args->source;
|
||||
if (mySource >= (const uint8_t *)args->sourceLimit)
|
||||
{
|
||||
if (args->source + 4 > args->sourceLimit)
|
||||
{
|
||||
/* got a partial character */
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
/* Don't even try to do a direct cast because the value may be on an odd address. */
|
||||
mySource = (unsigned char *) args->source;
|
||||
myUChar = (mySource[0] << 24)
|
||||
| (mySource[1] << 16)
|
||||
| (mySource[2] << 8)
|
||||
| (mySource[3]);
|
||||
|
||||
args->source = (const char *)(mySource + 4);
|
||||
if (myUChar <= MAXIMUM_UTF && myUChar >= 0) {
|
||||
return myUChar;
|
||||
}
|
||||
|
||||
uprv_memcpy(args->converter->invalidCharBuffer, mySource, 4);
|
||||
args->converter->invalidCharLength = 4;
|
||||
|
||||
myUCharPtr = myUCharBuf;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
args->target = myUCharPtr;
|
||||
args->targetLimit = myUCharBuf + 2;
|
||||
args->converter->fromCharErrorBehaviour(args->converter->toUContext,
|
||||
args,
|
||||
(const char *)mySource,
|
||||
4,
|
||||
UCNV_ILLEGAL,
|
||||
err);
|
||||
|
||||
if(U_SUCCESS(*err)) {
|
||||
length = (uint16_t)(args->target - myUCharBuf);
|
||||
if(length > 0) {
|
||||
return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, length);
|
||||
}
|
||||
/* else (callback did not write anything) continue */
|
||||
} else if(*err == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*err = U_ZERO_ERROR;
|
||||
return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, 2);
|
||||
} else {
|
||||
/* break on error */
|
||||
/* ### what if a callback set an error but _also_ generated output?! */
|
||||
return 0xffff;
|
||||
}
|
||||
/* no input */
|
||||
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
/* no input or only skipping callbacks */
|
||||
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
|
||||
if (length < 4)
|
||||
{
|
||||
/* got a partial character */
|
||||
uprv_memcpy(args->converter->toUBytes, mySource, length);
|
||||
args->converter->toULength = (int8_t)length;
|
||||
args->source = (const char *)(mySource + length);
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
/* Don't even try to do a direct cast because the value may be on an odd address. */
|
||||
myUChar = ((UChar32)mySource[0] << 24)
|
||||
| ((UChar32)mySource[1] << 16)
|
||||
| ((UChar32)mySource[2] << 8)
|
||||
| ((UChar32)mySource[3]);
|
||||
|
||||
args->source = (const char *)(mySource + 4);
|
||||
if ((uint32_t)myUChar <= MAXIMUM_UTF) {
|
||||
return myUChar;
|
||||
}
|
||||
|
||||
uprv_memcpy(args->converter->toUBytes, mySource, 4);
|
||||
args->converter->toULength = 4;
|
||||
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
@ -643,17 +548,9 @@ morebytes:
|
||||
}
|
||||
else
|
||||
{
|
||||
args->source = (const char *) mySource;
|
||||
args->target = myTarget;
|
||||
args->converter->invalidCharLength = (int8_t)i;
|
||||
if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
|
||||
{
|
||||
/* Stop if the error wasn't handled */
|
||||
break;
|
||||
}
|
||||
args->converter->invalidCharLength = 0;
|
||||
mySource = (unsigned char *) args->source;
|
||||
myTarget = args->target;
|
||||
args->converter->toULength = (int8_t)i;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -747,19 +644,9 @@ morebytes:
|
||||
}
|
||||
else
|
||||
{
|
||||
args->source = (const char *) mySource;
|
||||
args->target = myTarget;
|
||||
args->converter->invalidCharLength = (int8_t)i;
|
||||
args->offsets = myOffsets;
|
||||
if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, offsetNum, UCNV_ILLEGAL, err))
|
||||
{
|
||||
/* Stop if the error wasn't handled */
|
||||
break;
|
||||
}
|
||||
args->converter->invalidCharLength = 0;
|
||||
mySource = (unsigned char *) args->source;
|
||||
myTarget = args->target;
|
||||
myOffsets = args->offsets;
|
||||
args->converter->toULength = (int8_t)i;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
offsetNum += i;
|
||||
}
|
||||
@ -935,65 +822,44 @@ static UChar32
|
||||
T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
|
||||
UErrorCode* err)
|
||||
{
|
||||
UChar myUCharBuf[2];
|
||||
UChar *myUCharPtr;
|
||||
const unsigned char *mySource;
|
||||
const uint8_t *mySource;
|
||||
UChar32 myUChar;
|
||||
int32_t length;
|
||||
|
||||
while (args->source < args->sourceLimit)
|
||||
mySource = (const uint8_t *)args->source;
|
||||
if (mySource >= (const uint8_t *)args->sourceLimit)
|
||||
{
|
||||
if (args->source + 4 > args->sourceLimit)
|
||||
{
|
||||
/* got a partial character */
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
/* Don't even try to do a direct cast because the value may be on an odd address. */
|
||||
mySource = (unsigned char *) args->source;
|
||||
myUChar = (mySource[0])
|
||||
| (mySource[1] << 8)
|
||||
| (mySource[2] << 16)
|
||||
| (mySource[3] << 24);
|
||||
|
||||
args->source = (const char *)(mySource + 4);
|
||||
if (myUChar <= MAXIMUM_UTF && myUChar >= 0) {
|
||||
return myUChar;
|
||||
}
|
||||
|
||||
uprv_memcpy(args->converter->invalidCharBuffer, mySource, 4);
|
||||
args->converter->invalidCharLength = 4;
|
||||
|
||||
myUCharPtr = myUCharBuf;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
args->target = myUCharPtr;
|
||||
args->targetLimit = myUCharBuf + 2;
|
||||
args->converter->fromCharErrorBehaviour(args->converter->toUContext,
|
||||
args,
|
||||
(const char *)mySource,
|
||||
4,
|
||||
UCNV_ILLEGAL,
|
||||
err);
|
||||
|
||||
if(U_SUCCESS(*err)) {
|
||||
length = (uint16_t)(args->target - myUCharBuf);
|
||||
if(length > 0) {
|
||||
return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, length);
|
||||
}
|
||||
/* else (callback did not write anything) continue */
|
||||
} else if(*err == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*err = U_ZERO_ERROR;
|
||||
return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, 2);
|
||||
} else {
|
||||
/* break on error */
|
||||
/* ### what if a callback set an error but _also_ generated output?! */
|
||||
return 0xffff;
|
||||
}
|
||||
/* no input */
|
||||
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
/* no input or only skipping callbacks */
|
||||
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
|
||||
if (length < 4)
|
||||
{
|
||||
/* got a partial character */
|
||||
uprv_memcpy(args->converter->toUBytes, mySource, length);
|
||||
args->converter->toULength = (int8_t)length;
|
||||
args->source = (const char *)(mySource + length);
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
/* Don't even try to do a direct cast because the value may be on an odd address. */
|
||||
myUChar = ((UChar32)mySource[3] << 24)
|
||||
| ((UChar32)mySource[2] << 16)
|
||||
| ((UChar32)mySource[1] << 8)
|
||||
| ((UChar32)mySource[0]);
|
||||
|
||||
args->source = (const char *)(mySource + 4);
|
||||
if ((uint32_t)myUChar <= MAXIMUM_UTF) {
|
||||
return myUChar;
|
||||
}
|
||||
|
||||
uprv_memcpy(args->converter->toUBytes, mySource, 4);
|
||||
args->converter->toULength = 4;
|
||||
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
|
@ -22,7 +22,6 @@
|
||||
|
||||
/* UTF-7 -------------------------------------------------------------------- */
|
||||
|
||||
/* ### TODO: in user guide, document version option (=1 for escaping set O characters) */
|
||||
/*
|
||||
* UTF-7 is a stateful encoding of Unicode.
|
||||
* It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
|
||||
@ -247,7 +246,6 @@ _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
sourceIndex=byteIndex==0 ? 0 : -1;
|
||||
nextSourceIndex=0;
|
||||
|
||||
loop:
|
||||
if(inDirectMode) {
|
||||
directMode:
|
||||
/*
|
||||
@ -270,8 +268,8 @@ directMode:
|
||||
/* illegal */
|
||||
bytes[0]=b;
|
||||
byteIndex=1;
|
||||
nextSourceIndex=sourceIndex+1;
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
} else if(b!=PLUS) {
|
||||
/* write directly encoded character */
|
||||
*target++=b;
|
||||
@ -312,7 +310,8 @@ unicodeMode:
|
||||
if(b>=126) {
|
||||
/* illegal - test other illegal US-ASCII values by base64Value==-3 */
|
||||
inDirectMode=TRUE;
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
} else if((base64Value=fromBase64[b])>=0) {
|
||||
/* collect base64 bytes into UChars */
|
||||
switch(base64Counter) {
|
||||
@ -377,7 +376,8 @@ unicodeMode:
|
||||
/* absorb the minus and leave the Unicode Mode */
|
||||
if(bits!=0) {
|
||||
/* bits are illegally left over, a UChar is incomplete */
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
}
|
||||
sourceIndex=nextSourceIndex;
|
||||
@ -392,7 +392,8 @@ unicodeMode:
|
||||
bytes[0]=PLUS;
|
||||
bytes[1]=b;
|
||||
byteIndex=2;
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
} else if(bits==0) {
|
||||
/* un-read the character in case it is a plus sign */
|
||||
--source;
|
||||
@ -400,12 +401,14 @@ unicodeMode:
|
||||
goto directMode;
|
||||
} else {
|
||||
/* bits are illegally left over, a UChar is incomplete */
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
} else /* base64Value==-3 for illegal characters */ {
|
||||
/* illegal */
|
||||
inDirectMode=TRUE;
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* target is full */
|
||||
@ -414,7 +417,6 @@ unicodeMode:
|
||||
}
|
||||
}
|
||||
}
|
||||
endloop:
|
||||
|
||||
if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
|
||||
/*
|
||||
@ -430,69 +432,11 @@ endloop:
|
||||
cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
|
||||
cnv->toULength=byteIndex;
|
||||
|
||||
finish:
|
||||
/* write back the updated pointers */
|
||||
pArgs->source=(const char *)source;
|
||||
pArgs->target=target;
|
||||
pArgs->offsets=offsets;
|
||||
return;
|
||||
|
||||
callback:
|
||||
/* call the callback function with all the preparations and post-processing */
|
||||
/* update the arguments structure */
|
||||
pArgs->source=(const char *)source;
|
||||
pArgs->target=target;
|
||||
pArgs->offsets=offsets;
|
||||
|
||||
/* copy the current bytes to invalidCharBuffer */
|
||||
for(b=0; b<(uint8_t)byteIndex; ++b) {
|
||||
cnv->invalidCharBuffer[b]=(char)bytes[b];
|
||||
}
|
||||
cnv->invalidCharLength=byteIndex;
|
||||
|
||||
/* set the converter state in UConverter to deal with the next character */
|
||||
cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
|
||||
cnv->toULength=0;
|
||||
|
||||
/* call the callback function */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
{
|
||||
uint32_t status=cnv->toUnicodeStatus;
|
||||
inDirectMode=(UBool)((status>>24)&1);
|
||||
base64Counter=(int8_t)(status>>16);
|
||||
bits=(uint16_t)status;
|
||||
}
|
||||
byteIndex=cnv->toULength;
|
||||
|
||||
/* update target and deal with offsets if necessary */
|
||||
offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
|
||||
target=pArgs->target;
|
||||
|
||||
/* update the source pointer and index */
|
||||
sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
|
||||
/*
|
||||
* If the callback overflowed the target, then we need to
|
||||
* stop here with an overflow indication.
|
||||
*/
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
goto endloop;
|
||||
} else if(cnv->UCharErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
goto endloop;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* break on error */
|
||||
cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
|
||||
cnv->toULength=0;
|
||||
goto finish;
|
||||
} else {
|
||||
goto loop;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@ -961,7 +905,6 @@ _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
sourceIndex=byteIndex==0 ? 0 : -1;
|
||||
nextSourceIndex=0;
|
||||
|
||||
loop:
|
||||
if(inDirectMode) {
|
||||
directMode:
|
||||
/*
|
||||
@ -983,8 +926,8 @@ directMode:
|
||||
/* illegal */
|
||||
bytes[0]=b;
|
||||
byteIndex=1;
|
||||
nextSourceIndex=sourceIndex+1;
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
} else if(b!=AMPERSAND) {
|
||||
/* write directly encoded character */
|
||||
*target++=b;
|
||||
@ -995,8 +938,7 @@ directMode:
|
||||
/* switch to Unicode mode */
|
||||
nextSourceIndex=++sourceIndex;
|
||||
inDirectMode=FALSE;
|
||||
bytes[0]=b;
|
||||
byteIndex=1;
|
||||
byteIndex=0;
|
||||
bits=0;
|
||||
base64Counter=-1;
|
||||
goto unicodeMode;
|
||||
@ -1027,7 +969,8 @@ unicodeMode:
|
||||
if(b>0x7e) {
|
||||
/* illegal - test other illegal US-ASCII values by base64Value==-3 */
|
||||
inDirectMode=TRUE;
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
} else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
|
||||
/* collect base64 bytes into UChars */
|
||||
switch(base64Counter) {
|
||||
@ -1048,7 +991,8 @@ unicodeMode:
|
||||
if(isLegalIMAP(c)) {
|
||||
/* illegal */
|
||||
inDirectMode=TRUE;
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
goto endloop;
|
||||
}
|
||||
*target++=c;
|
||||
if(offsets!=NULL) {
|
||||
@ -1065,7 +1009,8 @@ unicodeMode:
|
||||
if(isLegalIMAP(c)) {
|
||||
/* illegal */
|
||||
inDirectMode=TRUE;
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
goto endloop;
|
||||
}
|
||||
*target++=c;
|
||||
if(offsets!=NULL) {
|
||||
@ -1082,7 +1027,8 @@ unicodeMode:
|
||||
if(isLegalIMAP(c)) {
|
||||
/* illegal */
|
||||
inDirectMode=TRUE;
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
goto endloop;
|
||||
}
|
||||
*target++=c;
|
||||
if(offsets!=NULL) {
|
||||
@ -1111,7 +1057,8 @@ unicodeMode:
|
||||
if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
|
||||
/* bits are illegally left over, a UChar is incomplete */
|
||||
/* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
}
|
||||
sourceIndex=nextSourceIndex;
|
||||
@ -1129,7 +1076,8 @@ unicodeMode:
|
||||
/* base64Value==-3 for illegal characters */
|
||||
/* illegal */
|
||||
inDirectMode=TRUE;
|
||||
goto callback;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* target is full */
|
||||
@ -1140,73 +1088,41 @@ unicodeMode:
|
||||
}
|
||||
endloop:
|
||||
|
||||
/*
|
||||
* the end of the input stream and detection of truncated input
|
||||
* are handled by the framework, but here we must check if we are in Unicode
|
||||
* mode and byteIndex==0 because we must end in direct mode
|
||||
*
|
||||
* conditions:
|
||||
* successful
|
||||
* in Unicode mode and byteIndex==0
|
||||
* end of input and no truncated input
|
||||
*/
|
||||
if( U_SUCCESS(*pErrorCode) &&
|
||||
!inDirectMode && byteIndex==0 &&
|
||||
pArgs->flush && source>=sourceLimit
|
||||
) {
|
||||
if(base64Counter==-1) {
|
||||
/* & at the very end of the input */
|
||||
/* make the ampersand the reported sequence */
|
||||
bytes[0]=AMPERSAND;
|
||||
byteIndex=1;
|
||||
}
|
||||
/* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
|
||||
|
||||
inDirectMode=TRUE; /* avoid looping */
|
||||
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
||||
}
|
||||
|
||||
/* set the converter state back into UConverter */
|
||||
cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
|
||||
cnv->toULength=byteIndex;
|
||||
|
||||
finish:
|
||||
/* write back the updated pointers */
|
||||
pArgs->source=(const char *)source;
|
||||
pArgs->target=target;
|
||||
pArgs->offsets=offsets;
|
||||
return;
|
||||
|
||||
callback:
|
||||
/* call the callback function with all the preparations and post-processing */
|
||||
/* update the arguments structure */
|
||||
pArgs->source=(const char *)source;
|
||||
pArgs->target=target;
|
||||
pArgs->offsets=offsets;
|
||||
|
||||
/* copy the current bytes to invalidCharBuffer */
|
||||
for(b=0; b<(uint8_t)byteIndex; ++b) {
|
||||
cnv->invalidCharBuffer[b]=(char)bytes[b];
|
||||
}
|
||||
cnv->invalidCharLength=byteIndex;
|
||||
|
||||
/* set the converter state in UConverter to deal with the next character */
|
||||
cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
|
||||
cnv->toULength=0;
|
||||
|
||||
/* call the callback function */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
{
|
||||
uint32_t status=cnv->toUnicodeStatus;
|
||||
inDirectMode=(UBool)((status>>24)&1);
|
||||
base64Counter=(int8_t)(status>>16);
|
||||
bits=(uint16_t)status;
|
||||
}
|
||||
byteIndex=cnv->toULength;
|
||||
|
||||
/* update target and deal with offsets if necessary */
|
||||
offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
|
||||
target=pArgs->target;
|
||||
|
||||
/* update the source pointer and index */
|
||||
sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
|
||||
/*
|
||||
* If the callback overflowed the target, then we need to
|
||||
* stop here with an overflow indication.
|
||||
*/
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
goto endloop;
|
||||
} else if(cnv->UCharErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
goto endloop;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* break on error */
|
||||
cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
|
||||
cnv->toULength=0;
|
||||
goto finish;
|
||||
} else {
|
||||
goto loop;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@ -1522,7 +1438,7 @@ static const UConverterImpl _IMAPImpl={
|
||||
static const UConverterStaticData _IMAPStaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"IMAP-mailbox-name",
|
||||
0, /* TODO CCSID for UTF-7 */
|
||||
0, /* TODO CCSID for IMAP-mailbox-name */
|
||||
UCNV_IBM, UCNV_IMAP_MAILBOX,
|
||||
1, 4,
|
||||
{ 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
|
||||
|
@ -88,64 +88,6 @@ static const int8_t bytesFromUTF8[256] = {
|
||||
static const uint32_t
|
||||
utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
|
||||
|
||||
/**
|
||||
* Calls invalid char callback when an invalid character sequence is encountered.
|
||||
* It presumes that the converter has a callback to call.
|
||||
*
|
||||
* @returns true when callback fails
|
||||
*/
|
||||
static UBool
|
||||
T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode *err)
|
||||
{
|
||||
UConverter *converter = args->converter;
|
||||
|
||||
if (U_SUCCESS(*err))
|
||||
{
|
||||
if (reason == UCNV_ILLEGAL) {
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
} else {
|
||||
*err = U_INVALID_CHAR_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
/* copy the toUBytes to the invalidCharBuffer */
|
||||
uprv_memcpy(converter->invalidCharBuffer,
|
||||
converter->toUBytes,
|
||||
converter->toULength);
|
||||
converter->invalidCharLength = converter->toULength;
|
||||
converter->toULength = 0;
|
||||
|
||||
/* Call the ErrorFunction */
|
||||
args->converter->fromCharErrorBehaviour(converter->toUContext,
|
||||
args,
|
||||
converter->invalidCharBuffer,
|
||||
converter->invalidCharLength,
|
||||
reason,
|
||||
err);
|
||||
|
||||
return (UBool)U_FAILURE(*err);
|
||||
}
|
||||
|
||||
static UBool
|
||||
T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args,
|
||||
int32_t currentOffset,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode *err)
|
||||
{
|
||||
int32_t *saveOffsets = args->offsets;
|
||||
UBool result;
|
||||
|
||||
result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err);
|
||||
|
||||
while (saveOffsets < args->offsets)
|
||||
{
|
||||
*(saveOffsets++) = currentOffset;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
|
||||
UErrorCode * err)
|
||||
{
|
||||
@ -159,7 +101,6 @@ U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
|
||||
int32_t i, inBytes;
|
||||
|
||||
/* Restore size of current sequence */
|
||||
start:
|
||||
if (args->converter->toUnicodeStatus && myTarget < targetLimit)
|
||||
{
|
||||
inBytes = args->converter->mode; /* restore # of bytes to consume */
|
||||
@ -256,22 +197,9 @@ morebytes:
|
||||
}
|
||||
else
|
||||
{
|
||||
args->source = (const char *) mySource;
|
||||
args->target = myTarget;
|
||||
|
||||
args->converter->toULength = (int8_t)i;
|
||||
if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
|
||||
{
|
||||
/* Stop if the error wasn't handled */
|
||||
/* args and err should already be set properly */
|
||||
return;
|
||||
}
|
||||
|
||||
mySource = (unsigned char *) args->source;
|
||||
myTarget = args->target;
|
||||
|
||||
/* goto the start to handle state left behind by the callback */
|
||||
goto start;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -302,7 +230,6 @@ U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs
|
||||
int32_t i, inBytes;
|
||||
|
||||
/* Restore size of current sequence */
|
||||
start:
|
||||
if (args->converter->toUnicodeStatus && myTarget < targetLimit)
|
||||
{
|
||||
inBytes = args->converter->mode; /* restore # of bytes to consume */
|
||||
@ -399,26 +326,9 @@ morebytes:
|
||||
}
|
||||
else
|
||||
{
|
||||
args->source = (const char *) mySource;
|
||||
args->target = myTarget;
|
||||
args->offsets = myOffsets;
|
||||
|
||||
args->converter->toULength = (int8_t)i;
|
||||
if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args,
|
||||
offsetNum, UCNV_ILLEGAL, err))
|
||||
{
|
||||
/* Stop if the error wasn't handled */
|
||||
/* args and err should already be set properly */
|
||||
return;
|
||||
}
|
||||
|
||||
offsetNum += i + ((unsigned char *) args->source - mySource);
|
||||
mySource = (unsigned char *) args->source;
|
||||
myTarget = args->target;
|
||||
myOffsets = args->offsets;
|
||||
|
||||
/* goto the start to handle state left behind by the callback */
|
||||
goto start;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -683,159 +593,140 @@ lowsurrogate:
|
||||
|
||||
U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
||||
UErrorCode *err) {
|
||||
UChar buffer[2];
|
||||
const char *sourceInitial;
|
||||
UConverter *cnv;
|
||||
const uint8_t *sourceInitial;
|
||||
const uint8_t *source;
|
||||
UChar* myUCharPtr;
|
||||
uint16_t extraBytesToWrite;
|
||||
uint8_t myByte;
|
||||
UChar32 ch;
|
||||
int8_t isLegalSequence;
|
||||
UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
|
||||
int8_t i, isLegalSequence;
|
||||
|
||||
while (args->source < args->sourceLimit)
|
||||
/* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
|
||||
|
||||
cnv = args->converter;
|
||||
sourceInitial = source = (const uint8_t *)args->source;
|
||||
if (source >= (const uint8_t *)args->sourceLimit)
|
||||
{
|
||||
sourceInitial = args->source;
|
||||
myByte = (uint8_t)*(args->source++);
|
||||
if (myByte < 0x80)
|
||||
{
|
||||
return (UChar32)myByte;
|
||||
}
|
||||
|
||||
extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
|
||||
if (extraBytesToWrite == 0) {
|
||||
isLegalSequence = FALSE;
|
||||
ch = 0;
|
||||
goto CALL_ERROR_FUNCTION;
|
||||
}
|
||||
|
||||
/*The byte sequence is longer than the buffer area passed*/
|
||||
source = (const uint8_t *)args->source;
|
||||
if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
|
||||
{
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
else
|
||||
{
|
||||
isLegalSequence = 1;
|
||||
ch = myByte << 6;
|
||||
switch(extraBytesToWrite)
|
||||
{
|
||||
/* note: code falls through cases! (sic)*/
|
||||
case 6:
|
||||
ch += (myByte = *source++);
|
||||
ch <<= 6;
|
||||
if (!UTF8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
case 5:
|
||||
ch += (myByte = *source++);
|
||||
ch <<= 6;
|
||||
if (!UTF8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
ch += (myByte = *source++);
|
||||
ch <<= 6;
|
||||
if (!UTF8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
ch += (myByte = *source++);
|
||||
ch <<= 6;
|
||||
if (!UTF8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
ch += (myByte = *source++);
|
||||
if (!UTF8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
ch -= offsetsFromUTF8[extraBytesToWrite];
|
||||
args->source = (const char *)source;
|
||||
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
*/
|
||||
if (isLegalSequence && (uint32_t)ch <= MAXIMUM_UTF && (uint32_t)ch >= utf8_minChar32[extraBytesToWrite]) {
|
||||
if(isCESU8) {
|
||||
if(extraBytesToWrite <= 3) {
|
||||
if( UTF_IS_FIRST_SURROGATE(ch) &&
|
||||
(const char *)(source + 3) <= args->sourceLimit &&
|
||||
source[0] == 0xed && (source[1] & 0xf0) == 0xb0 && (source[2] & 0xc0) == 0x80
|
||||
) {
|
||||
/* ch is a lead surrogate followed by a trail surrogate */
|
||||
ch = (ch << 10) +
|
||||
((source[1] & 0xf) << 6) + (source[2] & 0x3f) -
|
||||
((0xd800 << 10) - 0x10000);
|
||||
args->source = (const char *)(source + 3);
|
||||
}
|
||||
return ch; /* return the code point */
|
||||
}
|
||||
/* illegal CESU-8 */
|
||||
} else {
|
||||
if(!UTF_IS_SURROGATE(ch)) {
|
||||
return ch; /* return the code point */
|
||||
}
|
||||
/* illegal UTF-8 */
|
||||
}
|
||||
}
|
||||
|
||||
CALL_ERROR_FUNCTION:
|
||||
extraBytesToWrite = (uint16_t)(args->source - sourceInitial);
|
||||
args->converter->invalidCharLength = (uint8_t)extraBytesToWrite;
|
||||
uprv_memcpy(args->converter->invalidCharBuffer, sourceInitial, extraBytesToWrite);
|
||||
|
||||
myUCharPtr = buffer;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
args->target = myUCharPtr;
|
||||
args->targetLimit = buffer + 2;
|
||||
args->converter->fromCharErrorBehaviour(args->converter->toUContext,
|
||||
args,
|
||||
sourceInitial,
|
||||
extraBytesToWrite,
|
||||
UCNV_ILLEGAL,
|
||||
err);
|
||||
|
||||
if(U_SUCCESS(*err)) {
|
||||
extraBytesToWrite = (uint16_t)(args->target - buffer);
|
||||
if(extraBytesToWrite > 0) {
|
||||
return ucnv_getUChar32KeepOverflow(args->converter, buffer, extraBytesToWrite);
|
||||
}
|
||||
/* else (callback did not write anything) continue */
|
||||
} else if(*err == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*err = U_ZERO_ERROR;
|
||||
return ucnv_getUChar32KeepOverflow(args->converter, buffer, 2);
|
||||
} else {
|
||||
/* break on error */
|
||||
/* ### what if a callback set an error but _also_ generated output?! */
|
||||
return 0xffff;
|
||||
}
|
||||
/* no input */
|
||||
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
/* no input or only skipping callback calls */
|
||||
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
myByte = (uint8_t)*(source++);
|
||||
if (myByte < 0x80)
|
||||
{
|
||||
args->source = (const char *)source;
|
||||
return (UChar32)myByte;
|
||||
}
|
||||
|
||||
extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
|
||||
if (extraBytesToWrite == 0) {
|
||||
cnv->toUBytes[0] = myByte;
|
||||
cnv->toULength = 1;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
args->source = (const char *)source;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
/*The byte sequence is longer than the buffer area passed*/
|
||||
if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
|
||||
{
|
||||
/* check if all of the remaining bytes are trail bytes */
|
||||
cnv->toUBytes[0] = myByte;
|
||||
i = 1;
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
while(source < (const uint8_t *)args->sourceLimit) {
|
||||
if(U8_IS_TRAIL(myByte = *source)) {
|
||||
cnv->toUBytes[i++] = myByte;
|
||||
++source;
|
||||
} else {
|
||||
/* error even before we run out of input */
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
}
|
||||
cnv->toULength = i;
|
||||
args->source = (const char *)source;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
isLegalSequence = 1;
|
||||
ch = myByte << 6;
|
||||
switch(extraBytesToWrite)
|
||||
{
|
||||
/* note: code falls through cases! (sic)*/
|
||||
case 6:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!UTF8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
++source;
|
||||
case 5:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!UTF8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
++source;
|
||||
case 4:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!UTF8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
++source;
|
||||
case 3:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!UTF8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
++source;
|
||||
case 2:
|
||||
ch += (myByte = *source);
|
||||
if (!UTF8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
++source;
|
||||
};
|
||||
ch -= offsetsFromUTF8[extraBytesToWrite];
|
||||
args->source = (const char *)source;
|
||||
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
*/
|
||||
if (isLegalSequence &&
|
||||
(uint32_t)ch <= MAXIMUM_UTF &&
|
||||
(uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
|
||||
!U_IS_SURROGATE(ch)
|
||||
) {
|
||||
return ch; /* return the code point */
|
||||
}
|
||||
|
||||
for(i = 0; sourceInitial < source; ++i) {
|
||||
cnv->toUBytes[i] = *sourceInitial++;
|
||||
}
|
||||
cnv->toULength = i;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
@ -884,6 +775,29 @@ const UConverterSharedData _UTF8Data={
|
||||
|
||||
/* CESU-8 converter data ---------------------------------------------------- */
|
||||
|
||||
static const UConverterImpl _CESU8Impl={
|
||||
UCNV_CESU8,
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
|
||||
T_UConverter_toUnicode_UTF8,
|
||||
T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC,
|
||||
T_UConverter_fromUnicode_UTF8,
|
||||
T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC,
|
||||
NULL,
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
ucnv_getCompleteUnicodeSet
|
||||
};
|
||||
|
||||
static const UConverterStaticData _CESU8StaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"CESU-8",
|
||||
@ -897,6 +811,6 @@ static const UConverterStaticData _CESU8StaticData={
|
||||
|
||||
const UConverterSharedData _CESU8Data={
|
||||
sizeof(UConverterSharedData), ~((uint32_t) 0),
|
||||
NULL, NULL, &_CESU8StaticData, FALSE, &_UTF8Impl,
|
||||
NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
|
||||
0
|
||||
};
|
||||
|
@ -262,62 +262,22 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
}
|
||||
else if(targetUniChar>=0xfffe){
|
||||
SAVE_STATE:
|
||||
{
|
||||
const char *saveSource = args->source;
|
||||
UChar *saveTarget = args->target;
|
||||
int32_t *saveOffsets = args->offsets;
|
||||
|
||||
UConverterCallbackReason reason;
|
||||
int32_t currentOffset ;
|
||||
int32_t saveIndex = (int32_t)(myTarget - args->target);
|
||||
|
||||
args->converter->invalidCharLength=0;
|
||||
|
||||
if(targetUniChar == 0xfffe){
|
||||
reason = UCNV_UNASSIGNED;
|
||||
*err = U_INVALID_CHAR_FOUND;
|
||||
}
|
||||
else{
|
||||
reason = UCNV_ILLEGAL;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
if(myData->isStateDBCS){
|
||||
|
||||
args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)(tempBuf[0]-0x80);
|
||||
args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)(tempBuf[1]-0x80);
|
||||
currentOffset= (int32_t)(mySource - args->source -2);
|
||||
|
||||
}
|
||||
else{
|
||||
args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)mySourceChar;
|
||||
currentOffset= (int32_t)(mySource - args->source -1);
|
||||
}
|
||||
args->offsets = args->offsets?args->offsets+(myTarget - args->target):0;
|
||||
args->target = myTarget;
|
||||
args->source = mySource;
|
||||
myTarget = saveTarget;
|
||||
args->converter->fromCharErrorBehaviour (
|
||||
args->converter->toUContext,
|
||||
args,
|
||||
args->converter->invalidCharBuffer,
|
||||
args->converter->invalidCharLength,
|
||||
reason,
|
||||
err);
|
||||
|
||||
if(args->offsets){
|
||||
args->offsets = saveOffsets;
|
||||
|
||||
for (;saveIndex < (args->target - myTarget);saveIndex++) {
|
||||
args->offsets[saveIndex] += currentOffset;
|
||||
}
|
||||
}
|
||||
args->source = saveSource;
|
||||
myTarget = args->target;
|
||||
args->target = saveTarget;
|
||||
args->offsets = saveOffsets;
|
||||
if(U_FAILURE(*err))
|
||||
break;
|
||||
if(targetUniChar == 0xfffe){
|
||||
*err = U_INVALID_CHAR_FOUND;
|
||||
}
|
||||
else{
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
if(myData->isStateDBCS){
|
||||
args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
|
||||
args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
|
||||
args->converter->toULength=2;
|
||||
}
|
||||
else{
|
||||
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
||||
args->converter->toULength=1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
else{
|
||||
|
@ -1069,7 +1069,6 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
uint32_t targetUniChar = 0x0000;
|
||||
uint8_t sourceChar = 0x0000;
|
||||
UConverterDataISCII* data;
|
||||
UConverterCallbackReason reason;
|
||||
UChar32* toUnicodeStatus=NULL;
|
||||
UChar* contextCharToUnicode = NULL;
|
||||
|
||||
@ -1108,17 +1107,14 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
data->currentDeltaToUnicode = data->defDeltaToUnicode;
|
||||
data->currentMaskToUnicode = data->defMaskToUnicode;
|
||||
}else{
|
||||
|
||||
if((sourceChar >= 0x21 && sourceChar <= 0x3F)){
|
||||
/* these are display codes consume and continue */
|
||||
}else{
|
||||
*err =U_ILLEGAL_CHAR_FOUND;
|
||||
/* reset */
|
||||
*contextCharToUnicode=NO_CHAR_MARKER;
|
||||
reason = UCNV_ILLEGAL;
|
||||
goto CALLBACK;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* reset */
|
||||
@ -1148,11 +1144,9 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
/* byte unit is unassigned */
|
||||
targetUniChar = missingCharMarker;
|
||||
*err= U_INVALID_CHAR_FOUND;
|
||||
reason = UCNV_UNASSIGNED;
|
||||
}else{
|
||||
/* only 0xA1 - 0xEE are legal after EXT char */
|
||||
*contextCharToUnicode= NO_CHAR_MARKER;
|
||||
reason= UCNV_ILLEGAL;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
goto CALLBACK;
|
||||
@ -1260,49 +1254,11 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
/* we reach here only if targetUniChar == missingCharMarker
|
||||
* so assign codes to reason and err
|
||||
*/
|
||||
reason = UCNV_UNASSIGNED;
|
||||
*err = U_INVALID_CHAR_FOUND;
|
||||
CALLBACK:
|
||||
{
|
||||
const char *saveSource = args->source;
|
||||
UChar *saveTarget = args->target;
|
||||
int32_t *saveOffsets = NULL;
|
||||
int32_t currentOffset = (int32_t)(source - args->source -1);
|
||||
int32_t saveIndex = (int32_t)(target - args->target);
|
||||
|
||||
args->converter->invalidCharLength=0;
|
||||
|
||||
args->converter->invalidCharBuffer[args->converter->invalidCharLength++] =
|
||||
(char) sourceChar;
|
||||
|
||||
if(args->offsets){
|
||||
saveOffsets=args->offsets;
|
||||
args->offsets = args->offsets+(target - args->target);
|
||||
}
|
||||
|
||||
args->target =target;
|
||||
target =saveTarget;
|
||||
args->source = source;
|
||||
|
||||
args->converter->fromCharErrorBehaviour (
|
||||
args->converter->toUContext,
|
||||
args,
|
||||
args->converter->invalidCharBuffer,
|
||||
args->converter->invalidCharLength,
|
||||
reason,
|
||||
err);
|
||||
|
||||
if(args->offsets){
|
||||
args->offsets = saveOffsets;
|
||||
|
||||
for (;saveIndex < (args->target - target);saveIndex++) {
|
||||
*(args->offsets)++ = currentOffset;
|
||||
}
|
||||
}
|
||||
target=args->target;
|
||||
args->source = saveSource;
|
||||
args->target = saveTarget;
|
||||
}
|
||||
args->converter->toUBytes[0] = (uint8_t) sourceChar;
|
||||
args->converter->toULength = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1312,7 +1268,7 @@ CALLBACK:
|
||||
}
|
||||
}
|
||||
|
||||
if(args->flush && source == sourceLimit) {
|
||||
if(U_SUCCESS(*err) && args->flush && source == sourceLimit) {
|
||||
/* end of the input stream */
|
||||
UConverter *cnv = args->converter;
|
||||
|
||||
|
@ -482,10 +482,10 @@ _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
}
|
||||
|
||||
if(c>0x7f) {
|
||||
/* callback(illegal); copy the current bytes to invalidCharBuffer */
|
||||
/* callback(illegal); copy the current bytes to toUBytes[] */
|
||||
UConverter *cnv=pArgs->converter;
|
||||
cnv->invalidCharBuffer[0]=c;
|
||||
cnv->invalidCharLength=1;
|
||||
cnv->toUBytes[0]=c;
|
||||
cnv->toULength=1;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
} else if(source<sourceLimit && target>=pArgs->targetLimit) {
|
||||
/* target is full */
|
||||
@ -511,62 +511,25 @@ _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
static UChar32
|
||||
_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar buffer[UTF_MAX_CHAR_LENGTH];
|
||||
const uint8_t *source;
|
||||
uint8_t b;
|
||||
|
||||
/* set up the local pointers */
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
|
||||
/* conversion loop */
|
||||
while(source<(const uint8_t *)pArgs->sourceLimit) {
|
||||
if(source<(const uint8_t *)pArgs->sourceLimit) {
|
||||
b=*source++;
|
||||
pArgs->source=(const char *)source;
|
||||
if(b<=0x7f) {
|
||||
return b;
|
||||
} else {
|
||||
/* call the callback function with all the preparations and post-processing */
|
||||
UConverter *cnv=pArgs->converter;
|
||||
|
||||
/* callback(illegal) */
|
||||
cnv->toUBytes[0]=b;
|
||||
cnv->toULength=1;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
|
||||
/* update the arguments structure */
|
||||
pArgs->target=buffer;
|
||||
pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
|
||||
|
||||
/* copy the current byte to invalidCharBuffer */
|
||||
cnv->invalidCharBuffer[0]=(char)b;
|
||||
cnv->invalidCharLength=1;
|
||||
|
||||
/* call the callback function */
|
||||
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode);
|
||||
|
||||
/* update the source pointer */
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
|
||||
/*
|
||||
* return the first character if the callback wrote some
|
||||
* we do not need to goto finish because the converter state is already set
|
||||
*/
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
int32_t length=pArgs->target-buffer;
|
||||
if(length>0) {
|
||||
return ucnv_getUChar32KeepOverflow(cnv, buffer, length);
|
||||
}
|
||||
/* else (callback did not write anything) continue */
|
||||
} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH);
|
||||
} else {
|
||||
/* break on error */
|
||||
/* ### what if a callback set an error but _also_ generated output?! */
|
||||
return 0xffff;
|
||||
}
|
||||
return 0xffff;
|
||||
}
|
||||
}
|
||||
|
||||
/* no output because of empty input or only skipping callbacks */
|
||||
/* no output because of empty input */
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0xffff;
|
||||
}
|
||||
|
@ -542,21 +542,11 @@ fastUnicode:
|
||||
}
|
||||
endloop:
|
||||
|
||||
/* set the converter state back into UConverter */
|
||||
if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
|
||||
/* copy the input sequence into the error buffer */
|
||||
int8_t i;
|
||||
|
||||
for(i=0; i<cnv->toULength; ++i) {
|
||||
cnv->invalidCharBuffer[i]=(char)cnv->toUBytes[i];
|
||||
}
|
||||
cnv->invalidCharLength=i;
|
||||
|
||||
/* reset to deal with the next character */
|
||||
state=readCommand;
|
||||
}
|
||||
|
||||
/* set the converter state back into UConverter */
|
||||
if(state==readCommand) {
|
||||
} else if(state==readCommand) {
|
||||
/* not in a multi-byte sequence, reset toULength */
|
||||
cnv->toULength=0;
|
||||
}
|
||||
@ -845,21 +835,11 @@ fastUnicode:
|
||||
}
|
||||
endloop:
|
||||
|
||||
/* set the converter state back into UConverter */
|
||||
if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
|
||||
/* copy the input sequence into the error buffer */
|
||||
int8_t i;
|
||||
|
||||
for(i=0; i<cnv->toULength; ++i) {
|
||||
cnv->invalidCharBuffer[i]=(char)cnv->toUBytes[i];
|
||||
}
|
||||
cnv->invalidCharLength=i;
|
||||
|
||||
/* reset to deal with the next character */
|
||||
state=readCommand;
|
||||
}
|
||||
|
||||
/* set the converter state back into UConverter */
|
||||
if(state==readCommand) {
|
||||
} else if(state==readCommand) {
|
||||
/* not in a multi-byte sequence, reset toULength */
|
||||
cnv->toULength=0;
|
||||
}
|
||||
@ -2032,7 +2012,13 @@ static const UConverterStaticData _SCSUStaticData={
|
||||
0, /* CCSID for SCSU */
|
||||
UCNV_IBM, UCNV_SCSU,
|
||||
1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
|
||||
{ 0x0e, 0xff, 0xfd, 0 }, 3, /* ### the subchar really must be written by an SCSU function! */
|
||||
/*
|
||||
* ### TODO the subchar really must be written by an SCSU function
|
||||
* however, currently SCSU's fromUnicode() never causes errors, therefore
|
||||
* no callbacks will be called and no subchars written
|
||||
* See Jitterbug 2837 - RFE: forbid converting surrogate code points in all charsets
|
||||
*/
|
||||
{ 0x0e, 0xff, 0xfd, 0 }, 3,
|
||||
FALSE, FALSE,
|
||||
0,
|
||||
0,
|
||||
@ -2044,5 +2030,3 @@ const UConverterSharedData _SCSUData={
|
||||
NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
|
||||
0
|
||||
};
|
||||
|
||||
/* ### clarify: if an error occurs, does a converter reset itself? or is it in a defined or undefined state? */
|
||||
|
Loading…
Reference in New Issue
Block a user