diff --git a/icu4c/source/common/ucnv_lmb.c b/icu4c/source/common/ucnv_lmb.c index e06351c5ac..24153b406c 100644 --- a/icu4c/source/common/ucnv_lmb.c +++ b/icu4c/source/common/ucnv_lmb.c @@ -518,7 +518,7 @@ static const UConverterImpl _LMBCSImpl##n={\ _LMBCSToUnicodeWithOffsets,\ _LMBCSFromUnicode,\ _LMBCSFromUnicode,\ - _LMBCSGetNextUChar,\ + NULL,\ NULL,\ NULL,\ NULL,\ @@ -930,16 +930,6 @@ _LMBCSFromUnicode(UConverterFromUnicodeArgs* args, /* Now, the Unicode from LMBCS section */ -/* - Special codes for the getNextUnicodeWorker -- usually as the result of - special error-callback behavior: - ULMBCS_SKIP To control skipping over LMBCS sequences - ULMBCS_MULTI To indicate that a single LMBCS char translates to - multiple uniChars -*/ -#define ULMBCS_SKIP U_ERROR_LIMIT -#define ULMBCS_MULTI ULMBCS_SKIP+1 - /* A function to call when we are looking at the Unicode group byte in LMBCS */ static UChar GetUniFromLMBCSUni(char const ** ppLMBCSin) /* Called with LMBCS-style Unicode byte stream */ @@ -958,26 +948,22 @@ GetUniFromLMBCSUni(char const ** ppLMBCSin) /* Called with LMBCS-style Unicode /* CHECK_SOURCE_LIMIT: Helper macro to verify that there are at least'index' - bytes left in source up to sourceLimit.Errors appropriately if not + bytes left in source up to sourceLimit.Errors appropriately if not. + If we reach the limit, then update the source pointer to there to consume + all input as required by ICU converter semantics. */ #define CHECK_SOURCE_LIMIT(index) \ if (args->source+index > args->sourceLimit){\ *err = U_TRUNCATED_CHAR_FOUND;\ - args->source = saveSource;\ + args->source = args->sourceLimit;\ return 0xffff;} -/* Return the Unicode representation for the current LMBCS character - - This worker function is used by both ucnv_getNextUChar() and ucnv_ToUnicode(). - The last parameter says whether the return value should be treated as UTF-16 or - UTF-32. The only difference is in surrogate handling -*/ +/* Return the Unicode representation for the current LMBCS character */ static UChar32 _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args, - UErrorCode* err, - UBool returnUTF32) + UErrorCode* err) { UChar32 uniChar = 0; /* an output UNICODE char */ ulmbcs_byte_t CurByte; /* A byte from the input stream */ @@ -1027,20 +1013,10 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args, else if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BigEndian UTF16 */ { - UChar second; CHECK_SOURCE_LIMIT(2); - uniChar = GetUniFromLMBCSUni(&(args->source)); - - /* at this point we are usually done, but we need to make sure we are not in - a situation where we can successfully put together a surrogate pair */ - - if(returnUTF32 && UTF_IS_FIRST_SURROGATE(uniChar) && (args->source+3 <= args->sourceLimit) - && *(args->source)++ == ULMBCS_GRP_UNICODE - && UTF_IS_SECOND_SURROGATE(second = GetUniFromLMBCSUni(&(args->source)))) - { - uniChar = UTF16_GET_PAIR_VALUE(uniChar, second); - } + /* don't check for error indicators fffe/ffff below */ + return GetUniFromLMBCSUni(&(args->source)); } else if (CurByte <= ULMBCS_CTRLOFFSET) { @@ -1126,69 +1102,10 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args, } } } - if (((uint32_t)uniChar - 0xfffe) <= 1) /* 0xfffe<=uniChar<=0xffff */ - { - UConverterToUnicodeArgs cbArgs = *args; - UConverterCallbackReason reason; - UChar UCh; - - if (uniChar == 0xfffe) - { - reason = UCNV_UNASSIGNED; - *err = U_INVALID_CHAR_FOUND; - } - else - { - reason = UCNV_ILLEGAL; - *err = U_ILLEGAL_CHAR_FOUND; - } - - cbArgs.target = &UCh; - cbArgs.targetLimit = &UCh + 1; - cbArgs.converter->fromCharErrorBehaviour(cbArgs.converter->toUContext, - &cbArgs, - saveSource, - args->source - saveSource, - reason, - err); - - if (cbArgs.target != &UCh) - { - uniChar = (UChar32) UCh; - } - /* Did error functor skip */ - if (U_SUCCESS(*err) && cbArgs.target == &UCh) - { - *err = ULMBCS_SKIP; - } - /* Did error functor try to write multiple UChars? */ - else if (*err == U_BUFFER_OVERFLOW_ERROR) - { - *err = ULMBCS_MULTI; - } - } return uniChar; } -/* The exported function that gets one UTF32 character from a LMBCS stream -*/ -static UChar32 -_LMBCSGetNextUChar(UConverterToUnicodeArgs* args, - UErrorCode* err) -{ - UChar32 nextUChar; - do { - nextUChar = _LMBCSGetNextUCharWorker(args, err, TRUE); - } while (*err == ULMBCS_SKIP); - - if (*err == ULMBCS_MULTI) - { - *err = U_ZERO_ERROR; - } - return nextUChar; -} - /* The exported function that converts lmbcs to one or more UChars - currently UTF-16 */ @@ -1196,28 +1113,24 @@ static void _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args, UErrorCode* err) { + char LMBCS [ULMBCS_CHARSIZE_MAX]; UChar uniChar; /* one output UNICODE char */ - const char * saveSource = args->source; /* beginning of current code point */ + const char * saveSource; /* beginning of current code point */ const char * pStartLMBCS = args->source; /* beginning of whole string */ + const char * errSource = NULL; /* pointer to actual input in case an error occurs */ + int8_t savebytes = 0; - if (args->targetLimit == args->target) /* error check may belong in common code */ - { - *err = U_BUFFER_OVERFLOW_ERROR; - return; - } - /* Process from source to limit, or until error */ - while (!*err && args->sourceLimit > args->source && args->targetLimit > args->target) + while (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit > args->target) { saveSource = args->source; /* beginning of current code point */ if (args->converter->toULength) /* reassemble char from previous call */ { - char LMBCS [ULMBCS_CHARSIZE_MAX]; - const char *pLMBCS = LMBCS, *saveSourceLimit; + const char *saveSourceLimit; size_t size_old = args->converter->toULength; - /* limit from source is either reminder of temp buffer, or user limit on source */ + /* limit from source is either remainder of temp buffer, or user limit on source */ size_t size_new_maybe_1 = sizeof(LMBCS) - size_old; size_t size_new_maybe_2 = args->sourceLimit - args->source; size_t size_new = (size_new_maybe_1 < size_new_maybe_2) ? size_new_maybe_1 : size_new_maybe_2; @@ -1226,18 +1139,16 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args, uprv_memcpy(LMBCS, args->converter->toUBytes, size_old); uprv_memcpy(LMBCS + size_old, args->source, size_new); saveSourceLimit = args->sourceLimit; - args->source = pLMBCS; - args->sourceLimit = pLMBCS+size_old+size_new; - uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE); - pLMBCS = args->source; - args->source =saveSource; + args->source = errSource = LMBCS; + args->sourceLimit = LMBCS+size_old+size_new; + savebytes = (int8_t)(size_old+size_new); + uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err); + args->source = saveSource + ((args->source - LMBCS) - size_old); args->sourceLimit = saveSourceLimit; - args->source += (pLMBCS - LMBCS - size_old); if (*err == U_TRUNCATED_CHAR_FOUND) { /* evil special case: source buffers so small a char spans more than 2 buffers */ - int8_t savebytes = (int8_t)(size_old+size_new); args->converter->toULength = savebytes; uprv_memcpy(args->converter->toUBytes, LMBCS, savebytes); args->source = args->sourceLimit; @@ -1252,7 +1163,9 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args, } else { - uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE); + errSource = saveSource; + uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err); + savebytes = (int8_t)(args->source - saveSource); } if (U_SUCCESS(*err)) { @@ -1273,52 +1186,22 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args, *err = U_ILLEGAL_CHAR_FOUND; } } - else if (*err == ULMBCS_MULTI) - { - UChar * pUChar = args->converter->UCharErrorBuffer; - int8_t BufferLength = args->converter->UCharErrorBufferLength; - - *err = U_ZERO_ERROR; - do - { /* error functor wants to write multiple UniChars */ - *(args->target)++ = uniChar; - if(args->offsets) - { - *(args->offsets)++ = saveSource - pStartLMBCS; - } - uniChar = *pUChar++; - } - while(BufferLength-- && args->targetLimit > args->target); - - if (++BufferLength > 0) - { /* fix up remaining UChars that can't fit in caller's buffer */ - uprv_memmove( args->converter->UCharErrorBuffer, - args->converter->UCharErrorBuffer + args->converter->UCharErrorBufferLength - BufferLength, - sizeof(UChar) * BufferLength); - } - args->converter->UCharErrorBufferLength = BufferLength; - } - else if (*err == ULMBCS_SKIP) - { - *err = U_ZERO_ERROR; /* and just go around again..*/ - } } /* if target ran out before source, return U_BUFFER_OVERFLOW_ERROR */ if (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit <= args->target) { *err = U_BUFFER_OVERFLOW_ERROR; } - - /* If character incomplete, store away partial char if more to come */ - if (*err == U_TRUNCATED_CHAR_FOUND) + else if (U_FAILURE(*err)) { - args->source = args->sourceLimit; - { - int8_t savebytes = (int8_t)(args->sourceLimit - saveSource); - args->converter->toULength = (int8_t)savebytes; - uprv_memcpy(args->converter->toUBytes, saveSource, savebytes); - *err = U_ZERO_ERROR; - } + /* If character incomplete or unmappable/illegal, store it in toUBytes[] */ + args->converter->toULength = savebytes; + if (savebytes > 0) { + uprv_memcpy(args->converter->toUBytes, errSource, savebytes); + } + if (*err == U_TRUNCATED_CHAR_FOUND) { + *err = U_ZERO_ERROR; + } } } diff --git a/icu4c/source/common/ucnv_u16.c b/icu4c/source/common/ucnv_u16.c index 04519aebc6..7727546673 100644 --- a/icu4c/source/common/ucnv_u16.c +++ b/icu4c/source/common/ucnv_u16.c @@ -359,52 +359,51 @@ _UTF16OEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, # define _UTF16LEFromUnicodeWithOffsets _UTF16PEFromUnicodeWithOffsets #endif -static UChar32 T_UConverter_getNextUChar_UTF16_BE(UConverterToUnicodeArgs* args, - UErrorCode* err) -{ - UChar32 myUChar; - uint16_t first; - /*Checks boundaries and set appropriate error codes*/ - if (args->source+2 > args->sourceLimit) - { - if (args->source >= args->sourceLimit) - { - /*Either caller has reached the end of the byte stream*/ - *err = U_INDEX_OUTOFBOUNDS_ERROR; - } - else - { - /* a character was cut in half*/ - *err = U_TRUNCATED_CHAR_FOUND; - } +static UChar32 +_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { + const uint8_t *s, *sourceLimit; + UChar32 c; + + s=(const uint8_t *)pArgs->source; + sourceLimit=(const uint8_t *)pArgs->sourceLimit; + + if(s>=sourceLimit) { + /* no input */ + *err=U_INDEX_OUTOFBOUNDS_ERROR; return 0xffff; } - /*Gets the corresponding codepoint*/ - first = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*((args->source)+1))); - myUChar = first; - args->source += 2; + if(s+2>sourceLimit) { + /* only one byte: truncated UChar */ + pArgs->converter->toUBytes[0]=*s++; + pArgs->converter->toULength=1; + pArgs->source=(const char *)s; + *err = U_TRUNCATED_CHAR_FOUND; + return 0xffff; + } - if(UTF_IS_FIRST_SURROGATE(first)) { - uint16_t second; + /* get one UChar */ + c=((UChar32)*s<<8)|s[1]; + s+=2; - if (args->source+2 > args->sourceLimit) { - *err = U_TRUNCATED_CHAR_FOUND; - return 0xffff; - } + /* + * check for surrogate pairs + * surrogate code points are not currently considered an error + * TODO see Jitterbug 1838 + */ + if(U16_IS_LEAD(c) && s+2<=sourceLimit) { + UChar trail; - /* get the second surrogate and assemble the code point */ - second = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*(args->source+1))); - - /* ignore unmatched surrogates and just deliver the first one in such a case */ - if(UTF_IS_SECOND_SURROGATE(second)) { - /* matched pair, get pair value */ - myUChar = UTF16_GET_PAIR_VALUE(first, second); - args->source += 2; + /* get a second UChar and see if it is a trail surrogate */ + trail=((UChar)*s<<8)|s[1]; + if(U16_IS_TRAIL(trail)) { + c=U16_GET_SUPPLEMENTARY(c, trail); + s+=2; } } - return myUChar; + pArgs->source=(const char *)s; + return c; } static const UConverterImpl _UTF16BEImpl={ @@ -421,7 +420,7 @@ static const UConverterImpl _UTF16BEImpl={ _UTF16BEToUnicodeWithOffsets, _UTF16BEFromUnicodeWithOffsets, _UTF16BEFromUnicodeWithOffsets, - T_UConverter_getNextUChar_UTF16_BE, + _UTF16BEGetNextUChar, NULL, NULL, @@ -450,57 +449,51 @@ const UConverterSharedData _UTF16BEData={ /* UTF-16LE ----------------------------------------------------------------- */ -static UChar32 T_UConverter_getNextUChar_UTF16_LE(UConverterToUnicodeArgs* args, - UErrorCode* err) -{ - UChar32 myUChar; - uint16_t first; - /*Checks boundaries and set appropriate error codes*/ - if (args->source+2 > args->sourceLimit) - { - if (args->source >= args->sourceLimit) - { - /*Either caller has reached the end of the byte stream*/ - *err = U_INDEX_OUTOFBOUNDS_ERROR; - } - else - { - /* a character was cut in half*/ - *err = U_TRUNCATED_CHAR_FOUND; - } +static UChar32 +_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { + const uint8_t *s, *sourceLimit; + UChar32 c; + s=(const uint8_t *)pArgs->source; + sourceLimit=(const uint8_t *)pArgs->sourceLimit; + + if(s>=sourceLimit) { + /* no input */ + *err=U_INDEX_OUTOFBOUNDS_ERROR; return 0xffff; } - /*Gets the corresponding codepoint*/ - first = (uint16_t)(((uint16_t)*((args->source)+1) << 8) | ((uint8_t)(*(args->source)))); - myUChar=first; - /*updates the source*/ - args->source += 2; + if(s+2>sourceLimit) { + /* only one byte: truncated UChar */ + pArgs->converter->toUBytes[0]=*s++; + pArgs->converter->toULength=1; + pArgs->source=(const char *)s; + *err = U_TRUNCATED_CHAR_FOUND; + return 0xffff; + } - if (UTF_IS_FIRST_SURROGATE(first)) - { - uint16_t second; + /* get one UChar */ + c=((UChar32)s[1]<<8)|*s; + s+=2; - if (args->source+2 > args->sourceLimit) - { - *err = U_TRUNCATED_CHAR_FOUND; - return 0xffff; - } + /* + * check for surrogate pairs + * surrogate code points are not currently considered an error + * TODO see Jitterbug 1838 + */ + if(U16_IS_LEAD(c) && s+2<=sourceLimit) { + UChar trail; - /* get the second surrogate and assemble the code point */ - second = (uint16_t)(((uint16_t)*(args->source+1) << 8) |((uint8_t)(*(args->source)))); - - /* ignore unmatched surrogates and just deliver the first one in such a case */ - if(UTF_IS_SECOND_SURROGATE(second)) - { - /* matched pair, get pair value */ - myUChar = UTF16_GET_PAIR_VALUE(first, second); - args->source += 2; + /* get a second UChar and see if it is a trail surrogate */ + trail=((UChar)s[1]<<8)|*s; + if(U16_IS_TRAIL(trail)) { + c=U16_GET_SUPPLEMENTARY(c, trail); + s+=2; } } - return myUChar; + pArgs->source=(const char *)s; + return c; } static const UConverterImpl _UTF16LEImpl={ @@ -517,7 +510,7 @@ static const UConverterImpl _UTF16LEImpl={ _UTF16LEToUnicodeWithOffsets, _UTF16LEFromUnicodeWithOffsets, _UTF16LEFromUnicodeWithOffsets, - T_UConverter_getNextUChar_UTF16_LE, + _UTF16LEGetNextUChar, NULL, NULL, @@ -725,9 +718,9 @@ _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { switch(pArgs->converter->mode) { case 8: - return T_UConverter_getNextUChar_UTF16_BE(pArgs, pErrorCode); + return _UTF16BEGetNextUChar(pArgs, pErrorCode); case 9: - return T_UConverter_getNextUChar_UTF16_LE(pArgs, pErrorCode); + return _UTF16LEGetNextUChar(pArgs, pErrorCode); default: return UCNV_GET_NEXT_UCHAR_USE_TO_U; } diff --git a/icu4c/source/common/ucnv_u32.c b/icu4c/source/common/ucnv_u32.c index f845e3084f..f6590c178c 100644 --- a/icu4c/source/common/ucnv_u32.c +++ b/icu4c/source/common/ucnv_u32.c @@ -35,62 +35,6 @@ /* -SURROGATE_LOW_START + HALF_BASE */ #define SURROGATE_LOW_BASE 9216 -/** - * Calls invalid char callback when an invalid character sequence is encountered. - * It presumes that the converter has a callback to call. - * - * @returns true when callback fails - */ -static UBool -T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args, - UConverterCallbackReason reason, - UErrorCode *err) -{ - UConverter *converter = args->converter; - - if (U_SUCCESS(*err)) - { - if (reason == UCNV_ILLEGAL) { - *err = U_ILLEGAL_CHAR_FOUND; - } else { - *err = U_INVALID_CHAR_FOUND; - } - } - - /* copy the toUBytes to the invalidCharBuffer */ - uprv_memcpy(converter->invalidCharBuffer, - converter->toUBytes, - converter->invalidCharLength); - - /* Call the ErrorFunction */ - args->converter->fromCharErrorBehaviour(converter->toUContext, - args, - converter->invalidCharBuffer, - converter->invalidCharLength, - reason, - err); - - return (UBool)U_FAILURE(*err); -} - -static UBool -T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args, - int32_t currentOffset, - UConverterCallbackReason reason, - UErrorCode *err) -{ - int32_t *saveOffsets = args->offsets; - UBool result; - - result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err); - - while (saveOffsets < args->offsets) - { - *(saveOffsets++) = currentOffset; - } - return result; -} - /* UTF-32BE ----------------------------------------------------------------- */ static void @@ -166,17 +110,9 @@ morebytes: } else { - args->source = (const char *) mySource; - args->target = myTarget; - args->converter->invalidCharLength = (int8_t)i; - if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err)) - { - /* Stop if the error wasn't handled */ - break; - } - args->converter->invalidCharLength = 0; - mySource = (unsigned char *) args->source; - myTarget = args->target; + args->converter->toULength = (int8_t)i; + *err = U_ILLEGAL_CHAR_FOUND; + break; } } @@ -268,19 +204,9 @@ morebytes: } else { - args->source = (const char *) mySource; - args->target = myTarget; - args->converter->invalidCharLength = (int8_t)i; - args->offsets = myOffsets; - if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, offsetNum, UCNV_ILLEGAL, err)) - { - /* Stop if the error wasn't handled */ - break; - } - args->converter->invalidCharLength = 0; - mySource = (unsigned char *) args->source; - myTarget = args->target; - myOffsets = args->offsets; + args->converter->toULength = (int8_t)i; + *err = U_ILLEGAL_CHAR_FOUND; + break; } offsetNum += i; } @@ -464,65 +390,44 @@ static UChar32 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args, UErrorCode* err) { - UChar myUCharBuf[2]; - UChar *myUCharPtr; - const unsigned char *mySource; + const uint8_t *mySource; UChar32 myUChar; int32_t length; - while (args->source < args->sourceLimit) + mySource = (const uint8_t *)args->source; + if (mySource >= (const uint8_t *)args->sourceLimit) { - if (args->source + 4 > args->sourceLimit) - { - /* got a partial character */ - *err = U_TRUNCATED_CHAR_FOUND; - return 0xffff; - } - - /* Don't even try to do a direct cast because the value may be on an odd address. */ - mySource = (unsigned char *) args->source; - myUChar = (mySource[0] << 24) - | (mySource[1] << 16) - | (mySource[2] << 8) - | (mySource[3]); - - args->source = (const char *)(mySource + 4); - if (myUChar <= MAXIMUM_UTF && myUChar >= 0) { - return myUChar; - } - - uprv_memcpy(args->converter->invalidCharBuffer, mySource, 4); - args->converter->invalidCharLength = 4; - - myUCharPtr = myUCharBuf; - *err = U_ILLEGAL_CHAR_FOUND; - args->target = myUCharPtr; - args->targetLimit = myUCharBuf + 2; - args->converter->fromCharErrorBehaviour(args->converter->toUContext, - args, - (const char *)mySource, - 4, - UCNV_ILLEGAL, - err); - - if(U_SUCCESS(*err)) { - length = (uint16_t)(args->target - myUCharBuf); - if(length > 0) { - return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, length); - } - /* else (callback did not write anything) continue */ - } else if(*err == U_BUFFER_OVERFLOW_ERROR) { - *err = U_ZERO_ERROR; - return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, 2); - } else { - /* break on error */ - /* ### what if a callback set an error but _also_ generated output?! */ - return 0xffff; - } + /* no input */ + *err = U_INDEX_OUTOFBOUNDS_ERROR; + return 0xffff; } - /* no input or only skipping callbacks */ - *err = U_INDEX_OUTOFBOUNDS_ERROR; + length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); + if (length < 4) + { + /* got a partial character */ + uprv_memcpy(args->converter->toUBytes, mySource, length); + args->converter->toULength = (int8_t)length; + args->source = (const char *)(mySource + length); + *err = U_TRUNCATED_CHAR_FOUND; + return 0xffff; + } + + /* Don't even try to do a direct cast because the value may be on an odd address. */ + myUChar = ((UChar32)mySource[0] << 24) + | ((UChar32)mySource[1] << 16) + | ((UChar32)mySource[2] << 8) + | ((UChar32)mySource[3]); + + args->source = (const char *)(mySource + 4); + if ((uint32_t)myUChar <= MAXIMUM_UTF) { + return myUChar; + } + + uprv_memcpy(args->converter->toUBytes, mySource, 4); + args->converter->toULength = 4; + + *err = U_ILLEGAL_CHAR_FOUND; return 0xffff; } @@ -643,17 +548,9 @@ morebytes: } else { - args->source = (const char *) mySource; - args->target = myTarget; - args->converter->invalidCharLength = (int8_t)i; - if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err)) - { - /* Stop if the error wasn't handled */ - break; - } - args->converter->invalidCharLength = 0; - mySource = (unsigned char *) args->source; - myTarget = args->target; + args->converter->toULength = (int8_t)i; + *err = U_ILLEGAL_CHAR_FOUND; + break; } } @@ -747,19 +644,9 @@ morebytes: } else { - args->source = (const char *) mySource; - args->target = myTarget; - args->converter->invalidCharLength = (int8_t)i; - args->offsets = myOffsets; - if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, offsetNum, UCNV_ILLEGAL, err)) - { - /* Stop if the error wasn't handled */ - break; - } - args->converter->invalidCharLength = 0; - mySource = (unsigned char *) args->source; - myTarget = args->target; - myOffsets = args->offsets; + args->converter->toULength = (int8_t)i; + *err = U_ILLEGAL_CHAR_FOUND; + break; } offsetNum += i; } @@ -935,65 +822,44 @@ static UChar32 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args, UErrorCode* err) { - UChar myUCharBuf[2]; - UChar *myUCharPtr; - const unsigned char *mySource; + const uint8_t *mySource; UChar32 myUChar; int32_t length; - while (args->source < args->sourceLimit) + mySource = (const uint8_t *)args->source; + if (mySource >= (const uint8_t *)args->sourceLimit) { - if (args->source + 4 > args->sourceLimit) - { - /* got a partial character */ - *err = U_TRUNCATED_CHAR_FOUND; - return 0xffff; - } - - /* Don't even try to do a direct cast because the value may be on an odd address. */ - mySource = (unsigned char *) args->source; - myUChar = (mySource[0]) - | (mySource[1] << 8) - | (mySource[2] << 16) - | (mySource[3] << 24); - - args->source = (const char *)(mySource + 4); - if (myUChar <= MAXIMUM_UTF && myUChar >= 0) { - return myUChar; - } - - uprv_memcpy(args->converter->invalidCharBuffer, mySource, 4); - args->converter->invalidCharLength = 4; - - myUCharPtr = myUCharBuf; - *err = U_ILLEGAL_CHAR_FOUND; - args->target = myUCharPtr; - args->targetLimit = myUCharBuf + 2; - args->converter->fromCharErrorBehaviour(args->converter->toUContext, - args, - (const char *)mySource, - 4, - UCNV_ILLEGAL, - err); - - if(U_SUCCESS(*err)) { - length = (uint16_t)(args->target - myUCharBuf); - if(length > 0) { - return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, length); - } - /* else (callback did not write anything) continue */ - } else if(*err == U_BUFFER_OVERFLOW_ERROR) { - *err = U_ZERO_ERROR; - return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, 2); - } else { - /* break on error */ - /* ### what if a callback set an error but _also_ generated output?! */ - return 0xffff; - } + /* no input */ + *err = U_INDEX_OUTOFBOUNDS_ERROR; + return 0xffff; } - /* no input or only skipping callbacks */ - *err = U_INDEX_OUTOFBOUNDS_ERROR; + length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); + if (length < 4) + { + /* got a partial character */ + uprv_memcpy(args->converter->toUBytes, mySource, length); + args->converter->toULength = (int8_t)length; + args->source = (const char *)(mySource + length); + *err = U_TRUNCATED_CHAR_FOUND; + return 0xffff; + } + + /* Don't even try to do a direct cast because the value may be on an odd address. */ + myUChar = ((UChar32)mySource[3] << 24) + | ((UChar32)mySource[2] << 16) + | ((UChar32)mySource[1] << 8) + | ((UChar32)mySource[0]); + + args->source = (const char *)(mySource + 4); + if ((uint32_t)myUChar <= MAXIMUM_UTF) { + return myUChar; + } + + uprv_memcpy(args->converter->toUBytes, mySource, 4); + args->converter->toULength = 4; + + *err = U_ILLEGAL_CHAR_FOUND; return 0xffff; } diff --git a/icu4c/source/common/ucnv_u7.c b/icu4c/source/common/ucnv_u7.c index 50121c985a..3db83393f1 100644 --- a/icu4c/source/common/ucnv_u7.c +++ b/icu4c/source/common/ucnv_u7.c @@ -22,7 +22,6 @@ /* UTF-7 -------------------------------------------------------------------- */ -/* ### TODO: in user guide, document version option (=1 for escaping set O characters) */ /* * UTF-7 is a stateful encoding of Unicode. * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt) @@ -247,7 +246,6 @@ _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, sourceIndex=byteIndex==0 ? 0 : -1; nextSourceIndex=0; -loop: if(inDirectMode) { directMode: /* @@ -270,8 +268,8 @@ directMode: /* illegal */ bytes[0]=b; byteIndex=1; - nextSourceIndex=sourceIndex+1; - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; } else if(b!=PLUS) { /* write directly encoded character */ *target++=b; @@ -312,7 +310,8 @@ unicodeMode: if(b>=126) { /* illegal - test other illegal US-ASCII values by base64Value==-3 */ inDirectMode=TRUE; - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; } else if((base64Value=fromBase64[b])>=0) { /* collect base64 bytes into UChars */ switch(base64Counter) { @@ -377,7 +376,8 @@ unicodeMode: /* absorb the minus and leave the Unicode Mode */ if(bits!=0) { /* bits are illegally left over, a UChar is incomplete */ - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; } } sourceIndex=nextSourceIndex; @@ -392,7 +392,8 @@ unicodeMode: bytes[0]=PLUS; bytes[1]=b; byteIndex=2; - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; } else if(bits==0) { /* un-read the character in case it is a plus sign */ --source; @@ -400,12 +401,14 @@ unicodeMode: goto directMode; } else { /* bits are illegally left over, a UChar is incomplete */ - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; } } else /* base64Value==-3 for illegal characters */ { /* illegal */ inDirectMode=TRUE; - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; } } else { /* target is full */ @@ -414,7 +417,6 @@ unicodeMode: } } } -endloop: if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) { /* @@ -430,69 +432,11 @@ endloop: cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits; cnv->toULength=byteIndex; -finish: /* write back the updated pointers */ pArgs->source=(const char *)source; pArgs->target=target; pArgs->offsets=offsets; return; - -callback: - /* call the callback function with all the preparations and post-processing */ - /* update the arguments structure */ - pArgs->source=(const char *)source; - pArgs->target=target; - pArgs->offsets=offsets; - - /* copy the current bytes to invalidCharBuffer */ - for(b=0; b<(uint8_t)byteIndex; ++b) { - cnv->invalidCharBuffer[b]=(char)bytes[b]; - } - cnv->invalidCharLength=byteIndex; - - /* set the converter state in UConverter to deal with the next character */ - cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24; - cnv->toULength=0; - - /* call the callback function */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode); - - /* get the converter state from UConverter */ - { - uint32_t status=cnv->toUnicodeStatus; - inDirectMode=(UBool)((status>>24)&1); - base64Counter=(int8_t)(status>>16); - bits=(uint16_t)status; - } - byteIndex=cnv->toULength; - - /* update target and deal with offsets if necessary */ - offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex); - target=pArgs->target; - - /* update the source pointer and index */ - sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source); - source=(const uint8_t *)pArgs->source; - - /* - * If the callback overflowed the target, then we need to - * stop here with an overflow indication. - */ - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - goto endloop; - } else if(cnv->UCharErrorBufferLength>0) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - goto endloop; - } else if(U_FAILURE(*pErrorCode)) { - /* break on error */ - cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */ - cnv->toULength=0; - goto finish; - } else { - goto loop; - } } static void @@ -961,7 +905,6 @@ _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, sourceIndex=byteIndex==0 ? 0 : -1; nextSourceIndex=0; -loop: if(inDirectMode) { directMode: /* @@ -983,8 +926,8 @@ directMode: /* illegal */ bytes[0]=b; byteIndex=1; - nextSourceIndex=sourceIndex+1; - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; } else if(b!=AMPERSAND) { /* write directly encoded character */ *target++=b; @@ -995,8 +938,7 @@ directMode: /* switch to Unicode mode */ nextSourceIndex=++sourceIndex; inDirectMode=FALSE; - bytes[0]=b; - byteIndex=1; + byteIndex=0; bits=0; base64Counter=-1; goto unicodeMode; @@ -1027,7 +969,8 @@ unicodeMode: if(b>0x7e) { /* illegal - test other illegal US-ASCII values by base64Value==-3 */ inDirectMode=TRUE; - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; } else if((base64Value=FROM_BASE64_IMAP(b))>=0) { /* collect base64 bytes into UChars */ switch(base64Counter) { @@ -1048,7 +991,8 @@ unicodeMode: if(isLegalIMAP(c)) { /* illegal */ inDirectMode=TRUE; - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + goto endloop; } *target++=c; if(offsets!=NULL) { @@ -1065,7 +1009,8 @@ unicodeMode: if(isLegalIMAP(c)) { /* illegal */ inDirectMode=TRUE; - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + goto endloop; } *target++=c; if(offsets!=NULL) { @@ -1082,7 +1027,8 @@ unicodeMode: if(isLegalIMAP(c)) { /* illegal */ inDirectMode=TRUE; - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + goto endloop; } *target++=c; if(offsets!=NULL) { @@ -1111,7 +1057,8 @@ unicodeMode: if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) { /* bits are illegally left over, a UChar is incomplete */ /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */ - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; } } sourceIndex=nextSourceIndex; @@ -1129,7 +1076,8 @@ unicodeMode: /* base64Value==-3 for illegal characters */ /* illegal */ inDirectMode=TRUE; - goto callback; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; } } else { /* target is full */ @@ -1140,73 +1088,41 @@ unicodeMode: } endloop: + /* + * the end of the input stream and detection of truncated input + * are handled by the framework, but here we must check if we are in Unicode + * mode and byteIndex==0 because we must end in direct mode + * + * conditions: + * successful + * in Unicode mode and byteIndex==0 + * end of input and no truncated input + */ + if( U_SUCCESS(*pErrorCode) && + !inDirectMode && byteIndex==0 && + pArgs->flush && source>=sourceLimit + ) { + if(base64Counter==-1) { + /* & at the very end of the input */ + /* make the ampersand the reported sequence */ + bytes[0]=AMPERSAND; + byteIndex=1; + } + /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */ + + inDirectMode=TRUE; /* avoid looping */ + *pErrorCode=U_TRUNCATED_CHAR_FOUND; + } + /* set the converter state back into UConverter */ cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits; cnv->toULength=byteIndex; -finish: /* write back the updated pointers */ pArgs->source=(const char *)source; pArgs->target=target; pArgs->offsets=offsets; return; - -callback: - /* call the callback function with all the preparations and post-processing */ - /* update the arguments structure */ - pArgs->source=(const char *)source; - pArgs->target=target; - pArgs->offsets=offsets; - - /* copy the current bytes to invalidCharBuffer */ - for(b=0; b<(uint8_t)byteIndex; ++b) { - cnv->invalidCharBuffer[b]=(char)bytes[b]; - } - cnv->invalidCharLength=byteIndex; - - /* set the converter state in UConverter to deal with the next character */ - cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24; - cnv->toULength=0; - - /* call the callback function */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode); - - /* get the converter state from UConverter */ - { - uint32_t status=cnv->toUnicodeStatus; - inDirectMode=(UBool)((status>>24)&1); - base64Counter=(int8_t)(status>>16); - bits=(uint16_t)status; - } - byteIndex=cnv->toULength; - - /* update target and deal with offsets if necessary */ - offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex); - target=pArgs->target; - - /* update the source pointer and index */ - sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source); - source=(const uint8_t *)pArgs->source; - - /* - * If the callback overflowed the target, then we need to - * stop here with an overflow indication. - */ - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - goto endloop; - } else if(cnv->UCharErrorBufferLength>0) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - goto endloop; - } else if(U_FAILURE(*pErrorCode)) { - /* break on error */ - cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */ - cnv->toULength=0; - goto finish; - } else { - goto loop; - } } static void @@ -1522,7 +1438,7 @@ static const UConverterImpl _IMAPImpl={ static const UConverterStaticData _IMAPStaticData={ sizeof(UConverterStaticData), "IMAP-mailbox-name", - 0, /* TODO CCSID for UTF-7 */ + 0, /* TODO CCSID for IMAP-mailbox-name */ UCNV_IBM, UCNV_IMAP_MAILBOX, 1, 4, { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */ diff --git a/icu4c/source/common/ucnv_u8.c b/icu4c/source/common/ucnv_u8.c index b7471c2759..65be9a64b2 100644 --- a/icu4c/source/common/ucnv_u8.c +++ b/icu4c/source/common/ucnv_u8.c @@ -88,64 +88,6 @@ static const int8_t bytesFromUTF8[256] = { static const uint32_t utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff }; -/** - * Calls invalid char callback when an invalid character sequence is encountered. - * It presumes that the converter has a callback to call. - * - * @returns true when callback fails - */ -static UBool -T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args, - UConverterCallbackReason reason, - UErrorCode *err) -{ - UConverter *converter = args->converter; - - if (U_SUCCESS(*err)) - { - if (reason == UCNV_ILLEGAL) { - *err = U_ILLEGAL_CHAR_FOUND; - } else { - *err = U_INVALID_CHAR_FOUND; - } - } - - /* copy the toUBytes to the invalidCharBuffer */ - uprv_memcpy(converter->invalidCharBuffer, - converter->toUBytes, - converter->toULength); - converter->invalidCharLength = converter->toULength; - converter->toULength = 0; - - /* Call the ErrorFunction */ - args->converter->fromCharErrorBehaviour(converter->toUContext, - args, - converter->invalidCharBuffer, - converter->invalidCharLength, - reason, - err); - - return (UBool)U_FAILURE(*err); -} - -static UBool -T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args, - int32_t currentOffset, - UConverterCallbackReason reason, - UErrorCode *err) -{ - int32_t *saveOffsets = args->offsets; - UBool result; - - result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err); - - while (saveOffsets < args->offsets) - { - *(saveOffsets++) = currentOffset; - } - return result; -} - U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args, UErrorCode * err) { @@ -159,7 +101,6 @@ U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args, int32_t i, inBytes; /* Restore size of current sequence */ -start: if (args->converter->toUnicodeStatus && myTarget < targetLimit) { inBytes = args->converter->mode; /* restore # of bytes to consume */ @@ -256,22 +197,9 @@ morebytes: } else { - args->source = (const char *) mySource; - args->target = myTarget; - args->converter->toULength = (int8_t)i; - if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err)) - { - /* Stop if the error wasn't handled */ - /* args and err should already be set properly */ - return; - } - - mySource = (unsigned char *) args->source; - myTarget = args->target; - - /* goto the start to handle state left behind by the callback */ - goto start; + *err = U_ILLEGAL_CHAR_FOUND; + break; } } } @@ -302,7 +230,6 @@ U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs int32_t i, inBytes; /* Restore size of current sequence */ -start: if (args->converter->toUnicodeStatus && myTarget < targetLimit) { inBytes = args->converter->mode; /* restore # of bytes to consume */ @@ -399,26 +326,9 @@ morebytes: } else { - args->source = (const char *) mySource; - args->target = myTarget; - args->offsets = myOffsets; - args->converter->toULength = (int8_t)i; - if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, - offsetNum, UCNV_ILLEGAL, err)) - { - /* Stop if the error wasn't handled */ - /* args and err should already be set properly */ - return; - } - - offsetNum += i + ((unsigned char *) args->source - mySource); - mySource = (unsigned char *) args->source; - myTarget = args->target; - myOffsets = args->offsets; - - /* goto the start to handle state left behind by the callback */ - goto start; + *err = U_ILLEGAL_CHAR_FOUND; + break; } } } @@ -683,159 +593,140 @@ lowsurrogate: U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args, UErrorCode *err) { - UChar buffer[2]; - const char *sourceInitial; + UConverter *cnv; + const uint8_t *sourceInitial; const uint8_t *source; - UChar* myUCharPtr; uint16_t extraBytesToWrite; uint8_t myByte; UChar32 ch; - int8_t isLegalSequence; - UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data); + int8_t i, isLegalSequence; - while (args->source < args->sourceLimit) + /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ + + cnv = args->converter; + sourceInitial = source = (const uint8_t *)args->source; + if (source >= (const uint8_t *)args->sourceLimit) { - sourceInitial = args->source; - myByte = (uint8_t)*(args->source++); - if (myByte < 0x80) - { - return (UChar32)myByte; - } - - extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte]; - if (extraBytesToWrite == 0) { - isLegalSequence = FALSE; - ch = 0; - goto CALL_ERROR_FUNCTION; - } - - /*The byte sequence is longer than the buffer area passed*/ - source = (const uint8_t *)args->source; - if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit) - { - *err = U_TRUNCATED_CHAR_FOUND; - return 0xffff; - } - else - { - isLegalSequence = 1; - ch = myByte << 6; - switch(extraBytesToWrite) - { - /* note: code falls through cases! (sic)*/ - case 6: - ch += (myByte = *source++); - ch <<= 6; - if (!UTF8_IS_TRAIL(myByte)) - { - isLegalSequence = 0; - break; - } - case 5: - ch += (myByte = *source++); - ch <<= 6; - if (!UTF8_IS_TRAIL(myByte)) - { - isLegalSequence = 0; - break; - } - case 4: - ch += (myByte = *source++); - ch <<= 6; - if (!UTF8_IS_TRAIL(myByte)) - { - isLegalSequence = 0; - break; - } - case 3: - ch += (myByte = *source++); - ch <<= 6; - if (!UTF8_IS_TRAIL(myByte)) - { - isLegalSequence = 0; - break; - } - case 2: - ch += (myByte = *source++); - if (!UTF8_IS_TRAIL(myByte)) - { - isLegalSequence = 0; - } - }; - } - ch -= offsetsFromUTF8[extraBytesToWrite]; - args->source = (const char *)source; - - /* - * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: - * - use only trail bytes after a lead byte (checked above) - * - use the right number of trail bytes for a given lead byte - * - encode a code point <= U+10ffff - * - use the fewest possible number of bytes for their code points - * - use at most 4 bytes (for i>=5 it is 0x10ffff= utf8_minChar32[extraBytesToWrite]) { - if(isCESU8) { - if(extraBytesToWrite <= 3) { - if( UTF_IS_FIRST_SURROGATE(ch) && - (const char *)(source + 3) <= args->sourceLimit && - source[0] == 0xed && (source[1] & 0xf0) == 0xb0 && (source[2] & 0xc0) == 0x80 - ) { - /* ch is a lead surrogate followed by a trail surrogate */ - ch = (ch << 10) + - ((source[1] & 0xf) << 6) + (source[2] & 0x3f) - - ((0xd800 << 10) - 0x10000); - args->source = (const char *)(source + 3); - } - return ch; /* return the code point */ - } - /* illegal CESU-8 */ - } else { - if(!UTF_IS_SURROGATE(ch)) { - return ch; /* return the code point */ - } - /* illegal UTF-8 */ - } - } - -CALL_ERROR_FUNCTION: - extraBytesToWrite = (uint16_t)(args->source - sourceInitial); - args->converter->invalidCharLength = (uint8_t)extraBytesToWrite; - uprv_memcpy(args->converter->invalidCharBuffer, sourceInitial, extraBytesToWrite); - - myUCharPtr = buffer; - *err = U_ILLEGAL_CHAR_FOUND; - args->target = myUCharPtr; - args->targetLimit = buffer + 2; - args->converter->fromCharErrorBehaviour(args->converter->toUContext, - args, - sourceInitial, - extraBytesToWrite, - UCNV_ILLEGAL, - err); - - if(U_SUCCESS(*err)) { - extraBytesToWrite = (uint16_t)(args->target - buffer); - if(extraBytesToWrite > 0) { - return ucnv_getUChar32KeepOverflow(args->converter, buffer, extraBytesToWrite); - } - /* else (callback did not write anything) continue */ - } else if(*err == U_BUFFER_OVERFLOW_ERROR) { - *err = U_ZERO_ERROR; - return ucnv_getUChar32KeepOverflow(args->converter, buffer, 2); - } else { - /* break on error */ - /* ### what if a callback set an error but _also_ generated output?! */ - return 0xffff; - } + /* no input */ + *err = U_INDEX_OUTOFBOUNDS_ERROR; + return 0xffff; } - /* no input or only skipping callback calls */ - *err = U_INDEX_OUTOFBOUNDS_ERROR; + myByte = (uint8_t)*(source++); + if (myByte < 0x80) + { + args->source = (const char *)source; + return (UChar32)myByte; + } + + extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte]; + if (extraBytesToWrite == 0) { + cnv->toUBytes[0] = myByte; + cnv->toULength = 1; + *err = U_ILLEGAL_CHAR_FOUND; + args->source = (const char *)source; + return 0xffff; + } + + /*The byte sequence is longer than the buffer area passed*/ + if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit) + { + /* check if all of the remaining bytes are trail bytes */ + cnv->toUBytes[0] = myByte; + i = 1; + *err = U_TRUNCATED_CHAR_FOUND; + while(source < (const uint8_t *)args->sourceLimit) { + if(U8_IS_TRAIL(myByte = *source)) { + cnv->toUBytes[i++] = myByte; + ++source; + } else { + /* error even before we run out of input */ + *err = U_ILLEGAL_CHAR_FOUND; + break; + } + } + cnv->toULength = i; + args->source = (const char *)source; + return 0xffff; + } + + isLegalSequence = 1; + ch = myByte << 6; + switch(extraBytesToWrite) + { + /* note: code falls through cases! (sic)*/ + case 6: + ch += (myByte = *source); + ch <<= 6; + if (!UTF8_IS_TRAIL(myByte)) + { + isLegalSequence = 0; + break; + } + ++source; + case 5: + ch += (myByte = *source); + ch <<= 6; + if (!UTF8_IS_TRAIL(myByte)) + { + isLegalSequence = 0; + break; + } + ++source; + case 4: + ch += (myByte = *source); + ch <<= 6; + if (!UTF8_IS_TRAIL(myByte)) + { + isLegalSequence = 0; + break; + } + ++source; + case 3: + ch += (myByte = *source); + ch <<= 6; + if (!UTF8_IS_TRAIL(myByte)) + { + isLegalSequence = 0; + break; + } + ++source; + case 2: + ch += (myByte = *source); + if (!UTF8_IS_TRAIL(myByte)) + { + isLegalSequence = 0; + break; + } + ++source; + }; + ch -= offsetsFromUTF8[extraBytesToWrite]; + args->source = (const char *)source; + + /* + * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: + * - use only trail bytes after a lead byte (checked above) + * - use the right number of trail bytes for a given lead byte + * - encode a code point <= U+10ffff + * - use the fewest possible number of bytes for their code points + * - use at most 4 bytes (for i>=5 it is 0x10ffff= utf8_minChar32[extraBytesToWrite] && + !U_IS_SURROGATE(ch) + ) { + return ch; /* return the code point */ + } + + for(i = 0; sourceInitial < source; ++i) { + cnv->toUBytes[i] = *sourceInitial++; + } + cnv->toULength = i; + *err = U_ILLEGAL_CHAR_FOUND; return 0xffff; } @@ -884,6 +775,29 @@ const UConverterSharedData _UTF8Data={ /* CESU-8 converter data ---------------------------------------------------- */ +static const UConverterImpl _CESU8Impl={ + UCNV_CESU8, + + NULL, + NULL, + + NULL, + NULL, + NULL, + + T_UConverter_toUnicode_UTF8, + T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC, + T_UConverter_fromUnicode_UTF8, + T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC, + NULL, + + NULL, + NULL, + NULL, + NULL, + ucnv_getCompleteUnicodeSet +}; + static const UConverterStaticData _CESU8StaticData={ sizeof(UConverterStaticData), "CESU-8", @@ -897,6 +811,6 @@ static const UConverterStaticData _CESU8StaticData={ const UConverterSharedData _CESU8Data={ sizeof(UConverterSharedData), ~((uint32_t) 0), - NULL, NULL, &_CESU8StaticData, FALSE, &_UTF8Impl, + NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl, 0 }; diff --git a/icu4c/source/common/ucnvhz.c b/icu4c/source/common/ucnvhz.c index 669edaadc8..5c19f86fb9 100644 --- a/icu4c/source/common/ucnvhz.c +++ b/icu4c/source/common/ucnvhz.c @@ -262,62 +262,22 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, } else if(targetUniChar>=0xfffe){ SAVE_STATE: - { - const char *saveSource = args->source; - UChar *saveTarget = args->target; - int32_t *saveOffsets = args->offsets; - - UConverterCallbackReason reason; - int32_t currentOffset ; - int32_t saveIndex = (int32_t)(myTarget - args->target); - - args->converter->invalidCharLength=0; - - if(targetUniChar == 0xfffe){ - reason = UCNV_UNASSIGNED; - *err = U_INVALID_CHAR_FOUND; - } - else{ - reason = UCNV_ILLEGAL; - *err = U_ILLEGAL_CHAR_FOUND; - } - if(myData->isStateDBCS){ - - args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)(tempBuf[0]-0x80); - args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)(tempBuf[1]-0x80); - currentOffset= (int32_t)(mySource - args->source -2); - - } - else{ - args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)mySourceChar; - currentOffset= (int32_t)(mySource - args->source -1); - } - args->offsets = args->offsets?args->offsets+(myTarget - args->target):0; - args->target = myTarget; - args->source = mySource; - myTarget = saveTarget; - args->converter->fromCharErrorBehaviour ( - args->converter->toUContext, - args, - args->converter->invalidCharBuffer, - args->converter->invalidCharLength, - reason, - err); - - if(args->offsets){ - args->offsets = saveOffsets; - - for (;saveIndex < (args->target - myTarget);saveIndex++) { - args->offsets[saveIndex] += currentOffset; - } - } - args->source = saveSource; - myTarget = args->target; - args->target = saveTarget; - args->offsets = saveOffsets; - if(U_FAILURE(*err)) - break; + if(targetUniChar == 0xfffe){ + *err = U_INVALID_CHAR_FOUND; } + else{ + *err = U_ILLEGAL_CHAR_FOUND; + } + if(myData->isStateDBCS){ + args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80); + args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80); + args->converter->toULength=2; + } + else{ + args->converter->toUBytes[0] = (uint8_t)mySourceChar; + args->converter->toULength=1; + } + break; } } else{ diff --git a/icu4c/source/common/ucnvisci.c b/icu4c/source/common/ucnvisci.c index 6745444685..6039a3d579 100644 --- a/icu4c/source/common/ucnvisci.c +++ b/icu4c/source/common/ucnvisci.c @@ -1069,7 +1069,6 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, uint32_t targetUniChar = 0x0000; uint8_t sourceChar = 0x0000; UConverterDataISCII* data; - UConverterCallbackReason reason; UChar32* toUnicodeStatus=NULL; UChar* contextCharToUnicode = NULL; @@ -1108,17 +1107,14 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, data->currentDeltaToUnicode = data->defDeltaToUnicode; data->currentMaskToUnicode = data->defMaskToUnicode; }else{ - if((sourceChar >= 0x21 && sourceChar <= 0x3F)){ /* these are display codes consume and continue */ }else{ *err =U_ILLEGAL_CHAR_FOUND; /* reset */ *contextCharToUnicode=NO_CHAR_MARKER; - reason = UCNV_ILLEGAL; goto CALLBACK; } - } /* reset */ @@ -1148,11 +1144,9 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, /* byte unit is unassigned */ targetUniChar = missingCharMarker; *err= U_INVALID_CHAR_FOUND; - reason = UCNV_UNASSIGNED; }else{ /* only 0xA1 - 0xEE are legal after EXT char */ *contextCharToUnicode= NO_CHAR_MARKER; - reason= UCNV_ILLEGAL; *err = U_ILLEGAL_CHAR_FOUND; } goto CALLBACK; @@ -1260,49 +1254,11 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, /* we reach here only if targetUniChar == missingCharMarker * so assign codes to reason and err */ - reason = UCNV_UNASSIGNED; *err = U_INVALID_CHAR_FOUND; CALLBACK: - { - const char *saveSource = args->source; - UChar *saveTarget = args->target; - int32_t *saveOffsets = NULL; - int32_t currentOffset = (int32_t)(source - args->source -1); - int32_t saveIndex = (int32_t)(target - args->target); - - args->converter->invalidCharLength=0; - - args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = - (char) sourceChar; - - if(args->offsets){ - saveOffsets=args->offsets; - args->offsets = args->offsets+(target - args->target); - } - - args->target =target; - target =saveTarget; - args->source = source; - - args->converter->fromCharErrorBehaviour ( - args->converter->toUContext, - args, - args->converter->invalidCharBuffer, - args->converter->invalidCharLength, - reason, - err); - - if(args->offsets){ - args->offsets = saveOffsets; - - for (;saveIndex < (args->target - target);saveIndex++) { - *(args->offsets)++ = currentOffset; - } - } - target=args->target; - args->source = saveSource; - args->target = saveTarget; - } + args->converter->toUBytes[0] = (uint8_t) sourceChar; + args->converter->toULength = 1; + break; } } @@ -1312,7 +1268,7 @@ CALLBACK: } } - if(args->flush && source == sourceLimit) { + if(U_SUCCESS(*err) && args->flush && source == sourceLimit) { /* end of the input stream */ UConverter *cnv = args->converter; diff --git a/icu4c/source/common/ucnvlat1.c b/icu4c/source/common/ucnvlat1.c index 37e443ebfb..db2ca2417d 100644 --- a/icu4c/source/common/ucnvlat1.c +++ b/icu4c/source/common/ucnvlat1.c @@ -482,10 +482,10 @@ _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, } if(c>0x7f) { - /* callback(illegal); copy the current bytes to invalidCharBuffer */ + /* callback(illegal); copy the current bytes to toUBytes[] */ UConverter *cnv=pArgs->converter; - cnv->invalidCharBuffer[0]=c; - cnv->invalidCharLength=1; + cnv->toUBytes[0]=c; + cnv->toULength=1; *pErrorCode=U_ILLEGAL_CHAR_FOUND; } else if(source=pArgs->targetLimit) { /* target is full */ @@ -511,62 +511,25 @@ _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, static UChar32 _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { - UChar buffer[UTF_MAX_CHAR_LENGTH]; const uint8_t *source; uint8_t b; - /* set up the local pointers */ source=(const uint8_t *)pArgs->source; - - /* conversion loop */ - while(source<(const uint8_t *)pArgs->sourceLimit) { + if(source<(const uint8_t *)pArgs->sourceLimit) { b=*source++; pArgs->source=(const char *)source; if(b<=0x7f) { return b; } else { - /* call the callback function with all the preparations and post-processing */ UConverter *cnv=pArgs->converter; - - /* callback(illegal) */ + cnv->toUBytes[0]=b; + cnv->toULength=1; *pErrorCode=U_ILLEGAL_CHAR_FOUND; - - /* update the arguments structure */ - pArgs->target=buffer; - pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH; - - /* copy the current byte to invalidCharBuffer */ - cnv->invalidCharBuffer[0]=(char)b; - cnv->invalidCharLength=1; - - /* call the callback function */ - cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode); - - /* update the source pointer */ - source=(const uint8_t *)pArgs->source; - - /* - * return the first character if the callback wrote some - * we do not need to goto finish because the converter state is already set - */ - if(U_SUCCESS(*pErrorCode)) { - int32_t length=pArgs->target-buffer; - if(length>0) { - return ucnv_getUChar32KeepOverflow(cnv, buffer, length); - } - /* else (callback did not write anything) continue */ - } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - *pErrorCode=U_ZERO_ERROR; - return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH); - } else { - /* break on error */ - /* ### what if a callback set an error but _also_ generated output?! */ - return 0xffff; - } + return 0xffff; } } - /* no output because of empty input or only skipping callbacks */ + /* no output because of empty input */ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0xffff; } diff --git a/icu4c/source/common/ucnvscsu.c b/icu4c/source/common/ucnvscsu.c index 6f8bf9d307..7b3901af19 100644 --- a/icu4c/source/common/ucnvscsu.c +++ b/icu4c/source/common/ucnvscsu.c @@ -542,21 +542,11 @@ fastUnicode: } endloop: + /* set the converter state back into UConverter */ if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - /* copy the input sequence into the error buffer */ - int8_t i; - - for(i=0; itoULength; ++i) { - cnv->invalidCharBuffer[i]=(char)cnv->toUBytes[i]; - } - cnv->invalidCharLength=i; - /* reset to deal with the next character */ state=readCommand; - } - - /* set the converter state back into UConverter */ - if(state==readCommand) { + } else if(state==readCommand) { /* not in a multi-byte sequence, reset toULength */ cnv->toULength=0; } @@ -845,21 +835,11 @@ fastUnicode: } endloop: + /* set the converter state back into UConverter */ if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - /* copy the input sequence into the error buffer */ - int8_t i; - - for(i=0; itoULength; ++i) { - cnv->invalidCharBuffer[i]=(char)cnv->toUBytes[i]; - } - cnv->invalidCharLength=i; - /* reset to deal with the next character */ state=readCommand; - } - - /* set the converter state back into UConverter */ - if(state==readCommand) { + } else if(state==readCommand) { /* not in a multi-byte sequence, reset toULength */ cnv->toULength=0; } @@ -2032,7 +2012,13 @@ static const UConverterStaticData _SCSUStaticData={ 0, /* CCSID for SCSU */ UCNV_IBM, UCNV_SCSU, 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ - { 0x0e, 0xff, 0xfd, 0 }, 3, /* ### the subchar really must be written by an SCSU function! */ + /* + * ### TODO the subchar really must be written by an SCSU function + * however, currently SCSU's fromUnicode() never causes errors, therefore + * no callbacks will be called and no subchars written + * See Jitterbug 2837 - RFE: forbid converting surrogate code points in all charsets + */ + { 0x0e, 0xff, 0xfd, 0 }, 3, FALSE, FALSE, 0, 0, @@ -2044,5 +2030,3 @@ const UConverterSharedData _SCSUData={ NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl, 0 }; - -/* ### clarify: if an error occurs, does a converter reset itself? or is it in a defined or undefined state? */