From b7c791ad7543fdd25217c24694a98514ed6247c0 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 19 Dec 2000 00:29:27 +0000 Subject: [PATCH] ICU-507 use common implementation for getNextUChar() where appropriate X-SVN-Rev: 3261 --- icu4c/source/common/ucnv.c | 12 +++- icu4c/source/common/ucnv2022.c | 126 +-------------------------------- icu4c/source/common/ucnv_cnv.c | 48 +++++++++++++ icu4c/source/common/ucnv_cnv.h | 26 +++++++ icu4c/source/common/ucnvhz.c | 33 +-------- icu4c/source/common/ucnvmbcs.c | 63 +---------------- 6 files changed, 88 insertions(+), 220 deletions(-) diff --git a/icu4c/source/common/ucnv.c b/icu4c/source/common/ucnv.c index ab9a4e348f..b205a230b0 100644 --- a/icu4c/source/common/ucnv.c +++ b/icu4c/source/common/ucnv.c @@ -986,8 +986,8 @@ UChar32 ucnv_getNextUChar(UConverter * converter, UTF_NEXT_CHAR(converter->UCharErrorBuffer, i, sizeof(converter->UCharErrorBuffer), myUChar); /*In this memmove we update the internal buffer by *popping the first character. - *Note that in the call itself we decrement - *UCharErrorBufferLength + *Note that in the call itself we decrement + *UCharErrorBufferLength */ uprv_memmove (converter->UCharErrorBuffer, converter->UCharErrorBuffer + i, @@ -1005,7 +1005,13 @@ UChar32 ucnv_getNextUChar(UConverter * converter, args.target = NULL; args.targetLimit = NULL; args.size = sizeof(args); - ch = converter->sharedData->impl->getNextUChar(&args, err); + if (converter->sharedData->impl->getNextUChar != NULL) + { + ch = converter->sharedData->impl->getNextUChar(&args, err); + } else { + /* default implementation */ + ch = ucnv_getNextUCharFromToUImpl(&args, converter->sharedData->impl->toUnicode, FALSE, err); + } *source = args.source; return ch; } diff --git a/icu4c/source/common/ucnv2022.c b/icu4c/source/common/ucnv2022.c index 2d552f1e2c..196da651d4 100644 --- a/icu4c/source/common/ucnv2022.c +++ b/icu4c/source/common/ucnv2022.c @@ -152,9 +152,6 @@ U_CFUNC void UConverter_toUnicode_ISO_2022_JP(UConverterToUnicodeArgs* args, U_CFUNC void UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); -U_CFUNC UChar32 UConverter_getNextUChar_ISO_2022_JP (UConverterToUnicodeArgs * args, - UErrorCode * err); - /***************** ISO-2022-KR ********************************/ U_CFUNC void UConverter_fromUnicode_ISO_2022_KR(UConverterFromUnicodeArgs* args, UErrorCode* err); @@ -168,9 +165,6 @@ U_CFUNC void UConverter_toUnicode_ISO_2022_KR(UConverterToUnicodeArgs* args, U_CFUNC void UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); -U_CFUNC UChar32 UConverter_getNextUChar_ISO_2022_KR (UConverterToUnicodeArgs * args, - UErrorCode * err); - /***************** ISO-2022-CN ********************************/ U_CFUNC void UConverter_fromUnicode_ISO_2022_CN(UConverterFromUnicodeArgs* args, UErrorCode* err); @@ -184,9 +178,6 @@ U_CFUNC void UConverter_toUnicode_ISO_2022_CN(UConverterToUnicodeArgs* args, U_CFUNC void UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); -U_CFUNC UChar32 UConverter_getNextUChar_ISO_2022_CN (UConverterToUnicodeArgs * args, - UErrorCode * err); - #define ESC_2022 0x1B /*ESC*/ typedef enum @@ -416,7 +407,7 @@ static const UConverterImpl _ISO2022JPImpl={ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, UConverter_fromUnicode_ISO_2022_JP, UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, - UConverter_getNextUChar_ISO_2022_JP, + NULL, NULL, _ISO2022getName @@ -447,7 +438,7 @@ static const UConverterImpl _ISO2022KRImpl={ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, UConverter_fromUnicode_ISO_2022_KR, UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, - UConverter_getNextUChar_ISO_2022_KR, + NULL, NULL, _ISO2022getName @@ -479,7 +470,7 @@ static const UConverterImpl _ISO2022CNImpl={ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, UConverter_fromUnicode_ISO_2022_CN, UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, - UConverter_getNextUChar_ISO_2022_CN, + NULL, NULL, _ISO2022getName @@ -1899,43 +1890,6 @@ static void concatChar(UConverterFromUnicodeArgs* args, int32_t *targetIndex, in /*************** to unicode *******************/ -/* -* This is a simple, interim implementation of GetNextUChar() -* that allows to concentrate on testing one single implementation -* of the ToUnicode conversion before it gets copied to -* multiple version that are then optimized for their needs -* (with vs. without offsets and getNextUChar). -*/ - -U_CFUNC UChar32 -UConverter_getNextUChar_ISO_2022_JP(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UChar buffer[UTF_MAX_CHAR_LENGTH]; - const char *realLimit=pArgs->sourceLimit; - - pArgs->target=buffer; - pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH; - - while(pArgs->sourcesourceLimit=pArgs->source+1; - pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit); - UConverter_toUnicode_ISO_2022_JP(pArgs, pErrorCode); - if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - return 0xffff; - } else if(pArgs->target!=buffer) { - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - *pErrorCode=U_ZERO_ERROR; - } - return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, pArgs->target-buffer); - } - } - - /* no output because of empty input or only state changes and skipping callbacks */ - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0xffff; -} - /**************************************************************************** * Recognized escape sequences are * (B ASCII @@ -3243,43 +3197,6 @@ END_LOOP: args->source = mySource; } -/* -* This is a simple, interim implementation of GetNextUChar() -* that allows to concentrate on testing one single implementation -* of the ToUnicode conversion before it gets copied to -* multiple version that are then optimized for their needs -* (with vs. without offsets and getNextUChar). -*/ - -U_CFUNC UChar32 -UConverter_getNextUChar_ISO_2022_KR(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UChar buffer[UTF_MAX_CHAR_LENGTH]; - const char *realLimit=pArgs->sourceLimit; - - pArgs->target=buffer; - pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH; - - while(pArgs->sourcesourceLimit=pArgs->source+1; - pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit); - UConverter_toUnicode_ISO_2022_KR(pArgs, pErrorCode); - if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - return 0xffff; - } else if(pArgs->target!=buffer) { - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - *pErrorCode=U_ZERO_ERROR; - } - return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, pArgs->target-buffer); - } - } - - /* no output because of empty input or only state changes and skipping callbacks */ - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0xffff; -} - /*************************** END ISO2022-KR *********************************/ @@ -4779,40 +4696,3 @@ END_LOOP: args->target = myTarget; args->source = mySource; } - -/* -* This is a simple, interim implementation of GetNextUChar() -* that allows to concentrate on testing one single implementation -* of the ToUnicode conversion before it gets copied to -* multiple version that are then optimized for their needs -* (with vs. without offsets and getNextUChar). -*/ - -U_CFUNC UChar32 -UConverter_getNextUChar_ISO_2022_CN(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UChar buffer[UTF_MAX_CHAR_LENGTH]; - const char *realLimit=pArgs->sourceLimit; - - pArgs->target=buffer; - pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH; - - while(pArgs->sourcesourceLimit=pArgs->source+1; - pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit); - UConverter_toUnicode_ISO_2022_CN(pArgs, pErrorCode); - if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - return 0xffff; - } else if(pArgs->target!=buffer) { - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - *pErrorCode=U_ZERO_ERROR; - } - return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, pArgs->target-buffer); - } - } - - /* no output because of empty input or only state changes and skipping callbacks */ - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0xffff; -} diff --git a/icu4c/source/common/ucnv_cnv.c b/icu4c/source/common/ucnv_cnv.c index 35c445738d..adda252c41 100644 --- a/icu4c/source/common/ucnv_cnv.c +++ b/icu4c/source/common/ucnv_cnv.c @@ -193,3 +193,51 @@ ucnv_updateCallbackOffsets(int32_t *offsets, int32_t length, int32_t sourceIndex return NULL; } } + +/* + * This is a simple implementation of ucnv_getNextUChar() that uses the + * converter's toUnicode() function. See ucnv_cnv.h for details. + */ +U_CFUNC UChar32 +ucnv_getNextUCharFromToUImpl(UConverterToUnicodeArgs *pArgs, + T_ToUnicodeFunction toU, + UBool collectPairs, + UErrorCode *pErrorCode) { + UChar buffer[UTF_MAX_CHAR_LENGTH]; + const char *realLimit=pArgs->sourceLimit; + + pArgs->target=buffer; + pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH; + + while(pArgs->sourcesourceLimit=pArgs->source+1; + pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit); + + /* convert this byte and check the result */ + toU(pArgs, pErrorCode); + if(U_SUCCESS(*pErrorCode)) { + int32_t length=pArgs->target-buffer; + + /* this test is UTF-16 specific */ + if(/* some output and + (source consumed or don't collect surrogate pairs or not a surrogate or a surrogate pair) */ + length>0 && + (pArgs->flush || !collectPairs || !UTF_IS_FIRST_SURROGATE(buffer[0]) || length==2) + ) { + return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, length); + } + /* else continue with the loop */ + } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { + *pErrorCode=U_ZERO_ERROR; + return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, UTF_MAX_CHAR_LENGTH); + } else { + /* U_FAILURE() */ + return 0xffff; + } + } + + /* no output because of empty input or only state changes and skipping callbacks */ + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0xffff; +} diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h index 40df55fa56..b55663fa41 100644 --- a/icu4c/source/common/ucnv_cnv.h +++ b/icu4c/source/common/ucnv_cnv.h @@ -260,4 +260,30 @@ ucnv_updateCallbackOffsets(int32_t *offsets, int32_t length, int32_t sourceIndex #define FROM_U_USE_FALLBACK(useFallback, c) ((useFallback) || (uint32_t)((c)-0xe000)<0x1900 || (uint32_t)((c)-0xf0000)<0x20000) #define UCNV_FROM_U_USE_FALLBACK(cnv, c) FROM_U_USE_FALLBACK((cnv)->useFallback, c) +/** + * This is a simple implementation of ucnv_getNextUChar() that uses the + * converter's toUnicode() function. + * + * \par + * A surrogate pair from a single byte sequence is always + * combined to a supplementary code point. + * A surrogate pair from consecutive byte sequences is only combined + * if collectPairs is set. This is necessary for SCSU + * but not allowed for most legacy codepages. + * + * @param pArgs The argument structure supplied by ucnv_getNextUChar() + * @param toU A function pointer to the converter's toUnicode() function + * @param collectPairs indicates whether separate surrogate results from + * consecutive byte sequences should be combined into + * a single code point + * @param pErrorCode An ICU error code parameter + * @return The Unicode code point as a result of a conversion of a minimal + * number of input bytes + */ +U_CFUNC UChar32 +ucnv_getNextUCharFromToUImpl(UConverterToUnicodeArgs *pArgs, + T_ToUnicodeFunction toU, + UBool collectPairs, + UErrorCode *pErrorCode); + #endif /* UCNV_CNV */ diff --git a/icu4c/source/common/ucnvhz.c b/icu4c/source/common/ucnvhz.c index f631e0313c..9679bc4c51 100644 --- a/icu4c/source/common/ucnvhz.c +++ b/icu4c/source/common/ucnvhz.c @@ -60,9 +60,6 @@ U_CFUNC void UConverter_fromUnicode_HZ(UConverterFromUnicodeArgs *args, U_CFUNC void UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs *args, UErrorCode *err); -U_CFUNC UChar32 UConverter_getNextUChar_HZ (UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode); - static UConverterImpl _HZImpl={ UCNV_HZ, @@ -77,7 +74,7 @@ static UConverterImpl _HZImpl={ UConverter_toUnicode_HZ_OFFSETS_LOGIC, UConverter_fromUnicode_HZ, UConverter_fromUnicode_HZ_OFFSETS_LOGIC, - UConverter_getNextUChar_HZ, + NULL, NULL, NULL @@ -998,31 +995,3 @@ CALLBACK: return; } - -U_CFUNC UChar32 UConverter_getNextUChar_HZ (UConverterToUnicodeArgs * pArgs, - UErrorCode *pErrorCode){ - UChar buffer[UTF_MAX_CHAR_LENGTH]; - const char *realLimit=pArgs->sourceLimit; - - pArgs->target=buffer; - pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH; - - while(pArgs->sourcesourceLimit=pArgs->source+1; - pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit); - UConverter_toUnicode_HZ(pArgs, pErrorCode); - if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - return 0xffff; - } else if(pArgs->target!=buffer) { - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - *pErrorCode=U_ZERO_ERROR; - } - return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, pArgs->target-buffer); - } - } - - /* no output because of empty input or only state changes and skipping callbacks */ - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0xffff; -} diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index 0011fb6ce3..103d489ce2 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -908,73 +908,12 @@ endloop: } /* - * This is a simple, interim implementation of GetNextUChar() - * that allows to concentrate on testing one single implementation - * of the ToUnicode conversion before it gets copied to - * multiple version that are then optimized for their needs - * (with vs. without offsets and getNextUChar). * ### TODO: implement this directly similar to ToUnicode() */ U_CFUNC UChar32 _MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { - UChar buffer[UTF_MAX_CHAR_LENGTH]; - const char *realLimit=pArgs->sourceLimit; - - pArgs->target=buffer; - pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH; - - while(pArgs->sourcesourceLimit=pArgs->source+1; - pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit); - _MBCSToUnicode(pArgs, pErrorCode); - if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - return 0xffff; - } else { - int32_t length=pArgs->target-buffer; -#if 0 - /* - * markus 2000-oct-26 - * - * This version of the exit condition is commented out because of - * a clarification of the semantics of ucnv_getNextUChar() (see updated javadoc): - * - * Codepages that provide direct encodings of supplementary Unicode code points (U+10000 and up) - * should return single surrogates without combining them into pairs if single surrogates - * are encoded. This group of codepages includes UTF-8, UTF-32, and GB 18030. - * - * Codepages that provide direct encodings only of single surrogates - * must attempt to match pairs of them into supplementary code points. - * Single surrogates are returned only if they are not part of matched pairs. - * This group of codepages includes SCSU, LMBCS, and UTF-16. - * - * Currently, there is no MBCS codepage in the second group. SCSU, LMBCS, and UTF-16 - * are implemented with separate code. - * - * Therefore, this feature is removed here. - * It might need to be added back in later when some MBCS codepages are created that - * fall into the second group. In this case, a flag in the .cnv file will be necessary - * to indicate this. makeconv would need to set this flag based on whether the codepage - * contains only mappings for single surrogates but - * not directly for any supplementary code points. - */ - if(/* some output and (source consumed or not a surrogate or a surrogate pair [UTF-16 specific]) */ - length>0 && - (pArgs->flush || !UTF_IS_FIRST_SURROGATE(buffer[0]) || length==2) -#endif - if(length>0) { - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - *pErrorCode=U_ZERO_ERROR; - } - return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, length); - } - } - } - - /* no output because of empty input or only state changes and skipping callbacks */ - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0xffff; + return ucnv_getNextUCharFromToUImpl(pArgs, _MBCSToUnicode, FALSE, pErrorCode); } /*