From 4f2c865b527766728396643d30747149e6fd745e Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 18 Apr 2003 23:11:39 +0000 Subject: [PATCH] ICU-2144 add ucnv_convertEx() X-SVN-Rev: 11589 --- icu4c/source/common/ucnv.c | 221 +++++++++++++++++++---------- icu4c/source/common/unicode/ucnv.h | 137 ++++++++++++++++++ 2 files changed, 286 insertions(+), 72 deletions(-) diff --git a/icu4c/source/common/ucnv.c b/icu4c/source/common/ucnv.c index 9c35aaab5d..a3b8bfc57b 100644 --- a/icu4c/source/common/ucnv.c +++ b/icu4c/source/common/ucnv.c @@ -82,7 +82,7 @@ static void UCNV_DEBUG_CNV(UConverter *c, int line) /* size of intermediate and preflighting buffers in ucnv_convert() */ -#define CHUNK_SIZE 5*1024 +#define CHUNK_SIZE 1024 typedef struct UAmbiguousConverter { const char *name; @@ -808,7 +808,12 @@ ucnv_fromUnicode (UConverter * _this, if (U_FAILURE (*err)) return; } - + + if(!flush && *source == sourceLimit) { + /* the overflow buffer is emptied and there is no new input: we are done */ + return; + } + args.converter = _this; args.flush = flush; args.offsets = offsets; @@ -903,6 +908,11 @@ ucnv_toUnicode (UConverter * _this, return; } + if(!flush && *source == sourceLimit) { + /* the overflow buffer is emptied and there is no new input: we are done */ + return; + } + args.converter = _this; args.flush = flush; args.offsets = offsets; @@ -1116,6 +1126,120 @@ ucnv_getNextUChar(UConverter * converter, return ch; } +U_CAPI void U_EXPORT2 +ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv, + char **target, const char *targetLimit, + const char **source, const char *sourceLimit, + UChar *pivotStart, UChar **pivotSource, + UChar **pivotTarget, const UChar *pivotLimit, + UBool reset, UBool flush, + UErrorCode *pErrorCode) { + UChar pivotBuffer[CHUNK_SIZE]; + UChar *myPivotSource, *myPivotTarget; + + /* error checking */ + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + if( targetCnv==NULL || sourceCnv==NULL || + source==NULL || *source==NULL || + target==NULL || *target==NULL || targetLimit==NULL + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if(pivotStart==NULL) { + /* use the stack pivot buffer */ + pivotStart=myPivotSource=myPivotTarget=pivotBuffer; + pivotSource=&myPivotSource; + pivotTarget=&myPivotTarget; + pivotLimit=pivotBuffer+CHUNK_SIZE; + } else if( pivotStart>=pivotLimit || + pivotSource==NULL || *pivotSource==NULL || + pivotTarget==NULL || *pivotTarget==NULL || + pivotLimit==NULL + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if(sourceLimit==NULL) { + /* get limit of single-byte-NUL-terminated source string */ + sourceLimit=uprv_strchr(*source, 0); + } + + if(reset) { + ucnv_resetToUnicode(sourceCnv); + ucnv_resetFromUnicode(targetCnv); + *pivotTarget=*pivotSource=pivotStart; + } + + /* conversion loop */ + for(;;) { + if(reset) { + /* + * if we did a reset in this function, we know that there is nothing + * to convert to the target yet, so we save a function call + */ + reset=FALSE; + } else { + /* + * convert to the target first in case the pivot is filled at entry + * or the targetCnv has some output bytes in its state + */ + ucnv_fromUnicode(targetCnv, + target, targetLimit, + pivotSource, *pivotTarget, + NULL, + (UBool)(flush && *source==sourceLimit), + pErrorCode); + if(U_FAILURE(*pErrorCode)) { + break; + } + + /* ucnv_fromUnicode() must have consumed the pivot contents since it returned with U_SUCCESS() */ + *pivotSource=*pivotTarget=pivotStart; + } + + /* convert from the source to the pivot */ + ucnv_toUnicode(sourceCnv, + pivotTarget, pivotLimit, + source, sourceLimit, + NULL, + flush, + pErrorCode); + if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { + /* pivot overflow: continue with the conversion loop */ + *pErrorCode=U_ZERO_ERROR; + } else if(U_FAILURE(*pErrorCode) || *pivotTarget==pivotStart) { + /* conversion error, or there was nothing left to convert */ + break; + } + /* else ucnv_toUnicode() wrote into the pivot buffer: continue */ + } + + /* + * The conversion loop is exited when one of the following is true: + * - the entire source text has been converted successfully to the target buffer + * - a target buffer overflow occurred + * - a conversion error occurred + */ + + /* terminate the target buffer if possible */ + if(flush && U_SUCCESS(*pErrorCode)) { + if(*target!=targetLimit) { + **target=0; + if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { + *pErrorCode=U_ZERO_ERROR; + } + } else { + *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; + } + } +} + U_CAPI int32_t U_EXPORT2 ucnv_convert(const char *toConverterName, const char *fromConverterName, char *target, int32_t targetSize, @@ -1166,40 +1290,14 @@ ucnv_convert(const char *toConverterName, const char *fromConverterName, if(targetSize>0) { /* perform real conversion */ - - /* - * loops until the input buffer is completely consumed - * or an error is encountered; - * first we convert from inConverter codepage to Unicode - * then from Unicode to outConverter codepage - */ targetLimit=target+targetSize; - do { - pivot=pivotBuffer; - ucnv_toUnicode(inConverter, - &pivot, pivotBuffer+CHUNK_SIZE, - &source, sourceLimit, - NULL, - TRUE, - pErrorCode); - - /* U_BUFFER_OVERFLOW_ERROR only means that the pivot buffer is full */ - if(U_SUCCESS(*pErrorCode) || *pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - *pErrorCode=U_ZERO_ERROR; - pivot2=pivotBuffer; - ucnv_fromUnicode(outConverter, - &myTarget, targetLimit, - (const UChar **)&pivot2, pivot, - NULL, - (UBool)(source==sourceLimit), - pErrorCode); - /* - * If this overflows the real target, then we must stop - * converting and preflight with the loop below. - */ - } - } while(U_SUCCESS(*pErrorCode) && source!=sourceLimit); - + ucnv_convertEx(outConverter, inConverter, + &myTarget, targetLimit, + &source, sourceLimit, + pivotBuffer, &pivot, &pivot2, pivotBuffer+CHUNK_SIZE, + FALSE, + TRUE, + pErrorCode); targetCapacity=myTarget-target; } @@ -1214,53 +1312,32 @@ ucnv_convert(const char *toConverterName, const char *fromConverterName, targetLimit=targetBuffer+CHUNK_SIZE; do { - /* since the pivot buffer may still contain some characters, start with emptying it */ *pErrorCode=U_ZERO_ERROR; - while(pivot2!=pivot && U_SUCCESS(*pErrorCode)) { - myTarget=targetBuffer; - ucnv_fromUnicode(outConverter, - &myTarget, targetLimit, - (const UChar **)&pivot2, pivot, - NULL, - (UBool)(source==sourceLimit), - pErrorCode); - targetCapacity+=(myTarget-targetBuffer); - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - *pErrorCode=U_ZERO_ERROR; - } - } - - if(U_FAILURE(*pErrorCode)) { - /* an error occurred: done */ - break; - } - - if(source==sourceLimit) { - /* - * source is consumed: - * done, and set the buffer overflow error as - * the result for the entire function - */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - - /* now convert from the source into the pivot buffer again */ - pivot=pivot2=pivotBuffer; - ucnv_toUnicode(inConverter, - &pivot, pivotBuffer+CHUNK_SIZE, + myTarget=targetBuffer; + ucnv_convertEx(outConverter, inConverter, + &myTarget, targetLimit, &source, sourceLimit, - NULL, + pivotBuffer, &pivot, &pivot2, pivotBuffer+CHUNK_SIZE, + FALSE, TRUE, pErrorCode); + targetCapacity+=(myTarget-targetBuffer); + } while(*pErrorCode==U_BUFFER_OVERFLOW_ERROR); + + if(U_SUCCESS(*pErrorCode)) { + /* + * done with preflighting, set the buffer overflow error as + * the result for the entire function + */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } - while(U_SUCCESS(*pErrorCode) || *pErrorCode==U_BUFFER_OVERFLOW_ERROR); } ucnv_close (inConverter); ucnv_close (outConverter); - return u_terminateChars(target, targetSize, targetCapacity, pErrorCode); + /* no need to call u_terminateChars() because ucnv_convertEx() took care of that */ + return targetCapacity; } U_CAPI UConverterType U_EXPORT2 diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h index 3181b81a66..29a9a667a5 100644 --- a/icu4c/source/common/unicode/ucnv.h +++ b/icu4c/source/common/unicode/ucnv.h @@ -1060,6 +1060,141 @@ ucnv_getNextUChar(UConverter * converter, const char * sourceLimit, UErrorCode * err); +/** + * Convert from one external charset to another using two existing UConverters. + * Internally, two conversions - ucnv_toUnicode() and ucnv_fromUnicode() - + * are used, "pivoting" through 16-bit Unicode. + * + * There is a similar function, ucnv_convert(), + * which has the following limitations: + * - it takes charset names, not converter objects, so that + * - two converters are opened for each call + * - only single-string conversion is possible, not streaming operation + * - it does not provide enough information to find out, + * in case of failure, whether the toUnicode or + * the fromUnicode conversion failed + * + * By contrast, ucnv_convertEx() + * - takes UConverter parameters instead of charset names + * - fully exposes the pivot buffer for complete error handling + * + * ucnv_convertEx() also provides further convenience: + * - an option to reset the converters at the beginning + * (if reset==TRUE, see parameters; + * also sets *pivotTarget=*pivotSource=pivotStart) + * - allow NUL-terminated input + * (only a single NUL byte, will not work for charsets with multi-byte NULs) + * (if sourceLimit==NULL, see parameters) + * - terminate with a NUL on output + * (only a single NUL byte, not useful for charsets with multi-byte NULs), + * or set U_STRING_NOT_TERMINATED_WARNING if the output exactly fills + * the target buffer + * - the pivot buffer can be provided internally; + * in this case, the caller will not be able to get details about where an + * error occurred + * (if pivotStart==NULL, see below) + * + * The function returns when one of the following is true: + * - the entire source text has been converted successfully to the target buffer + * - a target buffer overflow occurred (U_BUFFER_OVERFLOW_ERROR) + * - a conversion error occurred + * (other U_FAILURE(), see description of pErrorCode) + * + * Limitation compared to the direct use of + * ucnv_fromUnicode() and ucnv_toUnicode(): + * ucnv_convertEx() does not provide offset information. + * + * Limitation compared to ucnv_fromUChars() and ucnv_toUChars(): + * ucnv_convertEx() does not support preflighting directly. + * + * Sample code for converting a single string from + * one external charset to UTF-8, ignoring the location of errors: + * + * \code + * int32_t + * myToUTF8(UConverter *cnv, + * const char *s, int32_t length, + * char *u8, int32_t capacity, + * UErrorCode *pErrorCode) { + * UConverter *utf8Cnv; + * char *target; + * + * if(U_FAILURE(*pErrorCode)) { + * return 0; + * } + * + * utf8Cnv=myGetCachedUTF8Converter(pErrorCode); + * if(U_FAILURE(*pErrorCode)) { + * return 0; + * } + * + * target=u8; + * ucnv_convertEx(cnv, utf8Cnv, + * &target, u8+capacity, + * &s, length>=0 ? s+length : NULL, + * NULL, NULL, NULL, NULL, + * TRUE, TRUE, + * pErrorCode); + * + * myReleaseCachedUTF8Converter(utf8Cnv); + * + * // return the output string length, but without preflighting + * return (int32_t)(target-u8); + * } + * \endcode + * + * @param targetCnv Output converter, used to convert from the UTF-16 pivot + * to the target using ucnv_fromUnicode(). + * @param sourceCnv Input converter, used to convert from the source to + * the UTF-16 pivot using ucnv_toUnicode(). + * @param target I/O parameter, same as for ucnv_fromUChars(). + * Input: *target points to the beginning of the target buffer. + * Output: *target points to the first unit after the last char written. + * @param targetLimit Pointer to the first unit after the target buffer. + * @param source I/O parameter, same as for ucnv_toUChars(). + * Input: *source points to the beginning of the source buffer. + * Output: *source points to the first unit after the last char read. + * @param sourceLimit Pointer to the first unit after the source buffer. + * @param pivotStart Pointer to the UTF-16 pivot buffer. If pivotStart==NULL, + * then an internal buffer is used and the other pivot + * arguments are ignored and can be NULL as well. + * @param pivotSource I/O parameter, same as source in ucnv_fromUChars() for + * conversion from the pivot buffer to the target buffer. + * @param pivotTarget I/O parameter, same as target in ucnv_toUChars() for + * conversion from the source buffer to the pivot buffer. + * It must be pivotStart<=*pivotSource<=*pivotTarget<=pivotLimit + * and pivotStartNOT AN EFFICIENT way to transcode. @@ -1075,6 +1210,8 @@ ucnv_getNextUChar(UConverter * converter, * @param err error status. * U_BUFFER_OVERFLOW_ERROR will be set if the target is full and there is still input left in the source. * @return will be filled in with the number of bytes needed in target + * + * @see ucnv_convertEx * @see ucnv_fromUnicode * @see ucnv_toUnicode * @see ucnv_fromUChars