ICU-2144 add ucnv_convertEx()

X-SVN-Rev: 11589
This commit is contained in:
Markus Scherer 2003-04-18 23:11:39 +00:00
parent e76f98cf2d
commit 4f2c865b52
2 changed files with 286 additions and 72 deletions

View File

@ -82,7 +82,7 @@ static void UCNV_DEBUG_CNV(UConverter *c, int line)
/* size of intermediate and preflighting buffers in ucnv_convert() */ /* size of intermediate and preflighting buffers in ucnv_convert() */
#define CHUNK_SIZE 5*1024 #define CHUNK_SIZE 1024
typedef struct UAmbiguousConverter { typedef struct UAmbiguousConverter {
const char *name; const char *name;
@ -809,6 +809,11 @@ ucnv_fromUnicode (UConverter * _this,
return; return;
} }
if(!flush && *source == sourceLimit) {
/* the overflow buffer is emptied and there is no new input: we are done */
return;
}
args.converter = _this; args.converter = _this;
args.flush = flush; args.flush = flush;
args.offsets = offsets; args.offsets = offsets;
@ -903,6 +908,11 @@ ucnv_toUnicode (UConverter * _this,
return; return;
} }
if(!flush && *source == sourceLimit) {
/* the overflow buffer is emptied and there is no new input: we are done */
return;
}
args.converter = _this; args.converter = _this;
args.flush = flush; args.flush = flush;
args.offsets = offsets; args.offsets = offsets;
@ -1116,6 +1126,120 @@ ucnv_getNextUChar(UConverter * converter,
return ch; return ch;
} }
U_CAPI void U_EXPORT2
ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
char **target, const char *targetLimit,
const char **source, const char *sourceLimit,
UChar *pivotStart, UChar **pivotSource,
UChar **pivotTarget, const UChar *pivotLimit,
UBool reset, UBool flush,
UErrorCode *pErrorCode) {
UChar pivotBuffer[CHUNK_SIZE];
UChar *myPivotSource, *myPivotTarget;
/* error checking */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
if( targetCnv==NULL || sourceCnv==NULL ||
source==NULL || *source==NULL ||
target==NULL || *target==NULL || targetLimit==NULL
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if(pivotStart==NULL) {
/* use the stack pivot buffer */
pivotStart=myPivotSource=myPivotTarget=pivotBuffer;
pivotSource=&myPivotSource;
pivotTarget=&myPivotTarget;
pivotLimit=pivotBuffer+CHUNK_SIZE;
} else if( pivotStart>=pivotLimit ||
pivotSource==NULL || *pivotSource==NULL ||
pivotTarget==NULL || *pivotTarget==NULL ||
pivotLimit==NULL
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if(sourceLimit==NULL) {
/* get limit of single-byte-NUL-terminated source string */
sourceLimit=uprv_strchr(*source, 0);
}
if(reset) {
ucnv_resetToUnicode(sourceCnv);
ucnv_resetFromUnicode(targetCnv);
*pivotTarget=*pivotSource=pivotStart;
}
/* conversion loop */
for(;;) {
if(reset) {
/*
* if we did a reset in this function, we know that there is nothing
* to convert to the target yet, so we save a function call
*/
reset=FALSE;
} else {
/*
* convert to the target first in case the pivot is filled at entry
* or the targetCnv has some output bytes in its state
*/
ucnv_fromUnicode(targetCnv,
target, targetLimit,
pivotSource, *pivotTarget,
NULL,
(UBool)(flush && *source==sourceLimit),
pErrorCode);
if(U_FAILURE(*pErrorCode)) {
break;
}
/* ucnv_fromUnicode() must have consumed the pivot contents since it returned with U_SUCCESS() */
*pivotSource=*pivotTarget=pivotStart;
}
/* convert from the source to the pivot */
ucnv_toUnicode(sourceCnv,
pivotTarget, pivotLimit,
source, sourceLimit,
NULL,
flush,
pErrorCode);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
/* pivot overflow: continue with the conversion loop */
*pErrorCode=U_ZERO_ERROR;
} else if(U_FAILURE(*pErrorCode) || *pivotTarget==pivotStart) {
/* conversion error, or there was nothing left to convert */
break;
}
/* else ucnv_toUnicode() wrote into the pivot buffer: continue */
}
/*
* The conversion loop is exited when one of the following is true:
* - the entire source text has been converted successfully to the target buffer
* - a target buffer overflow occurred
* - a conversion error occurred
*/
/* terminate the target buffer if possible */
if(flush && U_SUCCESS(*pErrorCode)) {
if(*target!=targetLimit) {
**target=0;
if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
*pErrorCode=U_ZERO_ERROR;
}
} else {
*pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
}
}
}
U_CAPI int32_t U_EXPORT2 U_CAPI int32_t U_EXPORT2
ucnv_convert(const char *toConverterName, const char *fromConverterName, ucnv_convert(const char *toConverterName, const char *fromConverterName,
char *target, int32_t targetSize, char *target, int32_t targetSize,
@ -1166,40 +1290,14 @@ ucnv_convert(const char *toConverterName, const char *fromConverterName,
if(targetSize>0) { if(targetSize>0) {
/* perform real conversion */ /* perform real conversion */
/*
* loops until the input buffer is completely consumed
* or an error is encountered;
* first we convert from inConverter codepage to Unicode
* then from Unicode to outConverter codepage
*/
targetLimit=target+targetSize; targetLimit=target+targetSize;
do { ucnv_convertEx(outConverter, inConverter,
pivot=pivotBuffer; &myTarget, targetLimit,
ucnv_toUnicode(inConverter,
&pivot, pivotBuffer+CHUNK_SIZE,
&source, sourceLimit, &source, sourceLimit,
NULL, pivotBuffer, &pivot, &pivot2, pivotBuffer+CHUNK_SIZE,
FALSE,
TRUE, TRUE,
pErrorCode); pErrorCode);
/* U_BUFFER_OVERFLOW_ERROR only means that the pivot buffer is full */
if(U_SUCCESS(*pErrorCode) || *pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
*pErrorCode=U_ZERO_ERROR;
pivot2=pivotBuffer;
ucnv_fromUnicode(outConverter,
&myTarget, targetLimit,
(const UChar **)&pivot2, pivot,
NULL,
(UBool)(source==sourceLimit),
pErrorCode);
/*
* If this overflows the real target, then we must stop
* converting and preflight with the loop below.
*/
}
} while(U_SUCCESS(*pErrorCode) && source!=sourceLimit);
targetCapacity=myTarget-target; targetCapacity=myTarget-target;
} }
@ -1214,53 +1312,32 @@ ucnv_convert(const char *toConverterName, const char *fromConverterName,
targetLimit=targetBuffer+CHUNK_SIZE; targetLimit=targetBuffer+CHUNK_SIZE;
do { do {
/* since the pivot buffer may still contain some characters, start with emptying it */
*pErrorCode=U_ZERO_ERROR; *pErrorCode=U_ZERO_ERROR;
while(pivot2!=pivot && U_SUCCESS(*pErrorCode)) {
myTarget=targetBuffer; myTarget=targetBuffer;
ucnv_fromUnicode(outConverter, ucnv_convertEx(outConverter, inConverter,
&myTarget, targetLimit, &myTarget, targetLimit,
(const UChar **)&pivot2, pivot, &source, sourceLimit,
NULL, pivotBuffer, &pivot, &pivot2, pivotBuffer+CHUNK_SIZE,
(UBool)(source==sourceLimit), FALSE,
TRUE,
pErrorCode); pErrorCode);
targetCapacity+=(myTarget-targetBuffer); targetCapacity+=(myTarget-targetBuffer);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { } while(*pErrorCode==U_BUFFER_OVERFLOW_ERROR);
*pErrorCode=U_ZERO_ERROR;
}
}
if(U_FAILURE(*pErrorCode)) { if(U_SUCCESS(*pErrorCode)) {
/* an error occurred: done */
break;
}
if(source==sourceLimit) {
/* /*
* source is consumed: * done with preflighting, set the buffer overflow error as
* done, and set the buffer overflow error as
* the result for the entire function * the result for the entire function
*/ */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR; *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
} }
/* now convert from the source into the pivot buffer again */
pivot=pivot2=pivotBuffer;
ucnv_toUnicode(inConverter,
&pivot, pivotBuffer+CHUNK_SIZE,
&source, sourceLimit,
NULL,
TRUE,
pErrorCode);
}
while(U_SUCCESS(*pErrorCode) || *pErrorCode==U_BUFFER_OVERFLOW_ERROR);
} }
ucnv_close (inConverter); ucnv_close (inConverter);
ucnv_close (outConverter); ucnv_close (outConverter);
return u_terminateChars(target, targetSize, targetCapacity, pErrorCode); /* no need to call u_terminateChars() because ucnv_convertEx() took care of that */
return targetCapacity;
} }
U_CAPI UConverterType U_EXPORT2 U_CAPI UConverterType U_EXPORT2

View File

@ -1060,6 +1060,141 @@ ucnv_getNextUChar(UConverter * converter,
const char * sourceLimit, const char * sourceLimit,
UErrorCode * err); UErrorCode * err);
/**
* Convert from one external charset to another using two existing UConverters.
* Internally, two conversions - ucnv_toUnicode() and ucnv_fromUnicode() -
* are used, "pivoting" through 16-bit Unicode.
*
* There is a similar function, ucnv_convert(),
* which has the following limitations:
* - it takes charset names, not converter objects, so that
* - two converters are opened for each call
* - only single-string conversion is possible, not streaming operation
* - it does not provide enough information to find out,
* in case of failure, whether the toUnicode or
* the fromUnicode conversion failed
*
* By contrast, ucnv_convertEx()
* - takes UConverter parameters instead of charset names
* - fully exposes the pivot buffer for complete error handling
*
* ucnv_convertEx() also provides further convenience:
* - an option to reset the converters at the beginning
* (if reset==TRUE, see parameters;
* also sets *pivotTarget=*pivotSource=pivotStart)
* - allow NUL-terminated input
* (only a single NUL byte, will not work for charsets with multi-byte NULs)
* (if sourceLimit==NULL, see parameters)
* - terminate with a NUL on output
* (only a single NUL byte, not useful for charsets with multi-byte NULs),
* or set U_STRING_NOT_TERMINATED_WARNING if the output exactly fills
* the target buffer
* - the pivot buffer can be provided internally;
* in this case, the caller will not be able to get details about where an
* error occurred
* (if pivotStart==NULL, see below)
*
* The function returns when one of the following is true:
* - the entire source text has been converted successfully to the target buffer
* - a target buffer overflow occurred (U_BUFFER_OVERFLOW_ERROR)
* - a conversion error occurred
* (other U_FAILURE(), see description of pErrorCode)
*
* Limitation compared to the direct use of
* ucnv_fromUnicode() and ucnv_toUnicode():
* ucnv_convertEx() does not provide offset information.
*
* Limitation compared to ucnv_fromUChars() and ucnv_toUChars():
* ucnv_convertEx() does not support preflighting directly.
*
* Sample code for converting a single string from
* one external charset to UTF-8, ignoring the location of errors:
*
* \code
* int32_t
* myToUTF8(UConverter *cnv,
* const char *s, int32_t length,
* char *u8, int32_t capacity,
* UErrorCode *pErrorCode) {
* UConverter *utf8Cnv;
* char *target;
*
* if(U_FAILURE(*pErrorCode)) {
* return 0;
* }
*
* utf8Cnv=myGetCachedUTF8Converter(pErrorCode);
* if(U_FAILURE(*pErrorCode)) {
* return 0;
* }
*
* target=u8;
* ucnv_convertEx(cnv, utf8Cnv,
* &target, u8+capacity,
* &s, length>=0 ? s+length : NULL,
* NULL, NULL, NULL, NULL,
* TRUE, TRUE,
* pErrorCode);
*
* myReleaseCachedUTF8Converter(utf8Cnv);
*
* // return the output string length, but without preflighting
* return (int32_t)(target-u8);
* }
* \endcode
*
* @param targetCnv Output converter, used to convert from the UTF-16 pivot
* to the target using ucnv_fromUnicode().
* @param sourceCnv Input converter, used to convert from the source to
* the UTF-16 pivot using ucnv_toUnicode().
* @param target I/O parameter, same as for ucnv_fromUChars().
* Input: *target points to the beginning of the target buffer.
* Output: *target points to the first unit after the last char written.
* @param targetLimit Pointer to the first unit after the target buffer.
* @param source I/O parameter, same as for ucnv_toUChars().
* Input: *source points to the beginning of the source buffer.
* Output: *source points to the first unit after the last char read.
* @param sourceLimit Pointer to the first unit after the source buffer.
* @param pivotStart Pointer to the UTF-16 pivot buffer. If pivotStart==NULL,
* then an internal buffer is used and the other pivot
* arguments are ignored and can be NULL as well.
* @param pivotSource I/O parameter, same as source in ucnv_fromUChars() for
* conversion from the pivot buffer to the target buffer.
* @param pivotTarget I/O parameter, same as target in ucnv_toUChars() for
* conversion from the source buffer to the pivot buffer.
* It must be pivotStart<=*pivotSource<=*pivotTarget<=pivotLimit
* and pivotStart<pivotLimit (unless pivotStart==NULL).
* @param pivotLimit Pointer to the first unit after the pivot buffer.
* @param reset If TRUE, then ucnv_resetToUnicode(sourceCnv) and
* ucnv_resetFromUnicode(targetCnv) are called, and the
* pivot pointers are reset (*pivotTarget=*pivotSource=pivotStart).
* @param flush If true, indicates the end of the input.
* Passed directly to ucnv_toUnicode(), and carried over to
* ucnv_fromUnicode() when the source is empty as well.
* @param pErrorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* U_BUFFER_OVERFLOW_ERROR always refers to the target buffer
* because overflows into the pivot buffer are handled internally.
* Other conversion errors are from the source-to-pivot
* conversion if *pivotSource==pivotStart, otherwise from
* the pivot-to-target conversion.
*
* @see ucnv_convert
* @see ucnv_fromUnicode
* @see ucnv_toUnicode
* @see ucnv_fromUChars
* @see ucnv_toUChars
* @draft ICU 2.6
*/
U_CAPI void U_EXPORT2
ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
char **target, const char *targetLimit,
const char **source, const char *sourceLimit,
UChar *pivotStart, UChar **pivotSource,
UChar **pivotTarget, const UChar *pivotLimit,
UBool reset, UBool flush,
UErrorCode *pErrorCode);
/** /**
* Will convert a sequence of bytes from one codepage to another. * Will convert a sequence of bytes from one codepage to another.
* This is <STRONG>NOT AN EFFICIENT</STRONG> way to transcode. * This is <STRONG>NOT AN EFFICIENT</STRONG> way to transcode.
@ -1075,6 +1210,8 @@ ucnv_getNextUChar(UConverter * converter,
* @param err error status. * @param err error status.
* <code>U_BUFFER_OVERFLOW_ERROR</code> will be set if the target is full and there is still input left in the source. * <code>U_BUFFER_OVERFLOW_ERROR</code> will be set if the target is full and there is still input left in the source.
* @return will be filled in with the number of bytes needed in target * @return will be filled in with the number of bytes needed in target
*
* @see ucnv_convertEx
* @see ucnv_fromUnicode * @see ucnv_fromUnicode
* @see ucnv_toUnicode * @see ucnv_toUnicode
* @see ucnv_fromUChars * @see ucnv_fromUChars