ICU-2144 add ucnv_convertEx()

X-SVN-Rev: 11589
2003-04-18 23:11:39 +00:00 · 2003-04-18 23:11:39 +00:00 · 4f2c865b52
commit 4f2c865b52
parent e76f98cf2d
2 changed files with 286 additions and 72 deletions
--- a/icu4c/source/common/ucnv.c
+++ b/icu4c/source/common/ucnv.c
@ -82,7 +82,7 @@ static void UCNV_DEBUG_CNV(UConverter *c, int line)


 /* size of intermediate and preflighting buffers in ucnv_convert() */
-#define CHUNK_SIZE 5*1024
+#define CHUNK_SIZE 1024

 typedef struct UAmbiguousConverter {
    const char *name;
@ -809,6 +809,11 @@ ucnv_fromUnicode (UConverter * _this,
            return;
    }

+    if(!flush && *source == sourceLimit) {
+        /* the overflow buffer is emptied and there is no new input: we are done */
+        return;
+    }
+
    args.converter = _this;
    args.flush = flush;
    args.offsets = offsets;
@ -903,6 +908,11 @@ ucnv_toUnicode (UConverter * _this,
            return;
    }

+    if(!flush && *source == sourceLimit) {
+        /* the overflow buffer is emptied and there is no new input: we are done */
+        return;
+    }
+
    args.converter = _this;
    args.flush = flush;
    args.offsets = offsets;
@ -1116,6 +1126,120 @@ ucnv_getNextUChar(UConverter * converter,
    return ch;
 }

+U_CAPI void U_EXPORT2
+ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
+               char **target, const char *targetLimit,
+               const char **source, const char *sourceLimit,
+               UChar *pivotStart, UChar **pivotSource,
+               UChar **pivotTarget, const UChar *pivotLimit,
+               UBool reset, UBool flush,
+               UErrorCode *pErrorCode) {
+    UChar pivotBuffer[CHUNK_SIZE];
+    UChar *myPivotSource, *myPivotTarget;
+
+    /* error checking */
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return;
+    }
+
+    if( targetCnv==NULL || sourceCnv==NULL ||
+        source==NULL || *source==NULL ||
+        target==NULL || *target==NULL || targetLimit==NULL
+    ) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+
+    if(pivotStart==NULL) {
+        /* use the stack pivot buffer */
+        pivotStart=myPivotSource=myPivotTarget=pivotBuffer;
+        pivotSource=&myPivotSource;
+        pivotTarget=&myPivotTarget;
+        pivotLimit=pivotBuffer+CHUNK_SIZE;
+    } else if(  pivotStart>=pivotLimit ||
+                pivotSource==NULL || *pivotSource==NULL ||
+                pivotTarget==NULL || *pivotTarget==NULL ||
+                pivotLimit==NULL
+    ) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+
+    if(sourceLimit==NULL) {
+        /* get limit of single-byte-NUL-terminated source string */
+        sourceLimit=uprv_strchr(*source, 0);
+    }
+
+    if(reset) {
+        ucnv_resetToUnicode(sourceCnv);
+        ucnv_resetFromUnicode(targetCnv);
+        *pivotTarget=*pivotSource=pivotStart;
+    }
+
+    /* conversion loop */
+    for(;;) {
+        if(reset) {
+            /*
+             * if we did a reset in this function, we know that there is nothing
+             * to convert to the target yet, so we save a function call
+             */
+            reset=FALSE;
+        } else {
+            /*
+             * convert to the target first in case the pivot is filled at entry
+             * or the targetCnv has some output bytes in its state
+             */
+            ucnv_fromUnicode(targetCnv,
+                             target, targetLimit,
+                             pivotSource, *pivotTarget,
+                             NULL,
+                             (UBool)(flush && *source==sourceLimit),
+                             pErrorCode);
+            if(U_FAILURE(*pErrorCode)) {
+                break;
+            }
+
+            /* ucnv_fromUnicode() must have consumed the pivot contents since it returned with U_SUCCESS() */
+            *pivotSource=*pivotTarget=pivotStart;
+        }
+
+        /* convert from the source to the pivot */
+        ucnv_toUnicode(sourceCnv,
+                       pivotTarget, pivotLimit,
+                       source, sourceLimit,
+                       NULL,
+                       flush,
+                       pErrorCode);
+        if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
+            /* pivot overflow: continue with the conversion loop */
+            *pErrorCode=U_ZERO_ERROR;
+        } else if(U_FAILURE(*pErrorCode) || *pivotTarget==pivotStart) {
+            /* conversion error, or there was nothing left to convert */
+            break;
+        }
+        /* else ucnv_toUnicode() wrote into the pivot buffer: continue */
+    }
+
+    /*
+     * The conversion loop is exited when one of the following is true:
+     * - the entire source text has been converted successfully to the target buffer
+     * - a target buffer overflow occurred
+     * - a conversion error occurred
+     */
+
+    /* terminate the target buffer if possible */
+    if(flush && U_SUCCESS(*pErrorCode)) {
+        if(*target!=targetLimit) {
+            **target=0;
+            if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
+                *pErrorCode=U_ZERO_ERROR;
+            }
+        } else {
+            *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
+        }
+    }
+}
+
 U_CAPI int32_t U_EXPORT2
 ucnv_convert(const char *toConverterName, const char *fromConverterName,
             char *target, int32_t targetSize,
@ -1166,40 +1290,14 @@ ucnv_convert(const char *toConverterName, const char *fromConverterName,

    if(targetSize>0) {
        /* perform real conversion */
-
-        /*
-         * loops until the input buffer is completely consumed
-         * or an error is encountered;
-         * first we convert from inConverter codepage to Unicode
-         * then from Unicode to outConverter codepage
-         */
        targetLimit=target+targetSize;
-        do {
-            pivot=pivotBuffer;
-            ucnv_toUnicode(inConverter,
-                           &pivot, pivotBuffer+CHUNK_SIZE,
+        ucnv_convertEx(outConverter, inConverter,
+                       &myTarget, targetLimit,
                       &source, sourceLimit,
-                           NULL,
+                       pivotBuffer, &pivot, &pivot2, pivotBuffer+CHUNK_SIZE,
+                       FALSE,
                       TRUE,
                       pErrorCode);
-
-            /* U_BUFFER_OVERFLOW_ERROR only means that the pivot buffer is full */
-            if(U_SUCCESS(*pErrorCode) || *pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-                *pErrorCode=U_ZERO_ERROR;
-                pivot2=pivotBuffer;
-                ucnv_fromUnicode(outConverter,
-                                 &myTarget, targetLimit,
-                                 (const UChar **)&pivot2, pivot,
-                                 NULL,
-                                 (UBool)(source==sourceLimit),
-                                 pErrorCode);
-                /*
-                 * If this overflows the real target, then we must stop
-                 * converting and preflight with the loop below.
-                 */
-            }
-        } while(U_SUCCESS(*pErrorCode) && source!=sourceLimit);
-
        targetCapacity=myTarget-target;
    }

@ -1214,53 +1312,32 @@ ucnv_convert(const char *toConverterName, const char *fromConverterName,

        targetLimit=targetBuffer+CHUNK_SIZE;
        do {
-            /* since the pivot buffer may still contain some characters, start with emptying it */
            *pErrorCode=U_ZERO_ERROR;
-            while(pivot2!=pivot && U_SUCCESS(*pErrorCode)) {
            myTarget=targetBuffer;
-                ucnv_fromUnicode(outConverter,
+            ucnv_convertEx(outConverter, inConverter,
                           &myTarget, targetLimit,
-                                 (const UChar **)&pivot2, pivot,
-                                 NULL,
-                                 (UBool)(source==sourceLimit),
+                           &source, sourceLimit,
+                           pivotBuffer, &pivot, &pivot2, pivotBuffer+CHUNK_SIZE,
+                           FALSE,
+                           TRUE,
                           pErrorCode);
            targetCapacity+=(myTarget-targetBuffer);
-                if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-                    *pErrorCode=U_ZERO_ERROR;
-                }
-            }
+        } while(*pErrorCode==U_BUFFER_OVERFLOW_ERROR);

-            if(U_FAILURE(*pErrorCode)) {
-                /* an error occurred: done */
-                break;
-            }
-
-            if(source==sourceLimit) {
+        if(U_SUCCESS(*pErrorCode)) {
            /*
-                 * source is consumed:
-                 * done, and set the buffer overflow error as
+             * done with preflighting, set the buffer overflow error as
             * the result for the entire function
             */
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-                break;
        }
-
-            /* now convert from the source into the pivot buffer again */
-            pivot=pivot2=pivotBuffer;
-            ucnv_toUnicode(inConverter,
-                           &pivot, pivotBuffer+CHUNK_SIZE,
-                           &source, sourceLimit,
-                           NULL,
-                           TRUE,
-                           pErrorCode);
-        }
-        while(U_SUCCESS(*pErrorCode) || *pErrorCode==U_BUFFER_OVERFLOW_ERROR);
    }

    ucnv_close (inConverter);
    ucnv_close (outConverter);

-    return u_terminateChars(target, targetSize, targetCapacity, pErrorCode);
+    /* no need to call u_terminateChars() because ucnv_convertEx() took care of that */
+    return targetCapacity;
 }

 U_CAPI UConverterType  U_EXPORT2
--- a/icu4c/source/common/unicode/ucnv.h
+++ b/icu4c/source/common/unicode/ucnv.h
@ -1060,6 +1060,141 @@ ucnv_getNextUChar(UConverter * converter,
                  const char * sourceLimit,
                  UErrorCode * err);

+/**
+ * Convert from one external charset to another using two existing UConverters.
+ * Internally, two conversions - ucnv_toUnicode() and ucnv_fromUnicode() -
+ * are used, "pivoting" through 16-bit Unicode.
+ *
+ * There is a similar function, ucnv_convert(),
+ * which has the following limitations:
+ * - it takes charset names, not converter objects, so that
+ *   - two converters are opened for each call
+ *   - only single-string conversion is possible, not streaming operation
+ * - it does not provide enough information to find out,
+ *   in case of failure, whether the toUnicode or
+ *   the fromUnicode conversion failed
+ *
+ * By contrast, ucnv_convertEx()
+ * - takes UConverter parameters instead of charset names
+ * - fully exposes the pivot buffer for complete error handling
+ *
+ * ucnv_convertEx() also provides further convenience:
+ * - an option to reset the converters at the beginning
+ *   (if reset==TRUE, see parameters;
+ *    also sets *pivotTarget=*pivotSource=pivotStart)
+ * - allow NUL-terminated input
+ *   (only a single NUL byte, will not work for charsets with multi-byte NULs)
+ *   (if sourceLimit==NULL, see parameters)
+ * - terminate with a NUL on output
+ *   (only a single NUL byte, not useful for charsets with multi-byte NULs),
+ *   or set U_STRING_NOT_TERMINATED_WARNING if the output exactly fills
+ *   the target buffer
+ * - the pivot buffer can be provided internally;
+ *   in this case, the caller will not be able to get details about where an
+ *   error occurred
+ *   (if pivotStart==NULL, see below)
+ *
+ * The function returns when one of the following is true:
+ * - the entire source text has been converted successfully to the target buffer
+ * - a target buffer overflow occurred (U_BUFFER_OVERFLOW_ERROR)
+ * - a conversion error occurred
+ *   (other U_FAILURE(), see description of pErrorCode)
+ *
+ * Limitation compared to the direct use of
+ * ucnv_fromUnicode() and ucnv_toUnicode():
+ * ucnv_convertEx() does not provide offset information.
+ *
+ * Limitation compared to ucnv_fromUChars() and ucnv_toUChars():
+ * ucnv_convertEx() does not support preflighting directly.
+ *
+ * Sample code for converting a single string from
+ * one external charset to UTF-8, ignoring the location of errors:
+ *
+ * \code
+ * int32_t
+ * myToUTF8(UConverter *cnv,
+ *          const char *s, int32_t length,
+ *          char *u8, int32_t capacity,
+ *          UErrorCode *pErrorCode) {
+ *     UConverter *utf8Cnv;
+ *     char *target;
+ *
+ *     if(U_FAILURE(*pErrorCode)) {
+ *         return 0;
+ *     }
+ *
+ *     utf8Cnv=myGetCachedUTF8Converter(pErrorCode);
+ *     if(U_FAILURE(*pErrorCode)) {
+ *         return 0;
+ *     }
+ *
+ *     target=u8;
+ *     ucnv_convertEx(cnv, utf8Cnv,
+ *                    &target, u8+capacity,
+ *                    &s, length>=0 ? s+length : NULL,
+ *                    NULL, NULL, NULL, NULL,
+ *                    TRUE, TRUE,
+ *                    pErrorCode);
+ * 
+ *     myReleaseCachedUTF8Converter(utf8Cnv);
+ *
+ *     // return the output string length, but without preflighting
+ *     return (int32_t)(target-u8);
+ * }
+ * \endcode
+ *
+ * @param targetCnv     Output converter, used to convert from the UTF-16 pivot
+ *                      to the target using ucnv_fromUnicode().
+ * @param sourceCnv     Input converter, used to convert from the source to
+ *                      the UTF-16 pivot using ucnv_toUnicode().
+ * @param target        I/O parameter, same as for ucnv_fromUChars().
+ *                      Input: *target points to the beginning of the target buffer.
+ *                      Output: *target points to the first unit after the last char written.
+ * @param targetLimit   Pointer to the first unit after the target buffer.
+ * @param source        I/O parameter, same as for ucnv_toUChars().
+ *                      Input: *source points to the beginning of the source buffer.
+ *                      Output: *source points to the first unit after the last char read.
+ * @param sourceLimit   Pointer to the first unit after the source buffer.
+ * @param pivotStart    Pointer to the UTF-16 pivot buffer. If pivotStart==NULL,
+ *                      then an internal buffer is used and the other pivot
+ *                      arguments are ignored and can be NULL as well.
+ * @param pivotSource   I/O parameter, same as source in ucnv_fromUChars() for
+ *                      conversion from the pivot buffer to the target buffer.
+ * @param pivotTarget   I/O parameter, same as target in ucnv_toUChars() for
+ *                      conversion from the source buffer to the pivot buffer.
+ *                      It must be pivotStart<=*pivotSource<=*pivotTarget<=pivotLimit
+ *                      and pivotStart<pivotLimit (unless pivotStart==NULL).
+ * @param pivotLimit    Pointer to the first unit after the pivot buffer.
+ * @param reset         If TRUE, then ucnv_resetToUnicode(sourceCnv) and
+ *                      ucnv_resetFromUnicode(targetCnv) are called, and the
+ *                      pivot pointers are reset (*pivotTarget=*pivotSource=pivotStart).
+ * @param flush         If true, indicates the end of the input.
+ *                      Passed directly to ucnv_toUnicode(), and carried over to
+ *                      ucnv_fromUnicode() when the source is empty as well.
+ * @param pErrorCode    ICU error code in/out parameter.
+ *                      Must fulfill U_SUCCESS before the function call.
+ *                      U_BUFFER_OVERFLOW_ERROR always refers to the target buffer
+ *                      because overflows into the pivot buffer are handled internally.
+ *                      Other conversion errors are from the source-to-pivot
+ *                      conversion if *pivotSource==pivotStart, otherwise from
+ *                      the pivot-to-target conversion.
+ *
+ * @see ucnv_convert
+ * @see ucnv_fromUnicode
+ * @see ucnv_toUnicode
+ * @see ucnv_fromUChars
+ * @see ucnv_toUChars
+ * @draft ICU 2.6
+ */
+U_CAPI void U_EXPORT2
+ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
+               char **target, const char *targetLimit,
+               const char **source, const char *sourceLimit,
+               UChar *pivotStart, UChar **pivotSource,
+               UChar **pivotTarget, const UChar *pivotLimit,
+               UBool reset, UBool flush,
+               UErrorCode *pErrorCode);
+
 /**
 * Will convert a sequence of bytes from one codepage to another.
 * This is <STRONG>NOT AN EFFICIENT</STRONG> way to transcode.
@ -1075,6 +1210,8 @@ ucnv_getNextUChar(UConverter * converter,
 * @param err error status. 
 * <code>U_BUFFER_OVERFLOW_ERROR</code> will be set if the target is full and there is still input left in the source.
 * @return  will be filled in with the number of bytes needed in target
+ *
+ * @see ucnv_convertEx
 * @see ucnv_fromUnicode
 * @see ucnv_toUnicode
 * @see ucnv_fromUChars