ICU-5230 faster u_strToUTF8() and u_strFromUTF8() functions, added ...WithSub() variants, added u_strFromUTF8Lenient()

X-SVN-Rev: 19723
2006-06-15 19:22:04 +00:00 · 2006-06-15 19:22:04 +00:00 · 1d687cde69
commit 1d687cde69
parent 154314acd1
3 changed files with 1397 additions and 86 deletions
--- a/icu4c/source/common/unicode/ustring.h
+++ b/icu4c/source/common/unicode/ustring.h
@ -1230,6 +1230,8 @@ u_strFromWCS(UChar   *dest,
 *                      which must not indicate a failure before the function call.
 * @return The pointer to destination buffer.
 * @stable ICU 2.0
+ * @see u_strToUTF8WithSub
+ * @see u_strFromUTF8
 */
 U_STABLE char* U_EXPORT2 
 u_strToUTF8(char *dest,           
@ -1257,6 +1259,8 @@ u_strToUTF8(char *dest,
 *                      which must not indicate a failure before the function call.
 * @return The pointer to destination buffer.
 * @stable ICU 2.0
+ * @see u_strFromUTF8WithSub
+ * @see u_strFromUTF8Lenient
 */
 U_STABLE UChar* U_EXPORT2
 u_strFromUTF8(UChar *dest,             
@ -1266,6 +1270,148 @@ u_strFromUTF8(UChar *dest,
              int32_t srcLength,
              UErrorCode *pErrorCode);

+/**
+ * Converts a sequence of UChars (UTF-16) to UTF-8 bytes.
+ * Same as u_strToUTF8() except for the additional subchar which is output for
+ * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
+ * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
+ *
+ * @param dest          A buffer for the result string. The result will be zero-terminated if
+ *                      the buffer is large enough.
+ * @param destCapacity  The size of the buffer (number of chars). If it is 0, then
+ *                      dest may be NULL and the function will only return the length of the 
+ *                      result without writing any of the result string (pre-flighting).
+ * @param pDestLength   A pointer to receive the number of units written to the destination. If 
+ *                      pDestLength!=NULL then *pDestLength is always set to the 
+ *                      number of output units corresponding to the transformation of 
+ *                      all the input units, even in case of a buffer overflow.
+ * @param src           The original source string
+ * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
+ * @param subchar       The substitution character to use in place of an illegal input sequence,
+ *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
+ *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
+ *                      except for surrogate code points (U+D800..U+DFFF).
+ *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
+ * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
+ *                      Set to 0 if no substitutions occur or subchar<0.
+ *                      pNumSubstitutions can be NULL.
+ * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
+ *                      pass the U_SUCCESS() test, or else the function returns
+ *                      immediately. Check for U_FAILURE() on output or use with
+ *                      function chaining. (See User Guide for details.)
+ * @return The pointer to destination buffer.
+ * @see u_strToUTF8
+ * @see u_strFromUTF8WithSub
+ * @draft ICU 3.6
+ */
+U_DRAFT char* U_EXPORT2
+u_strToUTF8WithSub(char *dest,
+            int32_t destCapacity,
+            int32_t *pDestLength,
+            const UChar *src,
+            int32_t srcLength,
+            UChar32 subchar, int32_t *pNumSubstitutions,
+            UErrorCode *pErrorCode);
+
+/**
+ * Converts a sequence of UTF-8 bytes to UChars (UTF-16).
+ * Same as u_strFromUTF8() except for the additional subchar which is output for
+ * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
+ * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
+ *
+ * @param dest          A buffer for the result string. The result will be zero-terminated if
+ *                      the buffer is large enough.
+ * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
+ *                      dest may be NULL and the function will only return the length of the 
+ *                      result without writing any of the result string (pre-flighting).
+ * @param pDestLength   A pointer to receive the number of units written to the destination. If 
+ *                      pDestLength!=NULL then *pDestLength is always set to the 
+ *                      number of output units corresponding to the transformation of 
+ *                      all the input units, even in case of a buffer overflow.
+ * @param src           The original source string
+ * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
+ * @param subchar       The substitution character to use in place of an illegal input sequence,
+ *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
+ *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
+ *                      except for surrogate code points (U+D800..U+DFFF).
+ *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
+ * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
+ *                      Set to 0 if no substitutions occur or subchar<0.
+ *                      pNumSubstitutions can be NULL.
+ * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
+ *                      pass the U_SUCCESS() test, or else the function returns
+ *                      immediately. Check for U_FAILURE() on output or use with
+ *                      function chaining. (See User Guide for details.)
+ * @return The pointer to destination buffer.
+ * @see u_strFromUTF8
+ * @see u_strFromUTF8Lenient
+ * @see u_strToUTF8WithSub
+ * @draft ICU 3.6
+ */
+U_DRAFT UChar* U_EXPORT2
+u_strFromUTF8WithSub(UChar *dest,
+              int32_t destCapacity,
+              int32_t *pDestLength,
+              const char *src,
+              int32_t srcLength,
+              UChar32 subchar, int32_t *pNumSubstitutions,
+              UErrorCode *pErrorCode);
+
+/**
+ * Converts a sequence of UTF-8 bytes to UChars (UTF-16).
+ * Same as u_strFromUTF8() except that this function is designed to be very fast,
+ * which it achieves by being lenient about malformed UTF-8 sequences.
+ * This function is intended for use in environments where UTF-8 text is
+ * expected to be well-formed.
+ *
+ * Its semantics are:
+ * - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text.
+ * - The function will not read beyond the input string, nor write beyond
+ *   the destCapacity.
+ * - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not
+ *   be well-formed UTF-16.
+ *   The function will resynchronize to valid code point boundaries
+ *   within a small number of code points after an illegal sequence.
+ * - Non-shortest forms are not detected and will result in "spoofing" output.
+ *
+ * For further performance improvement, if srcLength is given (>=0),
+ * then it must be destCapacity>=srcLength.
+ *
+ * @param dest          A buffer for the result string. The result will be zero-terminated if
+ *                      the buffer is large enough.
+ * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
+ *                      dest may be NULL and the function will only return the length of the 
+ *                      result without writing any of the result string (pre-flighting).
+ *                      Unlike for other ICU functions, if srcLength>=0 then it
+ *                      must be destCapacity>=srcLength.
+ * @param pDestLength   A pointer to receive the number of units written to the destination. If 
+ *                      pDestLength!=NULL then *pDestLength is always set to the 
+ *                      number of output units corresponding to the transformation of 
+ *                      all the input units, even in case of a buffer overflow.
+ *                      Unlike for other ICU functions, if srcLength>=0 but
+ *                      destCapacity<srcLength, then *pDestLength will be set to srcLength
+ *                      (and U_BUFFER_OVERFLOW_ERROR will be set)
+ *                      regardless of the actual result length.
+ * @param src           The original source string
+ * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
+ * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
+ *                      pass the U_SUCCESS() test, or else the function returns
+ *                      immediately. Check for U_FAILURE() on output or use with
+ *                      function chaining. (See User Guide for details.)
+ * @return The pointer to destination buffer.
+ * @see u_strFromUTF8
+ * @see u_strFromUTF8WithSub
+ * @see u_strToUTF8WithSub
+ * @draft ICU 3.6
+ */
+U_CAPI UChar * U_EXPORT2
+u_strFromUTF8Lenient(UChar *dest,
+                     int32_t destCapacity,
+                     int32_t *pDestLength,
+                     const char *src,
+                     int32_t srcLength,
+                     UErrorCode *pErrorCode);
+
 /**
 * Converts a sequence of UChars (UTF-16) to UTF32 units.
 *
--- a/icu4c/source/common/ustrtrns.c
+++ b/icu4c/source/common/ustrtrns.c
--- a/icu4c/source/test/cintltst/custrtrn.c
+++ b/icu4c/source/test/cintltst/custrtrn.c
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT:
- * Copyright (c) 2001-2005, International Business Machines Corporation and
+ * Copyright (c) 2001-2006, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
 /********************************************************************************
@ -22,6 +22,8 @@
 #include "unicode/ures.h"
 #include "ustr_imp.h"
 #include "cintltst.h"
+#include "cmemory.h"
+#include "cstring.h"
 #include "cwchar.h"

 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
@ -30,6 +32,7 @@ void addUCharTransformTest(TestNode** root);

 static void Test_UChar_UTF32_API(void);
 static void Test_UChar_UTF8_API(void);
+static void Test_FromUTF8Lenient(void);
 static void Test_UChar_WCHART_API(void);
 static void Test_widestrs(void);
 static void Test_WCHART_LongString(void);
@ -39,6 +42,7 @@ addUCharTransformTest(TestNode** root)
 {
   addTest(root, &Test_UChar_UTF32_API, "custrtrn/Test_UChar_UTF32_API");
   addTest(root, &Test_UChar_UTF8_API, "custrtrn/Test_UChar_UTF8_API");
+   addTest(root, &Test_FromUTF8Lenient, "custrtrn/Test_FromUTF8Lenient");
   addTest(root, &Test_UChar_WCHART_API,  "custrtrn/Test_UChar_WCHART_API");
   addTest(root, &Test_widestrs,  "custrtrn/Test_widestrs");
   addTest(root, &Test_WCHART_LongString, "custrtrn/Test_WCHART_LongString");
@ -270,16 +274,21 @@ static void Test_UChar_UTF8_API(void){
    int32_t u8DestLen =0;
    UBool failed = FALSE;
    int i= 0;
+    int32_t numSubstitutions;
+
    {
        /* preflight */
+        u8Temp[0] = 0x12;
        u_strToUTF8(u8Target,u8TargetLength, &u8DestLen, uSrc, uSrcLen,&err);
-        if(err == U_BUFFER_OVERFLOW_ERROR){
+        if(err == U_BUFFER_OVERFLOW_ERROR && u8Temp[0] == 0x12){
            err = U_ZERO_ERROR;       
            u8Target = (char*) malloc (sizeof(uint8_t) * (u8DestLen+1));
            u8TargetLength = u8DestLen;

+            u8Target[u8TargetLength] = 0xfe;
+            u8DestLen = -1;
            u_strToUTF8(u8Target,u8TargetLength, &u8DestLen, uSrc, uSrcLen,&err);
-            if(U_FAILURE(err)){
+            if(U_FAILURE(err) || u8DestLen != u8TargetLength || u8Target[u8TargetLength] != (char)0xfe){
                log_err("u_strToUTF8 failed after preflight. Error: %s\n", u_errorName(err));
                return;
            }
@ -308,16 +317,19 @@ static void Test_UChar_UTF8_API(void){
        u8SrcLen = u8DestLen;

        /* preflight */
+        uTemp[0] = 0x1234;
        u_strFromUTF8(uTarget,uTargetLength,&uDestLen,u8Src,u8SrcLen,&err);
-        if(err == U_BUFFER_OVERFLOW_ERROR){
+        if(err == U_BUFFER_OVERFLOW_ERROR && uTemp[0] == 0x1234){
            err = U_ZERO_ERROR;
            uTarget = (UChar*) malloc( sizeof(UChar) * (uDestLen+1));
            uTargetLength =  uDestLen;

+            uTarget[uTargetLength] = 0xfff0;
+            uDestLen = -1;
            u_strFromUTF8(uTarget,uTargetLength,&uDestLen,u8Src,u8SrcLen,&err);
        }
        else {
-            log_err("Should have gotten U_BUFFER_OVERFLOW_ERROR");
+            log_err("error: u_strFromUTF8(preflight) should have gotten U_BUFFER_OVERFLOW_ERROR\n");
        }
        /*for(i=0; i< uDestLen; i++){
            printf("0x%04X, ",uTarget[i]);
@ -326,6 +338,9 @@ static void Test_UChar_UTF8_API(void){
            }
        }*/

+        if(U_FAILURE(err) || uDestLen != uTargetLength || uTarget[uTargetLength] != 0xfff0) {
+            failed = TRUE;
+        }
        for(i=0; i< uSrcLen; i++){
            if(uTarget[i] != src16[i]){
                log_verbose("u_strFromUTF8() failed expected: \\u%04X got: \\u%04X at index: %i \n", src16[i] ,uTarget[i],i);
@ -333,7 +348,7 @@ static void Test_UChar_UTF8_API(void){
            }
        }
        if(failed){
-            log_err("u_strToUTF8() failed \n");
+            log_err("error: u_strFromUTF8(after preflighting) failed\n");
        }

        free(u8Target);
@ -414,10 +429,14 @@ static void Test_UChar_UTF8_API(void){
    {
        static const UChar
            withLead16[]={ 0x1800, 0xd89a, 0x0061 },
-            withTrail16[]={ 0x1800, 0xdcba, 0x0061, 0 };
+            withTrail16[]={ 0x1800, 0xdcba, 0x0061, 0 },
+            withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
+            withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
        static const uint8_t
            withLead8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xa2, 0x9a, 0x61 },
-            withTrail8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xb2, 0xba, 0x61 };
+            withTrail8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xb2, 0xba, 0x61, 0 },
+            withTrail8Sub1A[]={ 0xe1, 0xa0, 0x80, 0x1a, 0x61, 0 }, /* sub==U+001A */
+            withTrail8SubFFFD[]={ 0xe1, 0xa0, 0x80, 0xef, 0xbf, 0xbd, 0x61, 0 }; /* sub==U+FFFD */
        UChar out16[10];
        char out8[10];

@ -429,8 +448,384 @@ static void Test_UChar_UTF8_API(void){
        ) {
            log_err("error: u_strTo/FromUTF8(string with single surrogate) fails to report error\n");
        }
+
+        /* test error handling with substitution characters */
+
+        /* from UTF-8 with length */
+        err=U_ZERO_ERROR;
+        numSubstitutions=-1;
+        out16[0]=0x55aa;
+        uDestLen=0;
+        u_strFromUTF8WithSub(out16, LENGTHOF(out16), &uDestLen,
+                             (const char *)withTrail8, uprv_strlen((const char *)withTrail8),
+                             0x50005, &numSubstitutions,
+                             &err);
+        if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16Sub50005) ||
+                             0!=u_memcmp(withTrail16Sub50005, out16, uDestLen+1) ||
+                             numSubstitutions!=1) {
+            log_err("error: u_strFromUTF8WithSub(length) failed\n");
+        }
+
+        /* from UTF-8 with NUL termination */
+        err=U_ZERO_ERROR;
+        numSubstitutions=-1;
+        out16[0]=0x55aa;
+        uDestLen=0;
+        u_strFromUTF8WithSub(out16, LENGTHOF(out16), &uDestLen,
+                             (const char *)withTrail8, -1,
+                             0xfffd, &numSubstitutions,
+                             &err);
+        if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16SubFFFD) ||
+                             0!=u_memcmp(withTrail16SubFFFD, out16, uDestLen+1) ||
+                             numSubstitutions!=1) {
+            log_err("error: u_strFromUTF8WithSub(NUL termination) failed\n");
+        }
+
+        /* preflight from UTF-8 with NUL termination */
+        err=U_ZERO_ERROR;
+        numSubstitutions=-1;
+        out16[0]=0x55aa;
+        uDestLen=0;
+        u_strFromUTF8WithSub(out16, 1, &uDestLen,
+                             (const char *)withTrail8, -1,
+                             0x50005, &numSubstitutions,
+                             &err);
+        if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=1) {
+            log_err("error: u_strFromUTF8WithSub(preflight/NUL termination) failed\n");
+        }
+
+        /* to UTF-8 with length */
+        err=U_ZERO_ERROR;
+        numSubstitutions=-1;
+        out8[0]=0xf5;
+        u8DestLen=0;
+        u_strToUTF8WithSub(out8, LENGTHOF(out8), &u8DestLen,
+                           withTrail16, u_strlen(withTrail16),
+                           0xfffd, &numSubstitutions,
+                           &err);
+        if(U_FAILURE(err) || u8DestLen!=uprv_strlen((const char *)withTrail8SubFFFD) ||
+                             0!=uprv_memcmp((const char *)withTrail8SubFFFD, out8, u8DestLen+1) ||
+                             numSubstitutions!=1) {
+            log_err("error: u_strToUTF8WithSub(length) failed\n");
+        }
+
+        /* to UTF-8 with NUL termination */
+        err=U_ZERO_ERROR;
+        numSubstitutions=-1;
+        out8[0]=0xf5;
+        u8DestLen=0;
+        u_strToUTF8WithSub(out8, LENGTHOF(out8), &u8DestLen,
+                           withTrail16, -1,
+                           0x1a, &numSubstitutions,
+                           &err);
+        if(U_FAILURE(err) || u8DestLen!=uprv_strlen((const char *)withTrail8Sub1A) ||
+                             0!=uprv_memcmp((const char *)withTrail8Sub1A, out8, u8DestLen+1) ||
+                             numSubstitutions!=1) {
+            log_err("error: u_strToUTF8WithSub(NUL termination) failed\n");
+        }
+
+        /* preflight to UTF-8 with NUL termination */
+        err=U_ZERO_ERROR;
+        numSubstitutions=-1;
+        out8[0]=0xf5;
+        u8DestLen=0;
+        u_strToUTF8WithSub(out8, 1, &u8DestLen,
+                           withTrail16, -1,
+                           0xfffd, &numSubstitutions,
+                           &err);
+        if(err!=U_BUFFER_OVERFLOW_ERROR || u8DestLen!=uprv_strlen((const char *)withTrail8SubFFFD) ||
+                                           numSubstitutions!=1) {
+            log_err("error: u_strToUTF8WithSub(preflight/NUL termination) failed\n");
+        }
+
+        /* test that numSubstitutions==0 if there are no substitutions */
+
+        /* from UTF-8 with length (just first 3 bytes which are valid) */
+        err=U_ZERO_ERROR;
+        numSubstitutions=-1;
+        out16[0]=0x55aa;
+        uDestLen=0;
+        u_strFromUTF8WithSub(out16, LENGTHOF(out16), &uDestLen,
+                             (const char *)withTrail8, 3,
+                             0x50005, &numSubstitutions,
+                             &err);
+        if(U_FAILURE(err) || uDestLen!=1 ||
+                             0!=u_memcmp(withTrail16Sub50005, out16, uDestLen) ||
+                             numSubstitutions!=0) {
+            log_err("error: u_strFromUTF8WithSub(no subs) failed\n");
+        }
+
+        /* to UTF-8 with length (just first UChar which is valid) */
+        err=U_ZERO_ERROR;
+        numSubstitutions=-1;
+        out8[0]=0xf5;
+        u8DestLen=0;
+        u_strToUTF8WithSub(out8, LENGTHOF(out8), &u8DestLen,
+                           withTrail16, 1,
+                           0xfffd, &numSubstitutions,
+                           &err);
+        if(U_FAILURE(err) || u8DestLen!=3 ||
+                             0!=uprv_memcmp((const char *)withTrail8SubFFFD, out8, u8DestLen) ||
+                             numSubstitutions!=0) {
+            log_err("error: u_strToUTF8WithSub(no subs) failed\n");
+        }
+
+        /* test that numSubstitutions==0 if subchar==U_SENTINEL (no subchar) */
+
+        /* from UTF-8 with length (just first 3 bytes which are valid) */
+        err=U_ZERO_ERROR;
+        numSubstitutions=-1;
+        out16[0]=0x55aa;
+        uDestLen=0;
+        u_strFromUTF8WithSub(out16, LENGTHOF(out16), &uDestLen,
+                             (const char *)withTrail8, 3,
+                             U_SENTINEL, &numSubstitutions,
+                             &err);
+        if(U_FAILURE(err) || uDestLen!=1 ||
+                             0!=u_memcmp(withTrail16Sub50005, out16, uDestLen) ||
+                             numSubstitutions!=0) {
+            log_err("error: u_strFromUTF8WithSub(no subchar) failed\n");
+        }
+
+        /* to UTF-8 with length (just first UChar which is valid) */
+        err=U_ZERO_ERROR;
+        numSubstitutions=-1;
+        out8[0]=0xf5;
+        u8DestLen=0;
+        u_strToUTF8WithSub(out8, LENGTHOF(out8), &u8DestLen,
+                           withTrail16, 1,
+                           U_SENTINEL, &numSubstitutions,
+                           &err);
+        if(U_FAILURE(err) || u8DestLen!=3 ||
+                             0!=uprv_memcmp((const char *)withTrail8SubFFFD, out8, u8DestLen) ||
+                             numSubstitutions!=0) {
+            log_err("error: u_strToUTF8WithSub(no subchar) failed\n");
        }
    }
+}
+
+/* compare if two strings are equal, but match 0xfffd in the second string with anything in the first */
+static UBool
+equalAnyFFFD(const UChar *s, const UChar *t, int32_t length) {
+    UChar c1, c2;
+
+    while(length>0) {
+        c1=*s++;
+        c2=*t++;
+        if(c1!=c2 && c2!=0xfffd) {
+            return FALSE;
+        }
+        --length;
+    }
+    return TRUE;
+}
+
+/* test u_strFromUTF8Lenient() */
+static void
+Test_FromUTF8Lenient(void) {
+    /*
+     * Multiple input strings, each NUL-terminated.
+     * Terminate with a string starting with 0xff.
+     */
+    static const uint8_t bytes[]={
+        /* well-formed UTF-8 */
+        0x61,  0xc3, 0x9f,  0xe0, 0xa0, 0x80,  0xf0, 0xa0, 0x80, 0x80,
+        0x62,  0xc3, 0xa0,  0xe0, 0xa0, 0x81,  0xf0, 0xa0, 0x80, 0x81, 0,
+
+        /* various malformed sequences */
+        0xc3, 0xc3, 0x9f,  0xc3, 0xa0,  0xe0, 0x80, 0x8a,  0xf0, 0x41, 0x42, 0x43, 0,
+
+        /* truncated input */
+        0xc3, 0,
+        0xe0, 0,
+        0xe0, 0xa0, 0,
+        0xf0, 0,
+        0xf0, 0x90, 0,
+        0xf0, 0x90, 0x80, 0,
+
+        /* empty string */
+        0,
+
+        /* finish */
+        0xff, 0
+    };
+
+    /* Multiple output strings, each NUL-terminated. 0xfffd matches anything. */
+    static const UChar uchars[]={
+        0x61, 0xdf, 0x800,  0xd840, 0xdc00,
+        0x62, 0xe0, 0x801,  0xd840, 0xdc01,  0,
+
+        0xfffd, 0x9f, 0xe0, 0xa,  0xfffd, 0xfffd,  0,
+
+        0xfffd, 0,
+        0xfffd, 0,
+        0xfffd, 0,
+        0xfffd, 0,
+        0xfffd, 0,
+        0xfffd, 0,
+
+        0,
+
+        0
+    };
+
+    UChar dest[64];
+    const char *pb;
+    const UChar *pu, *pDest;
+    int32_t srcLength, destLength0, destLength;
+    int number;
+    UErrorCode errorCode;
+
+    /* verify checking for some illegal arguments */
+    dest[0]=0x1234;
+    destLength=-1;
+    errorCode=U_ZERO_ERROR;
+    pDest=u_strFromUTF8Lenient(dest, 1, &destLength, NULL, -1, &errorCode);
+    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || dest[0]!=0x1234) {
+        log_err("u_strFromUTF8Lenient(src=NULL) failed\n");
+    }
+
+    dest[0]=0x1234;
+    destLength=-1;
+    errorCode=U_ZERO_ERROR;
+    pDest=u_strFromUTF8Lenient(NULL, 1, &destLength, (const char *)bytes, -1, &errorCode);
+    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
+        log_err("u_strFromUTF8Lenient(dest=NULL[1]) failed\n");
+    }
+
+    dest[0]=0x1234;
+    destLength=-1;
+    errorCode=U_MEMORY_ALLOCATION_ERROR;
+    pDest=u_strFromUTF8Lenient(dest, 1, &destLength, (const char *)bytes, -1, &errorCode);
+    if(errorCode!=U_MEMORY_ALLOCATION_ERROR || dest[0]!=0x1234) {
+        log_err("u_strFromUTF8Lenient(U_MEMORY_ALLOCATION_ERROR) failed\n");
+    }
+
+    dest[0]=0x1234;
+    destLength=-1;
+    errorCode=U_MEMORY_ALLOCATION_ERROR;
+    pDest=u_strFromUTF8Lenient(dest, 1, &destLength, (const char *)bytes, -1, NULL);
+    if(dest[0]!=0x1234) {
+        log_err("u_strFromUTF8Lenient(pErrorCode=NULL) failed\n");
+    }
+
+    /* test normal behavior */
+    number=0; /* string number for log_err() */
+
+    for(pb=(const char *)bytes, pu=uchars;
+        *pb!=(char)0xff;
+        pb+=srcLength+1, pu+=destLength0+1, ++number
+    ) {
+        srcLength=uprv_strlen(pb);
+        destLength0=u_strlen(pu);
+
+        /* preflighting with NUL-termination */
+        dest[0]=0x1234;
+        destLength=-1;
+        errorCode=U_ZERO_ERROR;
+        pDest=u_strFromUTF8Lenient(NULL, 0, &destLength, pb, -1, &errorCode);
+        if (errorCode!= (destLength0==0 ? U_STRING_NOT_TERMINATED_WARNING : U_BUFFER_OVERFLOW_ERROR) ||
+            pDest!=NULL || dest[0]!=0x1234 || destLength!=destLength0
+        ) {
+            log_err("u_strFromUTF8Lenient(%d preflighting with NUL-termination) failed\n", number);
+        }
+
+        /* preflighting/some capacity with NUL-termination */
+        if(srcLength>0) {
+            dest[destLength0-1]=0x1234;
+            destLength=-1;
+            errorCode=U_ZERO_ERROR;
+            pDest=u_strFromUTF8Lenient(dest, destLength0-1, &destLength, pb, -1, &errorCode);
+            if (errorCode!=U_BUFFER_OVERFLOW_ERROR ||
+                dest[destLength0-1]!=0x1234 || destLength!=destLength0
+            ) {
+                log_err("u_strFromUTF8Lenient(%d preflighting/some capacity with NUL-termination) failed\n", number);
+            }
+        }
+
+        /* conversion with NUL-termination, much capacity */
+        dest[0]=dest[destLength0]=0x1234;
+        destLength=-1;
+        errorCode=U_ZERO_ERROR;
+        pDest=u_strFromUTF8Lenient(dest, LENGTHOF(dest), &destLength, pb, -1, &errorCode);
+        if (errorCode!=U_ZERO_ERROR ||
+            pDest!=dest || dest[destLength0]!=0 ||
+            destLength!=destLength0 || !equalAnyFFFD(dest, pu, destLength)
+        ) {
+            log_err("u_strFromUTF8Lenient(%d conversion with NUL-termination, much capacity) failed\n", number);
+        }
+
+        /* conversion with NUL-termination, exact capacity */
+        dest[0]=dest[destLength0]=0x1234;
+        destLength=-1;
+        errorCode=U_ZERO_ERROR;
+        pDest=u_strFromUTF8Lenient(dest, destLength0, &destLength, pb, -1, &errorCode);
+        if (errorCode!=U_STRING_NOT_TERMINATED_WARNING ||
+            pDest!=dest || dest[destLength0]!=0x1234 ||
+            destLength!=destLength0 || !equalAnyFFFD(dest, pu, destLength)
+        ) {
+            log_err("u_strFromUTF8Lenient(%d conversion with NUL-termination, exact capacity) failed\n", number);
+        }
+
+        /* preflighting with length */
+        dest[0]=0x1234;
+        destLength=-1;
+        errorCode=U_ZERO_ERROR;
+        pDest=u_strFromUTF8Lenient(NULL, 0, &destLength, pb, srcLength, &errorCode);
+        if (errorCode!= (destLength0==0 ? U_STRING_NOT_TERMINATED_WARNING : U_BUFFER_OVERFLOW_ERROR) ||
+            pDest!=NULL || dest[0]!=0x1234 || destLength!=srcLength
+        ) {
+            log_err("u_strFromUTF8Lenient(%d preflighting with length) failed\n", number);
+        }
+
+        /* preflighting/some capacity with length */
+        if(srcLength>0) {
+            dest[srcLength-1]=0x1234;
+            destLength=-1;
+            errorCode=U_ZERO_ERROR;
+            pDest=u_strFromUTF8Lenient(dest, srcLength-1, &destLength, pb, srcLength, &errorCode);
+            if (errorCode!=U_BUFFER_OVERFLOW_ERROR ||
+                dest[srcLength-1]!=0x1234 || destLength!=srcLength
+            ) {
+                log_err("u_strFromUTF8Lenient(%d preflighting/some capacity with length) failed\n", number);
+            }
+        }
+
+        /* conversion with length, much capacity */
+        dest[0]=dest[destLength0]=0x1234;
+        destLength=-1;
+        errorCode=U_ZERO_ERROR;
+        pDest=u_strFromUTF8Lenient(dest, LENGTHOF(dest), &destLength, pb, srcLength, &errorCode);
+        if (errorCode!=U_ZERO_ERROR ||
+            pDest!=dest || dest[destLength0]!=0 ||
+            destLength!=destLength0 || !equalAnyFFFD(dest, pu, destLength)
+        ) {
+            log_err("u_strFromUTF8Lenient(%d conversion with length, much capacity) failed\n", number);
+        }
+
+        /* conversion with length, srcLength capacity */
+        dest[0]=dest[srcLength]=dest[destLength0]=0x1234;
+        destLength=-1;
+        errorCode=U_ZERO_ERROR;
+        pDest=u_strFromUTF8Lenient(dest, srcLength, &destLength, pb, srcLength, &errorCode);
+        if(srcLength==destLength0) {
+            if (errorCode!=U_STRING_NOT_TERMINATED_WARNING ||
+                pDest!=dest || dest[destLength0]!=0x1234 ||
+                destLength!=destLength0 || !equalAnyFFFD(dest, pu, destLength)
+            ) {
+                log_err("u_strFromUTF8Lenient(%d conversion with length, srcLength capacity/not terminated) failed\n", number);
+            }
+        } else {
+            if (errorCode!=U_ZERO_ERROR ||
+                pDest!=dest || dest[destLength0]!=0 ||
+                destLength!=destLength0 || !equalAnyFFFD(dest, pu, destLength)
+            ) {
+                log_err("u_strFromUTF8Lenient(%d conversion with length, srcLength capacity/terminated) failed\n", number);
+            }
+        }
+    }
+}
+
 static const uint16_t src16j[] = {
    0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A,
    0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A,