ICU-13560 make some toUnicode converter check toULength not toUnicodeStatus for continuing after partial sequences; ucnv.cpp framework code only resets toULength after an error

X-SVN-Rev: 40793
2018-01-23 21:32:36 +00:00 · 2018-01-23 21:32:36 +00:00 · 0dc85d2408
commit 0dc85d2408
parent ac0972f12c
5 changed files with 69 additions and 15 deletions
--- a/icu4c/source/common/ucnv_u32.cpp
+++ b/icu4c/source/common/ucnv_u32.cpp
@ -55,7 +55,7 @@ T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
    uint32_t ch, i;

    /* Restore state of current sequence */
-    if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
+    if (args->converter->toULength > 0 && myTarget < targetLimit) {
        i = args->converter->toULength;       /* restore # of bytes consumed */
        args->converter->toULength = 0;

@ -136,7 +136,7 @@ T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
    int32_t offsetNum = 0;

    /* Restore state of current sequence */
-    if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
+    if (args->converter->toULength > 0 && myTarget < targetLimit) {
        i = args->converter->toULength;       /* restore # of bytes consumed */
        args->converter->toULength = 0;

@ -517,7 +517,7 @@ T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
    uint32_t ch, i;

    /* Restore state of current sequence */
-    if (args->converter->toUnicodeStatus && myTarget < targetLimit)
+    if (args->converter->toULength > 0 && myTarget < targetLimit)
    {
        i = args->converter->toULength;       /* restore # of bytes consumed */
        args->converter->toULength = 0;
@ -604,7 +604,7 @@ T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
    int32_t offsetNum = 0;

    /* Restore state of current sequence */
-    if (args->converter->toUnicodeStatus && myTarget < targetLimit)
+    if (args->converter->toULength > 0 && myTarget < targetLimit)
    {
        i = args->converter->toULength;       /* restore # of bytes consumed */
        args->converter->toULength = 0;
--- a/icu4c/source/common/ucnv_u8.cpp
+++ b/icu4c/source/common/ucnv_u8.cpp
@ -76,7 +76,7 @@ static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
    int32_t i, inBytes;

    /* Restore size of current sequence */
-    if (cnv->toUnicodeStatus && myTarget < targetLimit)
+    if (cnv->toULength > 0 && myTarget < targetLimit)
    {
        inBytes = cnv->mode;            /* restore # of bytes to consume */
        i = cnv->toULength;             /* restore # of bytes consumed */
@ -194,7 +194,7 @@ static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr
    int32_t i, inBytes;

    /* Restore size of current sequence */
-    if (cnv->toUnicodeStatus && myTarget < targetLimit)
+    if (cnv->toULength > 0 && myTarget < targetLimit)
    {
        inBytes = cnv->mode;            /* restore # of bytes to consume */
        i = cnv->toULength;             /* restore # of bytes consumed */
@ -670,12 +670,13 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);

    /* get the converter state from the UTF-8 UConverter */
-    c=(UChar32)utf8->toUnicodeStatus;
-    if(c!=0) {
+    if(utf8->toULength > 0) {
        toULength=oldToULength=utf8->toULength;
        toULimit=(int8_t)utf8->mode;
+        c=(UChar32)utf8->toUnicodeStatus;
    } else {
        toULength=oldToULength=toULimit=0;
+        c = 0;
    }

    count=(int32_t)(sourceLimit-source)+oldToULength;
--- a/icu4c/source/common/ucnvlat1.cpp
+++ b/icu4c/source/common/ucnvlat1.cpp
@ -340,7 +340,11 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);

    /* get the converter state from the UTF-8 UConverter */
-    c=(UChar32)utf8->toUnicodeStatus;
+    if (utf8->toULength > 0) {
+        c=(UChar32)utf8->toUnicodeStatus;
+    } else {
+        c = 0;
+    }
    if(c!=0 && source<sourceLimit) {
        if(targetCapacity==0) {
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
@ -620,7 +624,7 @@ ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,

    uint8_t c;

-    if(pToUArgs->converter->toUnicodeStatus!=0) {
+    if(pToUArgs->converter->toULength > 0) {
        /* no handling of partial UTF-8 characters here, fall back to pivoting */
        *pErrorCode=U_USING_DEFAULT_WARNING;
        return;
--- a/icu4c/source/common/ucnvmbcs.cpp
+++ b/icu4c/source/common/ucnvmbcs.cpp
@ -5064,12 +5064,13 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);

    /* get the converter state from the UTF-8 UConverter */
-    c=(UChar32)utf8->toUnicodeStatus;
-    if(c!=0) {
+    if(utf8->toULength > 0) {
        toULength=oldToULength=utf8->toULength;
        toULimit=(int8_t)utf8->mode;
+        c=(UChar32)utf8->toUnicodeStatus;
    } else {
        toULength=oldToULength=toULimit=0;
+        c = 0;
    }

    // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
@ -5359,12 +5360,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);

    /* get the converter state from the UTF-8 UConverter */
-    c=(UChar32)utf8->toUnicodeStatus;
-    if(c!=0) {
+    if(utf8->toULength > 0) {
        toULength=oldToULength=utf8->toULength;
        toULimit=(int8_t)utf8->mode;
+        c=(UChar32)utf8->toUnicodeStatus;
    } else {
        toULength=oldToULength=toULimit=0;
+        c = 0;
    }

    // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
--- a/icu4c/source/test/intltest/convtest.cpp
+++ b/icu4c/source/test/intltest/convtest.cpp
@ -733,6 +733,7 @@ ConversionTest::TestUTF8ToUTF8Overflow() {
    UChar *pivotSource = buffer16;
    UChar *pivotTarget = buffer16;
    const UChar *pivotLimit = buffer16 + UPRV_LENGTHOF(buffer16);
+    int32_t length;

    // Convert with insufficient target capacity.
    result[2] = 5;
@ -741,7 +742,7 @@ ConversionTest::TestUTF8ToUTF8Overflow() {
                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
                   FALSE, FALSE, errorCode);
    assertEquals("overflow", U_BUFFER_OVERFLOW_ERROR, errorCode.reset());
-    int32_t length = (int32_t)(target - result);
+    length = (int32_t)(target - result);
    assertEquals("number of bytes written", 2, length);
    assertEquals("next byte not clobbered", 5, result[2]);

@ -790,6 +791,52 @@ ConversionTest::TestUTF8ToUTF8Overflow() {
    if (length == 5) {
        assertTrue("text2 result same as input", memcmp(text2, result, length) == 0);
    }
+
+    ucnv_reset(cnv1.getAlias());
+    ucnv_reset(cnv2.getAlias());
+    memset(result, 0, sizeof(result));
+    static const char *illFormed = "\xf1\x91\x93\x96\x91\x94";  // U+514D6 + two more trail bytes
+    source = illFormed;
+    sourceLimit = illFormed + strlen(illFormed);
+    target = result;
+    pivotSource = pivotTarget = buffer16;
+
+    ucnv_setToUCallBack(cnv1.getAlias(), UCNV_TO_U_CALLBACK_STOP, nullptr, nullptr, nullptr, errorCode);
+
+    // Convert only two bytes and flush (but expect failure).
+    char errorBytes[10];
+    int8_t errorLength;
+    result[0] = 5;
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+                   &target, targetLimit, &source, source + 2,
+                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
+                   FALSE, TRUE, errorCode);
+    assertEquals("illFormed truncated", U_TRUNCATED_CHAR_FOUND, errorCode.reset());
+    length = (int32_t)(target - result);
+    assertEquals("illFormed number of bytes written", 0, length);
+    errorLength = UPRV_LENGTHOF(errorBytes);
+    ucnv_getInvalidChars(cnv1.getAlias(), errorBytes, &errorLength, errorCode);
+    assertEquals("illFormed truncated errorLength", 2, (int32_t)errorLength);
+    if (errorLength == 2) {
+        assertEquals("illFormed truncated errorBytes", 0xf191, 
+                     ((int32_t)(uint8_t)errorBytes[0] << 8) | (uint8_t)errorBytes[1]);
+    }
+
+    // Continue conversion starting with a trail byte.
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+                   &target, targetLimit, &source, sourceLimit,
+                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
+                   FALSE, TRUE, errorCode);
+
+    assertEquals("illFormed trail byte", U_ILLEGAL_CHAR_FOUND, errorCode.reset());
+    length = (int32_t)(target - result);
+    assertEquals("illFormed trail byte number of bytes written", 0, length);
+    errorLength = UPRV_LENGTHOF(errorBytes);
+    ucnv_getInvalidChars(cnv1.getAlias(), errorBytes, &errorLength, errorCode);
+    assertEquals("illFormed trail byte errorLength", 1, (int32_t)errorLength);
+    if (errorLength == 1) {
+        assertEquals("illFormed trail byte errorBytes", 0x93, (int32_t)(uint8_t)errorBytes[0]);
+    }
 }

 // open testdata or ICU data converter ------------------------------------- ***