ICU-21280 Correct source bytes counting in UTF8->UTF8 conversion

2020-09-11 15:50:41 +03:00 · 2020-09-11 15:50:41 +03:00 · 5a42118a6f
commit 5a42118a6f
parent 5ed09dc9b8
3 changed files with 63 additions and 2 deletions
--- a/icu4c/source/common/ucnv_u8.cpp
+++ b/icu4c/source/common/ucnv_u8.cpp
@ -707,9 +707,9 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,

        // Do not go back into the bytes that will be read for finishing a partial
        // sequence from the previous buffer.
-        int32_t length=count-toULimit;
+        int32_t length=count-toULength;
        U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
-        count=toULimit+length;
+        count=toULength+length;
    }

    if(c!=0) {
--- a/icu4c/source/test/intltest/convtest.cpp
+++ b/icu4c/source/test/intltest/convtest.cpp
@ -77,6 +77,7 @@ ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
    TESTCASE_AUTO(TestGetUnicodeSet2);
    TESTCASE_AUTO(TestDefaultIgnorableCallback);
    TESTCASE_AUTO(TestUTF8ToUTF8Overflow);
+    TESTCASE_AUTO(TestUTF8ToUTF8Streaming);
    TESTCASE_AUTO_END;
 }

@ -830,6 +831,65 @@ ConversionTest::TestUTF8ToUTF8Overflow() {
    }
 }

+void
+ConversionTest::TestUTF8ToUTF8Streaming() {
+    IcuTestErrorCode errorCode(*this, "TestUTF8ToUTF8Streaming");
+    LocalUConverterPointer cnv1(ucnv_open("UTF-8", errorCode));
+    LocalUConverterPointer cnv2(ucnv_open("UTF-8", errorCode));
+
+    // UTF8 encoded cyrillic part of 'Lorem ipsum'
+    static const char* text =
+        "\xd0\xb5\xd1\x82\x20\xd1\x81\xd1\x86\xd0\xb0\xd0\xb5\xd0\xb2\xd0"
+        "\xbe\xd0\xbb\xd0\xb0\x20\xd1\x81\xd0\xb0\xd0\xb4\xd0\xb8\xd0\xbf"
+        "\xd1\x81\xd1\x86\xd0\xb8\xd0\xbd\xd0\xb3\x20\xd0\xb0\xd1\x86\xd1"
+        "\x86\xd0\xbe\xd0\xbc\xd0\xbc\xd0\xbe\xd0\xb4\xd0\xb0\xd1\x80\xd0"
+        "\xb5\x20\xd1\x85\xd0\xb0\xd1\x81";
+
+    int32_t chunk1 = 25; // partial lead at the end: 0xd0
+    int32_t chunk2 = 47; // partial tail at the beginning: 0xb0
+
+    char result[128];
+
+    int32_t sourceLen = (int32_t)strlen(text);
+    const char* source = text;
+    const char* sourceLimit = text + chunk1;
+
+    int32_t targetLen = sizeof(result);
+    char* target = result;
+    const char* targetLimit = result + targetLen;
+
+    UChar buffer16[20];
+    UChar* pivotSource = buffer16;
+    UChar* pivotTarget = buffer16;
+    const UChar* pivotLimit = buffer16 + UPRV_LENGTHOF(buffer16);
+
+    int32_t length;
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+        &target, result + targetLen, &source, sourceLimit,
+        buffer16, &pivotSource, &pivotTarget, pivotLimit,
+        FALSE, FALSE, errorCode);
+
+    length = (int32_t)(target - result);
+    targetLen -= length;
+    assertEquals("First chunk -1 doesn't match converted length", chunk1 - 1, length);
+
+    source = text + chunk1;
+    sourceLimit = source + chunk2;
+
+    // Convert the rest and flush.
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+        &target, targetLimit, &source, sourceLimit,
+        buffer16, &pivotSource, &pivotTarget, pivotLimit,
+        FALSE, TRUE, errorCode);
+
+    length = (int32_t)(target - result - length);
+    targetLen -= length;
+    assertEquals("Second chunk + 2 doesn't  match converted length", chunk2 + 1, length);
+
+    assertEquals("Full text length match", sourceLen, sizeof(result) - targetLen);
+    assertSuccess("UTF-8->UTF-8", errorCode);
+}
+
 // open testdata or ICU data converter ------------------------------------- ***

 UConverter *
--- a/icu4c/source/test/intltest/convtest.h
+++ b/icu4c/source/test/intltest/convtest.h
@ -77,6 +77,7 @@ public:
    void TestGetUnicodeSet2();
    void TestDefaultIgnorableCallback();
    void TestUTF8ToUTF8Overflow();
+    void TestUTF8ToUTF8Streaming();

 private:
    UBool