QLocal8Bit::convertFromUnicode[win]: handle trailing high surrogate

The win32 API doesn't give us much choice. _Some_ code pages have support for returning some error if we pass a specific flag, but not all of them. Anyway, since the code pages might not support all that UTF-16 provides, we can't reasonably make it error out on characters that cannot be converted. So, the most reasonable thing we can handle is a unpaired high surrogate at the end of a string, assume that the rest of the string was fine, and that the low surrogate will be provided in the next call. Pick-to: 6.6 6.5 Fixes: QTBUG-118185 Task-number: QTBUG-105105 Change-Id: I1f193c9d8e04bec769d885d32440c759d9dff0c2 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com> Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
2023-10-18 17:30:14 +02:00 · 2023-10-18 17:30:14 +02:00 · d8d5922f16
commit d8d5922f16
parent e34ee8e88a
2 changed files with 66 additions and 16 deletions
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@ -1388,7 +1388,6 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,

    Q_ASSERT(uclen < INT_MAX); // ### FIXME
    Q_ASSERT(state);
-    Q_UNUSED(state); // ### Fixme
    if (state->flags & QStringConverter::Flag::Stateless) // temporary
        state = nullptr;

@ -1402,15 +1401,47 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
    qsizetype outlen = buf.size();
    QByteArray mb;

-    int len;
+    if (state && state->remainingChars > 0) {
+        Q_ASSERT(state->remainingChars == 1);
+        // Let's try to decode the pending character
+        wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] };
+        int len = WideCharToMultiByte(codePage, 0, wc, int(std::size(wc)), out, outlen, nullptr,
+                                      nullptr);
+        if (!len)
+            return {}; // Cannot recover, and I refuse to believe it was a size limitation
+        out += len;
+        outlen -= len;
+        ++ch;
+        --uclen;
+        state->remainingChars = 0;
+        state->state_data[0] = 0;
+        if (uclen == 0)
+            return QByteArrayView(buf.data(), len).toByteArray();
+    }
+
+    if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
+        // We can handle a missing low surrogate at the end of the string,
+        // so if there is one, exclude it now and store it in the state.
+        state->remainingChars = 1;
+        state->state_data[0] = ch[uclen - 1];
+        --uclen;
+        if (uclen == 0)
+            return QByteArray();
+    }
+
+    Q_ASSERT(uclen > 0);
+
+    int len = 0;
    while (!(len = WideCharToMultiByte(codePage, 0, ch, int(uclen), out, int(outlen), nullptr,
                                       nullptr))) {
        int r = GetLastError();
        if (r == ERROR_INSUFFICIENT_BUFFER) {
            int neededLength = WideCharToMultiByte(codePage, 0, ch, int(uclen), nullptr, 0, nullptr,
                                                   nullptr);
-            mb.resize(neededLength);
-            out = mb.data();
+            const qsizetype currentLength = out - buf.data();
+            mb.resize(currentLength + neededLength);
+            memcpy(mb.data(), out, currentLength * sizeof(*out));
+            out = mb.data() + currentLength;
            outlen = neededLength;
            // and try again...
        } else {
@ -1424,12 +1455,13 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
            break;
        }
    }
-    if (!len)
-        return QByteArray();
-    if (out == buf.data())
-        mb = QByteArray(buf.data(), len);
-    else
-        mb.resize(len);
+    auto end = out + len;
+    if (QtPrivate::q_points_into_range(out, buf.data(), buf.data() + buf.size())) {
+        if (end != buf.data()) // else: we return null-array
+            mb = QByteArrayView(buf.data(), end).toByteArray();
+    } else {
+        mb.truncate(end - mb.data());
+    }
    return mb;
 }
 #endif
--- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
+++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
@ -2508,6 +2508,10 @@ void tst_QStringConverter::fromLocal8Bit_data()
    QTest::newRow("shiftJIS")
            << "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\xa4\x90\xa2\x8a\x45\x81\x49"_ba
            << u"こんにちは､世界！"_s << SHIFT_JIS;
+
+    constexpr uint GB_18030 = 54936u;
+    QTest::newRow("GB-18030") << "\xc4\xe3\xba\xc3\xca\xc0\xbd\xe7\xa3\xa1"_ba << u"你好世界！"_s
+                              << GB_18030;
 }

 void tst_QStringConverter::fromLocal8Bit()
@ -2604,6 +2608,7 @@ void tst_QStringConverter::toLocal8Bit()
    for (QChar c : utf16)
        result += QLocal8Bit::convertFromUnicode_sys(QStringView(&c, 1), codePage, &state);
    QCOMPARE(result, eightBit);
+    QCOMPARE(state.remainingChars, 0);
 }

 void tst_QStringConverter::toLocal8Bit_special_cases()
@ -2613,20 +2618,33 @@ void tst_QStringConverter::toLocal8Bit_special_cases()
    constexpr uint UTF8 = 65001u;
    // Decode a 2-code unit character, but only provide 1 code unit at first:
    const char16_t a[] = u"𬽦";
-    QStringView firstHalf = QStringView(a, 1);
-    QByteArray result = QLocal8Bit::convertFromUnicode_sys(firstHalf, UTF8, &state);
-    QEXPECT_FAIL("", "We don't currently handle missing the low surrogate", Abort);
+    QStringView codeUnits = a;
+    QByteArray result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state);
    QCOMPARE(result, QString());
    QVERIFY(result.isNull());
    QCOMPARE_GT(state.remainingChars, 0);
    // Then provide the second code unit:
-    QStringView secondHalf = QStringView(a + 1, 1);
-    result = QLocal8Bit::convertFromUnicode_sys(secondHalf, UTF8, &state);
+    result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state);
    QCOMPARE(result, "\xf0\xac\xbd\xa6"_ba);
    QCOMPARE(state.remainingChars, 0);

    // Retain compat with the behavior for toLocal8Bit:
-    QCOMPARE(firstHalf.toLocal8Bit(), "?");
+    QCOMPARE(codeUnits.first(1).toLocal8Bit(), "?");
+
+    // Now do the same, but the second time we feed in a character, we also
+    // provide many more so the internal stack buffer is not large enough.
+    result.clear();
+    state.clear();
+    QString str = QStringView(a).toString().repeated(2048);
+    codeUnits = str;
+    result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state);
+    QCOMPARE(result, QString());
+    QVERIFY(result.isNull());
+    QCOMPARE_GT(state.remainingChars, 0);
+    // Then we provide the rest of the string:
+    result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state);
+    QCOMPARE(result.first(4), "\xf0\xac\xbd\xa6"_ba);
+    QCOMPARE(state.remainingChars, 0);
 }
 #endif // Q_OS_WIN