QLocal8Bit::convertFromUnicode[win]: handle trailing high surrogate

The win32 API doesn't give us much choice. _Some_ code pages have
support for returning some error if we pass a specific flag, but not
all of them.

Anyway, since the code pages might not support all that UTF-16 provides,
we can't reasonably make it error out on characters that cannot be
converted.

So, the most reasonable thing we can handle is a unpaired high surrogate
at the end of a string, assume that the rest of the string was fine, and
that the low surrogate will be provided in the next call.

Pick-to: 6.6 6.5
Fixes: QTBUG-118185
Task-number: QTBUG-105105
Change-Id: I1f193c9d8e04bec769d885d32440c759d9dff0c2
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
This commit is contained in:
Mårten Nordheim 2023-10-18 17:30:14 +02:00
parent e34ee8e88a
commit d8d5922f16
2 changed files with 66 additions and 16 deletions

View File

@ -1388,7 +1388,6 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
Q_ASSERT(uclen < INT_MAX); // ### FIXME
Q_ASSERT(state);
Q_UNUSED(state); // ### Fixme
if (state->flags & QStringConverter::Flag::Stateless) // temporary
state = nullptr;
@ -1402,15 +1401,47 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
qsizetype outlen = buf.size();
QByteArray mb;
int len;
if (state && state->remainingChars > 0) {
Q_ASSERT(state->remainingChars == 1);
// Let's try to decode the pending character
wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] };
int len = WideCharToMultiByte(codePage, 0, wc, int(std::size(wc)), out, outlen, nullptr,
nullptr);
if (!len)
return {}; // Cannot recover, and I refuse to believe it was a size limitation
out += len;
outlen -= len;
++ch;
--uclen;
state->remainingChars = 0;
state->state_data[0] = 0;
if (uclen == 0)
return QByteArrayView(buf.data(), len).toByteArray();
}
if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
// We can handle a missing low surrogate at the end of the string,
// so if there is one, exclude it now and store it in the state.
state->remainingChars = 1;
state->state_data[0] = ch[uclen - 1];
--uclen;
if (uclen == 0)
return QByteArray();
}
Q_ASSERT(uclen > 0);
int len = 0;
while (!(len = WideCharToMultiByte(codePage, 0, ch, int(uclen), out, int(outlen), nullptr,
nullptr))) {
int r = GetLastError();
if (r == ERROR_INSUFFICIENT_BUFFER) {
int neededLength = WideCharToMultiByte(codePage, 0, ch, int(uclen), nullptr, 0, nullptr,
nullptr);
mb.resize(neededLength);
out = mb.data();
const qsizetype currentLength = out - buf.data();
mb.resize(currentLength + neededLength);
memcpy(mb.data(), out, currentLength * sizeof(*out));
out = mb.data() + currentLength;
outlen = neededLength;
// and try again...
} else {
@ -1424,12 +1455,13 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
break;
}
}
if (!len)
return QByteArray();
if (out == buf.data())
mb = QByteArray(buf.data(), len);
else
mb.resize(len);
auto end = out + len;
if (QtPrivate::q_points_into_range(out, buf.data(), buf.data() + buf.size())) {
if (end != buf.data()) // else: we return null-array
mb = QByteArrayView(buf.data(), end).toByteArray();
} else {
mb.truncate(end - mb.data());
}
return mb;
}
#endif

View File

@ -2508,6 +2508,10 @@ void tst_QStringConverter::fromLocal8Bit_data()
QTest::newRow("shiftJIS")
<< "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\xa4\x90\xa2\x8a\x45\x81\x49"_ba
<< u"こんにちは、世界!"_s << SHIFT_JIS;
constexpr uint GB_18030 = 54936u;
QTest::newRow("GB-18030") << "\xc4\xe3\xba\xc3\xca\xc0\xbd\xe7\xa3\xa1"_ba << u"你好世界!"_s
<< GB_18030;
}
void tst_QStringConverter::fromLocal8Bit()
@ -2604,6 +2608,7 @@ void tst_QStringConverter::toLocal8Bit()
for (QChar c : utf16)
result += QLocal8Bit::convertFromUnicode_sys(QStringView(&c, 1), codePage, &state);
QCOMPARE(result, eightBit);
QCOMPARE(state.remainingChars, 0);
}
void tst_QStringConverter::toLocal8Bit_special_cases()
@ -2613,20 +2618,33 @@ void tst_QStringConverter::toLocal8Bit_special_cases()
constexpr uint UTF8 = 65001u;
// Decode a 2-code unit character, but only provide 1 code unit at first:
const char16_t a[] = u"𬽦";
QStringView firstHalf = QStringView(a, 1);
QByteArray result = QLocal8Bit::convertFromUnicode_sys(firstHalf, UTF8, &state);
QEXPECT_FAIL("", "We don't currently handle missing the low surrogate", Abort);
QStringView codeUnits = a;
QByteArray result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state);
QCOMPARE(result, QString());
QVERIFY(result.isNull());
QCOMPARE_GT(state.remainingChars, 0);
// Then provide the second code unit:
QStringView secondHalf = QStringView(a + 1, 1);
result = QLocal8Bit::convertFromUnicode_sys(secondHalf, UTF8, &state);
result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state);
QCOMPARE(result, "\xf0\xac\xbd\xa6"_ba);
QCOMPARE(state.remainingChars, 0);
// Retain compat with the behavior for toLocal8Bit:
QCOMPARE(firstHalf.toLocal8Bit(), "?");
QCOMPARE(codeUnits.first(1).toLocal8Bit(), "?");
// Now do the same, but the second time we feed in a character, we also
// provide many more so the internal stack buffer is not large enough.
result.clear();
state.clear();
QString str = QStringView(a).toString().repeated(2048);
codeUnits = str;
result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state);
QCOMPARE(result, QString());
QVERIFY(result.isNull());
QCOMPARE_GT(state.remainingChars, 0);
// Then we provide the rest of the string:
result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state);
QCOMPARE(result.first(4), "\xf0\xac\xbd\xa6"_ba);
QCOMPARE(state.remainingChars, 0);
}
#endif // Q_OS_WIN