QLocal8Bit::convertToUnicode[win]: handle more than one octet state

Both to store and to restore.

Without this a 3 or more octet sequence would cause errors or wrong
output. This can be seen with GB 18030.

Pick-to: 6.6 6.5
Fixes: QTBUG-118318
Task-number: QTBUG-105105
Change-Id: Id1f7f5f2fba4633b9f888add2186f4d8d21b7293
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Mårten Nordheim 2023-10-20 13:48:21 +02:00
parent ef24784f88
commit 94214fe100
2 changed files with 62 additions and 83 deletions

View File

@ -24,6 +24,7 @@
#include <qt_windows.h>
#ifndef QT_BOOTSTRAPPED
#include <QtCore/qvarlengtharray.h>
#include <QtCore/q20iterator.h>
#endif // !QT_BOOTSTRAPPED
#endif
@ -1256,59 +1257,6 @@ int QLocal8Bit::checkUtf8()
return GetACP() == CP_UTF8 ? 1 : -1;
}
static QString convertToUnicodeCharByChar(QByteArrayView in, quint32 codePage,
QStringConverter::State *state)
{
qsizetype length = in.size();
const char *chars = in.data();
Q_ASSERT(state);
if (state->flags & QStringConverter::Flag::Stateless) // temporary
state = nullptr;
if (!chars || !length)
return QString();
qsizetype copyLocation = 0;
qsizetype extra = 2;
if (state && state->remainingChars) {
copyLocation = state->remainingChars;
extra += copyLocation;
}
qsizetype newLength = length + extra;
char *mbcs = new char[newLength];
//ensure that we have a NULL terminated string
mbcs[newLength-1] = 0;
mbcs[newLength-2] = 0;
memcpy(&(mbcs[copyLocation]), chars, length);
if (copyLocation) {
//copy the last character from the state
mbcs[0] = (char)state->state_data[0];
state->remainingChars = 0;
}
const char *mb = mbcs;
const char *next = 0;
QString s;
while ((next = CharNextExA(codePage, mb, 0)) != mb) {
wchar_t wc[2] ={0};
int charlength = int(next - mb); // always just a few bytes
int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
if (len>0) {
s.append(QChar(wc[0]));
} else {
int r = GetLastError();
//check if the character being dropped is the last character
if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
state->remainingChars = 1;
state->state_data[0] = (char)*mb;
}
}
mb = next;
}
delete [] mbcs;
return s;
}
QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
{
return convertToUnicode_sys(in, CP_ACP, state);
@ -1330,28 +1278,60 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
wchar_t *out = buf.data();
qsizetype outlen = buf.size();
int len;
int len = 0;
QString sp;
//convert the pending character (if available)
if (state && state->remainingChars) {
char prev[3] = {0};
prev[0] = state->state_data[0];
prev[1] = mb[0];
state->remainingChars = 0;
len = MultiByteToWideChar(codePage, 0, prev, 2, out, outlen);
// Use at most 6 characters as a guess for the longest encoded character
// in any multibyte encoding.
// Even with a total of 2 bytes of overhead that would leave around
// 2^(4 * 8) possible characters
std::array<char, 6> prev = {0};
Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
int remainingChars = state->remainingChars;
for (int i = 0; i < remainingChars; ++i)
prev[i] = state->state_data[i];
do {
prev[remainingChars] = *mb;
++mb;
--mblen;
++remainingChars;
len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, prev.data(),
remainingChars, out, int(outlen));
} while (!len && mblen && remainingChars < int(prev.size()));
if (len) {
if (mblen == 1)
state->remainingChars = 0;
if (mblen == 0)
return QStringView(out, len).toString();
mb++;
mblen--;
++out;
--outlen;
out += len;
outlen -= len;
} else if (mblen == 0 && remainingChars <= q20::ssize(state->state_data)) {
// Update the state, maybe we're lucky next time
for (int i = state->remainingChars; i < remainingChars; ++i)
state->state_data[i] = prev[i];
state->remainingChars = remainingChars;
return QString();
} else {
// Reset the pointer and length, since we used none of it.
mb = in.data();
mblen = in.length();
// We couldn't decode any of the characters in the saved state,
// so output replacement characters
for (int i = 0; i < state->remainingChars; ++i)
out[i] = QChar::ReplacementCharacter;
out += state->remainingChars;
outlen -= state->remainingChars;
state->remainingChars = 0;
}
}
while (!(len=MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS,
mb, mblen, out, int(outlen)))) {
Q_ASSERT(mblen > 0);
Q_ASSERT(state->remainingChars == 0);
while (!(len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, mblen, out,
int(outlen)))) {
int r = GetLastError();
if (r == ERROR_INSUFFICIENT_BUFFER) {
Q_ASSERT(QtPrivate::q_points_into_range(out, buf.data(), buf.data() + buf.size()));
@ -1362,16 +1342,14 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
it = std::copy_n(buf.data(), offset, it);
out = it;
outlen = wclen;
} else if (r == ERROR_NO_UNICODE_TRANSLATION) {
//check whether, we hit an invalid character in the middle
if (state && ((mblen <= 1) || (state->remainingChars && state->state_data[0])))
return convertToUnicodeCharByChar(in, codePage, state);
//Remove the last character and try again...
if (state) {
state->state_data[0] = mb[mblen - 1];
state->remainingChars = 1;
} // else: We have discarded a character that we won't handle? @todo
mblen--;
} else if (r == ERROR_NO_UNICODE_TRANSLATION && state
&& state->remainingChars < q20::ssize(state->state_data)) {
++state->remainingChars;
--mblen;
for (qsizetype i = 0; i < state->remainingChars; ++i)
state->state_data[i] = mb[mblen + i];
if (mblen == 0)
break;
} else {
// Fail.
qWarning("MultiByteToWideChar: Cannot convert multibyte text");

View File

@ -2555,7 +2555,6 @@ void tst_QStringConverter::fromLocal8Bit_special_cases()
QCOMPARE_GT(state.remainingChars, 0);
// Then provide the remaining octet:
result = QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state);
QEXPECT_FAIL("", "We don't store enough state to handle this case", Abort);
QCOMPARE(result, u"");
QCOMPARE(state.remainingChars, 0);
@ -2564,20 +2563,22 @@ void tst_QStringConverter::fromLocal8Bit_special_cases()
state.clear();
constexpr uint GB_18030 = 54936u;
const char sequence[] = "\x95\x32\x90\x31";
QByteArrayView octets = QByteArrayView(sequence);
// Repeat the sequence multiple times to test handling of exhaustion of
// internal buffer
QByteArray repeated = QByteArray(sequence).repeated(2049);
QByteArrayView octets = QByteArrayView(repeated);
result = QLocal8Bit::convertToUnicode_sys(octets.first(2), GB_18030, &state);
QCOMPARE(result, QString());
QVERIFY(result.isNull());
QEXPECT_FAIL("", "We don't store enough state to handle this case.", Abort);
QCOMPARE_GT(state.remainingChars, 0);
// Then provide one more octet:
result = QLocal8Bit::convertToUnicode_sys(octets.sliced(2, 1), GB_18030, &state);
QCOMPARE(result, QString());
QVERIFY(result.isNull());
QCOMPARE_GT(state.remainingChars, 0);
// Then provide the last octet
result = QLocal8Bit::convertToUnicode_sys(octets.last(1), GB_18030, &state);
QCOMPARE(result, u"𠂇");
// Then provide the last octet + the rest of the string
result = QLocal8Bit::convertToUnicode_sys(octets.sliced(3), GB_18030, &state);
QCOMPARE(result.first(2), u"𠂇");
QCOMPARE(state.remainingChars, 0);
}
@ -2627,7 +2628,7 @@ void tst_QStringConverter::toLocal8Bit_special_cases()
// Retain compat with the behavior for toLocal8Bit:
QCOMPARE(firstHalf.toLocal8Bit(), "?");
}
#endif
#endif // Q_OS_WIN
struct DontCrashAtExit {
~DontCrashAtExit() {