QStringConverter[win]: expose+test control of code-page

Then we can easily test how fromLocal8Bit() and toLocal8Bit() behave
with different code-pages.

Pick-to: 6.6 6.5
Task-number: QTBUG-118318
Task-number: QTBUG-118185
Task-number: QTBUG-105105
Change-Id: Ib1cd3bccd27d598f4c80915557e332befcd96354
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Mårten Nordheim 2023-10-16 16:37:34 +02:00
parent 66b7cb2a88
commit 13fbedd162
3 changed files with 172 additions and 10 deletions

View File

@ -1253,7 +1253,8 @@ int QLocal8Bit::checkUtf8()
return GetACP() == CP_UTF8 ? 1 : -1;
}
static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
static QString convertToUnicodeCharByChar(QByteArrayView in, quint32 codePage,
QStringConverter::State *state)
{
qsizetype length = in.size();
const char *chars = in.data();
@ -1285,10 +1286,10 @@ static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::S
const char *mb = mbcs;
const char *next = 0;
QString s;
while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
while ((next = CharNextExA(codePage, mb, 0)) != mb) {
wchar_t wc[2] ={0};
int charlength = int(next - mb); // always just a few bytes
int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
int len = MultiByteToWideChar(codePage, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
if (len>0) {
s.append(QChar(wc[0]));
} else {
@ -1305,8 +1306,13 @@ static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::S
return s;
}
QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
{
return convertToUnicode_sys(in, CP_ACP, state);
}
QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
QStringConverter::State *state)
{
qsizetype length = in.size();
@ -1336,7 +1342,7 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St
prev[0] = state_data;
prev[1] = mb[0];
remainingChars = 0;
len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
len = MultiByteToWideChar(codePage, MB_PRECOMPOSED,
prev, 2, wc.data(), wc.length());
if (len) {
sp.append(QChar(wc[0]));
@ -1351,11 +1357,11 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St
}
}
while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
while (!(len=MultiByteToWideChar(codePage, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
mb, mblen, wc.data(), wc.length()))) {
int r = GetLastError();
if (r == ERROR_INSUFFICIENT_BUFFER) {
const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
const int wclen = MultiByteToWideChar(codePage, MB_PRECOMPOSED,
mb, mblen, 0, 0);
wc.resize(wclen);
} else if (r == ERROR_NO_UNICODE_TRANSLATION) {
@ -1364,7 +1370,7 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St
mblen--;
//check whether, we hit an invalid character in the middle
if ((mblen <= 1) || (remainingChars && state_data))
return convertToUnicodeCharByChar(in, state);
return convertToUnicodeCharByChar(in, codePage, state);
//Remove the last character and try again...
state_data = mb[mblen-1];
remainingChars = 1;
@ -1395,6 +1401,12 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St
}
QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
{
return convertFromUnicode_sys(in, CP_ACP, state);
}
QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
QStringConverter::State *state)
{
const QChar *ch = in.data();
qsizetype uclen = in.size();
@ -1412,12 +1424,12 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::
BOOL used_def;
QByteArray mb(4096, 0);
int len;
while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
while (!(len=WideCharToMultiByte(codePage, 0, (const wchar_t*)ch, uclen,
mb.data(), mb.size()-1, 0, &used_def)))
{
int r = GetLastError();
if (r == ERROR_INSUFFICIENT_BUFFER) {
mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
mb.resize(1+WideCharToMultiByte(codePage, 0,
(const wchar_t*)ch, uclen,
0, 0, 0, &used_def));
// and try again...

View File

@ -362,6 +362,7 @@ struct Q_CORE_EXPORT QLocal8Bit
}
return r > 0;
}
static QString convertToUnicode_sys(QByteArrayView, quint32, QStringConverter::State *);
static QString convertToUnicode_sys(QByteArrayView, QStringConverter::State *);
static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
{
@ -369,6 +370,7 @@ struct Q_CORE_EXPORT QLocal8Bit
return QUtf8::convertToUnicode(in, state);
return convertToUnicode_sys(in, state);
}
static QByteArray convertFromUnicode_sys(QStringView, quint32, QStringConverter::State *);
static QByteArray convertFromUnicode_sys(QStringView, QStringConverter::State *);
static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
{

View File

@ -180,6 +180,16 @@ private slots:
void encodingForHtml();
void availableCodesAreAvailable();
#ifdef Q_OS_WIN
// On all other systems local 8-bit encoding is UTF-8
void fromLocal8Bit_data();
void fromLocal8Bit();
void fromLocal8Bit_special_cases();
void toLocal8Bit_data();
void toLocal8Bit();
void toLocal8Bit_special_cases();
#endif
};
void tst_QStringConverter::constructByName()
@ -2484,6 +2494,144 @@ void tst_QStringConverter::threadSafety()
QCOMPARE(b, QString::fromLatin1("abcdefghijklmonpqrstufvxyz"));
}
#ifdef Q_OS_WIN
void tst_QStringConverter::fromLocal8Bit_data()
{
QTest::addColumn<QByteArray>("eightBit");
QTest::addColumn<QString>("utf16");
QTest::addColumn<quint32>("codePage");
constexpr uint WINDOWS_1252 = 1252u;
QTest::newRow("windows-1252") << "Hello, world!"_ba << u"Hello, world!"_s << WINDOWS_1252;
constexpr uint SHIFT_JIS = 932u;
// Mostly two byte characters, but the comma is a single byte character (0xa4)
QTest::newRow("shiftJIS")
<< "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\xa4\x90\xa2\x8a\x45\x81\x49"_ba
<< u"こんにちは、世界!"_s << SHIFT_JIS;
}
void tst_QStringConverter::fromLocal8Bit()
{
QFETCH(const QByteArray, eightBit);
QFETCH(const QString, utf16);
QFETCH(const quint32, codePage);
QStringConverter::State state;
QString result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state);
QCOMPARE(result, utf16);
QCOMPARE(state.remainingChars, 0);
result.clear();
state.clear();
for (char c : eightBit)
result += QLocal8Bit::convertToUnicode_sys({&c, 1}, codePage, &state);
QCOMPARE(result, utf16);
QCOMPARE(state.remainingChars, 0);
}
void tst_QStringConverter::fromLocal8Bit_special_cases()
{
QStringConverter::State state;
constexpr uint SHIFT_JIS = 932u;
// Decode a 2-octet character, but only provide 1 octet at first:
QString result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &state);
QCOMPARE(result, QString());
QVERIFY(result.isNull());
QCOMPARE_GT(state.remainingChars, 0);
// Then provide the second octet:
result = QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &state);
QCOMPARE(result, u"");
QCOMPARE(state.remainingChars, 0);
// Now try a 3-octet UTF-8 sequence:
result.clear();
state.clear();
constexpr uint UTF8 = 65001u;
// First the first 2 octets:
result = QLocal8Bit::convertToUnicode_sys("\xe4\xbd", UTF8, &state);
QCOMPARE(result, QString());
QVERIFY(result.isNull());
QCOMPARE_GT(state.remainingChars, 0);
// Then provide the remaining octet:
result = QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state);
QEXPECT_FAIL("", "We don't store enough state to handle this case", Abort);
QCOMPARE(result, u"");
QCOMPARE(state.remainingChars, 0);
// Now try a 4-octet GB 18030 sequence:
result.clear();
state.clear();
constexpr uint GB_18030 = 54936u;
const char sequence[] = "\x95\x32\x90\x31";
QByteArrayView octets = QByteArrayView(sequence);
result = QLocal8Bit::convertToUnicode_sys(octets.first(2), GB_18030, &state);
QCOMPARE(result, QString());
QVERIFY(result.isNull());
QEXPECT_FAIL("",
"We don't store enough state to handle this case. + GB 18030 does not work with "
"the MB_PRECOMPOSED flag.",
Abort);
QCOMPARE_GT(state.remainingChars, 0);
// Then provide one more octet:
result = QLocal8Bit::convertToUnicode_sys(octets.sliced(2, 1), GB_18030, &state);
QCOMPARE(result, QString());
QVERIFY(result.isNull());
QCOMPARE_GT(state.remainingChars, 0);
// Then provide the last octet
result = QLocal8Bit::convertToUnicode_sys(octets.last(1), GB_18030, &state);
QCOMPARE(result, u"𠂇");
QCOMPARE(state.remainingChars, 0);
}
void tst_QStringConverter::toLocal8Bit_data()
{
fromLocal8Bit_data();
}
void tst_QStringConverter::toLocal8Bit()
{
QFETCH(const QByteArray, eightBit);
QFETCH(const QString, utf16);
QFETCH(const quint32, codePage);
QStringConverter::State state;
QByteArray result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state);
QCOMPARE(result, eightBit);
QCOMPARE(state.remainingChars, 0);
result.clear();
state.clear();
for (QChar c : utf16)
result += QLocal8Bit::convertFromUnicode_sys(QStringView(&c, 1), codePage, &state);
QCOMPARE(result, eightBit);
}
void tst_QStringConverter::toLocal8Bit_special_cases()
{
QStringConverter::State state;
// Normally utf8 goes through a different code path, but we can force it here
constexpr uint UTF8 = 65001u;
// Decode a 2-code unit character, but only provide 1 code unit at first:
const char16_t a[] = u"𬽦";
QStringView firstHalf = QStringView(a, 1);
QByteArray result = QLocal8Bit::convertFromUnicode_sys(firstHalf, UTF8, &state);
QEXPECT_FAIL("", "We don't currently handle missing the low surrogate", Abort);
QCOMPARE(result, QString());
QVERIFY(result.isNull());
QCOMPARE_GT(state.remainingChars, 0);
// Then provide the second code unit:
QStringView secondHalf = QStringView(a + 1, 1);
result = QLocal8Bit::convertFromUnicode_sys(secondHalf, UTF8, &state);
QCOMPARE(result, "\xf0\xac\xbd\xa6"_ba);
QCOMPARE(state.remainingChars, 0);
// Retain compat with the behavior for toLocal8Bit:
QCOMPARE(firstHalf.toLocal8Bit(), "?");
}
#endif
struct DontCrashAtExit {
~DontCrashAtExit() {
QStringDecoder decoder(QStringDecoder::Utf8);