QStringConverter[win]: expose+test control of code-page
Then we can easily test how fromLocal8Bit() and toLocal8Bit() behave with different code-pages. Pick-to: 6.6 6.5 Task-number: QTBUG-118318 Task-number: QTBUG-118185 Task-number: QTBUG-105105 Change-Id: Ib1cd3bccd27d598f4c80915557e332befcd96354 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
66b7cb2a88
commit
13fbedd162
@ -1253,7 +1253,8 @@ int QLocal8Bit::checkUtf8()
|
||||
return GetACP() == CP_UTF8 ? 1 : -1;
|
||||
}
|
||||
|
||||
static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
|
||||
static QString convertToUnicodeCharByChar(QByteArrayView in, quint32 codePage,
|
||||
QStringConverter::State *state)
|
||||
{
|
||||
qsizetype length = in.size();
|
||||
const char *chars = in.data();
|
||||
@ -1285,10 +1286,10 @@ static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::S
|
||||
const char *mb = mbcs;
|
||||
const char *next = 0;
|
||||
QString s;
|
||||
while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
|
||||
while ((next = CharNextExA(codePage, mb, 0)) != mb) {
|
||||
wchar_t wc[2] ={0};
|
||||
int charlength = int(next - mb); // always just a few bytes
|
||||
int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
|
||||
int len = MultiByteToWideChar(codePage, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
|
||||
if (len>0) {
|
||||
s.append(QChar(wc[0]));
|
||||
} else {
|
||||
@ -1305,8 +1306,13 @@ static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::S
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
|
||||
{
|
||||
return convertToUnicode_sys(in, CP_ACP, state);
|
||||
}
|
||||
|
||||
QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
|
||||
QStringConverter::State *state)
|
||||
{
|
||||
qsizetype length = in.size();
|
||||
|
||||
@ -1336,7 +1342,7 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St
|
||||
prev[0] = state_data;
|
||||
prev[1] = mb[0];
|
||||
remainingChars = 0;
|
||||
len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
|
||||
len = MultiByteToWideChar(codePage, MB_PRECOMPOSED,
|
||||
prev, 2, wc.data(), wc.length());
|
||||
if (len) {
|
||||
sp.append(QChar(wc[0]));
|
||||
@ -1351,11 +1357,11 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St
|
||||
}
|
||||
}
|
||||
|
||||
while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
|
||||
while (!(len=MultiByteToWideChar(codePage, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
|
||||
mb, mblen, wc.data(), wc.length()))) {
|
||||
int r = GetLastError();
|
||||
if (r == ERROR_INSUFFICIENT_BUFFER) {
|
||||
const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
|
||||
const int wclen = MultiByteToWideChar(codePage, MB_PRECOMPOSED,
|
||||
mb, mblen, 0, 0);
|
||||
wc.resize(wclen);
|
||||
} else if (r == ERROR_NO_UNICODE_TRANSLATION) {
|
||||
@ -1364,7 +1370,7 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St
|
||||
mblen--;
|
||||
//check whether, we hit an invalid character in the middle
|
||||
if ((mblen <= 1) || (remainingChars && state_data))
|
||||
return convertToUnicodeCharByChar(in, state);
|
||||
return convertToUnicodeCharByChar(in, codePage, state);
|
||||
//Remove the last character and try again...
|
||||
state_data = mb[mblen-1];
|
||||
remainingChars = 1;
|
||||
@ -1395,6 +1401,12 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St
|
||||
}
|
||||
|
||||
QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
|
||||
{
|
||||
return convertFromUnicode_sys(in, CP_ACP, state);
|
||||
}
|
||||
|
||||
QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
|
||||
QStringConverter::State *state)
|
||||
{
|
||||
const QChar *ch = in.data();
|
||||
qsizetype uclen = in.size();
|
||||
@ -1412,12 +1424,12 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::
|
||||
BOOL used_def;
|
||||
QByteArray mb(4096, 0);
|
||||
int len;
|
||||
while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
|
||||
while (!(len=WideCharToMultiByte(codePage, 0, (const wchar_t*)ch, uclen,
|
||||
mb.data(), mb.size()-1, 0, &used_def)))
|
||||
{
|
||||
int r = GetLastError();
|
||||
if (r == ERROR_INSUFFICIENT_BUFFER) {
|
||||
mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
|
||||
mb.resize(1+WideCharToMultiByte(codePage, 0,
|
||||
(const wchar_t*)ch, uclen,
|
||||
0, 0, 0, &used_def));
|
||||
// and try again...
|
||||
|
@ -362,6 +362,7 @@ struct Q_CORE_EXPORT QLocal8Bit
|
||||
}
|
||||
return r > 0;
|
||||
}
|
||||
static QString convertToUnicode_sys(QByteArrayView, quint32, QStringConverter::State *);
|
||||
static QString convertToUnicode_sys(QByteArrayView, QStringConverter::State *);
|
||||
static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
|
||||
{
|
||||
@ -369,6 +370,7 @@ struct Q_CORE_EXPORT QLocal8Bit
|
||||
return QUtf8::convertToUnicode(in, state);
|
||||
return convertToUnicode_sys(in, state);
|
||||
}
|
||||
static QByteArray convertFromUnicode_sys(QStringView, quint32, QStringConverter::State *);
|
||||
static QByteArray convertFromUnicode_sys(QStringView, QStringConverter::State *);
|
||||
static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
|
||||
{
|
||||
|
@ -180,6 +180,16 @@ private slots:
|
||||
void encodingForHtml();
|
||||
|
||||
void availableCodesAreAvailable();
|
||||
|
||||
#ifdef Q_OS_WIN
|
||||
// On all other systems local 8-bit encoding is UTF-8
|
||||
void fromLocal8Bit_data();
|
||||
void fromLocal8Bit();
|
||||
void fromLocal8Bit_special_cases();
|
||||
void toLocal8Bit_data();
|
||||
void toLocal8Bit();
|
||||
void toLocal8Bit_special_cases();
|
||||
#endif
|
||||
};
|
||||
|
||||
void tst_QStringConverter::constructByName()
|
||||
@ -2484,6 +2494,144 @@ void tst_QStringConverter::threadSafety()
|
||||
QCOMPARE(b, QString::fromLatin1("abcdefghijklmonpqrstufvxyz"));
|
||||
}
|
||||
|
||||
#ifdef Q_OS_WIN
|
||||
void tst_QStringConverter::fromLocal8Bit_data()
|
||||
{
|
||||
QTest::addColumn<QByteArray>("eightBit");
|
||||
QTest::addColumn<QString>("utf16");
|
||||
QTest::addColumn<quint32>("codePage");
|
||||
|
||||
constexpr uint WINDOWS_1252 = 1252u;
|
||||
QTest::newRow("windows-1252") << "Hello, world!"_ba << u"Hello, world!"_s << WINDOWS_1252;
|
||||
constexpr uint SHIFT_JIS = 932u;
|
||||
// Mostly two byte characters, but the comma is a single byte character (0xa4)
|
||||
QTest::newRow("shiftJIS")
|
||||
<< "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\xa4\x90\xa2\x8a\x45\x81\x49"_ba
|
||||
<< u"こんにちは、世界!"_s << SHIFT_JIS;
|
||||
}
|
||||
|
||||
void tst_QStringConverter::fromLocal8Bit()
|
||||
{
|
||||
QFETCH(const QByteArray, eightBit);
|
||||
QFETCH(const QString, utf16);
|
||||
QFETCH(const quint32, codePage);
|
||||
|
||||
QStringConverter::State state;
|
||||
|
||||
QString result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state);
|
||||
QCOMPARE(result, utf16);
|
||||
QCOMPARE(state.remainingChars, 0);
|
||||
|
||||
result.clear();
|
||||
state.clear();
|
||||
for (char c : eightBit)
|
||||
result += QLocal8Bit::convertToUnicode_sys({&c, 1}, codePage, &state);
|
||||
QCOMPARE(result, utf16);
|
||||
QCOMPARE(state.remainingChars, 0);
|
||||
}
|
||||
|
||||
void tst_QStringConverter::fromLocal8Bit_special_cases()
|
||||
{
|
||||
QStringConverter::State state;
|
||||
constexpr uint SHIFT_JIS = 932u;
|
||||
// Decode a 2-octet character, but only provide 1 octet at first:
|
||||
QString result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &state);
|
||||
QCOMPARE(result, QString());
|
||||
QVERIFY(result.isNull());
|
||||
QCOMPARE_GT(state.remainingChars, 0);
|
||||
// Then provide the second octet:
|
||||
result = QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &state);
|
||||
QCOMPARE(result, u"こ");
|
||||
QCOMPARE(state.remainingChars, 0);
|
||||
|
||||
// Now try a 3-octet UTF-8 sequence:
|
||||
result.clear();
|
||||
state.clear();
|
||||
constexpr uint UTF8 = 65001u;
|
||||
// First the first 2 octets:
|
||||
result = QLocal8Bit::convertToUnicode_sys("\xe4\xbd", UTF8, &state);
|
||||
QCOMPARE(result, QString());
|
||||
QVERIFY(result.isNull());
|
||||
QCOMPARE_GT(state.remainingChars, 0);
|
||||
// Then provide the remaining octet:
|
||||
result = QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state);
|
||||
QEXPECT_FAIL("", "We don't store enough state to handle this case", Abort);
|
||||
QCOMPARE(result, u"你");
|
||||
QCOMPARE(state.remainingChars, 0);
|
||||
|
||||
// Now try a 4-octet GB 18030 sequence:
|
||||
result.clear();
|
||||
state.clear();
|
||||
constexpr uint GB_18030 = 54936u;
|
||||
const char sequence[] = "\x95\x32\x90\x31";
|
||||
QByteArrayView octets = QByteArrayView(sequence);
|
||||
result = QLocal8Bit::convertToUnicode_sys(octets.first(2), GB_18030, &state);
|
||||
QCOMPARE(result, QString());
|
||||
QVERIFY(result.isNull());
|
||||
QEXPECT_FAIL("",
|
||||
"We don't store enough state to handle this case. + GB 18030 does not work with "
|
||||
"the MB_PRECOMPOSED flag.",
|
||||
Abort);
|
||||
QCOMPARE_GT(state.remainingChars, 0);
|
||||
// Then provide one more octet:
|
||||
result = QLocal8Bit::convertToUnicode_sys(octets.sliced(2, 1), GB_18030, &state);
|
||||
QCOMPARE(result, QString());
|
||||
QVERIFY(result.isNull());
|
||||
QCOMPARE_GT(state.remainingChars, 0);
|
||||
// Then provide the last octet
|
||||
result = QLocal8Bit::convertToUnicode_sys(octets.last(1), GB_18030, &state);
|
||||
QCOMPARE(result, u"𠂇");
|
||||
QCOMPARE(state.remainingChars, 0);
|
||||
}
|
||||
|
||||
void tst_QStringConverter::toLocal8Bit_data()
|
||||
{
|
||||
fromLocal8Bit_data();
|
||||
}
|
||||
|
||||
void tst_QStringConverter::toLocal8Bit()
|
||||
{
|
||||
QFETCH(const QByteArray, eightBit);
|
||||
QFETCH(const QString, utf16);
|
||||
QFETCH(const quint32, codePage);
|
||||
|
||||
QStringConverter::State state;
|
||||
|
||||
QByteArray result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state);
|
||||
QCOMPARE(result, eightBit);
|
||||
QCOMPARE(state.remainingChars, 0);
|
||||
|
||||
result.clear();
|
||||
state.clear();
|
||||
for (QChar c : utf16)
|
||||
result += QLocal8Bit::convertFromUnicode_sys(QStringView(&c, 1), codePage, &state);
|
||||
QCOMPARE(result, eightBit);
|
||||
}
|
||||
|
||||
void tst_QStringConverter::toLocal8Bit_special_cases()
|
||||
{
|
||||
QStringConverter::State state;
|
||||
// Normally utf8 goes through a different code path, but we can force it here
|
||||
constexpr uint UTF8 = 65001u;
|
||||
// Decode a 2-code unit character, but only provide 1 code unit at first:
|
||||
const char16_t a[] = u"𬽦";
|
||||
QStringView firstHalf = QStringView(a, 1);
|
||||
QByteArray result = QLocal8Bit::convertFromUnicode_sys(firstHalf, UTF8, &state);
|
||||
QEXPECT_FAIL("", "We don't currently handle missing the low surrogate", Abort);
|
||||
QCOMPARE(result, QString());
|
||||
QVERIFY(result.isNull());
|
||||
QCOMPARE_GT(state.remainingChars, 0);
|
||||
// Then provide the second code unit:
|
||||
QStringView secondHalf = QStringView(a + 1, 1);
|
||||
result = QLocal8Bit::convertFromUnicode_sys(secondHalf, UTF8, &state);
|
||||
QCOMPARE(result, "\xf0\xac\xbd\xa6"_ba);
|
||||
QCOMPARE(state.remainingChars, 0);
|
||||
|
||||
// Retain compat with the behavior for toLocal8Bit:
|
||||
QCOMPARE(firstHalf.toLocal8Bit(), "?");
|
||||
}
|
||||
#endif
|
||||
|
||||
struct DontCrashAtExit {
|
||||
~DontCrashAtExit() {
|
||||
QStringDecoder decoder(QStringDecoder::Utf8);
|
||||
|
Loading…
Reference in New Issue
Block a user