Add tests for decoding too-short UTF-8 sequences

We were handling this properly, but not testing them. I guess we weren't
testing because the condition is a valid intermediate state, so
hasFailure() is correct it returning false.

Testing inspired by the bug reported in
https://github.com/intel/tinycbor/issues/137

Change-Id: Ib47c56818178458a88b4fffd1554ecfdd0af637e
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
This commit is contained in:
Thiago Macieira 2018-09-16 09:05:47 -07:00
parent 4d40f09a45
commit 7e1a0c0739
5 changed files with 51 additions and 17 deletions

View File

@ -1,6 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2016 The Qt Company Ltd.
** Copyright (C) 2018 The Qt Company Ltd.
** Copyright (C) 2018 Intel Corporation.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the QtCore module of the Qt Toolkit.
@ -1236,6 +1237,19 @@ bool QTextDecoder::hasFailure() const
return state.invalidChars != 0;
}
/*!
\internal
\since 5.12
Determines whether the decoder needs more bytes to continue decoding. That
is, this signifies that the input string ended in the middle of a
multi-byte sequence. Note that it's possible some codecs do not report this.
*/
bool QTextDecoder::needsMoreData() const
{
return state.remainingChars;
}
QT_END_NAMESPACE
#endif // QT_NO_TEXTCODEC

View File

@ -1,6 +1,6 @@
/****************************************************************************
**
** Copyright (C) 2016 The Qt Company Ltd.
** Copyright (C) 2018 The Qt Company Ltd.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the QtCore module of the Qt Toolkit.
@ -162,6 +162,7 @@ public:
QString toUnicode(const QByteArray &ba);
void toUnicode(QString *target, const char *chars, int len);
bool hasFailure() const;
bool needsMoreData() const;
private:
const QTextCodec *c;
QTextCodec::ConverterState state;

View File

@ -1,7 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2016 The Qt Company Ltd.
** Copyright (C) 2016 Intel Corporation.
** Copyright (C) 2018 The Qt Company Ltd.
** Copyright (C) 2018 Intel Corporation.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the QtCore module of the Qt Toolkit.

View File

@ -1,7 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2016 The Qt Company Ltd.
** Copyright (C) 2016 Intel Corporation.
** Copyright (C) 2018 The Qt Company Ltd.
** Copyright (C) 2018 Intel Corporation.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the test suite of the Qt Toolkit.
@ -71,7 +71,7 @@ void tst_Utf8::initTestCase()
// is the locale UTF-8?
if (QString(QChar(QChar::ReplacementCharacter)).toLocal8Bit() == "\xEF\xBF\xBD") {
QTest::newRow("localecodec") << true;
qDebug() << "locale is utf8";
qInfo() << "locale is utf8";
}
}
@ -226,6 +226,15 @@ void tst_Utf8::invalidUtf8()
// The system's UTF-8 codec is sometimes buggy
// GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
// OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
if (!useLocale)
QVERIFY(decoder->hasFailure() || decoder->needsMoreData());
else if (!decoder->hasFailure() && !decoder->needsMoreData())
qWarning("System codec does not report failure when it should. Should report bug upstream.");
// add a continuation character and test that we don't accidentally use it
// (buffer overrun)
utf8 += char(0x80 | 0x3f);
decoder->toUnicode(utf8.constData(), utf8.size() - 1);
if (!useLocale)
QVERIFY(decoder->hasFailure());
else if (!decoder->hasFailure())

View File

@ -1,6 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2016 The Qt Company Ltd.
** Copyright (C) 2018 The Qt Company Ltd.
** Copyright (C) 2018 Intel Corporation.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the test suite of the Qt Toolkit.
@ -29,15 +30,24 @@
void loadInvalidUtf8Rows()
{
QTest::newRow("1char") << QByteArray("\x80");
QTest::newRow("2chars-1") << QByteArray("\xC2\xC0");
QTest::newRow("2chars-2") << QByteArray("\xC3\xDF");
QTest::newRow("2chars-3") << QByteArray("\xC7\xF0");
QTest::newRow("3chars-1") << QByteArray("\xE0\xA0\xC0");
QTest::newRow("3chars-2") << QByteArray("\xE0\xC0\xA0");
QTest::newRow("4chars-1") << QByteArray("\xF0\x90\x80\xC0");
QTest::newRow("4chars-2") << QByteArray("\xF0\x90\xC0\x80");
QTest::newRow("4chars-3") << QByteArray("\xF0\xC0\x80\x80");
// Wrong continuations
QTest::newRow("bad-continuation-1char") << QByteArray("\x80");
QTest::newRow("bad-continuation-2chars-1") << QByteArray("\xC2\xC0");
QTest::newRow("bad-continuation-2chars-2") << QByteArray("\xC3\xDF");
QTest::newRow("bad-continuation-2chars-3") << QByteArray("\xC7\xF0");
QTest::newRow("bad-continuation-3chars-1") << QByteArray("\xE0\xA0\xC0");
QTest::newRow("bad-continuation-3chars-2") << QByteArray("\xE0\xC0\xA0");
QTest::newRow("bad-continuation-4chars-1") << QByteArray("\xF0\x90\x80\xC0");
QTest::newRow("bad-continuation-4chars-2") << QByteArray("\xF0\x90\xC0\x80");
QTest::newRow("bad-continuation-4chars-3") << QByteArray("\xF0\xC0\x80\x80");
// Too short
QTest::newRow("too-short-2chars") << QByteArray("\xC2");
QTest::newRow("too-short-3chars-1") << QByteArray("\xE0");
QTest::newRow("too-short-3chars-2") << QByteArray("\xE0\xA0");
QTest::newRow("too-short-4chars-1") << QByteArray("\xF0");
QTest::newRow("too-short-4chars-2") << QByteArray("\xF0\x90");
QTest::newRow("too-short-4chars-3") << QByteArray("\xF0\x90\x80");
// Surrogate pairs must now be present either
// U+D800: 1101 10 0000 00 0000