Add tests for decoding too-short UTF-8 sequences

We were handling this properly, but not testing them. I guess we weren't testing because the condition is a valid intermediate state, so hasFailure() is correct it returning false. Testing inspired by the bug reported in https://github.com/intel/tinycbor/issues/137 Change-Id: Ib47c56818178458a88b4fffd1554ecfdd0af637e Reviewed-by: Lars Knoll <lars.knoll@qt.io>
2018-09-16 09:05:47 -07:00 · 2018-09-16 09:05:47 -07:00 · 7e1a0c0739
commit 7e1a0c0739
parent 4d40f09a45
5 changed files with 51 additions and 17 deletions
--- a/src/corelib/codecs/qtextcodec.cpp
+++ b/src/corelib/codecs/qtextcodec.cpp
@ -1,6 +1,7 @@
 /****************************************************************************
 **
-** Copyright (C) 2016 The Qt Company Ltd.
+** Copyright (C) 2018 The Qt Company Ltd.
+** Copyright (C) 2018 Intel Corporation.
 ** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@ -1236,6 +1237,19 @@ bool QTextDecoder::hasFailure() const
    return state.invalidChars != 0;
 }

+/*!
+    \internal
+    \since 5.12
+
+    Determines whether the decoder needs more bytes to continue decoding. That
+    is, this signifies that the input string ended in the middle of a
+    multi-byte sequence. Note that it's possible some codecs do not report this.
+ */
+bool QTextDecoder::needsMoreData() const
+{
+    return state.remainingChars;
+}
+
 QT_END_NAMESPACE

 #endif // QT_NO_TEXTCODEC
--- a/src/corelib/codecs/qtextcodec.h
+++ b/src/corelib/codecs/qtextcodec.h
@ -1,6 +1,6 @@
 /****************************************************************************
 **
-** Copyright (C) 2016 The Qt Company Ltd.
+** Copyright (C) 2018 The Qt Company Ltd.
 ** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@ -162,6 +162,7 @@ public:
    QString toUnicode(const QByteArray &ba);
    void toUnicode(QString *target, const char *chars, int len);
    bool hasFailure() const;
+    bool needsMoreData() const;
 private:
    const QTextCodec *c;
    QTextCodec::ConverterState state;
--- a/src/corelib/codecs/qutfcodec_p.h
+++ b/src/corelib/codecs/qutfcodec_p.h
@ -1,7 +1,7 @@
 /****************************************************************************
 **
-** Copyright (C) 2016 The Qt Company Ltd.
-** Copyright (C) 2016 Intel Corporation.
+** Copyright (C) 2018 The Qt Company Ltd.
+** Copyright (C) 2018 Intel Corporation.
 ** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
--- a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp
+++ b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp
@ -1,7 +1,7 @@
 /****************************************************************************
 **
-** Copyright (C) 2016 The Qt Company Ltd.
-** Copyright (C) 2016 Intel Corporation.
+** Copyright (C) 2018 The Qt Company Ltd.
+** Copyright (C) 2018 Intel Corporation.
 ** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the test suite of the Qt Toolkit.
@ -71,7 +71,7 @@ void tst_Utf8::initTestCase()
    // is the locale UTF-8?
    if (QString(QChar(QChar::ReplacementCharacter)).toLocal8Bit() == "\xEF\xBF\xBD") {
        QTest::newRow("localecodec") << true;
-        qDebug() << "locale is utf8";
+        qInfo() << "locale is utf8";
    }
 }

@ -226,6 +226,15 @@ void tst_Utf8::invalidUtf8()
    // The system's UTF-8 codec is sometimes buggy
    //  GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
    //  OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
+    if (!useLocale)
+        QVERIFY(decoder->hasFailure() || decoder->needsMoreData());
+    else if (!decoder->hasFailure() && !decoder->needsMoreData())
+        qWarning("System codec does not report failure when it should. Should report bug upstream.");
+
+    // add a continuation character and test that we don't accidentally use it
+    // (buffer overrun)
+    utf8 += char(0x80 | 0x3f);
+    decoder->toUnicode(utf8.constData(), utf8.size() - 1);
    if (!useLocale)
        QVERIFY(decoder->hasFailure());
    else if (!decoder->hasFailure())
--- a/tests/auto/corelib/codecs/utf8/utf8data.cpp
+++ b/tests/auto/corelib/codecs/utf8/utf8data.cpp
@ -1,6 +1,7 @@
 /****************************************************************************
 **
-** Copyright (C) 2016 The Qt Company Ltd.
+** Copyright (C) 2018 The Qt Company Ltd.
+** Copyright (C) 2018 Intel Corporation.
 ** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the test suite of the Qt Toolkit.
@ -29,15 +30,24 @@

 void loadInvalidUtf8Rows()
 {
-    QTest::newRow("1char") << QByteArray("\x80");
-    QTest::newRow("2chars-1") << QByteArray("\xC2\xC0");
-    QTest::newRow("2chars-2") << QByteArray("\xC3\xDF");
-    QTest::newRow("2chars-3") << QByteArray("\xC7\xF0");
-    QTest::newRow("3chars-1") << QByteArray("\xE0\xA0\xC0");
-    QTest::newRow("3chars-2") << QByteArray("\xE0\xC0\xA0");
-    QTest::newRow("4chars-1") << QByteArray("\xF0\x90\x80\xC0");
-    QTest::newRow("4chars-2") << QByteArray("\xF0\x90\xC0\x80");
-    QTest::newRow("4chars-3") << QByteArray("\xF0\xC0\x80\x80");
+    // Wrong continuations
+    QTest::newRow("bad-continuation-1char") << QByteArray("\x80");
+    QTest::newRow("bad-continuation-2chars-1") << QByteArray("\xC2\xC0");
+    QTest::newRow("bad-continuation-2chars-2") << QByteArray("\xC3\xDF");
+    QTest::newRow("bad-continuation-2chars-3") << QByteArray("\xC7\xF0");
+    QTest::newRow("bad-continuation-3chars-1") << QByteArray("\xE0\xA0\xC0");
+    QTest::newRow("bad-continuation-3chars-2") << QByteArray("\xE0\xC0\xA0");
+    QTest::newRow("bad-continuation-4chars-1") << QByteArray("\xF0\x90\x80\xC0");
+    QTest::newRow("bad-continuation-4chars-2") << QByteArray("\xF0\x90\xC0\x80");
+    QTest::newRow("bad-continuation-4chars-3") << QByteArray("\xF0\xC0\x80\x80");
+
+    // Too short
+    QTest::newRow("too-short-2chars") << QByteArray("\xC2");
+    QTest::newRow("too-short-3chars-1") << QByteArray("\xE0");
+    QTest::newRow("too-short-3chars-2") << QByteArray("\xE0\xA0");
+    QTest::newRow("too-short-4chars-1") << QByteArray("\xF0");
+    QTest::newRow("too-short-4chars-2") << QByteArray("\xF0\x90");
+    QTest::newRow("too-short-4chars-3") << QByteArray("\xF0\x90\x80");

    // Surrogate pairs must now be present either
    // U+D800:        1101   10 0000   00 0000