QDecompressHelper: Skip double-decompression if download is small

To retain backwards compatibility with some QNetworkReply usage, namely connecting solely to finished-signal and allocating a buffer to read into, but without storing the entire decompressed file in memory until read, we may decompress the file twice. With this patch users can now avoid this double decompression if the amount of buffered data stays below 10 MiB. This means any file smaller than 10 MiB will never need to be decompressed twice to know the size of it. On top of that, if the data is handled as it arrives (e.g. in readyRead) and the buffer is kept below 10 MiB it won't need to decompress twice either. This is active as long as "countDecompressed" is true, though it currently always is in QNetworkAccessManger, with a future goal to make it possible to control with public API. Since it requires the user to potentially adapt their usage of QNetworkReply. In this patch we also stop tracking the amount of unhandled uncompressed bytes (uncompressedBytes) in favor of tracking the total amount of bytes which has been read() by the user of QDecompressHelper (totalBytesRead), since we can more intuitively work out the total amount of unread bytes using this value. Change-Id: Ie3d8d6e39a18343fcf9b610f45c7fe7e4cd4e474 Reviewed-by: Timur Pocheptsov <timur.pocheptsov@qt.io>
2020-07-31 17:18:17 +02:00 · 2020-07-31 17:18:17 +02:00 · b41d5f6293
commit b41d5f6293
parent 5889985c8c
3 changed files with 99 additions and 27 deletions
--- a/src/network/access/qdecompresshelper.cpp
+++ b/src/network/access/qdecompresshelper.cpp
@ -237,7 +237,13 @@ void QDecompressHelper::setCountingBytesEnabled(bool shouldCount)
 qint64 QDecompressHelper::uncompressedSize() const
 {
    Q_ASSERT(countDecompressed);
-    return uncompressedBytes;
+    // Use the 'totalUncompressedBytes' from the countHelper if it exceeds the amount of bytes
+    // that we know about.
+    auto totalUncompressed =
+            countHelper && countHelper->totalUncompressedBytes > totalUncompressedBytes
+            ? countHelper->totalUncompressedBytes
+            : totalUncompressedBytes;
+    return totalUncompressed - totalBytesRead;
 }

 /*!
@ -262,10 +268,9 @@ void QDecompressHelper::feed(QByteArray &&data)
 {
    Q_ASSERT(contentEncoding != None);
    totalCompressedBytes += data.size();
-    if (!countInternal(data))
-        clear(); // If our counting brother failed then so will we :|
-    else
    compressedDataBuffer.append(std::move(data));
+    if (!countInternal(compressedDataBuffer[compressedDataBuffer.bufferCount() - 1]))
+        clear(); // If our counting brother failed then so will we :|
 }

 /*!
@ -276,10 +281,9 @@ void QDecompressHelper::feed(const QByteDataBuffer &buffer)
 {
    Q_ASSERT(contentEncoding != None);
    totalCompressedBytes += buffer.byteAmount();
+    compressedDataBuffer.append(buffer);
    if (!countInternal(buffer))
        clear(); // If our counting brother failed then so will we :|
-    else
-        compressedDataBuffer.append(buffer);
 }

 /*!
@ -290,10 +294,10 @@ void QDecompressHelper::feed(QByteDataBuffer &&buffer)
 {
    Q_ASSERT(contentEncoding != None);
    totalCompressedBytes += buffer.byteAmount();
-    if (!countInternal(buffer))
-        clear(); // If our counting brother failed then so will we :|
-    else
+    const QByteDataBuffer copy(buffer);
    compressedDataBuffer.append(std::move(buffer));
+    if (!countInternal(copy))
+        clear(); // If our counting brother failed then so will we :|
 }

 /*!
@ -303,19 +307,34 @@ void QDecompressHelper::feed(QByteDataBuffer &&buffer)
    This lets us know the final size, unfortunately at the cost of
    increased computation.

-    Potential @future improvement:
-    Decompress XX MiB/KiB before starting the count.
-        For smaller files the extra decompression can then be avoided.
+    To save on some of the computation we will store the data until
+    we reach \c MaxDecompressedDataBufferSize stored. In this case the
+    "penalty" is completely removed from users who read the data on
+    readyRead rather than waiting for it all to be received. And
+    any file smaller than \c MaxDecompressedDataBufferSize will
+    avoid this issue as well.
 */
 bool QDecompressHelper::countInternal()
 {
    Q_ASSERT(countDecompressed);
+    while (hasDataInternal()
+           && decompressedDataBuffer.byteAmount() < MaxDecompressedDataBufferSize) {
+        const qsizetype toRead = 256 * 1024;
+        QByteArray buffer(toRead, Qt::Uninitialized);
+        qsizetype bytesRead = readInternal(buffer.data(), buffer.size());
+        if (bytesRead == -1)
+            return false;
+        buffer.truncate(bytesRead);
+        decompressedDataBuffer.append(std::move(buffer));
+    }
+    if (!hasDataInternal())
+        return true; // handled all the data so far, just return
+
    while (countHelper->hasData()) {
        std::array<char, 1024> temp;
        qsizetype bytesRead = countHelper->read(temp.data(), temp.size());
        if (bytesRead == -1)
            return false;
-        uncompressedBytes += bytesRead;
    }
    return true;
 }
@ -358,13 +377,45 @@ bool QDecompressHelper::countInternal(const QByteDataBuffer &buffer)

 qsizetype QDecompressHelper::read(char *data, qsizetype maxSize)
 {
+    if (maxSize <= 0)
+        return 0;
+
    if (!isValid())
        return -1;

-    qsizetype bytesRead = -1;
    if (!hasData())
        return 0;

+    qsizetype cachedRead = 0;
+    if (!decompressedDataBuffer.isEmpty()) {
+        cachedRead = decompressedDataBuffer.read(data, maxSize);
+        data += cachedRead;
+        maxSize -= cachedRead;
+    }
+
+    qsizetype bytesRead = readInternal(data, maxSize);
+    if (bytesRead == -1)
+        return -1;
+    totalBytesRead += bytesRead + cachedRead;
+    return bytesRead + cachedRead;
+}
+
+/*!
+    \internal
+    Like read() but without attempting to read the
+    cached/already-decompressed data.
+*/
+qsizetype QDecompressHelper::readInternal(char *data, qsizetype maxSize)
+{
+    Q_ASSERT(isValid());
+
+    if (maxSize <= 0)
+        return 0;
+
+    if (!hasDataInternal())
+        return 0;
+
+    qsizetype bytesRead = -1;
    switch (contentEncoding) {
    case None:
        Q_UNREACHABLE();
@ -382,8 +433,6 @@ qsizetype QDecompressHelper::read(char *data, qsizetype maxSize)
    }
    if (bytesRead == -1)
        clear();
-    else if (countDecompressed)
-        uncompressedBytes -= bytesRead;

    totalUncompressedBytes += bytesRead;
    if (isPotentialArchiveBomb())
@ -449,6 +498,16 @@ bool QDecompressHelper::isPotentialArchiveBomb() const
    read 0 bytes. This most likely means the decompression is done.
 */
 bool QDecompressHelper::hasData() const
+{
+    return hasDataInternal() || !decompressedDataBuffer.isEmpty();
+}
+
+/*!
+    \internal
+    Like hasData() but internally the buffer of decompressed data is
+    not interesting.
+*/
+bool QDecompressHelper::hasDataInternal() const
 {
    return encodedBytesAvailable() || decoderHasData;
 }
@ -497,11 +556,12 @@ void QDecompressHelper::clear()
    contentEncoding = None;

    compressedDataBuffer.clear();
+    decompressedDataBuffer.clear();
    decoderHasData = false;

    countDecompressed = false;
    countHelper.reset();
-    uncompressedBytes = 0;
+    totalBytesRead = 0;
    totalUncompressedBytes = 0;
    totalCompressedBytes = 0;
 }
--- a/src/network/access/qdecompresshelper_p.h
+++ b/src/network/access/qdecompresshelper_p.h
@ -98,6 +98,8 @@ public:

 private:
    bool isPotentialArchiveBomb() const;
+    bool hasDataInternal() const;
+    qsizetype readInternal(char *data, qsizetype maxSize);

    bool countInternal();
    bool countInternal(const QByteArray &data);
@ -111,16 +113,18 @@ private:
    qsizetype readZstandard(char *data, qsizetype maxSize);

    QByteDataBuffer compressedDataBuffer;
+    QByteDataBuffer decompressedDataBuffer;
+    const qsizetype MaxDecompressedDataBufferSize = 10 * 1024 * 1024;
    bool decoderHasData = false;

    bool countDecompressed = false;
    std::unique_ptr<QDecompressHelper> countHelper;
-    qint64 uncompressedBytes = 0;

    // Used for calculating the ratio
    qint64 archiveBombCheckThreshold = 10 * 1024 * 1024;
    qint64 totalUncompressedBytes = 0;
    qint64 totalCompressedBytes = 0;
+    qint64 totalBytesRead = 0;

    ContentEncoding contentEncoding = None;

--- a/tests/auto/network/access/qdecompresshelper/tst_qdecompresshelper.cpp
+++ b/tests/auto/network/access/qdecompresshelper/tst_qdecompresshelper.cpp
@ -347,20 +347,24 @@ void tst_QDecompressHelper::decompressBigData_data()
    QTest::addColumn<QByteArray>("encoding");
    QTest::addColumn<QString>("path");
    QTest::addColumn<qint64>("size");
+    QTest::addColumn<bool>("countAhead");

    qint64 fourGiB = 4ll * 1024ll * 1024ll * 1024ll;
    qint64 fiveGiB = 5ll * 1024ll * 1024ll * 1024ll;

-    QTest::newRow("gzip-4G") << QByteArray("gzip") << QString(":/4G.gz") << fourGiB;
+    // Only use countAhead on one of these since they share codepath anyway
+    QTest::newRow("gzip-counted-4G") << QByteArray("gzip") << QString(":/4G.gz") << fourGiB << true;
    QTest::newRow("deflate-5G") << QByteArray("deflate") << QString(":/5GiB.txt.inflate")
-                                << fiveGiB;
+                                << fiveGiB << false;

 #if QT_CONFIG(brotli)
-    QTest::newRow("brotli-4G") << QByteArray("br") << (srcDir + "/4G.br") << fourGiB;
+    QTest::newRow("brotli-4G") << QByteArray("br") << (srcDir + "/4G.br") << fourGiB << false;
+    QTest::newRow("brotli-counted-4G") << QByteArray("br") << (srcDir + "/4G.br") << fourGiB << true;
 #endif

 #if QT_CONFIG(zstd)
-    QTest::newRow("zstandard-4G") << QByteArray("zstd") << (":/4G.zst") << fourGiB;
+    QTest::newRow("zstandard-4G") << QByteArray("zstd") << (":/4G.zst") << fourGiB << false;
+    QTest::newRow("zstandard-counted-4G") << QByteArray("zstd") << (":/4G.zst") << fourGiB << true;
 #endif
 }

@ -373,16 +377,20 @@ void tst_QDecompressHelper::decompressBigData()
    const qint64 third = file.bytesAvailable() / 3;

    QDecompressHelper helper;
+    QFETCH(bool, countAhead);
+    helper.setCountingBytesEnabled(countAhead);
    helper.setDecompressedSafetyCheckThreshold(-1);
    QFETCH(QByteArray, encoding);
    helper.setEncoding(encoding);

-    QByteArray output(32 * 1024, Qt::Uninitialized);
+    // The size of 'output' should be at least QDecompressHelper::MaxDecompressedDataBufferSize + 1
+    QByteArray output(10 * 1024 * 1024 + 1, Qt::Uninitialized);
    qint64 totalSize = 0;
    while (!file.atEnd()) {
        helper.feed(file.read(third));
        while (helper.hasData()) {
            qsizetype bytesRead = helper.read(output.data(), output.size());
+            QVERIFY(bytesRead >= 0);
            QVERIFY(bytesRead <= output.size());
            totalSize += bytesRead;
            const auto isZero = [](char c) { return c == '\0'; };
@ -449,10 +457,10 @@ void tst_QDecompressHelper::bigZlib()
    helper.feed(compressedData.mid(firstHalf.size()));

    // We need the whole thing in one go... which is why this test is not available for 32-bit
-    qint64 expected = 5ll * 1024ll * 1024ll * 1024ll;
-    // This can be replaced with QByteArray after the qsizetype change is merged
-    std::unique_ptr<char[]> output(new char[expected]);
-    qsizetype size = helper.read(output.get(), expected);
+    const qint64 expected = 5ll * 1024ll * 1024ll * 1024ll;
+    // Request a few more byte than what is available, to verify exact size
+    QByteArray output(expected + 42, Qt::Uninitialized);
+    const qsizetype size = helper.read(output.data(), output.size());
    QCOMPARE(size, expected);
 #endif
 }