QDecompressHelper: Skip double-decompression if download is small

To retain backwards compatibility with some QNetworkReply usage,
namely connecting solely to finished-signal and allocating a buffer to
read into, but without storing the entire decompressed file in memory
until read, we may decompress the file twice.

With this patch users can now avoid this double decompression if the
amount of buffered data stays below 10 MiB. This means any file smaller
than 10 MiB will never need to be decompressed twice to know the size of
it. On top of that, if the data is handled as it arrives (e.g. in
readyRead) and the buffer is kept below 10 MiB it won't need to
decompress twice either.

This is active as long as "countDecompressed" is true, though it
currently always is in QNetworkAccessManger, with a future goal to make
it possible to control with public API. Since it requires the user to
potentially adapt their usage of QNetworkReply.

In this patch we also stop tracking the amount of unhandled uncompressed
bytes (uncompressedBytes) in favor of tracking the total amount of bytes
which has been read() by the user of QDecompressHelper (totalBytesRead),
since we can more intuitively work out the total amount of unread bytes
using this value.

Change-Id: Ie3d8d6e39a18343fcf9b610f45c7fe7e4cd4e474
Reviewed-by: Timur Pocheptsov <timur.pocheptsov@qt.io>
This commit is contained in:
Mårten Nordheim 2020-07-31 17:18:17 +02:00
parent 5889985c8c
commit b41d5f6293
3 changed files with 99 additions and 27 deletions

View File

@ -237,7 +237,13 @@ void QDecompressHelper::setCountingBytesEnabled(bool shouldCount)
qint64 QDecompressHelper::uncompressedSize() const
{
Q_ASSERT(countDecompressed);
return uncompressedBytes;
// Use the 'totalUncompressedBytes' from the countHelper if it exceeds the amount of bytes
// that we know about.
auto totalUncompressed =
countHelper && countHelper->totalUncompressedBytes > totalUncompressedBytes
? countHelper->totalUncompressedBytes
: totalUncompressedBytes;
return totalUncompressed - totalBytesRead;
}
/*!
@ -262,10 +268,9 @@ void QDecompressHelper::feed(QByteArray &&data)
{
Q_ASSERT(contentEncoding != None);
totalCompressedBytes += data.size();
if (!countInternal(data))
clear(); // If our counting brother failed then so will we :|
else
compressedDataBuffer.append(std::move(data));
if (!countInternal(compressedDataBuffer[compressedDataBuffer.bufferCount() - 1]))
clear(); // If our counting brother failed then so will we :|
}
/*!
@ -276,10 +281,9 @@ void QDecompressHelper::feed(const QByteDataBuffer &buffer)
{
Q_ASSERT(contentEncoding != None);
totalCompressedBytes += buffer.byteAmount();
compressedDataBuffer.append(buffer);
if (!countInternal(buffer))
clear(); // If our counting brother failed then so will we :|
else
compressedDataBuffer.append(buffer);
}
/*!
@ -290,10 +294,10 @@ void QDecompressHelper::feed(QByteDataBuffer &&buffer)
{
Q_ASSERT(contentEncoding != None);
totalCompressedBytes += buffer.byteAmount();
if (!countInternal(buffer))
clear(); // If our counting brother failed then so will we :|
else
const QByteDataBuffer copy(buffer);
compressedDataBuffer.append(std::move(buffer));
if (!countInternal(copy))
clear(); // If our counting brother failed then so will we :|
}
/*!
@ -303,19 +307,34 @@ void QDecompressHelper::feed(QByteDataBuffer &&buffer)
This lets us know the final size, unfortunately at the cost of
increased computation.
Potential @future improvement:
Decompress XX MiB/KiB before starting the count.
For smaller files the extra decompression can then be avoided.
To save on some of the computation we will store the data until
we reach \c MaxDecompressedDataBufferSize stored. In this case the
"penalty" is completely removed from users who read the data on
readyRead rather than waiting for it all to be received. And
any file smaller than \c MaxDecompressedDataBufferSize will
avoid this issue as well.
*/
bool QDecompressHelper::countInternal()
{
Q_ASSERT(countDecompressed);
while (hasDataInternal()
&& decompressedDataBuffer.byteAmount() < MaxDecompressedDataBufferSize) {
const qsizetype toRead = 256 * 1024;
QByteArray buffer(toRead, Qt::Uninitialized);
qsizetype bytesRead = readInternal(buffer.data(), buffer.size());
if (bytesRead == -1)
return false;
buffer.truncate(bytesRead);
decompressedDataBuffer.append(std::move(buffer));
}
if (!hasDataInternal())
return true; // handled all the data so far, just return
while (countHelper->hasData()) {
std::array<char, 1024> temp;
qsizetype bytesRead = countHelper->read(temp.data(), temp.size());
if (bytesRead == -1)
return false;
uncompressedBytes += bytesRead;
}
return true;
}
@ -358,13 +377,45 @@ bool QDecompressHelper::countInternal(const QByteDataBuffer &buffer)
qsizetype QDecompressHelper::read(char *data, qsizetype maxSize)
{
if (maxSize <= 0)
return 0;
if (!isValid())
return -1;
qsizetype bytesRead = -1;
if (!hasData())
return 0;
qsizetype cachedRead = 0;
if (!decompressedDataBuffer.isEmpty()) {
cachedRead = decompressedDataBuffer.read(data, maxSize);
data += cachedRead;
maxSize -= cachedRead;
}
qsizetype bytesRead = readInternal(data, maxSize);
if (bytesRead == -1)
return -1;
totalBytesRead += bytesRead + cachedRead;
return bytesRead + cachedRead;
}
/*!
\internal
Like read() but without attempting to read the
cached/already-decompressed data.
*/
qsizetype QDecompressHelper::readInternal(char *data, qsizetype maxSize)
{
Q_ASSERT(isValid());
if (maxSize <= 0)
return 0;
if (!hasDataInternal())
return 0;
qsizetype bytesRead = -1;
switch (contentEncoding) {
case None:
Q_UNREACHABLE();
@ -382,8 +433,6 @@ qsizetype QDecompressHelper::read(char *data, qsizetype maxSize)
}
if (bytesRead == -1)
clear();
else if (countDecompressed)
uncompressedBytes -= bytesRead;
totalUncompressedBytes += bytesRead;
if (isPotentialArchiveBomb())
@ -449,6 +498,16 @@ bool QDecompressHelper::isPotentialArchiveBomb() const
read 0 bytes. This most likely means the decompression is done.
*/
bool QDecompressHelper::hasData() const
{
return hasDataInternal() || !decompressedDataBuffer.isEmpty();
}
/*!
\internal
Like hasData() but internally the buffer of decompressed data is
not interesting.
*/
bool QDecompressHelper::hasDataInternal() const
{
return encodedBytesAvailable() || decoderHasData;
}
@ -497,11 +556,12 @@ void QDecompressHelper::clear()
contentEncoding = None;
compressedDataBuffer.clear();
decompressedDataBuffer.clear();
decoderHasData = false;
countDecompressed = false;
countHelper.reset();
uncompressedBytes = 0;
totalBytesRead = 0;
totalUncompressedBytes = 0;
totalCompressedBytes = 0;
}

View File

@ -98,6 +98,8 @@ public:
private:
bool isPotentialArchiveBomb() const;
bool hasDataInternal() const;
qsizetype readInternal(char *data, qsizetype maxSize);
bool countInternal();
bool countInternal(const QByteArray &data);
@ -111,16 +113,18 @@ private:
qsizetype readZstandard(char *data, qsizetype maxSize);
QByteDataBuffer compressedDataBuffer;
QByteDataBuffer decompressedDataBuffer;
const qsizetype MaxDecompressedDataBufferSize = 10 * 1024 * 1024;
bool decoderHasData = false;
bool countDecompressed = false;
std::unique_ptr<QDecompressHelper> countHelper;
qint64 uncompressedBytes = 0;
// Used for calculating the ratio
qint64 archiveBombCheckThreshold = 10 * 1024 * 1024;
qint64 totalUncompressedBytes = 0;
qint64 totalCompressedBytes = 0;
qint64 totalBytesRead = 0;
ContentEncoding contentEncoding = None;

View File

@ -347,20 +347,24 @@ void tst_QDecompressHelper::decompressBigData_data()
QTest::addColumn<QByteArray>("encoding");
QTest::addColumn<QString>("path");
QTest::addColumn<qint64>("size");
QTest::addColumn<bool>("countAhead");
qint64 fourGiB = 4ll * 1024ll * 1024ll * 1024ll;
qint64 fiveGiB = 5ll * 1024ll * 1024ll * 1024ll;
QTest::newRow("gzip-4G") << QByteArray("gzip") << QString(":/4G.gz") << fourGiB;
// Only use countAhead on one of these since they share codepath anyway
QTest::newRow("gzip-counted-4G") << QByteArray("gzip") << QString(":/4G.gz") << fourGiB << true;
QTest::newRow("deflate-5G") << QByteArray("deflate") << QString(":/5GiB.txt.inflate")
<< fiveGiB;
<< fiveGiB << false;
#if QT_CONFIG(brotli)
QTest::newRow("brotli-4G") << QByteArray("br") << (srcDir + "/4G.br") << fourGiB;
QTest::newRow("brotli-4G") << QByteArray("br") << (srcDir + "/4G.br") << fourGiB << false;
QTest::newRow("brotli-counted-4G") << QByteArray("br") << (srcDir + "/4G.br") << fourGiB << true;
#endif
#if QT_CONFIG(zstd)
QTest::newRow("zstandard-4G") << QByteArray("zstd") << (":/4G.zst") << fourGiB;
QTest::newRow("zstandard-4G") << QByteArray("zstd") << (":/4G.zst") << fourGiB << false;
QTest::newRow("zstandard-counted-4G") << QByteArray("zstd") << (":/4G.zst") << fourGiB << true;
#endif
}
@ -373,16 +377,20 @@ void tst_QDecompressHelper::decompressBigData()
const qint64 third = file.bytesAvailable() / 3;
QDecompressHelper helper;
QFETCH(bool, countAhead);
helper.setCountingBytesEnabled(countAhead);
helper.setDecompressedSafetyCheckThreshold(-1);
QFETCH(QByteArray, encoding);
helper.setEncoding(encoding);
QByteArray output(32 * 1024, Qt::Uninitialized);
// The size of 'output' should be at least QDecompressHelper::MaxDecompressedDataBufferSize + 1
QByteArray output(10 * 1024 * 1024 + 1, Qt::Uninitialized);
qint64 totalSize = 0;
while (!file.atEnd()) {
helper.feed(file.read(third));
while (helper.hasData()) {
qsizetype bytesRead = helper.read(output.data(), output.size());
QVERIFY(bytesRead >= 0);
QVERIFY(bytesRead <= output.size());
totalSize += bytesRead;
const auto isZero = [](char c) { return c == '\0'; };
@ -449,10 +457,10 @@ void tst_QDecompressHelper::bigZlib()
helper.feed(compressedData.mid(firstHalf.size()));
// We need the whole thing in one go... which is why this test is not available for 32-bit
qint64 expected = 5ll * 1024ll * 1024ll * 1024ll;
// This can be replaced with QByteArray after the qsizetype change is merged
std::unique_ptr<char[]> output(new char[expected]);
qsizetype size = helper.read(output.get(), expected);
const qint64 expected = 5ll * 1024ll * 1024ll * 1024ll;
// Request a few more byte than what is available, to verify exact size
QByteArray output(expected + 42, Qt::Uninitialized);
const qsizetype size = helper.read(output.data(), output.size());
QCOMPARE(size, expected);
#endif
}