From cc1522351f37e4a5b7a69ed05388003cd5c28ff9 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Tue, 18 Jul 2017 11:21:19 -0700 Subject: [PATCH] [libzstd] Fix bug in Huffman encoding Summary: Huffman encoding with a bad dictionary can encode worse than the HUF_BLOCKBOUND(srcSize), since we don't filter out incompressible input, and even if we did, the dictionaries Huffman table could be ill suited to compressing actual data. The fast optimization doesn't seem to improve compression speed, even when I hard coded fast = 1, the speed didn't improve over hard coding it to 0. Benchmarks: $ ./zstd.dev -b1e5 Benchmarking levels from 1 to 5 1#Synthetic 50% : 10000000 -> 3139163 (3.186), 524.8 MB/s ,1890.0 MB/s 2#Synthetic 50% : 10000000 -> 3115138 (3.210), 372.6 MB/s ,1830.2 MB/s 3#Synthetic 50% : 10000000 -> 3222672 (3.103), 223.3 MB/s ,1400.2 MB/s 4#Synthetic 50% : 10000000 -> 3276678 (3.052), 198.0 MB/s ,1280.1 MB/s 5#Synthetic 50% : 10000000 -> 3271570 (3.057), 107.8 MB/s ,1200.0 MB/s $ ./zstd -b1e5 Benchmarking levels from 1 to 5 1#Synthetic 50% : 10000000 -> 3139163 (3.186), 524.8 MB/s ,1870.2 MB/s 2#Synthetic 50% : 10000000 -> 3115138 (3.210), 370.0 MB/s ,1810.3 MB/s 3#Synthetic 50% : 10000000 -> 3222672 (3.103), 223.3 MB/s ,1380.1 MB/s 4#Synthetic 50% : 10000000 -> 3276678 (3.052), 196.1 MB/s ,1270.0 MB/s 5#Synthetic 50% : 10000000 -> 3271570 (3.057), 106.8 MB/s ,1180.1 MB/s $ ./zstd.dev -b1e5 ../silesia.tar Benchmarking levels from 1 to 5 1#silesia.tar : 211988480 -> 73651685 (2.878), 429.7 MB/s ,1096.5 MB/s 2#silesia.tar : 211988480 -> 70158785 (3.022), 321.2 MB/s ,1029.1 MB/s 3#silesia.tar : 211988480 -> 66993813 (3.164), 243.7 MB/s , 981.4 MB/s 4#silesia.tar : 211988480 -> 66306481 (3.197), 226.7 MB/s , 972.4 MB/s 5#silesia.tar : 211988480 -> 64757852 (3.274), 150.3 MB/s , 963.6 MB/s $ ./zstd -b1e5 ../silesia.tar Benchmarking levels from 1 to 5 1#silesia.tar : 211988480 -> 73651685 (2.878), 429.7 MB/s ,1087.1 MB/s 2#silesia.tar : 211988480 -> 70158785 (3.022), 318.8 MB/s ,1029.1 MB/s 3#silesia.tar : 211988480 -> 66993813 (3.164), 246.5 MB/s , 981.4 MB/s 4#silesia.tar : 211988480 -> 66306481 (3.197), 229.2 MB/s , 972.4 MB/s 5#silesia.tar : 211988480 -> 64757852 (3.274), 149.3 MB/s , 963.6 MB/s Test Plan: I added a test case to the fuzzer which crashed with ASAN before the patch and succeeded after. --- lib/compress/huf_compress.c | 3 +-- tests/fuzzer.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c index 7af0789a..beb4fdb6 100644 --- a/lib/compress/huf_compress.c +++ b/lib/compress/huf_compress.c @@ -436,7 +436,7 @@ static void HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } -#define HUF_FLUSHBITS(s) (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s)) +#define HUF_FLUSHBITS(s) BIT_flushBits(s) #define HUF_FLUSHBITS_1(stream) \ if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) @@ -451,7 +451,6 @@ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, si BYTE* const oend = ostart + dstSize; BYTE* op = ostart; size_t n; - const unsigned fast = (dstSize >= HUF_BLOCKBOUND(srcSize)); BIT_CStream_t bitC; /* init */ diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 904ce6fd..3099f346 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -1003,6 +1003,42 @@ static int basicUnitTests(U32 seed, double compressibility) if (r != _3BYTESTESTLENGTH) goto _output_error; } DISPLAYLEVEL(4, "OK \n"); + DISPLAYLEVEL(4, "test%3i : incompressible data and ill suited dictionary : ", testNb++); + RDG_genBuffer(CNBuffer, CNBuffSize, 0.0, 0.1, seed); + { /* Train a dictionary on low characters */ + size_t dictSize = 16 KB; + void* const dictBuffer = malloc(dictSize); + size_t const totalSampleSize = 1 MB; + size_t const sampleUnitSize = 8 KB; + U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize); + size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t)); + if (!dictBuffer || !samplesSizes) goto _output_error; + { U32 u; for (u=0; u