From 94efb1749d5b6fd4ce19d7299c1014999bea28cd Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 3 Feb 2018 23:54:10 -0800 Subject: [PATCH] faster decoding in 32-bits mode for long offsets (tentative) On my laptop: Before: ./zstd32 -b --zstd=wlog=27 silesia.tar enwik8 -S 3#silesia.tar : 211984896 -> 66683478 (3.179), 97.6 MB/s , 400.7 MB/s 3#enwik8 : 100000000 -> 35643153 (2.806), 76.5 MB/s , 303.2 MB/s After: ./zstd32 -b --zstd=wlog=27 silesia.tar enwik8 -S 3#silesia.tar : 211984896 -> 66683478 (3.179), 97.4 MB/s , 435.0 MB/s 3#enwik8 : 100000000 -> 35643153 (2.806), 76.2 MB/s , 338.1 MB/s Mileage vary, depending on file, and cpu type. But a generic rule is : x86 benefits less from "long-offset mode" than x64, maybe due to register pressure. On "entropy", long-mode is _never_ a win for x86. On my laptop though, it may, depending on file and compression level (enwik8 benefits more from "long-mode" than silesia). --- lib/decompress/zstd_decompress.c | 43 +++++++++++++++++--------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index 6cbce9be..1a1fb250 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -23,7 +23,7 @@ /*! * LEGACY_SUPPORT : -* if set to 1, ZSTD_decompress() can decode older formats (v0.1+) +* if set to 1+, ZSTD_decompress() can decode older formats (v0.1+) */ #ifndef ZSTD_LEGACY_SUPPORT # define ZSTD_LEGACY_SUPPORT 0 @@ -235,8 +235,8 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) /*-************************************************************* -* Decompression section -***************************************************************/ + * Frame header decoding + ***************************************************************/ /*! ZSTD_isFrame() : * Tells if the content of `buffer` starts with a valid Frame Identifier. @@ -258,7 +258,7 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size) /** ZSTD_frameHeaderSize_internal() : * srcSize must be large enough to reach header size fields. - * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. * @return : size of the Frame Header * or an error code, which can be tested with ZSTD_isError() */ static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format) @@ -481,6 +481,10 @@ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t he } +/*-************************************************************* + * Block decoding + ***************************************************************/ + /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, @@ -936,13 +940,14 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); assert(ofBits <= MaxOff); - if (MEM_32bits() && longOffsets) { - U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); - if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream); + BIT_reloadDStream(&seqState->DStream); if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); + assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ } else { - offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); } } @@ -955,7 +960,7 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; seqState->prevOffset[1] = seqState->prevOffset[0]; seqState->prevOffset[0] = offset = temp; - } else { + } else { /* offset == 0 */ offset = seqState->prevOffset[0]; } } else { @@ -967,16 +972,16 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l } seq.matchLength = ML_base[mlCode] - + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ + + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0); /* <= 16 bits */ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) BIT_reloadDStream(&seqState->DStream); if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) BIT_reloadDStream(&seqState->DStream); - /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */ + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); seq.litLength = LL_base[llCode] - + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ + + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0); /* <= 16 bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); @@ -1364,13 +1369,13 @@ static size_t ZSTD_decompressSequencesLong( FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); /* prepare in advance */ - for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNbfParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))); - /* windowSize could be any value at this point, since it is only validated - * in the streaming API. - */ DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); if (srcSize >= ZSTD_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); @@ -1429,7 +1430,9 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, ip += litCSize; srcSize -= litCSize; } - if (frame && dctx->fParams.windowSize > (1<<23)) + if ( frame /* windowSize exists */ + && (dctx->fParams.windowSize > (1<<24)) + && MEM_64bits() /* x86 benefits less from long mode than x64 */ ) return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, isLongOffset); return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, isLongOffset); }