fixed decompression-only benchmark

This commit is contained in:
Yann Collet 2018-11-08 12:36:39 -08:00
parent f6eb12084d
commit 8bed4012bd
2 changed files with 47 additions and 34 deletions

View File

@ -1026,17 +1026,18 @@ ZSTD_decompressSequencesLong_body(
/* Regen sequences */ /* Regen sequences */
if (nbSeq) { if (nbSeq) {
#define STORED_SEQS 4 #define STORED_SEQS 4
#define STOSEQ_MASK (STORED_SEQS-1) #define STORED_SEQS_MASK (STORED_SEQS-1)
#define ADVANCED_SEQS 4 #define ADVANCED_SEQS 4
seq_t sequences[STORED_SEQS]; seq_t sequences[STORED_SEQS];
int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
seqState_t seqState; seqState_t seqState;
int seqNb; int seqNb;
dctx->fseEntropy = 1; dctx->fseEntropy = 1;
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
seqState.prefixStart = prefixStart; seqState.prefixStart = prefixStart;
seqState.pos = (size_t)(op-prefixStart); seqState.pos = (size_t)(op-prefixStart);
seqState.dictEnd = dictEnd; seqState.dictEnd = dictEnd;
assert(iend > ip);
CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected);
ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
@ -1051,10 +1052,10 @@ ZSTD_decompressSequencesLong_body(
/* decode and decompress */ /* decode and decompress */
for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) { for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset); seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
if (ZSTD_isError(oneSeqSize)) return oneSeqSize; if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
PREFETCH(sequence.match); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ PREFETCH(sequence.match); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
sequences[seqNb&STOSEQ_MASK] = sequence; sequences[seqNb & STORED_SEQS_MASK] = sequence;
op += oneSeqSize; op += oneSeqSize;
} }
if (seqNb<nbSeq) return ERROR(corruption_detected); if (seqNb<nbSeq) return ERROR(corruption_detected);
@ -1062,7 +1063,7 @@ ZSTD_decompressSequencesLong_body(
/* finish queue */ /* finish queue */
seqNb -= seqAdvance; seqNb -= seqAdvance;
for ( ; seqNb<nbSeq ; seqNb++) { for ( ; seqNb<nbSeq ; seqNb++) {
size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb&STOSEQ_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
if (ZSTD_isError(oneSeqSize)) return oneSeqSize; if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
op += oneSeqSize; op += oneSeqSize;
} }
@ -1070,7 +1071,7 @@ ZSTD_decompressSequencesLong_body(
/* save reps for next block */ /* save reps for next block */
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
#undef STORED_SEQS #undef STORED_SEQS
#undef STOSEQ_MASK #undef STORED_SEQS_MASK
#undef ADVANCED_SEQS #undef ADVANCED_SEQS
} }
@ -1118,9 +1119,10 @@ ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
#endif #endif
typedef size_t (*ZSTD_decompressSequences_t)( typedef size_t (*ZSTD_decompressSequences_t)(
ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, ZSTD_DCtx* dctx,
const void *seqStart, size_t seqSize, int nbSeq, void* dst, size_t maxDstSize,
const ZSTD_longOffset_e isLongOffset); const void* seqStart, size_t seqSize, int nbSeq,
const ZSTD_longOffset_e isLongOffset);
static size_t static size_t
ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
@ -1136,10 +1138,17 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
} }
static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
void* dst, size_t maxDstSize, /* ZSTD_decompressSequencesLong() :
const void* seqStart, size_t seqSize, int nbSeq, * decompression function triggered when a minimum share of offsets is considered "long",
const ZSTD_longOffset_e isLongOffset) * aka out of cache.
* note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes mearning "farther than memory cache distance".
* This function will try to mitigate main memory latency through the use of prefetching */
static size_t
ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
void* dst, size_t maxDstSize,
const void* seqStart, size_t seqSize, int nbSeq,
const ZSTD_longOffset_e isLongOffset)
{ {
DEBUGLOG(5, "ZSTD_decompressSequencesLong"); DEBUGLOG(5, "ZSTD_decompressSequencesLong");
#if DYNAMIC_BMI2 #if DYNAMIC_BMI2
@ -1150,6 +1159,8 @@ static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
} }
/* ZSTD_getLongOffsetsShare() : /* ZSTD_getLongOffsetsShare() :
* condition : offTable must be valid * condition : offTable must be valid
* @return : "share" of long offsets (arbitrarily defined as > (1<<23)) * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
@ -1188,7 +1199,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
* In block mode, window size is not known, so we have to be conservative. * In block mode, window size is not known, so we have to be conservative.
* (note: but it could be evaluated from current-lowLimit) * (note: but it could be evaluated from current-lowLimit)
*/ */
ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))); ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
if (srcSize >= ZSTD_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); if (srcSize >= ZSTD_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);
@ -1208,7 +1219,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
ip += seqHSize; ip += seqHSize;
srcSize -= seqHSize; srcSize -= seqHSize;
if ( (!frame || dctx->fParams.windowSize > (1<<24)) if ( (!frame || (dctx->fParams.windowSize > (1<<24)))
&& (nbSeq>0) ) { /* could probably use a larger nbSeq limit */ && (nbSeq>0) ) { /* could probably use a larger nbSeq limit */
U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */

View File

@ -522,22 +522,24 @@ static BMK_benchOutcome_t BMK_benchOutcome_setValidResult(BMK_benchResult_t resu
/* benchMem with no allocation */ /* benchMem with no allocation */
static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc( static BMK_benchOutcome_t
const void** srcPtrs, size_t* srcSizes, BMK_benchMemAdvancedNoAlloc(
void** cPtrs, size_t* cCapacities, size_t* cSizes, const void** srcPtrs, size_t* srcSizes,
void** resPtrs, size_t* resSizes, void** cPtrs, size_t* cCapacities, size_t* cSizes,
void** resultBufferPtr, void* compressedBuffer, void** resPtrs, size_t* resSizes,
size_t maxCompressedSize, void** resultBufferPtr, void* compressedBuffer,
BMK_timedFnState_t* timeStateCompress, size_t maxCompressedSize,
BMK_timedFnState_t* timeStateDecompress, BMK_timedFnState_t* timeStateCompress,
BMK_timedFnState_t* timeStateDecompress,
const void* srcBuffer, size_t srcSize, const void* srcBuffer, size_t srcSize,
const size_t* fileSizes, unsigned nbFiles, const size_t* fileSizes, unsigned nbFiles,
const int cLevel, const ZSTD_compressionParameters* comprParams, const int cLevel,
const void* dictBuffer, size_t dictBufferSize, const ZSTD_compressionParameters* comprParams,
ZSTD_CCtx* cctx, ZSTD_DCtx* dctx, const void* dictBuffer, size_t dictBufferSize,
int displayLevel, const char* displayName, ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
const BMK_advancedParams_t* adv) int displayLevel, const char* displayName,
const BMK_advancedParams_t* adv)
{ {
size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize); /* avoid div by 0 */ size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize); /* avoid div by 0 */
BMK_benchResult_t benchResult; BMK_benchResult_t benchResult;
@ -599,6 +601,7 @@ static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc(
cPtr += cCapacities[nbBlocks]; cPtr += cCapacities[nbBlocks];
resPtr += thisBlockSize; resPtr += thisBlockSize;
remaining -= thisBlockSize; remaining -= thisBlockSize;
if (BMK_decodeOnly) { assert(nbBlocks==0); cSizes[nbBlocks] = thisBlockSize; }
} }
} }
} }
@ -633,7 +636,6 @@ static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc(
DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->\r", marks[markNb], displayName, (U32)srcSize); DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->\r", marks[markNb], displayName, (U32)srcSize);
while (!(compressionCompleted && decompressionCompleted)) { while (!(compressionCompleted && decompressionCompleted)) {
if (!compressionCompleted) { if (!compressionCompleted) {
BMK_runOutcome_t const cOutcome = BMK_runOutcome_t const cOutcome =
BMK_benchTimedFn( timeStateCompress, BMK_benchTimedFn( timeStateCompress,
@ -659,7 +661,6 @@ static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc(
} } } }
{ int const ratioAccuracy = (ratio < 10.) ? 3 : 2; { int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
markNb = (markNb+1) % NB_MARKS;
DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r", DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",
marks[markNb], displayName, marks[markNb], displayName,
(U32)srcSize, (U32)cSize, (U32)srcSize, (U32)cSize,
@ -690,7 +691,6 @@ static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc(
} }
{ int const ratioAccuracy = (ratio < 10.) ? 3 : 2; { int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
markNb = (markNb+1) % NB_MARKS;
DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r", DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r",
marks[markNb], displayName, marks[markNb], displayName,
(U32)srcSize, (U32)benchResult.cSize, (U32)srcSize, (U32)benchResult.cSize,
@ -700,6 +700,7 @@ static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc(
} }
decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress); decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress);
} }
markNb = (markNb+1) % NB_MARKS;
} /* while (!(compressionCompleted && decompressionCompleted)) */ } /* while (!(compressionCompleted && decompressionCompleted)) */
/* CRC Checking */ /* CRC Checking */
@ -707,7 +708,8 @@ static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc(
U64 const crcCheck = XXH64(resultBuffer, srcSize, 0); U64 const crcCheck = XXH64(resultBuffer, srcSize, 0);
if ((adv->mode == BMK_both) && (crcOrig!=crcCheck)) { if ((adv->mode == BMK_both) && (crcOrig!=crcCheck)) {
size_t u; size_t u;
DISPLAY("!!! WARNING !!! %14s : Invalid Checksum : %x != %x \n", displayName, (unsigned)crcOrig, (unsigned)crcCheck); DISPLAY("!!! WARNING !!! %14s : Invalid Checksum : %x != %x \n",
displayName, (unsigned)crcOrig, (unsigned)crcCheck);
for (u=0; u<srcSize; u++) { for (u=0; u<srcSize; u++) {
if (((const BYTE*)srcBuffer)[u] != resultBuffer[u]) { if (((const BYTE*)srcBuffer)[u] != resultBuffer[u]) {
U32 segNb, bNb, pos; U32 segNb, bNb, pos;