improve long-range decoder speed

on enwik9 at level 22 (which is almost a worst case scenario),
speed improves by +7% on my laptop (415 -> 445 MB/s)
This commit is contained in:
Yann Collet 2018-11-08 12:47:46 -08:00
parent 8bed4012bd
commit 9126da5b5c
3 changed files with 14 additions and 12 deletions

View File

@ -89,23 +89,21 @@
#endif
/* prefetch
* can be disabled, by declaring NO_PREFETCH macro
* All prefetch invocations use a single default locality 2,
* generating instruction prefetcht1,
* which, according to Intel, means "load data into L2 cache".
* This is a good enough "middle ground" for the time being,
* though in theory, it would be better to specialize locality depending on data being prefetched.
* Tests could not determine any sensible difference based on locality value. */
* can be disabled, by declaring NO_PREFETCH build macro */
#if defined(NO_PREFETCH)
# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
# define PREFETCH(ptr) (void)(ptr) /* disabled */
#else
# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
# define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
# define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
# define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
# define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
# define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
# define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
# else
# define PREFETCH(ptr) (void)(ptr) /* disabled */
# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
# define PREFETCH(ptr) (void)(ptr) /* disabled */
# endif
#endif /* NO_PREFETCH */

View File

@ -1054,7 +1054,7 @@ ZSTD_decompressSequencesLong_body(
seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
PREFETCH(sequence.match); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
sequences[seqNb & STORED_SEQS_MASK] = sequence;
op += oneSeqSize;
}

View File

@ -601,7 +601,11 @@ BMK_benchMemAdvancedNoAlloc(
cPtr += cCapacities[nbBlocks];
resPtr += thisBlockSize;
remaining -= thisBlockSize;
if (BMK_decodeOnly) { assert(nbBlocks==0); cSizes[nbBlocks] = thisBlockSize; }
if (BMK_decodeOnly) {
assert(nbBlocks==0);
cSizes[nbBlocks] = thisBlockSize;
benchResult.cSize = thisBlockSize;
}
}
}
}