improve long-range decoder speed
on enwik9 at level 22 (which is almost a worst case scenario), speed improves by +7% on my laptop (415 -> 445 MB/s)
This commit is contained in:
parent
8bed4012bd
commit
9126da5b5c
@ -89,23 +89,21 @@
|
||||
#endif
|
||||
|
||||
/* prefetch
|
||||
* can be disabled, by declaring NO_PREFETCH macro
|
||||
* All prefetch invocations use a single default locality 2,
|
||||
* generating instruction prefetcht1,
|
||||
* which, according to Intel, means "load data into L2 cache".
|
||||
* This is a good enough "middle ground" for the time being,
|
||||
* though in theory, it would be better to specialize locality depending on data being prefetched.
|
||||
* Tests could not determine any sensible difference based on locality value. */
|
||||
* can be disabled, by declaring NO_PREFETCH build macro */
|
||||
#if defined(NO_PREFETCH)
|
||||
# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
|
||||
# define PREFETCH(ptr) (void)(ptr) /* disabled */
|
||||
#else
|
||||
# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
|
||||
# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
|
||||
# define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
|
||||
# define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
|
||||
# define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
|
||||
# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
|
||||
# define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
|
||||
# define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
|
||||
# define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
|
||||
# else
|
||||
# define PREFETCH(ptr) (void)(ptr) /* disabled */
|
||||
# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
|
||||
# define PREFETCH(ptr) (void)(ptr) /* disabled */
|
||||
# endif
|
||||
#endif /* NO_PREFETCH */
|
||||
|
||||
|
@ -1054,7 +1054,7 @@ ZSTD_decompressSequencesLong_body(
|
||||
seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
|
||||
size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
|
||||
if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
|
||||
PREFETCH(sequence.match); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
|
||||
PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
|
||||
sequences[seqNb & STORED_SEQS_MASK] = sequence;
|
||||
op += oneSeqSize;
|
||||
}
|
||||
|
@ -601,7 +601,11 @@ BMK_benchMemAdvancedNoAlloc(
|
||||
cPtr += cCapacities[nbBlocks];
|
||||
resPtr += thisBlockSize;
|
||||
remaining -= thisBlockSize;
|
||||
if (BMK_decodeOnly) { assert(nbBlocks==0); cSizes[nbBlocks] = thisBlockSize; }
|
||||
if (BMK_decodeOnly) {
|
||||
assert(nbBlocks==0);
|
||||
cSizes[nbBlocks] = thisBlockSize;
|
||||
benchResult.cSize = thisBlockSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user