improve long-range decoder speed

on enwik9 at level 22 (which is almost a worst case scenario), speed improves by +7% on my laptop (415 -> 445 MB/s)
2018-11-08 12:47:46 -08:00 · 2018-11-08 12:47:46 -08:00 · 9126da5b5c
commit 9126da5b5c
parent 8bed4012bd
3 changed files with 14 additions and 12 deletions
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@ -89,23 +89,21 @@
 #endif

 /* prefetch
- * can be disabled, by declaring NO_PREFETCH macro
- * All prefetch invocations use a single default locality 2,
- * generating instruction prefetcht1,
- * which, according to Intel, means "load data into L2 cache".
- * This is a good enough "middle ground" for the time being,
- * though in theory, it would be better to specialize locality depending on data being prefetched.
- * Tests could not determine any sensible difference based on locality value. */
+ * can be disabled, by declaring NO_PREFETCH build macro */
 #if defined(NO_PREFETCH)
+#  define PREFETCH_L1(ptr)  (void)(ptr)  /* disabled */
 #  define PREFETCH(ptr)     (void)(ptr)  /* disabled */
 #else
 #  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#    define PREFETCH(ptr)   _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#    define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#    define PREFETCH(ptr)     _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
-#    define PREFETCH(ptr)   __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#    define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#    define PREFETCH(ptr)     __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
 #  else
-#    define PREFETCH(ptr)   (void)(ptr)  /* disabled */
+#    define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+#    define PREFETCH(ptr)    (void)(ptr)  /* disabled */
 #  endif
 #endif  /* NO_PREFETCH */

--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@ -1054,7 +1054,7 @@ ZSTD_decompressSequencesLong_body(
            seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
            size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
-            PREFETCH(sequence.match);  /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+            PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
            sequences[seqNb & STORED_SEQS_MASK] = sequence;
            op += oneSeqSize;
        }
--- a/programs/bench.c
+++ b/programs/bench.c
@ -601,7 +601,11 @@ BMK_benchMemAdvancedNoAlloc(
                cPtr += cCapacities[nbBlocks];
                resPtr += thisBlockSize;
                remaining -= thisBlockSize;
-                if (BMK_decodeOnly) { assert(nbBlocks==0); cSizes[nbBlocks] = thisBlockSize; }
+                if (BMK_decodeOnly) {
+                    assert(nbBlocks==0);
+                    cSizes[nbBlocks] = thisBlockSize;
+                    benchResult.cSize = thisBlockSize;
+                }
            }
        }
    }