added conditional prefetch

depending on amount of work to do.
2018-09-12 10:29:47 -07:00 · 2018-09-12 10:29:47 -07:00 · 4de344d505
commit 4de344d505
parent 63a519dbf6
2 changed files with 35 additions and 30 deletions
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@ -95,19 +95,21 @@
 #else
 #  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#    define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
+#    define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T1)
 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
-#    define PREFETCH(ptr)   __builtin_prefetch(ptr, 0 /* rw==read */, 0 /* locality */)
+#    define PREFETCH(ptr)   __builtin_prefetch(ptr, 0 /* rw==read */, 2 /* locality */)
 #  else
 #    define PREFETCH(ptr)   /* disabled */
 #  endif
 #endif  /* NO_PREFETCH */

-#define PREFETCH_AREA(ptr, size)  {   \
-    size_t pos;                       \
-    for (pos=0; pos<size; pos++) {    \
-        PREFETCH( (const char*)(const void*)ptr + pos); \
-    }                                 \
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(ptr, size)  {    \
+    size_t pos;                        \
+    for (pos=0; pos<size; pos+=CACHELINE_SIZE) { \
+        PREFETCH( (const char*)ptr + pos); \
+    }                                  \
 }

 /* disable warnings */
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@ -578,13 +578,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
        {
        case set_repeat:
            if (dctx->litEntropy==0) return ERROR(dictionary_corrupted);
-
-            /* prefetch huffman table if cold */
-            if (dctx->ddictIsCold) {
-                PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
-            }
-
            /* fall-through */
+
        case set_compressed:
            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
            {   size_t lhSize, litSize, litCSize;
@ -616,6 +611,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected);
                if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);

+                /* prefetch huffman table if cold */
+                if (dctx->ddictIsCold && (litSize > 256 /* heuristic */)) {
+                    PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
+                }
+
                if (HUF_isError((litEncType==set_repeat) ?
                                    ( singleStream ?
                                        HUF_decompress1X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) :
@ -897,7 +897,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
                                 const void* src, size_t srcSize,
                                 const U32* baseValue, const U32* nbAdditionalBits,
                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
-                                 int ddictIsCold)
+                                 int ddictIsCold, int nbSeq)
 {
    switch(type)
    {
@ -917,7 +917,8 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
    case set_repeat:
        if (!flagRepeatTable) return ERROR(corruption_detected);
        /* prefetch FSE table if used */
-        if (ddictIsCold) {
+        if (ddictIsCold && (nbSeq > 16 /* heuristic */)) {
+        //if (ddictIsCold) {
            const void* const pStart = *DTablePtr;
            size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
            PREFETCH_AREA(pStart, pSize);
@ -974,25 +975,27 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
    const BYTE* const istart = (const BYTE* const)src;
    const BYTE* const iend = istart + srcSize;
    const BYTE* ip = istart;
+    int nbSeq;
    DEBUGLOG(5, "ZSTD_decodeSeqHeaders");

    /* check */
    if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);

    /* SeqHead */
-    {   int nbSeq = *ip++;
-        if (!nbSeq) { *nbSeqPtr=0; return 1; }
-        if (nbSeq > 0x7F) {
-            if (nbSeq == 0xFF) {
-                if (ip+2 > iend) return ERROR(srcSize_wrong);
-                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
-            } else {
-                if (ip >= iend) return ERROR(srcSize_wrong);
-                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
-            }
+    nbSeq = *ip++;
+    if (!nbSeq) { *nbSeqPtr=0; return 1; }
+    if (nbSeq > 0x7F) {
+        if (nbSeq == 0xFF) {
+            if (ip+2 > iend) return ERROR(srcSize_wrong);
+            nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+        } else {
+            if (ip >= iend) return ERROR(srcSize_wrong);
+            nbSeq = ((nbSeq-0x80)<<8) + *ip++;
        }
-        *nbSeqPtr = nbSeq;
    }
+    *nbSeqPtr = nbSeq;
+    DEBUGLOG(2, "nbSeqs=%i", nbSeq);
+

    /* FSE table descriptors */
    if (ip+4 > iend) return ERROR(srcSize_wrong); /* minimum possible size */
@ -1007,7 +1010,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                      ip, iend-ip,
                                                      LL_base, LL_bits,
                                                      LL_defaultDTable, dctx->fseEntropy,
-                                                      dctx->ddictIsCold);
+                                                      dctx->ddictIsCold, nbSeq);
            if (ZSTD_isError(llhSize)) return ERROR(corruption_detected);
            ip += llhSize;
        }
@ -1017,7 +1020,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                      ip, iend-ip,
                                                      OF_base, OF_bits,
                                                      OF_defaultDTable, dctx->fseEntropy,
-                                                      dctx->ddictIsCold);
+                                                      dctx->ddictIsCold, nbSeq);
            if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected);
            ip += ofhSize;
        }
@ -1027,7 +1030,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                      ip, iend-ip,
                                                      ML_base, ML_bits,
                                                      ML_defaultDTable, dctx->fseEntropy,
-                                                      dctx->ddictIsCold);
+                                                      dctx->ddictIsCold, nbSeq);
            if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected);
            ip += mlhSize;
        }
@ -2395,7 +2398,7 @@ size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
            /* prefetch dictionary content */
            if (dctx->ddictIsCold) {
                size_t const dictSize = ddict->dictSize;
-                size_t const pSize = MIN(dictSize, 32 KB);   /* proposed heuristic : 8 x frameContentSize => need to know frameContentSize */
+                size_t const pSize = MIN(dictSize, 2 KB);   /* very conservative; would need to know Nb of Copies in dictionary, or frameContentSize as a proxy */
                const void* const pStart = (const char*)ddict->dictContent + dictSize - pSize;
                PREFETCH_AREA(pStart, pSize);
            }