changed PREFETCH() macro into PREFETCH_L2()

which is more accurate
2018-11-12 17:05:32 -08:00 · 2018-11-12 17:05:32 -08:00 · 626040ab53
commit 626040ab53
parent 7b0c551bff
2 changed files with 23 additions and 16 deletions
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@ -92,18 +92,18 @@
 * can be disabled, by declaring NO_PREFETCH build macro */
 #if defined(NO_PREFETCH)
 #  define PREFETCH_L1(ptr)  (void)(ptr)  /* disabled */
-#  define PREFETCH(ptr)     (void)(ptr)  /* disabled */
+#  define PREFETCH_L2(ptr)  (void)(ptr)  /* disabled */
 #else
 #  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
 #    define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
-#    define PREFETCH(ptr)     _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#    define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
 #    define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
-#    define PREFETCH(ptr)     __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#    define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
 #  else
 #    define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
-#    define PREFETCH(ptr)    (void)(ptr)  /* disabled */
+#    define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
 #  endif
 #endif  /* NO_PREFETCH */

@ -114,7 +114,7 @@
    size_t const _size = (size_t)(s);     \
    size_t _pos;                          \
    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
-        PREFETCH(_ptr + _pos);            \
+        PREFETCH_L2(_ptr + _pos);         \
    }                                     \
 }

--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@ -63,12 +63,13 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
 static void
 ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
                 U32 current, const BYTE* inputEnd,
-                 U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode)
+                 U32 nbCompares, U32 btLow,
+                 const ZSTD_dictMode_e dictMode)
 {
    const ZSTD_compressionParameters* const cParams = &ms->cParams;
-    U32*   const bt = ms->chainTable;
-    U32    const btLog  = cParams->chainLog - 1;
-    U32    const btMask = (1 << btLog) - 1;
+    U32* const bt = ms->chainTable;
+    U32  const btLog  = cParams->chainLog - 1;
+    U32  const btMask = (1 << btLog) - 1;
    size_t commonLengthSmaller=0, commonLengthLarger=0;
    const BYTE* const base = ms->window.base;
    const BYTE* const dictBase = ms->window.dictBase;
@ -80,7 +81,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
    const BYTE* match;
    U32* smallerPtr = bt + 2*(current&btMask);
    U32* largerPtr  = smallerPtr + 1;
-    U32 matchIndex = *smallerPtr;
+    U32 matchIndex = *smallerPtr;   /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
    U32 dummy32;   /* to be nullified at the end */
    U32 const windowLow = ms->window.lowLimit;

@ -93,6 +94,9 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
        U32* const nextPtr = bt + 2*(matchIndex & btMask);
        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
        assert(matchIndex < current);
+        /* note : all candidates are now supposed sorted,
+         * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
+         * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */

        if ( (dictMode != ZSTD_extDict)
          || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/
@ -108,7 +112,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
            match = dictBase + matchIndex;
            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
            if (matchIndex+matchLength >= dictLimit)
-                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+                match = base + matchIndex;   /* preparation for next read of match[matchLength] */
        }

        DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
@ -258,7 +262,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
         && (nbCandidates > 1) ) {
        DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
                    matchIndex);
-        *unsortedMark = previousCandidate;
+        *unsortedMark = previousCandidate;  /* the unsortedMark becomes a reversed chain, to move up back to original position */
        previousCandidate = matchIndex;
        matchIndex = *nextCandidate;
        nextCandidate = bt + 2*(matchIndex&btMask);
@ -266,11 +270,13 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
        nbCandidates --;
    }

+    /* nullify last candidate if it's still unsorted
+     * simplification, detrimental to compression ratio, beneficial for speed */
    if ( (matchIndex > unsortLimit)
      && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
        DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
                    matchIndex);
-        *nextCandidate = *unsortedMark = 0;   /* nullify next candidate if it's still unsorted (note : simplification, detrimental to compression ratio, beneficial for speed) */
+        *nextCandidate = *unsortedMark = 0;
    }

    /* batch sort stacked candidates */
@ -285,14 +291,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
    }

    /* find longest match */
-    {   size_t commonLengthSmaller=0, commonLengthLarger=0;
+    {   size_t commonLengthSmaller = 0, commonLengthLarger = 0;
        const BYTE* const dictBase = ms->window.dictBase;
        const U32 dictLimit = ms->window.dictLimit;
        const BYTE* const dictEnd = dictBase + dictLimit;
        const BYTE* const prefixStart = base + dictLimit;
        U32* smallerPtr = bt + 2*(current&btMask);
        U32* largerPtr  = bt + 2*(current&btMask) + 1;
-        U32 matchEndIdx = current+8+1;
+        U32 matchEndIdx = current + 8 + 1;
        U32 dummy32;   /* to be nullified at the end */
        size_t bestLength = 0;

@ -433,7 +439,7 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
 /* *********************************
 *  Hash Chain
 ***********************************/
-#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & mask]
+#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)]

 /* Update chains up to ip (excluded)
   Assumption : always within prefix (i.e. not within extDict) */
@ -497,6 +503,7 @@ size_t ZSTD_HcFindBestMatch_generic (
        size_t currentMl=0;
        if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
            const BYTE* const match = base + matchIndex;
+            assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
            if (match[ml] == ip[ml])   /* potentially better */
                currentMl = ZSTD_count(ip, match, iLimit);
        } else {