Merge pull request #2295 from felixhandte/dedicated-dict-search-structure-chain

DDSS for Lazy: Implement a Dedicated Dictionary Chain Table
2020-09-11 01:39:23 -04:00 · 2020-09-11 01:39:23 -04:00 · d903b552c8
commit d903b552c8
parent 005ceaa052 d6246d4a0f
4 changed files with 226 additions and 91 deletions
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@ -1647,13 +1647,13 @@ static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict,
 {
    size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy];
    int const dedicatedDictSearch = cdict->matchState.dedicatedDictSearch;
-    return ( dedicatedDictSearch
-          || pledgedSrcSize <= cutoff
-          || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
-          || params->attachDictPref == ZSTD_dictForceAttach )
-        && params->attachDictPref != ZSTD_dictForceCopy
-        && !params->forceWindow; /* dictMatchState isn't correctly
-                                 * handled in _enforceMaxDist */
+    return dedicatedDictSearch
+        || ( ( pledgedSrcSize <= cutoff
+            || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+            || params->attachDictPref == ZSTD_dictForceAttach )
+          && params->attachDictPref != ZSTD_dictForceCopy
+          && !params->forceWindow ); /* dictMatchState isn't correctly
+                                      * handled in _enforceMaxDist */
 }

 static size_t
@ -2914,10 +2914,12 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
        case ZSTD_greedy:
        case ZSTD_lazy:
        case ZSTD_lazy2:
-            if (chunk >= HASH_READ_SIZE && ms->dedicatedDictSearch)
+            if (chunk >= HASH_READ_SIZE && ms->dedicatedDictSearch) {
+                assert(chunk == remaining); /* must load everything in one go */
                ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, ichunk-HASH_READ_SIZE);
-            else if (chunk >= HASH_READ_SIZE)
+            } else if (chunk >= HASH_READ_SIZE) {
                ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE);
+            }
            break;

        case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
@ -3416,6 +3418,9 @@ static size_t ZSTD_initCDict_internal(
    assert(!ZSTD_checkCParams(cParams));
    cdict->matchState.cParams = cParams;
    cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch;
+    if (cdict->matchState.dedicatedDictSearch && dictSize > ZSTD_CHUNKSIZE_MAX) {
+        cdict->matchState.dedicatedDictSearch = 0;
+    }
    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
        cdict->dictContent = dictBuffer;
    } else {
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@ -478,23 +478,114 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {

 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
 {
-    U32 const target = (U32)(ip - ms->window.base);
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32* const hashTable = ms->hashTable;
    U32* const chainTable = ms->chainTable;
-    U32 const chainMask = (1 << ms->cParams.chainLog) - 1;
+    U32 const chainSize = 1 << ms->cParams.chainLog;
    U32 idx = ms->nextToUpdate;
-    U32 bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
+    U32 const minChain = chainSize < target ? target - chainSize : idx;
+    U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
+    U32 const cacheSize = bucketSize - 1;
+    U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
+    U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
+
+    /* We know the hashtable is oversized by a factor of `bucketSize`.
+     * We are going to temporarily pretend `bucketSize == 1`, keeping only a
+     * single entry. We will use the rest of the space to construct a temporary
+     * chaintable.
+     */
+    U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
+    U32* const tmpHashTable = hashTable;
+    U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
+    U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
+    U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
+
+    U32 hashIdx;
+
+    assert(ms->cParams.chainLog <= 24);
+    assert(ms->cParams.hashLog >= ms->cParams.chainLog);
+    assert(idx != 0);
+    assert(tmpMinChain <= minChain);
+
+    /* fill conventional hash table and conventional chain table */
    for ( ; idx < target; idx++) {
+        U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
+        if (idx >= tmpMinChain) {
+            tmpChainTable[idx - tmpMinChain] = hashTable[h];
+        }
+        tmpHashTable[h] = idx;
+    }
+
+    /* sort chains into ddss chain table */
+    {
+        U32 chainPos = 0;
+        for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
+            U32 count;
+            U32 countBeyondMinChain = 0;
+            U32 i = tmpHashTable[hashIdx];
+            for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
+                /* skip through the chain to the first position that won't be
+                 * in the hash cache bucket */
+                if (i < minChain) {
+                    countBeyondMinChain++;
+                }
+                i = tmpChainTable[i - tmpMinChain];
+            }
+            if (count == cacheSize) {
+                for (count = 0; count < chainLimit;) {
+                    if (i < minChain) {
+                        if (!i || countBeyondMinChain++ > cacheSize) {
+                            /* only allow pulling `cacheSize` number of entries
+                             * into the cache or chainTable beyond `minChain`,
+                             * to replace the entries pulled out of the
+                             * chainTable into the cache. This lets us reach
+                             * back further without increasing the total number
+                             * of entries in the chainTable, guaranteeing the
+                             * DDSS chain table will fit into the space
+                             * allocated for the regular one. */
+                            break;
+                        }
+                    }
+                    chainTable[chainPos++] = i;
+                    count++;
+                    if (i < tmpMinChain) {
+                        break;
+                    }
+                    i = tmpChainTable[i - tmpMinChain];
+                }
+            } else {
+                count = 0;
+            }
+            if (count) {
+                tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
+            } else {
+                tmpHashTable[hashIdx] = 0;
+            }
+        }
+        assert(chainPos <= chainSize); /* I believe this is guaranteed... */
+    }
+
+    /* move chain pointers into the last entry of each hash bucket */
+    for (hashIdx = (1 << hashLog); hashIdx; ) {
+        U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
+        U32 const chainPackedPointer = tmpHashTable[hashIdx];
+        U32 i;
+        for (i = 0; i < cacheSize; i++) {
+            hashTable[bucketIdx + i] = 0;
+        }
+        hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
+    }
+
+    /* fill the buckets of the hash table */
+    for (idx = ms->nextToUpdate; idx < target; idx++) {
+        U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
+                   << ZSTD_LAZY_DDSS_BUCKET_LOG;
        U32 i;
-        size_t const h = ZSTD_hashPtr(
-            ms->window.base + idx,
-            ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG,
-            ms->cParams.minMatch) << ZSTD_LAZY_DDSS_BUCKET_LOG;
        /* Shift hash cache down 1. */
-        for (i = bucketSize - 1; i; i--)
-            ms->hashTable[h + i] = ms->hashTable[h + i - 1];
-        /* Insert new position. */
-        chainTable[idx & chainMask] = ms->hashTable[h];
-        ms->hashTable[h] = idx;
+        for (i = cacheSize - 1; i; i--)
+            hashTable[h + i] = hashTable[h + i - 1];
+        hashTable[h] = idx;
    }

    ms->nextToUpdate = target;
@ -570,32 +661,39 @@ size_t ZSTD_HcFindBestMatch_generic (
    }

    if (dictMode == ZSTD_dedicatedDictSearch) {
-        const U32 ddsChainSize    = (1 << dms->cParams.chainLog);
-        const U32 ddsChainMask    = ddsChainSize - 1;
        const U32 ddsLowestIndex  = dms->window.dictLimit;
        const BYTE* const ddsBase = dms->window.base;
        const BYTE* const ddsEnd  = dms->window.nextSrc;
        const U32 ddsSize         = (U32)(ddsEnd - ddsBase);
        const U32 ddsIndexDelta   = dictLimit - ddsSize;
-        const U32 ddsMinChain     = ddsSize > ddsChainSize ? ddsSize - ddsChainSize : 0;
        const U32 bucketSize      = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
-        const U32 bucketLimit     = nbAttempts < bucketSize ? nbAttempts : bucketSize;
+        const U32 bucketLimit     = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
        U32 ddsAttempt;

-        for (ddsAttempt = 0; ddsAttempt < bucketSize; ddsAttempt++) {
+        for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
            PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
        }

+        {
+            U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+            U32 const chainIndex = chainPackedPointer >> 8;
+
+            PREFETCH_L1(&dms->chainTable[chainIndex]);
+        }
+
        for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
            size_t currentMl=0;
            const BYTE* match;
            matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
            match = ddsBase + matchIndex;

-            if (matchIndex < ddsLowestIndex) {
+            if (!matchIndex) {
                return ml;
            }

+            /* guaranteed by table construction */
+            (void)ddsLowestIndex;
+            assert(matchIndex >= ddsLowestIndex);
            assert(match+4 <= ddsEnd);
            if (MEM_read32(match) == MEM_read32(ip)) {
                /* assumption : matchIndex <= dictLimit-4 (by table construction) */
@ -613,27 +711,38 @@ size_t ZSTD_HcFindBestMatch_generic (
            }
        }

-        for ( ; (ddsAttempt < nbAttempts) & (matchIndex >= ddsMinChain); ddsAttempt++) {
-            size_t currentMl=0;
-            const BYTE* match;
-            matchIndex = dms->chainTable[matchIndex & ddsChainMask];
-            match = ddsBase + matchIndex;
+        {
+            U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+            U32 chainIndex = chainPackedPointer >> 8;
+            U32 const chainLength = chainPackedPointer & 0xFF;
+            U32 const chainAttempts = nbAttempts - ddsAttempt;
+            U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
+            U32 chainAttempt;

-            if (matchIndex < ddsLowestIndex) {
-                break;
+            for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
+                PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
            }

-            assert(match+4 <= ddsEnd);
-            if (MEM_read32(match) == MEM_read32(ip)) {
-                /* assumption : matchIndex <= dictLimit-4 (by table construction) */
-                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
-            }
+            for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
+                size_t currentMl=0;
+                const BYTE* match;
+                matchIndex = dms->chainTable[chainIndex];
+                match = ddsBase + matchIndex;

-            /* save best solution */
-            if (currentMl > ml) {
-                ml = currentMl;
-                *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
-                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+                /* guaranteed by table construction */
+                assert(matchIndex >= ddsLowestIndex);
+                assert(match+4 <= ddsEnd);
+                if (MEM_read32(match) == MEM_read32(ip)) {
+                    /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                    currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
+                }
+
+                /* save best solution */
+                if (currentMl > ml) {
+                    ml = currentMl;
+                    *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
+                    if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+                }
            }
        }
    } else if (dictMode == ZSTD_dictMatchState) {
@ -763,6 +872,12 @@ ZSTD_compressBlock_lazy_generic(
                        ZSTD_matchState_t* ms,
                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);

+    /**
+     * This table is indexed first by the four ZSTD_dictMode_e values, and then
+     * by the two searchMethod_e values. NULLs are placed for configurations
+     * that should never occur (extDict modes go to the other implementation
+     * below and there is no DDSS for binary tree search yet).
+     */
    const searchMax_f searchFuncs[4][2] = {
        {
            ZSTD_HcFindBestMatch_selectMLS,
@ -787,16 +902,13 @@ ZSTD_compressBlock_lazy_generic(

    const int isDMS = dictMode == ZSTD_dictMatchState;
    const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+    const int isDxS = isDMS || isDDS;
    const ZSTD_matchState_t* const dms = ms->dictMatchState;
-    const U32 dictLowestIndex      = isDMS || isDDS ?
-                                     dms->window.dictLimit : 0;
-    const BYTE* const dictBase     = isDMS || isDDS ?
-                                     dms->window.base : NULL;
-    const BYTE* const dictLowest   = isDMS || isDDS ?
-                                     dictBase + dictLowestIndex : NULL;
-    const BYTE* const dictEnd      = isDMS || isDDS ?
-                                     dms->window.nextSrc : NULL;
-    const U32 dictIndexDelta       = isDMS || isDDS ?
+    const U32 dictLowestIndex      = isDxS ? dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = isDxS ? dms->window.base : NULL;
+    const BYTE* const dictLowest   = isDxS ? dictBase + dictLowestIndex : NULL;
+    const BYTE* const dictEnd      = isDxS ? dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = isDxS ?
                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
                                     0;
    const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
@ -814,7 +926,7 @@ ZSTD_compressBlock_lazy_generic(
        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
    }
-    if (isDMS || isDDS) {
+    if (isDxS) {
        /* dictMatchState repCode checks don't currently handle repCode == 0
         * disabling. */
        assert(offset_1 <= dictAndPrefixLength);
@ -834,7 +946,7 @@ ZSTD_compressBlock_lazy_generic(
        const BYTE* start=ip+1;

        /* check repCode */
-        if (isDMS || isDDS) {
+        if (isDxS) {
            const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
            const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
                                && repIndex < prefixLowestIndex) ?
@ -877,7 +989,7 @@ ZSTD_compressBlock_lazy_generic(
                if ((mlRep >= 4) && (gain2 > gain1))
                    matchLength = mlRep, offset = 0, start = ip;
            }
-            if (isDMS || isDDS) {
+            if (isDxS) {
                const U32 repIndex = (U32)(ip - base) - offset_1;
                const BYTE* repMatch = repIndex < prefixLowestIndex ?
                               dictBase + (repIndex - dictIndexDelta) :
@ -912,7 +1024,7 @@ ZSTD_compressBlock_lazy_generic(
                    if ((mlRep >= 4) && (gain2 > gain1))
                        matchLength = mlRep, offset = 0, start = ip;
                }
-                if (isDMS || isDDS) {
+                if (isDxS) {
                    const U32 repIndex = (U32)(ip - base) - offset_1;
                    const BYTE* repMatch = repIndex < prefixLowestIndex ?
                                   dictBase + (repIndex - dictIndexDelta) :
@ -950,7 +1062,7 @@ ZSTD_compressBlock_lazy_generic(
                     && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */
                    { start--; matchLength++; }
            }
-            if (isDMS || isDDS) {
+            if (isDxS) {
                U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
                const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
                const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
@ -966,7 +1078,7 @@ _storeSequence:
        }

        /* check immediate repcode */
-        if (isDMS || isDDS) {
+        if (isDxS) {
            while (ip <= ilimit) {
                U32 const current2 = (U32)(ip-base);
                U32 const repIndex = current2 - offset_2;
--- a/lib/zstd.h
+++ b/lib/zstd.h
@ -1548,15 +1548,16 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
 #define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7

 /* Controls whether the new and experimental "dedicated dictionary search
- * structure" can be used.
+ * structure" can be used. This feature is still rough around the edges, be
+ * prepared for surprising behavior!
 *
 * How to use it:
 *
 * When using a CDict, whether to use this feature or not is controlled at
 * CDict creation, and it must be set in a CCtxParams set passed into that
- * construction. A compression will then use the feature or not based on how
- * the CDict was constructed; the value of this param, set in the CCtx, will
- * have no effect.
+ * construction (via ZSTD_createCDict_advanced2()). A compression will then
+ * use the feature or not based on how the CDict was constructed; the value of
+ * this param, set in the CCtx, will have no effect.
 *
 * However, when a dictionary buffer is passed into a CCtx, such as via
 * ZSTD_CCtx_loadDictionary(), this param can be set on the CCtx to control
@ -1578,10 +1579,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
 * written as the compression goes along. This means we can choose a search
 * structure for the dictionary that is read-optimized.
 *
- * This feature enables the use of that different structure. Note that this
- * means that the CDict tables can no longer be copied into the CCtx, so
- * the dict attachment mode ZSTD_dictForceCopy will no longer be useable. The
- * dictionary can only be attached or reloaded.
+ * This feature enables the use of that different structure.
+ *
+ * Note that some of the members of the ZSTD_compressionParameters struct have
+ * different semantics and constraints in the dedicated search structure. It is
+ * highly recommended that you simply set a compression level in the CCtxParams
+ * you pass into the CDict creation call, and avoid messing with the cParams
+ * directly.
 *
 * Effects:
 *
@ -1589,9 +1593,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
 * implementation supports this feature. Currently, that's limited to
 * ZSTD_greedy, ZSTD_lazy, and ZSTD_lazy2.
 *
- * In general, you should expect compression to be faster, and CDict creation
- * to be slightly slower. Eventually, we will probably make this mode the
- * default.
+ * Note that this means that the CDict tables can no longer be copied into the
+ * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be
+ * useable. The dictionary can only be attached or reloaded.
+ *
+ * In general, you should expect compression to be faster--sometimes very much
+ * so--and CDict creation to be slightly slower. Eventually, we will probably
+ * make this mode the default.
 */
 #define ZSTD_c_enableDedicatedDictSearch ZSTD_c_experimentalParam8

--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@ -571,7 +571,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
        r = ZSTD_decompress(decodedBuffer, CNBuffSize, compressedBuffer, cSize);
        if (!ZSTD_isError(r)) goto _output_error;
        if (ZSTD_getErrorCode(r) != ZSTD_error_checksum_wrong) goto _output_error;
-        
+
        CHECK_Z(ZSTD_DCtx_setParameter(dctx, ZSTD_d_forceIgnoreChecksum, ZSTD_d_ignoreChecksum));
        r = ZSTD_decompressDCtx(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize-1);
        if (!ZSTD_isError(r)) goto _output_error;   /* wrong checksum size should still throw error */
@ -2926,7 +2926,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
    {
        ZSTD_CCtx* const cctx = ZSTD_createCCtx();
        ZSTD_DCtx* const dctx = ZSTD_createDCtx();
-        size_t dictSize = CNBuffSize > 110 KB ? 110 KB : CNBuffSize;
+        size_t dictSize = CNBuffSize;
        void* dict = (void*)malloc(dictSize);
        ZSTD_CCtx_params* cctx_params = ZSTD_createCCtxParams();
        ZSTD_dictAttachPref_e const attachPrefs[] = {
@ -2934,10 +2934,13 @@ static int basicUnitTests(U32 const seed, double compressibility)
            ZSTD_dictForceAttach,
            ZSTD_dictForceCopy,
            ZSTD_dictForceLoad,
-            ZSTD_dictForceAttach
+            ZSTD_dictDefaultAttach,
+            ZSTD_dictForceAttach,
+            ZSTD_dictForceCopy,
+            ZSTD_dictForceLoad
        };
-        int const enableDedicatedDictSearch[] = {0, 0, 0, 0, 1};
-        int const cLevel = 6;
+        int const enableDedicatedDictSearch[] = {0, 0, 0, 0, 1, 1, 1, 1};
+        int cLevel;
        int i;

        RDG_genBuffer(dict, dictSize, 0.5, 0.5, seed);
@ -2945,28 +2948,35 @@ static int basicUnitTests(U32 const seed, double compressibility)

        CHECK(cctx_params != NULL);

-        for (i = 0; i < 5; ++i) {
-            ZSTD_dictAttachPref_e const attachPref = attachPrefs[i];
-            int const enableDDS = enableDedicatedDictSearch[i];
-            ZSTD_CDict* cdict;
+        for (dictSize = CNBuffSize; dictSize; dictSize = dictSize >> 3) {
+            DISPLAYLEVEL(3, "\n    Testing with dictSize %u ", (U32)dictSize);
+            for (cLevel = 4; cLevel < 13; cLevel++) {
+                for (i = 0; i < 8; ++i) {
+                    ZSTD_dictAttachPref_e const attachPref = attachPrefs[i];
+                    int const enableDDS = enableDedicatedDictSearch[i];
+                    ZSTD_CDict* cdict;

-            DISPLAYLEVEL(5, "\n  iter %d ", i);
+                    DISPLAYLEVEL(5, "\n      dictSize %u cLevel %d iter %d ", (U32)dictSize, cLevel, i);

-            ZSTD_CCtxParams_init(cctx_params, cLevel);
-            CHECK_Z(ZSTD_CCtxParams_setParameter(cctx_params, ZSTD_c_enableDedicatedDictSearch, enableDDS));
+                    ZSTD_CCtxParams_init(cctx_params, cLevel);
+                    CHECK_Z(ZSTD_CCtxParams_setParameter(cctx_params, ZSTD_c_enableDedicatedDictSearch, enableDDS));

-            cdict = ZSTD_createCDict_advanced2(dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, cctx_params, ZSTD_defaultCMem);
-            CHECK(cdict != NULL);
+                    cdict = ZSTD_createCDict_advanced2(dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, cctx_params, ZSTD_defaultCMem);
+                    CHECK(cdict != NULL);

-            CHECK_Z(ZSTD_CCtx_refCDict(cctx, cdict));
-            CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_forceAttachDict, attachPref));
+                    CHECK_Z(ZSTD_CCtx_refCDict(cctx, cdict));
+                    CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_forceAttachDict, attachPref));

-            cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, CNBuffSize);
-            CHECK_Z(cSize);
-            CHECK_Z(ZSTD_decompress_usingDict(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize, dict, dictSize));
+                    cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, CNBuffSize);
+                    CHECK_Z(cSize);
+                    CHECK_Z(ZSTD_decompress_usingDict(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize, dict, dictSize));

-            CHECK_Z(ZSTD_CCtx_reset(cctx, ZSTD_reset_session_and_parameters));
-            ZSTD_freeCDict(cdict);
+                    DISPLAYLEVEL(5, "compressed to %u bytes ", (U32)cSize);
+
+                    CHECK_Z(ZSTD_CCtx_reset(cctx, ZSTD_reset_session_and_parameters));
+                    ZSTD_freeCDict(cdict);
+                }
+            }
        }

        ZSTD_freeCCtx(cctx);