[zstd] Fix data corruption in niche use case

* Extract the overflow correction into a helper function.
* Load the dictionary `ZSTD_CHUNKSIZE_MAX = 512 MB` bytes at a time
  and overflow correct between each chunk.

Data corruption could happen when all these conditions are true:

* You are using multithreading mode
* Your overlap size is >= 512 MB (implies window size >= 512 MB)
* You are using a strategy >= ZSTD_btlazy
* You are compressing more than 4 GB

The problem is that when loading a large dictionary we don't do
overflow correction. We can only load 512 MB at a time, and may
need to do overflow correction before each chunk.
This commit is contained in:
Nick Terrell 2019-06-21 15:39:43 -07:00
parent 4156060ca4
commit 674534a700

View File

@ -1826,16 +1826,15 @@ static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const
/*! ZSTD_reduceIndex() :
* rescale all indexes to avoid future overflow (indexes are U32) */
static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue)
{
ZSTD_matchState_t* const ms = &zc->blockState.matchState;
{ U32 const hSize = (U32)1 << zc->appliedParams.cParams.hashLog;
{ U32 const hSize = (U32)1 << params->cParams.hashLog;
ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
}
if (zc->appliedParams.cParams.strategy != ZSTD_fast) {
U32 const chainSize = (U32)1 << zc->appliedParams.cParams.chainLog;
if (zc->appliedParams.cParams.strategy == ZSTD_btlazy2)
if (params->cParams.strategy != ZSTD_fast) {
U32 const chainSize = (U32)1 << params->cParams.chainLog;
if (params->cParams.strategy == ZSTD_btlazy2)
ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
else
ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
@ -2821,6 +2820,25 @@ out:
}
static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, void const* ip, void const* iend)
{
if (ZSTD_window_needOverflowCorrection(ms->window, iend)) {
U32 const maxDist = (U32)1 << params->cParams.windowLog;
U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy);
U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip);
ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
ZSTD_reduceIndex(ms, params, correction);
if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
else ms->nextToUpdate -= correction;
/* invalidate dictionaries on overflow correction */
ms->loadedDictEnd = 0;
ms->dictMatchState = NULL;
}
}
/*! ZSTD_compress_frameChunk() :
* Compress a chunk of data into one or multiple blocks.
* All blocks will be terminated, all input will be consumed.
@ -2854,20 +2872,7 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,
"not enough space to store compressed block");
if (remaining < blockSize) blockSize = remaining;
if (ZSTD_window_needOverflowCorrection(ms->window, ip + blockSize)) {
U32 const cycleLog = ZSTD_cycleLog(cctx->appliedParams.cParams.chainLog, cctx->appliedParams.cParams.strategy);
U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip);
ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
ZSTD_reduceIndex(cctx, correction);
if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
else ms->nextToUpdate -= correction;
/* invalidate dictionaries on overflow correction */
ms->loadedDictEnd = 0;
ms->dictMatchState = NULL;
}
ZSTD_overflowCorrectIfNeeded(ms, &cctx->appliedParams, ip, ip + blockSize);
ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
/* Ensure hash/chain table insertion resumes no sooner than lowlimit */
@ -3007,18 +3012,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
if (!frame) {
/* overflow check and correction for block mode */
if (ZSTD_window_needOverflowCorrection(ms->window, (const char*)src + srcSize)) {
U32 const cycleLog = ZSTD_cycleLog(cctx->appliedParams.cParams.chainLog, cctx->appliedParams.cParams.strategy);
U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, 1 << cctx->appliedParams.cParams.windowLog, src);
ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
ZSTD_reduceIndex(cctx, correction);
if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
else ms->nextToUpdate -= correction;
ms->loadedDictEnd = 0;
ms->dictMatchState = NULL;
}
ZSTD_overflowCorrectIfNeeded(ms, &cctx->appliedParams, src, (BYTE const*)src + srcSize);
}
DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize);
@ -3074,7 +3068,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
const void* src, size_t srcSize,
ZSTD_dictTableLoadMethod_e dtlm)
{
const BYTE* const ip = (const BYTE*) src;
const BYTE* ip = (const BYTE*) src;
const BYTE* const iend = ip + srcSize;
ZSTD_window_update(&ms->window, src, srcSize);
@ -3085,32 +3079,42 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
if (srcSize <= HASH_READ_SIZE) return 0;
switch(params->cParams.strategy)
{
case ZSTD_fast:
ZSTD_fillHashTable(ms, iend, dtlm);
break;
case ZSTD_dfast:
ZSTD_fillDoubleHashTable(ms, iend, dtlm);
break;
while (iend - ip > HASH_READ_SIZE) {
size_t const remaining = iend - ip;
size_t const chunk = MIN(remaining, ZSTD_CHUNKSIZE_MAX);
const BYTE* const ichunk = ip + chunk;
case ZSTD_greedy:
case ZSTD_lazy:
case ZSTD_lazy2:
if (srcSize >= HASH_READ_SIZE)
ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE);
break;
ZSTD_overflowCorrectIfNeeded(ms, params, ip, ichunk);
case ZSTD_btlazy2: /* we want the dictionary table fully sorted */
case ZSTD_btopt:
case ZSTD_btultra:
case ZSTD_btultra2:
if (srcSize >= HASH_READ_SIZE)
ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
break;
switch(params->cParams.strategy)
{
case ZSTD_fast:
ZSTD_fillHashTable(ms, ichunk, dtlm);
break;
case ZSTD_dfast:
ZSTD_fillDoubleHashTable(ms, ichunk, dtlm);
break;
default:
assert(0); /* not possible : not a valid strategy id */
case ZSTD_greedy:
case ZSTD_lazy:
case ZSTD_lazy2:
if (chunk >= HASH_READ_SIZE)
ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE);
break;
case ZSTD_btlazy2: /* we want the dictionary table fully sorted */
case ZSTD_btopt:
case ZSTD_btultra:
case ZSTD_btultra2:
if (chunk >= HASH_READ_SIZE)
ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk);
break;
default:
assert(0); /* not possible : not a valid strategy id */
}
ip = ichunk;
}
ms->nextToUpdate = (U32)(iend - ms->window.base);