diff --git a/lib/common/huf.h b/lib/common/huf.h index 522bf9b6..1cead357 100644 --- a/lib/common/huf.h +++ b/lib/common/huf.h @@ -206,10 +206,10 @@ The following API allows targeting specific sub-functions for advanced tasks. For example, it's possible to compress several blocks using the same 'CTable', or to save and regenerate 'CTable' using external methods. */ -/* FSE_count() : find it within "fse.h" */ +/* FSE_count() : exposed within "fse.h" */ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */ -size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); +size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap, in which case, CTable will overwrite count content */ size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c index 5692d56e..cfc5a98b 100644 --- a/lib/compress/huf_compress.c +++ b/lib/compress/huf_compress.c @@ -405,6 +405,7 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValu } /** HUF_buildCTable() : + * @return : maxNbBits * Note : count is used before tree is written, so they can safely overlap */ size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index ef7d3a34..b9e0ec44 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2869,7 +2869,7 @@ size_t ZSTD_compress_generic (ZSTD_CCtx* cctx, if (params.nbThreads > 1) { if (cctx->mtctx == NULL || (params.nbThreads != ZSTDMT_getNbThreads(cctx->mtctx))) { DEBUGLOG(4, "ZSTD_compress_generic: creating new mtctx for nbThreads=%u (previous: %u)", - params.nbThreads, ZSTDMT_getNbThreads(cctx->mtctx)); + params.nbThreads, (U32)ZSTDMT_getNbThreads(cctx->mtctx)); ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = ZSTDMT_createCCtx_advanced(params.nbThreads, cctx->customMem); if (cctx->mtctx == NULL) return ERROR(memory_allocation); diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index efdffddb..b5a3957a 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -537,8 +537,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, /* Checks */ if (totalSamplesSize < MAX(d, sizeof(U64)) || totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { - DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n", - (COVER_MAX_SAMPLES_SIZE >> 20)); + DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", + (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20)); return 0; } /* Zero the context */ @@ -651,12 +651,16 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs, } ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( - void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples, - ZDICT_cover_params_t parameters) { - BYTE *const dict = (BYTE *)dictBuffer; + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_cover_params_t parameters) +{ + BYTE* const dict = (BYTE*)dictBuffer; COVER_ctx_t ctx; COVER_map_t activeDmers; + + /* Initialize global data */ + g_displayLevel = parameters.zParams.notificationLevel; /* Checks */ if (!COVER_checkParameters(parameters, dictBufferCapacity)) { DISPLAYLEVEL(1, "Cover parameters incorrect\n"); @@ -671,8 +675,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( ZDICT_DICTSIZE_MIN); return ERROR(dstSize_tooSmall); } - /* Initialize global data */ - g_displayLevel = parameters.zParams.notificationLevel; /* Initialize context and activeDmers */ if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, parameters.d)) { @@ -947,6 +949,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( unsigned k; COVER_best_t best; POOL_ctx *pool = NULL; + /* Checks */ if (kMinK < kMaxD || kMaxK < kMinK) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index e8f45404..7d24e499 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -207,7 +207,6 @@ static dictItem ZDICT_analyzePos( U32 cumulLength[LLIMIT] = {0}; U32 savings[LLIMIT] = {0}; const BYTE* b = (const BYTE*)buffer; - size_t length; size_t maxLength = LLIMIT; size_t pos = suffix[start]; U32 end = start; @@ -222,26 +221,30 @@ static dictItem ZDICT_analyzePos( ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { /* skip and mark segment */ - U16 u16 = MEM_read16(b+pos+4); - U32 u, e = 6; - while (MEM_read16(b+pos+e) == u16) e+=2 ; - if (b[pos+e] == b[pos+e-1]) e++; - for (u=1; u=MINMATCHLENGTH); + { size_t length; + do { + end++; + length = ZDICT_count(b + pos, b + suffix[end]); + } while (length >= MINMATCHLENGTH); + } /* look backward */ - do { - length = ZDICT_count(b + pos, b + *(suffix+start-1)); - if (length >=MINMATCHLENGTH) start--; - } while(length >= MINMATCHLENGTH); + { size_t length; + do { + length = ZDICT_count(b + pos, b + *(suffix+start-1)); + if (length >=MINMATCHLENGTH) start--; + } while(length >= MINMATCHLENGTH); + } /* exit if not found a minimum nb of repetitions */ if (end-start < minRatio) { @@ -268,7 +271,7 @@ static dictItem ZDICT_analyzePos( U32 selectedCount = 0; U32 selectedID = currentID; for (id =refinedStart; id < refinedEnd; id++) { - if (b[ suffix[id] + searchLength] != currentChar) { + if (b[suffix[id] + searchLength] != currentChar) { if (currentCount > selectedCount) { selectedCount = currentCount; selectedID = currentID; @@ -297,20 +300,23 @@ static dictItem ZDICT_analyzePos( memset(lengthList, 0, sizeof(lengthList)); /* look forward */ - do { - end++; - length = ZDICT_count(b + pos, b + suffix[end]); - if (length >= LLIMIT) length = LLIMIT-1; - lengthList[length]++; - } while (length >=MINMATCHLENGTH); + { size_t length; + do { + end++; + length = ZDICT_count(b + pos, b + suffix[end]); + if (length >= LLIMIT) length = LLIMIT-1; + lengthList[length]++; + } while (length >=MINMATCHLENGTH); + } /* look backward */ - length = MINMATCHLENGTH; - while ((length >= MINMATCHLENGTH) & (start > 0)) { - length = ZDICT_count(b + pos, b + suffix[start - 1]); - if (length >= LLIMIT) length = LLIMIT - 1; - lengthList[length]++; - if (length >= MINMATCHLENGTH) start--; + { size_t length = MINMATCHLENGTH; + while ((length >= MINMATCHLENGTH) & (start > 0)) { + length = ZDICT_count(b + pos, b + suffix[start - 1]); + if (length >= LLIMIT) length = LLIMIT - 1; + lengthList[length]++; + if (length >= MINMATCHLENGTH) start--; + } } /* largest useful length */ @@ -345,12 +351,12 @@ static dictItem ZDICT_analyzePos( /* mark positions done */ { U32 id; for (id=start; id solution.length) length = solution.length; } pEnd = (U32)(testedPos + length); @@ -575,29 +581,30 @@ static void ZDICT_fillNoise(void* buffer, size_t length) typedef struct { - ZSTD_CCtx* ref; - ZSTD_CCtx* zc; + ZSTD_CCtx* ref; /* contains reference to dictionary */ + ZSTD_CCtx* zc; /* working context */ void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ } EStats_ress_t; #define MAXREPOFFSET 1024 static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, - U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, - const void* src, size_t srcSize, U32 notificationLevel) + U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, + const void* src, size_t srcSize, + U32 notificationLevel) { size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog); size_t cSize; if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ - { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); - if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } + { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); + if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } } cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } if (cSize) { /* if == 0; block is not compressible */ - const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc); + const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc); /* literals stats */ { const BYTE* bytePtr; @@ -659,6 +666,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, } } +/* ZDICT_flatLit() : + * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals. + * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode. + */ +static void ZDICT_flatLit(U32* countLit) +{ + int u; + for (u=1; u<256; u++) countLit[u] = 2; + countLit[0] = 4; + countLit[253] = 1; + countLit[254] = 1; +} #define OFFCODE_MAX 30 /* only applicable to first block */ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, @@ -688,6 +707,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, BYTE* dstPtr = (BYTE*)dstBuffer; /* init */ + DEBUGLOG(4, "ZDICT_analyzeEntropy"); esr.ref = ZSTD_createCCtx(); esr.zc = ZSTD_createCCtx(); esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); @@ -713,7 +733,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, goto _cleanup; } } - /* collect stats on all files */ + /* collect stats on all samples */ for (u=0; u=1) + params.zParams.notificationLevel = ZSTD_DEBUG; +#endif return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity, - samplesBuffer, samplesSizes, - nbSamples, ¶ms); + samplesBuffer, samplesSizes, nbSamples, + ¶ms); } size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) { ZDICT_params_t params; memset(¶ms, 0, sizeof(params)); diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h index 5f0000b1..92f66415 100644 --- a/lib/dictBuilder/zdict.h +++ b/lib/dictBuilder/zdict.h @@ -38,21 +38,21 @@ extern "C" { /*! ZDICT_trainFromBuffer(): - * Train a dictionary from an array of samples. - * Uses ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4. - * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, - * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. - * The resulting dictionary will be saved into `dictBuffer`. + * Train a dictionary from an array of samples. + * Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - * or an error code, which can be tested with ZDICT_isError(). - * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte. - * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. - * It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. - * In general, it's recommended to provide a few thousands samples, but this can vary a lot. + * or an error code, which can be tested with ZDICT_isError(). + * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte. + * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. + * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. + * In general, it's recommended to provide a few thousands samples, though this can vary a lot. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. */ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); /*====== Helper functions ======*/ @@ -72,14 +72,14 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode); * ==================================================================================== */ typedef struct { - int compressionLevel; /* 0 means default; target a specific zstd compression level */ - unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ - unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */ + int compressionLevel; /* optimize for a specific zstd compression level; 0 means default */ + unsigned notificationLevel; /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ + unsigned dictID; /* force dictID value; 0 means auto mode (32-bits random value) */ } ZDICT_params_t; /*! ZDICT_cover_params_t: - * For all values 0 means default. * k and d are the only required parameters. + * For others, value 0 means default. */ typedef struct { unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ @@ -91,28 +91,28 @@ typedef struct { /*! ZDICT_trainFromBuffer_cover(): - * Train a dictionary from an array of samples using the COVER algorithm. - * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, - * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. - * The resulting dictionary will be saved into `dictBuffer`. + * Train a dictionary from an array of samples using the COVER algorithm. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - * or an error code, which can be tested with ZDICT_isError(). - * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte. - * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. - * It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. - * In general, it's recommended to provide a few thousands samples, but this can vary a lot. + * or an error code, which can be tested with ZDICT_isError(). + * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte. + * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. + * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. + * In general, it's recommended to provide a few thousands samples, though this can vary a lot. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. */ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( - void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples, + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_cover_params_t parameters); /*! ZDICT_optimizeTrainFromBuffer_cover(): * The same requirements as above hold for all the parameters except `parameters`. * This function tries many parameter combinations and picks the best parameters. - * `*parameters` is filled with the best parameters found, and the dictionary - * constructed with those parameters is stored in `dictBuffer`. + * `*parameters` is filled with the best parameters found, + * dictionary constructed with those parameters is stored in `dictBuffer`. * * All of the parameters d, k, steps are optional. * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. @@ -125,9 +125,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. */ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( - void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples, - ZDICT_cover_params_t *parameters); + void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_cover_params_t* parameters); /*! ZDICT_finalizeDictionary(): * Given a custom content as a basis for dictionary, and a set of samples, @@ -157,22 +157,23 @@ typedef struct { } ZDICT_legacy_params_t; /*! ZDICT_trainFromBuffer_legacy(): - * Train a dictionary from an array of samples. - * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, - * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. - * The resulting dictionary will be saved into `dictBuffer`. + * Train a dictionary from an array of samples. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. * `parameters` is optional and can be provided with values set to 0 to mean "default". * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - * or an error code, which can be tested with ZDICT_isError(). - * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. - * It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. - * In general, it's recommended to provide a few thousands samples, but this can vary a lot. + * or an error code, which can be tested with ZDICT_isError(). + * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. + * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. + * In general, it's recommended to provide a few thousands samples, though this can vary a lot. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. - * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. + * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. */ ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy( - void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t parameters); + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_legacy_params_t parameters); /* Deprecation warnings */ /* It is generally possible to disable deprecation warnings from compiler, diff --git a/tests/fuzzer.c b/tests/fuzzer.c index a27cc9b5..024a583b 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -666,12 +666,13 @@ static int basicUnitTests(U32 seed, double compressibility) /* Dictionary and dictBuilder tests */ { ZSTD_CCtx* const cctx = ZSTD_createCCtx(); - size_t dictSize = 16 KB; - void* dictBuffer = malloc(dictSize); + size_t const dictBufferCapacity = 16 KB; + void* dictBuffer = malloc(dictBufferCapacity); size_t const totalSampleSize = 1 MB; size_t const sampleUnitSize = 8 KB; U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize); size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t)); + size_t dictSize; U32 dictID; if (dictBuffer==NULL || samplesSizes==NULL) { @@ -680,9 +681,19 @@ static int basicUnitTests(U32 seed, double compressibility) goto _output_error; } + DISPLAYLEVEL(4, "test%3i : dictBuilder on cyclic data : ", testNb++); + assert(compressedBufferSize >= totalSampleSize); + { U32 u; for (u=0; u