diff --git a/.gitignore b/.gitignore index e7c9a568..0c458153 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ projects/cmake/ # Test artefacts tmp* +dictionary # tmp files *.swp diff --git a/NEWS b/NEWS index 7ffa4023..d01a3313 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,6 @@ v0.8.0 New : updated compresson format +Improved : better speed on clang and gcc -O2, thanks to Eric Biggers Fixed : legacy mode with ZSTD_HEAPMODE=0, by Christopher Bergqvist Fixed : premature end of frame when zero-sized raw block, reported by Eric Biggers Fixed : checksum correctly checked in single-pass mode diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index f1518555..75a9b1e3 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -924,7 +924,7 @@ size_t ZDICT_trainFromBuffer_unsafe( const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_params_t params) { - U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16)); + U32 const dictListSize = MAX(MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16)); dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); unsigned selectivity = params.selectivityLevel; size_t const targetDictSize = maxDictSize; @@ -957,17 +957,25 @@ size_t ZDICT_trainFromBuffer_unsafe( DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize); DISPLAYLEVEL(3, "list %u best segments \n", nb); for (u=1; u<=nb; u++) { - U32 p = dictList[u].pos; - U32 l = dictList[u].length; - U32 d = MIN(40, l); + U32 pos = dictList[u].pos; + U32 length = dictList[u].length; + U32 printedLength = MIN(40, length); DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", - u, l, p, dictList[u].savings); - ZDICT_printHex(3, (const char*)samplesBuffer+p, d); + u, length, pos, dictList[u].savings); + ZDICT_printHex(3, (const char*)samplesBuffer+pos, printedLength); DISPLAYLEVEL(3, "| \n"); } } } /* create dictionary */ { U32 dictContentSize = ZDICT_dictSize(dictList); + U64 const totalSamplesSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); + if (dictContentSize < targetDictSize/2) { + DISPLAYLEVEL(2, "! warning : created dictionary significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize); + DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1); + DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n"); + if (totalSamplesSize < 10 * targetDictSize) + DISPLAYLEVEL(2, "! consider also increasing the number of samples (total size : %u MB)\n", (U32)(totalSamplesSize>>20)); + } /* build dict content */ { U32 u; diff --git a/programs/dibio.c b/programs/dibio.c index a61ea9cc..cb864ec1 100644 --- a/programs/dibio.c +++ b/programs/dibio.c @@ -202,9 +202,16 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, /* Checks */ if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ + g_displayLevel = params.notificationLevel; + if (nbFiles < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing \n"); + DISPLAYLEVEL(2, "! Please provide one file per sample \n"); + DISPLAYLEVEL(2, "! Avoid concatenating multiple samples into a single file \n"); + DISPLAYLEVEL(2, "! otherwise, dictBuilder will be unable to find the beginning of each sample \n"); + DISPLAYLEVEL(2, "! resulting in distorted statistics \n"); + } /* init */ - g_displayLevel = params.notificationLevel; if (benchedSize < totalSizeToLoad) DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));