Merge pull request #507 from terrelln/dict-builder-optimize

Add new dictionary builder
This commit is contained in:
Yann Collet 2017-01-03 20:33:33 +01:00 committed by GitHub
commit 4736dffd41
18 changed files with 1286 additions and 15 deletions

View File

@ -331,6 +331,10 @@
RelativePath="..\..\..\programs\datagen.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\cover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
>

View File

@ -343,6 +343,10 @@
RelativePath="..\..\..\programs\dibio.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\cover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
>

View File

@ -327,6 +327,10 @@
Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
>
<File
RelativePath="..\..\..\lib\dictBuilder\cover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
>

View File

@ -332,6 +332,10 @@
RelativePath="..\..\..\programs\datagen.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\cover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
>

View File

@ -344,6 +344,10 @@
RelativePath="..\..\..\programs\dibio.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\cover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
>

View File

@ -328,6 +328,10 @@
Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
>
<File
RelativePath="..\..\..\lib\dictBuilder\cover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
>

View File

@ -165,6 +165,7 @@
<ClCompile Include="..\..\..\lib\compress\zstd_compress.c" />
<ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
<ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
<ClCompile Include="..\..\..\programs\datagen.c" />

View File

@ -32,6 +32,7 @@
<ClCompile Include="..\..\..\lib\deprecated\zbuff_common.c" />
<ClCompile Include="..\..\..\lib\deprecated\zbuff_compress.c" />
<ClCompile Include="..\..\..\lib\deprecated\zbuff_decompress.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
<ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />

View File

@ -32,6 +32,7 @@
<ClCompile Include="..\..\..\lib\deprecated\zbuff_common.c" />
<ClCompile Include="..\..\..\lib\deprecated\zbuff_compress.c" />
<ClCompile Include="..\..\..\lib\deprecated\zbuff_decompress.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
<ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />

View File

@ -29,6 +29,7 @@
<ClCompile Include="..\..\..\lib\compress\zstd_compress.c" />
<ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
<ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
<ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />

View File

@ -67,6 +67,7 @@ SET(Sources
${LIBRARY_DIR}/compress/zstd_compress.c
${LIBRARY_DIR}/decompress/huf_decompress.c
${LIBRARY_DIR}/decompress/zstd_decompress.c
${LIBRARY_DIR}/dictBuilder/cover.c
${LIBRARY_DIR}/dictBuilder/divsufsort.c
${LIBRARY_DIR}/dictBuilder/zdict.c
${LIBRARY_DIR}/deprecated/zbuff_common.c

1021
lib/dictBuilder/cover.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -86,6 +86,55 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dict
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t parameters);
/*! COVER_params_t :
For all values 0 means default.
kMin and d are the only required parameters.
*/
typedef struct {
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (256) : Higher means more parameters checked */
unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */
int compressionLevel; /* 0 means default; target a specific zstd compression level */
} COVER_params_t;
/*! COVER_trainFromBuffer() :
Train a dictionary from an array of samples using the COVER algorithm.
Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
The resulting dictionary will be saved into `dictBuffer`.
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
or an error code, which can be tested with ZDICT_isError().
Tips : In general, a reasonable dictionary has a size of ~ 100 KB.
It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
In general, it's recommended to provide a few thousands samples, but this can vary a lot.
It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
*/
ZDICTLIB_API size_t COVER_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
COVER_params_t parameters);
/*! COVER_optimizeTrainFromBuffer() :
The same requirements as above hold for all the parameters except `parameters`.
This function tries many parameter combinations and picks the best parameters.
`*parameters` is filled with the best parameters found, and the dictionary
constructed with those parameters is stored in `dictBuffer`.
All of the parameters d, k, steps are optional.
If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
if steps is zero it defaults to its default value.
If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
or an error code, which can be tested with ZDICT_isError().
On success `*parameters` contains the parameters selected.
*/
ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
COVER_params_t *parameters);
/*! ZDICT_finalizeDictionary() :

View File

@ -42,6 +42,7 @@
#define SAMPLESIZE_MAX (128 KB)
#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
#define NOISELENGTH 32
@ -118,10 +119,36 @@ static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr,
fileSizes[n] = fileSize;
fclose(f);
} }
DISPLAYLEVEL(2, "\r%79s\r", "");
*bufferSizePtr = pos;
return n;
}
#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
static U32 DiB_rand(U32* src)
{
static const U32 prime1 = 2654435761U;
static const U32 prime2 = 2246822519U;
U32 rand32 = *src;
rand32 *= prime1;
rand32 ^= prime2;
rand32 = DiB_rotl32(rand32, 13);
*src = rand32;
return rand32 >> 5;
}
static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
/* Initialize the pseudorandom number generator */
U32 seed = 0xFD2FB528;
unsigned i;
for (i = nbFiles - 1; i > 0; --i) {
unsigned const j = DiB_rand(&seed) % (i + 1);
const char* tmp = fileNamesTable[j];
fileNamesTable[j] = fileNamesTable[i];
fileNamesTable[i] = tmp;
}
}
/*-********************************************************
* Dictionary training functions
@ -202,7 +229,8 @@ size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity,
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles,
ZDICT_params_t params)
ZDICT_params_t *params, COVER_params_t *coverParams,
int optimizeCover)
{
void* const dictBuffer = malloc(maxDictSize);
size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
@ -213,8 +241,10 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
int result = 0;
/* Checks */
if (params) g_displayLevel = params->notificationLevel;
else if (coverParams) g_displayLevel = coverParams->notificationLevel;
else EXM_THROW(13, "Neither dictionary algorith selected"); /* should not happen */
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
g_displayLevel = params.notificationLevel;
if (g_tooLargeSamples) {
DISPLAYLEVEL(2, "! Warning : some samples are very large \n");
DISPLAYLEVEL(2, "! Note that dictionary is only useful for small files or beginning of large files. \n");
@ -233,12 +263,29 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
/* Load input buffer */
DISPLAYLEVEL(3, "Shuffling input files\n");
DiB_shuffle(fileNamesTable, nbFiles);
nbFiles = DiB_loadFiles(srcBuffer, &benchedSize, fileSizes, fileNamesTable, nbFiles);
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
{ size_t const dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
srcBuffer, fileSizes, nbFiles,
params);
{
size_t dictSize;
if (params) {
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
srcBuffer, fileSizes, nbFiles,
*params);
} else if (optimizeCover) {
dictSize = COVER_optimizeTrainFromBuffer(
dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles,
coverParams);
if (!ZDICT_isError(dictSize)) {
DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
}
} else {
dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize,
srcBuffer, fileSizes, nbFiles,
*coverParams);
}
if (ZDICT_isError(dictSize)) {
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
result = 1;

View File

@ -32,7 +32,7 @@
*/
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles,
ZDICT_params_t parameters);
ZDICT_params_t *params, COVER_params_t *coverParams,
int optimizeCover);
#endif

View File

@ -127,6 +127,8 @@ static int usage_advanced(const char* programName)
DISPLAY( "\n");
DISPLAY( "Dictionary builder :\n");
DISPLAY( "--train ## : create a dictionary from a training set of files \n");
DISPLAY( "--cover=k=#,d=# : use the cover algorithm with parameters k and d \n");
DISPLAY( "--optimize-cover[=steps=#,k=#,d=#] : optimize cover parameters with optional parameters\n");
DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
DISPLAY( "--maxdict ## : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
@ -192,6 +194,27 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
}
#ifndef ZSTD_NODICT
/**
* parseCoverParameters() :
* reads cover parameters from *stringPtr (e.g. "--cover=smoothing=100,kmin=48,kstep=4,kmax=64,d=8") into *params
* @return 1 means that cover parameters were correct
* @return 0 in case of malformed parameters
*/
static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *params)
{
memset(params, 0, sizeof(*params));
for (; ;) {
if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
return 0;
}
if (stringPtr[0] != 0) return 0;
DISPLAYLEVEL(4, "k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
return 1;
}
#endif
/** parseCompressionParameters() :
* reads compression parameters from *stringPtr (e.g. "--zstd=wlog=23,clog=23,hlog=22,slog=6,slen=3,tlen=48,strat=6") into *params
* @return 1 means that compression parameters were correct
@ -254,6 +277,10 @@ int main(int argCount, const char* argv[])
char* fileNamesBuf = NULL;
unsigned fileNamesNb;
#endif
#ifndef ZSTD_NODICT
COVER_params_t coverParams;
int cover = 0;
#endif
/* init */
(void)recursive; (void)cLevelLast; /* not used when ZSTD_NOBENCH set */
@ -318,6 +345,20 @@ int main(int argCount, const char* argv[])
if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; }
/* long commands with arguments */
#ifndef ZSTD_NODICT
if (longCommandWArg(&argument, "--cover=")) {
cover=1; if (!parseCoverParameters(argument, &coverParams)) CLEAN_RETURN(badusage(programName));
continue;
}
if (longCommandWArg(&argument, "--optimize-cover")) {
cover=2;
/* Allow optional arguments following an = */
if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
continue;
}
#endif
if (longCommandWArg(&argument, "--memlimit=")) { memLimit = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--memory=")) { memLimit = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--memlimit-decompress=")) { memLimit = readU32FromChar(&argument); continue; }
@ -520,13 +561,20 @@ int main(int argCount, const char* argv[])
/* Check if dictionary builder is selected */
if (operation==zom_train) {
#ifndef ZSTD_NODICT
ZDICT_params_t dictParams;
memset(&dictParams, 0, sizeof(dictParams));
dictParams.compressionLevel = dictCLevel;
dictParams.selectivityLevel = dictSelect;
dictParams.notificationLevel = displayLevel;
dictParams.dictID = dictID;
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams);
if (cover) {
coverParams.compressionLevel = dictCLevel;
coverParams.notificationLevel = displayLevel;
coverParams.dictID = dictID;
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
} else {
ZDICT_params_t dictParams;
memset(&dictParams, 0, sizeof(dictParams));
dictParams.compressionLevel = dictCLevel;
dictParams.selectivityLevel = dictSelect;
dictParams.notificationLevel = displayLevel;
dictParams.dictID = dictID;
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
}
#endif
goto _end;
}

View File

@ -28,6 +28,7 @@
#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_compressContinue, ZSTD_compressBlock */
#include "zstd.h" /* ZSTD_VERSION_STRING */
#include "zstd_errors.h" /* ZSTD_getErrorCode */
#define ZDICT_STATIC_LINKING_ONLY
#include "zdict.h" /* ZDICT_trainFromBuffer */
#include "datagen.h" /* RDG_genBuffer */
#include "mem.h"
@ -317,6 +318,61 @@ static int basicUnitTests(U32 seed, double compressibility)
free(samplesSizes);
}
/* COVER dictionary builder tests */
{ ZSTD_CCtx* const cctx = ZSTD_createCCtx();
ZSTD_DCtx* const dctx = ZSTD_createDCtx();
size_t dictSize = 16 KB;
size_t optDictSize = dictSize;
void* dictBuffer = malloc(dictSize);
size_t const totalSampleSize = 1 MB;
size_t const sampleUnitSize = 8 KB;
U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize);
size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t));
COVER_params_t params;
U32 dictID;
if (dictBuffer==NULL || samplesSizes==NULL) {
free(dictBuffer);
free(samplesSizes);
goto _output_error;
}
DISPLAYLEVEL(4, "test%3i : COVER_trainFromBuffer : ", testNb++);
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
memset(&params, 0, sizeof(params));
params.d = 1 + (FUZ_rand(&seed) % 16);
params.k = params.d + (FUZ_rand(&seed) % 256);
dictSize = COVER_trainFromBuffer(dictBuffer, dictSize,
CNBuffer, samplesSizes, nbSamples,
params);
if (ZDICT_isError(dictSize)) goto _output_error;
DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize);
DISPLAYLEVEL(4, "test%3i : check dictID : ", testNb++);
dictID = ZDICT_getDictID(dictBuffer, dictSize);
if (dictID==0) goto _output_error;
DISPLAYLEVEL(4, "OK : %u \n", dictID);
DISPLAYLEVEL(4, "test%3i : COVER_optimizeTrainFromBuffer : ", testNb++);
memset(&params, 0, sizeof(params));
params.steps = 4;
optDictSize = COVER_optimizeTrainFromBuffer(dictBuffer, optDictSize,
CNBuffer, samplesSizes, nbSamples,
&params);
if (ZDICT_isError(optDictSize)) goto _output_error;
DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)optDictSize);
DISPLAYLEVEL(4, "test%3i : check dictID : ", testNb++);
dictID = ZDICT_getDictID(dictBuffer, optDictSize);
if (dictID==0) goto _output_error;
DISPLAYLEVEL(4, "OK : %u \n", dictID);
ZSTD_freeCCtx(cctx);
ZSTD_freeDCtx(dctx);
free(dictBuffer);
free(samplesSizes);
}
/* Decompression defense tests */
DISPLAYLEVEL(4, "test%3i : Check input length for magic number : ", testNb++);
{ size_t const r = ZSTD_decompress(decodedBuffer, CNBuffSize, CNBuffer, 3);

View File

@ -255,6 +255,27 @@ rm -rf dirTestDict
rm tmp*
$ECHO "\n**** cover dictionary tests **** "
TESTFILE=../programs/zstdcli.c
./datagen > tmpDict
$ECHO "- Create first dictionary"
$ZSTD --train --cover=k=46,d=8 *.c ../programs/*.c -o tmpDict
cp $TESTFILE tmp
$ZSTD -f tmp -D tmpDict
$ZSTD -d tmp.zst -D tmpDict -fo result
$DIFF $TESTFILE result
$ECHO "- Create second (different) dictionary"
$ZSTD --train --cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
$ECHO "- Create dictionary with short dictID"
$ZSTD --train --cover=k=46,d=8 *.c ../programs/*.c --dictID 1 -o tmpDict1
cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
$ECHO "- Create dictionary with size limit"
$ZSTD --train --optimize-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict 4K
rm tmp*
$ECHO "\n**** integrity tests **** "
$ECHO "test one file (tmp1.zst) "