Adding shrinking flag for cover and fastcover (#1656)
* Changed ERROR(GENERIC) excluding inits * editing git ignore * Edited init functions to size_t returns * moved declarations earlier * resolved issues with changes to init functions * fixed style and an error check * attempting to add tests that might trigger changes * added && die to cases expecting to fail * resolved no die on expected failed command * fixed accel to be incorrect value * Adding an automated shrinking option * Fixing build * finalizing fixes * fix? * Removing added comment in cover.h * Styling fixes * Merging with fb dev * removing megic number for default regression * Requested revisions * fixing support for fast cover * fixing casting errors * parenthesis fix * fixing some build nits * resolving travis ci syntax * might resolve all compilation issues * removed unused variable * remodeling the selectDict function * fixing bad memory access * fixing error checks * fixed erroring check in selectDict * fixing mixed declarations * modify mixed declaration * fixing nits and adding test cases * Adding requested changes + fixed bug for error checking * switched double comparison from != to < * fixed declaration typing * refactoring COVER_best_finish() and changing shrinkDict * removing the const's * modifying ZDICT_optimizeTrainFromBuffer_cover functions * fixing potential bad memcpy * fixing the error function for dict size
This commit is contained in:
parent
9038579ab2
commit
c55d2e7ba3
@ -889,9 +889,11 @@ void COVER_best_start(COVER_best_t *best) {
|
||||
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
||||
* If this dictionary is the best so far save it and its parameters.
|
||||
*/
|
||||
void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
||||
ZDICT_cover_params_t parameters, void *dict,
|
||||
size_t dictSize) {
|
||||
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
||||
COVER_dictSelection_t selection) {
|
||||
void* dict = selection.dictContent;
|
||||
size_t compressedSize = selection.totalCompressedSize;
|
||||
size_t dictSize = selection.dictSize;
|
||||
if (!best) {
|
||||
return;
|
||||
}
|
||||
@ -917,6 +919,9 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
||||
}
|
||||
}
|
||||
/* Save the dictionary, parameters, and size */
|
||||
if (!dict) {
|
||||
return;
|
||||
}
|
||||
memcpy(best->dict, dict, dictSize);
|
||||
best->dictSize = dictSize;
|
||||
best->parameters = parameters;
|
||||
@ -929,6 +934,111 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
||||
}
|
||||
}
|
||||
|
||||
COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
|
||||
COVER_dictSelection_t selection = { NULL, 0, error };
|
||||
return selection;
|
||||
}
|
||||
|
||||
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
|
||||
return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
|
||||
}
|
||||
|
||||
void COVER_dictSelectionFree(COVER_dictSelection_t selection){
|
||||
free(selection.dictContent);
|
||||
}
|
||||
|
||||
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
|
||||
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
||||
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
|
||||
|
||||
size_t largestDict = 0;
|
||||
size_t largestCompressed = 0;
|
||||
BYTE* customDictContentEnd = customDictContent + dictContentSize;
|
||||
|
||||
BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
|
||||
BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
|
||||
double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
|
||||
|
||||
if (!largestDictbuffer || !candidateDictBuffer) {
|
||||
free(largestDictbuffer);
|
||||
free(candidateDictBuffer);
|
||||
return COVER_dictSelectionError(dictContentSize);
|
||||
}
|
||||
|
||||
/* Initial dictionary size and compressed size */
|
||||
memcpy(largestDictbuffer, customDictContent, dictContentSize);
|
||||
dictContentSize = ZDICT_finalizeDictionary(
|
||||
largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
|
||||
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
||||
|
||||
if (ZDICT_isError(dictContentSize)) {
|
||||
free(largestDictbuffer);
|
||||
free(candidateDictBuffer);
|
||||
return COVER_dictSelectionError(dictContentSize);
|
||||
}
|
||||
|
||||
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
||||
samplesBuffer, offsets,
|
||||
nbCheckSamples, nbSamples,
|
||||
largestDictbuffer, dictContentSize);
|
||||
|
||||
if (ZSTD_isError(totalCompressedSize)) {
|
||||
free(largestDictbuffer);
|
||||
free(candidateDictBuffer);
|
||||
return COVER_dictSelectionError(totalCompressedSize);
|
||||
}
|
||||
|
||||
if (params.shrinkDict == 0) {
|
||||
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
||||
free(candidateDictBuffer);
|
||||
return selection;
|
||||
}
|
||||
|
||||
largestDict = dictContentSize;
|
||||
largestCompressed = totalCompressedSize;
|
||||
dictContentSize = ZDICT_DICTSIZE_MIN;
|
||||
|
||||
/* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
|
||||
while (dictContentSize < largestDict) {
|
||||
memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
|
||||
dictContentSize = ZDICT_finalizeDictionary(
|
||||
candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
|
||||
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
||||
|
||||
if (ZDICT_isError(dictContentSize)) {
|
||||
free(largestDictbuffer);
|
||||
free(candidateDictBuffer);
|
||||
return COVER_dictSelectionError(dictContentSize);
|
||||
|
||||
}
|
||||
|
||||
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
||||
samplesBuffer, offsets,
|
||||
nbCheckSamples, nbSamples,
|
||||
candidateDictBuffer, dictContentSize);
|
||||
|
||||
if (ZSTD_isError(totalCompressedSize)) {
|
||||
free(largestDictbuffer);
|
||||
free(candidateDictBuffer);
|
||||
return COVER_dictSelectionError(totalCompressedSize);
|
||||
}
|
||||
|
||||
if (totalCompressedSize <= largestCompressed * regressionTolerance) {
|
||||
COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
|
||||
free(largestDictbuffer);
|
||||
return selection;
|
||||
}
|
||||
dictContentSize *= 2;
|
||||
}
|
||||
dictContentSize = largestDict;
|
||||
totalCompressedSize = largestCompressed;
|
||||
{
|
||||
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
||||
free(candidateDictBuffer);
|
||||
return selection;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for COVER_tryParameters().
|
||||
*/
|
||||
@ -954,6 +1064,7 @@ static void COVER_tryParameters(void *opaque) {
|
||||
/* Allocate space for hash table, dict, and freqs */
|
||||
COVER_map_t activeDmers;
|
||||
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
||||
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
||||
U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
||||
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
||||
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
||||
@ -969,29 +1080,21 @@ static void COVER_tryParameters(void *opaque) {
|
||||
{
|
||||
const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
|
||||
dictBufferCapacity, parameters);
|
||||
dictBufferCapacity = ZDICT_finalizeDictionary(
|
||||
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
|
||||
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
|
||||
parameters.zParams);
|
||||
if (ZDICT_isError(dictBufferCapacity)) {
|
||||
DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
|
||||
selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
|
||||
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
||||
totalCompressedSize);
|
||||
|
||||
if (COVER_dictSelectionIsError(selection)) {
|
||||
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
||||
goto _cleanup;
|
||||
}
|
||||
}
|
||||
/* Check total compressed size */
|
||||
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
||||
ctx->samples, ctx->offsets,
|
||||
ctx->nbTrainSamples, ctx->nbSamples,
|
||||
dict, dictBufferCapacity);
|
||||
|
||||
_cleanup:
|
||||
COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
|
||||
dictBufferCapacity);
|
||||
free(dict);
|
||||
COVER_best_finish(data->best, parameters, selection);
|
||||
free(data);
|
||||
COVER_map_destroy(&activeDmers);
|
||||
if (dict) {
|
||||
free(dict);
|
||||
}
|
||||
COVER_dictSelectionFree(selection);
|
||||
if (freqs) {
|
||||
free(freqs);
|
||||
}
|
||||
@ -1013,6 +1116,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
||||
const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
|
||||
const unsigned kIterations =
|
||||
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
||||
const unsigned shrinkDict = 0;
|
||||
/* Local variables */
|
||||
const int displayLevel = parameters->zParams.notificationLevel;
|
||||
unsigned iteration = 1;
|
||||
@ -1091,6 +1195,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
||||
data->parameters.d = d;
|
||||
data->parameters.splitPoint = splitPoint;
|
||||
data->parameters.steps = kSteps;
|
||||
data->parameters.shrinkDict = shrinkDict;
|
||||
data->parameters.zParams.notificationLevel = g_displayLevel;
|
||||
/* Check the parameters */
|
||||
if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
|
||||
|
@ -46,6 +46,15 @@ typedef struct {
|
||||
U32 size;
|
||||
} COVER_epoch_info_t;
|
||||
|
||||
/**
|
||||
* Struct used for the dictionary selection function.
|
||||
*/
|
||||
typedef struct COVER_dictSelection {
|
||||
BYTE* dictContent;
|
||||
size_t dictSize;
|
||||
size_t totalCompressedSize;
|
||||
} COVER_dictSelection_t;
|
||||
|
||||
/**
|
||||
* Computes the number of epochs and the size of each epoch.
|
||||
* We will make sure that each epoch gets at least 10 * k bytes.
|
||||
@ -107,6 +116,32 @@ void COVER_best_start(COVER_best_t *best);
|
||||
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
||||
* If this dictionary is the best so far save it and its parameters.
|
||||
*/
|
||||
void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
||||
ZDICT_cover_params_t parameters, void *dict,
|
||||
size_t dictSize);
|
||||
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
||||
COVER_dictSelection_t selection);
|
||||
/**
|
||||
* Error function for COVER_selectDict function. Checks if the return
|
||||
* value is an error.
|
||||
*/
|
||||
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
|
||||
|
||||
/**
|
||||
* Error function for COVER_selectDict function. Returns a struct where
|
||||
* return.totalCompressedSize is a ZSTD error.
|
||||
*/
|
||||
COVER_dictSelection_t COVER_dictSelectionError(size_t error);
|
||||
|
||||
/**
|
||||
* Always call after selectDict is called to free up used memory from
|
||||
* newly created dictionary.
|
||||
*/
|
||||
void COVER_dictSelectionFree(COVER_dictSelection_t selection);
|
||||
|
||||
/**
|
||||
* Called to finalize the dictionary and select one based on whether or not
|
||||
* the shrink-dict flag was enabled. If enabled the dictionary used is the
|
||||
* smallest dictionary within a specified regression of the compressed size
|
||||
* from the largest dictionary.
|
||||
*/
|
||||
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
|
||||
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
||||
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
|
||||
|
@ -435,7 +435,6 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
|
||||
return tail;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Parameters for FASTCOVER_tryParameters().
|
||||
*/
|
||||
@ -464,6 +463,7 @@ static void FASTCOVER_tryParameters(void *opaque)
|
||||
U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
|
||||
/* Allocate space for hash table, dict, and freqs */
|
||||
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
||||
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
||||
U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
|
||||
if (!segmentFreqs || !dict || !freqs) {
|
||||
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
|
||||
@ -474,26 +474,23 @@ static void FASTCOVER_tryParameters(void *opaque)
|
||||
/* Build the dictionary */
|
||||
{ const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
|
||||
parameters, segmentFreqs);
|
||||
|
||||
const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
|
||||
dictBufferCapacity = ZDICT_finalizeDictionary(
|
||||
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
|
||||
ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
|
||||
if (ZDICT_isError(dictBufferCapacity)) {
|
||||
DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
|
||||
selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
|
||||
ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
||||
totalCompressedSize);
|
||||
|
||||
if (COVER_dictSelectionIsError(selection)) {
|
||||
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
||||
goto _cleanup;
|
||||
}
|
||||
}
|
||||
/* Check total compressed size */
|
||||
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
||||
ctx->samples, ctx->offsets,
|
||||
ctx->nbTrainSamples, ctx->nbSamples,
|
||||
dict, dictBufferCapacity);
|
||||
_cleanup:
|
||||
COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
|
||||
dictBufferCapacity);
|
||||
free(dict);
|
||||
COVER_best_finish(data->best, parameters, selection);
|
||||
free(data);
|
||||
free(segmentFreqs);
|
||||
free(dict);
|
||||
COVER_dictSelectionFree(selection);
|
||||
free(freqs);
|
||||
}
|
||||
|
||||
@ -508,6 +505,7 @@ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
|
||||
coverParams->nbThreads = fastCoverParams.nbThreads;
|
||||
coverParams->splitPoint = fastCoverParams.splitPoint;
|
||||
coverParams->zParams = fastCoverParams.zParams;
|
||||
coverParams->shrinkDict = fastCoverParams.shrinkDict;
|
||||
}
|
||||
|
||||
|
||||
@ -524,6 +522,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
|
||||
fastCoverParams->f = f;
|
||||
fastCoverParams->accel = accel;
|
||||
fastCoverParams->zParams = coverParams.zParams;
|
||||
fastCoverParams->shrinkDict = coverParams.shrinkDict;
|
||||
}
|
||||
|
||||
|
||||
@ -619,6 +618,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
||||
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
||||
const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
|
||||
const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
|
||||
const unsigned shrinkDict = 0;
|
||||
/* Local variables */
|
||||
const int displayLevel = parameters->zParams.notificationLevel;
|
||||
unsigned iteration = 1;
|
||||
@ -703,6 +703,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
||||
data->parameters.d = d;
|
||||
data->parameters.splitPoint = splitPoint;
|
||||
data->parameters.steps = kSteps;
|
||||
data->parameters.shrinkDict = shrinkDict;
|
||||
data->parameters.zParams.notificationLevel = g_displayLevel;
|
||||
/* Check the parameters */
|
||||
if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
|
||||
|
@ -94,6 +94,8 @@ typedef struct {
|
||||
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
|
||||
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
|
||||
double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
|
||||
unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
|
||||
unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
|
||||
ZDICT_params_t zParams;
|
||||
} ZDICT_cover_params_t;
|
||||
|
||||
@ -105,6 +107,9 @@ typedef struct {
|
||||
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
|
||||
double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
|
||||
unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
|
||||
unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
|
||||
unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
|
||||
|
||||
ZDICT_params_t zParams;
|
||||
} ZDICT_fastCover_params_t;
|
||||
|
||||
|
@ -179,8 +179,8 @@ static int usage_advanced(const char* programName)
|
||||
DISPLAY( "\n");
|
||||
DISPLAY( "Dictionary builder : \n");
|
||||
DISPLAY( "--train ## : create a dictionary from a training set of files \n");
|
||||
DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n");
|
||||
DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fast cover algorithm with optional args\n");
|
||||
DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#,shrink[=#]] : use the cover algorithm with optional args\n");
|
||||
DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#,shrink[=#]] : use the fast cover algorithm with optional args\n");
|
||||
DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
|
||||
DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
|
||||
DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
|
||||
@ -299,6 +299,7 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
|
||||
* @return 1 means that cover parameters were correct
|
||||
* @return 0 in case of malformed parameters
|
||||
*/
|
||||
static const unsigned kDefaultRegression = 1;
|
||||
static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
|
||||
{
|
||||
memset(params, 0, sizeof(*params));
|
||||
@ -311,10 +312,23 @@ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t
|
||||
params->splitPoint = (double)splitPercentage / 100.0;
|
||||
if (stringPtr[0]==',') { stringPtr++; continue; } else break;
|
||||
}
|
||||
if (longCommandWArg(&stringPtr, "shrink")) {
|
||||
params->shrinkDictMaxRegression = kDefaultRegression;
|
||||
params->shrinkDict = 1;
|
||||
if (stringPtr[0]=='=') {
|
||||
stringPtr++;
|
||||
params->shrinkDictMaxRegression = readU32FromChar(&stringPtr);
|
||||
}
|
||||
if (stringPtr[0]==',') {
|
||||
stringPtr++;
|
||||
continue;
|
||||
}
|
||||
else break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
if (stringPtr[0] != 0) return 0;
|
||||
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->steps, (unsigned)(params->splitPoint * 100));
|
||||
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplit=%u\nshrink%u\n", params->k, params->d, params->steps, (unsigned)(params->splitPoint * 100), params->shrinkDictMaxRegression);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -338,10 +352,23 @@ static unsigned parseFastCoverParameters(const char* stringPtr, ZDICT_fastCover_
|
||||
params->splitPoint = (double)splitPercentage / 100.0;
|
||||
if (stringPtr[0]==',') { stringPtr++; continue; } else break;
|
||||
}
|
||||
if (longCommandWArg(&stringPtr, "shrink")) {
|
||||
params->shrinkDictMaxRegression = kDefaultRegression;
|
||||
params->shrinkDict = 1;
|
||||
if (stringPtr[0]=='=') {
|
||||
stringPtr++;
|
||||
params->shrinkDictMaxRegression = readU32FromChar(&stringPtr);
|
||||
}
|
||||
if (stringPtr[0]==',') {
|
||||
stringPtr++;
|
||||
continue;
|
||||
}
|
||||
else break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
if (stringPtr[0] != 0) return 0;
|
||||
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel);
|
||||
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\nshrink=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel, params->shrinkDictMaxRegression);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -367,6 +394,8 @@ static ZDICT_cover_params_t defaultCoverParams(void)
|
||||
params.d = 8;
|
||||
params.steps = 4;
|
||||
params.splitPoint = 1.0;
|
||||
params.shrinkDict = 0;
|
||||
params.shrinkDictMaxRegression = kDefaultRegression;
|
||||
return params;
|
||||
}
|
||||
|
||||
@ -379,6 +408,8 @@ static ZDICT_fastCover_params_t defaultFastCoverParams(void)
|
||||
params.steps = 4;
|
||||
params.splitPoint = 0.75; /* different from default splitPoint of cover */
|
||||
params.accel = DEFAULT_ACCEL;
|
||||
params.shrinkDict = 0;
|
||||
params.shrinkDictMaxRegression = kDefaultRegression;
|
||||
return params;
|
||||
}
|
||||
#endif
|
||||
|
@ -1104,6 +1104,22 @@ static int basicUnitTests(U32 seed, double compressibility)
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
|
||||
|
||||
DISPLAYLEVEL(3, "test%3i : COVER dictBuilder with shrinkDict: ", testNb++);
|
||||
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
|
||||
{ ZDICT_cover_params_t coverParams;
|
||||
memset(&coverParams, 0, sizeof(coverParams));
|
||||
coverParams.steps = 8;
|
||||
coverParams.nbThreads = 4;
|
||||
coverParams.shrinkDict = 1;
|
||||
coverParams.shrinkDictMaxRegression = 1;
|
||||
dictSize = ZDICT_optimizeTrainFromBuffer_cover(
|
||||
dictBuffer, dictBufferCapacity,
|
||||
CNBuffer, samplesSizes, nbSamples/8, /* less samples for faster tests */
|
||||
&coverParams);
|
||||
if (ZDICT_isError(dictSize)) goto _output_error;
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
|
||||
|
||||
DISPLAYLEVEL(3, "test%3i : Multithreaded FASTCOVER dictBuilder : ", testNb++);
|
||||
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
|
||||
{ ZDICT_fastCover_params_t fastCoverParams;
|
||||
@ -1118,6 +1134,22 @@ static int basicUnitTests(U32 seed, double compressibility)
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
|
||||
|
||||
DISPLAYLEVEL(3, "test%3i : FASTCOVER dictBuilder with shrinkDict: ", testNb++);
|
||||
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
|
||||
{ ZDICT_fastCover_params_t fastCoverParams;
|
||||
memset(&fastCoverParams, 0, sizeof(fastCoverParams));
|
||||
fastCoverParams.steps = 8;
|
||||
fastCoverParams.nbThreads = 4;
|
||||
fastCoverParams.shrinkDict = 1;
|
||||
fastCoverParams.shrinkDictMaxRegression = 1;
|
||||
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(
|
||||
dictBuffer, dictBufferCapacity,
|
||||
CNBuffer, samplesSizes, nbSamples,
|
||||
&fastCoverParams);
|
||||
if (ZDICT_isError(dictSize)) goto _output_error;
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
|
||||
|
||||
DISPLAYLEVEL(3, "test%3i : check dictID : ", testNb++);
|
||||
dictID = ZDICT_getDictID(dictBuffer, dictSize);
|
||||
if (dictID==0) goto _output_error;
|
||||
|
@ -499,6 +499,10 @@ $ZSTD --train-fastcover=k=56,d=8 && die "Create dictionary without input file"
|
||||
println "- Create dictionary with short dictID"
|
||||
$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpDict1
|
||||
cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
|
||||
println "- Create dictionaries with shrink-dict flag enabled"
|
||||
$ZSTD --train-fastcover=steps=256,shrink "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpShrinkDict
|
||||
$ZSTD --train-fastcover=steps=256,shrink=1 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpShrinkDict1
|
||||
$ZSTD --train-fastcover=steps=256,shrink=5 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpShrinkDict2
|
||||
println "- Create dictionary with size limit"
|
||||
$ZSTD --train-fastcover=steps=8 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpDict2 --maxdict=4K
|
||||
println "- Compare size of dictionary from 90% training samples with 80% training samples"
|
||||
@ -989,6 +993,10 @@ $ZSTD --train-cover=k=56,d=8 && die "Create dictionary without input file (shoul
|
||||
println "- Create second (different) dictionary"
|
||||
$ZSTD --train-cover=k=56,d=8 "$TESTDIR"/*.c "$PRGDIR"/*.c "$PRGDIR"/*.h -o tmpDictC
|
||||
$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
|
||||
println "- Create dictionary using shrink-dict flag"
|
||||
$ZSTD --train-cover=steps=256,shrink "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpShrinkDict
|
||||
$ZSTD --train-cover=steps=256,shrink=1 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpShrinkDict1
|
||||
$ZSTD --train-cover=steps=256,shrink=5 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpShrinkDict2
|
||||
println "- Create dictionary with short dictID"
|
||||
$ZSTD --train-cover=k=46,d=8,split=80 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpDict1
|
||||
cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
|
||||
|
Loading…
Reference in New Issue
Block a user