Adding shrinking flag for cover and fastcover (#1656)

* Changed ERROR(GENERIC) excluding inits

* editing git ignore

* Edited init functions to size_t returns

* moved declarations earlier

* resolved issues with changes to init functions

* fixed style and an error check

* attempting to add tests that might trigger changes

* added && die to cases expecting to fail

* resolved no die on expected failed command

* fixed accel to be incorrect value

* Adding an automated shrinking option

* Fixing build

* finalizing fixes

* fix?

* Removing added comment in cover.h

* Styling fixes

* Merging with fb dev

* removing megic number for default regression

* Requested revisions

* fixing support for fast cover

* fixing casting errors

* parenthesis fix

* fixing some build nits

* resolving travis ci syntax

* might resolve all compilation issues

* removed unused variable

* remodeling the selectDict function

* fixing bad memory access

* fixing error checks

* fixed erroring check in selectDict

* fixing mixed declarations

* modify mixed declaration

* fixing nits and adding test cases

* Adding requested changes + fixed bug for error checking

* switched double comparison from != to <

* fixed declaration typing

* refactoring COVER_best_finish() and changing shrinkDict

* removing the const's

* modifying ZDICT_optimizeTrainFromBuffer_cover functions

* fixing potential bad memcpy

* fixing the error function for dict size
This commit is contained in:
Tyler-Tran 2019-06-27 16:26:57 -07:00 committed by Nick Terrell
parent 9038579ab2
commit c55d2e7ba3
7 changed files with 259 additions and 42 deletions

View File

@ -889,9 +889,11 @@ void COVER_best_start(COVER_best_t *best) {
* Decrements liveJobs and signals any waiting threads if liveJobs == 0. * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
* If this dictionary is the best so far save it and its parameters. * If this dictionary is the best so far save it and its parameters.
*/ */
void COVER_best_finish(COVER_best_t *best, size_t compressedSize, void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
ZDICT_cover_params_t parameters, void *dict, COVER_dictSelection_t selection) {
size_t dictSize) { void* dict = selection.dictContent;
size_t compressedSize = selection.totalCompressedSize;
size_t dictSize = selection.dictSize;
if (!best) { if (!best) {
return; return;
} }
@ -917,6 +919,9 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
} }
} }
/* Save the dictionary, parameters, and size */ /* Save the dictionary, parameters, and size */
if (!dict) {
return;
}
memcpy(best->dict, dict, dictSize); memcpy(best->dict, dict, dictSize);
best->dictSize = dictSize; best->dictSize = dictSize;
best->parameters = parameters; best->parameters = parameters;
@ -929,6 +934,111 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
} }
} }
COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
COVER_dictSelection_t selection = { NULL, 0, error };
return selection;
}
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
}
void COVER_dictSelectionFree(COVER_dictSelection_t selection){
free(selection.dictContent);
}
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
size_t largestDict = 0;
size_t largestCompressed = 0;
BYTE* customDictContentEnd = customDictContent + dictContentSize;
BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
if (!largestDictbuffer || !candidateDictBuffer) {
free(largestDictbuffer);
free(candidateDictBuffer);
return COVER_dictSelectionError(dictContentSize);
}
/* Initial dictionary size and compressed size */
memcpy(largestDictbuffer, customDictContent, dictContentSize);
dictContentSize = ZDICT_finalizeDictionary(
largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
if (ZDICT_isError(dictContentSize)) {
free(largestDictbuffer);
free(candidateDictBuffer);
return COVER_dictSelectionError(dictContentSize);
}
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
samplesBuffer, offsets,
nbCheckSamples, nbSamples,
largestDictbuffer, dictContentSize);
if (ZSTD_isError(totalCompressedSize)) {
free(largestDictbuffer);
free(candidateDictBuffer);
return COVER_dictSelectionError(totalCompressedSize);
}
if (params.shrinkDict == 0) {
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
free(candidateDictBuffer);
return selection;
}
largestDict = dictContentSize;
largestCompressed = totalCompressedSize;
dictContentSize = ZDICT_DICTSIZE_MIN;
/* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
while (dictContentSize < largestDict) {
memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
dictContentSize = ZDICT_finalizeDictionary(
candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
if (ZDICT_isError(dictContentSize)) {
free(largestDictbuffer);
free(candidateDictBuffer);
return COVER_dictSelectionError(dictContentSize);
}
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
samplesBuffer, offsets,
nbCheckSamples, nbSamples,
candidateDictBuffer, dictContentSize);
if (ZSTD_isError(totalCompressedSize)) {
free(largestDictbuffer);
free(candidateDictBuffer);
return COVER_dictSelectionError(totalCompressedSize);
}
if (totalCompressedSize <= largestCompressed * regressionTolerance) {
COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
free(largestDictbuffer);
return selection;
}
dictContentSize *= 2;
}
dictContentSize = largestDict;
totalCompressedSize = largestCompressed;
{
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
free(candidateDictBuffer);
return selection;
}
}
/** /**
* Parameters for COVER_tryParameters(). * Parameters for COVER_tryParameters().
*/ */
@ -954,6 +1064,7 @@ static void COVER_tryParameters(void *opaque) {
/* Allocate space for hash table, dict, and freqs */ /* Allocate space for hash table, dict, and freqs */
COVER_map_t activeDmers; COVER_map_t activeDmers;
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
@ -969,29 +1080,21 @@ static void COVER_tryParameters(void *opaque) {
{ {
const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict, const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
dictBufferCapacity, parameters); dictBufferCapacity, parameters);
dictBufferCapacity = ZDICT_finalizeDictionary( selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, totalCompressedSize);
parameters.zParams);
if (ZDICT_isError(dictBufferCapacity)) { if (COVER_dictSelectionIsError(selection)) {
DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); DISPLAYLEVEL(1, "Failed to select dictionary\n");
goto _cleanup; goto _cleanup;
} }
} }
/* Check total compressed size */
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
ctx->samples, ctx->offsets,
ctx->nbTrainSamples, ctx->nbSamples,
dict, dictBufferCapacity);
_cleanup: _cleanup:
COVER_best_finish(data->best, totalCompressedSize, parameters, dict, free(dict);
dictBufferCapacity); COVER_best_finish(data->best, parameters, selection);
free(data); free(data);
COVER_map_destroy(&activeDmers); COVER_map_destroy(&activeDmers);
if (dict) { COVER_dictSelectionFree(selection);
free(dict);
}
if (freqs) { if (freqs) {
free(freqs); free(freqs);
} }
@ -1013,6 +1116,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
const unsigned kIterations = const unsigned kIterations =
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
const unsigned shrinkDict = 0;
/* Local variables */ /* Local variables */
const int displayLevel = parameters->zParams.notificationLevel; const int displayLevel = parameters->zParams.notificationLevel;
unsigned iteration = 1; unsigned iteration = 1;
@ -1091,6 +1195,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
data->parameters.d = d; data->parameters.d = d;
data->parameters.splitPoint = splitPoint; data->parameters.splitPoint = splitPoint;
data->parameters.steps = kSteps; data->parameters.steps = kSteps;
data->parameters.shrinkDict = shrinkDict;
data->parameters.zParams.notificationLevel = g_displayLevel; data->parameters.zParams.notificationLevel = g_displayLevel;
/* Check the parameters */ /* Check the parameters */
if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) { if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {

View File

@ -46,6 +46,15 @@ typedef struct {
U32 size; U32 size;
} COVER_epoch_info_t; } COVER_epoch_info_t;
/**
* Struct used for the dictionary selection function.
*/
typedef struct COVER_dictSelection {
BYTE* dictContent;
size_t dictSize;
size_t totalCompressedSize;
} COVER_dictSelection_t;
/** /**
* Computes the number of epochs and the size of each epoch. * Computes the number of epochs and the size of each epoch.
* We will make sure that each epoch gets at least 10 * k bytes. * We will make sure that each epoch gets at least 10 * k bytes.
@ -107,6 +116,32 @@ void COVER_best_start(COVER_best_t *best);
* Decrements liveJobs and signals any waiting threads if liveJobs == 0. * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
* If this dictionary is the best so far save it and its parameters. * If this dictionary is the best so far save it and its parameters.
*/ */
void COVER_best_finish(COVER_best_t *best, size_t compressedSize, void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
ZDICT_cover_params_t parameters, void *dict, COVER_dictSelection_t selection);
size_t dictSize); /**
* Error function for COVER_selectDict function. Checks if the return
* value is an error.
*/
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
/**
* Error function for COVER_selectDict function. Returns a struct where
* return.totalCompressedSize is a ZSTD error.
*/
COVER_dictSelection_t COVER_dictSelectionError(size_t error);
/**
* Always call after selectDict is called to free up used memory from
* newly created dictionary.
*/
void COVER_dictSelectionFree(COVER_dictSelection_t selection);
/**
* Called to finalize the dictionary and select one based on whether or not
* the shrink-dict flag was enabled. If enabled the dictionary used is the
* smallest dictionary within a specified regression of the compressed size
* from the largest dictionary.
*/
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);

View File

@ -435,7 +435,6 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
return tail; return tail;
} }
/** /**
* Parameters for FASTCOVER_tryParameters(). * Parameters for FASTCOVER_tryParameters().
*/ */
@ -464,6 +463,7 @@ static void FASTCOVER_tryParameters(void *opaque)
U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16)); U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
/* Allocate space for hash table, dict, and freqs */ /* Allocate space for hash table, dict, and freqs */
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32)); U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
if (!segmentFreqs || !dict || !freqs) { if (!segmentFreqs || !dict || !freqs) {
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n"); DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
@ -473,27 +473,24 @@ static void FASTCOVER_tryParameters(void *opaque)
memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32)); memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
/* Build the dictionary */ /* Build the dictionary */
{ const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity, { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
parameters, segmentFreqs); parameters, segmentFreqs);
const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100); const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
dictBufferCapacity = ZDICT_finalizeDictionary( selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams); totalCompressedSize);
if (ZDICT_isError(dictBufferCapacity)) {
DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); if (COVER_dictSelectionIsError(selection)) {
DISPLAYLEVEL(1, "Failed to select dictionary\n");
goto _cleanup; goto _cleanup;
} }
} }
/* Check total compressed size */
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
ctx->samples, ctx->offsets,
ctx->nbTrainSamples, ctx->nbSamples,
dict, dictBufferCapacity);
_cleanup: _cleanup:
COVER_best_finish(data->best, totalCompressedSize, parameters, dict, free(dict);
dictBufferCapacity); COVER_best_finish(data->best, parameters, selection);
free(data); free(data);
free(segmentFreqs); free(segmentFreqs);
free(dict); COVER_dictSelectionFree(selection);
free(freqs); free(freqs);
} }
@ -508,6 +505,7 @@ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
coverParams->nbThreads = fastCoverParams.nbThreads; coverParams->nbThreads = fastCoverParams.nbThreads;
coverParams->splitPoint = fastCoverParams.splitPoint; coverParams->splitPoint = fastCoverParams.splitPoint;
coverParams->zParams = fastCoverParams.zParams; coverParams->zParams = fastCoverParams.zParams;
coverParams->shrinkDict = fastCoverParams.shrinkDict;
} }
@ -524,6 +522,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
fastCoverParams->f = f; fastCoverParams->f = f;
fastCoverParams->accel = accel; fastCoverParams->accel = accel;
fastCoverParams->zParams = coverParams.zParams; fastCoverParams->zParams = coverParams.zParams;
fastCoverParams->shrinkDict = coverParams.shrinkDict;
} }
@ -619,6 +618,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f; const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel; const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
const unsigned shrinkDict = 0;
/* Local variables */ /* Local variables */
const int displayLevel = parameters->zParams.notificationLevel; const int displayLevel = parameters->zParams.notificationLevel;
unsigned iteration = 1; unsigned iteration = 1;
@ -703,6 +703,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
data->parameters.d = d; data->parameters.d = d;
data->parameters.splitPoint = splitPoint; data->parameters.splitPoint = splitPoint;
data->parameters.steps = kSteps; data->parameters.steps = kSteps;
data->parameters.shrinkDict = shrinkDict;
data->parameters.zParams.notificationLevel = g_displayLevel; data->parameters.zParams.notificationLevel = g_displayLevel;
/* Check the parameters */ /* Check the parameters */
if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity, if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,

View File

@ -94,6 +94,8 @@ typedef struct {
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */ double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
ZDICT_params_t zParams; ZDICT_params_t zParams;
} ZDICT_cover_params_t; } ZDICT_cover_params_t;
@ -105,6 +107,9 @@ typedef struct {
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */ double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */ unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
ZDICT_params_t zParams; ZDICT_params_t zParams;
} ZDICT_fastCover_params_t; } ZDICT_fastCover_params_t;

View File

@ -179,8 +179,8 @@ static int usage_advanced(const char* programName)
DISPLAY( "\n"); DISPLAY( "\n");
DISPLAY( "Dictionary builder : \n"); DISPLAY( "Dictionary builder : \n");
DISPLAY( "--train ## : create a dictionary from a training set of files \n"); DISPLAY( "--train ## : create a dictionary from a training set of files \n");
DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n"); DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#,shrink[=#]] : use the cover algorithm with optional args\n");
DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fast cover algorithm with optional args\n"); DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#,shrink[=#]] : use the fast cover algorithm with optional args\n");
DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel); DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName); DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize); DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
@ -299,6 +299,7 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
* @return 1 means that cover parameters were correct * @return 1 means that cover parameters were correct
* @return 0 in case of malformed parameters * @return 0 in case of malformed parameters
*/ */
static const unsigned kDefaultRegression = 1;
static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params) static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
{ {
memset(params, 0, sizeof(*params)); memset(params, 0, sizeof(*params));
@ -311,10 +312,23 @@ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t
params->splitPoint = (double)splitPercentage / 100.0; params->splitPoint = (double)splitPercentage / 100.0;
if (stringPtr[0]==',') { stringPtr++; continue; } else break; if (stringPtr[0]==',') { stringPtr++; continue; } else break;
} }
if (longCommandWArg(&stringPtr, "shrink")) {
params->shrinkDictMaxRegression = kDefaultRegression;
params->shrinkDict = 1;
if (stringPtr[0]=='=') {
stringPtr++;
params->shrinkDictMaxRegression = readU32FromChar(&stringPtr);
}
if (stringPtr[0]==',') {
stringPtr++;
continue;
}
else break;
}
return 0; return 0;
} }
if (stringPtr[0] != 0) return 0; if (stringPtr[0] != 0) return 0;
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->steps, (unsigned)(params->splitPoint * 100)); DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplit=%u\nshrink%u\n", params->k, params->d, params->steps, (unsigned)(params->splitPoint * 100), params->shrinkDictMaxRegression);
return 1; return 1;
} }
@ -338,10 +352,23 @@ static unsigned parseFastCoverParameters(const char* stringPtr, ZDICT_fastCover_
params->splitPoint = (double)splitPercentage / 100.0; params->splitPoint = (double)splitPercentage / 100.0;
if (stringPtr[0]==',') { stringPtr++; continue; } else break; if (stringPtr[0]==',') { stringPtr++; continue; } else break;
} }
if (longCommandWArg(&stringPtr, "shrink")) {
params->shrinkDictMaxRegression = kDefaultRegression;
params->shrinkDict = 1;
if (stringPtr[0]=='=') {
stringPtr++;
params->shrinkDictMaxRegression = readU32FromChar(&stringPtr);
}
if (stringPtr[0]==',') {
stringPtr++;
continue;
}
else break;
}
return 0; return 0;
} }
if (stringPtr[0] != 0) return 0; if (stringPtr[0] != 0) return 0;
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel); DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\nshrink=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel, params->shrinkDictMaxRegression);
return 1; return 1;
} }
@ -367,6 +394,8 @@ static ZDICT_cover_params_t defaultCoverParams(void)
params.d = 8; params.d = 8;
params.steps = 4; params.steps = 4;
params.splitPoint = 1.0; params.splitPoint = 1.0;
params.shrinkDict = 0;
params.shrinkDictMaxRegression = kDefaultRegression;
return params; return params;
} }
@ -379,6 +408,8 @@ static ZDICT_fastCover_params_t defaultFastCoverParams(void)
params.steps = 4; params.steps = 4;
params.splitPoint = 0.75; /* different from default splitPoint of cover */ params.splitPoint = 0.75; /* different from default splitPoint of cover */
params.accel = DEFAULT_ACCEL; params.accel = DEFAULT_ACCEL;
params.shrinkDict = 0;
params.shrinkDictMaxRegression = kDefaultRegression;
return params; return params;
} }
#endif #endif

View File

@ -1104,6 +1104,22 @@ static int basicUnitTests(U32 seed, double compressibility)
} }
DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize); DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
DISPLAYLEVEL(3, "test%3i : COVER dictBuilder with shrinkDict: ", testNb++);
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
{ ZDICT_cover_params_t coverParams;
memset(&coverParams, 0, sizeof(coverParams));
coverParams.steps = 8;
coverParams.nbThreads = 4;
coverParams.shrinkDict = 1;
coverParams.shrinkDictMaxRegression = 1;
dictSize = ZDICT_optimizeTrainFromBuffer_cover(
dictBuffer, dictBufferCapacity,
CNBuffer, samplesSizes, nbSamples/8, /* less samples for faster tests */
&coverParams);
if (ZDICT_isError(dictSize)) goto _output_error;
}
DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
DISPLAYLEVEL(3, "test%3i : Multithreaded FASTCOVER dictBuilder : ", testNb++); DISPLAYLEVEL(3, "test%3i : Multithreaded FASTCOVER dictBuilder : ", testNb++);
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; } { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
{ ZDICT_fastCover_params_t fastCoverParams; { ZDICT_fastCover_params_t fastCoverParams;
@ -1118,6 +1134,22 @@ static int basicUnitTests(U32 seed, double compressibility)
} }
DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize); DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
DISPLAYLEVEL(3, "test%3i : FASTCOVER dictBuilder with shrinkDict: ", testNb++);
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
{ ZDICT_fastCover_params_t fastCoverParams;
memset(&fastCoverParams, 0, sizeof(fastCoverParams));
fastCoverParams.steps = 8;
fastCoverParams.nbThreads = 4;
fastCoverParams.shrinkDict = 1;
fastCoverParams.shrinkDictMaxRegression = 1;
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(
dictBuffer, dictBufferCapacity,
CNBuffer, samplesSizes, nbSamples,
&fastCoverParams);
if (ZDICT_isError(dictSize)) goto _output_error;
}
DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
DISPLAYLEVEL(3, "test%3i : check dictID : ", testNb++); DISPLAYLEVEL(3, "test%3i : check dictID : ", testNb++);
dictID = ZDICT_getDictID(dictBuffer, dictSize); dictID = ZDICT_getDictID(dictBuffer, dictSize);
if (dictID==0) goto _output_error; if (dictID==0) goto _output_error;

View File

@ -499,6 +499,10 @@ $ZSTD --train-fastcover=k=56,d=8 && die "Create dictionary without input file"
println "- Create dictionary with short dictID" println "- Create dictionary with short dictID"
$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpDict1 $ZSTD --train-fastcover=k=46,d=8,f=15,split=80 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpDict1
cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
println "- Create dictionaries with shrink-dict flag enabled"
$ZSTD --train-fastcover=steps=256,shrink "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpShrinkDict
$ZSTD --train-fastcover=steps=256,shrink=1 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpShrinkDict1
$ZSTD --train-fastcover=steps=256,shrink=5 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpShrinkDict2
println "- Create dictionary with size limit" println "- Create dictionary with size limit"
$ZSTD --train-fastcover=steps=8 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpDict2 --maxdict=4K $ZSTD --train-fastcover=steps=8 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpDict2 --maxdict=4K
println "- Compare size of dictionary from 90% training samples with 80% training samples" println "- Compare size of dictionary from 90% training samples with 80% training samples"
@ -989,6 +993,10 @@ $ZSTD --train-cover=k=56,d=8 && die "Create dictionary without input file (shoul
println "- Create second (different) dictionary" println "- Create second (different) dictionary"
$ZSTD --train-cover=k=56,d=8 "$TESTDIR"/*.c "$PRGDIR"/*.c "$PRGDIR"/*.h -o tmpDictC $ZSTD --train-cover=k=56,d=8 "$TESTDIR"/*.c "$PRGDIR"/*.c "$PRGDIR"/*.h -o tmpDictC
$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!" $ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
println "- Create dictionary using shrink-dict flag"
$ZSTD --train-cover=steps=256,shrink "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpShrinkDict
$ZSTD --train-cover=steps=256,shrink=1 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpShrinkDict1
$ZSTD --train-cover=steps=256,shrink=5 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpShrinkDict2
println "- Create dictionary with short dictID" println "- Create dictionary with short dictID"
$ZSTD --train-cover=k=46,d=8,split=80 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpDict1 $ZSTD --train-cover=k=46,d=8,split=80 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpDict1
cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"