Merge pull request #736 from terrelln/cover-default-api

[zdict] Make COVER the default algorithm
This commit is contained in:
Yann Collet 2017-06-28 20:25:36 -07:00 committed by GitHub
commit 811deaea6f
8 changed files with 182 additions and 196 deletions

View File

@ -398,7 +398,8 @@ typedef struct {
*/ */
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs, static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
COVER_map_t *activeDmers, U32 begin, COVER_map_t *activeDmers, U32 begin,
U32 end, COVER_params_t parameters) { U32 end,
ZDICT_cover_params_t parameters) {
/* Constants */ /* Constants */
const U32 k = parameters.k; const U32 k = parameters.k;
const U32 d = parameters.d; const U32 d = parameters.d;
@ -478,7 +479,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
* Check the validity of the parameters. * Check the validity of the parameters.
* Returns non-zero if the parameters are valid and 0 otherwise. * Returns non-zero if the parameters are valid and 0 otherwise.
*/ */
static int COVER_checkParameters(COVER_params_t parameters) { static int COVER_checkParameters(ZDICT_cover_params_t parameters) {
/* k and d are required parameters */ /* k and d are required parameters */
if (parameters.d == 0 || parameters.k == 0) { if (parameters.d == 0 || parameters.k == 0) {
return 0; return 0;
@ -600,7 +601,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs, static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
COVER_map_t *activeDmers, void *dictBuffer, COVER_map_t *activeDmers, void *dictBuffer,
size_t dictBufferCapacity, size_t dictBufferCapacity,
COVER_params_t parameters) { ZDICT_cover_params_t parameters) {
BYTE *const dict = (BYTE *)dictBuffer; BYTE *const dict = (BYTE *)dictBuffer;
size_t tail = dictBufferCapacity; size_t tail = dictBufferCapacity;
/* Divide the data up into epochs of equal size. /* Divide the data up into epochs of equal size.
@ -639,22 +640,10 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
return tail; return tail;
} }
/** ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
* Translate from COVER_params_t to ZDICT_params_t required for finalizing the
* dictionary.
*/
static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) {
ZDICT_params_t zdictParams;
memset(&zdictParams, 0, sizeof(zdictParams));
zdictParams.notificationLevel = 1;
zdictParams.dictID = parameters.dictID;
zdictParams.compressionLevel = parameters.compressionLevel;
return zdictParams;
}
ZDICTLIB_API size_t COVER_trainFromBuffer(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) { const size_t *samplesSizes, unsigned nbSamples,
ZDICT_cover_params_t parameters) {
BYTE *const dict = (BYTE *)dictBuffer; BYTE *const dict = (BYTE *)dictBuffer;
COVER_ctx_t ctx; COVER_ctx_t ctx;
COVER_map_t activeDmers; COVER_map_t activeDmers;
@ -673,7 +662,7 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
return ERROR(dstSize_tooSmall); return ERROR(dstSize_tooSmall);
} }
/* Initialize global data */ /* Initialize global data */
g_displayLevel = parameters.notificationLevel; g_displayLevel = parameters.zParams.notificationLevel;
/* Initialize context and activeDmers */ /* Initialize context and activeDmers */
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
parameters.d)) { parameters.d)) {
@ -690,10 +679,9 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
const size_t tail = const size_t tail =
COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer, COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
dictBufferCapacity, parameters); dictBufferCapacity, parameters);
ZDICT_params_t zdictParams = COVER_translateParams(parameters);
const size_t dictionarySize = ZDICT_finalizeDictionary( const size_t dictionarySize = ZDICT_finalizeDictionary(
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
samplesBuffer, samplesSizes, nbSamples, zdictParams); samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
if (!ZSTD_isError(dictionarySize)) { if (!ZSTD_isError(dictionarySize)) {
DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
(U32)dictionarySize); (U32)dictionarySize);
@ -718,7 +706,7 @@ typedef struct COVER_best_s {
size_t liveJobs; size_t liveJobs;
void *dict; void *dict;
size_t dictSize; size_t dictSize;
COVER_params_t parameters; ZDICT_cover_params_t parameters;
size_t compressedSize; size_t compressedSize;
} COVER_best_t; } COVER_best_t;
@ -786,7 +774,7 @@ static void COVER_best_start(COVER_best_t *best) {
* If this dictionary is the best so far save it and its parameters. * If this dictionary is the best so far save it and its parameters.
*/ */
static void COVER_best_finish(COVER_best_t *best, size_t compressedSize, static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
COVER_params_t parameters, void *dict, ZDICT_cover_params_t parameters, void *dict,
size_t dictSize) { size_t dictSize) {
if (!best) { if (!best) {
return; return;
@ -830,7 +818,7 @@ typedef struct COVER_tryParameters_data_s {
const COVER_ctx_t *ctx; const COVER_ctx_t *ctx;
COVER_best_t *best; COVER_best_t *best;
size_t dictBufferCapacity; size_t dictBufferCapacity;
COVER_params_t parameters; ZDICT_cover_params_t parameters;
} COVER_tryParameters_data_t; } COVER_tryParameters_data_t;
/** /**
@ -842,7 +830,7 @@ static void COVER_tryParameters(void *opaque) {
/* Save parameters as local variables */ /* Save parameters as local variables */
COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque; COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
const COVER_ctx_t *const ctx = data->ctx; const COVER_ctx_t *const ctx = data->ctx;
const COVER_params_t parameters = data->parameters; const ZDICT_cover_params_t parameters = data->parameters;
size_t dictBufferCapacity = data->dictBufferCapacity; size_t dictBufferCapacity = data->dictBufferCapacity;
size_t totalCompressedSize = ERROR(GENERIC); size_t totalCompressedSize = ERROR(GENERIC);
/* Allocate space for hash table, dict, and freqs */ /* Allocate space for hash table, dict, and freqs */
@ -863,10 +851,10 @@ static void COVER_tryParameters(void *opaque) {
{ {
const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict, const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
dictBufferCapacity, parameters); dictBufferCapacity, parameters);
const ZDICT_params_t zdictParams = COVER_translateParams(parameters);
dictBufferCapacity = ZDICT_finalizeDictionary( dictBufferCapacity = ZDICT_finalizeDictionary(
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams); ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples,
parameters.zParams);
if (ZDICT_isError(dictBufferCapacity)) { if (ZDICT_isError(dictBufferCapacity)) {
DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
goto _cleanup; goto _cleanup;
@ -892,8 +880,8 @@ static void COVER_tryParameters(void *opaque) {
} }
/* Create the cctx and cdict */ /* Create the cctx and cdict */
cctx = ZSTD_createCCtx(); cctx = ZSTD_createCCtx();
cdict = cdict = ZSTD_createCDict(dict, dictBufferCapacity,
ZSTD_createCDict(dict, dictBufferCapacity, parameters.compressionLevel); parameters.zParams.compressionLevel);
if (!dst || !cctx || !cdict) { if (!dst || !cctx || !cdict) {
goto _compressCleanup; goto _compressCleanup;
} }
@ -930,12 +918,10 @@ _cleanup:
} }
} }
ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer, ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
size_t dictBufferCapacity, void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
const size_t *samplesSizes, ZDICT_cover_params_t *parameters) {
unsigned nbSamples,
COVER_params_t *parameters) {
/* constants */ /* constants */
const unsigned nbThreads = parameters->nbThreads; const unsigned nbThreads = parameters->nbThreads;
const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
@ -947,7 +933,7 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
const unsigned kIterations = const unsigned kIterations =
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
/* Local variables */ /* Local variables */
const int displayLevel = parameters->notificationLevel; const int displayLevel = parameters->zParams.notificationLevel;
unsigned iteration = 1; unsigned iteration = 1;
unsigned d; unsigned d;
unsigned k; unsigned k;
@ -976,7 +962,7 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
/* Initialization */ /* Initialization */
COVER_best_init(&best); COVER_best_init(&best);
/* Turn down global display level to clean up display at level 2 and below */ /* Turn down global display level to clean up display at level 2 and below */
g_displayLevel = parameters->notificationLevel - 1; g_displayLevel = parameters->zParams.notificationLevel - 1;
/* Loop through d first because each new value needs a new context */ /* Loop through d first because each new value needs a new context */
LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n", LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
kIterations); kIterations);

View File

@ -487,7 +487,7 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
} }
static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize, static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */ const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
const size_t* fileSizes, unsigned nbFiles, const size_t* fileSizes, unsigned nbFiles,
U32 minRatio, U32 notificationLevel) U32 minRatio, U32 notificationLevel)
@ -634,17 +634,6 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
} } } } } }
} }
/*
static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
{
unsigned u;
size_t max=0;
for (u=0; u<nbFiles; u++)
if (max < fileSizes[u]) max = fileSizes[u];
return max;
}
*/
static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles) static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
{ {
size_t total=0; size_t total=0;
@ -930,14 +919,14 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
} }
/*! ZDICT_trainFromBuffer_unsafe() : /*! ZDICT_trainFromBuffer_unsafe_legacy() :
* Warning : `samplesBuffer` must be followed by noisy guard band. * Warning : `samplesBuffer` must be followed by noisy guard band.
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError() * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
*/ */
size_t ZDICT_trainFromBuffer_unsafe( size_t ZDICT_trainFromBuffer_unsafe_legacy(
void* dictBuffer, size_t maxDictSize, void* dictBuffer, size_t maxDictSize,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t params) ZDICT_legacy_params_t params)
{ {
U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16)); U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
@ -946,7 +935,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
size_t const targetDictSize = maxDictSize; size_t const targetDictSize = maxDictSize;
size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
size_t dictSize = 0; size_t dictSize = 0;
U32 const notificationLevel = params.notificationLevel; U32 const notificationLevel = params.zParams.notificationLevel;
/* checks */ /* checks */
if (!dictList) return ERROR(memory_allocation); if (!dictList) return ERROR(memory_allocation);
@ -957,13 +946,13 @@ size_t ZDICT_trainFromBuffer_unsafe(
ZDICT_initDictItem(dictList); ZDICT_initDictItem(dictList);
/* build dictionary */ /* build dictionary */
ZDICT_trainBuffer(dictList, dictListSize, ZDICT_trainBuffer_legacy(dictList, dictListSize,
samplesBuffer, samplesBuffSize, samplesBuffer, samplesBuffSize,
samplesSizes, nbSamples, samplesSizes, nbSamples,
minRep, notificationLevel); minRep, notificationLevel);
/* display best matches */ /* display best matches */
if (params.notificationLevel>= 3) { if (params.zParams.notificationLevel>= 3) {
U32 const nb = MIN(25, dictList[0].pos); U32 const nb = MIN(25, dictList[0].pos);
U32 const dictContentSize = ZDICT_dictSize(dictList); U32 const dictContentSize = ZDICT_dictSize(dictList);
U32 u; U32 u;
@ -1026,7 +1015,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
samplesBuffer, samplesSizes, nbSamples, samplesBuffer, samplesSizes, nbSamples,
params); params.zParams);
} }
/* clean up */ /* clean up */
@ -1037,9 +1026,9 @@ size_t ZDICT_trainFromBuffer_unsafe(
/* issue : samplesBuffer need to be followed by a noisy guard band. /* issue : samplesBuffer need to be followed by a noisy guard band.
* work around : duplicate the buffer, and add the noise */ * work around : duplicate the buffer, and add the noise */
size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t params) ZDICT_legacy_params_t params)
{ {
size_t result; size_t result;
void* newBuff; void* newBuff;
@ -1052,10 +1041,9 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
memcpy(newBuff, samplesBuffer, sBuffSize); memcpy(newBuff, samplesBuffer, sBuffSize);
ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
result = ZDICT_trainFromBuffer_unsafe( result =
dictBuffer, dictBufferCapacity, ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
newBuff, samplesSizes, nbSamples, samplesSizes, nbSamples, params);
params);
free(newBuff); free(newBuff);
return result; return result;
} }
@ -1064,11 +1052,13 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
{ {
ZDICT_params_t params; ZDICT_cover_params_t params;
memset(&params, 0, sizeof(params)); memset(&params, 0, sizeof(params));
return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity, params.d = 8;
samplesBuffer, samplesSizes, nbSamples, params.steps = 4;
params); return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
samplesBuffer, samplesSizes,
nbSamples, &params);
} }
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,

View File

@ -37,16 +37,18 @@ extern "C" {
/*! ZDICT_trainFromBuffer(): /*! ZDICT_trainFromBuffer():
Train a dictionary from an array of samples. * Train a dictionary from an array of samples.
Samples must be stored concatenated in a single flat buffer `samplesBuffer`, * Uses ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
The resulting dictionary will be saved into `dictBuffer`. * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) * The resulting dictionary will be saved into `dictBuffer`.
or an error code, which can be tested with ZDICT_isError(). * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
Tips : In general, a reasonable dictionary has a size of ~ 100 KB. * or an error code, which can be tested with ZDICT_isError().
It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte.
In general, it's recommended to provide a few thousands samples, but this can vary a lot. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
It's recommended that total size of all samples be about ~x100 times the target size of dictionary. * It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
* In general, it's recommended to provide a few thousands samples, but this can vary a lot.
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
*/ */
ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
@ -69,93 +71,77 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
* ==================================================================================== */ * ==================================================================================== */
typedef struct { typedef struct {
unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
int compressionLevel; /* 0 means default; target a specific zstd compression level */ int compressionLevel; /* 0 means default; target a specific zstd compression level */
unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */ unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */
unsigned reserved[2]; /* reserved space for future parameters */
} ZDICT_params_t; } ZDICT_params_t;
/*! ZDICT_cover_params_t:
/*! ZDICT_trainFromBuffer_advanced() : * For all values 0 means default.
Same as ZDICT_trainFromBuffer() with control over more parameters. * k and d are the only required parameters.
`parameters` is optional and can be provided with values set to 0 to mean "default".
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferSize`),
or an error code, which can be tested by ZDICT_isError().
note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using notificationLevel>0.
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t parameters);
/*! COVER_params_t :
For all values 0 means default.
k and d are the only required parameters.
*/ */
typedef struct { typedef struct {
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ ZDICT_params_t zParams;
unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */ } ZDICT_cover_params_t;
int compressionLevel; /* 0 means default; target a specific zstd compression level */
} COVER_params_t;
/*! COVER_trainFromBuffer() : /*! ZDICT_trainFromBuffer_cover():
Train a dictionary from an array of samples using the COVER algorithm. * Train a dictionary from an array of samples using the COVER algorithm.
Samples must be stored concatenated in a single flat buffer `samplesBuffer`, * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
The resulting dictionary will be saved into `dictBuffer`. * The resulting dictionary will be saved into `dictBuffer`.
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
or an error code, which can be tested with ZDICT_isError(). * or an error code, which can be tested with ZDICT_isError().
Note : COVER_trainFromBuffer() requires about 9 bytes of memory for each input byte. * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
Tips : In general, a reasonable dictionary has a size of ~ 100 KB. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. * It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
In general, it's recommended to provide a few thousands samples, but this can vary a lot. * In general, it's recommended to provide a few thousands samples, but this can vary a lot.
It's recommended that total size of all samples be about ~x100 times the target size of dictionary. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
*/ */
ZDICTLIB_API size_t COVER_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
COVER_params_t parameters); const size_t *samplesSizes, unsigned nbSamples,
ZDICT_cover_params_t parameters);
/*! COVER_optimizeTrainFromBuffer() : /*! ZDICT_optimizeTrainFromBuffer_cover():
The same requirements as above hold for all the parameters except `parameters`. * The same requirements as above hold for all the parameters except `parameters`.
This function tries many parameter combinations and picks the best parameters. * This function tries many parameter combinations and picks the best parameters.
`*parameters` is filled with the best parameters found, and the dictionary * `*parameters` is filled with the best parameters found, and the dictionary
constructed with those parameters is stored in `dictBuffer`. * constructed with those parameters is stored in `dictBuffer`.
*
All of the parameters d, k, steps are optional. * All of the parameters d, k, steps are optional.
If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
if steps is zero it defaults to its default value. * if steps is zero it defaults to its default value.
If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048]. * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
*
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
or an error code, which can be tested with ZDICT_isError(). * or an error code, which can be tested with ZDICT_isError().
On success `*parameters` contains the parameters selected. * On success `*parameters` contains the parameters selected.
Note : COVER_optimizeTrainFromBuffer() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
*/ */
ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
const void* samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
COVER_params_t *parameters); const size_t *samplesSizes, unsigned nbSamples,
ZDICT_cover_params_t *parameters);
/*! ZDICT_finalizeDictionary(): /*! ZDICT_finalizeDictionary():
* Given a custom content as a basis for dictionary, and a set of samples,
Given a custom content as a basis for dictionary, and a set of samples, * finalize dictionary by adding headers and statistics.
finalize dictionary by adding headers and statistics. *
* Samples must be stored concatenated in a flat buffer `samplesBuffer`,
Samples must be stored concatenated in a flat buffer `samplesBuffer`, * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
supplied with an array of sizes `samplesSizes`, providing the size of each sample in order. *
* dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes. * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes. *
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`), * or an error code, which can be tested by ZDICT_isError().
or an error code, which can be tested by ZDICT_isError(). * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
note : ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0. * Note 2: dictBuffer and dictContent can overlap
note 2 : dictBuffer and dictContent can overlap
*/ */
#define ZDICT_CONTENTSIZE_MIN 128 #define ZDICT_CONTENTSIZE_MIN 128
#define ZDICT_DICTSIZE_MIN 256 #define ZDICT_DICTSIZE_MIN 256
@ -164,7 +150,28 @@ ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBuffer
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t parameters); ZDICT_params_t parameters);
typedef struct {
unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
ZDICT_params_t zParams;
} ZDICT_legacy_params_t;
/*! ZDICT_trainFromBuffer_legacy():
* Train a dictionary from an array of samples.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* The resulting dictionary will be saved into `dictBuffer`.
* `parameters` is optional and can be provided with values set to 0 to mean "default".
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
* It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
* In general, it's recommended to provide a few thousands samples, but this can vary a lot.
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
* Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t parameters);
/* Deprecation warnings */ /* Deprecation warnings */
/* It is generally possible to disable deprecation warnings from compiler, /* It is generally possible to disable deprecation warnings from compiler,

View File

@ -216,21 +216,21 @@ static U64 DiB_getTotalCappedFileSize(const char** fileNamesTable, unsigned nbFi
} }
/*! ZDICT_trainFromBuffer_unsafe() : /*! ZDICT_trainFromBuffer_unsafe_legacy() :
Strictly Internal use only !! Strictly Internal use only !!
Same as ZDICT_trainFromBuffer_advanced(), but does not control `samplesBuffer`. Same as ZDICT_trainFromBuffer_legacy(), but does not control `samplesBuffer`.
`samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads. `samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads.
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
or an error code. or an error code.
*/ */
size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity, size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t parameters); ZDICT_legacy_params_t parameters);
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles, const char** fileNamesTable, unsigned nbFiles,
ZDICT_params_t *params, COVER_params_t *coverParams, ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
int optimizeCover) int optimizeCover)
{ {
void* const dictBuffer = malloc(maxDictSize); void* const dictBuffer = malloc(maxDictSize);
@ -243,8 +243,8 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
int result = 0; int result = 0;
/* Checks */ /* Checks */
if (params) g_displayLevel = params->notificationLevel; if (params) g_displayLevel = params->zParams.notificationLevel;
else if (coverParams) g_displayLevel = coverParams->notificationLevel; else if (coverParams) g_displayLevel = coverParams->zParams.notificationLevel;
else EXM_THROW(13, "Neither dictionary algorith selected"); /* should not happen */ else EXM_THROW(13, "Neither dictionary algorith selected"); /* should not happen */
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
if (g_tooLargeSamples) { if (g_tooLargeSamples) {
@ -273,20 +273,20 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
size_t dictSize; size_t dictSize;
if (params) { if (params) {
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */ DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize, dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
srcBuffer, fileSizes, nbFiles, srcBuffer, fileSizes, nbFiles,
*params); *params);
} else if (optimizeCover) { } else if (optimizeCover) {
dictSize = COVER_optimizeTrainFromBuffer( dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles, srcBuffer, fileSizes, nbFiles,
coverParams); coverParams);
if (!ZDICT_isError(dictSize)) { if (!ZDICT_isError(dictSize)) {
DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps); DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
} }
} else { } else {
dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize, dictSize =
srcBuffer, fileSizes, nbFiles, ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
*coverParams); fileSizes, nbFiles, *coverParams);
} }
if (ZDICT_isError(dictSize)) { if (ZDICT_isError(dictSize)) {
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */

View File

@ -32,7 +32,7 @@
*/ */
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles, const char** fileNamesTable, unsigned nbFiles,
ZDICT_params_t *params, COVER_params_t *coverParams, ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
int optimizeCover); int optimizeCover);
#endif #endif

View File

@ -248,7 +248,7 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
* @return 1 means that cover parameters were correct * @return 1 means that cover parameters were correct
* @return 0 in case of malformed parameters * @return 0 in case of malformed parameters
*/ */
static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t* params) static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
{ {
memset(params, 0, sizeof(*params)); memset(params, 0, sizeof(*params));
for (; ;) { for (; ;) {
@ -277,9 +277,9 @@ static unsigned parseLegacyParameters(const char* stringPtr, unsigned* selectivi
return 1; return 1;
} }
static COVER_params_t defaultCoverParams(void) static ZDICT_cover_params_t defaultCoverParams(void)
{ {
COVER_params_t params; ZDICT_cover_params_t params;
memset(&params, 0, sizeof(params)); memset(&params, 0, sizeof(params));
params.d = 8; params.d = 8;
params.steps = 4; params.steps = 4;
@ -358,7 +358,7 @@ int main(int argCount, const char* argv[])
unsigned fileNamesNb; unsigned fileNamesNb;
#endif #endif
#ifndef ZSTD_NODICT #ifndef ZSTD_NODICT
COVER_params_t coverParams = defaultCoverParams(); ZDICT_cover_params_t coverParams = defaultCoverParams();
int cover = 1; int cover = 1;
#endif #endif
@ -699,20 +699,20 @@ int main(int argCount, const char* argv[])
/* Check if dictionary builder is selected */ /* Check if dictionary builder is selected */
if (operation==zom_train) { if (operation==zom_train) {
#ifndef ZSTD_NODICT #ifndef ZSTD_NODICT
ZDICT_params_t zParams;
zParams.compressionLevel = dictCLevel;
zParams.notificationLevel = g_displayLevel;
zParams.dictID = dictID;
if (cover) { if (cover) {
int const optimize = !coverParams.k || !coverParams.d; int const optimize = !coverParams.k || !coverParams.d;
coverParams.nbThreads = nbThreads; coverParams.nbThreads = nbThreads;
coverParams.compressionLevel = dictCLevel; coverParams.zParams = zParams;
coverParams.notificationLevel = g_displayLevel;
coverParams.dictID = dictID;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, optimize); operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, optimize);
} else { } else {
ZDICT_params_t dictParams; ZDICT_legacy_params_t dictParams;
memset(&dictParams, 0, sizeof(dictParams)); memset(&dictParams, 0, sizeof(dictParams));
dictParams.compressionLevel = dictCLevel;
dictParams.selectivityLevel = dictSelect; dictParams.selectivityLevel = dictSelect;
dictParams.notificationLevel = g_displayLevel; dictParams.zParams = zParams;
dictParams.dictID = dictID;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0); operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
} }
#endif #endif

View File

@ -638,7 +638,7 @@ static int basicUnitTests(U32 seed, double compressibility)
size_t const sampleUnitSize = 8 KB; size_t const sampleUnitSize = 8 KB;
U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize); U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize);
size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t)); size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t));
COVER_params_t params; ZDICT_cover_params_t params;
U32 dictID; U32 dictID;
if (dictBuffer==NULL || samplesSizes==NULL) { if (dictBuffer==NULL || samplesSizes==NULL) {
@ -647,12 +647,12 @@ static int basicUnitTests(U32 seed, double compressibility)
goto _output_error; goto _output_error;
} }
DISPLAYLEVEL(4, "test%3i : COVER_trainFromBuffer : ", testNb++); DISPLAYLEVEL(4, "test%3i : ZDICT_trainFromBuffer_cover : ", testNb++);
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; } { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
memset(&params, 0, sizeof(params)); memset(&params, 0, sizeof(params));
params.d = 1 + (FUZ_rand(&seed) % 16); params.d = 1 + (FUZ_rand(&seed) % 16);
params.k = params.d + (FUZ_rand(&seed) % 256); params.k = params.d + (FUZ_rand(&seed) % 256);
dictSize = COVER_trainFromBuffer(dictBuffer, dictSize, dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, dictSize,
CNBuffer, samplesSizes, nbSamples, CNBuffer, samplesSizes, nbSamples,
params); params);
if (ZDICT_isError(dictSize)) goto _output_error; if (ZDICT_isError(dictSize)) goto _output_error;
@ -663,12 +663,12 @@ static int basicUnitTests(U32 seed, double compressibility)
if (dictID==0) goto _output_error; if (dictID==0) goto _output_error;
DISPLAYLEVEL(4, "OK : %u \n", dictID); DISPLAYLEVEL(4, "OK : %u \n", dictID);
DISPLAYLEVEL(4, "test%3i : COVER_optimizeTrainFromBuffer : ", testNb++); DISPLAYLEVEL(4, "test%3i : ZDICT_optimizeTrainFromBuffer_cover : ", testNb++);
memset(&params, 0, sizeof(params)); memset(&params, 0, sizeof(params));
params.steps = 4; params.steps = 4;
optDictSize = COVER_optimizeTrainFromBuffer(dictBuffer, optDictSize, optDictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, optDictSize,
CNBuffer, samplesSizes, nbSamples / 4, CNBuffer, samplesSizes,
&params); nbSamples / 4, &params);
if (ZDICT_isError(optDictSize)) goto _output_error; if (ZDICT_isError(optDictSize)) goto _output_error;
DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)optDictSize); DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)optDictSize);

View File

@ -131,7 +131,10 @@ static const void *symbols[] = {
&ZDICT_isError, &ZDICT_isError,
&ZDICT_getErrorName, &ZDICT_getErrorName,
/* zdict.h: advanced functions */ /* zdict.h: advanced functions */
&ZDICT_trainFromBuffer_advanced, &ZDICT_trainFromBuffer_cover,
&ZDICT_optimizeTrainFromBuffer_cover,
&ZDICT_finalizeDictionary,
&ZDICT_trainFromBuffer_legacy,
&ZDICT_addEntropyTablesFromBuffer, &ZDICT_addEntropyTablesFromBuffer,
NULL, NULL,
}; };