Add COVER to the zstd cli
This commit is contained in:
parent
9103ed6c8b
commit
df8415c502
@ -42,6 +42,7 @@
|
||||
|
||||
#define SAMPLESIZE_MAX (128 KB)
|
||||
#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
|
||||
#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
|
||||
static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
|
||||
|
||||
#define NOISELENGTH 32
|
||||
@ -118,10 +119,36 @@ static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr,
|
||||
fileSizes[n] = fileSize;
|
||||
fclose(f);
|
||||
} }
|
||||
DISPLAYLEVEL(2, "\r%79s\r", "");
|
||||
*bufferSizePtr = pos;
|
||||
return n;
|
||||
}
|
||||
|
||||
#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
|
||||
static U32 DiB_rand(U32* src)
|
||||
{
|
||||
static const U32 prime1 = 2654435761U;
|
||||
static const U32 prime2 = 2246822519U;
|
||||
U32 rand32 = *src;
|
||||
rand32 *= prime1;
|
||||
rand32 ^= prime2;
|
||||
rand32 = DiB_rotl32(rand32, 13);
|
||||
*src = rand32;
|
||||
return rand32 >> 5;
|
||||
}
|
||||
|
||||
static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
|
||||
/* Initialize the pseudorandom number generator */
|
||||
U32 seed = 0xFD2FB528;
|
||||
unsigned i;
|
||||
for (i = nbFiles - 1; i > 0; --i) {
|
||||
unsigned const j = DiB_rand(&seed) % (i + 1);
|
||||
const char* tmp = fileNamesTable[j];
|
||||
fileNamesTable[j] = fileNamesTable[i];
|
||||
fileNamesTable[i] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*-********************************************************
|
||||
* Dictionary training functions
|
||||
@ -202,7 +229,8 @@ size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity,
|
||||
|
||||
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
||||
const char** fileNamesTable, unsigned nbFiles,
|
||||
ZDICT_params_t params)
|
||||
ZDICT_params_t *params, COVER_params_t *coverParams,
|
||||
int optimizeCover)
|
||||
{
|
||||
void* const dictBuffer = malloc(maxDictSize);
|
||||
size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
|
||||
@ -213,8 +241,10 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
||||
int result = 0;
|
||||
|
||||
/* Checks */
|
||||
if (params) g_displayLevel = params->notificationLevel;
|
||||
else if (coverParams) g_displayLevel = coverParams->notificationLevel;
|
||||
else EXM_THROW(13, "Neither dictionary algorith selected"); /* should not happen */
|
||||
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
|
||||
g_displayLevel = params.notificationLevel;
|
||||
if (g_tooLargeSamples) {
|
||||
DISPLAYLEVEL(2, "! Warning : some samples are very large \n");
|
||||
DISPLAYLEVEL(2, "! Note that dictionary is only useful for small files or beginning of large files. \n");
|
||||
@ -233,12 +263,31 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
||||
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
|
||||
|
||||
/* Load input buffer */
|
||||
DISPLAYLEVEL(3, "Shuffling input files\n");
|
||||
DiB_shuffle(fileNamesTable, nbFiles);
|
||||
nbFiles = DiB_loadFiles(srcBuffer, &benchedSize, fileSizes, fileNamesTable, nbFiles);
|
||||
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
||||
|
||||
{ size_t const dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
|
||||
srcBuffer, fileSizes, nbFiles,
|
||||
params);
|
||||
{
|
||||
size_t dictSize;
|
||||
if (params) {
|
||||
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
||||
dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
|
||||
srcBuffer, fileSizes, nbFiles,
|
||||
*params);
|
||||
} else if (optimizeCover) {
|
||||
dictSize = COVER_optimizeTrainFromBuffer(
|
||||
dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles,
|
||||
coverParams);
|
||||
if (!ZDICT_isError(dictSize)) {
|
||||
DISPLAYLEVEL(2, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n",
|
||||
coverParams->smoothing, coverParams->kMin,
|
||||
coverParams->kStep, coverParams->kMax, coverParams->d);
|
||||
}
|
||||
} else {
|
||||
dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize,
|
||||
srcBuffer, fileSizes, nbFiles,
|
||||
*coverParams);
|
||||
}
|
||||
if (ZDICT_isError(dictSize)) {
|
||||
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
|
||||
result = 1;
|
||||
|
@ -32,7 +32,7 @@
|
||||
*/
|
||||
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
||||
const char** fileNamesTable, unsigned nbFiles,
|
||||
ZDICT_params_t parameters);
|
||||
|
||||
ZDICT_params_t *params, COVER_params_t *coverParams,
|
||||
int optimizeCover);
|
||||
|
||||
#endif
|
||||
|
@ -127,6 +127,8 @@ static int usage_advanced(const char* programName)
|
||||
DISPLAY( "\n");
|
||||
DISPLAY( "Dictionary builder :\n");
|
||||
DISPLAY( "--train ## : create a dictionary from a training set of files \n");
|
||||
DISPLAY( "--cover=k=#,d=# : use the cover algorithm with parameters k and d \n");
|
||||
DISPLAY( "--optimize-cover[=steps=#,k=#,d=#] : optimize cover parameters with optional parameters\n");
|
||||
DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
|
||||
DISPLAY( "--maxdict ## : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
|
||||
DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
|
||||
@ -192,6 +194,29 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
|
||||
}
|
||||
|
||||
|
||||
#ifndef ZSTD_NODICT
|
||||
/**
|
||||
* parseCoverParameters() :
|
||||
* reads cover parameters from *stringPtr (e.g. "--cover=smoothing=100,kmin=48,kstep=4,kmax=64,d=8") into *params
|
||||
* @return 1 means that cover parameters were correct
|
||||
* @return 0 in case of malformed parameters
|
||||
*/
|
||||
static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *params)
|
||||
{
|
||||
memset(params, 0, sizeof(*params));
|
||||
for (; ;) {
|
||||
if (longCommandWArg(&stringPtr, "smoothing=")) { params->smoothing = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "k=") || longCommandWArg(&stringPtr, "kMin=") || longCommandWArg(&stringPtr, "kmin=")) { params->kMin = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "kStep=") || longCommandWArg(&stringPtr, "kstep=")) { params->kStep = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "kMax=") || longCommandWArg(&stringPtr, "kmax=")) { params->kMax = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
return 0;
|
||||
}
|
||||
if (stringPtr[0] != 0) return 0;
|
||||
DISPLAYLEVEL(4, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n", params->smoothing, params->kMin, params->kStep, params->kMax, params->d);
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
/** parseCompressionParameters() :
|
||||
* reads compression parameters from *stringPtr (e.g. "--zstd=wlog=23,clog=23,hlog=22,slog=6,slen=3,tlen=48,strat=6") into *params
|
||||
* @return 1 means that compression parameters were correct
|
||||
@ -254,6 +279,10 @@ int main(int argCount, const char* argv[])
|
||||
char* fileNamesBuf = NULL;
|
||||
unsigned fileNamesNb;
|
||||
#endif
|
||||
#ifndef ZSTD_NODICT
|
||||
COVER_params_t coverParams;
|
||||
int cover = 0;
|
||||
#endif
|
||||
|
||||
/* init */
|
||||
(void)recursive; (void)cLevelLast; /* not used when ZSTD_NOBENCH set */
|
||||
@ -318,6 +347,20 @@ int main(int argCount, const char* argv[])
|
||||
if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; }
|
||||
|
||||
/* long commands with arguments */
|
||||
#ifndef ZSTD_NODICT
|
||||
if (longCommandWArg(&argument, "--cover=")) {
|
||||
cover=1; if (!parseCoverParameters(argument, &coverParams)) CLEAN_RETURN(badusage(programName));
|
||||
continue;
|
||||
}
|
||||
if (longCommandWArg(&argument, "--optimize-cover")) {
|
||||
cover=2;
|
||||
/* Allow optional arguments following an = */
|
||||
if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
|
||||
else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
|
||||
else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
if (longCommandWArg(&argument, "--memlimit=")) { memLimit = readU32FromChar(&argument); continue; }
|
||||
if (longCommandWArg(&argument, "--memory=")) { memLimit = readU32FromChar(&argument); continue; }
|
||||
if (longCommandWArg(&argument, "--memlimit-decompress=")) { memLimit = readU32FromChar(&argument); continue; }
|
||||
@ -520,13 +563,20 @@ int main(int argCount, const char* argv[])
|
||||
/* Check if dictionary builder is selected */
|
||||
if (operation==zom_train) {
|
||||
#ifndef ZSTD_NODICT
|
||||
ZDICT_params_t dictParams;
|
||||
memset(&dictParams, 0, sizeof(dictParams));
|
||||
dictParams.compressionLevel = dictCLevel;
|
||||
dictParams.selectivityLevel = dictSelect;
|
||||
dictParams.notificationLevel = displayLevel;
|
||||
dictParams.dictID = dictID;
|
||||
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams);
|
||||
if (cover) {
|
||||
coverParams.compressionLevel = dictCLevel;
|
||||
coverParams.notificationLevel = displayLevel;
|
||||
coverParams.dictID = dictID;
|
||||
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
|
||||
} else {
|
||||
ZDICT_params_t dictParams;
|
||||
memset(&dictParams, 0, sizeof(dictParams));
|
||||
dictParams.compressionLevel = dictCLevel;
|
||||
dictParams.selectivityLevel = dictSelect;
|
||||
dictParams.notificationLevel = displayLevel;
|
||||
dictParams.dictID = dictID;
|
||||
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
|
||||
}
|
||||
#endif
|
||||
goto _end;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user