Add COVER to the zstd cli

This commit is contained in:
Nick Terrell 2016-12-31 21:08:24 -08:00
parent 9103ed6c8b
commit df8415c502
3 changed files with 114 additions and 15 deletions

View File

@ -42,6 +42,7 @@
#define SAMPLESIZE_MAX (128 KB)
#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
#define NOISELENGTH 32
@ -118,10 +119,36 @@ static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr,
fileSizes[n] = fileSize;
fclose(f);
} }
DISPLAYLEVEL(2, "\r%79s\r", "");
*bufferSizePtr = pos;
return n;
}
#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
static U32 DiB_rand(U32* src)
{
static const U32 prime1 = 2654435761U;
static const U32 prime2 = 2246822519U;
U32 rand32 = *src;
rand32 *= prime1;
rand32 ^= prime2;
rand32 = DiB_rotl32(rand32, 13);
*src = rand32;
return rand32 >> 5;
}
static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
/* Initialize the pseudorandom number generator */
U32 seed = 0xFD2FB528;
unsigned i;
for (i = nbFiles - 1; i > 0; --i) {
unsigned const j = DiB_rand(&seed) % (i + 1);
const char* tmp = fileNamesTable[j];
fileNamesTable[j] = fileNamesTable[i];
fileNamesTable[i] = tmp;
}
}
/*-********************************************************
* Dictionary training functions
@ -202,7 +229,8 @@ size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity,
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles,
ZDICT_params_t params)
ZDICT_params_t *params, COVER_params_t *coverParams,
int optimizeCover)
{
void* const dictBuffer = malloc(maxDictSize);
size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
@ -213,8 +241,10 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
int result = 0;
/* Checks */
if (params) g_displayLevel = params->notificationLevel;
else if (coverParams) g_displayLevel = coverParams->notificationLevel;
else EXM_THROW(13, "Neither dictionary algorith selected"); /* should not happen */
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
g_displayLevel = params.notificationLevel;
if (g_tooLargeSamples) {
DISPLAYLEVEL(2, "! Warning : some samples are very large \n");
DISPLAYLEVEL(2, "! Note that dictionary is only useful for small files or beginning of large files. \n");
@ -233,12 +263,31 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
/* Load input buffer */
DISPLAYLEVEL(3, "Shuffling input files\n");
DiB_shuffle(fileNamesTable, nbFiles);
nbFiles = DiB_loadFiles(srcBuffer, &benchedSize, fileSizes, fileNamesTable, nbFiles);
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
{ size_t const dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
srcBuffer, fileSizes, nbFiles,
params);
{
size_t dictSize;
if (params) {
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
srcBuffer, fileSizes, nbFiles,
*params);
} else if (optimizeCover) {
dictSize = COVER_optimizeTrainFromBuffer(
dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles,
coverParams);
if (!ZDICT_isError(dictSize)) {
DISPLAYLEVEL(2, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n",
coverParams->smoothing, coverParams->kMin,
coverParams->kStep, coverParams->kMax, coverParams->d);
}
} else {
dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize,
srcBuffer, fileSizes, nbFiles,
*coverParams);
}
if (ZDICT_isError(dictSize)) {
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
result = 1;

View File

@ -32,7 +32,7 @@
*/
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles,
ZDICT_params_t parameters);
ZDICT_params_t *params, COVER_params_t *coverParams,
int optimizeCover);
#endif

View File

@ -127,6 +127,8 @@ static int usage_advanced(const char* programName)
DISPLAY( "\n");
DISPLAY( "Dictionary builder :\n");
DISPLAY( "--train ## : create a dictionary from a training set of files \n");
DISPLAY( "--cover=k=#,d=# : use the cover algorithm with parameters k and d \n");
DISPLAY( "--optimize-cover[=steps=#,k=#,d=#] : optimize cover parameters with optional parameters\n");
DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
DISPLAY( "--maxdict ## : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
@ -192,6 +194,29 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
}
#ifndef ZSTD_NODICT
/**
* parseCoverParameters() :
* reads cover parameters from *stringPtr (e.g. "--cover=smoothing=100,kmin=48,kstep=4,kmax=64,d=8") into *params
* @return 1 means that cover parameters were correct
* @return 0 in case of malformed parameters
*/
static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *params)
{
memset(params, 0, sizeof(*params));
for (; ;) {
if (longCommandWArg(&stringPtr, "smoothing=")) { params->smoothing = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "k=") || longCommandWArg(&stringPtr, "kMin=") || longCommandWArg(&stringPtr, "kmin=")) { params->kMin = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "kStep=") || longCommandWArg(&stringPtr, "kstep=")) { params->kStep = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "kMax=") || longCommandWArg(&stringPtr, "kmax=")) { params->kMax = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
return 0;
}
if (stringPtr[0] != 0) return 0;
DISPLAYLEVEL(4, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n", params->smoothing, params->kMin, params->kStep, params->kMax, params->d);
return 1;
}
#endif
/** parseCompressionParameters() :
* reads compression parameters from *stringPtr (e.g. "--zstd=wlog=23,clog=23,hlog=22,slog=6,slen=3,tlen=48,strat=6") into *params
* @return 1 means that compression parameters were correct
@ -254,6 +279,10 @@ int main(int argCount, const char* argv[])
char* fileNamesBuf = NULL;
unsigned fileNamesNb;
#endif
#ifndef ZSTD_NODICT
COVER_params_t coverParams;
int cover = 0;
#endif
/* init */
(void)recursive; (void)cLevelLast; /* not used when ZSTD_NOBENCH set */
@ -318,6 +347,20 @@ int main(int argCount, const char* argv[])
if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; }
/* long commands with arguments */
#ifndef ZSTD_NODICT
if (longCommandWArg(&argument, "--cover=")) {
cover=1; if (!parseCoverParameters(argument, &coverParams)) CLEAN_RETURN(badusage(programName));
continue;
}
if (longCommandWArg(&argument, "--optimize-cover")) {
cover=2;
/* Allow optional arguments following an = */
if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
continue;
}
#endif
if (longCommandWArg(&argument, "--memlimit=")) { memLimit = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--memory=")) { memLimit = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--memlimit-decompress=")) { memLimit = readU32FromChar(&argument); continue; }
@ -520,13 +563,20 @@ int main(int argCount, const char* argv[])
/* Check if dictionary builder is selected */
if (operation==zom_train) {
#ifndef ZSTD_NODICT
ZDICT_params_t dictParams;
memset(&dictParams, 0, sizeof(dictParams));
dictParams.compressionLevel = dictCLevel;
dictParams.selectivityLevel = dictSelect;
dictParams.notificationLevel = displayLevel;
dictParams.dictID = dictID;
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams);
if (cover) {
coverParams.compressionLevel = dictCLevel;
coverParams.notificationLevel = displayLevel;
coverParams.dictID = dictID;
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
} else {
ZDICT_params_t dictParams;
memset(&dictParams, 0, sizeof(dictParams));
dictParams.compressionLevel = dictCLevel;
dictParams.selectivityLevel = dictSelect;
dictParams.notificationLevel = displayLevel;
dictParams.dictID = dictID;
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
}
#endif
goto _end;
}