Add split=# to cli

This commit is contained in:
Jennifer Liu 2018-06-29 17:54:41 -07:00
parent 52fbbbcb6b
commit 348e5f77a9
3 changed files with 15 additions and 6 deletions

View File

@ -558,15 +558,15 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
/* Check if there's training sample */
if (nbTrainSamples < 1) {
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100));
DISPLAYLEVEL(1, "nbSamples is %u", nbSamples);
return 0;
}
/* Check if there's testing sample when splitPoint is nonzero */
if (nbTestSamples < 1 && splitPoint < 1.0) {
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100));
DISPLAYLEVEL(1, "nbSamples is %u", nbSamples);
return 0;
}
if (nbTrainSamples + nbTestSamples != nbSamples) {
DISPLAYLEVEL(1, "nbTrainSamples plus nbTestSamples don't add up to nbSamples");
return 0;
}
/* Zero the context */

View File

@ -223,11 +223,12 @@ Compression of small files similar to the sample set will be greatly improved.
This compares favorably to 4 bytes default.
However, it's up to the dictionary manager to not assign twice the same ID to
2 different dictionaries.
* `--train-cover[=k#,d=#,steps=#]`:
* `--train-cover[=k#,d=#,steps=#,split=#]`:
Select parameters for the default dictionary builder algorithm named cover.
If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
If _steps_ is not specified, then the default value of 40 is used.
If _split_ is not specified, then the default value of 80 is used.
Requires that _d_ <= _k_.
Selects segments of size _k_ with highest score to put in the dictionary.
@ -249,6 +250,8 @@ Compression of small files similar to the sample set will be greatly improved.
`zstd --train-cover=k=50 FILEs`
`zstd --train-cover=k=50,split=60 FILEs`
* `--train-legacy[=selectivity=#]`:
Use legacy dictionary builder algorithm with the given dictionary
_selectivity_ (default: 9).

View File

@ -278,14 +278,20 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
{
memset(params, 0, sizeof(*params));
unsigned splitPercentage = 100;
for (; ;) {
if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "split=")) {
splitPercentage = readU32FromChar(&stringPtr);
params->splitPoint = (double)splitPercentage / 100.0;
if (stringPtr[0]==',') { stringPtr++; continue; } else break;
}
return 0;
}
if (stringPtr[0] != 0) return 0;
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplitPoint=%d\n", params->k, params->d, params->steps, splitPercentage);
return 1;
}