Add split=# to cli
This commit is contained in:
parent
52fbbbcb6b
commit
348e5f77a9
@ -558,15 +558,15 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
||||
/* Check if there's training sample */
|
||||
if (nbTrainSamples < 1) {
|
||||
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
|
||||
DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100));
|
||||
DISPLAYLEVEL(1, "nbSamples is %u", nbSamples);
|
||||
return 0;
|
||||
}
|
||||
/* Check if there's testing sample when splitPoint is nonzero */
|
||||
if (nbTestSamples < 1 && splitPoint < 1.0) {
|
||||
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
|
||||
DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100));
|
||||
DISPLAYLEVEL(1, "nbSamples is %u", nbSamples);
|
||||
return 0;
|
||||
}
|
||||
if (nbTrainSamples + nbTestSamples != nbSamples) {
|
||||
DISPLAYLEVEL(1, "nbTrainSamples plus nbTestSamples don't add up to nbSamples");
|
||||
return 0;
|
||||
}
|
||||
/* Zero the context */
|
||||
|
@ -223,11 +223,12 @@ Compression of small files similar to the sample set will be greatly improved.
|
||||
This compares favorably to 4 bytes default.
|
||||
However, it's up to the dictionary manager to not assign twice the same ID to
|
||||
2 different dictionaries.
|
||||
* `--train-cover[=k#,d=#,steps=#]`:
|
||||
* `--train-cover[=k#,d=#,steps=#,split=#]`:
|
||||
Select parameters for the default dictionary builder algorithm named cover.
|
||||
If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
|
||||
If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
|
||||
If _steps_ is not specified, then the default value of 40 is used.
|
||||
If _split_ is not specified, then the default value of 80 is used.
|
||||
Requires that _d_ <= _k_.
|
||||
|
||||
Selects segments of size _k_ with highest score to put in the dictionary.
|
||||
@ -249,6 +250,8 @@ Compression of small files similar to the sample set will be greatly improved.
|
||||
|
||||
`zstd --train-cover=k=50 FILEs`
|
||||
|
||||
`zstd --train-cover=k=50,split=60 FILEs`
|
||||
|
||||
* `--train-legacy[=selectivity=#]`:
|
||||
Use legacy dictionary builder algorithm with the given dictionary
|
||||
_selectivity_ (default: 9).
|
||||
|
@ -278,14 +278,20 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
|
||||
static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
|
||||
{
|
||||
memset(params, 0, sizeof(*params));
|
||||
unsigned splitPercentage = 100;
|
||||
for (; ;) {
|
||||
if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "split=")) {
|
||||
splitPercentage = readU32FromChar(&stringPtr);
|
||||
params->splitPoint = (double)splitPercentage / 100.0;
|
||||
if (stringPtr[0]==',') { stringPtr++; continue; } else break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
if (stringPtr[0] != 0) return 0;
|
||||
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
|
||||
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplitPoint=%d\n", params->k, params->d, params->steps, splitPercentage);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user