Add split=# to cli

This commit is contained in:
Jennifer Liu 2018-06-29 17:54:41 -07:00
parent 52fbbbcb6b
commit 348e5f77a9
3 changed files with 15 additions and 6 deletions

View File

@ -558,15 +558,15 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
/* Check if there's training sample */ /* Check if there's training sample */
if (nbTrainSamples < 1) { if (nbTrainSamples < 1) {
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100));
DISPLAYLEVEL(1, "nbSamples is %u", nbSamples);
return 0; return 0;
} }
/* Check if there's testing sample when splitPoint is nonzero */ /* Check if there's testing sample when splitPoint is nonzero */
if (nbTestSamples < 1 && splitPoint < 1.0) { if (nbTestSamples < 1 && splitPoint < 1.0) {
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100)); return 0;
DISPLAYLEVEL(1, "nbSamples is %u", nbSamples); }
if (nbTrainSamples + nbTestSamples != nbSamples) {
DISPLAYLEVEL(1, "nbTrainSamples plus nbTestSamples don't add up to nbSamples");
return 0; return 0;
} }
/* Zero the context */ /* Zero the context */

View File

@ -223,11 +223,12 @@ Compression of small files similar to the sample set will be greatly improved.
This compares favorably to 4 bytes default. This compares favorably to 4 bytes default.
However, it's up to the dictionary manager to not assign twice the same ID to However, it's up to the dictionary manager to not assign twice the same ID to
2 different dictionaries. 2 different dictionaries.
* `--train-cover[=k#,d=#,steps=#]`: * `--train-cover[=k#,d=#,steps=#,split=#]`:
Select parameters for the default dictionary builder algorithm named cover. Select parameters for the default dictionary builder algorithm named cover.
If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8. If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
If _k_ is not specified, then it tries _steps_ values in the range [50, 2000]. If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
If _steps_ is not specified, then the default value of 40 is used. If _steps_ is not specified, then the default value of 40 is used.
If _split_ is not specified, then the default value of 80 is used.
Requires that _d_ <= _k_. Requires that _d_ <= _k_.
Selects segments of size _k_ with highest score to put in the dictionary. Selects segments of size _k_ with highest score to put in the dictionary.
@ -249,6 +250,8 @@ Compression of small files similar to the sample set will be greatly improved.
`zstd --train-cover=k=50 FILEs` `zstd --train-cover=k=50 FILEs`
`zstd --train-cover=k=50,split=60 FILEs`
* `--train-legacy[=selectivity=#]`: * `--train-legacy[=selectivity=#]`:
Use legacy dictionary builder algorithm with the given dictionary Use legacy dictionary builder algorithm with the given dictionary
_selectivity_ (default: 9). _selectivity_ (default: 9).

View File

@ -278,14 +278,20 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params) static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
{ {
memset(params, 0, sizeof(*params)); memset(params, 0, sizeof(*params));
unsigned splitPercentage = 100;
for (; ;) { for (; ;) {
if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "split=")) {
splitPercentage = readU32FromChar(&stringPtr);
params->splitPoint = (double)splitPercentage / 100.0;
if (stringPtr[0]==',') { stringPtr++; continue; } else break;
}
return 0; return 0;
} }
if (stringPtr[0] != 0) return 0; if (stringPtr[0] != 0) return 0;
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps); DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplitPoint=%d\n", params->k, params->d, params->steps, splitPercentage);
return 1; return 1;
} }