Add split=# to cli

2018-06-29 17:54:41 -07:00 · 2018-06-29 17:54:41 -07:00 · 348e5f77a9
commit 348e5f77a9
parent 52fbbbcb6b
3 changed files with 15 additions and 6 deletions
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@ -558,15 +558,15 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
  /* Check if there's training sample */
  if (nbTrainSamples < 1) {
    DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
-    DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100));
-    DISPLAYLEVEL(1, "nbSamples is %u", nbSamples);
    return 0;
  }
  /* Check if there's testing sample when splitPoint is nonzero */
  if (nbTestSamples < 1 && splitPoint < 1.0) {
    DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
-    DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100));
-    DISPLAYLEVEL(1, "nbSamples is %u", nbSamples);
+    return 0;
+  }
+  if (nbTrainSamples + nbTestSamples != nbSamples) {
+    DISPLAYLEVEL(1, "nbTrainSamples plus nbTestSamples don't add up to nbSamples");
    return 0;
  }
  /* Zero the context */
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@ -223,11 +223,12 @@ Compression of small files similar to the sample set will be greatly improved.
    This compares favorably to 4 bytes default.
    However, it's up to the dictionary manager to not assign twice the same ID to
    2 different dictionaries.
-* `--train-cover[=k#,d=#,steps=#]`:
+* `--train-cover[=k#,d=#,steps=#,split=#]`:
    Select parameters for the default dictionary builder algorithm named cover.
    If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
    If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
    If _steps_ is not specified, then the default value of 40 is used.
+    If _split_ is not specified, then the default value of 80 is used.
    Requires that _d_ <= _k_.

    Selects segments of size _k_ with highest score to put in the dictionary.
@ -249,6 +250,8 @@ Compression of small files similar to the sample set will be greatly improved.

    `zstd --train-cover=k=50 FILEs`

+    `zstd --train-cover=k=50,split=60 FILEs`
+
 * `--train-legacy[=selectivity=#]`:
    Use legacy dictionary builder algorithm with the given dictionary
    _selectivity_ (default: 9).
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@ -278,14 +278,20 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
 static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
 {
    memset(params, 0, sizeof(*params));
+    unsigned splitPercentage = 100;
    for (; ;) {
        if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
        if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
        if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "split=")) {
+          splitPercentage = readU32FromChar(&stringPtr);
+          params->splitPoint = (double)splitPercentage / 100.0;
+          if (stringPtr[0]==',') { stringPtr++; continue; } else break;
+        }
        return 0;
    }
    if (stringPtr[0] != 0) return 0;
-    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplitPoint=%d\n", params->k, params->d, params->steps, splitPercentage);
    return 1;
 }