[cover] Improvements for small or homogeneous data

* The algorithm would bail as soon as it found one epoch that contained no new segments. Change it so it now has to fail >= 10 times in a row (10 for fastcover, 10-100 for cover). * The algorithm uses the `maxDict` size to decide the epoch size. When this size is absurdly large, it causes tiny epochs. Lower bound the epoch size at 10x the segment size, and warn the user that their training set is too small. Fixes #1554
2019-03-22 12:28:55 -07:00 · 2019-03-22 12:28:55 -07:00 · d0f5ba36fb
commit d0f5ba36fb
parent 0c7668cd06
3 changed files with 107 additions and 22 deletions
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@ -627,6 +627,38 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
  return 1;
 }

+void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers)
+{
+  const double ratio = (double)nbDmers / maxDictSize;
+  if (ratio >= 10) {
+      return;
+  }
+  DISPLAYLEVEL(1, "WARNING: The maximum dictionary size %u is too large "
+                  "compared to the source size %u! "
+                  "size(source)/size(dictionary) = %f, but it should be >= "
+                  "10! This may lead to a subpar dictionary! We recommend "
+                  "training on sources at least 10x, and up to 100x the "
+                  "size of the dictionary!\n", (U32)maxDictSize,
+                  (U32)nbDmers, ratio);
+}
+
+COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
+                                       U32 nbDmers, U32 k, U32 passes)
+{
+  const U32 minEpochSize = k * 10;
+  COVER_epoch_info_t epochs;
+  epochs.num = MAX(1, maxDictSize / k / passes);
+  epochs.size = nbDmers / epochs.num;
+  if (epochs.size >= minEpochSize) {
+      assert(epochs.size * epochs.num <= nbDmers);
+      return epochs;
+  }
+  epochs.size = MIN(minEpochSize, nbDmers);
+  epochs.num = nbDmers / epochs.size;
+  assert(epochs.size * epochs.num <= nbDmers);
+  return epochs;
+}
+
 /**
 * Given the prepared context build the dictionary.
 */
@ -636,28 +668,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
                                    ZDICT_cover_params_t parameters) {
  BYTE *const dict = (BYTE *)dictBuffer;
  size_t tail = dictBufferCapacity;
-  /* Divide the data up into epochs of equal size.
-   * We will select at least one segment from each epoch.
-   */
-  const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
-  const unsigned epochSize = (U32)(ctx->suffixSize / epochs);
+  /* Divide the data into epochs. We will select one segment from each epoch. */
+  const COVER_epoch_info_t epochs = COVER_computeEpochs(
+      (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
+  const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
+  size_t zeroScoreRun = 0;
  size_t epoch;
  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
-                epochs, epochSize);
+                (U32)epochs.num, (U32)epochs.size);
  /* Loop through the epochs until there are no more segments or the dictionary
   * is full.
   */
-  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
-    const U32 epochBegin = (U32)(epoch * epochSize);
-    const U32 epochEnd = epochBegin + epochSize;
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+    const U32 epochBegin = (U32)(epoch * epochs.size);
+    const U32 epochEnd = epochBegin + epochs.size;
    size_t segmentSize;
    /* Select a segment */
    COVER_segment_t segment = COVER_selectSegment(
        ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
-    /* If the segment covers no dmers, then we are out of content */
+    /* If the segment covers no dmers, then we are out of content.
+     * There may be new content in other epochs, for continue for some time.
+     */
    if (segment.score == 0) {
-      break;
+      if (++zeroScoreRun >= maxZeroScoreRun) {
+          break;
+      }
+      continue;
    }
+    zeroScoreRun = 0;
    /* Trim the segment if necessary and if it is too small then we are done */
    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
    if (segmentSize < parameters.d) {
@ -706,6 +744,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
                      parameters.d, parameters.splitPoint)) {
    return ERROR(GENERIC);
  }
+  COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize);
  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
    COVER_ctx_destroy(&ctx);
@ -977,6 +1016,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
  unsigned k;
  COVER_best_t best;
  POOL_ctx *pool = NULL;
+  int warned = 0;

  /* Checks */
  if (splitPoint <= 0 || splitPoint > 1) {
@ -1019,6 +1059,10 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
      POOL_free(pool);
      return ERROR(GENERIC);
    }
+    if (!warned) {
+      COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize);
+      warned = 1;
+    }
    /* Loop through k reusing the same context */
    for (k = kMinK; k <= kMaxK; k += kStepSize) {
      /* Prepare the arguments */
--- a/lib/dictBuilder/cover.h
+++ b/lib/dictBuilder/cover.h
@ -38,6 +38,35 @@ typedef struct {
  U32 score;
 } COVER_segment_t;

+/**
+ *Number of epochs and size of each epoch.
+ */
+typedef struct {
+  U32 num;
+  U32 size;
+} COVER_epoch_info_t;
+
+/**
+ * Computes the number of epochs and the size of each epoch.
+ * We will make sure that each epoch gets at least 10 * k bytes.
+ *
+ * The COVER algorithms divide the data up into epochs of equal size and
+ * select one segemnt from each epoch.
+ *
+ * @param maxDictSize The maximum allowed dictioary size.
+ * @param nbDmers     The number of dmers we are training on.
+ * @param k           The parameter k (segment size).
+ * @param passes      The target number of passes over the dmer corpus.
+ *                    More passes means a better dictionary.
+ */
+COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
+                                       U32 k, U32 passes);
+
+/**
+ * Warns the user when their corpus is too small.
+ */
+void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers);
+
 /**
 *  Checks total compressed size of a dictionary
 */
--- a/lib/dictBuilder/fastcover.c
+++ b/lib/dictBuilder/fastcover.c
@ -386,29 +386,35 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
 {
  BYTE *const dict = (BYTE *)dictBuffer;
  size_t tail = dictBufferCapacity;
-  /* Divide the data up into epochs of equal size.
-   * We will select at least one segment from each epoch.
-   */
-  const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
-  const unsigned epochSize = (U32)(ctx->nbDmers / epochs);
+  /* Divide the data into epochs. We will select one segment from each epoch. */
+  const COVER_epoch_info_t epochs = COVER_computeEpochs(
+      (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
+  const size_t maxZeroScoreRun = 10;
+  size_t zeroScoreRun = 0;
  size_t epoch;
  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
-                epochs, epochSize);
+                (U32)epochs.num, (U32)epochs.size);
  /* Loop through the epochs until there are no more segments or the dictionary
   * is full.
   */
-  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
-    const U32 epochBegin = (U32)(epoch * epochSize);
-    const U32 epochEnd = epochBegin + epochSize;
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+    const U32 epochBegin = (U32)(epoch * epochs.size);
+    const U32 epochEnd = epochBegin + epochs.size;
    size_t segmentSize;
    /* Select a segment */
    COVER_segment_t segment = FASTCOVER_selectSegment(
        ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);

-    /* If the segment covers no dmers, then we are out of content */
+    /* If the segment covers no dmers, then we are out of content.
+     * There may be new content in other epochs, for continue for some time.
+     */
    if (segment.score == 0) {
-      break;
+      if (++zeroScoreRun >= maxZeroScoreRun) {
+          break;
+      }
+      continue;
    }
+    zeroScoreRun = 0;

    /* Trim the segment if necessary and if it is too small then we are done */
    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
@ -564,6 +570,7 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
      DISPLAYLEVEL(1, "Failed to initialize context\n");
      return ERROR(GENERIC);
    }
+    COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers);
    /* Build the dictionary */
    DISPLAYLEVEL(2, "Building dictionary\n");
    {
@ -616,6 +623,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
    unsigned k;
    COVER_best_t best;
    POOL_ctx *pool = NULL;
+    int warned = 0;
    /* Checks */
    if (splitPoint <= 0 || splitPoint > 1) {
      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
@ -664,6 +672,10 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
        POOL_free(pool);
        return ERROR(GENERIC);
      }
+      if (!warned) {
+        COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers);
+        warned = 1;
+      }
      /* Loop through k reusing the same context */
      for (k = kMinK; k <= kMaxK; k += kStepSize) {
        /* Prepare the arguments */