Merge pull request #509 from terrelln/dict-builder-32

Handle cover dictionary builder maximum input size for 32-bit mode
2017-01-10 06:30:28 +01:00 · 2017-01-10 06:30:28 +01:00 · b8cdc16969
commit b8cdc16969
parent 56958500fc 8d984699db
3 changed files with 9 additions and 4 deletions
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@ -28,7 +28,7 @@
 /*-*************************************
 *  Constants
 ***************************************/
-#define COVER_MAX_SAMPLES_SIZE ((U32)-1)
+#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))

 /*-*************************************
 *  Console display
@ -500,7 +500,9 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
  const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
  /* Checks */
  if (totalSamplesSize < d ||
-      totalSamplesSize > (size_t)COVER_MAX_SAMPLES_SIZE) {
+      totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
+    DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
+                 (COVER_MAX_SAMPLES_SIZE >> 20));
    return 0;
  }
  /* Zero the context */
@ -518,6 +520,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
  /* The offsets of each file */
  ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
  if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
+    DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
    COVER_ctx_destroy(ctx);
    return 0;
  }
@ -651,7 +654,6 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
  /* Initialize context and activeDmers */
  if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
                      parameters.d)) {
-    DISPLAYLEVEL(1, "Failed to initialize context\n");
    return ERROR(GENERIC);
  }
  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
--- a/lib/dictBuilder/zdict.h
+++ b/lib/dictBuilder/zdict.h
@ -108,6 +108,7 @@ typedef struct {
    The resulting dictionary will be saved into `dictBuffer`.
    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
              or an error code, which can be tested with ZDICT_isError().
+    Note : COVER_trainFromBuffer() requires about 9 bytes of memory for each input byte.
    Tips : In general, a reasonable dictionary has a size of ~ 100 KB.
           It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
           In general, it's recommended to provide a few thousands samples, but this can vary a lot.
@ -131,6 +132,7 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(void* dictBuffer, size_t dictBufferCap
    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
              or an error code, which can be tested with ZDICT_isError().
              On success `*parameters` contains the parameters selected.
+    Note : COVER_optimizeTrainFromBuffer() requires about 9 bytes of memory for each input byte.
 */
 ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                                     const void* samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
--- a/programs/dibio.c
+++ b/programs/dibio.c
@ -235,7 +235,8 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
    void* const dictBuffer = malloc(maxDictSize);
    size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
    unsigned long long const totalSizeToLoad = DiB_getTotalCappedFileSize(fileNamesTable, nbFiles);
-    size_t const maxMem =  DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
+    size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
+    size_t const maxMem =  DiB_findMaxMem(totalSizeToLoad * memMult) / memMult;
    size_t benchedSize = (size_t) MIN ((unsigned long long)maxMem, totalSizeToLoad);
    void* const srcBuffer = malloc(benchedSize+NOISELENGTH);
    int result = 0;