From 6d222c437ca1f3b7420f414354643d3f11ba075a Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Thu, 12 Jul 2018 17:56:58 -0700
Subject: [PATCH 01/35] Set requestedParams in ZSTD_initCStream*()

The correct parameters are used once, but once `ZSTD_resetCStream()` is
called the default parameters (level 3) are used. Fix this by setting
`requestedParams` in the `ZSTD_initCStream*()` functions.

The added tests both fail before this patch and pass after.
---
 lib/compress/zstd_compress.c | 22 ++++++++++------------
 tests/zstreamtest.c          | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index c6686252..d659baf1 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -3332,9 +3332,11 @@ size_t ZSTD_CStreamOutSize(void)
 static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx,
                     const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType,
                     const ZSTD_CDict* const cdict,
-                    ZSTD_CCtx_params const params, unsigned long long const pledgedSrcSize)
+                    ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_resetCStream_internal");
+    /* Finalize the compression parameters */
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, dictSize);
     /* params are supposed to be fully validated at this point */
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
@@ -3363,7 +3365,6 @@ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize)
     DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (U32)pledgedSrcSize);
     if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
     params.fParams.contentSizeFlag = 1;
-    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, 0);
     return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize);
 }
 
@@ -3376,6 +3377,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
                     ZSTD_CCtx_params params, unsigned long long pledgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_initCStream_internal");
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, dictSize);
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
 
@@ -3442,25 +3444,21 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
                 (U32)pledgedSrcSize, params.fParams.contentSizeFlag);
     CHECK_F( ZSTD_checkCParams(params.cParams) );
     if ((pledgedSrcSize==0) && (params.fParams.contentSizeFlag==0)) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* for compatibility with older programs relying on this behavior. Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. This line will be removed in the future. */
-    {   ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-        return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, cctxParams, pledgedSrcSize);
-    }
+    zcs->requestedParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, zcs->requestedParams, pledgedSrcSize);
 }
 
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
 {
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
-    ZSTD_CCtx_params const cctxParams =
-            ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN);
+    ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, zcs->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss)
 {
     U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;  /* temporary : 0 interpreted as "unknown" during transition period. Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. `0` will be interpreted as "empty" in the future */
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0);
-    ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-    return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, cctxParams, pledgedSrcSize);
+    ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel);
+    return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, zcs->requestedParams, pledgedSrcSize);
 }
 
 size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c
index 22c49cb3..3d61394a 100644
--- a/tests/zstreamtest.c
+++ b/tests/zstreamtest.c
@@ -969,6 +969,26 @@ static int basicUnitTests(U32 seed, double compressibility)
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_srcSize sets requestedParams : ", testNb++);
+    {   unsigned level;
+        CHECK_Z(ZSTD_initCStream_srcSize(zc, 11, ZSTD_CONTENTSIZE_UNKNOWN));
+        CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionLevel, &level));
+        CHECK(level != 11, "Compression level does not match");
+        ZSTD_resetCStream(zc, ZSTD_CONTENTSIZE_UNKNOWN);
+        CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionLevel, &level));
+        CHECK(level != 11, "Compression level does not match");
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_advanced sets requestedParams : ", testNb++);
+    {   ZSTD_parameters const params = ZSTD_getParams(9, 0, 0);
+        CHECK_Z(ZSTD_initCStream_advanced(zc, NULL, 0, params, ZSTD_CONTENTSIZE_UNKNOWN));
+        CHECK(badParameters(zc, params), "Compression parameters do not match");
+        ZSTD_resetCStream(zc, ZSTD_CONTENTSIZE_UNKNOWN);
+        CHECK(badParameters(zc, params), "Compression parameters do not match");
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     /* Overlen overwriting window data bug */
     DISPLAYLEVEL(3, "test%3i : wildcopy doesn't overwrite potential match data : ", testNb++);
     {   /* This test has a window size of 1024 bytes and consists of 3 blocks:

From a23a3b95f9c00ecf52216bd7fe768e41eac4e269 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 13 Jul 2018 16:05:14 -0700
Subject: [PATCH 02/35] Add random dictionary builder

---
 contrib/randomDictBuilder/Makefile  |  48 +++
 contrib/randomDictBuilder/README.md |  13 +
 contrib/randomDictBuilder/main.c    | 125 ++++++++
 contrib/randomDictBuilder/random.c  | 455 ++++++++++++++++++++++++++++
 contrib/randomDictBuilder/random.h  |  53 ++++
 contrib/randomDictBuilder/test.sh   |  14 +
 6 files changed, 708 insertions(+)
 create mode 100644 contrib/randomDictBuilder/Makefile
 create mode 100644 contrib/randomDictBuilder/README.md
 create mode 100644 contrib/randomDictBuilder/main.c
 create mode 100644 contrib/randomDictBuilder/random.c
 create mode 100644 contrib/randomDictBuilder/random.h
 create mode 100644 contrib/randomDictBuilder/test.sh

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
new file mode 100644
index 00000000..a2aade23
--- /dev/null
+++ b/contrib/randomDictBuilder/Makefile
@@ -0,0 +1,48 @@
+PROGRAM_FILES := ../../programs/fileio.c
+
+TEST_INPUT := ../../lib
+TEST_OUTPUT := randomDict
+ARG :=
+
+all: main testrun test clean
+
+run: main rand clean
+
+.PHONY: rand
+rand:
+	echo "Building a random dictionary with given arguments"
+	./main $(ARG)
+
+
+main: random.o main.o libzstd.a
+	gcc random.o main.o libzstd.a -o main
+
+main.o: main.c
+	gcc -c main.c -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -I random.h
+
+random.o: $(PROGRAM_FILES) random.c
+	gcc -c $(PROGRAM_FILES) -I ../../programs -I ../../lib/common -I random.h random.c
+
+libzstd.a:
+	$(MAKE) -C ../../lib libzstd.a
+	mv ../../lib/libzstd.a .
+
+.PHONY: testrun
+testrun: main
+	echo "Run with $(TEST_INPUT) and $(TEST_OUTPUT) "
+	./main in=$(TEST_INPUT) out=$(TEST_OUTPUT)
+	zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q
+	rm -f $(TEST_OUTPUT)
+
+.PHONY: test
+test: test.sh
+	sh test.sh
+	echo "Finish running test.sh"
+
+.PHONY: clean
+clean:
+	rm -f libzstd.a main
+	rm -f ../../lib/*/*.o
+	rm -f ../../programs/*.o
+	rm -f *.o
+	echo "Cleaning is completed"
diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md
new file mode 100644
index 00000000..cadffdf2
--- /dev/null
+++ b/contrib/randomDictBuilder/README.md
@@ -0,0 +1,13 @@
+Random Dictionary Builder
+
+### Permitted Arguments:
+Input Files (in=fileName): files used to build dictionary, can include multiple files, each following "in=", required
+Output Dictionary (out=dictName): if not provided, default to defaultDict
+Dictionary ID (dictID=#): positive number, if not provided, default to 0
+Maximum Dictionary Size (maxdict=#): positive number, in bytes, if not provided, default to 110KB
+Size of Randomly Selected Segment (k=#): positive number, in bytes, if not provided, default to 200
+Compression Level (c=#): positive number, if not provided, default to 3
+
+### Examples:
+make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520"
+make run ARG="in=../../lib/dictBuilder in=../../lib/compress"
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
new file mode 100644
index 00000000..15eb5c44
--- /dev/null
+++ b/contrib/randomDictBuilder/main.c
@@ -0,0 +1,125 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
+#include "random.h"
+#include "util.h"
+
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const unsigned g_defaultMaxDictSize = 110 KB;
+#define DEFAULT_CLEVEL 3
+#define DEFAULT_INPUTFILE ""
+#define DEFAULT_k 200
+#define DEFAULT_OUTPUTFILE "defaultDict"
+#define DEFAULT_DICTID 0
+
+
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) exit(1);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) exit(1);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) exit(1);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ */
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+
+int main(int argCount, const char* argv[])
+{
+  int displayLevel = 2;
+  const char* programName = argv[0];
+  int operationResult = 0;
+
+  unsigned cLevel = DEFAULT_CLEVEL;
+  char* inputFile = DEFAULT_INPUTFILE;
+  unsigned k = DEFAULT_k;
+  char* outputFile = DEFAULT_OUTPUTFILE;
+  unsigned dictID = DEFAULT_DICTID;
+  unsigned maxDictSize = g_defaultMaxDictSize;
+
+  const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
+  unsigned filenameIdx = 0;
+
+  for (int i = 1; i < argCount; i++) {
+    const char* argument = argv[i];
+    if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "in=")) {
+      inputFile = malloc(strlen(argument) + 1);
+      strcpy(inputFile, argument);
+      filenameTable[filenameIdx] = inputFile;
+      filenameIdx++;
+      continue;
+    }
+    if (longCommandWArg(&argument, "out=")) {
+      outputFile = malloc(strlen(argument) + 1);
+      strcpy(outputFile, argument);
+      continue;
+    }
+    DISPLAYLEVEL(1, "Incorrect parameters\n");
+    operationResult = 1;
+    return operationResult;
+  }
+
+
+  char* fileNamesBuf = NULL;
+  unsigned fileNamesNb = filenameIdx;
+  int followLinks = 0;
+  const char** extendedFileList = NULL;
+  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks);
+  if (extendedFileList) {
+      unsigned u;
+      for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
+      free((void*)filenameTable);
+      filenameTable = extendedFileList;
+      filenameIdx = fileNamesNb;
+  }
+
+  size_t blockSize = 0;
+
+  ZDICT_random_params_t params;
+  ZDICT_params_t zParams;
+  zParams.compressionLevel = cLevel;
+  zParams.notificationLevel = displayLevel;
+  zParams.dictID = dictID;
+  params.zParams = zParams;
+  params.k = k;
+
+  operationResult = RANDOM_trainFromFiles(outputFile, maxDictSize, filenameTable, filenameIdx, blockSize, &params);
+  return operationResult;
+}
diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c
new file mode 100644
index 00000000..a59427ba
--- /dev/null
+++ b/contrib/randomDictBuilder/random.c
@@ -0,0 +1,455 @@
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>            /* fprintf */
+#include <stdlib.h>           /* malloc, free, qsort */
+#include <string.h>           /* memset */
+#include <time.h>             /* clock */
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "random.h"
+#include "platform.h"         /* Large Files support */
+#include "util.h"             /* UTIL_getFileSize, UTIL_getTotalFileSize */
+
+/*-*************************************
+*  Constants
+***************************************/
+#define SAMPLESIZE_MAX (128 KB)
+#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define RANDOM_MEMMULT 9
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
+
+#define NOISELENGTH 32
+#define DEFAULT_K 200
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+
+/* ********************************************************
+*  File related operations
+**********************************************************/
+/** loadFiles() :
+ *  load samples from files listed in fileNamesTable into buffer.
+ *  works even if buffer is too small to load all samples.
+ *  Also provides the size of each sample into sampleSizes table
+ *  which must be sized correctly, using DiB_fileStats().
+ * @return : nb of samples effectively loaded into `buffer`
+ * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
+ *  sampleSizes is filled with the size of each sample.
+ */
+static unsigned loadFiles(void* buffer, size_t* bufferSizePtr,
+                              size_t* sampleSizes, unsigned sstSize,
+                              const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
+                              unsigned displayLevel)
+{
+    char* const buff = (char*)buffer;
+    size_t pos = 0;
+    unsigned nbLoadedChunks = 0, fileIndex;
+
+    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
+        const char* const fileName = fileNamesTable[fileIndex];
+        unsigned long long const fs64 = UTIL_getFileSize(fileName);
+        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
+        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
+        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
+        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
+        U32 cnb;
+        FILE* const f = fopen(fileName, "rb");
+        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
+        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
+        for (cnb=0; cnb<nbChunks; cnb++) {
+            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
+            if (toLoad > *bufferSizePtr-pos) break;
+            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
+                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
+                pos += readSize;
+                sampleSizes[nbLoadedChunks++] = toLoad;
+                remainingToLoad -= targetChunkSize;
+                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
+                    fileIndex = nbFiles;  /* stop there */
+                    break;
+                }
+                if (toLoad < targetChunkSize) {
+                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
+        }   }   }
+        fclose(f);
+    }
+    DISPLAYLEVEL(2, "\r%79s\r", "");
+    *bufferSizePtr = pos;
+    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
+    return nbLoadedChunks;
+}
+
+
+
+#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 getRand(U32* src)
+{
+    static const U32 prime1 = 2654435761U;
+    static const U32 prime2 = 2246822519U;
+    U32 rand32 = *src;
+    rand32 *= prime1;
+    rand32 ^= prime2;
+    rand32  = rotl32(rand32, 13);
+    *src = rand32;
+    return rand32 >> 5;
+}
+
+
+/* shuffle() :
+ * shuffle a table of file names in a semi-random way
+ * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
+ * it will load random elements from it, instead of just the first ones. */
+static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
+    U32 seed = 0xFD2FB528;
+    unsigned i;
+    for (i = nbFiles - 1; i > 0; --i) {
+        unsigned const j = getRand(&seed) % (i + 1);
+        const char* const tmp = fileNamesTable[j];
+        fileNamesTable[j] = fileNamesTable[i];
+        fileNamesTable[i] = tmp;
+    }
+}
+
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static size_t findMaxMem(unsigned long long requiredMem)
+{
+    size_t const step = 8 MB;
+    void* testmem = NULL;
+
+    requiredMem = (((requiredMem >> 23) + 1) << 23);
+    requiredMem += step;
+    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
+
+    while (!testmem) {
+        testmem = malloc((size_t)requiredMem);
+        requiredMem -= step;
+    }
+
+    free(testmem);
+    return (size_t)requiredMem;
+}
+
+static void saveDict(const char* dictFileName,
+                         const void* buff, size_t buffSize)
+{
+    FILE* const f = fopen(dictFileName, "wb");
+    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
+
+    { size_t const n = fwrite(buff, 1, buffSize, f);
+      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
+
+    { size_t const n = (size_t)fclose(f);
+      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
+}
+
+/*! getFileStats() :
+ *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
+ *  provides the amount of data to be loaded and the resulting nb of samples.
+ *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
+ */
+static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
+{
+    fileStats fs;
+    unsigned n;
+    memset(&fs, 0, sizeof(fs));
+    for (n=0; n<nbFiles; n++) {
+        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
+        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
+        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
+        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
+        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
+        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
+        fs.nbSamples += nbSamples;
+    }
+    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
+    return fs;
+}
+
+
+
+
+
+/* ********************************************************
+*  Random Dictionary Builder
+**********************************************************/
+/**
+ * Returns the sum of the sample sizes.
+ */
+static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) {
+  size_t sum = 0;
+  unsigned i;
+  for (i = 0; i < nbSamples; ++i) {
+    sum += samplesSizes[i];
+  }
+  return sum;
+}
+
+
+/**
+ * Selects a random segment from totalSamplesSize - k + 1 possible segments
+ */
+static RANDOM_segment_t RANDOM_selectSegment(const RANDOM_ctx_t *ctx,
+                                            ZDICT_random_params_t parameters) {
+    const U32 k = parameters.k;
+    RANDOM_segment_t segment;
+    unsigned index;
+
+    /* Seed random number generator */
+    srand((unsigned)time(NULL));
+    /* Randomly generate a number from 0 to sampleSizes - k */
+    index = rand()%(ctx->totalSamplesSize - k + 1);
+
+    /* inclusive */
+    segment.begin = index;
+    segment.end = index + k - 1;
+
+    return segment;
+}
+
+
+/**
+ * Check the validity of the parameters.
+ * Returns non-zero if the parameters are valid and 0 otherwise.
+ */
+static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDictSize) {
+    /* k is a required parameter */
+    if (parameters.k == 0) {
+      return 0;
+    }
+    /* k <= maxDictSize */
+    if (parameters.k > maxDictSize) {
+      return 0;
+    }
+    return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `RANDOM_ctx_init()`.
+ */
+static void RANDOM_ctx_destroy(RANDOM_ctx_t *ctx) {
+  if (!ctx) {
+    return;
+  }
+  if (ctx->offsets) {
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+  }
+}
+
+
+/**
+ * Prepare a context for dictionary building.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `RANDOM_ctx_destroy()`.
+ */
+static int RANDOM_ctx_init(RANDOM_ctx_t *ctx, const void *samplesBuffer,
+                          const size_t *samplesSizes, unsigned nbSamples) {
+    const BYTE *const samples = (const BYTE *)samplesBuffer;
+    const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples);
+    const int displayLevel = 2;
+    /* Checks */
+    if (totalSamplesSize >= (size_t)RANDOM_MAX_SAMPLES_SIZE) {
+      DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                   (U32)(totalSamplesSize>>20), (RANDOM_MAX_SAMPLES_SIZE >> 20));
+      return 0;
+    }
+    memset(ctx, 0, sizeof(*ctx));
+    DISPLAYLEVEL(1, "Building dictionary from %u samples of total size %u\n", nbSamples,
+                 (U32)totalSamplesSize);
+    ctx->samples = samples;
+    ctx->samplesSizes = samplesSizes;
+    ctx->nbSamples = nbSamples;
+    ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
+    ctx->totalSamplesSize = (U32)totalSamplesSize;
+    if (!ctx->offsets) {
+      DISPLAYLEVEL(1, "Failed to allocate buffer for offsets\n");
+      RANDOM_ctx_destroy(ctx);
+      return 0;
+    }
+    {
+      U32 i;
+      ctx->offsets[0] = 0;
+      for (i = 1; i <= nbSamples; ++i) {
+        ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+      }
+    }
+    return 1;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer,
+                                    size_t dictBufferCapacity,
+                                    ZDICT_random_params_t parameters) {
+    BYTE *const dict = (BYTE *)dictBuffer;
+    size_t tail = dictBufferCapacity;
+    const int displayLevel = parameters.zParams.notificationLevel;
+    while (tail > 0) {
+
+      /* Select a segment */
+      RANDOM_segment_t segment = RANDOM_selectSegment(ctx, parameters);
+
+      size_t segmentSize;
+      segmentSize = MIN(segment.end - segment.begin + 1, tail);
+
+      tail -= segmentSize;
+      memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+      DISPLAYUPDATE(
+          2, "\r%u%%       ",
+          (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+    }
+
+    return tail;
+}
+
+/*! ZDICT_trainFromBuffer_random():
+ *  Train a dictionary from an array of samples using the RANDOM algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_random_params_t parameters) {
+      const int displayLevel = parameters.zParams.notificationLevel;
+      BYTE* const dict = (BYTE*)dictBuffer;
+      RANDOM_ctx_t ctx;
+      /* Checks */
+      if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) {
+          DISPLAYLEVEL(1, "k is incorrect\n");
+          return ERROR(GENERIC);
+      }
+      if (nbSamples == 0) {
+        DISPLAYLEVEL(1, "Random must have at least one input file\n");
+        return ERROR(GENERIC);
+      }
+      if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+        DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                     ZDICT_DICTSIZE_MIN);
+        return ERROR(dstSize_tooSmall);
+      }
+
+      if (!RANDOM_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples)) {
+        return ERROR(GENERIC);
+      }
+      DISPLAYLEVEL(2, "Building dictionary\n");
+      {
+        const size_t tail = RANDOM_buildDictionary(&ctx, dictBuffer, dictBufferCapacity, parameters);
+        const size_t dictSize = ZDICT_finalizeDictionary(
+            dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+            samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
+        if (!ZSTD_isError(dictSize)) {
+            DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                          (U32)dictSize);
+        }
+        RANDOM_ctx_destroy(&ctx);
+        return dictSize;
+      }
+}
+
+
+int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
+                       const char** fileNamesTable, unsigned nbFiles,
+                       size_t chunkSize, ZDICT_random_params_t *params){
+    unsigned const displayLevel = params->zParams.notificationLevel;
+    void* const dictBuffer = malloc(maxDictSize);
+    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
+    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
+    size_t const memMult = RANDOM_MEMMULT;
+    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
+    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
+    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
+    int result = 0;
+
+    /* Checks */
+    if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
+        EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
+    if (fs.oneSampleTooLarge) {
+        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
+        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
+        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
+    }
+    if (fs.nbSamples < 5) {
+        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
+        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
+        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
+        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
+    }
+    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
+        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
+        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
+    }
+
+    /* init */
+    if (loadedSize < fs.totalSizeToLoad)
+        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
+
+    /* Load input buffer */
+    DISPLAYLEVEL(3, "Shuffling input files\n");
+    shuffle(fileNamesTable, nbFiles);
+    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    {   size_t dictSize;
+        dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, srcBuffer,
+                                             sampleSizes, fs.nbSamples, *params);
+        DISPLAYLEVEL(2, "k=%u\n", params->k);
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            result = 1;
+            goto _cleanup;
+        }
+        /* save dict */
+        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
+        saveDict(dictFileName, dictBuffer, dictSize);
+    }
+
+    /* clean up */
+_cleanup:
+    free(srcBuffer);
+    free(sampleSizes);
+    free(dictBuffer);
+    return result;
+}
diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h
new file mode 100644
index 00000000..05879641
--- /dev/null
+++ b/contrib/randomDictBuilder/random.h
@@ -0,0 +1,53 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+/**************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  U32 totalSamplesSize;
+} RANDOM_ctx_t;
+
+/**
+ * A segment is an inclusive range in the source.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+} RANDOM_segment_t;
+
+
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */
+    ZDICT_params_t zParams;
+} ZDICT_random_params_t;
+
+
+typedef struct {
+    U64 totalSizeToLoad;
+    unsigned oneSampleTooLarge;
+    unsigned nbSamples;
+} fileStats;
+
+
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_random_params_t parameters);
+
+
+int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
+                        const char** fileNamesTable, unsigned nbFiles,
+                        size_t chunkSize, ZDICT_random_params_t *params);
diff --git a/contrib/randomDictBuilder/test.sh b/contrib/randomDictBuilder/test.sh
new file mode 100644
index 00000000..552650ee
--- /dev/null
+++ b/contrib/randomDictBuilder/test.sh
@@ -0,0 +1,14 @@
+echo "Building random dictionary with c=5 in=../../lib/common k=200 out=dict1"
+./main c=5 in=../../lib/common k=200 out=dict1
+zstd -be3 -D dict1 -r ../../lib/common -q
+echo "Building random dictionary with c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000"
+./main c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
+zstd -be3 -D dict2 -r ../../lib/common -q
+echo "Building random dictionary with 2 sample sources"
+./main in=../../lib/common in=../../lib/compress out=dict3
+zstd -be3 -D dict3 -r ../../lib/common -q
+echo "Removing dict1 dict2 dict3"
+rm -f dict1 dict2 dict3
+
+echo "Testing with invalid parameters, should fail"
+! ./main r=10

From 31731df4dab0df7b465de2de5641b2e3416c9086 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 13 Jul 2018 17:38:53 -0700
Subject: [PATCH 03/35] Remove clevel and update documentation

---
 contrib/randomDictBuilder/README.md | 15 ++++++++++-----
 contrib/randomDictBuilder/main.c    | 11 ++++++++---
 contrib/randomDictBuilder/test.sh   |  8 ++++----
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md
index cadffdf2..de2c7ff6 100644
--- a/contrib/randomDictBuilder/README.md
+++ b/contrib/randomDictBuilder/README.md
@@ -1,12 +1,17 @@
 Random Dictionary Builder
 
 ### Permitted Arguments:
-Input Files (in=fileName): files used to build dictionary, can include multiple files, each following "in=", required
+Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
 Output Dictionary (out=dictName): if not provided, default to defaultDict
-Dictionary ID (dictID=#): positive number, if not provided, default to 0
-Maximum Dictionary Size (maxdict=#): positive number, in bytes, if not provided, default to 110KB
-Size of Randomly Selected Segment (k=#): positive number, in bytes, if not provided, default to 200
-Compression Level (c=#): positive number, if not provided, default to 3
+Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0
+Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB
+Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200
+Compression Level (c=#): positive number; if not provided, default to 3
+
+
+###Usage:
+To build a random dictionary with the provided arguments: make run ARG= followed by arguments
+
 
 ### Examples:
 make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520"
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index 15eb5c44..cf0b9476 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -63,7 +63,7 @@ int main(int argCount, const char* argv[])
   const char* programName = argv[0];
   int operationResult = 0;
 
-  unsigned cLevel = DEFAULT_CLEVEL;
+  /* Initialize parameters with default value */
   char* inputFile = DEFAULT_INPUTFILE;
   unsigned k = DEFAULT_k;
   char* outputFile = DEFAULT_OUTPUTFILE;
@@ -76,10 +76,10 @@ int main(int argCount, const char* argv[])
   for (int i = 1; i < argCount; i++) {
     const char* argument = argv[i];
     if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
-    if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "in=")) {
+      /* Allow multiple input files */
       inputFile = malloc(strlen(argument) + 1);
       strcpy(inputFile, argument);
       filenameTable[filenameIdx] = inputFile;
@@ -96,6 +96,11 @@ int main(int argCount, const char* argv[])
     return operationResult;
   }
 
+  if (maxDictSize == 0) {
+    DISPLAYLEVEL(1, "maxDictSize should not be 0.\n");
+    operationResult = 1;
+    return operationResult;
+  }
 
   char* fileNamesBuf = NULL;
   unsigned fileNamesNb = filenameIdx;
@@ -114,7 +119,7 @@ int main(int argCount, const char* argv[])
 
   ZDICT_random_params_t params;
   ZDICT_params_t zParams;
-  zParams.compressionLevel = cLevel;
+  zParams.compressionLevel = DEFAULT_CLEVEL;
   zParams.notificationLevel = displayLevel;
   zParams.dictID = dictID;
   params.zParams = zParams;
diff --git a/contrib/randomDictBuilder/test.sh b/contrib/randomDictBuilder/test.sh
index 552650ee..497820f8 100644
--- a/contrib/randomDictBuilder/test.sh
+++ b/contrib/randomDictBuilder/test.sh
@@ -1,8 +1,8 @@
-echo "Building random dictionary with c=5 in=../../lib/common k=200 out=dict1"
-./main c=5 in=../../lib/common k=200 out=dict1
+echo "Building random dictionary with in=../../lib/common k=200 out=dict1"
+./main in=../../lib/common k=200 out=dict1
 zstd -be3 -D dict1 -r ../../lib/common -q
-echo "Building random dictionary with c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000"
-./main c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
+echo "Building random dictionary with in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000"
+./main in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
 zstd -be3 -D dict2 -r ../../lib/common -q
 echo "Building random dictionary with 2 sample sources"
 ./main in=../../lib/common in=../../lib/compress out=dict3

From 0e5fbc10facdce2def08e4f4ecb67d255694df3a Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 13 Jul 2018 17:41:09 -0700
Subject: [PATCH 04/35] Update README

---
 contrib/randomDictBuilder/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md
index de2c7ff6..09f1e808 100644
--- a/contrib/randomDictBuilder/README.md
+++ b/contrib/randomDictBuilder/README.md
@@ -6,7 +6,6 @@ Output Dictionary (out=dictName): if not provided, default to defaultDict
 Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0
 Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB
 Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200
-Compression Level (c=#): positive number; if not provided, default to 3
 
 
 ###Usage:

From 58b82194755b52ad80b6e7da5aeae8e383f8bb90 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Mon, 9 Jul 2018 18:24:07 -0700
Subject: [PATCH 05/35] zstdcli: Allow -o before --train

Only set the default value if `outFileName` is unset.

Fixes #1227.
---
 programs/zstdcli.c | 14 ++++++++------
 tests/playTests.sh | 24 +++++++++++++++++++++---
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index e0d7807f..36ba2115 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -502,7 +502,7 @@ int main(int argCount, const char* argv[])
                     if (!strcmp(argument, "--sparse")) { FIO_setSparseWrite(2); continue; }
                     if (!strcmp(argument, "--no-sparse")) { FIO_setSparseWrite(0); continue; }
                     if (!strcmp(argument, "--test")) { operation=zom_test; continue; }
-                    if (!strcmp(argument, "--train")) { operation=zom_train; outFileName=g_defaultDictName; continue; }
+                    if (!strcmp(argument, "--train")) { operation=zom_train; if (outFileName==NULL) outFileName=g_defaultDictName; continue; }
                     if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; lastCommand=1; continue; }  /* kept available for compatibility with old syntax ; will be removed one day */
                     if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; lastCommand=1; continue; }  /* kept available for compatibility with old syntax ; will be removed one day */
                     if (!strcmp(argument, "--no-dictID")) { FIO_setDictIDFlag(0); continue; }
@@ -526,7 +526,8 @@ int main(int argCount, const char* argv[])
 #ifndef ZSTD_NODICT
                     if (longCommandWArg(&argument, "--train-cover")) {
                       operation = zom_train;
-                      outFileName = g_defaultDictName;
+                      if (outFileName == NULL)
+                          outFileName = g_defaultDictName;
                       cover = 1;
                       /* Allow optional arguments following an = */
                       if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
@@ -536,7 +537,8 @@ int main(int argCount, const char* argv[])
                     }
                     if (longCommandWArg(&argument, "--train-legacy")) {
                       operation = zom_train;
-                      outFileName = g_defaultDictName;
+                      if (outFileName == NULL)
+                          outFileName = g_defaultDictName;
                       cover = 0;
                       /* Allow optional arguments following an = */
                       if (*argument == 0) { continue; }
@@ -718,7 +720,7 @@ int main(int argCount, const char* argv[])
                         break;
 
                         /* Select compressibility of synthetic sample */
-                    case 'P': 
+                    case 'P':
                     {   argument++;
                         compressibility = (double)readU32FromChar(&argument) / 100;
                     }
@@ -841,7 +843,7 @@ int main(int argCount, const char* argv[])
         if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel();
         if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel();
         if (cLevelLast < cLevel) cLevelLast = cLevel;
-        if (cLevelLast > cLevel) 
+        if (cLevelLast > cLevel)
             DISPLAYLEVEL(2, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
         if(filenameIdx) {
             if(separateFiles) {
@@ -856,7 +858,7 @@ int main(int argCount, const char* argv[])
             } else {
                 for(; cLevel <= cLevelLast; cLevel++) {
                     BMK_benchFilesAdvanced(filenameTable, filenameIdx, dictFileName, cLevel, &compressionParams, g_displayLevel, &adv);
-                }            
+                }
             }
         } else {
             for(; cLevel <= cLevelLast; cLevel++) {
diff --git a/tests/playTests.sh b/tests/playTests.sh
index fb8b1d24..0a1f96c0 100755
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -404,7 +404,13 @@ $ECHO "Hello World" > tmp
 $ZSTD --train-legacy -q tmp && die "Dictionary training should fail : not enough input source"
 ./datagen -P0 -g10M > tmp
 $ZSTD --train-legacy -q tmp && die "Dictionary training should fail : source is pure noise"
-rm tmp*
+$ECHO "- Test -o before --train"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
 
 
 $ECHO "\n===>  cover dictionary builder : advanced options "
@@ -425,12 +431,18 @@ $ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1
 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 $ECHO "- Create dictionary with size limit"
 $ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
-rm tmp*
 $ECHO "- Compare size of dictionary from 90% training samples with 80% training samples"
 $ZSTD --train-cover=split=90 -r *.c ../programs/*.c
 $ZSTD --train-cover=split=80 -r *.c ../programs/*.c
 $ECHO "- Create dictionary using all samples for both training and testing"
 $ZSTD --train-cover=split=100 -r *.c ../programs/*.c
+$ECHO "- Test -o before --train-cover"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train-cover *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train-cover *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
 
 $ECHO "\n===>  legacy dictionary builder "
 
@@ -450,7 +462,13 @@ $ZSTD --train-legacy -s5 *.c ../programs/*.c --dictID=1 -o tmpDict1
 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 $ECHO "- Create dictionary with size limit"
 $ZSTD --train-legacy -s9 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
-rm tmp*
+$ECHO "- Test -o before --train-legacy"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train-legacy *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train-legacy *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
 
 
 $ECHO "\n===>  integrity tests "

From b5806d33db813dfb2bac7cd3b97b5bcf09ee57b7 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Mon, 16 Jul 2018 16:03:04 -0700
Subject: [PATCH 06/35] Refactor RANDOM

---
 contrib/randomDictBuilder/Makefile |  12 +-
 contrib/randomDictBuilder/main.c   | 297 ++++++++++++++++++++++++-
 contrib/randomDictBuilder/random.c | 343 ++---------------------------
 contrib/randomDictBuilder/random.h |  23 --
 4 files changed, 314 insertions(+), 361 deletions(-)

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
index a2aade23..443f6f04 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/randomDictBuilder/Makefile
@@ -14,14 +14,14 @@ rand:
 	./main $(ARG)
 
 
-main: random.o main.o libzstd.a
-	gcc random.o main.o libzstd.a -o main
+main: main.o random.o libzstd.a
+	gcc main.o random.o libzstd.a -o main
 
-main.o: main.c
-	gcc -c main.c -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -I random.h
+main.o: main.c $(PROGRAM_FILES)
+	gcc -c main.c $(PROGRAM_FILES) -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
 
-random.o: $(PROGRAM_FILES) random.c
-	gcc -c $(PROGRAM_FILES) -I ../../programs -I ../../lib/common -I random.h random.c
+random.o: random.c
+	gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder
 
 libzstd.a:
 	$(MAKE) -C ../../lib libzstd.a
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index cf0b9476..d9295aa9 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -3,13 +3,45 @@
 #include <string.h>   /* strcmp, strlen */
 #include <errno.h>    /* errno */
 #include <ctype.h>
-#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
 #include "random.h"
+#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
+#include "platform.h"         /* Large Files support */
 #include "util.h"
+#include "zdict.h"
 
+/*-*************************************
+*  Console display
+***************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
 
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+/*-*************************************
+*  Constants
+***************************************/
 static const unsigned g_defaultMaxDictSize = 110 KB;
 #define DEFAULT_CLEVEL 3
 #define DEFAULT_INPUTFILE ""
@@ -17,7 +49,33 @@ static const unsigned g_defaultMaxDictSize = 110 KB;
 #define DEFAULT_OUTPUTFILE "defaultDict"
 #define DEFAULT_DICTID 0
 
+#define SAMPLESIZE_MAX (128 KB)
+#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define RANDOM_MEMMULT 9
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 
+#define NOISELENGTH 32
+
+
+/*-*************************************
+*  Structs
+***************************************/
+typedef struct {
+    U64 totalSizeToLoad;
+    unsigned oneSampleTooLarge;
+    unsigned nbSamples;
+} fileStats;
+
+typedef struct {
+  const void* srcBuffer;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+}sampleInfo;
+
+
+/*-*************************************
+*  Commandline related functions
+***************************************/
 static unsigned readU32FromChar(const char** stringPtr)
 {
     const char errorMsg[] = "error: numeric value too large";
@@ -42,7 +100,6 @@ static unsigned readU32FromChar(const char** stringPtr)
     return result;
 }
 
-
 /** longCommandWArg() :
  *  check if *stringPtr is the same as longCommand.
  *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
@@ -56,6 +113,225 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
     return result;
 }
 
+/* ********************************************************
+*  File related operations
+**********************************************************/
+/** loadFiles() :
+ *  load samples from files listed in fileNamesTable into buffer.
+ *  works even if buffer is too small to load all samples.
+ *  Also provides the size of each sample into sampleSizes table
+ *  which must be sized correctly, using DiB_fileStats().
+ * @return : nb of samples effectively loaded into `buffer`
+ * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
+ *  sampleSizes is filled with the size of each sample.
+ */
+static unsigned loadFiles(void* buffer, size_t* bufferSizePtr,
+                              size_t* sampleSizes, unsigned sstSize,
+                              const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
+                              unsigned displayLevel)
+{
+    char* const buff = (char*)buffer;
+    size_t pos = 0;
+    unsigned nbLoadedChunks = 0, fileIndex;
+
+    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
+        const char* const fileName = fileNamesTable[fileIndex];
+        unsigned long long const fs64 = UTIL_getFileSize(fileName);
+        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
+        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
+        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
+        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
+        U32 cnb;
+        FILE* const f = fopen(fileName, "rb");
+        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
+        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
+        for (cnb=0; cnb<nbChunks; cnb++) {
+            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
+            if (toLoad > *bufferSizePtr-pos) break;
+            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
+                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
+                pos += readSize;
+                sampleSizes[nbLoadedChunks++] = toLoad;
+                remainingToLoad -= targetChunkSize;
+                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
+                    fileIndex = nbFiles;  /* stop there */
+                    break;
+                }
+                if (toLoad < targetChunkSize) {
+                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
+        }   }   }
+        fclose(f);
+    }
+    DISPLAYLEVEL(2, "\r%79s\r", "");
+    *bufferSizePtr = pos;
+    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
+    return nbLoadedChunks;
+}
+
+#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 getRand(U32* src)
+{
+    static const U32 prime1 = 2654435761U;
+    static const U32 prime2 = 2246822519U;
+    U32 rand32 = *src;
+    rand32 *= prime1;
+    rand32 ^= prime2;
+    rand32  = rotl32(rand32, 13);
+    *src = rand32;
+    return rand32 >> 5;
+}
+
+/* shuffle() :
+ * shuffle a table of file names in a semi-random way
+ * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
+ * it will load random elements from it, instead of just the first ones. */
+static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
+    U32 seed = 0xFD2FB528;
+    unsigned i;
+    for (i = nbFiles - 1; i > 0; --i) {
+        unsigned const j = getRand(&seed) % (i + 1);
+        const char* const tmp = fileNamesTable[j];
+        fileNamesTable[j] = fileNamesTable[i];
+        fileNamesTable[i] = tmp;
+    }
+}
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static size_t findMaxMem(unsigned long long requiredMem)
+{
+    size_t const step = 8 MB;
+    void* testmem = NULL;
+
+    requiredMem = (((requiredMem >> 23) + 1) << 23);
+    requiredMem += step;
+    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
+
+    while (!testmem) {
+        testmem = malloc((size_t)requiredMem);
+        requiredMem -= step;
+    }
+
+    free(testmem);
+    return (size_t)requiredMem;
+}
+
+static void saveDict(const char* dictFileName,
+                         const void* buff, size_t buffSize)
+{
+    FILE* const f = fopen(dictFileName, "wb");
+    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
+
+    { size_t const n = fwrite(buff, 1, buffSize, f);
+      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
+
+    { size_t const n = (size_t)fclose(f);
+      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
+}
+
+/*! getFileStats() :
+ *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
+ *  provides the amount of data to be loaded and the resulting nb of samples.
+ *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
+ */
+static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
+{
+    fileStats fs;
+    unsigned n;
+    memset(&fs, 0, sizeof(fs));
+    for (n=0; n<nbFiles; n++) {
+        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
+        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
+        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
+        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
+        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
+        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
+        fs.nbSamples += nbSamples;
+    }
+    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
+    return fs;
+}
+
+int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize,
+                       ZDICT_random_params_t *params){
+    unsigned const displayLevel = params->zParams.notificationLevel;
+    void* const dictBuffer = malloc(maxDictSize);
+
+    int result = 0;
+
+    /* Checks */
+    if (!dictBuffer)
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+
+    {   size_t dictSize;
+        dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer,
+                                             info->samplesSizes, info->nbSamples, *params);
+        DISPLAYLEVEL(2, "k=%u\n", params->k);
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            result = 1;
+            free(dictBuffer);
+        }
+        /* save dict */
+        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
+        saveDict(dictFileName, dictBuffer, dictSize);
+    }
+
+    /* clean up */
+    free(dictBuffer);
+    return result;
+}
+
+sampleInfo* getSampleInfo(const char** fileNamesTable,
+                  unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel){
+    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
+    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
+    size_t const memMult = RANDOM_MEMMULT;
+    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
+    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
+    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
+
+    /* Checks */
+    if ((!sampleSizes) || (!srcBuffer))
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+    if (fs.oneSampleTooLarge) {
+        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
+        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
+        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
+    }
+    if (fs.nbSamples < 5) {
+        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
+        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
+        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
+        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
+    }
+    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
+        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
+        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
+    }
+
+    /* init */
+    if (loadedSize < fs.totalSizeToLoad)
+        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
+
+    /* Load input buffer */
+    DISPLAYLEVEL(3, "Shuffling input files\n");
+    shuffle(fileNamesTable, nbFiles);
+    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo));
+
+    info->nbSamples = fs.nbSamples;
+    info->samplesSizes = sampleSizes;
+    info->srcBuffer = srcBuffer;
+
+    return info;
+}
+
+
 
 int main(int argCount, const char* argv[])
 {
@@ -63,7 +339,7 @@ int main(int argCount, const char* argv[])
   const char* programName = argv[0];
   int operationResult = 0;
 
-  /* Initialize parameters with default value */
+  unsigned cLevel = DEFAULT_CLEVEL;
   char* inputFile = DEFAULT_INPUTFILE;
   unsigned k = DEFAULT_k;
   char* outputFile = DEFAULT_OUTPUTFILE;
@@ -76,10 +352,10 @@ int main(int argCount, const char* argv[])
   for (int i = 1; i < argCount; i++) {
     const char* argument = argv[i];
     if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "in=")) {
-      /* Allow multiple input files */
       inputFile = malloc(strlen(argument) + 1);
       strcpy(inputFile, argument);
       filenameTable[filenameIdx] = inputFile;
@@ -96,12 +372,6 @@ int main(int argCount, const char* argv[])
     return operationResult;
   }
 
-  if (maxDictSize == 0) {
-    DISPLAYLEVEL(1, "maxDictSize should not be 0.\n");
-    operationResult = 1;
-    return operationResult;
-  }
-
   char* fileNamesBuf = NULL;
   unsigned fileNamesNb = filenameIdx;
   int followLinks = 0;
@@ -119,12 +389,15 @@ int main(int argCount, const char* argv[])
 
   ZDICT_random_params_t params;
   ZDICT_params_t zParams;
-  zParams.compressionLevel = DEFAULT_CLEVEL;
+  zParams.compressionLevel = cLevel;
   zParams.notificationLevel = displayLevel;
   zParams.dictID = dictID;
   params.zParams = zParams;
   params.k = k;
 
-  operationResult = RANDOM_trainFromFiles(outputFile, maxDictSize, filenameTable, filenameIdx, blockSize, &params);
+  sampleInfo* info= getSampleInfo(filenameTable,
+                    filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
+  operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, &params);
+
   return operationResult;
 }
diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c
index a59427ba..96c02389 100644
--- a/contrib/randomDictBuilder/random.c
+++ b/contrib/randomDictBuilder/random.c
@@ -5,24 +5,12 @@
 #include <stdlib.h>           /* malloc, free, qsort */
 #include <string.h>           /* memset */
 #include <time.h>             /* clock */
-#include "zstd_internal.h" /* includes zstd.h */
+#include "random.h"
+#include "util.h"             /* UTIL_getFileSize, UTIL_getTotalFileSize */
 #ifndef ZDICT_STATIC_LINKING_ONLY
 #define ZDICT_STATIC_LINKING_ONLY
 #endif
-#include "random.h"
-#include "platform.h"         /* Large Files support */
-#include "util.h"             /* UTIL_getFileSize, UTIL_getTotalFileSize */
-
-/*-*************************************
-*  Constants
-***************************************/
-#define SAMPLESIZE_MAX (128 KB)
-#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
-#define RANDOM_MEMMULT 9
-static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
-
-#define NOISELENGTH 32
-#define DEFAULT_K 200
+#include "zdict.h"
 
 /*-*************************************
 *  Console display
@@ -30,179 +18,16 @@ static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((siz
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
 
-static const U64 g_refreshRate = SEC_TO_MICRO / 6;
-static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
-
-#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
-            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
-            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
-            if (displayLevel>=4) fflush(stderr); } } }
-
-
-/*-*************************************
-*  Exceptions
-***************************************/
-#ifndef DEBUG
-#  define DEBUG 0
-#endif
-#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
-#define EXM_THROW(error, ...)                                             \
-{                                                                         \
-    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
-    DISPLAY("Error %i : ", error);                                        \
-    DISPLAY(__VA_ARGS__);                                                 \
-    DISPLAY("\n");                                                        \
-    exit(error);                                                          \
-}
-
-
-/* ********************************************************
-*  File related operations
-**********************************************************/
-/** loadFiles() :
- *  load samples from files listed in fileNamesTable into buffer.
- *  works even if buffer is too small to load all samples.
- *  Also provides the size of each sample into sampleSizes table
- *  which must be sized correctly, using DiB_fileStats().
- * @return : nb of samples effectively loaded into `buffer`
- * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
- *  sampleSizes is filled with the size of each sample.
- */
-static unsigned loadFiles(void* buffer, size_t* bufferSizePtr,
-                              size_t* sampleSizes, unsigned sstSize,
-                              const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
-                              unsigned displayLevel)
-{
-    char* const buff = (char*)buffer;
-    size_t pos = 0;
-    unsigned nbLoadedChunks = 0, fileIndex;
-
-    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
-        const char* const fileName = fileNamesTable[fileIndex];
-        unsigned long long const fs64 = UTIL_getFileSize(fileName);
-        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
-        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
-        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
-        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
-        U32 cnb;
-        FILE* const f = fopen(fileName, "rb");
-        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
-        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
-        for (cnb=0; cnb<nbChunks; cnb++) {
-            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
-            if (toLoad > *bufferSizePtr-pos) break;
-            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
-                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
-                pos += readSize;
-                sampleSizes[nbLoadedChunks++] = toLoad;
-                remainingToLoad -= targetChunkSize;
-                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
-                    fileIndex = nbFiles;  /* stop there */
-                    break;
-                }
-                if (toLoad < targetChunkSize) {
-                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
-        }   }   }
-        fclose(f);
-    }
-    DISPLAYLEVEL(2, "\r%79s\r", "");
-    *bufferSizePtr = pos;
-    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
-    return nbLoadedChunks;
-}
-
-
-
-#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
-static U32 getRand(U32* src)
-{
-    static const U32 prime1 = 2654435761U;
-    static const U32 prime2 = 2246822519U;
-    U32 rand32 = *src;
-    rand32 *= prime1;
-    rand32 ^= prime2;
-    rand32  = rotl32(rand32, 13);
-    *src = rand32;
-    return rand32 >> 5;
-}
-
-
-/* shuffle() :
- * shuffle a table of file names in a semi-random way
- * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
- * it will load random elements from it, instead of just the first ones. */
-static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
-    U32 seed = 0xFD2FB528;
-    unsigned i;
-    for (i = nbFiles - 1; i > 0; --i) {
-        unsigned const j = getRand(&seed) % (i + 1);
-        const char* const tmp = fileNamesTable[j];
-        fileNamesTable[j] = fileNamesTable[i];
-        fileNamesTable[i] = tmp;
-    }
-}
-
-
-
-/*-********************************************************
-*  Dictionary training functions
-**********************************************************/
-static size_t findMaxMem(unsigned long long requiredMem)
-{
-    size_t const step = 8 MB;
-    void* testmem = NULL;
-
-    requiredMem = (((requiredMem >> 23) + 1) << 23);
-    requiredMem += step;
-    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
-
-    while (!testmem) {
-        testmem = malloc((size_t)requiredMem);
-        requiredMem -= step;
-    }
-
-    free(testmem);
-    return (size_t)requiredMem;
-}
-
-static void saveDict(const char* dictFileName,
-                         const void* buff, size_t buffSize)
-{
-    FILE* const f = fopen(dictFileName, "wb");
-    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
-
-    { size_t const n = fwrite(buff, 1, buffSize, f);
-      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
-
-    { size_t const n = (size_t)fclose(f);
-      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
-}
-
-/*! getFileStats() :
- *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
- *  provides the amount of data to be loaded and the resulting nb of samples.
- *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
- */
-static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
-{
-    fileStats fs;
-    unsigned n;
-    memset(&fs, 0, sizeof(fs));
-    for (n=0; n<nbFiles; n++) {
-        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
-        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
-        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
-        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
-        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
-        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
-        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
-        fs.nbSamples += nbSamples;
-    }
-    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
-    return fs;
-}
-
-
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
 
 
 
@@ -225,16 +50,14 @@ static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) {
 /**
  * Selects a random segment from totalSamplesSize - k + 1 possible segments
  */
-static RANDOM_segment_t RANDOM_selectSegment(const RANDOM_ctx_t *ctx,
+static RANDOM_segment_t RANDOM_selectSegment(const size_t totalSamplesSize,
                                             ZDICT_random_params_t parameters) {
     const U32 k = parameters.k;
     RANDOM_segment_t segment;
     unsigned index;
 
-    /* Seed random number generator */
-    srand((unsigned)time(NULL));
     /* Randomly generate a number from 0 to sampleSizes - k */
-    index = rand()%(ctx->totalSamplesSize - k + 1);
+    index = rand()%(totalSamplesSize - k + 1);
 
     /* inclusive */
     segment.begin = index;
@@ -261,65 +84,11 @@ static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDi
 }
 
 
-/**
- * Clean up a context initialized with `RANDOM_ctx_init()`.
- */
-static void RANDOM_ctx_destroy(RANDOM_ctx_t *ctx) {
-  if (!ctx) {
-    return;
-  }
-  if (ctx->offsets) {
-    free(ctx->offsets);
-    ctx->offsets = NULL;
-  }
-}
-
-
-/**
- * Prepare a context for dictionary building.
- * Returns 1 on success or zero on error.
- * The context must be destroyed with `RANDOM_ctx_destroy()`.
- */
-static int RANDOM_ctx_init(RANDOM_ctx_t *ctx, const void *samplesBuffer,
-                          const size_t *samplesSizes, unsigned nbSamples) {
-    const BYTE *const samples = (const BYTE *)samplesBuffer;
-    const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples);
-    const int displayLevel = 2;
-    /* Checks */
-    if (totalSamplesSize >= (size_t)RANDOM_MAX_SAMPLES_SIZE) {
-      DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
-                   (U32)(totalSamplesSize>>20), (RANDOM_MAX_SAMPLES_SIZE >> 20));
-      return 0;
-    }
-    memset(ctx, 0, sizeof(*ctx));
-    DISPLAYLEVEL(1, "Building dictionary from %u samples of total size %u\n", nbSamples,
-                 (U32)totalSamplesSize);
-    ctx->samples = samples;
-    ctx->samplesSizes = samplesSizes;
-    ctx->nbSamples = nbSamples;
-    ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
-    ctx->totalSamplesSize = (U32)totalSamplesSize;
-    if (!ctx->offsets) {
-      DISPLAYLEVEL(1, "Failed to allocate buffer for offsets\n");
-      RANDOM_ctx_destroy(ctx);
-      return 0;
-    }
-    {
-      U32 i;
-      ctx->offsets[0] = 0;
-      for (i = 1; i <= nbSamples; ++i) {
-        ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
-      }
-    }
-    return 1;
-}
-
-
 /**
  * Given the prepared context build the dictionary.
  */
-static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer,
-                                    size_t dictBufferCapacity,
+static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *samples,
+                                    void *dictBuffer, size_t dictBufferCapacity,
                                     ZDICT_random_params_t parameters) {
     BYTE *const dict = (BYTE *)dictBuffer;
     size_t tail = dictBufferCapacity;
@@ -327,13 +96,13 @@ static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer,
     while (tail > 0) {
 
       /* Select a segment */
-      RANDOM_segment_t segment = RANDOM_selectSegment(ctx, parameters);
+      RANDOM_segment_t segment = RANDOM_selectSegment(totalSamplesSize, parameters);
 
       size_t segmentSize;
       segmentSize = MIN(segment.end - segment.begin + 1, tail);
 
       tail -= segmentSize;
-      memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+      memcpy(dict + tail, samples + segment.begin, segmentSize);
       DISPLAYUPDATE(
           2, "\r%u%%       ",
           (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
@@ -342,6 +111,7 @@ static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer,
     return tail;
 }
 
+
 /*! ZDICT_trainFromBuffer_random():
  *  Train a dictionary from an array of samples using the RANDOM algorithm.
  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
@@ -356,7 +126,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
     ZDICT_random_params_t parameters) {
       const int displayLevel = parameters.zParams.notificationLevel;
       BYTE* const dict = (BYTE*)dictBuffer;
-      RANDOM_ctx_t ctx;
       /* Checks */
       if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) {
           DISPLAYLEVEL(1, "k is incorrect\n");
@@ -371,13 +140,12 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
                      ZDICT_DICTSIZE_MIN);
         return ERROR(dstSize_tooSmall);
       }
+      const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples);
+      const BYTE *const samples = (const BYTE *)samplesBuffer;
 
-      if (!RANDOM_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples)) {
-        return ERROR(GENERIC);
-      }
       DISPLAYLEVEL(2, "Building dictionary\n");
       {
-        const size_t tail = RANDOM_buildDictionary(&ctx, dictBuffer, dictBufferCapacity, parameters);
+        const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples, dictBuffer, dictBufferCapacity, parameters);
         const size_t dictSize = ZDICT_finalizeDictionary(
             dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
             samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
@@ -385,71 +153,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
             DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
                           (U32)dictSize);
         }
-        RANDOM_ctx_destroy(&ctx);
         return dictSize;
       }
 }
-
-
-int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
-                       const char** fileNamesTable, unsigned nbFiles,
-                       size_t chunkSize, ZDICT_random_params_t *params){
-    unsigned const displayLevel = params->zParams.notificationLevel;
-    void* const dictBuffer = malloc(maxDictSize);
-    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
-    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
-    size_t const memMult = RANDOM_MEMMULT;
-    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
-    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
-    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
-    int result = 0;
-
-    /* Checks */
-    if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
-        EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
-    if (fs.oneSampleTooLarge) {
-        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
-        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
-        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
-    }
-    if (fs.nbSamples < 5) {
-        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
-        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
-        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
-        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
-    }
-    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
-        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
-        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
-    }
-
-    /* init */
-    if (loadedSize < fs.totalSizeToLoad)
-        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
-
-    /* Load input buffer */
-    DISPLAYLEVEL(3, "Shuffling input files\n");
-    shuffle(fileNamesTable, nbFiles);
-    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
-
-    {   size_t dictSize;
-        dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, srcBuffer,
-                                             sampleSizes, fs.nbSamples, *params);
-        DISPLAYLEVEL(2, "k=%u\n", params->k);
-        if (ZDICT_isError(dictSize)) {
-            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
-            result = 1;
-            goto _cleanup;
-        }
-        /* save dict */
-        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
-        saveDict(dictFileName, dictBuffer, dictSize);
-    }
-
-    /* clean up */
-_cleanup:
-    free(srcBuffer);
-    free(sampleSizes);
-    free(dictBuffer);
-    return result;
-}
diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h
index 05879641..77529daf 100644
--- a/contrib/randomDictBuilder/random.h
+++ b/contrib/randomDictBuilder/random.h
@@ -8,18 +8,6 @@
 #endif
 #include "zdict.h"
 
-
-/**************************************
-* Context
-***************************************/
-typedef struct {
-  const BYTE *samples;
-  size_t *offsets;
-  const size_t *samplesSizes;
-  size_t nbSamples;
-  U32 totalSamplesSize;
-} RANDOM_ctx_t;
-
 /**
  * A segment is an inclusive range in the source.
  */
@@ -35,19 +23,8 @@ typedef struct {
 } ZDICT_random_params_t;
 
 
-typedef struct {
-    U64 totalSizeToLoad;
-    unsigned oneSampleTooLarge;
-    unsigned nbSamples;
-} fileStats;
-
 
 ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
     void *dictBuffer, size_t dictBufferCapacity,
     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
     ZDICT_random_params_t parameters);
-
-
-int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
-                        const char** fileNamesTable, unsigned nbFiles,
-                        size_t chunkSize, ZDICT_random_params_t *params);

From 1f7fa5cdd6555e22dfa8c2dc1f5c17293e703fe3 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Mon, 16 Jul 2018 16:31:59 -0700
Subject: [PATCH 07/35] Fix spacing and Edit Makefile (now run with make
 instead of make run)

---
 contrib/randomDictBuilder/Makefile  | 13 +++++----
 contrib/randomDictBuilder/README.md |  9 ++++---
 contrib/randomDictBuilder/main.c    | 42 ++++++++++++++---------------
 contrib/randomDictBuilder/random.c  |  9 ++++---
 contrib/randomDictBuilder/random.h  |  5 ++--
 5 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
index 443f6f04..77dd2933 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/randomDictBuilder/Makefile
@@ -4,16 +4,15 @@ TEST_INPUT := ../../lib
 TEST_OUTPUT := randomDict
 ARG :=
 
-all: main testrun test clean
+all: main run clean
 
-run: main rand clean
+test: main testrun testshell clean
 
-.PHONY: rand
-rand:
+.PHONY: run
+run:
 	echo "Building a random dictionary with given arguments"
 	./main $(ARG)
 
-
 main: main.o random.o libzstd.a
 	gcc main.o random.o libzstd.a -o main
 
@@ -34,8 +33,8 @@ testrun: main
 	zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q
 	rm -f $(TEST_OUTPUT)
 
-.PHONY: test
-test: test.sh
+.PHONY: testshell
+testshell: test.sh
 	sh test.sh
 	echo "Finish running test.sh"
 
diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md
index 09f1e808..0e70d3dc 100644
--- a/contrib/randomDictBuilder/README.md
+++ b/contrib/randomDictBuilder/README.md
@@ -7,11 +7,14 @@ Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0
 Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB
 Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200
 
+###Running Test:
+make test
+
 
 ###Usage:
-To build a random dictionary with the provided arguments: make run ARG= followed by arguments
+To build a random dictionary with the provided arguments: make ARG= followed by arguments
 
 
 ### Examples:
-make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520"
-make run ARG="in=../../lib/dictBuilder in=../../lib/compress"
+make ARG="in=../../lib/dictBuilder out=dict100 dictID=520"
+make ARG="in=../../lib/dictBuilder in=../../lib/compress"
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index d9295aa9..e195188b 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -52,7 +52,8 @@ static const unsigned g_defaultMaxDictSize = 110 KB;
 #define SAMPLESIZE_MAX (128 KB)
 #define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
 #define RANDOM_MEMMULT 9
-static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ?
+                          (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 
 #define NOISELENGTH 32
 
@@ -76,8 +77,7 @@ typedef struct {
 /*-*************************************
 *  Commandline related functions
 ***************************************/
-static unsigned readU32FromChar(const char** stringPtr)
-{
+static unsigned readU32FromChar(const char** stringPtr){
     const char errorMsg[] = "error: numeric value too large";
     unsigned result = 0;
     while ((**stringPtr >='0') && (**stringPtr <='9')) {
@@ -105,8 +105,7 @@ static unsigned readU32FromChar(const char** stringPtr)
  *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
  * @return 0 and doesn't modify *stringPtr otherwise.
  */
-static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
-{
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){
     size_t const comSize = strlen(longCommand);
     int const result = !strncmp(*stringPtr, longCommand, comSize);
     if (result) *stringPtr += comSize;
@@ -125,11 +124,9 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
  * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
  *  sampleSizes is filled with the size of each sample.
  */
-static unsigned loadFiles(void* buffer, size_t* bufferSizePtr,
-                              size_t* sampleSizes, unsigned sstSize,
-                              const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
-                              unsigned displayLevel)
-{
+static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes,
+                          unsigned sstSize, const char** fileNamesTable, unsigned nbFiles,
+                          size_t targetChunkSize, unsigned displayLevel) {
     char* const buff = (char*)buffer;
     size_t pos = 0;
     unsigned nbLoadedChunks = 0, fileIndex;
@@ -200,8 +197,7 @@ static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
 /*-********************************************************
 *  Dictionary training functions
 **********************************************************/
-static size_t findMaxMem(unsigned long long requiredMem)
-{
+static size_t findMaxMem(unsigned long long requiredMem) {
     size_t const step = 8 MB;
     void* testmem = NULL;
 
@@ -219,8 +215,7 @@ static size_t findMaxMem(unsigned long long requiredMem)
 }
 
 static void saveDict(const char* dictFileName,
-                         const void* buff, size_t buffSize)
-{
+                         const void* buff, size_t buffSize) {
     FILE* const f = fopen(dictFileName, "wb");
     if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
 
@@ -236,8 +231,8 @@ static void saveDict(const char* dictFileName,
  *  provides the amount of data to be loaded and the resulting nb of samples.
  *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
  */
-static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
-{
+static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles,
+                              size_t chunkSize, unsigned displayLevel) {
     fileStats fs;
     unsigned n;
     memset(&fs, 0, sizeof(fs));
@@ -255,8 +250,9 @@ static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, siz
     return fs;
 }
 
-int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize,
-                       ZDICT_random_params_t *params){
+int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
+                          unsigned maxDictSize,
+                          ZDICT_random_params_t *params) {
     unsigned const displayLevel = params->zParams.notificationLevel;
     void* const dictBuffer = malloc(maxDictSize);
 
@@ -285,8 +281,8 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned m
     return result;
 }
 
-sampleInfo* getSampleInfo(const char** fileNamesTable,
-                  unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel){
+sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+                          unsigned maxDictSize, const unsigned displayLevel) {
     fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
     size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
     size_t const memMult = RANDOM_MEMMULT;
@@ -320,7 +316,8 @@ sampleInfo* getSampleInfo(const char** fileNamesTable,
     /* Load input buffer */
     DISPLAYLEVEL(3, "Shuffling input files\n");
     shuffle(fileNamesTable, nbFiles);
-    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples,
+                        fileNamesTable, nbFiles, chunkSize, displayLevel);
 
     sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo));
 
@@ -376,7 +373,8 @@ int main(int argCount, const char* argv[])
   unsigned fileNamesNb = filenameIdx;
   int followLinks = 0;
   const char** extendedFileList = NULL;
-  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks);
+  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
+                                        &fileNamesNb, followLinks);
   if (extendedFileList) {
       unsigned u;
       for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c
index 96c02389..cfed14a4 100644
--- a/contrib/randomDictBuilder/random.c
+++ b/contrib/randomDictBuilder/random.c
@@ -71,7 +71,8 @@ static RANDOM_segment_t RANDOM_selectSegment(const size_t totalSamplesSize,
  * Check the validity of the parameters.
  * Returns non-zero if the parameters are valid and 0 otherwise.
  */
-static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDictSize) {
+static int RANDOM_checkParameters(ZDICT_random_params_t parameters,
+                                  size_t maxDictSize) {
     /* k is a required parameter */
     if (parameters.k == 0) {
       return 0;
@@ -115,7 +116,8 @@ static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *
 /*! ZDICT_trainFromBuffer_random():
  *  Train a dictionary from an array of samples using the RANDOM algorithm.
  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
- *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each
+ *  sample, in order.
  *  The resulting dictionary will be saved into `dictBuffer`.
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  *          or an error code, which can be tested with ZDICT_isError().
@@ -145,7 +147,8 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
 
       DISPLAYLEVEL(2, "Building dictionary\n");
       {
-        const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples, dictBuffer, dictBufferCapacity, parameters);
+        const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples,
+                                  dictBuffer, dictBufferCapacity, parameters);
         const size_t dictSize = ZDICT_finalizeDictionary(
             dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
             samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h
index 77529daf..b6696323 100644
--- a/contrib/randomDictBuilder/random.h
+++ b/contrib/randomDictBuilder/random.h
@@ -18,13 +18,12 @@ typedef struct {
 
 
 typedef struct {
-    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */
+    unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */
     ZDICT_params_t zParams;
 } ZDICT_random_params_t;
 
 
 
-ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
-    void *dictBuffer, size_t dictBufferCapacity,
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity,
     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
     ZDICT_random_params_t parameters);

From 4d32339b75c98c4534963eb73a55ae7c4826214e Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Mon, 16 Jul 2018 18:59:18 -0700
Subject: [PATCH 08/35] Remove CLevel cli option which was accidentally added
 back in the last commit

---
 contrib/randomDictBuilder/main.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index e195188b..e66f2847 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -336,7 +336,6 @@ int main(int argCount, const char* argv[])
   const char* programName = argv[0];
   int operationResult = 0;
 
-  unsigned cLevel = DEFAULT_CLEVEL;
   char* inputFile = DEFAULT_INPUTFILE;
   unsigned k = DEFAULT_k;
   char* outputFile = DEFAULT_OUTPUTFILE;
@@ -349,7 +348,6 @@ int main(int argCount, const char* argv[])
   for (int i = 1; i < argCount; i++) {
     const char* argument = argv[i];
     if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
-    if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "in=")) {
@@ -387,7 +385,7 @@ int main(int argCount, const char* argv[])
 
   ZDICT_random_params_t params;
   ZDICT_params_t zParams;
-  zParams.compressionLevel = cLevel;
+  zParams.compressionLevel = DEFAULT_CLEVEL;
   zParams.notificationLevel = displayLevel;
   zParams.dictID = dictID;
   params.zParams = zParams;

From 53e1f0504e077f90ecaea3a0bc18327177fd57ee Mon Sep 17 00:00:00 2001
From: cyan4973 <yann.collet.73@gmail.com>
Date: Tue, 17 Jul 2018 14:39:44 +0200
Subject: [PATCH 09/35] zstdmt debug traces compatibles with mingw

since mingw does not have `sys/times.h`,
remove this path when detecting mingw compilation.
---
 lib/compress/zstdmt_compress.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c
index 6daedca8..d5193d52 100644
--- a/lib/compress/zstdmt_compress.c
+++ b/lib/compress/zstdmt_compress.c
@@ -37,7 +37,9 @@
 #define ZSTD_RESIZE_SEQPOOL 0
 
 /* ======   Debug   ====== */
-#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) && !defined(_MSC_VER)
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) \
+    && !defined(_MSC_VER) \
+    && !defined(__MINGW32__)
 
 #  include <stdio.h>
 #  include <unistd.h>

From 49acfaeaec44a25c4628a2512965445152e8776a Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Tue, 17 Jul 2018 12:35:09 -0700
Subject: [PATCH 10/35] Move file loading functions to new file for access by
 benchmarking tool

---
 contrib/randomDictBuilder/Makefile |  11 +-
 contrib/randomDictBuilder/io.c     | 243 +++++++++++++++++++++++++++++
 contrib/randomDictBuilder/io.h     |  33 ++++
 contrib/randomDictBuilder/main.c   | 215 +------------------------
 4 files changed, 290 insertions(+), 212 deletions(-)
 create mode 100644 contrib/randomDictBuilder/io.c
 create mode 100644 contrib/randomDictBuilder/io.h

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
index 77dd2933..8360a409 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/randomDictBuilder/Makefile
@@ -13,15 +13,18 @@ run:
 	echo "Building a random dictionary with given arguments"
 	./main $(ARG)
 
-main: main.o random.o libzstd.a
-	gcc main.o random.o libzstd.a -o main
+main: main.o io.o random.o libzstd.a
+	gcc main.o io.o random.o libzstd.a -o main
 
-main.o: main.c $(PROGRAM_FILES)
-	gcc -c main.c $(PROGRAM_FILES) -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+main.o: main.c
+	gcc -c main.c -I io.h -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
 
 random.o: random.c
 	gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder
 
+io.o: io.c $(PROGRAM_FILES)
+	gcc -c io.c $(PROGRAM_FILES) -I io.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+
 libzstd.a:
 	$(MAKE) -C ../../lib libzstd.a
 	mv ../../lib/libzstd.a .
diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c
new file mode 100644
index 00000000..a5f71498
--- /dev/null
+++ b/contrib/randomDictBuilder/io.c
@@ -0,0 +1,243 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "io.h"
+#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
+#include "platform.h"         /* Large Files support */
+#include "util.h"
+#include "zdict.h"
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+
+/*-*************************************
+*  Constants
+***************************************/
+
+#define SAMPLESIZE_MAX (128 KB)
+#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define RANDOM_MEMMULT 9
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ?
+                          (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
+
+#define NOISELENGTH 32
+
+
+
+/* ********************************************************
+*  File related operations
+**********************************************************/
+/** loadFiles() :
+ *  load samples from files listed in fileNamesTable into buffer.
+ *  works even if buffer is too small to load all samples.
+ *  Also provides the size of each sample into sampleSizes table
+ *  which must be sized correctly, using DiB_fileStats().
+ * @return : nb of samples effectively loaded into `buffer`
+ * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
+ *  sampleSizes is filled with the size of each sample.
+ */
+static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes,
+                          unsigned sstSize, const char** fileNamesTable, unsigned nbFiles,
+                          size_t targetChunkSize, unsigned displayLevel) {
+    char* const buff = (char*)buffer;
+    size_t pos = 0;
+    unsigned nbLoadedChunks = 0, fileIndex;
+
+    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
+        const char* const fileName = fileNamesTable[fileIndex];
+        unsigned long long const fs64 = UTIL_getFileSize(fileName);
+        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
+        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
+        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
+        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
+        U32 cnb;
+        FILE* const f = fopen(fileName, "rb");
+        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
+        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
+        for (cnb=0; cnb<nbChunks; cnb++) {
+            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
+            if (toLoad > *bufferSizePtr-pos) break;
+            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
+                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
+                pos += readSize;
+                sampleSizes[nbLoadedChunks++] = toLoad;
+                remainingToLoad -= targetChunkSize;
+                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
+                    fileIndex = nbFiles;  /* stop there */
+                    break;
+                }
+                if (toLoad < targetChunkSize) {
+                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
+        }   }   }
+        fclose(f);
+    }
+    DISPLAYLEVEL(2, "\r%79s\r", "");
+    *bufferSizePtr = pos;
+    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
+    return nbLoadedChunks;
+}
+
+#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 getRand(U32* src)
+{
+    static const U32 prime1 = 2654435761U;
+    static const U32 prime2 = 2246822519U;
+    U32 rand32 = *src;
+    rand32 *= prime1;
+    rand32 ^= prime2;
+    rand32  = rotl32(rand32, 13);
+    *src = rand32;
+    return rand32 >> 5;
+}
+
+/* shuffle() :
+ * shuffle a table of file names in a semi-random way
+ * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
+ * it will load random elements from it, instead of just the first ones. */
+static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
+    U32 seed = 0xFD2FB528;
+    unsigned i;
+    for (i = nbFiles - 1; i > 0; --i) {
+        unsigned const j = getRand(&seed) % (i + 1);
+        const char* const tmp = fileNamesTable[j];
+        fileNamesTable[j] = fileNamesTable[i];
+        fileNamesTable[i] = tmp;
+    }
+}
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static size_t findMaxMem(unsigned long long requiredMem) {
+    size_t const step = 8 MB;
+    void* testmem = NULL;
+
+    requiredMem = (((requiredMem >> 23) + 1) << 23);
+    requiredMem += step;
+    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
+
+    while (!testmem) {
+        testmem = malloc((size_t)requiredMem);
+        requiredMem -= step;
+    }
+
+    free(testmem);
+    return (size_t)requiredMem;
+}
+
+void saveDict(const char* dictFileName,
+                         const void* buff, size_t buffSize) {
+    FILE* const f = fopen(dictFileName, "wb");
+    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
+
+    { size_t const n = fwrite(buff, 1, buffSize, f);
+      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
+
+    { size_t const n = (size_t)fclose(f);
+      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
+}
+
+/*! getFileStats() :
+ *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
+ *  provides the amount of data to be loaded and the resulting nb of samples.
+ *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
+ */
+static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles,
+                              size_t chunkSize, unsigned displayLevel) {
+    fileStats fs;
+    unsigned n;
+    memset(&fs, 0, sizeof(fs));
+    for (n=0; n<nbFiles; n++) {
+        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
+        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
+        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
+        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
+        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
+        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
+        fs.nbSamples += nbSamples;
+    }
+    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
+    return fs;
+}
+
+
+
+
+sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+                          unsigned maxDictSize, const unsigned displayLevel) {
+    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
+    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
+    size_t const memMult = RANDOM_MEMMULT;
+    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
+    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
+    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
+
+    /* Checks */
+    if ((!sampleSizes) || (!srcBuffer))
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+    if (fs.oneSampleTooLarge) {
+        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
+        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
+        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
+    }
+    if (fs.nbSamples < 5) {
+        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
+        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
+        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
+        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
+    }
+    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
+        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
+        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
+    }
+
+    /* init */
+    if (loadedSize < fs.totalSizeToLoad)
+        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
+
+    /* Load input buffer */
+    DISPLAYLEVEL(3, "Shuffling input files\n");
+    shuffle(fileNamesTable, nbFiles);
+    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples,
+                        fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo));
+
+    info->nbSamples = fs.nbSamples;
+    info->samplesSizes = sampleSizes;
+    info->srcBuffer = srcBuffer;
+
+    return info;
+}
diff --git a/contrib/randomDictBuilder/io.h b/contrib/randomDictBuilder/io.h
new file mode 100644
index 00000000..4b5639fe
--- /dev/null
+++ b/contrib/randomDictBuilder/io.h
@@ -0,0 +1,33 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "zstd_internal.h" /* includes zstd.h */
+#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
+#include "platform.h"         /* Large Files support */
+#include "util.h"
+#include "zdict.h"
+
+
+/*-*************************************
+*  Structs
+***************************************/
+typedef struct {
+    U64 totalSizeToLoad;
+    unsigned oneSampleTooLarge;
+    unsigned nbSamples;
+} fileStats;
+
+typedef struct {
+  const void* srcBuffer;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+}sampleInfo;
+
+
+sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+                          unsigned maxDictSize, const unsigned displayLevel);
+
+
+void saveDict(const char* dictFileName, const void* buff, size_t buffSize);
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index e66f2847..34a9d99e 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -4,11 +4,11 @@
 #include <errno.h>    /* errno */
 #include <ctype.h>
 #include "random.h"
-#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
-#include "platform.h"         /* Large Files support */
+#include "io.h"
 #include "util.h"
 #include "zdict.h"
 
+
 /*-*************************************
 *  Console display
 ***************************************/
@@ -23,6 +23,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
             { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
             if (displayLevel>=4) fflush(stderr); } } }
 
+
 /*-*************************************
 *  Exceptions
 ***************************************/
@@ -39,6 +40,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
     exit(error);                                                          \
 }
 
+
 /*-*************************************
 *  Constants
 ***************************************/
@@ -49,29 +51,6 @@ static const unsigned g_defaultMaxDictSize = 110 KB;
 #define DEFAULT_OUTPUTFILE "defaultDict"
 #define DEFAULT_DICTID 0
 
-#define SAMPLESIZE_MAX (128 KB)
-#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
-#define RANDOM_MEMMULT 9
-static const size_t g_maxMemory = (sizeof(size_t) == 4) ?
-                          (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
-
-#define NOISELENGTH 32
-
-
-/*-*************************************
-*  Structs
-***************************************/
-typedef struct {
-    U64 totalSizeToLoad;
-    unsigned oneSampleTooLarge;
-    unsigned nbSamples;
-} fileStats;
-
-typedef struct {
-  const void* srcBuffer;
-  const size_t *samplesSizes;
-  size_t nbSamples;
-}sampleInfo;
 
 
 /*-*************************************
@@ -112,144 +91,11 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
     return result;
 }
 
-/* ********************************************************
-*  File related operations
-**********************************************************/
-/** loadFiles() :
- *  load samples from files listed in fileNamesTable into buffer.
- *  works even if buffer is too small to load all samples.
- *  Also provides the size of each sample into sampleSizes table
- *  which must be sized correctly, using DiB_fileStats().
- * @return : nb of samples effectively loaded into `buffer`
- * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
- *  sampleSizes is filled with the size of each sample.
- */
-static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes,
-                          unsigned sstSize, const char** fileNamesTable, unsigned nbFiles,
-                          size_t targetChunkSize, unsigned displayLevel) {
-    char* const buff = (char*)buffer;
-    size_t pos = 0;
-    unsigned nbLoadedChunks = 0, fileIndex;
-
-    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
-        const char* const fileName = fileNamesTable[fileIndex];
-        unsigned long long const fs64 = UTIL_getFileSize(fileName);
-        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
-        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
-        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
-        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
-        U32 cnb;
-        FILE* const f = fopen(fileName, "rb");
-        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
-        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
-        for (cnb=0; cnb<nbChunks; cnb++) {
-            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
-            if (toLoad > *bufferSizePtr-pos) break;
-            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
-                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
-                pos += readSize;
-                sampleSizes[nbLoadedChunks++] = toLoad;
-                remainingToLoad -= targetChunkSize;
-                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
-                    fileIndex = nbFiles;  /* stop there */
-                    break;
-                }
-                if (toLoad < targetChunkSize) {
-                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
-        }   }   }
-        fclose(f);
-    }
-    DISPLAYLEVEL(2, "\r%79s\r", "");
-    *bufferSizePtr = pos;
-    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
-    return nbLoadedChunks;
-}
-
-#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
-static U32 getRand(U32* src)
-{
-    static const U32 prime1 = 2654435761U;
-    static const U32 prime2 = 2246822519U;
-    U32 rand32 = *src;
-    rand32 *= prime1;
-    rand32 ^= prime2;
-    rand32  = rotl32(rand32, 13);
-    *src = rand32;
-    return rand32 >> 5;
-}
-
-/* shuffle() :
- * shuffle a table of file names in a semi-random way
- * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
- * it will load random elements from it, instead of just the first ones. */
-static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
-    U32 seed = 0xFD2FB528;
-    unsigned i;
-    for (i = nbFiles - 1; i > 0; --i) {
-        unsigned const j = getRand(&seed) % (i + 1);
-        const char* const tmp = fileNamesTable[j];
-        fileNamesTable[j] = fileNamesTable[i];
-        fileNamesTable[i] = tmp;
-    }
-}
 
 
-/*-********************************************************
-*  Dictionary training functions
-**********************************************************/
-static size_t findMaxMem(unsigned long long requiredMem) {
-    size_t const step = 8 MB;
-    void* testmem = NULL;
-
-    requiredMem = (((requiredMem >> 23) + 1) << 23);
-    requiredMem += step;
-    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
-
-    while (!testmem) {
-        testmem = malloc((size_t)requiredMem);
-        requiredMem -= step;
-    }
-
-    free(testmem);
-    return (size_t)requiredMem;
-}
-
-static void saveDict(const char* dictFileName,
-                         const void* buff, size_t buffSize) {
-    FILE* const f = fopen(dictFileName, "wb");
-    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
-
-    { size_t const n = fwrite(buff, 1, buffSize, f);
-      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
-
-    { size_t const n = (size_t)fclose(f);
-      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
-}
-
-/*! getFileStats() :
- *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
- *  provides the amount of data to be loaded and the resulting nb of samples.
- *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
- */
-static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles,
-                              size_t chunkSize, unsigned displayLevel) {
-    fileStats fs;
-    unsigned n;
-    memset(&fs, 0, sizeof(fs));
-    for (n=0; n<nbFiles; n++) {
-        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
-        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
-        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
-        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
-        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
-        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
-        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
-        fs.nbSamples += nbSamples;
-    }
-    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
-    return fs;
-}
-
+/*-*************************************
+*  RANDOM
+***************************************/
 int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
                           unsigned maxDictSize,
                           ZDICT_random_params_t *params) {
@@ -281,53 +127,6 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
     return result;
 }
 
-sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
-                          unsigned maxDictSize, const unsigned displayLevel) {
-    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
-    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
-    size_t const memMult = RANDOM_MEMMULT;
-    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
-    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
-    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
-
-    /* Checks */
-    if ((!sampleSizes) || (!srcBuffer))
-        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
-    if (fs.oneSampleTooLarge) {
-        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
-        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
-        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
-    }
-    if (fs.nbSamples < 5) {
-        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
-        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
-        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
-        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
-    }
-    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
-        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
-        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
-    }
-
-    /* init */
-    if (loadedSize < fs.totalSizeToLoad)
-        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
-
-    /* Load input buffer */
-    DISPLAYLEVEL(3, "Shuffling input files\n");
-    shuffle(fileNamesTable, nbFiles);
-    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples,
-                        fileNamesTable, nbFiles, chunkSize, displayLevel);
-
-    sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo));
-
-    info->nbSamples = fs.nbSamples;
-    info->samplesSizes = sampleSizes;
-    info->srcBuffer = srcBuffer;
-
-    return info;
-}
-
 
 
 int main(int argCount, const char* argv[])

From e6fe4058388c820444a80d9d10aa5d840fab3c0c Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Tue, 17 Jul 2018 12:42:53 -0700
Subject: [PATCH 11/35] Make test PHONY target

---
 contrib/randomDictBuilder/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
index 8360a409..678ff28a 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/randomDictBuilder/Makefile
@@ -6,6 +6,7 @@ ARG :=
 
 all: main run clean
 
+.PHONY: test
 test: main testrun testshell clean
 
 .PHONY: run

From 4e706d7f2cb79df257809b45c033b3bcf5822edf Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Tue, 17 Jul 2018 14:57:27 -0700
Subject: [PATCH 12/35] fileio: Error in compression on read errors

We can write a corrupted file if the input file errors during a read.
We should return a non-zero error code in this case.
---
 programs/fileio.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/programs/fileio.c b/programs/fileio.c
index b4eed28d..85367fdf 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -797,6 +797,14 @@ FIO_compressZstdFrame(const cRess_t* ressPtr,
         }
     } while (directive != ZSTD_e_end);
 
+    if (ferror(srcFile)) {
+        EXM_THROW(26, "Read error : I/O error");
+    }
+    if (fileSize != UTIL_FILESIZE_UNKNOWN && *readsize != fileSize) {
+        EXM_THROW(27, "Read error : Incomplete read : %llu / %llu B",
+                (unsigned long long)*readsize, (unsigned long long)fileSize);
+    }
+
     return compressedfilesize;
 }
 

From 896ff0644a2531a22edf78ea9cb6b58a4de9c77f Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Tue, 17 Jul 2018 16:01:44 -0700
Subject: [PATCH 13/35] Fix deallocation problem and add documentation

---
 contrib/randomDictBuilder/io.c     |  7 +++++++
 contrib/randomDictBuilder/io.h     | 17 +++++++++++++++++
 contrib/randomDictBuilder/main.c   | 20 +++++++++++---------
 contrib/randomDictBuilder/random.c | 11 ++---------
 contrib/randomDictBuilder/random.h |  9 ++++++++-
 5 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c
index a5f71498..1c3eda58 100644
--- a/contrib/randomDictBuilder/io.c
+++ b/contrib/randomDictBuilder/io.c
@@ -241,3 +241,10 @@ sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t
 
     return info;
 }
+
+
+void freeSampleInfo(sampleInfo *info) {
+    if (info->samplesSizes) free((void*)(info->samplesSizes));
+    if (info->srcBuffer) free((void*)(info->srcBuffer));
+    free(info);
+}
diff --git a/contrib/randomDictBuilder/io.h b/contrib/randomDictBuilder/io.h
index 4b5639fe..55967f76 100644
--- a/contrib/randomDictBuilder/io.h
+++ b/contrib/randomDictBuilder/io.h
@@ -26,8 +26,25 @@ typedef struct {
 }sampleInfo;
 
 
+
+/*! getSampleInfo():
+ *  Load from input files and add samples to buffer
+ * @return: a sampleInfo struct containing infomation about buffer where samples are stored,
+ *          size of each sample, and total number of samples
+ */
 sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
                           unsigned maxDictSize, const unsigned displayLevel);
 
 
+
+/*! freeSampleInfo():
+ *  Free memory allocated for info
+ */
+void freeSampleInfo(sampleInfo *info);
+
+
+
+/*! saveDict():
+ *  Save data stored on buff to dictFileName
+ */
 void saveDict(const char* dictFileName, const void* buff, size_t buffSize);
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index 34a9d99e..1f12c7a4 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -46,7 +46,6 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 ***************************************/
 static const unsigned g_defaultMaxDictSize = 110 KB;
 #define DEFAULT_CLEVEL 3
-#define DEFAULT_INPUTFILE ""
 #define DEFAULT_k 200
 #define DEFAULT_OUTPUTFILE "defaultDict"
 #define DEFAULT_DICTID 0
@@ -135,30 +134,29 @@ int main(int argCount, const char* argv[])
   const char* programName = argv[0];
   int operationResult = 0;
 
-  char* inputFile = DEFAULT_INPUTFILE;
+  /* Initialize arguments to default values */
   unsigned k = DEFAULT_k;
-  char* outputFile = DEFAULT_OUTPUTFILE;
+  const char* outputFile = DEFAULT_OUTPUTFILE;
   unsigned dictID = DEFAULT_DICTID;
   unsigned maxDictSize = g_defaultMaxDictSize;
 
+  /* Initialize table to store input files */
   const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
   unsigned filenameIdx = 0;
 
+  /* Parse arguments */
   for (int i = 1; i < argCount; i++) {
     const char* argument = argv[i];
     if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "in=")) {
-      inputFile = malloc(strlen(argument) + 1);
-      strcpy(inputFile, argument);
-      filenameTable[filenameIdx] = inputFile;
+      filenameTable[filenameIdx] = argument;
       filenameIdx++;
       continue;
     }
     if (longCommandWArg(&argument, "out=")) {
-      outputFile = malloc(strlen(argument) + 1);
-      strcpy(outputFile, argument);
+      outputFile = argument;
       continue;
     }
     DISPLAYLEVEL(1, "Incorrect parameters\n");
@@ -168,7 +166,7 @@ int main(int argCount, const char* argv[])
 
   char* fileNamesBuf = NULL;
   unsigned fileNamesNb = filenameIdx;
-  int followLinks = 0;
+  int followLinks = 0; /* follow directory recursively */
   const char** extendedFileList = NULL;
   extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
                                         &fileNamesNb, followLinks);
@@ -194,5 +192,9 @@ int main(int argCount, const char* argv[])
                     filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
   operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, &params);
 
+  /* Free allocated memory */
+  UTIL_freeFileList(extendedFileList, fileNamesBuf);
+  freeSampleInfo(info);
+
   return operationResult;
 }
diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c
index cfed14a4..34aec39e 100644
--- a/contrib/randomDictBuilder/random.c
+++ b/contrib/randomDictBuilder/random.c
@@ -113,15 +113,8 @@ static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *
 }
 
 
-/*! ZDICT_trainFromBuffer_random():
- *  Train a dictionary from an array of samples using the RANDOM algorithm.
- *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
- *  supplied with an array of sizes `samplesSizes`, providing the size of each
- *  sample, in order.
- *  The resulting dictionary will be saved into `dictBuffer`.
- * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
- *          or an error code, which can be tested with ZDICT_isError().
- */
+
+
 ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
     void *dictBuffer, size_t dictBufferCapacity,
     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h
index b6696323..c3146f86 100644
--- a/contrib/randomDictBuilder/random.h
+++ b/contrib/randomDictBuilder/random.h
@@ -23,7 +23,14 @@ typedef struct {
 } ZDICT_random_params_t;
 
 
-
+/*! ZDICT_trainFromBuffer_random():
+ *  Train a dictionary from an array of samples.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity,
     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
     ZDICT_random_params_t parameters);

From ce09fb723d1311e62c920430fb14634e9b67dd70 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Tue, 17 Jul 2018 16:13:40 -0700
Subject: [PATCH 14/35] Update freeSampleInfo

---
 contrib/randomDictBuilder/io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c
index 1c3eda58..67c40858 100644
--- a/contrib/randomDictBuilder/io.c
+++ b/contrib/randomDictBuilder/io.c
@@ -244,6 +244,7 @@ sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t
 
 
 void freeSampleInfo(sampleInfo *info) {
+    if (!info) return;
     if (info->samplesSizes) free((void*)(info->samplesSizes));
     if (info->srcBuffer) free((void*)(info->srcBuffer));
     free(info);

From 52e7cf0e405ac6eb827322b607d094125646bbfb Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Wed, 18 Jul 2018 10:40:13 -0700
Subject: [PATCH 15/35] Add cleanup to trainfromFiles and move RANDOM_segment_t
 declaration

---
 contrib/randomDictBuilder/main.c   | 3 ++-
 contrib/randomDictBuilder/random.c | 9 +++++++++
 contrib/randomDictBuilder/random.h | 7 -------
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index 1f12c7a4..36c4326b 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -114,7 +114,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
             result = 1;
-            free(dictBuffer);
+            goto _cleanup;
         }
         /* save dict */
         DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
@@ -122,6 +122,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
     }
 
     /* clean up */
+_cleanup:
     free(dictBuffer);
     return result;
 }
diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c
index 34aec39e..5276bea9 100644
--- a/contrib/randomDictBuilder/random.c
+++ b/contrib/randomDictBuilder/random.c
@@ -47,6 +47,15 @@ static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) {
 }
 
 
+/**
+ * A segment is an inclusive range in the source.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+} RANDOM_segment_t;
+
+
 /**
  * Selects a random segment from totalSamplesSize - k + 1 possible segments
  */
diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h
index c3146f86..352775f9 100644
--- a/contrib/randomDictBuilder/random.h
+++ b/contrib/randomDictBuilder/random.h
@@ -8,13 +8,6 @@
 #endif
 #include "zdict.h"
 
-/**
- * A segment is an inclusive range in the source.
- */
-typedef struct {
-  U32 begin;
-  U32 end;
-} RANDOM_segment_t;
 
 
 typedef struct {

From 5bb46a898e6565e5bc1ee861999384f806f83831 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Wed, 18 Jul 2018 12:15:49 -0700
Subject: [PATCH 16/35] Rename cleanup

---
 contrib/randomDictBuilder/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index 36c4326b..4751a9e1 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -114,7 +114,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
             result = 1;
-            goto _cleanup;
+            goto _done;
         }
         /* save dict */
         DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
@@ -122,7 +122,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
     }
 
     /* clean up */
-_cleanup:
+_done:
     free(dictBuffer);
     return result;
 }

From 0c5eaef248443342dd1cd19f5e434334bef6fc4c Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Thu, 19 Jul 2018 13:44:27 -0700
Subject: [PATCH 17/35] Update Makefile

---
 contrib/randomDictBuilder/Makefile | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
index 678ff28a..5f9240bf 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/randomDictBuilder/Makefile
@@ -1,8 +1,11 @@
-PROGRAM_FILES := ../../programs/fileio.c
+ARG :=
+
+CC ?= gcc
+CFLAGS ?= -O3
+INCLUDES := -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
 
 TEST_INPUT := ../../lib
 TEST_OUTPUT := randomDict
-ARG :=
 
 all: main run clean
 
@@ -15,16 +18,16 @@ run:
 	./main $(ARG)
 
 main: main.o io.o random.o libzstd.a
-	gcc main.o io.o random.o libzstd.a -o main
+	$(CC) $(CFLAGS) main.o io.o random.o libzstd.a -o main
 
 main.o: main.c
-	gcc -c main.c -I io.h -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+	$(CC) $(CFLAGS) $(INCLUDES) -c main.c
 
 random.o: random.c
-	gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder
+	$(CC) $(CFLAGS) $(INCLUDES) -c random.c
 
-io.o: io.c $(PROGRAM_FILES)
-	gcc -c io.c $(PROGRAM_FILES) -I io.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+io.o: io.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c io.c
 
 libzstd.a:
 	$(MAKE) -C ../../lib libzstd.a
@@ -44,8 +47,6 @@ testshell: test.sh
 
 .PHONY: clean
 clean:
-	rm -f libzstd.a main
-	rm -f ../../lib/*/*.o
-	rm -f ../../programs/*.o
-	rm -f *.o
+	rm -f *.o main libzstd.a
+	$(MAKE) -C ../../lib clean
 	echo "Cleaning is completed"

From 5624f3f1eabf84d603ca5607f59e6aa286d13211 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Thu, 19 Jul 2018 14:35:27 -0700
Subject: [PATCH 18/35] Revert "attempt to re-enable arm64 tests"

This reverts commit 9c277f137cbcaa385ff5b95ec4cbdce50675541d.
---
 .travis.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 80406064..71b27019 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,7 +25,12 @@ matrix:
     - env: Cmd='make valgrindinstall && make -C tests clean valgrindTest'
 
     - env: Cmd='make arminstall && make armfuzz'
-    - env: Cmd='make arminstall && make aarch64fuzz'
+
+# Following test is disabled, as there is a bug in Travis' ld
+# preventing aarch64 compilation to complete.
+# > collect2: error: ld terminated with signal 11 [Segmentation fault], core dumped
+# to be re-enabled in a few commit, as it's possible that a random code change circumvent the ld bug
+#    - env: Cmd='make arminstall && make aarch64fuzz'
 
     - env: Cmd='make ppcinstall && make ppcfuzz'
     - env: Cmd='make ppcinstall && make ppc64fuzz'

From 470c8d42f4bbc8246bcd0bc8438aaad6d1c375ee Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 20 Jul 2018 11:32:39 -0700
Subject: [PATCH 19/35] Benchmark dictionary builders

---
 contrib/benchmarkDictBuilder/Makefile      |  44 ++
 contrib/benchmarkDictBuilder/README.md     |  43 ++
 contrib/benchmarkDictBuilder/benchmark.c   | 458 +++++++++++++++++++++
 contrib/benchmarkDictBuilder/dictBuilder.h |  10 +
 contrib/benchmarkDictBuilder/test.sh       |   2 +
 contrib/randomDictBuilder/io.c             |   2 +-
 contrib/randomDictBuilder/io.h             |   4 +
 7 files changed, 562 insertions(+), 1 deletion(-)
 create mode 100644 contrib/benchmarkDictBuilder/Makefile
 create mode 100644 contrib/benchmarkDictBuilder/README.md
 create mode 100644 contrib/benchmarkDictBuilder/benchmark.c
 create mode 100644 contrib/benchmarkDictBuilder/dictBuilder.h
 create mode 100644 contrib/benchmarkDictBuilder/test.sh

diff --git a/contrib/benchmarkDictBuilder/Makefile b/contrib/benchmarkDictBuilder/Makefile
new file mode 100644
index 00000000..d36d96d5
--- /dev/null
+++ b/contrib/benchmarkDictBuilder/Makefile
@@ -0,0 +1,44 @@
+ARG :=
+
+CC ?= gcc
+CFLAGS ?= -O3
+INCLUDES := -I ../randomDictBuilder -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+
+RANDOM_FILE := ../randomDictBuilder/random.c
+IO_FILE := ../randomDictBuilder/io.c
+
+all: run clean
+
+.PHONY: run
+run: benchmark
+	echo "Benchmarking with $(ARG)"
+	./benchmark $(ARG)
+
+.PHONY: test
+test: benchmarkTest clean
+
+.PHONY: benchmarkTest
+benchmarkTest: benchmark test.sh
+	sh test.sh
+
+benchmark: benchmark.o io.o random.o libzstd.a
+	$(CC) $(CFLAGS) benchmark.o io.o random.o libzstd.a -o benchmark
+
+benchmark.o: benchmark.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c benchmark.c
+
+random.o: $(RANDOM_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) -c $(RANDOM_FILE)
+
+io.o: $(IO_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)
+
+libzstd.a:
+	$(MAKE) -C ../../lib libzstd.a
+	mv ../../lib/libzstd.a .
+
+.PHONY: clean
+clean:
+	rm -f *.o benchmark libzstd.a
+	$(MAKE) -C ../../lib clean
+	echo "Cleaning is completed"
diff --git a/contrib/benchmarkDictBuilder/README.md b/contrib/benchmarkDictBuilder/README.md
new file mode 100644
index 00000000..b680a53c
--- /dev/null
+++ b/contrib/benchmarkDictBuilder/README.md
@@ -0,0 +1,43 @@
+Benchmarking Dictionary Builder
+
+### Permitted Argument:
+Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
+
+###Running Test:
+make test
+
+###Usage:
+Benchmark given input files: make ARG= followed by permitted arguments
+
+### Examples:
+make ARG="in=../../lib/dictBuilder in=../../lib/compress"
+
+###Benchmarking Result:
+
+github:
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| random        | 0.182254      |  8.786957          |
+| cover         | 34.821007     |  10.430999         |
+| legacy        | 1.125494      |  8.989482          |
+
+hg-commands
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| random        | 0.089231      |  3.489515          |
+| cover         | 32.342462     |  4.030274          |
+| legacy        | 1.066594      |  3.911896          |
+
+hg-manifest
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| random        | 1.095083      |  2.309485          |
+| cover         | 517.999132    |  2.575331          |
+| legacy        | 10.789509     |  2.506775          |
+
+hg-changelog
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| random        | 0.639630      |  2.096785          |
+| cover         | 121.398023    |  2.175706          |
+| legacy        | 3.050893      |  2.058273          |
diff --git a/contrib/benchmarkDictBuilder/benchmark.c b/contrib/benchmarkDictBuilder/benchmark.c
new file mode 100644
index 00000000..aabd96a0
--- /dev/null
+++ b/contrib/benchmarkDictBuilder/benchmark.c
@@ -0,0 +1,458 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include <time.h>
+#include "random.h"
+#include "dictBuilder.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#include "io.h"
+#include "util.h"
+#include "zdict.h"
+
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+/*-*************************************
+*  Constants
+***************************************/
+static const unsigned g_defaultMaxDictSize = 110 KB;
+#define MEMMULT 11
+#define NOISELENGTH 32
+
+/*-*************************************
+*  Struct
+***************************************/
+typedef struct {
+  const void* dictBuffer;
+  size_t dictSize;
+} dictInfo;
+
+
+/*-*************************************
+*  Commandline related functions
+***************************************/
+static unsigned readU32FromChar(const char** stringPtr){
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) exit(1);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) exit(1);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) exit(1);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ */
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+static void fillNoise(void* buffer, size_t length)
+{
+    unsigned const prime1 = 2654435761U;
+    unsigned const prime2 = 2246822519U;
+    unsigned acc = prime1;
+    size_t p=0;;
+
+    for (p=0; p<length; p++) {
+        acc *= prime2;
+        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
+    }
+}
+
+/*-*************************************
+* Dictionary related operations
+***************************************/
+/** createDictFromFiles() :
+ *  Based on type of param given, train dictionary using the corresponding algorithm
+ *  @return dictInfo containing dictionary buffer and dictionary size
+ */
+dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
+                  ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams,
+                  ZDICT_legacy_params_t *legacyParams) {
+    unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel :
+                        coverParams ? coverParams->zParams.notificationLevel :
+                        legacyParams ? legacyParams->zParams.notificationLevel :
+                        0;   /* should never happen */
+    void* const dictBuffer = malloc(maxDictSize);
+
+    dictInfo* dInfo;
+
+    /* Checks */
+    if (!dictBuffer)
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+
+    {   size_t dictSize;
+        if(randomParams) {
+          dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer,
+                                               info->samplesSizes, info->nbSamples, *randomParams);
+        }else if(coverParams) {
+          dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
+                                                info->samplesSizes, info->nbSamples, coverParams);
+        } else {
+          size_t totalSize= 0;
+          for (int i = 0; i < info->nbSamples; i++) {
+            totalSize += info->samplesSizes[i];
+          }
+          size_t const maxMem = findMaxMem(totalSize * MEMMULT) / MEMMULT;
+          size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, totalSize);
+          fillNoise((char*)(info->srcBuffer) + loadedSize, NOISELENGTH);
+          dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize, info->srcBuffer,
+                                               info->samplesSizes, info->nbSamples, *legacyParams);
+        }
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            free(dictBuffer);
+            freeSampleInfo(info);
+            return dInfo;
+        }
+        dInfo = (dictInfo *)malloc(sizeof(dictInfo));
+        dInfo->dictBuffer = dictBuffer;
+        dInfo->dictSize = dictSize;
+    }
+    return dInfo;
+}
+
+
+/** compressWithDict() :
+ *  Compress samples from sample buffer given dicionary stored on dictionary buffer and compression level
+ *  @return compression ratio
+ */
+double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLevel, int displayLevel) {
+  /* Local variables */
+  size_t totalCompressedSize = 0;
+  size_t totalOriginalSize = 0;
+  double cRatio;
+  size_t dstCapacity;
+  int i;
+
+  /* Pointers */
+  ZSTD_CCtx* cctx;
+  ZSTD_CDict *cdict;
+  size_t *offsets;
+  void* dst;
+
+  /* Allocate dst with enough space to compress the maximum sized sample */
+  {
+    size_t maxSampleSize = 0;
+    for (int i = 0; i < srcInfo->nbSamples; i++) {
+      maxSampleSize = MAX(srcInfo->samplesSizes[i], maxSampleSize);
+    }
+    dstCapacity = ZSTD_compressBound(maxSampleSize);
+    dst = malloc(dstCapacity);
+  }
+
+  /* Create the cctx and cdict */
+  cctx = ZSTD_createCCtx();
+  cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel);
+
+  if(!cctx || !cdict || !dst) {
+    cRatio = -1;
+    goto _cleanup;
+  }
+
+  /* Calculate offset for each sample */
+  offsets = (size_t *)malloc((srcInfo->nbSamples + 1) * sizeof(size_t));
+  offsets[0] = 0;
+  for (i = 1; i <= srcInfo->nbSamples; i++) {
+    offsets[i] = offsets[i - 1] + srcInfo->samplesSizes[i - 1];
+  }
+
+  /* Compress each sample and sum their sizes*/
+  const BYTE *const samples = (const BYTE *)srcInfo->srcBuffer;
+  for (i = 0; i < srcInfo->nbSamples; i++) {
+    const size_t compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict);
+    if (ZSTD_isError(compressedSize)) {
+      cRatio = -1;
+      goto _cleanup;
+    }
+    totalCompressedSize += compressedSize;
+  }
+
+  /* Sum orignal sizes */
+  for (i = 0; i<srcInfo->nbSamples; i++) {
+    totalOriginalSize += srcInfo->samplesSizes[i];
+  }
+
+  /* Calculate compression ratio */
+  DISPLAYLEVEL(2, "original size is %lu\n", totalOriginalSize);
+  DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize);
+  cRatio = (double)totalOriginalSize/(double)totalCompressedSize;
+
+_cleanup:
+  if(dst) {
+    free(dst);
+  }
+  if(offsets) {
+    free(offsets);
+  }
+  ZSTD_freeCCtx(cctx);
+  ZSTD_freeCDict(cdict);
+  return cRatio;
+}
+
+
+/** FreeDictInfo() :
+ *  Free memory allocated for dictInfo
+ */
+void freeDictInfo(dictInfo* info) {
+  if (!info) return;
+  if (info->dictBuffer) free((void*)(info->dictBuffer));
+  free(info);
+}
+
+
+
+/*-********************************************************
+  *  Benchmarking functions
+**********************************************************/
+/** benchmarkRandom() :
+ *  Measure how long random dictionary builder takes and compression ratio with the random dictionary
+ *  @return 0 if benchmark successfully, 1 otherwise
+ */
+int benchmarkRandom(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam) {
+  const int displayLevel = randomParam->zParams.notificationLevel;
+  int result = 0;
+  clock_t t;
+  t = clock();
+  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, NULL, NULL);
+  t = clock() - t;
+  double time_taken = ((double)t)/CLOCKS_PER_SEC;
+  if (!dInfo) {
+    DISPLAYLEVEL(1, "RANDOM does not train successfully\n");
+    result = 1;
+    goto _cleanup;
+  }
+  DISPLAYLEVEL(2, "RANDOM took %f seconds to execute \n", time_taken);
+
+  double cRatio = compressWithDict(srcInfo, dInfo, randomParam->zParams.compressionLevel, displayLevel);
+  if (cRatio < 0) {
+    DISPLAYLEVEL(1, "Compressing with RANDOM dictionary does not work\n");
+    result = 1;
+    goto _cleanup;
+  }
+  DISPLAYLEVEL(2, "Compression ratio with random dictionary is %f\n", cRatio);
+
+
+_cleanup:
+  freeDictInfo(dInfo);
+  return result;
+}
+
+/** benchmarkCover() :
+ *  Measure how long random dictionary builder takes and compression ratio with the cover dictionary
+ *  @return 0 if benchmark successfully, 1 otherwise
+ */
+int benchmarkCover(sampleInfo *srcInfo, unsigned maxDictSize,
+                ZDICT_cover_params_t *coverParam) {
+  const int displayLevel = coverParam->zParams.notificationLevel;
+  int result = 0;
+  clock_t t;
+  t = clock();
+  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, NULL, coverParam, NULL);
+  t = clock() - t;
+  double time_taken = ((double)t)/CLOCKS_PER_SEC;
+  if (!dInfo) {
+    DISPLAYLEVEL(1, "COVER does not train successfully\n");
+    result = 1;
+    goto _cleanup;
+  }
+  DISPLAYLEVEL(2, "COVER took %f seconds to execute \n", time_taken);
+
+  double cRatio = compressWithDict(srcInfo, dInfo, coverParam->zParams.compressionLevel, displayLevel);
+  if (cRatio < 0) {
+    DISPLAYLEVEL(1, "Compressing with COVER dictionary does not work\n");
+    result = 1;
+    goto _cleanup;
+  }
+  DISPLAYLEVEL(2, "Compression ratio with cover dictionary is %f\n", cRatio);
+
+_cleanup:
+  freeDictInfo(dInfo);
+  return result;
+}
+
+
+
+/** benchmarkLegacy() :
+ *  Measure how long legacy dictionary builder takes and compression ratio with the legacy dictionary
+ *  @return 0 if benchmark successfully, 1 otherwise
+ */
+int benchmarkLegacy(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_legacy_params_t *legacyParam) {
+  const int displayLevel = legacyParam->zParams.notificationLevel;
+  int result = 0;
+  clock_t t;
+  t = clock();
+  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, NULL, NULL, legacyParam);
+  t = clock() - t;
+  double time_taken = ((double)t)/CLOCKS_PER_SEC;
+  if (!dInfo) {
+    DISPLAYLEVEL(1, "LEGACY does not train successfully\n");
+    result = 1;
+    goto _cleanup;
+
+  }
+  DISPLAYLEVEL(2, "LEGACY took %f seconds to execute \n", time_taken);
+
+  double cRatio = compressWithDict(srcInfo, dInfo, legacyParam->zParams.compressionLevel, displayLevel);
+  if (cRatio < 0) {
+    DISPLAYLEVEL(1, "Compressing with LEGACY dictionary does not work\n");
+    result = 1;
+    goto _cleanup;
+
+  }
+  DISPLAYLEVEL(2, "Compression ratio with legacy dictionary is %f\n", cRatio);
+
+_cleanup:
+  freeDictInfo(dInfo);
+  return result;
+}
+
+
+
+int main(int argCount, const char* argv[])
+{
+  int displayLevel = 2;
+  const char* programName = argv[0];
+  int result = 0;
+  /* Initialize arguments to default values */
+  unsigned k = 200;
+  unsigned d = 6;
+  unsigned cLevel = 3;
+  unsigned dictID = 0;
+  unsigned maxDictSize = g_defaultMaxDictSize;
+
+  /* Initialize table to store input files */
+  const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
+  unsigned filenameIdx = 0;
+
+  char* fileNamesBuf = NULL;
+  unsigned fileNamesNb = filenameIdx;
+  int followLinks = 0;
+  const char** extendedFileList = NULL;
+
+  /* Parse arguments */
+  for (int i = 1; i < argCount; i++) {
+    const char* argument = argv[i];
+    if (longCommandWArg(&argument, "in=")) {
+      filenameTable[filenameIdx] = argument;
+      filenameIdx++;
+      continue;
+    }
+    DISPLAYLEVEL(1, "benchmark: Incorrect parameters\n");
+    return 1;
+  }
+
+
+  /* Get the list of all files recursively (because followLinks==0)*/
+  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
+                                        &fileNamesNb, followLinks);
+  if (extendedFileList) {
+    unsigned u;
+    for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
+    free((void*)filenameTable);
+    filenameTable = extendedFileList;
+    filenameIdx = fileNamesNb;
+  }
+
+  size_t blockSize = 0;
+  sampleInfo* srcInfo= getSampleInfo(filenameTable,
+                    filenameIdx, blockSize, maxDictSize, displayLevel);
+
+  /* set up zParams */
+  ZDICT_params_t zParams;
+  zParams.compressionLevel = cLevel;
+  zParams.notificationLevel = displayLevel;
+  zParams.dictID = dictID;
+
+  /* for random */
+  ZDICT_random_params_t randomParam;
+  randomParam.zParams = zParams;
+  randomParam.k = k;
+  int randomResult = benchmarkRandom(srcInfo, maxDictSize, &randomParam);
+  if(randomResult) {
+    result = 1;
+    goto _cleanup;
+  }
+
+  /* for cover */
+  ZDICT_cover_params_t coverParam;
+  memset(&coverParam, 0, sizeof(coverParam));
+  coverParam.zParams = zParams;
+  coverParam.splitPoint = 1.0;
+  coverParam.d = d;
+  coverParam.steps = 40;
+  coverParam.nbThreads = 1;
+  int coverOptResult = benchmarkCover(srcInfo, maxDictSize, &coverParam);
+  if(coverOptResult) {
+    result = 1;
+    goto _cleanup;
+  }
+
+  /* for legacy */
+  ZDICT_legacy_params_t legacyParam;
+  legacyParam.zParams = zParams;
+  legacyParam.selectivityLevel = 9;
+  int legacyResult = benchmarkLegacy(srcInfo, maxDictSize, &legacyParam);
+  if(legacyResult) {
+    result = 1;
+    goto _cleanup;
+  }
+
+  /* Free allocated memory */
+_cleanup:
+  UTIL_freeFileList(extendedFileList, fileNamesBuf);
+  freeSampleInfo(srcInfo);
+  return result;
+}
diff --git a/contrib/benchmarkDictBuilder/dictBuilder.h b/contrib/benchmarkDictBuilder/dictBuilder.h
new file mode 100644
index 00000000..a2dae576
--- /dev/null
+++ b/contrib/benchmarkDictBuilder/dictBuilder.h
@@ -0,0 +1,10 @@
+/*! ZDICT_trainFromBuffer_unsafe_legacy() :
+    Strictly Internal use only !!
+    Same as ZDICT_trainFromBuffer_legacy(), but does not control `samplesBuffer`.
+    `samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads.
+    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+              or an error code.
+*/
+size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                                           const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                           ZDICT_legacy_params_t parameters);
diff --git a/contrib/benchmarkDictBuilder/test.sh b/contrib/benchmarkDictBuilder/test.sh
new file mode 100644
index 00000000..6354784e
--- /dev/null
+++ b/contrib/benchmarkDictBuilder/test.sh
@@ -0,0 +1,2 @@
+echo "Benchmark with in=../../lib/common"
+./benchmark in=../../lib/common
diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c
index 67c40858..1217b574 100644
--- a/contrib/randomDictBuilder/io.c
+++ b/contrib/randomDictBuilder/io.c
@@ -139,7 +139,7 @@ static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
 /*-********************************************************
 *  Dictionary training functions
 **********************************************************/
-static size_t findMaxMem(unsigned long long requiredMem) {
+size_t findMaxMem(unsigned long long requiredMem) {
     size_t const step = 8 MB;
     void* testmem = NULL;
 
diff --git a/contrib/randomDictBuilder/io.h b/contrib/randomDictBuilder/io.h
index 55967f76..e2f454c2 100644
--- a/contrib/randomDictBuilder/io.h
+++ b/contrib/randomDictBuilder/io.h
@@ -48,3 +48,7 @@ void freeSampleInfo(sampleInfo *info);
  *  Save data stored on buff to dictFileName
  */
 void saveDict(const char* dictFileName, const void* buff, size_t buffSize);
+
+
+
+size_t findMaxMem(unsigned long long requiredMem);

From 71e767ac0989c177e970970b8dbe3fa85ff4912f Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 20 Jul 2018 17:03:47 -0700
Subject: [PATCH 20/35] Refactoring and benchmark without dictionary

---
 contrib/benchmarkDictBuilder/README.md        |  43 ---
 contrib/benchmarkDictBuilder/dictBuilder.h    |  10 -
 .../benchmarkDictBuilder/Makefile             |   8 +-
 .../benchmarkDictBuilder/README.md            |  47 +++
 .../benchmarkDictBuilder/benchmark.c          | 322 +++++++-----------
 .../benchmarkDictBuilder/dictBuilder.h        |   6 +
 .../benchmarkDictBuilder/test.sh              |   2 +-
 .../randomDictBuilder/Makefile                |  10 +-
 .../randomDictBuilder/README.md               |   4 +-
 .../randomDictBuilder/io.c                    |  33 ++
 .../randomDictBuilder/io.h                    |   8 +-
 .../randomDictBuilder/main.c                  |  40 ---
 .../randomDictBuilder/random.c                |   0
 .../randomDictBuilder/random.h                |   0
 .../randomDictBuilder/test.sh                 |  12 +-
 15 files changed, 232 insertions(+), 313 deletions(-)
 delete mode 100644 contrib/benchmarkDictBuilder/README.md
 delete mode 100644 contrib/benchmarkDictBuilder/dictBuilder.h
 rename contrib/{ => experimental_dict_builders}/benchmarkDictBuilder/Makefile (76%)
 create mode 100644 contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
 rename contrib/{ => experimental_dict_builders}/benchmarkDictBuilder/benchmark.c (53%)
 create mode 100644 contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h
 rename contrib/{ => experimental_dict_builders}/benchmarkDictBuilder/test.sh (54%)
 rename contrib/{ => experimental_dict_builders}/randomDictBuilder/Makefile (79%)
 rename contrib/{ => experimental_dict_builders}/randomDictBuilder/README.md (85%)
 rename contrib/{ => experimental_dict_builders}/randomDictBuilder/io.c (89%)
 rename contrib/{ => experimental_dict_builders}/randomDictBuilder/io.h (78%)
 rename contrib/{ => experimental_dict_builders}/randomDictBuilder/main.c (79%)
 rename contrib/{ => experimental_dict_builders}/randomDictBuilder/random.c (100%)
 rename contrib/{ => experimental_dict_builders}/randomDictBuilder/random.h (100%)
 rename contrib/{ => experimental_dict_builders}/randomDictBuilder/test.sh (52%)

diff --git a/contrib/benchmarkDictBuilder/README.md b/contrib/benchmarkDictBuilder/README.md
deleted file mode 100644
index b680a53c..00000000
--- a/contrib/benchmarkDictBuilder/README.md
+++ /dev/null
@@ -1,43 +0,0 @@
-Benchmarking Dictionary Builder
-
-### Permitted Argument:
-Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
-
-###Running Test:
-make test
-
-###Usage:
-Benchmark given input files: make ARG= followed by permitted arguments
-
-### Examples:
-make ARG="in=../../lib/dictBuilder in=../../lib/compress"
-
-###Benchmarking Result:
-
-github:
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| random        | 0.182254      |  8.786957          |
-| cover         | 34.821007     |  10.430999         |
-| legacy        | 1.125494      |  8.989482          |
-
-hg-commands
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| random        | 0.089231      |  3.489515          |
-| cover         | 32.342462     |  4.030274          |
-| legacy        | 1.066594      |  3.911896          |
-
-hg-manifest
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| random        | 1.095083      |  2.309485          |
-| cover         | 517.999132    |  2.575331          |
-| legacy        | 10.789509     |  2.506775          |
-
-hg-changelog
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| random        | 0.639630      |  2.096785          |
-| cover         | 121.398023    |  2.175706          |
-| legacy        | 3.050893      |  2.058273          |
diff --git a/contrib/benchmarkDictBuilder/dictBuilder.h b/contrib/benchmarkDictBuilder/dictBuilder.h
deleted file mode 100644
index a2dae576..00000000
--- a/contrib/benchmarkDictBuilder/dictBuilder.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*! ZDICT_trainFromBuffer_unsafe_legacy() :
-    Strictly Internal use only !!
-    Same as ZDICT_trainFromBuffer_legacy(), but does not control `samplesBuffer`.
-    `samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads.
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
-              or an error code.
-*/
-size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCapacity,
-                                           const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                           ZDICT_legacy_params_t parameters);
diff --git a/contrib/benchmarkDictBuilder/Makefile b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
similarity index 76%
rename from contrib/benchmarkDictBuilder/Makefile
rename to contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
index d36d96d5..72ce04f2 100644
--- a/contrib/benchmarkDictBuilder/Makefile
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
@@ -2,7 +2,7 @@ ARG :=
 
 CC ?= gcc
 CFLAGS ?= -O3
-INCLUDES := -I ../randomDictBuilder -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+INCLUDES := -I ../randomDictBuilder -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
 
 RANDOM_FILE := ../randomDictBuilder/random.c
 IO_FILE := ../randomDictBuilder/io.c
@@ -34,11 +34,11 @@ io.o: $(IO_FILE)
 	$(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)
 
 libzstd.a:
-	$(MAKE) -C ../../lib libzstd.a
-	mv ../../lib/libzstd.a .
+	$(MAKE) -C ../../../lib libzstd.a
+	mv ../../../lib/libzstd.a .
 
 .PHONY: clean
 clean:
 	rm -f *.o benchmark libzstd.a
-	$(MAKE) -C ../../lib clean
+	$(MAKE) -C ../../../lib clean
 	echo "Cleaning is completed"
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
new file mode 100644
index 00000000..de783a0e
--- /dev/null
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -0,0 +1,47 @@
+Benchmarking Dictionary Builder
+
+### Permitted Argument:
+Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
+
+###Running Test:
+make test
+
+###Usage:
+Benchmark given input files: make ARG= followed by permitted arguments
+
+### Examples:
+make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
+
+###Benchmarking Result:
+
+github:
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| nodict        | 0.000004      |  2.999642          |
+| random        | 0.180238      |  8.786957          |
+| cover         | 33.891987     |  10.430999         |
+| legacy        | 1.077569      |  8.989482          |
+
+hg-commands
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| nodict        | 0.000006      |  2.425291          |
+| random        | 0.088735      |  3.489515          |
+| cover         | 35.447300     |  4.030274          |
+| legacy        | 1.048509      |  3.911896          |
+
+hg-manifest
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| nodict        | 0.000005      |  1.866385          |
+| random        | 1.148231      |  2.309485          |
+| cover         | 509.685257    |  2.575331          |
+| legacy        | 10.705866     |  2.506775          |
+
+hg-changelog
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| nodict        | 0.000005      |  1.377613          |
+| random        | 0.706434      |  2.096785          |
+| cover         | 122.815783    |  2.175706          |
+| legacy        | 3.010318      |  2.058273          |
diff --git a/contrib/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
similarity index 53%
rename from contrib/benchmarkDictBuilder/benchmark.c
rename to contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
index aabd96a0..890afb8b 100644
--- a/contrib/benchmarkDictBuilder/benchmark.c
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@@ -44,12 +44,14 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
     exit(error);                                                          \
 }
 
+
 /*-*************************************
 *  Constants
 ***************************************/
 static const unsigned g_defaultMaxDictSize = 110 KB;
-#define MEMMULT 11
-#define NOISELENGTH 32
+#define DEFAULT_CLEVEL 3
+#define DEFAULT_DISPLAYLEVEL 2
+
 
 /*-*************************************
 *  Struct
@@ -60,57 +62,6 @@ typedef struct {
 } dictInfo;
 
 
-/*-*************************************
-*  Commandline related functions
-***************************************/
-static unsigned readU32FromChar(const char** stringPtr){
-    const char errorMsg[] = "error: numeric value too large";
-    unsigned result = 0;
-    while ((**stringPtr >='0') && (**stringPtr <='9')) {
-        unsigned const max = (((unsigned)(-1)) / 10) - 1;
-        if (result > max) exit(1);
-        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
-    }
-    if ((**stringPtr=='K') || (**stringPtr=='M')) {
-        unsigned const maxK = ((unsigned)(-1)) >> 10;
-        if (result > maxK) exit(1);
-        result <<= 10;
-        if (**stringPtr=='M') {
-            if (result > maxK) exit(1);
-            result <<= 10;
-        }
-        (*stringPtr)++;  /* skip `K` or `M` */
-        if (**stringPtr=='i') (*stringPtr)++;
-        if (**stringPtr=='B') (*stringPtr)++;
-    }
-    return result;
-}
-
-/** longCommandWArg() :
- *  check if *stringPtr is the same as longCommand.
- *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
- * @return 0 and doesn't modify *stringPtr otherwise.
- */
-static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){
-    size_t const comSize = strlen(longCommand);
-    int const result = !strncmp(*stringPtr, longCommand, comSize);
-    if (result) *stringPtr += comSize;
-    return result;
-}
-
-static void fillNoise(void* buffer, size_t length)
-{
-    unsigned const prime1 = 2654435761U;
-    unsigned const prime2 = 2246822519U;
-    unsigned acc = prime1;
-    size_t p=0;;
-
-    for (p=0; p<length; p++) {
-        acc *= prime2;
-        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
-    }
-}
-
 /*-*************************************
 * Dictionary related operations
 ***************************************/
@@ -122,9 +73,9 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
                   ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams,
                   ZDICT_legacy_params_t *legacyParams) {
     unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel :
-                        coverParams ? coverParams->zParams.notificationLevel :
-                        legacyParams ? legacyParams->zParams.notificationLevel :
-                        0;   /* should never happen */
+                                  coverParams ? coverParams->zParams.notificationLevel :
+                                  legacyParams ? legacyParams->zParams.notificationLevel :
+                                  DEFAULT_DISPLAYLEVEL;   /* no dict */
     void* const dictBuffer = malloc(maxDictSize);
 
     dictInfo* dInfo;
@@ -140,21 +91,15 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
         }else if(coverParams) {
           dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
                                                 info->samplesSizes, info->nbSamples, coverParams);
-        } else {
-          size_t totalSize= 0;
-          for (int i = 0; i < info->nbSamples; i++) {
-            totalSize += info->samplesSizes[i];
-          }
-          size_t const maxMem = findMaxMem(totalSize * MEMMULT) / MEMMULT;
-          size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, totalSize);
-          fillNoise((char*)(info->srcBuffer) + loadedSize, NOISELENGTH);
-          dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize, info->srcBuffer,
+        } else if(legacyParams) {
+          dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer,
                                                info->samplesSizes, info->nbSamples, *legacyParams);
+        } else {
+          dictSize = 0;
         }
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
             free(dictBuffer);
-            freeSampleInfo(info);
             return dInfo;
         }
         dInfo = (dictInfo *)malloc(sizeof(dictInfo));
@@ -173,6 +118,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
   /* Local variables */
   size_t totalCompressedSize = 0;
   size_t totalOriginalSize = 0;
+  unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0;
   double cRatio;
   size_t dstCapacity;
   int i;
@@ -193,15 +139,6 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
     dst = malloc(dstCapacity);
   }
 
-  /* Create the cctx and cdict */
-  cctx = ZSTD_createCCtx();
-  cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel);
-
-  if(!cctx || !cdict || !dst) {
-    cRatio = -1;
-    goto _cleanup;
-  }
-
   /* Calculate offset for each sample */
   offsets = (size_t *)malloc((srcInfo->nbSamples + 1) * sizeof(size_t));
   offsets[0] = 0;
@@ -209,13 +146,35 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
     offsets[i] = offsets[i - 1] + srcInfo->samplesSizes[i - 1];
   }
 
+  /* Create the cctx */
+  cctx = ZSTD_createCCtx();
+  if(!cctx || !dst) {
+    cRatio = -1;
+    goto _nodictCleanup;
+  }
+
+  /* Create CDict if there's a dictionary stored on buffer */
+  if (hasDict) {
+    cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel);
+    if(!cdict) {
+      cRatio = -1;
+      goto _dictCleanup;
+    }
+  }
+
   /* Compress each sample and sum their sizes*/
   const BYTE *const samples = (const BYTE *)srcInfo->srcBuffer;
   for (i = 0; i < srcInfo->nbSamples; i++) {
-    const size_t compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict);
+    size_t compressedSize;
+    if(hasDict) {
+      compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict);
+    } else {
+      compressedSize = ZSTD_compressCCtx(cctx, dst, dstCapacity,samples + offsets[i], srcInfo->samplesSizes[i], compressionLevel);
+    }
     if (ZSTD_isError(compressedSize)) {
       cRatio = -1;
-      goto _cleanup;
+      if(hasDict) goto _dictCleanup;
+      else goto _nodictCleanup;
     }
     totalCompressedSize += compressedSize;
   }
@@ -230,15 +189,14 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
   DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize);
   cRatio = (double)totalOriginalSize/(double)totalCompressedSize;
 
-_cleanup:
-  if(dst) {
-    free(dst);
-  }
-  if(offsets) {
-    free(offsets);
-  }
-  ZSTD_freeCCtx(cctx);
+_dictCleanup:
   ZSTD_freeCDict(cdict);
+
+_nodictCleanup:
+  free(dst);
+  free(offsets);
+  ZSTD_freeCCtx(cctx);
+
   return cRatio;
 }
 
@@ -257,102 +215,48 @@ void freeDictInfo(dictInfo* info) {
 /*-********************************************************
   *  Benchmarking functions
 **********************************************************/
-/** benchmarkRandom() :
- *  Measure how long random dictionary builder takes and compression ratio with the random dictionary
+/** benchmarkDictBuilder() :
+ *  Measure how long a dictionary builder takes and compression ratio with the dictionary built
  *  @return 0 if benchmark successfully, 1 otherwise
  */
-int benchmarkRandom(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam) {
-  const int displayLevel = randomParam->zParams.notificationLevel;
+int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam,
+                        ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam) {
+  /* Local variables */
+  const unsigned displayLevel = randomParam ? randomParam->zParams.notificationLevel :
+                                coverParam ? coverParam->zParams.notificationLevel :
+                                legacyParam ? legacyParam->zParams.notificationLevel :
+                                DEFAULT_DISPLAYLEVEL;   /* no dict */
+  const char* name = randomParam ? "RANDOM" :
+                    coverParam ? "COVER" :
+                    legacyParam ? "LEGACY" :
+                    "NODICT";    /* no dict */
+  const unsigned cLevel = randomParam ? randomParam->zParams.compressionLevel :
+                          coverParam ? coverParam->zParams.compressionLevel :
+                          legacyParam ? legacyParam->zParams.compressionLevel :
+                          DEFAULT_CLEVEL;   /* no dict */
   int result = 0;
-  clock_t t;
-  t = clock();
-  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, NULL, NULL);
-  t = clock() - t;
-  double time_taken = ((double)t)/CLOCKS_PER_SEC;
+
+  /* Calculate speed */
+  const UTIL_time_t begin = UTIL_getTime();
+  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam);
+  const U64 timeMicro = UTIL_clockSpanMicro(begin);
+  const double timeSec = timeMicro / (double)SEC_TO_MICRO;
   if (!dInfo) {
-    DISPLAYLEVEL(1, "RANDOM does not train successfully\n");
+    DISPLAYLEVEL(1, "%s does not train successfully\n", name);
     result = 1;
     goto _cleanup;
   }
-  DISPLAYLEVEL(2, "RANDOM took %f seconds to execute \n", time_taken);
+  DISPLAYLEVEL(2, "%s took %f seconds to execute \n", name, timeSec);
 
-  double cRatio = compressWithDict(srcInfo, dInfo, randomParam->zParams.compressionLevel, displayLevel);
+  /* Calculate compression ratio */
+  double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel);
   if (cRatio < 0) {
-    DISPLAYLEVEL(1, "Compressing with RANDOM dictionary does not work\n");
-    result = 1;
-    goto _cleanup;
-  }
-  DISPLAYLEVEL(2, "Compression ratio with random dictionary is %f\n", cRatio);
-
-
-_cleanup:
-  freeDictInfo(dInfo);
-  return result;
-}
-
-/** benchmarkCover() :
- *  Measure how long random dictionary builder takes and compression ratio with the cover dictionary
- *  @return 0 if benchmark successfully, 1 otherwise
- */
-int benchmarkCover(sampleInfo *srcInfo, unsigned maxDictSize,
-                ZDICT_cover_params_t *coverParam) {
-  const int displayLevel = coverParam->zParams.notificationLevel;
-  int result = 0;
-  clock_t t;
-  t = clock();
-  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, NULL, coverParam, NULL);
-  t = clock() - t;
-  double time_taken = ((double)t)/CLOCKS_PER_SEC;
-  if (!dInfo) {
-    DISPLAYLEVEL(1, "COVER does not train successfully\n");
-    result = 1;
-    goto _cleanup;
-  }
-  DISPLAYLEVEL(2, "COVER took %f seconds to execute \n", time_taken);
-
-  double cRatio = compressWithDict(srcInfo, dInfo, coverParam->zParams.compressionLevel, displayLevel);
-  if (cRatio < 0) {
-    DISPLAYLEVEL(1, "Compressing with COVER dictionary does not work\n");
-    result = 1;
-    goto _cleanup;
-  }
-  DISPLAYLEVEL(2, "Compression ratio with cover dictionary is %f\n", cRatio);
-
-_cleanup:
-  freeDictInfo(dInfo);
-  return result;
-}
-
-
-
-/** benchmarkLegacy() :
- *  Measure how long legacy dictionary builder takes and compression ratio with the legacy dictionary
- *  @return 0 if benchmark successfully, 1 otherwise
- */
-int benchmarkLegacy(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_legacy_params_t *legacyParam) {
-  const int displayLevel = legacyParam->zParams.notificationLevel;
-  int result = 0;
-  clock_t t;
-  t = clock();
-  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, NULL, NULL, legacyParam);
-  t = clock() - t;
-  double time_taken = ((double)t)/CLOCKS_PER_SEC;
-  if (!dInfo) {
-    DISPLAYLEVEL(1, "LEGACY does not train successfully\n");
+    DISPLAYLEVEL(1, "Compressing with %s dictionary does not work\n", name);
     result = 1;
     goto _cleanup;
 
   }
-  DISPLAYLEVEL(2, "LEGACY took %f seconds to execute \n", time_taken);
-
-  double cRatio = compressWithDict(srcInfo, dInfo, legacyParam->zParams.compressionLevel, displayLevel);
-  if (cRatio < 0) {
-    DISPLAYLEVEL(1, "Compressing with LEGACY dictionary does not work\n");
-    result = 1;
-    goto _cleanup;
-
-  }
-  DISPLAYLEVEL(2, "Compression ratio with legacy dictionary is %f\n", cRatio);
+  DISPLAYLEVEL(2, "Compression ratio with %s dictionary is %f\n", name, cRatio);
 
 _cleanup:
   freeDictInfo(dInfo);
@@ -363,15 +267,16 @@ _cleanup:
 
 int main(int argCount, const char* argv[])
 {
-  int displayLevel = 2;
+  const int displayLevel = DEFAULT_DISPLAYLEVEL;
   const char* programName = argv[0];
   int result = 0;
+
   /* Initialize arguments to default values */
-  unsigned k = 200;
-  unsigned d = 6;
-  unsigned cLevel = 3;
-  unsigned dictID = 0;
-  unsigned maxDictSize = g_defaultMaxDictSize;
+  const unsigned k = 200;
+  const unsigned d = 6;
+  const unsigned cLevel = DEFAULT_CLEVEL;
+  const unsigned dictID = 0;
+  const unsigned maxDictSize = g_defaultMaxDictSize;
 
   /* Initialize table to store input files */
   const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
@@ -379,7 +284,7 @@ int main(int argCount, const char* argv[])
 
   char* fileNamesBuf = NULL;
   unsigned fileNamesNb = filenameIdx;
-  int followLinks = 0;
+  const int followLinks = 0;
   const char** extendedFileList = NULL;
 
   /* Parse arguments */
@@ -394,7 +299,6 @@ int main(int argCount, const char* argv[])
     return 1;
   }
 
-
   /* Get the list of all files recursively (because followLinks==0)*/
   extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
                                         &fileNamesNb, followLinks);
@@ -406,6 +310,7 @@ int main(int argCount, const char* argv[])
     filenameIdx = fileNamesNb;
   }
 
+  /* get sampleInfo */
   size_t blockSize = 0;
   sampleInfo* srcInfo= getSampleInfo(filenameTable,
                     filenameIdx, blockSize, maxDictSize, displayLevel);
@@ -416,38 +321,53 @@ int main(int argCount, const char* argv[])
   zParams.notificationLevel = displayLevel;
   zParams.dictID = dictID;
 
+  /* with no dict */
+  {
+    const int noDictResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL);
+    if(noDictResult) {
+      result = 1;
+      goto _cleanup;
+    }
+  }
+
   /* for random */
-  ZDICT_random_params_t randomParam;
-  randomParam.zParams = zParams;
-  randomParam.k = k;
-  int randomResult = benchmarkRandom(srcInfo, maxDictSize, &randomParam);
-  if(randomResult) {
-    result = 1;
-    goto _cleanup;
+  {
+    ZDICT_random_params_t randomParam;
+    randomParam.zParams = zParams;
+    randomParam.k = k;
+    const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL);
+    if(randomResult) {
+      result = 1;
+      goto _cleanup;
+    }
   }
 
   /* for cover */
-  ZDICT_cover_params_t coverParam;
-  memset(&coverParam, 0, sizeof(coverParam));
-  coverParam.zParams = zParams;
-  coverParam.splitPoint = 1.0;
-  coverParam.d = d;
-  coverParam.steps = 40;
-  coverParam.nbThreads = 1;
-  int coverOptResult = benchmarkCover(srcInfo, maxDictSize, &coverParam);
-  if(coverOptResult) {
-    result = 1;
-    goto _cleanup;
+  {
+    ZDICT_cover_params_t coverParam;
+    memset(&coverParam, 0, sizeof(coverParam));
+    coverParam.zParams = zParams;
+    coverParam.splitPoint = 1.0;
+    coverParam.d = d;
+    coverParam.steps = 40;
+    coverParam.nbThreads = 1;
+    const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL);
+    if(coverOptResult) {
+      result = 1;
+      goto _cleanup;
+    }
   }
 
   /* for legacy */
-  ZDICT_legacy_params_t legacyParam;
-  legacyParam.zParams = zParams;
-  legacyParam.selectivityLevel = 9;
-  int legacyResult = benchmarkLegacy(srcInfo, maxDictSize, &legacyParam);
-  if(legacyResult) {
-    result = 1;
-    goto _cleanup;
+  {
+    ZDICT_legacy_params_t legacyParam;
+    legacyParam.zParams = zParams;
+    legacyParam.selectivityLevel = 9;
+    const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam);
+    if(legacyResult) {
+      result = 1;
+      goto _cleanup;
+    }
   }
 
   /* Free allocated memory */
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h b/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h
new file mode 100644
index 00000000..781ec8c2
--- /dev/null
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h
@@ -0,0 +1,6 @@
+/* ZDICT_trainFromBuffer_legacy() :
+ * issue : samplesBuffer need to be followed by a noisy guard band.
+ * work around : duplicate the buffer, and add the noise */
+size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                    ZDICT_legacy_params_t params);
diff --git a/contrib/benchmarkDictBuilder/test.sh b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
similarity index 54%
rename from contrib/benchmarkDictBuilder/test.sh
rename to contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
index 6354784e..5eaf5930 100644
--- a/contrib/benchmarkDictBuilder/test.sh
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
@@ -1,2 +1,2 @@
 echo "Benchmark with in=../../lib/common"
-./benchmark in=../../lib/common
+./benchmark in=../../../lib/common
diff --git a/contrib/randomDictBuilder/Makefile b/contrib/experimental_dict_builders/randomDictBuilder/Makefile
similarity index 79%
rename from contrib/randomDictBuilder/Makefile
rename to contrib/experimental_dict_builders/randomDictBuilder/Makefile
index 5f9240bf..bbd40e47 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/experimental_dict_builders/randomDictBuilder/Makefile
@@ -2,9 +2,9 @@ ARG :=
 
 CC ?= gcc
 CFLAGS ?= -O3
-INCLUDES := -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+INCLUDES := -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
 
-TEST_INPUT := ../../lib
+TEST_INPUT := ../../../lib
 TEST_OUTPUT := randomDict
 
 all: main run clean
@@ -30,8 +30,8 @@ io.o: io.c
 	$(CC) $(CFLAGS) $(INCLUDES) -c io.c
 
 libzstd.a:
-	$(MAKE) -C ../../lib libzstd.a
-	mv ../../lib/libzstd.a .
+	$(MAKE) -C ../../../lib libzstd.a
+	mv ../../../lib/libzstd.a .
 
 .PHONY: testrun
 testrun: main
@@ -48,5 +48,5 @@ testshell: test.sh
 .PHONY: clean
 clean:
 	rm -f *.o main libzstd.a
-	$(MAKE) -C ../../lib clean
+	$(MAKE) -C ../../../lib clean
 	echo "Cleaning is completed"
diff --git a/contrib/randomDictBuilder/README.md b/contrib/experimental_dict_builders/randomDictBuilder/README.md
similarity index 85%
rename from contrib/randomDictBuilder/README.md
rename to contrib/experimental_dict_builders/randomDictBuilder/README.md
index 0e70d3dc..da12a428 100644
--- a/contrib/randomDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/randomDictBuilder/README.md
@@ -16,5 +16,5 @@ To build a random dictionary with the provided arguments: make ARG= followed by
 
 
 ### Examples:
-make ARG="in=../../lib/dictBuilder out=dict100 dictID=520"
-make ARG="in=../../lib/dictBuilder in=../../lib/compress"
+make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520"
+make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
diff --git a/contrib/randomDictBuilder/io.c b/contrib/experimental_dict_builders/randomDictBuilder/io.c
similarity index 89%
rename from contrib/randomDictBuilder/io.c
rename to contrib/experimental_dict_builders/randomDictBuilder/io.c
index 1217b574..bfe39eae 100644
--- a/contrib/randomDictBuilder/io.c
+++ b/contrib/experimental_dict_builders/randomDictBuilder/io.c
@@ -53,6 +53,39 @@ static const size_t g_maxMemory = (sizeof(size_t) == 4) ?
 #define NOISELENGTH 32
 
 
+/*-*************************************
+*  Commandline related functions
+***************************************/
+unsigned readU32FromChar(const char** stringPtr){
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) exit(1);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) exit(1);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) exit(1);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+unsigned longCommandWArg(const char** stringPtr, const char* longCommand){
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
 
 /* ********************************************************
 *  File related operations
diff --git a/contrib/randomDictBuilder/io.h b/contrib/experimental_dict_builders/randomDictBuilder/io.h
similarity index 78%
rename from contrib/randomDictBuilder/io.h
rename to contrib/experimental_dict_builders/randomDictBuilder/io.h
index e2f454c2..0ee24604 100644
--- a/contrib/randomDictBuilder/io.h
+++ b/contrib/experimental_dict_builders/randomDictBuilder/io.h
@@ -50,5 +50,11 @@ void freeSampleInfo(sampleInfo *info);
 void saveDict(const char* dictFileName, const void* buff, size_t buffSize);
 
 
+unsigned readU32FromChar(const char** stringPtr);
 
-size_t findMaxMem(unsigned long long requiredMem);
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ */
+unsigned longCommandWArg(const char** stringPtr, const char* longCommand);
diff --git a/contrib/randomDictBuilder/main.c b/contrib/experimental_dict_builders/randomDictBuilder/main.c
similarity index 79%
rename from contrib/randomDictBuilder/main.c
rename to contrib/experimental_dict_builders/randomDictBuilder/main.c
index 4751a9e1..3f3a6ca7 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/experimental_dict_builders/randomDictBuilder/main.c
@@ -52,46 +52,6 @@ static const unsigned g_defaultMaxDictSize = 110 KB;
 
 
 
-/*-*************************************
-*  Commandline related functions
-***************************************/
-static unsigned readU32FromChar(const char** stringPtr){
-    const char errorMsg[] = "error: numeric value too large";
-    unsigned result = 0;
-    while ((**stringPtr >='0') && (**stringPtr <='9')) {
-        unsigned const max = (((unsigned)(-1)) / 10) - 1;
-        if (result > max) exit(1);
-        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
-    }
-    if ((**stringPtr=='K') || (**stringPtr=='M')) {
-        unsigned const maxK = ((unsigned)(-1)) >> 10;
-        if (result > maxK) exit(1);
-        result <<= 10;
-        if (**stringPtr=='M') {
-            if (result > maxK) exit(1);
-            result <<= 10;
-        }
-        (*stringPtr)++;  /* skip `K` or `M` */
-        if (**stringPtr=='i') (*stringPtr)++;
-        if (**stringPtr=='B') (*stringPtr)++;
-    }
-    return result;
-}
-
-/** longCommandWArg() :
- *  check if *stringPtr is the same as longCommand.
- *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
- * @return 0 and doesn't modify *stringPtr otherwise.
- */
-static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){
-    size_t const comSize = strlen(longCommand);
-    int const result = !strncmp(*stringPtr, longCommand, comSize);
-    if (result) *stringPtr += comSize;
-    return result;
-}
-
-
-
 /*-*************************************
 *  RANDOM
 ***************************************/
diff --git a/contrib/randomDictBuilder/random.c b/contrib/experimental_dict_builders/randomDictBuilder/random.c
similarity index 100%
rename from contrib/randomDictBuilder/random.c
rename to contrib/experimental_dict_builders/randomDictBuilder/random.c
diff --git a/contrib/randomDictBuilder/random.h b/contrib/experimental_dict_builders/randomDictBuilder/random.h
similarity index 100%
rename from contrib/randomDictBuilder/random.h
rename to contrib/experimental_dict_builders/randomDictBuilder/random.h
diff --git a/contrib/randomDictBuilder/test.sh b/contrib/experimental_dict_builders/randomDictBuilder/test.sh
similarity index 52%
rename from contrib/randomDictBuilder/test.sh
rename to contrib/experimental_dict_builders/randomDictBuilder/test.sh
index 497820f8..1eb732e5 100644
--- a/contrib/randomDictBuilder/test.sh
+++ b/contrib/experimental_dict_builders/randomDictBuilder/test.sh
@@ -1,12 +1,12 @@
 echo "Building random dictionary with in=../../lib/common k=200 out=dict1"
-./main in=../../lib/common k=200 out=dict1
-zstd -be3 -D dict1 -r ../../lib/common -q
+./main in=../../../lib/common k=200 out=dict1
+zstd -be3 -D dict1 -r ../../../lib/common -q
 echo "Building random dictionary with in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000"
-./main in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
-zstd -be3 -D dict2 -r ../../lib/common -q
+./main in=../../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
+zstd -be3 -D dict2 -r ../../../lib/common -q
 echo "Building random dictionary with 2 sample sources"
-./main in=../../lib/common in=../../lib/compress out=dict3
-zstd -be3 -D dict3 -r ../../lib/common -q
+./main in=../../../lib/common in=../../../lib/compress out=dict3
+zstd -be3 -D dict3 -r ../../../lib/common -q
 echo "Removing dict1 dict2 dict3"
 rm -f dict1 dict2 dict3
 

From b6c5d4982c489b76b4b0e994c680b1e3bd01080b Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 20 Jul 2018 17:41:22 -0700
Subject: [PATCH 21/35] Minor fix

---
 .../benchmarkDictBuilder/benchmark.c          | 28 ++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
index 890afb8b..64041964 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@@ -78,7 +78,7 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
                                   DEFAULT_DISPLAYLEVEL;   /* no dict */
     void* const dictBuffer = malloc(maxDictSize);
 
-    dictInfo* dInfo;
+    dictInfo* dInfo = NULL;
 
     /* Checks */
     if (!dictBuffer)
@@ -118,16 +118,16 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
   /* Local variables */
   size_t totalCompressedSize = 0;
   size_t totalOriginalSize = 0;
-  unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0;
+  const unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0;
   double cRatio;
   size_t dstCapacity;
   int i;
 
   /* Pointers */
-  ZSTD_CCtx* cctx;
-  ZSTD_CDict *cdict;
-  size_t *offsets;
-  void* dst;
+  ZSTD_CDict *cdict = NULL;
+  ZSTD_CCtx* cctx = NULL;
+  size_t *offsets = NULL;
+  void* dst = NULL;
 
   /* Allocate dst with enough space to compress the maximum sized sample */
   {
@@ -150,7 +150,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
   cctx = ZSTD_createCCtx();
   if(!cctx || !dst) {
     cRatio = -1;
-    goto _nodictCleanup;
+    goto _cleanup;
   }
 
   /* Create CDict if there's a dictionary stored on buffer */
@@ -158,7 +158,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
     cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel);
     if(!cdict) {
       cRatio = -1;
-      goto _dictCleanup;
+      goto _cleanup;
     }
   }
 
@@ -173,8 +173,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
     }
     if (ZSTD_isError(compressedSize)) {
       cRatio = -1;
-      if(hasDict) goto _dictCleanup;
-      else goto _nodictCleanup;
+      goto _cleanup;
     }
     totalCompressedSize += compressedSize;
   }
@@ -189,14 +188,11 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
   DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize);
   cRatio = (double)totalOriginalSize/(double)totalCompressedSize;
 
-_dictCleanup:
-  ZSTD_freeCDict(cdict);
-
-_nodictCleanup:
+_cleanup:
   free(dst);
   free(offsets);
   ZSTD_freeCCtx(cctx);
-
+  ZSTD_freeCDict(cdict);
   return cRatio;
 }
 
@@ -249,7 +245,7 @@ int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random
   DISPLAYLEVEL(2, "%s took %f seconds to execute \n", name, timeSec);
 
   /* Calculate compression ratio */
-  double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel);
+  const double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel);
   if (cRatio < 0) {
     DISPLAYLEVEL(1, "Compressing with %s dictionary does not work\n", name);
     result = 1;

From 7f3f70f76621f4e488080d27f09614167c7b9a4b Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Wed, 25 Jul 2018 16:34:07 -0700
Subject: [PATCH 22/35] Add Fast Cover Dictionary Builder

---
 .../fastCover/Makefile                        |  54 ++
 .../fastCover/README.md                       |  24 +
 .../fastCover/fastCover.c                     | 738 ++++++++++++++++++
 .../fastCover/fastCover.h                     |  47 ++
 .../fastCover/main.c                          | 177 +++++
 .../fastCover/test.sh                         |  14 +
 6 files changed, 1054 insertions(+)
 create mode 100644 contrib/experimental_dict_builders/fastCover/Makefile
 create mode 100644 contrib/experimental_dict_builders/fastCover/README.md
 create mode 100644 contrib/experimental_dict_builders/fastCover/fastCover.c
 create mode 100644 contrib/experimental_dict_builders/fastCover/fastCover.h
 create mode 100644 contrib/experimental_dict_builders/fastCover/main.c
 create mode 100644 contrib/experimental_dict_builders/fastCover/test.sh

diff --git a/contrib/experimental_dict_builders/fastCover/Makefile b/contrib/experimental_dict_builders/fastCover/Makefile
new file mode 100644
index 00000000..9c56013d
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/Makefile
@@ -0,0 +1,54 @@
+ARG :=
+
+CC ?= gcc
+CFLAGS ?= -O3
+INCLUDES := -I ../../../programs -I ../randomDictBuilder -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
+
+IO_FILE := ../randomDictBuilder/io.c
+
+TEST_INPUT := ../../../lib
+TEST_OUTPUT := fastCoverDict
+
+all: main run clean
+
+.PHONY: test
+test: main testrun testshell clean
+
+.PHONY: run
+run:
+	echo "Building a fastCover dictionary with given arguments"
+	./main $(ARG)
+
+main: main.o io.o fastCover.o libzstd.a
+	$(CC) $(CFLAGS) main.o io.o fastCover.o libzstd.a -o main
+
+main.o: main.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c main.c
+
+fastCover.o: fastCover.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c fastCover.c
+
+io.o: $(IO_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)
+
+libzstd.a:
+	$(MAKE) -C ../../../lib libzstd.a
+	mv ../../../lib/libzstd.a .
+
+.PHONY: testrun
+testrun: main
+	echo "Run with $(TEST_INPUT) and $(TEST_OUTPUT) "
+	./main in=$(TEST_INPUT) out=$(TEST_OUTPUT)
+	zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q
+	rm -f $(TEST_OUTPUT)
+
+.PHONY: testshell
+testshell: test.sh
+	sh test.sh
+	echo "Finish running test.sh"
+
+.PHONY: clean
+clean:
+	rm -f *.o main libzstd.a
+	$(MAKE) -C ../../../lib clean
+	echo "Cleaning is completed"
diff --git a/contrib/experimental_dict_builders/fastCover/README.md b/contrib/experimental_dict_builders/fastCover/README.md
new file mode 100644
index 00000000..088e38be
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/README.md
@@ -0,0 +1,24 @@
+FastCover Dictionary Builder
+
+### Permitted Arguments:
+Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
+Output Dictionary (out=dictName): if not provided, default to fastCoverDict
+Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0
+Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB
+Size of Selected Segment (k=#): positive number; in bytes; if not provided, default to 200
+Size of Dmer (d=#): positive number; in bytes; if not provided, default to 8
+Number of steps (steps=#): positive number, if not provided, default to 32
+Percentage of samples used for training(split=#): positive number; if not provided, default to 100
+
+
+###Running Test:
+make test
+
+
+###Usage:
+To build a random dictionary with the provided arguments: make ARG= followed by arguments
+
+
+### Examples:
+make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520"
+make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c
new file mode 100644
index 00000000..6d3ad90a
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.c
@@ -0,0 +1,738 @@
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "fastCover.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#include "zdict.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define FASTCOVER_MAX_F 32
+#define DEFAULT_SPLITPOINT 1.0
+
+/*-*************************************
+*  Console display
+***************************************/
+static int g_displayLevel = 2;
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+
+/*-*************************************
+* Hash Function
+***************************************/
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+/**
+ * Hash the 8-byte value pointed to by p and mod 2^f
+ */
+static size_t FASTCOVER_hash8PtrToIndex(const void* p, U32 h) {
+  return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
+}
+
+
+/*-*************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
+  size_t nbDmers;
+  U32 *freqs;
+  unsigned d;
+} FASTCOVER_ctx_t;
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/**
+ * Returns the sum of the sample sizes.
+ */
+static size_t FASTCOVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+  size_t sum = 0;
+  unsigned i;
+  for (i = 0; i < nbSamples; ++i) {
+    sum += samplesSizes[i];
+  }
+  return sum;
+}
+
+
+/*-*************************************
+*  fast functions
+***************************************/
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} FASTCOVER_segment_t;
+
+
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of all dmers with hash value d.
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer with hash value d is in the dictionay we set F(d) = F(d)/2.
+ */
+static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
+                                                  U32 *freqs, U32 begin,U32 end,
+                                                  ZDICT_fastCover_params_t parameters) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 dmersInK = k - d + 1;
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  FASTCOVER_segment_t bestSegment = {0, 0, 0};
+  FASTCOVER_segment_t activeSegment;
+  /* Reset the activeDmers in the segment */
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* Get hash value of current dmer  */
+    size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.end, parameters.f);
+    /* Add frequency of this index to score */
+    activeSegment.score += freqs[index];
+    /* Increment end of segment */
+    activeSegment.end += 1;
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      /* Get hash value of the dmer to be eliminated from active segment */
+      size_t delIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.begin, parameters.f);
+      /* Subtract frequency of this index from score */
+      activeSegment.score -= freqs[delIndex];
+      /* Increment start of segment */
+      activeSegment.begin += 1;
+    }
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+  {
+    /* Trim off the zero frequency head and tail from the segment. */
+    U32 newBegin = bestSegment.end;
+    U32 newEnd = bestSegment.begin;
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f);
+      U32 freq = freqs[index];
+      if (freq != 0) {
+        newBegin = MIN(newBegin, pos);
+        newEnd = pos + 1;
+      }
+    }
+    bestSegment.begin = newBegin;
+    bestSegment.end = newEnd;
+  }
+  {
+    /* Half the frequency of hash value of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      size_t i = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f);
+      freqs[i] = freqs[i]/2;
+    }
+  }
+  return bestSegment;
+}
+
+/**
+ * Check the validity of the parameters.
+ * Returns non-zero if the parameters are valid and 0 otherwise.
+ */
+static int FASTCOVER_checkParameters(ZDICT_fastCover_params_t parameters,
+                                 size_t maxDictSize) {
+  /* k, d, and f are required parameters */
+  if (parameters.d == 0 || parameters.k == 0 || parameters.f == 0) {
+    return 0;
+  }
+  /* 0 < f <= FASTCOVER_MAX_F */
+  if (parameters.f > FASTCOVER_MAX_F) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
+    return 0;
+  }
+  return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
+ */
+static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) {
+  if (!ctx) {
+    return;
+  }
+  if (ctx->freqs) {
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+  }
+  if (ctx->offsets) {
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+  }
+}
+
+/**
+ * Calculate for frequency of hash value of each dmer in ctx->samples
+ */
+static void FASTCOVER_getFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){
+  /* inCurrSample keeps track of this hash value has already be seen in previous dmers in the same sample*/
+  size_t* inCurrSample = (size_t *)malloc((1<<f)*sizeof(size_t));
+  size_t start; /* start of current dmer */
+  for (unsigned i = 0; i < ctx->nbTrainSamples; i++) {
+    memset(inCurrSample, 0, (1 << f)); /* Reset inCurrSample for each sample */
+    size_t currSampleStart = ctx->offsets[i];
+    size_t currSampleEnd = ctx->offsets[i+1];
+    start = currSampleStart;
+    while (start + f < currSampleEnd) {
+      size_t dmerIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + start, f);
+      /* if no dmer with same hash value has been seen in current sample */
+      if (inCurrSample[dmerIndex] == 0) {
+        inCurrSample[dmerIndex]++;
+        freqs[dmerIndex]++;
+      }
+      start++;
+    }
+  }
+  free(inCurrSample);
+}
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can used multiple
+ * times.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
+ */
+static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer,
+                          const size_t *samplesSizes, unsigned nbSamples,
+                          unsigned d, double splitPoint, unsigned f) {
+  const BYTE *const samples = (const BYTE *)samplesBuffer;
+  const size_t totalSamplesSize = FASTCOVER_sum(samplesSizes, nbSamples);
+  /* Split samples into testing and training sets */
+  const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+  const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+  const size_t trainingSamplesSize = splitPoint < 1.0 ? FASTCOVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+  const size_t testSamplesSize = splitPoint < 1.0 ? FASTCOVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
+  /* Checks */
+  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+      totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
+    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                 (U32)(totalSamplesSize>>20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+    return 0;
+  }
+  /* Check if there are at least 5 training samples */
+  if (nbTrainSamples < 5) {
+    DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
+    return 0;
+  }
+  /* Check if there's testing sample */
+  if (nbTestSamples < 1) {
+    DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
+    return 0;
+  }
+  /* Zero the context */
+  memset(ctx, 0, sizeof(*ctx));
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+               (U32)trainingSamplesSize);
+  DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+               (U32)testSamplesSize);
+
+  ctx->samples = samples;
+  ctx->samplesSizes = samplesSizes;
+  ctx->nbSamples = nbSamples;
+  ctx->nbTrainSamples = nbTrainSamples;
+  ctx->nbTestSamples = nbTestSamples;
+  ctx->nbDmers = trainingSamplesSize - d + 1;
+  ctx->d = d;
+
+  /* The offsets of each file */
+  ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
+  if (!ctx->offsets) {
+    DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
+    FASTCOVER_ctx_destroy(ctx);
+    return 0;
+  }
+
+  /* Fill offsets from the samplesSizes */
+  {
+    U32 i;
+    ctx->offsets[0] = 0;
+    for (i = 1; i <= nbSamples; ++i) {
+      ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+    }
+  }
+
+  /* Initialize frequency array of size 2^f */
+  ctx->freqs =(U32 *)malloc((1 << f) * sizeof(U32));
+  memset(ctx->freqs, 0, (1 << f) * sizeof(U32));
+
+  DISPLAYLEVEL(2, "Computing frequencies\n");
+  FASTCOVER_getFrequency(ctx->freqs, f, ctx);
+
+  return 1;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t FASTCOVER_buildDictionary(const FASTCOVER_ctx_t *ctx, U32 *freqs,
+                                    void *dictBuffer,
+                                    size_t dictBufferCapacity,
+                                    ZDICT_fastCover_params_t parameters){
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data up into epochs of equal size.
+   * We will select at least one segment from each epoch.
+   */
+  const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
+  const U32 epochSize = (U32)(ctx->nbDmers / epochs);
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
+               epochSize);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
+    const U32 epochBegin = (U32)(epoch * epochSize);
+    const U32 epochEnd = epochBegin + epochSize;
+    size_t segmentSize;
+    /* Select a segment */
+    FASTCOVER_segment_t segment = FASTCOVER_selectSegment(
+        ctx, freqs, epochBegin, epochEnd, parameters);
+
+    /* If the segment covers no dmers, then we are out of content */
+    if (segment.score == 0) {
+      break;
+    }
+
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+
+/**
+ * FASTCOVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except FASTCOVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct fast_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_fastCover_params_t parameters;
+  size_t compressedSize;
+} FASTCOVER_best_t;
+
+/**
+ * Initialize the `FASTCOVER_best_t`.
+ */
+static void FASTCOVER_best_init(FASTCOVER_best_t *best) {
+  if (best==NULL) return; /* compatible with init on NULL */
+  (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
+  (void)ZSTD_pthread_cond_init(&best->cond, NULL);
+  best->liveJobs = 0;
+  best->dict = NULL;
+  best->dictSize = 0;
+  best->compressedSize = (size_t)-1;
+  memset(&best->parameters, 0, sizeof(best->parameters));
+}
+
+/**
+ * Wait until liveJobs == 0.
+ */
+static void FASTCOVER_best_wait(FASTCOVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  while (best->liveJobs != 0) {
+    ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
+  }
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Call FASTCOVER_best_wait() and then destroy the FASTCOVER_best_t.
+ */
+static void FASTCOVER_best_destroy(FASTCOVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  FASTCOVER_best_wait(best);
+  if (best->dict) {
+    free(best->dict);
+  }
+  ZSTD_pthread_mutex_destroy(&best->mutex);
+  ZSTD_pthread_cond_destroy(&best->cond);
+}
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+static void FASTCOVER_best_start(FASTCOVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  ++best->liveJobs;
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+static void FASTCOVER_best_finish(FASTCOVER_best_t *best, size_t compressedSize,
+                              ZDICT_fastCover_params_t parameters, void *dict,
+                              size_t dictSize) {
+  if (!best) {
+    return;
+  }
+  {
+    size_t liveJobs;
+    ZSTD_pthread_mutex_lock(&best->mutex);
+    --best->liveJobs;
+    liveJobs = best->liveJobs;
+    /* If the new dictionary is better */
+    if (compressedSize < best->compressedSize) {
+      /* Allocate space if necessary */
+      if (!best->dict || best->dictSize < dictSize) {
+        if (best->dict) {
+          free(best->dict);
+        }
+        best->dict = malloc(dictSize);
+        if (!best->dict) {
+          best->compressedSize = ERROR(GENERIC);
+          best->dictSize = 0;
+          return;
+        }
+      }
+      /* Save the dictionary, parameters, and size */
+      memcpy(best->dict, dict, dictSize);
+      best->dictSize = dictSize;
+      best->parameters = parameters;
+      best->compressedSize = compressedSize;
+    }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
+    if (liveJobs == 0) {
+      ZSTD_pthread_cond_broadcast(&best->cond);
+    }
+  }
+}
+
+/**
+ * Parameters for FASTCOVER_tryParameters().
+ */
+typedef struct FASTCOVER_tryParameters_data_s {
+  const FASTCOVER_ctx_t *ctx;
+  FASTCOVER_best_t *best;
+  size_t dictBufferCapacity;
+  ZDICT_fastCover_params_t parameters;
+} FASTCOVER_tryParameters_data_t;
+
+/**
+ * Tries a set of parameters and updates the FASTCOVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void FASTCOVER_tryParameters(void *opaque) {
+  /* Save parameters as local variables */
+  FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
+  const FASTCOVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_fastCover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Allocate space for hash table, dict, and freqs */
+  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  U32 *freqs = (U32*) malloc((1 << parameters.f) * sizeof(U32));
+  if (!dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, (1 << parameters.f) * sizeof(U32));
+  /* Build the dictionary */
+  {
+    const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict,
+                                              dictBufferCapacity, parameters);
+
+    dictBufferCapacity = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
+        parameters.zParams);
+    if (ZDICT_isError(dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+      goto _cleanup;
+    }
+  }
+  /* Check total compressed size */
+  {
+    /* Pointers */
+    ZSTD_CCtx *cctx;
+    ZSTD_CDict *cdict;
+    void *dst;
+    /* Local variables */
+    size_t dstCapacity;
+    size_t i;
+    /* Allocate dst with enough space to compress the maximum sized sample */
+    {
+      size_t maxSampleSize = 0;
+      i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
+      for (; i < ctx->nbSamples; ++i) {
+        maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
+      }
+      dstCapacity = ZSTD_compressBound(maxSampleSize);
+      dst = malloc(dstCapacity);
+    }
+    /* Create the cctx and cdict */
+    cctx = ZSTD_createCCtx();
+    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                             parameters.zParams.compressionLevel);
+    if (!dst || !cctx || !cdict) {
+      goto _compressCleanup;
+    }
+    /* Compress each sample and sum their sizes (or error) */
+    totalCompressedSize = dictBufferCapacity;
+    i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
+    for (; i < ctx->nbSamples; ++i) {
+      const size_t size = ZSTD_compress_usingCDict(
+          cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
+          ctx->samplesSizes[i], cdict);
+      if (ZSTD_isError(size)) {
+        totalCompressedSize = ERROR(GENERIC);
+        goto _compressCleanup;
+      }
+      totalCompressedSize += size;
+    }
+  _compressCleanup:
+    ZSTD_freeCCtx(cctx);
+    ZSTD_freeCDict(cdict);
+    if (dst) {
+      free(dst);
+    }
+  }
+
+_cleanup:
+  FASTCOVER_best_finish(data->best, totalCompressedSize, parameters, dict,
+                    dictBufferCapacity);
+  free(data);
+  if (dict) {
+    free(dict);
+  }
+  if (freqs) {
+    free(freqs);
+  }
+}
+
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_fastCover_params_t *parameters) {
+    /* constants */
+    const unsigned nbThreads = parameters->nbThreads;
+    const double splitPoint =
+        parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
+    const unsigned kMinD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+    const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+    const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+    const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+    const unsigned kIterations =
+        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+    const unsigned f = parameters->f == 0 ? 23 : parameters->f;
+
+    /* Local variables */
+    const int displayLevel = parameters->zParams.notificationLevel;
+    unsigned iteration = 1;
+    unsigned d;
+    unsigned k;
+    FASTCOVER_best_t best;
+    POOL_ctx *pool = NULL;
+
+    /* Checks */
+    if (splitPoint <= 0 || splitPoint > 1) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+      return ERROR(GENERIC);
+    }
+    if (kMinK < kMaxD || kMaxK < kMinK) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "fast must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    if (nbThreads > 1) {
+      pool = POOL_create(nbThreads, 1);
+      if (!pool) {
+        return ERROR(memory_allocation);
+      }
+    }
+    /* Initialization */
+    FASTCOVER_best_init(&best);
+    /* Turn down global display level to clean up display at level 2 and below */
+    g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+    /* Loop through d first because each new value needs a new context */
+    LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                      kIterations);
+    for (d = kMinD; d <= kMaxD; d += 2) {
+      /* Initialize the context for this value of d */
+      FASTCOVER_ctx_t ctx;
+      LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+      if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f)) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+        FASTCOVER_best_destroy(&best);
+        POOL_free(pool);
+        return ERROR(GENERIC);
+      }
+      /* Loop through k reusing the same context */
+      for (k = kMinK; k <= kMaxK; k += kStepSize) {
+        /* Prepare the arguments */
+        FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
+            sizeof(FASTCOVER_tryParameters_data_t));
+        LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+        if (!data) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+          FASTCOVER_best_destroy(&best);
+          FASTCOVER_ctx_destroy(&ctx);
+          POOL_free(pool);
+          return ERROR(GENERIC);
+        }
+        data->ctx = &ctx;
+        data->best = &best;
+        data->dictBufferCapacity = dictBufferCapacity;
+        data->parameters = *parameters;
+        data->parameters.k = k;
+        data->parameters.d = d;
+        data->parameters.f = f;
+        data->parameters.splitPoint = splitPoint;
+        data->parameters.steps = kSteps;
+        data->parameters.zParams.notificationLevel = g_displayLevel;
+        /* Check the parameters */
+        if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity)) {
+          DISPLAYLEVEL(1, "fastCover parameters incorrect\n");
+          free(data);
+          continue;
+        }
+        /* Call the function and pass ownership of data to it */
+        FASTCOVER_best_start(&best);
+        if (pool) {
+          POOL_add(pool, &FASTCOVER_tryParameters, data);
+        } else {
+          FASTCOVER_tryParameters(data);
+        }
+        /* Print status */
+        LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                           (U32)((iteration * 100) / kIterations));
+        ++iteration;
+      }
+      FASTCOVER_best_wait(&best);
+      FASTCOVER_ctx_destroy(&ctx);
+    }
+    LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+    /* Fill the output buffer and parameters with output of the best parameters */
+    {
+      const size_t dictSize = best.dictSize;
+      if (ZSTD_isError(best.compressedSize)) {
+        const size_t compressedSize = best.compressedSize;
+        FASTCOVER_best_destroy(&best);
+        POOL_free(pool);
+        return compressedSize;
+      }
+      *parameters = best.parameters;
+      memcpy(dictBuffer, best.dict, dictSize);
+      FASTCOVER_best_destroy(&best);
+      POOL_free(pool);
+      return dictSize;
+    }
+
+}
diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.h b/contrib/experimental_dict_builders/fastCover/fastCover.h
new file mode 100644
index 00000000..eca04baa
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.h
@@ -0,0 +1,47 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+
+
+
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned f;                  /* log of size of frequency array */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
+    ZDICT_params_t zParams;
+} ZDICT_fastCover_params_t;
+
+
+
+/*! ZDICT_optimizeTrainFromBuffer_fastCover():
+ *  Train a dictionary from an array of samples using a modified version of the COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ *  All of the parameters except for f are optional.
+ *  If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ *  if steps is zero it defaults to its default value.
+ *  If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ *
+ *  @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_fastCover_params_t *parameters);
diff --git a/contrib/experimental_dict_builders/fastCover/main.c b/contrib/experimental_dict_builders/fastCover/main.c
new file mode 100644
index 00000000..260eeb28
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/main.c
@@ -0,0 +1,177 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "fastCover.h"
+#include "io.h"
+#include "util.h"
+#include "zdict.h"
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+
+/*-*************************************
+*  Constants
+***************************************/
+static const unsigned g_defaultMaxDictSize = 110 KB;
+#define DEFAULT_CLEVEL 3
+
+
+/*-*************************************
+*  FASTCOVER
+***************************************/
+int FASTCOVER_trainFromFiles(const char* dictFileName, sampleInfo *info,
+                          unsigned maxDictSize,
+                          ZDICT_fastCover_params_t *params) {
+    unsigned const displayLevel = params->zParams.notificationLevel;
+    void* const dictBuffer = malloc(maxDictSize);
+
+    int result = 0;
+
+    /* Checks */
+    if (!dictBuffer)
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+
+    {   size_t dictSize;
+        dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+                                             info->samplesSizes, info->nbSamples, params);
+        DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint*100));
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            result = 1;
+            goto _done;
+        }
+        /* save dict */
+        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
+        saveDict(dictFileName, dictBuffer, dictSize);
+    }
+
+    /* clean up */
+_done:
+    free(dictBuffer);
+    return result;
+}
+
+
+
+int main(int argCount, const char* argv[])
+{
+  int displayLevel = 2;
+  const char* programName = argv[0];
+  int operationResult = 0;
+
+  /* Initialize arguments to default values */
+  unsigned k = 200;
+  unsigned d = 8;
+  unsigned f = 23;
+  unsigned steps = 32;
+  unsigned nbThreads = 1;
+  unsigned split = 100;
+  const char* outputFile = "fastCoverDict";
+  unsigned dictID = 0;
+  unsigned maxDictSize = g_defaultMaxDictSize;
+
+  /* Initialize table to store input files */
+  const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
+  unsigned filenameIdx = 0;
+
+  char* fileNamesBuf = NULL;
+  unsigned fileNamesNb = filenameIdx;
+  int followLinks = 0; /* follow directory recursively */
+  const char** extendedFileList = NULL;
+
+  /* Parse arguments */
+  for (int i = 1; i < argCount; i++) {
+    const char* argument = argv[i];
+    if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "d=")) { d = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "f=")) { f = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "steps=")) { steps = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "split=")) { split = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "in=")) {
+      filenameTable[filenameIdx] = argument;
+      filenameIdx++;
+      continue;
+    }
+    if (longCommandWArg(&argument, "out=")) {
+      outputFile = argument;
+      continue;
+    }
+    DISPLAYLEVEL(1, "Incorrect parameters\n");
+    operationResult = 1;
+    return operationResult;
+  }
+
+  /* Get the list of all files recursively (because followLinks==0)*/
+  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
+                                        &fileNamesNb, followLinks);
+  if (extendedFileList) {
+      unsigned u;
+      for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
+      free((void*)filenameTable);
+      filenameTable = extendedFileList;
+      filenameIdx = fileNamesNb;
+  }
+
+  size_t blockSize = 0;
+
+  /* Set up zParams */
+  ZDICT_params_t zParams;
+  zParams.compressionLevel = DEFAULT_CLEVEL;
+  zParams.notificationLevel = displayLevel;
+  zParams.dictID = dictID;
+
+  /* Set up fastCover params */
+  ZDICT_fastCover_params_t params;
+  params.zParams = zParams;
+  params.k = k;
+  params.d = d;
+  params.f = f;
+  params.steps = steps;
+  params.nbThreads = nbThreads;
+  params.splitPoint = (double)split/100;
+
+  /* Build dictionary */
+  sampleInfo* info= getSampleInfo(filenameTable,
+                    filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
+  operationResult = FASTCOVER_trainFromFiles(outputFile, info, maxDictSize, &params);
+
+  /* Free allocated memory */
+  UTIL_freeFileList(extendedFileList, fileNamesBuf);
+  freeSampleInfo(info);
+
+  return operationResult;
+}
diff --git a/contrib/experimental_dict_builders/fastCover/test.sh b/contrib/experimental_dict_builders/fastCover/test.sh
new file mode 100644
index 00000000..b5570fef
--- /dev/null
+++ b/contrib/experimental_dict_builders/fastCover/test.sh
@@ -0,0 +1,14 @@
+echo "Building fastCover dictionary with in=../../lib/common k=200 f=20 out=dict1"
+./main in=../../../lib/common k=200 f=20 out=dict1
+zstd -be3 -D dict1 -r ../../../lib/common -q
+echo "Building fastCover dictionary with in=../../lib/common k=500 f=24 out=dict2 dictID=100 maxdict=140000"
+./main in=../../../lib/common k=500 f=24 out=dict2 dictID=100 maxdict=140000
+zstd -be3 -D dict2 -r ../../../lib/common -q
+echo "Building fastCover dictionary with 2 sample sources"
+./main in=../../../lib/common in=../../../lib/compress out=dict3
+zstd -be3 -D dict3 -r ../../../lib/common -q
+echo "Removing dict1 dict2 dict3"
+rm -f dict1 dict2 dict3
+
+echo "Testing with invalid parameters, should fail"
+! ./main r=10

From f5407e398a4570632b07a52ba86f3a8ac04c80fa Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Wed, 25 Jul 2018 16:54:08 -0700
Subject: [PATCH 23/35] Make hash value const

---
 .../experimental_dict_builders/fastCover/fastCover.c   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c
index 6d3ad90a..32a15a4b 100644
--- a/contrib/experimental_dict_builders/fastCover/fastCover.c
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.c
@@ -138,7 +138,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
    */
   while (activeSegment.end < end) {
     /* Get hash value of current dmer  */
-    size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.end, parameters.f);
+    const size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.end, parameters.f);
     /* Add frequency of this index to score */
     activeSegment.score += freqs[index];
     /* Increment end of segment */
@@ -146,7 +146,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
     /* If the window is now too large, drop the first position */
     if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
       /* Get hash value of the dmer to be eliminated from active segment */
-      size_t delIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.begin, parameters.f);
+      const size_t delIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.begin, parameters.f);
       /* Subtract frequency of this index from score */
       activeSegment.score -= freqs[delIndex];
       /* Increment start of segment */
@@ -163,7 +163,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
     U32 newEnd = bestSegment.begin;
     U32 pos;
     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
-      size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f);
+      const size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f);
       U32 freq = freqs[index];
       if (freq != 0) {
         newBegin = MIN(newBegin, pos);
@@ -177,7 +177,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
     /* Half the frequency of hash value of each dmer covered by the chosen segment. */
     U32 pos;
     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
-      size_t i = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f);
+      const size_t i = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f);
       freqs[i] = freqs[i]/2;
     }
   }
@@ -244,7 +244,7 @@ static void FASTCOVER_getFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx)
     size_t currSampleEnd = ctx->offsets[i+1];
     start = currSampleStart;
     while (start + f < currSampleEnd) {
-      size_t dmerIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + start, f);
+      const size_t dmerIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + start, f);
       /* if no dmer with same hash value has been seen in current sample */
       if (inCurrSample[dmerIndex] == 0) {
         inCurrSample[dmerIndex]++;

From d1fc507ef998f511f6f1da7edc57670bb6b3404f Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Wed, 25 Jul 2018 17:05:54 -0700
Subject: [PATCH 24/35] Initial benchmarking result for fastCover

---
 .../benchmarkDictBuilder/Makefile             | 10 +++--
 .../benchmarkDictBuilder/README.md            | 40 ++++++++++--------
 .../benchmarkDictBuilder/benchmark.c          | 42 +++++++++++++++----
 3 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
index 72ce04f2..68149488 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
@@ -2,9 +2,10 @@ ARG :=
 
 CC ?= gcc
 CFLAGS ?= -O3
-INCLUDES := -I ../randomDictBuilder -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
+INCLUDES := -I ../randomDictBuilder -I ../fastCover -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
 
 RANDOM_FILE := ../randomDictBuilder/random.c
+FAST_FILE := ../fastCover/fastCover.c
 IO_FILE := ../randomDictBuilder/io.c
 
 all: run clean
@@ -21,8 +22,8 @@ test: benchmarkTest clean
 benchmarkTest: benchmark test.sh
 	sh test.sh
 
-benchmark: benchmark.o io.o random.o libzstd.a
-	$(CC) $(CFLAGS) benchmark.o io.o random.o libzstd.a -o benchmark
+benchmark: benchmark.o io.o random.o fastCover.o libzstd.a
+	$(CC) $(CFLAGS) benchmark.o io.o random.o fastCover.o libzstd.a -o benchmark
 
 benchmark.o: benchmark.c
 	$(CC) $(CFLAGS) $(INCLUDES) -c benchmark.c
@@ -30,6 +31,9 @@ benchmark.o: benchmark.c
 random.o: $(RANDOM_FILE)
 	$(CC) $(CFLAGS) $(INCLUDES) -c $(RANDOM_FILE)
 
+fastCover.o: $(FAST_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) -c $(FAST_FILE)
+
 io.o: $(IO_FILE)
 	$(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)
 
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
index de783a0e..e02d592c 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -18,30 +18,34 @@ github:
 | Algorithm     | Speed(sec)    | Compression Ratio  |
 | ------------- |:-------------:| ------------------:|
 | nodict        | 0.000004      |  2.999642          |
-| random        | 0.180238      |  8.786957          |
-| cover         | 33.891987     |  10.430999         |
-| legacy        | 1.077569      |  8.989482          |
+| random        | 0.135459      |  8.786957          |
+| cover         | 50.341079     |  10.641263         |
+| legacy        | 0.866283      |  8.989482          |
+| fastCover     | 13.450947     |  10.215174         |
 
 hg-commands
 | Algorithm     | Speed(sec)    | Compression Ratio  |
 | ------------- |:-------------:| ------------------:|
-| nodict        | 0.000006      |  2.425291          |
-| random        | 0.088735      |  3.489515          |
-| cover         | 35.447300     |  4.030274          |
-| legacy        | 1.048509      |  3.911896          |
+| nodict        | 0.000020      |  2.425291          |
+| random        | 0.088828      |  3.489515          |
+| cover         | 60.028672     |  4.131136          |
+| legacy        | 0.852481      |  3.911896          |
+| fastCover     | 9.524284      |  3.977229          |
+
+hg-changelog
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| nodict        | 0.000004      |  1.377613          |
+| random        | 0.621812      |  2.096785          |
+| cover         | 217.510962    |  2.188654          |
+| legacy        | 2.559194      |  2.058273          |
+| fastCover     | 51.132516     |  2.124185          |
 
 hg-manifest
 | Algorithm     | Speed(sec)    | Compression Ratio  |
 | ------------- |:-------------:| ------------------:|
 | nodict        | 0.000005      |  1.866385          |
-| random        | 1.148231      |  2.309485          |
-| cover         | 509.685257    |  2.575331          |
-| legacy        | 10.705866     |  2.506775          |
-
-hg-changelog
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| nodict        | 0.000005      |  1.377613          |
-| random        | 0.706434      |  2.096785          |
-| cover         | 122.815783    |  2.175706          |
-| legacy        | 3.010318      |  2.058273          |
+| random        | 1.035220      |  2.309485          |
+| cover         | 930.480173    |  2.582597          |
+| legacy        | 8.916513      |  2.506775          |
+| fastCover     | 116.871089    |  2.525689          |
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
index 64041964..865ecb34 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@@ -5,6 +5,7 @@
 #include <ctype.h>
 #include <time.h>
 #include "random.h"
+#include "fastCover.h"
 #include "dictBuilder.h"
 #include "zstd_internal.h" /* includes zstd.h */
 #include "io.h"
@@ -71,10 +72,11 @@ typedef struct {
  */
 dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
                   ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams,
-                  ZDICT_legacy_params_t *legacyParams) {
+                  ZDICT_legacy_params_t *legacyParams, ZDICT_fastCover_params_t *fastParams) {
     unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel :
                                   coverParams ? coverParams->zParams.notificationLevel :
                                   legacyParams ? legacyParams->zParams.notificationLevel :
+                                  fastParams ? fastParams->zParams.notificationLevel :
                                   DEFAULT_DISPLAYLEVEL;   /* no dict */
     void* const dictBuffer = malloc(maxDictSize);
 
@@ -94,6 +96,9 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
         } else if(legacyParams) {
           dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer,
                                                info->samplesSizes, info->nbSamples, *legacyParams);
+        } else if(fastParams) {
+          dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+                                                info->samplesSizes, info->nbSamples, fastParams);
         } else {
           dictSize = 0;
         }
@@ -216,25 +221,29 @@ void freeDictInfo(dictInfo* info) {
  *  @return 0 if benchmark successfully, 1 otherwise
  */
 int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam,
-                        ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam) {
+                        ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam,
+                        ZDICT_fastCover_params_t *fastParam) {
   /* Local variables */
   const unsigned displayLevel = randomParam ? randomParam->zParams.notificationLevel :
                                 coverParam ? coverParam->zParams.notificationLevel :
                                 legacyParam ? legacyParam->zParams.notificationLevel :
+                                fastParam ? fastParam->zParams.notificationLevel:
                                 DEFAULT_DISPLAYLEVEL;   /* no dict */
   const char* name = randomParam ? "RANDOM" :
                     coverParam ? "COVER" :
                     legacyParam ? "LEGACY" :
+                    fastParam ? "FAST":
                     "NODICT";    /* no dict */
   const unsigned cLevel = randomParam ? randomParam->zParams.compressionLevel :
                           coverParam ? coverParam->zParams.compressionLevel :
                           legacyParam ? legacyParam->zParams.compressionLevel :
+                          fastParam ? fastParam->zParams.compressionLevel:
                           DEFAULT_CLEVEL;   /* no dict */
   int result = 0;
 
   /* Calculate speed */
   const UTIL_time_t begin = UTIL_getTime();
-  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam);
+  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam, fastParam);
   const U64 timeMicro = UTIL_clockSpanMicro(begin);
   const double timeSec = timeMicro / (double)SEC_TO_MICRO;
   if (!dInfo) {
@@ -269,7 +278,6 @@ int main(int argCount, const char* argv[])
 
   /* Initialize arguments to default values */
   const unsigned k = 200;
-  const unsigned d = 6;
   const unsigned cLevel = DEFAULT_CLEVEL;
   const unsigned dictID = 0;
   const unsigned maxDictSize = g_defaultMaxDictSize;
@@ -319,7 +327,7 @@ int main(int argCount, const char* argv[])
 
   /* with no dict */
   {
-    const int noDictResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL);
+    const int noDictResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, NULL);
     if(noDictResult) {
       result = 1;
       goto _cleanup;
@@ -331,7 +339,7 @@ int main(int argCount, const char* argv[])
     ZDICT_random_params_t randomParam;
     randomParam.zParams = zParams;
     randomParam.k = k;
-    const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL);
+    const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL, NULL);
     if(randomResult) {
       result = 1;
       goto _cleanup;
@@ -344,10 +352,9 @@ int main(int argCount, const char* argv[])
     memset(&coverParam, 0, sizeof(coverParam));
     coverParam.zParams = zParams;
     coverParam.splitPoint = 1.0;
-    coverParam.d = d;
     coverParam.steps = 40;
     coverParam.nbThreads = 1;
-    const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL);
+    const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL);
     if(coverOptResult) {
       result = 1;
       goto _cleanup;
@@ -359,13 +366,30 @@ int main(int argCount, const char* argv[])
     ZDICT_legacy_params_t legacyParam;
     legacyParam.zParams = zParams;
     legacyParam.selectivityLevel = 9;
-    const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam);
+    const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam, NULL);
     if(legacyResult) {
       result = 1;
       goto _cleanup;
     }
   }
 
+  /* for fastCover */
+  {
+    ZDICT_fastCover_params_t fastParam;
+    memset(&fastParam, 0, sizeof(fastParam));
+    fastParam.zParams = zParams;
+    fastParam.splitPoint = 1.0;
+    fastParam.d = 8;
+    fastParam.f = 23;
+    fastParam.steps = 40;
+    fastParam.nbThreads = 1;
+    const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+    if(fastOptResult) {
+      result = 1;
+      goto _cleanup;
+    }
+  }
+
   /* Free allocated memory */
 _cleanup:
   UTIL_freeFileList(extendedFileList, fileNamesBuf);

From 1e85f314d859c5295f88c98fcd0dc9fa03f68b12 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Wed, 25 Jul 2018 17:53:38 -0700
Subject: [PATCH 25/35] Benchmark fast cover optimize vs k=200

---
 .../benchmarkDictBuilder/README.md            | 60 ++++++++++---------
 .../benchmarkDictBuilder/benchmark.c          | 22 ++++++-
 2 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
index e02d592c..478d8793 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -15,37 +15,41 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
 ###Benchmarking Result:
 
 github:
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| nodict        | 0.000004      |  2.999642          |
-| random        | 0.135459      |  8.786957          |
-| cover         | 50.341079     |  10.641263         |
-| legacy        | 0.866283      |  8.989482          |
-| fastCover     | 13.450947     |  10.215174         |
+| Algorithm         | Speed(sec)    | Compression Ratio  |
+| ------------------|:-------------:| ------------------:|
+| nodict            | 0.000004      |  2.999642          |
+| random            | 0.148247      |  8.786957          |
+| cover             | 56.331553     |  10.641263         |
+| legacy            | 0.917595      |  8.989482          |
+| fastCover(opt)    | 13.169979     |  10.215174         |
+| fastCover(k=200)  | 2.692406      |  8.657219          |
 
 hg-commands
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| nodict        | 0.000020      |  2.425291          |
-| random        | 0.088828      |  3.489515          |
-| cover         | 60.028672     |  4.131136          |
-| legacy        | 0.852481      |  3.911896          |
-| fastCover     | 9.524284      |  3.977229          |
+| Algorithm         | Speed(sec)    | Compression Ratio  |
+| ----------------- |:-------------:| ------------------:|
+| nodict            | 0.000007      |  2.425291          |
+| random            | 0.093990      |  3.489515          |
+| cover             | 58.602385     |  4.131136          |
+| legacy            | 0.865683      |  3.911896          |
+| fastCover(opt)    | 9.404134      |  3.977229          |
+| fastCover(k=200)  | 1.037434      |  3.810326          |
 
 hg-changelog
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| nodict        | 0.000004      |  1.377613          |
-| random        | 0.621812      |  2.096785          |
-| cover         | 217.510962    |  2.188654          |
-| legacy        | 2.559194      |  2.058273          |
-| fastCover     | 51.132516     |  2.124185          |
+| Algorithm         | Speed(sec)    | Compression Ratio  |
+| ----------------- |:-------------:| ------------------:|
+| nodict            | 0.000022      |  1.377613          |
+| random            | 0.551539      |  2.096785          |
+| cover             | 221.370056    |  2.188654          |
+| legacy            | 2.405923      |  2.058273          |
+| fastCover(opt)    | 49.526246     |  2.124185          |
+| fastCover(k=200)  | 9.746872      |  2.114674          |
 
 hg-manifest
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| nodict        | 0.000005      |  1.866385          |
-| random        | 1.035220      |  2.309485          |
-| cover         | 930.480173    |  2.582597          |
-| legacy        | 8.916513      |  2.506775          |
-| fastCover     | 116.871089    |  2.525689          |
+| Algorithm         | Speed(sec)    | Compression Ratio  |
+| ----------------- |:-------------:| ------------------:|
+| nodict            | 0.000019      |  1.866385          |
+| random            | 1.083536      |  2.309485          |
+| cover             | 928.894887    |  2.582597          |
+| legacy            | 9.110371      |  2.506775          |
+| fastCover(opt)    | 116.508270    |  2.525689          |
+| fastCover(k=200)  | 12.176555     |  2.472221          |
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
index 865ecb34..62135436 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@@ -373,7 +373,8 @@ int main(int argCount, const char* argv[])
     }
   }
 
-  /* for fastCover */
+
+  /* for fastCover (optimizing k) */
   {
     ZDICT_fastCover_params_t fastParam;
     memset(&fastParam, 0, sizeof(fastParam));
@@ -390,6 +391,25 @@ int main(int argCount, const char* argv[])
     }
   }
 
+  /* for fastCover (with k provided) */
+  {
+    ZDICT_fastCover_params_t fastParam;
+    memset(&fastParam, 0, sizeof(fastParam));
+    fastParam.zParams = zParams;
+    fastParam.splitPoint = 1.0;
+    fastParam.d = 8;
+    fastParam.f = 23;
+    fastParam.k = 200;
+    fastParam.steps = 40;
+    fastParam.nbThreads = 1;
+    const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+    if(fastOptResult) {
+      result = 1;
+      goto _cleanup;
+    }
+  }
+
+
   /* Free allocated memory */
 _cleanup:
   UTIL_freeFileList(extendedFileList, fileNamesBuf);

From 2333ecb173077edaf34f032baadfcc63531928c1 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Wed, 25 Jul 2018 18:10:09 -0700
Subject: [PATCH 26/35] Allow d=6

---
 .../fastCover/README.md                       |  2 +-
 .../fastCover/fastCover.c                     | 27 +++++++++++++------
 .../fastCover/test.sh                         |  3 ++-
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/contrib/experimental_dict_builders/fastCover/README.md b/contrib/experimental_dict_builders/fastCover/README.md
index 088e38be..66e00ee0 100644
--- a/contrib/experimental_dict_builders/fastCover/README.md
+++ b/contrib/experimental_dict_builders/fastCover/README.md
@@ -6,7 +6,7 @@ Output Dictionary (out=dictName): if not provided, default to fastCoverDict
 Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0
 Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB
 Size of Selected Segment (k=#): positive number; in bytes; if not provided, default to 200
-Size of Dmer (d=#): positive number; in bytes; if not provided, default to 8
+Size of Dmer (d=#): either 6 or 8; if not provided, default to 8
 Number of steps (steps=#): positive number, if not provided, default to 32
 Percentage of samples used for training(split=#): positive number; if not provided, default to 100
 
diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c
index 32a15a4b..abd592cd 100644
--- a/contrib/experimental_dict_builders/fastCover/fastCover.c
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.c
@@ -50,14 +50,21 @@ static clock_t g_time = 0;
 /*-*************************************
 * Hash Function
 ***************************************/
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
 static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
 static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
 static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
 
 /**
- * Hash the 8-byte value pointed to by p and mod 2^f
+ * Hash the d-byte value pointed to by p and mod 2^f
  */
-static size_t FASTCOVER_hash8PtrToIndex(const void* p, U32 h) {
+static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) {
+  if (d == 6) {
+    return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1);
+  }
   return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
 }
 
@@ -138,7 +145,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
    */
   while (activeSegment.end < end) {
     /* Get hash value of current dmer  */
-    const size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.end, parameters.f);
+    const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d);
     /* Add frequency of this index to score */
     activeSegment.score += freqs[index];
     /* Increment end of segment */
@@ -146,7 +153,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
     /* If the window is now too large, drop the first position */
     if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
       /* Get hash value of the dmer to be eliminated from active segment */
-      const size_t delIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.begin, parameters.f);
+      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d);
       /* Subtract frequency of this index from score */
       activeSegment.score -= freqs[delIndex];
       /* Increment start of segment */
@@ -163,7 +170,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
     U32 newEnd = bestSegment.begin;
     U32 pos;
     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
-      const size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f);
+      const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d);
       U32 freq = freqs[index];
       if (freq != 0) {
         newBegin = MIN(newBegin, pos);
@@ -177,7 +184,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
     /* Half the frequency of hash value of each dmer covered by the chosen segment. */
     U32 pos;
     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
-      const size_t i = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f);
+      const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d);
       freqs[i] = freqs[i]/2;
     }
   }
@@ -194,6 +201,10 @@ static int FASTCOVER_checkParameters(ZDICT_fastCover_params_t parameters,
   if (parameters.d == 0 || parameters.k == 0 || parameters.f == 0) {
     return 0;
   }
+  /* d has to be 6 or 8 */
+  if (parameters.d != 6 && parameters.d != 8) {
+    return 0;
+  }
   /* 0 < f <= FASTCOVER_MAX_F */
   if (parameters.f > FASTCOVER_MAX_F) {
     return 0;
@@ -244,7 +255,7 @@ static void FASTCOVER_getFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx)
     size_t currSampleEnd = ctx->offsets[i+1];
     start = currSampleStart;
     while (start + f < currSampleEnd) {
-      const size_t dmerIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + start, f);
+      const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, ctx->d);
       /* if no dmer with same hash value has been seen in current sample */
       if (inCurrSample[dmerIndex] == 0) {
         inCurrSample[dmerIndex]++;
@@ -615,7 +626,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
     const unsigned nbThreads = parameters->nbThreads;
     const double splitPoint =
         parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
-    const unsigned kMinD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
     const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
     const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
     const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
diff --git a/contrib/experimental_dict_builders/fastCover/test.sh b/contrib/experimental_dict_builders/fastCover/test.sh
index b5570fef..91d4f492 100644
--- a/contrib/experimental_dict_builders/fastCover/test.sh
+++ b/contrib/experimental_dict_builders/fastCover/test.sh
@@ -11,4 +11,5 @@ echo "Removing dict1 dict2 dict3"
 rm -f dict1 dict2 dict3
 
 echo "Testing with invalid parameters, should fail"
-! ./main r=10
+! ./main in=../../../lib/common r=10
+! ./main in=../../../lib/common d=10

From 3b163e0b5b5f9eec427b87001483c3b627c95a8f Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Thu, 26 Jul 2018 13:53:13 -0700
Subject: [PATCH 27/35] Add array to keep track of frequency within active
 segment, fix malloc bug, update benchmarking result

---
 .../benchmarkDictBuilder/README.md            | 60 ++++++++--------
 .../fastCover/fastCover.c                     | 69 +++++++++++--------
 .../fastCover/main.c                          |  2 +-
 .../randomDictBuilder/main.c                  |  2 +-
 4 files changed, 75 insertions(+), 58 deletions(-)

diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
index 478d8793..07d65b08 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -14,42 +14,46 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
 
 ###Benchmarking Result:
 
+d=8
+f=23
+freq[i] = 0 when dmer added to best segment
+
 github:
 | Algorithm         | Speed(sec)    | Compression Ratio  |
-| ------------------|:-------------:| ------------------:|
-| nodict            | 0.000004      |  2.999642          |
-| random            | 0.148247      |  8.786957          |
-| cover             | 56.331553     |  10.641263         |
-| legacy            | 0.917595      |  8.989482          |
-| fastCover(opt)    | 13.169979     |  10.215174         |
-| fastCover(k=200)  | 2.692406      |  8.657219          |
+| ----------------- | ------------- | ------------------ |
+| nodict            | 0.000007      |  2.999642          |
+| random            | 0.150258      |  8.786957          |
+| cover             | 60.388853     |  10.641263         |
+| legacy            | 0.965050      |  8.989482          |
+| fastCover(opt)    | 84.968131     |  10.614747         |
+| fastCover(k=200)  | 6.465490      |  9.484150          |
 
 hg-commands
 | Algorithm         | Speed(sec)    | Compression Ratio  |
-| ----------------- |:-------------:| ------------------:|
-| nodict            | 0.000007      |  2.425291          |
-| random            | 0.093990      |  3.489515          |
-| cover             | 58.602385     |  4.131136          |
-| legacy            | 0.865683      |  3.911896          |
-| fastCover(opt)    | 9.404134      |  3.977229          |
-| fastCover(k=200)  | 1.037434      |  3.810326          |
+| ----------------- | ------------- | ------------------ |
+| nodict            | 0.000005      |  2.425291          |
+| random            | 0.084348      |  3.489515          |
+| cover             | 60.144894     |  4.131136          |
+| legacy            | 0.831981      |  3.911896          |
+| fastCover(opt)    | 59.030437     |  4.157595          |
+| fastCover(k=200)  | 3.702932      |  4.134222          |
 
 hg-changelog
 | Algorithm         | Speed(sec)    | Compression Ratio  |
-| ----------------- |:-------------:| ------------------:|
-| nodict            | 0.000022      |  1.377613          |
-| random            | 0.551539      |  2.096785          |
-| cover             | 221.370056    |  2.188654          |
-| legacy            | 2.405923      |  2.058273          |
-| fastCover(opt)    | 49.526246     |  2.124185          |
-| fastCover(k=200)  | 9.746872      |  2.114674          |
+| ----------------- | ------------- | ------------------ |
+| nodict            | 0.000004      |  1.377613          |
+| random            | 0.555964      |  2.096785          |
+| cover             | 214.423753    |  2.188654          |
+| legacy            | 2.180249      |  2.058273          |
+| fastCover(opt)    | 102.261452    |  2.180347          |
+| fastCover(k=200)  | 11.81039      |  2.170673          |
 
 hg-manifest
 | Algorithm         | Speed(sec)    | Compression Ratio  |
-| ----------------- |:-------------:| ------------------:|
-| nodict            | 0.000019      |  1.866385          |
-| random            | 1.083536      |  2.309485          |
-| cover             | 928.894887    |  2.582597          |
-| legacy            | 9.110371      |  2.506775          |
-| fastCover(opt)    | 116.508270    |  2.525689          |
-| fastCover(k=200)  | 12.176555     |  2.472221          |
+| ----------------- | ------------- | ------------------ |
+| nodict            | 0.000006      |  1.866385          |
+| random            | 1.063974      |  2.309485          |
+| cover             | 909.101849    |  2.582597          |
+| legacy            | 8.706580      |  2.506775          |
+| fastCover(opt)    | 188.598079    |  2.596761          |
+| fastCover(k=200)  | 13.392734     |  2.592985          |
diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c
index abd592cd..6f990e0c 100644
--- a/contrib/experimental_dict_builders/fastCover/fastCover.c
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.c
@@ -48,7 +48,7 @@ static clock_t g_time = 0;
 
 
 /*-*************************************
-* Hash Function
+* Hash Functions
 ***************************************/
 static const U64 prime6bytes = 227718039650203ULL;
 static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
@@ -58,6 +58,7 @@ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
 static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
 static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
 
+
 /**
  * Hash the d-byte value pointed to by p and mod 2^f
  */
@@ -140,29 +141,41 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
   activeSegment.begin = begin;
   activeSegment.end = begin;
   activeSegment.score = 0;
-  /* Slide the activeSegment through the whole epoch.
-   * Save the best segment in bestSegment.
-   */
-  while (activeSegment.end < end) {
-    /* Get hash value of current dmer  */
-    const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d);
-    /* Add frequency of this index to score */
-    activeSegment.score += freqs[index];
-    /* Increment end of segment */
-    activeSegment.end += 1;
-    /* If the window is now too large, drop the first position */
-    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
-      /* Get hash value of the dmer to be eliminated from active segment */
-      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d);
-      /* Subtract frequency of this index from score */
-      activeSegment.score -= freqs[delIndex];
-      /* Increment start of segment */
-      activeSegment.begin += 1;
-    }
-    /* If this segment is the best so far save it */
-    if (activeSegment.score > bestSegment.score) {
-      bestSegment = activeSegment;
+  {
+    /* Keep track of number of times an index has been seen in current segment */
+    U16* currfreqs =(U16 *)malloc((1 << parameters.f) * sizeof(U16));
+    memset(currfreqs, 0, (1 << parameters.f) * sizeof(*currfreqs));
+    /* Slide the activeSegment through the whole epoch.
+     * Save the best segment in bestSegment.
+     */
+    while (activeSegment.end < end) {
+      /* Get hash value of current dmer */
+      const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d);
+      /* Add frequency of this index to score if this is the first occurence of index in active segment */
+      if (currfreqs[index] == 0) {
+        activeSegment.score += freqs[index];
+      }
+      currfreqs[index] += 1;
+      /* Increment end of segment */
+      activeSegment.end += 1;
+      /* If the window is now too large, drop the first position */
+      if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+        /* Get hash value of the dmer to be eliminated from active segment */
+        const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d);
+        currfreqs[delIndex] -= 1;
+        /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
+        if (currfreqs[delIndex] == 0) {
+          activeSegment.score -= freqs[delIndex];
+        }
+        /* Increment start of segment */
+        activeSegment.begin += 1;
+      }
+      /* If this segment is the best so far save it */
+      if (activeSegment.score > bestSegment.score) {
+        bestSegment = activeSegment;
+      }
     }
+    free(currfreqs);
   }
   {
     /* Trim off the zero frequency head and tail from the segment. */
@@ -185,7 +198,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
     U32 pos;
     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
       const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d);
-      freqs[i] = freqs[i]/2;
+      freqs[i] = 0;
     }
   }
   return bestSegment;
@@ -245,12 +258,12 @@ static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) {
 /**
  * Calculate for frequency of hash value of each dmer in ctx->samples
  */
-static void FASTCOVER_getFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){
+static void FASTCOVER_computeFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){
   /* inCurrSample keeps track of this hash value has already be seen in previous dmers in the same sample*/
-  size_t* inCurrSample = (size_t *)malloc((1<<f)*sizeof(size_t));
+  BYTE* inCurrSample = (BYTE *)malloc((1 << f) * sizeof(BYTE));
   size_t start; /* start of current dmer */
   for (unsigned i = 0; i < ctx->nbTrainSamples; i++) {
-    memset(inCurrSample, 0, (1 << f)); /* Reset inCurrSample for each sample */
+    memset(inCurrSample, 0, (1 << f) * sizeof(*inCurrSample)); /* Reset inCurrSample for each sample */
     size_t currSampleStart = ctx->offsets[i];
     size_t currSampleEnd = ctx->offsets[i+1];
     start = currSampleStart;
@@ -338,7 +351,7 @@ static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer,
   memset(ctx->freqs, 0, (1 << f) * sizeof(U32));
 
   DISPLAYLEVEL(2, "Computing frequencies\n");
-  FASTCOVER_getFrequency(ctx->freqs, f, ctx);
+  FASTCOVER_computeFrequency(ctx->freqs, f, ctx);
 
   return 1;
 }
diff --git a/contrib/experimental_dict_builders/fastCover/main.c b/contrib/experimental_dict_builders/fastCover/main.c
index 260eeb28..f286b050 100644
--- a/contrib/experimental_dict_builders/fastCover/main.c
+++ b/contrib/experimental_dict_builders/fastCover/main.c
@@ -165,7 +165,7 @@ int main(int argCount, const char* argv[])
   params.splitPoint = (double)split/100;
 
   /* Build dictionary */
-  sampleInfo* info= getSampleInfo(filenameTable,
+  sampleInfo* info = getSampleInfo(filenameTable,
                     filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
   operationResult = FASTCOVER_trainFromFiles(outputFile, info, maxDictSize, &params);
 
diff --git a/contrib/experimental_dict_builders/randomDictBuilder/main.c b/contrib/experimental_dict_builders/randomDictBuilder/main.c
index 3f3a6ca7..3ad88574 100644
--- a/contrib/experimental_dict_builders/randomDictBuilder/main.c
+++ b/contrib/experimental_dict_builders/randomDictBuilder/main.c
@@ -149,7 +149,7 @@ int main(int argCount, const char* argv[])
   params.zParams = zParams;
   params.k = k;
 
-  sampleInfo* info= getSampleInfo(filenameTable,
+  sampleInfo* info = getSampleInfo(filenameTable,
                     filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
   operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, &params);
 

From 09ccd977c355c07a469a295837397abe28b6fdb2 Mon Sep 17 00:00:00 2001
From: George Lu <gclu@fb.com>
Date: Thu, 26 Jul 2018 15:17:58 -0700
Subject: [PATCH 28/35] no zero

---
 programs/bench.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/programs/bench.c b/programs/bench.c
index a54168c4..76d1ff6d 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -549,7 +549,8 @@ static BMK_return_t BMK_benchMemAdvancedNoAlloc(
                             double const compressionSpeed = ((double)srcSize / intermediateResultCompress.result.result.nanoSecPerRun) * 1000;
                             int const cSpeedAccuracy = (compressionSpeed < 10.) ? 2 : 1;
                             results.result.cSpeed = compressionSpeed * 1000000;
-                            results.result.cSize = intermediateResultCompress.result.result.sumOfReturn;
+                            cSize = intermediateResultCompress.result.result.sumOfReturn;
+                            results.result.cSize = cSize;
                             ratio = (double)srcSize / results.result.cSize;
                             markNb = (markNb+1) % NB_MARKS;
                             DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",

From 3d7941ce41d33bbbedb15fa9794c9fbcb1713384 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Thu, 26 Jul 2018 16:24:13 -0700
Subject: [PATCH 29/35] Benchmark different f values

---
 .../benchmarkDictBuilder/README.md            | 131 +++++++++++++-----
 .../benchmarkDictBuilder/benchmark.c          | 104 +++++++-------
 2 files changed, 152 insertions(+), 83 deletions(-)

diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
index 07d65b08..1ee4b19b 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -14,46 +14,107 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
 
 ###Benchmarking Result:
 
-d=8
-f=23
-freq[i] = 0 when dmer added to best segment
+For every f value for fast, the first one is optimize and the second one has k=200
 
 github:
-| Algorithm         | Speed(sec)    | Compression Ratio  |
-| ----------------- | ------------- | ------------------ |
-| nodict            | 0.000007      |  2.999642          |
-| random            | 0.150258      |  8.786957          |
-| cover             | 60.388853     |  10.641263         |
-| legacy            | 0.965050      |  8.989482          |
-| fastCover(opt)    | 84.968131     |  10.614747         |
-| fastCover(k=200)  | 6.465490      |  9.484150          |
+NODICT       0.000023       2.999642
+RANDOM       0.149020       8.786957
+LEGACY       0.854277       8.989482
+FAST15       8.764078       10.609015
+FAST15       0.232610       9.135669
+FAST16       9.597777       10.474574
+FAST16       0.243698       9.346482
+FAST17       9.385449       10.611737
+FAST17       0.268376       9.605798
+FAST18       9.988885       10.626382
+FAST18       0.311769       9.130565
+FAST19       10.737259       10.411729
+FAST19       0.331885       9.271814
+FAST20       10.479782       10.388895
+FAST20       0.498416       9.194115
+FAST21       21.189883       10.376394
+FAST21       1.098532       9.244456
+FAST22       39.849935       10.432555
+FAST22       2.590561       9.410930
+FAST23       75.832399       10.614747
+FAST23       6.108487       9.484150
+FAST24       139.782714       10.611753
+FAST24       13.029406       9.379030
+COVER       55.118542       10.641263
 
 hg-commands
-| Algorithm         | Speed(sec)    | Compression Ratio  |
-| ----------------- | ------------- | ------------------ |
-| nodict            | 0.000005      |  2.425291          |
-| random            | 0.084348      |  3.489515          |
-| cover             | 60.144894     |  4.131136          |
-| legacy            | 0.831981      |  3.911896          |
-| fastCover(opt)    | 59.030437     |  4.157595          |
-| fastCover(k=200)  | 3.702932      |  4.134222          |
+NODICT       0.000012       2.425291
+RANDOM       0.083071       3.489515
+LEGACY       0.835195       3.911896
+FAST15       0.163980       3.808375
+FAST16       6.373850       4.010783
+FAST16       0.160299       3.966604
+FAST17       6.668799       4.091602
+FAST17       0.172480       4.062773
+FAST18       6.266105       4.130824
+FAST18       0.171554       4.094666
+FAST19       6.869651       4.158180
+FAST19       0.209468       4.111289
+FAST20       8.267766       4.149707
+FAST20       0.331680       4.119873
+FAST21       18.824296       4.171784
+FAST21       0.783961       4.120884
+FAST22       33.321252       4.152035
+FAST22       1.854215       4.126626
+FAST23       60.775388       4.157595
+FAST23       4.040395       4.134222
+FAST24       110.910038       4.163091
+FAST24       8.505828       4.143533
+COVER       61.654796       4.131136
 
 hg-changelog
-| Algorithm         | Speed(sec)    | Compression Ratio  |
-| ----------------- | ------------- | ------------------ |
-| nodict            | 0.000004      |  1.377613          |
-| random            | 0.555964      |  2.096785          |
-| cover             | 214.423753    |  2.188654          |
-| legacy            | 2.180249      |  2.058273          |
-| fastCover(opt)    | 102.261452    |  2.180347          |
-| fastCover(k=200)  | 11.81039      |  2.170673          |
+NODICT       0.000004       1.377613
+RANDOM       0.582067       2.096785
+LEGACY       2.739515       2.058273
+FAST15       35.682665       2.127596
+FAST15       0.931621       2.115299
+FAST16       36.557988       2.141787
+FAST16       1.008155       2.136080
+FAST17       36.272242       2.155332
+FAST17       0.906803       2.154596
+FAST18       35.542043       2.171997
+FAST18       1.063101       2.167723
+FAST19       37.756934       2.180893
+FAST19       1.257291       2.173768
+FAST20       40.273755       2.179442
+FAST20       1.630522       2.170072
+FAST21       54.606548       2.181400
+FAST21       2.321266       2.171643
+FAST22       72.454066       2.178774
+FAST22       5.092888       2.168885
+FAST23       106.753208       2.180347
+FAST23       14.722222       2.170673
+FAST24       171.083201       2.183426
+FAST24       27.575575       2.170623
+COVER       227.219660       2.188654
 
 hg-manifest
-| Algorithm         | Speed(sec)    | Compression Ratio  |
-| ----------------- | ------------- | ------------------ |
-| nodict            | 0.000006      |  1.866385          |
-| random            | 1.063974      |  2.309485          |
-| cover             | 909.101849    |  2.582597          |
-| legacy            | 8.706580      |  2.506775          |
-| fastCover(opt)    | 188.598079    |  2.596761          |
-| fastCover(k=200)  | 13.392734     |  2.592985          |
+NODICT       0.000007       1.866385
+RANDOM       1.086571       2.309485
+LEGACY       9.567507       2.506775
+FAST15       77.811380       2.380461
+FAST15       1.969718       2.317727
+FAST16       75.789019       2.469144
+FAST16       2.051283       2.375815
+FAST17       79.659040       2.539069
+FAST17       1.995394       2.501047
+FAST18       76.281105       2.578095
+FAST18       2.059272       2.564840
+FAST19       79.395382       2.590433
+FAST19       2.354158       2.591024
+FAST20       87.937568       2.597813
+FAST20       2.922189       2.597104
+FAST21       121.760549       2.598408
+FAST21       4.798981       2.600269
+FAST22       155.878461       2.594560
+FAST22       8.151807       2.601047
+FAST23       194.238003       2.596761
+FAST23       15.160578       2.592985
+FAST24       267.425904       2.597657
+FAST24       29.513286       2.600363
+COVER       930.675322       2.582597
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
index 62135436..9feaae59 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@@ -340,12 +340,67 @@ int main(int argCount, const char* argv[])
     randomParam.zParams = zParams;
     randomParam.k = k;
     const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL, NULL);
+    DISPLAYLEVEL(2, "k=%u\n", randomParam.k);
     if(randomResult) {
       result = 1;
       goto _cleanup;
     }
   }
 
+  /* for legacy */
+  {
+    ZDICT_legacy_params_t legacyParam;
+    legacyParam.zParams = zParams;
+    legacyParam.selectivityLevel = 9;
+    const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam, NULL);
+    DISPLAYLEVEL(2, "selectivityLevel=%u\n", legacyParam.selectivityLevel);
+    if(legacyResult) {
+      result = 1;
+      goto _cleanup;
+    }
+  }
+
+  /* for fastCover */
+  for (unsigned f = 15; f < 25; f++){
+    DISPLAYLEVEL(2, "current f is %u\n", f);
+    /* for fastCover (optimizing k) */
+    {
+      ZDICT_fastCover_params_t fastParam;
+      memset(&fastParam, 0, sizeof(fastParam));
+      fastParam.zParams = zParams;
+      fastParam.splitPoint = 1.0;
+      fastParam.d = 8;
+      fastParam.f = f;
+      fastParam.steps = 40;
+      fastParam.nbThreads = 1;
+      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
+      if(fastOptResult) {
+        result = 1;
+        goto _cleanup;
+      }
+    }
+
+    /* for fastCover (with k provided) */
+    {
+      ZDICT_fastCover_params_t fastParam;
+      memset(&fastParam, 0, sizeof(fastParam));
+      fastParam.zParams = zParams;
+      fastParam.splitPoint = 1.0;
+      fastParam.d = 8;
+      fastParam.f = f;
+      fastParam.k = 200;
+      fastParam.steps = 40;
+      fastParam.nbThreads = 1;
+      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
+      if(fastOptResult) {
+        result = 1;
+        goto _cleanup;
+      }
+    }
+  }
+
   /* for cover */
   {
     ZDICT_cover_params_t coverParam;
@@ -355,60 +410,13 @@ int main(int argCount, const char* argv[])
     coverParam.steps = 40;
     coverParam.nbThreads = 1;
     const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL);
+    DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100));
     if(coverOptResult) {
       result = 1;
       goto _cleanup;
     }
   }
 
-  /* for legacy */
-  {
-    ZDICT_legacy_params_t legacyParam;
-    legacyParam.zParams = zParams;
-    legacyParam.selectivityLevel = 9;
-    const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam, NULL);
-    if(legacyResult) {
-      result = 1;
-      goto _cleanup;
-    }
-  }
-
-
-  /* for fastCover (optimizing k) */
-  {
-    ZDICT_fastCover_params_t fastParam;
-    memset(&fastParam, 0, sizeof(fastParam));
-    fastParam.zParams = zParams;
-    fastParam.splitPoint = 1.0;
-    fastParam.d = 8;
-    fastParam.f = 23;
-    fastParam.steps = 40;
-    fastParam.nbThreads = 1;
-    const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
-    if(fastOptResult) {
-      result = 1;
-      goto _cleanup;
-    }
-  }
-
-  /* for fastCover (with k provided) */
-  {
-    ZDICT_fastCover_params_t fastParam;
-    memset(&fastParam, 0, sizeof(fastParam));
-    fastParam.zParams = zParams;
-    fastParam.splitPoint = 1.0;
-    fastParam.d = 8;
-    fastParam.f = 23;
-    fastParam.k = 200;
-    fastParam.steps = 40;
-    fastParam.nbThreads = 1;
-    const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
-    if(fastOptResult) {
-      result = 1;
-      goto _cleanup;
-    }
-  }
-
 
   /* Free allocated memory */
 _cleanup:

From 759c543312fd722c6f351513411d6d57742c7e4e Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Thu, 26 Jul 2018 19:03:01 -0700
Subject: [PATCH 30/35] Rerun cover and fastCover with optimized values

---
 .../benchmarkDictBuilder/README.md            | 197 +++++++++---------
 .../benchmarkDictBuilder/benchmark.c          | 109 ++++++----
 .../fastCover/fastCover.c                     |   2 +-
 3 files changed, 169 insertions(+), 139 deletions(-)

diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
index 1ee4b19b..04866b7e 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -13,108 +13,113 @@ Benchmark given input files: make ARG= followed by permitted arguments
 make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
 
 ###Benchmarking Result:
-
-For every f value for fast, the first one is optimize and the second one has k=200
+First Cover is optimize cover, second Cover uses optimized d and k from first one.
+For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one.
 
 github:
-NODICT       0.000023       2.999642
-RANDOM       0.149020       8.786957
-LEGACY       0.854277       8.989482
-FAST15       8.764078       10.609015
-FAST15       0.232610       9.135669
-FAST16       9.597777       10.474574
-FAST16       0.243698       9.346482
-FAST17       9.385449       10.611737
-FAST17       0.268376       9.605798
-FAST18       9.988885       10.626382
-FAST18       0.311769       9.130565
-FAST19       10.737259       10.411729
-FAST19       0.331885       9.271814
-FAST20       10.479782       10.388895
-FAST20       0.498416       9.194115
-FAST21       21.189883       10.376394
-FAST21       1.098532       9.244456
-FAST22       39.849935       10.432555
-FAST22       2.590561       9.410930
-FAST23       75.832399       10.614747
-FAST23       6.108487       9.484150
-FAST24       139.782714       10.611753
-FAST24       13.029406       9.379030
-COVER       55.118542       10.641263
+NODICT       0.000004       2.999642
+RANDOM       0.146096       8.786957
+LEGACY       0.956888       8.989482
+COVER       56.596152       10.641263
+COVER       4.937047       10.641263
+FAST15       17.722269       10.586461
+FAST15       0.239135       10.586461
+FAST16       18.276179       10.492503
+FAST16       0.265285       10.492503
+FAST17       18.077916       10.611737
+FAST17       0.236573       10.611737
+FAST18       19.510150       10.621586
+FAST18       0.278683       10.621586
+FAST19       18.794350       10.629626
+FAST19       0.307943       10.629626
+FAST20       19.671099       10.610308
+FAST20       0.428814       10.610308
+FAST21       36.527238       10.625733
+FAST21       0.716384       10.625733
+FAST22       83.803521       10.625281
+FAST22       1.290246       10.625281
+FAST23       158.287924       10.602342
+FAST23       3.084848       10.602342
+FAST24       283.630941       10.603379
+FAST24       8.088933       10.603379
 
 hg-commands
-NODICT       0.000012       2.425291
-RANDOM       0.083071       3.489515
-LEGACY       0.835195       3.911896
-FAST15       0.163980       3.808375
-FAST16       6.373850       4.010783
-FAST16       0.160299       3.966604
-FAST17       6.668799       4.091602
-FAST17       0.172480       4.062773
-FAST18       6.266105       4.130824
-FAST18       0.171554       4.094666
-FAST19       6.869651       4.158180
-FAST19       0.209468       4.111289
-FAST20       8.267766       4.149707
-FAST20       0.331680       4.119873
-FAST21       18.824296       4.171784
-FAST21       0.783961       4.120884
-FAST22       33.321252       4.152035
-FAST22       1.854215       4.126626
-FAST23       60.775388       4.157595
-FAST23       4.040395       4.134222
-FAST24       110.910038       4.163091
-FAST24       8.505828       4.143533
-COVER       61.654796       4.131136
+NODICT       0.000007       2.425291
+RANDOM       0.084010       3.489515
+LEGACY       0.926763       3.911896
+COVER       62.036915       4.131136
+COVER       2.194398       4.131136
+FAST15       12.169025       3.903719
+FAST15       0.156552       3.903719
+FAST16       11.886255       4.005077
+FAST16       0.155506       4.005077
+FAST17       11.886955       4.097811
+FAST17       0.176327       4.097811
+FAST18       12.544698       4.136081
+FAST18       0.171796       4.136081
+FAST19       12.920868       4.166021
+FAST19       0.207029       4.166021
+FAST20       15.771429       4.163740
+FAST20       0.258685       4.163740
+FAST21       33.165829       4.157057
+FAST21       0.663088       4.157057
+FAST22       68.779201       4.158195
+FAST22       1.568439       4.158195
+FAST23       121.921931       4.161450
+FAST23       2.498972       4.161450
+FAST24       221.990451       4.159658
+FAST24       5.793594       4.159658
 
 hg-changelog
 NODICT       0.000004       1.377613
-RANDOM       0.582067       2.096785
-LEGACY       2.739515       2.058273
-FAST15       35.682665       2.127596
-FAST15       0.931621       2.115299
-FAST16       36.557988       2.141787
-FAST16       1.008155       2.136080
-FAST17       36.272242       2.155332
-FAST17       0.906803       2.154596
-FAST18       35.542043       2.171997
-FAST18       1.063101       2.167723
-FAST19       37.756934       2.180893
-FAST19       1.257291       2.173768
-FAST20       40.273755       2.179442
-FAST20       1.630522       2.170072
-FAST21       54.606548       2.181400
-FAST21       2.321266       2.171643
-FAST22       72.454066       2.178774
-FAST22       5.092888       2.168885
-FAST23       106.753208       2.180347
-FAST23       14.722222       2.170673
-FAST24       171.083201       2.183426
-FAST24       27.575575       2.170623
-COVER       227.219660       2.188654
+RANDOM       0.549307       2.096785
+LEGACY       2.273818       2.058273
+COVER       219.640608       2.188654
+COVER       6.055391       2.188654
+FAST15       67.820700       2.127194
+FAST15       0.824624       2.127194
+FAST16       69.774209       2.145401
+FAST16       0.889737       2.145401
+FAST17       70.027355       2.157544
+FAST17       0.869004       2.157544
+FAST18       68.229652       2.173127
+FAST18       0.930689       2.173127
+FAST19       70.696241       2.179527
+FAST19       1.385515       2.179527
+FAST20       80.618172       2.183233
+FAST20       1.699632       2.183233
+FAST21       96.366254       2.180920
+FAST21       2.606553       2.180920
+FAST22       139.440758       2.184297
+FAST22       5.962606       2.184297
+FAST23       207.791930       2.187666
+FAST23       14.823301       2.187666
+FAST24       322.050385       2.189889
+FAST24       29.294918       2.189889
 
 hg-manifest
-NODICT       0.000007       1.866385
-RANDOM       1.086571       2.309485
-LEGACY       9.567507       2.506775
-FAST15       77.811380       2.380461
-FAST15       1.969718       2.317727
-FAST16       75.789019       2.469144
-FAST16       2.051283       2.375815
-FAST17       79.659040       2.539069
-FAST17       1.995394       2.501047
-FAST18       76.281105       2.578095
-FAST18       2.059272       2.564840
-FAST19       79.395382       2.590433
-FAST19       2.354158       2.591024
-FAST20       87.937568       2.597813
-FAST20       2.922189       2.597104
-FAST21       121.760549       2.598408
-FAST21       4.798981       2.600269
-FAST22       155.878461       2.594560
-FAST22       8.151807       2.601047
-FAST23       194.238003       2.596761
-FAST23       15.160578       2.592985
-FAST24       267.425904       2.597657
-FAST24       29.513286       2.600363
-COVER       930.675322       2.582597
+NODICT       0.000008       1.866385
+RANDOM       1.075766       2.309485
+LEGACY       8.688387       2.506775
+COVER       926.024689       2.582597
+COVER       33.630695       2.582597
+FAST15       152.845945       2.377689
+FAST15       2.206285       2.377689
+FAST16       147.772371       2.464814
+FAST16       1.937997       2.464814
+FAST17       147.729498       2.539834
+FAST17       1.966577       2.539834
+FAST18       144.156821       2.576924
+FAST18       1.954106       2.576924
+FAST19       145.678760       2.592479
+FAST19       2.096876       2.592479
+FAST20       159.634674       2.594551
+FAST20       2.568766       2.594551
+FAST21       228.116552       2.597128
+FAST21       4.634508       2.597128
+FAST22       288.890644       2.596971
+FAST22       6.618204       2.596971
+FAST23       377.196211       2.601416
+FAST23       13.497286       2.601416
+FAST24       503.208577       2.602830
+FAST24       29.538585       2.602830
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
index 9feaae59..a775eae3 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@@ -277,7 +277,8 @@ int main(int argCount, const char* argv[])
   int result = 0;
 
   /* Initialize arguments to default values */
-  const unsigned k = 200;
+  unsigned k = 200;
+  unsigned d = 8;
   const unsigned cLevel = DEFAULT_CLEVEL;
   const unsigned dictID = 0;
   const unsigned maxDictSize = g_defaultMaxDictSize;
@@ -360,47 +361,6 @@ int main(int argCount, const char* argv[])
     }
   }
 
-  /* for fastCover */
-  for (unsigned f = 15; f < 25; f++){
-    DISPLAYLEVEL(2, "current f is %u\n", f);
-    /* for fastCover (optimizing k) */
-    {
-      ZDICT_fastCover_params_t fastParam;
-      memset(&fastParam, 0, sizeof(fastParam));
-      fastParam.zParams = zParams;
-      fastParam.splitPoint = 1.0;
-      fastParam.d = 8;
-      fastParam.f = f;
-      fastParam.steps = 40;
-      fastParam.nbThreads = 1;
-      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
-      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
-      if(fastOptResult) {
-        result = 1;
-        goto _cleanup;
-      }
-    }
-
-    /* for fastCover (with k provided) */
-    {
-      ZDICT_fastCover_params_t fastParam;
-      memset(&fastParam, 0, sizeof(fastParam));
-      fastParam.zParams = zParams;
-      fastParam.splitPoint = 1.0;
-      fastParam.d = 8;
-      fastParam.f = f;
-      fastParam.k = 200;
-      fastParam.steps = 40;
-      fastParam.nbThreads = 1;
-      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
-      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
-      if(fastOptResult) {
-        result = 1;
-        goto _cleanup;
-      }
-    }
-  }
-
   /* for cover */
   {
     ZDICT_cover_params_t coverParam;
@@ -415,8 +375,73 @@ int main(int argCount, const char* argv[])
       result = 1;
       goto _cleanup;
     }
+
+    k = coverParam.k;
+    d = coverParam.d;
+
+    /* for COVER with k and d provided */
+    ZDICT_cover_params_t covernParam;
+    memset(&covernParam, 0, sizeof(covernParam));
+    covernParam.zParams = zParams;
+    covernParam.splitPoint = 1.0;
+    covernParam.steps = 40;
+    covernParam.nbThreads = 1;
+    covernParam.k = k;
+    covernParam.d = d;
+    const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &covernParam, NULL, NULL);
+    DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", covernParam.k, covernParam.d, covernParam.steps, (unsigned)(covernParam.splitPoint * 100));
+    if(coverResult) {
+      result = 1;
+      goto _cleanup;
+    }
   }
 
+  /* for fastCover */
+  for (unsigned f = 15; f < 25; f++){
+    DISPLAYLEVEL(2, "current f is %u\n", f);
+    /* for fastCover (optimizing k and d) */
+    {
+      ZDICT_fastCover_params_t fastParam;
+      memset(&fastParam, 0, sizeof(fastParam));
+      fastParam.zParams = zParams;
+      fastParam.splitPoint = 1.0;
+      fastParam.f = f;
+      fastParam.steps = 40;
+      fastParam.nbThreads = 1;
+      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
+      if(fastOptResult) {
+        result = 1;
+        goto _cleanup;
+      }
+
+      k = fastParam.k;
+      d = fastParam.d;
+    }
+
+
+    /* for fastCover (with k and d provided) */
+    {
+      ZDICT_fastCover_params_t fastParam;
+      memset(&fastParam, 0, sizeof(fastParam));
+      fastParam.zParams = zParams;
+      fastParam.splitPoint = 1.0;
+      fastParam.d = d;
+      fastParam.f = f;
+      fastParam.k = k;
+      fastParam.steps = 40;
+      fastParam.nbThreads = 1;
+      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
+      if(fastOptResult) {
+        result = 1;
+        goto _cleanup;
+      }
+    }
+  }
+
+
+
 
   /* Free allocated memory */
 _cleanup:
diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c
index 6f990e0c..d6b3254e 100644
--- a/contrib/experimental_dict_builders/fastCover/fastCover.c
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.c
@@ -267,7 +267,7 @@ static void FASTCOVER_computeFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *
     size_t currSampleStart = ctx->offsets[i];
     size_t currSampleEnd = ctx->offsets[i+1];
     start = currSampleStart;
-    while (start + f < currSampleEnd) {
+    while (start + ctx->d <= currSampleEnd) {
       const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, ctx->d);
       /* if no dmer with same hash value has been seen in current sample */
       if (inCurrSample[dmerIndex] == 0) {

From 49b398e93f5357c4311b678a7e4b4d875035f379 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 27 Jul 2018 13:39:19 -0700
Subject: [PATCH 31/35] Use same param after optimizing cover and fastCover and
 record k and d for benchmarking

---
 .../benchmarkDictBuilder/README.md            | 211 +++++++++---------
 .../benchmarkDictBuilder/benchmark.c          |  74 ++----
 2 files changed, 129 insertions(+), 156 deletions(-)

diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
index 04866b7e..654ca409 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -13,113 +13,114 @@ Benchmark given input files: make ARG= followed by permitted arguments
 make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
 
 ###Benchmarking Result:
-First Cover is optimize cover, second Cover uses optimized d and k from first one.
-For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one.
+- First Cover is optimize cover, second Cover uses optimized d and k from first one.
+- For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one.
+- Fourth column is chosen d and fifth column is chosen k
 
 github:
-NODICT       0.000004       2.999642
-RANDOM       0.146096       8.786957
-LEGACY       0.956888       8.989482
-COVER       56.596152       10.641263
-COVER       4.937047       10.641263
-FAST15       17.722269       10.586461
-FAST15       0.239135       10.586461
-FAST16       18.276179       10.492503
-FAST16       0.265285       10.492503
-FAST17       18.077916       10.611737
-FAST17       0.236573       10.611737
-FAST18       19.510150       10.621586
-FAST18       0.278683       10.621586
-FAST19       18.794350       10.629626
-FAST19       0.307943       10.629626
-FAST20       19.671099       10.610308
-FAST20       0.428814       10.610308
-FAST21       36.527238       10.625733
-FAST21       0.716384       10.625733
-FAST22       83.803521       10.625281
-FAST22       1.290246       10.625281
-FAST23       158.287924       10.602342
-FAST23       3.084848       10.602342
-FAST24       283.630941       10.603379
-FAST24       8.088933       10.603379
+NODICT       0.000004       2.999642        
+RANDOM       0.146096       8.786957        
+LEGACY       0.956888       8.989482        
+COVER       56.596152       10.641263        8          1298
+COVER       4.937047       10.641263        8          1298
+FAST15       17.722269       10.586461        8          1778
+FAST15       0.239135       10.586461        8          1778
+FAST16       18.276179       10.492503        6          1778
+FAST16       0.265285       10.492503        6          1778
+FAST17       18.077916       10.611737        8          1778
+FAST17       0.236573       10.611737        8          1778
+FAST18       19.510150       10.621586        8          1778
+FAST18       0.278683       10.621586        8          1778
+FAST19       18.794350       10.629626        8          1778
+FAST19       0.307943       10.629626        8          1778
+FAST20       19.671099       10.610308        8          1778
+FAST20       0.428814       10.610308        8          1778
+FAST21       36.527238       10.625733        8          1778
+FAST21       0.716384       10.625733        8          1778
+FAST22       83.803521       10.625281        8          1778
+FAST22       1.290246       10.625281        8          1778
+FAST23       158.287924       10.602342        8          1778
+FAST23       3.084848       10.602342        8          1778
+FAST24       283.630941       10.603379        8          1778
+FAST24       8.088933       10.603379        8          1778
 
-hg-commands
-NODICT       0.000007       2.425291
-RANDOM       0.084010       3.489515
-LEGACY       0.926763       3.911896
-COVER       62.036915       4.131136
-COVER       2.194398       4.131136
-FAST15       12.169025       3.903719
-FAST15       0.156552       3.903719
-FAST16       11.886255       4.005077
-FAST16       0.155506       4.005077
-FAST17       11.886955       4.097811
-FAST17       0.176327       4.097811
-FAST18       12.544698       4.136081
-FAST18       0.171796       4.136081
-FAST19       12.920868       4.166021
-FAST19       0.207029       4.166021
-FAST20       15.771429       4.163740
-FAST20       0.258685       4.163740
-FAST21       33.165829       4.157057
-FAST21       0.663088       4.157057
-FAST22       68.779201       4.158195
-FAST22       1.568439       4.158195
-FAST23       121.921931       4.161450
-FAST23       2.498972       4.161450
-FAST24       221.990451       4.159658
-FAST24       5.793594       4.159658
+hg-commands:
+NODICT       0.000007       2.425291        
+RANDOM       0.084010       3.489515        
+LEGACY       0.926763       3.911896        
+COVER       62.036915       4.131136        8          386
+COVER       2.194398       4.131136        8          386
+FAST15       12.169025       3.903719        6          1106
+FAST15       0.156552       3.903719        6          1106
+FAST16       11.886255       4.005077        8          530
+FAST16       0.155506       4.005077        8          530
+FAST17       11.886955       4.097811        8          818
+FAST17       0.176327       4.097811        8          818
+FAST18       12.544698       4.136081        8          770
+FAST18       0.171796       4.136081        8          770
+FAST19       12.920868       4.166021        8          530
+FAST19       0.207029       4.166021        8          530
+FAST20       15.771429       4.163740        8          482
+FAST20       0.258685       4.163740        8          482
+FAST21       33.165829       4.157057        8          434
+FAST21       0.663088       4.157057        8          434
+FAST22       68.779201       4.158195        8          290
+FAST22       1.568439       4.158195        8          290
+FAST23       121.921931       4.161450        8          434
+FAST23       2.498972       4.161450        8          434
+FAST24       221.990451       4.159658        8          338
+FAST24       5.793594       4.159658        8          338
 
-hg-changelog
-NODICT       0.000004       1.377613
-RANDOM       0.549307       2.096785
-LEGACY       2.273818       2.058273
-COVER       219.640608       2.188654
-COVER       6.055391       2.188654
-FAST15       67.820700       2.127194
-FAST15       0.824624       2.127194
-FAST16       69.774209       2.145401
-FAST16       0.889737       2.145401
-FAST17       70.027355       2.157544
-FAST17       0.869004       2.157544
-FAST18       68.229652       2.173127
-FAST18       0.930689       2.173127
-FAST19       70.696241       2.179527
-FAST19       1.385515       2.179527
-FAST20       80.618172       2.183233
-FAST20       1.699632       2.183233
-FAST21       96.366254       2.180920
-FAST21       2.606553       2.180920
-FAST22       139.440758       2.184297
-FAST22       5.962606       2.184297
-FAST23       207.791930       2.187666
-FAST23       14.823301       2.187666
-FAST24       322.050385       2.189889
-FAST24       29.294918       2.189889
+hg-changelog:
+NODICT       0.000004       1.377613        
+RANDOM       0.549307       2.096785        
+LEGACY       2.273818       2.058273        
+COVER       219.640608       2.188654        8          98
+COVER       6.055391       2.188654        8          98
+FAST15       67.820700       2.127194        8          866
+FAST15       0.824624       2.127194        8          866
+FAST16       69.774209       2.145401        8          338
+FAST16       0.889737       2.145401        8          338
+FAST17       70.027355       2.157544        8          194
+FAST17       0.869004       2.157544        8          194
+FAST18       68.229652       2.173127        8          98
+FAST18       0.930689       2.173127        8          98
+FAST19       70.696241       2.179527        8          98
+FAST19       1.385515       2.179527        8          98
+FAST20       80.618172       2.183233        6          98
+FAST20       1.699632       2.183233        6          98
+FAST21       96.366254       2.180920        8          98
+FAST21       2.606553       2.180920        8          98
+FAST22       139.440758       2.184297        8          98
+FAST22       5.962606       2.184297        8          98
+FAST23       207.791930       2.187666        6          98
+FAST23       14.823301       2.187666        6          98
+FAST24       322.050385       2.189889        6          98
+FAST24       29.294918       2.189889        6          98
 
-hg-manifest
-NODICT       0.000008       1.866385
-RANDOM       1.075766       2.309485
-LEGACY       8.688387       2.506775
-COVER       926.024689       2.582597
-COVER       33.630695       2.582597
-FAST15       152.845945       2.377689
-FAST15       2.206285       2.377689
-FAST16       147.772371       2.464814
-FAST16       1.937997       2.464814
-FAST17       147.729498       2.539834
-FAST17       1.966577       2.539834
-FAST18       144.156821       2.576924
-FAST18       1.954106       2.576924
-FAST19       145.678760       2.592479
-FAST19       2.096876       2.592479
-FAST20       159.634674       2.594551
-FAST20       2.568766       2.594551
-FAST21       228.116552       2.597128
-FAST21       4.634508       2.597128
-FAST22       288.890644       2.596971
-FAST22       6.618204       2.596971
-FAST23       377.196211       2.601416
-FAST23       13.497286       2.601416
-FAST24       503.208577       2.602830
-FAST24       29.538585       2.602830
+hg-manifest:
+NODICT       0.000008       1.866385        
+RANDOM       1.075766       2.309485        
+LEGACY       8.688387       2.506775        
+COVER       926.024689       2.582597        8          434
+COVER       33.630695       2.582597        8          434
+FAST15       152.845945       2.377689        8          1682
+FAST15       2.206285       2.377689        8          1682
+FAST16       147.772371       2.464814        8          1538
+FAST16       1.937997       2.464814        8          1538
+FAST17       147.729498       2.539834        6          1826
+FAST17       1.966577       2.539834        6          1826
+FAST18       144.156821       2.576924        8          1922
+FAST18       1.954106       2.576924        8          1922
+FAST19       145.678760       2.592479        6          290
+FAST19       2.096876       2.592479        6          290
+FAST20       159.634674       2.594551        8          194
+FAST20       2.568766       2.594551        8          194
+FAST21       228.116552       2.597128        6          194
+FAST21       4.634508       2.597128        6          194
+FAST22       288.890644       2.596971        6          386
+FAST22       6.618204       2.596971        6          386
+FAST23       377.196211       2.601416        8          194
+FAST23       13.497286       2.601416        8          194
+FAST24       503.208577       2.602830        6          194
+FAST24       29.538585       2.602830        6          194
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
index a775eae3..75008a08 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@@ -251,7 +251,7 @@ int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random
     result = 1;
     goto _cleanup;
   }
-  DISPLAYLEVEL(2, "%s took %f seconds to execute \n", name, timeSec);
+  DISPLAYLEVEL(1, "%s took %f seconds to execute \n", name, timeSec);
 
   /* Calculate compression ratio */
   const double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel);
@@ -261,7 +261,7 @@ int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random
     goto _cleanup;
 
   }
-  DISPLAYLEVEL(2, "Compression ratio with %s dictionary is %f\n", name, cRatio);
+  DISPLAYLEVEL(1, "Compression ratio with %s dictionary is %f\n", name, cRatio);
 
 _cleanup:
   freeDictInfo(dInfo);
@@ -376,73 +376,45 @@ int main(int argCount, const char* argv[])
       goto _cleanup;
     }
 
-    k = coverParam.k;
-    d = coverParam.d;
-
-    /* for COVER with k and d provided */
-    ZDICT_cover_params_t covernParam;
-    memset(&covernParam, 0, sizeof(covernParam));
-    covernParam.zParams = zParams;
-    covernParam.splitPoint = 1.0;
-    covernParam.steps = 40;
-    covernParam.nbThreads = 1;
-    covernParam.k = k;
-    covernParam.d = d;
-    const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &covernParam, NULL, NULL);
-    DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", covernParam.k, covernParam.d, covernParam.steps, (unsigned)(covernParam.splitPoint * 100));
+    const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL);
+    DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100));
     if(coverResult) {
       result = 1;
       goto _cleanup;
     }
+
   }
 
   /* for fastCover */
   for (unsigned f = 15; f < 25; f++){
     DISPLAYLEVEL(2, "current f is %u\n", f);
     /* for fastCover (optimizing k and d) */
-    {
-      ZDICT_fastCover_params_t fastParam;
-      memset(&fastParam, 0, sizeof(fastParam));
-      fastParam.zParams = zParams;
-      fastParam.splitPoint = 1.0;
-      fastParam.f = f;
-      fastParam.steps = 40;
-      fastParam.nbThreads = 1;
-      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
-      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
-      if(fastOptResult) {
-        result = 1;
-        goto _cleanup;
-      }
-
-      k = fastParam.k;
-      d = fastParam.d;
+    ZDICT_fastCover_params_t fastParam;
+    memset(&fastParam, 0, sizeof(fastParam));
+    fastParam.zParams = zParams;
+    fastParam.splitPoint = 1.0;
+    fastParam.f = f;
+    fastParam.steps = 40;
+    fastParam.nbThreads = 1;
+    const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+    DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
+    if(fastOptResult) {
+      result = 1;
+      goto _cleanup;
     }
 
 
     /* for fastCover (with k and d provided) */
-    {
-      ZDICT_fastCover_params_t fastParam;
-      memset(&fastParam, 0, sizeof(fastParam));
-      fastParam.zParams = zParams;
-      fastParam.splitPoint = 1.0;
-      fastParam.d = d;
-      fastParam.f = f;
-      fastParam.k = k;
-      fastParam.steps = 40;
-      fastParam.nbThreads = 1;
-      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
-      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
-      if(fastOptResult) {
-        result = 1;
-        goto _cleanup;
-      }
+    const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+    DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
+    if(fastResult) {
+      result = 1;
+      goto _cleanup;
     }
+
   }
 
 
-
-
   /* Free allocated memory */
 _cleanup:
   UTIL_freeFileList(extendedFileList, fileNamesBuf);

From 61262f6c0dc137e078bbc4cd1131fc3b88657414 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 27 Jul 2018 16:51:38 -0700
Subject: [PATCH 32/35] Save segmentFreqs in ctx instead of malloc and memset
 in SelectSegment

---
 .../benchmarkDictBuilder/README.md            | 113 ++++++++++++++++++
 .../benchmarkDictBuilder/test.sh              |  10 +-
 .../fastCover/Makefile                        |   6 +-
 .../fastCover/fastCover.c                     |  28 +++--
 4 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
index 654ca409..a818e6eb 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -17,6 +17,8 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
 - For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one.
 - Fourth column is chosen d and fifth column is chosen k
 
+Version 1:
+
 github:
 NODICT       0.000004       2.999642        
 RANDOM       0.146096       8.786957        
@@ -124,3 +126,114 @@ FAST23       377.196211       2.601416        8          194
 FAST23       13.497286       2.601416        8          194
 FAST24       503.208577       2.602830        6          194
 FAST24       29.538585       2.602830        6          194
+
+---------------------------------------------------------------
+Version 2 (save segmentFreqs in ctx instead of malloc and memset in every call to SelectSegment):
+
+github:
+NODICT       0.000005       2.999642        
+RANDOM       0.141553       8.786957        
+LEGACY       0.904340       8.989482        
+COVER       53.621302       10.641263        8          1298
+COVER       4.085037       10.641263        8          1298
+FAST15       17.636211       10.586461        8          1778
+FAST15       0.221236       10.586461        8          1778
+FAST16       18.716259       10.492503        6          1778
+FAST16       0.251522       10.492503        6          1778
+FAST17       17.614391       10.611737        8          1778
+FAST17       0.241011       10.611737        8          1778
+FAST18       19.926270       10.621586        8          1778
+FAST18       0.287195       10.621586        8          1778
+FAST19       19.626808       10.629626        8          1778
+FAST19       0.340191       10.629626        8          1778
+FAST20       18.918657       10.610308        8          1778
+FAST20       0.463307       10.610308        8          1778
+FAST21       20.502362       10.625733        8          1778
+FAST21       0.638202       10.625733        8          1778
+FAST22       22.702695       10.625281        8          1778
+FAST22       1.353399       10.625281        8          1778
+FAST23       28.041990       10.602342        8          1778
+FAST23       3.029502       10.602342        8          1778
+FAST24       35.662961       10.603379        8          1778
+FAST24       6.524258       10.603379        8          1778
+
+hg-commands:
+NODICT       0.000005       2.425291        
+RANDOM       0.080469       3.489515        
+LEGACY       0.794417       3.911896        
+COVER       54.198788       4.131136        8          386
+COVER       2.191729       4.131136        8          386
+FAST15       11.852793       3.903719        6          1106
+FAST15       0.175406       3.903719        6          1106
+FAST16       12.863315       4.005077        8          530
+FAST16       0.158410       4.005077        8          530
+FAST17       11.977917       4.097811        8          818
+FAST17       0.162381       4.097811        8          818
+FAST18       11.749304       4.136081        8          770
+FAST18       0.173242       4.136081        8          770
+FAST19       11.905785       4.166021        8          530
+FAST19       0.186403       4.166021        8          530
+FAST20       13.293999       4.163740        8          482
+FAST20       0.241508       4.163740        8          482
+FAST21       16.623177       4.157057        8          434
+FAST21       0.372647       4.157057        8          434
+FAST22       20.918409       4.158195        8          290
+FAST22       0.570431       4.158195        8          290
+FAST23       21.762805       4.161450        8          434
+FAST23       1.162206       4.161450        8          434
+FAST24       29.133745       4.159658        8          338
+FAST24       3.054376       4.159658        8          338
+
+hg-changelog:
+NODICT       0.000006       1.377613        
+RANDOM       0.601346       2.096785        
+LEGACY       2.544973       2.058273        
+COVER       222.639708       2.188654        8          98
+COVER       6.072892       2.188654        8          98
+FAST15       70.394523       2.127194        8          866
+FAST15       0.899766       2.127194        8          866
+FAST16       69.845529       2.145401        8          338
+FAST16       0.881569       2.145401        8          338
+FAST17       69.382431       2.157544        8          194
+FAST17       0.943291       2.157544        8          194
+FAST18       71.348283       2.173127        8          98
+FAST18       1.034765       2.173127        8          98
+FAST19       71.380923       2.179527        8          98
+FAST19       1.254700       2.179527        8          98
+FAST20       72.802714       2.183233        6          98
+FAST20       1.368704       2.183233        6          98
+FAST21       82.042339       2.180920        8          98
+FAST21       2.213864       2.180920        8          98
+FAST22       90.666200       2.184297        8          98
+FAST22       3.590399       2.184297        8          98
+FAST23       108.926377       2.187666        6          98
+FAST23       8.723759       2.187666        6          98
+FAST24       134.296232       2.189889        6          98
+FAST24       19.396532       2.189889        6          98
+
+hg-manifest:
+NODICT       0.000005       1.866385        
+RANDOM       0.982192       2.309485        
+LEGACY       9.507729       2.506775        
+COVER       922.742066       2.582597        8          434
+COVER       36.500276       2.582597        8          434
+FAST15       163.886717       2.377689        8          1682
+FAST15       2.107328       2.377689        8          1682
+FAST16       152.684592       2.464814        8          1538
+FAST16       2.157789       2.464814        8          1538
+FAST17       154.463459       2.539834        6          1826
+FAST17       2.282455       2.539834        6          1826
+FAST18       155.540044       2.576924        8          1922
+FAST18       2.101807       2.576924        8          1922
+FAST19       152.650343       2.592479        6          290
+FAST19       2.359461       2.592479        6          290
+FAST20       174.623634       2.594551        8          194
+FAST20       2.870022       2.594551        8          194
+FAST21       219.876653       2.597128        6          194
+FAST21       4.386269       2.597128        6          194
+FAST22       247.986803       2.596971        6          386
+FAST22       6.201144       2.596971        6          386
+FAST23       276.051806       2.601416        8          194
+FAST23       11.613477       2.601416        8          194
+FAST24       328.234024       2.602830        6          194
+FAST24       26.710364       2.602830        6          194
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
index 5eaf5930..e5508ded 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
@@ -1,2 +1,8 @@
-echo "Benchmark with in=../../lib/common"
-./benchmark in=../../../lib/common
+echo "-----------------github--------------------"
+./benchmark in=github
+echo "-----------------hg-commands--------------------"
+./benchmark in=hg-commands
+echo "-----------------hg-changelog--------------------"
+./benchmark in=hg-changelog
+echo "------------------hg-manifest-------------------"
+./benchmark in=hg-manifest
diff --git a/contrib/experimental_dict_builders/fastCover/Makefile b/contrib/experimental_dict_builders/fastCover/Makefile
index 9c56013d..4a7cc17d 100644
--- a/contrib/experimental_dict_builders/fastCover/Makefile
+++ b/contrib/experimental_dict_builders/fastCover/Makefile
@@ -1,7 +1,7 @@
 ARG :=
 
 CC ?= gcc
-CFLAGS ?= -O3
+CFLAGS ?= -O3 -g
 INCLUDES := -I ../../../programs -I ../randomDictBuilder -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
 
 IO_FILE := ../randomDictBuilder/io.c
@@ -9,7 +9,7 @@ IO_FILE := ../randomDictBuilder/io.c
 TEST_INPUT := ../../../lib
 TEST_OUTPUT := fastCoverDict
 
-all: main run clean
+all: main run
 
 .PHONY: test
 test: main testrun testshell clean
@@ -32,7 +32,7 @@ io.o: $(IO_FILE)
 	$(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)
 
 libzstd.a:
-	$(MAKE) -C ../../../lib libzstd.a
+	$(MAKE) MOREFLAGS=-g -C ../../../lib libzstd.a
 	mv ../../../lib/libzstd.a .
 
 .PHONY: testrun
diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c
index d6b3254e..3c1aa951 100644
--- a/contrib/experimental_dict_builders/fastCover/fastCover.c
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.c
@@ -82,6 +82,7 @@ typedef struct {
   size_t nbTestSamples;
   size_t nbDmers;
   U32 *freqs;
+  U16 *segmentFreqs;
   unsigned d;
 } FASTCOVER_ctx_t;
 
@@ -142,9 +143,6 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
   activeSegment.end = begin;
   activeSegment.score = 0;
   {
-    /* Keep track of number of times an index has been seen in current segment */
-    U16* currfreqs =(U16 *)malloc((1 << parameters.f) * sizeof(U16));
-    memset(currfreqs, 0, (1 << parameters.f) * sizeof(*currfreqs));
     /* Slide the activeSegment through the whole epoch.
      * Save the best segment in bestSegment.
      */
@@ -152,19 +150,19 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
       /* Get hash value of current dmer */
       const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d);
       /* Add frequency of this index to score if this is the first occurence of index in active segment */
-      if (currfreqs[index] == 0) {
+      if (ctx->segmentFreqs[index] == 0) {
         activeSegment.score += freqs[index];
       }
-      currfreqs[index] += 1;
+      ctx->segmentFreqs[index] += 1;
       /* Increment end of segment */
       activeSegment.end += 1;
       /* If the window is now too large, drop the first position */
       if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
         /* Get hash value of the dmer to be eliminated from active segment */
         const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d);
-        currfreqs[delIndex] -= 1;
+        ctx->segmentFreqs[delIndex] -= 1;
         /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
-        if (currfreqs[delIndex] == 0) {
+        if (ctx->segmentFreqs[delIndex] == 0) {
           activeSegment.score -= freqs[delIndex];
         }
         /* Increment start of segment */
@@ -175,7 +173,12 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
         bestSegment = activeSegment;
       }
     }
-    free(currfreqs);
+    /* Zero out rest of segmentFreqs array */
+    while (activeSegment.begin < end) {
+      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d);
+      ctx->segmentFreqs[delIndex] -= 1;
+      activeSegment.begin += 1;
+    }
   }
   {
     /* Trim off the zero frequency head and tail from the segment. */
@@ -245,6 +248,10 @@ static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) {
   if (!ctx) {
     return;
   }
+  if (ctx->segmentFreqs) {
+    free(ctx->segmentFreqs);
+    ctx->segmentFreqs = NULL;
+  }
   if (ctx->freqs) {
     free(ctx->freqs);
     ctx->freqs = NULL;
@@ -347,9 +354,8 @@ static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer,
   }
 
   /* Initialize frequency array of size 2^f */
-  ctx->freqs =(U32 *)malloc((1 << f) * sizeof(U32));
-  memset(ctx->freqs, 0, (1 << f) * sizeof(U32));
-
+  ctx->freqs = (U32 *)calloc((1 << f), sizeof(U32));
+  ctx->segmentFreqs = (U16 *)calloc((1 << f), sizeof(U16));
   DISPLAYLEVEL(2, "Computing frequencies\n");
   FASTCOVER_computeFrequency(ctx->freqs, f, ctx);
 

From 96d84ee235f4d6cbf71c415a1a0327235751ba86 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 27 Jul 2018 16:54:05 -0700
Subject: [PATCH 33/35] Revert test.sh

---
 .../benchmarkDictBuilder/test.sh                       | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
index e5508ded..5eaf5930 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
@@ -1,8 +1,2 @@
-echo "-----------------github--------------------"
-./benchmark in=github
-echo "-----------------hg-commands--------------------"
-./benchmark in=hg-commands
-echo "-----------------hg-changelog--------------------"
-./benchmark in=hg-changelog
-echo "------------------hg-manifest-------------------"
-./benchmark in=hg-manifest
+echo "Benchmark with in=../../lib/common"
+./benchmark in=../../../lib/common

From 53ef22a4bc3844f860531dce31481db8b6fcd9bf Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 27 Jul 2018 16:56:50 -0700
Subject: [PATCH 34/35] Undo deleting clean in make

---
 contrib/experimental_dict_builders/fastCover/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/experimental_dict_builders/fastCover/Makefile b/contrib/experimental_dict_builders/fastCover/Makefile
index 4a7cc17d..3ba24790 100644
--- a/contrib/experimental_dict_builders/fastCover/Makefile
+++ b/contrib/experimental_dict_builders/fastCover/Makefile
@@ -9,7 +9,7 @@ IO_FILE := ../randomDictBuilder/io.c
 TEST_INPUT := ../../../lib
 TEST_OUTPUT := fastCoverDict
 
-all: main run
+all: main run clean
 
 .PHONY: test
 test: main testrun testshell clean

From 51b109c1b5991d3a9bac7bbd5e82065a816777cb Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 27 Jul 2018 17:31:33 -0700
Subject: [PATCH 35/35] Delete old benchmarking result

---
 .../benchmarkDictBuilder/README.md            | 113 ------------------
 1 file changed, 113 deletions(-)

diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
index a818e6eb..20fbde95 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -17,119 +17,6 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
 - For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one.
 - Fourth column is chosen d and fifth column is chosen k
 
-Version 1:
-
-github:
-NODICT       0.000004       2.999642        
-RANDOM       0.146096       8.786957        
-LEGACY       0.956888       8.989482        
-COVER       56.596152       10.641263        8          1298
-COVER       4.937047       10.641263        8          1298
-FAST15       17.722269       10.586461        8          1778
-FAST15       0.239135       10.586461        8          1778
-FAST16       18.276179       10.492503        6          1778
-FAST16       0.265285       10.492503        6          1778
-FAST17       18.077916       10.611737        8          1778
-FAST17       0.236573       10.611737        8          1778
-FAST18       19.510150       10.621586        8          1778
-FAST18       0.278683       10.621586        8          1778
-FAST19       18.794350       10.629626        8          1778
-FAST19       0.307943       10.629626        8          1778
-FAST20       19.671099       10.610308        8          1778
-FAST20       0.428814       10.610308        8          1778
-FAST21       36.527238       10.625733        8          1778
-FAST21       0.716384       10.625733        8          1778
-FAST22       83.803521       10.625281        8          1778
-FAST22       1.290246       10.625281        8          1778
-FAST23       158.287924       10.602342        8          1778
-FAST23       3.084848       10.602342        8          1778
-FAST24       283.630941       10.603379        8          1778
-FAST24       8.088933       10.603379        8          1778
-
-hg-commands:
-NODICT       0.000007       2.425291        
-RANDOM       0.084010       3.489515        
-LEGACY       0.926763       3.911896        
-COVER       62.036915       4.131136        8          386
-COVER       2.194398       4.131136        8          386
-FAST15       12.169025       3.903719        6          1106
-FAST15       0.156552       3.903719        6          1106
-FAST16       11.886255       4.005077        8          530
-FAST16       0.155506       4.005077        8          530
-FAST17       11.886955       4.097811        8          818
-FAST17       0.176327       4.097811        8          818
-FAST18       12.544698       4.136081        8          770
-FAST18       0.171796       4.136081        8          770
-FAST19       12.920868       4.166021        8          530
-FAST19       0.207029       4.166021        8          530
-FAST20       15.771429       4.163740        8          482
-FAST20       0.258685       4.163740        8          482
-FAST21       33.165829       4.157057        8          434
-FAST21       0.663088       4.157057        8          434
-FAST22       68.779201       4.158195        8          290
-FAST22       1.568439       4.158195        8          290
-FAST23       121.921931       4.161450        8          434
-FAST23       2.498972       4.161450        8          434
-FAST24       221.990451       4.159658        8          338
-FAST24       5.793594       4.159658        8          338
-
-hg-changelog:
-NODICT       0.000004       1.377613        
-RANDOM       0.549307       2.096785        
-LEGACY       2.273818       2.058273        
-COVER       219.640608       2.188654        8          98
-COVER       6.055391       2.188654        8          98
-FAST15       67.820700       2.127194        8          866
-FAST15       0.824624       2.127194        8          866
-FAST16       69.774209       2.145401        8          338
-FAST16       0.889737       2.145401        8          338
-FAST17       70.027355       2.157544        8          194
-FAST17       0.869004       2.157544        8          194
-FAST18       68.229652       2.173127        8          98
-FAST18       0.930689       2.173127        8          98
-FAST19       70.696241       2.179527        8          98
-FAST19       1.385515       2.179527        8          98
-FAST20       80.618172       2.183233        6          98
-FAST20       1.699632       2.183233        6          98
-FAST21       96.366254       2.180920        8          98
-FAST21       2.606553       2.180920        8          98
-FAST22       139.440758       2.184297        8          98
-FAST22       5.962606       2.184297        8          98
-FAST23       207.791930       2.187666        6          98
-FAST23       14.823301       2.187666        6          98
-FAST24       322.050385       2.189889        6          98
-FAST24       29.294918       2.189889        6          98
-
-hg-manifest:
-NODICT       0.000008       1.866385        
-RANDOM       1.075766       2.309485        
-LEGACY       8.688387       2.506775        
-COVER       926.024689       2.582597        8          434
-COVER       33.630695       2.582597        8          434
-FAST15       152.845945       2.377689        8          1682
-FAST15       2.206285       2.377689        8          1682
-FAST16       147.772371       2.464814        8          1538
-FAST16       1.937997       2.464814        8          1538
-FAST17       147.729498       2.539834        6          1826
-FAST17       1.966577       2.539834        6          1826
-FAST18       144.156821       2.576924        8          1922
-FAST18       1.954106       2.576924        8          1922
-FAST19       145.678760       2.592479        6          290
-FAST19       2.096876       2.592479        6          290
-FAST20       159.634674       2.594551        8          194
-FAST20       2.568766       2.594551        8          194
-FAST21       228.116552       2.597128        6          194
-FAST21       4.634508       2.597128        6          194
-FAST22       288.890644       2.596971        6          386
-FAST22       6.618204       2.596971        6          386
-FAST23       377.196211       2.601416        8          194
-FAST23       13.497286       2.601416        8          194
-FAST24       503.208577       2.602830        6          194
-FAST24       29.538585       2.602830        6          194
-
----------------------------------------------------------------
-Version 2 (save segmentFreqs in ctx instead of malloc and memset in every call to SelectSegment):
-
 github:
 NODICT       0.000005       2.999642        
 RANDOM       0.141553       8.786957