From 2bd85f41994e9695911cfc4c86fbc04fdb35ee82 Mon Sep 17 00:00:00 2001
From: "W. Felix Handte" <w@felixhandte.com>
Date: Fri, 22 Sep 2017 11:55:42 -0700
Subject: [PATCH 1/3] Add Dictionary Support to the Command Line Tool

---
 programs/lz4cli.c | 27 +++++++++++++++
 programs/lz4io.c  | 84 ++++++++++++++++++++++++++++++++++++++++++++---
 programs/lz4io.h  |  2 ++
 3 files changed, 109 insertions(+), 4 deletions(-)

diff --git a/programs/lz4cli.c b/programs/lz4cli.c
index ff489c6..857fa65 100644
--- a/programs/lz4cli.c
+++ b/programs/lz4cli.c
@@ -113,6 +113,7 @@ static int usage(const char* exeName)
     DISPLAY( " -9     : High compression \n");
     DISPLAY( " -d     : decompression (default for %s extension)\n", LZ4_EXTENSION);
     DISPLAY( " -z     : force compression \n");
+    DISPLAY( " -D FILE: use dictionary in FILE \n");
     DISPLAY( " -f     : overwrite output without prompting \n");
     DISPLAY( " -k     : preserve source files(s)  (default) \n");
     DISPLAY( "--rm    : remove source file(s) after successful de/compression \n");
@@ -290,6 +291,7 @@ int main(int argc, const char** argv)
     operationMode_e mode = om_auto;
     const char* input_filename = NULL;
     const char* output_filename= NULL;
+    const char* dictionary_filename = NULL;
     char* dynNameSpace = NULL;
     const char** inFileNames = (const char**) calloc(argc, sizeof(char*));
     unsigned ifnIdx=0;
@@ -399,6 +401,22 @@ int main(int argc, const char** argv)
                     /* Compression (default) */
                 case 'z': mode = om_compress; break;
 
+                case 'D':
+                    if (argument[1] == '\0') {
+                        /* path is next arg */
+                        if (i + 1 == argc) {
+                            /* there is no next arg */
+                            badusage(exeName);
+                        }
+                        dictionary_filename = argv[++i];
+                    } else {
+                        /* path follows immediately */
+                        dictionary_filename = argument + 1;
+                    }
+                    /* skip to end of argument so that we jump to parsing next argument */
+                    argument += strlen(argument) - 1;
+                    break;
+
                     /* Use Legacy format (ex : Linux kernel compression) */
                 case 'l': legacy_format = 1; blockSize = 8 MB; break;
 
@@ -560,6 +578,15 @@ int main(int argc, const char** argv)
         mode = om_decompress;   /* defer to decompress */
     }
 
+    if (dictionary_filename) {
+        if (!strcmp(dictionary_filename, stdinmark) && IS_CONSOLE(stdin)) {
+            DISPLAYLEVEL(1, "refusing to read from a console\n");
+            exit(1);
+        }
+
+        LZ4IO_setDictionaryFilename(dictionary_filename);
+    }
+
     /* compress or decompress */
     if (!input_filename) input_filename = stdinmark;
     /* Check if input is defined as console; trigger an error in this case */
diff --git a/programs/lz4io.c b/programs/lz4io.c
index 06741b4..642e11c 100644
--- a/programs/lz4io.c
+++ b/programs/lz4io.c
@@ -57,6 +57,7 @@
 #include "lz4.h"       /* still required for legacy format */
 #include "lz4hc.h"     /* still required for legacy format */
 #include "lz4frame.h"
+#include "lz4frame_static.h"
 
 
 /*****************************
@@ -110,6 +111,8 @@ static int g_streamChecksum = 1;
 static int g_blockIndependence = 1;
 static int g_sparseFileSupport = 1;
 static int g_contentSizeFlag = 0;
+static int g_useDictionary = 0;
+static const char* g_dictionaryFilename = NULL;
 
 
 /**************************************
@@ -142,6 +145,12 @@ static int g_contentSizeFlag = 0;
 /* ****************** Parameters ******************** */
 /* ************************************************** */
 
+int LZ4IO_setDictionaryFilename(const char* dictionaryFilename) {
+    g_dictionaryFilename = dictionaryFilename;
+    g_useDictionary = dictionaryFilename != NULL;
+    return g_useDictionary;
+}
+
 /* Default setting : overwrite = 1; return : overwrite mode (0/1) */
 int LZ4IO_setOverwrite(int yes)
 {
@@ -395,8 +404,53 @@ typedef struct {
     void*  dstBuffer;
     size_t dstBufferSize;
     LZ4F_compressionContext_t ctx;
+    LZ4F_CDict* cdict;
 } cRess_t;
 
+static void* LZ4IO_createDict(const char* dictionaryFilename, size_t *dictionarySize) {
+    FILE* dictionaryFile;
+    size_t blockSize = 64 KB;
+    size_t dictionaryBufferSize = blockSize;
+    size_t readSize;
+    void* dictionaryBuffer;
+    *dictionarySize = 0;
+    dictionaryBuffer = malloc(dictionaryBufferSize);
+
+    if (!dictionaryBuffer) EXM_THROW(25, "Allocation error : not enough memory");
+
+    if (!dictionaryFilename) EXM_THROW(25, "Dictionary error : no filename provided");
+
+    dictionaryFile = LZ4IO_openSrcFile(g_dictionaryFilename);
+    if (!dictionaryFile) EXM_THROW(25, "Dictionary error : could not open dictionary file");
+
+    do {
+        if (*dictionarySize + blockSize > dictionaryBufferSize) {
+            dictionaryBufferSize *= 2;
+            dictionaryBuffer = realloc(dictionaryBuffer, dictionaryBufferSize);
+            if (!dictionaryBuffer) EXM_THROW(26, "Allocation error : not enough memory");
+        }
+        /* Read next block */
+        readSize = fread((char*)dictionaryBuffer + *dictionarySize, (size_t)1, (size_t)blockSize, dictionaryFile);
+        *dictionarySize += readSize;
+    } while (readSize>0);
+
+    return dictionaryBuffer;
+}
+
+static LZ4F_CDict* LZ4IO_createCDict(void) {
+    size_t dictionarySize;
+    void* dictionaryBuffer;
+    LZ4F_CDict* cdict;
+    if (!g_useDictionary) {
+        return NULL;
+    }
+    dictionaryBuffer = LZ4IO_createDict(g_dictionaryFilename, &dictionarySize);
+    if (!dictionaryBuffer) EXM_THROW(25, "Dictionary error : could not create dictionary");
+    cdict = LZ4F_createCDict(dictionaryBuffer, dictionarySize);
+    free(dictionaryBuffer);
+    return cdict;
+}
+
 static cRess_t LZ4IO_createCResources(void)
 {
     const size_t blockSize = (size_t)LZ4IO_GetBlockSize_FromBlockId (g_blockSizeId);
@@ -412,6 +466,8 @@ static cRess_t LZ4IO_createCResources(void)
     ress.dstBuffer = malloc(ress.dstBufferSize);
     if (!ress.srcBuffer || !ress.dstBuffer) EXM_THROW(31, "Allocation error : not enough memory");
 
+    ress.cdict = LZ4IO_createCDict();
+
     return ress;
 }
 
@@ -419,6 +475,10 @@ static void LZ4IO_freeCResources(cRess_t ress)
 {
     free(ress.srcBuffer);
     free(ress.dstBuffer);
+
+    LZ4F_freeCDict(ress.cdict);
+    ress.cdict = NULL;
+
     { LZ4F_errorCode_t const errorCode = LZ4F_freeCompressionContext(ress.ctx);
       if (LZ4F_isError(errorCode)) EXM_THROW(38, "Error : can't free LZ4F context resource : %s", LZ4F_getErrorName(errorCode)); }
 }
@@ -472,7 +532,7 @@ static int LZ4IO_compressFilename_extRess(cRess_t ress, const char* srcFileName,
     /* single-block file */
     if (readSize < blockSize) {
         /* Compress in single pass */
-        size_t const cSize = LZ4F_compressFrame(dstBuffer, dstBufferSize, srcBuffer, readSize, &prefs);
+        size_t cSize = LZ4F_compressFrame_usingCDict(dstBuffer, dstBufferSize, srcBuffer, readSize, ress.cdict, &prefs);
         if (LZ4F_isError(cSize)) EXM_THROW(31, "Compression failed : %s", LZ4F_getErrorName(cSize));
         compressedfilesize = cSize;
         DISPLAYUPDATE(2, "\rRead : %u MB   ==> %.2f%%   ",
@@ -488,7 +548,7 @@ static int LZ4IO_compressFilename_extRess(cRess_t ress, const char* srcFileName,
     /* multiple-blocks file */
     {
         /* Write Archive Header */
-        size_t headerSize = LZ4F_compressBegin(ctx, dstBuffer, dstBufferSize, &prefs);
+        size_t headerSize = LZ4F_compressBegin_usingCDict(ctx, dstBuffer, dstBufferSize, ress.cdict, &prefs);
         if (LZ4F_isError(headerSize)) EXM_THROW(33, "File header generation failed : %s", LZ4F_getErrorName(headerSize));
         { size_t const sizeCheck = fwrite(dstBuffer, 1, headerSize, dstFile);
           if (sizeCheck!=headerSize) EXM_THROW(34, "Write error : cannot write header"); }
@@ -745,8 +805,21 @@ typedef struct {
     size_t dstBufferSize;
     FILE*  dstFile;
     LZ4F_decompressionContext_t dCtx;
+    void*  dictBuffer;
+    size_t dictBufferSize;
 } dRess_t;
 
+static void LZ4IO_loadDDict(dRess_t* ress) {
+    if (!g_useDictionary) {
+        ress->dictBuffer = NULL;
+        ress->dictBufferSize = 0;
+        return;
+    }
+
+    ress->dictBuffer = LZ4IO_createDict(g_dictionaryFilename, &ress->dictBufferSize);
+    if (!ress->dictBuffer) EXM_THROW(25, "Dictionary error : could not create dictionary");
+}
+
 static const size_t LZ4IO_dBufferSize = 64 KB;
 static dRess_t LZ4IO_createDResources(void)
 {
@@ -763,6 +836,8 @@ static dRess_t LZ4IO_createDResources(void)
     ress.dstBuffer = malloc(ress.dstBufferSize);
     if (!ress.srcBuffer || !ress.dstBuffer) EXM_THROW(61, "Allocation error : not enough memory");
 
+    LZ4IO_loadDDict(&ress);
+
     ress.dstFile = NULL;
     return ress;
 }
@@ -773,6 +848,7 @@ static void LZ4IO_freeDResources(dRess_t ress)
     if (LZ4F_isError(errorCode)) EXM_THROW(69, "Error : can't free LZ4F context resource : %s", LZ4F_getErrorName(errorCode));
     free(ress.srcBuffer);
     free(ress.dstBuffer);
+    free(ress.dictBuffer);
 }
 
 
@@ -786,7 +862,7 @@ static unsigned long long LZ4IO_decompressLZ4F(dRess_t ress, FILE* srcFile, FILE
     {   size_t inSize = MAGICNUMBER_SIZE;
         size_t outSize= 0;
         LZ4IO_writeLE32(ress.srcBuffer, LZ4IO_MAGICNUMBER);
-        nextToLoad = LZ4F_decompress(ress.dCtx, ress.dstBuffer, &outSize, ress.srcBuffer, &inSize, NULL);
+        nextToLoad = LZ4F_decompress_usingDict(ress.dCtx, ress.dstBuffer, &outSize, ress.srcBuffer, &inSize, ress.dictBuffer, ress.dictBufferSize, NULL);
         if (LZ4F_isError(nextToLoad)) EXM_THROW(62, "Header error : %s", LZ4F_getErrorName(nextToLoad));
     }
 
@@ -805,7 +881,7 @@ static unsigned long long LZ4IO_decompressLZ4F(dRess_t ress, FILE* srcFile, FILE
             /* Decode Input (at least partially) */
             size_t remaining = readSize - pos;
             decodedBytes = ress.dstBufferSize;
-            nextToLoad = LZ4F_decompress(ress.dCtx, ress.dstBuffer, &decodedBytes, (char*)(ress.srcBuffer)+pos, &remaining, NULL);
+            nextToLoad = LZ4F_decompress_usingDict(ress.dCtx, ress.dstBuffer, &decodedBytes, (char*)(ress.srcBuffer)+pos, &remaining, ress.dictBuffer, ress.dictBufferSize, NULL);
             if (LZ4F_isError(nextToLoad)) EXM_THROW(66, "Decompression error : %s", LZ4F_getErrorName(nextToLoad));
             pos += remaining;
 
diff --git a/programs/lz4io.h b/programs/lz4io.h
index 6190f00..b21b8b6 100644
--- a/programs/lz4io.h
+++ b/programs/lz4io.h
@@ -64,6 +64,8 @@ int LZ4IO_decompressMultipleFilenames(const char** inFileNamesTable, int ifntSiz
 /* ****************** Parameters ******************** */
 /* ************************************************** */
 
+int LZ4IO_setDictionaryFilename(const char* dictionaryFilename);
+
 /* Default setting : overwrite = 1;
    return : overwrite mode (0/1) */
 int LZ4IO_setOverwrite(int yes);

From 93f8284c175a4047b0d9df3112927bbb3b832b2a Mon Sep 17 00:00:00 2001
From: "W. Felix Handte" <w@felixhandte.com>
Date: Fri, 22 Sep 2017 14:50:11 -0700
Subject: [PATCH 2/3] Add some tests verifying command line dictionary
 functionality

---
 tests/Makefile | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/tests/Makefile b/tests/Makefile
index e870fcf..1a907b7 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -129,6 +129,8 @@ ifneq (,$(filter $(shell uname),SunOS))
 DIFF:=gdiff
 endif
 
+DD:=dd
+
 
 test: test-lz4 test-lz4c test-frametest test-fullbench test-fuzzer
 
@@ -253,6 +255,31 @@ test-lz4-basic: lz4 datagen unlz4 lz4cat
 	$(LZ4) -BX tmp-tlb-hw -c -q | $(LZ4) -tv  # test block checksum
 	@$(RM) tmp-tlb*
 
+test-lz4-dict: lz4 datagen
+	@echo "\n ---- test lz4 compression/decompression with dictionary ----"
+	./datagen -g16KB > tmp-dict
+	./datagen -g32KB > tmp-dict-sample-32k
+	< tmp-dict-sample-32k $(LZ4) -D tmp-dict | $(LZ4) -dD tmp-dict | diff - tmp-dict-sample-32k
+	./datagen -g128MB > tmp-dict-sample-128m
+	< tmp-dict-sample-128m $(LZ4) -D tmp-dict | $(LZ4) -dD tmp-dict | diff - tmp-dict-sample-128m
+	touch tmp-dict-sample-0
+	< tmp-dict-sample-0 $(LZ4) -D tmp-dict | $(LZ4) -dD tmp-dict | diff - tmp-dict-sample-0
+
+	< tmp-dict-sample-32k $(LZ4) -D tmp-dict-sample-0 | $(LZ4) -dD tmp-dict-sample-0 | diff - tmp-dict-sample-32k
+	< tmp-dict-sample-0 $(LZ4) -D tmp-dict-sample-0 | $(LZ4) -dD tmp-dict-sample-0 | diff - tmp-dict-sample-0
+
+	@echo "\n ---- test lz4 dictionary loading ----"
+	./datagen -g128KB > tmp-dict-data-128KB
+	set -e; \
+	for l in 0 1 4 128 32767 32768 32769 65535 65536 65537 98303 98304 98305 131071 131072 131073; do \
+		./datagen -g$$l > tmp-dict-$$l; \
+		$(DD) if=tmp-dict-$$l of=tmp-dict-$$l-tail bs=1 count=65536 skip=$$((l > 65536 ? l - 65536 : 0)); \
+		< tmp-dict-$$l      $(LZ4) -D stdin tmp-dict-data-128KB | $(LZ4) -dD tmp-dict-$$l-tail | $(DIFF) - tmp-dict-data-128KB; \
+		< tmp-dict-$$l-tail $(LZ4) -D stdin tmp-dict-data-128KB | $(LZ4) -dD tmp-dict-$$l      | $(DIFF) - tmp-dict-data-128KB; \
+	done
+
+	@$(RM) tmp-dict*
+
 test-lz4-hugefile: lz4 datagen
 	@echo "\n ---- test huge files compression/decompression ----"
 	./datagen -g6GB   | $(LZ4) -vB5D  | $(LZ4) -qt
@@ -292,7 +319,7 @@ test-lz4-opt-parser: lz4 datagen
 
 test-lz4: lz4 datagen test-lz4-basic test-lz4-opt-parser test-lz4-multiple \
           test-lz4-sparse test-lz4-frame-concatenation test-lz4-testmode \
-          test-lz4-contentSize test-lz4-hugefile
+          test-lz4-contentSize test-lz4-hugefile test-lz4-dict
 	@$(RM) tmp*
 
 test-lz4c: lz4c datagen

From 9a16272261f571d54e4642c760d099adb6cc27b1 Mon Sep 17 00:00:00 2001
From: "W. Felix Handte" <w@felixhandte.com>
Date: Tue, 3 Oct 2017 12:50:28 -0400
Subject: [PATCH 3/3] Read the Dictionary into a Circular Buffer

---
 programs/lz4io.c | 67 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/programs/lz4io.c b/programs/lz4io.c
index 642e11c..57434f7 100644
--- a/programs/lz4io.c
+++ b/programs/lz4io.c
@@ -83,6 +83,7 @@
 #define LEGACY_BLOCKSIZE   (8 MB)
 #define MIN_STREAM_BUFSIZE (192 KB)
 #define LZ4IO_BLOCKSIZEID_DEFAULT 7
+#define LZ4_MAX_DICT_SIZE (64 KB)
 
 
 /**************************************
@@ -407,34 +408,60 @@ typedef struct {
     LZ4F_CDict* cdict;
 } cRess_t;
 
-static void* LZ4IO_createDict(const char* dictionaryFilename, size_t *dictionarySize) {
-    FILE* dictionaryFile;
-    size_t blockSize = 64 KB;
-    size_t dictionaryBufferSize = blockSize;
+static void* LZ4IO_createDict(const char* dictFilename, size_t *dictSize) {
     size_t readSize;
-    void* dictionaryBuffer;
-    *dictionarySize = 0;
-    dictionaryBuffer = malloc(dictionaryBufferSize);
+    size_t dictEnd = 0;
+    size_t dictLen = 0;
+    size_t dictStart;
+    size_t circularBufSize = LZ4_MAX_DICT_SIZE;
+    char* circularBuf;
+    char* dictBuf;
+    FILE* dictFile;
 
-    if (!dictionaryBuffer) EXM_THROW(25, "Allocation error : not enough memory");
+    if (!dictFilename) EXM_THROW(25, "Dictionary error : no filename provided");
 
-    if (!dictionaryFilename) EXM_THROW(25, "Dictionary error : no filename provided");
+    circularBuf = (char *) malloc(circularBufSize);
+    if (!circularBuf) EXM_THROW(25, "Allocation error : not enough memory");
 
-    dictionaryFile = LZ4IO_openSrcFile(g_dictionaryFilename);
-    if (!dictionaryFile) EXM_THROW(25, "Dictionary error : could not open dictionary file");
+    dictFile = LZ4IO_openSrcFile(dictFilename);
+    if (!dictFile) EXM_THROW(25, "Dictionary error : could not open dictionary file");
+
+    /* opportunistically seek to the part of the file we care about. If this */
+    /* fails it's not a problem since we'll just read everything anyways.    */
+    if (strcmp(dictFilename, stdinmark)) {
+        UTIL_fseek(dictFile, -LZ4_MAX_DICT_SIZE, SEEK_END);
+    }
 
     do {
-        if (*dictionarySize + blockSize > dictionaryBufferSize) {
-            dictionaryBufferSize *= 2;
-            dictionaryBuffer = realloc(dictionaryBuffer, dictionaryBufferSize);
-            if (!dictionaryBuffer) EXM_THROW(26, "Allocation error : not enough memory");
-        }
-        /* Read next block */
-        readSize = fread((char*)dictionaryBuffer + *dictionarySize, (size_t)1, (size_t)blockSize, dictionaryFile);
-        *dictionarySize += readSize;
+        readSize = fread(circularBuf + dictEnd, 1, circularBufSize - dictEnd, dictFile);
+        dictEnd = (dictEnd + readSize) % circularBufSize;
+        dictLen += readSize;
     } while (readSize>0);
 
-    return dictionaryBuffer;
+    if (dictLen > LZ4_MAX_DICT_SIZE) {
+        dictLen = LZ4_MAX_DICT_SIZE;
+    }
+
+    *dictSize = dictLen;
+
+    dictStart = (circularBufSize + dictEnd - dictLen) % circularBufSize;
+
+    if (dictStart == 0) {
+        /* We're in the simple case where the dict starts at the beginning of our circular buffer. */
+        dictBuf = circularBuf;
+        circularBuf = NULL;
+    } else {
+        /* Otherwise, we will alloc a new buffer and copy our dict into that. */
+        dictBuf = (char *) malloc(dictLen ? dictLen : 1);
+        if (!dictBuf) EXM_THROW(25, "Allocation error : not enough memory");
+
+        memcpy(dictBuf, circularBuf + dictStart, circularBufSize - dictStart);
+        memcpy(dictBuf + circularBufSize - dictStart, circularBuf, dictLen - (circularBufSize - dictStart));
+    }
+
+    free(circularBuf);
+
+    return dictBuf;
 }
 
 static LZ4F_CDict* LZ4IO_createCDict(void) {