From 2bd85f41994e9695911cfc4c86fbc04fdb35ee82 Mon Sep 17 00:00:00 2001 From: "W. Felix Handte" Date: Fri, 22 Sep 2017 11:55:42 -0700 Subject: [PATCH 1/3] Add Dictionary Support to the Command Line Tool --- programs/lz4cli.c | 27 +++++++++++++++ programs/lz4io.c | 84 ++++++++++++++++++++++++++++++++++++++++++++--- programs/lz4io.h | 2 ++ 3 files changed, 109 insertions(+), 4 deletions(-) diff --git a/programs/lz4cli.c b/programs/lz4cli.c index ff489c6..857fa65 100644 --- a/programs/lz4cli.c +++ b/programs/lz4cli.c @@ -113,6 +113,7 @@ static int usage(const char* exeName) DISPLAY( " -9 : High compression \n"); DISPLAY( " -d : decompression (default for %s extension)\n", LZ4_EXTENSION); DISPLAY( " -z : force compression \n"); + DISPLAY( " -D FILE: use dictionary in FILE \n"); DISPLAY( " -f : overwrite output without prompting \n"); DISPLAY( " -k : preserve source files(s) (default) \n"); DISPLAY( "--rm : remove source file(s) after successful de/compression \n"); @@ -290,6 +291,7 @@ int main(int argc, const char** argv) operationMode_e mode = om_auto; const char* input_filename = NULL; const char* output_filename= NULL; + const char* dictionary_filename = NULL; char* dynNameSpace = NULL; const char** inFileNames = (const char**) calloc(argc, sizeof(char*)); unsigned ifnIdx=0; @@ -399,6 +401,22 @@ int main(int argc, const char** argv) /* Compression (default) */ case 'z': mode = om_compress; break; + case 'D': + if (argument[1] == '\0') { + /* path is next arg */ + if (i + 1 == argc) { + /* there is no next arg */ + badusage(exeName); + } + dictionary_filename = argv[++i]; + } else { + /* path follows immediately */ + dictionary_filename = argument + 1; + } + /* skip to end of argument so that we jump to parsing next argument */ + argument += strlen(argument) - 1; + break; + /* Use Legacy format (ex : Linux kernel compression) */ case 'l': legacy_format = 1; blockSize = 8 MB; break; @@ -560,6 +578,15 @@ int main(int argc, const char** argv) mode = om_decompress; /* defer to decompress */ } + if (dictionary_filename) { + if (!strcmp(dictionary_filename, stdinmark) && IS_CONSOLE(stdin)) { + DISPLAYLEVEL(1, "refusing to read from a console\n"); + exit(1); + } + + LZ4IO_setDictionaryFilename(dictionary_filename); + } + /* compress or decompress */ if (!input_filename) input_filename = stdinmark; /* Check if input is defined as console; trigger an error in this case */ diff --git a/programs/lz4io.c b/programs/lz4io.c index 06741b4..642e11c 100644 --- a/programs/lz4io.c +++ b/programs/lz4io.c @@ -57,6 +57,7 @@ #include "lz4.h" /* still required for legacy format */ #include "lz4hc.h" /* still required for legacy format */ #include "lz4frame.h" +#include "lz4frame_static.h" /***************************** @@ -110,6 +111,8 @@ static int g_streamChecksum = 1; static int g_blockIndependence = 1; static int g_sparseFileSupport = 1; static int g_contentSizeFlag = 0; +static int g_useDictionary = 0; +static const char* g_dictionaryFilename = NULL; /************************************** @@ -142,6 +145,12 @@ static int g_contentSizeFlag = 0; /* ****************** Parameters ******************** */ /* ************************************************** */ +int LZ4IO_setDictionaryFilename(const char* dictionaryFilename) { + g_dictionaryFilename = dictionaryFilename; + g_useDictionary = dictionaryFilename != NULL; + return g_useDictionary; +} + /* Default setting : overwrite = 1; return : overwrite mode (0/1) */ int LZ4IO_setOverwrite(int yes) { @@ -395,8 +404,53 @@ typedef struct { void* dstBuffer; size_t dstBufferSize; LZ4F_compressionContext_t ctx; + LZ4F_CDict* cdict; } cRess_t; +static void* LZ4IO_createDict(const char* dictionaryFilename, size_t *dictionarySize) { + FILE* dictionaryFile; + size_t blockSize = 64 KB; + size_t dictionaryBufferSize = blockSize; + size_t readSize; + void* dictionaryBuffer; + *dictionarySize = 0; + dictionaryBuffer = malloc(dictionaryBufferSize); + + if (!dictionaryBuffer) EXM_THROW(25, "Allocation error : not enough memory"); + + if (!dictionaryFilename) EXM_THROW(25, "Dictionary error : no filename provided"); + + dictionaryFile = LZ4IO_openSrcFile(g_dictionaryFilename); + if (!dictionaryFile) EXM_THROW(25, "Dictionary error : could not open dictionary file"); + + do { + if (*dictionarySize + blockSize > dictionaryBufferSize) { + dictionaryBufferSize *= 2; + dictionaryBuffer = realloc(dictionaryBuffer, dictionaryBufferSize); + if (!dictionaryBuffer) EXM_THROW(26, "Allocation error : not enough memory"); + } + /* Read next block */ + readSize = fread((char*)dictionaryBuffer + *dictionarySize, (size_t)1, (size_t)blockSize, dictionaryFile); + *dictionarySize += readSize; + } while (readSize>0); + + return dictionaryBuffer; +} + +static LZ4F_CDict* LZ4IO_createCDict(void) { + size_t dictionarySize; + void* dictionaryBuffer; + LZ4F_CDict* cdict; + if (!g_useDictionary) { + return NULL; + } + dictionaryBuffer = LZ4IO_createDict(g_dictionaryFilename, &dictionarySize); + if (!dictionaryBuffer) EXM_THROW(25, "Dictionary error : could not create dictionary"); + cdict = LZ4F_createCDict(dictionaryBuffer, dictionarySize); + free(dictionaryBuffer); + return cdict; +} + static cRess_t LZ4IO_createCResources(void) { const size_t blockSize = (size_t)LZ4IO_GetBlockSize_FromBlockId (g_blockSizeId); @@ -412,6 +466,8 @@ static cRess_t LZ4IO_createCResources(void) ress.dstBuffer = malloc(ress.dstBufferSize); if (!ress.srcBuffer || !ress.dstBuffer) EXM_THROW(31, "Allocation error : not enough memory"); + ress.cdict = LZ4IO_createCDict(); + return ress; } @@ -419,6 +475,10 @@ static void LZ4IO_freeCResources(cRess_t ress) { free(ress.srcBuffer); free(ress.dstBuffer); + + LZ4F_freeCDict(ress.cdict); + ress.cdict = NULL; + { LZ4F_errorCode_t const errorCode = LZ4F_freeCompressionContext(ress.ctx); if (LZ4F_isError(errorCode)) EXM_THROW(38, "Error : can't free LZ4F context resource : %s", LZ4F_getErrorName(errorCode)); } } @@ -472,7 +532,7 @@ static int LZ4IO_compressFilename_extRess(cRess_t ress, const char* srcFileName, /* single-block file */ if (readSize < blockSize) { /* Compress in single pass */ - size_t const cSize = LZ4F_compressFrame(dstBuffer, dstBufferSize, srcBuffer, readSize, &prefs); + size_t cSize = LZ4F_compressFrame_usingCDict(dstBuffer, dstBufferSize, srcBuffer, readSize, ress.cdict, &prefs); if (LZ4F_isError(cSize)) EXM_THROW(31, "Compression failed : %s", LZ4F_getErrorName(cSize)); compressedfilesize = cSize; DISPLAYUPDATE(2, "\rRead : %u MB ==> %.2f%% ", @@ -488,7 +548,7 @@ static int LZ4IO_compressFilename_extRess(cRess_t ress, const char* srcFileName, /* multiple-blocks file */ { /* Write Archive Header */ - size_t headerSize = LZ4F_compressBegin(ctx, dstBuffer, dstBufferSize, &prefs); + size_t headerSize = LZ4F_compressBegin_usingCDict(ctx, dstBuffer, dstBufferSize, ress.cdict, &prefs); if (LZ4F_isError(headerSize)) EXM_THROW(33, "File header generation failed : %s", LZ4F_getErrorName(headerSize)); { size_t const sizeCheck = fwrite(dstBuffer, 1, headerSize, dstFile); if (sizeCheck!=headerSize) EXM_THROW(34, "Write error : cannot write header"); } @@ -745,8 +805,21 @@ typedef struct { size_t dstBufferSize; FILE* dstFile; LZ4F_decompressionContext_t dCtx; + void* dictBuffer; + size_t dictBufferSize; } dRess_t; +static void LZ4IO_loadDDict(dRess_t* ress) { + if (!g_useDictionary) { + ress->dictBuffer = NULL; + ress->dictBufferSize = 0; + return; + } + + ress->dictBuffer = LZ4IO_createDict(g_dictionaryFilename, &ress->dictBufferSize); + if (!ress->dictBuffer) EXM_THROW(25, "Dictionary error : could not create dictionary"); +} + static const size_t LZ4IO_dBufferSize = 64 KB; static dRess_t LZ4IO_createDResources(void) { @@ -763,6 +836,8 @@ static dRess_t LZ4IO_createDResources(void) ress.dstBuffer = malloc(ress.dstBufferSize); if (!ress.srcBuffer || !ress.dstBuffer) EXM_THROW(61, "Allocation error : not enough memory"); + LZ4IO_loadDDict(&ress); + ress.dstFile = NULL; return ress; } @@ -773,6 +848,7 @@ static void LZ4IO_freeDResources(dRess_t ress) if (LZ4F_isError(errorCode)) EXM_THROW(69, "Error : can't free LZ4F context resource : %s", LZ4F_getErrorName(errorCode)); free(ress.srcBuffer); free(ress.dstBuffer); + free(ress.dictBuffer); } @@ -786,7 +862,7 @@ static unsigned long long LZ4IO_decompressLZ4F(dRess_t ress, FILE* srcFile, FILE { size_t inSize = MAGICNUMBER_SIZE; size_t outSize= 0; LZ4IO_writeLE32(ress.srcBuffer, LZ4IO_MAGICNUMBER); - nextToLoad = LZ4F_decompress(ress.dCtx, ress.dstBuffer, &outSize, ress.srcBuffer, &inSize, NULL); + nextToLoad = LZ4F_decompress_usingDict(ress.dCtx, ress.dstBuffer, &outSize, ress.srcBuffer, &inSize, ress.dictBuffer, ress.dictBufferSize, NULL); if (LZ4F_isError(nextToLoad)) EXM_THROW(62, "Header error : %s", LZ4F_getErrorName(nextToLoad)); } @@ -805,7 +881,7 @@ static unsigned long long LZ4IO_decompressLZ4F(dRess_t ress, FILE* srcFile, FILE /* Decode Input (at least partially) */ size_t remaining = readSize - pos; decodedBytes = ress.dstBufferSize; - nextToLoad = LZ4F_decompress(ress.dCtx, ress.dstBuffer, &decodedBytes, (char*)(ress.srcBuffer)+pos, &remaining, NULL); + nextToLoad = LZ4F_decompress_usingDict(ress.dCtx, ress.dstBuffer, &decodedBytes, (char*)(ress.srcBuffer)+pos, &remaining, ress.dictBuffer, ress.dictBufferSize, NULL); if (LZ4F_isError(nextToLoad)) EXM_THROW(66, "Decompression error : %s", LZ4F_getErrorName(nextToLoad)); pos += remaining; diff --git a/programs/lz4io.h b/programs/lz4io.h index 6190f00..b21b8b6 100644 --- a/programs/lz4io.h +++ b/programs/lz4io.h @@ -64,6 +64,8 @@ int LZ4IO_decompressMultipleFilenames(const char** inFileNamesTable, int ifntSiz /* ****************** Parameters ******************** */ /* ************************************************** */ +int LZ4IO_setDictionaryFilename(const char* dictionaryFilename); + /* Default setting : overwrite = 1; return : overwrite mode (0/1) */ int LZ4IO_setOverwrite(int yes); From 93f8284c175a4047b0d9df3112927bbb3b832b2a Mon Sep 17 00:00:00 2001 From: "W. Felix Handte" Date: Fri, 22 Sep 2017 14:50:11 -0700 Subject: [PATCH 2/3] Add some tests verifying command line dictionary functionality --- tests/Makefile | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/Makefile b/tests/Makefile index e870fcf..1a907b7 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -129,6 +129,8 @@ ifneq (,$(filter $(shell uname),SunOS)) DIFF:=gdiff endif +DD:=dd + test: test-lz4 test-lz4c test-frametest test-fullbench test-fuzzer @@ -253,6 +255,31 @@ test-lz4-basic: lz4 datagen unlz4 lz4cat $(LZ4) -BX tmp-tlb-hw -c -q | $(LZ4) -tv # test block checksum @$(RM) tmp-tlb* +test-lz4-dict: lz4 datagen + @echo "\n ---- test lz4 compression/decompression with dictionary ----" + ./datagen -g16KB > tmp-dict + ./datagen -g32KB > tmp-dict-sample-32k + < tmp-dict-sample-32k $(LZ4) -D tmp-dict | $(LZ4) -dD tmp-dict | diff - tmp-dict-sample-32k + ./datagen -g128MB > tmp-dict-sample-128m + < tmp-dict-sample-128m $(LZ4) -D tmp-dict | $(LZ4) -dD tmp-dict | diff - tmp-dict-sample-128m + touch tmp-dict-sample-0 + < tmp-dict-sample-0 $(LZ4) -D tmp-dict | $(LZ4) -dD tmp-dict | diff - tmp-dict-sample-0 + + < tmp-dict-sample-32k $(LZ4) -D tmp-dict-sample-0 | $(LZ4) -dD tmp-dict-sample-0 | diff - tmp-dict-sample-32k + < tmp-dict-sample-0 $(LZ4) -D tmp-dict-sample-0 | $(LZ4) -dD tmp-dict-sample-0 | diff - tmp-dict-sample-0 + + @echo "\n ---- test lz4 dictionary loading ----" + ./datagen -g128KB > tmp-dict-data-128KB + set -e; \ + for l in 0 1 4 128 32767 32768 32769 65535 65536 65537 98303 98304 98305 131071 131072 131073; do \ + ./datagen -g$$l > tmp-dict-$$l; \ + $(DD) if=tmp-dict-$$l of=tmp-dict-$$l-tail bs=1 count=65536 skip=$$((l > 65536 ? l - 65536 : 0)); \ + < tmp-dict-$$l $(LZ4) -D stdin tmp-dict-data-128KB | $(LZ4) -dD tmp-dict-$$l-tail | $(DIFF) - tmp-dict-data-128KB; \ + < tmp-dict-$$l-tail $(LZ4) -D stdin tmp-dict-data-128KB | $(LZ4) -dD tmp-dict-$$l | $(DIFF) - tmp-dict-data-128KB; \ + done + + @$(RM) tmp-dict* + test-lz4-hugefile: lz4 datagen @echo "\n ---- test huge files compression/decompression ----" ./datagen -g6GB | $(LZ4) -vB5D | $(LZ4) -qt @@ -292,7 +319,7 @@ test-lz4-opt-parser: lz4 datagen test-lz4: lz4 datagen test-lz4-basic test-lz4-opt-parser test-lz4-multiple \ test-lz4-sparse test-lz4-frame-concatenation test-lz4-testmode \ - test-lz4-contentSize test-lz4-hugefile + test-lz4-contentSize test-lz4-hugefile test-lz4-dict @$(RM) tmp* test-lz4c: lz4c datagen From 9a16272261f571d54e4642c760d099adb6cc27b1 Mon Sep 17 00:00:00 2001 From: "W. Felix Handte" Date: Tue, 3 Oct 2017 12:50:28 -0400 Subject: [PATCH 3/3] Read the Dictionary into a Circular Buffer --- programs/lz4io.c | 67 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 20 deletions(-) diff --git a/programs/lz4io.c b/programs/lz4io.c index 642e11c..57434f7 100644 --- a/programs/lz4io.c +++ b/programs/lz4io.c @@ -83,6 +83,7 @@ #define LEGACY_BLOCKSIZE (8 MB) #define MIN_STREAM_BUFSIZE (192 KB) #define LZ4IO_BLOCKSIZEID_DEFAULT 7 +#define LZ4_MAX_DICT_SIZE (64 KB) /************************************** @@ -407,34 +408,60 @@ typedef struct { LZ4F_CDict* cdict; } cRess_t; -static void* LZ4IO_createDict(const char* dictionaryFilename, size_t *dictionarySize) { - FILE* dictionaryFile; - size_t blockSize = 64 KB; - size_t dictionaryBufferSize = blockSize; +static void* LZ4IO_createDict(const char* dictFilename, size_t *dictSize) { size_t readSize; - void* dictionaryBuffer; - *dictionarySize = 0; - dictionaryBuffer = malloc(dictionaryBufferSize); + size_t dictEnd = 0; + size_t dictLen = 0; + size_t dictStart; + size_t circularBufSize = LZ4_MAX_DICT_SIZE; + char* circularBuf; + char* dictBuf; + FILE* dictFile; - if (!dictionaryBuffer) EXM_THROW(25, "Allocation error : not enough memory"); + if (!dictFilename) EXM_THROW(25, "Dictionary error : no filename provided"); - if (!dictionaryFilename) EXM_THROW(25, "Dictionary error : no filename provided"); + circularBuf = (char *) malloc(circularBufSize); + if (!circularBuf) EXM_THROW(25, "Allocation error : not enough memory"); - dictionaryFile = LZ4IO_openSrcFile(g_dictionaryFilename); - if (!dictionaryFile) EXM_THROW(25, "Dictionary error : could not open dictionary file"); + dictFile = LZ4IO_openSrcFile(dictFilename); + if (!dictFile) EXM_THROW(25, "Dictionary error : could not open dictionary file"); + + /* opportunistically seek to the part of the file we care about. If this */ + /* fails it's not a problem since we'll just read everything anyways. */ + if (strcmp(dictFilename, stdinmark)) { + UTIL_fseek(dictFile, -LZ4_MAX_DICT_SIZE, SEEK_END); + } do { - if (*dictionarySize + blockSize > dictionaryBufferSize) { - dictionaryBufferSize *= 2; - dictionaryBuffer = realloc(dictionaryBuffer, dictionaryBufferSize); - if (!dictionaryBuffer) EXM_THROW(26, "Allocation error : not enough memory"); - } - /* Read next block */ - readSize = fread((char*)dictionaryBuffer + *dictionarySize, (size_t)1, (size_t)blockSize, dictionaryFile); - *dictionarySize += readSize; + readSize = fread(circularBuf + dictEnd, 1, circularBufSize - dictEnd, dictFile); + dictEnd = (dictEnd + readSize) % circularBufSize; + dictLen += readSize; } while (readSize>0); - return dictionaryBuffer; + if (dictLen > LZ4_MAX_DICT_SIZE) { + dictLen = LZ4_MAX_DICT_SIZE; + } + + *dictSize = dictLen; + + dictStart = (circularBufSize + dictEnd - dictLen) % circularBufSize; + + if (dictStart == 0) { + /* We're in the simple case where the dict starts at the beginning of our circular buffer. */ + dictBuf = circularBuf; + circularBuf = NULL; + } else { + /* Otherwise, we will alloc a new buffer and copy our dict into that. */ + dictBuf = (char *) malloc(dictLen ? dictLen : 1); + if (!dictBuf) EXM_THROW(25, "Allocation error : not enough memory"); + + memcpy(dictBuf, circularBuf + dictStart, circularBufSize - dictStart); + memcpy(dictBuf + circularBufSize - dictStart, circularBuf, dictLen - (circularBufSize - dictStart)); + } + + free(circularBuf); + + return dictBuf; } static LZ4F_CDict* LZ4IO_createCDict(void) {