From 6d222c437ca1f3b7420f414354643d3f11ba075a Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Thu, 12 Jul 2018 17:56:58 -0700 Subject: [PATCH 01/35] Set requestedParams in ZSTD_initCStream*() The correct parameters are used once, but once `ZSTD_resetCStream()` is called the default parameters (level 3) are used. Fix this by setting `requestedParams` in the `ZSTD_initCStream*()` functions. The added tests both fail before this patch and pass after. --- lib/compress/zstd_compress.c | 22 ++++++++++------------ tests/zstreamtest.c | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index c6686252..d659baf1 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -3332,9 +3332,11 @@ size_t ZSTD_CStreamOutSize(void) static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx, const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType, const ZSTD_CDict* const cdict, - ZSTD_CCtx_params const params, unsigned long long const pledgedSrcSize) + ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize) { DEBUGLOG(4, "ZSTD_resetCStream_internal"); + /* Finalize the compression parameters */ + params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, dictSize); /* params are supposed to be fully validated at this point */ assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); assert(!((dict) && (cdict))); /* either dict or cdict, not both */ @@ -3363,7 +3365,6 @@ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize) DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (U32)pledgedSrcSize); if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; params.fParams.contentSizeFlag = 1; - params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, 0); return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize); } @@ -3376,6 +3377,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, ZSTD_CCtx_params params, unsigned long long pledgedSrcSize) { DEBUGLOG(4, "ZSTD_initCStream_internal"); + params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, dictSize); assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); assert(!((dict) && (cdict))); /* either dict or cdict, not both */ @@ -3442,25 +3444,21 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, (U32)pledgedSrcSize, params.fParams.contentSizeFlag); CHECK_F( ZSTD_checkCParams(params.cParams) ); if ((pledgedSrcSize==0) && (params.fParams.contentSizeFlag==0)) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; /* for compatibility with older programs relying on this behavior. Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. This line will be removed in the future. */ - { ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); - return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, cctxParams, pledgedSrcSize); - } + zcs->requestedParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); + return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, zcs->requestedParams, pledgedSrcSize); } size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize); - ZSTD_CCtx_params const cctxParams = - ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); - return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN); + ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel); + return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, zcs->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN); } size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss) { U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; /* temporary : 0 interpreted as "unknown" during transition period. Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. `0` will be interpreted as "empty" in the future */ - ZSTD_parameters const params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0); - ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); - return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, cctxParams, pledgedSrcSize); + ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel); + return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, zcs->requestedParams, pledgedSrcSize); } size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c index 22c49cb3..3d61394a 100644 --- a/tests/zstreamtest.c +++ b/tests/zstreamtest.c @@ -969,6 +969,26 @@ static int basicUnitTests(U32 seed, double compressibility) } DISPLAYLEVEL(3, "OK \n"); + DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_srcSize sets requestedParams : ", testNb++); + { unsigned level; + CHECK_Z(ZSTD_initCStream_srcSize(zc, 11, ZSTD_CONTENTSIZE_UNKNOWN)); + CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionLevel, &level)); + CHECK(level != 11, "Compression level does not match"); + ZSTD_resetCStream(zc, ZSTD_CONTENTSIZE_UNKNOWN); + CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionLevel, &level)); + CHECK(level != 11, "Compression level does not match"); + } + DISPLAYLEVEL(3, "OK \n"); + + DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_advanced sets requestedParams : ", testNb++); + { ZSTD_parameters const params = ZSTD_getParams(9, 0, 0); + CHECK_Z(ZSTD_initCStream_advanced(zc, NULL, 0, params, ZSTD_CONTENTSIZE_UNKNOWN)); + CHECK(badParameters(zc, params), "Compression parameters do not match"); + ZSTD_resetCStream(zc, ZSTD_CONTENTSIZE_UNKNOWN); + CHECK(badParameters(zc, params), "Compression parameters do not match"); + } + DISPLAYLEVEL(3, "OK \n"); + /* Overlen overwriting window data bug */ DISPLAYLEVEL(3, "test%3i : wildcopy doesn't overwrite potential match data : ", testNb++); { /* This test has a window size of 1024 bytes and consists of 3 blocks: From a23a3b95f9c00ecf52216bd7fe768e41eac4e269 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 13 Jul 2018 16:05:14 -0700 Subject: [PATCH 02/35] Add random dictionary builder --- contrib/randomDictBuilder/Makefile | 48 +++ contrib/randomDictBuilder/README.md | 13 + contrib/randomDictBuilder/main.c | 125 ++++++++ contrib/randomDictBuilder/random.c | 455 ++++++++++++++++++++++++++++ contrib/randomDictBuilder/random.h | 53 ++++ contrib/randomDictBuilder/test.sh | 14 + 6 files changed, 708 insertions(+) create mode 100644 contrib/randomDictBuilder/Makefile create mode 100644 contrib/randomDictBuilder/README.md create mode 100644 contrib/randomDictBuilder/main.c create mode 100644 contrib/randomDictBuilder/random.c create mode 100644 contrib/randomDictBuilder/random.h create mode 100644 contrib/randomDictBuilder/test.sh diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile new file mode 100644 index 00000000..a2aade23 --- /dev/null +++ b/contrib/randomDictBuilder/Makefile @@ -0,0 +1,48 @@ +PROGRAM_FILES := ../../programs/fileio.c + +TEST_INPUT := ../../lib +TEST_OUTPUT := randomDict +ARG := + +all: main testrun test clean + +run: main rand clean + +.PHONY: rand +rand: + echo "Building a random dictionary with given arguments" + ./main $(ARG) + + +main: random.o main.o libzstd.a + gcc random.o main.o libzstd.a -o main + +main.o: main.c + gcc -c main.c -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -I random.h + +random.o: $(PROGRAM_FILES) random.c + gcc -c $(PROGRAM_FILES) -I ../../programs -I ../../lib/common -I random.h random.c + +libzstd.a: + $(MAKE) -C ../../lib libzstd.a + mv ../../lib/libzstd.a . + +.PHONY: testrun +testrun: main + echo "Run with $(TEST_INPUT) and $(TEST_OUTPUT) " + ./main in=$(TEST_INPUT) out=$(TEST_OUTPUT) + zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q + rm -f $(TEST_OUTPUT) + +.PHONY: test +test: test.sh + sh test.sh + echo "Finish running test.sh" + +.PHONY: clean +clean: + rm -f libzstd.a main + rm -f ../../lib/*/*.o + rm -f ../../programs/*.o + rm -f *.o + echo "Cleaning is completed" diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md new file mode 100644 index 00000000..cadffdf2 --- /dev/null +++ b/contrib/randomDictBuilder/README.md @@ -0,0 +1,13 @@ +Random Dictionary Builder + +### Permitted Arguments: +Input Files (in=fileName): files used to build dictionary, can include multiple files, each following "in=", required +Output Dictionary (out=dictName): if not provided, default to defaultDict +Dictionary ID (dictID=#): positive number, if not provided, default to 0 +Maximum Dictionary Size (maxdict=#): positive number, in bytes, if not provided, default to 110KB +Size of Randomly Selected Segment (k=#): positive number, in bytes, if not provided, default to 200 +Compression Level (c=#): positive number, if not provided, default to 3 + +### Examples: +make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520" +make run ARG="in=../../lib/dictBuilder in=../../lib/compress" diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c new file mode 100644 index 00000000..15eb5c44 --- /dev/null +++ b/contrib/randomDictBuilder/main.c @@ -0,0 +1,125 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "random.h" +#include "util.h" + +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const unsigned g_defaultMaxDictSize = 110 KB; +#define DEFAULT_CLEVEL 3 +#define DEFAULT_INPUTFILE "" +#define DEFAULT_k 200 +#define DEFAULT_OUTPUTFILE "defaultDict" +#define DEFAULT_DICTID 0 + + +static unsigned readU32FromChar(const char** stringPtr) +{ + const char errorMsg[] = "error: numeric value too large"; + unsigned result = 0; + while ((**stringPtr >='0') && (**stringPtr <='9')) { + unsigned const max = (((unsigned)(-1)) / 10) - 1; + if (result > max) exit(1); + result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; + } + if ((**stringPtr=='K') || (**stringPtr=='M')) { + unsigned const maxK = ((unsigned)(-1)) >> 10; + if (result > maxK) exit(1); + result <<= 10; + if (**stringPtr=='M') { + if (result > maxK) exit(1); + result <<= 10; + } + (*stringPtr)++; /* skip `K` or `M` */ + if (**stringPtr=='i') (*stringPtr)++; + if (**stringPtr=='B') (*stringPtr)++; + } + return result; +} + + +/** longCommandWArg() : + * check if *stringPtr is the same as longCommand. + * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. + * @return 0 and doesn't modify *stringPtr otherwise. + */ +static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) +{ + size_t const comSize = strlen(longCommand); + int const result = !strncmp(*stringPtr, longCommand, comSize); + if (result) *stringPtr += comSize; + return result; +} + + +int main(int argCount, const char* argv[]) +{ + int displayLevel = 2; + const char* programName = argv[0]; + int operationResult = 0; + + unsigned cLevel = DEFAULT_CLEVEL; + char* inputFile = DEFAULT_INPUTFILE; + unsigned k = DEFAULT_k; + char* outputFile = DEFAULT_OUTPUTFILE; + unsigned dictID = DEFAULT_DICTID; + unsigned maxDictSize = g_defaultMaxDictSize; + + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); + unsigned filenameIdx = 0; + + for (int i = 1; i < argCount; i++) { + const char* argument = argv[i]; + if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "in=")) { + inputFile = malloc(strlen(argument) + 1); + strcpy(inputFile, argument); + filenameTable[filenameIdx] = inputFile; + filenameIdx++; + continue; + } + if (longCommandWArg(&argument, "out=")) { + outputFile = malloc(strlen(argument) + 1); + strcpy(outputFile, argument); + continue; + } + DISPLAYLEVEL(1, "Incorrect parameters\n"); + operationResult = 1; + return operationResult; + } + + + char* fileNamesBuf = NULL; + unsigned fileNamesNb = filenameIdx; + int followLinks = 0; + const char** extendedFileList = NULL; + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks); + if (extendedFileList) { + unsigned u; + for (u=0; u /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "random.h" +#include "platform.h" /* Large Files support */ +#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ + +/*-************************************* +* Constants +***************************************/ +#define SAMPLESIZE_MAX (128 KB) +#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define RANDOM_MEMMULT 9 +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); + +#define NOISELENGTH 32 +#define DEFAULT_K 200 + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/* ******************************************************** +* File related operations +**********************************************************/ +/** loadFiles() : + * load samples from files listed in fileNamesTable into buffer. + * works even if buffer is too small to load all samples. + * Also provides the size of each sample into sampleSizes table + * which must be sized correctly, using DiB_fileStats(). + * @return : nb of samples effectively loaded into `buffer` + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, + size_t* sampleSizes, unsigned sstSize, + const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, + unsigned displayLevel) +{ + char* const buff = (char*)buffer; + size_t pos = 0; + unsigned nbLoadedChunks = 0, fileIndex; + + for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; + { size_t const readSize = fread(buff+pos, 1, toLoad, f); + if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); + pos += readSize; + sampleSizes[nbLoadedChunks++] = toLoad; + remainingToLoad -= targetChunkSize; + if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ + fileIndex = nbFiles; /* stop there */ + break; + } + if (toLoad < targetChunkSize) { + fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); + } } } + fclose(f); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + *bufferSizePtr = pos; + DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) + return nbLoadedChunks; +} + + + +#define rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 getRand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + + +/* shuffle() : + * shuffle a table of file names in a semi-random way + * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, + * it will load random elements from it, instead of just the first ones. */ +static void shuffle(const char** fileNamesTable, unsigned nbFiles) { + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = getRand(&seed) % (i + 1); + const char* const tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +static size_t findMaxMem(unsigned long long requiredMem) +{ + size_t const step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += step; + if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; + + while (!testmem) { + testmem = malloc((size_t)requiredMem); + requiredMem -= step; + } + + free(testmem); + return (size_t)requiredMem; +} + +static void saveDict(const char* dictFileName, + const void* buff, size_t buffSize) +{ + FILE* const f = fopen(dictFileName, "wb"); + if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); + + { size_t const n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } + + { size_t const n = (size_t)fclose(f); + if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } +} + +/*! getFileStats() : + * Given a list of files, and a chunkSize (0 == no chunk, whole files) + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) +{ + fileStats fs; + unsigned n; + memset(&fs, 0, sizeof(fs)); + for (n=0; n 2*SAMPLESIZE_MAX); + fs.nbSamples += nbSamples; + } + DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); + return fs; +} + + + + + +/* ******************************************************** +* Random Dictionary Builder +**********************************************************/ +/** + * Returns the sum of the sample sizes. + */ +static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) { + size_t sum = 0; + unsigned i; + for (i = 0; i < nbSamples; ++i) { + sum += samplesSizes[i]; + } + return sum; +} + + +/** + * Selects a random segment from totalSamplesSize - k + 1 possible segments + */ +static RANDOM_segment_t RANDOM_selectSegment(const RANDOM_ctx_t *ctx, + ZDICT_random_params_t parameters) { + const U32 k = parameters.k; + RANDOM_segment_t segment; + unsigned index; + + /* Seed random number generator */ + srand((unsigned)time(NULL)); + /* Randomly generate a number from 0 to sampleSizes - k */ + index = rand()%(ctx->totalSamplesSize - k + 1); + + /* inclusive */ + segment.begin = index; + segment.end = index + k - 1; + + return segment; +} + + +/** + * Check the validity of the parameters. + * Returns non-zero if the parameters are valid and 0 otherwise. + */ +static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDictSize) { + /* k is a required parameter */ + if (parameters.k == 0) { + return 0; + } + /* k <= maxDictSize */ + if (parameters.k > maxDictSize) { + return 0; + } + return 1; +} + + +/** + * Clean up a context initialized with `RANDOM_ctx_init()`. + */ +static void RANDOM_ctx_destroy(RANDOM_ctx_t *ctx) { + if (!ctx) { + return; + } + if (ctx->offsets) { + free(ctx->offsets); + ctx->offsets = NULL; + } +} + + +/** + * Prepare a context for dictionary building. + * Returns 1 on success or zero on error. + * The context must be destroyed with `RANDOM_ctx_destroy()`. + */ +static int RANDOM_ctx_init(RANDOM_ctx_t *ctx, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples) { + const BYTE *const samples = (const BYTE *)samplesBuffer; + const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples); + const int displayLevel = 2; + /* Checks */ + if (totalSamplesSize >= (size_t)RANDOM_MAX_SAMPLES_SIZE) { + DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", + (U32)(totalSamplesSize>>20), (RANDOM_MAX_SAMPLES_SIZE >> 20)); + return 0; + } + memset(ctx, 0, sizeof(*ctx)); + DISPLAYLEVEL(1, "Building dictionary from %u samples of total size %u\n", nbSamples, + (U32)totalSamplesSize); + ctx->samples = samples; + ctx->samplesSizes = samplesSizes; + ctx->nbSamples = nbSamples; + ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); + ctx->totalSamplesSize = (U32)totalSamplesSize; + if (!ctx->offsets) { + DISPLAYLEVEL(1, "Failed to allocate buffer for offsets\n"); + RANDOM_ctx_destroy(ctx); + return 0; + } + { + U32 i; + ctx->offsets[0] = 0; + for (i = 1; i <= nbSamples; ++i) { + ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; + } + } + return 1; +} + + +/** + * Given the prepared context build the dictionary. + */ +static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, + size_t dictBufferCapacity, + ZDICT_random_params_t parameters) { + BYTE *const dict = (BYTE *)dictBuffer; + size_t tail = dictBufferCapacity; + const int displayLevel = parameters.zParams.notificationLevel; + while (tail > 0) { + + /* Select a segment */ + RANDOM_segment_t segment = RANDOM_selectSegment(ctx, parameters); + + size_t segmentSize; + segmentSize = MIN(segment.end - segment.begin + 1, tail); + + tail -= segmentSize; + memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + DISPLAYUPDATE( + 2, "\r%u%% ", + (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); + } + + return tail; +} + +/*! ZDICT_trainFromBuffer_random(): + * Train a dictionary from an array of samples using the RANDOM algorithm. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + */ +ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_random_params_t parameters) { + const int displayLevel = parameters.zParams.notificationLevel; + BYTE* const dict = (BYTE*)dictBuffer; + RANDOM_ctx_t ctx; + /* Checks */ + if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) { + DISPLAYLEVEL(1, "k is incorrect\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "Random must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + + if (!RANDOM_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples)) { + return ERROR(GENERIC); + } + DISPLAYLEVEL(2, "Building dictionary\n"); + { + const size_t tail = RANDOM_buildDictionary(&ctx, dictBuffer, dictBufferCapacity, parameters); + const size_t dictSize = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + samplesBuffer, samplesSizes, nbSamples, parameters.zParams); + if (!ZSTD_isError(dictSize)) { + DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", + (U32)dictSize); + } + RANDOM_ctx_destroy(&ctx); + return dictSize; + } +} + + +int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, + const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, ZDICT_random_params_t *params){ + unsigned const displayLevel = params->zParams.notificationLevel; + void* const dictBuffer = malloc(maxDictSize); + fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); + size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); + size_t const memMult = RANDOM_MEMMULT; + size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); + void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + int result = 0; + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer)) + EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ + if (fs.oneSampleTooLarge) { + DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); + } + if (fs.nbSamples < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); + DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } + if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ + if (loadedSize < fs.totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); + + /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + shuffle(fileNamesTable, nbFiles); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); + + { size_t dictSize; + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, srcBuffer, + sampleSizes, fs.nbSamples, *params); + DISPLAYLEVEL(2, "k=%u\n", params->k); + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + result = 1; + goto _cleanup; + } + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + saveDict(dictFileName, dictBuffer, dictSize); + } + + /* clean up */ +_cleanup: + free(srcBuffer); + free(sampleSizes); + free(dictBuffer); + return result; +} diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h new file mode 100644 index 00000000..05879641 --- /dev/null +++ b/contrib/randomDictBuilder/random.h @@ -0,0 +1,53 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "zdict.h" + + +/************************************** +* Context +***************************************/ +typedef struct { + const BYTE *samples; + size_t *offsets; + const size_t *samplesSizes; + size_t nbSamples; + U32 totalSamplesSize; +} RANDOM_ctx_t; + +/** + * A segment is an inclusive range in the source. + */ +typedef struct { + U32 begin; + U32 end; +} RANDOM_segment_t; + + +typedef struct { + unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */ + ZDICT_params_t zParams; +} ZDICT_random_params_t; + + +typedef struct { + U64 totalSizeToLoad; + unsigned oneSampleTooLarge; + unsigned nbSamples; +} fileStats; + + +ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_random_params_t parameters); + + +int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, + const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, ZDICT_random_params_t *params); diff --git a/contrib/randomDictBuilder/test.sh b/contrib/randomDictBuilder/test.sh new file mode 100644 index 00000000..552650ee --- /dev/null +++ b/contrib/randomDictBuilder/test.sh @@ -0,0 +1,14 @@ +echo "Building random dictionary with c=5 in=../../lib/common k=200 out=dict1" +./main c=5 in=../../lib/common k=200 out=dict1 +zstd -be3 -D dict1 -r ../../lib/common -q +echo "Building random dictionary with c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000" +./main c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 +zstd -be3 -D dict2 -r ../../lib/common -q +echo "Building random dictionary with 2 sample sources" +./main in=../../lib/common in=../../lib/compress out=dict3 +zstd -be3 -D dict3 -r ../../lib/common -q +echo "Removing dict1 dict2 dict3" +rm -f dict1 dict2 dict3 + +echo "Testing with invalid parameters, should fail" +! ./main r=10 From 31731df4dab0df7b465de2de5641b2e3416c9086 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 13 Jul 2018 17:38:53 -0700 Subject: [PATCH 03/35] Remove clevel and update documentation --- contrib/randomDictBuilder/README.md | 15 ++++++++++----- contrib/randomDictBuilder/main.c | 11 ++++++++--- contrib/randomDictBuilder/test.sh | 8 ++++---- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md index cadffdf2..de2c7ff6 100644 --- a/contrib/randomDictBuilder/README.md +++ b/contrib/randomDictBuilder/README.md @@ -1,12 +1,17 @@ Random Dictionary Builder ### Permitted Arguments: -Input Files (in=fileName): files used to build dictionary, can include multiple files, each following "in=", required +Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in=" Output Dictionary (out=dictName): if not provided, default to defaultDict -Dictionary ID (dictID=#): positive number, if not provided, default to 0 -Maximum Dictionary Size (maxdict=#): positive number, in bytes, if not provided, default to 110KB -Size of Randomly Selected Segment (k=#): positive number, in bytes, if not provided, default to 200 -Compression Level (c=#): positive number, if not provided, default to 3 +Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0 +Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB +Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200 +Compression Level (c=#): positive number; if not provided, default to 3 + + +###Usage: +To build a random dictionary with the provided arguments: make run ARG= followed by arguments + ### Examples: make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520" diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index 15eb5c44..cf0b9476 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -63,7 +63,7 @@ int main(int argCount, const char* argv[]) const char* programName = argv[0]; int operationResult = 0; - unsigned cLevel = DEFAULT_CLEVEL; + /* Initialize parameters with default value */ char* inputFile = DEFAULT_INPUTFILE; unsigned k = DEFAULT_k; char* outputFile = DEFAULT_OUTPUTFILE; @@ -76,10 +76,10 @@ int main(int argCount, const char* argv[]) for (int i = 1; i < argCount; i++) { const char* argument = argv[i]; if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } - if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "in=")) { + /* Allow multiple input files */ inputFile = malloc(strlen(argument) + 1); strcpy(inputFile, argument); filenameTable[filenameIdx] = inputFile; @@ -96,6 +96,11 @@ int main(int argCount, const char* argv[]) return operationResult; } + if (maxDictSize == 0) { + DISPLAYLEVEL(1, "maxDictSize should not be 0.\n"); + operationResult = 1; + return operationResult; + } char* fileNamesBuf = NULL; unsigned fileNamesNb = filenameIdx; @@ -114,7 +119,7 @@ int main(int argCount, const char* argv[]) ZDICT_random_params_t params; ZDICT_params_t zParams; - zParams.compressionLevel = cLevel; + zParams.compressionLevel = DEFAULT_CLEVEL; zParams.notificationLevel = displayLevel; zParams.dictID = dictID; params.zParams = zParams; diff --git a/contrib/randomDictBuilder/test.sh b/contrib/randomDictBuilder/test.sh index 552650ee..497820f8 100644 --- a/contrib/randomDictBuilder/test.sh +++ b/contrib/randomDictBuilder/test.sh @@ -1,8 +1,8 @@ -echo "Building random dictionary with c=5 in=../../lib/common k=200 out=dict1" -./main c=5 in=../../lib/common k=200 out=dict1 +echo "Building random dictionary with in=../../lib/common k=200 out=dict1" +./main in=../../lib/common k=200 out=dict1 zstd -be3 -D dict1 -r ../../lib/common -q -echo "Building random dictionary with c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000" -./main c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 +echo "Building random dictionary with in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000" +./main in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 zstd -be3 -D dict2 -r ../../lib/common -q echo "Building random dictionary with 2 sample sources" ./main in=../../lib/common in=../../lib/compress out=dict3 From 0e5fbc10facdce2def08e4f4ecb67d255694df3a Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 13 Jul 2018 17:41:09 -0700 Subject: [PATCH 04/35] Update README --- contrib/randomDictBuilder/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md index de2c7ff6..09f1e808 100644 --- a/contrib/randomDictBuilder/README.md +++ b/contrib/randomDictBuilder/README.md @@ -6,7 +6,6 @@ Output Dictionary (out=dictName): if not provided, default to defaultDict Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0 Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200 -Compression Level (c=#): positive number; if not provided, default to 3 ###Usage: From 58b82194755b52ad80b6e7da5aeae8e383f8bb90 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 9 Jul 2018 18:24:07 -0700 Subject: [PATCH 05/35] zstdcli: Allow -o before --train Only set the default value if `outFileName` is unset. Fixes #1227. --- programs/zstdcli.c | 14 ++++++++------ tests/playTests.sh | 24 +++++++++++++++++++++--- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/programs/zstdcli.c b/programs/zstdcli.c index e0d7807f..36ba2115 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -502,7 +502,7 @@ int main(int argCount, const char* argv[]) if (!strcmp(argument, "--sparse")) { FIO_setSparseWrite(2); continue; } if (!strcmp(argument, "--no-sparse")) { FIO_setSparseWrite(0); continue; } if (!strcmp(argument, "--test")) { operation=zom_test; continue; } - if (!strcmp(argument, "--train")) { operation=zom_train; outFileName=g_defaultDictName; continue; } + if (!strcmp(argument, "--train")) { operation=zom_train; if (outFileName==NULL) outFileName=g_defaultDictName; continue; } if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; lastCommand=1; continue; } /* kept available for compatibility with old syntax ; will be removed one day */ if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; lastCommand=1; continue; } /* kept available for compatibility with old syntax ; will be removed one day */ if (!strcmp(argument, "--no-dictID")) { FIO_setDictIDFlag(0); continue; } @@ -526,7 +526,8 @@ int main(int argCount, const char* argv[]) #ifndef ZSTD_NODICT if (longCommandWArg(&argument, "--train-cover")) { operation = zom_train; - outFileName = g_defaultDictName; + if (outFileName == NULL) + outFileName = g_defaultDictName; cover = 1; /* Allow optional arguments following an = */ if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); } @@ -536,7 +537,8 @@ int main(int argCount, const char* argv[]) } if (longCommandWArg(&argument, "--train-legacy")) { operation = zom_train; - outFileName = g_defaultDictName; + if (outFileName == NULL) + outFileName = g_defaultDictName; cover = 0; /* Allow optional arguments following an = */ if (*argument == 0) { continue; } @@ -718,7 +720,7 @@ int main(int argCount, const char* argv[]) break; /* Select compressibility of synthetic sample */ - case 'P': + case 'P': { argument++; compressibility = (double)readU32FromChar(&argument) / 100; } @@ -841,7 +843,7 @@ int main(int argCount, const char* argv[]) if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel(); if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel(); if (cLevelLast < cLevel) cLevelLast = cLevel; - if (cLevelLast > cLevel) + if (cLevelLast > cLevel) DISPLAYLEVEL(2, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast); if(filenameIdx) { if(separateFiles) { @@ -856,7 +858,7 @@ int main(int argCount, const char* argv[]) } else { for(; cLevel <= cLevelLast; cLevel++) { BMK_benchFilesAdvanced(filenameTable, filenameIdx, dictFileName, cLevel, &compressionParams, g_displayLevel, &adv); - } + } } } else { for(; cLevel <= cLevelLast; cLevel++) { diff --git a/tests/playTests.sh b/tests/playTests.sh index fb8b1d24..0a1f96c0 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -404,7 +404,13 @@ $ECHO "Hello World" > tmp $ZSTD --train-legacy -q tmp && die "Dictionary training should fail : not enough input source" ./datagen -P0 -g10M > tmp $ZSTD --train-legacy -q tmp && die "Dictionary training should fail : source is pure noise" -rm tmp* +$ECHO "- Test -o before --train" +rm -f tmpDict dictionary +$ZSTD -o tmpDict --train *.c ../programs/*.c +test -f tmpDict +$ZSTD --train *.c ../programs/*.c +test -f dictionary +rm tmp* dictionary $ECHO "\n===> cover dictionary builder : advanced options " @@ -425,12 +431,18 @@ $ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" $ECHO "- Create dictionary with size limit" $ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K -rm tmp* $ECHO "- Compare size of dictionary from 90% training samples with 80% training samples" $ZSTD --train-cover=split=90 -r *.c ../programs/*.c $ZSTD --train-cover=split=80 -r *.c ../programs/*.c $ECHO "- Create dictionary using all samples for both training and testing" $ZSTD --train-cover=split=100 -r *.c ../programs/*.c +$ECHO "- Test -o before --train-cover" +rm -f tmpDict dictionary +$ZSTD -o tmpDict --train-cover *.c ../programs/*.c +test -f tmpDict +$ZSTD --train-cover *.c ../programs/*.c +test -f dictionary +rm tmp* dictionary $ECHO "\n===> legacy dictionary builder " @@ -450,7 +462,13 @@ $ZSTD --train-legacy -s5 *.c ../programs/*.c --dictID=1 -o tmpDict1 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" $ECHO "- Create dictionary with size limit" $ZSTD --train-legacy -s9 *.c ../programs/*.c -o tmpDict2 --maxdict=4K -rm tmp* +$ECHO "- Test -o before --train-legacy" +rm -f tmpDict dictionary +$ZSTD -o tmpDict --train-legacy *.c ../programs/*.c +test -f tmpDict +$ZSTD --train-legacy *.c ../programs/*.c +test -f dictionary +rm tmp* dictionary $ECHO "\n===> integrity tests " From b5806d33db813dfb2bac7cd3b97b5bcf09ee57b7 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Mon, 16 Jul 2018 16:03:04 -0700 Subject: [PATCH 06/35] Refactor RANDOM --- contrib/randomDictBuilder/Makefile | 12 +- contrib/randomDictBuilder/main.c | 297 ++++++++++++++++++++++++- contrib/randomDictBuilder/random.c | 343 ++--------------------------- contrib/randomDictBuilder/random.h | 23 -- 4 files changed, 314 insertions(+), 361 deletions(-) diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index a2aade23..443f6f04 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -14,14 +14,14 @@ rand: ./main $(ARG) -main: random.o main.o libzstd.a - gcc random.o main.o libzstd.a -o main +main: main.o random.o libzstd.a + gcc main.o random.o libzstd.a -o main -main.o: main.c - gcc -c main.c -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -I random.h +main.o: main.c $(PROGRAM_FILES) + gcc -c main.c $(PROGRAM_FILES) -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -random.o: $(PROGRAM_FILES) random.c - gcc -c $(PROGRAM_FILES) -I ../../programs -I ../../lib/common -I random.h random.c +random.o: random.c + gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder libzstd.a: $(MAKE) -C ../../lib libzstd.a diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index cf0b9476..d9295aa9 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -3,13 +3,45 @@ #include /* strcmp, strlen */ #include /* errno */ #include -#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ #include "random.h" +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ #include "util.h" +#include "zdict.h" +/*-************************************* +* Console display +***************************************/ #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + +/*-************************************* +* Constants +***************************************/ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_CLEVEL 3 #define DEFAULT_INPUTFILE "" @@ -17,7 +49,33 @@ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_OUTPUTFILE "defaultDict" #define DEFAULT_DICTID 0 +#define SAMPLESIZE_MAX (128 KB) +#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define RANDOM_MEMMULT 9 +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); +#define NOISELENGTH 32 + + +/*-************************************* +* Structs +***************************************/ +typedef struct { + U64 totalSizeToLoad; + unsigned oneSampleTooLarge; + unsigned nbSamples; +} fileStats; + +typedef struct { + const void* srcBuffer; + const size_t *samplesSizes; + size_t nbSamples; +}sampleInfo; + + +/*-************************************* +* Commandline related functions +***************************************/ static unsigned readU32FromChar(const char** stringPtr) { const char errorMsg[] = "error: numeric value too large"; @@ -42,7 +100,6 @@ static unsigned readU32FromChar(const char** stringPtr) return result; } - /** longCommandWArg() : * check if *stringPtr is the same as longCommand. * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. @@ -56,6 +113,225 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) return result; } +/* ******************************************************** +* File related operations +**********************************************************/ +/** loadFiles() : + * load samples from files listed in fileNamesTable into buffer. + * works even if buffer is too small to load all samples. + * Also provides the size of each sample into sampleSizes table + * which must be sized correctly, using DiB_fileStats(). + * @return : nb of samples effectively loaded into `buffer` + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, + size_t* sampleSizes, unsigned sstSize, + const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, + unsigned displayLevel) +{ + char* const buff = (char*)buffer; + size_t pos = 0; + unsigned nbLoadedChunks = 0, fileIndex; + + for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; + { size_t const readSize = fread(buff+pos, 1, toLoad, f); + if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); + pos += readSize; + sampleSizes[nbLoadedChunks++] = toLoad; + remainingToLoad -= targetChunkSize; + if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ + fileIndex = nbFiles; /* stop there */ + break; + } + if (toLoad < targetChunkSize) { + fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); + } } } + fclose(f); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + *bufferSizePtr = pos; + DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) + return nbLoadedChunks; +} + +#define rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 getRand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + +/* shuffle() : + * shuffle a table of file names in a semi-random way + * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, + * it will load random elements from it, instead of just the first ones. */ +static void shuffle(const char** fileNamesTable, unsigned nbFiles) { + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = getRand(&seed) % (i + 1); + const char* const tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +static size_t findMaxMem(unsigned long long requiredMem) +{ + size_t const step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += step; + if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; + + while (!testmem) { + testmem = malloc((size_t)requiredMem); + requiredMem -= step; + } + + free(testmem); + return (size_t)requiredMem; +} + +static void saveDict(const char* dictFileName, + const void* buff, size_t buffSize) +{ + FILE* const f = fopen(dictFileName, "wb"); + if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); + + { size_t const n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } + + { size_t const n = (size_t)fclose(f); + if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } +} + +/*! getFileStats() : + * Given a list of files, and a chunkSize (0 == no chunk, whole files) + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) +{ + fileStats fs; + unsigned n; + memset(&fs, 0, sizeof(fs)); + for (n=0; n 2*SAMPLESIZE_MAX); + fs.nbSamples += nbSamples; + } + DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); + return fs; +} + +int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize, + ZDICT_random_params_t *params){ + unsigned const displayLevel = params->zParams.notificationLevel; + void* const dictBuffer = malloc(maxDictSize); + + int result = 0; + + /* Checks */ + if (!dictBuffer) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + + { size_t dictSize; + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *params); + DISPLAYLEVEL(2, "k=%u\n", params->k); + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + result = 1; + free(dictBuffer); + } + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + saveDict(dictFileName, dictBuffer, dictSize); + } + + /* clean up */ + free(dictBuffer); + return result; +} + +sampleInfo* getSampleInfo(const char** fileNamesTable, + unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel){ + fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); + size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); + size_t const memMult = RANDOM_MEMMULT; + size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); + void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer)) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + if (fs.oneSampleTooLarge) { + DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); + } + if (fs.nbSamples < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); + DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } + if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ + if (loadedSize < fs.totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); + + /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + shuffle(fileNamesTable, nbFiles); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); + + sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); + + info->nbSamples = fs.nbSamples; + info->samplesSizes = sampleSizes; + info->srcBuffer = srcBuffer; + + return info; +} + + int main(int argCount, const char* argv[]) { @@ -63,7 +339,7 @@ int main(int argCount, const char* argv[]) const char* programName = argv[0]; int operationResult = 0; - /* Initialize parameters with default value */ + unsigned cLevel = DEFAULT_CLEVEL; char* inputFile = DEFAULT_INPUTFILE; unsigned k = DEFAULT_k; char* outputFile = DEFAULT_OUTPUTFILE; @@ -76,10 +352,10 @@ int main(int argCount, const char* argv[]) for (int i = 1; i < argCount; i++) { const char* argument = argv[i]; if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "in=")) { - /* Allow multiple input files */ inputFile = malloc(strlen(argument) + 1); strcpy(inputFile, argument); filenameTable[filenameIdx] = inputFile; @@ -96,12 +372,6 @@ int main(int argCount, const char* argv[]) return operationResult; } - if (maxDictSize == 0) { - DISPLAYLEVEL(1, "maxDictSize should not be 0.\n"); - operationResult = 1; - return operationResult; - } - char* fileNamesBuf = NULL; unsigned fileNamesNb = filenameIdx; int followLinks = 0; @@ -119,12 +389,15 @@ int main(int argCount, const char* argv[]) ZDICT_random_params_t params; ZDICT_params_t zParams; - zParams.compressionLevel = DEFAULT_CLEVEL; + zParams.compressionLevel = cLevel; zParams.notificationLevel = displayLevel; zParams.dictID = dictID; params.zParams = zParams; params.k = k; - operationResult = RANDOM_trainFromFiles(outputFile, maxDictSize, filenameTable, filenameIdx, blockSize, ¶ms); + sampleInfo* info= getSampleInfo(filenameTable, + filenameIdx, blockSize, maxDictSize, zParams.notificationLevel); + operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, ¶ms); + return operationResult; } diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c index a59427ba..96c02389 100644 --- a/contrib/randomDictBuilder/random.c +++ b/contrib/randomDictBuilder/random.c @@ -5,24 +5,12 @@ #include /* malloc, free, qsort */ #include /* memset */ #include /* clock */ -#include "zstd_internal.h" /* includes zstd.h */ +#include "random.h" +#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ #ifndef ZDICT_STATIC_LINKING_ONLY #define ZDICT_STATIC_LINKING_ONLY #endif -#include "random.h" -#include "platform.h" /* Large Files support */ -#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ - -/*-************************************* -* Constants -***************************************/ -#define SAMPLESIZE_MAX (128 KB) -#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) -#define RANDOM_MEMMULT 9 -static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); - -#define NOISELENGTH 32 -#define DEFAULT_K 200 +#include "zdict.h" /*-************************************* * Console display @@ -30,179 +18,16 @@ static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((siz #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } -static const U64 g_refreshRate = SEC_TO_MICRO / 6; -static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; - -#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ - if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ - { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ - if (displayLevel>=4) fflush(stderr); } } } - - -/*-************************************* -* Exceptions -***************************************/ -#ifndef DEBUG -# define DEBUG 0 -#endif -#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); -#define EXM_THROW(error, ...) \ -{ \ - DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ - DISPLAY("Error %i : ", error); \ - DISPLAY(__VA_ARGS__); \ - DISPLAY("\n"); \ - exit(error); \ -} - - -/* ******************************************************** -* File related operations -**********************************************************/ -/** loadFiles() : - * load samples from files listed in fileNamesTable into buffer. - * works even if buffer is too small to load all samples. - * Also provides the size of each sample into sampleSizes table - * which must be sized correctly, using DiB_fileStats(). - * @return : nb of samples effectively loaded into `buffer` - * *bufferSizePtr is modified, it provides the amount data loaded within buffer. - * sampleSizes is filled with the size of each sample. - */ -static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, - size_t* sampleSizes, unsigned sstSize, - const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, - unsigned displayLevel) -{ - char* const buff = (char*)buffer; - size_t pos = 0; - unsigned nbLoadedChunks = 0, fileIndex; - - for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; - { size_t const readSize = fread(buff+pos, 1, toLoad, f); - if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); - pos += readSize; - sampleSizes[nbLoadedChunks++] = toLoad; - remainingToLoad -= targetChunkSize; - if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ - fileIndex = nbFiles; /* stop there */ - break; - } - if (toLoad < targetChunkSize) { - fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); - } } } - fclose(f); - } - DISPLAYLEVEL(2, "\r%79s\r", ""); - *bufferSizePtr = pos; - DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) - return nbLoadedChunks; -} - - - -#define rotl32(x,r) ((x << r) | (x >> (32 - r))) -static U32 getRand(U32* src) -{ - static const U32 prime1 = 2654435761U; - static const U32 prime2 = 2246822519U; - U32 rand32 = *src; - rand32 *= prime1; - rand32 ^= prime2; - rand32 = rotl32(rand32, 13); - *src = rand32; - return rand32 >> 5; -} - - -/* shuffle() : - * shuffle a table of file names in a semi-random way - * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, - * it will load random elements from it, instead of just the first ones. */ -static void shuffle(const char** fileNamesTable, unsigned nbFiles) { - U32 seed = 0xFD2FB528; - unsigned i; - for (i = nbFiles - 1; i > 0; --i) { - unsigned const j = getRand(&seed) % (i + 1); - const char* const tmp = fileNamesTable[j]; - fileNamesTable[j] = fileNamesTable[i]; - fileNamesTable[i] = tmp; - } -} - - - -/*-******************************************************** -* Dictionary training functions -**********************************************************/ -static size_t findMaxMem(unsigned long long requiredMem) -{ - size_t const step = 8 MB; - void* testmem = NULL; - - requiredMem = (((requiredMem >> 23) + 1) << 23); - requiredMem += step; - if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; - - while (!testmem) { - testmem = malloc((size_t)requiredMem); - requiredMem -= step; - } - - free(testmem); - return (size_t)requiredMem; -} - -static void saveDict(const char* dictFileName, - const void* buff, size_t buffSize) -{ - FILE* const f = fopen(dictFileName, "wb"); - if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); - - { size_t const n = fwrite(buff, 1, buffSize, f); - if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } - - { size_t const n = (size_t)fclose(f); - if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } -} - -/*! getFileStats() : - * Given a list of files, and a chunkSize (0 == no chunk, whole files) - * provides the amount of data to be loaded and the resulting nb of samples. - * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. - */ -static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) -{ - fileStats fs; - unsigned n; - memset(&fs, 0, sizeof(fs)); - for (n=0; n 2*SAMPLESIZE_MAX); - fs.nbSamples += nbSamples; - } - DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); - return fs; -} - - +#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ + g_time = clock(); \ + DISPLAY(__VA_ARGS__); \ + } \ + } +#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(displayLevel, l, __VA_ARGS__) +static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; +static clock_t g_time = 0; @@ -225,16 +50,14 @@ static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) { /** * Selects a random segment from totalSamplesSize - k + 1 possible segments */ -static RANDOM_segment_t RANDOM_selectSegment(const RANDOM_ctx_t *ctx, +static RANDOM_segment_t RANDOM_selectSegment(const size_t totalSamplesSize, ZDICT_random_params_t parameters) { const U32 k = parameters.k; RANDOM_segment_t segment; unsigned index; - /* Seed random number generator */ - srand((unsigned)time(NULL)); /* Randomly generate a number from 0 to sampleSizes - k */ - index = rand()%(ctx->totalSamplesSize - k + 1); + index = rand()%(totalSamplesSize - k + 1); /* inclusive */ segment.begin = index; @@ -261,65 +84,11 @@ static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDi } -/** - * Clean up a context initialized with `RANDOM_ctx_init()`. - */ -static void RANDOM_ctx_destroy(RANDOM_ctx_t *ctx) { - if (!ctx) { - return; - } - if (ctx->offsets) { - free(ctx->offsets); - ctx->offsets = NULL; - } -} - - -/** - * Prepare a context for dictionary building. - * Returns 1 on success or zero on error. - * The context must be destroyed with `RANDOM_ctx_destroy()`. - */ -static int RANDOM_ctx_init(RANDOM_ctx_t *ctx, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples) { - const BYTE *const samples = (const BYTE *)samplesBuffer; - const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples); - const int displayLevel = 2; - /* Checks */ - if (totalSamplesSize >= (size_t)RANDOM_MAX_SAMPLES_SIZE) { - DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", - (U32)(totalSamplesSize>>20), (RANDOM_MAX_SAMPLES_SIZE >> 20)); - return 0; - } - memset(ctx, 0, sizeof(*ctx)); - DISPLAYLEVEL(1, "Building dictionary from %u samples of total size %u\n", nbSamples, - (U32)totalSamplesSize); - ctx->samples = samples; - ctx->samplesSizes = samplesSizes; - ctx->nbSamples = nbSamples; - ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); - ctx->totalSamplesSize = (U32)totalSamplesSize; - if (!ctx->offsets) { - DISPLAYLEVEL(1, "Failed to allocate buffer for offsets\n"); - RANDOM_ctx_destroy(ctx); - return 0; - } - { - U32 i; - ctx->offsets[0] = 0; - for (i = 1; i <= nbSamples; ++i) { - ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; - } - } - return 1; -} - - /** * Given the prepared context build the dictionary. */ -static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, - size_t dictBufferCapacity, +static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *samples, + void *dictBuffer, size_t dictBufferCapacity, ZDICT_random_params_t parameters) { BYTE *const dict = (BYTE *)dictBuffer; size_t tail = dictBufferCapacity; @@ -327,13 +96,13 @@ static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, while (tail > 0) { /* Select a segment */ - RANDOM_segment_t segment = RANDOM_selectSegment(ctx, parameters); + RANDOM_segment_t segment = RANDOM_selectSegment(totalSamplesSize, parameters); size_t segmentSize; segmentSize = MIN(segment.end - segment.begin + 1, tail); tail -= segmentSize; - memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + memcpy(dict + tail, samples + segment.begin, segmentSize); DISPLAYUPDATE( 2, "\r%u%% ", (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); @@ -342,6 +111,7 @@ static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, return tail; } + /*! ZDICT_trainFromBuffer_random(): * Train a dictionary from an array of samples using the RANDOM algorithm. * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, @@ -356,7 +126,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( ZDICT_random_params_t parameters) { const int displayLevel = parameters.zParams.notificationLevel; BYTE* const dict = (BYTE*)dictBuffer; - RANDOM_ctx_t ctx; /* Checks */ if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) { DISPLAYLEVEL(1, "k is incorrect\n"); @@ -371,13 +140,12 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( ZDICT_DICTSIZE_MIN); return ERROR(dstSize_tooSmall); } + const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples); + const BYTE *const samples = (const BYTE *)samplesBuffer; - if (!RANDOM_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples)) { - return ERROR(GENERIC); - } DISPLAYLEVEL(2, "Building dictionary\n"); { - const size_t tail = RANDOM_buildDictionary(&ctx, dictBuffer, dictBufferCapacity, parameters); + const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples, dictBuffer, dictBufferCapacity, parameters); const size_t dictSize = ZDICT_finalizeDictionary( dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, samplesBuffer, samplesSizes, nbSamples, parameters.zParams); @@ -385,71 +153,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", (U32)dictSize); } - RANDOM_ctx_destroy(&ctx); return dictSize; } } - - -int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, - const char** fileNamesTable, unsigned nbFiles, - size_t chunkSize, ZDICT_random_params_t *params){ - unsigned const displayLevel = params->zParams.notificationLevel; - void* const dictBuffer = malloc(maxDictSize); - fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); - size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); - size_t const memMult = RANDOM_MEMMULT; - size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; - size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); - void* const srcBuffer = malloc(loadedSize+NOISELENGTH); - int result = 0; - - /* Checks */ - if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer)) - EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ - if (fs.oneSampleTooLarge) { - DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); - DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); - DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); - } - if (fs.nbSamples < 5) { - DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); - DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); - DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); - EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ - } - if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { - DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); - DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); - } - - /* init */ - if (loadedSize < fs.totalSizeToLoad) - DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); - - /* Load input buffer */ - DISPLAYLEVEL(3, "Shuffling input files\n"); - shuffle(fileNamesTable, nbFiles); - nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); - - { size_t dictSize; - dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, srcBuffer, - sampleSizes, fs.nbSamples, *params); - DISPLAYLEVEL(2, "k=%u\n", params->k); - if (ZDICT_isError(dictSize)) { - DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ - result = 1; - goto _cleanup; - } - /* save dict */ - DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); - saveDict(dictFileName, dictBuffer, dictSize); - } - - /* clean up */ -_cleanup: - free(srcBuffer); - free(sampleSizes); - free(dictBuffer); - return result; -} diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h index 05879641..77529daf 100644 --- a/contrib/randomDictBuilder/random.h +++ b/contrib/randomDictBuilder/random.h @@ -8,18 +8,6 @@ #endif #include "zdict.h" - -/************************************** -* Context -***************************************/ -typedef struct { - const BYTE *samples; - size_t *offsets; - const size_t *samplesSizes; - size_t nbSamples; - U32 totalSamplesSize; -} RANDOM_ctx_t; - /** * A segment is an inclusive range in the source. */ @@ -35,19 +23,8 @@ typedef struct { } ZDICT_random_params_t; -typedef struct { - U64 totalSizeToLoad; - unsigned oneSampleTooLarge; - unsigned nbSamples; -} fileStats; - ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_random_params_t parameters); - - -int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, - const char** fileNamesTable, unsigned nbFiles, - size_t chunkSize, ZDICT_random_params_t *params); From 1f7fa5cdd6555e22dfa8c2dc1f5c17293e703fe3 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Mon, 16 Jul 2018 16:31:59 -0700 Subject: [PATCH 07/35] Fix spacing and Edit Makefile (now run with make instead of make run) --- contrib/randomDictBuilder/Makefile | 13 +++++---- contrib/randomDictBuilder/README.md | 9 ++++--- contrib/randomDictBuilder/main.c | 42 ++++++++++++++--------------- contrib/randomDictBuilder/random.c | 9 ++++--- contrib/randomDictBuilder/random.h | 5 ++-- 5 files changed, 40 insertions(+), 38 deletions(-) diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index 443f6f04..77dd2933 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -4,16 +4,15 @@ TEST_INPUT := ../../lib TEST_OUTPUT := randomDict ARG := -all: main testrun test clean +all: main run clean -run: main rand clean +test: main testrun testshell clean -.PHONY: rand -rand: +.PHONY: run +run: echo "Building a random dictionary with given arguments" ./main $(ARG) - main: main.o random.o libzstd.a gcc main.o random.o libzstd.a -o main @@ -34,8 +33,8 @@ testrun: main zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q rm -f $(TEST_OUTPUT) -.PHONY: test -test: test.sh +.PHONY: testshell +testshell: test.sh sh test.sh echo "Finish running test.sh" diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md index 09f1e808..0e70d3dc 100644 --- a/contrib/randomDictBuilder/README.md +++ b/contrib/randomDictBuilder/README.md @@ -7,11 +7,14 @@ Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0 Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200 +###Running Test: +make test + ###Usage: -To build a random dictionary with the provided arguments: make run ARG= followed by arguments +To build a random dictionary with the provided arguments: make ARG= followed by arguments ### Examples: -make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520" -make run ARG="in=../../lib/dictBuilder in=../../lib/compress" +make ARG="in=../../lib/dictBuilder out=dict100 dictID=520" +make ARG="in=../../lib/dictBuilder in=../../lib/compress" diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index d9295aa9..e195188b 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -52,7 +52,8 @@ static const unsigned g_defaultMaxDictSize = 110 KB; #define SAMPLESIZE_MAX (128 KB) #define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) #define RANDOM_MEMMULT 9 -static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? + (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); #define NOISELENGTH 32 @@ -76,8 +77,7 @@ typedef struct { /*-************************************* * Commandline related functions ***************************************/ -static unsigned readU32FromChar(const char** stringPtr) -{ +static unsigned readU32FromChar(const char** stringPtr){ const char errorMsg[] = "error: numeric value too large"; unsigned result = 0; while ((**stringPtr >='0') && (**stringPtr <='9')) { @@ -105,8 +105,7 @@ static unsigned readU32FromChar(const char** stringPtr) * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. * @return 0 and doesn't modify *stringPtr otherwise. */ -static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) -{ +static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){ size_t const comSize = strlen(longCommand); int const result = !strncmp(*stringPtr, longCommand, comSize); if (result) *stringPtr += comSize; @@ -125,11 +124,9 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) * *bufferSizePtr is modified, it provides the amount data loaded within buffer. * sampleSizes is filled with the size of each sample. */ -static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, - size_t* sampleSizes, unsigned sstSize, - const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, - unsigned displayLevel) -{ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes, + unsigned sstSize, const char** fileNamesTable, unsigned nbFiles, + size_t targetChunkSize, unsigned displayLevel) { char* const buff = (char*)buffer; size_t pos = 0; unsigned nbLoadedChunks = 0, fileIndex; @@ -200,8 +197,7 @@ static void shuffle(const char** fileNamesTable, unsigned nbFiles) { /*-******************************************************** * Dictionary training functions **********************************************************/ -static size_t findMaxMem(unsigned long long requiredMem) -{ +static size_t findMaxMem(unsigned long long requiredMem) { size_t const step = 8 MB; void* testmem = NULL; @@ -219,8 +215,7 @@ static size_t findMaxMem(unsigned long long requiredMem) } static void saveDict(const char* dictFileName, - const void* buff, size_t buffSize) -{ + const void* buff, size_t buffSize) { FILE* const f = fopen(dictFileName, "wb"); if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); @@ -236,8 +231,8 @@ static void saveDict(const char* dictFileName, * provides the amount of data to be loaded and the resulting nb of samples. * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. */ -static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) -{ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, unsigned displayLevel) { fileStats fs; unsigned n; memset(&fs, 0, sizeof(fs)); @@ -255,8 +250,9 @@ static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, siz return fs; } -int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize, - ZDICT_random_params_t *params){ +int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, + unsigned maxDictSize, + ZDICT_random_params_t *params) { unsigned const displayLevel = params->zParams.notificationLevel; void* const dictBuffer = malloc(maxDictSize); @@ -285,8 +281,8 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned m return result; } -sampleInfo* getSampleInfo(const char** fileNamesTable, - unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel){ +sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, + unsigned maxDictSize, const unsigned displayLevel) { fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); size_t const memMult = RANDOM_MEMMULT; @@ -320,7 +316,8 @@ sampleInfo* getSampleInfo(const char** fileNamesTable, /* Load input buffer */ DISPLAYLEVEL(3, "Shuffling input files\n"); shuffle(fileNamesTable, nbFiles); - nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, + fileNamesTable, nbFiles, chunkSize, displayLevel); sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); @@ -376,7 +373,8 @@ int main(int argCount, const char* argv[]) unsigned fileNamesNb = filenameIdx; int followLinks = 0; const char** extendedFileList = NULL; - extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks); + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, + &fileNamesNb, followLinks); if (extendedFileList) { unsigned u; for (u=0; u Date: Mon, 16 Jul 2018 18:59:18 -0700 Subject: [PATCH 08/35] Remove CLevel cli option which was accidentally added back in the last commit --- contrib/randomDictBuilder/main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index e195188b..e66f2847 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -336,7 +336,6 @@ int main(int argCount, const char* argv[]) const char* programName = argv[0]; int operationResult = 0; - unsigned cLevel = DEFAULT_CLEVEL; char* inputFile = DEFAULT_INPUTFILE; unsigned k = DEFAULT_k; char* outputFile = DEFAULT_OUTPUTFILE; @@ -349,7 +348,6 @@ int main(int argCount, const char* argv[]) for (int i = 1; i < argCount; i++) { const char* argument = argv[i]; if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } - if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "in=")) { @@ -387,7 +385,7 @@ int main(int argCount, const char* argv[]) ZDICT_random_params_t params; ZDICT_params_t zParams; - zParams.compressionLevel = cLevel; + zParams.compressionLevel = DEFAULT_CLEVEL; zParams.notificationLevel = displayLevel; zParams.dictID = dictID; params.zParams = zParams; From 53e1f0504e077f90ecaea3a0bc18327177fd57ee Mon Sep 17 00:00:00 2001 From: cyan4973 Date: Tue, 17 Jul 2018 14:39:44 +0200 Subject: [PATCH 09/35] zstdmt debug traces compatibles with mingw since mingw does not have `sys/times.h`, remove this path when detecting mingw compilation. --- lib/compress/zstdmt_compress.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c index 6daedca8..d5193d52 100644 --- a/lib/compress/zstdmt_compress.c +++ b/lib/compress/zstdmt_compress.c @@ -37,7 +37,9 @@ #define ZSTD_RESIZE_SEQPOOL 0 /* ====== Debug ====== */ -#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) && !defined(_MSC_VER) +#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) \ + && !defined(_MSC_VER) \ + && !defined(__MINGW32__) # include # include From 49acfaeaec44a25c4628a2512965445152e8776a Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 17 Jul 2018 12:35:09 -0700 Subject: [PATCH 10/35] Move file loading functions to new file for access by benchmarking tool --- contrib/randomDictBuilder/Makefile | 11 +- contrib/randomDictBuilder/io.c | 243 +++++++++++++++++++++++++++++ contrib/randomDictBuilder/io.h | 33 ++++ contrib/randomDictBuilder/main.c | 215 +------------------------ 4 files changed, 290 insertions(+), 212 deletions(-) create mode 100644 contrib/randomDictBuilder/io.c create mode 100644 contrib/randomDictBuilder/io.h diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index 77dd2933..8360a409 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -13,15 +13,18 @@ run: echo "Building a random dictionary with given arguments" ./main $(ARG) -main: main.o random.o libzstd.a - gcc main.o random.o libzstd.a -o main +main: main.o io.o random.o libzstd.a + gcc main.o io.o random.o libzstd.a -o main -main.o: main.c $(PROGRAM_FILES) - gcc -c main.c $(PROGRAM_FILES) -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder +main.o: main.c + gcc -c main.c -I io.h -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder random.o: random.c gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder +io.o: io.c $(PROGRAM_FILES) + gcc -c io.c $(PROGRAM_FILES) -I io.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder + libzstd.a: $(MAKE) -C ../../lib libzstd.a mv ../../lib/libzstd.a . diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c new file mode 100644 index 00000000..a5f71498 --- /dev/null +++ b/contrib/randomDictBuilder/io.c @@ -0,0 +1,243 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "io.h" +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ +#include "util.h" +#include "zdict.h" + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/*-************************************* +* Constants +***************************************/ + +#define SAMPLESIZE_MAX (128 KB) +#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define RANDOM_MEMMULT 9 +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? + (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); + +#define NOISELENGTH 32 + + + +/* ******************************************************** +* File related operations +**********************************************************/ +/** loadFiles() : + * load samples from files listed in fileNamesTable into buffer. + * works even if buffer is too small to load all samples. + * Also provides the size of each sample into sampleSizes table + * which must be sized correctly, using DiB_fileStats(). + * @return : nb of samples effectively loaded into `buffer` + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes, + unsigned sstSize, const char** fileNamesTable, unsigned nbFiles, + size_t targetChunkSize, unsigned displayLevel) { + char* const buff = (char*)buffer; + size_t pos = 0; + unsigned nbLoadedChunks = 0, fileIndex; + + for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; + { size_t const readSize = fread(buff+pos, 1, toLoad, f); + if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); + pos += readSize; + sampleSizes[nbLoadedChunks++] = toLoad; + remainingToLoad -= targetChunkSize; + if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ + fileIndex = nbFiles; /* stop there */ + break; + } + if (toLoad < targetChunkSize) { + fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); + } } } + fclose(f); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + *bufferSizePtr = pos; + DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) + return nbLoadedChunks; +} + +#define rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 getRand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + +/* shuffle() : + * shuffle a table of file names in a semi-random way + * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, + * it will load random elements from it, instead of just the first ones. */ +static void shuffle(const char** fileNamesTable, unsigned nbFiles) { + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = getRand(&seed) % (i + 1); + const char* const tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +static size_t findMaxMem(unsigned long long requiredMem) { + size_t const step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += step; + if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; + + while (!testmem) { + testmem = malloc((size_t)requiredMem); + requiredMem -= step; + } + + free(testmem); + return (size_t)requiredMem; +} + +void saveDict(const char* dictFileName, + const void* buff, size_t buffSize) { + FILE* const f = fopen(dictFileName, "wb"); + if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); + + { size_t const n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } + + { size_t const n = (size_t)fclose(f); + if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } +} + +/*! getFileStats() : + * Given a list of files, and a chunkSize (0 == no chunk, whole files) + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, unsigned displayLevel) { + fileStats fs; + unsigned n; + memset(&fs, 0, sizeof(fs)); + for (n=0; n 2*SAMPLESIZE_MAX); + fs.nbSamples += nbSamples; + } + DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); + return fs; +} + + + + +sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, + unsigned maxDictSize, const unsigned displayLevel) { + fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); + size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); + size_t const memMult = RANDOM_MEMMULT; + size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); + void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer)) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + if (fs.oneSampleTooLarge) { + DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); + } + if (fs.nbSamples < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); + DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } + if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ + if (loadedSize < fs.totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); + + /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + shuffle(fileNamesTable, nbFiles); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, + fileNamesTable, nbFiles, chunkSize, displayLevel); + + sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); + + info->nbSamples = fs.nbSamples; + info->samplesSizes = sampleSizes; + info->srcBuffer = srcBuffer; + + return info; +} diff --git a/contrib/randomDictBuilder/io.h b/contrib/randomDictBuilder/io.h new file mode 100644 index 00000000..4b5639fe --- /dev/null +++ b/contrib/randomDictBuilder/io.h @@ -0,0 +1,33 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "zstd_internal.h" /* includes zstd.h */ +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ +#include "util.h" +#include "zdict.h" + + +/*-************************************* +* Structs +***************************************/ +typedef struct { + U64 totalSizeToLoad; + unsigned oneSampleTooLarge; + unsigned nbSamples; +} fileStats; + +typedef struct { + const void* srcBuffer; + const size_t *samplesSizes; + size_t nbSamples; +}sampleInfo; + + +sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, + unsigned maxDictSize, const unsigned displayLevel); + + +void saveDict(const char* dictFileName, const void* buff, size_t buffSize); diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index e66f2847..34a9d99e 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -4,11 +4,11 @@ #include /* errno */ #include #include "random.h" -#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ -#include "platform.h" /* Large Files support */ +#include "io.h" #include "util.h" #include "zdict.h" + /*-************************************* * Console display ***************************************/ @@ -23,6 +23,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ if (displayLevel>=4) fflush(stderr); } } } + /*-************************************* * Exceptions ***************************************/ @@ -39,6 +40,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; exit(error); \ } + /*-************************************* * Constants ***************************************/ @@ -49,29 +51,6 @@ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_OUTPUTFILE "defaultDict" #define DEFAULT_DICTID 0 -#define SAMPLESIZE_MAX (128 KB) -#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) -#define RANDOM_MEMMULT 9 -static const size_t g_maxMemory = (sizeof(size_t) == 4) ? - (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); - -#define NOISELENGTH 32 - - -/*-************************************* -* Structs -***************************************/ -typedef struct { - U64 totalSizeToLoad; - unsigned oneSampleTooLarge; - unsigned nbSamples; -} fileStats; - -typedef struct { - const void* srcBuffer; - const size_t *samplesSizes; - size_t nbSamples; -}sampleInfo; /*-************************************* @@ -112,144 +91,11 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) return result; } -/* ******************************************************** -* File related operations -**********************************************************/ -/** loadFiles() : - * load samples from files listed in fileNamesTable into buffer. - * works even if buffer is too small to load all samples. - * Also provides the size of each sample into sampleSizes table - * which must be sized correctly, using DiB_fileStats(). - * @return : nb of samples effectively loaded into `buffer` - * *bufferSizePtr is modified, it provides the amount data loaded within buffer. - * sampleSizes is filled with the size of each sample. - */ -static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes, - unsigned sstSize, const char** fileNamesTable, unsigned nbFiles, - size_t targetChunkSize, unsigned displayLevel) { - char* const buff = (char*)buffer; - size_t pos = 0; - unsigned nbLoadedChunks = 0, fileIndex; - - for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; - { size_t const readSize = fread(buff+pos, 1, toLoad, f); - if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); - pos += readSize; - sampleSizes[nbLoadedChunks++] = toLoad; - remainingToLoad -= targetChunkSize; - if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ - fileIndex = nbFiles; /* stop there */ - break; - } - if (toLoad < targetChunkSize) { - fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); - } } } - fclose(f); - } - DISPLAYLEVEL(2, "\r%79s\r", ""); - *bufferSizePtr = pos; - DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) - return nbLoadedChunks; -} - -#define rotl32(x,r) ((x << r) | (x >> (32 - r))) -static U32 getRand(U32* src) -{ - static const U32 prime1 = 2654435761U; - static const U32 prime2 = 2246822519U; - U32 rand32 = *src; - rand32 *= prime1; - rand32 ^= prime2; - rand32 = rotl32(rand32, 13); - *src = rand32; - return rand32 >> 5; -} - -/* shuffle() : - * shuffle a table of file names in a semi-random way - * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, - * it will load random elements from it, instead of just the first ones. */ -static void shuffle(const char** fileNamesTable, unsigned nbFiles) { - U32 seed = 0xFD2FB528; - unsigned i; - for (i = nbFiles - 1; i > 0; --i) { - unsigned const j = getRand(&seed) % (i + 1); - const char* const tmp = fileNamesTable[j]; - fileNamesTable[j] = fileNamesTable[i]; - fileNamesTable[i] = tmp; - } -} -/*-******************************************************** -* Dictionary training functions -**********************************************************/ -static size_t findMaxMem(unsigned long long requiredMem) { - size_t const step = 8 MB; - void* testmem = NULL; - - requiredMem = (((requiredMem >> 23) + 1) << 23); - requiredMem += step; - if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; - - while (!testmem) { - testmem = malloc((size_t)requiredMem); - requiredMem -= step; - } - - free(testmem); - return (size_t)requiredMem; -} - -static void saveDict(const char* dictFileName, - const void* buff, size_t buffSize) { - FILE* const f = fopen(dictFileName, "wb"); - if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); - - { size_t const n = fwrite(buff, 1, buffSize, f); - if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } - - { size_t const n = (size_t)fclose(f); - if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } -} - -/*! getFileStats() : - * Given a list of files, and a chunkSize (0 == no chunk, whole files) - * provides the amount of data to be loaded and the resulting nb of samples. - * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. - */ -static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, - size_t chunkSize, unsigned displayLevel) { - fileStats fs; - unsigned n; - memset(&fs, 0, sizeof(fs)); - for (n=0; n 2*SAMPLESIZE_MAX); - fs.nbSamples += nbSamples; - } - DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); - return fs; -} - +/*-************************************* +* RANDOM +***************************************/ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize, ZDICT_random_params_t *params) { @@ -281,53 +127,6 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, return result; } -sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, - unsigned maxDictSize, const unsigned displayLevel) { - fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); - size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); - size_t const memMult = RANDOM_MEMMULT; - size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; - size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); - void* const srcBuffer = malloc(loadedSize+NOISELENGTH); - - /* Checks */ - if ((!sampleSizes) || (!srcBuffer)) - EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ - if (fs.oneSampleTooLarge) { - DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); - DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); - DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); - } - if (fs.nbSamples < 5) { - DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); - DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); - DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); - EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ - } - if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { - DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); - DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); - } - - /* init */ - if (loadedSize < fs.totalSizeToLoad) - DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); - - /* Load input buffer */ - DISPLAYLEVEL(3, "Shuffling input files\n"); - shuffle(fileNamesTable, nbFiles); - nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, - fileNamesTable, nbFiles, chunkSize, displayLevel); - - sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); - - info->nbSamples = fs.nbSamples; - info->samplesSizes = sampleSizes; - info->srcBuffer = srcBuffer; - - return info; -} - int main(int argCount, const char* argv[]) From e6fe4058388c820444a80d9d10aa5d840fab3c0c Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 17 Jul 2018 12:42:53 -0700 Subject: [PATCH 11/35] Make test PHONY target --- contrib/randomDictBuilder/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index 8360a409..678ff28a 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -6,6 +6,7 @@ ARG := all: main run clean +.PHONY: test test: main testrun testshell clean .PHONY: run From 4e706d7f2cb79df257809b45c033b3bcf5822edf Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Tue, 17 Jul 2018 14:57:27 -0700 Subject: [PATCH 12/35] fileio: Error in compression on read errors We can write a corrupted file if the input file errors during a read. We should return a non-zero error code in this case. --- programs/fileio.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/programs/fileio.c b/programs/fileio.c index b4eed28d..85367fdf 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -797,6 +797,14 @@ FIO_compressZstdFrame(const cRess_t* ressPtr, } } while (directive != ZSTD_e_end); + if (ferror(srcFile)) { + EXM_THROW(26, "Read error : I/O error"); + } + if (fileSize != UTIL_FILESIZE_UNKNOWN && *readsize != fileSize) { + EXM_THROW(27, "Read error : Incomplete read : %llu / %llu B", + (unsigned long long)*readsize, (unsigned long long)fileSize); + } + return compressedfilesize; } From 896ff0644a2531a22edf78ea9cb6b58a4de9c77f Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 17 Jul 2018 16:01:44 -0700 Subject: [PATCH 13/35] Fix deallocation problem and add documentation --- contrib/randomDictBuilder/io.c | 7 +++++++ contrib/randomDictBuilder/io.h | 17 +++++++++++++++++ contrib/randomDictBuilder/main.c | 20 +++++++++++--------- contrib/randomDictBuilder/random.c | 11 ++--------- contrib/randomDictBuilder/random.h | 9 ++++++++- 5 files changed, 45 insertions(+), 19 deletions(-) diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c index a5f71498..1c3eda58 100644 --- a/contrib/randomDictBuilder/io.c +++ b/contrib/randomDictBuilder/io.c @@ -241,3 +241,10 @@ sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t return info; } + + +void freeSampleInfo(sampleInfo *info) { + if (info->samplesSizes) free((void*)(info->samplesSizes)); + if (info->srcBuffer) free((void*)(info->srcBuffer)); + free(info); +} diff --git a/contrib/randomDictBuilder/io.h b/contrib/randomDictBuilder/io.h index 4b5639fe..55967f76 100644 --- a/contrib/randomDictBuilder/io.h +++ b/contrib/randomDictBuilder/io.h @@ -26,8 +26,25 @@ typedef struct { }sampleInfo; + +/*! getSampleInfo(): + * Load from input files and add samples to buffer + * @return: a sampleInfo struct containing infomation about buffer where samples are stored, + * size of each sample, and total number of samples + */ sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel); + +/*! freeSampleInfo(): + * Free memory allocated for info + */ +void freeSampleInfo(sampleInfo *info); + + + +/*! saveDict(): + * Save data stored on buff to dictFileName + */ void saveDict(const char* dictFileName, const void* buff, size_t buffSize); diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index 34a9d99e..1f12c7a4 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -46,7 +46,6 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; ***************************************/ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_CLEVEL 3 -#define DEFAULT_INPUTFILE "" #define DEFAULT_k 200 #define DEFAULT_OUTPUTFILE "defaultDict" #define DEFAULT_DICTID 0 @@ -135,30 +134,29 @@ int main(int argCount, const char* argv[]) const char* programName = argv[0]; int operationResult = 0; - char* inputFile = DEFAULT_INPUTFILE; + /* Initialize arguments to default values */ unsigned k = DEFAULT_k; - char* outputFile = DEFAULT_OUTPUTFILE; + const char* outputFile = DEFAULT_OUTPUTFILE; unsigned dictID = DEFAULT_DICTID; unsigned maxDictSize = g_defaultMaxDictSize; + /* Initialize table to store input files */ const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); unsigned filenameIdx = 0; + /* Parse arguments */ for (int i = 1; i < argCount; i++) { const char* argument = argv[i]; if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "in=")) { - inputFile = malloc(strlen(argument) + 1); - strcpy(inputFile, argument); - filenameTable[filenameIdx] = inputFile; + filenameTable[filenameIdx] = argument; filenameIdx++; continue; } if (longCommandWArg(&argument, "out=")) { - outputFile = malloc(strlen(argument) + 1); - strcpy(outputFile, argument); + outputFile = argument; continue; } DISPLAYLEVEL(1, "Incorrect parameters\n"); @@ -168,7 +166,7 @@ int main(int argCount, const char* argv[]) char* fileNamesBuf = NULL; unsigned fileNamesNb = filenameIdx; - int followLinks = 0; + int followLinks = 0; /* follow directory recursively */ const char** extendedFileList = NULL; extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks); @@ -194,5 +192,9 @@ int main(int argCount, const char* argv[]) filenameIdx, blockSize, maxDictSize, zParams.notificationLevel); operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, ¶ms); + /* Free allocated memory */ + UTIL_freeFileList(extendedFileList, fileNamesBuf); + freeSampleInfo(info); + return operationResult; } diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c index cfed14a4..34aec39e 100644 --- a/contrib/randomDictBuilder/random.c +++ b/contrib/randomDictBuilder/random.c @@ -113,15 +113,8 @@ static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE * } -/*! ZDICT_trainFromBuffer_random(): - * Train a dictionary from an array of samples using the RANDOM algorithm. - * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, - * supplied with an array of sizes `samplesSizes`, providing the size of each - * sample, in order. - * The resulting dictionary will be saved into `dictBuffer`. - * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - * or an error code, which can be tested with ZDICT_isError(). - */ + + ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h index b6696323..c3146f86 100644 --- a/contrib/randomDictBuilder/random.h +++ b/contrib/randomDictBuilder/random.h @@ -23,7 +23,14 @@ typedef struct { } ZDICT_random_params_t; - +/*! ZDICT_trainFromBuffer_random(): + * Train a dictionary from an array of samples. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + */ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_random_params_t parameters); From ce09fb723d1311e62c920430fb14634e9b67dd70 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 17 Jul 2018 16:13:40 -0700 Subject: [PATCH 14/35] Update freeSampleInfo --- contrib/randomDictBuilder/io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c index 1c3eda58..67c40858 100644 --- a/contrib/randomDictBuilder/io.c +++ b/contrib/randomDictBuilder/io.c @@ -244,6 +244,7 @@ sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t void freeSampleInfo(sampleInfo *info) { + if (!info) return; if (info->samplesSizes) free((void*)(info->samplesSizes)); if (info->srcBuffer) free((void*)(info->srcBuffer)); free(info); From 52e7cf0e405ac6eb827322b607d094125646bbfb Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Wed, 18 Jul 2018 10:40:13 -0700 Subject: [PATCH 15/35] Add cleanup to trainfromFiles and move RANDOM_segment_t declaration --- contrib/randomDictBuilder/main.c | 3 ++- contrib/randomDictBuilder/random.c | 9 +++++++++ contrib/randomDictBuilder/random.h | 7 ------- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index 1f12c7a4..36c4326b 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -114,7 +114,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, if (ZDICT_isError(dictSize)) { DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ result = 1; - free(dictBuffer); + goto _cleanup; } /* save dict */ DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); @@ -122,6 +122,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, } /* clean up */ +_cleanup: free(dictBuffer); return result; } diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c index 34aec39e..5276bea9 100644 --- a/contrib/randomDictBuilder/random.c +++ b/contrib/randomDictBuilder/random.c @@ -47,6 +47,15 @@ static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) { } +/** + * A segment is an inclusive range in the source. + */ +typedef struct { + U32 begin; + U32 end; +} RANDOM_segment_t; + + /** * Selects a random segment from totalSamplesSize - k + 1 possible segments */ diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h index c3146f86..352775f9 100644 --- a/contrib/randomDictBuilder/random.h +++ b/contrib/randomDictBuilder/random.h @@ -8,13 +8,6 @@ #endif #include "zdict.h" -/** - * A segment is an inclusive range in the source. - */ -typedef struct { - U32 begin; - U32 end; -} RANDOM_segment_t; typedef struct { From 5bb46a898e6565e5bc1ee861999384f806f83831 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Wed, 18 Jul 2018 12:15:49 -0700 Subject: [PATCH 16/35] Rename cleanup --- contrib/randomDictBuilder/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index 36c4326b..4751a9e1 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -114,7 +114,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, if (ZDICT_isError(dictSize)) { DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ result = 1; - goto _cleanup; + goto _done; } /* save dict */ DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); @@ -122,7 +122,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, } /* clean up */ -_cleanup: +_done: free(dictBuffer); return result; } From 0c5eaef248443342dd1cd19f5e434334bef6fc4c Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Thu, 19 Jul 2018 13:44:27 -0700 Subject: [PATCH 17/35] Update Makefile --- contrib/randomDictBuilder/Makefile | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index 678ff28a..5f9240bf 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -1,8 +1,11 @@ -PROGRAM_FILES := ../../programs/fileio.c +ARG := + +CC ?= gcc +CFLAGS ?= -O3 +INCLUDES := -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder TEST_INPUT := ../../lib TEST_OUTPUT := randomDict -ARG := all: main run clean @@ -15,16 +18,16 @@ run: ./main $(ARG) main: main.o io.o random.o libzstd.a - gcc main.o io.o random.o libzstd.a -o main + $(CC) $(CFLAGS) main.o io.o random.o libzstd.a -o main main.o: main.c - gcc -c main.c -I io.h -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder + $(CC) $(CFLAGS) $(INCLUDES) -c main.c random.o: random.c - gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder + $(CC) $(CFLAGS) $(INCLUDES) -c random.c -io.o: io.c $(PROGRAM_FILES) - gcc -c io.c $(PROGRAM_FILES) -I io.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder +io.o: io.c + $(CC) $(CFLAGS) $(INCLUDES) -c io.c libzstd.a: $(MAKE) -C ../../lib libzstd.a @@ -44,8 +47,6 @@ testshell: test.sh .PHONY: clean clean: - rm -f libzstd.a main - rm -f ../../lib/*/*.o - rm -f ../../programs/*.o - rm -f *.o + rm -f *.o main libzstd.a + $(MAKE) -C ../../lib clean echo "Cleaning is completed" From 5624f3f1eabf84d603ca5607f59e6aa286d13211 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Thu, 19 Jul 2018 14:35:27 -0700 Subject: [PATCH 18/35] Revert "attempt to re-enable arm64 tests" This reverts commit 9c277f137cbcaa385ff5b95ec4cbdce50675541d. --- .travis.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 80406064..71b27019 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,7 +25,12 @@ matrix: - env: Cmd='make valgrindinstall && make -C tests clean valgrindTest' - env: Cmd='make arminstall && make armfuzz' - - env: Cmd='make arminstall && make aarch64fuzz' + +# Following test is disabled, as there is a bug in Travis' ld +# preventing aarch64 compilation to complete. +# > collect2: error: ld terminated with signal 11 [Segmentation fault], core dumped +# to be re-enabled in a few commit, as it's possible that a random code change circumvent the ld bug +# - env: Cmd='make arminstall && make aarch64fuzz' - env: Cmd='make ppcinstall && make ppcfuzz' - env: Cmd='make ppcinstall && make ppc64fuzz' From 470c8d42f4bbc8246bcd0bc8438aaad6d1c375ee Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 20 Jul 2018 11:32:39 -0700 Subject: [PATCH 19/35] Benchmark dictionary builders --- contrib/benchmarkDictBuilder/Makefile | 44 ++ contrib/benchmarkDictBuilder/README.md | 43 ++ contrib/benchmarkDictBuilder/benchmark.c | 458 +++++++++++++++++++++ contrib/benchmarkDictBuilder/dictBuilder.h | 10 + contrib/benchmarkDictBuilder/test.sh | 2 + contrib/randomDictBuilder/io.c | 2 +- contrib/randomDictBuilder/io.h | 4 + 7 files changed, 562 insertions(+), 1 deletion(-) create mode 100644 contrib/benchmarkDictBuilder/Makefile create mode 100644 contrib/benchmarkDictBuilder/README.md create mode 100644 contrib/benchmarkDictBuilder/benchmark.c create mode 100644 contrib/benchmarkDictBuilder/dictBuilder.h create mode 100644 contrib/benchmarkDictBuilder/test.sh diff --git a/contrib/benchmarkDictBuilder/Makefile b/contrib/benchmarkDictBuilder/Makefile new file mode 100644 index 00000000..d36d96d5 --- /dev/null +++ b/contrib/benchmarkDictBuilder/Makefile @@ -0,0 +1,44 @@ +ARG := + +CC ?= gcc +CFLAGS ?= -O3 +INCLUDES := -I ../randomDictBuilder -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder + +RANDOM_FILE := ../randomDictBuilder/random.c +IO_FILE := ../randomDictBuilder/io.c + +all: run clean + +.PHONY: run +run: benchmark + echo "Benchmarking with $(ARG)" + ./benchmark $(ARG) + +.PHONY: test +test: benchmarkTest clean + +.PHONY: benchmarkTest +benchmarkTest: benchmark test.sh + sh test.sh + +benchmark: benchmark.o io.o random.o libzstd.a + $(CC) $(CFLAGS) benchmark.o io.o random.o libzstd.a -o benchmark + +benchmark.o: benchmark.c + $(CC) $(CFLAGS) $(INCLUDES) -c benchmark.c + +random.o: $(RANDOM_FILE) + $(CC) $(CFLAGS) $(INCLUDES) -c $(RANDOM_FILE) + +io.o: $(IO_FILE) + $(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE) + +libzstd.a: + $(MAKE) -C ../../lib libzstd.a + mv ../../lib/libzstd.a . + +.PHONY: clean +clean: + rm -f *.o benchmark libzstd.a + $(MAKE) -C ../../lib clean + echo "Cleaning is completed" diff --git a/contrib/benchmarkDictBuilder/README.md b/contrib/benchmarkDictBuilder/README.md new file mode 100644 index 00000000..b680a53c --- /dev/null +++ b/contrib/benchmarkDictBuilder/README.md @@ -0,0 +1,43 @@ +Benchmarking Dictionary Builder + +### Permitted Argument: +Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in=" + +###Running Test: +make test + +###Usage: +Benchmark given input files: make ARG= followed by permitted arguments + +### Examples: +make ARG="in=../../lib/dictBuilder in=../../lib/compress" + +###Benchmarking Result: + +github: +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| random | 0.182254 | 8.786957 | +| cover | 34.821007 | 10.430999 | +| legacy | 1.125494 | 8.989482 | + +hg-commands +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| random | 0.089231 | 3.489515 | +| cover | 32.342462 | 4.030274 | +| legacy | 1.066594 | 3.911896 | + +hg-manifest +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| random | 1.095083 | 2.309485 | +| cover | 517.999132 | 2.575331 | +| legacy | 10.789509 | 2.506775 | + +hg-changelog +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| random | 0.639630 | 2.096785 | +| cover | 121.398023 | 2.175706 | +| legacy | 3.050893 | 2.058273 | diff --git a/contrib/benchmarkDictBuilder/benchmark.c b/contrib/benchmarkDictBuilder/benchmark.c new file mode 100644 index 00000000..aabd96a0 --- /dev/null +++ b/contrib/benchmarkDictBuilder/benchmark.c @@ -0,0 +1,458 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include +#include "random.h" +#include "dictBuilder.h" +#include "zstd_internal.h" /* includes zstd.h */ +#include "io.h" +#include "util.h" +#include "zdict.h" + + + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + +/*-************************************* +* Constants +***************************************/ +static const unsigned g_defaultMaxDictSize = 110 KB; +#define MEMMULT 11 +#define NOISELENGTH 32 + +/*-************************************* +* Struct +***************************************/ +typedef struct { + const void* dictBuffer; + size_t dictSize; +} dictInfo; + + +/*-************************************* +* Commandline related functions +***************************************/ +static unsigned readU32FromChar(const char** stringPtr){ + const char errorMsg[] = "error: numeric value too large"; + unsigned result = 0; + while ((**stringPtr >='0') && (**stringPtr <='9')) { + unsigned const max = (((unsigned)(-1)) / 10) - 1; + if (result > max) exit(1); + result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; + } + if ((**stringPtr=='K') || (**stringPtr=='M')) { + unsigned const maxK = ((unsigned)(-1)) >> 10; + if (result > maxK) exit(1); + result <<= 10; + if (**stringPtr=='M') { + if (result > maxK) exit(1); + result <<= 10; + } + (*stringPtr)++; /* skip `K` or `M` */ + if (**stringPtr=='i') (*stringPtr)++; + if (**stringPtr=='B') (*stringPtr)++; + } + return result; +} + +/** longCommandWArg() : + * check if *stringPtr is the same as longCommand. + * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. + * @return 0 and doesn't modify *stringPtr otherwise. + */ +static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){ + size_t const comSize = strlen(longCommand); + int const result = !strncmp(*stringPtr, longCommand, comSize); + if (result) *stringPtr += comSize; + return result; +} + +static void fillNoise(void* buffer, size_t length) +{ + unsigned const prime1 = 2654435761U; + unsigned const prime2 = 2246822519U; + unsigned acc = prime1; + size_t p=0;; + + for (p=0; p> 21); + } +} + +/*-************************************* +* Dictionary related operations +***************************************/ +/** createDictFromFiles() : + * Based on type of param given, train dictionary using the corresponding algorithm + * @return dictInfo containing dictionary buffer and dictionary size + */ +dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize, + ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams, + ZDICT_legacy_params_t *legacyParams) { + unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel : + coverParams ? coverParams->zParams.notificationLevel : + legacyParams ? legacyParams->zParams.notificationLevel : + 0; /* should never happen */ + void* const dictBuffer = malloc(maxDictSize); + + dictInfo* dInfo; + + /* Checks */ + if (!dictBuffer) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + + { size_t dictSize; + if(randomParams) { + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *randomParams); + }else if(coverParams) { + dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, coverParams); + } else { + size_t totalSize= 0; + for (int i = 0; i < info->nbSamples; i++) { + totalSize += info->samplesSizes[i]; + } + size_t const maxMem = findMaxMem(totalSize * MEMMULT) / MEMMULT; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, totalSize); + fillNoise((char*)(info->srcBuffer) + loadedSize, NOISELENGTH); + dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *legacyParams); + } + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + free(dictBuffer); + freeSampleInfo(info); + return dInfo; + } + dInfo = (dictInfo *)malloc(sizeof(dictInfo)); + dInfo->dictBuffer = dictBuffer; + dInfo->dictSize = dictSize; + } + return dInfo; +} + + +/** compressWithDict() : + * Compress samples from sample buffer given dicionary stored on dictionary buffer and compression level + * @return compression ratio + */ +double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLevel, int displayLevel) { + /* Local variables */ + size_t totalCompressedSize = 0; + size_t totalOriginalSize = 0; + double cRatio; + size_t dstCapacity; + int i; + + /* Pointers */ + ZSTD_CCtx* cctx; + ZSTD_CDict *cdict; + size_t *offsets; + void* dst; + + /* Allocate dst with enough space to compress the maximum sized sample */ + { + size_t maxSampleSize = 0; + for (int i = 0; i < srcInfo->nbSamples; i++) { + maxSampleSize = MAX(srcInfo->samplesSizes[i], maxSampleSize); + } + dstCapacity = ZSTD_compressBound(maxSampleSize); + dst = malloc(dstCapacity); + } + + /* Create the cctx and cdict */ + cctx = ZSTD_createCCtx(); + cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel); + + if(!cctx || !cdict || !dst) { + cRatio = -1; + goto _cleanup; + } + + /* Calculate offset for each sample */ + offsets = (size_t *)malloc((srcInfo->nbSamples + 1) * sizeof(size_t)); + offsets[0] = 0; + for (i = 1; i <= srcInfo->nbSamples; i++) { + offsets[i] = offsets[i - 1] + srcInfo->samplesSizes[i - 1]; + } + + /* Compress each sample and sum their sizes*/ + const BYTE *const samples = (const BYTE *)srcInfo->srcBuffer; + for (i = 0; i < srcInfo->nbSamples; i++) { + const size_t compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict); + if (ZSTD_isError(compressedSize)) { + cRatio = -1; + goto _cleanup; + } + totalCompressedSize += compressedSize; + } + + /* Sum orignal sizes */ + for (i = 0; inbSamples; i++) { + totalOriginalSize += srcInfo->samplesSizes[i]; + } + + /* Calculate compression ratio */ + DISPLAYLEVEL(2, "original size is %lu\n", totalOriginalSize); + DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize); + cRatio = (double)totalOriginalSize/(double)totalCompressedSize; + +_cleanup: + if(dst) { + free(dst); + } + if(offsets) { + free(offsets); + } + ZSTD_freeCCtx(cctx); + ZSTD_freeCDict(cdict); + return cRatio; +} + + +/** FreeDictInfo() : + * Free memory allocated for dictInfo + */ +void freeDictInfo(dictInfo* info) { + if (!info) return; + if (info->dictBuffer) free((void*)(info->dictBuffer)); + free(info); +} + + + +/*-******************************************************** + * Benchmarking functions +**********************************************************/ +/** benchmarkRandom() : + * Measure how long random dictionary builder takes and compression ratio with the random dictionary + * @return 0 if benchmark successfully, 1 otherwise + */ +int benchmarkRandom(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam) { + const int displayLevel = randomParam->zParams.notificationLevel; + int result = 0; + clock_t t; + t = clock(); + dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, NULL, NULL); + t = clock() - t; + double time_taken = ((double)t)/CLOCKS_PER_SEC; + if (!dInfo) { + DISPLAYLEVEL(1, "RANDOM does not train successfully\n"); + result = 1; + goto _cleanup; + } + DISPLAYLEVEL(2, "RANDOM took %f seconds to execute \n", time_taken); + + double cRatio = compressWithDict(srcInfo, dInfo, randomParam->zParams.compressionLevel, displayLevel); + if (cRatio < 0) { + DISPLAYLEVEL(1, "Compressing with RANDOM dictionary does not work\n"); + result = 1; + goto _cleanup; + } + DISPLAYLEVEL(2, "Compression ratio with random dictionary is %f\n", cRatio); + + +_cleanup: + freeDictInfo(dInfo); + return result; +} + +/** benchmarkCover() : + * Measure how long random dictionary builder takes and compression ratio with the cover dictionary + * @return 0 if benchmark successfully, 1 otherwise + */ +int benchmarkCover(sampleInfo *srcInfo, unsigned maxDictSize, + ZDICT_cover_params_t *coverParam) { + const int displayLevel = coverParam->zParams.notificationLevel; + int result = 0; + clock_t t; + t = clock(); + dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, NULL, coverParam, NULL); + t = clock() - t; + double time_taken = ((double)t)/CLOCKS_PER_SEC; + if (!dInfo) { + DISPLAYLEVEL(1, "COVER does not train successfully\n"); + result = 1; + goto _cleanup; + } + DISPLAYLEVEL(2, "COVER took %f seconds to execute \n", time_taken); + + double cRatio = compressWithDict(srcInfo, dInfo, coverParam->zParams.compressionLevel, displayLevel); + if (cRatio < 0) { + DISPLAYLEVEL(1, "Compressing with COVER dictionary does not work\n"); + result = 1; + goto _cleanup; + } + DISPLAYLEVEL(2, "Compression ratio with cover dictionary is %f\n", cRatio); + +_cleanup: + freeDictInfo(dInfo); + return result; +} + + + +/** benchmarkLegacy() : + * Measure how long legacy dictionary builder takes and compression ratio with the legacy dictionary + * @return 0 if benchmark successfully, 1 otherwise + */ +int benchmarkLegacy(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_legacy_params_t *legacyParam) { + const int displayLevel = legacyParam->zParams.notificationLevel; + int result = 0; + clock_t t; + t = clock(); + dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, NULL, NULL, legacyParam); + t = clock() - t; + double time_taken = ((double)t)/CLOCKS_PER_SEC; + if (!dInfo) { + DISPLAYLEVEL(1, "LEGACY does not train successfully\n"); + result = 1; + goto _cleanup; + + } + DISPLAYLEVEL(2, "LEGACY took %f seconds to execute \n", time_taken); + + double cRatio = compressWithDict(srcInfo, dInfo, legacyParam->zParams.compressionLevel, displayLevel); + if (cRatio < 0) { + DISPLAYLEVEL(1, "Compressing with LEGACY dictionary does not work\n"); + result = 1; + goto _cleanup; + + } + DISPLAYLEVEL(2, "Compression ratio with legacy dictionary is %f\n", cRatio); + +_cleanup: + freeDictInfo(dInfo); + return result; +} + + + +int main(int argCount, const char* argv[]) +{ + int displayLevel = 2; + const char* programName = argv[0]; + int result = 0; + /* Initialize arguments to default values */ + unsigned k = 200; + unsigned d = 6; + unsigned cLevel = 3; + unsigned dictID = 0; + unsigned maxDictSize = g_defaultMaxDictSize; + + /* Initialize table to store input files */ + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); + unsigned filenameIdx = 0; + + char* fileNamesBuf = NULL; + unsigned fileNamesNb = filenameIdx; + int followLinks = 0; + const char** extendedFileList = NULL; + + /* Parse arguments */ + for (int i = 1; i < argCount; i++) { + const char* argument = argv[i]; + if (longCommandWArg(&argument, "in=")) { + filenameTable[filenameIdx] = argument; + filenameIdx++; + continue; + } + DISPLAYLEVEL(1, "benchmark: Incorrect parameters\n"); + return 1; + } + + + /* Get the list of all files recursively (because followLinks==0)*/ + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, + &fileNamesNb, followLinks); + if (extendedFileList) { + unsigned u; + for (u=0; u Date: Fri, 20 Jul 2018 17:03:47 -0700 Subject: [PATCH 20/35] Refactoring and benchmark without dictionary --- contrib/benchmarkDictBuilder/README.md | 43 --- contrib/benchmarkDictBuilder/dictBuilder.h | 10 - .../benchmarkDictBuilder/Makefile | 8 +- .../benchmarkDictBuilder/README.md | 47 +++ .../benchmarkDictBuilder/benchmark.c | 322 +++++++----------- .../benchmarkDictBuilder/dictBuilder.h | 6 + .../benchmarkDictBuilder/test.sh | 2 +- .../randomDictBuilder/Makefile | 10 +- .../randomDictBuilder/README.md | 4 +- .../randomDictBuilder/io.c | 33 ++ .../randomDictBuilder/io.h | 8 +- .../randomDictBuilder/main.c | 40 --- .../randomDictBuilder/random.c | 0 .../randomDictBuilder/random.h | 0 .../randomDictBuilder/test.sh | 12 +- 15 files changed, 232 insertions(+), 313 deletions(-) delete mode 100644 contrib/benchmarkDictBuilder/README.md delete mode 100644 contrib/benchmarkDictBuilder/dictBuilder.h rename contrib/{ => experimental_dict_builders}/benchmarkDictBuilder/Makefile (76%) create mode 100644 contrib/experimental_dict_builders/benchmarkDictBuilder/README.md rename contrib/{ => experimental_dict_builders}/benchmarkDictBuilder/benchmark.c (53%) create mode 100644 contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h rename contrib/{ => experimental_dict_builders}/benchmarkDictBuilder/test.sh (54%) rename contrib/{ => experimental_dict_builders}/randomDictBuilder/Makefile (79%) rename contrib/{ => experimental_dict_builders}/randomDictBuilder/README.md (85%) rename contrib/{ => experimental_dict_builders}/randomDictBuilder/io.c (89%) rename contrib/{ => experimental_dict_builders}/randomDictBuilder/io.h (78%) rename contrib/{ => experimental_dict_builders}/randomDictBuilder/main.c (79%) rename contrib/{ => experimental_dict_builders}/randomDictBuilder/random.c (100%) rename contrib/{ => experimental_dict_builders}/randomDictBuilder/random.h (100%) rename contrib/{ => experimental_dict_builders}/randomDictBuilder/test.sh (52%) diff --git a/contrib/benchmarkDictBuilder/README.md b/contrib/benchmarkDictBuilder/README.md deleted file mode 100644 index b680a53c..00000000 --- a/contrib/benchmarkDictBuilder/README.md +++ /dev/null @@ -1,43 +0,0 @@ -Benchmarking Dictionary Builder - -### Permitted Argument: -Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in=" - -###Running Test: -make test - -###Usage: -Benchmark given input files: make ARG= followed by permitted arguments - -### Examples: -make ARG="in=../../lib/dictBuilder in=../../lib/compress" - -###Benchmarking Result: - -github: -| Algorithm | Speed(sec) | Compression Ratio | -| ------------- |:-------------:| ------------------:| -| random | 0.182254 | 8.786957 | -| cover | 34.821007 | 10.430999 | -| legacy | 1.125494 | 8.989482 | - -hg-commands -| Algorithm | Speed(sec) | Compression Ratio | -| ------------- |:-------------:| ------------------:| -| random | 0.089231 | 3.489515 | -| cover | 32.342462 | 4.030274 | -| legacy | 1.066594 | 3.911896 | - -hg-manifest -| Algorithm | Speed(sec) | Compression Ratio | -| ------------- |:-------------:| ------------------:| -| random | 1.095083 | 2.309485 | -| cover | 517.999132 | 2.575331 | -| legacy | 10.789509 | 2.506775 | - -hg-changelog -| Algorithm | Speed(sec) | Compression Ratio | -| ------------- |:-------------:| ------------------:| -| random | 0.639630 | 2.096785 | -| cover | 121.398023 | 2.175706 | -| legacy | 3.050893 | 2.058273 | diff --git a/contrib/benchmarkDictBuilder/dictBuilder.h b/contrib/benchmarkDictBuilder/dictBuilder.h deleted file mode 100644 index a2dae576..00000000 --- a/contrib/benchmarkDictBuilder/dictBuilder.h +++ /dev/null @@ -1,10 +0,0 @@ -/*! ZDICT_trainFromBuffer_unsafe_legacy() : - Strictly Internal use only !! - Same as ZDICT_trainFromBuffer_legacy(), but does not control `samplesBuffer`. - `samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads. - @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - or an error code. -*/ -size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, - ZDICT_legacy_params_t parameters); diff --git a/contrib/benchmarkDictBuilder/Makefile b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile similarity index 76% rename from contrib/benchmarkDictBuilder/Makefile rename to contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile index d36d96d5..72ce04f2 100644 --- a/contrib/benchmarkDictBuilder/Makefile +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile @@ -2,7 +2,7 @@ ARG := CC ?= gcc CFLAGS ?= -O3 -INCLUDES := -I ../randomDictBuilder -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder +INCLUDES := -I ../randomDictBuilder -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder RANDOM_FILE := ../randomDictBuilder/random.c IO_FILE := ../randomDictBuilder/io.c @@ -34,11 +34,11 @@ io.o: $(IO_FILE) $(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE) libzstd.a: - $(MAKE) -C ../../lib libzstd.a - mv ../../lib/libzstd.a . + $(MAKE) -C ../../../lib libzstd.a + mv ../../../lib/libzstd.a . .PHONY: clean clean: rm -f *.o benchmark libzstd.a - $(MAKE) -C ../../lib clean + $(MAKE) -C ../../../lib clean echo "Cleaning is completed" diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md new file mode 100644 index 00000000..de783a0e --- /dev/null +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -0,0 +1,47 @@ +Benchmarking Dictionary Builder + +### Permitted Argument: +Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in=" + +###Running Test: +make test + +###Usage: +Benchmark given input files: make ARG= followed by permitted arguments + +### Examples: +make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" + +###Benchmarking Result: + +github: +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| nodict | 0.000004 | 2.999642 | +| random | 0.180238 | 8.786957 | +| cover | 33.891987 | 10.430999 | +| legacy | 1.077569 | 8.989482 | + +hg-commands +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| nodict | 0.000006 | 2.425291 | +| random | 0.088735 | 3.489515 | +| cover | 35.447300 | 4.030274 | +| legacy | 1.048509 | 3.911896 | + +hg-manifest +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| nodict | 0.000005 | 1.866385 | +| random | 1.148231 | 2.309485 | +| cover | 509.685257 | 2.575331 | +| legacy | 10.705866 | 2.506775 | + +hg-changelog +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| nodict | 0.000005 | 1.377613 | +| random | 0.706434 | 2.096785 | +| cover | 122.815783 | 2.175706 | +| legacy | 3.010318 | 2.058273 | diff --git a/contrib/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c similarity index 53% rename from contrib/benchmarkDictBuilder/benchmark.c rename to contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c index aabd96a0..890afb8b 100644 --- a/contrib/benchmarkDictBuilder/benchmark.c +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -44,12 +44,14 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; exit(error); \ } + /*-************************************* * Constants ***************************************/ static const unsigned g_defaultMaxDictSize = 110 KB; -#define MEMMULT 11 -#define NOISELENGTH 32 +#define DEFAULT_CLEVEL 3 +#define DEFAULT_DISPLAYLEVEL 2 + /*-************************************* * Struct @@ -60,57 +62,6 @@ typedef struct { } dictInfo; -/*-************************************* -* Commandline related functions -***************************************/ -static unsigned readU32FromChar(const char** stringPtr){ - const char errorMsg[] = "error: numeric value too large"; - unsigned result = 0; - while ((**stringPtr >='0') && (**stringPtr <='9')) { - unsigned const max = (((unsigned)(-1)) / 10) - 1; - if (result > max) exit(1); - result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; - } - if ((**stringPtr=='K') || (**stringPtr=='M')) { - unsigned const maxK = ((unsigned)(-1)) >> 10; - if (result > maxK) exit(1); - result <<= 10; - if (**stringPtr=='M') { - if (result > maxK) exit(1); - result <<= 10; - } - (*stringPtr)++; /* skip `K` or `M` */ - if (**stringPtr=='i') (*stringPtr)++; - if (**stringPtr=='B') (*stringPtr)++; - } - return result; -} - -/** longCommandWArg() : - * check if *stringPtr is the same as longCommand. - * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. - * @return 0 and doesn't modify *stringPtr otherwise. - */ -static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){ - size_t const comSize = strlen(longCommand); - int const result = !strncmp(*stringPtr, longCommand, comSize); - if (result) *stringPtr += comSize; - return result; -} - -static void fillNoise(void* buffer, size_t length) -{ - unsigned const prime1 = 2654435761U; - unsigned const prime2 = 2246822519U; - unsigned acc = prime1; - size_t p=0;; - - for (p=0; p> 21); - } -} - /*-************************************* * Dictionary related operations ***************************************/ @@ -122,9 +73,9 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize, ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams, ZDICT_legacy_params_t *legacyParams) { unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel : - coverParams ? coverParams->zParams.notificationLevel : - legacyParams ? legacyParams->zParams.notificationLevel : - 0; /* should never happen */ + coverParams ? coverParams->zParams.notificationLevel : + legacyParams ? legacyParams->zParams.notificationLevel : + DEFAULT_DISPLAYLEVEL; /* no dict */ void* const dictBuffer = malloc(maxDictSize); dictInfo* dInfo; @@ -140,21 +91,15 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize, }else if(coverParams) { dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, info->samplesSizes, info->nbSamples, coverParams); - } else { - size_t totalSize= 0; - for (int i = 0; i < info->nbSamples; i++) { - totalSize += info->samplesSizes[i]; - } - size_t const maxMem = findMaxMem(totalSize * MEMMULT) / MEMMULT; - size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, totalSize); - fillNoise((char*)(info->srcBuffer) + loadedSize, NOISELENGTH); - dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize, info->srcBuffer, + } else if(legacyParams) { + dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer, info->samplesSizes, info->nbSamples, *legacyParams); + } else { + dictSize = 0; } if (ZDICT_isError(dictSize)) { DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ free(dictBuffer); - freeSampleInfo(info); return dInfo; } dInfo = (dictInfo *)malloc(sizeof(dictInfo)); @@ -173,6 +118,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev /* Local variables */ size_t totalCompressedSize = 0; size_t totalOriginalSize = 0; + unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0; double cRatio; size_t dstCapacity; int i; @@ -193,15 +139,6 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev dst = malloc(dstCapacity); } - /* Create the cctx and cdict */ - cctx = ZSTD_createCCtx(); - cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel); - - if(!cctx || !cdict || !dst) { - cRatio = -1; - goto _cleanup; - } - /* Calculate offset for each sample */ offsets = (size_t *)malloc((srcInfo->nbSamples + 1) * sizeof(size_t)); offsets[0] = 0; @@ -209,13 +146,35 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev offsets[i] = offsets[i - 1] + srcInfo->samplesSizes[i - 1]; } + /* Create the cctx */ + cctx = ZSTD_createCCtx(); + if(!cctx || !dst) { + cRatio = -1; + goto _nodictCleanup; + } + + /* Create CDict if there's a dictionary stored on buffer */ + if (hasDict) { + cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel); + if(!cdict) { + cRatio = -1; + goto _dictCleanup; + } + } + /* Compress each sample and sum their sizes*/ const BYTE *const samples = (const BYTE *)srcInfo->srcBuffer; for (i = 0; i < srcInfo->nbSamples; i++) { - const size_t compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict); + size_t compressedSize; + if(hasDict) { + compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict); + } else { + compressedSize = ZSTD_compressCCtx(cctx, dst, dstCapacity,samples + offsets[i], srcInfo->samplesSizes[i], compressionLevel); + } if (ZSTD_isError(compressedSize)) { cRatio = -1; - goto _cleanup; + if(hasDict) goto _dictCleanup; + else goto _nodictCleanup; } totalCompressedSize += compressedSize; } @@ -230,15 +189,14 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize); cRatio = (double)totalOriginalSize/(double)totalCompressedSize; -_cleanup: - if(dst) { - free(dst); - } - if(offsets) { - free(offsets); - } - ZSTD_freeCCtx(cctx); +_dictCleanup: ZSTD_freeCDict(cdict); + +_nodictCleanup: + free(dst); + free(offsets); + ZSTD_freeCCtx(cctx); + return cRatio; } @@ -257,102 +215,48 @@ void freeDictInfo(dictInfo* info) { /*-******************************************************** * Benchmarking functions **********************************************************/ -/** benchmarkRandom() : - * Measure how long random dictionary builder takes and compression ratio with the random dictionary +/** benchmarkDictBuilder() : + * Measure how long a dictionary builder takes and compression ratio with the dictionary built * @return 0 if benchmark successfully, 1 otherwise */ -int benchmarkRandom(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam) { - const int displayLevel = randomParam->zParams.notificationLevel; +int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam, + ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam) { + /* Local variables */ + const unsigned displayLevel = randomParam ? randomParam->zParams.notificationLevel : + coverParam ? coverParam->zParams.notificationLevel : + legacyParam ? legacyParam->zParams.notificationLevel : + DEFAULT_DISPLAYLEVEL; /* no dict */ + const char* name = randomParam ? "RANDOM" : + coverParam ? "COVER" : + legacyParam ? "LEGACY" : + "NODICT"; /* no dict */ + const unsigned cLevel = randomParam ? randomParam->zParams.compressionLevel : + coverParam ? coverParam->zParams.compressionLevel : + legacyParam ? legacyParam->zParams.compressionLevel : + DEFAULT_CLEVEL; /* no dict */ int result = 0; - clock_t t; - t = clock(); - dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, NULL, NULL); - t = clock() - t; - double time_taken = ((double)t)/CLOCKS_PER_SEC; + + /* Calculate speed */ + const UTIL_time_t begin = UTIL_getTime(); + dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam); + const U64 timeMicro = UTIL_clockSpanMicro(begin); + const double timeSec = timeMicro / (double)SEC_TO_MICRO; if (!dInfo) { - DISPLAYLEVEL(1, "RANDOM does not train successfully\n"); + DISPLAYLEVEL(1, "%s does not train successfully\n", name); result = 1; goto _cleanup; } - DISPLAYLEVEL(2, "RANDOM took %f seconds to execute \n", time_taken); + DISPLAYLEVEL(2, "%s took %f seconds to execute \n", name, timeSec); - double cRatio = compressWithDict(srcInfo, dInfo, randomParam->zParams.compressionLevel, displayLevel); + /* Calculate compression ratio */ + double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel); if (cRatio < 0) { - DISPLAYLEVEL(1, "Compressing with RANDOM dictionary does not work\n"); - result = 1; - goto _cleanup; - } - DISPLAYLEVEL(2, "Compression ratio with random dictionary is %f\n", cRatio); - - -_cleanup: - freeDictInfo(dInfo); - return result; -} - -/** benchmarkCover() : - * Measure how long random dictionary builder takes and compression ratio with the cover dictionary - * @return 0 if benchmark successfully, 1 otherwise - */ -int benchmarkCover(sampleInfo *srcInfo, unsigned maxDictSize, - ZDICT_cover_params_t *coverParam) { - const int displayLevel = coverParam->zParams.notificationLevel; - int result = 0; - clock_t t; - t = clock(); - dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, NULL, coverParam, NULL); - t = clock() - t; - double time_taken = ((double)t)/CLOCKS_PER_SEC; - if (!dInfo) { - DISPLAYLEVEL(1, "COVER does not train successfully\n"); - result = 1; - goto _cleanup; - } - DISPLAYLEVEL(2, "COVER took %f seconds to execute \n", time_taken); - - double cRatio = compressWithDict(srcInfo, dInfo, coverParam->zParams.compressionLevel, displayLevel); - if (cRatio < 0) { - DISPLAYLEVEL(1, "Compressing with COVER dictionary does not work\n"); - result = 1; - goto _cleanup; - } - DISPLAYLEVEL(2, "Compression ratio with cover dictionary is %f\n", cRatio); - -_cleanup: - freeDictInfo(dInfo); - return result; -} - - - -/** benchmarkLegacy() : - * Measure how long legacy dictionary builder takes and compression ratio with the legacy dictionary - * @return 0 if benchmark successfully, 1 otherwise - */ -int benchmarkLegacy(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_legacy_params_t *legacyParam) { - const int displayLevel = legacyParam->zParams.notificationLevel; - int result = 0; - clock_t t; - t = clock(); - dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, NULL, NULL, legacyParam); - t = clock() - t; - double time_taken = ((double)t)/CLOCKS_PER_SEC; - if (!dInfo) { - DISPLAYLEVEL(1, "LEGACY does not train successfully\n"); + DISPLAYLEVEL(1, "Compressing with %s dictionary does not work\n", name); result = 1; goto _cleanup; } - DISPLAYLEVEL(2, "LEGACY took %f seconds to execute \n", time_taken); - - double cRatio = compressWithDict(srcInfo, dInfo, legacyParam->zParams.compressionLevel, displayLevel); - if (cRatio < 0) { - DISPLAYLEVEL(1, "Compressing with LEGACY dictionary does not work\n"); - result = 1; - goto _cleanup; - - } - DISPLAYLEVEL(2, "Compression ratio with legacy dictionary is %f\n", cRatio); + DISPLAYLEVEL(2, "Compression ratio with %s dictionary is %f\n", name, cRatio); _cleanup: freeDictInfo(dInfo); @@ -363,15 +267,16 @@ _cleanup: int main(int argCount, const char* argv[]) { - int displayLevel = 2; + const int displayLevel = DEFAULT_DISPLAYLEVEL; const char* programName = argv[0]; int result = 0; + /* Initialize arguments to default values */ - unsigned k = 200; - unsigned d = 6; - unsigned cLevel = 3; - unsigned dictID = 0; - unsigned maxDictSize = g_defaultMaxDictSize; + const unsigned k = 200; + const unsigned d = 6; + const unsigned cLevel = DEFAULT_CLEVEL; + const unsigned dictID = 0; + const unsigned maxDictSize = g_defaultMaxDictSize; /* Initialize table to store input files */ const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); @@ -379,7 +284,7 @@ int main(int argCount, const char* argv[]) char* fileNamesBuf = NULL; unsigned fileNamesNb = filenameIdx; - int followLinks = 0; + const int followLinks = 0; const char** extendedFileList = NULL; /* Parse arguments */ @@ -394,7 +299,6 @@ int main(int argCount, const char* argv[]) return 1; } - /* Get the list of all files recursively (because followLinks==0)*/ extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks); @@ -406,6 +310,7 @@ int main(int argCount, const char* argv[]) filenameIdx = fileNamesNb; } + /* get sampleInfo */ size_t blockSize = 0; sampleInfo* srcInfo= getSampleInfo(filenameTable, filenameIdx, blockSize, maxDictSize, displayLevel); @@ -416,38 +321,53 @@ int main(int argCount, const char* argv[]) zParams.notificationLevel = displayLevel; zParams.dictID = dictID; + /* with no dict */ + { + const int noDictResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL); + if(noDictResult) { + result = 1; + goto _cleanup; + } + } + /* for random */ - ZDICT_random_params_t randomParam; - randomParam.zParams = zParams; - randomParam.k = k; - int randomResult = benchmarkRandom(srcInfo, maxDictSize, &randomParam); - if(randomResult) { - result = 1; - goto _cleanup; + { + ZDICT_random_params_t randomParam; + randomParam.zParams = zParams; + randomParam.k = k; + const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL); + if(randomResult) { + result = 1; + goto _cleanup; + } } /* for cover */ - ZDICT_cover_params_t coverParam; - memset(&coverParam, 0, sizeof(coverParam)); - coverParam.zParams = zParams; - coverParam.splitPoint = 1.0; - coverParam.d = d; - coverParam.steps = 40; - coverParam.nbThreads = 1; - int coverOptResult = benchmarkCover(srcInfo, maxDictSize, &coverParam); - if(coverOptResult) { - result = 1; - goto _cleanup; + { + ZDICT_cover_params_t coverParam; + memset(&coverParam, 0, sizeof(coverParam)); + coverParam.zParams = zParams; + coverParam.splitPoint = 1.0; + coverParam.d = d; + coverParam.steps = 40; + coverParam.nbThreads = 1; + const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL); + if(coverOptResult) { + result = 1; + goto _cleanup; + } } /* for legacy */ - ZDICT_legacy_params_t legacyParam; - legacyParam.zParams = zParams; - legacyParam.selectivityLevel = 9; - int legacyResult = benchmarkLegacy(srcInfo, maxDictSize, &legacyParam); - if(legacyResult) { - result = 1; - goto _cleanup; + { + ZDICT_legacy_params_t legacyParam; + legacyParam.zParams = zParams; + legacyParam.selectivityLevel = 9; + const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam); + if(legacyResult) { + result = 1; + goto _cleanup; + } } /* Free allocated memory */ diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h b/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h new file mode 100644 index 00000000..781ec8c2 --- /dev/null +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h @@ -0,0 +1,6 @@ +/* ZDICT_trainFromBuffer_legacy() : + * issue : samplesBuffer need to be followed by a noisy guard band. + * work around : duplicate the buffer, and add the noise */ +size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_legacy_params_t params); diff --git a/contrib/benchmarkDictBuilder/test.sh b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh similarity index 54% rename from contrib/benchmarkDictBuilder/test.sh rename to contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh index 6354784e..5eaf5930 100644 --- a/contrib/benchmarkDictBuilder/test.sh +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh @@ -1,2 +1,2 @@ echo "Benchmark with in=../../lib/common" -./benchmark in=../../lib/common +./benchmark in=../../../lib/common diff --git a/contrib/randomDictBuilder/Makefile b/contrib/experimental_dict_builders/randomDictBuilder/Makefile similarity index 79% rename from contrib/randomDictBuilder/Makefile rename to contrib/experimental_dict_builders/randomDictBuilder/Makefile index 5f9240bf..bbd40e47 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/experimental_dict_builders/randomDictBuilder/Makefile @@ -2,9 +2,9 @@ ARG := CC ?= gcc CFLAGS ?= -O3 -INCLUDES := -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder +INCLUDES := -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder -TEST_INPUT := ../../lib +TEST_INPUT := ../../../lib TEST_OUTPUT := randomDict all: main run clean @@ -30,8 +30,8 @@ io.o: io.c $(CC) $(CFLAGS) $(INCLUDES) -c io.c libzstd.a: - $(MAKE) -C ../../lib libzstd.a - mv ../../lib/libzstd.a . + $(MAKE) -C ../../../lib libzstd.a + mv ../../../lib/libzstd.a . .PHONY: testrun testrun: main @@ -48,5 +48,5 @@ testshell: test.sh .PHONY: clean clean: rm -f *.o main libzstd.a - $(MAKE) -C ../../lib clean + $(MAKE) -C ../../../lib clean echo "Cleaning is completed" diff --git a/contrib/randomDictBuilder/README.md b/contrib/experimental_dict_builders/randomDictBuilder/README.md similarity index 85% rename from contrib/randomDictBuilder/README.md rename to contrib/experimental_dict_builders/randomDictBuilder/README.md index 0e70d3dc..da12a428 100644 --- a/contrib/randomDictBuilder/README.md +++ b/contrib/experimental_dict_builders/randomDictBuilder/README.md @@ -16,5 +16,5 @@ To build a random dictionary with the provided arguments: make ARG= followed by ### Examples: -make ARG="in=../../lib/dictBuilder out=dict100 dictID=520" -make ARG="in=../../lib/dictBuilder in=../../lib/compress" +make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520" +make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" diff --git a/contrib/randomDictBuilder/io.c b/contrib/experimental_dict_builders/randomDictBuilder/io.c similarity index 89% rename from contrib/randomDictBuilder/io.c rename to contrib/experimental_dict_builders/randomDictBuilder/io.c index 1217b574..bfe39eae 100644 --- a/contrib/randomDictBuilder/io.c +++ b/contrib/experimental_dict_builders/randomDictBuilder/io.c @@ -53,6 +53,39 @@ static const size_t g_maxMemory = (sizeof(size_t) == 4) ? #define NOISELENGTH 32 +/*-************************************* +* Commandline related functions +***************************************/ +unsigned readU32FromChar(const char** stringPtr){ + const char errorMsg[] = "error: numeric value too large"; + unsigned result = 0; + while ((**stringPtr >='0') && (**stringPtr <='9')) { + unsigned const max = (((unsigned)(-1)) / 10) - 1; + if (result > max) exit(1); + result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; + } + if ((**stringPtr=='K') || (**stringPtr=='M')) { + unsigned const maxK = ((unsigned)(-1)) >> 10; + if (result > maxK) exit(1); + result <<= 10; + if (**stringPtr=='M') { + if (result > maxK) exit(1); + result <<= 10; + } + (*stringPtr)++; /* skip `K` or `M` */ + if (**stringPtr=='i') (*stringPtr)++; + if (**stringPtr=='B') (*stringPtr)++; + } + return result; +} + +unsigned longCommandWArg(const char** stringPtr, const char* longCommand){ + size_t const comSize = strlen(longCommand); + int const result = !strncmp(*stringPtr, longCommand, comSize); + if (result) *stringPtr += comSize; + return result; +} + /* ******************************************************** * File related operations diff --git a/contrib/randomDictBuilder/io.h b/contrib/experimental_dict_builders/randomDictBuilder/io.h similarity index 78% rename from contrib/randomDictBuilder/io.h rename to contrib/experimental_dict_builders/randomDictBuilder/io.h index e2f454c2..0ee24604 100644 --- a/contrib/randomDictBuilder/io.h +++ b/contrib/experimental_dict_builders/randomDictBuilder/io.h @@ -50,5 +50,11 @@ void freeSampleInfo(sampleInfo *info); void saveDict(const char* dictFileName, const void* buff, size_t buffSize); +unsigned readU32FromChar(const char** stringPtr); -size_t findMaxMem(unsigned long long requiredMem); +/** longCommandWArg() : + * check if *stringPtr is the same as longCommand. + * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. + * @return 0 and doesn't modify *stringPtr otherwise. + */ +unsigned longCommandWArg(const char** stringPtr, const char* longCommand); diff --git a/contrib/randomDictBuilder/main.c b/contrib/experimental_dict_builders/randomDictBuilder/main.c similarity index 79% rename from contrib/randomDictBuilder/main.c rename to contrib/experimental_dict_builders/randomDictBuilder/main.c index 4751a9e1..3f3a6ca7 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/experimental_dict_builders/randomDictBuilder/main.c @@ -52,46 +52,6 @@ static const unsigned g_defaultMaxDictSize = 110 KB; -/*-************************************* -* Commandline related functions -***************************************/ -static unsigned readU32FromChar(const char** stringPtr){ - const char errorMsg[] = "error: numeric value too large"; - unsigned result = 0; - while ((**stringPtr >='0') && (**stringPtr <='9')) { - unsigned const max = (((unsigned)(-1)) / 10) - 1; - if (result > max) exit(1); - result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; - } - if ((**stringPtr=='K') || (**stringPtr=='M')) { - unsigned const maxK = ((unsigned)(-1)) >> 10; - if (result > maxK) exit(1); - result <<= 10; - if (**stringPtr=='M') { - if (result > maxK) exit(1); - result <<= 10; - } - (*stringPtr)++; /* skip `K` or `M` */ - if (**stringPtr=='i') (*stringPtr)++; - if (**stringPtr=='B') (*stringPtr)++; - } - return result; -} - -/** longCommandWArg() : - * check if *stringPtr is the same as longCommand. - * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. - * @return 0 and doesn't modify *stringPtr otherwise. - */ -static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){ - size_t const comSize = strlen(longCommand); - int const result = !strncmp(*stringPtr, longCommand, comSize); - if (result) *stringPtr += comSize; - return result; -} - - - /*-************************************* * RANDOM ***************************************/ diff --git a/contrib/randomDictBuilder/random.c b/contrib/experimental_dict_builders/randomDictBuilder/random.c similarity index 100% rename from contrib/randomDictBuilder/random.c rename to contrib/experimental_dict_builders/randomDictBuilder/random.c diff --git a/contrib/randomDictBuilder/random.h b/contrib/experimental_dict_builders/randomDictBuilder/random.h similarity index 100% rename from contrib/randomDictBuilder/random.h rename to contrib/experimental_dict_builders/randomDictBuilder/random.h diff --git a/contrib/randomDictBuilder/test.sh b/contrib/experimental_dict_builders/randomDictBuilder/test.sh similarity index 52% rename from contrib/randomDictBuilder/test.sh rename to contrib/experimental_dict_builders/randomDictBuilder/test.sh index 497820f8..1eb732e5 100644 --- a/contrib/randomDictBuilder/test.sh +++ b/contrib/experimental_dict_builders/randomDictBuilder/test.sh @@ -1,12 +1,12 @@ echo "Building random dictionary with in=../../lib/common k=200 out=dict1" -./main in=../../lib/common k=200 out=dict1 -zstd -be3 -D dict1 -r ../../lib/common -q +./main in=../../../lib/common k=200 out=dict1 +zstd -be3 -D dict1 -r ../../../lib/common -q echo "Building random dictionary with in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000" -./main in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 -zstd -be3 -D dict2 -r ../../lib/common -q +./main in=../../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 +zstd -be3 -D dict2 -r ../../../lib/common -q echo "Building random dictionary with 2 sample sources" -./main in=../../lib/common in=../../lib/compress out=dict3 -zstd -be3 -D dict3 -r ../../lib/common -q +./main in=../../../lib/common in=../../../lib/compress out=dict3 +zstd -be3 -D dict3 -r ../../../lib/common -q echo "Removing dict1 dict2 dict3" rm -f dict1 dict2 dict3 From b6c5d4982c489b76b4b0e994c680b1e3bd01080b Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 20 Jul 2018 17:41:22 -0700 Subject: [PATCH 21/35] Minor fix --- .../benchmarkDictBuilder/benchmark.c | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c index 890afb8b..64041964 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -78,7 +78,7 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize, DEFAULT_DISPLAYLEVEL; /* no dict */ void* const dictBuffer = malloc(maxDictSize); - dictInfo* dInfo; + dictInfo* dInfo = NULL; /* Checks */ if (!dictBuffer) @@ -118,16 +118,16 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev /* Local variables */ size_t totalCompressedSize = 0; size_t totalOriginalSize = 0; - unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0; + const unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0; double cRatio; size_t dstCapacity; int i; /* Pointers */ - ZSTD_CCtx* cctx; - ZSTD_CDict *cdict; - size_t *offsets; - void* dst; + ZSTD_CDict *cdict = NULL; + ZSTD_CCtx* cctx = NULL; + size_t *offsets = NULL; + void* dst = NULL; /* Allocate dst with enough space to compress the maximum sized sample */ { @@ -150,7 +150,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev cctx = ZSTD_createCCtx(); if(!cctx || !dst) { cRatio = -1; - goto _nodictCleanup; + goto _cleanup; } /* Create CDict if there's a dictionary stored on buffer */ @@ -158,7 +158,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel); if(!cdict) { cRatio = -1; - goto _dictCleanup; + goto _cleanup; } } @@ -173,8 +173,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev } if (ZSTD_isError(compressedSize)) { cRatio = -1; - if(hasDict) goto _dictCleanup; - else goto _nodictCleanup; + goto _cleanup; } totalCompressedSize += compressedSize; } @@ -189,14 +188,11 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize); cRatio = (double)totalOriginalSize/(double)totalCompressedSize; -_dictCleanup: - ZSTD_freeCDict(cdict); - -_nodictCleanup: +_cleanup: free(dst); free(offsets); ZSTD_freeCCtx(cctx); - + ZSTD_freeCDict(cdict); return cRatio; } @@ -249,7 +245,7 @@ int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random DISPLAYLEVEL(2, "%s took %f seconds to execute \n", name, timeSec); /* Calculate compression ratio */ - double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel); + const double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel); if (cRatio < 0) { DISPLAYLEVEL(1, "Compressing with %s dictionary does not work\n", name); result = 1; From 7f3f70f76621f4e488080d27f09614167c7b9a4b Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Wed, 25 Jul 2018 16:34:07 -0700 Subject: [PATCH 22/35] Add Fast Cover Dictionary Builder --- .../fastCover/Makefile | 54 ++ .../fastCover/README.md | 24 + .../fastCover/fastCover.c | 738 ++++++++++++++++++ .../fastCover/fastCover.h | 47 ++ .../fastCover/main.c | 177 +++++ .../fastCover/test.sh | 14 + 6 files changed, 1054 insertions(+) create mode 100644 contrib/experimental_dict_builders/fastCover/Makefile create mode 100644 contrib/experimental_dict_builders/fastCover/README.md create mode 100644 contrib/experimental_dict_builders/fastCover/fastCover.c create mode 100644 contrib/experimental_dict_builders/fastCover/fastCover.h create mode 100644 contrib/experimental_dict_builders/fastCover/main.c create mode 100644 contrib/experimental_dict_builders/fastCover/test.sh diff --git a/contrib/experimental_dict_builders/fastCover/Makefile b/contrib/experimental_dict_builders/fastCover/Makefile new file mode 100644 index 00000000..9c56013d --- /dev/null +++ b/contrib/experimental_dict_builders/fastCover/Makefile @@ -0,0 +1,54 @@ +ARG := + +CC ?= gcc +CFLAGS ?= -O3 +INCLUDES := -I ../../../programs -I ../randomDictBuilder -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder + +IO_FILE := ../randomDictBuilder/io.c + +TEST_INPUT := ../../../lib +TEST_OUTPUT := fastCoverDict + +all: main run clean + +.PHONY: test +test: main testrun testshell clean + +.PHONY: run +run: + echo "Building a fastCover dictionary with given arguments" + ./main $(ARG) + +main: main.o io.o fastCover.o libzstd.a + $(CC) $(CFLAGS) main.o io.o fastCover.o libzstd.a -o main + +main.o: main.c + $(CC) $(CFLAGS) $(INCLUDES) -c main.c + +fastCover.o: fastCover.c + $(CC) $(CFLAGS) $(INCLUDES) -c fastCover.c + +io.o: $(IO_FILE) + $(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE) + +libzstd.a: + $(MAKE) -C ../../../lib libzstd.a + mv ../../../lib/libzstd.a . + +.PHONY: testrun +testrun: main + echo "Run with $(TEST_INPUT) and $(TEST_OUTPUT) " + ./main in=$(TEST_INPUT) out=$(TEST_OUTPUT) + zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q + rm -f $(TEST_OUTPUT) + +.PHONY: testshell +testshell: test.sh + sh test.sh + echo "Finish running test.sh" + +.PHONY: clean +clean: + rm -f *.o main libzstd.a + $(MAKE) -C ../../../lib clean + echo "Cleaning is completed" diff --git a/contrib/experimental_dict_builders/fastCover/README.md b/contrib/experimental_dict_builders/fastCover/README.md new file mode 100644 index 00000000..088e38be --- /dev/null +++ b/contrib/experimental_dict_builders/fastCover/README.md @@ -0,0 +1,24 @@ +FastCover Dictionary Builder + +### Permitted Arguments: +Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in=" +Output Dictionary (out=dictName): if not provided, default to fastCoverDict +Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0 +Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB +Size of Selected Segment (k=#): positive number; in bytes; if not provided, default to 200 +Size of Dmer (d=#): positive number; in bytes; if not provided, default to 8 +Number of steps (steps=#): positive number, if not provided, default to 32 +Percentage of samples used for training(split=#): positive number; if not provided, default to 100 + + +###Running Test: +make test + + +###Usage: +To build a random dictionary with the provided arguments: make ARG= followed by arguments + + +### Examples: +make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520" +make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c new file mode 100644 index 00000000..6d3ad90a --- /dev/null +++ b/contrib/experimental_dict_builders/fastCover/fastCover.c @@ -0,0 +1,738 @@ +/*-************************************* +* Dependencies +***************************************/ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "mem.h" /* read */ +#include "pool.h" +#include "threading.h" +#include "fastCover.h" +#include "zstd_internal.h" /* includes zstd.h */ +#include "zdict.h" + + +/*-************************************* +* Constants +***************************************/ +#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define FASTCOVER_MAX_F 32 +#define DEFAULT_SPLITPOINT 1.0 + +/*-************************************* +* Console display +***************************************/ +static int g_displayLevel = 2; +#define DISPLAY(...) \ + { \ + fprintf(stderr, __VA_ARGS__); \ + fflush(stderr); \ + } +#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + DISPLAY(__VA_ARGS__); \ + } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ +#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__) + +#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ + g_time = clock(); \ + DISPLAY(__VA_ARGS__); \ + } \ + } +#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__) +static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; +static clock_t g_time = 0; + + +/*-************************************* +* Hash Function +***************************************/ +static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } + +/** + * Hash the 8-byte value pointed to by p and mod 2^f + */ +static size_t FASTCOVER_hash8PtrToIndex(const void* p, U32 h) { + return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1); +} + + +/*-************************************* +* Context +***************************************/ +typedef struct { + const BYTE *samples; + size_t *offsets; + const size_t *samplesSizes; + size_t nbSamples; + size_t nbTrainSamples; + size_t nbTestSamples; + size_t nbDmers; + U32 *freqs; + unsigned d; +} FASTCOVER_ctx_t; + + +/*-************************************* +* Helper functions +***************************************/ +/** + * Returns the sum of the sample sizes. + */ +static size_t FASTCOVER_sum(const size_t *samplesSizes, unsigned nbSamples) { + size_t sum = 0; + unsigned i; + for (i = 0; i < nbSamples; ++i) { + sum += samplesSizes[i]; + } + return sum; +} + + +/*-************************************* +* fast functions +***************************************/ +/** + * A segment is a range in the source as well as the score of the segment. + */ +typedef struct { + U32 begin; + U32 end; + U32 score; +} FASTCOVER_segment_t; + + +/** + * Selects the best segment in an epoch. + * Segments of are scored according to the function: + * + * Let F(d) be the frequency of all dmers with hash value d. + * Let S_i be hash value of the dmer at position i of segment S which has length k. + * + * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) + * + * Once the dmer with hash value d is in the dictionay we set F(d) = F(d)/2. + */ +static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, + U32 *freqs, U32 begin,U32 end, + ZDICT_fastCover_params_t parameters) { + /* Constants */ + const U32 k = parameters.k; + const U32 d = parameters.d; + const U32 dmersInK = k - d + 1; + /* Try each segment (activeSegment) and save the best (bestSegment) */ + FASTCOVER_segment_t bestSegment = {0, 0, 0}; + FASTCOVER_segment_t activeSegment; + /* Reset the activeDmers in the segment */ + /* The activeSegment starts at the beginning of the epoch. */ + activeSegment.begin = begin; + activeSegment.end = begin; + activeSegment.score = 0; + /* Slide the activeSegment through the whole epoch. + * Save the best segment in bestSegment. + */ + while (activeSegment.end < end) { + /* Get hash value of current dmer */ + size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.end, parameters.f); + /* Add frequency of this index to score */ + activeSegment.score += freqs[index]; + /* Increment end of segment */ + activeSegment.end += 1; + /* If the window is now too large, drop the first position */ + if (activeSegment.end - activeSegment.begin == dmersInK + 1) { + /* Get hash value of the dmer to be eliminated from active segment */ + size_t delIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.begin, parameters.f); + /* Subtract frequency of this index from score */ + activeSegment.score -= freqs[delIndex]; + /* Increment start of segment */ + activeSegment.begin += 1; + } + /* If this segment is the best so far save it */ + if (activeSegment.score > bestSegment.score) { + bestSegment = activeSegment; + } + } + { + /* Trim off the zero frequency head and tail from the segment. */ + U32 newBegin = bestSegment.end; + U32 newEnd = bestSegment.begin; + U32 pos; + for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { + size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f); + U32 freq = freqs[index]; + if (freq != 0) { + newBegin = MIN(newBegin, pos); + newEnd = pos + 1; + } + } + bestSegment.begin = newBegin; + bestSegment.end = newEnd; + } + { + /* Half the frequency of hash value of each dmer covered by the chosen segment. */ + U32 pos; + for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { + size_t i = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f); + freqs[i] = freqs[i]/2; + } + } + return bestSegment; +} + +/** + * Check the validity of the parameters. + * Returns non-zero if the parameters are valid and 0 otherwise. + */ +static int FASTCOVER_checkParameters(ZDICT_fastCover_params_t parameters, + size_t maxDictSize) { + /* k, d, and f are required parameters */ + if (parameters.d == 0 || parameters.k == 0 || parameters.f == 0) { + return 0; + } + /* 0 < f <= FASTCOVER_MAX_F */ + if (parameters.f > FASTCOVER_MAX_F) { + return 0; + } + /* k <= maxDictSize */ + if (parameters.k > maxDictSize) { + return 0; + } + /* d <= k */ + if (parameters.d > parameters.k) { + return 0; + } + /* 0 < splitPoint <= 1 */ + if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) { + return 0; + } + return 1; +} + + +/** + * Clean up a context initialized with `FASTCOVER_ctx_init()`. + */ +static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) { + if (!ctx) { + return; + } + if (ctx->freqs) { + free(ctx->freqs); + ctx->freqs = NULL; + } + if (ctx->offsets) { + free(ctx->offsets); + ctx->offsets = NULL; + } +} + +/** + * Calculate for frequency of hash value of each dmer in ctx->samples + */ +static void FASTCOVER_getFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){ + /* inCurrSample keeps track of this hash value has already be seen in previous dmers in the same sample*/ + size_t* inCurrSample = (size_t *)malloc((1<nbTrainSamples; i++) { + memset(inCurrSample, 0, (1 << f)); /* Reset inCurrSample for each sample */ + size_t currSampleStart = ctx->offsets[i]; + size_t currSampleEnd = ctx->offsets[i+1]; + start = currSampleStart; + while (start + f < currSampleEnd) { + size_t dmerIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + start, f); + /* if no dmer with same hash value has been seen in current sample */ + if (inCurrSample[dmerIndex] == 0) { + inCurrSample[dmerIndex]++; + freqs[dmerIndex]++; + } + start++; + } + } + free(inCurrSample); +} + +/** + * Prepare a context for dictionary building. + * The context is only dependent on the parameter `d` and can used multiple + * times. + * Returns 1 on success or zero on error. + * The context must be destroyed with `FASTCOVER_ctx_destroy()`. + */ +static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, + unsigned d, double splitPoint, unsigned f) { + const BYTE *const samples = (const BYTE *)samplesBuffer; + const size_t totalSamplesSize = FASTCOVER_sum(samplesSizes, nbSamples); + /* Split samples into testing and training sets */ + const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples; + const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples; + const size_t trainingSamplesSize = splitPoint < 1.0 ? FASTCOVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize; + const size_t testSamplesSize = splitPoint < 1.0 ? FASTCOVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize; + /* Checks */ + if (totalSamplesSize < MAX(d, sizeof(U64)) || + totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) { + DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", + (U32)(totalSamplesSize>>20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20)); + return 0; + } + /* Check if there are at least 5 training samples */ + if (nbTrainSamples < 5) { + DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); + return 0; + } + /* Check if there's testing sample */ + if (nbTestSamples < 1) { + DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); + return 0; + } + /* Zero the context */ + memset(ctx, 0, sizeof(*ctx)); + DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples, + (U32)trainingSamplesSize); + DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples, + (U32)testSamplesSize); + + ctx->samples = samples; + ctx->samplesSizes = samplesSizes; + ctx->nbSamples = nbSamples; + ctx->nbTrainSamples = nbTrainSamples; + ctx->nbTestSamples = nbTestSamples; + ctx->nbDmers = trainingSamplesSize - d + 1; + ctx->d = d; + + /* The offsets of each file */ + ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); + if (!ctx->offsets) { + DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n"); + FASTCOVER_ctx_destroy(ctx); + return 0; + } + + /* Fill offsets from the samplesSizes */ + { + U32 i; + ctx->offsets[0] = 0; + for (i = 1; i <= nbSamples; ++i) { + ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; + } + } + + /* Initialize frequency array of size 2^f */ + ctx->freqs =(U32 *)malloc((1 << f) * sizeof(U32)); + memset(ctx->freqs, 0, (1 << f) * sizeof(U32)); + + DISPLAYLEVEL(2, "Computing frequencies\n"); + FASTCOVER_getFrequency(ctx->freqs, f, ctx); + + return 1; +} + + +/** + * Given the prepared context build the dictionary. + */ +static size_t FASTCOVER_buildDictionary(const FASTCOVER_ctx_t *ctx, U32 *freqs, + void *dictBuffer, + size_t dictBufferCapacity, + ZDICT_fastCover_params_t parameters){ + BYTE *const dict = (BYTE *)dictBuffer; + size_t tail = dictBufferCapacity; + /* Divide the data up into epochs of equal size. + * We will select at least one segment from each epoch. + */ + const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k)); + const U32 epochSize = (U32)(ctx->nbDmers / epochs); + size_t epoch; + DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs, + epochSize); + /* Loop through the epochs until there are no more segments or the dictionary + * is full. + */ + for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) { + const U32 epochBegin = (U32)(epoch * epochSize); + const U32 epochEnd = epochBegin + epochSize; + size_t segmentSize; + /* Select a segment */ + FASTCOVER_segment_t segment = FASTCOVER_selectSegment( + ctx, freqs, epochBegin, epochEnd, parameters); + + /* If the segment covers no dmers, then we are out of content */ + if (segment.score == 0) { + break; + } + + /* Trim the segment if necessary and if it is too small then we are done */ + segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); + if (segmentSize < parameters.d) { + break; + } + + /* We fill the dictionary from the back to allow the best segments to be + * referenced with the smallest offsets. + */ + tail -= segmentSize; + memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + DISPLAYUPDATE( + 2, "\r%u%% ", + (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + return tail; +} + + +/** + * FASTCOVER_best_t is used for two purposes: + * 1. Synchronizing threads. + * 2. Saving the best parameters and dictionary. + * + * All of the methods except FASTCOVER_best_init() are thread safe if zstd is + * compiled with multithreaded support. + */ +typedef struct fast_best_s { + ZSTD_pthread_mutex_t mutex; + ZSTD_pthread_cond_t cond; + size_t liveJobs; + void *dict; + size_t dictSize; + ZDICT_fastCover_params_t parameters; + size_t compressedSize; +} FASTCOVER_best_t; + +/** + * Initialize the `FASTCOVER_best_t`. + */ +static void FASTCOVER_best_init(FASTCOVER_best_t *best) { + if (best==NULL) return; /* compatible with init on NULL */ + (void)ZSTD_pthread_mutex_init(&best->mutex, NULL); + (void)ZSTD_pthread_cond_init(&best->cond, NULL); + best->liveJobs = 0; + best->dict = NULL; + best->dictSize = 0; + best->compressedSize = (size_t)-1; + memset(&best->parameters, 0, sizeof(best->parameters)); +} + +/** + * Wait until liveJobs == 0. + */ +static void FASTCOVER_best_wait(FASTCOVER_best_t *best) { + if (!best) { + return; + } + ZSTD_pthread_mutex_lock(&best->mutex); + while (best->liveJobs != 0) { + ZSTD_pthread_cond_wait(&best->cond, &best->mutex); + } + ZSTD_pthread_mutex_unlock(&best->mutex); +} + +/** + * Call FASTCOVER_best_wait() and then destroy the FASTCOVER_best_t. + */ +static void FASTCOVER_best_destroy(FASTCOVER_best_t *best) { + if (!best) { + return; + } + FASTCOVER_best_wait(best); + if (best->dict) { + free(best->dict); + } + ZSTD_pthread_mutex_destroy(&best->mutex); + ZSTD_pthread_cond_destroy(&best->cond); +} + +/** + * Called when a thread is about to be launched. + * Increments liveJobs. + */ +static void FASTCOVER_best_start(FASTCOVER_best_t *best) { + if (!best) { + return; + } + ZSTD_pthread_mutex_lock(&best->mutex); + ++best->liveJobs; + ZSTD_pthread_mutex_unlock(&best->mutex); +} + +/** + * Called when a thread finishes executing, both on error or success. + * Decrements liveJobs and signals any waiting threads if liveJobs == 0. + * If this dictionary is the best so far save it and its parameters. + */ +static void FASTCOVER_best_finish(FASTCOVER_best_t *best, size_t compressedSize, + ZDICT_fastCover_params_t parameters, void *dict, + size_t dictSize) { + if (!best) { + return; + } + { + size_t liveJobs; + ZSTD_pthread_mutex_lock(&best->mutex); + --best->liveJobs; + liveJobs = best->liveJobs; + /* If the new dictionary is better */ + if (compressedSize < best->compressedSize) { + /* Allocate space if necessary */ + if (!best->dict || best->dictSize < dictSize) { + if (best->dict) { + free(best->dict); + } + best->dict = malloc(dictSize); + if (!best->dict) { + best->compressedSize = ERROR(GENERIC); + best->dictSize = 0; + return; + } + } + /* Save the dictionary, parameters, and size */ + memcpy(best->dict, dict, dictSize); + best->dictSize = dictSize; + best->parameters = parameters; + best->compressedSize = compressedSize; + } + ZSTD_pthread_mutex_unlock(&best->mutex); + if (liveJobs == 0) { + ZSTD_pthread_cond_broadcast(&best->cond); + } + } +} + +/** + * Parameters for FASTCOVER_tryParameters(). + */ +typedef struct FASTCOVER_tryParameters_data_s { + const FASTCOVER_ctx_t *ctx; + FASTCOVER_best_t *best; + size_t dictBufferCapacity; + ZDICT_fastCover_params_t parameters; +} FASTCOVER_tryParameters_data_t; + +/** + * Tries a set of parameters and updates the FASTCOVER_best_t with the results. + * This function is thread safe if zstd is compiled with multithreaded support. + * It takes its parameters as an *OWNING* opaque pointer to support threading. + */ +static void FASTCOVER_tryParameters(void *opaque) { + /* Save parameters as local variables */ + FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque; + const FASTCOVER_ctx_t *const ctx = data->ctx; + const ZDICT_fastCover_params_t parameters = data->parameters; + size_t dictBufferCapacity = data->dictBufferCapacity; + size_t totalCompressedSize = ERROR(GENERIC); + /* Allocate space for hash table, dict, and freqs */ + BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + U32 *freqs = (U32*) malloc((1 << parameters.f) * sizeof(U32)); + if (!dict || !freqs) { + DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n"); + goto _cleanup; + } + /* Copy the frequencies because we need to modify them */ + memcpy(freqs, ctx->freqs, (1 << parameters.f) * sizeof(U32)); + /* Build the dictionary */ + { + const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, + dictBufferCapacity, parameters); + + dictBufferCapacity = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, + parameters.zParams); + if (ZDICT_isError(dictBufferCapacity)) { + DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); + goto _cleanup; + } + } + /* Check total compressed size */ + { + /* Pointers */ + ZSTD_CCtx *cctx; + ZSTD_CDict *cdict; + void *dst; + /* Local variables */ + size_t dstCapacity; + size_t i; + /* Allocate dst with enough space to compress the maximum sized sample */ + { + size_t maxSampleSize = 0; + i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0; + for (; i < ctx->nbSamples; ++i) { + maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize); + } + dstCapacity = ZSTD_compressBound(maxSampleSize); + dst = malloc(dstCapacity); + } + /* Create the cctx and cdict */ + cctx = ZSTD_createCCtx(); + cdict = ZSTD_createCDict(dict, dictBufferCapacity, + parameters.zParams.compressionLevel); + if (!dst || !cctx || !cdict) { + goto _compressCleanup; + } + /* Compress each sample and sum their sizes (or error) */ + totalCompressedSize = dictBufferCapacity; + i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0; + for (; i < ctx->nbSamples; ++i) { + const size_t size = ZSTD_compress_usingCDict( + cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i], + ctx->samplesSizes[i], cdict); + if (ZSTD_isError(size)) { + totalCompressedSize = ERROR(GENERIC); + goto _compressCleanup; + } + totalCompressedSize += size; + } + _compressCleanup: + ZSTD_freeCCtx(cctx); + ZSTD_freeCDict(cdict); + if (dst) { + free(dst); + } + } + +_cleanup: + FASTCOVER_best_finish(data->best, totalCompressedSize, parameters, dict, + dictBufferCapacity); + free(data); + if (dict) { + free(dict); + } + if (freqs) { + free(freqs); + } +} + +ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, + ZDICT_fastCover_params_t *parameters) { + /* constants */ + const unsigned nbThreads = parameters->nbThreads; + const double splitPoint = + parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; + const unsigned kMinD = parameters->d == 0 ? 8 : parameters->d; + const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; + const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; + const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k; + const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps; + const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); + const unsigned kIterations = + (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); + const unsigned f = parameters->f == 0 ? 23 : parameters->f; + + /* Local variables */ + const int displayLevel = parameters->zParams.notificationLevel; + unsigned iteration = 1; + unsigned d; + unsigned k; + FASTCOVER_best_t best; + POOL_ctx *pool = NULL; + + /* Checks */ + if (splitPoint <= 0 || splitPoint > 1) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); + return ERROR(GENERIC); + } + if (kMinK < kMaxD || kMaxK < kMinK) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "fast must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + if (nbThreads > 1) { + pool = POOL_create(nbThreads, 1); + if (!pool) { + return ERROR(memory_allocation); + } + } + /* Initialization */ + FASTCOVER_best_init(&best); + /* Turn down global display level to clean up display at level 2 and below */ + g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1; + /* Loop through d first because each new value needs a new context */ + LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n", + kIterations); + for (d = kMinD; d <= kMaxD; d += 2) { + /* Initialize the context for this value of d */ + FASTCOVER_ctx_t ctx; + LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); + if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f)) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); + FASTCOVER_best_destroy(&best); + POOL_free(pool); + return ERROR(GENERIC); + } + /* Loop through k reusing the same context */ + for (k = kMinK; k <= kMaxK; k += kStepSize) { + /* Prepare the arguments */ + FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc( + sizeof(FASTCOVER_tryParameters_data_t)); + LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k); + if (!data) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n"); + FASTCOVER_best_destroy(&best); + FASTCOVER_ctx_destroy(&ctx); + POOL_free(pool); + return ERROR(GENERIC); + } + data->ctx = &ctx; + data->best = &best; + data->dictBufferCapacity = dictBufferCapacity; + data->parameters = *parameters; + data->parameters.k = k; + data->parameters.d = d; + data->parameters.f = f; + data->parameters.splitPoint = splitPoint; + data->parameters.steps = kSteps; + data->parameters.zParams.notificationLevel = g_displayLevel; + /* Check the parameters */ + if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity)) { + DISPLAYLEVEL(1, "fastCover parameters incorrect\n"); + free(data); + continue; + } + /* Call the function and pass ownership of data to it */ + FASTCOVER_best_start(&best); + if (pool) { + POOL_add(pool, &FASTCOVER_tryParameters, data); + } else { + FASTCOVER_tryParameters(data); + } + /* Print status */ + LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ", + (U32)((iteration * 100) / kIterations)); + ++iteration; + } + FASTCOVER_best_wait(&best); + FASTCOVER_ctx_destroy(&ctx); + } + LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", ""); + /* Fill the output buffer and parameters with output of the best parameters */ + { + const size_t dictSize = best.dictSize; + if (ZSTD_isError(best.compressedSize)) { + const size_t compressedSize = best.compressedSize; + FASTCOVER_best_destroy(&best); + POOL_free(pool); + return compressedSize; + } + *parameters = best.parameters; + memcpy(dictBuffer, best.dict, dictSize); + FASTCOVER_best_destroy(&best); + POOL_free(pool); + return dictSize; + } + +} diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.h b/contrib/experimental_dict_builders/fastCover/fastCover.h new file mode 100644 index 00000000..eca04baa --- /dev/null +++ b/contrib/experimental_dict_builders/fastCover/fastCover.h @@ -0,0 +1,47 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "mem.h" /* read */ +#include "pool.h" +#include "threading.h" +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "zdict.h" + + + + + +typedef struct { + unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ + unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ + unsigned f; /* log of size of frequency array */ + unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ + unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ + double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */ + ZDICT_params_t zParams; +} ZDICT_fastCover_params_t; + + + +/*! ZDICT_optimizeTrainFromBuffer_fastCover(): + * Train a dictionary from an array of samples using a modified version of the COVER algorithm. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * All of the parameters except for f are optional. + * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. + * if steps is zero it defaults to its default value. + * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048]. + * + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + * On success `*parameters` contains the parameters selected. + */ +ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, + ZDICT_fastCover_params_t *parameters); diff --git a/contrib/experimental_dict_builders/fastCover/main.c b/contrib/experimental_dict_builders/fastCover/main.c new file mode 100644 index 00000000..260eeb28 --- /dev/null +++ b/contrib/experimental_dict_builders/fastCover/main.c @@ -0,0 +1,177 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "fastCover.h" +#include "io.h" +#include "util.h" +#include "zdict.h" + + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/*-************************************* +* Constants +***************************************/ +static const unsigned g_defaultMaxDictSize = 110 KB; +#define DEFAULT_CLEVEL 3 + + +/*-************************************* +* FASTCOVER +***************************************/ +int FASTCOVER_trainFromFiles(const char* dictFileName, sampleInfo *info, + unsigned maxDictSize, + ZDICT_fastCover_params_t *params) { + unsigned const displayLevel = params->zParams.notificationLevel; + void* const dictBuffer = malloc(maxDictSize); + + int result = 0; + + /* Checks */ + if (!dictBuffer) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + + { size_t dictSize; + dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, params); + DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint*100)); + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + result = 1; + goto _done; + } + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + saveDict(dictFileName, dictBuffer, dictSize); + } + + /* clean up */ +_done: + free(dictBuffer); + return result; +} + + + +int main(int argCount, const char* argv[]) +{ + int displayLevel = 2; + const char* programName = argv[0]; + int operationResult = 0; + + /* Initialize arguments to default values */ + unsigned k = 200; + unsigned d = 8; + unsigned f = 23; + unsigned steps = 32; + unsigned nbThreads = 1; + unsigned split = 100; + const char* outputFile = "fastCoverDict"; + unsigned dictID = 0; + unsigned maxDictSize = g_defaultMaxDictSize; + + /* Initialize table to store input files */ + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); + unsigned filenameIdx = 0; + + char* fileNamesBuf = NULL; + unsigned fileNamesNb = filenameIdx; + int followLinks = 0; /* follow directory recursively */ + const char** extendedFileList = NULL; + + /* Parse arguments */ + for (int i = 1; i < argCount; i++) { + const char* argument = argv[i]; + if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "d=")) { d = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "f=")) { f = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "steps=")) { steps = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "split=")) { split = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "in=")) { + filenameTable[filenameIdx] = argument; + filenameIdx++; + continue; + } + if (longCommandWArg(&argument, "out=")) { + outputFile = argument; + continue; + } + DISPLAYLEVEL(1, "Incorrect parameters\n"); + operationResult = 1; + return operationResult; + } + + /* Get the list of all files recursively (because followLinks==0)*/ + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, + &fileNamesNb, followLinks); + if (extendedFileList) { + unsigned u; + for (u=0; u Date: Wed, 25 Jul 2018 16:54:08 -0700 Subject: [PATCH 23/35] Make hash value const --- .../experimental_dict_builders/fastCover/fastCover.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c index 6d3ad90a..32a15a4b 100644 --- a/contrib/experimental_dict_builders/fastCover/fastCover.c +++ b/contrib/experimental_dict_builders/fastCover/fastCover.c @@ -138,7 +138,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, */ while (activeSegment.end < end) { /* Get hash value of current dmer */ - size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.end, parameters.f); + const size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.end, parameters.f); /* Add frequency of this index to score */ activeSegment.score += freqs[index]; /* Increment end of segment */ @@ -146,7 +146,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, /* If the window is now too large, drop the first position */ if (activeSegment.end - activeSegment.begin == dmersInK + 1) { /* Get hash value of the dmer to be eliminated from active segment */ - size_t delIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.begin, parameters.f); + const size_t delIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.begin, parameters.f); /* Subtract frequency of this index from score */ activeSegment.score -= freqs[delIndex]; /* Increment start of segment */ @@ -163,7 +163,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, U32 newEnd = bestSegment.begin; U32 pos; for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { - size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f); + const size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f); U32 freq = freqs[index]; if (freq != 0) { newBegin = MIN(newBegin, pos); @@ -177,7 +177,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, /* Half the frequency of hash value of each dmer covered by the chosen segment. */ U32 pos; for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { - size_t i = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f); + const size_t i = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f); freqs[i] = freqs[i]/2; } } @@ -244,7 +244,7 @@ static void FASTCOVER_getFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx) size_t currSampleEnd = ctx->offsets[i+1]; start = currSampleStart; while (start + f < currSampleEnd) { - size_t dmerIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + start, f); + const size_t dmerIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + start, f); /* if no dmer with same hash value has been seen in current sample */ if (inCurrSample[dmerIndex] == 0) { inCurrSample[dmerIndex]++; From d1fc507ef998f511f6f1da7edc57670bb6b3404f Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Wed, 25 Jul 2018 17:05:54 -0700 Subject: [PATCH 24/35] Initial benchmarking result for fastCover --- .../benchmarkDictBuilder/Makefile | 10 +++-- .../benchmarkDictBuilder/README.md | 40 ++++++++++-------- .../benchmarkDictBuilder/benchmark.c | 42 +++++++++++++++---- 3 files changed, 62 insertions(+), 30 deletions(-) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile index 72ce04f2..68149488 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile @@ -2,9 +2,10 @@ ARG := CC ?= gcc CFLAGS ?= -O3 -INCLUDES := -I ../randomDictBuilder -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder +INCLUDES := -I ../randomDictBuilder -I ../fastCover -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder RANDOM_FILE := ../randomDictBuilder/random.c +FAST_FILE := ../fastCover/fastCover.c IO_FILE := ../randomDictBuilder/io.c all: run clean @@ -21,8 +22,8 @@ test: benchmarkTest clean benchmarkTest: benchmark test.sh sh test.sh -benchmark: benchmark.o io.o random.o libzstd.a - $(CC) $(CFLAGS) benchmark.o io.o random.o libzstd.a -o benchmark +benchmark: benchmark.o io.o random.o fastCover.o libzstd.a + $(CC) $(CFLAGS) benchmark.o io.o random.o fastCover.o libzstd.a -o benchmark benchmark.o: benchmark.c $(CC) $(CFLAGS) $(INCLUDES) -c benchmark.c @@ -30,6 +31,9 @@ benchmark.o: benchmark.c random.o: $(RANDOM_FILE) $(CC) $(CFLAGS) $(INCLUDES) -c $(RANDOM_FILE) +fastCover.o: $(FAST_FILE) + $(CC) $(CFLAGS) $(INCLUDES) -c $(FAST_FILE) + io.o: $(IO_FILE) $(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md index de783a0e..e02d592c 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -18,30 +18,34 @@ github: | Algorithm | Speed(sec) | Compression Ratio | | ------------- |:-------------:| ------------------:| | nodict | 0.000004 | 2.999642 | -| random | 0.180238 | 8.786957 | -| cover | 33.891987 | 10.430999 | -| legacy | 1.077569 | 8.989482 | +| random | 0.135459 | 8.786957 | +| cover | 50.341079 | 10.641263 | +| legacy | 0.866283 | 8.989482 | +| fastCover | 13.450947 | 10.215174 | hg-commands | Algorithm | Speed(sec) | Compression Ratio | | ------------- |:-------------:| ------------------:| -| nodict | 0.000006 | 2.425291 | -| random | 0.088735 | 3.489515 | -| cover | 35.447300 | 4.030274 | -| legacy | 1.048509 | 3.911896 | +| nodict | 0.000020 | 2.425291 | +| random | 0.088828 | 3.489515 | +| cover | 60.028672 | 4.131136 | +| legacy | 0.852481 | 3.911896 | +| fastCover | 9.524284 | 3.977229 | + +hg-changelog +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| nodict | 0.000004 | 1.377613 | +| random | 0.621812 | 2.096785 | +| cover | 217.510962 | 2.188654 | +| legacy | 2.559194 | 2.058273 | +| fastCover | 51.132516 | 2.124185 | hg-manifest | Algorithm | Speed(sec) | Compression Ratio | | ------------- |:-------------:| ------------------:| | nodict | 0.000005 | 1.866385 | -| random | 1.148231 | 2.309485 | -| cover | 509.685257 | 2.575331 | -| legacy | 10.705866 | 2.506775 | - -hg-changelog -| Algorithm | Speed(sec) | Compression Ratio | -| ------------- |:-------------:| ------------------:| -| nodict | 0.000005 | 1.377613 | -| random | 0.706434 | 2.096785 | -| cover | 122.815783 | 2.175706 | -| legacy | 3.010318 | 2.058273 | +| random | 1.035220 | 2.309485 | +| cover | 930.480173 | 2.582597 | +| legacy | 8.916513 | 2.506775 | +| fastCover | 116.871089 | 2.525689 | diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c index 64041964..865ecb34 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -5,6 +5,7 @@ #include #include #include "random.h" +#include "fastCover.h" #include "dictBuilder.h" #include "zstd_internal.h" /* includes zstd.h */ #include "io.h" @@ -71,10 +72,11 @@ typedef struct { */ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize, ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams, - ZDICT_legacy_params_t *legacyParams) { + ZDICT_legacy_params_t *legacyParams, ZDICT_fastCover_params_t *fastParams) { unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel : coverParams ? coverParams->zParams.notificationLevel : legacyParams ? legacyParams->zParams.notificationLevel : + fastParams ? fastParams->zParams.notificationLevel : DEFAULT_DISPLAYLEVEL; /* no dict */ void* const dictBuffer = malloc(maxDictSize); @@ -94,6 +96,9 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize, } else if(legacyParams) { dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer, info->samplesSizes, info->nbSamples, *legacyParams); + } else if(fastParams) { + dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, fastParams); } else { dictSize = 0; } @@ -216,25 +221,29 @@ void freeDictInfo(dictInfo* info) { * @return 0 if benchmark successfully, 1 otherwise */ int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam, - ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam) { + ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam, + ZDICT_fastCover_params_t *fastParam) { /* Local variables */ const unsigned displayLevel = randomParam ? randomParam->zParams.notificationLevel : coverParam ? coverParam->zParams.notificationLevel : legacyParam ? legacyParam->zParams.notificationLevel : + fastParam ? fastParam->zParams.notificationLevel: DEFAULT_DISPLAYLEVEL; /* no dict */ const char* name = randomParam ? "RANDOM" : coverParam ? "COVER" : legacyParam ? "LEGACY" : + fastParam ? "FAST": "NODICT"; /* no dict */ const unsigned cLevel = randomParam ? randomParam->zParams.compressionLevel : coverParam ? coverParam->zParams.compressionLevel : legacyParam ? legacyParam->zParams.compressionLevel : + fastParam ? fastParam->zParams.compressionLevel: DEFAULT_CLEVEL; /* no dict */ int result = 0; /* Calculate speed */ const UTIL_time_t begin = UTIL_getTime(); - dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam); + dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam, fastParam); const U64 timeMicro = UTIL_clockSpanMicro(begin); const double timeSec = timeMicro / (double)SEC_TO_MICRO; if (!dInfo) { @@ -269,7 +278,6 @@ int main(int argCount, const char* argv[]) /* Initialize arguments to default values */ const unsigned k = 200; - const unsigned d = 6; const unsigned cLevel = DEFAULT_CLEVEL; const unsigned dictID = 0; const unsigned maxDictSize = g_defaultMaxDictSize; @@ -319,7 +327,7 @@ int main(int argCount, const char* argv[]) /* with no dict */ { - const int noDictResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL); + const int noDictResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, NULL); if(noDictResult) { result = 1; goto _cleanup; @@ -331,7 +339,7 @@ int main(int argCount, const char* argv[]) ZDICT_random_params_t randomParam; randomParam.zParams = zParams; randomParam.k = k; - const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL); + const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL, NULL); if(randomResult) { result = 1; goto _cleanup; @@ -344,10 +352,9 @@ int main(int argCount, const char* argv[]) memset(&coverParam, 0, sizeof(coverParam)); coverParam.zParams = zParams; coverParam.splitPoint = 1.0; - coverParam.d = d; coverParam.steps = 40; coverParam.nbThreads = 1; - const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL); + const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL); if(coverOptResult) { result = 1; goto _cleanup; @@ -359,13 +366,30 @@ int main(int argCount, const char* argv[]) ZDICT_legacy_params_t legacyParam; legacyParam.zParams = zParams; legacyParam.selectivityLevel = 9; - const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam); + const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam, NULL); if(legacyResult) { result = 1; goto _cleanup; } } + /* for fastCover */ + { + ZDICT_fastCover_params_t fastParam; + memset(&fastParam, 0, sizeof(fastParam)); + fastParam.zParams = zParams; + fastParam.splitPoint = 1.0; + fastParam.d = 8; + fastParam.f = 23; + fastParam.steps = 40; + fastParam.nbThreads = 1; + const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); + if(fastOptResult) { + result = 1; + goto _cleanup; + } + } + /* Free allocated memory */ _cleanup: UTIL_freeFileList(extendedFileList, fileNamesBuf); From 1e85f314d859c5295f88c98fcd0dc9fa03f68b12 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Wed, 25 Jul 2018 17:53:38 -0700 Subject: [PATCH 25/35] Benchmark fast cover optimize vs k=200 --- .../benchmarkDictBuilder/README.md | 60 ++++++++++--------- .../benchmarkDictBuilder/benchmark.c | 22 ++++++- 2 files changed, 53 insertions(+), 29 deletions(-) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md index e02d592c..478d8793 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -15,37 +15,41 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" ###Benchmarking Result: github: -| Algorithm | Speed(sec) | Compression Ratio | -| ------------- |:-------------:| ------------------:| -| nodict | 0.000004 | 2.999642 | -| random | 0.135459 | 8.786957 | -| cover | 50.341079 | 10.641263 | -| legacy | 0.866283 | 8.989482 | -| fastCover | 13.450947 | 10.215174 | +| Algorithm | Speed(sec) | Compression Ratio | +| ------------------|:-------------:| ------------------:| +| nodict | 0.000004 | 2.999642 | +| random | 0.148247 | 8.786957 | +| cover | 56.331553 | 10.641263 | +| legacy | 0.917595 | 8.989482 | +| fastCover(opt) | 13.169979 | 10.215174 | +| fastCover(k=200) | 2.692406 | 8.657219 | hg-commands -| Algorithm | Speed(sec) | Compression Ratio | -| ------------- |:-------------:| ------------------:| -| nodict | 0.000020 | 2.425291 | -| random | 0.088828 | 3.489515 | -| cover | 60.028672 | 4.131136 | -| legacy | 0.852481 | 3.911896 | -| fastCover | 9.524284 | 3.977229 | +| Algorithm | Speed(sec) | Compression Ratio | +| ----------------- |:-------------:| ------------------:| +| nodict | 0.000007 | 2.425291 | +| random | 0.093990 | 3.489515 | +| cover | 58.602385 | 4.131136 | +| legacy | 0.865683 | 3.911896 | +| fastCover(opt) | 9.404134 | 3.977229 | +| fastCover(k=200) | 1.037434 | 3.810326 | hg-changelog -| Algorithm | Speed(sec) | Compression Ratio | -| ------------- |:-------------:| ------------------:| -| nodict | 0.000004 | 1.377613 | -| random | 0.621812 | 2.096785 | -| cover | 217.510962 | 2.188654 | -| legacy | 2.559194 | 2.058273 | -| fastCover | 51.132516 | 2.124185 | +| Algorithm | Speed(sec) | Compression Ratio | +| ----------------- |:-------------:| ------------------:| +| nodict | 0.000022 | 1.377613 | +| random | 0.551539 | 2.096785 | +| cover | 221.370056 | 2.188654 | +| legacy | 2.405923 | 2.058273 | +| fastCover(opt) | 49.526246 | 2.124185 | +| fastCover(k=200) | 9.746872 | 2.114674 | hg-manifest -| Algorithm | Speed(sec) | Compression Ratio | -| ------------- |:-------------:| ------------------:| -| nodict | 0.000005 | 1.866385 | -| random | 1.035220 | 2.309485 | -| cover | 930.480173 | 2.582597 | -| legacy | 8.916513 | 2.506775 | -| fastCover | 116.871089 | 2.525689 | +| Algorithm | Speed(sec) | Compression Ratio | +| ----------------- |:-------------:| ------------------:| +| nodict | 0.000019 | 1.866385 | +| random | 1.083536 | 2.309485 | +| cover | 928.894887 | 2.582597 | +| legacy | 9.110371 | 2.506775 | +| fastCover(opt) | 116.508270 | 2.525689 | +| fastCover(k=200) | 12.176555 | 2.472221 | diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c index 865ecb34..62135436 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -373,7 +373,8 @@ int main(int argCount, const char* argv[]) } } - /* for fastCover */ + + /* for fastCover (optimizing k) */ { ZDICT_fastCover_params_t fastParam; memset(&fastParam, 0, sizeof(fastParam)); @@ -390,6 +391,25 @@ int main(int argCount, const char* argv[]) } } + /* for fastCover (with k provided) */ + { + ZDICT_fastCover_params_t fastParam; + memset(&fastParam, 0, sizeof(fastParam)); + fastParam.zParams = zParams; + fastParam.splitPoint = 1.0; + fastParam.d = 8; + fastParam.f = 23; + fastParam.k = 200; + fastParam.steps = 40; + fastParam.nbThreads = 1; + const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); + if(fastOptResult) { + result = 1; + goto _cleanup; + } + } + + /* Free allocated memory */ _cleanup: UTIL_freeFileList(extendedFileList, fileNamesBuf); From 2333ecb173077edaf34f032baadfcc63531928c1 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Wed, 25 Jul 2018 18:10:09 -0700 Subject: [PATCH 26/35] Allow d=6 --- .../fastCover/README.md | 2 +- .../fastCover/fastCover.c | 27 +++++++++++++------ .../fastCover/test.sh | 3 ++- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/contrib/experimental_dict_builders/fastCover/README.md b/contrib/experimental_dict_builders/fastCover/README.md index 088e38be..66e00ee0 100644 --- a/contrib/experimental_dict_builders/fastCover/README.md +++ b/contrib/experimental_dict_builders/fastCover/README.md @@ -6,7 +6,7 @@ Output Dictionary (out=dictName): if not provided, default to fastCoverDict Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0 Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB Size of Selected Segment (k=#): positive number; in bytes; if not provided, default to 200 -Size of Dmer (d=#): positive number; in bytes; if not provided, default to 8 +Size of Dmer (d=#): either 6 or 8; if not provided, default to 8 Number of steps (steps=#): positive number, if not provided, default to 32 Percentage of samples used for training(split=#): positive number; if not provided, default to 100 diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c index 32a15a4b..abd592cd 100644 --- a/contrib/experimental_dict_builders/fastCover/fastCover.c +++ b/contrib/experimental_dict_builders/fastCover/fastCover.c @@ -50,14 +50,21 @@ static clock_t g_time = 0; /*-************************************* * Hash Function ***************************************/ +static const U64 prime6bytes = 227718039650203ULL; +static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } + static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } /** - * Hash the 8-byte value pointed to by p and mod 2^f + * Hash the d-byte value pointed to by p and mod 2^f */ -static size_t FASTCOVER_hash8PtrToIndex(const void* p, U32 h) { +static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) { + if (d == 6) { + return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1); + } return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1); } @@ -138,7 +145,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, */ while (activeSegment.end < end) { /* Get hash value of current dmer */ - const size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.end, parameters.f); + const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d); /* Add frequency of this index to score */ activeSegment.score += freqs[index]; /* Increment end of segment */ @@ -146,7 +153,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, /* If the window is now too large, drop the first position */ if (activeSegment.end - activeSegment.begin == dmersInK + 1) { /* Get hash value of the dmer to be eliminated from active segment */ - const size_t delIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + activeSegment.begin, parameters.f); + const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d); /* Subtract frequency of this index from score */ activeSegment.score -= freqs[delIndex]; /* Increment start of segment */ @@ -163,7 +170,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, U32 newEnd = bestSegment.begin; U32 pos; for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { - const size_t index = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f); + const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d); U32 freq = freqs[index]; if (freq != 0) { newBegin = MIN(newBegin, pos); @@ -177,7 +184,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, /* Half the frequency of hash value of each dmer covered by the chosen segment. */ U32 pos; for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { - const size_t i = FASTCOVER_hash8PtrToIndex(ctx->samples + pos, parameters.f); + const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d); freqs[i] = freqs[i]/2; } } @@ -194,6 +201,10 @@ static int FASTCOVER_checkParameters(ZDICT_fastCover_params_t parameters, if (parameters.d == 0 || parameters.k == 0 || parameters.f == 0) { return 0; } + /* d has to be 6 or 8 */ + if (parameters.d != 6 && parameters.d != 8) { + return 0; + } /* 0 < f <= FASTCOVER_MAX_F */ if (parameters.f > FASTCOVER_MAX_F) { return 0; @@ -244,7 +255,7 @@ static void FASTCOVER_getFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx) size_t currSampleEnd = ctx->offsets[i+1]; start = currSampleStart; while (start + f < currSampleEnd) { - const size_t dmerIndex = FASTCOVER_hash8PtrToIndex(ctx->samples + start, f); + const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, ctx->d); /* if no dmer with same hash value has been seen in current sample */ if (inCurrSample[dmerIndex] == 0) { inCurrSample[dmerIndex]++; @@ -615,7 +626,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( const unsigned nbThreads = parameters->nbThreads; const double splitPoint = parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; - const unsigned kMinD = parameters->d == 0 ? 8 : parameters->d; + const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k; diff --git a/contrib/experimental_dict_builders/fastCover/test.sh b/contrib/experimental_dict_builders/fastCover/test.sh index b5570fef..91d4f492 100644 --- a/contrib/experimental_dict_builders/fastCover/test.sh +++ b/contrib/experimental_dict_builders/fastCover/test.sh @@ -11,4 +11,5 @@ echo "Removing dict1 dict2 dict3" rm -f dict1 dict2 dict3 echo "Testing with invalid parameters, should fail" -! ./main r=10 +! ./main in=../../../lib/common r=10 +! ./main in=../../../lib/common d=10 From 3b163e0b5b5f9eec427b87001483c3b627c95a8f Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Thu, 26 Jul 2018 13:53:13 -0700 Subject: [PATCH 27/35] Add array to keep track of frequency within active segment, fix malloc bug, update benchmarking result --- .../benchmarkDictBuilder/README.md | 60 ++++++++-------- .../fastCover/fastCover.c | 69 +++++++++++-------- .../fastCover/main.c | 2 +- .../randomDictBuilder/main.c | 2 +- 4 files changed, 75 insertions(+), 58 deletions(-) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md index 478d8793..07d65b08 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -14,42 +14,46 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" ###Benchmarking Result: +d=8 +f=23 +freq[i] = 0 when dmer added to best segment + github: | Algorithm | Speed(sec) | Compression Ratio | -| ------------------|:-------------:| ------------------:| -| nodict | 0.000004 | 2.999642 | -| random | 0.148247 | 8.786957 | -| cover | 56.331553 | 10.641263 | -| legacy | 0.917595 | 8.989482 | -| fastCover(opt) | 13.169979 | 10.215174 | -| fastCover(k=200) | 2.692406 | 8.657219 | +| ----------------- | ------------- | ------------------ | +| nodict | 0.000007 | 2.999642 | +| random | 0.150258 | 8.786957 | +| cover | 60.388853 | 10.641263 | +| legacy | 0.965050 | 8.989482 | +| fastCover(opt) | 84.968131 | 10.614747 | +| fastCover(k=200) | 6.465490 | 9.484150 | hg-commands | Algorithm | Speed(sec) | Compression Ratio | -| ----------------- |:-------------:| ------------------:| -| nodict | 0.000007 | 2.425291 | -| random | 0.093990 | 3.489515 | -| cover | 58.602385 | 4.131136 | -| legacy | 0.865683 | 3.911896 | -| fastCover(opt) | 9.404134 | 3.977229 | -| fastCover(k=200) | 1.037434 | 3.810326 | +| ----------------- | ------------- | ------------------ | +| nodict | 0.000005 | 2.425291 | +| random | 0.084348 | 3.489515 | +| cover | 60.144894 | 4.131136 | +| legacy | 0.831981 | 3.911896 | +| fastCover(opt) | 59.030437 | 4.157595 | +| fastCover(k=200) | 3.702932 | 4.134222 | hg-changelog | Algorithm | Speed(sec) | Compression Ratio | -| ----------------- |:-------------:| ------------------:| -| nodict | 0.000022 | 1.377613 | -| random | 0.551539 | 2.096785 | -| cover | 221.370056 | 2.188654 | -| legacy | 2.405923 | 2.058273 | -| fastCover(opt) | 49.526246 | 2.124185 | -| fastCover(k=200) | 9.746872 | 2.114674 | +| ----------------- | ------------- | ------------------ | +| nodict | 0.000004 | 1.377613 | +| random | 0.555964 | 2.096785 | +| cover | 214.423753 | 2.188654 | +| legacy | 2.180249 | 2.058273 | +| fastCover(opt) | 102.261452 | 2.180347 | +| fastCover(k=200) | 11.81039 | 2.170673 | hg-manifest | Algorithm | Speed(sec) | Compression Ratio | -| ----------------- |:-------------:| ------------------:| -| nodict | 0.000019 | 1.866385 | -| random | 1.083536 | 2.309485 | -| cover | 928.894887 | 2.582597 | -| legacy | 9.110371 | 2.506775 | -| fastCover(opt) | 116.508270 | 2.525689 | -| fastCover(k=200) | 12.176555 | 2.472221 | +| ----------------- | ------------- | ------------------ | +| nodict | 0.000006 | 1.866385 | +| random | 1.063974 | 2.309485 | +| cover | 909.101849 | 2.582597 | +| legacy | 8.706580 | 2.506775 | +| fastCover(opt) | 188.598079 | 2.596761 | +| fastCover(k=200) | 13.392734 | 2.592985 | diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c index abd592cd..6f990e0c 100644 --- a/contrib/experimental_dict_builders/fastCover/fastCover.c +++ b/contrib/experimental_dict_builders/fastCover/fastCover.c @@ -48,7 +48,7 @@ static clock_t g_time = 0; /*-************************************* -* Hash Function +* Hash Functions ***************************************/ static const U64 prime6bytes = 227718039650203ULL; static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } @@ -58,6 +58,7 @@ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } + /** * Hash the d-byte value pointed to by p and mod 2^f */ @@ -140,29 +141,41 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, activeSegment.begin = begin; activeSegment.end = begin; activeSegment.score = 0; - /* Slide the activeSegment through the whole epoch. - * Save the best segment in bestSegment. - */ - while (activeSegment.end < end) { - /* Get hash value of current dmer */ - const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d); - /* Add frequency of this index to score */ - activeSegment.score += freqs[index]; - /* Increment end of segment */ - activeSegment.end += 1; - /* If the window is now too large, drop the first position */ - if (activeSegment.end - activeSegment.begin == dmersInK + 1) { - /* Get hash value of the dmer to be eliminated from active segment */ - const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d); - /* Subtract frequency of this index from score */ - activeSegment.score -= freqs[delIndex]; - /* Increment start of segment */ - activeSegment.begin += 1; - } - /* If this segment is the best so far save it */ - if (activeSegment.score > bestSegment.score) { - bestSegment = activeSegment; + { + /* Keep track of number of times an index has been seen in current segment */ + U16* currfreqs =(U16 *)malloc((1 << parameters.f) * sizeof(U16)); + memset(currfreqs, 0, (1 << parameters.f) * sizeof(*currfreqs)); + /* Slide the activeSegment through the whole epoch. + * Save the best segment in bestSegment. + */ + while (activeSegment.end < end) { + /* Get hash value of current dmer */ + const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d); + /* Add frequency of this index to score if this is the first occurence of index in active segment */ + if (currfreqs[index] == 0) { + activeSegment.score += freqs[index]; + } + currfreqs[index] += 1; + /* Increment end of segment */ + activeSegment.end += 1; + /* If the window is now too large, drop the first position */ + if (activeSegment.end - activeSegment.begin == dmersInK + 1) { + /* Get hash value of the dmer to be eliminated from active segment */ + const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d); + currfreqs[delIndex] -= 1; + /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */ + if (currfreqs[delIndex] == 0) { + activeSegment.score -= freqs[delIndex]; + } + /* Increment start of segment */ + activeSegment.begin += 1; + } + /* If this segment is the best so far save it */ + if (activeSegment.score > bestSegment.score) { + bestSegment = activeSegment; + } } + free(currfreqs); } { /* Trim off the zero frequency head and tail from the segment. */ @@ -185,7 +198,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, U32 pos; for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d); - freqs[i] = freqs[i]/2; + freqs[i] = 0; } } return bestSegment; @@ -245,12 +258,12 @@ static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) { /** * Calculate for frequency of hash value of each dmer in ctx->samples */ -static void FASTCOVER_getFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){ +static void FASTCOVER_computeFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){ /* inCurrSample keeps track of this hash value has already be seen in previous dmers in the same sample*/ - size_t* inCurrSample = (size_t *)malloc((1<nbTrainSamples; i++) { - memset(inCurrSample, 0, (1 << f)); /* Reset inCurrSample for each sample */ + memset(inCurrSample, 0, (1 << f) * sizeof(*inCurrSample)); /* Reset inCurrSample for each sample */ size_t currSampleStart = ctx->offsets[i]; size_t currSampleEnd = ctx->offsets[i+1]; start = currSampleStart; @@ -338,7 +351,7 @@ static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer, memset(ctx->freqs, 0, (1 << f) * sizeof(U32)); DISPLAYLEVEL(2, "Computing frequencies\n"); - FASTCOVER_getFrequency(ctx->freqs, f, ctx); + FASTCOVER_computeFrequency(ctx->freqs, f, ctx); return 1; } diff --git a/contrib/experimental_dict_builders/fastCover/main.c b/contrib/experimental_dict_builders/fastCover/main.c index 260eeb28..f286b050 100644 --- a/contrib/experimental_dict_builders/fastCover/main.c +++ b/contrib/experimental_dict_builders/fastCover/main.c @@ -165,7 +165,7 @@ int main(int argCount, const char* argv[]) params.splitPoint = (double)split/100; /* Build dictionary */ - sampleInfo* info= getSampleInfo(filenameTable, + sampleInfo* info = getSampleInfo(filenameTable, filenameIdx, blockSize, maxDictSize, zParams.notificationLevel); operationResult = FASTCOVER_trainFromFiles(outputFile, info, maxDictSize, ¶ms); diff --git a/contrib/experimental_dict_builders/randomDictBuilder/main.c b/contrib/experimental_dict_builders/randomDictBuilder/main.c index 3f3a6ca7..3ad88574 100644 --- a/contrib/experimental_dict_builders/randomDictBuilder/main.c +++ b/contrib/experimental_dict_builders/randomDictBuilder/main.c @@ -149,7 +149,7 @@ int main(int argCount, const char* argv[]) params.zParams = zParams; params.k = k; - sampleInfo* info= getSampleInfo(filenameTable, + sampleInfo* info = getSampleInfo(filenameTable, filenameIdx, blockSize, maxDictSize, zParams.notificationLevel); operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, ¶ms); From 09ccd977c355c07a469a295837397abe28b6fdb2 Mon Sep 17 00:00:00 2001 From: George Lu Date: Thu, 26 Jul 2018 15:17:58 -0700 Subject: [PATCH 28/35] no zero --- programs/bench.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/programs/bench.c b/programs/bench.c index a54168c4..76d1ff6d 100644 --- a/programs/bench.c +++ b/programs/bench.c @@ -549,7 +549,8 @@ static BMK_return_t BMK_benchMemAdvancedNoAlloc( double const compressionSpeed = ((double)srcSize / intermediateResultCompress.result.result.nanoSecPerRun) * 1000; int const cSpeedAccuracy = (compressionSpeed < 10.) ? 2 : 1; results.result.cSpeed = compressionSpeed * 1000000; - results.result.cSize = intermediateResultCompress.result.result.sumOfReturn; + cSize = intermediateResultCompress.result.result.sumOfReturn; + results.result.cSize = cSize; ratio = (double)srcSize / results.result.cSize; markNb = (markNb+1) % NB_MARKS; DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r", From 3d7941ce41d33bbbedb15fa9794c9fbcb1713384 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Thu, 26 Jul 2018 16:24:13 -0700 Subject: [PATCH 29/35] Benchmark different f values --- .../benchmarkDictBuilder/README.md | 131 +++++++++++++----- .../benchmarkDictBuilder/benchmark.c | 104 +++++++------- 2 files changed, 152 insertions(+), 83 deletions(-) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md index 07d65b08..1ee4b19b 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -14,46 +14,107 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" ###Benchmarking Result: -d=8 -f=23 -freq[i] = 0 when dmer added to best segment +For every f value for fast, the first one is optimize and the second one has k=200 github: -| Algorithm | Speed(sec) | Compression Ratio | -| ----------------- | ------------- | ------------------ | -| nodict | 0.000007 | 2.999642 | -| random | 0.150258 | 8.786957 | -| cover | 60.388853 | 10.641263 | -| legacy | 0.965050 | 8.989482 | -| fastCover(opt) | 84.968131 | 10.614747 | -| fastCover(k=200) | 6.465490 | 9.484150 | +NODICT 0.000023 2.999642 +RANDOM 0.149020 8.786957 +LEGACY 0.854277 8.989482 +FAST15 8.764078 10.609015 +FAST15 0.232610 9.135669 +FAST16 9.597777 10.474574 +FAST16 0.243698 9.346482 +FAST17 9.385449 10.611737 +FAST17 0.268376 9.605798 +FAST18 9.988885 10.626382 +FAST18 0.311769 9.130565 +FAST19 10.737259 10.411729 +FAST19 0.331885 9.271814 +FAST20 10.479782 10.388895 +FAST20 0.498416 9.194115 +FAST21 21.189883 10.376394 +FAST21 1.098532 9.244456 +FAST22 39.849935 10.432555 +FAST22 2.590561 9.410930 +FAST23 75.832399 10.614747 +FAST23 6.108487 9.484150 +FAST24 139.782714 10.611753 +FAST24 13.029406 9.379030 +COVER 55.118542 10.641263 hg-commands -| Algorithm | Speed(sec) | Compression Ratio | -| ----------------- | ------------- | ------------------ | -| nodict | 0.000005 | 2.425291 | -| random | 0.084348 | 3.489515 | -| cover | 60.144894 | 4.131136 | -| legacy | 0.831981 | 3.911896 | -| fastCover(opt) | 59.030437 | 4.157595 | -| fastCover(k=200) | 3.702932 | 4.134222 | +NODICT 0.000012 2.425291 +RANDOM 0.083071 3.489515 +LEGACY 0.835195 3.911896 +FAST15 0.163980 3.808375 +FAST16 6.373850 4.010783 +FAST16 0.160299 3.966604 +FAST17 6.668799 4.091602 +FAST17 0.172480 4.062773 +FAST18 6.266105 4.130824 +FAST18 0.171554 4.094666 +FAST19 6.869651 4.158180 +FAST19 0.209468 4.111289 +FAST20 8.267766 4.149707 +FAST20 0.331680 4.119873 +FAST21 18.824296 4.171784 +FAST21 0.783961 4.120884 +FAST22 33.321252 4.152035 +FAST22 1.854215 4.126626 +FAST23 60.775388 4.157595 +FAST23 4.040395 4.134222 +FAST24 110.910038 4.163091 +FAST24 8.505828 4.143533 +COVER 61.654796 4.131136 hg-changelog -| Algorithm | Speed(sec) | Compression Ratio | -| ----------------- | ------------- | ------------------ | -| nodict | 0.000004 | 1.377613 | -| random | 0.555964 | 2.096785 | -| cover | 214.423753 | 2.188654 | -| legacy | 2.180249 | 2.058273 | -| fastCover(opt) | 102.261452 | 2.180347 | -| fastCover(k=200) | 11.81039 | 2.170673 | +NODICT 0.000004 1.377613 +RANDOM 0.582067 2.096785 +LEGACY 2.739515 2.058273 +FAST15 35.682665 2.127596 +FAST15 0.931621 2.115299 +FAST16 36.557988 2.141787 +FAST16 1.008155 2.136080 +FAST17 36.272242 2.155332 +FAST17 0.906803 2.154596 +FAST18 35.542043 2.171997 +FAST18 1.063101 2.167723 +FAST19 37.756934 2.180893 +FAST19 1.257291 2.173768 +FAST20 40.273755 2.179442 +FAST20 1.630522 2.170072 +FAST21 54.606548 2.181400 +FAST21 2.321266 2.171643 +FAST22 72.454066 2.178774 +FAST22 5.092888 2.168885 +FAST23 106.753208 2.180347 +FAST23 14.722222 2.170673 +FAST24 171.083201 2.183426 +FAST24 27.575575 2.170623 +COVER 227.219660 2.188654 hg-manifest -| Algorithm | Speed(sec) | Compression Ratio | -| ----------------- | ------------- | ------------------ | -| nodict | 0.000006 | 1.866385 | -| random | 1.063974 | 2.309485 | -| cover | 909.101849 | 2.582597 | -| legacy | 8.706580 | 2.506775 | -| fastCover(opt) | 188.598079 | 2.596761 | -| fastCover(k=200) | 13.392734 | 2.592985 | +NODICT 0.000007 1.866385 +RANDOM 1.086571 2.309485 +LEGACY 9.567507 2.506775 +FAST15 77.811380 2.380461 +FAST15 1.969718 2.317727 +FAST16 75.789019 2.469144 +FAST16 2.051283 2.375815 +FAST17 79.659040 2.539069 +FAST17 1.995394 2.501047 +FAST18 76.281105 2.578095 +FAST18 2.059272 2.564840 +FAST19 79.395382 2.590433 +FAST19 2.354158 2.591024 +FAST20 87.937568 2.597813 +FAST20 2.922189 2.597104 +FAST21 121.760549 2.598408 +FAST21 4.798981 2.600269 +FAST22 155.878461 2.594560 +FAST22 8.151807 2.601047 +FAST23 194.238003 2.596761 +FAST23 15.160578 2.592985 +FAST24 267.425904 2.597657 +FAST24 29.513286 2.600363 +COVER 930.675322 2.582597 diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c index 62135436..9feaae59 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -340,12 +340,67 @@ int main(int argCount, const char* argv[]) randomParam.zParams = zParams; randomParam.k = k; const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL, NULL); + DISPLAYLEVEL(2, "k=%u\n", randomParam.k); if(randomResult) { result = 1; goto _cleanup; } } + /* for legacy */ + { + ZDICT_legacy_params_t legacyParam; + legacyParam.zParams = zParams; + legacyParam.selectivityLevel = 9; + const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam, NULL); + DISPLAYLEVEL(2, "selectivityLevel=%u\n", legacyParam.selectivityLevel); + if(legacyResult) { + result = 1; + goto _cleanup; + } + } + + /* for fastCover */ + for (unsigned f = 15; f < 25; f++){ + DISPLAYLEVEL(2, "current f is %u\n", f); + /* for fastCover (optimizing k) */ + { + ZDICT_fastCover_params_t fastParam; + memset(&fastParam, 0, sizeof(fastParam)); + fastParam.zParams = zParams; + fastParam.splitPoint = 1.0; + fastParam.d = 8; + fastParam.f = f; + fastParam.steps = 40; + fastParam.nbThreads = 1; + const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); + DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); + if(fastOptResult) { + result = 1; + goto _cleanup; + } + } + + /* for fastCover (with k provided) */ + { + ZDICT_fastCover_params_t fastParam; + memset(&fastParam, 0, sizeof(fastParam)); + fastParam.zParams = zParams; + fastParam.splitPoint = 1.0; + fastParam.d = 8; + fastParam.f = f; + fastParam.k = 200; + fastParam.steps = 40; + fastParam.nbThreads = 1; + const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); + DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); + if(fastOptResult) { + result = 1; + goto _cleanup; + } + } + } + /* for cover */ { ZDICT_cover_params_t coverParam; @@ -355,60 +410,13 @@ int main(int argCount, const char* argv[]) coverParam.steps = 40; coverParam.nbThreads = 1; const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL); + DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100)); if(coverOptResult) { result = 1; goto _cleanup; } } - /* for legacy */ - { - ZDICT_legacy_params_t legacyParam; - legacyParam.zParams = zParams; - legacyParam.selectivityLevel = 9; - const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam, NULL); - if(legacyResult) { - result = 1; - goto _cleanup; - } - } - - - /* for fastCover (optimizing k) */ - { - ZDICT_fastCover_params_t fastParam; - memset(&fastParam, 0, sizeof(fastParam)); - fastParam.zParams = zParams; - fastParam.splitPoint = 1.0; - fastParam.d = 8; - fastParam.f = 23; - fastParam.steps = 40; - fastParam.nbThreads = 1; - const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); - if(fastOptResult) { - result = 1; - goto _cleanup; - } - } - - /* for fastCover (with k provided) */ - { - ZDICT_fastCover_params_t fastParam; - memset(&fastParam, 0, sizeof(fastParam)); - fastParam.zParams = zParams; - fastParam.splitPoint = 1.0; - fastParam.d = 8; - fastParam.f = 23; - fastParam.k = 200; - fastParam.steps = 40; - fastParam.nbThreads = 1; - const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); - if(fastOptResult) { - result = 1; - goto _cleanup; - } - } - /* Free allocated memory */ _cleanup: From 759c543312fd722c6f351513411d6d57742c7e4e Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Thu, 26 Jul 2018 19:03:01 -0700 Subject: [PATCH 30/35] Rerun cover and fastCover with optimized values --- .../benchmarkDictBuilder/README.md | 197 +++++++++--------- .../benchmarkDictBuilder/benchmark.c | 109 ++++++---- .../fastCover/fastCover.c | 2 +- 3 files changed, 169 insertions(+), 139 deletions(-) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md index 1ee4b19b..04866b7e 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -13,108 +13,113 @@ Benchmark given input files: make ARG= followed by permitted arguments make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" ###Benchmarking Result: - -For every f value for fast, the first one is optimize and the second one has k=200 +First Cover is optimize cover, second Cover uses optimized d and k from first one. +For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one. github: -NODICT 0.000023 2.999642 -RANDOM 0.149020 8.786957 -LEGACY 0.854277 8.989482 -FAST15 8.764078 10.609015 -FAST15 0.232610 9.135669 -FAST16 9.597777 10.474574 -FAST16 0.243698 9.346482 -FAST17 9.385449 10.611737 -FAST17 0.268376 9.605798 -FAST18 9.988885 10.626382 -FAST18 0.311769 9.130565 -FAST19 10.737259 10.411729 -FAST19 0.331885 9.271814 -FAST20 10.479782 10.388895 -FAST20 0.498416 9.194115 -FAST21 21.189883 10.376394 -FAST21 1.098532 9.244456 -FAST22 39.849935 10.432555 -FAST22 2.590561 9.410930 -FAST23 75.832399 10.614747 -FAST23 6.108487 9.484150 -FAST24 139.782714 10.611753 -FAST24 13.029406 9.379030 -COVER 55.118542 10.641263 +NODICT 0.000004 2.999642 +RANDOM 0.146096 8.786957 +LEGACY 0.956888 8.989482 +COVER 56.596152 10.641263 +COVER 4.937047 10.641263 +FAST15 17.722269 10.586461 +FAST15 0.239135 10.586461 +FAST16 18.276179 10.492503 +FAST16 0.265285 10.492503 +FAST17 18.077916 10.611737 +FAST17 0.236573 10.611737 +FAST18 19.510150 10.621586 +FAST18 0.278683 10.621586 +FAST19 18.794350 10.629626 +FAST19 0.307943 10.629626 +FAST20 19.671099 10.610308 +FAST20 0.428814 10.610308 +FAST21 36.527238 10.625733 +FAST21 0.716384 10.625733 +FAST22 83.803521 10.625281 +FAST22 1.290246 10.625281 +FAST23 158.287924 10.602342 +FAST23 3.084848 10.602342 +FAST24 283.630941 10.603379 +FAST24 8.088933 10.603379 hg-commands -NODICT 0.000012 2.425291 -RANDOM 0.083071 3.489515 -LEGACY 0.835195 3.911896 -FAST15 0.163980 3.808375 -FAST16 6.373850 4.010783 -FAST16 0.160299 3.966604 -FAST17 6.668799 4.091602 -FAST17 0.172480 4.062773 -FAST18 6.266105 4.130824 -FAST18 0.171554 4.094666 -FAST19 6.869651 4.158180 -FAST19 0.209468 4.111289 -FAST20 8.267766 4.149707 -FAST20 0.331680 4.119873 -FAST21 18.824296 4.171784 -FAST21 0.783961 4.120884 -FAST22 33.321252 4.152035 -FAST22 1.854215 4.126626 -FAST23 60.775388 4.157595 -FAST23 4.040395 4.134222 -FAST24 110.910038 4.163091 -FAST24 8.505828 4.143533 -COVER 61.654796 4.131136 +NODICT 0.000007 2.425291 +RANDOM 0.084010 3.489515 +LEGACY 0.926763 3.911896 +COVER 62.036915 4.131136 +COVER 2.194398 4.131136 +FAST15 12.169025 3.903719 +FAST15 0.156552 3.903719 +FAST16 11.886255 4.005077 +FAST16 0.155506 4.005077 +FAST17 11.886955 4.097811 +FAST17 0.176327 4.097811 +FAST18 12.544698 4.136081 +FAST18 0.171796 4.136081 +FAST19 12.920868 4.166021 +FAST19 0.207029 4.166021 +FAST20 15.771429 4.163740 +FAST20 0.258685 4.163740 +FAST21 33.165829 4.157057 +FAST21 0.663088 4.157057 +FAST22 68.779201 4.158195 +FAST22 1.568439 4.158195 +FAST23 121.921931 4.161450 +FAST23 2.498972 4.161450 +FAST24 221.990451 4.159658 +FAST24 5.793594 4.159658 hg-changelog NODICT 0.000004 1.377613 -RANDOM 0.582067 2.096785 -LEGACY 2.739515 2.058273 -FAST15 35.682665 2.127596 -FAST15 0.931621 2.115299 -FAST16 36.557988 2.141787 -FAST16 1.008155 2.136080 -FAST17 36.272242 2.155332 -FAST17 0.906803 2.154596 -FAST18 35.542043 2.171997 -FAST18 1.063101 2.167723 -FAST19 37.756934 2.180893 -FAST19 1.257291 2.173768 -FAST20 40.273755 2.179442 -FAST20 1.630522 2.170072 -FAST21 54.606548 2.181400 -FAST21 2.321266 2.171643 -FAST22 72.454066 2.178774 -FAST22 5.092888 2.168885 -FAST23 106.753208 2.180347 -FAST23 14.722222 2.170673 -FAST24 171.083201 2.183426 -FAST24 27.575575 2.170623 -COVER 227.219660 2.188654 +RANDOM 0.549307 2.096785 +LEGACY 2.273818 2.058273 +COVER 219.640608 2.188654 +COVER 6.055391 2.188654 +FAST15 67.820700 2.127194 +FAST15 0.824624 2.127194 +FAST16 69.774209 2.145401 +FAST16 0.889737 2.145401 +FAST17 70.027355 2.157544 +FAST17 0.869004 2.157544 +FAST18 68.229652 2.173127 +FAST18 0.930689 2.173127 +FAST19 70.696241 2.179527 +FAST19 1.385515 2.179527 +FAST20 80.618172 2.183233 +FAST20 1.699632 2.183233 +FAST21 96.366254 2.180920 +FAST21 2.606553 2.180920 +FAST22 139.440758 2.184297 +FAST22 5.962606 2.184297 +FAST23 207.791930 2.187666 +FAST23 14.823301 2.187666 +FAST24 322.050385 2.189889 +FAST24 29.294918 2.189889 hg-manifest -NODICT 0.000007 1.866385 -RANDOM 1.086571 2.309485 -LEGACY 9.567507 2.506775 -FAST15 77.811380 2.380461 -FAST15 1.969718 2.317727 -FAST16 75.789019 2.469144 -FAST16 2.051283 2.375815 -FAST17 79.659040 2.539069 -FAST17 1.995394 2.501047 -FAST18 76.281105 2.578095 -FAST18 2.059272 2.564840 -FAST19 79.395382 2.590433 -FAST19 2.354158 2.591024 -FAST20 87.937568 2.597813 -FAST20 2.922189 2.597104 -FAST21 121.760549 2.598408 -FAST21 4.798981 2.600269 -FAST22 155.878461 2.594560 -FAST22 8.151807 2.601047 -FAST23 194.238003 2.596761 -FAST23 15.160578 2.592985 -FAST24 267.425904 2.597657 -FAST24 29.513286 2.600363 -COVER 930.675322 2.582597 +NODICT 0.000008 1.866385 +RANDOM 1.075766 2.309485 +LEGACY 8.688387 2.506775 +COVER 926.024689 2.582597 +COVER 33.630695 2.582597 +FAST15 152.845945 2.377689 +FAST15 2.206285 2.377689 +FAST16 147.772371 2.464814 +FAST16 1.937997 2.464814 +FAST17 147.729498 2.539834 +FAST17 1.966577 2.539834 +FAST18 144.156821 2.576924 +FAST18 1.954106 2.576924 +FAST19 145.678760 2.592479 +FAST19 2.096876 2.592479 +FAST20 159.634674 2.594551 +FAST20 2.568766 2.594551 +FAST21 228.116552 2.597128 +FAST21 4.634508 2.597128 +FAST22 288.890644 2.596971 +FAST22 6.618204 2.596971 +FAST23 377.196211 2.601416 +FAST23 13.497286 2.601416 +FAST24 503.208577 2.602830 +FAST24 29.538585 2.602830 diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c index 9feaae59..a775eae3 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -277,7 +277,8 @@ int main(int argCount, const char* argv[]) int result = 0; /* Initialize arguments to default values */ - const unsigned k = 200; + unsigned k = 200; + unsigned d = 8; const unsigned cLevel = DEFAULT_CLEVEL; const unsigned dictID = 0; const unsigned maxDictSize = g_defaultMaxDictSize; @@ -360,47 +361,6 @@ int main(int argCount, const char* argv[]) } } - /* for fastCover */ - for (unsigned f = 15; f < 25; f++){ - DISPLAYLEVEL(2, "current f is %u\n", f); - /* for fastCover (optimizing k) */ - { - ZDICT_fastCover_params_t fastParam; - memset(&fastParam, 0, sizeof(fastParam)); - fastParam.zParams = zParams; - fastParam.splitPoint = 1.0; - fastParam.d = 8; - fastParam.f = f; - fastParam.steps = 40; - fastParam.nbThreads = 1; - const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); - DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); - if(fastOptResult) { - result = 1; - goto _cleanup; - } - } - - /* for fastCover (with k provided) */ - { - ZDICT_fastCover_params_t fastParam; - memset(&fastParam, 0, sizeof(fastParam)); - fastParam.zParams = zParams; - fastParam.splitPoint = 1.0; - fastParam.d = 8; - fastParam.f = f; - fastParam.k = 200; - fastParam.steps = 40; - fastParam.nbThreads = 1; - const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); - DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); - if(fastOptResult) { - result = 1; - goto _cleanup; - } - } - } - /* for cover */ { ZDICT_cover_params_t coverParam; @@ -415,8 +375,73 @@ int main(int argCount, const char* argv[]) result = 1; goto _cleanup; } + + k = coverParam.k; + d = coverParam.d; + + /* for COVER with k and d provided */ + ZDICT_cover_params_t covernParam; + memset(&covernParam, 0, sizeof(covernParam)); + covernParam.zParams = zParams; + covernParam.splitPoint = 1.0; + covernParam.steps = 40; + covernParam.nbThreads = 1; + covernParam.k = k; + covernParam.d = d; + const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &covernParam, NULL, NULL); + DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", covernParam.k, covernParam.d, covernParam.steps, (unsigned)(covernParam.splitPoint * 100)); + if(coverResult) { + result = 1; + goto _cleanup; + } } + /* for fastCover */ + for (unsigned f = 15; f < 25; f++){ + DISPLAYLEVEL(2, "current f is %u\n", f); + /* for fastCover (optimizing k and d) */ + { + ZDICT_fastCover_params_t fastParam; + memset(&fastParam, 0, sizeof(fastParam)); + fastParam.zParams = zParams; + fastParam.splitPoint = 1.0; + fastParam.f = f; + fastParam.steps = 40; + fastParam.nbThreads = 1; + const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); + DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); + if(fastOptResult) { + result = 1; + goto _cleanup; + } + + k = fastParam.k; + d = fastParam.d; + } + + + /* for fastCover (with k and d provided) */ + { + ZDICT_fastCover_params_t fastParam; + memset(&fastParam, 0, sizeof(fastParam)); + fastParam.zParams = zParams; + fastParam.splitPoint = 1.0; + fastParam.d = d; + fastParam.f = f; + fastParam.k = k; + fastParam.steps = 40; + fastParam.nbThreads = 1; + const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); + DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); + if(fastOptResult) { + result = 1; + goto _cleanup; + } + } + } + + + /* Free allocated memory */ _cleanup: diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c index 6f990e0c..d6b3254e 100644 --- a/contrib/experimental_dict_builders/fastCover/fastCover.c +++ b/contrib/experimental_dict_builders/fastCover/fastCover.c @@ -267,7 +267,7 @@ static void FASTCOVER_computeFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t * size_t currSampleStart = ctx->offsets[i]; size_t currSampleEnd = ctx->offsets[i+1]; start = currSampleStart; - while (start + f < currSampleEnd) { + while (start + ctx->d <= currSampleEnd) { const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, ctx->d); /* if no dmer with same hash value has been seen in current sample */ if (inCurrSample[dmerIndex] == 0) { From 49b398e93f5357c4311b678a7e4b4d875035f379 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 27 Jul 2018 13:39:19 -0700 Subject: [PATCH 31/35] Use same param after optimizing cover and fastCover and record k and d for benchmarking --- .../benchmarkDictBuilder/README.md | 211 +++++++++--------- .../benchmarkDictBuilder/benchmark.c | 74 ++---- 2 files changed, 129 insertions(+), 156 deletions(-) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md index 04866b7e..654ca409 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -13,113 +13,114 @@ Benchmark given input files: make ARG= followed by permitted arguments make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" ###Benchmarking Result: -First Cover is optimize cover, second Cover uses optimized d and k from first one. -For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one. +- First Cover is optimize cover, second Cover uses optimized d and k from first one. +- For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one. +- Fourth column is chosen d and fifth column is chosen k github: -NODICT 0.000004 2.999642 -RANDOM 0.146096 8.786957 -LEGACY 0.956888 8.989482 -COVER 56.596152 10.641263 -COVER 4.937047 10.641263 -FAST15 17.722269 10.586461 -FAST15 0.239135 10.586461 -FAST16 18.276179 10.492503 -FAST16 0.265285 10.492503 -FAST17 18.077916 10.611737 -FAST17 0.236573 10.611737 -FAST18 19.510150 10.621586 -FAST18 0.278683 10.621586 -FAST19 18.794350 10.629626 -FAST19 0.307943 10.629626 -FAST20 19.671099 10.610308 -FAST20 0.428814 10.610308 -FAST21 36.527238 10.625733 -FAST21 0.716384 10.625733 -FAST22 83.803521 10.625281 -FAST22 1.290246 10.625281 -FAST23 158.287924 10.602342 -FAST23 3.084848 10.602342 -FAST24 283.630941 10.603379 -FAST24 8.088933 10.603379 +NODICT 0.000004 2.999642 +RANDOM 0.146096 8.786957 +LEGACY 0.956888 8.989482 +COVER 56.596152 10.641263 8 1298 +COVER 4.937047 10.641263 8 1298 +FAST15 17.722269 10.586461 8 1778 +FAST15 0.239135 10.586461 8 1778 +FAST16 18.276179 10.492503 6 1778 +FAST16 0.265285 10.492503 6 1778 +FAST17 18.077916 10.611737 8 1778 +FAST17 0.236573 10.611737 8 1778 +FAST18 19.510150 10.621586 8 1778 +FAST18 0.278683 10.621586 8 1778 +FAST19 18.794350 10.629626 8 1778 +FAST19 0.307943 10.629626 8 1778 +FAST20 19.671099 10.610308 8 1778 +FAST20 0.428814 10.610308 8 1778 +FAST21 36.527238 10.625733 8 1778 +FAST21 0.716384 10.625733 8 1778 +FAST22 83.803521 10.625281 8 1778 +FAST22 1.290246 10.625281 8 1778 +FAST23 158.287924 10.602342 8 1778 +FAST23 3.084848 10.602342 8 1778 +FAST24 283.630941 10.603379 8 1778 +FAST24 8.088933 10.603379 8 1778 -hg-commands -NODICT 0.000007 2.425291 -RANDOM 0.084010 3.489515 -LEGACY 0.926763 3.911896 -COVER 62.036915 4.131136 -COVER 2.194398 4.131136 -FAST15 12.169025 3.903719 -FAST15 0.156552 3.903719 -FAST16 11.886255 4.005077 -FAST16 0.155506 4.005077 -FAST17 11.886955 4.097811 -FAST17 0.176327 4.097811 -FAST18 12.544698 4.136081 -FAST18 0.171796 4.136081 -FAST19 12.920868 4.166021 -FAST19 0.207029 4.166021 -FAST20 15.771429 4.163740 -FAST20 0.258685 4.163740 -FAST21 33.165829 4.157057 -FAST21 0.663088 4.157057 -FAST22 68.779201 4.158195 -FAST22 1.568439 4.158195 -FAST23 121.921931 4.161450 -FAST23 2.498972 4.161450 -FAST24 221.990451 4.159658 -FAST24 5.793594 4.159658 +hg-commands: +NODICT 0.000007 2.425291 +RANDOM 0.084010 3.489515 +LEGACY 0.926763 3.911896 +COVER 62.036915 4.131136 8 386 +COVER 2.194398 4.131136 8 386 +FAST15 12.169025 3.903719 6 1106 +FAST15 0.156552 3.903719 6 1106 +FAST16 11.886255 4.005077 8 530 +FAST16 0.155506 4.005077 8 530 +FAST17 11.886955 4.097811 8 818 +FAST17 0.176327 4.097811 8 818 +FAST18 12.544698 4.136081 8 770 +FAST18 0.171796 4.136081 8 770 +FAST19 12.920868 4.166021 8 530 +FAST19 0.207029 4.166021 8 530 +FAST20 15.771429 4.163740 8 482 +FAST20 0.258685 4.163740 8 482 +FAST21 33.165829 4.157057 8 434 +FAST21 0.663088 4.157057 8 434 +FAST22 68.779201 4.158195 8 290 +FAST22 1.568439 4.158195 8 290 +FAST23 121.921931 4.161450 8 434 +FAST23 2.498972 4.161450 8 434 +FAST24 221.990451 4.159658 8 338 +FAST24 5.793594 4.159658 8 338 -hg-changelog -NODICT 0.000004 1.377613 -RANDOM 0.549307 2.096785 -LEGACY 2.273818 2.058273 -COVER 219.640608 2.188654 -COVER 6.055391 2.188654 -FAST15 67.820700 2.127194 -FAST15 0.824624 2.127194 -FAST16 69.774209 2.145401 -FAST16 0.889737 2.145401 -FAST17 70.027355 2.157544 -FAST17 0.869004 2.157544 -FAST18 68.229652 2.173127 -FAST18 0.930689 2.173127 -FAST19 70.696241 2.179527 -FAST19 1.385515 2.179527 -FAST20 80.618172 2.183233 -FAST20 1.699632 2.183233 -FAST21 96.366254 2.180920 -FAST21 2.606553 2.180920 -FAST22 139.440758 2.184297 -FAST22 5.962606 2.184297 -FAST23 207.791930 2.187666 -FAST23 14.823301 2.187666 -FAST24 322.050385 2.189889 -FAST24 29.294918 2.189889 +hg-changelog: +NODICT 0.000004 1.377613 +RANDOM 0.549307 2.096785 +LEGACY 2.273818 2.058273 +COVER 219.640608 2.188654 8 98 +COVER 6.055391 2.188654 8 98 +FAST15 67.820700 2.127194 8 866 +FAST15 0.824624 2.127194 8 866 +FAST16 69.774209 2.145401 8 338 +FAST16 0.889737 2.145401 8 338 +FAST17 70.027355 2.157544 8 194 +FAST17 0.869004 2.157544 8 194 +FAST18 68.229652 2.173127 8 98 +FAST18 0.930689 2.173127 8 98 +FAST19 70.696241 2.179527 8 98 +FAST19 1.385515 2.179527 8 98 +FAST20 80.618172 2.183233 6 98 +FAST20 1.699632 2.183233 6 98 +FAST21 96.366254 2.180920 8 98 +FAST21 2.606553 2.180920 8 98 +FAST22 139.440758 2.184297 8 98 +FAST22 5.962606 2.184297 8 98 +FAST23 207.791930 2.187666 6 98 +FAST23 14.823301 2.187666 6 98 +FAST24 322.050385 2.189889 6 98 +FAST24 29.294918 2.189889 6 98 -hg-manifest -NODICT 0.000008 1.866385 -RANDOM 1.075766 2.309485 -LEGACY 8.688387 2.506775 -COVER 926.024689 2.582597 -COVER 33.630695 2.582597 -FAST15 152.845945 2.377689 -FAST15 2.206285 2.377689 -FAST16 147.772371 2.464814 -FAST16 1.937997 2.464814 -FAST17 147.729498 2.539834 -FAST17 1.966577 2.539834 -FAST18 144.156821 2.576924 -FAST18 1.954106 2.576924 -FAST19 145.678760 2.592479 -FAST19 2.096876 2.592479 -FAST20 159.634674 2.594551 -FAST20 2.568766 2.594551 -FAST21 228.116552 2.597128 -FAST21 4.634508 2.597128 -FAST22 288.890644 2.596971 -FAST22 6.618204 2.596971 -FAST23 377.196211 2.601416 -FAST23 13.497286 2.601416 -FAST24 503.208577 2.602830 -FAST24 29.538585 2.602830 +hg-manifest: +NODICT 0.000008 1.866385 +RANDOM 1.075766 2.309485 +LEGACY 8.688387 2.506775 +COVER 926.024689 2.582597 8 434 +COVER 33.630695 2.582597 8 434 +FAST15 152.845945 2.377689 8 1682 +FAST15 2.206285 2.377689 8 1682 +FAST16 147.772371 2.464814 8 1538 +FAST16 1.937997 2.464814 8 1538 +FAST17 147.729498 2.539834 6 1826 +FAST17 1.966577 2.539834 6 1826 +FAST18 144.156821 2.576924 8 1922 +FAST18 1.954106 2.576924 8 1922 +FAST19 145.678760 2.592479 6 290 +FAST19 2.096876 2.592479 6 290 +FAST20 159.634674 2.594551 8 194 +FAST20 2.568766 2.594551 8 194 +FAST21 228.116552 2.597128 6 194 +FAST21 4.634508 2.597128 6 194 +FAST22 288.890644 2.596971 6 386 +FAST22 6.618204 2.596971 6 386 +FAST23 377.196211 2.601416 8 194 +FAST23 13.497286 2.601416 8 194 +FAST24 503.208577 2.602830 6 194 +FAST24 29.538585 2.602830 6 194 diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c index a775eae3..75008a08 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -251,7 +251,7 @@ int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random result = 1; goto _cleanup; } - DISPLAYLEVEL(2, "%s took %f seconds to execute \n", name, timeSec); + DISPLAYLEVEL(1, "%s took %f seconds to execute \n", name, timeSec); /* Calculate compression ratio */ const double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel); @@ -261,7 +261,7 @@ int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random goto _cleanup; } - DISPLAYLEVEL(2, "Compression ratio with %s dictionary is %f\n", name, cRatio); + DISPLAYLEVEL(1, "Compression ratio with %s dictionary is %f\n", name, cRatio); _cleanup: freeDictInfo(dInfo); @@ -376,73 +376,45 @@ int main(int argCount, const char* argv[]) goto _cleanup; } - k = coverParam.k; - d = coverParam.d; - - /* for COVER with k and d provided */ - ZDICT_cover_params_t covernParam; - memset(&covernParam, 0, sizeof(covernParam)); - covernParam.zParams = zParams; - covernParam.splitPoint = 1.0; - covernParam.steps = 40; - covernParam.nbThreads = 1; - covernParam.k = k; - covernParam.d = d; - const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &covernParam, NULL, NULL); - DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", covernParam.k, covernParam.d, covernParam.steps, (unsigned)(covernParam.splitPoint * 100)); + const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL); + DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100)); if(coverResult) { result = 1; goto _cleanup; } + } /* for fastCover */ for (unsigned f = 15; f < 25; f++){ DISPLAYLEVEL(2, "current f is %u\n", f); /* for fastCover (optimizing k and d) */ - { - ZDICT_fastCover_params_t fastParam; - memset(&fastParam, 0, sizeof(fastParam)); - fastParam.zParams = zParams; - fastParam.splitPoint = 1.0; - fastParam.f = f; - fastParam.steps = 40; - fastParam.nbThreads = 1; - const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); - DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); - if(fastOptResult) { - result = 1; - goto _cleanup; - } - - k = fastParam.k; - d = fastParam.d; + ZDICT_fastCover_params_t fastParam; + memset(&fastParam, 0, sizeof(fastParam)); + fastParam.zParams = zParams; + fastParam.splitPoint = 1.0; + fastParam.f = f; + fastParam.steps = 40; + fastParam.nbThreads = 1; + const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); + DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); + if(fastOptResult) { + result = 1; + goto _cleanup; } /* for fastCover (with k and d provided) */ - { - ZDICT_fastCover_params_t fastParam; - memset(&fastParam, 0, sizeof(fastParam)); - fastParam.zParams = zParams; - fastParam.splitPoint = 1.0; - fastParam.d = d; - fastParam.f = f; - fastParam.k = k; - fastParam.steps = 40; - fastParam.nbThreads = 1; - const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); - DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); - if(fastOptResult) { - result = 1; - goto _cleanup; - } + const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); + DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); + if(fastResult) { + result = 1; + goto _cleanup; } + } - - /* Free allocated memory */ _cleanup: UTIL_freeFileList(extendedFileList, fileNamesBuf); From 61262f6c0dc137e078bbc4cd1131fc3b88657414 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 27 Jul 2018 16:51:38 -0700 Subject: [PATCH 32/35] Save segmentFreqs in ctx instead of malloc and memset in SelectSegment --- .../benchmarkDictBuilder/README.md | 113 ++++++++++++++++++ .../benchmarkDictBuilder/test.sh | 10 +- .../fastCover/Makefile | 6 +- .../fastCover/fastCover.c | 28 +++-- 4 files changed, 141 insertions(+), 16 deletions(-) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md index 654ca409..a818e6eb 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -17,6 +17,8 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" - For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one. - Fourth column is chosen d and fifth column is chosen k +Version 1: + github: NODICT 0.000004 2.999642 RANDOM 0.146096 8.786957 @@ -124,3 +126,114 @@ FAST23 377.196211 2.601416 8 194 FAST23 13.497286 2.601416 8 194 FAST24 503.208577 2.602830 6 194 FAST24 29.538585 2.602830 6 194 + +--------------------------------------------------------------- +Version 2 (save segmentFreqs in ctx instead of malloc and memset in every call to SelectSegment): + +github: +NODICT 0.000005 2.999642 +RANDOM 0.141553 8.786957 +LEGACY 0.904340 8.989482 +COVER 53.621302 10.641263 8 1298 +COVER 4.085037 10.641263 8 1298 +FAST15 17.636211 10.586461 8 1778 +FAST15 0.221236 10.586461 8 1778 +FAST16 18.716259 10.492503 6 1778 +FAST16 0.251522 10.492503 6 1778 +FAST17 17.614391 10.611737 8 1778 +FAST17 0.241011 10.611737 8 1778 +FAST18 19.926270 10.621586 8 1778 +FAST18 0.287195 10.621586 8 1778 +FAST19 19.626808 10.629626 8 1778 +FAST19 0.340191 10.629626 8 1778 +FAST20 18.918657 10.610308 8 1778 +FAST20 0.463307 10.610308 8 1778 +FAST21 20.502362 10.625733 8 1778 +FAST21 0.638202 10.625733 8 1778 +FAST22 22.702695 10.625281 8 1778 +FAST22 1.353399 10.625281 8 1778 +FAST23 28.041990 10.602342 8 1778 +FAST23 3.029502 10.602342 8 1778 +FAST24 35.662961 10.603379 8 1778 +FAST24 6.524258 10.603379 8 1778 + +hg-commands: +NODICT 0.000005 2.425291 +RANDOM 0.080469 3.489515 +LEGACY 0.794417 3.911896 +COVER 54.198788 4.131136 8 386 +COVER 2.191729 4.131136 8 386 +FAST15 11.852793 3.903719 6 1106 +FAST15 0.175406 3.903719 6 1106 +FAST16 12.863315 4.005077 8 530 +FAST16 0.158410 4.005077 8 530 +FAST17 11.977917 4.097811 8 818 +FAST17 0.162381 4.097811 8 818 +FAST18 11.749304 4.136081 8 770 +FAST18 0.173242 4.136081 8 770 +FAST19 11.905785 4.166021 8 530 +FAST19 0.186403 4.166021 8 530 +FAST20 13.293999 4.163740 8 482 +FAST20 0.241508 4.163740 8 482 +FAST21 16.623177 4.157057 8 434 +FAST21 0.372647 4.157057 8 434 +FAST22 20.918409 4.158195 8 290 +FAST22 0.570431 4.158195 8 290 +FAST23 21.762805 4.161450 8 434 +FAST23 1.162206 4.161450 8 434 +FAST24 29.133745 4.159658 8 338 +FAST24 3.054376 4.159658 8 338 + +hg-changelog: +NODICT 0.000006 1.377613 +RANDOM 0.601346 2.096785 +LEGACY 2.544973 2.058273 +COVER 222.639708 2.188654 8 98 +COVER 6.072892 2.188654 8 98 +FAST15 70.394523 2.127194 8 866 +FAST15 0.899766 2.127194 8 866 +FAST16 69.845529 2.145401 8 338 +FAST16 0.881569 2.145401 8 338 +FAST17 69.382431 2.157544 8 194 +FAST17 0.943291 2.157544 8 194 +FAST18 71.348283 2.173127 8 98 +FAST18 1.034765 2.173127 8 98 +FAST19 71.380923 2.179527 8 98 +FAST19 1.254700 2.179527 8 98 +FAST20 72.802714 2.183233 6 98 +FAST20 1.368704 2.183233 6 98 +FAST21 82.042339 2.180920 8 98 +FAST21 2.213864 2.180920 8 98 +FAST22 90.666200 2.184297 8 98 +FAST22 3.590399 2.184297 8 98 +FAST23 108.926377 2.187666 6 98 +FAST23 8.723759 2.187666 6 98 +FAST24 134.296232 2.189889 6 98 +FAST24 19.396532 2.189889 6 98 + +hg-manifest: +NODICT 0.000005 1.866385 +RANDOM 0.982192 2.309485 +LEGACY 9.507729 2.506775 +COVER 922.742066 2.582597 8 434 +COVER 36.500276 2.582597 8 434 +FAST15 163.886717 2.377689 8 1682 +FAST15 2.107328 2.377689 8 1682 +FAST16 152.684592 2.464814 8 1538 +FAST16 2.157789 2.464814 8 1538 +FAST17 154.463459 2.539834 6 1826 +FAST17 2.282455 2.539834 6 1826 +FAST18 155.540044 2.576924 8 1922 +FAST18 2.101807 2.576924 8 1922 +FAST19 152.650343 2.592479 6 290 +FAST19 2.359461 2.592479 6 290 +FAST20 174.623634 2.594551 8 194 +FAST20 2.870022 2.594551 8 194 +FAST21 219.876653 2.597128 6 194 +FAST21 4.386269 2.597128 6 194 +FAST22 247.986803 2.596971 6 386 +FAST22 6.201144 2.596971 6 386 +FAST23 276.051806 2.601416 8 194 +FAST23 11.613477 2.601416 8 194 +FAST24 328.234024 2.602830 6 194 +FAST24 26.710364 2.602830 6 194 diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh index 5eaf5930..e5508ded 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh @@ -1,2 +1,8 @@ -echo "Benchmark with in=../../lib/common" -./benchmark in=../../../lib/common +echo "-----------------github--------------------" +./benchmark in=github +echo "-----------------hg-commands--------------------" +./benchmark in=hg-commands +echo "-----------------hg-changelog--------------------" +./benchmark in=hg-changelog +echo "------------------hg-manifest-------------------" +./benchmark in=hg-manifest diff --git a/contrib/experimental_dict_builders/fastCover/Makefile b/contrib/experimental_dict_builders/fastCover/Makefile index 9c56013d..4a7cc17d 100644 --- a/contrib/experimental_dict_builders/fastCover/Makefile +++ b/contrib/experimental_dict_builders/fastCover/Makefile @@ -1,7 +1,7 @@ ARG := CC ?= gcc -CFLAGS ?= -O3 +CFLAGS ?= -O3 -g INCLUDES := -I ../../../programs -I ../randomDictBuilder -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder IO_FILE := ../randomDictBuilder/io.c @@ -9,7 +9,7 @@ IO_FILE := ../randomDictBuilder/io.c TEST_INPUT := ../../../lib TEST_OUTPUT := fastCoverDict -all: main run clean +all: main run .PHONY: test test: main testrun testshell clean @@ -32,7 +32,7 @@ io.o: $(IO_FILE) $(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE) libzstd.a: - $(MAKE) -C ../../../lib libzstd.a + $(MAKE) MOREFLAGS=-g -C ../../../lib libzstd.a mv ../../../lib/libzstd.a . .PHONY: testrun diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c index d6b3254e..3c1aa951 100644 --- a/contrib/experimental_dict_builders/fastCover/fastCover.c +++ b/contrib/experimental_dict_builders/fastCover/fastCover.c @@ -82,6 +82,7 @@ typedef struct { size_t nbTestSamples; size_t nbDmers; U32 *freqs; + U16 *segmentFreqs; unsigned d; } FASTCOVER_ctx_t; @@ -142,9 +143,6 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, activeSegment.end = begin; activeSegment.score = 0; { - /* Keep track of number of times an index has been seen in current segment */ - U16* currfreqs =(U16 *)malloc((1 << parameters.f) * sizeof(U16)); - memset(currfreqs, 0, (1 << parameters.f) * sizeof(*currfreqs)); /* Slide the activeSegment through the whole epoch. * Save the best segment in bestSegment. */ @@ -152,19 +150,19 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, /* Get hash value of current dmer */ const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d); /* Add frequency of this index to score if this is the first occurence of index in active segment */ - if (currfreqs[index] == 0) { + if (ctx->segmentFreqs[index] == 0) { activeSegment.score += freqs[index]; } - currfreqs[index] += 1; + ctx->segmentFreqs[index] += 1; /* Increment end of segment */ activeSegment.end += 1; /* If the window is now too large, drop the first position */ if (activeSegment.end - activeSegment.begin == dmersInK + 1) { /* Get hash value of the dmer to be eliminated from active segment */ const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d); - currfreqs[delIndex] -= 1; + ctx->segmentFreqs[delIndex] -= 1; /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */ - if (currfreqs[delIndex] == 0) { + if (ctx->segmentFreqs[delIndex] == 0) { activeSegment.score -= freqs[delIndex]; } /* Increment start of segment */ @@ -175,7 +173,12 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, bestSegment = activeSegment; } } - free(currfreqs); + /* Zero out rest of segmentFreqs array */ + while (activeSegment.begin < end) { + const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d); + ctx->segmentFreqs[delIndex] -= 1; + activeSegment.begin += 1; + } } { /* Trim off the zero frequency head and tail from the segment. */ @@ -245,6 +248,10 @@ static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) { if (!ctx) { return; } + if (ctx->segmentFreqs) { + free(ctx->segmentFreqs); + ctx->segmentFreqs = NULL; + } if (ctx->freqs) { free(ctx->freqs); ctx->freqs = NULL; @@ -347,9 +354,8 @@ static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer, } /* Initialize frequency array of size 2^f */ - ctx->freqs =(U32 *)malloc((1 << f) * sizeof(U32)); - memset(ctx->freqs, 0, (1 << f) * sizeof(U32)); - + ctx->freqs = (U32 *)calloc((1 << f), sizeof(U32)); + ctx->segmentFreqs = (U16 *)calloc((1 << f), sizeof(U16)); DISPLAYLEVEL(2, "Computing frequencies\n"); FASTCOVER_computeFrequency(ctx->freqs, f, ctx); From 96d84ee235f4d6cbf71c415a1a0327235751ba86 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 27 Jul 2018 16:54:05 -0700 Subject: [PATCH 33/35] Revert test.sh --- .../benchmarkDictBuilder/test.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh index e5508ded..5eaf5930 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh @@ -1,8 +1,2 @@ -echo "-----------------github--------------------" -./benchmark in=github -echo "-----------------hg-commands--------------------" -./benchmark in=hg-commands -echo "-----------------hg-changelog--------------------" -./benchmark in=hg-changelog -echo "------------------hg-manifest-------------------" -./benchmark in=hg-manifest +echo "Benchmark with in=../../lib/common" +./benchmark in=../../../lib/common From 53ef22a4bc3844f860531dce31481db8b6fcd9bf Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 27 Jul 2018 16:56:50 -0700 Subject: [PATCH 34/35] Undo deleting clean in make --- contrib/experimental_dict_builders/fastCover/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/experimental_dict_builders/fastCover/Makefile b/contrib/experimental_dict_builders/fastCover/Makefile index 4a7cc17d..3ba24790 100644 --- a/contrib/experimental_dict_builders/fastCover/Makefile +++ b/contrib/experimental_dict_builders/fastCover/Makefile @@ -9,7 +9,7 @@ IO_FILE := ../randomDictBuilder/io.c TEST_INPUT := ../../../lib TEST_OUTPUT := fastCoverDict -all: main run +all: main run clean .PHONY: test test: main testrun testshell clean From 51b109c1b5991d3a9bac7bbd5e82065a816777cb Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 27 Jul 2018 17:31:33 -0700 Subject: [PATCH 35/35] Delete old benchmarking result --- .../benchmarkDictBuilder/README.md | 113 ------------------ 1 file changed, 113 deletions(-) diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md index a818e6eb..20fbde95 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -17,119 +17,6 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" - For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one. - Fourth column is chosen d and fifth column is chosen k -Version 1: - -github: -NODICT 0.000004 2.999642 -RANDOM 0.146096 8.786957 -LEGACY 0.956888 8.989482 -COVER 56.596152 10.641263 8 1298 -COVER 4.937047 10.641263 8 1298 -FAST15 17.722269 10.586461 8 1778 -FAST15 0.239135 10.586461 8 1778 -FAST16 18.276179 10.492503 6 1778 -FAST16 0.265285 10.492503 6 1778 -FAST17 18.077916 10.611737 8 1778 -FAST17 0.236573 10.611737 8 1778 -FAST18 19.510150 10.621586 8 1778 -FAST18 0.278683 10.621586 8 1778 -FAST19 18.794350 10.629626 8 1778 -FAST19 0.307943 10.629626 8 1778 -FAST20 19.671099 10.610308 8 1778 -FAST20 0.428814 10.610308 8 1778 -FAST21 36.527238 10.625733 8 1778 -FAST21 0.716384 10.625733 8 1778 -FAST22 83.803521 10.625281 8 1778 -FAST22 1.290246 10.625281 8 1778 -FAST23 158.287924 10.602342 8 1778 -FAST23 3.084848 10.602342 8 1778 -FAST24 283.630941 10.603379 8 1778 -FAST24 8.088933 10.603379 8 1778 - -hg-commands: -NODICT 0.000007 2.425291 -RANDOM 0.084010 3.489515 -LEGACY 0.926763 3.911896 -COVER 62.036915 4.131136 8 386 -COVER 2.194398 4.131136 8 386 -FAST15 12.169025 3.903719 6 1106 -FAST15 0.156552 3.903719 6 1106 -FAST16 11.886255 4.005077 8 530 -FAST16 0.155506 4.005077 8 530 -FAST17 11.886955 4.097811 8 818 -FAST17 0.176327 4.097811 8 818 -FAST18 12.544698 4.136081 8 770 -FAST18 0.171796 4.136081 8 770 -FAST19 12.920868 4.166021 8 530 -FAST19 0.207029 4.166021 8 530 -FAST20 15.771429 4.163740 8 482 -FAST20 0.258685 4.163740 8 482 -FAST21 33.165829 4.157057 8 434 -FAST21 0.663088 4.157057 8 434 -FAST22 68.779201 4.158195 8 290 -FAST22 1.568439 4.158195 8 290 -FAST23 121.921931 4.161450 8 434 -FAST23 2.498972 4.161450 8 434 -FAST24 221.990451 4.159658 8 338 -FAST24 5.793594 4.159658 8 338 - -hg-changelog: -NODICT 0.000004 1.377613 -RANDOM 0.549307 2.096785 -LEGACY 2.273818 2.058273 -COVER 219.640608 2.188654 8 98 -COVER 6.055391 2.188654 8 98 -FAST15 67.820700 2.127194 8 866 -FAST15 0.824624 2.127194 8 866 -FAST16 69.774209 2.145401 8 338 -FAST16 0.889737 2.145401 8 338 -FAST17 70.027355 2.157544 8 194 -FAST17 0.869004 2.157544 8 194 -FAST18 68.229652 2.173127 8 98 -FAST18 0.930689 2.173127 8 98 -FAST19 70.696241 2.179527 8 98 -FAST19 1.385515 2.179527 8 98 -FAST20 80.618172 2.183233 6 98 -FAST20 1.699632 2.183233 6 98 -FAST21 96.366254 2.180920 8 98 -FAST21 2.606553 2.180920 8 98 -FAST22 139.440758 2.184297 8 98 -FAST22 5.962606 2.184297 8 98 -FAST23 207.791930 2.187666 6 98 -FAST23 14.823301 2.187666 6 98 -FAST24 322.050385 2.189889 6 98 -FAST24 29.294918 2.189889 6 98 - -hg-manifest: -NODICT 0.000008 1.866385 -RANDOM 1.075766 2.309485 -LEGACY 8.688387 2.506775 -COVER 926.024689 2.582597 8 434 -COVER 33.630695 2.582597 8 434 -FAST15 152.845945 2.377689 8 1682 -FAST15 2.206285 2.377689 8 1682 -FAST16 147.772371 2.464814 8 1538 -FAST16 1.937997 2.464814 8 1538 -FAST17 147.729498 2.539834 6 1826 -FAST17 1.966577 2.539834 6 1826 -FAST18 144.156821 2.576924 8 1922 -FAST18 1.954106 2.576924 8 1922 -FAST19 145.678760 2.592479 6 290 -FAST19 2.096876 2.592479 6 290 -FAST20 159.634674 2.594551 8 194 -FAST20 2.568766 2.594551 8 194 -FAST21 228.116552 2.597128 6 194 -FAST21 4.634508 2.597128 6 194 -FAST22 288.890644 2.596971 6 386 -FAST22 6.618204 2.596971 6 386 -FAST23 377.196211 2.601416 8 194 -FAST23 13.497286 2.601416 8 194 -FAST24 503.208577 2.602830 6 194 -FAST24 29.538585 2.602830 6 194 - ---------------------------------------------------------------- -Version 2 (save segmentFreqs in ctx instead of malloc and memset in every call to SelectSegment): - github: NODICT 0.000005 2.999642 RANDOM 0.141553 8.786957