From 96b39f65fa0e11664c64b5b82bbfdd433125d203 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Sat, 31 Dec 2016 21:07:44 -0800 Subject: [PATCH] Add COVER dictionary builder --- lib/dictBuilder/cover.c | 1041 +++++++++++++++++++++++++++++++++++++++ lib/dictBuilder/zdict.h | 52 ++ 2 files changed, 1093 insertions(+) create mode 100644 lib/dictBuilder/cover.c diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c new file mode 100644 index 00000000..1bad0186 --- /dev/null +++ b/lib/dictBuilder/cover.c @@ -0,0 +1,1041 @@ +/** + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +/*-************************************* +* Dependencies +***************************************/ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#ifdef ZSTD_PTHREAD +#include "threading.h" +#endif + +#include "mem.h" /* read */ +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "zdict.h" + +/*-************************************* +* Constants +***************************************/ +#define COVER_MAX_SAMPLES_SIZE ((U32)-1) + +/*-************************************* +* Console display +***************************************/ +static int g_displayLevel = 2; +#define DISPLAY(...) \ + { \ + fprintf(stderr, __VA_ARGS__); \ + fflush(stderr); \ + } +#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + DISPLAY(__VA_ARGS__); \ + } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ +#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__) + +#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ + g_time = clock(); \ + DISPLAY(__VA_ARGS__); \ + if (displayLevel >= 4) \ + fflush(stdout); \ + } \ + } +#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__) +static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; +static clock_t g_time = 0; + +/*-************************************* +* Hash table +*************************************** +* A small specialized hash map for storing activeDmers. +* The map does not resize, so if it becomes full it will loop forever. +* Thus, the map must be large enough to store every value. +* The map implements linear probing and keeps its load less than 0.5. +*/ + +#define MAP_EMPTY_VALUE ((U32)-1) +typedef struct COVER_map_pair_t_s { + U32 key; + U32 value; +} COVER_map_pair_t; + +typedef struct COVER_map_s { + COVER_map_pair_t *data; + U32 sizeLog; + U32 size; + U32 sizeMask; +} COVER_map_t; + +/** + * Clear the map. + */ +static void COVER_map_clear(COVER_map_t *map) { + memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t)); +} + +/** + * Initializes a map of the given size. + * Returns 1 on success and 0 on failure. + * The map must be destroyed with COVER_map_destroy(). + * The map is only guaranteed to be large enough to hold size elements. + */ +static int COVER_map_init(COVER_map_t *map, U32 size) { + map->sizeLog = ZSTD_highbit32(size) + 2; + map->size = (U32)1 << map->sizeLog; + map->sizeMask = map->size - 1; + map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t)); + if (!map->data) { + map->sizeLog = 0; + map->size = 0; + return 0; + } + COVER_map_clear(map); + return 1; +} + +/** + * Internal hash function + */ +static const U32 prime4bytes = 2654435761U; +static U32 COVER_map_hash(COVER_map_t *map, U32 key) { + return (key * prime4bytes) >> (32 - map->sizeLog); +} + +/** + * Helper function that returns the index that a key should be placed into. + */ +static U32 COVER_map_index(COVER_map_t *map, U32 key) { + const U32 hash = COVER_map_hash(map, key); + U32 i; + for (i = hash;; i = (i + 1) & map->sizeMask) { + COVER_map_pair_t *pos = &map->data[i]; + if (pos->value == MAP_EMPTY_VALUE) { + return i; + } + if (pos->key == key) { + return i; + } + } +} + +/** + * Returns the pointer to the value for key. + * If key is not in the map, it is inserted and the value is set to 0. + * The map must not be full. + */ +static U32 *COVER_map_at(COVER_map_t *map, U32 key) { + COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)]; + if (pos->value == MAP_EMPTY_VALUE) { + pos->key = key; + pos->value = 0; + } + return &pos->value; +} + +/** + * Deletes key from the map if present. + */ +static void COVER_map_remove(COVER_map_t *map, U32 key) { + U32 i = COVER_map_index(map, key); + COVER_map_pair_t *del = &map->data[i]; + U32 shift = 1; + if (del->value == MAP_EMPTY_VALUE) { + return; + } + for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) { + COVER_map_pair_t *const pos = &map->data[i]; + /* If the position is empty we are done */ + if (pos->value == MAP_EMPTY_VALUE) { + del->value = MAP_EMPTY_VALUE; + return; + } + /* If pos can be moved to del do so */ + if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) { + del->key = pos->key; + del->value = pos->value; + del = pos; + shift = 1; + } else { + ++shift; + } + } +} + +/** + * Destroyes a map that is inited with COVER_map_init(). + */ +static void COVER_map_destroy(COVER_map_t *map) { + if (map->data) { + free(map->data); + } + map->data = NULL; + map->size = 0; +} + +/*-************************************* +* Context +***************************************/ + +typedef struct { + const BYTE *samples; + size_t *offsets; + const size_t *samplesSizes; + size_t nbSamples; + U32 *suffix; + size_t suffixSize; + U32 *freqs; + U32 *dmerAt; + unsigned d; +} COVER_ctx_t; + +/* We need a global context for qsort... */ +static COVER_ctx_t *g_ctx = NULL; + +/*-************************************* +* Helper functions +***************************************/ + +/** + * Returns the sum of the sample sizes. + */ +static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) { + size_t sum = 0; + size_t i; + for (i = 0; i < nbSamples; ++i) { + sum += samplesSizes[i]; + } + return sum; +} + +/** + * Returns -1 if the dmer at lp is less than the dmer at rp. + * Return 0 if the dmers at lp and rp are equal. + * Returns 1 if the dmer at lp is greater than the dmer at rp. + */ +static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) { + const U32 lhs = *(const U32 *)lp; + const U32 rhs = *(const U32 *)rp; + return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d); +} + +/** + * Same as COVER_cmp() except ties are broken by pointer value + * NOTE: g_ctx must be set to call this function. A global is required because + * qsort doesn't take an opaque pointer. + */ +static int COVER_strict_cmp(const void *lp, const void *rp) { + int result = COVER_cmp(g_ctx, lp, rp); + if (result == 0) { + result = lp < rp ? -1 : 1; + } + return result; +} + +/** + * Returns the first pointer in [first, last) whose element does not compare + * less than value. If no such element exists it returns last. + */ +static const size_t *COVER_lower_bound(const size_t *first, const size_t *last, + size_t value) { + size_t count = last - first; + while (count != 0) { + size_t step = count / 2; + const size_t *ptr = first; + ptr += step; + if (*ptr < value) { + first = ++ptr; + count -= step + 1; + } else { + count = step; + } + } + return first; +} + +/** + * Generic groupBy function. + * Groups an array sorted by cmp into groups with equivalent values. + * Calls grp for each group. + */ +static void +COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx, + int (*cmp)(COVER_ctx_t *, const void *, const void *), + void (*grp)(COVER_ctx_t *, const void *, const void *)) { + const BYTE *ptr = (const BYTE *)data; + size_t num = 0; + while (num < count) { + const BYTE *grpEnd = ptr + size; + ++num; + while (num < count && cmp(ctx, ptr, grpEnd) == 0) { + grpEnd += size; + ++num; + } + grp(ctx, ptr, grpEnd); + ptr = grpEnd; + } +} + +/*-************************************* +* Cover functions +***************************************/ + +/** + * Called on each group of positions with the same dmer. + * Counts the frequency of each dmer and saves it in the suffix array. + * Fills `ctx->dmerAt`. + */ +static void COVER_group(COVER_ctx_t *ctx, const void *group, + const void *groupEnd) { + /* The group consists of all the positions with the same first d bytes. */ + const U32 *grpPtr = (const U32 *)group; + const U32 *grpEnd = (const U32 *)groupEnd; + /* The dmerId is how we will reference this dmer. + * This allows us to map the whole dmer space to a much smaller space, the + * size of the suffix array. + */ + const U32 dmerId = (U32)(grpPtr - ctx->suffix); + /* Count the number of samples this dmer shows up in */ + U32 freq = 0; + /* Details */ + const size_t *curOffsetPtr = ctx->offsets; + const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples; + /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a + * different sample than the last. + */ + size_t curSampleEnd = ctx->offsets[0]; + for (; grpPtr != grpEnd; ++grpPtr) { + /* Save the dmerId for this position so we can get back to it. */ + ctx->dmerAt[*grpPtr] = dmerId; + /* Dictionaries only help for the first reference to the dmer. + * After that zstd can reference the match from the previous reference. + * So only count each dmer once for each sample it is in. + */ + if (*grpPtr < curSampleEnd) { + continue; + } + freq += 1; + /* Binary search to find the end of the sample *grpPtr is in. + * In the common case that grpPtr + 1 == grpEnd we can skip the binary + * search because the loop is over. + */ + if (grpPtr + 1 != grpEnd) { + const size_t *sampleEndPtr = + COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr); + curSampleEnd = *sampleEndPtr; + curOffsetPtr = sampleEndPtr + 1; + } + } + /* At this point we are never going to look at this segment of the suffix + * array again. We take advantage of this fact to save memory. + * We store the frequency of the dmer in the first position of the group, + * which is dmerId. + */ + ctx->suffix[dmerId] = freq; +} + +/** + * A segment is a range in the source as well as the score of the segment. + */ +typedef struct { + U32 begin; + U32 end; + double score; +} COVER_segment_t; + +/** + * Selects the best segment in an epoch. + * Segments of are scored according to the function: + * + * Let F(d) be the frequency of dmer d. + * Let L(S) be the length of segment S. + * Let S_i be the dmer at position i of segment S. + * + * F(S_1) + F(S_2) + ... + F(S_{L(S)-d+1}) + * Score(S) = -------------------------------------- + * smoothing + L(S) + * + * We try kStep segment lengths in the range [kMin, kMax]. + * For each segment length we find the best segment according to Score. + * We then take the best segment overall according to Score and return it. + * + * The difference from the paper is that we try multiple segment lengths. + * We want to fit the segment length closer to the length of the useful part. + * Longer segments allow longer matches, so they are worth more than shorter + * ones. However, if the extra length isn't high frequency it hurts us. + * We add the smoothing in to give an advantage to longer segments. + * The larger smoothing is, the more longer matches are favored. + */ +static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs, + COVER_map_t *activeDmers, U32 begin, + U32 end, COVER_params_t parameters) { + /* Saves the best segment of any length tried */ + COVER_segment_t globalBestSegment = {0, 0, 0}; + /* For each segment length */ + U32 k; + U32 step = MAX((parameters.kMax - parameters.kMin) / parameters.kStep, 1); + for (k = parameters.kMin; k <= parameters.kMax; k += step) { + /* Save the best segment of this length */ + COVER_segment_t bestSegment = {0, 0, 0}; + COVER_segment_t activeSegment; + const size_t dmersInK = k - ctx->d + 1; + /* Reset the activeDmers in the segment */ + COVER_map_clear(activeDmers); + activeSegment.begin = begin; + activeSegment.end = begin; + activeSegment.score = 0; + /* Slide the active segment through the whole epoch. + * Save the best segment in bestSegment. + */ + while (activeSegment.end < end) { + /* The dmerId for the dmer at the next position */ + U32 newDmer = ctx->dmerAt[activeSegment.end]; + /* The entry in activeDmers for this dmerId */ + U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer); + /* If the dmer isn't already present in the segment add its score. */ + if (*newDmerOcc == 0) { + /* The paper suggest using the L-0.5 norm, but experiments show that it + * doesn't help. + */ + activeSegment.score += freqs[newDmer]; + } + /* Add the dmer to the segment */ + activeSegment.end += 1; + *newDmerOcc += 1; + + /* If the window is now too large, drop the first position */ + if (activeSegment.end - activeSegment.begin == dmersInK + 1) { + U32 delDmer = ctx->dmerAt[activeSegment.begin]; + U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer); + activeSegment.begin += 1; + *delDmerOcc -= 1; + /* If this is the last occurence of the dmer, subtract its score */ + if (*delDmerOcc == 0) { + COVER_map_remove(activeDmers, delDmer); + activeSegment.score -= freqs[delDmer]; + } + } + + /* If this segment is the best so far save it */ + if (activeSegment.score > bestSegment.score) { + bestSegment = activeSegment; + } + } + { + /* Trim off the zero frequency head and tail from the segment. */ + U32 newBegin = bestSegment.end; + U32 newEnd = bestSegment.begin; + U32 pos; + for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { + U32 freq = freqs[ctx->dmerAt[pos]]; + if (freq != 0) { + newBegin = MIN(newBegin, pos); + newEnd = pos + 1; + } + } + bestSegment.begin = newBegin; + bestSegment.end = newEnd; + /* Calculate the final score normalizing for segment length */ + bestSegment.score /= + (parameters.smoothing + (bestSegment.end - bestSegment.begin)); + } + /* If this segment is the best so far for any length save it */ + if (bestSegment.score > globalBestSegment.score) { + globalBestSegment = bestSegment; + } + } + { + /* Zero out the frequency of each dmer covered by the chosen segment. */ + size_t pos; + for (pos = globalBestSegment.begin; pos != globalBestSegment.end; ++pos) { + freqs[ctx->dmerAt[pos]] = 0; + } + } + return globalBestSegment; +} + +/** + * Check the validity of the parameters. + * If the parameters are valid and any are default, set them to the correct + * values. + * Returns 1 on success, 0 on failure. + */ +static int COVER_defaultParameters(COVER_params_t *parameters) { + /* kMin and d are required parameters */ + if (parameters->d == 0 || parameters->kMin == 0) { + return 0; + } + /* d <= kMin */ + if (parameters->d > parameters->kMin) { + return 0; + } + /* If kMax is set (non-zero) then kMin <= kMax */ + if (parameters->kMax != 0 && parameters->kMax < parameters->kMin) { + return 0; + } + /* If kMax is set, then kStep must be as well */ + if (parameters->kMax != 0 && parameters->kStep == 0) { + return 0; + } + parameters->kMax = MAX(parameters->kMin, parameters->kMax); + parameters->kStep = MAX(1, parameters->kStep); + return 1; +} + +/** + * Clean up a context initialized with `COVER_ctx_init()`. + */ +static void COVER_ctx_destroy(COVER_ctx_t *ctx) { + if (!ctx) { + return; + } + if (ctx->suffix) { + free(ctx->suffix); + ctx->suffix = NULL; + } + if (ctx->freqs) { + free(ctx->freqs); + ctx->freqs = NULL; + } + if (ctx->dmerAt) { + free(ctx->dmerAt); + ctx->dmerAt = NULL; + } + if (ctx->offsets) { + free(ctx->offsets); + ctx->offsets = NULL; + } +} + +/** + * Prepare a context for dictionary building. + * The context is only dependent on the parameter `d` and can used multiple + * times. + * Returns 1 on success or zero on error. + * The context must be destroyed with `COVER_ctx_destroy()`. + */ +static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, + unsigned d) { + const BYTE *const samples = (const BYTE *)samplesBuffer; + const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); + /* Checks */ + if (totalSamplesSize < d || + totalSamplesSize > (size_t)COVER_MAX_SAMPLES_SIZE) { + return 0; + } + /* Zero the context */ + memset(ctx, 0, sizeof(*ctx)); + DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples, + (U32)totalSamplesSize); + ctx->samples = samples; + ctx->samplesSizes = samplesSizes; + ctx->nbSamples = nbSamples; + /* Partial suffix array */ + ctx->suffixSize = totalSamplesSize - d + 1; + ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); + /* Maps index to the dmerID */ + ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); + /* The offsets of each file */ + ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); + if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) { + COVER_ctx_destroy(ctx); + return 0; + } + ctx->freqs = NULL; + ctx->d = d; + + /* Fill offsets from the samlesSizes */ + { + U32 i; + ctx->offsets[0] = 0; + for (i = 1; i <= nbSamples; ++i) { + ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; + } + } + DISPLAYLEVEL(2, "Constructing partial suffix array\n"); + { + /* suffix is a partial suffix array. + * It only sorts suffixes by their first parameters.d bytes. + * The sort is stable, so each dmer group is sorted by position in input. + */ + U32 i; + for (i = 0; i < ctx->suffixSize; ++i) { + ctx->suffix[i] = i; + } + /* qsort doesn't take an opaque pointer, so pass as a global */ + g_ctx = ctx; + qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), &COVER_strict_cmp); + } + DISPLAYLEVEL(2, "Computing frequencies\n"); + /* For each dmer group (group of positions with the same first d bytes): + * 1. For each position we set dmerAt[position] = dmerID. The dmerID is + * (groupBeginPtr - suffix). This allows us to go from position to + * dmerID so we can look up values in freq. + * 2. We calculate how many samples the dmer occurs in and save it in + * freqs[dmerId]. + */ + COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, &COVER_cmp, + &COVER_group); + ctx->freqs = ctx->suffix; + ctx->suffix = NULL; + return 1; +} + +/** + * Given the prepared context build the dictionary. + */ +static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs, + COVER_map_t *activeDmers, void *dictBuffer, + size_t dictBufferCapacity, + COVER_params_t parameters) { + BYTE *const dict = (BYTE *)dictBuffer; + size_t tail = dictBufferCapacity; + /* Divide the data up into epochs of equal size. + * We will select at least one segment from each epoch. + */ + const U32 epochs = (U32)(dictBufferCapacity / parameters.kMax); + const U32 epochSize = (U32)(ctx->suffixSize / epochs); + size_t epoch; + DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs, + epochSize); + for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) { + const U32 epochBegin = (U32)(epoch * epochSize); + const U32 epochEnd = epochBegin + epochSize; + size_t segmentSize; + COVER_segment_t segment = COVER_selectSegment( + ctx, freqs, activeDmers, epochBegin, epochEnd, parameters); + segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); + if (segmentSize == 0) { + break; + } + /* We fill the dictionary from the back to allow the best segments to be + * referenced with the smallest offsets. + */ + tail -= segmentSize; + memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + DISPLAYUPDATE( + 2, "\r%u%% ", + (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + return tail; +} + +/** + * Translate from COVER_params_t to ZDICT_params_t required for finalizing the + * dictionary. + */ +static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) { + ZDICT_params_t zdictParams; + memset(&zdictParams, 0, sizeof(zdictParams)); + zdictParams.notificationLevel = 1; + zdictParams.dictID = parameters.dictID; + zdictParams.compressionLevel = parameters.compressionLevel; + return zdictParams; +} + +/** + * Constructs a dictionary using a heuristic based on the following paper: + * + * Liao, Petri, Moffat, Wirth + * Effective Construction of Relative Lempel-Ziv Dictionaries + * Published in WWW 2016. + */ +ZDICTLIB_API size_t COVER_trainFromBuffer( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) { + BYTE *const dict = (BYTE *)dictBuffer; + COVER_ctx_t ctx; + COVER_map_t activeDmers; + size_t rc; + /* Checks */ + if (!COVER_defaultParameters(¶meters)) { + DISPLAYLEVEL(1, "Cover parameters incorrect\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "Cover must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + return ERROR(dstSize_tooSmall); + } + /* Initialize global data */ + g_displayLevel = parameters.notificationLevel; + /* Initialize context and activeDmers */ + if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, + parameters.d)) { + DISPLAYLEVEL(1, "Failed to initialize context\n"); + return ERROR(GENERIC); + } + if (!COVER_map_init(&activeDmers, parameters.kMax - parameters.d + 1)) { + DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); + COVER_ctx_destroy(&ctx); + return ERROR(GENERIC); + } + + DISPLAYLEVEL(2, "Building dictionary\n"); + { + const size_t tail = + COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer, + dictBufferCapacity, parameters); + ZDICT_params_t zdictParams = COVER_translateParams(parameters); + DISPLAYLEVEL(2, "Dictionary content size: %u", + (U32)(dictBufferCapacity - tail)); + rc = ZDICT_finalizeDictionary(dict, dictBufferCapacity, dict + tail, + dictBufferCapacity - tail, samplesBuffer, + samplesSizes, nbSamples, zdictParams); + } + if (!ZSTD_isError(rc)) { + DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", (U32)rc); + } + COVER_ctx_destroy(&ctx); + COVER_map_destroy(&activeDmers); + return rc; +} + +/** + * COVER_best_t is used for two purposes: + * 1. Synchronizing threads. + * 2. Saving the best parameters and dictionary. + * + * All of the methods are thread safe if `ZSTD_PTHREAD` is defined. + */ +typedef struct COVER_best_s { +#ifdef ZSTD_PTHREAD + pthread_mutex_t mutex; + pthread_cond_t cond; + size_t liveJobs; +#endif + void *dict; + size_t dictSize; + COVER_params_t parameters; + size_t compressedSize; +} COVER_best_t; + +/** + * Initialize the `COVER_best_t`. + */ +static void COVER_best_init(COVER_best_t *best) { + if (!best) { + return; + } +#ifdef ZSTD_PTHREAD + pthread_mutex_init(&best->mutex, NULL); + pthread_cond_init(&best->cond, NULL); + best->liveJobs = 0; +#endif + best->dict = NULL; + best->dictSize = 0; + best->compressedSize = (size_t)-1; + memset(&best->parameters, 0, sizeof(best->parameters)); +} + +/** + * Wait until liveJobs == 0. + */ +static void COVER_best_wait(COVER_best_t *best) { + if (!best) { + return; + } +#ifdef ZSTD_PTHREAD + pthread_mutex_lock(&best->mutex); + while (best->liveJobs != 0) { + pthread_cond_wait(&best->cond, &best->mutex); + } + pthread_mutex_unlock(&best->mutex); +#endif +} + +/** + * Call COVER_best_wait() and then destroy the COVER_best_t. + */ +static void COVER_best_destroy(COVER_best_t *best) { + if (!best) { + return; + } + COVER_best_wait(best); + if (best->dict) { + free(best->dict); + } +#ifdef ZSTD_PTHREAD + pthread_mutex_destroy(&best->mutex); + pthread_cond_destroy(&best->cond); +#endif +} + +/** + * Called when a thread is about to be launched. + * Increments liveJobs. + */ +static void COVER_best_start(COVER_best_t *best) { + if (!best) { + return; + } +#ifdef ZSTD_PTHREAD + pthread_mutex_lock(&best->mutex); + ++best->liveJobs; + pthread_mutex_unlock(&best->mutex); +#endif +} + +/** + * Called when a thread finishes executing, both on error or success. + * Decrements liveJobs and signals any waiting threads if liveJobs == 0. + * If this dictionary is the best so far save it and its parameters. + */ +static void COVER_best_finish(COVER_best_t *best, size_t compressedSize, + COVER_params_t parameters, void *dict, + size_t dictSize) { + if (!best) { + return; + } + { +#ifdef ZSTD_PTHREAD + size_t liveJobs; + pthread_mutex_lock(&best->mutex); + --best->liveJobs; + liveJobs = best->liveJobs; +#endif + /* If the new dictionary is better */ + if (compressedSize < best->compressedSize) { + /* Allocate space if necessary */ + if (!best->dict || best->dictSize < dictSize) { + if (best->dict) { + free(best->dict); + } + best->dict = malloc(dictSize); + if (!best->dict) { + best->compressedSize = ERROR(GENERIC); + best->dictSize = 0; + return; + } + } + /* Save the dictionary, parameters, and size */ + memcpy(best->dict, dict, dictSize); + best->dictSize = dictSize; + best->parameters = parameters; + best->compressedSize = compressedSize; + } +#ifdef ZSTD_PTHREAD + pthread_mutex_unlock(&best->mutex); + if (liveJobs == 0) { + pthread_cond_broadcast(&best->cond); + } +#endif + } +} + +/** + * Parameters for COVER_tryParameters(). + */ +typedef struct COVER_tryParameters_data_s { + const COVER_ctx_t *ctx; + COVER_best_t *best; + size_t dictBufferCapacity; + COVER_params_t parameters; +} COVER_tryParameters_data_t; + +/** + * Tries a set of parameters and upates the COVER_best_t with the results. + * This function is thread safe if ZSTD_PTHREAD is defined. + * It takes its parameters as an *OWNING* opaque pointer to support threading. + */ +static void COVER_tryParameters(void *opaque) { + /* Save parameters as local variables */ + COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)opaque; + const COVER_ctx_t *ctx = data->ctx; + COVER_params_t parameters = data->parameters; + size_t dictBufferCapacity = data->dictBufferCapacity; + size_t totalCompressedSize = ERROR(GENERIC); + /* Allocate space for hash table, dict, and freqs */ + COVER_map_t activeDmers; + BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); + if (!COVER_map_init(&activeDmers, parameters.kMax - parameters.d + 1)) { + DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); + goto _cleanup; + } + if (!dict || !freqs) { + DISPLAYLEVEL(1, "Failed to allocate dictionary buffer\n"); + goto _cleanup; + } + /* Copy the frequencies because we need to modify them */ + memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32)); + /* Build the dictionary */ + { + const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict, + dictBufferCapacity, parameters); + ZDICT_params_t zdictParams = COVER_translateParams(parameters); + dictBufferCapacity = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams); + if (ZDICT_isError(dictBufferCapacity)) { + DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); + goto _cleanup; + } + } + /* Check total compressed size */ + { + /* Pointers */ + ZSTD_CCtx *cctx; + ZSTD_CDict *cdict; + void *dst; + /* Local variables */ + size_t dstCapacity; + size_t i; + /* Allocate dst with enough space to compress the maximum sized sample */ + { + size_t maxSampleSize = 0; + for (i = 0; i < ctx->nbSamples; ++i) { + maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize); + } + dstCapacity = ZSTD_compressBound(maxSampleSize); + dst = malloc(dstCapacity); + } + /* Create the cctx and cdict */ + cctx = ZSTD_createCCtx(); + cdict = + ZSTD_createCDict(dict, dictBufferCapacity, parameters.compressionLevel); + if (!dst || !cctx || !cdict) { + goto _compressCleanup; + } + /* Compress each sample and sum their sizes (or error) */ + totalCompressedSize = 0; + for (i = 0; i < ctx->nbSamples; ++i) { + const size_t size = ZSTD_compress_usingCDict( + cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i], + ctx->samplesSizes[i], cdict); + if (ZSTD_isError(size)) { + totalCompressedSize = ERROR(GENERIC); + goto _compressCleanup; + } + totalCompressedSize += size; + } + _compressCleanup: + ZSTD_freeCCtx(cctx); + ZSTD_freeCDict(cdict); + if (dst) { + free(dst); + } + } + +_cleanup: + COVER_best_finish(data->best, totalCompressedSize, parameters, dict, + dictBufferCapacity); + free(data); + COVER_map_destroy(&activeDmers); + if (dict) { + free(dict); + } + if (freqs) { + free(freqs); + } +} + +ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer, + size_t dictBufferCapacity, + const void *samplesBuffer, + const size_t *samplesSizes, + unsigned nbSamples, + COVER_params_t *parameters) { + /* constants */ + const unsigned dMin = parameters->d == 0 ? 6 : parameters->d; + const unsigned dMax = parameters->d == 0 ? 16 : parameters->d; + const unsigned min = parameters->kMin == 0 ? 32 : parameters->kMin; + const unsigned max = parameters->kMax == 0 ? 1024 : parameters->kMax; + const unsigned kStep = parameters->kStep == 0 ? 8 : parameters->kStep; + const unsigned step = MAX((max - min) / kStep, 1); + /* Local variables */ + unsigned iteration = 1; + const unsigned iterations = + (1 + (dMax - dMin) / 2) * (((1 + kStep) * (2 + kStep)) / 2) * 4; + const int displayLevel = parameters->notificationLevel; + unsigned d; + COVER_best_t best; + COVER_best_init(&best); + /* Turn down display level to clean up display at level 2 and below */ + g_displayLevel = parameters->notificationLevel - 1; + /* Loop through d first because each new value needs a new context */ + LOCALDISPLAYLEVEL(displayLevel, 3, "Trying %u different sets of parameters\n", + iterations); + for (d = dMin; d <= dMax; d += 2) { + unsigned kMin; + /* Initialize the context for this value of d */ + COVER_ctx_t ctx; + LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); + if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); + COVER_best_destroy(&best); + return ERROR(GENERIC); + } + /* Loop through the rest of the parameters reusing the same context */ + for (kMin = min; kMin <= max; kMin += step) { + unsigned kMax; + LOCALDISPLAYLEVEL(displayLevel, 3, "kMin=%u\n", kMin); + for (kMax = kMin; kMax <= max; kMax += step) { + unsigned smoothing; + LOCALDISPLAYLEVEL(displayLevel, 3, "kMax=%u\n", kMax); + for (smoothing = kMin / 4; smoothing <= kMin * 2; smoothing *= 2) { + /* Prepare the arguments */ + COVER_tryParameters_data_t *data = + (COVER_tryParameters_data_t *)malloc( + sizeof(COVER_tryParameters_data_t)); + LOCALDISPLAYLEVEL(displayLevel, 3, "smoothing=%u\n", smoothing); + if (!data) { + LOCALDISPLAYLEVEL(displayLevel, 1, + "Failed to allocate parameters\n"); + COVER_best_destroy(&best); + COVER_ctx_destroy(&ctx); + return ERROR(GENERIC); + } + data->ctx = &ctx; + data->best = &best; + data->dictBufferCapacity = dictBufferCapacity; + data->parameters = *parameters; + data->parameters.d = d; + data->parameters.kMin = kMin; + data->parameters.kStep = kStep; + data->parameters.kMax = kMax; + data->parameters.smoothing = smoothing; + /* Call the function and pass ownership of data to it */ + COVER_best_start(&best); + COVER_tryParameters(data); + /* Print status */ + LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ", + (U32)((iteration * 100) / iterations)); + ++iteration; + } + } + } + COVER_best_wait(&best); + COVER_ctx_destroy(&ctx); + } + LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", ""); + /* Fill the output buffer and parameters with output of the best parameters */ + { + const size_t dictSize = best.dictSize; + if (ZSTD_isError(best.compressedSize)) { + COVER_best_destroy(&best); + return best.compressedSize; + } + *parameters = best.parameters; + memcpy(dictBuffer, best.dict, dictSize); + COVER_best_destroy(&best); + return dictSize; + } +} diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h index 63b8f072..4a2e7944 100644 --- a/lib/dictBuilder/zdict.h +++ b/lib/dictBuilder/zdict.h @@ -86,6 +86,58 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dict const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_params_t parameters); +/*! COVER_params_t : + For all values 0 means default. + kMin and d are the only required parameters. +*/ +typedef struct { + unsigned d; /* dmer size : constraint: <= kMin : Should probably be in the range [6, 16]. */ + unsigned kMin; /* Minimum segment size : constraint: > 0 */ + unsigned kStep; /* Try kStep segment lengths uniformly distributed in the range [kMin, kMax] : 0 (default) only if kMax == 0 */ + unsigned kMax; /* Maximum segment size : 0 = kMin (default) : constraint : 0 or >= kMin */ + unsigned smoothing; /* Higher smoothing => larger segments are selected. Only useful if kMax > kMin. */ + + unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ + unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */ + int compressionLevel; /* 0 means default; target a specific zstd compression level */ +} COVER_params_t; + + +/*! COVER_trainFromBuffer() : + Train a dictionary from an array of samples using the COVER algorithm. + Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + The resulting dictionary will be saved into `dictBuffer`. + @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + or an error code, which can be tested with ZDICT_isError(). + Tips : In general, a reasonable dictionary has a size of ~ 100 KB. + It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. + In general, it's recommended to provide a few thousands samples, but this can vary a lot. + It's recommended that total size of all samples be about ~x100 times the target size of dictionary. +*/ +ZDICTLIB_API size_t COVER_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + COVER_params_t parameters); + +/*! COVER_optimizeTrainFromBuffer() : + The same requirements as above hold for all the parameters except `parameters`. + This function tries many parameter combinations and picks the best parameters. + `*parameters` is filled with the best parameters found, and the dictionary + constructed with those parameters is stored in `dictBuffer`. + + All of the {d, kMin, kStep, kMax} are optional, and smoothing is ignored. + If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. + If kStep is non-zero then it is used, otherwise we pick 8. + If kMin and kMax are non-zero, then they limit the search space for kMin and kMax, + otherwise we check kMin and kMax values in the range [32, 1024]. + + @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + or an error code, which can be tested with ZDICT_isError(). + On success `*parameters` contains the parameters selected. +*/ +ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + COVER_params_t *parameters); /*! ZDICT_finalizeDictionary() :