Edit distance match finder

2020-03-13 09:39:33 -05:00 · 2020-03-13 09:39:33 -05:00 · dbdeca1412
commit dbdeca1412
parent 66607d0eac
3 changed files with 670 additions and 0 deletions
--- a/contrib/match_finders/README.md
+++ b/contrib/match_finders/README.md
@ -0,0 +1,42 @@
 ## Edit Distance Match Finder
 ```
 /* This match finder leverages techniques used in file comparison algorithms
 * to find matches between a dictionary and a source file.
 * 
 * The original motivation for studying this approach was to try and optimize 
 * Zstandard for the use case of patching: the most common scenario being 
 * updating an existing software package with the next version. When patching,
 * the difference between the old version of the package and the new version 
 * is generally tiny (most of the new file will be identical to 
 * the old one). In more technical terms, the edit distance (the minimal number 
 * of changes required to take one sequence of bytes to another) between the 
 * files would be small relative to the size of the file. 
 * 
 * Various 'diffing' algorithms utilize this notion of edit distance and 
 * the corrensponding concept of a minimal edit script between two 
 * sequences to identify the regions within two files where they differ. 
 * The core algorithm used in this match finder is described in: 
 * 
 * "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
 *    Algorithmica Vol. 1, 1986, pp. 251-266,
 *    <https://doi.org/10.1007/BF01840446>.
 * 
 * Additional algorithmic heuristics for speed improvement have also been included.
 * These we inspired from implementations of various regular and binary diffing 
 * algorithms such as GNU diff, bsdiff, and Xdelta. 
 * 
 * Note: after some experimentation, this approach proved to not provide enough 
 * utility to justify the additional CPU used in finding matches. The one area
 * where this approach consistenly outperforms Zstandard even on level 19 is 
 * when compressing small files (<10 KB) using a equally small dictionary that 
 * is very similar to the source file. For the use case that this was intended,
 * (large similar files) this approach by itself took 5-10X longer than zstd-19 and 
 * generally resulted in 2-3X larger files. The core advantage that zstd-19 has 
 * over this appraoch for match finding is the overlapping matches. This approach 
 * cannot find any. 
 * 
 * I'm leaving this in the contrib section in case this ever becomes interesting 
 * to explore again.
 * */
 ```
--- a/contrib/match_finders/zstd_edist.c
+++ b/contrib/match_finders/zstd_edist.c
@ -0,0 +1,558 @@
 /*
 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
 * All rights reserved.
 *
 * This source code is licensed under both the BSD-style license (found in the
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
 * in the COPYING file in the root directory of this source tree).
 * You may select, at your option, one of the above-listed licenses.
 */
 /*-*************************************
 *  Dependencies
 ***************************************/
 /* Currently relies on qsort when combining contiguous matches. This can probabily 
 * be avoided but would require changes to the algorithm. The qsort is far from 
 * the bottleneck in this algorithm even for medium sized files so it's probably 
 * not worth trying to address */ 
 #include <stdlib.h>
 #include <assert.h>
 #include "zstd_edist.h"
 #include "mem.h"
 /*-*************************************
 *  Constants
 ***************************************/
 /* Just a sential for the entires of the diagnomal matrix */
 #define ZSTD_EDIST_DIAG_MAX (S32)(1 << 30)
 /* How large should a snake be to be considered a 'big' snake. 
 * For an explanation of what a 'snake' is with respect to the 
 * edit distance matrix, see the linked paper in zstd_edist.h */
 #define ZSTD_EDIST_SNAKE_THRESH 20
 /* After how many iterations should we start to use the heuristic
 * based on 'big' snakes */
 #define ZSTD_EDIST_SNAKE_ITER_THRESH 200
 /* After how many iterations should be just give up and take 
 * the best availabe edit script for this round */ 
 #define ZSTD_EDIST_EXPENSIVE_THRESH 1024
 /*-*************************************
 *  Structures
 ***************************************/
 typedef struct {
    U32 dictIdx;
    U32 srcIdx;
    U32 matchLength;
 } ZSTD_eDist_match;
 typedef struct {
    const BYTE* dict;
    const BYTE* src;
    size_t dictSize;
    size_t srcSize;
    S32* forwardDiag;            /* Entires of the forward diagonal stored here */ 
    S32* backwardDiag;           /* Entires of the backward diagonal stored here.
                                  *   Note: this buffer and the 'forwardDiag' buffer 
                                  *   are contiguous. See the ZSTD_eDist_genSequences */ 
    ZSTD_eDist_match* matches;   /* Accumulate matches of length 1 in this buffer. 
                                  *   In a subsequence post-processing step, we combine 
                                  *   contiguous matches. */
    U32 nbMatches;
 } ZSTD_eDist_state;
 typedef struct {
    S32 dictMid;           /* The mid diagonal for the dictionary */
    S32 srcMid;            /* The mid diagonal for the source */ 
    int lowUseHeuristics;  /* Should we use heuristics for the low part */
    int highUseHeuristics; /* Should we use heuristics for the high part */ 
 } ZSTD_eDist_partition;
 /*-*************************************
 *  Internal
 ***************************************/
 static void ZSTD_eDist_diag(ZSTD_eDist_state* state,
                    ZSTD_eDist_partition* partition,
                    S32 dictLow, S32 dictHigh, S32 srcLow, 
                    S32 srcHigh, int useHeuristics)
 {
    S32* const forwardDiag = state->forwardDiag;
    S32* const backwardDiag = state->backwardDiag;
    const BYTE* const dict = state->dict;
    const BYTE* const src = state->src;
    S32 const diagMin = dictLow - srcHigh;
    S32 const diagMax = dictHigh - srcLow;
    S32 const forwardMid = dictLow - srcLow;
    S32 const backwardMid = dictHigh - srcHigh;
    S32 forwardMin = forwardMid;
    S32 forwardMax = forwardMid;
    S32 backwardMin = backwardMid;
    S32 backwardMax = backwardMid;
    int odd = (forwardMid - backwardMid) & 1;
    U32 iterations;
    forwardDiag[forwardMid] = dictLow;
    backwardDiag[backwardMid] = dictHigh;
    /* Main loop for updating diag entries. Unless useHeuristics is 
     * set to false, this loop will run until it finds the minimal 
     * edit script */ 
    for (iterations = 1;;iterations++) {
        S32 diag;
        int bigSnake = 0;
        if (forwardMin > diagMin) {
            forwardMin--;
            forwardDiag[forwardMin - 1] = -1;
        } else {
            forwardMin++;
        }
        if (forwardMax < diagMax) {
            forwardMax++;
            forwardDiag[forwardMax + 1] = -1;
        } else {
            forwardMax--;
        }
        for (diag = forwardMax; diag >= forwardMin; diag -= 2) {
            S32 dictIdx;
            S32 srcIdx;
            S32 low = forwardDiag[diag - 1];
            S32 high = forwardDiag[diag + 1];
            S32 dictIdx0 = low < high ? high : low + 1;
            for (dictIdx = dictIdx0, srcIdx = dictIdx0 - diag;
                dictIdx < dictHigh && srcIdx < srcHigh && dict[dictIdx] == src[srcIdx];
                dictIdx++, srcIdx++) continue;
            if (dictIdx - dictIdx0 > ZSTD_EDIST_SNAKE_THRESH)
                bigSnake = 1;
            forwardDiag[diag] = dictIdx;
            if (odd && backwardMin <= diag && diag <= backwardMax && backwardDiag[diag] <= dictIdx) {
                partition->dictMid = dictIdx;
                partition->srcMid = srcIdx;
                partition->lowUseHeuristics = 0;
                partition->highUseHeuristics = 0;
                return;
            }
        }
        if (backwardMin > diagMin) {
            backwardMin--;
            backwardDiag[backwardMin - 1] = ZSTD_EDIST_DIAG_MAX;
        } else {
            backwardMin++;
        }
        if (backwardMax < diagMax) {
            backwardMax++;
            backwardDiag[backwardMax + 1] = ZSTD_EDIST_DIAG_MAX;
        } else {
            backwardMax--;
        }
        for (diag = backwardMax; diag >= backwardMin; diag -= 2) {
            S32 dictIdx;
            S32 srcIdx;
            S32 low = backwardDiag[diag - 1];
            S32 high = backwardDiag[diag + 1];
            S32 dictIdx0 = low < high ? low : high - 1;
            for (dictIdx = dictIdx0, srcIdx = dictIdx0 - diag;
                dictLow < dictIdx && srcLow < srcIdx && dict[dictIdx - 1] == src[srcIdx - 1];
                dictIdx--, srcIdx--) continue;
            if (dictIdx0 - dictIdx > ZSTD_EDIST_SNAKE_THRESH)
                bigSnake = 1;
            backwardDiag[diag] = dictIdx;
            if (!odd && forwardMin <= diag && diag <= forwardMax && dictIdx <= forwardDiag[diag]) {
                partition->dictMid = dictIdx;
                partition->srcMid = srcIdx;
                partition->lowUseHeuristics = 0;
                partition->highUseHeuristics = 0;
                return;
            }
        }
        if (!useHeuristics)
            continue;
        /* Everything under this point is a heuritic. Using these will 
         * substantially speed up the match finding. In some cases, taking 
         * the total match finding time from several minutes to seconds.
         * Of course, the caveat is that the edit script found may no longer 
         * be optimal */ 
        /* Big snake heuristic */ 
        if (iterations > ZSTD_EDIST_SNAKE_ITER_THRESH && bigSnake) {
            {
                S32 best = 0;
                for (diag = forwardMax; diag >= forwardMin; diag -= 2) {
                    S32 diagDiag = diag - forwardMid;
                    S32 dictIdx = forwardDiag[diag];
                    S32 srcIdx = dictIdx - diag;
                    S32 v = (dictIdx - dictLow) * 2 - diagDiag;
                    if (v > 12 * (iterations + (diagDiag < 0 ? -diagDiag : diagDiag))) {
                        if (v > best 
                          && dictLow + ZSTD_EDIST_SNAKE_THRESH <= dictIdx && dictIdx <= dictHigh
                          && srcLow + ZSTD_EDIST_SNAKE_THRESH <= srcIdx && srcIdx <= srcHigh) {
                            S32 k;
                            for (k = 1; dict[dictIdx - k] == src[srcIdx - k]; k++) {
                                if (k == ZSTD_EDIST_SNAKE_THRESH) {
                                    best = v;
                                    partition->dictMid = dictIdx;
                                    partition->srcMid = srcIdx;
                                    break;
                                }
                            }
                        }
                    }
                }
                if (best > 0) {
                    partition->lowUseHeuristics = 0;
                    partition->highUseHeuristics = 1;
                    return;
                }
            }
            {
                S32 best = 0;
                for (diag = backwardMax; diag >= backwardMin; diag -= 2) {
                    S32 diagDiag = diag - backwardMid;
                    S32 dictIdx = backwardDiag[diag];
                    S32 srcIdx = dictIdx - diag;
                    S32 v = (dictHigh - dictIdx) * 2 + diagDiag;
                    if (v > 12 * (iterations + (diagDiag < 0 ? -diagDiag : diagDiag))) {
                        if (v > best 
                          && dictLow < dictIdx && dictIdx <= dictHigh - ZSTD_EDIST_SNAKE_THRESH
                          && srcLow < srcIdx && srcIdx <= srcHigh - ZSTD_EDIST_SNAKE_THRESH) {
                            int k;
                            for (k = 0; dict[dictIdx + k] == src[srcIdx + k]; k++) {
                                if (k == ZSTD_EDIST_SNAKE_THRESH - 1) { 
                                    best = v;
                                    partition->dictMid = dictIdx;
                                    partition->srcMid = srcIdx;
                                    break; 
                                }
                            }
                        }
                    }
                }
                if (best > 0) {
                    partition->lowUseHeuristics = 1;
                    partition->highUseHeuristics = 0;
                    return;
                }
            }
        }
        /* More general 'too expensive' heuristic */ 
        if (iterations >= ZSTD_EDIST_EXPENSIVE_THRESH) {
            S32 forwardDictSrcBest;
            S32 forwardDictBest = 0;
            S32 backwardDictSrcBest;
            S32 backwardDictBest = 0;
            forwardDictSrcBest = -1;
            for (diag = forwardMax; diag >= forwardMin; diag -= 2) {
                S32 dictIdx = MIN(forwardDiag[diag], dictHigh);
                S32 srcIdx = dictIdx - diag;
                if (srcHigh < srcIdx) {
                    dictIdx = srcHigh + diag;
                    srcIdx = srcHigh;
                }
                if (forwardDictSrcBest < dictIdx + srcIdx) {
                    forwardDictSrcBest = dictIdx + srcIdx;
                    forwardDictBest = dictIdx;
                }
            }
            backwardDictSrcBest = ZSTD_EDIST_DIAG_MAX;
            for (diag = backwardMax; diag >= backwardMin; diag -= 2) {
                S32 dictIdx = MAX(dictLow, backwardDiag[diag]);
                S32 srcIdx = dictIdx - diag;
                if (srcIdx < srcLow) {
                    dictIdx = srcLow + diag;
                    srcIdx = srcLow;
                }
                if (dictIdx + srcIdx < backwardDictSrcBest) {
                    backwardDictSrcBest = dictIdx + srcIdx;
                    backwardDictBest = dictIdx;
                }
            }
            if ((dictHigh + srcHigh) - backwardDictSrcBest < forwardDictSrcBest - (dictLow + srcLow)) {
                partition->dictMid = forwardDictBest;
                partition->srcMid = forwardDictSrcBest - forwardDictBest;
                partition->lowUseHeuristics = 0;
                partition->highUseHeuristics = 1;
            } else {
                partition->dictMid = backwardDictBest;
                partition->srcMid = backwardDictSrcBest - backwardDictBest;
                partition->lowUseHeuristics = 1;
                partition->highUseHeuristics = 0;
            }
            return;
        }
    }
 }
 static void ZSTD_eDist_insertMatch(ZSTD_eDist_state* state, 
                    S32 const dictIdx, S32 const srcIdx)
 {
    state->matches[state->nbMatches].dictIdx = dictIdx;
    state->matches[state->nbMatches].srcIdx = srcIdx;
    state->matches[state->nbMatches].matchLength = 1;
    state->nbMatches++;
 }
 static int ZSTD_eDist_compare(ZSTD_eDist_state* state,
                    S32 dictLow, S32 dictHigh, S32 srcLow,
                    S32 srcHigh, int useHeuristics)
 {
    const BYTE* const dict = state->dict;
    const BYTE* const src = state->src;
    /* Found matches while traversing from the low end */ 
    while (dictLow < dictHigh && srcLow < srcHigh && dict[dictLow] == src[srcLow]) {
        ZSTD_eDist_insertMatch(state, dictLow, srcLow);
        dictLow++;
        srcLow++;
    }
    /* Found matches while traversing from the high end */
    while (dictLow < dictHigh && srcLow < srcHigh && dict[dictHigh - 1] == src[srcHigh - 1]) {
        ZSTD_eDist_insertMatch(state, dictHigh - 1, srcHigh - 1);
        dictHigh--;
        srcHigh--;
    }
    /* If the low and high end end up touching. If we wanted to make 
     * note of the differences like most diffing algorithms do, we would 
     * do so here. In our case, we're only concerned with matches 
     * Note: if you wanted to find the edit distance of the algorithm, 
     *   you could just accumulate the cost for an insertion/deletion 
     *   below. */ 
    if (dictLow == dictHigh) {
        while (srcLow < srcHigh) {
            /* Reaching this point means inserting src[srcLow] into 
             * the current position of dict */ 
            srcLow++;
        }
    } else if (srcLow == srcHigh) {
        while (dictLow < dictHigh) {
            /* Reaching this point means deleteing dict[dictLow] from 
             * the current positino of dict */ 
            dictLow++;
        }
    } else {
        ZSTD_eDist_partition partition;
        partition.dictMid = 0;
        partition.srcMid = 0;
        ZSTD_eDist_diag(state, &partition, dictLow, dictHigh, 
            srcLow, srcHigh, useHeuristics);
        if (ZSTD_eDist_compare(state, dictLow, partition.dictMid, 
          srcLow, partition.srcMid, partition.lowUseHeuristics))
            return 1;
        if (ZSTD_eDist_compare(state, partition.dictMid, dictHigh,
          partition.srcMid, srcHigh, partition.highUseHeuristics))
            return 1;
    }
    return 0;
 }
 static int ZSTD_eDist_matchComp(const void* p, const void* q)
 {
    S32 const l = ((ZSTD_eDist_match*)p)->srcIdx;
    S32 const r = ((ZSTD_eDist_match*)q)->srcIdx;
    return (l - r);
 }
 /* The matches from the approach above will all be of the form 
 * (dictIdx, srcIdx, 1). this method combines contiguous matches 
 * of length MINMATCH or greater. Matches less than MINMATCH 
 * are discarded */ 
 static void ZSTD_eDist_combineMatches(ZSTD_eDist_state* state)
 {
    /* Create a new buffer to put the combined matches into 
     * and memcpy to state->matches after */ 
    ZSTD_eDist_match* combinedMatches = 
        ZSTD_malloc(state->nbMatches * sizeof(ZSTD_eDist_match), 
        ZSTD_defaultCMem);
    U32 nbCombinedMatches = 1;
    size_t i;
    /* Make sure that the srcIdx and dictIdx are in sorted order.
     * The combination step won't work otherwise */ 
    qsort(state->matches, state->nbMatches, sizeof(ZSTD_eDist_match), ZSTD_eDist_matchComp);
    memcpy(combinedMatches, state->matches, sizeof(ZSTD_eDist_match));
    for (i = 1; i < state->nbMatches; i++) {
        ZSTD_eDist_match const match = state->matches[i];
        ZSTD_eDist_match const combinedMatch = 
            combinedMatches[nbCombinedMatches - 1];
        if (combinedMatch.srcIdx + combinedMatch.matchLength == match.srcIdx && 
          combinedMatch.dictIdx + combinedMatch.matchLength == match.dictIdx) {
            combinedMatches[nbCombinedMatches - 1].matchLength++;
        } else {
            /* Discard matches that are less than MINMATCH */
            if (combinedMatches[nbCombinedMatches - 1].matchLength < MINMATCH) {
                nbCombinedMatches--;
            }
            memcpy(combinedMatches + nbCombinedMatches, 
                state->matches + i, sizeof(ZSTD_eDist_match));
            nbCombinedMatches++;
        }
    }
    memcpy(state->matches, combinedMatches, nbCombinedMatches * sizeof(ZSTD_eDist_match));
    state->nbMatches = nbCombinedMatches;
    ZSTD_free(combinedMatches, ZSTD_defaultCMem);
 }
 static size_t ZSTD_eDist_convertMatchesToSequences(ZSTD_Sequence* sequences, 
    ZSTD_eDist_state* state)
 {
    const ZSTD_eDist_match* matches = state->matches;
    size_t const nbMatches = state->nbMatches;
    size_t const dictSize = state->dictSize;
    size_t nbSequences = 0;
    size_t i;
    for (i = 0; i < nbMatches; i++) {
        ZSTD_eDist_match const match = matches[i];
        U32 const litLength = !i ? match.srcIdx : 
            match.srcIdx - (matches[i - 1].srcIdx + matches[i - 1].matchLength);
        U32 const offset = (match.srcIdx + dictSize) - match.dictIdx;
        U32 const matchLength = match.matchLength;
        sequences[nbSequences].offset = offset;
        sequences[nbSequences].litLength = litLength;
        sequences[nbSequences].matchLength = matchLength;
        nbSequences++;
    }
    return nbSequences;
 }
 /*-*************************************
 *  Interal utils
 ***************************************/
 static size_t ZSTD_eDist_hamingDist(const BYTE* const a,
                        const BYTE* const b, size_t n)
 {
    size_t i;
    size_t dist = 0;
    for (i = 0; i < n; i++)
        dist += a[i] != b[i];
    return dist; 
 }
 /* This is a pretty naive recursive implementation that should only
 * be used for quick tests obviously. Don't try and run this on a 
 * GB file or something. There are faster implementations. Use those
 * if you need to run it for large files. */
 static size_t ZSTD_eDist_levenshteinDist(const BYTE* const s,
                        size_t const sn, const BYTE* const t,
                        size_t const tn)
 {
    size_t a, b, c;
    if (!sn)
        return tn;
    if (!tn)
        return sn;
    if (s[sn - 1] == t[tn - 1])
        return ZSTD_eDist_levenshteinDist(
            s, sn - 1, t, tn - 1);
    a = ZSTD_eDist_levenshteinDist(s, sn - 1, t, tn - 1);
    b = ZSTD_eDist_levenshteinDist(s, sn, t, tn - 1);
    c = ZSTD_eDist_levenshteinDist(s, sn - 1, t, tn);
    if (a > b)
        a = b;
    if (a > c)
        a = c;
    return a + 1;
 }
 static void ZSTD_eDist_validateMatches(ZSTD_eDist_match* matches,
                        size_t const nbMatches, const BYTE* const dict,
                        size_t const dictSize, const BYTE* const src,
                        size_t const srcSize)
 {
    size_t i;
    for (i = 0; i < nbMatches; i++) {
        ZSTD_eDist_match match = matches[i];
        U32 const dictIdx = match.dictIdx;
        U32 const srcIdx = match.srcIdx;
        U32 const matchLength = match.matchLength;
        assert(dictIdx + matchLength < dictSize);
        assert(srcIdx + matchLength < srcSize);
        assert(!memcmp(dict + dictIdx, src + srcIdx, matchLength));
    }
 }
 /*-*************************************
 *  API
 ***************************************/
 size_t ZSTD_eDist_genSequences(ZSTD_Sequence* sequences, 
                        const void* dict, size_t dictSize,
                        const void* src, size_t srcSize,
                        int useHeuristics)
 {
    size_t const nbDiags = dictSize + srcSize + 3;
    S32* buffer = ZSTD_malloc(nbDiags * 2 * sizeof(S32), ZSTD_defaultCMem);
    ZSTD_eDist_state state;
    size_t nbSequences = 0;
    state.dict = (const BYTE*)dict;
    state.src = (const BYTE*)src;
    state.dictSize = dictSize;
    state.srcSize = srcSize;
    state.forwardDiag = buffer;
    state.backwardDiag = buffer + nbDiags;
    state.forwardDiag += srcSize + 1;
    state.backwardDiag += srcSize + 1;
    state.matches = ZSTD_malloc(srcSize * sizeof(ZSTD_eDist_match), ZSTD_defaultCMem);
    state.nbMatches = 0;
    ZSTD_eDist_compare(&state, 0, dictSize, 0, srcSize, 1);
    ZSTD_eDist_combineMatches(&state);
    nbSequences = ZSTD_eDist_convertMatchesToSequences(sequences, &state);
    ZSTD_free(buffer, ZSTD_defaultCMem);
    ZSTD_free(state.matches, ZSTD_defaultCMem);
    return nbSequences;
 }
--- a/contrib/match_finders/zstd_edist.h
+++ b/contrib/match_finders/zstd_edist.h
@ -0,0 +1,70 @@
 /*
 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
 * All rights reserved.
 *
 * This source code is licensed under both the BSD-style license (found in the
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
 * in the COPYING file in the root directory of this source tree).
 * You may select, at your option, one of the above-listed licenses.
 */
 /* This match finder leverages techniques used in file comparison algorithms
 * to find matches between a dictionary and a source file.
 * 
 * The original motivation for studying this approach was to try and optimize 
 * Zstandard for the use case of patching: the most common scenario being 
 * updating an existing software package with the next version. When patching,
 * the difference between the old version of the package and the new version 
 * is generally tiny (most of the new file will be identical to 
 * the old one). In more technical terms, the edit distance (the minimal number 
 * of changes required to take one sequence of bytes to another) between the 
 * files would be small relative to the size of the file. 
 * 
 * Various 'diffing' algorithms utilize this notion of edit distance and 
 * the corrensponding concept of a minimal edit script between two 
 * sequences to identify the regions within two files where they differ. 
 * The core algorithm used in this match finder is described in: 
 * 
 * "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
 *    Algorithmica Vol. 1, 1986, pp. 251-266,
 *    <https://doi.org/10.1007/BF01840446>.
 * 
 * Additional algorithmic heuristics for speed improvement have also been included.
 * These we inspired from implementations of various regular and binary diffing 
 * algorithms such as GNU diff, bsdiff, and Xdelta. 
 * 
 * Note: after some experimentation, this approach proved to not provide enough 
 * utility to justify the additional CPU used in finding matches. The one area
 * where this approach consistenly outperforms Zstandard even on level 19 is 
 * when compressing small files (<10 KB) using a equally small dictionary that 
 * is very similar to the source file. For the use case that this was intended,
 * (large similar files) this approach by itself took 5-10X longer than zstd-19 and 
 * generally resulted in 2-3X larger files. The core advantage that zstd-19 has 
 * over this appraoch for match finding is the overlapping matches. This approach 
 * cannot find any. 
 * 
 * I'm leaving this in the contrib section in case this ever becomes interesting 
 * to explore again.
 * */
 #ifndef ZSTD_EDIST_H
 #define ZSTD_EDIST_H
 /*-*************************************
 *  Dependencies
 ***************************************/
 #include <stddef.h>
 #include "zstd_internal.h" /* ZSTD_Sequence */
 /*! ZSTD_eDist_genSequences() : 
 * Will populate the provided ZSTD_Sequence buffer with sequences 
 * based on the optimal or near-optimal (depending on 'useHeuristics')
 * edit script between 'dict' and 'src.' 
 * @return : the number of sequences found */ 
 size_t ZSTD_eDist_genSequences(ZSTD_Sequence* sequences, 
                        const void* dict, size_t dictSize,
                        const void* src, size_t srcSize,
                        int useHeuristics);
 #endif