Merge pull request #2036 from bimbashrestha/edist
[contrib] Edit distance match finder
This commit is contained in:
commit
40414f984a
42
contrib/match_finders/README.md
Normal file
42
contrib/match_finders/README.md
Normal file
@ -0,0 +1,42 @@
|
||||
## Edit Distance Match Finder
|
||||
|
||||
```
|
||||
/* This match finder leverages techniques used in file comparison algorithms
|
||||
* to find matches between a dictionary and a source file.
|
||||
*
|
||||
* The original motivation for studying this approach was to try and optimize
|
||||
* Zstandard for the use case of patching: the most common scenario being
|
||||
* updating an existing software package with the next version. When patching,
|
||||
* the difference between the old version of the package and the new version
|
||||
* is generally tiny (most of the new file will be identical to
|
||||
* the old one). In more technical terms, the edit distance (the minimal number
|
||||
* of changes required to take one sequence of bytes to another) between the
|
||||
* files would be small relative to the size of the file.
|
||||
*
|
||||
* Various 'diffing' algorithms utilize this notion of edit distance and
|
||||
* the corrensponding concept of a minimal edit script between two
|
||||
* sequences to identify the regions within two files where they differ.
|
||||
* The core algorithm used in this match finder is described in:
|
||||
*
|
||||
* "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
|
||||
* Algorithmica Vol. 1, 1986, pp. 251-266,
|
||||
* <https://doi.org/10.1007/BF01840446>.
|
||||
*
|
||||
* Additional algorithmic heuristics for speed improvement have also been included.
|
||||
* These we inspired from implementations of various regular and binary diffing
|
||||
* algorithms such as GNU diff, bsdiff, and Xdelta.
|
||||
*
|
||||
* Note: after some experimentation, this approach proved to not provide enough
|
||||
* utility to justify the additional CPU used in finding matches. The one area
|
||||
* where this approach consistenly outperforms Zstandard even on level 19 is
|
||||
* when compressing small files (<10 KB) using a equally small dictionary that
|
||||
* is very similar to the source file. For the use case that this was intended,
|
||||
* (large similar files) this approach by itself took 5-10X longer than zstd-19 and
|
||||
* generally resulted in 2-3X larger files. The core advantage that zstd-19 has
|
||||
* over this appraoch for match finding is the overlapping matches. This approach
|
||||
* cannot find any.
|
||||
*
|
||||
* I'm leaving this in the contrib section in case this ever becomes interesting
|
||||
* to explore again.
|
||||
* */
|
||||
```
|
558
contrib/match_finders/zstd_edist.c
Normal file
558
contrib/match_finders/zstd_edist.c
Normal file
@ -0,0 +1,558 @@
|
||||
/*
|
||||
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* This source code is licensed under both the BSD-style license (found in the
|
||||
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
||||
* in the COPYING file in the root directory of this source tree).
|
||||
* You may select, at your option, one of the above-listed licenses.
|
||||
*/
|
||||
|
||||
/*-*************************************
|
||||
* Dependencies
|
||||
***************************************/
|
||||
|
||||
/* Currently relies on qsort when combining contiguous matches. This can probabily
|
||||
* be avoided but would require changes to the algorithm. The qsort is far from
|
||||
* the bottleneck in this algorithm even for medium sized files so it's probably
|
||||
* not worth trying to address */
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "zstd_edist.h"
|
||||
#include "mem.h"
|
||||
|
||||
/*-*************************************
|
||||
* Constants
|
||||
***************************************/
|
||||
|
||||
/* Just a sential for the entires of the diagnomal matrix */
|
||||
#define ZSTD_EDIST_DIAG_MAX (S32)(1 << 30)
|
||||
|
||||
/* How large should a snake be to be considered a 'big' snake.
|
||||
* For an explanation of what a 'snake' is with respect to the
|
||||
* edit distance matrix, see the linked paper in zstd_edist.h */
|
||||
#define ZSTD_EDIST_SNAKE_THRESH 20
|
||||
|
||||
/* After how many iterations should we start to use the heuristic
|
||||
* based on 'big' snakes */
|
||||
#define ZSTD_EDIST_SNAKE_ITER_THRESH 200
|
||||
|
||||
/* After how many iterations should be just give up and take
|
||||
* the best availabe edit script for this round */
|
||||
#define ZSTD_EDIST_EXPENSIVE_THRESH 1024
|
||||
|
||||
/*-*************************************
|
||||
* Structures
|
||||
***************************************/
|
||||
|
||||
typedef struct {
|
||||
U32 dictIdx;
|
||||
U32 srcIdx;
|
||||
U32 matchLength;
|
||||
} ZSTD_eDist_match;
|
||||
|
||||
typedef struct {
|
||||
const BYTE* dict;
|
||||
const BYTE* src;
|
||||
size_t dictSize;
|
||||
size_t srcSize;
|
||||
S32* forwardDiag; /* Entires of the forward diagonal stored here */
|
||||
S32* backwardDiag; /* Entires of the backward diagonal stored here.
|
||||
* Note: this buffer and the 'forwardDiag' buffer
|
||||
* are contiguous. See the ZSTD_eDist_genSequences */
|
||||
ZSTD_eDist_match* matches; /* Accumulate matches of length 1 in this buffer.
|
||||
* In a subsequence post-processing step, we combine
|
||||
* contiguous matches. */
|
||||
U32 nbMatches;
|
||||
} ZSTD_eDist_state;
|
||||
|
||||
typedef struct {
|
||||
S32 dictMid; /* The mid diagonal for the dictionary */
|
||||
S32 srcMid; /* The mid diagonal for the source */
|
||||
int lowUseHeuristics; /* Should we use heuristics for the low part */
|
||||
int highUseHeuristics; /* Should we use heuristics for the high part */
|
||||
} ZSTD_eDist_partition;
|
||||
|
||||
/*-*************************************
|
||||
* Internal
|
||||
***************************************/
|
||||
|
||||
static void ZSTD_eDist_diag(ZSTD_eDist_state* state,
|
||||
ZSTD_eDist_partition* partition,
|
||||
S32 dictLow, S32 dictHigh, S32 srcLow,
|
||||
S32 srcHigh, int useHeuristics)
|
||||
{
|
||||
S32* const forwardDiag = state->forwardDiag;
|
||||
S32* const backwardDiag = state->backwardDiag;
|
||||
const BYTE* const dict = state->dict;
|
||||
const BYTE* const src = state->src;
|
||||
|
||||
S32 const diagMin = dictLow - srcHigh;
|
||||
S32 const diagMax = dictHigh - srcLow;
|
||||
S32 const forwardMid = dictLow - srcLow;
|
||||
S32 const backwardMid = dictHigh - srcHigh;
|
||||
|
||||
S32 forwardMin = forwardMid;
|
||||
S32 forwardMax = forwardMid;
|
||||
S32 backwardMin = backwardMid;
|
||||
S32 backwardMax = backwardMid;
|
||||
int odd = (forwardMid - backwardMid) & 1;
|
||||
U32 iterations;
|
||||
|
||||
forwardDiag[forwardMid] = dictLow;
|
||||
backwardDiag[backwardMid] = dictHigh;
|
||||
|
||||
/* Main loop for updating diag entries. Unless useHeuristics is
|
||||
* set to false, this loop will run until it finds the minimal
|
||||
* edit script */
|
||||
for (iterations = 1;;iterations++) {
|
||||
S32 diag;
|
||||
int bigSnake = 0;
|
||||
|
||||
if (forwardMin > diagMin) {
|
||||
forwardMin--;
|
||||
forwardDiag[forwardMin - 1] = -1;
|
||||
} else {
|
||||
forwardMin++;
|
||||
}
|
||||
|
||||
if (forwardMax < diagMax) {
|
||||
forwardMax++;
|
||||
forwardDiag[forwardMax + 1] = -1;
|
||||
} else {
|
||||
forwardMax--;
|
||||
}
|
||||
|
||||
for (diag = forwardMax; diag >= forwardMin; diag -= 2) {
|
||||
S32 dictIdx;
|
||||
S32 srcIdx;
|
||||
S32 low = forwardDiag[diag - 1];
|
||||
S32 high = forwardDiag[diag + 1];
|
||||
S32 dictIdx0 = low < high ? high : low + 1;
|
||||
|
||||
for (dictIdx = dictIdx0, srcIdx = dictIdx0 - diag;
|
||||
dictIdx < dictHigh && srcIdx < srcHigh && dict[dictIdx] == src[srcIdx];
|
||||
dictIdx++, srcIdx++) continue;
|
||||
|
||||
if (dictIdx - dictIdx0 > ZSTD_EDIST_SNAKE_THRESH)
|
||||
bigSnake = 1;
|
||||
|
||||
forwardDiag[diag] = dictIdx;
|
||||
|
||||
if (odd && backwardMin <= diag && diag <= backwardMax && backwardDiag[diag] <= dictIdx) {
|
||||
partition->dictMid = dictIdx;
|
||||
partition->srcMid = srcIdx;
|
||||
partition->lowUseHeuristics = 0;
|
||||
partition->highUseHeuristics = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (backwardMin > diagMin) {
|
||||
backwardMin--;
|
||||
backwardDiag[backwardMin - 1] = ZSTD_EDIST_DIAG_MAX;
|
||||
} else {
|
||||
backwardMin++;
|
||||
}
|
||||
|
||||
if (backwardMax < diagMax) {
|
||||
backwardMax++;
|
||||
backwardDiag[backwardMax + 1] = ZSTD_EDIST_DIAG_MAX;
|
||||
} else {
|
||||
backwardMax--;
|
||||
}
|
||||
|
||||
|
||||
for (diag = backwardMax; diag >= backwardMin; diag -= 2) {
|
||||
S32 dictIdx;
|
||||
S32 srcIdx;
|
||||
S32 low = backwardDiag[diag - 1];
|
||||
S32 high = backwardDiag[diag + 1];
|
||||
S32 dictIdx0 = low < high ? low : high - 1;
|
||||
|
||||
for (dictIdx = dictIdx0, srcIdx = dictIdx0 - diag;
|
||||
dictLow < dictIdx && srcLow < srcIdx && dict[dictIdx - 1] == src[srcIdx - 1];
|
||||
dictIdx--, srcIdx--) continue;
|
||||
|
||||
if (dictIdx0 - dictIdx > ZSTD_EDIST_SNAKE_THRESH)
|
||||
bigSnake = 1;
|
||||
|
||||
backwardDiag[diag] = dictIdx;
|
||||
|
||||
if (!odd && forwardMin <= diag && diag <= forwardMax && dictIdx <= forwardDiag[diag]) {
|
||||
partition->dictMid = dictIdx;
|
||||
partition->srcMid = srcIdx;
|
||||
partition->lowUseHeuristics = 0;
|
||||
partition->highUseHeuristics = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!useHeuristics)
|
||||
continue;
|
||||
|
||||
/* Everything under this point is a heuritic. Using these will
|
||||
* substantially speed up the match finding. In some cases, taking
|
||||
* the total match finding time from several minutes to seconds.
|
||||
* Of course, the caveat is that the edit script found may no longer
|
||||
* be optimal */
|
||||
|
||||
/* Big snake heuristic */
|
||||
if (iterations > ZSTD_EDIST_SNAKE_ITER_THRESH && bigSnake) {
|
||||
{
|
||||
S32 best = 0;
|
||||
|
||||
for (diag = forwardMax; diag >= forwardMin; diag -= 2) {
|
||||
S32 diagDiag = diag - forwardMid;
|
||||
S32 dictIdx = forwardDiag[diag];
|
||||
S32 srcIdx = dictIdx - diag;
|
||||
S32 v = (dictIdx - dictLow) * 2 - diagDiag;
|
||||
|
||||
if (v > 12 * (iterations + (diagDiag < 0 ? -diagDiag : diagDiag))) {
|
||||
if (v > best
|
||||
&& dictLow + ZSTD_EDIST_SNAKE_THRESH <= dictIdx && dictIdx <= dictHigh
|
||||
&& srcLow + ZSTD_EDIST_SNAKE_THRESH <= srcIdx && srcIdx <= srcHigh) {
|
||||
S32 k;
|
||||
for (k = 1; dict[dictIdx - k] == src[srcIdx - k]; k++) {
|
||||
if (k == ZSTD_EDIST_SNAKE_THRESH) {
|
||||
best = v;
|
||||
partition->dictMid = dictIdx;
|
||||
partition->srcMid = srcIdx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (best > 0) {
|
||||
partition->lowUseHeuristics = 0;
|
||||
partition->highUseHeuristics = 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
S32 best = 0;
|
||||
|
||||
for (diag = backwardMax; diag >= backwardMin; diag -= 2) {
|
||||
S32 diagDiag = diag - backwardMid;
|
||||
S32 dictIdx = backwardDiag[diag];
|
||||
S32 srcIdx = dictIdx - diag;
|
||||
S32 v = (dictHigh - dictIdx) * 2 + diagDiag;
|
||||
|
||||
if (v > 12 * (iterations + (diagDiag < 0 ? -diagDiag : diagDiag))) {
|
||||
if (v > best
|
||||
&& dictLow < dictIdx && dictIdx <= dictHigh - ZSTD_EDIST_SNAKE_THRESH
|
||||
&& srcLow < srcIdx && srcIdx <= srcHigh - ZSTD_EDIST_SNAKE_THRESH) {
|
||||
int k;
|
||||
for (k = 0; dict[dictIdx + k] == src[srcIdx + k]; k++) {
|
||||
if (k == ZSTD_EDIST_SNAKE_THRESH - 1) {
|
||||
best = v;
|
||||
partition->dictMid = dictIdx;
|
||||
partition->srcMid = srcIdx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (best > 0) {
|
||||
partition->lowUseHeuristics = 1;
|
||||
partition->highUseHeuristics = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* More general 'too expensive' heuristic */
|
||||
if (iterations >= ZSTD_EDIST_EXPENSIVE_THRESH) {
|
||||
S32 forwardDictSrcBest;
|
||||
S32 forwardDictBest = 0;
|
||||
S32 backwardDictSrcBest;
|
||||
S32 backwardDictBest = 0;
|
||||
|
||||
forwardDictSrcBest = -1;
|
||||
for (diag = forwardMax; diag >= forwardMin; diag -= 2) {
|
||||
S32 dictIdx = MIN(forwardDiag[diag], dictHigh);
|
||||
S32 srcIdx = dictIdx - diag;
|
||||
|
||||
if (srcHigh < srcIdx) {
|
||||
dictIdx = srcHigh + diag;
|
||||
srcIdx = srcHigh;
|
||||
}
|
||||
|
||||
if (forwardDictSrcBest < dictIdx + srcIdx) {
|
||||
forwardDictSrcBest = dictIdx + srcIdx;
|
||||
forwardDictBest = dictIdx;
|
||||
}
|
||||
}
|
||||
|
||||
backwardDictSrcBest = ZSTD_EDIST_DIAG_MAX;
|
||||
for (diag = backwardMax; diag >= backwardMin; diag -= 2) {
|
||||
S32 dictIdx = MAX(dictLow, backwardDiag[diag]);
|
||||
S32 srcIdx = dictIdx - diag;
|
||||
|
||||
if (srcIdx < srcLow) {
|
||||
dictIdx = srcLow + diag;
|
||||
srcIdx = srcLow;
|
||||
}
|
||||
|
||||
if (dictIdx + srcIdx < backwardDictSrcBest) {
|
||||
backwardDictSrcBest = dictIdx + srcIdx;
|
||||
backwardDictBest = dictIdx;
|
||||
}
|
||||
}
|
||||
|
||||
if ((dictHigh + srcHigh) - backwardDictSrcBest < forwardDictSrcBest - (dictLow + srcLow)) {
|
||||
partition->dictMid = forwardDictBest;
|
||||
partition->srcMid = forwardDictSrcBest - forwardDictBest;
|
||||
partition->lowUseHeuristics = 0;
|
||||
partition->highUseHeuristics = 1;
|
||||
} else {
|
||||
partition->dictMid = backwardDictBest;
|
||||
partition->srcMid = backwardDictSrcBest - backwardDictBest;
|
||||
partition->lowUseHeuristics = 1;
|
||||
partition->highUseHeuristics = 0;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ZSTD_eDist_insertMatch(ZSTD_eDist_state* state,
|
||||
S32 const dictIdx, S32 const srcIdx)
|
||||
{
|
||||
state->matches[state->nbMatches].dictIdx = dictIdx;
|
||||
state->matches[state->nbMatches].srcIdx = srcIdx;
|
||||
state->matches[state->nbMatches].matchLength = 1;
|
||||
state->nbMatches++;
|
||||
}
|
||||
|
||||
static int ZSTD_eDist_compare(ZSTD_eDist_state* state,
|
||||
S32 dictLow, S32 dictHigh, S32 srcLow,
|
||||
S32 srcHigh, int useHeuristics)
|
||||
{
|
||||
const BYTE* const dict = state->dict;
|
||||
const BYTE* const src = state->src;
|
||||
|
||||
/* Found matches while traversing from the low end */
|
||||
while (dictLow < dictHigh && srcLow < srcHigh && dict[dictLow] == src[srcLow]) {
|
||||
ZSTD_eDist_insertMatch(state, dictLow, srcLow);
|
||||
dictLow++;
|
||||
srcLow++;
|
||||
}
|
||||
|
||||
/* Found matches while traversing from the high end */
|
||||
while (dictLow < dictHigh && srcLow < srcHigh && dict[dictHigh - 1] == src[srcHigh - 1]) {
|
||||
ZSTD_eDist_insertMatch(state, dictHigh - 1, srcHigh - 1);
|
||||
dictHigh--;
|
||||
srcHigh--;
|
||||
}
|
||||
|
||||
/* If the low and high end end up touching. If we wanted to make
|
||||
* note of the differences like most diffing algorithms do, we would
|
||||
* do so here. In our case, we're only concerned with matches
|
||||
* Note: if you wanted to find the edit distance of the algorithm,
|
||||
* you could just accumulate the cost for an insertion/deletion
|
||||
* below. */
|
||||
if (dictLow == dictHigh) {
|
||||
while (srcLow < srcHigh) {
|
||||
/* Reaching this point means inserting src[srcLow] into
|
||||
* the current position of dict */
|
||||
srcLow++;
|
||||
}
|
||||
} else if (srcLow == srcHigh) {
|
||||
while (dictLow < dictHigh) {
|
||||
/* Reaching this point means deleteing dict[dictLow] from
|
||||
* the current positino of dict */
|
||||
dictLow++;
|
||||
}
|
||||
} else {
|
||||
ZSTD_eDist_partition partition;
|
||||
partition.dictMid = 0;
|
||||
partition.srcMid = 0;
|
||||
ZSTD_eDist_diag(state, &partition, dictLow, dictHigh,
|
||||
srcLow, srcHigh, useHeuristics);
|
||||
if (ZSTD_eDist_compare(state, dictLow, partition.dictMid,
|
||||
srcLow, partition.srcMid, partition.lowUseHeuristics))
|
||||
return 1;
|
||||
if (ZSTD_eDist_compare(state, partition.dictMid, dictHigh,
|
||||
partition.srcMid, srcHigh, partition.highUseHeuristics))
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ZSTD_eDist_matchComp(const void* p, const void* q)
|
||||
{
|
||||
S32 const l = ((ZSTD_eDist_match*)p)->srcIdx;
|
||||
S32 const r = ((ZSTD_eDist_match*)q)->srcIdx;
|
||||
return (l - r);
|
||||
}
|
||||
|
||||
/* The matches from the approach above will all be of the form
|
||||
* (dictIdx, srcIdx, 1). this method combines contiguous matches
|
||||
* of length MINMATCH or greater. Matches less than MINMATCH
|
||||
* are discarded */
|
||||
static void ZSTD_eDist_combineMatches(ZSTD_eDist_state* state)
|
||||
{
|
||||
/* Create a new buffer to put the combined matches into
|
||||
* and memcpy to state->matches after */
|
||||
ZSTD_eDist_match* combinedMatches =
|
||||
ZSTD_malloc(state->nbMatches * sizeof(ZSTD_eDist_match),
|
||||
ZSTD_defaultCMem);
|
||||
|
||||
U32 nbCombinedMatches = 1;
|
||||
size_t i;
|
||||
|
||||
/* Make sure that the srcIdx and dictIdx are in sorted order.
|
||||
* The combination step won't work otherwise */
|
||||
qsort(state->matches, state->nbMatches, sizeof(ZSTD_eDist_match), ZSTD_eDist_matchComp);
|
||||
|
||||
memcpy(combinedMatches, state->matches, sizeof(ZSTD_eDist_match));
|
||||
for (i = 1; i < state->nbMatches; i++) {
|
||||
ZSTD_eDist_match const match = state->matches[i];
|
||||
ZSTD_eDist_match const combinedMatch =
|
||||
combinedMatches[nbCombinedMatches - 1];
|
||||
if (combinedMatch.srcIdx + combinedMatch.matchLength == match.srcIdx &&
|
||||
combinedMatch.dictIdx + combinedMatch.matchLength == match.dictIdx) {
|
||||
combinedMatches[nbCombinedMatches - 1].matchLength++;
|
||||
} else {
|
||||
/* Discard matches that are less than MINMATCH */
|
||||
if (combinedMatches[nbCombinedMatches - 1].matchLength < MINMATCH) {
|
||||
nbCombinedMatches--;
|
||||
}
|
||||
|
||||
memcpy(combinedMatches + nbCombinedMatches,
|
||||
state->matches + i, sizeof(ZSTD_eDist_match));
|
||||
nbCombinedMatches++;
|
||||
}
|
||||
}
|
||||
memcpy(state->matches, combinedMatches, nbCombinedMatches * sizeof(ZSTD_eDist_match));
|
||||
state->nbMatches = nbCombinedMatches;
|
||||
ZSTD_free(combinedMatches, ZSTD_defaultCMem);
|
||||
}
|
||||
|
||||
static size_t ZSTD_eDist_convertMatchesToSequences(ZSTD_Sequence* sequences,
|
||||
ZSTD_eDist_state* state)
|
||||
{
|
||||
const ZSTD_eDist_match* matches = state->matches;
|
||||
size_t const nbMatches = state->nbMatches;
|
||||
size_t const dictSize = state->dictSize;
|
||||
size_t nbSequences = 0;
|
||||
size_t i;
|
||||
for (i = 0; i < nbMatches; i++) {
|
||||
ZSTD_eDist_match const match = matches[i];
|
||||
U32 const litLength = !i ? match.srcIdx :
|
||||
match.srcIdx - (matches[i - 1].srcIdx + matches[i - 1].matchLength);
|
||||
U32 const offset = (match.srcIdx + dictSize) - match.dictIdx;
|
||||
U32 const matchLength = match.matchLength;
|
||||
sequences[nbSequences].offset = offset;
|
||||
sequences[nbSequences].litLength = litLength;
|
||||
sequences[nbSequences].matchLength = matchLength;
|
||||
nbSequences++;
|
||||
}
|
||||
return nbSequences;
|
||||
}
|
||||
|
||||
/*-*************************************
|
||||
* Interal utils
|
||||
***************************************/
|
||||
|
||||
static size_t ZSTD_eDist_hamingDist(const BYTE* const a,
|
||||
const BYTE* const b, size_t n)
|
||||
{
|
||||
size_t i;
|
||||
size_t dist = 0;
|
||||
for (i = 0; i < n; i++)
|
||||
dist += a[i] != b[i];
|
||||
return dist;
|
||||
}
|
||||
|
||||
/* This is a pretty naive recursive implementation that should only
|
||||
* be used for quick tests obviously. Don't try and run this on a
|
||||
* GB file or something. There are faster implementations. Use those
|
||||
* if you need to run it for large files. */
|
||||
static size_t ZSTD_eDist_levenshteinDist(const BYTE* const s,
|
||||
size_t const sn, const BYTE* const t,
|
||||
size_t const tn)
|
||||
{
|
||||
size_t a, b, c;
|
||||
|
||||
if (!sn)
|
||||
return tn;
|
||||
if (!tn)
|
||||
return sn;
|
||||
|
||||
if (s[sn - 1] == t[tn - 1])
|
||||
return ZSTD_eDist_levenshteinDist(
|
||||
s, sn - 1, t, tn - 1);
|
||||
|
||||
a = ZSTD_eDist_levenshteinDist(s, sn - 1, t, tn - 1);
|
||||
b = ZSTD_eDist_levenshteinDist(s, sn, t, tn - 1);
|
||||
c = ZSTD_eDist_levenshteinDist(s, sn - 1, t, tn);
|
||||
|
||||
if (a > b)
|
||||
a = b;
|
||||
if (a > c)
|
||||
a = c;
|
||||
|
||||
return a + 1;
|
||||
}
|
||||
|
||||
static void ZSTD_eDist_validateMatches(ZSTD_eDist_match* matches,
|
||||
size_t const nbMatches, const BYTE* const dict,
|
||||
size_t const dictSize, const BYTE* const src,
|
||||
size_t const srcSize)
|
||||
{
|
||||
size_t i;
|
||||
for (i = 0; i < nbMatches; i++) {
|
||||
ZSTD_eDist_match match = matches[i];
|
||||
U32 const dictIdx = match.dictIdx;
|
||||
U32 const srcIdx = match.srcIdx;
|
||||
U32 const matchLength = match.matchLength;
|
||||
|
||||
assert(dictIdx + matchLength < dictSize);
|
||||
assert(srcIdx + matchLength < srcSize);
|
||||
assert(!memcmp(dict + dictIdx, src + srcIdx, matchLength));
|
||||
}
|
||||
}
|
||||
|
||||
/*-*************************************
|
||||
* API
|
||||
***************************************/
|
||||
|
||||
size_t ZSTD_eDist_genSequences(ZSTD_Sequence* sequences,
|
||||
const void* dict, size_t dictSize,
|
||||
const void* src, size_t srcSize,
|
||||
int useHeuristics)
|
||||
{
|
||||
size_t const nbDiags = dictSize + srcSize + 3;
|
||||
S32* buffer = ZSTD_malloc(nbDiags * 2 * sizeof(S32), ZSTD_defaultCMem);
|
||||
ZSTD_eDist_state state;
|
||||
size_t nbSequences = 0;
|
||||
|
||||
state.dict = (const BYTE*)dict;
|
||||
state.src = (const BYTE*)src;
|
||||
state.dictSize = dictSize;
|
||||
state.srcSize = srcSize;
|
||||
state.forwardDiag = buffer;
|
||||
state.backwardDiag = buffer + nbDiags;
|
||||
state.forwardDiag += srcSize + 1;
|
||||
state.backwardDiag += srcSize + 1;
|
||||
state.matches = ZSTD_malloc(srcSize * sizeof(ZSTD_eDist_match), ZSTD_defaultCMem);
|
||||
state.nbMatches = 0;
|
||||
|
||||
ZSTD_eDist_compare(&state, 0, dictSize, 0, srcSize, 1);
|
||||
ZSTD_eDist_combineMatches(&state);
|
||||
nbSequences = ZSTD_eDist_convertMatchesToSequences(sequences, &state);
|
||||
|
||||
ZSTD_free(buffer, ZSTD_defaultCMem);
|
||||
ZSTD_free(state.matches, ZSTD_defaultCMem);
|
||||
|
||||
return nbSequences;
|
||||
}
|
70
contrib/match_finders/zstd_edist.h
Normal file
70
contrib/match_finders/zstd_edist.h
Normal file
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* This source code is licensed under both the BSD-style license (found in the
|
||||
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
||||
* in the COPYING file in the root directory of this source tree).
|
||||
* You may select, at your option, one of the above-listed licenses.
|
||||
*/
|
||||
|
||||
/* This match finder leverages techniques used in file comparison algorithms
|
||||
* to find matches between a dictionary and a source file.
|
||||
*
|
||||
* The original motivation for studying this approach was to try and optimize
|
||||
* Zstandard for the use case of patching: the most common scenario being
|
||||
* updating an existing software package with the next version. When patching,
|
||||
* the difference between the old version of the package and the new version
|
||||
* is generally tiny (most of the new file will be identical to
|
||||
* the old one). In more technical terms, the edit distance (the minimal number
|
||||
* of changes required to take one sequence of bytes to another) between the
|
||||
* files would be small relative to the size of the file.
|
||||
*
|
||||
* Various 'diffing' algorithms utilize this notion of edit distance and
|
||||
* the corrensponding concept of a minimal edit script between two
|
||||
* sequences to identify the regions within two files where they differ.
|
||||
* The core algorithm used in this match finder is described in:
|
||||
*
|
||||
* "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
|
||||
* Algorithmica Vol. 1, 1986, pp. 251-266,
|
||||
* <https://doi.org/10.1007/BF01840446>.
|
||||
*
|
||||
* Additional algorithmic heuristics for speed improvement have also been included.
|
||||
* These we inspired from implementations of various regular and binary diffing
|
||||
* algorithms such as GNU diff, bsdiff, and Xdelta.
|
||||
*
|
||||
* Note: after some experimentation, this approach proved to not provide enough
|
||||
* utility to justify the additional CPU used in finding matches. The one area
|
||||
* where this approach consistenly outperforms Zstandard even on level 19 is
|
||||
* when compressing small files (<10 KB) using a equally small dictionary that
|
||||
* is very similar to the source file. For the use case that this was intended,
|
||||
* (large similar files) this approach by itself took 5-10X longer than zstd-19 and
|
||||
* generally resulted in 2-3X larger files. The core advantage that zstd-19 has
|
||||
* over this appraoch for match finding is the overlapping matches. This approach
|
||||
* cannot find any.
|
||||
*
|
||||
* I'm leaving this in the contrib section in case this ever becomes interesting
|
||||
* to explore again.
|
||||
* */
|
||||
|
||||
#ifndef ZSTD_EDIST_H
|
||||
#define ZSTD_EDIST_H
|
||||
|
||||
/*-*************************************
|
||||
* Dependencies
|
||||
***************************************/
|
||||
|
||||
#include <stddef.h>
|
||||
#include "zstd_internal.h" /* ZSTD_Sequence */
|
||||
|
||||
/*! ZSTD_eDist_genSequences() :
|
||||
* Will populate the provided ZSTD_Sequence buffer with sequences
|
||||
* based on the optimal or near-optimal (depending on 'useHeuristics')
|
||||
* edit script between 'dict' and 'src.'
|
||||
* @return : the number of sequences found */
|
||||
size_t ZSTD_eDist_genSequences(ZSTD_Sequence* sequences,
|
||||
const void* dict, size_t dictSize,
|
||||
const void* src, size_t srcSize,
|
||||
int useHeuristics);
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user