Experiment with 64-bit hash and checksum
This commit is contained in:
parent
13a01ffb27
commit
273c17b350
@ -25,7 +25,7 @@ LDFLAGS += -lzstd
|
||||
|
||||
default: all
|
||||
|
||||
all: main-circular-buffer main-integrated
|
||||
all: main-circular-buffer main-integrated main-hf
|
||||
|
||||
#main-basic : basic_table.c ldm.c main-ldm.c
|
||||
# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
||||
@ -33,12 +33,14 @@ all: main-circular-buffer main-integrated
|
||||
main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c
|
||||
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
main-hf: ldm_hf_test.c main-ldm.c
|
||||
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
main-integrated: ldm_with_table.c main-ldm.c
|
||||
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
|
||||
clean:
|
||||
@rm -f core *.o tmp* result* *.ldm *.ldm.dec \
|
||||
main-basic main-circular-buffer main-integrated
|
||||
main-basic main-circular-buffer main-integrated main-hf
|
||||
@echo Cleaning completed
|
||||
|
||||
|
@ -5,14 +5,16 @@
|
||||
#include "ldm_hashtable.h"
|
||||
#include "mem.h"
|
||||
|
||||
|
||||
// Number of elements per hash bucket.
|
||||
// HASH_BUCKET_SIZE_LOG defined in ldm.h
|
||||
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
|
||||
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG))
|
||||
|
||||
|
||||
|
||||
// TODO: rename. Number of hash buckets.
|
||||
// TODO: Link to HASH_ENTRY_SIZE_LOG
|
||||
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-3-(HASH_BUCKET_SIZE_LOG))
|
||||
|
||||
//#define ZSTD_SKIP
|
||||
|
||||
struct LDM_hashTable {
|
||||
@ -175,6 +177,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table,
|
||||
if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) {
|
||||
U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd);
|
||||
U32 backwardMatchLength, totalMatchLength;
|
||||
|
||||
if (forwardMatchLength < table->minMatchLength) {
|
||||
continue;
|
||||
}
|
||||
|
@ -4,14 +4,15 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ldm.h"
|
||||
#include "ldm_hashtable.h"
|
||||
|
||||
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
|
||||
#define LDM_HASH_ENTRY_SIZE_LOG 3
|
||||
#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
|
||||
#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3)
|
||||
|
||||
// Insert every (HASH_ONLY_EVERY + 1) into the hash table.
|
||||
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG)))
|
||||
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)))
|
||||
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
|
||||
|
||||
#define ML_BITS 4
|
||||
@ -26,8 +27,7 @@
|
||||
//#define RUN_CHECKS
|
||||
//#define TMP_RECOMPUTE_LENGTHS
|
||||
|
||||
#include "ldm.h"
|
||||
#include "ldm_hashtable.h"
|
||||
typedef U32 checksum_t;
|
||||
|
||||
// TODO: Scanning speed
|
||||
// TODO: Memory usage
|
||||
@ -71,22 +71,22 @@ struct LDM_CCtx {
|
||||
|
||||
LDM_hashTable *hashTable;
|
||||
|
||||
// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
|
||||
|
||||
const BYTE *lastPosHashed; /* Last position hashed */
|
||||
hash_t lastHash; /* Hash corresponding to lastPosHashed */
|
||||
U32 lastSum;
|
||||
checksum_t lastSum;
|
||||
|
||||
const BYTE *nextIp; // TODO: this is redundant (ip + step)
|
||||
const BYTE *nextPosHashed;
|
||||
hash_t nextHash; /* Hash corresponding to nextPosHashed */
|
||||
U32 nextSum;
|
||||
checksum_t nextSum;
|
||||
|
||||
|
||||
|
||||
unsigned step; // ip step, should be 1.
|
||||
|
||||
const BYTE *lagIp;
|
||||
hash_t lagHash;
|
||||
U32 lagSum;
|
||||
checksum_t lagSum;
|
||||
|
||||
U64 numHashInserts;
|
||||
// DEBUG
|
||||
@ -191,15 +191,15 @@ static hash_t checksumToHash(U32 sum) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes a checksum based on rsync's checksum.
|
||||
* Computes a 32-bit checksum based on rsync's checksum.
|
||||
*
|
||||
* a(k,l) = \sum_{i = k}^l x_i (mod M)
|
||||
* b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M)
|
||||
* checksum(k,l) = a(k,l) + 2^{16} * b(k,l)
|
||||
*/
|
||||
static U32 getChecksum(const BYTE *buf, U32 len) {
|
||||
static checksum_t getChecksum(const BYTE *buf, U32 len) {
|
||||
U32 i;
|
||||
U32 s1, s2;
|
||||
checksum_t s1, s2;
|
||||
|
||||
s1 = s2 = 0;
|
||||
for (i = 0; i < (len - 4); i += 4) {
|
||||
@ -226,7 +226,7 @@ static U32 getChecksum(const BYTE *buf, U32 len) {
|
||||
*
|
||||
* Thus toRemove should correspond to data[0].
|
||||
*/
|
||||
static U32 updateChecksum(U32 sum, U32 len,
|
||||
static checksum_t updateChecksum(checksum_t sum, U32 len,
|
||||
BYTE toRemove, BYTE toAdd) {
|
||||
U32 s1 = (sum & 0xffff) - toRemove + toAdd;
|
||||
U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1;
|
||||
@ -262,7 +262,6 @@ static void setNextHash(LDM_CCtx *cctx) {
|
||||
cctx->nextHash = checksumToHash(cctx->nextSum);
|
||||
|
||||
#if LDM_LAG
|
||||
// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp);
|
||||
if (cctx->ip - cctx->ibase > LDM_LAG) {
|
||||
cctx->lagSum = updateChecksum(
|
||||
cctx->lagSum, LDM_HASH_LENGTH,
|
||||
@ -288,32 +287,28 @@ static void setNextHash(LDM_CCtx *cctx) {
|
||||
}
|
||||
|
||||
static void putHashOfCurrentPositionFromHash(
|
||||
LDM_CCtx *cctx, hash_t hash, U32 sum) {
|
||||
LDM_CCtx *cctx, hash_t hash, U32 checksum) {
|
||||
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
|
||||
// Note: this works only when cctx->step is 1.
|
||||
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
|
||||
/**
|
||||
const LDM_hashEntry entry = { cctx->ip - cctx->ibase ,
|
||||
MEM_read32(cctx->ip) };
|
||||
*/
|
||||
#if LDM_LAG
|
||||
// TODO: off by 1, but whatever
|
||||
if (cctx->lagIp - cctx->ibase > 0) {
|
||||
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum };
|
||||
HASH_insert(cctx->hashTable, cctx->lagHash, entry);
|
||||
} else {
|
||||
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
|
||||
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum };
|
||||
HASH_insert(cctx->hashTable, hash, entry);
|
||||
}
|
||||
#else
|
||||
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
|
||||
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum };
|
||||
HASH_insert(cctx->hashTable, hash, entry);
|
||||
#endif
|
||||
}
|
||||
|
||||
cctx->lastPosHashed = cctx->ip;
|
||||
cctx->lastHash = hash;
|
||||
cctx->lastSum = sum;
|
||||
cctx->lastSum = checksum;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -336,7 +331,7 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) {
|
||||
* Insert hash of the current position into the hash table.
|
||||
*/
|
||||
static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) {
|
||||
U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH);
|
||||
checksum_t sum = getChecksum(cctx->ip, LDM_HASH_LENGTH);
|
||||
hash_t hash = checksumToHash(sum);
|
||||
|
||||
#ifdef RUN_CHECKS
|
||||
@ -441,7 +436,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
|
||||
|
||||
while (entry == NULL) {
|
||||
hash_t h;
|
||||
U32 sum;
|
||||
checksum_t sum;
|
||||
setNextHash(cctx);
|
||||
h = cctx->nextHash;
|
||||
sum = cctx->nextSum;
|
||||
@ -698,23 +693,7 @@ size_t LDM_decompress(const void *src, size_t compressedSize,
|
||||
}
|
||||
|
||||
// TODO: implement and test hash function
|
||||
void LDM_test(void) {
|
||||
void LDM_test(const BYTE *src) {
|
||||
(void)src;
|
||||
}
|
||||
|
||||
/*
|
||||
void LDM_test(const void *src, size_t srcSize,
|
||||
void *dst, size_t maxDstSize) {
|
||||
const BYTE *ip = (const BYTE *)src + 1125;
|
||||
U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
|
||||
U32 sum2;
|
||||
++ip;
|
||||
for (; ip < (const BYTE *)src + 1125 + 100; ip++) {
|
||||
sum2 = updateChecksum(sum, LDM_HASH_LENGTH,
|
||||
ip[-1], ip[LDM_HASH_LENGTH - 1]);
|
||||
sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
|
||||
printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
@ -31,6 +31,7 @@ typedef struct LDM_compressStats LDM_compressStats;
|
||||
typedef struct LDM_CCtx LDM_CCtx;
|
||||
typedef struct LDM_DCtx LDM_DCtx;
|
||||
|
||||
|
||||
/**
|
||||
* Compresses src into dst.
|
||||
*
|
||||
@ -151,6 +152,6 @@ void LDM_readHeader(const void *src, U64 *compressedSize,
|
||||
|
||||
void LDM_outputConfiguration(void);
|
||||
|
||||
void LDM_test(void);
|
||||
void LDM_test(const BYTE *src);
|
||||
|
||||
#endif /* LDM_H */
|
||||
|
@ -3,6 +3,8 @@
|
||||
|
||||
#include "mem.h"
|
||||
|
||||
#define LDM_HASH_ENTRY_SIZE_LOG 3
|
||||
|
||||
// TODO: clean up comments
|
||||
|
||||
typedef U32 hash_t;
|
||||
|
1048
contrib/long_distance_matching/ldm_hf_test.c
Normal file
1048
contrib/long_distance_matching/ldm_hf_test.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -29,7 +29,7 @@
|
||||
#define CHECKSUM_CHAR_OFFSET 10
|
||||
|
||||
// Take first match only.
|
||||
#define ZSTD_SKIP
|
||||
//#define ZSTD_SKIP
|
||||
|
||||
//#define RUN_CHECKS
|
||||
|
||||
@ -292,8 +292,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
|
||||
|
||||
totalMatchLength = forwardMatchLength + backwardMatchLength;
|
||||
|
||||
if (totalMatchLength >= bestMatchLength &&
|
||||
totalMatchLength >= LDM_MIN_MATCH_LENGTH) {
|
||||
if (totalMatchLength >= bestMatchLength) {
|
||||
bestMatchLength = totalMatchLength;
|
||||
*pForwardMatchLength = forwardMatchLength;
|
||||
*pBackwardMatchLength = backwardMatchLength;
|
||||
@ -305,7 +304,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bestEntry != NULL && bestMatchLength > LDM_MIN_MATCH_LENGTH) {
|
||||
if (bestEntry != NULL) {
|
||||
return bestEntry;
|
||||
}
|
||||
return NULL;
|
||||
@ -951,23 +950,8 @@ size_t LDM_decompress(const void *src, size_t compressedSize,
|
||||
}
|
||||
|
||||
// TODO: implement and test hash function
|
||||
void LDM_test(void) {
|
||||
void LDM_test(const BYTE *src) {
|
||||
(void)src;
|
||||
}
|
||||
|
||||
/*
|
||||
void LDM_test(const void *src, size_t srcSize,
|
||||
void *dst, size_t maxDstSize) {
|
||||
const BYTE *ip = (const BYTE *)src + 1125;
|
||||
U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
|
||||
U32 sum2;
|
||||
++ip;
|
||||
for (; ip < (const BYTE *)src + 1125 + 100; ip++) {
|
||||
sum2 = updateChecksum(sum, LDM_HASH_LENGTH,
|
||||
ip[-1], ip[LDM_HASH_LENGTH - 1]);
|
||||
sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
|
||||
printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
@ -13,13 +13,13 @@
|
||||
#include "zstd.h"
|
||||
|
||||
#define DEBUG
|
||||
#define TEST
|
||||
//#define TEST
|
||||
|
||||
/* Compress file given by fname and output to oname.
|
||||
* Returns 0 if successful, error code otherwise.
|
||||
*
|
||||
* TODO: This might seg fault if the compressed size is > the decompress
|
||||
* size due to the mmapping and output file size allocated to be the input size.
|
||||
* size due to the mmapping and output file size allocated to be the input size
|
||||
* The compress function should check before writing or buffer writes.
|
||||
*/
|
||||
static int compress(const char *fname, const char *oname) {
|
||||
@ -69,6 +69,11 @@ static int compress(const char *fname, const char *oname) {
|
||||
perror("mmap error for output");
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef TEST
|
||||
LDM_test((const BYTE *)src);
|
||||
#endif
|
||||
|
||||
gettimeofday(&tv1, NULL);
|
||||
|
||||
compressedSize = LDM_HEADER_SIZE +
|
||||
@ -251,8 +256,5 @@ int main(int argc, const char *argv[]) {
|
||||
/* verify */
|
||||
verify(inpFilename, decFilename);
|
||||
|
||||
#ifdef TEST
|
||||
LDM_test();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user