Experiment with 64-bit hash and checksum

This commit is contained in:
Stella Lau 2017-07-20 16:50:06 -07:00
parent 13a01ffb27
commit 273c17b350
8 changed files with 1096 additions and 75 deletions

View File

@ -25,7 +25,7 @@ LDFLAGS += -lzstd
default: all
all: main-circular-buffer main-integrated
all: main-circular-buffer main-integrated main-hf
#main-basic : basic_table.c ldm.c main-ldm.c
# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
@ -33,12 +33,14 @@ all: main-circular-buffer main-integrated
main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
main-hf: ldm_hf_test.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
main-integrated: ldm_with_table.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
clean:
@rm -f core *.o tmp* result* *.ldm *.ldm.dec \
main-basic main-circular-buffer main-integrated
main-basic main-circular-buffer main-integrated main-hf
@echo Cleaning completed

View File

@ -5,14 +5,16 @@
#include "ldm_hashtable.h"
#include "mem.h"
// Number of elements per hash bucket.
// HASH_BUCKET_SIZE_LOG defined in ldm.h
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG))
// TODO: rename. Number of hash buckets.
// TODO: Link to HASH_ENTRY_SIZE_LOG
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-3-(HASH_BUCKET_SIZE_LOG))
//#define ZSTD_SKIP
struct LDM_hashTable {
@ -175,6 +177,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table,
if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) {
U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd);
U32 backwardMatchLength, totalMatchLength;
if (forwardMatchLength < table->minMatchLength) {
continue;
}

View File

@ -4,14 +4,15 @@
#include <stdlib.h>
#include <string.h>
#include "ldm.h"
#include "ldm_hashtable.h"
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
#define LDM_HASH_ENTRY_SIZE_LOG 3
#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3)
// Insert every (HASH_ONLY_EVERY + 1) into the hash table.
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG)))
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)))
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
#define ML_BITS 4
@ -26,8 +27,7 @@
//#define RUN_CHECKS
//#define TMP_RECOMPUTE_LENGTHS
#include "ldm.h"
#include "ldm_hashtable.h"
typedef U32 checksum_t;
// TODO: Scanning speed
// TODO: Memory usage
@ -71,22 +71,22 @@ struct LDM_CCtx {
LDM_hashTable *hashTable;
// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
const BYTE *lastPosHashed; /* Last position hashed */
hash_t lastHash; /* Hash corresponding to lastPosHashed */
U32 lastSum;
checksum_t lastSum;
const BYTE *nextIp; // TODO: this is redundant (ip + step)
const BYTE *nextPosHashed;
hash_t nextHash; /* Hash corresponding to nextPosHashed */
U32 nextSum;
checksum_t nextSum;
unsigned step; // ip step, should be 1.
const BYTE *lagIp;
hash_t lagHash;
U32 lagSum;
checksum_t lagSum;
U64 numHashInserts;
// DEBUG
@ -191,15 +191,15 @@ static hash_t checksumToHash(U32 sum) {
}
/**
* Computes a checksum based on rsync's checksum.
* Computes a 32-bit checksum based on rsync's checksum.
*
* a(k,l) = \sum_{i = k}^l x_i (mod M)
* b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M)
* checksum(k,l) = a(k,l) + 2^{16} * b(k,l)
*/
static U32 getChecksum(const BYTE *buf, U32 len) {
static checksum_t getChecksum(const BYTE *buf, U32 len) {
U32 i;
U32 s1, s2;
checksum_t s1, s2;
s1 = s2 = 0;
for (i = 0; i < (len - 4); i += 4) {
@ -226,8 +226,8 @@ static U32 getChecksum(const BYTE *buf, U32 len) {
*
* Thus toRemove should correspond to data[0].
*/
static U32 updateChecksum(U32 sum, U32 len,
BYTE toRemove, BYTE toAdd) {
static checksum_t updateChecksum(checksum_t sum, U32 len,
BYTE toRemove, BYTE toAdd) {
U32 s1 = (sum & 0xffff) - toRemove + toAdd;
U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1;
@ -262,7 +262,6 @@ static void setNextHash(LDM_CCtx *cctx) {
cctx->nextHash = checksumToHash(cctx->nextSum);
#if LDM_LAG
// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp);
if (cctx->ip - cctx->ibase > LDM_LAG) {
cctx->lagSum = updateChecksum(
cctx->lagSum, LDM_HASH_LENGTH,
@ -288,32 +287,28 @@ static void setNextHash(LDM_CCtx *cctx) {
}
static void putHashOfCurrentPositionFromHash(
LDM_CCtx *cctx, hash_t hash, U32 sum) {
LDM_CCtx *cctx, hash_t hash, U32 checksum) {
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
// Note: this works only when cctx->step is 1.
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
/**
const LDM_hashEntry entry = { cctx->ip - cctx->ibase ,
MEM_read32(cctx->ip) };
*/
#if LDM_LAG
// TODO: off by 1, but whatever
if (cctx->lagIp - cctx->ibase > 0) {
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum };
HASH_insert(cctx->hashTable, cctx->lagHash, entry);
} else {
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum };
HASH_insert(cctx->hashTable, hash, entry);
}
#else
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum };
HASH_insert(cctx->hashTable, hash, entry);
#endif
}
cctx->lastPosHashed = cctx->ip;
cctx->lastHash = hash;
cctx->lastSum = sum;
cctx->lastSum = checksum;
}
/**
@ -336,7 +331,7 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) {
* Insert hash of the current position into the hash table.
*/
static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) {
U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH);
checksum_t sum = getChecksum(cctx->ip, LDM_HASH_LENGTH);
hash_t hash = checksumToHash(sum);
#ifdef RUN_CHECKS
@ -441,7 +436,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
while (entry == NULL) {
hash_t h;
U32 sum;
checksum_t sum;
setNextHash(cctx);
h = cctx->nextHash;
sum = cctx->nextSum;
@ -698,23 +693,7 @@ size_t LDM_decompress(const void *src, size_t compressedSize,
}
// TODO: implement and test hash function
void LDM_test(void) {
void LDM_test(const BYTE *src) {
(void)src;
}
/*
void LDM_test(const void *src, size_t srcSize,
void *dst, size_t maxDstSize) {
const BYTE *ip = (const BYTE *)src + 1125;
U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
U32 sum2;
++ip;
for (; ip < (const BYTE *)src + 1125 + 100; ip++) {
sum2 = updateChecksum(sum, LDM_HASH_LENGTH,
ip[-1], ip[LDM_HASH_LENGTH - 1]);
sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2);
}
}
*/

View File

@ -31,6 +31,7 @@ typedef struct LDM_compressStats LDM_compressStats;
typedef struct LDM_CCtx LDM_CCtx;
typedef struct LDM_DCtx LDM_DCtx;
/**
* Compresses src into dst.
*
@ -151,6 +152,6 @@ void LDM_readHeader(const void *src, U64 *compressedSize,
void LDM_outputConfiguration(void);
void LDM_test(void);
void LDM_test(const BYTE *src);
#endif /* LDM_H */

View File

@ -3,6 +3,8 @@
#include "mem.h"
#define LDM_HASH_ENTRY_SIZE_LOG 3
// TODO: clean up comments
typedef U32 hash_t;

File diff suppressed because it is too large Load Diff

View File

@ -29,7 +29,7 @@
#define CHECKSUM_CHAR_OFFSET 10
// Take first match only.
#define ZSTD_SKIP
//#define ZSTD_SKIP
//#define RUN_CHECKS
@ -292,8 +292,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
totalMatchLength = forwardMatchLength + backwardMatchLength;
if (totalMatchLength >= bestMatchLength &&
totalMatchLength >= LDM_MIN_MATCH_LENGTH) {
if (totalMatchLength >= bestMatchLength) {
bestMatchLength = totalMatchLength;
*pForwardMatchLength = forwardMatchLength;
*pBackwardMatchLength = backwardMatchLength;
@ -305,7 +304,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
}
}
}
if (bestEntry != NULL && bestMatchLength > LDM_MIN_MATCH_LENGTH) {
if (bestEntry != NULL) {
return bestEntry;
}
return NULL;
@ -951,23 +950,8 @@ size_t LDM_decompress(const void *src, size_t compressedSize,
}
// TODO: implement and test hash function
void LDM_test(void) {
void LDM_test(const BYTE *src) {
(void)src;
}
/*
void LDM_test(const void *src, size_t srcSize,
void *dst, size_t maxDstSize) {
const BYTE *ip = (const BYTE *)src + 1125;
U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
U32 sum2;
++ip;
for (; ip < (const BYTE *)src + 1125 + 100; ip++) {
sum2 = updateChecksum(sum, LDM_HASH_LENGTH,
ip[-1], ip[LDM_HASH_LENGTH - 1]);
sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2);
}
}
*/

View File

@ -13,13 +13,13 @@
#include "zstd.h"
#define DEBUG
#define TEST
//#define TEST
/* Compress file given by fname and output to oname.
* Returns 0 if successful, error code otherwise.
*
* TODO: This might seg fault if the compressed size is > the decompress
* size due to the mmapping and output file size allocated to be the input size.
* size due to the mmapping and output file size allocated to be the input size
* The compress function should check before writing or buffer writes.
*/
static int compress(const char *fname, const char *oname) {
@ -69,6 +69,11 @@ static int compress(const char *fname, const char *oname) {
perror("mmap error for output");
return 1;
}
#ifdef TEST
LDM_test((const BYTE *)src);
#endif
gettimeofday(&tv1, NULL);
compressedSize = LDM_HEADER_SIZE +
@ -251,8 +256,5 @@ int main(int argc, const char *argv[]) {
/* verify */
verify(inpFilename, decFilename);
#ifdef TEST
LDM_test();
#endif
return 0;
}