Experiment with using a lag when hashing
This commit is contained in:
parent
a00e406231
commit
fc41a87964
@ -25,7 +25,7 @@ LDFLAGS += -lzstd
|
|||||||
|
|
||||||
default: all
|
default: all
|
||||||
|
|
||||||
all: main-basic main-circular-buffer
|
all: main-basic main-circular-buffer main-lag
|
||||||
|
|
||||||
main-basic : basic_table.c ldm.c main-ldm.c
|
main-basic : basic_table.c ldm.c main-ldm.c
|
||||||
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
||||||
@ -33,9 +33,11 @@ main-basic : basic_table.c ldm.c main-ldm.c
|
|||||||
main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c
|
main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c
|
||||||
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
||||||
|
|
||||||
|
main-lag: lag_table.c ldm.c main-ldm.c
|
||||||
|
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
@rm -f core *.o tmp* result* *.ldm *.ldm.dec \
|
@rm -f core *.o tmp* result* *.ldm *.ldm.dec \
|
||||||
main-basic main-circular-buffer
|
main-basic main-circular-buffer main-lag
|
||||||
@echo Cleaning completed
|
@echo Cleaning completed
|
||||||
|
|
||||||
|
@ -27,7 +27,6 @@ LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) {
|
|||||||
return table->entries + hash;
|
return table->entries + hash;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
LDM_hashEntry *HASH_getEntryFromHash(
|
LDM_hashEntry *HASH_getEntryFromHash(
|
||||||
const LDM_hashTable *table, const hash_t hash, const U32 checksum) {
|
const LDM_hashTable *table, const hash_t hash, const U32 checksum) {
|
||||||
(void)checksum;
|
(void)checksum;
|
||||||
@ -43,13 +42,10 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
|||||||
(void)checksum;
|
(void)checksum;
|
||||||
if ((*isValid)(pIn, entry->offset + table->offsetBase)) {
|
if ((*isValid)(pIn, entry->offset + table->offsetBase)) {
|
||||||
return entry;
|
return entry;
|
||||||
} else {
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void HASH_insert(LDM_hashTable *table,
|
void HASH_insert(LDM_hashTable *table,
|
||||||
const hash_t hash, const LDM_hashEntry entry) {
|
const hash_t hash, const LDM_hashEntry entry) {
|
||||||
*getBucket(table, hash) = entry;
|
*getBucket(table, hash) = entry;
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
// refactor code to scale the number of elements appropriately.
|
// refactor code to scale the number of elements appropriately.
|
||||||
|
|
||||||
// Number of elements per hash bucket.
|
// Number of elements per hash bucket.
|
||||||
#define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now
|
#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now
|
||||||
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
|
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
|
||||||
|
|
||||||
struct LDM_hashTable {
|
struct LDM_hashTable {
|
||||||
@ -19,6 +19,7 @@ struct LDM_hashTable {
|
|||||||
// Position corresponding to offset=0 in LDM_hashEntry.
|
// Position corresponding to offset=0 in LDM_hashEntry.
|
||||||
const BYTE *offsetBase;
|
const BYTE *offsetBase;
|
||||||
BYTE *bucketOffsets; // Pointer to current insert position.
|
BYTE *bucketOffsets; // Pointer to current insert position.
|
||||||
|
|
||||||
// Last insert was at bucketOffsets - 1?
|
// Last insert was at bucketOffsets - 1?
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -35,15 +36,6 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) {
|
|||||||
return table->entries + (hash << HASH_BUCKET_SIZE_LOG);
|
return table->entries + (hash << HASH_BUCKET_SIZE_LOG);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
static LDM_hashEntry *getLastInsertFromHash(const LDM_hashTable *table,
|
|
||||||
const hash_t hash) {
|
|
||||||
LDM_hashEntry *bucket = getBucket(table, hash);
|
|
||||||
BYTE offset = (table->bucketOffsets[hash] - 1) & (HASH_BUCKET_SIZE - 1);
|
|
||||||
return bucket + offset;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
||||||
const hash_t hash,
|
const hash_t hash,
|
||||||
const U32 checksum,
|
const U32 checksum,
|
||||||
@ -53,7 +45,12 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
|||||||
LDM_hashEntry *cur = bucket;
|
LDM_hashEntry *cur = bucket;
|
||||||
// TODO: in order of recency?
|
// TODO: in order of recency?
|
||||||
for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) {
|
for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) {
|
||||||
// CHeck checksum for faster check.
|
/*
|
||||||
|
if (cur->checksum == 0 && cur->offset == 0) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
// Check checksum for faster check.
|
||||||
if (cur->checksum == checksum &&
|
if (cur->checksum == checksum &&
|
||||||
(*isValid)(pIn, cur->offset + table->offsetBase)) {
|
(*isValid)(pIn, cur->offset + table->offsetBase)) {
|
||||||
return cur;
|
return cur;
|
||||||
@ -62,7 +59,6 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table,
|
LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table,
|
||||||
const hash_t hash,
|
const hash_t hash,
|
||||||
const U32 checksum) {
|
const U32 checksum) {
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
// Insert every (HASH_ONLY_EVERY + 1) into the hash table.
|
// Insert every (HASH_ONLY_EVERY + 1) into the hash table.
|
||||||
#define HASH_ONLY_EVERY 31
|
#define HASH_ONLY_EVERY 15
|
||||||
|
|
||||||
#define LDM_HASHLOG (LDM_MEMORY_USAGE-2)
|
#define LDM_HASHLOG (LDM_MEMORY_USAGE-2)
|
||||||
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
|
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
|
||||||
@ -18,6 +18,10 @@
|
|||||||
|
|
||||||
#define COMPUTE_STATS
|
#define COMPUTE_STATS
|
||||||
#define CHECKSUM_CHAR_OFFSET 10
|
#define CHECKSUM_CHAR_OFFSET 10
|
||||||
|
|
||||||
|
#define LAG 0
|
||||||
|
|
||||||
|
//#define HASH_CHECK
|
||||||
//#define RUN_CHECKS
|
//#define RUN_CHECKS
|
||||||
//#define LDM_DEBUG
|
//#define LDM_DEBUG
|
||||||
|
|
||||||
@ -79,6 +83,10 @@ struct LDM_CCtx {
|
|||||||
|
|
||||||
unsigned step; // ip step, should be 1.
|
unsigned step; // ip step, should be 1.
|
||||||
|
|
||||||
|
const BYTE *lagIp;
|
||||||
|
hash_t lagHash;
|
||||||
|
U32 lagSum;
|
||||||
|
|
||||||
// DEBUG
|
// DEBUG
|
||||||
const BYTE *DEBUG_setNextHash;
|
const BYTE *DEBUG_setNextHash;
|
||||||
};
|
};
|
||||||
@ -253,6 +261,17 @@ static void setNextHash(LDM_CCtx *cctx) {
|
|||||||
cctx->nextPosHashed = cctx->nextIp;
|
cctx->nextPosHashed = cctx->nextIp;
|
||||||
cctx->nextHash = checksumToHash(cctx->nextSum);
|
cctx->nextHash = checksumToHash(cctx->nextSum);
|
||||||
|
|
||||||
|
#if LAG
|
||||||
|
if (cctx->ip - cctx->ibase > LAG) {
|
||||||
|
// printf("LAG %zu\n", cctx->ip - cctx->lagIp);
|
||||||
|
cctx->lagSum = updateChecksum(
|
||||||
|
cctx->lagSum, LDM_HASH_LENGTH,
|
||||||
|
cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]);
|
||||||
|
cctx->lagIp++;
|
||||||
|
cctx->lagHash = checksumToHash(cctx->lagSum);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef RUN_CHECKS
|
#ifdef RUN_CHECKS
|
||||||
check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH);
|
check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH);
|
||||||
|
|
||||||
@ -270,18 +289,6 @@ static void setNextHash(LDM_CCtx *cctx) {
|
|||||||
|
|
||||||
static void putHashOfCurrentPositionFromHash(
|
static void putHashOfCurrentPositionFromHash(
|
||||||
LDM_CCtx *cctx, hash_t hash, U32 sum) {
|
LDM_CCtx *cctx, hash_t hash, U32 sum) {
|
||||||
/*
|
|
||||||
#ifdef COMPUTE_STATS
|
|
||||||
if (cctx->stats.numHashInserts < HASH_getSize(cctx->hashTable)) {
|
|
||||||
U32 offset = HASH_getEntryFromHash(cctx->hashTable, hash)->offset;
|
|
||||||
cctx->stats.numHashInserts++;
|
|
||||||
if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) {
|
|
||||||
cctx->stats.numCollisions++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
|
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
|
||||||
// Note: this works only when cctx->step is 1.
|
// Note: this works only when cctx->step is 1.
|
||||||
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
|
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
|
||||||
@ -289,8 +296,19 @@ static void putHashOfCurrentPositionFromHash(
|
|||||||
const LDM_hashEntry entry = { cctx->ip - cctx->ibase ,
|
const LDM_hashEntry entry = { cctx->ip - cctx->ibase ,
|
||||||
MEM_read32(cctx->ip) };
|
MEM_read32(cctx->ip) };
|
||||||
*/
|
*/
|
||||||
|
#if LAG
|
||||||
|
// TODO: off by 1, but whatever
|
||||||
|
if (cctx->lagIp - cctx->ibase > 0) {
|
||||||
|
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum };
|
||||||
|
HASH_insert(cctx->hashTable, cctx->lagHash, entry);
|
||||||
|
} else {
|
||||||
|
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
|
||||||
|
HASH_insert(cctx->hashTable, hash, entry);
|
||||||
|
}
|
||||||
|
#else
|
||||||
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
|
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
|
||||||
HASH_insert(cctx->hashTable, hash, entry);
|
HASH_insert(cctx->hashTable, hash, entry);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
cctx->lastPosHashed = cctx->ip;
|
cctx->lastPosHashed = cctx->ip;
|
||||||
@ -331,15 +349,6 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) {
|
|||||||
putHashOfCurrentPositionFromHash(cctx, hash, sum);
|
putHashOfCurrentPositionFromHash(cctx, hash, sum);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the position of the entry at hashTable[hash].
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
static const BYTE *getPositionOnHash(const LDM_CCtx *cctx, const hash_t hash) {
|
|
||||||
return HASH_getEntryFromHash(cctx->hashTable, hash)->offset + cctx->ibase;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
|
U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
|
||||||
const BYTE *pInLimit) {
|
const BYTE *pInLimit) {
|
||||||
const BYTE * const pStart = pIn;
|
const BYTE * const pStart = pIn;
|
||||||
@ -431,12 +440,20 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) {
|
|||||||
if (cctx->ip > cctx->imatchLimit) {
|
if (cctx->ip > cctx->imatchLimit) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
#ifdef HASH_CHECK
|
||||||
|
entry = HASH_getEntryFromHash(cctx->hashTable, h, sum);
|
||||||
|
#else
|
||||||
entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip,
|
entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip,
|
||||||
&LDM_isValidMatch);
|
&LDM_isValidMatch);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (entry != NULL) {
|
if (entry != NULL) {
|
||||||
*match = entry->offset + cctx->ibase;
|
*match = entry->offset + cctx->ibase;
|
||||||
|
#ifdef HASH_CHECK
|
||||||
|
if (!LDM_isValidMatch(cctx->ip, *match)) {
|
||||||
|
entry = NULL;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
putHashOfCurrentPositionFromHash(cctx, h, sum);
|
putHashOfCurrentPositionFromHash(cctx, h, sum);
|
||||||
}
|
}
|
||||||
@ -508,6 +525,12 @@ size_t LDM_compress(const void *src, size_t srcSize,
|
|||||||
/* Hash the first position and put it into the hash table. */
|
/* Hash the first position and put it into the hash table. */
|
||||||
LDM_putHashOfCurrentPosition(&cctx);
|
LDM_putHashOfCurrentPosition(&cctx);
|
||||||
|
|
||||||
|
#if LAG
|
||||||
|
cctx.lagIp = cctx.ip;
|
||||||
|
cctx.lagHash = cctx.lastHash;
|
||||||
|
cctx.lagSum = cctx.lastSum;
|
||||||
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find a match.
|
* Find a match.
|
||||||
* If no more matches can be found (i.e. the length of the remaining input
|
* If no more matches can be found (i.e. the length of the remaining input
|
||||||
@ -575,7 +598,7 @@ size_t LDM_compress(const void *src, size_t srcSize,
|
|||||||
|
|
||||||
/* Encode the last literals (no more matches). */
|
/* Encode the last literals (no more matches). */
|
||||||
{
|
{
|
||||||
const size_t lastRun = cctx.iend - cctx.anchor;
|
const U32 lastRun = cctx.iend - cctx.anchor;
|
||||||
BYTE *pToken = cctx.op++;
|
BYTE *pToken = cctx.op++;
|
||||||
LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun);
|
LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun);
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,8 @@
|
|||||||
#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE))
|
#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE))
|
||||||
#define LDM_OFFSET_SIZE 4
|
#define LDM_OFFSET_SIZE 4
|
||||||
|
|
||||||
// Defines the size of the hash table.
|
// Defines the size of the hash table (currently the number of elements).
|
||||||
#define LDM_MEMORY_USAGE 20
|
#define LDM_MEMORY_USAGE 12
|
||||||
|
|
||||||
#define LDM_WINDOW_SIZE_LOG 30
|
#define LDM_WINDOW_SIZE_LOG 30
|
||||||
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
|
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
|
||||||
|
@ -3,34 +3,54 @@
|
|||||||
|
|
||||||
#include "mem.h"
|
#include "mem.h"
|
||||||
|
|
||||||
|
// TODO: clean up comments
|
||||||
|
|
||||||
typedef U32 hash_t;
|
typedef U32 hash_t;
|
||||||
|
|
||||||
typedef struct LDM_hashEntry {
|
typedef struct LDM_hashEntry {
|
||||||
U32 offset;
|
U32 offset; // TODO: Replace with pointer?
|
||||||
U32 checksum;
|
U32 checksum;
|
||||||
} LDM_hashEntry;
|
} LDM_hashEntry;
|
||||||
|
|
||||||
typedef struct LDM_hashTable LDM_hashTable;
|
typedef struct LDM_hashTable LDM_hashTable;
|
||||||
|
|
||||||
// TODO: rename functions
|
/**
|
||||||
// TODO: comments
|
* Create a hash table with size hash buckets.
|
||||||
|
* LDM_hashEntry.offset is added to offsetBase to calculate pMatch in
|
||||||
|
* HASH_getValidEntry.
|
||||||
|
*/
|
||||||
LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase);
|
LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase);
|
||||||
|
|
||||||
//TODO: unneeded?
|
/**
|
||||||
|
* Returns an LDM_hashEntry from the table that matches the checksum.
|
||||||
|
* Returns NULL if one does not exist.
|
||||||
|
*/
|
||||||
LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table,
|
LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table,
|
||||||
const hash_t hash,
|
const hash_t hash,
|
||||||
const U32 checksum);
|
const U32 checksum);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a valid entry that matches the checksum. A valid entry is defined by
|
||||||
|
* *isValid.
|
||||||
|
*
|
||||||
|
* The function finds an entry matching the checksum, computes pMatch as
|
||||||
|
* offset + table.offsetBase, and calls isValid.
|
||||||
|
*/
|
||||||
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
||||||
const hash_t hash,
|
const hash_t hash,
|
||||||
const U32 checksum,
|
const U32 checksum,
|
||||||
const BYTE *pIn,
|
const BYTE *pIn,
|
||||||
int (*isValid)(const BYTE *pIn, const BYTE *pMatch));
|
int (*isValid)(const BYTE *pIn, const BYTE *pMatch));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Insert an LDM_hashEntry into the bucket corresponding to hash.
|
||||||
|
*/
|
||||||
void HASH_insert(LDM_hashTable *table, const hash_t hash,
|
void HASH_insert(LDM_hashTable *table, const hash_t hash,
|
||||||
const LDM_hashEntry entry);
|
const LDM_hashEntry entry);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the number of distinct hash buckets.
|
||||||
|
*/
|
||||||
U32 HASH_getSize(const LDM_hashTable *table);
|
U32 HASH_getSize(const LDM_hashTable *table);
|
||||||
|
|
||||||
void HASH_destroyTable(LDM_hashTable *table);
|
void HASH_destroyTable(LDM_hashTable *table);
|
||||||
|
@ -163,7 +163,6 @@ static int decompress(const char *fname, const char *oname) {
|
|||||||
outSize = LDM_decompress(
|
outSize = LDM_decompress(
|
||||||
src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE,
|
src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE,
|
||||||
dst, decompressedSize);
|
dst, decompressedSize);
|
||||||
|
|
||||||
printf("Ret size out: %zu\n", outSize);
|
printf("Ret size out: %zu\n", outSize);
|
||||||
ftruncate(fdout, outSize);
|
ftruncate(fdout, outSize);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user