Experiment with using a lag when hashing

This commit is contained in:
Stella Lau 2017-07-17 18:13:09 -07:00
parent a00e406231
commit fc41a87964
7 changed files with 88 additions and 52 deletions

View File

@ -25,7 +25,7 @@ LDFLAGS += -lzstd
default: all default: all
all: main-basic main-circular-buffer all: main-basic main-circular-buffer main-lag
main-basic : basic_table.c ldm.c main-ldm.c main-basic : basic_table.c ldm.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
@ -33,9 +33,11 @@ main-basic : basic_table.c ldm.c main-ldm.c
main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
main-lag: lag_table.c ldm.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
clean: clean:
@rm -f core *.o tmp* result* *.ldm *.ldm.dec \ @rm -f core *.o tmp* result* *.ldm *.ldm.dec \
main-basic main-circular-buffer main-basic main-circular-buffer main-lag
@echo Cleaning completed @echo Cleaning completed

View File

@ -27,7 +27,6 @@ LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) {
return table->entries + hash; return table->entries + hash;
} }
LDM_hashEntry *HASH_getEntryFromHash( LDM_hashEntry *HASH_getEntryFromHash(
const LDM_hashTable *table, const hash_t hash, const U32 checksum) { const LDM_hashTable *table, const hash_t hash, const U32 checksum) {
(void)checksum; (void)checksum;
@ -43,13 +42,10 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
(void)checksum; (void)checksum;
if ((*isValid)(pIn, entry->offset + table->offsetBase)) { if ((*isValid)(pIn, entry->offset + table->offsetBase)) {
return entry; return entry;
} else {
return NULL;
} }
return NULL;
} }
void HASH_insert(LDM_hashTable *table, void HASH_insert(LDM_hashTable *table,
const hash_t hash, const LDM_hashEntry entry) { const hash_t hash, const LDM_hashEntry entry) {
*getBucket(table, hash) = entry; *getBucket(table, hash) = entry;

View File

@ -9,7 +9,7 @@
// refactor code to scale the number of elements appropriately. // refactor code to scale the number of elements appropriately.
// Number of elements per hash bucket. // Number of elements per hash bucket.
#define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now #define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
struct LDM_hashTable { struct LDM_hashTable {
@ -19,6 +19,7 @@ struct LDM_hashTable {
// Position corresponding to offset=0 in LDM_hashEntry. // Position corresponding to offset=0 in LDM_hashEntry.
const BYTE *offsetBase; const BYTE *offsetBase;
BYTE *bucketOffsets; // Pointer to current insert position. BYTE *bucketOffsets; // Pointer to current insert position.
// Last insert was at bucketOffsets - 1? // Last insert was at bucketOffsets - 1?
}; };
@ -35,15 +36,6 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) {
return table->entries + (hash << HASH_BUCKET_SIZE_LOG); return table->entries + (hash << HASH_BUCKET_SIZE_LOG);
} }
/*
static LDM_hashEntry *getLastInsertFromHash(const LDM_hashTable *table,
const hash_t hash) {
LDM_hashEntry *bucket = getBucket(table, hash);
BYTE offset = (table->bucketOffsets[hash] - 1) & (HASH_BUCKET_SIZE - 1);
return bucket + offset;
}
*/
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
const hash_t hash, const hash_t hash,
const U32 checksum, const U32 checksum,
@ -53,7 +45,12 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
LDM_hashEntry *cur = bucket; LDM_hashEntry *cur = bucket;
// TODO: in order of recency? // TODO: in order of recency?
for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) {
// CHeck checksum for faster check. /*
if (cur->checksum == 0 && cur->offset == 0) {
return NULL;
}
*/
// Check checksum for faster check.
if (cur->checksum == checksum && if (cur->checksum == checksum &&
(*isValid)(pIn, cur->offset + table->offsetBase)) { (*isValid)(pIn, cur->offset + table->offsetBase)) {
return cur; return cur;
@ -62,7 +59,6 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
return NULL; return NULL;
} }
LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table,
const hash_t hash, const hash_t hash,
const U32 checksum) { const U32 checksum) {

View File

@ -5,7 +5,7 @@
#include <string.h> #include <string.h>
// Insert every (HASH_ONLY_EVERY + 1) into the hash table. // Insert every (HASH_ONLY_EVERY + 1) into the hash table.
#define HASH_ONLY_EVERY 31 #define HASH_ONLY_EVERY 15
#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHLOG (LDM_MEMORY_USAGE-2)
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
@ -18,6 +18,10 @@
#define COMPUTE_STATS #define COMPUTE_STATS
#define CHECKSUM_CHAR_OFFSET 10 #define CHECKSUM_CHAR_OFFSET 10
#define LAG 0
//#define HASH_CHECK
//#define RUN_CHECKS //#define RUN_CHECKS
//#define LDM_DEBUG //#define LDM_DEBUG
@ -79,6 +83,10 @@ struct LDM_CCtx {
unsigned step; // ip step, should be 1. unsigned step; // ip step, should be 1.
const BYTE *lagIp;
hash_t lagHash;
U32 lagSum;
// DEBUG // DEBUG
const BYTE *DEBUG_setNextHash; const BYTE *DEBUG_setNextHash;
}; };
@ -253,6 +261,17 @@ static void setNextHash(LDM_CCtx *cctx) {
cctx->nextPosHashed = cctx->nextIp; cctx->nextPosHashed = cctx->nextIp;
cctx->nextHash = checksumToHash(cctx->nextSum); cctx->nextHash = checksumToHash(cctx->nextSum);
#if LAG
if (cctx->ip - cctx->ibase > LAG) {
// printf("LAG %zu\n", cctx->ip - cctx->lagIp);
cctx->lagSum = updateChecksum(
cctx->lagSum, LDM_HASH_LENGTH,
cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]);
cctx->lagIp++;
cctx->lagHash = checksumToHash(cctx->lagSum);
}
#endif
#ifdef RUN_CHECKS #ifdef RUN_CHECKS
check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH);
@ -270,18 +289,6 @@ static void setNextHash(LDM_CCtx *cctx) {
static void putHashOfCurrentPositionFromHash( static void putHashOfCurrentPositionFromHash(
LDM_CCtx *cctx, hash_t hash, U32 sum) { LDM_CCtx *cctx, hash_t hash, U32 sum) {
/*
#ifdef COMPUTE_STATS
if (cctx->stats.numHashInserts < HASH_getSize(cctx->hashTable)) {
U32 offset = HASH_getEntryFromHash(cctx->hashTable, hash)->offset;
cctx->stats.numHashInserts++;
if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) {
cctx->stats.numCollisions++;
}
}
#endif
*/
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
// Note: this works only when cctx->step is 1. // Note: this works only when cctx->step is 1.
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
@ -289,8 +296,19 @@ static void putHashOfCurrentPositionFromHash(
const LDM_hashEntry entry = { cctx->ip - cctx->ibase , const LDM_hashEntry entry = { cctx->ip - cctx->ibase ,
MEM_read32(cctx->ip) }; MEM_read32(cctx->ip) };
*/ */
#if LAG
// TODO: off by 1, but whatever
if (cctx->lagIp - cctx->ibase > 0) {
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum };
HASH_insert(cctx->hashTable, cctx->lagHash, entry);
} else {
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
HASH_insert(cctx->hashTable, hash, entry);
}
#else
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
HASH_insert(cctx->hashTable, hash, entry); HASH_insert(cctx->hashTable, hash, entry);
#endif
} }
cctx->lastPosHashed = cctx->ip; cctx->lastPosHashed = cctx->ip;
@ -331,15 +349,6 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) {
putHashOfCurrentPositionFromHash(cctx, hash, sum); putHashOfCurrentPositionFromHash(cctx, hash, sum);
} }
/**
* Returns the position of the entry at hashTable[hash].
*/
/*
static const BYTE *getPositionOnHash(const LDM_CCtx *cctx, const hash_t hash) {
return HASH_getEntryFromHash(cctx->hashTable, hash)->offset + cctx->ibase;
}
*/
U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
const BYTE *pInLimit) { const BYTE *pInLimit) {
const BYTE * const pStart = pIn; const BYTE * const pStart = pIn;
@ -431,12 +440,20 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) {
if (cctx->ip > cctx->imatchLimit) { if (cctx->ip > cctx->imatchLimit) {
return 1; return 1;
} }
#ifdef HASH_CHECK
entry = HASH_getEntryFromHash(cctx->hashTable, h, sum);
#else
entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip, entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip,
&LDM_isValidMatch); &LDM_isValidMatch);
#endif
if (entry != NULL) { if (entry != NULL) {
*match = entry->offset + cctx->ibase; *match = entry->offset + cctx->ibase;
#ifdef HASH_CHECK
if (!LDM_isValidMatch(cctx->ip, *match)) {
entry = NULL;
}
#endif
} }
putHashOfCurrentPositionFromHash(cctx, h, sum); putHashOfCurrentPositionFromHash(cctx, h, sum);
} }
@ -508,6 +525,12 @@ size_t LDM_compress(const void *src, size_t srcSize,
/* Hash the first position and put it into the hash table. */ /* Hash the first position and put it into the hash table. */
LDM_putHashOfCurrentPosition(&cctx); LDM_putHashOfCurrentPosition(&cctx);
#if LAG
cctx.lagIp = cctx.ip;
cctx.lagHash = cctx.lastHash;
cctx.lagSum = cctx.lastSum;
#endif
/** /**
* Find a match. * Find a match.
* If no more matches can be found (i.e. the length of the remaining input * If no more matches can be found (i.e. the length of the remaining input
@ -575,7 +598,7 @@ size_t LDM_compress(const void *src, size_t srcSize,
/* Encode the last literals (no more matches). */ /* Encode the last literals (no more matches). */
{ {
const size_t lastRun = cctx.iend - cctx.anchor; const U32 lastRun = cctx.iend - cctx.anchor;
BYTE *pToken = cctx.op++; BYTE *pToken = cctx.op++;
LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun);
} }

View File

@ -10,8 +10,8 @@
#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE))
#define LDM_OFFSET_SIZE 4 #define LDM_OFFSET_SIZE 4
// Defines the size of the hash table. // Defines the size of the hash table (currently the number of elements).
#define LDM_MEMORY_USAGE 20 #define LDM_MEMORY_USAGE 12
#define LDM_WINDOW_SIZE_LOG 30 #define LDM_WINDOW_SIZE_LOG 30
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))

View File

@ -3,34 +3,54 @@
#include "mem.h" #include "mem.h"
// TODO: clean up comments
typedef U32 hash_t; typedef U32 hash_t;
typedef struct LDM_hashEntry { typedef struct LDM_hashEntry {
U32 offset; U32 offset; // TODO: Replace with pointer?
U32 checksum; U32 checksum;
} LDM_hashEntry; } LDM_hashEntry;
typedef struct LDM_hashTable LDM_hashTable; typedef struct LDM_hashTable LDM_hashTable;
// TODO: rename functions /**
// TODO: comments * Create a hash table with size hash buckets.
* LDM_hashEntry.offset is added to offsetBase to calculate pMatch in
* HASH_getValidEntry.
*/
LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase); LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase);
//TODO: unneeded? /**
* Returns an LDM_hashEntry from the table that matches the checksum.
* Returns NULL if one does not exist.
*/
LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table,
const hash_t hash, const hash_t hash,
const U32 checksum); const U32 checksum);
/**
* Gets a valid entry that matches the checksum. A valid entry is defined by
* *isValid.
*
* The function finds an entry matching the checksum, computes pMatch as
* offset + table.offsetBase, and calls isValid.
*/
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
const hash_t hash, const hash_t hash,
const U32 checksum, const U32 checksum,
const BYTE *pIn, const BYTE *pIn,
int (*isValid)(const BYTE *pIn, const BYTE *pMatch)); int (*isValid)(const BYTE *pIn, const BYTE *pMatch));
/**
* Insert an LDM_hashEntry into the bucket corresponding to hash.
*/
void HASH_insert(LDM_hashTable *table, const hash_t hash, void HASH_insert(LDM_hashTable *table, const hash_t hash,
const LDM_hashEntry entry); const LDM_hashEntry entry);
/**
* Return the number of distinct hash buckets.
*/
U32 HASH_getSize(const LDM_hashTable *table); U32 HASH_getSize(const LDM_hashTable *table);
void HASH_destroyTable(LDM_hashTable *table); void HASH_destroyTable(LDM_hashTable *table);

View File

@ -163,7 +163,6 @@ static int decompress(const char *fname, const char *oname) {
outSize = LDM_decompress( outSize = LDM_decompress(
src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE,
dst, decompressedSize); dst, decompressedSize);
printf("Ret size out: %zu\n", outSize); printf("Ret size out: %zu\n", outSize);
ftruncate(fdout, outSize); ftruncate(fdout, outSize);