Move hash table access for own functions
This commit is contained in:
parent
2d8e6c6608
commit
6e443b4960
@ -23,17 +23,23 @@ struct LDM_hashEntry {
|
|||||||
offset_t offset;
|
offset_t offset;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct LDM_hashTable {
|
// TODO: move to its own file.
|
||||||
U32 numEntries;
|
struct LDM_hashTable {
|
||||||
U32 minimumTagMask; // TODO: what if tag == offset?
|
U32 size;
|
||||||
|
|
||||||
// Maximum number of elements in the table.
|
|
||||||
U32 limit;
|
|
||||||
|
|
||||||
LDM_hashEntry *entries;
|
LDM_hashEntry *entries;
|
||||||
} LDM_hashTable;
|
};
|
||||||
|
|
||||||
|
LDM_hashEntry *HASH_getHash(
|
||||||
|
const LDM_hashTable *table, const hash_t hash) {
|
||||||
|
return &(table->entries[hash]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void HASH_insert(LDM_hashTable *table,
|
||||||
|
const hash_t hash, const LDM_hashEntry entry) {
|
||||||
|
*HASH_getHash(table, hash) = entry;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// TODO: Add offset histogram by powers of two
|
|
||||||
// TODO: Scanning speed
|
// TODO: Scanning speed
|
||||||
// TODO: Memory usage
|
// TODO: Memory usage
|
||||||
struct LDM_compressStats {
|
struct LDM_compressStats {
|
||||||
@ -74,7 +80,9 @@ struct LDM_CCtx {
|
|||||||
|
|
||||||
LDM_compressStats stats; /* Compression statistics */
|
LDM_compressStats stats; /* Compression statistics */
|
||||||
|
|
||||||
LDM_hashEntry *hashTable;
|
LDM_hashTable hashTable;
|
||||||
|
|
||||||
|
// LDM_hashEntry *hashTable;
|
||||||
|
|
||||||
// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
|
// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
|
||||||
|
|
||||||
@ -93,18 +101,19 @@ struct LDM_CCtx {
|
|||||||
const BYTE *DEBUG_setNextHash;
|
const BYTE *DEBUG_setNextHash;
|
||||||
};
|
};
|
||||||
|
|
||||||
void LDM_outputHashTableOccupancy(
|
|
||||||
const LDM_hashEntry *hashTable, U32 hashTableSize) {
|
|
||||||
|
void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable) {
|
||||||
U32 i = 0;
|
U32 i = 0;
|
||||||
U32 ctr = 0;
|
U32 ctr = 0;
|
||||||
for (; i < hashTableSize; i++) {
|
for (; i < hashTable->size; i++) {
|
||||||
if (hashTable[i].offset == 0) {
|
if (HASH_getHash(hashTable, i)->offset == 0) {
|
||||||
ctr++;
|
ctr++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n",
|
printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n",
|
||||||
hashTableSize, ctr,
|
hashTable->size, ctr,
|
||||||
100.0 * (double)(ctr) / (double)hashTableSize);
|
100.0 * (double)(ctr) / (double)hashTable->size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: This can be done more efficiently (but it is not that important as it
|
// TODO: This can be done more efficiently (but it is not that important as it
|
||||||
@ -120,13 +129,14 @@ static int intLog2(U32 x) {
|
|||||||
// TODO: Maybe we would eventually prefer to have linear rather than
|
// TODO: Maybe we would eventually prefer to have linear rather than
|
||||||
// exponential buckets.
|
// exponential buckets.
|
||||||
void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) {
|
void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) {
|
||||||
int i = 0;
|
U32 i = 0;
|
||||||
int buckets[32] = { 0 };
|
int buckets[32] = { 0 };
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Hash table histogram\n");
|
printf("Hash table histogram\n");
|
||||||
for (; i < LDM_HASHTABLESIZE_U32; i++) {
|
for (; i < cctx->hashTable.size; i++) {
|
||||||
int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset;
|
int offset = (cctx->ip - cctx->ibase) -
|
||||||
|
HASH_getHash(&cctx->hashTable, i)->offset;
|
||||||
buckets[intLog2(offset)]++;
|
buckets[intLog2(offset)]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,7 +145,7 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) {
|
|||||||
printf("2^%*d: %10u %6.3f%%\n", 2, i,
|
printf("2^%*d: %10u %6.3f%%\n", 2, i,
|
||||||
buckets[i],
|
buckets[i],
|
||||||
100.0 * (double) buckets[i] /
|
100.0 * (double) buckets[i] /
|
||||||
(double) LDM_HASHTABLESIZE_U32);
|
(double) cctx->hashTable.size);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
@ -305,7 +315,7 @@ static void putHashOfCurrentPositionFromHash(
|
|||||||
LDM_CCtx *cctx, hash_t hash, U32 sum) {
|
LDM_CCtx *cctx, hash_t hash, U32 sum) {
|
||||||
#ifdef COMPUTE_STATS
|
#ifdef COMPUTE_STATS
|
||||||
if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) {
|
if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) {
|
||||||
offset_t offset = cctx->hashTable[hash].offset;
|
offset_t offset = HASH_getHash(&cctx->hashTable, hash)->offset;
|
||||||
cctx->stats.numHashInserts++;
|
cctx->stats.numHashInserts++;
|
||||||
if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) {
|
if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) {
|
||||||
cctx->stats.numCollisions++;
|
cctx->stats.numCollisions++;
|
||||||
@ -317,7 +327,7 @@ static void putHashOfCurrentPositionFromHash(
|
|||||||
// Note: this works only when cctx->step is 1.
|
// Note: this works only when cctx->step is 1.
|
||||||
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
|
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
|
||||||
const LDM_hashEntry entry = { cctx->ip - cctx->ibase };
|
const LDM_hashEntry entry = { cctx->ip - cctx->ibase };
|
||||||
cctx->hashTable[hash] = entry;
|
HASH_insert(&cctx->hashTable, hash, entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
cctx->lastPosHashed = cctx->ip;
|
cctx->lastPosHashed = cctx->ip;
|
||||||
@ -362,7 +372,7 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) {
|
|||||||
* Returns the position of the entry at hashTable[hash].
|
* Returns the position of the entry at hashTable[hash].
|
||||||
*/
|
*/
|
||||||
static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) {
|
static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) {
|
||||||
return cctx->hashTable[hash].offset + cctx->ibase;
|
return HASH_getHash(&cctx->hashTable, hash)->offset + cctx->ibase;
|
||||||
}
|
}
|
||||||
|
|
||||||
U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
|
U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
|
||||||
@ -389,6 +399,11 @@ void LDM_readHeader(const void *src, U64 *compressedSize,
|
|||||||
// ip += sizeof(U64);
|
// ip += sizeof(U64);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void LDM_initializeHashTable(LDM_hashTable *table) {
|
||||||
|
table->size = LDM_HASHTABLESIZE_U32;
|
||||||
|
table->entries = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry));
|
||||||
|
}
|
||||||
|
|
||||||
void LDM_initializeCCtx(LDM_CCtx *cctx,
|
void LDM_initializeCCtx(LDM_CCtx *cctx,
|
||||||
const void *src, size_t srcSize,
|
const void *src, size_t srcSize,
|
||||||
void *dst, size_t maxDstSize) {
|
void *dst, size_t maxDstSize) {
|
||||||
@ -408,7 +423,9 @@ void LDM_initializeCCtx(LDM_CCtx *cctx,
|
|||||||
cctx->anchor = cctx->ibase;
|
cctx->anchor = cctx->ibase;
|
||||||
|
|
||||||
memset(&(cctx->stats), 0, sizeof(cctx->stats));
|
memset(&(cctx->stats), 0, sizeof(cctx->stats));
|
||||||
cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry));
|
|
||||||
|
LDM_initializeHashTable(&cctx->hashTable);
|
||||||
|
// calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry));
|
||||||
// memset(cctx->hashTable, 0, sizeof(cctx->hashTable));
|
// memset(cctx->hashTable, 0, sizeof(cctx->hashTable));
|
||||||
cctx->stats.minOffset = UINT_MAX;
|
cctx->stats.minOffset = UINT_MAX;
|
||||||
cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG;
|
cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG;
|
||||||
@ -424,6 +441,10 @@ void LDM_initializeCCtx(LDM_CCtx *cctx,
|
|||||||
cctx->DEBUG_setNextHash = 0;
|
cctx->DEBUG_setNextHash = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void LDM_destroyCCtx(LDM_CCtx *cctx) {
|
||||||
|
free((cctx->hashTable).entries);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Finds the "best" match.
|
* Finds the "best" match.
|
||||||
*
|
*
|
||||||
@ -594,10 +615,14 @@ size_t LDM_compress(const void *src, size_t srcSize,
|
|||||||
|
|
||||||
#ifdef COMPUTE_STATS
|
#ifdef COMPUTE_STATS
|
||||||
LDM_printCompressStats(&cctx.stats);
|
LDM_printCompressStats(&cctx.stats);
|
||||||
LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32);
|
LDM_outputHashTableOccupancy(&cctx.hashTable);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return cctx.op - cctx.obase;
|
{
|
||||||
|
const size_t ret = cctx.op - cctx.obase;
|
||||||
|
LDM_destroyCCtx(&cctx);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct LDM_DCtx {
|
struct LDM_DCtx {
|
||||||
|
@ -26,6 +26,7 @@
|
|||||||
typedef U32 offset_t;
|
typedef U32 offset_t;
|
||||||
typedef U32 hash_t;
|
typedef U32 hash_t;
|
||||||
typedef struct LDM_hashEntry LDM_hashEntry;
|
typedef struct LDM_hashEntry LDM_hashEntry;
|
||||||
|
typedef struct LDM_hashTable LDM_hashTable;
|
||||||
typedef struct LDM_compressStats LDM_compressStats;
|
typedef struct LDM_compressStats LDM_compressStats;
|
||||||
typedef struct LDM_CCtx LDM_CCtx;
|
typedef struct LDM_CCtx LDM_CCtx;
|
||||||
typedef struct LDM_DCtx LDM_DCtx;
|
typedef struct LDM_DCtx LDM_DCtx;
|
||||||
@ -62,17 +63,23 @@ size_t LDM_compress(const void *src, size_t srcSize,
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the compression context.
|
* Initialize the compression context.
|
||||||
|
*
|
||||||
|
* Allocates memory for the hash table.
|
||||||
*/
|
*/
|
||||||
void LDM_initializeCCtx(LDM_CCtx *cctx,
|
void LDM_initializeCCtx(LDM_CCtx *cctx,
|
||||||
const void *src, size_t srcSize,
|
const void *src, size_t srcSize,
|
||||||
void *dst, size_t maxDstSize);
|
void *dst, size_t maxDstSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Frees up memory allocating in initializeCCtx
|
||||||
|
*/
|
||||||
|
void LDM_destroyCCtx(LDM_CCtx *cctx);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prints the percentage of the hash table occupied (where occupied is defined
|
* Prints the percentage of the hash table occupied (where occupied is defined
|
||||||
* as the entry being non-zero).
|
* as the entry being non-zero).
|
||||||
*/
|
*/
|
||||||
void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable,
|
void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable);
|
||||||
U32 hashTableSize);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prints the distribution of offsets in the hash table.
|
* Prints the distribution of offsets in the hash table.
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
# This Makefile presumes libzstd is installed, using `sudo make install`
|
# This Makefile presumes libzstd is installed, using `sudo make install`
|
||||||
|
|
||||||
CPPFLAGS+= -I../../../../lib/common
|
CPPFLAGS+= -I../../lib/common
|
||||||
CFLAGS ?= -O3
|
CFLAGS ?= -O3
|
||||||
DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
|
DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
|
||||||
-Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
|
-Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
|
||||||
@ -27,7 +27,7 @@ default: all
|
|||||||
|
|
||||||
all: main-ldm
|
all: main-ldm
|
||||||
|
|
||||||
main-ldm : ldm.c main-ldm.c
|
main-ldm : ldm.h ldm.c main-ldm.c
|
||||||
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
#define RUN_MASK ((1U<<RUN_BITS)-1)
|
#define RUN_MASK ((1U<<RUN_BITS)-1)
|
||||||
|
|
||||||
#define COMPUTE_STATS
|
#define COMPUTE_STATS
|
||||||
#define CHECKSUM_CHAR_OFFSET 0
|
#define CHECKSUM_CHAR_OFFSET 10
|
||||||
//#define RUN_CHECKS
|
//#define RUN_CHECKS
|
||||||
//#define LDM_DEBUG
|
//#define LDM_DEBUG
|
||||||
|
|
||||||
@ -23,10 +23,21 @@ struct LDM_hashEntry {
|
|||||||
offset_t offset;
|
offset_t offset;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef struct LDM_hashTable {
|
||||||
|
U32 numEntries;
|
||||||
|
U32 minimumTagMask; // TODO: what if tag == offset?
|
||||||
|
|
||||||
|
// Maximum number of elements in the table.
|
||||||
|
U32 limit;
|
||||||
|
|
||||||
|
LDM_hashEntry *entries;
|
||||||
|
} LDM_hashTable;
|
||||||
|
|
||||||
// TODO: Add offset histogram by powers of two
|
// TODO: Add offset histogram by powers of two
|
||||||
// TODO: Scanning speed
|
// TODO: Scanning speed
|
||||||
// TODO: Memory usage
|
// TODO: Memory usage
|
||||||
struct LDM_compressStats {
|
struct LDM_compressStats {
|
||||||
|
U32 windowSizeLog, hashTableSizeLog;
|
||||||
U32 numMatches;
|
U32 numMatches;
|
||||||
U64 totalMatchLength;
|
U64 totalMatchLength;
|
||||||
U64 totalLiteralLength;
|
U64 totalLiteralLength;
|
||||||
@ -36,6 +47,8 @@ struct LDM_compressStats {
|
|||||||
|
|
||||||
U32 numCollisions;
|
U32 numCollisions;
|
||||||
U32 numHashInserts;
|
U32 numHashInserts;
|
||||||
|
|
||||||
|
U32 offsetHistogram[32];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LDM_CCtx {
|
struct LDM_CCtx {
|
||||||
@ -61,7 +74,9 @@ struct LDM_CCtx {
|
|||||||
|
|
||||||
LDM_compressStats stats; /* Compression statistics */
|
LDM_compressStats stats; /* Compression statistics */
|
||||||
|
|
||||||
LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
|
LDM_hashEntry *hashTable;
|
||||||
|
|
||||||
|
// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
|
||||||
|
|
||||||
const BYTE *lastPosHashed; /* Last position hashed */
|
const BYTE *lastPosHashed; /* Last position hashed */
|
||||||
hash_t lastHash; /* Hash corresponding to lastPosHashed */
|
hash_t lastHash; /* Hash corresponding to lastPosHashed */
|
||||||
@ -78,7 +93,7 @@ struct LDM_CCtx {
|
|||||||
const BYTE *DEBUG_setNextHash;
|
const BYTE *DEBUG_setNextHash;
|
||||||
};
|
};
|
||||||
|
|
||||||
void LDM_outputHashtableOccupancy(
|
void LDM_outputHashTableOccupancy(
|
||||||
const LDM_hashEntry *hashTable, U32 hashTableSize) {
|
const LDM_hashEntry *hashTable, U32 hashTableSize) {
|
||||||
U32 i = 0;
|
U32 i = 0;
|
||||||
U32 ctr = 0;
|
U32 ctr = 0;
|
||||||
@ -92,26 +107,78 @@ void LDM_outputHashtableOccupancy(
|
|||||||
100.0 * (double)(ctr) / (double)hashTableSize);
|
100.0 * (double)(ctr) / (double)hashTableSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: This can be done more efficiently (but it is not that important as it
|
||||||
|
// is only used for computing stats).
|
||||||
|
static int intLog2(U32 x) {
|
||||||
|
int ret = 0;
|
||||||
|
while (x >>= 1) {
|
||||||
|
ret++;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Maybe we would eventually prefer to have linear rather than
|
||||||
|
// exponential buckets.
|
||||||
|
void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) {
|
||||||
|
int i = 0;
|
||||||
|
int buckets[32] = { 0 };
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
printf("Hash table histogram\n");
|
||||||
|
for (; i < LDM_HASHTABLESIZE_U32; i++) {
|
||||||
|
int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset;
|
||||||
|
buckets[intLog2(offset)]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
i = 0;
|
||||||
|
for (; i < 32; i++) {
|
||||||
|
printf("2^%*d: %10u %6.3f%%\n", 2, i,
|
||||||
|
buckets[i],
|
||||||
|
100.0 * (double) buckets[i] /
|
||||||
|
(double) LDM_HASHTABLESIZE_U32);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
void LDM_printCompressStats(const LDM_compressStats *stats) {
|
void LDM_printCompressStats(const LDM_compressStats *stats) {
|
||||||
|
int i = 0;
|
||||||
printf("=====================\n");
|
printf("=====================\n");
|
||||||
printf("Compression statistics\n");
|
printf("Compression statistics\n");
|
||||||
//TODO: compute percentage matched?
|
//TODO: compute percentage matched?
|
||||||
|
printf("Window size, hash table size (bytes): 2^%u, 2^%u\n",
|
||||||
|
stats->windowSizeLog, stats->hashTableSizeLog);
|
||||||
printf("num matches, total match length: %u, %llu\n",
|
printf("num matches, total match length: %u, %llu\n",
|
||||||
stats->numMatches,
|
stats->numMatches,
|
||||||
stats->totalMatchLength);
|
stats->totalMatchLength);
|
||||||
printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) /
|
printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) /
|
||||||
(double)stats->numMatches);
|
(double)stats->numMatches);
|
||||||
printf("avg literal length: %.1f\n",
|
printf("avg literal length, total literalLength: %.1f, %llu\n",
|
||||||
((double)stats->totalLiteralLength) / (double)stats->numMatches);
|
((double)stats->totalLiteralLength) / (double)stats->numMatches,
|
||||||
|
stats->totalLiteralLength);
|
||||||
printf("avg offset length: %.1f\n",
|
printf("avg offset length: %.1f\n",
|
||||||
((double)stats->totalOffset) / (double)stats->numMatches);
|
((double)stats->totalOffset) / (double)stats->numMatches);
|
||||||
printf("min offset, max offset: %u %u\n",
|
printf("min offset, max offset: %u, %u\n",
|
||||||
stats->minOffset, stats->maxOffset);
|
stats->minOffset, stats->maxOffset);
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
printf("offset histogram: offset, num matches, %% of matches\n");
|
||||||
|
|
||||||
|
for (; i <= intLog2(stats->maxOffset); i++) {
|
||||||
|
printf("2^%*d: %10u %6.3f%%\n", 2, i,
|
||||||
|
stats->offsetHistogram[i],
|
||||||
|
100.0 * (double) stats->offsetHistogram[i] /
|
||||||
|
(double) stats->numMatches);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
|
||||||
printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n",
|
printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n",
|
||||||
stats->numCollisions, stats->numHashInserts,
|
stats->numCollisions, stats->numHashInserts,
|
||||||
stats->numHashInserts == 0 ?
|
stats->numHashInserts == 0 ?
|
||||||
1.0 : (100.0 * (double)stats->numCollisions) /
|
1.0 : (100.0 * (double)stats->numCollisions) /
|
||||||
(double)stats->numHashInserts);
|
(double)stats->numHashInserts);
|
||||||
|
printf("=====================\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
|
int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
|
||||||
@ -145,7 +212,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
|
|||||||
* of the hash table.
|
* of the hash table.
|
||||||
*/
|
*/
|
||||||
static hash_t checksumToHash(U32 sum) {
|
static hash_t checksumToHash(U32 sum) {
|
||||||
return ((sum * 2654435761U) >> ((32)-LDM_HASHLOG));
|
return ((sum * 2654435761U) >> (32 - LDM_HASHLOG));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -341,8 +408,12 @@ void LDM_initializeCCtx(LDM_CCtx *cctx,
|
|||||||
cctx->anchor = cctx->ibase;
|
cctx->anchor = cctx->ibase;
|
||||||
|
|
||||||
memset(&(cctx->stats), 0, sizeof(cctx->stats));
|
memset(&(cctx->stats), 0, sizeof(cctx->stats));
|
||||||
memset(cctx->hashTable, 0, sizeof(cctx->hashTable));
|
cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry));
|
||||||
|
// memset(cctx->hashTable, 0, sizeof(cctx->hashTable));
|
||||||
cctx->stats.minOffset = UINT_MAX;
|
cctx->stats.minOffset = UINT_MAX;
|
||||||
|
cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG;
|
||||||
|
cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE;
|
||||||
|
|
||||||
|
|
||||||
cctx->lastPosHashed = NULL;
|
cctx->lastPosHashed = NULL;
|
||||||
|
|
||||||
@ -353,6 +424,10 @@ void LDM_initializeCCtx(LDM_CCtx *cctx,
|
|||||||
cctx->DEBUG_setNextHash = 0;
|
cctx->DEBUG_setNextHash = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void LDM_destroyCCtx(LDM_CCtx *cctx) {
|
||||||
|
free(cctx->hashTable);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Finds the "best" match.
|
* Finds the "best" match.
|
||||||
*
|
*
|
||||||
@ -379,7 +454,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) {
|
|||||||
*match = getPositionOnHash(cctx, h);
|
*match = getPositionOnHash(cctx, h);
|
||||||
putHashOfCurrentPositionFromHash(cctx, h, sum);
|
putHashOfCurrentPositionFromHash(cctx, h, sum);
|
||||||
|
|
||||||
} while (cctx->ip - *match > WINDOW_SIZE ||
|
} while (cctx->ip - *match > LDM_WINDOW_SIZE ||
|
||||||
!LDM_isValidMatch(cctx->ip, *match));
|
!LDM_isValidMatch(cctx->ip, *match));
|
||||||
setNextHash(cctx);
|
setNextHash(cctx);
|
||||||
return 0;
|
return 0;
|
||||||
@ -443,24 +518,19 @@ void LDM_outputBlock(LDM_CCtx *cctx,
|
|||||||
size_t LDM_compress(const void *src, size_t srcSize,
|
size_t LDM_compress(const void *src, size_t srcSize,
|
||||||
void *dst, size_t maxDstSize) {
|
void *dst, size_t maxDstSize) {
|
||||||
LDM_CCtx cctx;
|
LDM_CCtx cctx;
|
||||||
|
const BYTE *match;
|
||||||
LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize);
|
LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize);
|
||||||
|
|
||||||
/* Hash the first position and put it into the hash table. */
|
/* Hash the first position and put it into the hash table. */
|
||||||
LDM_putHashOfCurrentPosition(&cctx);
|
LDM_putHashOfCurrentPosition(&cctx);
|
||||||
|
|
||||||
// TODO: loop condition is not accurate.
|
/**
|
||||||
while (1) {
|
* Find a match.
|
||||||
const BYTE *match;
|
* If no more matches can be found (i.e. the length of the remaining input
|
||||||
|
* is less than the minimum match length), then stop searching for matches
|
||||||
/**
|
* and encode the final literals.
|
||||||
* Find a match.
|
*/
|
||||||
* If no more matches can be found (i.e. the length of the remaining input
|
while (LDM_findBestMatch(&cctx, &match) == 0) {
|
||||||
* is less than the minimum match length), then stop searching for matches
|
|
||||||
* and encode the final literals.
|
|
||||||
*/
|
|
||||||
if (LDM_findBestMatch(&cctx, &match) != 0) {
|
|
||||||
goto _last_literals;
|
|
||||||
}
|
|
||||||
#ifdef COMPUTE_STATS
|
#ifdef COMPUTE_STATS
|
||||||
cctx.stats.numMatches++;
|
cctx.stats.numMatches++;
|
||||||
#endif
|
#endif
|
||||||
@ -485,6 +555,8 @@ size_t LDM_compress(const void *src, size_t srcSize,
|
|||||||
cctx.ip + LDM_MIN_MATCH_LENGTH, match + LDM_MIN_MATCH_LENGTH,
|
cctx.ip + LDM_MIN_MATCH_LENGTH, match + LDM_MIN_MATCH_LENGTH,
|
||||||
cctx.ihashLimit);
|
cctx.ihashLimit);
|
||||||
|
|
||||||
|
LDM_outputBlock(&cctx, literalLength, offset, matchLength);
|
||||||
|
|
||||||
#ifdef COMPUTE_STATS
|
#ifdef COMPUTE_STATS
|
||||||
cctx.stats.totalLiteralLength += literalLength;
|
cctx.stats.totalLiteralLength += literalLength;
|
||||||
cctx.stats.totalOffset += offset;
|
cctx.stats.totalOffset += offset;
|
||||||
@ -493,8 +565,8 @@ size_t LDM_compress(const void *src, size_t srcSize,
|
|||||||
offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset;
|
offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset;
|
||||||
cctx.stats.maxOffset =
|
cctx.stats.maxOffset =
|
||||||
offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset;
|
offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset;
|
||||||
|
cctx.stats.offsetHistogram[(U32)intLog2(offset)]++;
|
||||||
#endif
|
#endif
|
||||||
LDM_outputBlock(&cctx, literalLength, offset, matchLength);
|
|
||||||
|
|
||||||
// Move ip to end of block, inserting hashes at each position.
|
// Move ip to end of block, inserting hashes at each position.
|
||||||
cctx.nextIp = cctx.ip + cctx.step;
|
cctx.nextIp = cctx.ip + cctx.step;
|
||||||
@ -514,20 +586,26 @@ size_t LDM_compress(const void *src, size_t srcSize,
|
|||||||
cctx.anchor = cctx.ip;
|
cctx.anchor = cctx.ip;
|
||||||
LDM_updateLastHashFromNextHash(&cctx);
|
LDM_updateLastHashFromNextHash(&cctx);
|
||||||
}
|
}
|
||||||
_last_literals:
|
|
||||||
|
// LDM_outputHashTableOffsetHistogram(&cctx);
|
||||||
|
|
||||||
/* Encode the last literals (no more matches). */
|
/* Encode the last literals (no more matches). */
|
||||||
{
|
{
|
||||||
const size_t lastRun = (size_t)(cctx.iend - cctx.anchor);
|
const size_t lastRun = cctx.iend - cctx.anchor;
|
||||||
BYTE *pToken = cctx.op++;
|
BYTE *pToken = cctx.op++;
|
||||||
LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun);
|
LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef COMPUTE_STATS
|
#ifdef COMPUTE_STATS
|
||||||
LDM_printCompressStats(&cctx.stats);
|
LDM_printCompressStats(&cctx.stats);
|
||||||
LDM_outputHashtableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32);
|
LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return (cctx.op - (const BYTE *)cctx.obase);
|
{
|
||||||
|
const size_t ret = cctx.op - cctx.obase;
|
||||||
|
LDM_destroyCCtx(&cctx);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct LDM_DCtx {
|
struct LDM_DCtx {
|
||||||
@ -611,7 +689,6 @@ size_t LDM_decompress(const void *src, size_t compressedSize,
|
|||||||
|
|
||||||
// TODO: implement and test hash function
|
// TODO: implement and test hash function
|
||||||
void LDM_test(void) {
|
void LDM_test(void) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -11,16 +11,17 @@
|
|||||||
#define LDM_OFFSET_SIZE 4
|
#define LDM_OFFSET_SIZE 4
|
||||||
|
|
||||||
// Defines the size of the hash table.
|
// Defines the size of the hash table.
|
||||||
#define LDM_MEMORY_USAGE 22
|
#define LDM_MEMORY_USAGE 16
|
||||||
#define LDM_HASHLOG (LDM_MEMORY_USAGE-2)
|
#define LDM_HASHLOG (LDM_MEMORY_USAGE-2)
|
||||||
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
|
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
|
||||||
#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
|
#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
|
||||||
|
|
||||||
#define WINDOW_SIZE (1 << 25)
|
#define LDM_WINDOW_SIZE_LOG 25
|
||||||
|
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
|
||||||
|
|
||||||
//These should be multiples of four.
|
//These should be multiples of four.
|
||||||
#define LDM_MIN_MATCH_LENGTH 8
|
#define LDM_MIN_MATCH_LENGTH 4
|
||||||
#define LDM_HASH_LENGTH 8
|
#define LDM_HASH_LENGTH 4
|
||||||
|
|
||||||
typedef U32 offset_t;
|
typedef U32 offset_t;
|
||||||
typedef U32 hash_t;
|
typedef U32 hash_t;
|
||||||
@ -61,18 +62,33 @@ size_t LDM_compress(const void *src, size_t srcSize,
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the compression context.
|
* Initialize the compression context.
|
||||||
|
*
|
||||||
|
* Allocates memory for the hash table.
|
||||||
*/
|
*/
|
||||||
void LDM_initializeCCtx(LDM_CCtx *cctx,
|
void LDM_initializeCCtx(LDM_CCtx *cctx,
|
||||||
const void *src, size_t srcSize,
|
const void *src, size_t srcSize,
|
||||||
void *dst, size_t maxDstSize);
|
void *dst, size_t maxDstSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Frees up memory allocating in initializeCCtx
|
||||||
|
*/
|
||||||
|
void LDM_destroyCCtx(LDM_CCtx *cctx);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prints the percentage of the hash table occupied (where occupied is defined
|
* Prints the percentage of the hash table occupied (where occupied is defined
|
||||||
* as the entry being non-zero).
|
* as the entry being non-zero).
|
||||||
*/
|
*/
|
||||||
void LDM_outputHashtableOccupancy(const LDM_hashEntry *hashTable,
|
void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable,
|
||||||
U32 hashTableSize);
|
U32 hashTableSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prints the distribution of offsets in the hash table.
|
||||||
|
*
|
||||||
|
* The offsets are defined as the distance of the hash table entry from the
|
||||||
|
* current input position of the cctx.
|
||||||
|
*/
|
||||||
|
void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Outputs compression statistics to stdout.
|
* Outputs compression statistics to stdout.
|
||||||
*/
|
*/
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
#include "zstd.h"
|
#include "zstd.h"
|
||||||
|
|
||||||
#define DEBUG
|
#define DEBUG
|
||||||
//#define TEST
|
#define TEST
|
||||||
|
|
||||||
/* Compress file given by fname and output to oname.
|
/* Compress file given by fname and output to oname.
|
||||||
* Returns 0 if successful, error code otherwise.
|
* Returns 0 if successful, error code otherwise.
|
||||||
|
Loading…
Reference in New Issue
Block a user