diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index c9c6a709..08cb856c 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -23,17 +23,23 @@ struct LDM_hashEntry { offset_t offset; }; -typedef struct LDM_hashTable { - U32 numEntries; - U32 minimumTagMask; // TODO: what if tag == offset? - - // Maximum number of elements in the table. - U32 limit; - +// TODO: move to its own file. +struct LDM_hashTable { + U32 size; LDM_hashEntry *entries; -} LDM_hashTable; +}; + +LDM_hashEntry *HASH_getHash( + const LDM_hashTable *table, const hash_t hash) { + return &(table->entries[hash]); +} + +void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { + *HASH_getHash(table, hash) = entry; +} + -// TODO: Add offset histogram by powers of two // TODO: Scanning speed // TODO: Memory usage struct LDM_compressStats { @@ -74,7 +80,9 @@ struct LDM_CCtx { LDM_compressStats stats; /* Compression statistics */ - LDM_hashEntry *hashTable; + LDM_hashTable hashTable; + +// LDM_hashEntry *hashTable; // LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; @@ -93,18 +101,19 @@ struct LDM_CCtx { const BYTE *DEBUG_setNextHash; }; -void LDM_outputHashTableOccupancy( - const LDM_hashEntry *hashTable, U32 hashTableSize) { + + +void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable) { U32 i = 0; U32 ctr = 0; - for (; i < hashTableSize; i++) { - if (hashTable[i].offset == 0) { + for (; i < hashTable->size; i++) { + if (HASH_getHash(hashTable, i)->offset == 0) { ctr++; } } printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - hashTableSize, ctr, - 100.0 * (double)(ctr) / (double)hashTableSize); + hashTable->size, ctr, + 100.0 * (double)(ctr) / (double)hashTable->size); } // TODO: This can be done more efficiently (but it is not that important as it @@ -120,13 +129,14 @@ static int intLog2(U32 x) { // TODO: Maybe we would eventually prefer to have linear rather than // exponential buckets. void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { - int i = 0; + U32 i = 0; int buckets[32] = { 0 }; printf("\n"); printf("Hash table histogram\n"); - for (; i < LDM_HASHTABLESIZE_U32; i++) { - int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset; + for (; i < cctx->hashTable.size; i++) { + int offset = (cctx->ip - cctx->ibase) - + HASH_getHash(&cctx->hashTable, i)->offset; buckets[intLog2(offset)]++; } @@ -135,7 +145,7 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { printf("2^%*d: %10u %6.3f%%\n", 2, i, buckets[i], 100.0 * (double) buckets[i] / - (double) LDM_HASHTABLESIZE_U32); + (double) cctx->hashTable.size); } printf("\n"); } @@ -305,7 +315,7 @@ static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { #ifdef COMPUTE_STATS if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { - offset_t offset = cctx->hashTable[hash].offset; + offset_t offset = HASH_getHash(&cctx->hashTable, hash)->offset; cctx->stats.numHashInserts++; if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { cctx->stats.numCollisions++; @@ -317,7 +327,7 @@ static void putHashOfCurrentPositionFromHash( // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; - cctx->hashTable[hash] = entry; + HASH_insert(&cctx->hashTable, hash, entry); } cctx->lastPosHashed = cctx->ip; @@ -362,7 +372,7 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { * Returns the position of the entry at hashTable[hash]. */ static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) { - return cctx->hashTable[hash].offset + cctx->ibase; + return HASH_getHash(&cctx->hashTable, hash)->offset + cctx->ibase; } U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, @@ -389,6 +399,11 @@ void LDM_readHeader(const void *src, U64 *compressedSize, // ip += sizeof(U64); } +static void LDM_initializeHashTable(LDM_hashTable *table) { + table->size = LDM_HASHTABLESIZE_U32; + table->entries = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); +} + void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { @@ -408,7 +423,9 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); + + LDM_initializeHashTable(&cctx->hashTable); +// calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); // memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->stats.minOffset = UINT_MAX; cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; @@ -424,6 +441,10 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->DEBUG_setNextHash = 0; } +void LDM_destroyCCtx(LDM_CCtx *cctx) { + free((cctx->hashTable).entries); +} + /** * Finds the "best" match. * @@ -594,10 +615,14 @@ size_t LDM_compress(const void *src, size_t srcSize, #ifdef COMPUTE_STATS LDM_printCompressStats(&cctx.stats); - LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); + LDM_outputHashTableOccupancy(&cctx.hashTable); #endif - return cctx.op - cctx.obase; + { + const size_t ret = cctx.op - cctx.obase; + LDM_destroyCCtx(&cctx); + return ret; + } } struct LDM_DCtx { diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 87444359..8c3aa4e6 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -26,6 +26,7 @@ typedef U32 offset_t; typedef U32 hash_t; typedef struct LDM_hashEntry LDM_hashEntry; +typedef struct LDM_hashTable LDM_hashTable; typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_DCtx LDM_DCtx; @@ -62,17 +63,23 @@ size_t LDM_compress(const void *src, size_t srcSize, /** * Initialize the compression context. + * + * Allocates memory for the hash table. */ void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize); +/** + * Frees up memory allocating in initializeCCtx + */ +void LDM_destroyCCtx(LDM_CCtx *cctx); + /** * Prints the percentage of the hash table occupied (where occupied is defined * as the entry being non-zero). */ -void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable, - U32 hashTableSize); +void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable); /** * Prints the distribution of offsets in the hash table. diff --git a/contrib/long_distance_matching/versions/v0.5/Makefile b/contrib/long_distance_matching/versions/v0.5/Makefile index dee686bc..cff78644 100644 --- a/contrib/long_distance_matching/versions/v0.5/Makefile +++ b/contrib/long_distance_matching/versions/v0.5/Makefile @@ -9,7 +9,7 @@ # This Makefile presumes libzstd is installed, using `sudo make install` -CPPFLAGS+= -I../../../../lib/common +CPPFLAGS+= -I../../lib/common CFLAGS ?= -O3 DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ @@ -27,7 +27,7 @@ default: all all: main-ldm -main-ldm : ldm.c main-ldm.c +main-ldm : ldm.h ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.c b/contrib/long_distance_matching/versions/v0.5/ldm.c index b8e8c63b..06c97bc4 100644 --- a/contrib/long_distance_matching/versions/v0.5/ldm.c +++ b/contrib/long_distance_matching/versions/v0.5/ldm.c @@ -15,7 +15,7 @@ #define RUN_MASK ((1U<>= 1) { + ret++; + } + return ret; +} + +// TODO: Maybe we would eventually prefer to have linear rather than +// exponential buckets. +void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { + int i = 0; + int buckets[32] = { 0 }; + + printf("\n"); + printf("Hash table histogram\n"); + for (; i < LDM_HASHTABLESIZE_U32; i++) { + int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset; + buckets[intLog2(offset)]++; + } + + i = 0; + for (; i < 32; i++) { + printf("2^%*d: %10u %6.3f%%\n", 2, i, + buckets[i], + 100.0 * (double) buckets[i] / + (double) LDM_HASHTABLESIZE_U32); + } + printf("\n"); +} + void LDM_printCompressStats(const LDM_compressStats *stats) { + int i = 0; printf("=====================\n"); printf("Compression statistics\n"); //TODO: compute percentage matched? + printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", + stats->windowSizeLog, stats->hashTableSizeLog); printf("num matches, total match length: %u, %llu\n", stats->numMatches, stats->totalMatchLength); printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / (double)stats->numMatches); - printf("avg literal length: %.1f\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches); + printf("avg literal length, total literalLength: %.1f, %llu\n", + ((double)stats->totalLiteralLength) / (double)stats->numMatches, + stats->totalLiteralLength); printf("avg offset length: %.1f\n", ((double)stats->totalOffset) / (double)stats->numMatches); - printf("min offset, max offset: %u %u\n", + printf("min offset, max offset: %u, %u\n", stats->minOffset, stats->maxOffset); + + printf("\n"); + printf("offset histogram: offset, num matches, %% of matches\n"); + + for (; i <= intLog2(stats->maxOffset); i++) { + printf("2^%*d: %10u %6.3f%%\n", 2, i, + stats->offsetHistogram[i], + 100.0 * (double) stats->offsetHistogram[i] / + (double) stats->numMatches); + } + printf("\n"); + + printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", stats->numCollisions, stats->numHashInserts, stats->numHashInserts == 0 ? 1.0 : (100.0 * (double)stats->numCollisions) / (double)stats->numHashInserts); + printf("=====================\n"); + } int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { @@ -145,7 +212,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { * of the hash table. */ static hash_t checksumToHash(U32 sum) { - return ((sum * 2654435761U) >> ((32)-LDM_HASHLOG)); + return ((sum * 2654435761U) >> (32 - LDM_HASHLOG)); } /** @@ -341,8 +408,12 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); + cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); +// memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->stats.minOffset = UINT_MAX; + cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; + cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; + cctx->lastPosHashed = NULL; @@ -353,6 +424,10 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->DEBUG_setNextHash = 0; } +void LDM_destroyCCtx(LDM_CCtx *cctx) { + free(cctx->hashTable); +} + /** * Finds the "best" match. * @@ -379,7 +454,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { *match = getPositionOnHash(cctx, h); putHashOfCurrentPositionFromHash(cctx, h, sum); - } while (cctx->ip - *match > WINDOW_SIZE || + } while (cctx->ip - *match > LDM_WINDOW_SIZE || !LDM_isValidMatch(cctx->ip, *match)); setNextHash(cctx); return 0; @@ -443,24 +518,19 @@ void LDM_outputBlock(LDM_CCtx *cctx, size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; + const BYTE *match; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); - // TODO: loop condition is not accurate. - while (1) { - const BYTE *match; - - /** - * Find a match. - * If no more matches can be found (i.e. the length of the remaining input - * is less than the minimum match length), then stop searching for matches - * and encode the final literals. - */ - if (LDM_findBestMatch(&cctx, &match) != 0) { - goto _last_literals; - } + /** + * Find a match. + * If no more matches can be found (i.e. the length of the remaining input + * is less than the minimum match length), then stop searching for matches + * and encode the final literals. + */ + while (LDM_findBestMatch(&cctx, &match) == 0) { #ifdef COMPUTE_STATS cctx.stats.numMatches++; #endif @@ -485,6 +555,8 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.ip + LDM_MIN_MATCH_LENGTH, match + LDM_MIN_MATCH_LENGTH, cctx.ihashLimit); + LDM_outputBlock(&cctx, literalLength, offset, matchLength); + #ifdef COMPUTE_STATS cctx.stats.totalLiteralLength += literalLength; cctx.stats.totalOffset += offset; @@ -493,8 +565,8 @@ size_t LDM_compress(const void *src, size_t srcSize, offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; cctx.stats.maxOffset = offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; + cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; #endif - LDM_outputBlock(&cctx, literalLength, offset, matchLength); // Move ip to end of block, inserting hashes at each position. cctx.nextIp = cctx.ip + cctx.step; @@ -514,20 +586,26 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.anchor = cctx.ip; LDM_updateLastHashFromNextHash(&cctx); } -_last_literals: + + // LDM_outputHashTableOffsetHistogram(&cctx); + /* Encode the last literals (no more matches). */ { - const size_t lastRun = (size_t)(cctx.iend - cctx.anchor); + const size_t lastRun = cctx.iend - cctx.anchor; BYTE *pToken = cctx.op++; LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); } #ifdef COMPUTE_STATS LDM_printCompressStats(&cctx.stats); - LDM_outputHashtableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); + LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); #endif - return (cctx.op - (const BYTE *)cctx.obase); + { + const size_t ret = cctx.op - cctx.obase; + LDM_destroyCCtx(&cctx); + return ret; + } } struct LDM_DCtx { @@ -611,7 +689,6 @@ size_t LDM_decompress(const void *src, size_t compressedSize, // TODO: implement and test hash function void LDM_test(void) { - } /* diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.h b/contrib/long_distance_matching/versions/v0.5/ldm.h index 5da3c3b9..70cda8b8 100644 --- a/contrib/long_distance_matching/versions/v0.5/ldm.h +++ b/contrib/long_distance_matching/versions/v0.5/ldm.h @@ -11,16 +11,17 @@ #define LDM_OFFSET_SIZE 4 // Defines the size of the hash table. -#define LDM_MEMORY_USAGE 22 +#define LDM_MEMORY_USAGE 16 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define WINDOW_SIZE (1 << 25) +#define LDM_WINDOW_SIZE_LOG 25 +#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 8 -#define LDM_HASH_LENGTH 8 +#define LDM_MIN_MATCH_LENGTH 4 +#define LDM_HASH_LENGTH 4 typedef U32 offset_t; typedef U32 hash_t; @@ -61,18 +62,33 @@ size_t LDM_compress(const void *src, size_t srcSize, /** * Initialize the compression context. + * + * Allocates memory for the hash table. */ void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize); +/** + * Frees up memory allocating in initializeCCtx + */ +void LDM_destroyCCtx(LDM_CCtx *cctx); + /** * Prints the percentage of the hash table occupied (where occupied is defined * as the entry being non-zero). */ -void LDM_outputHashtableOccupancy(const LDM_hashEntry *hashTable, +void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable, U32 hashTableSize); +/** + * Prints the distribution of offsets in the hash table. + * + * The offsets are defined as the distance of the hash table entry from the + * current input position of the cctx. + */ +void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx); + /** * Outputs compression statistics to stdout. */ diff --git a/contrib/long_distance_matching/versions/v0.5/main-ldm.c b/contrib/long_distance_matching/versions/v0.5/main-ldm.c index 40afef8c..ea6375ba 100644 --- a/contrib/long_distance_matching/versions/v0.5/main-ldm.c +++ b/contrib/long_distance_matching/versions/v0.5/main-ldm.c @@ -13,7 +13,7 @@ #include "zstd.h" #define DEBUG -//#define TEST +#define TEST /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise.