From 2d8e6c6608bc0de2d24a40ad6fe7a2fb8e8377bf Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 14 Jul 2017 12:31:01 -0700 Subject: [PATCH] Add more statistics --- contrib/long_distance_matching/ldm.c | 61 ++++++++++++++++++++++------ contrib/long_distance_matching/ldm.h | 19 ++++++--- 2 files changed, 62 insertions(+), 18 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index d935a2bd..c9c6a709 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -37,6 +37,7 @@ typedef struct LDM_hashTable { // TODO: Scanning speed // TODO: Memory usage struct LDM_compressStats { + U32 windowSizeLog, hashTableSizeLog; U32 numMatches; U64 totalMatchLength; U64 totalLiteralLength; @@ -73,7 +74,9 @@ struct LDM_CCtx { LDM_compressStats stats; /* Compression statistics */ - LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; + LDM_hashEntry *hashTable; + +// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ @@ -90,7 +93,7 @@ struct LDM_CCtx { const BYTE *DEBUG_setNextHash; }; -void LDM_outputHashtableOccupancy( +void LDM_outputHashTableOccupancy( const LDM_hashEntry *hashTable, U32 hashTableSize) { U32 i = 0; U32 ctr = 0; @@ -104,9 +107,8 @@ void LDM_outputHashtableOccupancy( 100.0 * (double)(ctr) / (double)hashTableSize); } -// TODO: This can be done more efficienctly but is not that important as it -// is only used for computing stats. -// +// TODO: This can be done more efficiently (but it is not that important as it +// is only used for computing stats). static int intLog2(U32 x) { int ret = 0; while (x >>= 1) { @@ -115,30 +117,57 @@ static int intLog2(U32 x) { return ret; } +// TODO: Maybe we would eventually prefer to have linear rather than +// exponential buckets. +void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { + int i = 0; + int buckets[32] = { 0 }; + + printf("\n"); + printf("Hash table histogram\n"); + for (; i < LDM_HASHTABLESIZE_U32; i++) { + int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset; + buckets[intLog2(offset)]++; + } + + i = 0; + for (; i < 32; i++) { + printf("2^%*d: %10u %6.3f%%\n", 2, i, + buckets[i], + 100.0 * (double) buckets[i] / + (double) LDM_HASHTABLESIZE_U32); + } + printf("\n"); +} + void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; printf("=====================\n"); printf("Compression statistics\n"); //TODO: compute percentage matched? + printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", + stats->windowSizeLog, stats->hashTableSizeLog); printf("num matches, total match length: %u, %llu\n", stats->numMatches, stats->totalMatchLength); printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / (double)stats->numMatches); - printf("avg literal length: %.1f\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches); + printf("avg literal length, total literalLength: %.1f, %llu\n", + ((double)stats->totalLiteralLength) / (double)stats->numMatches, + stats->totalLiteralLength); printf("avg offset length: %.1f\n", ((double)stats->totalOffset) / (double)stats->numMatches); - printf("min offset, max offset: %u %u\n", + printf("min offset, max offset: %u, %u\n", stats->minOffset, stats->maxOffset); printf("\n"); - printf("offset histogram\n"); + printf("offset histogram: offset, num matches, %% of matches\n"); + for (; i <= intLog2(stats->maxOffset); i++) { printf("2^%*d: %10u %6.3f%%\n", 2, i, stats->offsetHistogram[i], 100.0 * (double) stats->offsetHistogram[i] / - (double)stats->numMatches); + (double) stats->numMatches); } printf("\n"); @@ -379,8 +408,12 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); + cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); +// memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->stats.minOffset = UINT_MAX; + cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; + cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; + cctx->lastPosHashed = NULL; @@ -417,7 +450,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { *match = getPositionOnHash(cctx, h); putHashOfCurrentPositionFromHash(cctx, h, sum); - } while (cctx->ip - *match > WINDOW_SIZE || + } while (cctx->ip - *match > LDM_WINDOW_SIZE || !LDM_isValidMatch(cctx->ip, *match)); setNextHash(cctx); return 0; @@ -550,6 +583,8 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_updateLastHashFromNextHash(&cctx); } + // LDM_outputHashTableOffsetHistogram(&cctx); + /* Encode the last literals (no more matches). */ { const size_t lastRun = cctx.iend - cctx.anchor; @@ -559,7 +594,7 @@ size_t LDM_compress(const void *src, size_t srcSize, #ifdef COMPUTE_STATS LDM_printCompressStats(&cctx.stats); - LDM_outputHashtableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); + LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); #endif return cctx.op - cctx.obase; diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 3c8c04ec..87444359 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -11,16 +11,17 @@ #define LDM_OFFSET_SIZE 4 // Defines the size of the hash table. -#define LDM_MEMORY_USAGE 20 +#define LDM_MEMORY_USAGE 16 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define WINDOW_SIZE (1 << 25) +#define LDM_WINDOW_SIZE_LOG 25 +#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 1024 -#define LDM_HASH_LENGTH 1024 +#define LDM_MIN_MATCH_LENGTH 4 +#define LDM_HASH_LENGTH 4 typedef U32 offset_t; typedef U32 hash_t; @@ -70,9 +71,17 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, * Prints the percentage of the hash table occupied (where occupied is defined * as the entry being non-zero). */ -void LDM_outputHashtableOccupancy(const LDM_hashEntry *hashTable, +void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable, U32 hashTableSize); +/** + * Prints the distribution of offsets in the hash table. + * + * The offsets are defined as the distance of the hash table entry from the + * current input position of the cctx. + */ +void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx); + /** * Outputs compression statistics to stdout. */