From 38cbcb5f1a0b50d35edaa90acebb26fa64ab807c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 15 Mar 2018 16:26:08 -0700 Subject: [PATCH] removed LRM exploratory experiment --- contrib/long_distance_matching/Makefile | 36 - contrib/long_distance_matching/README.md | 102 --- contrib/long_distance_matching/ldm.c | 856 -------------------- contrib/long_distance_matching/ldm.h | 197 ----- contrib/long_distance_matching/ldm_common.c | 109 --- contrib/long_distance_matching/ldm_params.h | 12 - contrib/long_distance_matching/main.c | 269 ------ 7 files changed, 1581 deletions(-) delete mode 100644 contrib/long_distance_matching/Makefile delete mode 100644 contrib/long_distance_matching/README.md delete mode 100644 contrib/long_distance_matching/ldm.c delete mode 100644 contrib/long_distance_matching/ldm.h delete mode 100644 contrib/long_distance_matching/ldm_common.c delete mode 100644 contrib/long_distance_matching/ldm_params.h delete mode 100644 contrib/long_distance_matching/main.c diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile deleted file mode 100644 index 6ed1fab4..00000000 --- a/contrib/long_distance_matching/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -# ################################################################ -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under both the BSD-style license (found in the -# LICENSE file in the root directory of this source tree) and the GPLv2 (found -# in the COPYING file in the root directory of this source tree). -# ################################################################ - -# This Makefile presumes libzstd is installed, using `sudo make install` - -CPPFLAGS+= -I../../lib/common -CFLAGS ?= -O3 -DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ - -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ - -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ - -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ - -Wredundant-decls -CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS) -FLAGS = $(CPPFLAGS) $(CFLAGS) - -LDFLAGS += -lzstd - -.PHONY: default all clean - -default: all - -all: ldm - -ldm: ldm_common.c ldm.c main.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -clean: - @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - ldm - @echo Cleaning completed diff --git a/contrib/long_distance_matching/README.md b/contrib/long_distance_matching/README.md deleted file mode 100644 index 771a6c3c..00000000 --- a/contrib/long_distance_matching/README.md +++ /dev/null @@ -1,102 +0,0 @@ -This is a compression algorithm focused on finding long distance matches. - -It is based upon lz4 and uses nearly the same block format (github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md). The number of bytes to encode the offset is four instead of two in lz4 to reflect the longer distance matching. The block format is described in `ldm.h`. - -### Build - -Run `make`. - -### Compressing a file - -`ldm ` - -Decompression and verification can be enabled by defining `DECOMPRESS_AND_VERIFY` in `main.c`. -The output file names are as follows: -- `.ldm` : compressed file -- `.ldm.dec` : decompressed file - -### Parameters - -There are various parameters that can be tuned. These parameters can be tuned in `ldm.h` or, alternatively if `ldm_params.h` is included, in `ldm_params.h` (for easier configuration). - -The parameters are as follows and must all be defined: -- `LDM_MEMORY_USAGE` : the memory usage of the underlying hash table in bytes. -- `HASH_BUCKET_SIZE_LOG` : the log size of each bucket in the hash table (used in collision resolution). -- `LDM_LAG` : the lag (in bytes) in inserting entries into the hash table. -- `LDM_WINDOW_SIZE_LOG` : the log maximum window size when searching for matches. -- `LDM_MIN_MATCH_LENGTH` : the minimum match length. -- `INSERT_BY_TAG` : insert entries into the hash table as a function of the hash. This increases speed by reducing the number of hash table lookups and match comparisons. Certain hashes will never be inserted. -- `USE_CHECKSUM` : store a checksum with the hash table entries for faster comparison. This halves the number of entries the hash table can contain. - -The optional parameter `HASH_ONLY_EVERY_LOG` is the log inverse frequency of insertion into the hash table. That is, an entry is inserted approximately every `1 << HASH_ONLY_EVERY_LOG` times. If this parameter is not defined, the value is computed as a function of the window size and memory usage to approximate an even coverage of the window. - - -### Benchmark - -Below is a comparison of various compression methods on a tar of four versions of llvm (versions `3.9.0`, `3.9.1`, `4.0.0`, `4.0.1`) with a total size of `727900160` B. - -| Method | Size | Ratio | -|:---|---:|---:| -|lrzip -p 32 -n -w 1 | `369968714` | `1.97`| -|ldm | `209391361` | `3.48`| -|lz4 | `189954338` | `3.83`| -|lrzip -p 32 -l -w 1 | `163940343` | `4.44`| -|zstd -1 | `126080293` | `5.77`| -|lrzip -p 32 -n | `124821009` | `5.83`| -|lrzip -p 32 -n -w 1 & zstd -1 | `120317909` | `6.05`| -|zstd -3 -o | `115290952` | `6.31`| -|lrzip -p 32 -g -L 9 -w 1 | `107168979` | `6.79`| -|zstd -6 -o | `102772098` | `7.08`| -|zstd -T16 -9 | `98040470` | `7.42`| -|lrzip -p 32 -n -w 1 & zstd -T32 -19 | `88050289` | `8.27`| -|zstd -T32 -19 | `83626098` | `8.70`| -|lrzip -p 32 -n & zstd -1 | `36335117` | `20.03`| -|ldm & zstd -6 | `32856232` | `22.15`| -|lrzip -p 32 -g -L 9 | `32243594` | `22.58`| -|lrzip -p 32 -n & zstd -6 | `30954572` | `23.52`| -|lrzip -p 32 -n & zstd -T32 -19 | `26472064` | `27.50`| - -The method marked `ldm` was run with the following parameters: - -| Parameter | Value | -|:---|---:| -| `LDM_MEMORY_USAGE` | `23`| -|`HASH_BUCKET_SIZE_LOG` | `3`| -|`LDM_LAG` | `0`| -|`LDM_WINDOW_SIZE_LOG` | `28`| -|`LDM_MIN_MATCH_LENGTH`| `64`| -|`INSERT_BY_TAG` | `1`| -|`USE_CHECKSUM` | `1`| - -The compression speed was `220.5 MB/s`. - -### Parameter selection - -Below is a brief discussion of the effects of the parameters on the speed and compression ratio. - -#### Speed - -A large bottleneck in terms of speed is finding the matches and comparing to see if they are greater than the minimum match length. Generally: -- The fewer matches found (or the lower the percentage of the literals matched), the slower the algorithm will behave. -- Increasing `HASH_ONLY_EVERY_LOG` results in fewer inserts and, if `INSERT_BY_TAG` is set, fewer lookups in the table. This has a large effect on speed, as well as compression ratio. -- If `HASH_ONLY_EVERY_LOG` is not set, its value is calculated based on `LDM_WINDOW_SIZE_LOG` and `LDM_MEMORY_USAGE`. Increasing `LDM_WINDOW_SIZE_LOG` has the effect of increasing `HASH_ONLY_EVERY_LOG` and increasing `LDM_MEMORY_USAGE` decreases `HASH_ONLY_EVERY_LOG`. -- `USE_CHECKSUM` generally improves speed with hash table lookups. - -#### Compression ratio - -The compression ratio is highly correlated with the coverage of matches. As a long distance matcher, the algorithm was designed to "optimize" for long distance matches outside the zstd compression window. The compression ratio after recompressing the output of the long-distance matcher with zstd was a more important signal in development than the raw compression ratio itself. - -Generally, increasing `LDM_MEMORY_USAGE` will improve the compression ratio. However when using the default computed value of `HASH_ONLY_EVERY_LOG`, this increases the frequency of insertion and lookup in the table and thus may result in a decrease in speed. - -Below is a table showing the speed and compression ratio when compressing the llvm tar (as described above) using different settings for `LDM_MEMORY_USAGE`. The other parameters were the same as used in the benchmark above. - -| `LDM_MEMORY_USAGE` | Ratio | Speed (MB/s) | Ratio after zstd -6 | -|---:| ---: | ---: | ---: | -| `18` | `1.85` | `232.4` | `10.92` | -| `21` | `2.79` | `233.9` | `15.92` | -| `23` | `3.48` | `220.5` | `18.29` | -| `25` | `4.56` | `140.8` | `19.21` | - -### Compression statistics - -Compression statistics (and the configuration) can be enabled/disabled via `COMPUTE_STATS` and `OUTPUT_CONFIGURATION` in `ldm.h`. diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c deleted file mode 100644 index 37b188b7..00000000 --- a/contrib/long_distance_matching/ldm.c +++ /dev/null @@ -1,856 +0,0 @@ -#include -#include -#include -#include -#include - -#include "ldm.h" - -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) - -#if USE_CHECKSUM - #define LDM_HASH_ENTRY_SIZE_LOG 3 -#else - #define LDM_HASH_ENTRY_SIZE_LOG 2 -#endif - -// Entries are inserted into the table HASH_ONLY_EVERY + 1 times "on average". -#ifndef HASH_ONLY_EVERY_LOG - #define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) -#endif - -#define HASH_ONLY_EVERY ((1 << (HASH_ONLY_EVERY_LOG)) - 1) - -#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) -#define NUM_HASH_BUCKETS_LOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) - -#define HASH_CHAR_OFFSET 10 - -// Take the first match in the hash bucket only. -//#define ZSTD_SKIP - -static const U64 prime8bytes = 11400714785074694791ULL; - -// Type of the small hash used to index into the hash table. -typedef U32 hash_t; - -#if USE_CHECKSUM -typedef struct LDM_hashEntry { - U32 offset; - U32 checksum; -} LDM_hashEntry; -#else -typedef struct LDM_hashEntry { - U32 offset; -} LDM_hashEntry; -#endif - -struct LDM_compressStats { - U32 windowSizeLog, hashTableSizeLog; - U32 numMatches; - U64 totalMatchLength; - U64 totalLiteralLength; - U64 totalOffset; - - U32 matchLengthHistogram[32]; - - U32 minOffset, maxOffset; - U32 offsetHistogram[32]; -}; - -typedef struct LDM_hashTable LDM_hashTable; - -struct LDM_CCtx { - size_t isize; /* Input size */ - size_t maxOSize; /* Maximum output size */ - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of input */ - - // Maximum input position such that hashing at the position does not exceed - // end of input. - const BYTE *ihashLimit; - - // Maximum input position such that finding a match of at least the minimum - // match length does not exceed end of input. - const BYTE *imatchLimit; - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Output */ - - const BYTE *anchor; /* Anchor to start of current (match) block */ - - LDM_compressStats stats; /* Compression statistics */ - - LDM_hashTable *hashTable; - - const BYTE *lastPosHashed; /* Last position hashed */ - U64 lastHash; - - const BYTE *nextIp; // TODO: this is redundant (ip + step) - const BYTE *nextPosHashed; - U64 nextHash; - - unsigned step; // ip step, should be 1. - - const BYTE *lagIp; - U64 lagHash; -}; - -struct LDM_hashTable { - U32 numBuckets; // The number of buckets. - U32 numEntries; // numBuckets * HASH_BUCKET_SIZE. - - LDM_hashEntry *entries; - BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position. -}; - -static void HASH_destroyTable(LDM_hashTable *table) { - free(table->entries); - free(table->bucketOffsets); - free(table); -} - -/** - * Create a hash table that can contain size elements. - * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG. - * - * Returns NULL if table creation failed. - */ -static LDM_hashTable *HASH_createTable(U32 size) { - LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); - if (!table) return NULL; - - table->numBuckets = size >> HASH_BUCKET_SIZE_LOG; - table->numEntries = size; - table->entries = calloc(size, sizeof(LDM_hashEntry)); - table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); - - if (!table->entries || !table->bucketOffsets) { - HASH_destroyTable(table); - return NULL; - } - - return table; -} - -static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { - return table->entries + (hash << HASH_BUCKET_SIZE_LOG); -} - -static unsigned ZSTD_NbCommonBytes (size_t val) { - if (MEM_isLittleEndian()) { - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanForward64( &r, (U64)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctzll((U64)val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, - 0, 3, 1, 3, 1, 4, 2, 7, - 0, 2, 3, 6, 1, 5, 3, 5, - 1, 3, 4, 4, 2, 5, 6, 7, - 7, 0, 1, 2, 3, 3, 4, 6, - 2, 6, 5, 5, 3, 4, 5, 6, - 7, 1, 2, 4, 6, 4, 4, 5, - 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[ - ((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r=0; - _BitScanForward( &r, (U32)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctz((U32)val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, - 3, 2, 2, 1, 3, 2, 0, 1, - 3, 3, 1, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[ - ((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif - } - } else { /* Big Endian CPU */ - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clzll(val) >> 3); -# else - unsigned r; - /* calculate this way due to compiler complaining in 32-bits mode */ - const unsigned n32 = sizeof(size_t)*4; - if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r = 0; - _BitScanReverse( &r, (unsigned long)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clz((U32)val) >> 3); -# else - unsigned r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif - } - } -} - -// From lib/compress/zstd_compress.c -static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, - const BYTE *const pInLimit) { - const BYTE * const pStart = pIn; - const BYTE * const pInLoopLimit = pInLimit - (sizeof(size_t)-1); - - while (pIn < pInLoopLimit) { - size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); - if (!diff) { - pIn += sizeof(size_t); - pMatch += sizeof(size_t); - continue; - } - pIn += ZSTD_NbCommonBytes(diff); - return (size_t)(pIn - pStart); - } - - if (MEM_64bits()) { - if ((pIn < (pInLimit - 3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { - pIn += 4; - pMatch += 4; - } - } - if ((pIn < (pInLimit - 1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { - pIn += 2; - pMatch += 2; - } - if ((pIn < pInLimit) && (*pMatch == *pIn)) { - pIn++; - } - return (size_t)(pIn - pStart); -} - -/** - * Count number of bytes that match backwards before pIn and pMatch. - * - * We count only bytes where pMatch > pBase and pIn > pAnchor. - */ -static size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, - const BYTE *pMatch, const BYTE *pBase) { - size_t matchLength = 0; - while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { - pIn--; - pMatch--; - matchLength++; - } - return matchLength; -} - -/** - * Returns a pointer to the entry in the hash table matching the hash and - * checksum with the "longest match length" as defined below. The forward and - * backward match lengths are written to *pForwardMatchLength and - * *pBackwardMatchLength. - * - * The match length is defined based on cctx->ip and the entry's offset. - * The forward match is computed from cctx->ip and entry->offset + cctx->ibase. - * The backward match is computed backwards from cctx->ip and - * cctx->ibase only if the forward match is longer than LDM_MIN_MATCH_LENGTH. - */ -static LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, - const hash_t hash, - const U32 checksum, - U64 *pForwardMatchLength, - U64 *pBackwardMatchLength) { - LDM_hashTable *table = cctx->hashTable; - LDM_hashEntry *bucket = getBucket(table, hash); - LDM_hashEntry *cur; - LDM_hashEntry *bestEntry = NULL; - U64 bestMatchLength = 0; -#if !(USE_CHECKSUM) - (void)checksum; -#endif - for (cur = bucket; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - const BYTE *pMatch = cur->offset + cctx->ibase; - - // Check checksum for faster check. -#if USE_CHECKSUM - if (cur->checksum == checksum && - cctx->ip - pMatch <= LDM_WINDOW_SIZE) { -#else - if (cctx->ip - pMatch <= LDM_WINDOW_SIZE) { -#endif - U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); - U64 backwardMatchLength, totalMatchLength; - - // Only take matches where the forward match length is large enough - // for speed. - if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) { - continue; - } - - backwardMatchLength = - countBackwardsMatch(cctx->ip, cctx->anchor, - cur->offset + cctx->ibase, - cctx->ibase); - - totalMatchLength = forwardMatchLength + backwardMatchLength; - - if (totalMatchLength >= bestMatchLength) { - bestMatchLength = totalMatchLength; - *pForwardMatchLength = forwardMatchLength; - *pBackwardMatchLength = backwardMatchLength; - - bestEntry = cur; -#ifdef ZSTD_SKIP - return cur; -#endif - } - } - } - if (bestEntry != NULL) { - return bestEntry; - } - return NULL; -} - -/** - * Insert an entry into the hash table. The table uses a "circular buffer", - * with the oldest entry overwritten. - */ -static void HASH_insert(LDM_hashTable *table, - const hash_t hash, const LDM_hashEntry entry) { - *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; - table->bucketOffsets[hash]++; - table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; -} - -static void HASH_outputTableOccupancy(const LDM_hashTable *table) { - U32 ctr = 0; - LDM_hashEntry *cur = table->entries; - LDM_hashEntry *end = table->entries + (table->numBuckets * HASH_BUCKET_SIZE); - for (; cur < end; ++cur) { - if (cur->offset == 0) { - ctr++; - } - } - - // The number of buckets is repeated as a check for now. - printf("Num buckets, bucket size: %d (2^%d), %d\n", - table->numBuckets, NUM_HASH_BUCKETS_LOG, HASH_BUCKET_SIZE); - printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - table->numEntries, ctr, - 100.0 * (double)(ctr) / table->numEntries); -} - -// TODO: This can be done more efficiently, for example by using builtin -// functions (but it is not that important as it is only used for computing -// stats). -static int intLog2(U64 x) { - int ret = 0; - while (x >>= 1) { - ret++; - } - return ret; -} - -void LDM_printCompressStats(const LDM_compressStats *stats) { - printf("=====================\n"); - printf("Compression statistics\n"); - printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", - stats->windowSizeLog, stats->hashTableSizeLog); - printf("num matches, total match length, %% matched: %u, %llu, %.3f\n", - stats->numMatches, - stats->totalMatchLength, - 100.0 * (double)stats->totalMatchLength / - (double)(stats->totalMatchLength + stats->totalLiteralLength)); - printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / - (double)stats->numMatches); - printf("avg literal length, total literalLength: %.1f, %llu\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches, - stats->totalLiteralLength); - printf("avg offset length: %.1f\n", - ((double)stats->totalOffset) / (double)stats->numMatches); - printf("min offset, max offset: %u, %u\n", - stats->minOffset, stats->maxOffset); - - printf("\n"); - printf("offset histogram | match length histogram\n"); - printf("offset/ML, num matches, %% of matches | num matches, %% of matches\n"); - - { - int i; - int logMaxOffset = intLog2(stats->maxOffset); - for (i = 0; i <= logMaxOffset; i++) { - printf("2^%*d: %10u %6.3f%% |2^%*d: %10u %6.3f \n", - 2, i, - stats->offsetHistogram[i], - 100.0 * (double) stats->offsetHistogram[i] / - (double) stats->numMatches, - 2, i, - stats->matchLengthHistogram[i], - 100.0 * (double) stats->matchLengthHistogram[i] / - (double) stats->numMatches); - } - } - printf("\n"); - printf("=====================\n"); -} - -/** - * Return the upper (most significant) NUM_HASH_BUCKETS_LOG bits. - */ -static hash_t getSmallHash(U64 hash) { - return hash >> (64 - NUM_HASH_BUCKETS_LOG); -} - -/** - * Return the 32 bits after the upper NUM_HASH_BUCKETS_LOG bits. - */ -static U32 getChecksum(U64 hash) { - return (hash >> (64 - 32 - NUM_HASH_BUCKETS_LOG)) & 0xFFFFFFFF; -} - -#if INSERT_BY_TAG -static U32 lowerBitsFromHfHash(U64 hash) { - // The number of bits used so far is NUM_HASH_BUCKETS_LOG + 32. - // So there are 32 - NUM_HASH_BUCKETS_LOG bits left. - // Occasional hashing requires HASH_ONLY_EVERY_LOG bits. - // So if 32 - LDMHASHLOG < HASH_ONLY_EVERY_LOG, just return lower bits - // allowing for reuse of bits. - if (32 - NUM_HASH_BUCKETS_LOG < HASH_ONLY_EVERY_LOG) { - return hash & HASH_ONLY_EVERY; - } else { - // Otherwise shift by - // (32 - NUM_HASH_BUCKETS_LOG - HASH_ONLY_EVERY_LOG) bits first. - return (hash >> (32 - NUM_HASH_BUCKETS_LOG - HASH_ONLY_EVERY_LOG)) & - HASH_ONLY_EVERY; - } -} -#endif - -/** - * Get a 64-bit hash using the first len bytes from buf. - * - * Giving bytes s = s_1, s_2, ... s_k, the hash is defined to be - * H(s) = s_1*(a^(k-1)) + s_2*(a^(k-2)) + ... + s_k*(a^0) - * - * where the constant a is defined to be prime8bytes. - * - * The implementation adds an offset to each byte, so - * H(s) = (s_1 + HASH_CHAR_OFFSET)*(a^(k-1)) + ... - */ -static U64 getHash(const BYTE *buf, U32 len) { - U64 ret = 0; - U32 i; - for (i = 0; i < len; i++) { - ret *= prime8bytes; - ret += buf[i] + HASH_CHAR_OFFSET; - } - return ret; - -} - -static U64 ipow(U64 base, U64 exp) { - U64 ret = 1; - while (exp) { - if (exp & 1) { - ret *= base; - } - exp >>= 1; - base *= base; - } - return ret; -} - -static U64 updateHash(U64 hash, U32 len, - BYTE toRemove, BYTE toAdd) { - // TODO: this relies on compiler optimization. - // The exponential can be calculated explicitly as len is constant. - hash -= ((toRemove + HASH_CHAR_OFFSET) * - ipow(prime8bytes, len - 1)); - hash *= prime8bytes; - hash += toAdd + HASH_CHAR_OFFSET; - return hash; -} - -/** - * Update cctx->nextHash and cctx->nextPosHashed - * based on cctx->lastHash and cctx->lastPosHashed. - * - * This uses a rolling hash and requires that the last position hashed - * corresponds to cctx->nextIp - step. - */ -static void setNextHash(LDM_CCtx *cctx) { - cctx->nextHash = updateHash( - cctx->lastHash, LDM_HASH_LENGTH, - cctx->lastPosHashed[0], - cctx->lastPosHashed[LDM_HASH_LENGTH]); - cctx->nextPosHashed = cctx->nextIp; - -#if LDM_LAG - if (cctx->ip - cctx->ibase > LDM_LAG) { - cctx->lagHash = updateHash( - cctx->lagHash, LDM_HASH_LENGTH, - cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); - cctx->lagIp++; - } -#endif -} - -static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { - // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. - // Note: this works only when cctx->step is 1. -#if LDM_LAG - if (cctx -> lagIp - cctx->ibase > 0) { -#if INSERT_BY_TAG - U32 hashEveryMask = lowerBitsFromHfHash(cctx->lagHash); - if (hashEveryMask == HASH_ONLY_EVERY) { -#else - if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { -#endif - U32 smallHash = getSmallHash(cctx->lagHash); - -# if USE_CHECKSUM - U32 checksum = getChecksum(cctx->lagHash); - const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, checksum }; -# else - const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase }; -# endif - - HASH_insert(cctx->hashTable, smallHash, entry); - } - } else { -#endif // LDM_LAG -#if INSERT_BY_TAG - U32 hashEveryMask = lowerBitsFromHfHash(hash); - if (hashEveryMask == HASH_ONLY_EVERY) { -#else - if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { -#endif - U32 smallHash = getSmallHash(hash); - -#if USE_CHECKSUM - U32 checksum = getChecksum(hash); - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; -#else - const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; -#endif - HASH_insert(cctx->hashTable, smallHash, entry); - } -#if LDM_LAG - } -#endif - - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; -} - -/** - * Copy over the cctx->lastHash, and cctx->lastPosHashed - * fields from the "next" fields. - * - * This requires that cctx->ip == cctx->nextPosHashed. - */ -static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { - putHashOfCurrentPositionFromHash(cctx, cctx->nextHash); -} - -/** - * Insert hash of the current position into the hash table. - */ -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U64 hash = getHash(cctx->ip, LDM_HASH_LENGTH); - - putHashOfCurrentPositionFromHash(cctx, hash); -} - -size_t LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - cctx->isize = srcSize; - cctx->maxOSize = maxDstSize; - - cctx->ibase = (const BYTE *)src; - cctx->ip = cctx->ibase; - cctx->iend = cctx->ibase + srcSize; - - cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; - cctx->imatchLimit = cctx->iend - LDM_MIN_MATCH_LENGTH; - - cctx->obase = (BYTE *)dst; - cctx->op = (BYTE *)dst; - - cctx->anchor = cctx->ibase; - - memset(&(cctx->stats), 0, sizeof(cctx->stats)); -#if USE_CHECKSUM - cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64); -#else - cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32); -#endif - - if (!cctx->hashTable) return 1; - - cctx->stats.minOffset = UINT_MAX; - cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; - cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; - - cctx->lastPosHashed = NULL; - - cctx->step = 1; // Fixed to be 1 for now. Changing may break things. - cctx->nextIp = cctx->ip + cctx->step; - cctx->nextPosHashed = 0; - - return 0; -} - -void LDM_destroyCCtx(LDM_CCtx *cctx) { - HASH_destroyTable(cctx->hashTable); -} - -/** - * Finds the "best" match. - * - * Returns 0 if successful and 1 otherwise (i.e. no match can be found - * in the remaining input that is long enough). - * - * forwardMatchLength contains the forward length of the match. - */ -static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, - U64 *forwardMatchLength, U64 *backwardMatchLength) { - - LDM_hashEntry *entry = NULL; - cctx->nextIp = cctx->ip + cctx->step; - - while (entry == NULL) { - U64 hash; - hash_t smallHash; - U32 checksum; -#if INSERT_BY_TAG - U32 hashEveryMask; -#endif - setNextHash(cctx); - - hash = cctx->nextHash; - smallHash = getSmallHash(hash); - checksum = getChecksum(hash); -#if INSERT_BY_TAG - hashEveryMask = lowerBitsFromHfHash(hash); -#endif - - cctx->ip = cctx->nextIp; - cctx->nextIp += cctx->step; - - if (cctx->ip > cctx->imatchLimit) { - return 1; - } -#if INSERT_BY_TAG - if (hashEveryMask == HASH_ONLY_EVERY) { - - entry = HASH_getBestEntry(cctx, smallHash, checksum, - forwardMatchLength, backwardMatchLength); - } -#else - entry = HASH_getBestEntry(cctx, smallHash, checksum, - forwardMatchLength, backwardMatchLength); -#endif - - if (entry != NULL) { - *match = entry->offset + cctx->ibase; - } - - putHashOfCurrentPositionFromHash(cctx, hash); - - } - setNextHash(cctx); - return 0; -} - -void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength) { - /* Encode the literal length. */ - if (literalLength >= RUN_MASK) { - U64 len = (U64)literalLength - RUN_MASK; - *pToken = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *(cctx->op)++ = 255; - } - *(cctx->op)++ = (BYTE)len; - } else { - *pToken = (BYTE)(literalLength << ML_BITS); - } - - /* Encode the literals. */ - memcpy(cctx->op, cctx->anchor, literalLength); - cctx->op += literalLength; -} - -void LDM_outputBlock(LDM_CCtx *cctx, - const U64 literalLength, - const U32 offset, - const U64 matchLength) { - BYTE *pToken = cctx->op++; - - /* Encode the literal length and literals. */ - LDM_encodeLiteralLengthAndLiterals(cctx, pToken, literalLength); - - /* Encode the offset. */ - MEM_write32(cctx->op, offset); - cctx->op += LDM_OFFSET_SIZE; - - /* Encode the match length. */ - if (matchLength >= ML_MASK) { - U64 matchLengthRemaining = matchLength; - *pToken += ML_MASK; - matchLengthRemaining -= ML_MASK; - MEM_write32(cctx->op, 0xFFFFFFFF); - while (matchLengthRemaining >= 4*0xFF) { - cctx->op += 4; - MEM_write32(cctx->op, 0xffffffff); - matchLengthRemaining -= 4*0xFF; - } - cctx->op += matchLengthRemaining / 255; - *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); - } else { - *pToken += (BYTE)(matchLength); - } -} - -// TODO: maxDstSize is unused. This function may seg fault when writing -// beyond the size of dst, as it does not check maxDstSize. Writing to -// a buffer and performing checks is a possible solution. -// -// This is based upon lz4. -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - LDM_CCtx cctx; - const BYTE *match = NULL; - U64 forwardMatchLength = 0; - U64 backwardsMatchLength = 0; - - if (LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize)) { - // Initialization failed. - return 0; - } - -#ifdef OUTPUT_CONFIGURATION - LDM_outputConfiguration(); -#endif - - /* Hash the first position and put it into the hash table. */ - LDM_putHashOfCurrentPosition(&cctx); - - cctx.lagIp = cctx.ip; - cctx.lagHash = cctx.lastHash; - - /** - * Find a match. - * If no more matches can be found (i.e. the length of the remaining input - * is less than the minimum match length), then stop searching for matches - * and encode the final literals. - */ - while (!LDM_findBestMatch(&cctx, &match, &forwardMatchLength, - &backwardsMatchLength)) { - -#ifdef COMPUTE_STATS - cctx.stats.numMatches++; -#endif - - cctx.ip -= backwardsMatchLength; - match -= backwardsMatchLength; - - /** - * Write current block (literals, literal length, match offset, match - * length) and update pointers and hashes. - */ - { - const U64 literalLength = cctx.ip - cctx.anchor; - const U32 offset = cctx.ip - match; - const U64 matchLength = forwardMatchLength + - backwardsMatchLength - - LDM_MIN_MATCH_LENGTH; - - LDM_outputBlock(&cctx, literalLength, offset, matchLength); - -#ifdef COMPUTE_STATS - cctx.stats.totalLiteralLength += literalLength; - cctx.stats.totalOffset += offset; - cctx.stats.totalMatchLength += matchLength + LDM_MIN_MATCH_LENGTH; - cctx.stats.minOffset = - offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; - cctx.stats.maxOffset = - offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; - cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; - cctx.stats.matchLengthHistogram[ - (U32)intLog2(matchLength + LDM_MIN_MATCH_LENGTH)]++; -#endif - - // Move ip to end of block, inserting hashes at each position. - cctx.nextIp = cctx.ip + cctx.step; - while (cctx.ip < cctx.anchor + LDM_MIN_MATCH_LENGTH + - matchLength + literalLength) { - if (cctx.ip > cctx.lastPosHashed) { - // TODO: Simplify. - LDM_updateLastHashFromNextHash(&cctx); - setNextHash(&cctx); - } - cctx.ip++; - cctx.nextIp++; - } - } - - // Set start of next block to current input pointer. - cctx.anchor = cctx.ip; - LDM_updateLastHashFromNextHash(&cctx); - } - - /* Encode the last literals (no more matches). */ - { - const U64 lastRun = cctx.iend - cctx.anchor; - BYTE *pToken = cctx.op++; - LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); - } - -#ifdef COMPUTE_STATS - LDM_printCompressStats(&cctx.stats); - HASH_outputTableOccupancy(cctx.hashTable); -#endif - - { - const size_t ret = cctx.op - cctx.obase; - LDM_destroyCCtx(&cctx); - return ret; - } -} - -void LDM_outputConfiguration(void) { - printf("=====================\n"); - printf("Configuration\n"); - printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); - printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", - LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); - printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); - printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); - printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); - printf("LDM_LAG: %d\n", LDM_LAG); - printf("USE_CHECKSUM: %d\n", USE_CHECKSUM); - printf("INSERT_BY_TAG: %d\n", INSERT_BY_TAG); - printf("HASH_CHAR_OFFSET: %d\n", HASH_CHAR_OFFSET); - printf("=====================\n"); -} diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h deleted file mode 100644 index 4adadbd0..00000000 --- a/contrib/long_distance_matching/ldm.h +++ /dev/null @@ -1,197 +0,0 @@ -#ifndef LDM_H -#define LDM_H - -#include "mem.h" // from /lib/common/mem.h - -//#include "ldm_params.h" - -// ============================================================================= -// Modify the parameters in ldm_params.h if "ldm_params.h" is included. -// Otherwise, modify the parameters here. -// ============================================================================= - -#ifndef LDM_PARAMS_H - // Defines the size of the hash table. - // Note that this is not the number of buckets. - // Currently this should be less than WINDOW_SIZE_LOG + 4. - #define LDM_MEMORY_USAGE 23 - - // The number of entries in a hash bucket. - #define HASH_BUCKET_SIZE_LOG 3 // The maximum is 4 for now. - - // Defines the lag in inserting elements into the hash table. - #define LDM_LAG 0 - - // The maximum window size when searching for matches. - // The maximum value is 30 - #define LDM_WINDOW_SIZE_LOG 28 - - // The minimum match length. - // This should be a multiple of four. - #define LDM_MIN_MATCH_LENGTH 64 - - // If INSERT_BY_TAG, insert entries into the hash table as a function of the - // hash. Certain hashes will not be inserted. - // - // Otherwise, insert as a function of the position. - #define INSERT_BY_TAG 1 - - // Store a checksum with the hash table entries for faster comparison. - // This halves the number of entries the hash table can contain. - #define USE_CHECKSUM 1 -#endif - -// Output compression statistics. -#define COMPUTE_STATS - -// Output the configuration. -#define OUTPUT_CONFIGURATION - -// If defined, forces the probability of insertion to be approximately -// one per (1 << HASH_ONLY_EVERY_LOG). If not defined, the probability will be -// calculated based on the memory usage and window size for "even" insertion -// throughout the window. - -// #define HASH_ONLY_EVERY_LOG 8 - -// ============================================================================= - -// The number of bytes storing the compressed and decompressed size -// in the header. -#define LDM_COMPRESSED_SIZE 8 -#define LDM_DECOMPRESSED_SIZE 8 -#define LDM_HEADER_SIZE ((LDM_COMPRESSED_SIZE)+(LDM_DECOMPRESSED_SIZE)) - -#define ML_BITS 4 -#define ML_MASK ((1U< - -#include "ldm.h" - -/** - * This function reads the header at the beginning of src and writes - * the compressed and decompressed size to compressedSize and - * decompressedSize. - * - * The header consists of 16 bytes: 8 bytes each in little-endian format - * of the compressed size and the decompressed size. - */ -void LDM_readHeader(const void *src, U64 *compressedSize, - U64 *decompressedSize) { - const BYTE *ip = (const BYTE *)src; - *compressedSize = MEM_readLE64(ip); - *decompressedSize = MEM_readLE64(ip + 8); -} - -/** - * Writes the 16-byte header (8-bytes each of the compressedSize and - * decompressedSize in little-endian format) to memPtr. - */ -void LDM_writeHeader(void *memPtr, U64 compressedSize, - U64 decompressedSize) { - MEM_writeLE64(memPtr, compressedSize); - MEM_writeLE64((BYTE *)memPtr + 8, decompressedSize); -} - -struct LDM_DCtx { - size_t compressedSize; - size_t maxDecompressedSize; - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of source */ - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Current output position */ - const BYTE *oend; /* End of output */ -}; - -void LDM_initializeDCtx(LDM_DCtx *dctx, - const void *src, size_t compressedSize, - void *dst, size_t maxDecompressedSize) { - dctx->compressedSize = compressedSize; - dctx->maxDecompressedSize = maxDecompressedSize; - - dctx->ibase = src; - dctx->ip = (const BYTE *)src; - dctx->iend = dctx->ip + dctx->compressedSize; - dctx->op = dst; - dctx->oend = dctx->op + dctx->maxDecompressedSize; -} - -size_t LDM_decompress(const void *src, size_t compressedSize, - void *dst, size_t maxDecompressedSize) { - - LDM_DCtx dctx; - LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); - - while (dctx.ip < dctx.iend) { - BYTE *cpy; - const BYTE *match; - size_t length, offset; - - /* Get the literal length. */ - const unsigned token = *(dctx.ip)++; - if ((length = (token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - - /* Copy the literals. */ - cpy = dctx.op + length; - memcpy(dctx.op, dctx.ip, length); - dctx.ip += length; - dctx.op = cpy; - - //TODO: dynamic offset size? - /* Encode the offset. */ - offset = MEM_read32(dctx.ip); - dctx.ip += LDM_OFFSET_SIZE; - match = dctx.op - offset; - - /* Get the match length. */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - length += LDM_MIN_MATCH_LENGTH; - - /* Copy match. */ - cpy = dctx.op + length; - - // TODO: this can be made more efficient. - while (match < cpy - offset && dctx.op < dctx.oend) { - *(dctx.op)++ = *match++; - } - } - return dctx.op - (BYTE *)dst; -} diff --git a/contrib/long_distance_matching/ldm_params.h b/contrib/long_distance_matching/ldm_params.h deleted file mode 100644 index a541581b..00000000 --- a/contrib/long_distance_matching/ldm_params.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef LDM_PARAMS_H -#define LDM_PARAMS_H - -#define LDM_MEMORY_USAGE 23 -#define HASH_BUCKET_SIZE_LOG 3 -#define LDM_LAG 0 -#define LDM_WINDOW_SIZE_LOG 28 -#define LDM_MIN_MATCH_LENGTH 64 -#define INSERT_BY_TAG 1 -#define USE_CHECKSUM 1 - -#endif // LDM_PARAMS_H diff --git a/contrib/long_distance_matching/main.c b/contrib/long_distance_matching/main.c deleted file mode 100644 index 7c7086a5..00000000 --- a/contrib/long_distance_matching/main.c +++ /dev/null @@ -1,269 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "ldm.h" -#include "zstd.h" - -// #define DECOMPRESS_AND_VERIFY - -/* Compress file given by fname and output to oname. - * Returns 0 if successful, error code otherwise. - * - * This adds a header from LDM_writeHeader to the beginning of the output. - * - * This might seg fault if the compressed size is > the decompress - * size due to the mmapping and output file size allocated to be the input size - * The compress function should check before writing or buffer writes. - */ -static int compress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - size_t maxCompressedSize, compressedSize; - - struct timeval tv1, tv2; - double timeTaken; - - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - maxCompressedSize = (statbuf.st_size + LDM_HEADER_SIZE); - - // Handle case where compressed size is > decompressed size. - // TODO: The compress function should check before writing or buffer writes. - maxCompressedSize += statbuf.st_size / 255; - - ftruncate(fdout, maxCompressedSize); - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* mmap the output file. */ - if ((dst = mmap(0, maxCompressedSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - - gettimeofday(&tv1, NULL); - - compressedSize = LDM_HEADER_SIZE + - LDM_compress(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, maxCompressedSize); - - gettimeofday(&tv2, NULL); - - // Write the header. - LDM_writeHeader(dst, compressedSize, statbuf.st_size); - - // Truncate file to compressedSize. - ftruncate(fdout, compressedSize); - - printf("%25s : %10lu -> %10lu - %s \n", fname, - (size_t)statbuf.st_size, (size_t)compressedSize, oname); - printf("Compression ratio: %.2fx --- %.1f%%\n", - (double)statbuf.st_size / (double)compressedSize, - (double)compressedSize / (double)(statbuf.st_size) * 100.0); - - timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec), - - printf("Total compress time = %.3f seconds, Average scanning speed: %.3f MB/s\n", - timeTaken, - ((double)statbuf.st_size / (double) (1 << 20)) / timeTaken); - - // Close files. - close(fdin); - close(fdout); - return 0; -} - -#ifdef DECOMPRESS_AND_VERIFY -/* Decompress file compressed using LDM_compress. - * The input file should have the LDM_HEADER followed by payload. - * Returns 0 if succesful, and an error code otherwise. - */ -static int decompress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - U64 compressedSize, decompressedSize; - size_t outSize; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* Read the header. */ - LDM_readHeader(src, &compressedSize, &decompressedSize); - - ftruncate(fdout, decompressedSize); - - /* mmap the output file */ - if ((dst = mmap(0, decompressedSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - - outSize = LDM_decompress( - src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, - dst, decompressedSize); - printf("Ret size out: %zu\n", outSize); - - close(fdin); - close(fdout); - return 0; -} - -/* Compare two files. - * Returns 0 iff they are the same. - */ -static int compare(FILE *fp0, FILE *fp1) { - int result = 0; - while (result == 0) { - char b0[1024]; - char b1[1024]; - const size_t r0 = fread(b0, 1, sizeof(b0), fp0); - const size_t r1 = fread(b1, 1, sizeof(b1), fp1); - - result = (int)r0 - (int)r1; - - if (0 == r0 || 0 == r1) break; - - if (0 == result) result = memcmp(b0, b1, r0); - } - return result; -} - -/* Verify the input file is the same as the decompressed file. */ -static int verify(const char *inpFilename, const char *decFilename) { - FILE *inpFp, *decFp; - - if ((inpFp = fopen(inpFilename, "rb")) == NULL) { - perror("Could not open input file\n"); - return 1; - } - - if ((decFp = fopen(decFilename, "rb")) == NULL) { - perror("Could not open decompressed file\n"); - return 1; - } - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - { - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - return 1; - } - } - - fclose(decFp); - fclose(inpFp); - return 0; -} -#endif - -int main(int argc, const char *argv[]) { - const char * const exeName = argv[0]; - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Wrong arguments\n"); - printf("Usage:\n"); - printf("%s FILE\n", exeName); - return 1; - } - - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - /* Compress */ - { - if (compress(inpFilename, ldmFilename)) { - printf("Compress error\n"); - return 1; - } - } - -#ifdef DECOMPRESS_AND_VERIFY - /* Decompress */ - { - struct timeval tv1, tv2; - gettimeofday(&tv1, NULL); - if (decompress(ldmFilename, decFilename)) { - printf("Decompress error\n"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total decompress time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - } - /* verify */ - if (verify(inpFilename, decFilename)) { - printf("Verification error\n"); - return 1; - } -#endif - return 0; -}