diff --git a/contrib/long_distance_matching/versions/v1/ldm.c b/contrib/long_distance_matching/versions/v1/ldm.c new file mode 100644 index 00000000..266425f8 --- /dev/null +++ b/contrib/long_distance_matching/versions/v1/ldm.c @@ -0,0 +1,394 @@ +#include +#include +#include +#include + +#include "ldm.h" + +#define LDM_MEMORY_USAGE 14 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) + +#define WINDOW_SIZE (1 << 20) +#define MAX_WINDOW_SIZE 31 +#define HASH_SIZE 4 +#define MINMATCH 4 + +#define ML_BITS 4 +#define ML_MASK ((1U<>8); + } +} + +static U32 LDM_read32(const void *ptr) { + return *(const U32 *)ptr; +} + +static U64 LDM_read64(const void *ptr) { + return *(const U64 *)ptr; +} + + +static void LDM_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} + +static void LDM_wild_copy(void *dstPtr, const void *srcPtr, void *dstEnd) { + BYTE *d = (BYTE *)dstPtr; + const BYTE *s = (const BYTE *)srcPtr; + BYTE * const e = (BYTE *)dstEnd; + + do { + LDM_copy8(d, s); + d += 8; + s += 8; + } while (d < e); + +} + +struct hash_entry { + U64 offset; + tag t; +}; + +static U32 LDM_hash(U32 sequence) { + return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); +} + +static U32 LDM_hash5(U64 sequence) { + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; + const U32 hashLog = LDM_HASHLOG; + if (LDM_isLittleEndian()) + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); +} + +static U32 LDM_hash_position(const void * const p) { + return LDM_hash(LDM_read32(p)); +} + +static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, + const BYTE *srcBase) { + U32 *hashTable = (U32 *) tableBase; + hashTable[h] = (U32)(p - srcBase); +} + +static void LDM_put_position(const BYTE *p, void *tableBase, + const BYTE *srcBase) { + U32 const h = LDM_hash_position(p); + LDM_put_position_on_hash(p, h, tableBase, srcBase); +} + +static const BYTE *LDM_get_position_on_hash( + U32 h, void *tableBase, const BYTE *srcBase) { + const U32 * const hashTable = (U32*)tableBase; + return hashTable[h] + srcBase; +} + +static BYTE LDM_read_byte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} + +static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (unsigned)(pIn - pStart); + } + return (unsigned)(pIn - pStart); +} + +void LDM_read_header(void const *source, size_t *compressed_size, + size_t *decompressed_size) { + const U32 *ip = (const U32 *)source; + *compressed_size = *ip++; + *decompressed_size = *ip; +} + +size_t LDM_compress(void const *source, void *dest, size_t source_size, + size_t max_dest_size) { + const BYTE * const istart = (const BYTE*)source; + const BYTE *ip = istart; + const BYTE * const iend = istart + source_size; + const BYTE *ilimit = iend - HASH_SIZE; + const BYTE * const matchlimit = iend - HASH_SIZE; + const BYTE * const mflimit = iend - MINMATCH; + BYTE *op = (BYTE*) dest; + U32 hashTable[LDM_HASHTABLESIZE_U32]; + memset(hashTable, 0, sizeof(hashTable)); + + const BYTE *anchor = (const BYTE *)source; +// struct LDM_cctx cctx; + size_t output_size = 0; + + U32 forwardH; + + /* Hash first byte: put into hash table */ + + LDM_put_position(ip, hashTable, istart); + ip++; + forwardH = LDM_hash_position(ip); + + //TODO Loop terminates before ip>=ilimit. + while (ip < ilimit) { + const BYTE *match; + BYTE *token; + + /* Find a match */ + { + const BYTE *forwardIp = ip; + unsigned step = 1; + + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + + if (forwardIp > mflimit) { + goto _last_literals; + } + + match = LDM_get_position_on_hash(h, hashTable, istart); + + forwardH = LDM_hash_position(forwardIp); + LDM_put_position_on_hash(ip, h, hashTable, istart); + } while (ip - match > WINDOW_SIZE || + LDM_read64(match) != LDM_read64(ip)); + } + + // TODO catchup + while (ip > anchor && match > istart && ip[-1] == match[-1]) { + ip--; + match--; + } + + /* Encode literals */ + { + unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + +#ifdef LDM_DEBUG + printf("Cur position: %zu\n", anchor - istart); + printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); +#endif + /* + fwrite(match, 4, 1, stdout); + printf("\n"); + */ + + if (litLength >= RUN_MASK) { + int len = (int)litLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *op++ = 255; + } + *op++ = (BYTE)len; + } else { + *token = (BYTE)(litLength << ML_BITS); + } +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(anchor, litLength, 1, stdout); + printf("\n"); +#endif + memcpy(op, anchor, litLength); + //LDM_wild_copy(op, anchor, op + litLength); + op += litLength; + } +_next_match: + /* Encode offset */ + { + LDM_write32(op, ip - match); + op += 4; + } + + /* Encode Match Length */ + { + unsigned matchCode; + matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, + matchlimit); +#ifdef LDM_DEBUG + printf("Match length %zu\n", matchCode + MINMATCH); + fwrite(ip, MINMATCH + matchCode, 1, stdout); + printf("\n"); +#endif + ip += MINMATCH + matchCode; + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LDM_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*0xFF) { + op += 4; + LDM_write32(op, 0xffffffff); + matchCode -= 4*0xFF; + } + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else { + *token += (BYTE)(matchCode); + } +#ifdef LDM_DEBUG + printf("\n"); +#endif + } + + anchor = ip; + + LDM_put_position(ip, hashTable, istart); + forwardH = LDM_hash_position(++ip); + } +_last_literals: + /* Encode last literals */ + { + size_t const lastRun = (size_t)(iend - anchor); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255; accumulator -= 255) { + *op++ = 255; + } + *op++ = (BYTE)accumulator; + } else { + *op++ = (BYTE)(lastRun << ML_BITS); + } + memcpy(op, anchor, lastRun); + op += lastRun; + } + return (op - (BYTE *)dest); +} + +size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, + size_t max_decompressed_size) { + const BYTE *ip = (const BYTE *)source; + const BYTE * const iend = ip + compressed_size; + BYTE *op = (BYTE *)dest; + BYTE * const oend = op + max_decompressed_size; + BYTE *cpy; + + while (ip < iend) { + size_t length; + const BYTE *match; + size_t offset; + + /* get literal length */ + unsigned const token = *ip++; + if ((length=(token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } +#ifdef LDM_DEBUG + printf("Literal length: %zu\n", length); +#endif + + /* copy literals */ + cpy = op + length; +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(ip, length, 1, stdout); + printf("\n"); +#endif + memcpy(op, ip, length); +// LDM_wild_copy(op, ip, cpy); + ip += length; + op = cpy; + + /* get offset */ + offset = LDM_read32(ip); + +#ifdef LDM_DEBUG + printf("Offset: %zu\n", offset); +#endif + ip += 4; + match = op - offset; + // LDM_write32(op, (U32)offset); + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } + length += MINMATCH; +#ifdef LDM_DEBUG + printf("Match length: %zu\n", length); +#endif + /* copy match */ + cpy = op + length; + + // Inefficient for now + + while (match < cpy - offset && op < oend) { + *op++ = *match++; + } + } +// memcpy(dest, source, compressed_size); + return op - (BYTE *)dest; +} + + diff --git a/contrib/long_distance_matching/versions/v1/ldm.h b/contrib/long_distance_matching/versions/v1/ldm.h new file mode 100644 index 00000000..f4ca25a3 --- /dev/null +++ b/contrib/long_distance_matching/versions/v1/ldm.h @@ -0,0 +1,19 @@ +#ifndef LDM_H +#define LDM_H + +#include /* size_t */ + +#define LDM_COMPRESS_SIZE 4 +#define LDM_DECOMPRESS_SIZE 4 +#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) + +size_t LDM_compress(void const *source, void *dest, size_t source_size, + size_t max_dest_size); + +size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, + size_t max_decompressed_size); + +void LDM_read_header(void const *source, size_t *compressed_size, + size_t *decompressed_size); + +#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v1/main-ldm.c b/contrib/long_distance_matching/versions/v1/main-ldm.c new file mode 100644 index 00000000..10869cce --- /dev/null +++ b/contrib/long_distance_matching/versions/v1/main-ldm.c @@ -0,0 +1,459 @@ +// TODO: file size must fit into a U32 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ldm.h" + +// #define BUF_SIZE 16*1024 // Block size +#define DEBUG + +//#define ZSTD + +#if 0 +static size_t compress_file(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *buf = NULL; + size_t r = 1; + size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; + + src = malloc(BUF_SIZE); + if (!src) { + printf("Not enough memory\n"); + goto cleanup; + } + + size = BUF_SIZE + LDM_HEADER_SIZE; + buf = malloc(size); + if (!buf) { + printf("Not enough memory\n"); + goto cleanup; + } + + + for (;;) { + k = fread(src, 1, BUF_SIZE, in); + if (k == 0) + break; + count_in += k; + + n = LDM_compress(src, buf, k, BUF_SIZE); + + // n = k; + // offset += n; + offset = k; + count_out += k; + +// k = fwrite(src, 1, offset, out); + + k = fwrite(buf, 1, offset, out); + if (k < offset) { + if (ferror(out)) + printf("Write failed\n"); + else + printf("Short write\n"); + goto cleanup; + } + + } + *size_in = count_in; + *size_out = count_out; + r = 0; + cleanup: + free(src); + free(buf); + return r; +} + +static size_t decompress_file(FILE *in, FILE *out) { + void *src = malloc(BUF_SIZE); + void *dst = NULL; + size_t dst_capacity = BUF_SIZE; + size_t ret = 1; + size_t bytes_written = 0; + + if (!src) { + perror("decompress_file(src)"); + goto cleanup; + } + + while (ret != 0) { + /* Load more input */ + size_t src_size = fread(src, 1, BUF_SIZE, in); + void *src_ptr = src; + void *src_end = src_ptr + src_size; + if (src_size == 0 || ferror(in)) { + printf("(TODO): Decompress: not enough input or error reading file\n"); + //TODO + ret = 0; + goto cleanup; + } + + /* Allocate destination buffer if it hasn't been allocated already */ + if (!dst) { + dst = malloc(dst_capacity); + if (!dst) { + perror("decompress_file(dst)"); + goto cleanup; + } + } + + // TODO + + /* Decompress: + * Continue while there is more input to read. + */ + while (src_ptr != src_end && ret != 0) { + // size_t dst_size = src_size; + size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); + size_t written = fwrite(dst, 1, dst_size, out); +// printf("Writing %zu bytes\n", dst_size); + bytes_written += dst_size; + if (written != dst_size) { + printf("Decompress: Failed to write to file\n"); + goto cleanup; + } + src_ptr += src_size; + src_size = src_end - src_ptr; + } + + /* Update input */ + + } + + printf("Wrote %zu bytes\n", bytes_written); + + cleanup: + free(src); + free(dst); + + return ret; +} +#endif + +static size_t compress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* open the input file */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* open the output file */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* find size of input file */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + size_t size_in = statbuf.st_size; + + /* go to the location corresponding to the last byte */ + if (lseek(fdout, size_in + LDM_HEADER_SIZE - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + size_t out_size = statbuf.st_size + LDM_HEADER_SIZE; + + /* mmap the output file */ + if ((dst = mmap(0, out_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + + #ifdef ZSTD + size_t size_out = ZSTD_compress(dst, statbuf.st_size, + src, statbuf.st_size, 1); + #else + size_t size_out = LDM_compress(src, dst + LDM_HEADER_SIZE, statbuf.st_size, + statbuf.st_size); + size_out += LDM_HEADER_SIZE; + + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + memcpy(dst, &size_out, 4); + memcpy(dst + 4, &(statbuf.st_size), 4); + printf("Compressed size: %zu\n", size_out); + printf("Decompressed size: %zu\n", statbuf.st_size); + #endif + ftruncate(fdout, size_out); + + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + (unsigned)statbuf.st_size, (unsigned)size_out, oname, + (double)size_out / (statbuf.st_size) * 100); + + close(fdin); + close(fdout); + return 0; +} + +static size_t decompress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* open the input file */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* open the output file */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* find size of input file */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* mmap the input file */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* read header */ + size_t compressed_size, decompressed_size; + LDM_read_header(src, &compressed_size, &decompressed_size); + + printf("Size, compressed_size, decompressed_size: %zu %zu %zu\n", + statbuf.st_size, compressed_size, decompressed_size); + + /* go to the location corresponding to the last byte */ + if (lseek(fdout, decompressed_size - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, decompressed_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + + /* Copy input file to output file */ +// memcpy(dst, src, statbuf.st_size); + + #ifdef ZSTD + size_t size_out = ZSTD_decompress(dst, decomrpessed_size, + src + LDM_HEADER_SIZE, + statbuf.st_size - LDM_HEADER_SIZE); + #else + size_t size_out = LDM_decompress(src + LDM_HEADER_SIZE, dst, + statbuf.st_size - LDM_HEADER_SIZE, + decompressed_size); + printf("Ret size out: %zu\n", size_out); + #endif + ftruncate(fdout, size_out); + + close(fdin); + close(fdout); + return 0; +} + +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) { + break; + } + if (0 == result) { + result = memcmp(b0, b1, r0); + } + } + return result; +} + +static void verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); +} + +int main(int argc, const char *argv[]) { + const char * const exeName = argv[0]; + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Wrong arguments\n"); + printf("Usage:\n"); + printf("%s FILE\n", exeName); + return 1; + } + + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + struct timeval tv1, tv2; + /* compress */ + { + gettimeofday(&tv1, NULL); + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + } + + /* decompress */ + + gettimeofday(&tv1, NULL); + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + + /* verify */ + verify(inpFilename, decFilename); + return 0; +} + +#if 0 +int main2(int argc, char *argv[]) { + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Please specify input filename\n"); + return 0; + } + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *outFp = fopen(ldmFilename, "wb"); + size_t sizeIn = 0; + size_t sizeOut = 0; + size_t ret; + printf("compress : %s -> %s\n", inpFilename, ldmFilename); + ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); + if (ret) { + printf("compress : failed with code %zu\n", ret); + return ret; + } + printf("%s: %zu → %zu bytes, %.1f%%\n", + inpFilename, sizeIn, sizeOut, + (double)sizeOut / sizeIn * 100); + printf("compress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* decompress */ + { + FILE *inpFp = fopen(ldmFilename, "rb"); + FILE *outFp = fopen(decFilename, "wb"); + size_t ret; + + printf("decompress : %s -> %s\n", ldmFilename, decFilename); + ret = decompress_file(inpFp, outFp); + if (ret) { + printf("decompress : failed with code %zu\n", ret); + return ret; + } + printf("decompress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* verify */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); + } + return 0; +} +#endif +