From 88f3d8641e55544fbf22f7226ee1793828c65983 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 5 Jul 2017 13:57:07 -0700 Subject: [PATCH 01/62] Initial long distance matcher commit --- contrib/long_distance_matching/Makefile | 27 +++ contrib/long_distance_matching/ldm.c | 43 +++++ contrib/long_distance_matching/ldm.h | 10 ++ contrib/long_distance_matching/main.c | 227 ++++++++++++++++++++++++ contrib/long_distance_matching/main.h | 7 + 5 files changed, 314 insertions(+) create mode 100644 contrib/long_distance_matching/Makefile create mode 100644 contrib/long_distance_matching/ldm.c create mode 100644 contrib/long_distance_matching/ldm.h create mode 100644 contrib/long_distance_matching/main.c create mode 100644 contrib/long_distance_matching/main.h diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile new file mode 100644 index 00000000..bfe02ea2 --- /dev/null +++ b/contrib/long_distance_matching/Makefile @@ -0,0 +1,27 @@ +# ################################################################ +# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# ################################################################ + +# This Makefile presumes libzstd is installed, using `sudo make install` + + +.PHONY: default all clean + +default: all + +all: main + + +main : ldm.c main.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +clean: + @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ + main + @echo Cleaning completed + diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c new file mode 100644 index 00000000..34118c81 --- /dev/null +++ b/contrib/long_distance_matching/ldm.c @@ -0,0 +1,43 @@ +#include +#include +#include + +#include "ldm.h" + +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; + +typedef uint64_t tag; + +struct hash_entry { + U64 offset; + tag t; +}; + +size_t LDM_compress(const char *source, char *dest, size_t source_size, size_t max_dest_size) { + // max_dest_size >= source_size + + + /** + * Loop: + * Find match at position k (hash next n bytes, rolling hash) + * Compute match length + * Output literal length: k (sequences of 4 + (k-4) bytes) + * Output match length + * Output literals + * Output offset + */ + + memcpy(dest, source, source_size); + return source_size; +} + +size_t LDM_decompress(const char *source, char *dest, size_t compressed_size, size_t max_decompressed_size) { + memcpy(dest, source, compressed_size); + return compressed_size; +} + + diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h new file mode 100644 index 00000000..d0151373 --- /dev/null +++ b/contrib/long_distance_matching/ldm.h @@ -0,0 +1,10 @@ +#ifndef LDM_H +#define LDM_H + +#include /* size_t */ + +size_t LDM_compress(const char *source, char *dest, size_t source_size, size_t max_dest_size); + +size_t LDM_decompress(const char *source, char *dest, size_t compressed_size, size_t max_decompressed_size); + +#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/main.c b/contrib/long_distance_matching/main.c new file mode 100644 index 00000000..ddf5145f --- /dev/null +++ b/contrib/long_distance_matching/main.c @@ -0,0 +1,227 @@ +#include +#include +#include + +#include "ldm.h" + +#define BUF_SIZE 16*1024 // Block size +#define LDM_HEADER_SIZE 8 + +static size_t compress_file(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *buf = NULL; + size_t r = 1; + size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; + + src = malloc(BUF_SIZE); + if (!src) { + printf("Not enough memory\n"); + goto cleanup; + } + + size = BUF_SIZE + LDM_HEADER_SIZE; + buf = malloc(size); + if (!buf) { + printf("Not enough memory\n"); + goto cleanup; + } + + + for (;;) { + k = fread(src, 1, BUF_SIZE, in); + if (k == 0) + break; + count_in += k; + + n = LDM_compress(src, buf, k, BUF_SIZE); + + // n = k; + // offset += n; + offset = k; + count_out += k; + +// k = fwrite(src, 1, offset, out); + + k = fwrite(buf, 1, offset, out); + if (k < offset) { + if (ferror(out)) + printf("Write failed\n"); + else + printf("Short write\n"); + goto cleanup; + } + + } + *size_in = count_in; + *size_out = count_out; + r = 0; + cleanup: + free(src); + free(buf); + return r; +} + +static size_t decompress_file(FILE *in, FILE *out) { + void *src = malloc(BUF_SIZE); + void *dst = NULL; + size_t dst_capacity = BUF_SIZE; + size_t ret = 1; + size_t bytes_written = 0; + + if (!src) { + perror("decompress_file(src)"); + goto cleanup; + } + + while (ret != 0) { + /* Load more input */ + size_t src_size = fread(src, 1, BUF_SIZE, in); + void *src_ptr = src; + void *src_end = src_ptr + src_size; + if (src_size == 0 || ferror(in)) { + printf("(TODO): Decompress: not enough input or error reading file\n"); + //TODO + ret = 0; + goto cleanup; + } + + /* Allocate destination buffer if it hasn't been allocated already */ + if (!dst) { + dst = malloc(dst_capacity); + if (!dst) { + perror("decompress_file(dst)"); + goto cleanup; + } + } + + // TODO + + /* Decompress: + * Continue while there is more input to read. + */ + while (src_ptr != src_end && ret != 0) { + // size_t dst_size = src_size; + size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); + size_t written = fwrite(dst, 1, dst_size, out); +// printf("Writing %zu bytes\n", dst_size); + bytes_written += dst_size; + if (written != dst_size) { + printf("Decompress: Failed to write to file\n"); + goto cleanup; + } + src_ptr += src_size; + src_size = src_end - src_ptr; + } + + /* Update input */ + + } + + printf("Wrote %zu bytes\n", bytes_written); + + cleanup: + free(src); + free(dst); + + return ret; +} + +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) { + break; + } + if (0 == result) { + result = memcmp(b0, b1, r0); + } + } + return result; +} + +int main(int argc, char *argv[]) { + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Please specify input filename\n"); + return 0; + } + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *outFp = fopen(ldmFilename, "wb"); + size_t sizeIn = 0; + size_t sizeOut = 0; + size_t ret; + printf("compress : %s -> %s\n", inpFilename, ldmFilename); + ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); + if (ret) { + printf("compress : failed with code %zu\n", ret); + return ret; + } + printf("%s: %zu → %zu bytes, %.1f%%\n", + inpFilename, sizeIn, sizeOut, + (double)sizeOut / sizeIn * 100); + printf("compress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* decompress */ + { + FILE *inpFp = fopen(ldmFilename, "rb"); + FILE *outFp = fopen(decFilename, "wb"); + size_t ret; + + printf("decompress : %s -> %s\n", ldmFilename, decFilename); + ret = decompress_file(inpFp, outFp); + if (ret) { + printf("decompress : failed with code %zu\n", ret); + return ret; + } + printf("decompress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* verify */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); + } + + + return 0; +} + + diff --git a/contrib/long_distance_matching/main.h b/contrib/long_distance_matching/main.h new file mode 100644 index 00000000..a0b03012 --- /dev/null +++ b/contrib/long_distance_matching/main.h @@ -0,0 +1,7 @@ +#ifndef _MAIN_H +#define _MAIN_H + +void compress_file(FILE *in, FILE *out, int argc, char *argv[]); +void decompress_file(FILE *in, FILE *out, int argc, char *argv[]); + +#endif /* _MAIN_H */ From 8aa34a76086b32ae66119b50961e33c4777e2bbf Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 6 Jul 2017 07:30:49 -0700 Subject: [PATCH 02/62] Switch to mmapping files --- contrib/long_distance_matching/Makefile | 9 +- contrib/long_distance_matching/main-ldm.c | 411 ++++++++++++++++++++++ 2 files changed, 418 insertions(+), 2 deletions(-) create mode 100644 contrib/long_distance_matching/main-ldm.c diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index bfe02ea2..0efae69b 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -10,18 +10,23 @@ # This Makefile presumes libzstd is installed, using `sudo make install` +LDFLAGS += -lzstd + .PHONY: default all clean default: all -all: main +all: main main-ldm main : ldm.c main.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ +main-ldm : ldm.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main + main main-ldm @echo Cleaning completed diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c new file mode 100644 index 00000000..bdcffb0f --- /dev/null +++ b/contrib/long_distance_matching/main-ldm.c @@ -0,0 +1,411 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ldm.h" + +#define BUF_SIZE 16*1024 // Block size +#define LDM_HEADER_SIZE 8 +#define DEBUG + +#if 0 +static size_t compress_file(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *buf = NULL; + size_t r = 1; + size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; + + src = malloc(BUF_SIZE); + if (!src) { + printf("Not enough memory\n"); + goto cleanup; + } + + size = BUF_SIZE + LDM_HEADER_SIZE; + buf = malloc(size); + if (!buf) { + printf("Not enough memory\n"); + goto cleanup; + } + + + for (;;) { + k = fread(src, 1, BUF_SIZE, in); + if (k == 0) + break; + count_in += k; + + n = LDM_compress(src, buf, k, BUF_SIZE); + + // n = k; + // offset += n; + offset = k; + count_out += k; + +// k = fwrite(src, 1, offset, out); + + k = fwrite(buf, 1, offset, out); + if (k < offset) { + if (ferror(out)) + printf("Write failed\n"); + else + printf("Short write\n"); + goto cleanup; + } + + } + *size_in = count_in; + *size_out = count_out; + r = 0; + cleanup: + free(src); + free(buf); + return r; +} + +static size_t decompress_file(FILE *in, FILE *out) { + void *src = malloc(BUF_SIZE); + void *dst = NULL; + size_t dst_capacity = BUF_SIZE; + size_t ret = 1; + size_t bytes_written = 0; + + if (!src) { + perror("decompress_file(src)"); + goto cleanup; + } + + while (ret != 0) { + /* Load more input */ + size_t src_size = fread(src, 1, BUF_SIZE, in); + void *src_ptr = src; + void *src_end = src_ptr + src_size; + if (src_size == 0 || ferror(in)) { + printf("(TODO): Decompress: not enough input or error reading file\n"); + //TODO + ret = 0; + goto cleanup; + } + + /* Allocate destination buffer if it hasn't been allocated already */ + if (!dst) { + dst = malloc(dst_capacity); + if (!dst) { + perror("decompress_file(dst)"); + goto cleanup; + } + } + + // TODO + + /* Decompress: + * Continue while there is more input to read. + */ + while (src_ptr != src_end && ret != 0) { + // size_t dst_size = src_size; + size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); + size_t written = fwrite(dst, 1, dst_size, out); +// printf("Writing %zu bytes\n", dst_size); + bytes_written += dst_size; + if (written != dst_size) { + printf("Decompress: Failed to write to file\n"); + goto cleanup; + } + src_ptr += src_size; + src_size = src_end - src_ptr; + } + + /* Update input */ + + } + + printf("Wrote %zu bytes\n", bytes_written); + + cleanup: + free(src); + free(dst); + + return ret; +} +#endif + +static size_t compress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* open the input file */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* open the output file */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* find size of input file */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* go to the location corresponding to the last byte */ + if (lseek(fdout, statbuf.st_size - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, statbuf.st_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + + /* Copy input file to output file */ +// memcpy(dst, src, statbuf.st_size); + size_t size_out = ZSTD_compress(dst, statbuf.st_size, + src, statbuf.st_size, 1); + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + (unsigned)statbuf.st_size, (unsigned)size_out, oname, + (double)size_out / (statbuf.st_size) * 100); + + close(fdin); + close(fdout); + return 0; +} + +static size_t decompress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* open the input file */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* open the output file */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* find size of input file */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* go to the location corresponding to the last byte */ + if (lseek(fdout, statbuf.st_size - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, statbuf.st_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + + /* Copy input file to output file */ +// memcpy(dst, src, statbuf.st_size); + + size_t size_out = ZSTD_decompress(dst, statbuf.st_size, + src, statbuf.st_size); + + + close(fdin); + close(fdout); + return 0; +} + +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) { + break; + } + if (0 == result) { + result = memcmp(b0, b1, r0); + } + } + return result; +} + +static void verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); +} + +int main(int argc, const char *argv[]) { + const char * const exeName = argv[0]; + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Wrong arguments\n"); + printf("Usage:\n"); + printf("%s FILE\n", exeName); + return 1; + } + + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + + /* decompress */ + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + + /* verify */ + verify(inpFilename, decFilename); +} + +#if 0 +int main2(int argc, char *argv[]) { + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Please specify input filename\n"); + return 0; + } + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *outFp = fopen(ldmFilename, "wb"); + size_t sizeIn = 0; + size_t sizeOut = 0; + size_t ret; + printf("compress : %s -> %s\n", inpFilename, ldmFilename); + ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); + if (ret) { + printf("compress : failed with code %zu\n", ret); + return ret; + } + printf("%s: %zu → %zu bytes, %.1f%%\n", + inpFilename, sizeIn, sizeOut, + (double)sizeOut / sizeIn * 100); + printf("compress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* decompress */ + { + FILE *inpFp = fopen(ldmFilename, "rb"); + FILE *outFp = fopen(decFilename, "wb"); + size_t ret; + + printf("decompress : %s -> %s\n", ldmFilename, decFilename); + ret = decompress_file(inpFp, outFp); + if (ret) { + printf("decompress : failed with code %zu\n", ret); + return ret; + } + printf("decompress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* verify */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); + } + return 0; +} +#endif + From b96ad327a48bf4a97574e3d11d2f94741d900616 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 6 Jul 2017 15:23:15 -0700 Subject: [PATCH 03/62] Add simple compress and decompress functions --- contrib/long_distance_matching/ldm.c | 342 ++++++++++++++++++++-- contrib/long_distance_matching/ldm.h | 6 +- contrib/long_distance_matching/main-ldm.c | 44 ++- 3 files changed, 364 insertions(+), 28 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 34118c81..c8051ea4 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -1,9 +1,25 @@ #include #include #include +#include #include "ldm.h" +#define LDM_MEMORY_USAGE 14 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) + +#define WINDOW_SIZE (1 << 20) +#define HASH_SIZE 4 +#define MINMATCH 4 + +#define ML_BITS 4 +#define ML_MASK ((1U<>8); + } +} + + + +static U32 LDM_read32(const void *ptr) { + return *(const U32 *)ptr; +} + +static void LDM_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} + +static void LDM_wild_copy(void *dstPtr, const void *srcPtr, void *dstEnd) { + BYTE *d = (BYTE *)dstPtr; + const BYTE *s = (const BYTE *)srcPtr; + BYTE * const e = (BYTE *)dstEnd; + + do { + LDM_copy8(d, s); + d += 8; + s += 8; + } while (d < e); + +} + struct hash_entry { U64 offset; tag t; }; -size_t LDM_compress(const char *source, char *dest, size_t source_size, size_t max_dest_size) { - // max_dest_size >= source_size - - - /** - * Loop: - * Find match at position k (hash next n bytes, rolling hash) - * Compute match length - * Output literal length: k (sequences of 4 + (k-4) bytes) - * Output match length - * Output literals - * Output offset - */ - - memcpy(dest, source, source_size); - return source_size; +static U32 LDM_hash(U32 sequence) { + return ((sequence * 2654435761U) >> ((MINMATCH*8)-LDM_HASHLOG)); } -size_t LDM_decompress(const char *source, char *dest, size_t compressed_size, size_t max_decompressed_size) { - memcpy(dest, source, compressed_size); +static U32 LDM_hash_position(const void * const p) { + return LDM_hash(LDM_read32(p)); +} + +static U64 find_best_match(tag t, U64 offset) { + return 0; +} + +static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, + const BYTE *srcBase) { + U32 *hashTable = (U32 *) tableBase; + hashTable[h] = (U32)(p - srcBase); +} + +static void LDM_put_position(const BYTE *p, void *tableBase, + const BYTE *srcBase) { + U32 const h = LDM_hash_position(p); + LDM_put_position_on_hash(p, h, tableBase, srcBase); +} + +static const BYTE *LDM_get_position_on_hash( + U32 h, void *tableBase, const BYTE *srcBase) { + const U32 * const hashTable = (U32*)tableBase; + return hashTable[h] + srcBase; +} + +static BYTE LDM_read_byte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} + +static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (unsigned)(pIn - pStart); + } + return (unsigned)(pIn - pStart); +} + + +size_t LDM_compress(void const *source, void *dest, size_t source_size, + size_t max_dest_size) { + const BYTE * const istart = (const BYTE*)source; + const BYTE *ip = istart; + const BYTE * const iend = istart + source_size; + const BYTE *ilimit = iend - HASH_SIZE; + const BYTE * const matchlimit = iend - HASH_SIZE; + BYTE *op = (BYTE*) dest; + U32 hashTable[LDM_HASHTABLESIZE_U32]; + memset(hashTable, 0, sizeof(hashTable)); + + const BYTE *anchor = (const BYTE *)source; +// struct LDM_cctx cctx; + size_t output_size = 0; + + U32 forwardH; + + /* Hash first byte: put into hash table */ + + LDM_put_position(ip, hashTable, istart); + ip++; + forwardH = LDM_hash_position(ip); + + while (ip < ilimit) { + const BYTE *match; + BYTE *token; + /* Find a match */ + { + const BYTE *forwardIp = ip; + unsigned step = 1; + + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + + match = LDM_get_position_on_hash(h, hashTable, istart); + + forwardH = LDM_hash_position(forwardIp); + LDM_put_position_on_hash(ip, h, hashTable, istart); + } while (ip - match > WINDOW_SIZE || + LDM_read32(match) != LDM_read32(ip)); + } + + /* Encode literals */ + { + unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + + printf("Cur position: %zu\n", anchor - istart); + printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); + /* + fwrite(match, 4, 1, stdout); + printf("\n"); + */ + + if (litLength >= RUN_MASK) { + int len = (int)litLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *op++ = (BYTE)len; + } + } else { + *token = (BYTE)(litLength << ML_BITS); + } + + printf("Literals "); + fwrite(anchor, litLength, 1, stdout); + printf("\n"); + + LDM_wild_copy(op, anchor, op + litLength); + op += litLength; + } +_next_match: + /* Encode offset */ + { + LDM_writeLE16(op, (U16)(ip - match)); + op += 2; + } + + /* Encode Match Length */ + { + unsigned matchCode; + matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, + matchlimit); + + printf("Match length %zu\n", matchCode + MINMATCH); + fwrite(ip, MINMATCH + matchCode, 1, stdout); + printf("\n"); + ip += MINMATCH + matchCode; + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LDM_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*0xFF) { + op += 4; + LDM_write32(op, 0xffffffff); + matchCode -= 4*0xFF; + } + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else { + *token += (BYTE)(matchCode); + } + printf("\n"); + } + + anchor = ip; + + LDM_put_position(ip, hashTable, istart); + forwardH = LDM_hash_position(++ip); + } + /* Encode last literals */ + { + /* + size_t const lastRun = (size_t)(iend - anchor); + printf("last run length: %zu, %zu %zu %zu %zu\n", lastRun, iend-istart, + anchor-istart, ip-istart, ilimit-istart); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255; accumulator -= 255) { + *op++ = 255; + } + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRun << ML_BITS); + } + fwrite(anchor, lastRun, 1, stdout); + printf("^last run\n"); + memcpy(op, anchor, lastRun); + op += lastRun; + +// memcpy(dest + (ip - istart), ip, 1); +// */ + } + return (op - (BYTE *)dest); +} + +size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, + size_t max_decompressed_size) { + const BYTE *ip = (const BYTE *)source; + const BYTE * const iend = ip + compressed_size; + BYTE *op = (BYTE *)dest; + BYTE * const oend = op + max_decompressed_size; + BYTE *cpy; + + while (ip < iend) { + size_t length; + const BYTE *match; + size_t offset; + + /* get literal length */ + unsigned const token = *ip++; + if ((length=(token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } + printf("Literal length: %zu\n", length); + + /* copy literals */ + cpy = op + length; + LDM_wild_copy(op, ip, cpy); + ip += length; + op = cpy; + + /* get offset */ + offset = LDM_readLE16(ip); + printf("Offset: %zu\n", offset); + ip += 2; + match = op - offset; + // LDM_write32(op, (U32)offset); + + /* get matchlength */ + length = token & ML_MASK; + printf("Match length: %zu\n", length); + if (length == ML_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } + length += MINMATCH; + + /* copy match */ + cpy = op + length; + + + + } + +// memcpy(dest, source, compressed_size); return compressed_size; } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index d0151373..0aab6aa3 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -3,8 +3,10 @@ #include /* size_t */ -size_t LDM_compress(const char *source, char *dest, size_t source_size, size_t max_dest_size); +size_t LDM_compress(void const *source, void *dest, size_t source_size, + size_t max_dest_size); -size_t LDM_decompress(const char *source, char *dest, size_t compressed_size, size_t max_decompressed_size); +size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, + size_t max_decompressed_size); #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index bdcffb0f..8b97ce92 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -13,6 +14,7 @@ #define BUF_SIZE 16*1024 // Block size #define LDM_HEADER_SIZE 8 #define DEBUG +// #define ZSTD #if 0 static size_t compress_file(FILE *in, FILE *out, size_t *size_in, @@ -163,7 +165,7 @@ static size_t compress(const char *fname, const char *oname) { perror("lseek error"); return 1; } - + /* write a dummy byte at the last location */ if (write(fdout, "", 1) != 1) { perror("write error"); @@ -186,9 +188,15 @@ static size_t compress(const char *fname, const char *oname) { /* Copy input file to output file */ // memcpy(dst, src, statbuf.st_size); - size_t size_out = ZSTD_compress(dst, statbuf.st_size, - src, statbuf.st_size, 1); - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + #ifdef ZSTD + size_t size_out = ZSTD_compress(dst, statbuf.st_size, + src, statbuf.st_size, 1); + #else + size_t size_out = LDM_compress(src, dst, statbuf.st_size, + statbuf.st_size); + #endif + ftruncate(fdout, size_out); + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, (unsigned)statbuf.st_size, (unsigned)size_out, oname, (double)size_out / (statbuf.st_size) * 100); @@ -225,7 +233,7 @@ static size_t decompress(const char *fname, const char *oname) { perror("lseek error"); return 1; } - + /* write a dummy byte at the last location */ if (write(fdout, "", 1) != 1) { perror("write error"); @@ -249,9 +257,14 @@ static size_t decompress(const char *fname, const char *oname) { /* Copy input file to output file */ // memcpy(dst, src, statbuf.st_size); - size_t size_out = ZSTD_decompress(dst, statbuf.st_size, - src, statbuf.st_size); - + #ifdef ZSTD + size_t size_out = ZSTD_decompress(dst, statbuf.st_size, + src, statbuf.st_size); + #else + size_t size_out = LDM_decompress(src, dst, statbuf.st_size, + statbuf.st_size); + #endif + ftruncate(fdout, size_out); close(fdin); close(fdout); @@ -315,20 +328,35 @@ int main(int argc, const char *argv[]) { printf("ldm = [%s]\n", ldmFilename); printf("dec = [%s]\n", decFilename); + struct timeval tv1, tv2; /* compress */ + { + gettimeofday(&tv1, NULL); if (compress(inpFilename, ldmFilename)) { printf("Compress error"); return 1; } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + } /* decompress */ + + gettimeofday(&tv1, NULL); if (decompress(ldmFilename, decFilename)) { printf("Decompress error"); return 1; } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); /* verify */ verify(inpFilename, decFilename); + return 0; } #if 0 From 3bbfa1249e8ef2f2b966f6b71a75d8cfbb53089a Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 6 Jul 2017 16:47:08 -0700 Subject: [PATCH 04/62] Update compressor and decompressor --- contrib/long_distance_matching/ldm.c | 40 ++++++++++++++++++----- contrib/long_distance_matching/main-ldm.c | 4 +-- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index c8051ea4..908ac2ac 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -20,6 +20,8 @@ #define RUN_BITS (8-ML_BITS) #define RUN_MASK ((1U<= 255; len -= 255) { - *op++ = (BYTE)len; + *op++ = 255; } + *op++ = (BYTE)len; } else { *token = (BYTE)(litLength << ML_BITS); } - +#ifdef LDM_DEBUG printf("Literals "); fwrite(anchor, litLength, 1, stdout); printf("\n"); - +#endif LDM_wild_copy(op, anchor, op + litLength); op += litLength; } @@ -232,10 +238,11 @@ _next_match: unsigned matchCode; matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, matchlimit); - +#ifdef LDM_DEBUG printf("Match length %zu\n", matchCode + MINMATCH); fwrite(ip, MINMATCH + matchCode, 1, stdout); printf("\n"); +#endif ip += MINMATCH + matchCode; if (matchCode >= ML_MASK) { *token += ML_MASK; @@ -251,7 +258,9 @@ _next_match: } else { *token += (BYTE)(matchCode); } +#ifdef LDM_DEBUG printf("\n"); +#endif } anchor = ip; @@ -308,24 +317,33 @@ size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, length += s; } while (s == 255); } +#ifdef LDM_DEBUG printf("Literal length: %zu\n", length); +#endif /* copy literals */ cpy = op + length; +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(ip, length, 1, stdout); + printf("\n"); +#endif LDM_wild_copy(op, ip, cpy); ip += length; op = cpy; /* get offset */ offset = LDM_readLE16(ip); + +#ifdef LDM_DEBUG printf("Offset: %zu\n", offset); +#endif ip += 2; match = op - offset; // LDM_write32(op, (U32)offset); /* get matchlength */ length = token & ML_MASK; - printf("Match length: %zu\n", length); if (length == ML_MASK) { unsigned s; do { @@ -334,16 +352,20 @@ size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, } while (s == 255); } length += MINMATCH; - +#ifdef LDM_DEBUG + printf("Match length: %zu\n", length); +#endif /* copy match */ cpy = op + length; - - + // Inefficient for now + while (match < cpy - offset) { + *op++ = *match++; + } } // memcpy(dest, source, compressed_size); - return compressed_size; + return op - (BYTE *)dest; } diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 8b97ce92..7f1abdab 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -229,7 +229,7 @@ static size_t decompress(const char *fname, const char *oname) { } /* go to the location corresponding to the last byte */ - if (lseek(fdout, statbuf.st_size - 1, SEEK_SET) == -1) { + if (lseek(fdout, 2*statbuf.st_size - 1, SEEK_SET) == -1) { perror("lseek error"); return 1; } @@ -264,7 +264,7 @@ static size_t decompress(const char *fname, const char *oname) { size_t size_out = LDM_decompress(src, dst, statbuf.st_size, statbuf.st_size); #endif - ftruncate(fdout, size_out); + //ftruncate(fdout, size_out); close(fdin); close(fdout); From f791fc27e3faae15eef3dfa7ce768a05cd2773cb Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 7 Jul 2017 12:44:29 -0700 Subject: [PATCH 05/62] Add header with compress and decompress size --- contrib/long_distance_matching/Makefile | 6 +-- contrib/long_distance_matching/ldm.c | 16 +++--- contrib/long_distance_matching/ldm.h | 7 +++ contrib/long_distance_matching/main-ldm.c | 61 +++++++++++++++-------- 4 files changed, 57 insertions(+), 33 deletions(-) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 0efae69b..4e04fd6a 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -16,11 +16,11 @@ LDFLAGS += -lzstd default: all -all: main main-ldm +all: main-ldm -main : ldm.c main.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ +#main : ldm.c main.c +# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ main-ldm : ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 908ac2ac..cb90efec 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -69,8 +69,6 @@ static void LDM_writeLE16(void *memPtr, U16 value) { } } - - static U32 LDM_read32(const void *ptr) { return *(const U32 *)ptr; } @@ -98,17 +96,13 @@ struct hash_entry { }; static U32 LDM_hash(U32 sequence) { - return ((sequence * 2654435761U) >> ((MINMATCH*8)-LDM_HASHLOG)); + return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); } static U32 LDM_hash_position(const void * const p) { return LDM_hash(LDM_read32(p)); } -static U64 find_best_match(tag t, U64 offset) { - return 0; -} - static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, const BYTE *srcBase) { U32 *hashTable = (U32 *) tableBase; @@ -148,6 +142,12 @@ static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, return (unsigned)(pIn - pStart); } +void LDM_read_header(void const *source, size_t *compressed_size, + size_t *decompressed_size) { + U32 *ip = (U32 *)source; + *compressed_size = *ip++; + *decompressed_size = *ip; +} size_t LDM_compress(void const *source, void *dest, size_t source_size, size_t max_dest_size) { @@ -359,7 +359,7 @@ size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, cpy = op + length; // Inefficient for now - while (match < cpy - offset) { + while (match < cpy - offset && op < oend) { *op++ = *match++; } } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 0aab6aa3..f4ca25a3 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -3,10 +3,17 @@ #include /* size_t */ +#define LDM_COMPRESS_SIZE 4 +#define LDM_DECOMPRESS_SIZE 4 +#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) + size_t LDM_compress(void const *source, void *dest, size_t source_size, size_t max_dest_size); size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, size_t max_decompressed_size); +void LDM_read_header(void const *source, size_t *compressed_size, + size_t *decompressed_size); + #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 7f1abdab..4d54ef6d 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -11,10 +11,10 @@ #include #include "ldm.h" -#define BUF_SIZE 16*1024 // Block size -#define LDM_HEADER_SIZE 8 +// #define BUF_SIZE 16*1024 // Block size #define DEBUG -// #define ZSTD + +//#define ZSTD #if 0 static size_t compress_file(FILE *in, FILE *out, size_t *size_in, @@ -159,9 +159,10 @@ static size_t compress(const char *fname, const char *oname) { perror("Fstat error"); return 1; } + size_t size_in = statbuf.st_size; /* go to the location corresponding to the last byte */ - if (lseek(fdout, statbuf.st_size - 1, SEEK_SET) == -1) { + if (lseek(fdout, size_in + LDM_HEADER_SIZE - 1, SEEK_SET) == -1) { perror("lseek error"); return 1; } @@ -178,24 +179,31 @@ static size_t compress(const char *fname, const char *oname) { perror("mmap error for input"); return 1; } + size_t out_size = statbuf.st_size + LDM_HEADER_SIZE; /* mmap the output file */ - if ((dst = mmap(0, statbuf.st_size, PROT_READ | PROT_WRITE, + if ((dst = mmap(0, out_size, PROT_READ | PROT_WRITE, MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { perror("mmap error for output"); return 1; } - /* Copy input file to output file */ -// memcpy(dst, src, statbuf.st_size); #ifdef ZSTD size_t size_out = ZSTD_compress(dst, statbuf.st_size, src, statbuf.st_size, 1); #else - size_t size_out = LDM_compress(src, dst, statbuf.st_size, + size_t size_out = LDM_compress(src, dst + LDM_HEADER_SIZE, statbuf.st_size, statbuf.st_size); + size_out += LDM_HEADER_SIZE; + + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + memcpy(dst, &size_out, 4); + memcpy(dst + 4, &(statbuf.st_size), 4); + printf("Compressed size: %zu\n", size_out); + printf("Decompressed size: %zu\n", statbuf.st_size); #endif ftruncate(fdout, size_out); + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, (unsigned)statbuf.st_size, (unsigned)size_out, oname, (double)size_out / (statbuf.st_size) * 100); @@ -228,8 +236,22 @@ static size_t decompress(const char *fname, const char *oname) { return 1; } + /* mmap the input file */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* read header */ + size_t compressed_size, decompressed_size; + LDM_read_header(src, &compressed_size, &decompressed_size); + + printf("Size, compressed_size, decompressed_size: %zu %zu %zu\n", + statbuf.st_size, compressed_size, decompressed_size); + /* go to the location corresponding to the last byte */ - if (lseek(fdout, 2*statbuf.st_size - 1, SEEK_SET) == -1) { + if (lseek(fdout, decompressed_size - 1, SEEK_SET) == -1) { perror("lseek error"); return 1; } @@ -240,15 +262,8 @@ static size_t decompress(const char *fname, const char *oname) { return 1; } - /* mmap the input file */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - /* mmap the output file */ - if ((dst = mmap(0, statbuf.st_size, PROT_READ | PROT_WRITE, + if ((dst = mmap(0, decompressed_size, PROT_READ | PROT_WRITE, MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { perror("mmap error for output"); return 1; @@ -258,13 +273,15 @@ static size_t decompress(const char *fname, const char *oname) { // memcpy(dst, src, statbuf.st_size); #ifdef ZSTD - size_t size_out = ZSTD_decompress(dst, statbuf.st_size, - src, statbuf.st_size); + size_t size_out = ZSTD_decompress(dst, decomrpessed_size, + src + LDM_HEADER_SIZE, + statbuf.st_size - LDM_HEADER_SIZE); #else - size_t size_out = LDM_decompress(src, dst, statbuf.st_size, - statbuf.st_size); + size_t size_out = LDM_decompress(src + LDM_HEADER_SIZE, dst, + statbuf.st_size - LDM_HEADER_SIZE, + decompressed_size); #endif - //ftruncate(fdout, size_out); + ftruncate(fdout, size_out); close(fdin); close(fdout); From 7945f9ee4757f045d4006813cd1f3e1ddfafa85c Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 7 Jul 2017 14:14:01 -0700 Subject: [PATCH 06/62] Fix offset overflow bug --- contrib/long_distance_matching/ldm.c | 36 ++++++++++++++--------- contrib/long_distance_matching/main-ldm.c | 1 + 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index cb90efec..b02869fe 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -11,7 +11,7 @@ #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) -#define WINDOW_SIZE (1 << 20) +#define WINDOW_SIZE (1 << 15) #define HASH_SIZE 4 #define MINMATCH 4 @@ -144,7 +144,7 @@ static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, void LDM_read_header(void const *source, size_t *compressed_size, size_t *decompressed_size) { - U32 *ip = (U32 *)source; + const U32 *ip = (const U32 *)source; *compressed_size = *ip++; *decompressed_size = *ip; } @@ -156,6 +156,7 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, const BYTE * const iend = istart + source_size; const BYTE *ilimit = iend - HASH_SIZE; const BYTE * const matchlimit = iend - HASH_SIZE; + const BYTE * const mflimit = iend - MINMATCH; BYTE *op = (BYTE*) dest; U32 hashTable[LDM_HASHTABLESIZE_U32]; memset(hashTable, 0, sizeof(hashTable)); @@ -172,6 +173,7 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, ip++; forwardH = LDM_hash_position(ip); + //TODO Loop terminates before ip>=ilimit. while (ip < ilimit) { const BYTE *match; BYTE *token; @@ -186,6 +188,10 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, ip = forwardIp; forwardIp += step; + if (forwardIp > mflimit) { + goto _last_literals; + } + match = LDM_get_position_on_hash(h, hashTable, istart); forwardH = LDM_hash_position(forwardIp); @@ -194,6 +200,12 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, LDM_read32(match) != LDM_read32(ip)); } + // TODO catchup + while (ip > anchor && match > istart && ip[-1] == match[-1]) { + ip--; + match--; + } + /* Encode literals */ { unsigned const litLength = (unsigned)(ip - anchor); @@ -223,7 +235,8 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, fwrite(anchor, litLength, 1, stdout); printf("\n"); #endif - LDM_wild_copy(op, anchor, op + litLength); + memcpy(op, anchor, litLength); + //LDM_wild_copy(op, anchor, op + litLength); op += litLength; } _next_match: @@ -268,29 +281,22 @@ _next_match: LDM_put_position(ip, hashTable, istart); forwardH = LDM_hash_position(++ip); } +_last_literals: /* Encode last literals */ { - /* size_t const lastRun = (size_t)(iend - anchor); - printf("last run length: %zu, %zu %zu %zu %zu\n", lastRun, iend-istart, - anchor-istart, ip-istart, ilimit-istart); if (lastRun >= RUN_MASK) { size_t accumulator = lastRun - RUN_MASK; *op++ = RUN_MASK << ML_BITS; for(; accumulator >= 255; accumulator -= 255) { *op++ = 255; } - *op++ = (BYTE) accumulator; + *op++ = (BYTE)accumulator; } else { *op++ = (BYTE)(lastRun << ML_BITS); } - fwrite(anchor, lastRun, 1, stdout); - printf("^last run\n"); memcpy(op, anchor, lastRun); op += lastRun; - -// memcpy(dest + (ip - istart), ip, 1); -// */ } return (op - (BYTE *)dest); } @@ -328,7 +334,8 @@ size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, fwrite(ip, length, 1, stdout); printf("\n"); #endif - LDM_wild_copy(op, ip, cpy); + memcpy(op, ip, length); +// LDM_wild_copy(op, ip, cpy); ip += length; op = cpy; @@ -358,12 +365,13 @@ size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, /* copy match */ cpy = op + length; +// printf("TMP_PREV: %zu\n", op - (BYTE *)dest); // Inefficient for now while (match < cpy - offset && op < oend) { *op++ = *match++; } +// printf("TMP: %zu\n", op - (BYTE *)dest); } - // memcpy(dest, source, compressed_size); return op - (BYTE *)dest; } diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 4d54ef6d..26db1e94 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -280,6 +280,7 @@ static size_t decompress(const char *fname, const char *oname) { size_t size_out = LDM_decompress(src + LDM_HEADER_SIZE, dst, statbuf.st_size - LDM_HEADER_SIZE, decompressed_size); + printf("Ret size out: %zu\n", size_out); #endif ftruncate(fdout, size_out); From 4076be09ec17c53680927396ffd85e02e10160ad Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 7 Jul 2017 14:52:40 -0700 Subject: [PATCH 07/62] [ldm] Update to hash every position --- contrib/long_distance_matching/ldm.c | 48 ++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index b02869fe..b03e4368 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -11,7 +11,8 @@ #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) -#define WINDOW_SIZE (1 << 15) +#define WINDOW_SIZE (1 << 20) +#define MAX_WINDOW_SIZE 31 #define HASH_SIZE 4 #define MINMATCH 4 @@ -73,6 +74,11 @@ static U32 LDM_read32(const void *ptr) { return *(const U32 *)ptr; } +static U64 LDM_read64(const void *ptr) { + return *(const U64 *)ptr; +} + + static void LDM_copy8(void *dst, const void *src) { memcpy(dst, src, 8); } @@ -87,7 +93,6 @@ static void LDM_wild_copy(void *dstPtr, const void *srcPtr, void *dstEnd) { d += 8; s += 8; } while (d < e); - } struct hash_entry { @@ -99,12 +104,23 @@ static U32 LDM_hash(U32 sequence) { return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); } +static U32 LDM_hash5(U64 sequence) { + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; + const U32 hashLog = LDM_HASHLOG; + if (LDM_isLittleEndian()) + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); +} + static U32 LDM_hash_position(const void * const p) { return LDM_hash(LDM_read32(p)); } static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, const BYTE *srcBase) { +// printf("Hashing: %zu\n", p - srcBase); U32 *hashTable = (U32 *) tableBase; hashTable[h] = (U32)(p - srcBase); } @@ -170,6 +186,7 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, /* Hash first byte: put into hash table */ LDM_put_position(ip, hashTable, istart); + const BYTE *lastHash = ip; ip++; forwardH = LDM_hash_position(ip); @@ -196,8 +213,9 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, forwardH = LDM_hash_position(forwardIp); LDM_put_position_on_hash(ip, h, hashTable, istart); + lastHash = ip; } while (ip - match > WINDOW_SIZE || - LDM_read32(match) != LDM_read32(ip)); + LDM_read64(match) != LDM_read64(ip)); } // TODO catchup @@ -215,10 +233,6 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, printf("Cur position: %zu\n", anchor - istart); printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); #endif - /* - fwrite(match, 4, 1, stdout); - printf("\n"); - */ if (litLength >= RUN_MASK) { int len = (int)litLength - RUN_MASK; @@ -242,8 +256,8 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, _next_match: /* Encode offset */ { - LDM_writeLE16(op, (U16)(ip - match)); - op += 2; + LDM_write32(op, ip - match); + op += 4; } /* Encode Match Length */ @@ -256,7 +270,13 @@ _next_match: fwrite(ip, MINMATCH + matchCode, 1, stdout); printf("\n"); #endif - ip += MINMATCH + matchCode; + + unsigned ctr = 1; + ip++; + for (; ctr < MINMATCH + matchCode; ip++, ctr++) { + LDM_put_position(ip, hashTable, istart); + } +// ip += MINMATCH + matchCode; if (matchCode >= ML_MASK) { *token += ML_MASK; matchCode -= ML_MASK; @@ -280,6 +300,7 @@ _next_match: LDM_put_position(ip, hashTable, istart); forwardH = LDM_hash_position(++ip); + lastHash = ip; } _last_literals: /* Encode last literals */ @@ -340,12 +361,12 @@ size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, op = cpy; /* get offset */ - offset = LDM_readLE16(ip); + offset = LDM_read32(ip); #ifdef LDM_DEBUG printf("Offset: %zu\n", offset); #endif - ip += 2; + ip += 4; match = op - offset; // LDM_write32(op, (U32)offset); @@ -365,12 +386,11 @@ size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, /* copy match */ cpy = op + length; -// printf("TMP_PREV: %zu\n", op - (BYTE *)dest); // Inefficient for now + while (match < cpy - offset && op < oend) { *op++ = *match++; } -// printf("TMP: %zu\n", op - (BYTE *)dest); } // memcpy(dest, source, compressed_size); return op - (BYTE *)dest; From acdeb9f30211b018460c8f9e71595b49d553e555 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 7 Jul 2017 17:09:28 -0700 Subject: [PATCH 08/62] Add compression statistics --- contrib/long_distance_matching/ldm.c | 57 ++++++++++++++++++++--- contrib/long_distance_matching/main-ldm.c | 2 + 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index b03e4368..f8061f53 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -5,6 +5,8 @@ #include "ldm.h" +#define HASH_EVERY 7 + #define LDM_MEMORY_USAGE 14 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) @@ -13,8 +15,8 @@ #define WINDOW_SIZE (1 << 20) #define MAX_WINDOW_SIZE 31 -#define HASH_SIZE 4 -#define MINMATCH 4 +#define HASH_SIZE 8 +#define MINMATCH 8 #define ML_BITS 4 #define ML_MASK ((1U<num_matches); + printf("Average match length: %.1f\n", ((double)stats->total_match_length) / + (double)stats->num_matches); + printf("Average literal length: %.1f\n", + ((double)stats->total_literal_length) / (double)stats->num_matches); + printf("Average offset length: %.1f\n", + ((double)stats->total_offset) / (double)stats->num_matches); + printf("=====================\n"); +} + struct hash_entry { U64 offset; tag t; @@ -121,12 +143,19 @@ static U32 LDM_hash_position(const void * const p) { static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, const BYTE *srcBase) { // printf("Hashing: %zu\n", p - srcBase); + if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { + return; + } + U32 *hashTable = (U32 *) tableBase; hashTable[h] = (U32)(p - srcBase); } static void LDM_put_position(const BYTE *p, void *tableBase, const BYTE *srcBase) { + if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { + return; + } U32 const h = LDM_hash_position(p); LDM_put_position_on_hash(p, h, tableBase, srcBase); } @@ -174,6 +203,9 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, const BYTE * const matchlimit = iend - HASH_SIZE; const BYTE * const mflimit = iend - MINMATCH; BYTE *op = (BYTE*) dest; + + compress_stats compressStats = { 0 }; + U32 hashTable[LDM_HASHTABLESIZE_U32]; memset(hashTable, 0, sizeof(hashTable)); @@ -217,8 +249,9 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, } while (ip - match > WINDOW_SIZE || LDM_read64(match) != LDM_read64(ip)); } + compressStats.num_matches++; - // TODO catchup + /* Catchup: look back to extend match from found match */ while (ip > anchor && match > istart && ip[-1] == match[-1]) { ip--; match--; @@ -229,6 +262,8 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, unsigned const litLength = (unsigned)(ip - anchor); token = op++; + compressStats.total_literal_length += litLength; + #ifdef LDM_DEBUG printf("Cur position: %zu\n", anchor - istart); printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); @@ -256,8 +291,13 @@ size_t LDM_compress(void const *source, void *dest, size_t source_size, _next_match: /* Encode offset */ { + /* + LDM_writeLE16(op, ip-match); + op += 2; + */ LDM_write32(op, ip - match); op += 4; + compressStats.total_offset += (ip - match); } /* Encode Match Length */ @@ -270,7 +310,7 @@ _next_match: fwrite(ip, MINMATCH + matchCode, 1, stdout); printf("\n"); #endif - + compressStats.total_match_length += matchCode + MINMATCH; unsigned ctr = 1; ip++; for (; ctr < MINMATCH + matchCode; ip++, ctr++) { @@ -293,6 +333,7 @@ _next_match: } #ifdef LDM_DEBUG printf("\n"); + #endif } @@ -319,6 +360,7 @@ _last_literals: memcpy(op, anchor, lastRun); op += lastRun; } + print_compress_stats(&compressStats); return (op - (BYTE *)dest); } @@ -361,12 +403,15 @@ size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, op = cpy; /* get offset */ + /* + offset = LDM_readLE16(ip); + ip += 2; + */ offset = LDM_read32(ip); - + ip += 4; #ifdef LDM_DEBUG printf("Offset: %zu\n", offset); #endif - ip += 4; match = op - offset; // LDM_write32(op, (U32)offset); diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 26db1e94..10869cce 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -1,3 +1,5 @@ +// TODO: file size must fit into a U32 + #include #include #include From 719ccdc5a58efc9531a9b0d35ee358da7650cef3 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Sun, 9 Jul 2017 22:45:54 -0700 Subject: [PATCH 09/62] Update mainfile --- contrib/long_distance_matching/main.c | 29 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/contrib/long_distance_matching/main.c b/contrib/long_distance_matching/main.c index ddf5145f..67144166 100644 --- a/contrib/long_distance_matching/main.c +++ b/contrib/long_distance_matching/main.c @@ -1,12 +1,31 @@ +#include #include #include #include +#include +#include #include "ldm.h" #define BUF_SIZE 16*1024 // Block size #define LDM_HEADER_SIZE 8 +/* +static size_t compress_file_mmap(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *dst; + struct stat statbuf; + + if (fstat(in, &statbuf) < 0) { + printf("fstat error\n"); + return 1; + } + + + return 0; +} +*/ + static size_t compress_file(FILE *in, FILE *out, size_t *size_in, size_t *size_out) { char *src, *buf = NULL; @@ -26,7 +45,6 @@ static size_t compress_file(FILE *in, FILE *out, size_t *size_in, goto cleanup; } - for (;;) { k = fread(src, 1, BUF_SIZE, in); if (k == 0) @@ -37,10 +55,8 @@ static size_t compress_file(FILE *in, FILE *out, size_t *size_in, // n = k; // offset += n; - offset = k; - count_out += k; - -// k = fwrite(src, 1, offset, out); + offset = n; + count_out += n; k = fwrite(buf, 1, offset, out); if (k < offset) { @@ -94,8 +110,6 @@ static size_t decompress_file(FILE *in, FILE *out) { } } - // TODO - /* Decompress: * Continue while there is more input to read. */ @@ -220,7 +234,6 @@ int main(int argc, char *argv[]) { fclose(inpFp); } - return 0; } From eb280cd5685ae960ff75ff3c5381507ebaaef403 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 10 Jul 2017 06:32:05 -0700 Subject: [PATCH 10/62] Add folder for old versions --- .../long_distance_matching/versions/v1/ldm.c | 394 +++++++++++++++ .../long_distance_matching/versions/v1/ldm.h | 19 + .../versions/v1/main-ldm.c | 459 ++++++++++++++++++ 3 files changed, 872 insertions(+) create mode 100644 contrib/long_distance_matching/versions/v1/ldm.c create mode 100644 contrib/long_distance_matching/versions/v1/ldm.h create mode 100644 contrib/long_distance_matching/versions/v1/main-ldm.c diff --git a/contrib/long_distance_matching/versions/v1/ldm.c b/contrib/long_distance_matching/versions/v1/ldm.c new file mode 100644 index 00000000..266425f8 --- /dev/null +++ b/contrib/long_distance_matching/versions/v1/ldm.c @@ -0,0 +1,394 @@ +#include +#include +#include +#include + +#include "ldm.h" + +#define LDM_MEMORY_USAGE 14 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) + +#define WINDOW_SIZE (1 << 20) +#define MAX_WINDOW_SIZE 31 +#define HASH_SIZE 4 +#define MINMATCH 4 + +#define ML_BITS 4 +#define ML_MASK ((1U<>8); + } +} + +static U32 LDM_read32(const void *ptr) { + return *(const U32 *)ptr; +} + +static U64 LDM_read64(const void *ptr) { + return *(const U64 *)ptr; +} + + +static void LDM_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} + +static void LDM_wild_copy(void *dstPtr, const void *srcPtr, void *dstEnd) { + BYTE *d = (BYTE *)dstPtr; + const BYTE *s = (const BYTE *)srcPtr; + BYTE * const e = (BYTE *)dstEnd; + + do { + LDM_copy8(d, s); + d += 8; + s += 8; + } while (d < e); + +} + +struct hash_entry { + U64 offset; + tag t; +}; + +static U32 LDM_hash(U32 sequence) { + return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); +} + +static U32 LDM_hash5(U64 sequence) { + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; + const U32 hashLog = LDM_HASHLOG; + if (LDM_isLittleEndian()) + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); +} + +static U32 LDM_hash_position(const void * const p) { + return LDM_hash(LDM_read32(p)); +} + +static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, + const BYTE *srcBase) { + U32 *hashTable = (U32 *) tableBase; + hashTable[h] = (U32)(p - srcBase); +} + +static void LDM_put_position(const BYTE *p, void *tableBase, + const BYTE *srcBase) { + U32 const h = LDM_hash_position(p); + LDM_put_position_on_hash(p, h, tableBase, srcBase); +} + +static const BYTE *LDM_get_position_on_hash( + U32 h, void *tableBase, const BYTE *srcBase) { + const U32 * const hashTable = (U32*)tableBase; + return hashTable[h] + srcBase; +} + +static BYTE LDM_read_byte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} + +static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (unsigned)(pIn - pStart); + } + return (unsigned)(pIn - pStart); +} + +void LDM_read_header(void const *source, size_t *compressed_size, + size_t *decompressed_size) { + const U32 *ip = (const U32 *)source; + *compressed_size = *ip++; + *decompressed_size = *ip; +} + +size_t LDM_compress(void const *source, void *dest, size_t source_size, + size_t max_dest_size) { + const BYTE * const istart = (const BYTE*)source; + const BYTE *ip = istart; + const BYTE * const iend = istart + source_size; + const BYTE *ilimit = iend - HASH_SIZE; + const BYTE * const matchlimit = iend - HASH_SIZE; + const BYTE * const mflimit = iend - MINMATCH; + BYTE *op = (BYTE*) dest; + U32 hashTable[LDM_HASHTABLESIZE_U32]; + memset(hashTable, 0, sizeof(hashTable)); + + const BYTE *anchor = (const BYTE *)source; +// struct LDM_cctx cctx; + size_t output_size = 0; + + U32 forwardH; + + /* Hash first byte: put into hash table */ + + LDM_put_position(ip, hashTable, istart); + ip++; + forwardH = LDM_hash_position(ip); + + //TODO Loop terminates before ip>=ilimit. + while (ip < ilimit) { + const BYTE *match; + BYTE *token; + + /* Find a match */ + { + const BYTE *forwardIp = ip; + unsigned step = 1; + + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + + if (forwardIp > mflimit) { + goto _last_literals; + } + + match = LDM_get_position_on_hash(h, hashTable, istart); + + forwardH = LDM_hash_position(forwardIp); + LDM_put_position_on_hash(ip, h, hashTable, istart); + } while (ip - match > WINDOW_SIZE || + LDM_read64(match) != LDM_read64(ip)); + } + + // TODO catchup + while (ip > anchor && match > istart && ip[-1] == match[-1]) { + ip--; + match--; + } + + /* Encode literals */ + { + unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + +#ifdef LDM_DEBUG + printf("Cur position: %zu\n", anchor - istart); + printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); +#endif + /* + fwrite(match, 4, 1, stdout); + printf("\n"); + */ + + if (litLength >= RUN_MASK) { + int len = (int)litLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *op++ = 255; + } + *op++ = (BYTE)len; + } else { + *token = (BYTE)(litLength << ML_BITS); + } +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(anchor, litLength, 1, stdout); + printf("\n"); +#endif + memcpy(op, anchor, litLength); + //LDM_wild_copy(op, anchor, op + litLength); + op += litLength; + } +_next_match: + /* Encode offset */ + { + LDM_write32(op, ip - match); + op += 4; + } + + /* Encode Match Length */ + { + unsigned matchCode; + matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, + matchlimit); +#ifdef LDM_DEBUG + printf("Match length %zu\n", matchCode + MINMATCH); + fwrite(ip, MINMATCH + matchCode, 1, stdout); + printf("\n"); +#endif + ip += MINMATCH + matchCode; + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LDM_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*0xFF) { + op += 4; + LDM_write32(op, 0xffffffff); + matchCode -= 4*0xFF; + } + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else { + *token += (BYTE)(matchCode); + } +#ifdef LDM_DEBUG + printf("\n"); +#endif + } + + anchor = ip; + + LDM_put_position(ip, hashTable, istart); + forwardH = LDM_hash_position(++ip); + } +_last_literals: + /* Encode last literals */ + { + size_t const lastRun = (size_t)(iend - anchor); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255; accumulator -= 255) { + *op++ = 255; + } + *op++ = (BYTE)accumulator; + } else { + *op++ = (BYTE)(lastRun << ML_BITS); + } + memcpy(op, anchor, lastRun); + op += lastRun; + } + return (op - (BYTE *)dest); +} + +size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, + size_t max_decompressed_size) { + const BYTE *ip = (const BYTE *)source; + const BYTE * const iend = ip + compressed_size; + BYTE *op = (BYTE *)dest; + BYTE * const oend = op + max_decompressed_size; + BYTE *cpy; + + while (ip < iend) { + size_t length; + const BYTE *match; + size_t offset; + + /* get literal length */ + unsigned const token = *ip++; + if ((length=(token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } +#ifdef LDM_DEBUG + printf("Literal length: %zu\n", length); +#endif + + /* copy literals */ + cpy = op + length; +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(ip, length, 1, stdout); + printf("\n"); +#endif + memcpy(op, ip, length); +// LDM_wild_copy(op, ip, cpy); + ip += length; + op = cpy; + + /* get offset */ + offset = LDM_read32(ip); + +#ifdef LDM_DEBUG + printf("Offset: %zu\n", offset); +#endif + ip += 4; + match = op - offset; + // LDM_write32(op, (U32)offset); + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } + length += MINMATCH; +#ifdef LDM_DEBUG + printf("Match length: %zu\n", length); +#endif + /* copy match */ + cpy = op + length; + + // Inefficient for now + + while (match < cpy - offset && op < oend) { + *op++ = *match++; + } + } +// memcpy(dest, source, compressed_size); + return op - (BYTE *)dest; +} + + diff --git a/contrib/long_distance_matching/versions/v1/ldm.h b/contrib/long_distance_matching/versions/v1/ldm.h new file mode 100644 index 00000000..f4ca25a3 --- /dev/null +++ b/contrib/long_distance_matching/versions/v1/ldm.h @@ -0,0 +1,19 @@ +#ifndef LDM_H +#define LDM_H + +#include /* size_t */ + +#define LDM_COMPRESS_SIZE 4 +#define LDM_DECOMPRESS_SIZE 4 +#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) + +size_t LDM_compress(void const *source, void *dest, size_t source_size, + size_t max_dest_size); + +size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, + size_t max_decompressed_size); + +void LDM_read_header(void const *source, size_t *compressed_size, + size_t *decompressed_size); + +#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v1/main-ldm.c b/contrib/long_distance_matching/versions/v1/main-ldm.c new file mode 100644 index 00000000..10869cce --- /dev/null +++ b/contrib/long_distance_matching/versions/v1/main-ldm.c @@ -0,0 +1,459 @@ +// TODO: file size must fit into a U32 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ldm.h" + +// #define BUF_SIZE 16*1024 // Block size +#define DEBUG + +//#define ZSTD + +#if 0 +static size_t compress_file(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *buf = NULL; + size_t r = 1; + size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; + + src = malloc(BUF_SIZE); + if (!src) { + printf("Not enough memory\n"); + goto cleanup; + } + + size = BUF_SIZE + LDM_HEADER_SIZE; + buf = malloc(size); + if (!buf) { + printf("Not enough memory\n"); + goto cleanup; + } + + + for (;;) { + k = fread(src, 1, BUF_SIZE, in); + if (k == 0) + break; + count_in += k; + + n = LDM_compress(src, buf, k, BUF_SIZE); + + // n = k; + // offset += n; + offset = k; + count_out += k; + +// k = fwrite(src, 1, offset, out); + + k = fwrite(buf, 1, offset, out); + if (k < offset) { + if (ferror(out)) + printf("Write failed\n"); + else + printf("Short write\n"); + goto cleanup; + } + + } + *size_in = count_in; + *size_out = count_out; + r = 0; + cleanup: + free(src); + free(buf); + return r; +} + +static size_t decompress_file(FILE *in, FILE *out) { + void *src = malloc(BUF_SIZE); + void *dst = NULL; + size_t dst_capacity = BUF_SIZE; + size_t ret = 1; + size_t bytes_written = 0; + + if (!src) { + perror("decompress_file(src)"); + goto cleanup; + } + + while (ret != 0) { + /* Load more input */ + size_t src_size = fread(src, 1, BUF_SIZE, in); + void *src_ptr = src; + void *src_end = src_ptr + src_size; + if (src_size == 0 || ferror(in)) { + printf("(TODO): Decompress: not enough input or error reading file\n"); + //TODO + ret = 0; + goto cleanup; + } + + /* Allocate destination buffer if it hasn't been allocated already */ + if (!dst) { + dst = malloc(dst_capacity); + if (!dst) { + perror("decompress_file(dst)"); + goto cleanup; + } + } + + // TODO + + /* Decompress: + * Continue while there is more input to read. + */ + while (src_ptr != src_end && ret != 0) { + // size_t dst_size = src_size; + size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); + size_t written = fwrite(dst, 1, dst_size, out); +// printf("Writing %zu bytes\n", dst_size); + bytes_written += dst_size; + if (written != dst_size) { + printf("Decompress: Failed to write to file\n"); + goto cleanup; + } + src_ptr += src_size; + src_size = src_end - src_ptr; + } + + /* Update input */ + + } + + printf("Wrote %zu bytes\n", bytes_written); + + cleanup: + free(src); + free(dst); + + return ret; +} +#endif + +static size_t compress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* open the input file */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* open the output file */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* find size of input file */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + size_t size_in = statbuf.st_size; + + /* go to the location corresponding to the last byte */ + if (lseek(fdout, size_in + LDM_HEADER_SIZE - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + size_t out_size = statbuf.st_size + LDM_HEADER_SIZE; + + /* mmap the output file */ + if ((dst = mmap(0, out_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + + #ifdef ZSTD + size_t size_out = ZSTD_compress(dst, statbuf.st_size, + src, statbuf.st_size, 1); + #else + size_t size_out = LDM_compress(src, dst + LDM_HEADER_SIZE, statbuf.st_size, + statbuf.st_size); + size_out += LDM_HEADER_SIZE; + + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + memcpy(dst, &size_out, 4); + memcpy(dst + 4, &(statbuf.st_size), 4); + printf("Compressed size: %zu\n", size_out); + printf("Decompressed size: %zu\n", statbuf.st_size); + #endif + ftruncate(fdout, size_out); + + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + (unsigned)statbuf.st_size, (unsigned)size_out, oname, + (double)size_out / (statbuf.st_size) * 100); + + close(fdin); + close(fdout); + return 0; +} + +static size_t decompress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* open the input file */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* open the output file */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* find size of input file */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* mmap the input file */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* read header */ + size_t compressed_size, decompressed_size; + LDM_read_header(src, &compressed_size, &decompressed_size); + + printf("Size, compressed_size, decompressed_size: %zu %zu %zu\n", + statbuf.st_size, compressed_size, decompressed_size); + + /* go to the location corresponding to the last byte */ + if (lseek(fdout, decompressed_size - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, decompressed_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + + /* Copy input file to output file */ +// memcpy(dst, src, statbuf.st_size); + + #ifdef ZSTD + size_t size_out = ZSTD_decompress(dst, decomrpessed_size, + src + LDM_HEADER_SIZE, + statbuf.st_size - LDM_HEADER_SIZE); + #else + size_t size_out = LDM_decompress(src + LDM_HEADER_SIZE, dst, + statbuf.st_size - LDM_HEADER_SIZE, + decompressed_size); + printf("Ret size out: %zu\n", size_out); + #endif + ftruncate(fdout, size_out); + + close(fdin); + close(fdout); + return 0; +} + +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) { + break; + } + if (0 == result) { + result = memcmp(b0, b1, r0); + } + } + return result; +} + +static void verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); +} + +int main(int argc, const char *argv[]) { + const char * const exeName = argv[0]; + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Wrong arguments\n"); + printf("Usage:\n"); + printf("%s FILE\n", exeName); + return 1; + } + + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + struct timeval tv1, tv2; + /* compress */ + { + gettimeofday(&tv1, NULL); + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + } + + /* decompress */ + + gettimeofday(&tv1, NULL); + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + + /* verify */ + verify(inpFilename, decFilename); + return 0; +} + +#if 0 +int main2(int argc, char *argv[]) { + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Please specify input filename\n"); + return 0; + } + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *outFp = fopen(ldmFilename, "wb"); + size_t sizeIn = 0; + size_t sizeOut = 0; + size_t ret; + printf("compress : %s -> %s\n", inpFilename, ldmFilename); + ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); + if (ret) { + printf("compress : failed with code %zu\n", ret); + return ret; + } + printf("%s: %zu → %zu bytes, %.1f%%\n", + inpFilename, sizeIn, sizeOut, + (double)sizeOut / sizeIn * 100); + printf("compress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* decompress */ + { + FILE *inpFp = fopen(ldmFilename, "rb"); + FILE *outFp = fopen(decFilename, "wb"); + size_t ret; + + printf("decompress : %s -> %s\n", ldmFilename, decFilename); + ret = decompress_file(inpFp, outFp); + if (ret) { + printf("decompress : failed with code %zu\n", ret); + return ret; + } + printf("decompress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* verify */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); + } + return 0; +} +#endif + From 474e06ac5bd166b4771dd8dc2640ce8f0440aa77 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 10 Jul 2017 06:32:29 -0700 Subject: [PATCH 11/62] Minor refactoring --- contrib/long_distance_matching/ldm.c | 31 +- contrib/long_distance_matching/ldm.h | 12 +- contrib/long_distance_matching/main-ldm.c | 499 +++++++++++----------- 3 files changed, 279 insertions(+), 263 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index f8061f53..aeef4a33 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -187,29 +187,30 @@ static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, return (unsigned)(pIn - pStart); } -void LDM_read_header(void const *source, size_t *compressed_size, +void LDM_read_header(const void *src, size_t *compressed_size, size_t *decompressed_size) { - const U32 *ip = (const U32 *)source; + const U32 *ip = (const U32 *)src; *compressed_size = *ip++; *decompressed_size = *ip; } -size_t LDM_compress(void const *source, void *dest, size_t source_size, - size_t max_dest_size) { - const BYTE * const istart = (const BYTE*)source; +// TODO: maxDstSize is unused +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + const BYTE * const istart = (const BYTE*)src; const BYTE *ip = istart; - const BYTE * const iend = istart + source_size; + const BYTE * const iend = istart + srcSize; const BYTE *ilimit = iend - HASH_SIZE; const BYTE * const matchlimit = iend - HASH_SIZE; const BYTE * const mflimit = iend - MINMATCH; - BYTE *op = (BYTE*) dest; + BYTE *op = (BYTE*) dst; compress_stats compressStats = { 0 }; U32 hashTable[LDM_HASHTABLESIZE_U32]; memset(hashTable, 0, sizeof(hashTable)); - const BYTE *anchor = (const BYTE *)source; + const BYTE *anchor = (const BYTE *)src; // struct LDM_cctx cctx; size_t output_size = 0; @@ -361,14 +362,14 @@ _last_literals: op += lastRun; } print_compress_stats(&compressStats); - return (op - (BYTE *)dest); + return (op - (BYTE *)dst); } -size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, - size_t max_decompressed_size) { - const BYTE *ip = (const BYTE *)source; +size_t LDM_decompress(const void *src, size_t compressed_size, + void *dst, size_t max_decompressed_size) { + const BYTE *ip = (const BYTE *)src; const BYTE * const iend = ip + compressed_size; - BYTE *op = (BYTE *)dest; + BYTE *op = (BYTE *)dst; BYTE * const oend = op + max_decompressed_size; BYTE *cpy; @@ -437,8 +438,8 @@ size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, *op++ = *match++; } } -// memcpy(dest, source, compressed_size); - return op - (BYTE *)dest; +// memcpy(dst, src, compressed_size); + return op - (BYTE *)dst; } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index f4ca25a3..0ac7b2ec 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -7,13 +7,13 @@ #define LDM_DECOMPRESS_SIZE 4 #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) -size_t LDM_compress(void const *source, void *dest, size_t source_size, - size_t max_dest_size); +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); -size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, - size_t max_decompressed_size); +size_t LDM_decompress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); -void LDM_read_header(void const *source, size_t *compressed_size, - size_t *decompressed_size); +void LDM_read_header(const void *src, size_t *compressSize, + size_t *decompressSize); #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 10869cce..0017335b 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -18,6 +18,263 @@ //#define ZSTD +/* Compress file given by fname and output to oname. + * Returns 0 if successful, error code otherwise. + */ +static int compress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + size_t maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; + + /* Go to the location corresponding to the last byte. */ + /* TODO: fallocate? */ + if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* Write a dummy byte at the last location. */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +#ifdef ZSTD + size_t compressSize = ZSTD_compress(dst, statbuf.st_size, + src, statbuf.st_size, 1); +#else + size_t compressSize = LDM_HEADER_SIZE + + LDM_compress(src, statbuf.st_size, + dst + LDM_HEADER_SIZE, statbuf.st_size); + + // Write compress and decompress size to header + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + memcpy(dst, &compressSize, 4); + memcpy(dst + 4, &(statbuf.st_size), 4); + +#ifdef DEBUG + printf("Compressed size: %zu\n", compressSize); + printf("Decompressed size: %zu\n", statbuf.st_size); +#endif +#endif + + // Truncate file to compressSize. + ftruncate(fdout, compressSize); + + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + (unsigned)statbuf.st_size, (unsigned)compressSize, oname, + (double)compressSize / (statbuf.st_size) * 100); + + // Close files. + close(fdin); + close(fdout); + return 0; +} + +/* Decompress file compressed using LDM_compress. + * The input file should have the LDM_HEADER followed by payload. + * Returns 0 if succesful, and an error code otherwise. + */ +static int decompress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* Read the header. */ + size_t compressSize, decompressSize; + LDM_read_header(src, &compressSize, &decompressSize); + +#ifdef DEBUG + printf("Size, compressSize, decompressSize: %zu %zu %zu\n", + statbuf.st_size, compressSize, decompressSize); +#endif + + /* Go to the location corresponding to the last byte. */ + if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +#ifdef ZSTD + size_t outSize = ZSTD_decompress(dst, decomrpessed_size, + src + LDM_HEADER_SIZE, + statbuf.st_size - LDM_HEADER_SIZE); +#else + size_t outSize = LDM_decompress( + src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, + dst, decompressSize); + + printf("Ret size out: %zu\n", outSize); + #endif + ftruncate(fdout, outSize); + + close(fdin); + close(fdout); + return 0; +} + +/* Compare two files. + * Returns 0 iff they are the same. + */ +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) break; + + if (0 == result) result = memcmp(b0, b1, r0); + } + return result; +} + +/* Verify the input file is the same as the decompressed file. */ +static void verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); +} + +int main(int argc, const char *argv[]) { + const char * const exeName = argv[0]; + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Wrong arguments\n"); + printf("Usage:\n"); + printf("%s FILE\n", exeName); + return 1; + } + + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + struct timeval tv1, tv2; + + /* Compress */ + + gettimeofday(&tv1, NULL); + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + + /* Decompress */ + + gettimeofday(&tv1, NULL); + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + + /* verify */ + verify(inpFilename, decFilename); + return 0; +} + + #if 0 static size_t compress_file(FILE *in, FILE *out, size_t *size_in, size_t *size_out) { @@ -137,249 +394,7 @@ static size_t decompress_file(FILE *in, FILE *out) { return ret; } -#endif -static size_t compress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - - /* open the input file */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* open the output file */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* find size of input file */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - size_t size_in = statbuf.st_size; - - /* go to the location corresponding to the last byte */ - if (lseek(fdout, size_in + LDM_HEADER_SIZE - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the input file */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - size_t out_size = statbuf.st_size + LDM_HEADER_SIZE; - - /* mmap the output file */ - if ((dst = mmap(0, out_size, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - - #ifdef ZSTD - size_t size_out = ZSTD_compress(dst, statbuf.st_size, - src, statbuf.st_size, 1); - #else - size_t size_out = LDM_compress(src, dst + LDM_HEADER_SIZE, statbuf.st_size, - statbuf.st_size); - size_out += LDM_HEADER_SIZE; - - // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &size_out, 4); - memcpy(dst + 4, &(statbuf.st_size), 4); - printf("Compressed size: %zu\n", size_out); - printf("Decompressed size: %zu\n", statbuf.st_size); - #endif - ftruncate(fdout, size_out); - - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)size_out, oname, - (double)size_out / (statbuf.st_size) * 100); - - close(fdin); - close(fdout); - return 0; -} - -static size_t decompress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - - /* open the input file */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* open the output file */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* find size of input file */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - /* mmap the input file */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* read header */ - size_t compressed_size, decompressed_size; - LDM_read_header(src, &compressed_size, &decompressed_size); - - printf("Size, compressed_size, decompressed_size: %zu %zu %zu\n", - statbuf.st_size, compressed_size, decompressed_size); - - /* go to the location corresponding to the last byte */ - if (lseek(fdout, decompressed_size - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, decompressed_size, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - - /* Copy input file to output file */ -// memcpy(dst, src, statbuf.st_size); - - #ifdef ZSTD - size_t size_out = ZSTD_decompress(dst, decomrpessed_size, - src + LDM_HEADER_SIZE, - statbuf.st_size - LDM_HEADER_SIZE); - #else - size_t size_out = LDM_decompress(src + LDM_HEADER_SIZE, dst, - statbuf.st_size - LDM_HEADER_SIZE, - decompressed_size); - printf("Ret size out: %zu\n", size_out); - #endif - ftruncate(fdout, size_out); - - close(fdin); - close(fdout); - return 0; -} - -static int compare(FILE *fp0, FILE *fp1) { - int result = 0; - while (result == 0) { - char b0[1024]; - char b1[1024]; - const size_t r0 = fread(b0, 1, sizeof(b0), fp0); - const size_t r1 = fread(b1, 1, sizeof(b1), fp1); - - result = (int)r0 - (int)r1; - - if (0 == r0 || 0 == r1) { - break; - } - if (0 == result) { - result = memcmp(b0, b1, r0); - } - } - return result; -} - -static void verify(const char *inpFilename, const char *decFilename) { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); -} - -int main(int argc, const char *argv[]) { - const char * const exeName = argv[0]; - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Wrong arguments\n"); - printf("Usage:\n"); - printf("%s FILE\n", exeName); - return 1; - } - - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - struct timeval tv1, tv2; - /* compress */ - { - gettimeofday(&tv1, NULL); - if (compress(inpFilename, ldmFilename)) { - printf("Compress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - } - - /* decompress */ - - gettimeofday(&tv1, NULL); - if (decompress(ldmFilename, decFilename)) { - printf("Decompress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - - /* verify */ - verify(inpFilename, decFilename); - return 0; -} - -#if 0 int main2(int argc, char *argv[]) { char inpFilename[256] = { 0 }; char ldmFilename[256] = { 0 }; From 5432214ee31b0dc56b4311e786e2420548afb7c7 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 10 Jul 2017 06:50:49 -0700 Subject: [PATCH 12/62] Minor refactoring --- contrib/long_distance_matching/ldm.c | 47 +- .../versions/v2/Makefile | 32 ++ .../long_distance_matching/versions/v2/ldm.c | 436 ++++++++++++++++ .../long_distance_matching/versions/v2/ldm.h | 19 + .../versions/v2/main-ldm.c | 474 ++++++++++++++++++ 5 files changed, 980 insertions(+), 28 deletions(-) create mode 100644 contrib/long_distance_matching/versions/v2/Makefile create mode 100644 contrib/long_distance_matching/versions/v2/ldm.c create mode 100644 contrib/long_distance_matching/versions/v2/ldm.h create mode 100644 contrib/long_distance_matching/versions/v2/main-ldm.c diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index aeef4a33..9081d136 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -33,8 +33,7 @@ typedef uint64_t U64; typedef uint64_t tag; -static unsigned LDM_isLittleEndian(void) -{ +static unsigned LDM_isLittleEndian(void) { const union { U32 u; BYTE c[4]; } one = { 1 }; return one.c[0]; } @@ -54,11 +53,11 @@ static U16 LDM_readLE16(const void *memPtr) { } } -static void LDM_write32(void *memPtr, U32 value) { +static void LDM_write16(void *memPtr, U16 value){ memcpy(memPtr, &value, sizeof(value)); } -static void LDM_write16(void *memPtr, U16 value) { +static void LDM_write32(void *memPtr, U32 value) { memcpy(memPtr, &value, sizeof(value)); } @@ -80,23 +79,10 @@ static U64 LDM_read64(const void *ptr) { return *(const U64 *)ptr; } - static void LDM_copy8(void *dst, const void *src) { memcpy(dst, src, 8); } -static void LDM_wild_copy(void *dstPtr, const void *srcPtr, void *dstEnd) { - BYTE *d = (BYTE *)dstPtr; - const BYTE *s = (const BYTE *)srcPtr; - BYTE * const e = (BYTE *)dstEnd; - - do { - LDM_copy8(d, s); - d += 8; - s += 8; - } while (d < e); -} - typedef struct compress_stats { U32 num_matches; U32 total_match_length; @@ -104,7 +90,7 @@ typedef struct compress_stats { U64 total_offset; } compress_stats; -static void print_compress_stats(const compress_stats *stats) { +static void LDM_printCompressStats(const compress_stats *stats) { printf("=====================\n"); printf("Compression statistics\n"); printf("Total number of matches: %u\n", stats->num_matches); @@ -117,6 +103,7 @@ static void print_compress_stats(const compress_stats *stats) { printf("=====================\n"); } +// TODO: unused. struct hash_entry { U64 offset; tag t; @@ -142,7 +129,6 @@ static U32 LDM_hash_position(const void * const p) { static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, const BYTE *srcBase) { -// printf("Hashing: %zu\n", p - srcBase); if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { return; } @@ -187,11 +173,11 @@ static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, return (unsigned)(pIn - pStart); } -void LDM_read_header(const void *src, size_t *compressed_size, - size_t *decompressed_size) { +void LDM_read_header(const void *src, size_t *compressSize, + size_t *decompressSize) { const U32 *ip = (const U32 *)src; - *compressed_size = *ip++; - *decompressed_size = *ip; + *compressSize = *ip++; + *decompressSize = *ip; } // TODO: maxDstSize is unused @@ -286,7 +272,6 @@ size_t LDM_compress(const void *src, size_t srcSize, printf("\n"); #endif memcpy(op, anchor, litLength); - //LDM_wild_copy(op, anchor, op + litLength); op += litLength; } _next_match: @@ -361,10 +346,19 @@ _last_literals: memcpy(op, anchor, lastRun); op += lastRun; } - print_compress_stats(&compressStats); + LDM_printCompressStats(&compressStats); return (op - (BYTE *)dst); } +typedef struct LDM_DCtx { + const BYTE * const ibase; /* Pointer to base of input */ + const BYTE *ip; /* Pointer to current input position */ + const BYTE *iend; /* End of source */ + BYTE *op; /* Pointer to output */ + const BYTE * const oend; /* Pointer to end of output */ + +} LDM_DCtx; + size_t LDM_decompress(const void *src, size_t compressed_size, void *dst, size_t max_decompressed_size) { const BYTE *ip = (const BYTE *)src; @@ -399,7 +393,6 @@ size_t LDM_decompress(const void *src, size_t compressed_size, printf("\n"); #endif memcpy(op, ip, length); -// LDM_wild_copy(op, ip, cpy); ip += length; op = cpy; @@ -433,12 +426,10 @@ size_t LDM_decompress(const void *src, size_t compressed_size, cpy = op + length; // Inefficient for now - while (match < cpy - offset && op < oend) { *op++ = *match++; } } -// memcpy(dst, src, compressed_size); return op - (BYTE *)dst; } diff --git a/contrib/long_distance_matching/versions/v2/Makefile b/contrib/long_distance_matching/versions/v2/Makefile new file mode 100644 index 00000000..4e04fd6a --- /dev/null +++ b/contrib/long_distance_matching/versions/v2/Makefile @@ -0,0 +1,32 @@ +# ################################################################ +# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# ################################################################ + +# This Makefile presumes libzstd is installed, using `sudo make install` + + +LDFLAGS += -lzstd + +.PHONY: default all clean + +default: all + +all: main-ldm + + +#main : ldm.c main.c +# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +main-ldm : ldm.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +clean: + @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ + main main-ldm + @echo Cleaning completed + diff --git a/contrib/long_distance_matching/versions/v2/ldm.c b/contrib/long_distance_matching/versions/v2/ldm.c new file mode 100644 index 00000000..9081d136 --- /dev/null +++ b/contrib/long_distance_matching/versions/v2/ldm.c @@ -0,0 +1,436 @@ +#include +#include +#include +#include + +#include "ldm.h" + +#define HASH_EVERY 7 + +#define LDM_MEMORY_USAGE 14 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) + +#define WINDOW_SIZE (1 << 20) +#define MAX_WINDOW_SIZE 31 +#define HASH_SIZE 8 +#define MINMATCH 8 + +#define ML_BITS 4 +#define ML_MASK ((1U<>8); + } +} + +static U32 LDM_read32(const void *ptr) { + return *(const U32 *)ptr; +} + +static U64 LDM_read64(const void *ptr) { + return *(const U64 *)ptr; +} + +static void LDM_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} + +typedef struct compress_stats { + U32 num_matches; + U32 total_match_length; + U32 total_literal_length; + U64 total_offset; +} compress_stats; + +static void LDM_printCompressStats(const compress_stats *stats) { + printf("=====================\n"); + printf("Compression statistics\n"); + printf("Total number of matches: %u\n", stats->num_matches); + printf("Average match length: %.1f\n", ((double)stats->total_match_length) / + (double)stats->num_matches); + printf("Average literal length: %.1f\n", + ((double)stats->total_literal_length) / (double)stats->num_matches); + printf("Average offset length: %.1f\n", + ((double)stats->total_offset) / (double)stats->num_matches); + printf("=====================\n"); +} + +// TODO: unused. +struct hash_entry { + U64 offset; + tag t; +}; + +static U32 LDM_hash(U32 sequence) { + return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); +} + +static U32 LDM_hash5(U64 sequence) { + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; + const U32 hashLog = LDM_HASHLOG; + if (LDM_isLittleEndian()) + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); +} + +static U32 LDM_hash_position(const void * const p) { + return LDM_hash(LDM_read32(p)); +} + +static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, + const BYTE *srcBase) { + if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { + return; + } + + U32 *hashTable = (U32 *) tableBase; + hashTable[h] = (U32)(p - srcBase); +} + +static void LDM_put_position(const BYTE *p, void *tableBase, + const BYTE *srcBase) { + if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { + return; + } + U32 const h = LDM_hash_position(p); + LDM_put_position_on_hash(p, h, tableBase, srcBase); +} + +static const BYTE *LDM_get_position_on_hash( + U32 h, void *tableBase, const BYTE *srcBase) { + const U32 * const hashTable = (U32*)tableBase; + return hashTable[h] + srcBase; +} + +static BYTE LDM_read_byte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} + +static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (unsigned)(pIn - pStart); + } + return (unsigned)(pIn - pStart); +} + +void LDM_read_header(const void *src, size_t *compressSize, + size_t *decompressSize) { + const U32 *ip = (const U32 *)src; + *compressSize = *ip++; + *decompressSize = *ip; +} + +// TODO: maxDstSize is unused +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + const BYTE * const istart = (const BYTE*)src; + const BYTE *ip = istart; + const BYTE * const iend = istart + srcSize; + const BYTE *ilimit = iend - HASH_SIZE; + const BYTE * const matchlimit = iend - HASH_SIZE; + const BYTE * const mflimit = iend - MINMATCH; + BYTE *op = (BYTE*) dst; + + compress_stats compressStats = { 0 }; + + U32 hashTable[LDM_HASHTABLESIZE_U32]; + memset(hashTable, 0, sizeof(hashTable)); + + const BYTE *anchor = (const BYTE *)src; +// struct LDM_cctx cctx; + size_t output_size = 0; + + U32 forwardH; + + /* Hash first byte: put into hash table */ + + LDM_put_position(ip, hashTable, istart); + const BYTE *lastHash = ip; + ip++; + forwardH = LDM_hash_position(ip); + + //TODO Loop terminates before ip>=ilimit. + while (ip < ilimit) { + const BYTE *match; + BYTE *token; + + /* Find a match */ + { + const BYTE *forwardIp = ip; + unsigned step = 1; + + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + + if (forwardIp > mflimit) { + goto _last_literals; + } + + match = LDM_get_position_on_hash(h, hashTable, istart); + + forwardH = LDM_hash_position(forwardIp); + LDM_put_position_on_hash(ip, h, hashTable, istart); + lastHash = ip; + } while (ip - match > WINDOW_SIZE || + LDM_read64(match) != LDM_read64(ip)); + } + compressStats.num_matches++; + + /* Catchup: look back to extend match from found match */ + while (ip > anchor && match > istart && ip[-1] == match[-1]) { + ip--; + match--; + } + + /* Encode literals */ + { + unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + + compressStats.total_literal_length += litLength; + +#ifdef LDM_DEBUG + printf("Cur position: %zu\n", anchor - istart); + printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); +#endif + + if (litLength >= RUN_MASK) { + int len = (int)litLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *op++ = 255; + } + *op++ = (BYTE)len; + } else { + *token = (BYTE)(litLength << ML_BITS); + } +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(anchor, litLength, 1, stdout); + printf("\n"); +#endif + memcpy(op, anchor, litLength); + op += litLength; + } +_next_match: + /* Encode offset */ + { + /* + LDM_writeLE16(op, ip-match); + op += 2; + */ + LDM_write32(op, ip - match); + op += 4; + compressStats.total_offset += (ip - match); + } + + /* Encode Match Length */ + { + unsigned matchCode; + matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, + matchlimit); +#ifdef LDM_DEBUG + printf("Match length %zu\n", matchCode + MINMATCH); + fwrite(ip, MINMATCH + matchCode, 1, stdout); + printf("\n"); +#endif + compressStats.total_match_length += matchCode + MINMATCH; + unsigned ctr = 1; + ip++; + for (; ctr < MINMATCH + matchCode; ip++, ctr++) { + LDM_put_position(ip, hashTable, istart); + } +// ip += MINMATCH + matchCode; + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LDM_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*0xFF) { + op += 4; + LDM_write32(op, 0xffffffff); + matchCode -= 4*0xFF; + } + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else { + *token += (BYTE)(matchCode); + } +#ifdef LDM_DEBUG + printf("\n"); + +#endif + } + + anchor = ip; + + LDM_put_position(ip, hashTable, istart); + forwardH = LDM_hash_position(++ip); + lastHash = ip; + } +_last_literals: + /* Encode last literals */ + { + size_t const lastRun = (size_t)(iend - anchor); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255; accumulator -= 255) { + *op++ = 255; + } + *op++ = (BYTE)accumulator; + } else { + *op++ = (BYTE)(lastRun << ML_BITS); + } + memcpy(op, anchor, lastRun); + op += lastRun; + } + LDM_printCompressStats(&compressStats); + return (op - (BYTE *)dst); +} + +typedef struct LDM_DCtx { + const BYTE * const ibase; /* Pointer to base of input */ + const BYTE *ip; /* Pointer to current input position */ + const BYTE *iend; /* End of source */ + BYTE *op; /* Pointer to output */ + const BYTE * const oend; /* Pointer to end of output */ + +} LDM_DCtx; + +size_t LDM_decompress(const void *src, size_t compressed_size, + void *dst, size_t max_decompressed_size) { + const BYTE *ip = (const BYTE *)src; + const BYTE * const iend = ip + compressed_size; + BYTE *op = (BYTE *)dst; + BYTE * const oend = op + max_decompressed_size; + BYTE *cpy; + + while (ip < iend) { + size_t length; + const BYTE *match; + size_t offset; + + /* get literal length */ + unsigned const token = *ip++; + if ((length=(token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } +#ifdef LDM_DEBUG + printf("Literal length: %zu\n", length); +#endif + + /* copy literals */ + cpy = op + length; +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(ip, length, 1, stdout); + printf("\n"); +#endif + memcpy(op, ip, length); + ip += length; + op = cpy; + + /* get offset */ + /* + offset = LDM_readLE16(ip); + ip += 2; + */ + offset = LDM_read32(ip); + ip += 4; +#ifdef LDM_DEBUG + printf("Offset: %zu\n", offset); +#endif + match = op - offset; + // LDM_write32(op, (U32)offset); + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } + length += MINMATCH; +#ifdef LDM_DEBUG + printf("Match length: %zu\n", length); +#endif + /* copy match */ + cpy = op + length; + + // Inefficient for now + while (match < cpy - offset && op < oend) { + *op++ = *match++; + } + } + return op - (BYTE *)dst; +} + + diff --git a/contrib/long_distance_matching/versions/v2/ldm.h b/contrib/long_distance_matching/versions/v2/ldm.h new file mode 100644 index 00000000..0ac7b2ec --- /dev/null +++ b/contrib/long_distance_matching/versions/v2/ldm.h @@ -0,0 +1,19 @@ +#ifndef LDM_H +#define LDM_H + +#include /* size_t */ + +#define LDM_COMPRESS_SIZE 4 +#define LDM_DECOMPRESS_SIZE 4 +#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) + +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +size_t LDM_decompress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +void LDM_read_header(const void *src, size_t *compressSize, + size_t *decompressSize); + +#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v2/main-ldm.c b/contrib/long_distance_matching/versions/v2/main-ldm.c new file mode 100644 index 00000000..0017335b --- /dev/null +++ b/contrib/long_distance_matching/versions/v2/main-ldm.c @@ -0,0 +1,474 @@ +// TODO: file size must fit into a U32 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ldm.h" + +// #define BUF_SIZE 16*1024 // Block size +#define DEBUG + +//#define ZSTD + +/* Compress file given by fname and output to oname. + * Returns 0 if successful, error code otherwise. + */ +static int compress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + size_t maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; + + /* Go to the location corresponding to the last byte. */ + /* TODO: fallocate? */ + if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* Write a dummy byte at the last location. */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +#ifdef ZSTD + size_t compressSize = ZSTD_compress(dst, statbuf.st_size, + src, statbuf.st_size, 1); +#else + size_t compressSize = LDM_HEADER_SIZE + + LDM_compress(src, statbuf.st_size, + dst + LDM_HEADER_SIZE, statbuf.st_size); + + // Write compress and decompress size to header + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + memcpy(dst, &compressSize, 4); + memcpy(dst + 4, &(statbuf.st_size), 4); + +#ifdef DEBUG + printf("Compressed size: %zu\n", compressSize); + printf("Decompressed size: %zu\n", statbuf.st_size); +#endif +#endif + + // Truncate file to compressSize. + ftruncate(fdout, compressSize); + + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + (unsigned)statbuf.st_size, (unsigned)compressSize, oname, + (double)compressSize / (statbuf.st_size) * 100); + + // Close files. + close(fdin); + close(fdout); + return 0; +} + +/* Decompress file compressed using LDM_compress. + * The input file should have the LDM_HEADER followed by payload. + * Returns 0 if succesful, and an error code otherwise. + */ +static int decompress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* Read the header. */ + size_t compressSize, decompressSize; + LDM_read_header(src, &compressSize, &decompressSize); + +#ifdef DEBUG + printf("Size, compressSize, decompressSize: %zu %zu %zu\n", + statbuf.st_size, compressSize, decompressSize); +#endif + + /* Go to the location corresponding to the last byte. */ + if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +#ifdef ZSTD + size_t outSize = ZSTD_decompress(dst, decomrpessed_size, + src + LDM_HEADER_SIZE, + statbuf.st_size - LDM_HEADER_SIZE); +#else + size_t outSize = LDM_decompress( + src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, + dst, decompressSize); + + printf("Ret size out: %zu\n", outSize); + #endif + ftruncate(fdout, outSize); + + close(fdin); + close(fdout); + return 0; +} + +/* Compare two files. + * Returns 0 iff they are the same. + */ +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) break; + + if (0 == result) result = memcmp(b0, b1, r0); + } + return result; +} + +/* Verify the input file is the same as the decompressed file. */ +static void verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); +} + +int main(int argc, const char *argv[]) { + const char * const exeName = argv[0]; + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Wrong arguments\n"); + printf("Usage:\n"); + printf("%s FILE\n", exeName); + return 1; + } + + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + struct timeval tv1, tv2; + + /* Compress */ + + gettimeofday(&tv1, NULL); + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + + /* Decompress */ + + gettimeofday(&tv1, NULL); + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + + /* verify */ + verify(inpFilename, decFilename); + return 0; +} + + +#if 0 +static size_t compress_file(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *buf = NULL; + size_t r = 1; + size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; + + src = malloc(BUF_SIZE); + if (!src) { + printf("Not enough memory\n"); + goto cleanup; + } + + size = BUF_SIZE + LDM_HEADER_SIZE; + buf = malloc(size); + if (!buf) { + printf("Not enough memory\n"); + goto cleanup; + } + + + for (;;) { + k = fread(src, 1, BUF_SIZE, in); + if (k == 0) + break; + count_in += k; + + n = LDM_compress(src, buf, k, BUF_SIZE); + + // n = k; + // offset += n; + offset = k; + count_out += k; + +// k = fwrite(src, 1, offset, out); + + k = fwrite(buf, 1, offset, out); + if (k < offset) { + if (ferror(out)) + printf("Write failed\n"); + else + printf("Short write\n"); + goto cleanup; + } + + } + *size_in = count_in; + *size_out = count_out; + r = 0; + cleanup: + free(src); + free(buf); + return r; +} + +static size_t decompress_file(FILE *in, FILE *out) { + void *src = malloc(BUF_SIZE); + void *dst = NULL; + size_t dst_capacity = BUF_SIZE; + size_t ret = 1; + size_t bytes_written = 0; + + if (!src) { + perror("decompress_file(src)"); + goto cleanup; + } + + while (ret != 0) { + /* Load more input */ + size_t src_size = fread(src, 1, BUF_SIZE, in); + void *src_ptr = src; + void *src_end = src_ptr + src_size; + if (src_size == 0 || ferror(in)) { + printf("(TODO): Decompress: not enough input or error reading file\n"); + //TODO + ret = 0; + goto cleanup; + } + + /* Allocate destination buffer if it hasn't been allocated already */ + if (!dst) { + dst = malloc(dst_capacity); + if (!dst) { + perror("decompress_file(dst)"); + goto cleanup; + } + } + + // TODO + + /* Decompress: + * Continue while there is more input to read. + */ + while (src_ptr != src_end && ret != 0) { + // size_t dst_size = src_size; + size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); + size_t written = fwrite(dst, 1, dst_size, out); +// printf("Writing %zu bytes\n", dst_size); + bytes_written += dst_size; + if (written != dst_size) { + printf("Decompress: Failed to write to file\n"); + goto cleanup; + } + src_ptr += src_size; + src_size = src_end - src_ptr; + } + + /* Update input */ + + } + + printf("Wrote %zu bytes\n", bytes_written); + + cleanup: + free(src); + free(dst); + + return ret; +} + +int main2(int argc, char *argv[]) { + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Please specify input filename\n"); + return 0; + } + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *outFp = fopen(ldmFilename, "wb"); + size_t sizeIn = 0; + size_t sizeOut = 0; + size_t ret; + printf("compress : %s -> %s\n", inpFilename, ldmFilename); + ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); + if (ret) { + printf("compress : failed with code %zu\n", ret); + return ret; + } + printf("%s: %zu → %zu bytes, %.1f%%\n", + inpFilename, sizeIn, sizeOut, + (double)sizeOut / sizeIn * 100); + printf("compress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* decompress */ + { + FILE *inpFp = fopen(ldmFilename, "rb"); + FILE *outFp = fopen(decFilename, "wb"); + size_t ret; + + printf("decompress : %s -> %s\n", ldmFilename, decFilename); + ret = decompress_file(inpFp, outFp); + if (ret) { + printf("decompress : failed with code %zu\n", ret); + return ret; + } + printf("decompress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* verify */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); + } + return 0; +} +#endif + From ae9cf235d62bb7482496309294852f51bdfd1f1d Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 10 Jul 2017 07:38:09 -0700 Subject: [PATCH 13/62] Add LDM_DCtx --- contrib/long_distance_matching/ldm.c | 74 ++++++++++++++--------- contrib/long_distance_matching/main-ldm.c | 4 +- 2 files changed, 47 insertions(+), 31 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 9081d136..b18ed3d4 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -351,33 +351,49 @@ _last_literals: } typedef struct LDM_DCtx { - const BYTE * const ibase; /* Pointer to base of input */ - const BYTE *ip; /* Pointer to current input position */ - const BYTE *iend; /* End of source */ - BYTE *op; /* Pointer to output */ - const BYTE * const oend; /* Pointer to end of output */ + const BYTE *ibase; /* Pointer to base of input */ + const BYTE *ip; /* Pointer to current input position */ + const BYTE *iend; /* End of source */ + const BYTE *obase; /* Pointer to base of output */ + BYTE *op; /* Pointer to output */ + const BYTE *oend; /* Pointer to end of output */ + + size_t compressSize; + size_t maxDecompressSize; } LDM_DCtx; -size_t LDM_decompress(const void *src, size_t compressed_size, - void *dst, size_t max_decompressed_size) { - const BYTE *ip = (const BYTE *)src; - const BYTE * const iend = ip + compressed_size; - BYTE *op = (BYTE *)dst; - BYTE * const oend = op + max_decompressed_size; +static void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressSize, + void *dst, size_t maxDecompressSize) { + dctx->ibase = src; + dctx->ip = (const BYTE *)src; + dctx->iend = dctx->ip + compressSize; + dctx->op = dst; + dctx->oend = dctx->op + maxDecompressSize; + + dctx->compressSize = compressSize; + dctx->maxDecompressSize = maxDecompressSize; +} + +size_t LDM_decompress(const void *src, size_t compressSize, + void *dst, size_t maxDecompressSize) { + LDM_DCtx dctx; + LDM_initializeDCtx(&dctx, src, compressSize, dst, maxDecompressSize); + BYTE *cpy; - while (ip < iend) { + while (dctx.ip < dctx.iend) { size_t length; const BYTE *match; size_t offset; /* get literal length */ - unsigned const token = *ip++; + unsigned const token = *(dctx.ip)++; if ((length=(token >> ML_BITS)) == RUN_MASK) { unsigned s; do { - s = *ip++; + s = *(dctx.ip)++; length += s; } while (s == 255); } @@ -386,27 +402,27 @@ size_t LDM_decompress(const void *src, size_t compressed_size, #endif /* copy literals */ - cpy = op + length; + cpy = dctx.op + length; #ifdef LDM_DEBUG printf("Literals "); - fwrite(ip, length, 1, stdout); + fwrite(dctx.ip, length, 1, stdout); printf("\n"); #endif - memcpy(op, ip, length); - ip += length; - op = cpy; + memcpy(dctx.op, dctx.ip, length); + dctx.ip += length; + dctx.op = cpy; /* get offset */ /* - offset = LDM_readLE16(ip); - ip += 2; + offset = LDM_readLE16(dctx.ip); + dctx.ip += 2; */ - offset = LDM_read32(ip); - ip += 4; + offset = LDM_read32(dctx.ip); + dctx.ip += 4; #ifdef LDM_DEBUG printf("Offset: %zu\n", offset); #endif - match = op - offset; + match = dctx.op - offset; // LDM_write32(op, (U32)offset); /* get matchlength */ @@ -414,7 +430,7 @@ size_t LDM_decompress(const void *src, size_t compressed_size, if (length == ML_MASK) { unsigned s; do { - s = *ip++; + s = *(dctx.ip)++; length += s; } while (s == 255); } @@ -423,14 +439,14 @@ size_t LDM_decompress(const void *src, size_t compressed_size, printf("Match length: %zu\n", length); #endif /* copy match */ - cpy = op + length; + cpy = dctx.op + length; // Inefficient for now - while (match < cpy - offset && op < oend) { - *op++ = *match++; + while (match < cpy - offset && dctx.op < dctx.oend) { + *(dctx.op)++ = *match++; } } - return op - (BYTE *)dst; + return dctx.op - (BYTE *)dst; } diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 0017335b..b529201f 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -88,7 +88,7 @@ static int compress(const char *fname, const char *oname) { #ifdef DEBUG printf("Compressed size: %zu\n", compressSize); - printf("Decompressed size: %zu\n", statbuf.st_size); + printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); #endif #endif @@ -145,7 +145,7 @@ static int decompress(const char *fname, const char *oname) { #ifdef DEBUG printf("Size, compressSize, decompressSize: %zu %zu %zu\n", - statbuf.st_size, compressSize, decompressSize); + (size_t)statbuf.st_size, compressSize, decompressSize); #endif /* Go to the location corresponding to the last byte. */ From 10a71d9f1c31539b79c66c90750025e26094621c Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 10 Jul 2017 12:38:27 -0700 Subject: [PATCH 14/62] Add compression context --- contrib/long_distance_matching/ldm.c | 287 ++++++++++++++++----------- 1 file changed, 167 insertions(+), 120 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index b18ed3d4..7bf26781 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -31,7 +31,10 @@ typedef uint32_t U32; typedef int32_t S32; typedef uint64_t U64; -typedef uint64_t tag; +typedef uint32_t offset_t; +typedef uint32_t hash_t; + +// typedef uint64_t tag; static unsigned LDM_isLittleEndian(void) { const union { U32 u; BYTE c[4]; } one = { 1 }; @@ -65,7 +68,7 @@ static void LDM_writeLE16(void *memPtr, U16 value) { if (LDM_isLittleEndian()) { LDM_write16(memPtr, value); } else { - BYTE* p = (BYTE*)memPtr; + BYTE* p = (BYTE *)memPtr; p[0] = (BYTE) value; p[1] = (BYTE)(value>>8); } @@ -82,15 +85,18 @@ static U64 LDM_read64(const void *ptr) { static void LDM_copy8(void *dst, const void *src) { memcpy(dst, src, 8); } +typedef struct LDM_hashEntry { + offset_t offset; +} LDM_hashEntry; -typedef struct compress_stats { +typedef struct LDM_compressStats { U32 num_matches; U32 total_match_length; U32 total_literal_length; U64 total_offset; -} compress_stats; +} LDM_compressStats; -static void LDM_printCompressStats(const compress_stats *stats) { +static void LDM_printCompressStats(const LDM_compressStats *stats) { printf("=====================\n"); printf("Compression statistics\n"); printf("Total number of matches: %u\n", stats->num_matches); @@ -103,53 +109,83 @@ static void LDM_printCompressStats(const compress_stats *stats) { printf("=====================\n"); } -// TODO: unused. -struct hash_entry { - U64 offset; - tag t; -}; +typedef struct LDM_CCtx { + size_t isize; /* Input size */ + size_t maxOSize; /* Maximum output size */ -static U32 LDM_hash(U32 sequence) { + const BYTE *ibase; /* Base of input */ + const BYTE *ip; /* Current input position */ + const BYTE *iend; /* End of input */ + + // Maximum input position such that hashing at the position does not exceed + // end of input. + const BYTE *ihashLimit; + + // Maximum input position such that finding a match of at least the minimum + // match length does not exceed end of input. + const BYTE *imatchLimit; + + const BYTE *obase; /* Base of output */ + BYTE *op; /* Output */ + + const BYTE *anchor; /* Anchor to start of current (match) block */ + + LDM_compressStats stats; /* Compression statistics */ + + LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; + + const BYTE *lastPosHashed; /* Last position hashed */ + hash_t lastHash; /* Hash corresponding to lastPosHashed */ + +} LDM_CCtx; + + +static hash_t LDM_hash(U32 sequence) { return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); } -static U32 LDM_hash5(U64 sequence) { +static hash_t LDM_hash5(U64 sequence) { static const U64 prime5bytes = 889523592379ULL; static const U64 prime8bytes = 11400714785074694791ULL; const U32 hashLog = LDM_HASHLOG; if (LDM_isLittleEndian()) - return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + return (((sequence << 24) * prime5bytes) >> (64 - hashLog)); else - return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); + return (((sequence >> 24) * prime8bytes) >> (64 - hashLog)); } -static U32 LDM_hash_position(const void * const p) { +static hash_t LDM_hash_position(const void * const p) { return LDM_hash(LDM_read32(p)); } -static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, - const BYTE *srcBase) { +static void LDM_put_position_on_hash(const BYTE *p, hash_t h, + void *tableBase, const BYTE *srcBase) { if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { return; } - U32 *hashTable = (U32 *) tableBase; - hashTable[h] = (U32)(p - srcBase); + LDM_hashEntry *hashTable = (LDM_hashEntry *) tableBase; + hashTable[h] = (LDM_hashEntry) { (hash_t )(p - srcBase) }; } -static void LDM_put_position(const BYTE *p, void *tableBase, +static void LDM_putPosition(const BYTE *p, void *tableBase, const BYTE *srcBase) { if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { return; } - U32 const h = LDM_hash_position(p); + hash_t const h = LDM_hash_position(p); LDM_put_position_on_hash(p, h, tableBase, srcBase); } +static void LDM_putHashOfCurrentPosition(LDM_CCtx *const cctx) { + LDM_putPosition(cctx->ip, cctx->hashTable, cctx->ibase); +} + + static const BYTE *LDM_get_position_on_hash( - U32 h, void *tableBase, const BYTE *srcBase) { - const U32 * const hashTable = (U32*)tableBase; - return hashTable[h] + srcBase; + hash_t h, void *tableBase, const BYTE *srcBase) { + const LDM_hashEntry * const hashTable = (LDM_hashEntry *)tableBase; + return hashTable[h].offset + srcBase; } static BYTE LDM_read_byte(const void *memPtr) { @@ -180,140 +216,149 @@ void LDM_read_header(const void *src, size_t *compressSize, *decompressSize = *ip; } -// TODO: maxDstSize is unused +static void LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + cctx->isize = srcSize; + cctx->maxOSize = maxDstSize; + + cctx->ibase = (const BYTE *)src; + cctx->ip = cctx->ibase; + cctx->iend = cctx->ibase + srcSize; + + cctx->ihashLimit = cctx->iend - HASH_SIZE; + cctx->imatchLimit = cctx->iend - MINMATCH; + + cctx->obase = (BYTE *)dst; + cctx->op = (BYTE *)cctx->obase; + + cctx->anchor = cctx->ibase; + + memset(&(cctx->stats), 0, sizeof(cctx->stats)); + memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); + + cctx->lastPosHashed = NULL; +} + +// TODO: srcSize and maxDstSize is unused size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { - const BYTE * const istart = (const BYTE*)src; - const BYTE *ip = istart; - const BYTE * const iend = istart + srcSize; - const BYTE *ilimit = iend - HASH_SIZE; - const BYTE * const matchlimit = iend - HASH_SIZE; - const BYTE * const mflimit = iend - MINMATCH; - BYTE *op = (BYTE*) dst; - - compress_stats compressStats = { 0 }; - - U32 hashTable[LDM_HASHTABLESIZE_U32]; - memset(hashTable, 0, sizeof(hashTable)); - - const BYTE *anchor = (const BYTE *)src; -// struct LDM_cctx cctx; - size_t output_size = 0; + LDM_CCtx cctx; + LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); U32 forwardH; - /* Hash first byte: put into hash table */ + /* Hash the first position and put it into the hash table. */ + LDM_putHashOfCurrentPosition(&cctx); + const BYTE *lastHash = cctx.ip; + cctx.ip++; + forwardH = LDM_hash_position(cctx.ip); - LDM_put_position(ip, hashTable, istart); - const BYTE *lastHash = ip; - ip++; - forwardH = LDM_hash_position(ip); - - //TODO Loop terminates before ip>=ilimit. - while (ip < ilimit) { + // TODO: loop condition is not accurate. + while (1) { const BYTE *match; BYTE *token; /* Find a match */ { - const BYTE *forwardIp = ip; + const BYTE *forwardIp = cctx.ip; unsigned step = 1; do { U32 const h = forwardH; - ip = forwardIp; + cctx.ip = forwardIp; forwardIp += step; - if (forwardIp > mflimit) { + if (forwardIp > cctx.imatchLimit) { goto _last_literals; } - match = LDM_get_position_on_hash(h, hashTable, istart); + match = LDM_get_position_on_hash(h, cctx.hashTable, cctx.ibase); forwardH = LDM_hash_position(forwardIp); - LDM_put_position_on_hash(ip, h, hashTable, istart); - lastHash = ip; - } while (ip - match > WINDOW_SIZE || - LDM_read64(match) != LDM_read64(ip)); + LDM_put_position_on_hash(cctx.ip, h, cctx.hashTable, cctx.ibase); + lastHash = cctx.ip; + } while (cctx.ip - match > WINDOW_SIZE || + LDM_read64(match) != LDM_read64(cctx.ip)); } - compressStats.num_matches++; + cctx.stats.num_matches++; /* Catchup: look back to extend match from found match */ - while (ip > anchor && match > istart && ip[-1] == match[-1]) { - ip--; + while (cctx.ip > cctx.anchor && match > cctx.ibase && cctx.ip[-1] == match[-1]) { + cctx.ip--; match--; } /* Encode literals */ { - unsigned const litLength = (unsigned)(ip - anchor); - token = op++; + unsigned const litLength = (unsigned)(cctx.ip - cctx.anchor); + token = cctx.op++; - compressStats.total_literal_length += litLength; + cctx.stats.total_literal_length += litLength; #ifdef LDM_DEBUG - printf("Cur position: %zu\n", anchor - istart); - printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); + printf("Cur position: %zu\n", cctx.anchor - cctx.ibase); + printf("LitLength %zu. (Match offset). %zu\n", litLength, cctx.ip - match); #endif if (litLength >= RUN_MASK) { int len = (int)litLength - RUN_MASK; *token = (RUN_MASK << ML_BITS); for (; len >= 255; len -= 255) { - *op++ = 255; + *(cctx.op)++ = 255; } - *op++ = (BYTE)len; + *(cctx.op)++ = (BYTE)len; } else { *token = (BYTE)(litLength << ML_BITS); } #ifdef LDM_DEBUG printf("Literals "); - fwrite(anchor, litLength, 1, stdout); + fwrite(cctx.anchor, litLength, 1, stdout); printf("\n"); #endif - memcpy(op, anchor, litLength); - op += litLength; + memcpy(cctx.op, cctx.anchor, litLength); + cctx.op += litLength; } _next_match: /* Encode offset */ { /* - LDM_writeLE16(op, ip-match); - op += 2; + LDM_writeLE16(cctx.op, cctx.ip-match); + cctx.op += 2; */ - LDM_write32(op, ip - match); - op += 4; - compressStats.total_offset += (ip - match); + LDM_write32(cctx.op, cctx.ip - match); + cctx.op += 4; + cctx.stats.total_offset += (cctx.ip - match); } /* Encode Match Length */ { unsigned matchCode; - matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, - matchlimit); + matchCode = LDM_count(cctx.ip + MINMATCH, match + MINMATCH, + cctx.ihashLimit); #ifdef LDM_DEBUG printf("Match length %zu\n", matchCode + MINMATCH); - fwrite(ip, MINMATCH + matchCode, 1, stdout); + fwrite(cctx.ip, MINMATCH + matchCode, 1, stdout); printf("\n"); #endif - compressStats.total_match_length += matchCode + MINMATCH; + cctx.stats.total_match_length += matchCode + MINMATCH; unsigned ctr = 1; - ip++; - for (; ctr < MINMATCH + matchCode; ip++, ctr++) { - LDM_put_position(ip, hashTable, istart); + cctx.ip++; + for (; ctr < MINMATCH + matchCode; cctx.ip++, ctr++) { + LDM_putHashOfCurrentPosition(&cctx); } -// ip += MINMATCH + matchCode; +// cctx.ip += MINMATCH + matchCode; if (matchCode >= ML_MASK) { *token += ML_MASK; matchCode -= ML_MASK; - LDM_write32(op, 0xFFFFFFFF); + LDM_write32(cctx.op, 0xFFFFFFFF); while (matchCode >= 4*0xFF) { - op += 4; - LDM_write32(op, 0xffffffff); + cctx.op += 4; + LDM_write32(cctx.op, 0xffffffff); matchCode -= 4*0xFF; } - op += matchCode / 255; - *op++ = (BYTE)(matchCode % 255); + cctx.op += matchCode / 255; + *(cctx.op)++ = (BYTE)(matchCode % 255); } else { *token += (BYTE)(matchCode); } @@ -323,57 +368,58 @@ _next_match: #endif } - anchor = ip; + cctx.anchor = cctx.ip; - LDM_put_position(ip, hashTable, istart); - forwardH = LDM_hash_position(++ip); - lastHash = ip; + LDM_putPosition(cctx.ip, cctx.hashTable, cctx.ibase); + forwardH = LDM_hash_position(++cctx.ip); + lastHash = cctx.ip; } _last_literals: /* Encode last literals */ { - size_t const lastRun = (size_t)(iend - anchor); + size_t const lastRun = (size_t)(cctx.iend - cctx.anchor); if (lastRun >= RUN_MASK) { size_t accumulator = lastRun - RUN_MASK; - *op++ = RUN_MASK << ML_BITS; + *(cctx.op)++ = RUN_MASK << ML_BITS; for(; accumulator >= 255; accumulator -= 255) { - *op++ = 255; + *(cctx.op)++ = 255; } - *op++ = (BYTE)accumulator; + *(cctx.op)++ = (BYTE)accumulator; } else { - *op++ = (BYTE)(lastRun << ML_BITS); + *(cctx.op)++ = (BYTE)(lastRun << ML_BITS); } - memcpy(op, anchor, lastRun); - op += lastRun; + memcpy(cctx.op, cctx.anchor, lastRun); + cctx.op += lastRun; } - LDM_printCompressStats(&compressStats); - return (op - (BYTE *)dst); + LDM_printCompressStats(&cctx.stats); + return (cctx.op - (BYTE *)cctx.obase); } typedef struct LDM_DCtx { - const BYTE *ibase; /* Pointer to base of input */ - const BYTE *ip; /* Pointer to current input position */ - const BYTE *iend; /* End of source */ - - const BYTE *obase; /* Pointer to base of output */ - BYTE *op; /* Pointer to output */ - const BYTE *oend; /* Pointer to end of output */ - size_t compressSize; size_t maxDecompressSize; + + const BYTE *ibase; /* Base of input */ + const BYTE *ip; /* Current input position */ + const BYTE *iend; /* End of source */ + + const BYTE *obase; /* Base of output */ + BYTE *op; /* Current output position */ + const BYTE *oend; /* End of output */ } LDM_DCtx; static void LDM_initializeDCtx(LDM_DCtx *dctx, const void *src, size_t compressSize, void *dst, size_t maxDecompressSize) { - dctx->ibase = src; - dctx->ip = (const BYTE *)src; - dctx->iend = dctx->ip + compressSize; - dctx->op = dst; - dctx->oend = dctx->op + maxDecompressSize; - dctx->compressSize = compressSize; dctx->maxDecompressSize = maxDecompressSize; + + dctx->ibase = src; + dctx->ip = (const BYTE *)src; + dctx->iend = dctx->ip + dctx->compressSize; + dctx->op = dst; + dctx->oend = dctx->op + dctx->maxDecompressSize; + } size_t LDM_decompress(const void *src, size_t compressSize, @@ -382,15 +428,14 @@ size_t LDM_decompress(const void *src, size_t compressSize, LDM_initializeDCtx(&dctx, src, compressSize, dst, maxDecompressSize); BYTE *cpy; + size_t length; + const BYTE *match; + size_t offset; while (dctx.ip < dctx.iend) { - size_t length; - const BYTE *match; - size_t offset; - /* get literal length */ unsigned const token = *(dctx.ip)++; - if ((length=(token >> ML_BITS)) == RUN_MASK) { + if ((length = (token >> ML_BITS)) == RUN_MASK) { unsigned s; do { s = *(dctx.ip)++; @@ -417,6 +462,8 @@ size_t LDM_decompress(const void *src, size_t compressSize, offset = LDM_readLE16(dctx.ip); dctx.ip += 2; */ + + //TODO : dynamic offset size offset = LDM_read32(dctx.ip); dctx.ip += 4; #ifdef LDM_DEBUG From e4155b11d749c2c5f6faac9cabbc6e0dc1262e14 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 10 Jul 2017 13:08:19 -0700 Subject: [PATCH 15/62] Add warning flags to makefile and clean up code to remove warnings --- contrib/long_distance_matching/Makefile | 10 ++- contrib/long_distance_matching/ldm.c | 57 +++++++++-------- contrib/long_distance_matching/ldm.h | 4 +- contrib/long_distance_matching/main-ldm.c | 77 ++++++++++++----------- contrib/long_distance_matching/util.c | 64 +++++++++++++++++++ contrib/long_distance_matching/util.h | 23 +++++++ 6 files changed, 171 insertions(+), 64 deletions(-) create mode 100644 contrib/long_distance_matching/util.c create mode 100644 contrib/long_distance_matching/util.h diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 4e04fd6a..5ffd4eaf 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -9,6 +9,14 @@ # This Makefile presumes libzstd is installed, using `sudo make install` +CFLAGS ?= -O3 +DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ + -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ + -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ + -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ + -Wredundant-decls +CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS) +FLAGS = $(CPPFLAGS) $(CFLAGS) LDFLAGS += -lzstd @@ -22,7 +30,7 @@ all: main-ldm #main : ldm.c main.c # $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-ldm : ldm.c main-ldm.c +main-ldm : util.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 7bf26781..12cffc40 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -4,6 +4,7 @@ #include #include "ldm.h" +#include "util.h" #define HASH_EVERY 7 @@ -36,6 +37,7 @@ typedef uint32_t hash_t; // typedef uint64_t tag; +/* static unsigned LDM_isLittleEndian(void) { const union { U32 u; BYTE c[4]; } one = { 1 }; return one.c[0]; @@ -85,6 +87,8 @@ static U64 LDM_read64(const void *ptr) { static void LDM_copy8(void *dst, const void *src) { memcpy(dst, src, 8); } + +*/ typedef struct LDM_hashEntry { offset_t offset; } LDM_hashEntry; @@ -144,6 +148,7 @@ static hash_t LDM_hash(U32 sequence) { return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); } +/* static hash_t LDM_hash5(U64 sequence) { static const U64 prime5bytes = 889523592379ULL; static const U64 prime8bytes = 11400714785074694791ULL; @@ -153,35 +158,40 @@ static hash_t LDM_hash5(U64 sequence) { else return (((sequence >> 24) * prime8bytes) >> (64 - hashLog)); } +*/ static hash_t LDM_hash_position(const void * const p) { return LDM_hash(LDM_read32(p)); } -static void LDM_put_position_on_hash(const BYTE *p, hash_t h, - void *tableBase, const BYTE *srcBase) { +static void LDM_putHashOfPosition(const BYTE *p, hash_t h, + void *tableBase, const BYTE *srcBase) { + LDM_hashEntry *hashTable; if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { return; } - LDM_hashEntry *hashTable = (LDM_hashEntry *) tableBase; - hashTable[h] = (LDM_hashEntry) { (hash_t )(p - srcBase) }; + hashTable = (LDM_hashEntry *) tableBase; + hashTable[h] = (LDM_hashEntry) { (hash_t)(p - srcBase) }; } static void LDM_putPosition(const BYTE *p, void *tableBase, - const BYTE *srcBase) { + const BYTE *srcBase) { + hash_t hash; if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { return; } - hash_t const h = LDM_hash_position(p); - LDM_put_position_on_hash(p, h, tableBase, srcBase); + hash = LDM_hash_position(p); + LDM_putHashOfPosition(p, hash, tableBase, srcBase); } static void LDM_putHashOfCurrentPosition(LDM_CCtx *const cctx) { - LDM_putPosition(cctx->ip, cctx->hashTable, cctx->ibase); + hash_t hash = LDM_hash_position(cctx->ip); + LDM_putHashOfPosition(cctx->ip, hash, cctx->hashTable, cctx->ibase); + cctx->lastPosHashed = cctx->ip; + cctx->lastHash = hash; } - static const BYTE *LDM_get_position_on_hash( hash_t h, void *tableBase, const BYTE *srcBase) { const LDM_hashEntry * const hashTable = (LDM_hashEntry *)tableBase; @@ -209,8 +219,8 @@ static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, return (unsigned)(pIn - pStart); } -void LDM_read_header(const void *src, size_t *compressSize, - size_t *decompressSize) { +void LDM_readHeader(const void *src, size_t *compressSize, + size_t *decompressSize) { const U32 *ip = (const U32 *)src; *compressSize = *ip++; *decompressSize = *ip; @@ -230,7 +240,7 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->imatchLimit = cctx->iend - MINMATCH; cctx->obase = (BYTE *)dst; - cctx->op = (BYTE *)cctx->obase; + cctx->op = (BYTE *)dst; cctx->anchor = cctx->ibase; @@ -244,13 +254,12 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; + U32 forwardH; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); - U32 forwardH; /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); - const BYTE *lastHash = cctx.ip; cctx.ip++; forwardH = LDM_hash_position(cctx.ip); @@ -276,8 +285,7 @@ size_t LDM_compress(const void *src, size_t srcSize, match = LDM_get_position_on_hash(h, cctx.hashTable, cctx.ibase); forwardH = LDM_hash_position(forwardIp); - LDM_put_position_on_hash(cctx.ip, h, cctx.hashTable, cctx.ibase); - lastHash = cctx.ip; + LDM_putHashOfPosition(cctx.ip, h, cctx.hashTable, cctx.ibase); } while (cctx.ip - match > WINDOW_SIZE || LDM_read64(match) != LDM_read64(cctx.ip)); } @@ -319,7 +327,7 @@ size_t LDM_compress(const void *src, size_t srcSize, memcpy(cctx.op, cctx.anchor, litLength); cctx.op += litLength; } -_next_match: + /* Encode offset */ { /* @@ -334,6 +342,7 @@ _next_match: /* Encode Match Length */ { unsigned matchCode; + unsigned ctr = 1; matchCode = LDM_count(cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); #ifdef LDM_DEBUG @@ -342,7 +351,6 @@ _next_match: printf("\n"); #endif cctx.stats.total_match_length += matchCode + MINMATCH; - unsigned ctr = 1; cctx.ip++; for (; ctr < MINMATCH + matchCode; cctx.ip++, ctr++) { LDM_putHashOfCurrentPosition(&cctx); @@ -372,7 +380,6 @@ _next_match: LDM_putPosition(cctx.ip, cctx.hashTable, cctx.ibase); forwardH = LDM_hash_position(++cctx.ip); - lastHash = cctx.ip; } _last_literals: /* Encode last literals */ @@ -392,7 +399,7 @@ _last_literals: cctx.op += lastRun; } LDM_printCompressStats(&cctx.stats); - return (cctx.op - (BYTE *)cctx.obase); + return (cctx.op - (const BYTE *)cctx.obase); } typedef struct LDM_DCtx { @@ -427,12 +434,12 @@ size_t LDM_decompress(const void *src, size_t compressSize, LDM_DCtx dctx; LDM_initializeDCtx(&dctx, src, compressSize, dst, maxDecompressSize); - BYTE *cpy; - size_t length; - const BYTE *match; - size_t offset; - while (dctx.ip < dctx.iend) { + BYTE *cpy; + size_t length; + const BYTE *match; + size_t offset; + /* get literal length */ unsigned const token = *(dctx.ip)++; if ((length = (token >> ML_BITS)) == RUN_MASK) { diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 0ac7b2ec..287d444d 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -13,7 +13,7 @@ size_t LDM_compress(const void *src, size_t srcSize, size_t LDM_decompress(const void *src, size_t srcSize, void *dst, size_t maxDstSize); -void LDM_read_header(const void *src, size_t *compressSize, - size_t *decompressSize); +void LDM_readHeader(const void *src, size_t *compressSize, + size_t *decompressSize); #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index b529201f..724d735d 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -25,6 +25,7 @@ static int compress(const char *fname, const char *oname) { int fdin, fdout; struct stat statbuf; char *src, *dst; + size_t maxCompressSize, compressSize; /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { @@ -44,10 +45,10 @@ static int compress(const char *fname, const char *oname) { return 1; } - size_t maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; + maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; /* Go to the location corresponding to the last byte. */ - /* TODO: fallocate? */ + /* TODO: fallocate? */ if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { perror("lseek error"); return 1; @@ -74,14 +75,14 @@ static int compress(const char *fname, const char *oname) { } #ifdef ZSTD - size_t compressSize = ZSTD_compress(dst, statbuf.st_size, + compressSize = ZSTD_compress(dst, statbuf.st_size, src, statbuf.st_size, 1); #else - size_t compressSize = LDM_HEADER_SIZE + + compressSize = LDM_HEADER_SIZE + LDM_compress(src, statbuf.st_size, dst + LDM_HEADER_SIZE, statbuf.st_size); - // Write compress and decompress size to header + // Write compress and decompress size to header // TODO: should depend on LDM_DECOMPRESS_SIZE write32 memcpy(dst, &compressSize, 4); memcpy(dst + 4, &(statbuf.st_size), 4); @@ -107,12 +108,13 @@ static int compress(const char *fname, const char *oname) { /* Decompress file compressed using LDM_compress. * The input file should have the LDM_HEADER followed by payload. - * Returns 0 if succesful, and an error code otherwise. + * Returns 0 if succesful, and an error code otherwise. */ static int decompress(const char *fname, const char *oname) { int fdin, fdout; struct stat statbuf; char *src, *dst; + size_t compressSize, decompressSize, outSize; /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { @@ -140,8 +142,7 @@ static int decompress(const char *fname, const char *oname) { } /* Read the header. */ - size_t compressSize, decompressSize; - LDM_read_header(src, &compressSize, &decompressSize); + LDM_readHeader(src, &compressSize, &decompressSize); #ifdef DEBUG printf("Size, compressSize, decompressSize: %zu %zu %zu\n", @@ -168,11 +169,11 @@ static int decompress(const char *fname, const char *oname) { } #ifdef ZSTD - size_t outSize = ZSTD_decompress(dst, decomrpessed_size, + outSize = ZSTD_decompress(dst, decomrpessed_size, src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE); #else - size_t outSize = LDM_decompress( + outSize = LDM_decompress( src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, dst, decompressSize); @@ -211,12 +212,14 @@ static void verify(const char *inpFilename, const char *decFilename) { FILE *decFp = fopen(decFilename, "rb"); printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } + { + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + } fclose(decFp); fclose(inpFp); @@ -243,32 +246,34 @@ int main(int argc, const char *argv[]) { printf("ldm = [%s]\n", ldmFilename); printf("dec = [%s]\n", decFilename); - struct timeval tv1, tv2; /* Compress */ - - gettimeofday(&tv1, NULL); - if (compress(inpFilename, ldmFilename)) { - printf("Compress error"); - return 1; + { + struct timeval tv1, tv2; + gettimeofday(&tv1, NULL); + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); /* Decompress */ - - gettimeofday(&tv1, NULL); - if (decompress(ldmFilename, decFilename)) { - printf("Decompress error"); - return 1; + { + struct timeval tv1, tv2; + gettimeofday(&tv1, NULL); + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - /* verify */ verify(inpFilename, decFilename); return 0; diff --git a/contrib/long_distance_matching/util.c b/contrib/long_distance_matching/util.c new file mode 100644 index 00000000..9ea4ca1e --- /dev/null +++ b/contrib/long_distance_matching/util.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include + +#include "util.h" + +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; + +unsigned LDM_isLittleEndian(void) { + const union { U32 u; BYTE c[4]; } one = { 1 }; + return one.c[0]; +} + +U16 LDM_read16(const void *memPtr) { + U16 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +U16 LDM_readLE16(const void *memPtr) { + if (LDM_isLittleEndian()) { + return LDM_read16(memPtr); + } else { + const BYTE *p = (const BYTE *)memPtr; + return (U16)((U16)p[0] + (p[1] << 8)); + } +} + +void LDM_write16(void *memPtr, U16 value){ + memcpy(memPtr, &value, sizeof(value)); +} + +void LDM_write32(void *memPtr, U32 value) { + memcpy(memPtr, &value, sizeof(value)); +} + +void LDM_writeLE16(void *memPtr, U16 value) { + if (LDM_isLittleEndian()) { + LDM_write16(memPtr, value); + } else { + BYTE* p = (BYTE *)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +U32 LDM_read32(const void *ptr) { + return *(const U32 *)ptr; +} + +U64 LDM_read64(const void *ptr) { + return *(const U64 *)ptr; +} + +void LDM_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} + + diff --git a/contrib/long_distance_matching/util.h b/contrib/long_distance_matching/util.h new file mode 100644 index 00000000..90726412 --- /dev/null +++ b/contrib/long_distance_matching/util.h @@ -0,0 +1,23 @@ +#ifndef LDM_UTIL_H +#define LDM_UTIL_H + +unsigned LDM_isLittleEndian(void); + +uint16_t LDM_read16(const void *memPtr); + +uint16_t LDM_readLE16(const void *memPtr); + +void LDM_write16(void *memPtr, uint16_t value); + +void LDM_write32(void *memPtr, uint32_t value); + +void LDM_writeLE16(void *memPtr, uint16_t value); + +uint32_t LDM_read32(const void *ptr); + +uint64_t LDM_read64(const void *ptr); + +void LDM_copy8(void *dst, const void *src); + + +#endif /* LDM_UTIL_H */ From ef2b72831636d56d4e18168f5abaa16965e4a1a8 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 10 Jul 2017 15:48:47 -0700 Subject: [PATCH 16/62] Clean up and refactor compress function --- contrib/long_distance_matching/ldm.c | 317 ++++++++++----------------- 1 file changed, 115 insertions(+), 202 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 12cffc40..a1d4449e 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -14,6 +14,8 @@ #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) +#define LDM_OFFSET_SIZE 4 + #define WINDOW_SIZE (1 << 20) #define MAX_WINDOW_SIZE 31 #define HASH_SIZE 8 @@ -35,81 +37,27 @@ typedef uint64_t U64; typedef uint32_t offset_t; typedef uint32_t hash_t; -// typedef uint64_t tag; - -/* -static unsigned LDM_isLittleEndian(void) { - const union { U32 u; BYTE c[4]; } one = { 1 }; - return one.c[0]; -} - -static U16 LDM_read16(const void *memPtr) { - U16 val; - memcpy(&val, memPtr, sizeof(val)); - return val; -} - -static U16 LDM_readLE16(const void *memPtr) { - if (LDM_isLittleEndian()) { - return LDM_read16(memPtr); - } else { - const BYTE *p = (const BYTE *)memPtr; - return (U16)((U16)p[0] + (p[1] << 8)); - } -} - -static void LDM_write16(void *memPtr, U16 value){ - memcpy(memPtr, &value, sizeof(value)); -} - -static void LDM_write32(void *memPtr, U32 value) { - memcpy(memPtr, &value, sizeof(value)); -} - -static void LDM_writeLE16(void *memPtr, U16 value) { - if (LDM_isLittleEndian()) { - LDM_write16(memPtr, value); - } else { - BYTE* p = (BYTE *)memPtr; - p[0] = (BYTE) value; - p[1] = (BYTE)(value>>8); - } -} - -static U32 LDM_read32(const void *ptr) { - return *(const U32 *)ptr; -} - -static U64 LDM_read64(const void *ptr) { - return *(const U64 *)ptr; -} - -static void LDM_copy8(void *dst, const void *src) { - memcpy(dst, src, 8); -} - -*/ typedef struct LDM_hashEntry { offset_t offset; } LDM_hashEntry; typedef struct LDM_compressStats { - U32 num_matches; - U32 total_match_length; - U32 total_literal_length; - U64 total_offset; + U32 numMatches; + U32 totalMatchLength; + U32 totalLiteralLength; + U64 totalOffset; } LDM_compressStats; static void LDM_printCompressStats(const LDM_compressStats *stats) { printf("=====================\n"); printf("Compression statistics\n"); - printf("Total number of matches: %u\n", stats->num_matches); - printf("Average match length: %.1f\n", ((double)stats->total_match_length) / - (double)stats->num_matches); + printf("Total number of matches: %u\n", stats->numMatches); + printf("Average match length: %.1f\n", ((double)stats->totalMatchLength) / + (double)stats->numMatches); printf("Average literal length: %.1f\n", - ((double)stats->total_literal_length) / (double)stats->num_matches); + ((double)stats->totalLiteralLength) / (double)stats->numMatches); printf("Average offset length: %.1f\n", - ((double)stats->total_offset) / (double)stats->num_matches); + ((double)stats->totalOffset) / (double)stats->numMatches); printf("=====================\n"); } @@ -140,6 +88,10 @@ typedef struct LDM_CCtx { const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ + const BYTE *forwardIp; + hash_t forwardHash; + + unsigned step; } LDM_CCtx; @@ -160,38 +112,25 @@ static hash_t LDM_hash5(U64 sequence) { } */ -static hash_t LDM_hash_position(const void * const p) { +static hash_t LDM_hashPosition(const void * const p) { return LDM_hash(LDM_read32(p)); } -static void LDM_putHashOfPosition(const BYTE *p, hash_t h, - void *tableBase, const BYTE *srcBase) { - LDM_hashEntry *hashTable; - if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { +static void LDM_putHashOfCurrentPositionFromHash( + LDM_CCtx *cctx, hash_t hash) { + if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { return; } - - hashTable = (LDM_hashEntry *) tableBase; - hashTable[h] = (LDM_hashEntry) { (hash_t)(p - srcBase) }; -} - -static void LDM_putPosition(const BYTE *p, void *tableBase, - const BYTE *srcBase) { - hash_t hash; - if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { - return; - } - hash = LDM_hash_position(p); - LDM_putHashOfPosition(p, hash, tableBase, srcBase); -} - -static void LDM_putHashOfCurrentPosition(LDM_CCtx *const cctx) { - hash_t hash = LDM_hash_position(cctx->ip); - LDM_putHashOfPosition(cctx->ip, hash, cctx->hashTable, cctx->ibase); + (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; cctx->lastPosHashed = cctx->ip; cctx->lastHash = hash; } +static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { + hash_t hash = LDM_hashPosition(cctx->ip); + LDM_putHashOfCurrentPositionFromHash(cctx, hash); +} + static const BYTE *LDM_get_position_on_hash( hash_t h, void *tableBase, const BYTE *srcBase) { const LDM_hashEntry * const hashTable = (LDM_hashEntry *)tableBase; @@ -248,141 +187,136 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->lastPosHashed = NULL; + cctx->forwardIp = NULL; + + cctx->step = 1; +} + +static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { + cctx->forwardIp = cctx->ip; + + do { + hash_t const h = cctx->forwardHash; + cctx->ip = cctx->forwardIp; + cctx->forwardIp += cctx->step; + + if (cctx->forwardIp > cctx->imatchLimit) { + return 1; + } + + *match = LDM_get_position_on_hash(h, cctx->hashTable, cctx->ibase); + + cctx->forwardHash = LDM_hashPosition(cctx->forwardIp); + LDM_putHashOfCurrentPositionFromHash(cctx, h); + } while (cctx->ip - *match > WINDOW_SIZE || + LDM_read64(*match) != LDM_read64(cctx->ip)); + return 0; } // TODO: srcSize and maxDstSize is unused size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; - U32 forwardH; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); - /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); cctx.ip++; - forwardH = LDM_hash_position(cctx.ip); + cctx.forwardHash = LDM_hashPosition(cctx.ip); // TODO: loop condition is not accurate. while (1) { const BYTE *match; - BYTE *token; - /* Find a match */ - { - const BYTE *forwardIp = cctx.ip; - unsigned step = 1; - - do { - U32 const h = forwardH; - cctx.ip = forwardIp; - forwardIp += step; - - if (forwardIp > cctx.imatchLimit) { - goto _last_literals; - } - - match = LDM_get_position_on_hash(h, cctx.hashTable, cctx.ibase); - - forwardH = LDM_hash_position(forwardIp); - LDM_putHashOfPosition(cctx.ip, h, cctx.hashTable, cctx.ibase); - } while (cctx.ip - match > WINDOW_SIZE || - LDM_read64(match) != LDM_read64(cctx.ip)); + /** + * Find a match. + * If no more matches can be found (i.e. the length of the remaining input + * is less than the minimum match length), then stop searching for matches + * and encode the final literals. + */ + if (LDM_findBestMatch(&cctx, &match) != 0) { + goto _last_literals; } - cctx.stats.num_matches++; - /* Catchup: look back to extend match from found match */ - while (cctx.ip > cctx.anchor && match > cctx.ibase && cctx.ip[-1] == match[-1]) { + cctx.stats.numMatches++; + + /** + * Catchup: look back to extend the match backwards from the found match. + */ + while (cctx.ip > cctx.anchor && match > cctx.ibase && + cctx.ip[-1] == match[-1]) { cctx.ip--; match--; } - /* Encode literals */ + /** + * Write current block (literals, literal length, match offset, match + * length) and update pointers and hashes. + */ { - unsigned const litLength = (unsigned)(cctx.ip - cctx.anchor); - token = cctx.op++; + unsigned const literalLength = (unsigned)(cctx.ip - cctx.anchor); + unsigned const offset = cctx.ip - match; + unsigned const matchLength = LDM_count( + cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); + BYTE *token = cctx.op++; - cctx.stats.total_literal_length += litLength; + cctx.stats.totalLiteralLength += literalLength; + cctx.stats.totalOffset += offset; + cctx.stats.totalMatchLength += matchLength + MINMATCH; -#ifdef LDM_DEBUG - printf("Cur position: %zu\n", cctx.anchor - cctx.ibase); - printf("LitLength %zu. (Match offset). %zu\n", litLength, cctx.ip - match); -#endif - - if (litLength >= RUN_MASK) { - int len = (int)litLength - RUN_MASK; + /* Encode the literal length. */ + if (literalLength >= RUN_MASK) { + int len = (int)literalLength - RUN_MASK; *token = (RUN_MASK << ML_BITS); for (; len >= 255; len -= 255) { *(cctx.op)++ = 255; } *(cctx.op)++ = (BYTE)len; } else { - *token = (BYTE)(litLength << ML_BITS); + *token = (BYTE)(literalLength << ML_BITS); } -#ifdef LDM_DEBUG - printf("Literals "); - fwrite(cctx.anchor, litLength, 1, stdout); - printf("\n"); -#endif - memcpy(cctx.op, cctx.anchor, litLength); - cctx.op += litLength; - } - /* Encode offset */ - { - /* - LDM_writeLE16(cctx.op, cctx.ip-match); - cctx.op += 2; - */ - LDM_write32(cctx.op, cctx.ip - match); - cctx.op += 4; - cctx.stats.total_offset += (cctx.ip - match); - } + /* Encode the literals. */ + memcpy(cctx.op, cctx.anchor, literalLength); + cctx.op += literalLength; - /* Encode Match Length */ - { - unsigned matchCode; - unsigned ctr = 1; - matchCode = LDM_count(cctx.ip + MINMATCH, match + MINMATCH, - cctx.ihashLimit); -#ifdef LDM_DEBUG - printf("Match length %zu\n", matchCode + MINMATCH); - fwrite(cctx.ip, MINMATCH + matchCode, 1, stdout); - printf("\n"); -#endif - cctx.stats.total_match_length += matchCode + MINMATCH; - cctx.ip++; - for (; ctr < MINMATCH + matchCode; cctx.ip++, ctr++) { - LDM_putHashOfCurrentPosition(&cctx); - } -// cctx.ip += MINMATCH + matchCode; - if (matchCode >= ML_MASK) { + /* Encode the offset. */ + LDM_write32(cctx.op, offset); + cctx.op += LDM_OFFSET_SIZE; + + /* Encode match length */ + if (matchLength >= ML_MASK) { + unsigned matchLengthRemaining = matchLength; *token += ML_MASK; - matchCode -= ML_MASK; + matchLengthRemaining -= ML_MASK; LDM_write32(cctx.op, 0xFFFFFFFF); - while (matchCode >= 4*0xFF) { + while (matchLengthRemaining >= 4*0xFF) { cctx.op += 4; LDM_write32(cctx.op, 0xffffffff); - matchCode -= 4*0xFF; + matchLengthRemaining -= 4*0xFF; } - cctx.op += matchCode / 255; - *(cctx.op)++ = (BYTE)(matchCode % 255); + cctx.op += matchLengthRemaining / 255; + *(cctx.op)++ = (BYTE)(matchLengthRemaining % 255); } else { - *token += (BYTE)(matchCode); + *token += (BYTE)(matchLength); } -#ifdef LDM_DEBUG - printf("\n"); -#endif + /* Update input pointer, inserting hashes into hash table along the + * way. + */ + while (cctx.ip < cctx.anchor + MINMATCH + matchLength + literalLength) { + LDM_putHashOfCurrentPosition(&cctx); + cctx.ip++; + } } + // Set start of next block to current input pointer. cctx.anchor = cctx.ip; - - LDM_putPosition(cctx.ip, cctx.hashTable, cctx.ibase); - forwardH = LDM_hash_position(++cctx.ip); + LDM_putHashOfCurrentPosition(&cctx); + cctx.forwardHash = LDM_hashPosition(++cctx.ip); } _last_literals: - /* Encode last literals */ + /* Encode the last literals (no more matches). */ { size_t const lastRun = (size_t)(cctx.iend - cctx.anchor); if (lastRun >= RUN_MASK) { @@ -436,11 +370,10 @@ size_t LDM_decompress(const void *src, size_t compressSize, while (dctx.ip < dctx.iend) { BYTE *cpy; - size_t length; const BYTE *match; - size_t offset; + size_t length, offset; - /* get literal length */ + /* Get the literal length. */ unsigned const token = *(dctx.ip)++; if ((length = (token >> ML_BITS)) == RUN_MASK) { unsigned s; @@ -449,37 +382,19 @@ size_t LDM_decompress(const void *src, size_t compressSize, length += s; } while (s == 255); } -#ifdef LDM_DEBUG - printf("Literal length: %zu\n", length); -#endif - /* copy literals */ + /* Copy literals. */ cpy = dctx.op + length; -#ifdef LDM_DEBUG - printf("Literals "); - fwrite(dctx.ip, length, 1, stdout); - printf("\n"); -#endif memcpy(dctx.op, dctx.ip, length); dctx.ip += length; dctx.op = cpy; - /* get offset */ - /* - offset = LDM_readLE16(dctx.ip); - dctx.ip += 2; - */ - //TODO : dynamic offset size offset = LDM_read32(dctx.ip); - dctx.ip += 4; -#ifdef LDM_DEBUG - printf("Offset: %zu\n", offset); -#endif + dctx.ip += LDM_OFFSET_SIZE; match = dctx.op - offset; - // LDM_write32(op, (U32)offset); - /* get matchlength */ + /* Get the match length. */ length = token & ML_MASK; if (length == ML_MASK) { unsigned s; @@ -489,10 +404,8 @@ size_t LDM_decompress(const void *src, size_t compressSize, } while (s == 255); } length += MINMATCH; -#ifdef LDM_DEBUG - printf("Match length: %zu\n", length); -#endif - /* copy match */ + + /* Copy match. */ cpy = dctx.op + length; // Inefficient for now From 6c3673f4c388d2a3b0d8af696e1e69c9c8f68b5b Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 10 Jul 2017 22:27:43 -0700 Subject: [PATCH 17/62] Add rolling hash --- contrib/long_distance_matching/ldm.c | 87 +++++++++++++++++++++------- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index a1d4449e..42a4affd 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -3,12 +3,13 @@ #include #include + #include "ldm.h" #include "util.h" #define HASH_EVERY 7 -#define LDM_MEMORY_USAGE 14 +#define LDM_MEMORY_USAGE 20 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) @@ -16,16 +17,18 @@ #define LDM_OFFSET_SIZE 4 -#define WINDOW_SIZE (1 << 20) +#define WINDOW_SIZE (1 << 24) #define MAX_WINDOW_SIZE 31 -#define HASH_SIZE 8 -#define MINMATCH 8 +#define HASH_SIZE 4 +#define LDM_HASH_LENGTH 4 +#define MINMATCH 4 #define ML_BITS 4 #define ML_MASK ((1U<> 2); +// return sum & (LDM_HASHTABLESIZE - 1); +} +static U32 LDM_getRollingHash(const char *data, U32 len) { + U32 i; + U32 s1, s2; + const schar *buf = (const schar *)data; + + s1 = s2 = 0; + for (i = 0; i < (len - 4); i += 4) { + s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + + (2 * buf[i + 2]) + (buf[i + 3]); + s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3]; + } + for(; i < len; i++) { + s1 += buf[i]; + s2 += s1; + } + return (s1 & 0xffff) + (s2 << 16); +} + +static hash_t LDM_hashPosition(const void * const p) { + return LDM_sumToHash(LDM_getRollingHash((const char *)p, LDM_HASH_LENGTH)); +} + +typedef struct LDM_sumStruct { + U16 s1, s2; +} LDM_sumStruct; + +static void LDM_getRollingHashParts(U32 sum, LDM_sumStruct *sumStruct) { + sumStruct->s1 = sum & 0xffff; + sumStruct->s2 = sum >> 16; +} + +#else static hash_t LDM_hash(U32 sequence) { return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); } +static hash_t LDM_hashPosition(const void * const p) { + return LDM_hash(LDM_read32(p)); +} +#endif + /* static hash_t LDM_hash5(U64 sequence) { static const U64 prime5bytes = 889523592379ULL; @@ -112,10 +161,6 @@ static hash_t LDM_hash5(U64 sequence) { } */ -static hash_t LDM_hashPosition(const void * const p) { - return LDM_hash(LDM_read32(p)); -} - static void LDM_putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash) { if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { @@ -187,26 +232,26 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->lastPosHashed = NULL; - cctx->forwardIp = NULL; + cctx->nextIp = NULL; cctx->step = 1; } static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { - cctx->forwardIp = cctx->ip; + cctx->nextIp = cctx->ip; do { - hash_t const h = cctx->forwardHash; - cctx->ip = cctx->forwardIp; - cctx->forwardIp += cctx->step; + hash_t const h = cctx->nextHash; + cctx->ip = cctx->nextIp; + cctx->nextIp += cctx->step; - if (cctx->forwardIp > cctx->imatchLimit) { + if (cctx->nextIp > cctx->imatchLimit) { return 1; } *match = LDM_get_position_on_hash(h, cctx->hashTable, cctx->ibase); - cctx->forwardHash = LDM_hashPosition(cctx->forwardIp); + cctx->nextHash = LDM_hashPosition(cctx->nextIp); LDM_putHashOfCurrentPositionFromHash(cctx, h); } while (cctx->ip - *match > WINDOW_SIZE || LDM_read64(*match) != LDM_read64(cctx->ip)); @@ -222,7 +267,7 @@ size_t LDM_compress(const void *src, size_t srcSize, /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); cctx.ip++; - cctx.forwardHash = LDM_hashPosition(cctx.ip); + cctx.nextHash = LDM_hashPosition(cctx.ip); // TODO: loop condition is not accurate. while (1) { @@ -241,7 +286,7 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.stats.numMatches++; /** - * Catchup: look back to extend the match backwards from the found match. + * Catch up: look back to extend the match backwards from the found match. */ while (cctx.ip > cctx.anchor && match > cctx.ibase && cctx.ip[-1] == match[-1]) { @@ -313,7 +358,7 @@ size_t LDM_compress(const void *src, size_t srcSize, // Set start of next block to current input pointer. cctx.anchor = cctx.ip; LDM_putHashOfCurrentPosition(&cctx); - cctx.forwardHash = LDM_hashPosition(++cctx.ip); + cctx.nextHash = LDM_hashPosition(++cctx.ip); } _last_literals: /* Encode the last literals (no more matches). */ From f6c5d07fe295997d9f01461d2c40714e10aa5827 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Tue, 11 Jul 2017 09:23:44 -0700 Subject: [PATCH 18/62] Save v3 --- contrib/long_distance_matching/ldm.c | 8 +- .../versions/v3/Makefile | 40 ++ .../long_distance_matching/versions/v3/ldm.c | 464 +++++++++++++++++ .../long_distance_matching/versions/v3/ldm.h | 19 + .../versions/v3/main-ldm.c | 479 ++++++++++++++++++ .../long_distance_matching/versions/v3/util.c | 64 +++ .../long_distance_matching/versions/v3/util.h | 23 + 7 files changed, 1093 insertions(+), 4 deletions(-) create mode 100644 contrib/long_distance_matching/versions/v3/Makefile create mode 100644 contrib/long_distance_matching/versions/v3/ldm.c create mode 100644 contrib/long_distance_matching/versions/v3/ldm.h create mode 100644 contrib/long_distance_matching/versions/v3/main-ldm.c create mode 100644 contrib/long_distance_matching/versions/v3/util.c create mode 100644 contrib/long_distance_matching/versions/v3/util.h diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 42a4affd..1dedf5c3 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -7,9 +7,9 @@ #include "ldm.h" #include "util.h" -#define HASH_EVERY 7 +#define HASH_EVERY 1 -#define LDM_MEMORY_USAGE 20 +#define LDM_MEMORY_USAGE 16 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) @@ -17,7 +17,7 @@ #define LDM_OFFSET_SIZE 4 -#define WINDOW_SIZE (1 << 24) +#define WINDOW_SIZE (1 << 20) #define MAX_WINDOW_SIZE 31 #define HASH_SIZE 4 #define LDM_HASH_LENGTH 4 @@ -28,7 +28,7 @@ #define RUN_BITS (8-ML_BITS) #define RUN_MASK ((1U< +#include +#include +#include + + +#include "ldm.h" +#include "util.h" + +#define HASH_EVERY 1 + +#define LDM_MEMORY_USAGE 16 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) + +#define LDM_OFFSET_SIZE 4 + +#define WINDOW_SIZE (1 << 20) +#define MAX_WINDOW_SIZE 31 +#define HASH_SIZE 4 +#define LDM_HASH_LENGTH 4 +#define MINMATCH 4 + +#define ML_BITS 4 +#define ML_MASK ((1U<numMatches); + printf("Average match length: %.1f\n", ((double)stats->totalMatchLength) / + (double)stats->numMatches); + printf("Average literal length: %.1f\n", + ((double)stats->totalLiteralLength) / (double)stats->numMatches); + printf("Average offset length: %.1f\n", + ((double)stats->totalOffset) / (double)stats->numMatches); + printf("=====================\n"); +} + +typedef struct LDM_CCtx { + size_t isize; /* Input size */ + size_t maxOSize; /* Maximum output size */ + + const BYTE *ibase; /* Base of input */ + const BYTE *ip; /* Current input position */ + const BYTE *iend; /* End of input */ + + // Maximum input position such that hashing at the position does not exceed + // end of input. + const BYTE *ihashLimit; + + // Maximum input position such that finding a match of at least the minimum + // match length does not exceed end of input. + const BYTE *imatchLimit; + + const BYTE *obase; /* Base of output */ + BYTE *op; /* Output */ + + const BYTE *anchor; /* Anchor to start of current (match) block */ + + LDM_compressStats stats; /* Compression statistics */ + + LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; + + const BYTE *lastPosHashed; /* Last position hashed */ + hash_t lastHash; /* Hash corresponding to lastPosHashed */ + const BYTE *nextIp; + hash_t nextHash; /* Hash corresponding to nextIp */ + + unsigned step; +} LDM_CCtx; + +#ifdef LDM_ROLLING_HASH +/** + * Convert a sum computed from LDM_getRollingHash to a hash value in the range + * of the hash table. + */ +static hash_t LDM_sumToHash(U32 sum) { + return sum % (LDM_HASHTABLESIZE >> 2); +// return sum & (LDM_HASHTABLESIZE - 1); +} + +static U32 LDM_getRollingHash(const char *data, U32 len) { + U32 i; + U32 s1, s2; + const schar *buf = (const schar *)data; + + s1 = s2 = 0; + for (i = 0; i < (len - 4); i += 4) { + s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + + (2 * buf[i + 2]) + (buf[i + 3]); + s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3]; + } + for(; i < len; i++) { + s1 += buf[i]; + s2 += s1; + } + return (s1 & 0xffff) + (s2 << 16); +} + +static hash_t LDM_hashPosition(const void * const p) { + return LDM_sumToHash(LDM_getRollingHash((const char *)p, LDM_HASH_LENGTH)); +} + +typedef struct LDM_sumStruct { + U16 s1, s2; +} LDM_sumStruct; + +static void LDM_getRollingHashParts(U32 sum, LDM_sumStruct *sumStruct) { + sumStruct->s1 = sum & 0xffff; + sumStruct->s2 = sum >> 16; +} + +#else +static hash_t LDM_hash(U32 sequence) { + return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); +} + +static hash_t LDM_hashPosition(const void * const p) { + return LDM_hash(LDM_read32(p)); +} +#endif + +/* +static hash_t LDM_hash5(U64 sequence) { + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; + const U32 hashLog = LDM_HASHLOG; + if (LDM_isLittleEndian()) + return (((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (((sequence >> 24) * prime8bytes) >> (64 - hashLog)); +} +*/ + +static void LDM_putHashOfCurrentPositionFromHash( + LDM_CCtx *cctx, hash_t hash) { + if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { + return; + } + (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; + cctx->lastPosHashed = cctx->ip; + cctx->lastHash = hash; +} + +static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { + hash_t hash = LDM_hashPosition(cctx->ip); + LDM_putHashOfCurrentPositionFromHash(cctx, hash); +} + +static const BYTE *LDM_get_position_on_hash( + hash_t h, void *tableBase, const BYTE *srcBase) { + const LDM_hashEntry * const hashTable = (LDM_hashEntry *)tableBase; + return hashTable[h].offset + srcBase; +} + +static BYTE LDM_read_byte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} + +static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (unsigned)(pIn - pStart); + } + return (unsigned)(pIn - pStart); +} + +void LDM_readHeader(const void *src, size_t *compressSize, + size_t *decompressSize) { + const U32 *ip = (const U32 *)src; + *compressSize = *ip++; + *decompressSize = *ip; +} + +static void LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + cctx->isize = srcSize; + cctx->maxOSize = maxDstSize; + + cctx->ibase = (const BYTE *)src; + cctx->ip = cctx->ibase; + cctx->iend = cctx->ibase + srcSize; + + cctx->ihashLimit = cctx->iend - HASH_SIZE; + cctx->imatchLimit = cctx->iend - MINMATCH; + + cctx->obase = (BYTE *)dst; + cctx->op = (BYTE *)dst; + + cctx->anchor = cctx->ibase; + + memset(&(cctx->stats), 0, sizeof(cctx->stats)); + memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); + + cctx->lastPosHashed = NULL; + cctx->nextIp = NULL; + + cctx->step = 1; +} + +static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { + cctx->nextIp = cctx->ip; + + do { + hash_t const h = cctx->nextHash; + cctx->ip = cctx->nextIp; + cctx->nextIp += cctx->step; + + if (cctx->nextIp > cctx->imatchLimit) { + return 1; + } + + *match = LDM_get_position_on_hash(h, cctx->hashTable, cctx->ibase); + + cctx->nextHash = LDM_hashPosition(cctx->nextIp); + LDM_putHashOfCurrentPositionFromHash(cctx, h); + } while (cctx->ip - *match > WINDOW_SIZE || + LDM_read64(*match) != LDM_read64(cctx->ip)); + return 0; +} + +// TODO: srcSize and maxDstSize is unused +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + LDM_CCtx cctx; + LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + + /* Hash the first position and put it into the hash table. */ + LDM_putHashOfCurrentPosition(&cctx); + cctx.ip++; + cctx.nextHash = LDM_hashPosition(cctx.ip); + + // TODO: loop condition is not accurate. + while (1) { + const BYTE *match; + + /** + * Find a match. + * If no more matches can be found (i.e. the length of the remaining input + * is less than the minimum match length), then stop searching for matches + * and encode the final literals. + */ + if (LDM_findBestMatch(&cctx, &match) != 0) { + goto _last_literals; + } + + cctx.stats.numMatches++; + + /** + * Catch up: look back to extend the match backwards from the found match. + */ + while (cctx.ip > cctx.anchor && match > cctx.ibase && + cctx.ip[-1] == match[-1]) { + cctx.ip--; + match--; + } + + /** + * Write current block (literals, literal length, match offset, match + * length) and update pointers and hashes. + */ + { + unsigned const literalLength = (unsigned)(cctx.ip - cctx.anchor); + unsigned const offset = cctx.ip - match; + unsigned const matchLength = LDM_count( + cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); + BYTE *token = cctx.op++; + + cctx.stats.totalLiteralLength += literalLength; + cctx.stats.totalOffset += offset; + cctx.stats.totalMatchLength += matchLength + MINMATCH; + + /* Encode the literal length. */ + if (literalLength >= RUN_MASK) { + int len = (int)literalLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *(cctx.op)++ = 255; + } + *(cctx.op)++ = (BYTE)len; + } else { + *token = (BYTE)(literalLength << ML_BITS); + } + + /* Encode the literals. */ + memcpy(cctx.op, cctx.anchor, literalLength); + cctx.op += literalLength; + + /* Encode the offset. */ + LDM_write32(cctx.op, offset); + cctx.op += LDM_OFFSET_SIZE; + + /* Encode match length */ + if (matchLength >= ML_MASK) { + unsigned matchLengthRemaining = matchLength; + *token += ML_MASK; + matchLengthRemaining -= ML_MASK; + LDM_write32(cctx.op, 0xFFFFFFFF); + while (matchLengthRemaining >= 4*0xFF) { + cctx.op += 4; + LDM_write32(cctx.op, 0xffffffff); + matchLengthRemaining -= 4*0xFF; + } + cctx.op += matchLengthRemaining / 255; + *(cctx.op)++ = (BYTE)(matchLengthRemaining % 255); + } else { + *token += (BYTE)(matchLength); + } + + /* Update input pointer, inserting hashes into hash table along the + * way. + */ + while (cctx.ip < cctx.anchor + MINMATCH + matchLength + literalLength) { + LDM_putHashOfCurrentPosition(&cctx); + cctx.ip++; + } + } + + // Set start of next block to current input pointer. + cctx.anchor = cctx.ip; + LDM_putHashOfCurrentPosition(&cctx); + cctx.nextHash = LDM_hashPosition(++cctx.ip); + } +_last_literals: + /* Encode the last literals (no more matches). */ + { + size_t const lastRun = (size_t)(cctx.iend - cctx.anchor); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *(cctx.op)++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255; accumulator -= 255) { + *(cctx.op)++ = 255; + } + *(cctx.op)++ = (BYTE)accumulator; + } else { + *(cctx.op)++ = (BYTE)(lastRun << ML_BITS); + } + memcpy(cctx.op, cctx.anchor, lastRun); + cctx.op += lastRun; + } + LDM_printCompressStats(&cctx.stats); + return (cctx.op - (const BYTE *)cctx.obase); +} + +typedef struct LDM_DCtx { + size_t compressSize; + size_t maxDecompressSize; + + const BYTE *ibase; /* Base of input */ + const BYTE *ip; /* Current input position */ + const BYTE *iend; /* End of source */ + + const BYTE *obase; /* Base of output */ + BYTE *op; /* Current output position */ + const BYTE *oend; /* End of output */ +} LDM_DCtx; + +static void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressSize, + void *dst, size_t maxDecompressSize) { + dctx->compressSize = compressSize; + dctx->maxDecompressSize = maxDecompressSize; + + dctx->ibase = src; + dctx->ip = (const BYTE *)src; + dctx->iend = dctx->ip + dctx->compressSize; + dctx->op = dst; + dctx->oend = dctx->op + dctx->maxDecompressSize; + +} + +size_t LDM_decompress(const void *src, size_t compressSize, + void *dst, size_t maxDecompressSize) { + LDM_DCtx dctx; + LDM_initializeDCtx(&dctx, src, compressSize, dst, maxDecompressSize); + + while (dctx.ip < dctx.iend) { + BYTE *cpy; + const BYTE *match; + size_t length, offset; + + /* Get the literal length. */ + unsigned const token = *(dctx.ip)++; + if ((length = (token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + + /* Copy literals. */ + cpy = dctx.op + length; + memcpy(dctx.op, dctx.ip, length); + dctx.ip += length; + dctx.op = cpy; + + //TODO : dynamic offset size + offset = LDM_read32(dctx.ip); + dctx.ip += LDM_OFFSET_SIZE; + match = dctx.op - offset; + + /* Get the match length. */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + length += MINMATCH; + + /* Copy match. */ + cpy = dctx.op + length; + + // Inefficient for now + while (match < cpy - offset && dctx.op < dctx.oend) { + *(dctx.op)++ = *match++; + } + } + return dctx.op - (BYTE *)dst; +} + + diff --git a/contrib/long_distance_matching/versions/v3/ldm.h b/contrib/long_distance_matching/versions/v3/ldm.h new file mode 100644 index 00000000..287d444d --- /dev/null +++ b/contrib/long_distance_matching/versions/v3/ldm.h @@ -0,0 +1,19 @@ +#ifndef LDM_H +#define LDM_H + +#include /* size_t */ + +#define LDM_COMPRESS_SIZE 4 +#define LDM_DECOMPRESS_SIZE 4 +#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) + +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +size_t LDM_decompress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +void LDM_readHeader(const void *src, size_t *compressSize, + size_t *decompressSize); + +#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v3/main-ldm.c b/contrib/long_distance_matching/versions/v3/main-ldm.c new file mode 100644 index 00000000..724d735d --- /dev/null +++ b/contrib/long_distance_matching/versions/v3/main-ldm.c @@ -0,0 +1,479 @@ +// TODO: file size must fit into a U32 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ldm.h" + +// #define BUF_SIZE 16*1024 // Block size +#define DEBUG + +//#define ZSTD + +/* Compress file given by fname and output to oname. + * Returns 0 if successful, error code otherwise. + */ +static int compress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + size_t maxCompressSize, compressSize; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; + + /* Go to the location corresponding to the last byte. */ + /* TODO: fallocate? */ + if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* Write a dummy byte at the last location. */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +#ifdef ZSTD + compressSize = ZSTD_compress(dst, statbuf.st_size, + src, statbuf.st_size, 1); +#else + compressSize = LDM_HEADER_SIZE + + LDM_compress(src, statbuf.st_size, + dst + LDM_HEADER_SIZE, statbuf.st_size); + + // Write compress and decompress size to header + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + memcpy(dst, &compressSize, 4); + memcpy(dst + 4, &(statbuf.st_size), 4); + +#ifdef DEBUG + printf("Compressed size: %zu\n", compressSize); + printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); +#endif +#endif + + // Truncate file to compressSize. + ftruncate(fdout, compressSize); + + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + (unsigned)statbuf.st_size, (unsigned)compressSize, oname, + (double)compressSize / (statbuf.st_size) * 100); + + // Close files. + close(fdin); + close(fdout); + return 0; +} + +/* Decompress file compressed using LDM_compress. + * The input file should have the LDM_HEADER followed by payload. + * Returns 0 if succesful, and an error code otherwise. + */ +static int decompress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + size_t compressSize, decompressSize, outSize; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* Read the header. */ + LDM_readHeader(src, &compressSize, &decompressSize); + +#ifdef DEBUG + printf("Size, compressSize, decompressSize: %zu %zu %zu\n", + (size_t)statbuf.st_size, compressSize, decompressSize); +#endif + + /* Go to the location corresponding to the last byte. */ + if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +#ifdef ZSTD + outSize = ZSTD_decompress(dst, decomrpessed_size, + src + LDM_HEADER_SIZE, + statbuf.st_size - LDM_HEADER_SIZE); +#else + outSize = LDM_decompress( + src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, + dst, decompressSize); + + printf("Ret size out: %zu\n", outSize); + #endif + ftruncate(fdout, outSize); + + close(fdin); + close(fdout); + return 0; +} + +/* Compare two files. + * Returns 0 iff they are the same. + */ +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) break; + + if (0 == result) result = memcmp(b0, b1, r0); + } + return result; +} + +/* Verify the input file is the same as the decompressed file. */ +static void verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + { + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + } + + fclose(decFp); + fclose(inpFp); +} + +int main(int argc, const char *argv[]) { + const char * const exeName = argv[0]; + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Wrong arguments\n"); + printf("Usage:\n"); + printf("%s FILE\n", exeName); + return 1; + } + + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + + /* Compress */ + { + struct timeval tv1, tv2; + gettimeofday(&tv1, NULL); + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + } + + /* Decompress */ + { + struct timeval tv1, tv2; + gettimeofday(&tv1, NULL); + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + } + /* verify */ + verify(inpFilename, decFilename); + return 0; +} + + +#if 0 +static size_t compress_file(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *buf = NULL; + size_t r = 1; + size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; + + src = malloc(BUF_SIZE); + if (!src) { + printf("Not enough memory\n"); + goto cleanup; + } + + size = BUF_SIZE + LDM_HEADER_SIZE; + buf = malloc(size); + if (!buf) { + printf("Not enough memory\n"); + goto cleanup; + } + + + for (;;) { + k = fread(src, 1, BUF_SIZE, in); + if (k == 0) + break; + count_in += k; + + n = LDM_compress(src, buf, k, BUF_SIZE); + + // n = k; + // offset += n; + offset = k; + count_out += k; + +// k = fwrite(src, 1, offset, out); + + k = fwrite(buf, 1, offset, out); + if (k < offset) { + if (ferror(out)) + printf("Write failed\n"); + else + printf("Short write\n"); + goto cleanup; + } + + } + *size_in = count_in; + *size_out = count_out; + r = 0; + cleanup: + free(src); + free(buf); + return r; +} + +static size_t decompress_file(FILE *in, FILE *out) { + void *src = malloc(BUF_SIZE); + void *dst = NULL; + size_t dst_capacity = BUF_SIZE; + size_t ret = 1; + size_t bytes_written = 0; + + if (!src) { + perror("decompress_file(src)"); + goto cleanup; + } + + while (ret != 0) { + /* Load more input */ + size_t src_size = fread(src, 1, BUF_SIZE, in); + void *src_ptr = src; + void *src_end = src_ptr + src_size; + if (src_size == 0 || ferror(in)) { + printf("(TODO): Decompress: not enough input or error reading file\n"); + //TODO + ret = 0; + goto cleanup; + } + + /* Allocate destination buffer if it hasn't been allocated already */ + if (!dst) { + dst = malloc(dst_capacity); + if (!dst) { + perror("decompress_file(dst)"); + goto cleanup; + } + } + + // TODO + + /* Decompress: + * Continue while there is more input to read. + */ + while (src_ptr != src_end && ret != 0) { + // size_t dst_size = src_size; + size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); + size_t written = fwrite(dst, 1, dst_size, out); +// printf("Writing %zu bytes\n", dst_size); + bytes_written += dst_size; + if (written != dst_size) { + printf("Decompress: Failed to write to file\n"); + goto cleanup; + } + src_ptr += src_size; + src_size = src_end - src_ptr; + } + + /* Update input */ + + } + + printf("Wrote %zu bytes\n", bytes_written); + + cleanup: + free(src); + free(dst); + + return ret; +} + +int main2(int argc, char *argv[]) { + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Please specify input filename\n"); + return 0; + } + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *outFp = fopen(ldmFilename, "wb"); + size_t sizeIn = 0; + size_t sizeOut = 0; + size_t ret; + printf("compress : %s -> %s\n", inpFilename, ldmFilename); + ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); + if (ret) { + printf("compress : failed with code %zu\n", ret); + return ret; + } + printf("%s: %zu → %zu bytes, %.1f%%\n", + inpFilename, sizeIn, sizeOut, + (double)sizeOut / sizeIn * 100); + printf("compress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* decompress */ + { + FILE *inpFp = fopen(ldmFilename, "rb"); + FILE *outFp = fopen(decFilename, "wb"); + size_t ret; + + printf("decompress : %s -> %s\n", ldmFilename, decFilename); + ret = decompress_file(inpFp, outFp); + if (ret) { + printf("decompress : failed with code %zu\n", ret); + return ret; + } + printf("decompress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* verify */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); + } + return 0; +} +#endif + diff --git a/contrib/long_distance_matching/versions/v3/util.c b/contrib/long_distance_matching/versions/v3/util.c new file mode 100644 index 00000000..9ea4ca1e --- /dev/null +++ b/contrib/long_distance_matching/versions/v3/util.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include + +#include "util.h" + +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; + +unsigned LDM_isLittleEndian(void) { + const union { U32 u; BYTE c[4]; } one = { 1 }; + return one.c[0]; +} + +U16 LDM_read16(const void *memPtr) { + U16 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +U16 LDM_readLE16(const void *memPtr) { + if (LDM_isLittleEndian()) { + return LDM_read16(memPtr); + } else { + const BYTE *p = (const BYTE *)memPtr; + return (U16)((U16)p[0] + (p[1] << 8)); + } +} + +void LDM_write16(void *memPtr, U16 value){ + memcpy(memPtr, &value, sizeof(value)); +} + +void LDM_write32(void *memPtr, U32 value) { + memcpy(memPtr, &value, sizeof(value)); +} + +void LDM_writeLE16(void *memPtr, U16 value) { + if (LDM_isLittleEndian()) { + LDM_write16(memPtr, value); + } else { + BYTE* p = (BYTE *)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +U32 LDM_read32(const void *ptr) { + return *(const U32 *)ptr; +} + +U64 LDM_read64(const void *ptr) { + return *(const U64 *)ptr; +} + +void LDM_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} + + diff --git a/contrib/long_distance_matching/versions/v3/util.h b/contrib/long_distance_matching/versions/v3/util.h new file mode 100644 index 00000000..90726412 --- /dev/null +++ b/contrib/long_distance_matching/versions/v3/util.h @@ -0,0 +1,23 @@ +#ifndef LDM_UTIL_H +#define LDM_UTIL_H + +unsigned LDM_isLittleEndian(void); + +uint16_t LDM_read16(const void *memPtr); + +uint16_t LDM_readLE16(const void *memPtr); + +void LDM_write16(void *memPtr, uint16_t value); + +void LDM_write32(void *memPtr, uint32_t value); + +void LDM_writeLE16(void *memPtr, uint16_t value); + +uint32_t LDM_read32(const void *ptr); + +uint64_t LDM_read64(const void *ptr); + +void LDM_copy8(void *dst, const void *src); + + +#endif /* LDM_UTIL_H */ From 583dda17a811a9f4aab42ed4178d296ca5551447 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Tue, 11 Jul 2017 18:13:26 -0700 Subject: [PATCH 19/62] Update rolling hash --- contrib/long_distance_matching/ldm.c | 435 ++++++++++++---- contrib/long_distance_matching/ldm.h | 3 + contrib/long_distance_matching/main-ldm.c | 15 +- contrib/long_distance_matching/util.c | 5 + contrib/long_distance_matching/util.h | 2 + .../versions/v0.1/ldm.c | 394 ++++++++++++++ .../versions/v0.1/ldm.h | 19 + .../versions/v0.1/main-ldm.c | 459 +++++++++++++++++ .../versions/v0.2/Makefile | 32 ++ .../versions/v0.2/ldm.c | 436 ++++++++++++++++ .../versions/v0.2/ldm.h | 19 + .../versions/v0.2/main-ldm.c | 474 +++++++++++++++++ .../versions/v0.3/Makefile | 40 ++ .../versions/v0.3/ldm.c | 464 +++++++++++++++++ .../versions/v0.3/ldm.h | 19 + .../versions/v0.3/main-ldm.c | 479 ++++++++++++++++++ .../versions/v0.3/util.c | 64 +++ .../versions/v0.3/util.h | 23 + 18 files changed, 3282 insertions(+), 100 deletions(-) create mode 100644 contrib/long_distance_matching/versions/v0.1/ldm.c create mode 100644 contrib/long_distance_matching/versions/v0.1/ldm.h create mode 100644 contrib/long_distance_matching/versions/v0.1/main-ldm.c create mode 100644 contrib/long_distance_matching/versions/v0.2/Makefile create mode 100644 contrib/long_distance_matching/versions/v0.2/ldm.c create mode 100644 contrib/long_distance_matching/versions/v0.2/ldm.h create mode 100644 contrib/long_distance_matching/versions/v0.2/main-ldm.c create mode 100644 contrib/long_distance_matching/versions/v0.3/Makefile create mode 100644 contrib/long_distance_matching/versions/v0.3/ldm.c create mode 100644 contrib/long_distance_matching/versions/v0.3/ldm.h create mode 100644 contrib/long_distance_matching/versions/v0.3/main-ldm.c create mode 100644 contrib/long_distance_matching/versions/v0.3/util.c create mode 100644 contrib/long_distance_matching/versions/v0.3/util.h diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 1dedf5c3..ca4f0f2c 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -9,7 +9,7 @@ #define HASH_EVERY 1 -#define LDM_MEMORY_USAGE 16 +#define LDM_MEMORY_USAGE 22 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) @@ -17,10 +17,12 @@ #define LDM_OFFSET_SIZE 4 -#define WINDOW_SIZE (1 << 20) +#define WINDOW_SIZE (1 << 23) #define MAX_WINDOW_SIZE 31 #define HASH_SIZE 4 -#define LDM_HASH_LENGTH 4 +#define LDM_HASH_LENGTH 100 + +// Should be multiple of four #define MINMATCH 4 #define ML_BITS 4 @@ -28,7 +30,9 @@ #define RUN_BITS (8-ML_BITS) #define RUN_MASK ((1U<totalLiteralLength) / (double)stats->numMatches); printf("Average offset length: %.1f\n", ((double)stats->totalOffset) / (double)stats->numMatches); + printf("Num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", + stats->numCollisions, stats->numHashInserts, + stats->numHashInserts == 0 ? + 1.0 : (100.0 * (double)stats->numCollisions) / + (double)stats->numHashInserts); printf("=====================\n"); } @@ -95,17 +108,43 @@ typedef struct LDM_CCtx { const BYTE *nextIp; hash_t nextHash; /* Hash corresponding to nextIp */ + // Members for rolling hash. + U32 lastSum; + U32 nextSum; + unsigned step; + + // DEBUG + const BYTE *DEBUG_setNextHash; } LDM_CCtx; +static int LDM_isValidMatch(const BYTE *p, const BYTE *match) { + U16 lengthLeft = MINMATCH; + const BYTE *curP = p; + const BYTE *curMatch = match; + + for (; lengthLeft >= 8; lengthLeft -= 8) { + if (LDM_read64(curP) != LDM_read64(curMatch)) { + return 0; + } + curP += 8; + curMatch += 8; + } + if (lengthLeft > 0) { + return LDM_read32(curP) == LDM_read32(curMatch); + } + return 1; +} + + + #ifdef LDM_ROLLING_HASH /** * Convert a sum computed from LDM_getRollingHash to a hash value in the range * of the hash table. */ static hash_t LDM_sumToHash(U32 sum) { - return sum % (LDM_HASHTABLESIZE >> 2); -// return sum & (LDM_HASHTABLESIZE - 1); + return sum & (LDM_HASH_SIZE_U32 - 1); } static U32 LDM_getRollingHash(const char *data, U32 len) { @@ -126,18 +165,102 @@ static U32 LDM_getRollingHash(const char *data, U32 len) { return (s1 & 0xffff) + (s2 << 16); } -static hash_t LDM_hashPosition(const void * const p) { - return LDM_sumToHash(LDM_getRollingHash((const char *)p, LDM_HASH_LENGTH)); -} - typedef struct LDM_sumStruct { U16 s1, s2; } LDM_sumStruct; +static U32 LDM_updateRollingHash(U32 sum, U32 len, + schar toRemove, schar toAdd) { + U32 s1 = (sum & 0xffff) - toRemove + toAdd; + U32 s2 = (sum >> 16) - (toRemove * len) + s1; + + return (s1 & 0xffff) + (s2 << 16); +} + + +/* +static hash_t LDM_hashPosition(const void * const p) { + return LDM_sumToHash(LDM_getRollingHash((const char *)p, LDM_HASH_LENGTH)); +} +*/ + +/* static void LDM_getRollingHashParts(U32 sum, LDM_sumStruct *sumStruct) { sumStruct->s1 = sum & 0xffff; sumStruct->s2 = sum >> 16; } +*/ + +static void LDM_setNextHash(LDM_CCtx *cctx) { + U32 check; + +#ifdef RUN_CHECKS + if ((cctx->nextIp - cctx->ibase != 1) && + (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { + printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, + cctx->DEBUG_setNextHash - cctx->ibase); + } + + cctx->DEBUG_setNextHash = cctx->nextIp; +#endif + + cctx->nextSum = LDM_getRollingHash((const char *)cctx->nextIp, LDM_HASH_LENGTH); + /* + check = LDM_updateRollingHash( + cctx->lastSum, LDM_HASH_LENGTH, + (schar)((cctx->lastPosHashed)[0]), + (schar)((cctx->lastPosHashed)[LDM_HASH_LENGTH])); + */ + +#ifdef RUN_CHECKS + if (check != cctx->nextSum) { + printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); +// printf("INFO: %u %u %u\n", LDM_read32(cctx->nextIp), + } else { +// printf("CHECK: setNextHash passed\n"); + } +#endif + cctx->nextHash = LDM_sumToHash(cctx->nextSum); + +#ifdef RUN_CHECKS + if ((cctx->nextIp - cctx->lastPosHashed) != 1) { + printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", + cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, + cctx->ip - cctx->ibase); + } +#endif + +} + +static void LDM_putHashOfCurrentPositionFromHash( + LDM_CCtx *cctx, hash_t hash, U32 sum) { + /* + if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { + return; + } + */ +#ifdef COMPUTE_STATS + if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { + offset_t offset = (cctx->hashTable)[hash].offset; + cctx->stats.numHashInserts++; + if (offset == 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { + cctx->stats.numCollisions++; + } + } +#endif + (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; + cctx->lastPosHashed = cctx->ip; + cctx->lastHash = hash; + cctx->lastSum = sum; +} + +static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { + U32 sum = LDM_getRollingHash((const char *)cctx->ip, LDM_HASH_LENGTH); + hash_t hash = LDM_sumToHash(sum); +// hash_t hash = LDM_hashPosition(cctx->ip); + LDM_putHashOfCurrentPositionFromHash(cctx, hash, sum); +// printf("Offset %zu\n", cctx->ip - cctx->ibase); +} #else static hash_t LDM_hash(U32 sequence) { @@ -147,6 +270,39 @@ static hash_t LDM_hash(U32 sequence) { static hash_t LDM_hashPosition(const void * const p) { return LDM_hash(LDM_read32(p)); } + +static void LDM_putHashOfCurrentPositionFromHash( + LDM_CCtx *cctx, hash_t hash) { + /* + if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { + return; + } + */ +#ifdef COMPUTE_STATS + if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { + offset_t offset = (cctx->hashTable)[hash].offset; + cctx->stats.numHashInserts++; + if (offset == 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { + cctx->stats.numCollisions++; + } + } +#endif + + (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; +#ifdef RUN_CHECKS + if (cctx->ip - cctx->lastPosHashed != 1) { + printf("putHashError\n"); + } +#endif + cctx->lastPosHashed = cctx->ip; + cctx->lastHash = hash; +} + +static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { + hash_t hash = LDM_hashPosition(cctx->ip); + LDM_putHashOfCurrentPositionFromHash(cctx, hash); +} + #endif /* @@ -161,38 +317,19 @@ static hash_t LDM_hash5(U64 sequence) { } */ -static void LDM_putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash) { - if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { - return; - } - (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; -} -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - hash_t hash = LDM_hashPosition(cctx->ip); - LDM_putHashOfCurrentPositionFromHash(cctx, hash); -} - -static const BYTE *LDM_get_position_on_hash( +static const BYTE *LDM_getPositionOnHash( hash_t h, void *tableBase, const BYTE *srcBase) { const LDM_hashEntry * const hashTable = (LDM_hashEntry *)tableBase; return hashTable[h].offset + srcBase; } -static BYTE LDM_read_byte(const void *memPtr) { - BYTE val; - memcpy(&val, memPtr, 1); - return val; -} static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE * const pStart = pIn; while (pIn < pInLimit - 1) { - BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); + BYTE const diff = LDM_readByte(pMatch) ^ LDM_readByte(pIn); if (!diff) { pIn++; pMatch++; @@ -220,7 +357,11 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->ip = cctx->ibase; cctx->iend = cctx->ibase + srcSize; +#ifdef LDM_ROLLING_HASH + cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; +#else cctx->ihashLimit = cctx->iend - HASH_SIZE; +#endif cctx->imatchLimit = cctx->iend - MINMATCH; cctx->obase = (BYTE *)dst; @@ -232,11 +373,46 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->lastPosHashed = NULL; - cctx->nextIp = NULL; cctx->step = 1; + cctx->nextIp = cctx->ip + cctx->step; + + cctx->DEBUG_setNextHash = 0; } +#ifdef LDM_ROLLING_HASH +static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { + cctx->nextIp = cctx->ip + cctx->step; + + do { + hash_t h; + U32 sum; +// printf("Call A\n"); + LDM_setNextHash(cctx); +// printf("End call a\n"); + h = cctx->nextHash; + sum = cctx->nextSum; + cctx->ip = cctx->nextIp; + cctx->nextIp += cctx->step; + + if (cctx->ip > cctx->imatchLimit) { + return 1; + } + + *match = LDM_getPositionOnHash(h, cctx->hashTable, cctx->ibase); + +// // Compute cctx->nextSum and cctx->nextHash from cctx->nextIp. +// LDM_setNextHash(cctx); + LDM_putHashOfCurrentPositionFromHash(cctx, h, sum); + +// printf("%u %u\n", cctx->lastHash, cctx->nextHash); + } while (cctx->ip - *match > WINDOW_SIZE || + !LDM_isValidMatch(cctx->ip, *match)); +// LDM_read64(*match) != LDM_read64(cctx->ip)); + LDM_setNextHash(cctx); + return 0; +} +#else static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { cctx->nextIp = cctx->ip; @@ -245,33 +421,131 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { cctx->ip = cctx->nextIp; cctx->nextIp += cctx->step; - if (cctx->nextIp > cctx->imatchLimit) { + if (cctx->ip > cctx->imatchLimit) { return 1; } - *match = LDM_get_position_on_hash(h, cctx->hashTable, cctx->ibase); + *match = LDM_getPositionOnHash(h, cctx->hashTable, cctx->ibase); cctx->nextHash = LDM_hashPosition(cctx->nextIp); LDM_putHashOfCurrentPositionFromHash(cctx, h); + } while (cctx->ip - *match > WINDOW_SIZE || - LDM_read64(*match) != LDM_read64(cctx->ip)); + !LDM_isValidMatch(cctx->ip, *match)); return 0; } +#endif + +/** + * Write current block (literals, literal length, match offset, + * match length). + * + * Update input pointer, inserting hashes into hash table along the + * way. + */ +static void LDM_outputBlock(LDM_CCtx *cctx, const BYTE *match) { + unsigned const literalLength = (unsigned)(cctx->ip - cctx->anchor); + unsigned const offset = cctx->ip - match; + unsigned const matchLength = LDM_count( + cctx->ip + MINMATCH, match + MINMATCH, cctx->ihashLimit); + BYTE *token = cctx->op++; + + cctx->stats.totalLiteralLength += literalLength; + cctx->stats.totalOffset += offset; + cctx->stats.totalMatchLength += matchLength + MINMATCH; + + /* Encode the literal length. */ + if (literalLength >= RUN_MASK) { + int len = (int)literalLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *(cctx->op)++ = 255; + } + *(cctx->op)++ = (BYTE)len; + } else { + *token = (BYTE)(literalLength << ML_BITS); + } + + /* Encode the literals. */ + memcpy(cctx->op, cctx->anchor, literalLength); + cctx->op += literalLength; + + /* Encode the offset. */ + LDM_write32(cctx->op, offset); + cctx->op += LDM_OFFSET_SIZE; + + /* Encode match length */ + if (matchLength >= ML_MASK) { + unsigned matchLengthRemaining = matchLength; + *token += ML_MASK; + matchLengthRemaining -= ML_MASK; + LDM_write32(cctx->op, 0xFFFFFFFF); + while (matchLengthRemaining >= 4*0xFF) { + cctx->op += 4; + LDM_write32(cctx->op, 0xffffffff); + matchLengthRemaining -= 4*0xFF; + } + cctx->op += matchLengthRemaining / 255; + *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); + } else { + *token += (BYTE)(matchLength); + } + +// LDM_setNextHash(cctx); +// cctx->ip = cctx->lastPosHashed + 1; +// cctx->nextIp = cctx->ip + cctx->step; +// printf("HERE: %zu %zu %zu\n", cctx->ip - cctx->ibase, +// cctx->lastPosHashed - cctx->ibase, cctx->nextIp - cctx->ibase); + + cctx->nextIp = cctx->ip + cctx->step; + + while (cctx->ip < cctx->anchor + MINMATCH + matchLength + literalLength) { +// printf("Loop\n"); + if (cctx->ip > cctx->lastPosHashed) { + LDM_putHashOfCurrentPosition(cctx); +#ifdef LDM_ROLLING_HASH + LDM_setNextHash(cctx); +#endif + } + /* + printf("Call b %zu %zu %zu\n", + cctx->lastPosHashed - cctx->ibase, + cctx->nextIp - cctx->ibase, + cctx->ip - cctx->ibase); + */ +// printf("end call b\n"); + cctx->ip++; + cctx->nextIp++; + } + +// printf("There: %zu %zu\n", cctx->ip - cctx->ibase, cctx->lastPosHashed - cctx->ibase); +} + // TODO: srcSize and maxDstSize is unused size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; + U32 tmp_hash; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); +#ifdef LDM_ROLLING_HASH +// LDM_setNextHash(&cctx); +// tmp_hash = LDM_updateRollingHash(cctx.lastSum, LDM_HASH_LENGTH, +// cctx.ip[0], cctx.ip[LDM_HASH_LENGTH]); +// printf("Update test: %u %u\n", tmp_hash, cctx.nextSum); +// cctx.ip++; +#else cctx.ip++; cctx.nextHash = LDM_hashPosition(cctx.ip); +#endif // TODO: loop condition is not accurate. while (1) { const BYTE *match; +// printf("Start of loop\n"); /** * Find a match. @@ -282,6 +556,7 @@ size_t LDM_compress(const void *src, size_t srcSize, if (LDM_findBestMatch(&cctx, &match) != 0) { goto _last_literals; } +// printf("End of match finding\n"); cctx.stats.numMatches++; @@ -290,6 +565,7 @@ size_t LDM_compress(const void *src, size_t srcSize, */ while (cctx.ip > cctx.anchor && match > cctx.ibase && cctx.ip[-1] == match[-1]) { +// printf("Catch up\n"); cctx.ip--; match--; } @@ -298,67 +574,24 @@ size_t LDM_compress(const void *src, size_t srcSize, * Write current block (literals, literal length, match offset, match * length) and update pointers and hashes. */ - { - unsigned const literalLength = (unsigned)(cctx.ip - cctx.anchor); - unsigned const offset = cctx.ip - match; - unsigned const matchLength = LDM_count( - cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); - BYTE *token = cctx.op++; - - cctx.stats.totalLiteralLength += literalLength; - cctx.stats.totalOffset += offset; - cctx.stats.totalMatchLength += matchLength + MINMATCH; - - /* Encode the literal length. */ - if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; - *token = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *(cctx.op)++ = 255; - } - *(cctx.op)++ = (BYTE)len; - } else { - *token = (BYTE)(literalLength << ML_BITS); - } - - /* Encode the literals. */ - memcpy(cctx.op, cctx.anchor, literalLength); - cctx.op += literalLength; - - /* Encode the offset. */ - LDM_write32(cctx.op, offset); - cctx.op += LDM_OFFSET_SIZE; - - /* Encode match length */ - if (matchLength >= ML_MASK) { - unsigned matchLengthRemaining = matchLength; - *token += ML_MASK; - matchLengthRemaining -= ML_MASK; - LDM_write32(cctx.op, 0xFFFFFFFF); - while (matchLengthRemaining >= 4*0xFF) { - cctx.op += 4; - LDM_write32(cctx.op, 0xffffffff); - matchLengthRemaining -= 4*0xFF; - } - cctx.op += matchLengthRemaining / 255; - *(cctx.op)++ = (BYTE)(matchLengthRemaining % 255); - } else { - *token += (BYTE)(matchLength); - } - - /* Update input pointer, inserting hashes into hash table along the - * way. - */ - while (cctx.ip < cctx.anchor + MINMATCH + matchLength + literalLength) { - LDM_putHashOfCurrentPosition(&cctx); - cctx.ip++; - } - } + LDM_outputBlock(&cctx, match); +// printf("End of loop\n"); // Set start of next block to current input pointer. cctx.anchor = cctx.ip; LDM_putHashOfCurrentPosition(&cctx); - cctx.nextHash = LDM_hashPosition(++cctx.ip); +#ifndef LDM_ROLLING_HASH + cctx.ip++; +#endif + + /* + LDM_putHashOfCurrentPosition(&cctx); + printf("Call c\n"); + LDM_setNextHash(&cctx); + printf("End call c\n"); + cctx.ip++; + cctx.nextIp++; + */ } _last_literals: /* Encode the last literals (no more matches). */ @@ -453,7 +686,7 @@ size_t LDM_decompress(const void *src, size_t compressSize, /* Copy match. */ cpy = dctx.op + length; - // Inefficient for now + // Inefficient for now. while (match < cpy - offset && dctx.op < dctx.oend) { *(dctx.op)++ = *match++; } @@ -461,4 +694,20 @@ size_t LDM_decompress(const void *src, size_t compressSize, return dctx.op - (BYTE *)dst; } +void LDM_test(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { +#ifdef LDM_ROLLING_HASH + const BYTE *ip = (const BYTE *)src + 1125; + U32 sum = LDM_getRollingHash((const char *)ip, LDM_HASH_LENGTH); + U32 sum2; + ++ip; + for (; ip < (const BYTE *)src + 1125 + 100; ip++) { + sum2 = LDM_updateRollingHash(sum, LDM_HASH_LENGTH, + ip[-1], ip[LDM_HASH_LENGTH - 1]); + sum = LDM_getRollingHash((const char *)ip, LDM_HASH_LENGTH); + printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2); + } +#endif +} + diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 287d444d..a34faac4 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -16,4 +16,7 @@ size_t LDM_decompress(const void *src, size_t srcSize, void LDM_readHeader(const void *src, size_t *compressSize, size_t *decompressSize); +void LDM_test(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 724d735d..f8ae5469 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -15,6 +15,7 @@ // #define BUF_SIZE 16*1024 // Block size #define DEBUG +//#define TEST //#define ZSTD @@ -74,6 +75,11 @@ static int compress(const char *fname, const char *oname) { return 1; } +#ifdef TEST + LDM_test(src, statbuf.st_size, + dst + LDM_HEADER_SIZE, statbuf.st_size); +#endif + #ifdef ZSTD compressSize = ZSTD_compress(dst, statbuf.st_size, src, statbuf.st_size, 1); @@ -144,11 +150,6 @@ static int decompress(const char *fname, const char *oname) { /* Read the header. */ LDM_readHeader(src, &compressSize, &decompressSize); -#ifdef DEBUG - printf("Size, compressSize, decompressSize: %zu %zu %zu\n", - (size_t)statbuf.st_size, compressSize, decompressSize); -#endif - /* Go to the location corresponding to the last byte. */ if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { perror("lseek error"); @@ -256,7 +257,7 @@ int main(int argc, const char *argv[]) { return 1; } gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", + printf("Total compress time = %f seconds\n", (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + (double) (tv2.tv_sec - tv1.tv_sec)); } @@ -270,7 +271,7 @@ int main(int argc, const char *argv[]) { return 1; } gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", + printf("Total decompress time = %f seconds\n", (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + (double) (tv2.tv_sec - tv1.tv_sec)); } diff --git a/contrib/long_distance_matching/util.c b/contrib/long_distance_matching/util.c index 9ea4ca1e..70fcbc2c 100644 --- a/contrib/long_distance_matching/util.c +++ b/contrib/long_distance_matching/util.c @@ -61,4 +61,9 @@ void LDM_copy8(void *dst, const void *src) { memcpy(dst, src, 8); } +BYTE LDM_readByte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} diff --git a/contrib/long_distance_matching/util.h b/contrib/long_distance_matching/util.h index 90726412..d1c3c999 100644 --- a/contrib/long_distance_matching/util.h +++ b/contrib/long_distance_matching/util.h @@ -19,5 +19,7 @@ uint64_t LDM_read64(const void *ptr); void LDM_copy8(void *dst, const void *src); +uint8_t LDM_readByte(const void *ptr); + #endif /* LDM_UTIL_H */ diff --git a/contrib/long_distance_matching/versions/v0.1/ldm.c b/contrib/long_distance_matching/versions/v0.1/ldm.c new file mode 100644 index 00000000..266425f8 --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.1/ldm.c @@ -0,0 +1,394 @@ +#include +#include +#include +#include + +#include "ldm.h" + +#define LDM_MEMORY_USAGE 14 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) + +#define WINDOW_SIZE (1 << 20) +#define MAX_WINDOW_SIZE 31 +#define HASH_SIZE 4 +#define MINMATCH 4 + +#define ML_BITS 4 +#define ML_MASK ((1U<>8); + } +} + +static U32 LDM_read32(const void *ptr) { + return *(const U32 *)ptr; +} + +static U64 LDM_read64(const void *ptr) { + return *(const U64 *)ptr; +} + + +static void LDM_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} + +static void LDM_wild_copy(void *dstPtr, const void *srcPtr, void *dstEnd) { + BYTE *d = (BYTE *)dstPtr; + const BYTE *s = (const BYTE *)srcPtr; + BYTE * const e = (BYTE *)dstEnd; + + do { + LDM_copy8(d, s); + d += 8; + s += 8; + } while (d < e); + +} + +struct hash_entry { + U64 offset; + tag t; +}; + +static U32 LDM_hash(U32 sequence) { + return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); +} + +static U32 LDM_hash5(U64 sequence) { + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; + const U32 hashLog = LDM_HASHLOG; + if (LDM_isLittleEndian()) + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); +} + +static U32 LDM_hash_position(const void * const p) { + return LDM_hash(LDM_read32(p)); +} + +static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, + const BYTE *srcBase) { + U32 *hashTable = (U32 *) tableBase; + hashTable[h] = (U32)(p - srcBase); +} + +static void LDM_put_position(const BYTE *p, void *tableBase, + const BYTE *srcBase) { + U32 const h = LDM_hash_position(p); + LDM_put_position_on_hash(p, h, tableBase, srcBase); +} + +static const BYTE *LDM_get_position_on_hash( + U32 h, void *tableBase, const BYTE *srcBase) { + const U32 * const hashTable = (U32*)tableBase; + return hashTable[h] + srcBase; +} + +static BYTE LDM_read_byte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} + +static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (unsigned)(pIn - pStart); + } + return (unsigned)(pIn - pStart); +} + +void LDM_read_header(void const *source, size_t *compressed_size, + size_t *decompressed_size) { + const U32 *ip = (const U32 *)source; + *compressed_size = *ip++; + *decompressed_size = *ip; +} + +size_t LDM_compress(void const *source, void *dest, size_t source_size, + size_t max_dest_size) { + const BYTE * const istart = (const BYTE*)source; + const BYTE *ip = istart; + const BYTE * const iend = istart + source_size; + const BYTE *ilimit = iend - HASH_SIZE; + const BYTE * const matchlimit = iend - HASH_SIZE; + const BYTE * const mflimit = iend - MINMATCH; + BYTE *op = (BYTE*) dest; + U32 hashTable[LDM_HASHTABLESIZE_U32]; + memset(hashTable, 0, sizeof(hashTable)); + + const BYTE *anchor = (const BYTE *)source; +// struct LDM_cctx cctx; + size_t output_size = 0; + + U32 forwardH; + + /* Hash first byte: put into hash table */ + + LDM_put_position(ip, hashTable, istart); + ip++; + forwardH = LDM_hash_position(ip); + + //TODO Loop terminates before ip>=ilimit. + while (ip < ilimit) { + const BYTE *match; + BYTE *token; + + /* Find a match */ + { + const BYTE *forwardIp = ip; + unsigned step = 1; + + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + + if (forwardIp > mflimit) { + goto _last_literals; + } + + match = LDM_get_position_on_hash(h, hashTable, istart); + + forwardH = LDM_hash_position(forwardIp); + LDM_put_position_on_hash(ip, h, hashTable, istart); + } while (ip - match > WINDOW_SIZE || + LDM_read64(match) != LDM_read64(ip)); + } + + // TODO catchup + while (ip > anchor && match > istart && ip[-1] == match[-1]) { + ip--; + match--; + } + + /* Encode literals */ + { + unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + +#ifdef LDM_DEBUG + printf("Cur position: %zu\n", anchor - istart); + printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); +#endif + /* + fwrite(match, 4, 1, stdout); + printf("\n"); + */ + + if (litLength >= RUN_MASK) { + int len = (int)litLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *op++ = 255; + } + *op++ = (BYTE)len; + } else { + *token = (BYTE)(litLength << ML_BITS); + } +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(anchor, litLength, 1, stdout); + printf("\n"); +#endif + memcpy(op, anchor, litLength); + //LDM_wild_copy(op, anchor, op + litLength); + op += litLength; + } +_next_match: + /* Encode offset */ + { + LDM_write32(op, ip - match); + op += 4; + } + + /* Encode Match Length */ + { + unsigned matchCode; + matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, + matchlimit); +#ifdef LDM_DEBUG + printf("Match length %zu\n", matchCode + MINMATCH); + fwrite(ip, MINMATCH + matchCode, 1, stdout); + printf("\n"); +#endif + ip += MINMATCH + matchCode; + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LDM_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*0xFF) { + op += 4; + LDM_write32(op, 0xffffffff); + matchCode -= 4*0xFF; + } + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else { + *token += (BYTE)(matchCode); + } +#ifdef LDM_DEBUG + printf("\n"); +#endif + } + + anchor = ip; + + LDM_put_position(ip, hashTable, istart); + forwardH = LDM_hash_position(++ip); + } +_last_literals: + /* Encode last literals */ + { + size_t const lastRun = (size_t)(iend - anchor); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255; accumulator -= 255) { + *op++ = 255; + } + *op++ = (BYTE)accumulator; + } else { + *op++ = (BYTE)(lastRun << ML_BITS); + } + memcpy(op, anchor, lastRun); + op += lastRun; + } + return (op - (BYTE *)dest); +} + +size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, + size_t max_decompressed_size) { + const BYTE *ip = (const BYTE *)source; + const BYTE * const iend = ip + compressed_size; + BYTE *op = (BYTE *)dest; + BYTE * const oend = op + max_decompressed_size; + BYTE *cpy; + + while (ip < iend) { + size_t length; + const BYTE *match; + size_t offset; + + /* get literal length */ + unsigned const token = *ip++; + if ((length=(token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } +#ifdef LDM_DEBUG + printf("Literal length: %zu\n", length); +#endif + + /* copy literals */ + cpy = op + length; +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(ip, length, 1, stdout); + printf("\n"); +#endif + memcpy(op, ip, length); +// LDM_wild_copy(op, ip, cpy); + ip += length; + op = cpy; + + /* get offset */ + offset = LDM_read32(ip); + +#ifdef LDM_DEBUG + printf("Offset: %zu\n", offset); +#endif + ip += 4; + match = op - offset; + // LDM_write32(op, (U32)offset); + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } + length += MINMATCH; +#ifdef LDM_DEBUG + printf("Match length: %zu\n", length); +#endif + /* copy match */ + cpy = op + length; + + // Inefficient for now + + while (match < cpy - offset && op < oend) { + *op++ = *match++; + } + } +// memcpy(dest, source, compressed_size); + return op - (BYTE *)dest; +} + + diff --git a/contrib/long_distance_matching/versions/v0.1/ldm.h b/contrib/long_distance_matching/versions/v0.1/ldm.h new file mode 100644 index 00000000..f4ca25a3 --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.1/ldm.h @@ -0,0 +1,19 @@ +#ifndef LDM_H +#define LDM_H + +#include /* size_t */ + +#define LDM_COMPRESS_SIZE 4 +#define LDM_DECOMPRESS_SIZE 4 +#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) + +size_t LDM_compress(void const *source, void *dest, size_t source_size, + size_t max_dest_size); + +size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, + size_t max_decompressed_size); + +void LDM_read_header(void const *source, size_t *compressed_size, + size_t *decompressed_size); + +#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v0.1/main-ldm.c b/contrib/long_distance_matching/versions/v0.1/main-ldm.c new file mode 100644 index 00000000..10869cce --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.1/main-ldm.c @@ -0,0 +1,459 @@ +// TODO: file size must fit into a U32 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ldm.h" + +// #define BUF_SIZE 16*1024 // Block size +#define DEBUG + +//#define ZSTD + +#if 0 +static size_t compress_file(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *buf = NULL; + size_t r = 1; + size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; + + src = malloc(BUF_SIZE); + if (!src) { + printf("Not enough memory\n"); + goto cleanup; + } + + size = BUF_SIZE + LDM_HEADER_SIZE; + buf = malloc(size); + if (!buf) { + printf("Not enough memory\n"); + goto cleanup; + } + + + for (;;) { + k = fread(src, 1, BUF_SIZE, in); + if (k == 0) + break; + count_in += k; + + n = LDM_compress(src, buf, k, BUF_SIZE); + + // n = k; + // offset += n; + offset = k; + count_out += k; + +// k = fwrite(src, 1, offset, out); + + k = fwrite(buf, 1, offset, out); + if (k < offset) { + if (ferror(out)) + printf("Write failed\n"); + else + printf("Short write\n"); + goto cleanup; + } + + } + *size_in = count_in; + *size_out = count_out; + r = 0; + cleanup: + free(src); + free(buf); + return r; +} + +static size_t decompress_file(FILE *in, FILE *out) { + void *src = malloc(BUF_SIZE); + void *dst = NULL; + size_t dst_capacity = BUF_SIZE; + size_t ret = 1; + size_t bytes_written = 0; + + if (!src) { + perror("decompress_file(src)"); + goto cleanup; + } + + while (ret != 0) { + /* Load more input */ + size_t src_size = fread(src, 1, BUF_SIZE, in); + void *src_ptr = src; + void *src_end = src_ptr + src_size; + if (src_size == 0 || ferror(in)) { + printf("(TODO): Decompress: not enough input or error reading file\n"); + //TODO + ret = 0; + goto cleanup; + } + + /* Allocate destination buffer if it hasn't been allocated already */ + if (!dst) { + dst = malloc(dst_capacity); + if (!dst) { + perror("decompress_file(dst)"); + goto cleanup; + } + } + + // TODO + + /* Decompress: + * Continue while there is more input to read. + */ + while (src_ptr != src_end && ret != 0) { + // size_t dst_size = src_size; + size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); + size_t written = fwrite(dst, 1, dst_size, out); +// printf("Writing %zu bytes\n", dst_size); + bytes_written += dst_size; + if (written != dst_size) { + printf("Decompress: Failed to write to file\n"); + goto cleanup; + } + src_ptr += src_size; + src_size = src_end - src_ptr; + } + + /* Update input */ + + } + + printf("Wrote %zu bytes\n", bytes_written); + + cleanup: + free(src); + free(dst); + + return ret; +} +#endif + +static size_t compress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* open the input file */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* open the output file */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* find size of input file */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + size_t size_in = statbuf.st_size; + + /* go to the location corresponding to the last byte */ + if (lseek(fdout, size_in + LDM_HEADER_SIZE - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + size_t out_size = statbuf.st_size + LDM_HEADER_SIZE; + + /* mmap the output file */ + if ((dst = mmap(0, out_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + + #ifdef ZSTD + size_t size_out = ZSTD_compress(dst, statbuf.st_size, + src, statbuf.st_size, 1); + #else + size_t size_out = LDM_compress(src, dst + LDM_HEADER_SIZE, statbuf.st_size, + statbuf.st_size); + size_out += LDM_HEADER_SIZE; + + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + memcpy(dst, &size_out, 4); + memcpy(dst + 4, &(statbuf.st_size), 4); + printf("Compressed size: %zu\n", size_out); + printf("Decompressed size: %zu\n", statbuf.st_size); + #endif + ftruncate(fdout, size_out); + + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + (unsigned)statbuf.st_size, (unsigned)size_out, oname, + (double)size_out / (statbuf.st_size) * 100); + + close(fdin); + close(fdout); + return 0; +} + +static size_t decompress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* open the input file */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* open the output file */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* find size of input file */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* mmap the input file */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* read header */ + size_t compressed_size, decompressed_size; + LDM_read_header(src, &compressed_size, &decompressed_size); + + printf("Size, compressed_size, decompressed_size: %zu %zu %zu\n", + statbuf.st_size, compressed_size, decompressed_size); + + /* go to the location corresponding to the last byte */ + if (lseek(fdout, decompressed_size - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, decompressed_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + + /* Copy input file to output file */ +// memcpy(dst, src, statbuf.st_size); + + #ifdef ZSTD + size_t size_out = ZSTD_decompress(dst, decomrpessed_size, + src + LDM_HEADER_SIZE, + statbuf.st_size - LDM_HEADER_SIZE); + #else + size_t size_out = LDM_decompress(src + LDM_HEADER_SIZE, dst, + statbuf.st_size - LDM_HEADER_SIZE, + decompressed_size); + printf("Ret size out: %zu\n", size_out); + #endif + ftruncate(fdout, size_out); + + close(fdin); + close(fdout); + return 0; +} + +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) { + break; + } + if (0 == result) { + result = memcmp(b0, b1, r0); + } + } + return result; +} + +static void verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); +} + +int main(int argc, const char *argv[]) { + const char * const exeName = argv[0]; + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Wrong arguments\n"); + printf("Usage:\n"); + printf("%s FILE\n", exeName); + return 1; + } + + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + struct timeval tv1, tv2; + /* compress */ + { + gettimeofday(&tv1, NULL); + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + } + + /* decompress */ + + gettimeofday(&tv1, NULL); + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + + /* verify */ + verify(inpFilename, decFilename); + return 0; +} + +#if 0 +int main2(int argc, char *argv[]) { + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Please specify input filename\n"); + return 0; + } + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *outFp = fopen(ldmFilename, "wb"); + size_t sizeIn = 0; + size_t sizeOut = 0; + size_t ret; + printf("compress : %s -> %s\n", inpFilename, ldmFilename); + ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); + if (ret) { + printf("compress : failed with code %zu\n", ret); + return ret; + } + printf("%s: %zu → %zu bytes, %.1f%%\n", + inpFilename, sizeIn, sizeOut, + (double)sizeOut / sizeIn * 100); + printf("compress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* decompress */ + { + FILE *inpFp = fopen(ldmFilename, "rb"); + FILE *outFp = fopen(decFilename, "wb"); + size_t ret; + + printf("decompress : %s -> %s\n", ldmFilename, decFilename); + ret = decompress_file(inpFp, outFp); + if (ret) { + printf("decompress : failed with code %zu\n", ret); + return ret; + } + printf("decompress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* verify */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); + } + return 0; +} +#endif + diff --git a/contrib/long_distance_matching/versions/v0.2/Makefile b/contrib/long_distance_matching/versions/v0.2/Makefile new file mode 100644 index 00000000..4e04fd6a --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.2/Makefile @@ -0,0 +1,32 @@ +# ################################################################ +# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# ################################################################ + +# This Makefile presumes libzstd is installed, using `sudo make install` + + +LDFLAGS += -lzstd + +.PHONY: default all clean + +default: all + +all: main-ldm + + +#main : ldm.c main.c +# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +main-ldm : ldm.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +clean: + @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ + main main-ldm + @echo Cleaning completed + diff --git a/contrib/long_distance_matching/versions/v0.2/ldm.c b/contrib/long_distance_matching/versions/v0.2/ldm.c new file mode 100644 index 00000000..9081d136 --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.2/ldm.c @@ -0,0 +1,436 @@ +#include +#include +#include +#include + +#include "ldm.h" + +#define HASH_EVERY 7 + +#define LDM_MEMORY_USAGE 14 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) + +#define WINDOW_SIZE (1 << 20) +#define MAX_WINDOW_SIZE 31 +#define HASH_SIZE 8 +#define MINMATCH 8 + +#define ML_BITS 4 +#define ML_MASK ((1U<>8); + } +} + +static U32 LDM_read32(const void *ptr) { + return *(const U32 *)ptr; +} + +static U64 LDM_read64(const void *ptr) { + return *(const U64 *)ptr; +} + +static void LDM_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} + +typedef struct compress_stats { + U32 num_matches; + U32 total_match_length; + U32 total_literal_length; + U64 total_offset; +} compress_stats; + +static void LDM_printCompressStats(const compress_stats *stats) { + printf("=====================\n"); + printf("Compression statistics\n"); + printf("Total number of matches: %u\n", stats->num_matches); + printf("Average match length: %.1f\n", ((double)stats->total_match_length) / + (double)stats->num_matches); + printf("Average literal length: %.1f\n", + ((double)stats->total_literal_length) / (double)stats->num_matches); + printf("Average offset length: %.1f\n", + ((double)stats->total_offset) / (double)stats->num_matches); + printf("=====================\n"); +} + +// TODO: unused. +struct hash_entry { + U64 offset; + tag t; +}; + +static U32 LDM_hash(U32 sequence) { + return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); +} + +static U32 LDM_hash5(U64 sequence) { + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; + const U32 hashLog = LDM_HASHLOG; + if (LDM_isLittleEndian()) + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); +} + +static U32 LDM_hash_position(const void * const p) { + return LDM_hash(LDM_read32(p)); +} + +static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, + const BYTE *srcBase) { + if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { + return; + } + + U32 *hashTable = (U32 *) tableBase; + hashTable[h] = (U32)(p - srcBase); +} + +static void LDM_put_position(const BYTE *p, void *tableBase, + const BYTE *srcBase) { + if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { + return; + } + U32 const h = LDM_hash_position(p); + LDM_put_position_on_hash(p, h, tableBase, srcBase); +} + +static const BYTE *LDM_get_position_on_hash( + U32 h, void *tableBase, const BYTE *srcBase) { + const U32 * const hashTable = (U32*)tableBase; + return hashTable[h] + srcBase; +} + +static BYTE LDM_read_byte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} + +static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (unsigned)(pIn - pStart); + } + return (unsigned)(pIn - pStart); +} + +void LDM_read_header(const void *src, size_t *compressSize, + size_t *decompressSize) { + const U32 *ip = (const U32 *)src; + *compressSize = *ip++; + *decompressSize = *ip; +} + +// TODO: maxDstSize is unused +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + const BYTE * const istart = (const BYTE*)src; + const BYTE *ip = istart; + const BYTE * const iend = istart + srcSize; + const BYTE *ilimit = iend - HASH_SIZE; + const BYTE * const matchlimit = iend - HASH_SIZE; + const BYTE * const mflimit = iend - MINMATCH; + BYTE *op = (BYTE*) dst; + + compress_stats compressStats = { 0 }; + + U32 hashTable[LDM_HASHTABLESIZE_U32]; + memset(hashTable, 0, sizeof(hashTable)); + + const BYTE *anchor = (const BYTE *)src; +// struct LDM_cctx cctx; + size_t output_size = 0; + + U32 forwardH; + + /* Hash first byte: put into hash table */ + + LDM_put_position(ip, hashTable, istart); + const BYTE *lastHash = ip; + ip++; + forwardH = LDM_hash_position(ip); + + //TODO Loop terminates before ip>=ilimit. + while (ip < ilimit) { + const BYTE *match; + BYTE *token; + + /* Find a match */ + { + const BYTE *forwardIp = ip; + unsigned step = 1; + + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + + if (forwardIp > mflimit) { + goto _last_literals; + } + + match = LDM_get_position_on_hash(h, hashTable, istart); + + forwardH = LDM_hash_position(forwardIp); + LDM_put_position_on_hash(ip, h, hashTable, istart); + lastHash = ip; + } while (ip - match > WINDOW_SIZE || + LDM_read64(match) != LDM_read64(ip)); + } + compressStats.num_matches++; + + /* Catchup: look back to extend match from found match */ + while (ip > anchor && match > istart && ip[-1] == match[-1]) { + ip--; + match--; + } + + /* Encode literals */ + { + unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + + compressStats.total_literal_length += litLength; + +#ifdef LDM_DEBUG + printf("Cur position: %zu\n", anchor - istart); + printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); +#endif + + if (litLength >= RUN_MASK) { + int len = (int)litLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *op++ = 255; + } + *op++ = (BYTE)len; + } else { + *token = (BYTE)(litLength << ML_BITS); + } +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(anchor, litLength, 1, stdout); + printf("\n"); +#endif + memcpy(op, anchor, litLength); + op += litLength; + } +_next_match: + /* Encode offset */ + { + /* + LDM_writeLE16(op, ip-match); + op += 2; + */ + LDM_write32(op, ip - match); + op += 4; + compressStats.total_offset += (ip - match); + } + + /* Encode Match Length */ + { + unsigned matchCode; + matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, + matchlimit); +#ifdef LDM_DEBUG + printf("Match length %zu\n", matchCode + MINMATCH); + fwrite(ip, MINMATCH + matchCode, 1, stdout); + printf("\n"); +#endif + compressStats.total_match_length += matchCode + MINMATCH; + unsigned ctr = 1; + ip++; + for (; ctr < MINMATCH + matchCode; ip++, ctr++) { + LDM_put_position(ip, hashTable, istart); + } +// ip += MINMATCH + matchCode; + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LDM_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*0xFF) { + op += 4; + LDM_write32(op, 0xffffffff); + matchCode -= 4*0xFF; + } + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else { + *token += (BYTE)(matchCode); + } +#ifdef LDM_DEBUG + printf("\n"); + +#endif + } + + anchor = ip; + + LDM_put_position(ip, hashTable, istart); + forwardH = LDM_hash_position(++ip); + lastHash = ip; + } +_last_literals: + /* Encode last literals */ + { + size_t const lastRun = (size_t)(iend - anchor); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255; accumulator -= 255) { + *op++ = 255; + } + *op++ = (BYTE)accumulator; + } else { + *op++ = (BYTE)(lastRun << ML_BITS); + } + memcpy(op, anchor, lastRun); + op += lastRun; + } + LDM_printCompressStats(&compressStats); + return (op - (BYTE *)dst); +} + +typedef struct LDM_DCtx { + const BYTE * const ibase; /* Pointer to base of input */ + const BYTE *ip; /* Pointer to current input position */ + const BYTE *iend; /* End of source */ + BYTE *op; /* Pointer to output */ + const BYTE * const oend; /* Pointer to end of output */ + +} LDM_DCtx; + +size_t LDM_decompress(const void *src, size_t compressed_size, + void *dst, size_t max_decompressed_size) { + const BYTE *ip = (const BYTE *)src; + const BYTE * const iend = ip + compressed_size; + BYTE *op = (BYTE *)dst; + BYTE * const oend = op + max_decompressed_size; + BYTE *cpy; + + while (ip < iend) { + size_t length; + const BYTE *match; + size_t offset; + + /* get literal length */ + unsigned const token = *ip++; + if ((length=(token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } +#ifdef LDM_DEBUG + printf("Literal length: %zu\n", length); +#endif + + /* copy literals */ + cpy = op + length; +#ifdef LDM_DEBUG + printf("Literals "); + fwrite(ip, length, 1, stdout); + printf("\n"); +#endif + memcpy(op, ip, length); + ip += length; + op = cpy; + + /* get offset */ + /* + offset = LDM_readLE16(ip); + ip += 2; + */ + offset = LDM_read32(ip); + ip += 4; +#ifdef LDM_DEBUG + printf("Offset: %zu\n", offset); +#endif + match = op - offset; + // LDM_write32(op, (U32)offset); + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while (s == 255); + } + length += MINMATCH; +#ifdef LDM_DEBUG + printf("Match length: %zu\n", length); +#endif + /* copy match */ + cpy = op + length; + + // Inefficient for now + while (match < cpy - offset && op < oend) { + *op++ = *match++; + } + } + return op - (BYTE *)dst; +} + + diff --git a/contrib/long_distance_matching/versions/v0.2/ldm.h b/contrib/long_distance_matching/versions/v0.2/ldm.h new file mode 100644 index 00000000..0ac7b2ec --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.2/ldm.h @@ -0,0 +1,19 @@ +#ifndef LDM_H +#define LDM_H + +#include /* size_t */ + +#define LDM_COMPRESS_SIZE 4 +#define LDM_DECOMPRESS_SIZE 4 +#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) + +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +size_t LDM_decompress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +void LDM_read_header(const void *src, size_t *compressSize, + size_t *decompressSize); + +#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v0.2/main-ldm.c b/contrib/long_distance_matching/versions/v0.2/main-ldm.c new file mode 100644 index 00000000..0017335b --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.2/main-ldm.c @@ -0,0 +1,474 @@ +// TODO: file size must fit into a U32 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ldm.h" + +// #define BUF_SIZE 16*1024 // Block size +#define DEBUG + +//#define ZSTD + +/* Compress file given by fname and output to oname. + * Returns 0 if successful, error code otherwise. + */ +static int compress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + size_t maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; + + /* Go to the location corresponding to the last byte. */ + /* TODO: fallocate? */ + if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* Write a dummy byte at the last location. */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +#ifdef ZSTD + size_t compressSize = ZSTD_compress(dst, statbuf.st_size, + src, statbuf.st_size, 1); +#else + size_t compressSize = LDM_HEADER_SIZE + + LDM_compress(src, statbuf.st_size, + dst + LDM_HEADER_SIZE, statbuf.st_size); + + // Write compress and decompress size to header + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + memcpy(dst, &compressSize, 4); + memcpy(dst + 4, &(statbuf.st_size), 4); + +#ifdef DEBUG + printf("Compressed size: %zu\n", compressSize); + printf("Decompressed size: %zu\n", statbuf.st_size); +#endif +#endif + + // Truncate file to compressSize. + ftruncate(fdout, compressSize); + + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + (unsigned)statbuf.st_size, (unsigned)compressSize, oname, + (double)compressSize / (statbuf.st_size) * 100); + + // Close files. + close(fdin); + close(fdout); + return 0; +} + +/* Decompress file compressed using LDM_compress. + * The input file should have the LDM_HEADER followed by payload. + * Returns 0 if succesful, and an error code otherwise. + */ +static int decompress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* Read the header. */ + size_t compressSize, decompressSize; + LDM_read_header(src, &compressSize, &decompressSize); + +#ifdef DEBUG + printf("Size, compressSize, decompressSize: %zu %zu %zu\n", + statbuf.st_size, compressSize, decompressSize); +#endif + + /* Go to the location corresponding to the last byte. */ + if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +#ifdef ZSTD + size_t outSize = ZSTD_decompress(dst, decomrpessed_size, + src + LDM_HEADER_SIZE, + statbuf.st_size - LDM_HEADER_SIZE); +#else + size_t outSize = LDM_decompress( + src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, + dst, decompressSize); + + printf("Ret size out: %zu\n", outSize); + #endif + ftruncate(fdout, outSize); + + close(fdin); + close(fdout); + return 0; +} + +/* Compare two files. + * Returns 0 iff they are the same. + */ +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) break; + + if (0 == result) result = memcmp(b0, b1, r0); + } + return result; +} + +/* Verify the input file is the same as the decompressed file. */ +static void verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); +} + +int main(int argc, const char *argv[]) { + const char * const exeName = argv[0]; + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Wrong arguments\n"); + printf("Usage:\n"); + printf("%s FILE\n", exeName); + return 1; + } + + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + struct timeval tv1, tv2; + + /* Compress */ + + gettimeofday(&tv1, NULL); + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + + /* Decompress */ + + gettimeofday(&tv1, NULL); + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + + /* verify */ + verify(inpFilename, decFilename); + return 0; +} + + +#if 0 +static size_t compress_file(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *buf = NULL; + size_t r = 1; + size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; + + src = malloc(BUF_SIZE); + if (!src) { + printf("Not enough memory\n"); + goto cleanup; + } + + size = BUF_SIZE + LDM_HEADER_SIZE; + buf = malloc(size); + if (!buf) { + printf("Not enough memory\n"); + goto cleanup; + } + + + for (;;) { + k = fread(src, 1, BUF_SIZE, in); + if (k == 0) + break; + count_in += k; + + n = LDM_compress(src, buf, k, BUF_SIZE); + + // n = k; + // offset += n; + offset = k; + count_out += k; + +// k = fwrite(src, 1, offset, out); + + k = fwrite(buf, 1, offset, out); + if (k < offset) { + if (ferror(out)) + printf("Write failed\n"); + else + printf("Short write\n"); + goto cleanup; + } + + } + *size_in = count_in; + *size_out = count_out; + r = 0; + cleanup: + free(src); + free(buf); + return r; +} + +static size_t decompress_file(FILE *in, FILE *out) { + void *src = malloc(BUF_SIZE); + void *dst = NULL; + size_t dst_capacity = BUF_SIZE; + size_t ret = 1; + size_t bytes_written = 0; + + if (!src) { + perror("decompress_file(src)"); + goto cleanup; + } + + while (ret != 0) { + /* Load more input */ + size_t src_size = fread(src, 1, BUF_SIZE, in); + void *src_ptr = src; + void *src_end = src_ptr + src_size; + if (src_size == 0 || ferror(in)) { + printf("(TODO): Decompress: not enough input or error reading file\n"); + //TODO + ret = 0; + goto cleanup; + } + + /* Allocate destination buffer if it hasn't been allocated already */ + if (!dst) { + dst = malloc(dst_capacity); + if (!dst) { + perror("decompress_file(dst)"); + goto cleanup; + } + } + + // TODO + + /* Decompress: + * Continue while there is more input to read. + */ + while (src_ptr != src_end && ret != 0) { + // size_t dst_size = src_size; + size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); + size_t written = fwrite(dst, 1, dst_size, out); +// printf("Writing %zu bytes\n", dst_size); + bytes_written += dst_size; + if (written != dst_size) { + printf("Decompress: Failed to write to file\n"); + goto cleanup; + } + src_ptr += src_size; + src_size = src_end - src_ptr; + } + + /* Update input */ + + } + + printf("Wrote %zu bytes\n", bytes_written); + + cleanup: + free(src); + free(dst); + + return ret; +} + +int main2(int argc, char *argv[]) { + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Please specify input filename\n"); + return 0; + } + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *outFp = fopen(ldmFilename, "wb"); + size_t sizeIn = 0; + size_t sizeOut = 0; + size_t ret; + printf("compress : %s -> %s\n", inpFilename, ldmFilename); + ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); + if (ret) { + printf("compress : failed with code %zu\n", ret); + return ret; + } + printf("%s: %zu → %zu bytes, %.1f%%\n", + inpFilename, sizeIn, sizeOut, + (double)sizeOut / sizeIn * 100); + printf("compress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* decompress */ + { + FILE *inpFp = fopen(ldmFilename, "rb"); + FILE *outFp = fopen(decFilename, "wb"); + size_t ret; + + printf("decompress : %s -> %s\n", ldmFilename, decFilename); + ret = decompress_file(inpFp, outFp); + if (ret) { + printf("decompress : failed with code %zu\n", ret); + return ret; + } + printf("decompress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* verify */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); + } + return 0; +} +#endif + diff --git a/contrib/long_distance_matching/versions/v0.3/Makefile b/contrib/long_distance_matching/versions/v0.3/Makefile new file mode 100644 index 00000000..5ffd4eaf --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.3/Makefile @@ -0,0 +1,40 @@ +# ################################################################ +# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# ################################################################ + +# This Makefile presumes libzstd is installed, using `sudo make install` + +CFLAGS ?= -O3 +DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ + -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ + -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ + -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ + -Wredundant-decls +CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS) +FLAGS = $(CPPFLAGS) $(CFLAGS) + +LDFLAGS += -lzstd + +.PHONY: default all clean + +default: all + +all: main-ldm + + +#main : ldm.c main.c +# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +main-ldm : util.c ldm.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +clean: + @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ + main main-ldm + @echo Cleaning completed + diff --git a/contrib/long_distance_matching/versions/v0.3/ldm.c b/contrib/long_distance_matching/versions/v0.3/ldm.c new file mode 100644 index 00000000..1dedf5c3 --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.3/ldm.c @@ -0,0 +1,464 @@ +#include +#include +#include +#include + + +#include "ldm.h" +#include "util.h" + +#define HASH_EVERY 1 + +#define LDM_MEMORY_USAGE 16 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) + +#define LDM_OFFSET_SIZE 4 + +#define WINDOW_SIZE (1 << 20) +#define MAX_WINDOW_SIZE 31 +#define HASH_SIZE 4 +#define LDM_HASH_LENGTH 4 +#define MINMATCH 4 + +#define ML_BITS 4 +#define ML_MASK ((1U<numMatches); + printf("Average match length: %.1f\n", ((double)stats->totalMatchLength) / + (double)stats->numMatches); + printf("Average literal length: %.1f\n", + ((double)stats->totalLiteralLength) / (double)stats->numMatches); + printf("Average offset length: %.1f\n", + ((double)stats->totalOffset) / (double)stats->numMatches); + printf("=====================\n"); +} + +typedef struct LDM_CCtx { + size_t isize; /* Input size */ + size_t maxOSize; /* Maximum output size */ + + const BYTE *ibase; /* Base of input */ + const BYTE *ip; /* Current input position */ + const BYTE *iend; /* End of input */ + + // Maximum input position such that hashing at the position does not exceed + // end of input. + const BYTE *ihashLimit; + + // Maximum input position such that finding a match of at least the minimum + // match length does not exceed end of input. + const BYTE *imatchLimit; + + const BYTE *obase; /* Base of output */ + BYTE *op; /* Output */ + + const BYTE *anchor; /* Anchor to start of current (match) block */ + + LDM_compressStats stats; /* Compression statistics */ + + LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; + + const BYTE *lastPosHashed; /* Last position hashed */ + hash_t lastHash; /* Hash corresponding to lastPosHashed */ + const BYTE *nextIp; + hash_t nextHash; /* Hash corresponding to nextIp */ + + unsigned step; +} LDM_CCtx; + +#ifdef LDM_ROLLING_HASH +/** + * Convert a sum computed from LDM_getRollingHash to a hash value in the range + * of the hash table. + */ +static hash_t LDM_sumToHash(U32 sum) { + return sum % (LDM_HASHTABLESIZE >> 2); +// return sum & (LDM_HASHTABLESIZE - 1); +} + +static U32 LDM_getRollingHash(const char *data, U32 len) { + U32 i; + U32 s1, s2; + const schar *buf = (const schar *)data; + + s1 = s2 = 0; + for (i = 0; i < (len - 4); i += 4) { + s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + + (2 * buf[i + 2]) + (buf[i + 3]); + s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3]; + } + for(; i < len; i++) { + s1 += buf[i]; + s2 += s1; + } + return (s1 & 0xffff) + (s2 << 16); +} + +static hash_t LDM_hashPosition(const void * const p) { + return LDM_sumToHash(LDM_getRollingHash((const char *)p, LDM_HASH_LENGTH)); +} + +typedef struct LDM_sumStruct { + U16 s1, s2; +} LDM_sumStruct; + +static void LDM_getRollingHashParts(U32 sum, LDM_sumStruct *sumStruct) { + sumStruct->s1 = sum & 0xffff; + sumStruct->s2 = sum >> 16; +} + +#else +static hash_t LDM_hash(U32 sequence) { + return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); +} + +static hash_t LDM_hashPosition(const void * const p) { + return LDM_hash(LDM_read32(p)); +} +#endif + +/* +static hash_t LDM_hash5(U64 sequence) { + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; + const U32 hashLog = LDM_HASHLOG; + if (LDM_isLittleEndian()) + return (((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (((sequence >> 24) * prime8bytes) >> (64 - hashLog)); +} +*/ + +static void LDM_putHashOfCurrentPositionFromHash( + LDM_CCtx *cctx, hash_t hash) { + if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { + return; + } + (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; + cctx->lastPosHashed = cctx->ip; + cctx->lastHash = hash; +} + +static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { + hash_t hash = LDM_hashPosition(cctx->ip); + LDM_putHashOfCurrentPositionFromHash(cctx, hash); +} + +static const BYTE *LDM_get_position_on_hash( + hash_t h, void *tableBase, const BYTE *srcBase) { + const LDM_hashEntry * const hashTable = (LDM_hashEntry *)tableBase; + return hashTable[h].offset + srcBase; +} + +static BYTE LDM_read_byte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} + +static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (unsigned)(pIn - pStart); + } + return (unsigned)(pIn - pStart); +} + +void LDM_readHeader(const void *src, size_t *compressSize, + size_t *decompressSize) { + const U32 *ip = (const U32 *)src; + *compressSize = *ip++; + *decompressSize = *ip; +} + +static void LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + cctx->isize = srcSize; + cctx->maxOSize = maxDstSize; + + cctx->ibase = (const BYTE *)src; + cctx->ip = cctx->ibase; + cctx->iend = cctx->ibase + srcSize; + + cctx->ihashLimit = cctx->iend - HASH_SIZE; + cctx->imatchLimit = cctx->iend - MINMATCH; + + cctx->obase = (BYTE *)dst; + cctx->op = (BYTE *)dst; + + cctx->anchor = cctx->ibase; + + memset(&(cctx->stats), 0, sizeof(cctx->stats)); + memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); + + cctx->lastPosHashed = NULL; + cctx->nextIp = NULL; + + cctx->step = 1; +} + +static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { + cctx->nextIp = cctx->ip; + + do { + hash_t const h = cctx->nextHash; + cctx->ip = cctx->nextIp; + cctx->nextIp += cctx->step; + + if (cctx->nextIp > cctx->imatchLimit) { + return 1; + } + + *match = LDM_get_position_on_hash(h, cctx->hashTable, cctx->ibase); + + cctx->nextHash = LDM_hashPosition(cctx->nextIp); + LDM_putHashOfCurrentPositionFromHash(cctx, h); + } while (cctx->ip - *match > WINDOW_SIZE || + LDM_read64(*match) != LDM_read64(cctx->ip)); + return 0; +} + +// TODO: srcSize and maxDstSize is unused +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + LDM_CCtx cctx; + LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + + /* Hash the first position and put it into the hash table. */ + LDM_putHashOfCurrentPosition(&cctx); + cctx.ip++; + cctx.nextHash = LDM_hashPosition(cctx.ip); + + // TODO: loop condition is not accurate. + while (1) { + const BYTE *match; + + /** + * Find a match. + * If no more matches can be found (i.e. the length of the remaining input + * is less than the minimum match length), then stop searching for matches + * and encode the final literals. + */ + if (LDM_findBestMatch(&cctx, &match) != 0) { + goto _last_literals; + } + + cctx.stats.numMatches++; + + /** + * Catch up: look back to extend the match backwards from the found match. + */ + while (cctx.ip > cctx.anchor && match > cctx.ibase && + cctx.ip[-1] == match[-1]) { + cctx.ip--; + match--; + } + + /** + * Write current block (literals, literal length, match offset, match + * length) and update pointers and hashes. + */ + { + unsigned const literalLength = (unsigned)(cctx.ip - cctx.anchor); + unsigned const offset = cctx.ip - match; + unsigned const matchLength = LDM_count( + cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); + BYTE *token = cctx.op++; + + cctx.stats.totalLiteralLength += literalLength; + cctx.stats.totalOffset += offset; + cctx.stats.totalMatchLength += matchLength + MINMATCH; + + /* Encode the literal length. */ + if (literalLength >= RUN_MASK) { + int len = (int)literalLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *(cctx.op)++ = 255; + } + *(cctx.op)++ = (BYTE)len; + } else { + *token = (BYTE)(literalLength << ML_BITS); + } + + /* Encode the literals. */ + memcpy(cctx.op, cctx.anchor, literalLength); + cctx.op += literalLength; + + /* Encode the offset. */ + LDM_write32(cctx.op, offset); + cctx.op += LDM_OFFSET_SIZE; + + /* Encode match length */ + if (matchLength >= ML_MASK) { + unsigned matchLengthRemaining = matchLength; + *token += ML_MASK; + matchLengthRemaining -= ML_MASK; + LDM_write32(cctx.op, 0xFFFFFFFF); + while (matchLengthRemaining >= 4*0xFF) { + cctx.op += 4; + LDM_write32(cctx.op, 0xffffffff); + matchLengthRemaining -= 4*0xFF; + } + cctx.op += matchLengthRemaining / 255; + *(cctx.op)++ = (BYTE)(matchLengthRemaining % 255); + } else { + *token += (BYTE)(matchLength); + } + + /* Update input pointer, inserting hashes into hash table along the + * way. + */ + while (cctx.ip < cctx.anchor + MINMATCH + matchLength + literalLength) { + LDM_putHashOfCurrentPosition(&cctx); + cctx.ip++; + } + } + + // Set start of next block to current input pointer. + cctx.anchor = cctx.ip; + LDM_putHashOfCurrentPosition(&cctx); + cctx.nextHash = LDM_hashPosition(++cctx.ip); + } +_last_literals: + /* Encode the last literals (no more matches). */ + { + size_t const lastRun = (size_t)(cctx.iend - cctx.anchor); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *(cctx.op)++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255; accumulator -= 255) { + *(cctx.op)++ = 255; + } + *(cctx.op)++ = (BYTE)accumulator; + } else { + *(cctx.op)++ = (BYTE)(lastRun << ML_BITS); + } + memcpy(cctx.op, cctx.anchor, lastRun); + cctx.op += lastRun; + } + LDM_printCompressStats(&cctx.stats); + return (cctx.op - (const BYTE *)cctx.obase); +} + +typedef struct LDM_DCtx { + size_t compressSize; + size_t maxDecompressSize; + + const BYTE *ibase; /* Base of input */ + const BYTE *ip; /* Current input position */ + const BYTE *iend; /* End of source */ + + const BYTE *obase; /* Base of output */ + BYTE *op; /* Current output position */ + const BYTE *oend; /* End of output */ +} LDM_DCtx; + +static void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressSize, + void *dst, size_t maxDecompressSize) { + dctx->compressSize = compressSize; + dctx->maxDecompressSize = maxDecompressSize; + + dctx->ibase = src; + dctx->ip = (const BYTE *)src; + dctx->iend = dctx->ip + dctx->compressSize; + dctx->op = dst; + dctx->oend = dctx->op + dctx->maxDecompressSize; + +} + +size_t LDM_decompress(const void *src, size_t compressSize, + void *dst, size_t maxDecompressSize) { + LDM_DCtx dctx; + LDM_initializeDCtx(&dctx, src, compressSize, dst, maxDecompressSize); + + while (dctx.ip < dctx.iend) { + BYTE *cpy; + const BYTE *match; + size_t length, offset; + + /* Get the literal length. */ + unsigned const token = *(dctx.ip)++; + if ((length = (token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + + /* Copy literals. */ + cpy = dctx.op + length; + memcpy(dctx.op, dctx.ip, length); + dctx.ip += length; + dctx.op = cpy; + + //TODO : dynamic offset size + offset = LDM_read32(dctx.ip); + dctx.ip += LDM_OFFSET_SIZE; + match = dctx.op - offset; + + /* Get the match length. */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + length += MINMATCH; + + /* Copy match. */ + cpy = dctx.op + length; + + // Inefficient for now + while (match < cpy - offset && dctx.op < dctx.oend) { + *(dctx.op)++ = *match++; + } + } + return dctx.op - (BYTE *)dst; +} + + diff --git a/contrib/long_distance_matching/versions/v0.3/ldm.h b/contrib/long_distance_matching/versions/v0.3/ldm.h new file mode 100644 index 00000000..287d444d --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.3/ldm.h @@ -0,0 +1,19 @@ +#ifndef LDM_H +#define LDM_H + +#include /* size_t */ + +#define LDM_COMPRESS_SIZE 4 +#define LDM_DECOMPRESS_SIZE 4 +#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) + +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +size_t LDM_decompress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +void LDM_readHeader(const void *src, size_t *compressSize, + size_t *decompressSize); + +#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v0.3/main-ldm.c b/contrib/long_distance_matching/versions/v0.3/main-ldm.c new file mode 100644 index 00000000..724d735d --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.3/main-ldm.c @@ -0,0 +1,479 @@ +// TODO: file size must fit into a U32 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ldm.h" + +// #define BUF_SIZE 16*1024 // Block size +#define DEBUG + +//#define ZSTD + +/* Compress file given by fname and output to oname. + * Returns 0 if successful, error code otherwise. + */ +static int compress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + size_t maxCompressSize, compressSize; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; + + /* Go to the location corresponding to the last byte. */ + /* TODO: fallocate? */ + if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* Write a dummy byte at the last location. */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +#ifdef ZSTD + compressSize = ZSTD_compress(dst, statbuf.st_size, + src, statbuf.st_size, 1); +#else + compressSize = LDM_HEADER_SIZE + + LDM_compress(src, statbuf.st_size, + dst + LDM_HEADER_SIZE, statbuf.st_size); + + // Write compress and decompress size to header + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + memcpy(dst, &compressSize, 4); + memcpy(dst + 4, &(statbuf.st_size), 4); + +#ifdef DEBUG + printf("Compressed size: %zu\n", compressSize); + printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); +#endif +#endif + + // Truncate file to compressSize. + ftruncate(fdout, compressSize); + + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + (unsigned)statbuf.st_size, (unsigned)compressSize, oname, + (double)compressSize / (statbuf.st_size) * 100); + + // Close files. + close(fdin); + close(fdout); + return 0; +} + +/* Decompress file compressed using LDM_compress. + * The input file should have the LDM_HEADER followed by payload. + * Returns 0 if succesful, and an error code otherwise. + */ +static int decompress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + size_t compressSize, decompressSize, outSize; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* Read the header. */ + LDM_readHeader(src, &compressSize, &decompressSize); + +#ifdef DEBUG + printf("Size, compressSize, decompressSize: %zu %zu %zu\n", + (size_t)statbuf.st_size, compressSize, decompressSize); +#endif + + /* Go to the location corresponding to the last byte. */ + if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +#ifdef ZSTD + outSize = ZSTD_decompress(dst, decomrpessed_size, + src + LDM_HEADER_SIZE, + statbuf.st_size - LDM_HEADER_SIZE); +#else + outSize = LDM_decompress( + src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, + dst, decompressSize); + + printf("Ret size out: %zu\n", outSize); + #endif + ftruncate(fdout, outSize); + + close(fdin); + close(fdout); + return 0; +} + +/* Compare two files. + * Returns 0 iff they are the same. + */ +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) break; + + if (0 == result) result = memcmp(b0, b1, r0); + } + return result; +} + +/* Verify the input file is the same as the decompressed file. */ +static void verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + { + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + } + + fclose(decFp); + fclose(inpFp); +} + +int main(int argc, const char *argv[]) { + const char * const exeName = argv[0]; + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Wrong arguments\n"); + printf("Usage:\n"); + printf("%s FILE\n", exeName); + return 1; + } + + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + + /* Compress */ + { + struct timeval tv1, tv2; + gettimeofday(&tv1, NULL); + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + } + + /* Decompress */ + { + struct timeval tv1, tv2; + gettimeofday(&tv1, NULL); + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + } + /* verify */ + verify(inpFilename, decFilename); + return 0; +} + + +#if 0 +static size_t compress_file(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *buf = NULL; + size_t r = 1; + size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; + + src = malloc(BUF_SIZE); + if (!src) { + printf("Not enough memory\n"); + goto cleanup; + } + + size = BUF_SIZE + LDM_HEADER_SIZE; + buf = malloc(size); + if (!buf) { + printf("Not enough memory\n"); + goto cleanup; + } + + + for (;;) { + k = fread(src, 1, BUF_SIZE, in); + if (k == 0) + break; + count_in += k; + + n = LDM_compress(src, buf, k, BUF_SIZE); + + // n = k; + // offset += n; + offset = k; + count_out += k; + +// k = fwrite(src, 1, offset, out); + + k = fwrite(buf, 1, offset, out); + if (k < offset) { + if (ferror(out)) + printf("Write failed\n"); + else + printf("Short write\n"); + goto cleanup; + } + + } + *size_in = count_in; + *size_out = count_out; + r = 0; + cleanup: + free(src); + free(buf); + return r; +} + +static size_t decompress_file(FILE *in, FILE *out) { + void *src = malloc(BUF_SIZE); + void *dst = NULL; + size_t dst_capacity = BUF_SIZE; + size_t ret = 1; + size_t bytes_written = 0; + + if (!src) { + perror("decompress_file(src)"); + goto cleanup; + } + + while (ret != 0) { + /* Load more input */ + size_t src_size = fread(src, 1, BUF_SIZE, in); + void *src_ptr = src; + void *src_end = src_ptr + src_size; + if (src_size == 0 || ferror(in)) { + printf("(TODO): Decompress: not enough input or error reading file\n"); + //TODO + ret = 0; + goto cleanup; + } + + /* Allocate destination buffer if it hasn't been allocated already */ + if (!dst) { + dst = malloc(dst_capacity); + if (!dst) { + perror("decompress_file(dst)"); + goto cleanup; + } + } + + // TODO + + /* Decompress: + * Continue while there is more input to read. + */ + while (src_ptr != src_end && ret != 0) { + // size_t dst_size = src_size; + size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); + size_t written = fwrite(dst, 1, dst_size, out); +// printf("Writing %zu bytes\n", dst_size); + bytes_written += dst_size; + if (written != dst_size) { + printf("Decompress: Failed to write to file\n"); + goto cleanup; + } + src_ptr += src_size; + src_size = src_end - src_ptr; + } + + /* Update input */ + + } + + printf("Wrote %zu bytes\n", bytes_written); + + cleanup: + free(src); + free(dst); + + return ret; +} + +int main2(int argc, char *argv[]) { + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Please specify input filename\n"); + return 0; + } + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *outFp = fopen(ldmFilename, "wb"); + size_t sizeIn = 0; + size_t sizeOut = 0; + size_t ret; + printf("compress : %s -> %s\n", inpFilename, ldmFilename); + ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); + if (ret) { + printf("compress : failed with code %zu\n", ret); + return ret; + } + printf("%s: %zu → %zu bytes, %.1f%%\n", + inpFilename, sizeIn, sizeOut, + (double)sizeOut / sizeIn * 100); + printf("compress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* decompress */ + { + FILE *inpFp = fopen(ldmFilename, "rb"); + FILE *outFp = fopen(decFilename, "wb"); + size_t ret; + + printf("decompress : %s -> %s\n", ldmFilename, decFilename); + ret = decompress_file(inpFp, outFp); + if (ret) { + printf("decompress : failed with code %zu\n", ret); + return ret; + } + printf("decompress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* verify */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); + } + return 0; +} +#endif + diff --git a/contrib/long_distance_matching/versions/v0.3/util.c b/contrib/long_distance_matching/versions/v0.3/util.c new file mode 100644 index 00000000..9ea4ca1e --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.3/util.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include + +#include "util.h" + +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; + +unsigned LDM_isLittleEndian(void) { + const union { U32 u; BYTE c[4]; } one = { 1 }; + return one.c[0]; +} + +U16 LDM_read16(const void *memPtr) { + U16 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +U16 LDM_readLE16(const void *memPtr) { + if (LDM_isLittleEndian()) { + return LDM_read16(memPtr); + } else { + const BYTE *p = (const BYTE *)memPtr; + return (U16)((U16)p[0] + (p[1] << 8)); + } +} + +void LDM_write16(void *memPtr, U16 value){ + memcpy(memPtr, &value, sizeof(value)); +} + +void LDM_write32(void *memPtr, U32 value) { + memcpy(memPtr, &value, sizeof(value)); +} + +void LDM_writeLE16(void *memPtr, U16 value) { + if (LDM_isLittleEndian()) { + LDM_write16(memPtr, value); + } else { + BYTE* p = (BYTE *)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +U32 LDM_read32(const void *ptr) { + return *(const U32 *)ptr; +} + +U64 LDM_read64(const void *ptr) { + return *(const U64 *)ptr; +} + +void LDM_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} + + diff --git a/contrib/long_distance_matching/versions/v0.3/util.h b/contrib/long_distance_matching/versions/v0.3/util.h new file mode 100644 index 00000000..90726412 --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.3/util.h @@ -0,0 +1,23 @@ +#ifndef LDM_UTIL_H +#define LDM_UTIL_H + +unsigned LDM_isLittleEndian(void); + +uint16_t LDM_read16(const void *memPtr); + +uint16_t LDM_readLE16(const void *memPtr); + +void LDM_write16(void *memPtr, uint16_t value); + +void LDM_write32(void *memPtr, uint32_t value); + +void LDM_writeLE16(void *memPtr, uint16_t value); + +uint32_t LDM_read32(const void *ptr); + +uint64_t LDM_read64(const void *ptr); + +void LDM_copy8(void *dst, const void *src); + + +#endif /* LDM_UTIL_H */ From 50502519fbd8adeeebed6f0b5232bcfc100bc63a Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 12 Jul 2017 09:47:00 -0700 Subject: [PATCH 20/62] Switch to using rolling hash only --- contrib/long_distance_matching/ldm.c | 40 +- .../versions/{v3 => v0.4}/Makefile | 0 .../versions/{v3 => v0.4}/ldm.c | 451 +++++++++++++---- .../versions/{v3 => v0.4}/ldm.h | 3 + .../versions/{v3 => v0.4}/main-ldm.c | 15 +- .../versions/{v3 => v0.4}/util.c | 5 + .../versions/{v3 => v0.4}/util.h | 2 + .../long_distance_matching/versions/v1/ldm.c | 394 --------------- .../long_distance_matching/versions/v1/ldm.h | 19 - .../versions/v1/main-ldm.c | 459 ----------------- .../versions/v2/Makefile | 32 -- .../long_distance_matching/versions/v2/ldm.c | 436 ---------------- .../long_distance_matching/versions/v2/ldm.h | 19 - .../versions/v2/main-ldm.c | 474 ------------------ 14 files changed, 404 insertions(+), 1945 deletions(-) rename contrib/long_distance_matching/versions/{v3 => v0.4}/Makefile (100%) rename contrib/long_distance_matching/versions/{v3 => v0.4}/ldm.c (51%) rename contrib/long_distance_matching/versions/{v3 => v0.4}/ldm.h (85%) rename contrib/long_distance_matching/versions/{v3 => v0.4}/main-ldm.c (98%) rename contrib/long_distance_matching/versions/{v3 => v0.4}/util.c (92%) rename contrib/long_distance_matching/versions/{v3 => v0.4}/util.h (91%) delete mode 100644 contrib/long_distance_matching/versions/v1/ldm.c delete mode 100644 contrib/long_distance_matching/versions/v1/ldm.h delete mode 100644 contrib/long_distance_matching/versions/v1/main-ldm.c delete mode 100644 contrib/long_distance_matching/versions/v2/Makefile delete mode 100644 contrib/long_distance_matching/versions/v2/ldm.c delete mode 100644 contrib/long_distance_matching/versions/v2/ldm.h delete mode 100644 contrib/long_distance_matching/versions/v2/main-ldm.c diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index ca4f0f2c..79648097 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -20,7 +20,7 @@ #define WINDOW_SIZE (1 << 23) #define MAX_WINDOW_SIZE 31 #define HASH_SIZE 4 -#define LDM_HASH_LENGTH 100 +#define LDM_HASH_LENGTH 4 // Should be multiple of four #define MINMATCH 4 @@ -106,7 +106,8 @@ typedef struct LDM_CCtx { const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ const BYTE *nextIp; - hash_t nextHash; /* Hash corresponding to nextIp */ + const BYTE *nextPosHashed; + hash_t nextHash; /* Hash corresponding to nextPosHashed */ // Members for rolling hash. U32 lastSum; @@ -192,9 +193,9 @@ static void LDM_getRollingHashParts(U32 sum, LDM_sumStruct *sumStruct) { */ static void LDM_setNextHash(LDM_CCtx *cctx) { - U32 check; #ifdef RUN_CHECKS + U32 check; if ((cctx->nextIp - cctx->ibase != 1) && (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, @@ -204,22 +205,21 @@ static void LDM_setNextHash(LDM_CCtx *cctx) { cctx->DEBUG_setNextHash = cctx->nextIp; #endif - cctx->nextSum = LDM_getRollingHash((const char *)cctx->nextIp, LDM_HASH_LENGTH); - /* - check = LDM_updateRollingHash( +// cctx->nextSum = LDM_getRollingHash((const char *)cctx->nextIp, LDM_HASH_LENGTH); + cctx->nextSum = LDM_updateRollingHash( cctx->lastSum, LDM_HASH_LENGTH, (schar)((cctx->lastPosHashed)[0]), (schar)((cctx->lastPosHashed)[LDM_HASH_LENGTH])); - */ #ifdef RUN_CHECKS + check = LDM_getRollingHash((const char *)cctx->nextIp, LDM_HASH_LENGTH); + if (check != cctx->nextSum) { printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); // printf("INFO: %u %u %u\n", LDM_read32(cctx->nextIp), - } else { -// printf("CHECK: setNextHash passed\n"); } #endif + cctx->nextPosHashed = cctx->nextIp; cctx->nextHash = LDM_sumToHash(cctx->nextSum); #ifdef RUN_CHECKS @@ -254,9 +254,23 @@ static void LDM_putHashOfCurrentPositionFromHash( cctx->lastSum = sum; } +static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { +#ifdef RUN_CHECKS + if (cctx->ip != cctx->nextPosHashed) { + printf("CHECK failed: updateLastHashFromNextHash %zu\n", cctx->ip - cctx->ibase); + } +#endif + LDM_putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); +} + static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { U32 sum = LDM_getRollingHash((const char *)cctx->ip, LDM_HASH_LENGTH); hash_t hash = LDM_sumToHash(sum); +#ifdef RUN_CHECKS + if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { + printf("CHECK failed: putHashOfCurrentPosition %zu\n", cctx->ip - cctx->ibase); + } +#endif // hash_t hash = LDM_hashPosition(cctx->ip); LDM_putHashOfCurrentPositionFromHash(cctx, hash, sum); // printf("Offset %zu\n", cctx->ip - cctx->ibase); @@ -376,6 +390,7 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->step = 1; cctx->nextIp = cctx->ip + cctx->step; + cctx->nextPosHashed = 0; cctx->DEBUG_setNextHash = 0; } @@ -503,7 +518,8 @@ static void LDM_outputBlock(LDM_CCtx *cctx, const BYTE *match) { while (cctx->ip < cctx->anchor + MINMATCH + matchLength + literalLength) { // printf("Loop\n"); if (cctx->ip > cctx->lastPosHashed) { - LDM_putHashOfCurrentPosition(cctx); + LDM_updateLastHashFromNextHash(cctx); +// LDM_putHashOfCurrentPosition(cctx); #ifdef LDM_ROLLING_HASH LDM_setNextHash(cctx); #endif @@ -526,7 +542,6 @@ static void LDM_outputBlock(LDM_CCtx *cctx, const BYTE *match) { size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; - U32 tmp_hash; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ @@ -579,7 +594,8 @@ size_t LDM_compress(const void *src, size_t srcSize, // Set start of next block to current input pointer. cctx.anchor = cctx.ip; - LDM_putHashOfCurrentPosition(&cctx); + LDM_updateLastHashFromNextHash(&cctx); +// LDM_putHashOfCurrentPosition(&cctx); #ifndef LDM_ROLLING_HASH cctx.ip++; #endif diff --git a/contrib/long_distance_matching/versions/v3/Makefile b/contrib/long_distance_matching/versions/v0.4/Makefile similarity index 100% rename from contrib/long_distance_matching/versions/v3/Makefile rename to contrib/long_distance_matching/versions/v0.4/Makefile diff --git a/contrib/long_distance_matching/versions/v3/ldm.c b/contrib/long_distance_matching/versions/v0.4/ldm.c similarity index 51% rename from contrib/long_distance_matching/versions/v3/ldm.c rename to contrib/long_distance_matching/versions/v0.4/ldm.c index 1dedf5c3..79648097 100644 --- a/contrib/long_distance_matching/versions/v3/ldm.c +++ b/contrib/long_distance_matching/versions/v0.4/ldm.c @@ -9,7 +9,7 @@ #define HASH_EVERY 1 -#define LDM_MEMORY_USAGE 16 +#define LDM_MEMORY_USAGE 22 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) @@ -17,10 +17,12 @@ #define LDM_OFFSET_SIZE 4 -#define WINDOW_SIZE (1 << 20) +#define WINDOW_SIZE (1 << 23) #define MAX_WINDOW_SIZE 31 #define HASH_SIZE 4 #define LDM_HASH_LENGTH 4 + +// Should be multiple of four #define MINMATCH 4 #define ML_BITS 4 @@ -28,7 +30,9 @@ #define RUN_BITS (8-ML_BITS) #define RUN_MASK ((1U<totalLiteralLength) / (double)stats->numMatches); printf("Average offset length: %.1f\n", ((double)stats->totalOffset) / (double)stats->numMatches); + printf("Num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", + stats->numCollisions, stats->numHashInserts, + stats->numHashInserts == 0 ? + 1.0 : (100.0 * (double)stats->numCollisions) / + (double)stats->numHashInserts); printf("=====================\n"); } @@ -93,19 +106,46 @@ typedef struct LDM_CCtx { const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ const BYTE *nextIp; - hash_t nextHash; /* Hash corresponding to nextIp */ + const BYTE *nextPosHashed; + hash_t nextHash; /* Hash corresponding to nextPosHashed */ + + // Members for rolling hash. + U32 lastSum; + U32 nextSum; unsigned step; + + // DEBUG + const BYTE *DEBUG_setNextHash; } LDM_CCtx; +static int LDM_isValidMatch(const BYTE *p, const BYTE *match) { + U16 lengthLeft = MINMATCH; + const BYTE *curP = p; + const BYTE *curMatch = match; + + for (; lengthLeft >= 8; lengthLeft -= 8) { + if (LDM_read64(curP) != LDM_read64(curMatch)) { + return 0; + } + curP += 8; + curMatch += 8; + } + if (lengthLeft > 0) { + return LDM_read32(curP) == LDM_read32(curMatch); + } + return 1; +} + + + #ifdef LDM_ROLLING_HASH /** * Convert a sum computed from LDM_getRollingHash to a hash value in the range * of the hash table. */ static hash_t LDM_sumToHash(U32 sum) { - return sum % (LDM_HASHTABLESIZE >> 2); -// return sum & (LDM_HASHTABLESIZE - 1); + return sum & (LDM_HASH_SIZE_U32 - 1); } static U32 LDM_getRollingHash(const char *data, U32 len) { @@ -126,18 +166,115 @@ static U32 LDM_getRollingHash(const char *data, U32 len) { return (s1 & 0xffff) + (s2 << 16); } -static hash_t LDM_hashPosition(const void * const p) { - return LDM_sumToHash(LDM_getRollingHash((const char *)p, LDM_HASH_LENGTH)); -} - typedef struct LDM_sumStruct { U16 s1, s2; } LDM_sumStruct; +static U32 LDM_updateRollingHash(U32 sum, U32 len, + schar toRemove, schar toAdd) { + U32 s1 = (sum & 0xffff) - toRemove + toAdd; + U32 s2 = (sum >> 16) - (toRemove * len) + s1; + + return (s1 & 0xffff) + (s2 << 16); +} + + +/* +static hash_t LDM_hashPosition(const void * const p) { + return LDM_sumToHash(LDM_getRollingHash((const char *)p, LDM_HASH_LENGTH)); +} +*/ + +/* static void LDM_getRollingHashParts(U32 sum, LDM_sumStruct *sumStruct) { sumStruct->s1 = sum & 0xffff; sumStruct->s2 = sum >> 16; } +*/ + +static void LDM_setNextHash(LDM_CCtx *cctx) { + +#ifdef RUN_CHECKS + U32 check; + if ((cctx->nextIp - cctx->ibase != 1) && + (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { + printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, + cctx->DEBUG_setNextHash - cctx->ibase); + } + + cctx->DEBUG_setNextHash = cctx->nextIp; +#endif + +// cctx->nextSum = LDM_getRollingHash((const char *)cctx->nextIp, LDM_HASH_LENGTH); + cctx->nextSum = LDM_updateRollingHash( + cctx->lastSum, LDM_HASH_LENGTH, + (schar)((cctx->lastPosHashed)[0]), + (schar)((cctx->lastPosHashed)[LDM_HASH_LENGTH])); + +#ifdef RUN_CHECKS + check = LDM_getRollingHash((const char *)cctx->nextIp, LDM_HASH_LENGTH); + + if (check != cctx->nextSum) { + printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); +// printf("INFO: %u %u %u\n", LDM_read32(cctx->nextIp), + } +#endif + cctx->nextPosHashed = cctx->nextIp; + cctx->nextHash = LDM_sumToHash(cctx->nextSum); + +#ifdef RUN_CHECKS + if ((cctx->nextIp - cctx->lastPosHashed) != 1) { + printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", + cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, + cctx->ip - cctx->ibase); + } +#endif + +} + +static void LDM_putHashOfCurrentPositionFromHash( + LDM_CCtx *cctx, hash_t hash, U32 sum) { + /* + if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { + return; + } + */ +#ifdef COMPUTE_STATS + if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { + offset_t offset = (cctx->hashTable)[hash].offset; + cctx->stats.numHashInserts++; + if (offset == 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { + cctx->stats.numCollisions++; + } + } +#endif + (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; + cctx->lastPosHashed = cctx->ip; + cctx->lastHash = hash; + cctx->lastSum = sum; +} + +static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { +#ifdef RUN_CHECKS + if (cctx->ip != cctx->nextPosHashed) { + printf("CHECK failed: updateLastHashFromNextHash %zu\n", cctx->ip - cctx->ibase); + } +#endif + LDM_putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); +} + +static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { + U32 sum = LDM_getRollingHash((const char *)cctx->ip, LDM_HASH_LENGTH); + hash_t hash = LDM_sumToHash(sum); +#ifdef RUN_CHECKS + if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { + printf("CHECK failed: putHashOfCurrentPosition %zu\n", cctx->ip - cctx->ibase); + } +#endif +// hash_t hash = LDM_hashPosition(cctx->ip); + LDM_putHashOfCurrentPositionFromHash(cctx, hash, sum); +// printf("Offset %zu\n", cctx->ip - cctx->ibase); +} #else static hash_t LDM_hash(U32 sequence) { @@ -147,6 +284,39 @@ static hash_t LDM_hash(U32 sequence) { static hash_t LDM_hashPosition(const void * const p) { return LDM_hash(LDM_read32(p)); } + +static void LDM_putHashOfCurrentPositionFromHash( + LDM_CCtx *cctx, hash_t hash) { + /* + if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { + return; + } + */ +#ifdef COMPUTE_STATS + if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { + offset_t offset = (cctx->hashTable)[hash].offset; + cctx->stats.numHashInserts++; + if (offset == 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { + cctx->stats.numCollisions++; + } + } +#endif + + (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; +#ifdef RUN_CHECKS + if (cctx->ip - cctx->lastPosHashed != 1) { + printf("putHashError\n"); + } +#endif + cctx->lastPosHashed = cctx->ip; + cctx->lastHash = hash; +} + +static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { + hash_t hash = LDM_hashPosition(cctx->ip); + LDM_putHashOfCurrentPositionFromHash(cctx, hash); +} + #endif /* @@ -161,38 +331,19 @@ static hash_t LDM_hash5(U64 sequence) { } */ -static void LDM_putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash) { - if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { - return; - } - (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; -} -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - hash_t hash = LDM_hashPosition(cctx->ip); - LDM_putHashOfCurrentPositionFromHash(cctx, hash); -} - -static const BYTE *LDM_get_position_on_hash( +static const BYTE *LDM_getPositionOnHash( hash_t h, void *tableBase, const BYTE *srcBase) { const LDM_hashEntry * const hashTable = (LDM_hashEntry *)tableBase; return hashTable[h].offset + srcBase; } -static BYTE LDM_read_byte(const void *memPtr) { - BYTE val; - memcpy(&val, memPtr, 1); - return val; -} static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE * const pStart = pIn; while (pIn < pInLimit - 1) { - BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); + BYTE const diff = LDM_readByte(pMatch) ^ LDM_readByte(pIn); if (!diff) { pIn++; pMatch++; @@ -220,7 +371,11 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->ip = cctx->ibase; cctx->iend = cctx->ibase + srcSize; +#ifdef LDM_ROLLING_HASH + cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; +#else cctx->ihashLimit = cctx->iend - HASH_SIZE; +#endif cctx->imatchLimit = cctx->iend - MINMATCH; cctx->obase = (BYTE *)dst; @@ -232,11 +387,47 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->lastPosHashed = NULL; - cctx->nextIp = NULL; cctx->step = 1; + cctx->nextIp = cctx->ip + cctx->step; + cctx->nextPosHashed = 0; + + cctx->DEBUG_setNextHash = 0; } +#ifdef LDM_ROLLING_HASH +static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { + cctx->nextIp = cctx->ip + cctx->step; + + do { + hash_t h; + U32 sum; +// printf("Call A\n"); + LDM_setNextHash(cctx); +// printf("End call a\n"); + h = cctx->nextHash; + sum = cctx->nextSum; + cctx->ip = cctx->nextIp; + cctx->nextIp += cctx->step; + + if (cctx->ip > cctx->imatchLimit) { + return 1; + } + + *match = LDM_getPositionOnHash(h, cctx->hashTable, cctx->ibase); + +// // Compute cctx->nextSum and cctx->nextHash from cctx->nextIp. +// LDM_setNextHash(cctx); + LDM_putHashOfCurrentPositionFromHash(cctx, h, sum); + +// printf("%u %u\n", cctx->lastHash, cctx->nextHash); + } while (cctx->ip - *match > WINDOW_SIZE || + !LDM_isValidMatch(cctx->ip, *match)); +// LDM_read64(*match) != LDM_read64(cctx->ip)); + LDM_setNextHash(cctx); + return 0; +} +#else static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { cctx->nextIp = cctx->ip; @@ -245,19 +436,108 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { cctx->ip = cctx->nextIp; cctx->nextIp += cctx->step; - if (cctx->nextIp > cctx->imatchLimit) { + if (cctx->ip > cctx->imatchLimit) { return 1; } - *match = LDM_get_position_on_hash(h, cctx->hashTable, cctx->ibase); + *match = LDM_getPositionOnHash(h, cctx->hashTable, cctx->ibase); cctx->nextHash = LDM_hashPosition(cctx->nextIp); LDM_putHashOfCurrentPositionFromHash(cctx, h); + } while (cctx->ip - *match > WINDOW_SIZE || - LDM_read64(*match) != LDM_read64(cctx->ip)); + !LDM_isValidMatch(cctx->ip, *match)); return 0; } +#endif + +/** + * Write current block (literals, literal length, match offset, + * match length). + * + * Update input pointer, inserting hashes into hash table along the + * way. + */ +static void LDM_outputBlock(LDM_CCtx *cctx, const BYTE *match) { + unsigned const literalLength = (unsigned)(cctx->ip - cctx->anchor); + unsigned const offset = cctx->ip - match; + unsigned const matchLength = LDM_count( + cctx->ip + MINMATCH, match + MINMATCH, cctx->ihashLimit); + BYTE *token = cctx->op++; + + cctx->stats.totalLiteralLength += literalLength; + cctx->stats.totalOffset += offset; + cctx->stats.totalMatchLength += matchLength + MINMATCH; + + /* Encode the literal length. */ + if (literalLength >= RUN_MASK) { + int len = (int)literalLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *(cctx->op)++ = 255; + } + *(cctx->op)++ = (BYTE)len; + } else { + *token = (BYTE)(literalLength << ML_BITS); + } + + /* Encode the literals. */ + memcpy(cctx->op, cctx->anchor, literalLength); + cctx->op += literalLength; + + /* Encode the offset. */ + LDM_write32(cctx->op, offset); + cctx->op += LDM_OFFSET_SIZE; + + /* Encode match length */ + if (matchLength >= ML_MASK) { + unsigned matchLengthRemaining = matchLength; + *token += ML_MASK; + matchLengthRemaining -= ML_MASK; + LDM_write32(cctx->op, 0xFFFFFFFF); + while (matchLengthRemaining >= 4*0xFF) { + cctx->op += 4; + LDM_write32(cctx->op, 0xffffffff); + matchLengthRemaining -= 4*0xFF; + } + cctx->op += matchLengthRemaining / 255; + *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); + } else { + *token += (BYTE)(matchLength); + } + +// LDM_setNextHash(cctx); +// cctx->ip = cctx->lastPosHashed + 1; +// cctx->nextIp = cctx->ip + cctx->step; +// printf("HERE: %zu %zu %zu\n", cctx->ip - cctx->ibase, +// cctx->lastPosHashed - cctx->ibase, cctx->nextIp - cctx->ibase); + + cctx->nextIp = cctx->ip + cctx->step; + + while (cctx->ip < cctx->anchor + MINMATCH + matchLength + literalLength) { +// printf("Loop\n"); + if (cctx->ip > cctx->lastPosHashed) { + LDM_updateLastHashFromNextHash(cctx); +// LDM_putHashOfCurrentPosition(cctx); +#ifdef LDM_ROLLING_HASH + LDM_setNextHash(cctx); +#endif + } + /* + printf("Call b %zu %zu %zu\n", + cctx->lastPosHashed - cctx->ibase, + cctx->nextIp - cctx->ibase, + cctx->ip - cctx->ibase); + */ +// printf("end call b\n"); + cctx->ip++; + cctx->nextIp++; + } + +// printf("There: %zu %zu\n", cctx->ip - cctx->ibase, cctx->lastPosHashed - cctx->ibase); +} + // TODO: srcSize and maxDstSize is unused size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { @@ -266,12 +546,21 @@ size_t LDM_compress(const void *src, size_t srcSize, /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); +#ifdef LDM_ROLLING_HASH +// LDM_setNextHash(&cctx); +// tmp_hash = LDM_updateRollingHash(cctx.lastSum, LDM_HASH_LENGTH, +// cctx.ip[0], cctx.ip[LDM_HASH_LENGTH]); +// printf("Update test: %u %u\n", tmp_hash, cctx.nextSum); +// cctx.ip++; +#else cctx.ip++; cctx.nextHash = LDM_hashPosition(cctx.ip); +#endif // TODO: loop condition is not accurate. while (1) { const BYTE *match; +// printf("Start of loop\n"); /** * Find a match. @@ -282,6 +571,7 @@ size_t LDM_compress(const void *src, size_t srcSize, if (LDM_findBestMatch(&cctx, &match) != 0) { goto _last_literals; } +// printf("End of match finding\n"); cctx.stats.numMatches++; @@ -290,6 +580,7 @@ size_t LDM_compress(const void *src, size_t srcSize, */ while (cctx.ip > cctx.anchor && match > cctx.ibase && cctx.ip[-1] == match[-1]) { +// printf("Catch up\n"); cctx.ip--; match--; } @@ -298,67 +589,25 @@ size_t LDM_compress(const void *src, size_t srcSize, * Write current block (literals, literal length, match offset, match * length) and update pointers and hashes. */ - { - unsigned const literalLength = (unsigned)(cctx.ip - cctx.anchor); - unsigned const offset = cctx.ip - match; - unsigned const matchLength = LDM_count( - cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); - BYTE *token = cctx.op++; - - cctx.stats.totalLiteralLength += literalLength; - cctx.stats.totalOffset += offset; - cctx.stats.totalMatchLength += matchLength + MINMATCH; - - /* Encode the literal length. */ - if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; - *token = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *(cctx.op)++ = 255; - } - *(cctx.op)++ = (BYTE)len; - } else { - *token = (BYTE)(literalLength << ML_BITS); - } - - /* Encode the literals. */ - memcpy(cctx.op, cctx.anchor, literalLength); - cctx.op += literalLength; - - /* Encode the offset. */ - LDM_write32(cctx.op, offset); - cctx.op += LDM_OFFSET_SIZE; - - /* Encode match length */ - if (matchLength >= ML_MASK) { - unsigned matchLengthRemaining = matchLength; - *token += ML_MASK; - matchLengthRemaining -= ML_MASK; - LDM_write32(cctx.op, 0xFFFFFFFF); - while (matchLengthRemaining >= 4*0xFF) { - cctx.op += 4; - LDM_write32(cctx.op, 0xffffffff); - matchLengthRemaining -= 4*0xFF; - } - cctx.op += matchLengthRemaining / 255; - *(cctx.op)++ = (BYTE)(matchLengthRemaining % 255); - } else { - *token += (BYTE)(matchLength); - } - - /* Update input pointer, inserting hashes into hash table along the - * way. - */ - while (cctx.ip < cctx.anchor + MINMATCH + matchLength + literalLength) { - LDM_putHashOfCurrentPosition(&cctx); - cctx.ip++; - } - } + LDM_outputBlock(&cctx, match); +// printf("End of loop\n"); // Set start of next block to current input pointer. cctx.anchor = cctx.ip; + LDM_updateLastHashFromNextHash(&cctx); +// LDM_putHashOfCurrentPosition(&cctx); +#ifndef LDM_ROLLING_HASH + cctx.ip++; +#endif + + /* LDM_putHashOfCurrentPosition(&cctx); - cctx.nextHash = LDM_hashPosition(++cctx.ip); + printf("Call c\n"); + LDM_setNextHash(&cctx); + printf("End call c\n"); + cctx.ip++; + cctx.nextIp++; + */ } _last_literals: /* Encode the last literals (no more matches). */ @@ -453,7 +702,7 @@ size_t LDM_decompress(const void *src, size_t compressSize, /* Copy match. */ cpy = dctx.op + length; - // Inefficient for now + // Inefficient for now. while (match < cpy - offset && dctx.op < dctx.oend) { *(dctx.op)++ = *match++; } @@ -461,4 +710,20 @@ size_t LDM_decompress(const void *src, size_t compressSize, return dctx.op - (BYTE *)dst; } +void LDM_test(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { +#ifdef LDM_ROLLING_HASH + const BYTE *ip = (const BYTE *)src + 1125; + U32 sum = LDM_getRollingHash((const char *)ip, LDM_HASH_LENGTH); + U32 sum2; + ++ip; + for (; ip < (const BYTE *)src + 1125 + 100; ip++) { + sum2 = LDM_updateRollingHash(sum, LDM_HASH_LENGTH, + ip[-1], ip[LDM_HASH_LENGTH - 1]); + sum = LDM_getRollingHash((const char *)ip, LDM_HASH_LENGTH); + printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2); + } +#endif +} + diff --git a/contrib/long_distance_matching/versions/v3/ldm.h b/contrib/long_distance_matching/versions/v0.4/ldm.h similarity index 85% rename from contrib/long_distance_matching/versions/v3/ldm.h rename to contrib/long_distance_matching/versions/v0.4/ldm.h index 287d444d..a34faac4 100644 --- a/contrib/long_distance_matching/versions/v3/ldm.h +++ b/contrib/long_distance_matching/versions/v0.4/ldm.h @@ -16,4 +16,7 @@ size_t LDM_decompress(const void *src, size_t srcSize, void LDM_readHeader(const void *src, size_t *compressSize, size_t *decompressSize); +void LDM_test(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v3/main-ldm.c b/contrib/long_distance_matching/versions/v0.4/main-ldm.c similarity index 98% rename from contrib/long_distance_matching/versions/v3/main-ldm.c rename to contrib/long_distance_matching/versions/v0.4/main-ldm.c index 724d735d..f8ae5469 100644 --- a/contrib/long_distance_matching/versions/v3/main-ldm.c +++ b/contrib/long_distance_matching/versions/v0.4/main-ldm.c @@ -15,6 +15,7 @@ // #define BUF_SIZE 16*1024 // Block size #define DEBUG +//#define TEST //#define ZSTD @@ -74,6 +75,11 @@ static int compress(const char *fname, const char *oname) { return 1; } +#ifdef TEST + LDM_test(src, statbuf.st_size, + dst + LDM_HEADER_SIZE, statbuf.st_size); +#endif + #ifdef ZSTD compressSize = ZSTD_compress(dst, statbuf.st_size, src, statbuf.st_size, 1); @@ -144,11 +150,6 @@ static int decompress(const char *fname, const char *oname) { /* Read the header. */ LDM_readHeader(src, &compressSize, &decompressSize); -#ifdef DEBUG - printf("Size, compressSize, decompressSize: %zu %zu %zu\n", - (size_t)statbuf.st_size, compressSize, decompressSize); -#endif - /* Go to the location corresponding to the last byte. */ if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { perror("lseek error"); @@ -256,7 +257,7 @@ int main(int argc, const char *argv[]) { return 1; } gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", + printf("Total compress time = %f seconds\n", (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + (double) (tv2.tv_sec - tv1.tv_sec)); } @@ -270,7 +271,7 @@ int main(int argc, const char *argv[]) { return 1; } gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", + printf("Total decompress time = %f seconds\n", (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + (double) (tv2.tv_sec - tv1.tv_sec)); } diff --git a/contrib/long_distance_matching/versions/v3/util.c b/contrib/long_distance_matching/versions/v0.4/util.c similarity index 92% rename from contrib/long_distance_matching/versions/v3/util.c rename to contrib/long_distance_matching/versions/v0.4/util.c index 9ea4ca1e..70fcbc2c 100644 --- a/contrib/long_distance_matching/versions/v3/util.c +++ b/contrib/long_distance_matching/versions/v0.4/util.c @@ -61,4 +61,9 @@ void LDM_copy8(void *dst, const void *src) { memcpy(dst, src, 8); } +BYTE LDM_readByte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} diff --git a/contrib/long_distance_matching/versions/v3/util.h b/contrib/long_distance_matching/versions/v0.4/util.h similarity index 91% rename from contrib/long_distance_matching/versions/v3/util.h rename to contrib/long_distance_matching/versions/v0.4/util.h index 90726412..d1c3c999 100644 --- a/contrib/long_distance_matching/versions/v3/util.h +++ b/contrib/long_distance_matching/versions/v0.4/util.h @@ -19,5 +19,7 @@ uint64_t LDM_read64(const void *ptr); void LDM_copy8(void *dst, const void *src); +uint8_t LDM_readByte(const void *ptr); + #endif /* LDM_UTIL_H */ diff --git a/contrib/long_distance_matching/versions/v1/ldm.c b/contrib/long_distance_matching/versions/v1/ldm.c deleted file mode 100644 index 266425f8..00000000 --- a/contrib/long_distance_matching/versions/v1/ldm.c +++ /dev/null @@ -1,394 +0,0 @@ -#include -#include -#include -#include - -#include "ldm.h" - -#define LDM_MEMORY_USAGE 14 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) - -#define WINDOW_SIZE (1 << 20) -#define MAX_WINDOW_SIZE 31 -#define HASH_SIZE 4 -#define MINMATCH 4 - -#define ML_BITS 4 -#define ML_MASK ((1U<>8); - } -} - -static U32 LDM_read32(const void *ptr) { - return *(const U32 *)ptr; -} - -static U64 LDM_read64(const void *ptr) { - return *(const U64 *)ptr; -} - - -static void LDM_copy8(void *dst, const void *src) { - memcpy(dst, src, 8); -} - -static void LDM_wild_copy(void *dstPtr, const void *srcPtr, void *dstEnd) { - BYTE *d = (BYTE *)dstPtr; - const BYTE *s = (const BYTE *)srcPtr; - BYTE * const e = (BYTE *)dstEnd; - - do { - LDM_copy8(d, s); - d += 8; - s += 8; - } while (d < e); - -} - -struct hash_entry { - U64 offset; - tag t; -}; - -static U32 LDM_hash(U32 sequence) { - return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); -} - -static U32 LDM_hash5(U64 sequence) { - static const U64 prime5bytes = 889523592379ULL; - static const U64 prime8bytes = 11400714785074694791ULL; - const U32 hashLog = LDM_HASHLOG; - if (LDM_isLittleEndian()) - return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); - else - return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); -} - -static U32 LDM_hash_position(const void * const p) { - return LDM_hash(LDM_read32(p)); -} - -static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, - const BYTE *srcBase) { - U32 *hashTable = (U32 *) tableBase; - hashTable[h] = (U32)(p - srcBase); -} - -static void LDM_put_position(const BYTE *p, void *tableBase, - const BYTE *srcBase) { - U32 const h = LDM_hash_position(p); - LDM_put_position_on_hash(p, h, tableBase, srcBase); -} - -static const BYTE *LDM_get_position_on_hash( - U32 h, void *tableBase, const BYTE *srcBase) { - const U32 * const hashTable = (U32*)tableBase; - return hashTable[h] + srcBase; -} - -static BYTE LDM_read_byte(const void *memPtr) { - BYTE val; - memcpy(&val, memPtr, 1); - return val; -} - -static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (unsigned)(pIn - pStart); - } - return (unsigned)(pIn - pStart); -} - -void LDM_read_header(void const *source, size_t *compressed_size, - size_t *decompressed_size) { - const U32 *ip = (const U32 *)source; - *compressed_size = *ip++; - *decompressed_size = *ip; -} - -size_t LDM_compress(void const *source, void *dest, size_t source_size, - size_t max_dest_size) { - const BYTE * const istart = (const BYTE*)source; - const BYTE *ip = istart; - const BYTE * const iend = istart + source_size; - const BYTE *ilimit = iend - HASH_SIZE; - const BYTE * const matchlimit = iend - HASH_SIZE; - const BYTE * const mflimit = iend - MINMATCH; - BYTE *op = (BYTE*) dest; - U32 hashTable[LDM_HASHTABLESIZE_U32]; - memset(hashTable, 0, sizeof(hashTable)); - - const BYTE *anchor = (const BYTE *)source; -// struct LDM_cctx cctx; - size_t output_size = 0; - - U32 forwardH; - - /* Hash first byte: put into hash table */ - - LDM_put_position(ip, hashTable, istart); - ip++; - forwardH = LDM_hash_position(ip); - - //TODO Loop terminates before ip>=ilimit. - while (ip < ilimit) { - const BYTE *match; - BYTE *token; - - /* Find a match */ - { - const BYTE *forwardIp = ip; - unsigned step = 1; - - do { - U32 const h = forwardH; - ip = forwardIp; - forwardIp += step; - - if (forwardIp > mflimit) { - goto _last_literals; - } - - match = LDM_get_position_on_hash(h, hashTable, istart); - - forwardH = LDM_hash_position(forwardIp); - LDM_put_position_on_hash(ip, h, hashTable, istart); - } while (ip - match > WINDOW_SIZE || - LDM_read64(match) != LDM_read64(ip)); - } - - // TODO catchup - while (ip > anchor && match > istart && ip[-1] == match[-1]) { - ip--; - match--; - } - - /* Encode literals */ - { - unsigned const litLength = (unsigned)(ip - anchor); - token = op++; - -#ifdef LDM_DEBUG - printf("Cur position: %zu\n", anchor - istart); - printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); -#endif - /* - fwrite(match, 4, 1, stdout); - printf("\n"); - */ - - if (litLength >= RUN_MASK) { - int len = (int)litLength - RUN_MASK; - *token = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *op++ = 255; - } - *op++ = (BYTE)len; - } else { - *token = (BYTE)(litLength << ML_BITS); - } -#ifdef LDM_DEBUG - printf("Literals "); - fwrite(anchor, litLength, 1, stdout); - printf("\n"); -#endif - memcpy(op, anchor, litLength); - //LDM_wild_copy(op, anchor, op + litLength); - op += litLength; - } -_next_match: - /* Encode offset */ - { - LDM_write32(op, ip - match); - op += 4; - } - - /* Encode Match Length */ - { - unsigned matchCode; - matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, - matchlimit); -#ifdef LDM_DEBUG - printf("Match length %zu\n", matchCode + MINMATCH); - fwrite(ip, MINMATCH + matchCode, 1, stdout); - printf("\n"); -#endif - ip += MINMATCH + matchCode; - if (matchCode >= ML_MASK) { - *token += ML_MASK; - matchCode -= ML_MASK; - LDM_write32(op, 0xFFFFFFFF); - while (matchCode >= 4*0xFF) { - op += 4; - LDM_write32(op, 0xffffffff); - matchCode -= 4*0xFF; - } - op += matchCode / 255; - *op++ = (BYTE)(matchCode % 255); - } else { - *token += (BYTE)(matchCode); - } -#ifdef LDM_DEBUG - printf("\n"); -#endif - } - - anchor = ip; - - LDM_put_position(ip, hashTable, istart); - forwardH = LDM_hash_position(++ip); - } -_last_literals: - /* Encode last literals */ - { - size_t const lastRun = (size_t)(iend - anchor); - if (lastRun >= RUN_MASK) { - size_t accumulator = lastRun - RUN_MASK; - *op++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255; accumulator -= 255) { - *op++ = 255; - } - *op++ = (BYTE)accumulator; - } else { - *op++ = (BYTE)(lastRun << ML_BITS); - } - memcpy(op, anchor, lastRun); - op += lastRun; - } - return (op - (BYTE *)dest); -} - -size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, - size_t max_decompressed_size) { - const BYTE *ip = (const BYTE *)source; - const BYTE * const iend = ip + compressed_size; - BYTE *op = (BYTE *)dest; - BYTE * const oend = op + max_decompressed_size; - BYTE *cpy; - - while (ip < iend) { - size_t length; - const BYTE *match; - size_t offset; - - /* get literal length */ - unsigned const token = *ip++; - if ((length=(token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *ip++; - length += s; - } while (s == 255); - } -#ifdef LDM_DEBUG - printf("Literal length: %zu\n", length); -#endif - - /* copy literals */ - cpy = op + length; -#ifdef LDM_DEBUG - printf("Literals "); - fwrite(ip, length, 1, stdout); - printf("\n"); -#endif - memcpy(op, ip, length); -// LDM_wild_copy(op, ip, cpy); - ip += length; - op = cpy; - - /* get offset */ - offset = LDM_read32(ip); - -#ifdef LDM_DEBUG - printf("Offset: %zu\n", offset); -#endif - ip += 4; - match = op - offset; - // LDM_write32(op, (U32)offset); - - /* get matchlength */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *ip++; - length += s; - } while (s == 255); - } - length += MINMATCH; -#ifdef LDM_DEBUG - printf("Match length: %zu\n", length); -#endif - /* copy match */ - cpy = op + length; - - // Inefficient for now - - while (match < cpy - offset && op < oend) { - *op++ = *match++; - } - } -// memcpy(dest, source, compressed_size); - return op - (BYTE *)dest; -} - - diff --git a/contrib/long_distance_matching/versions/v1/ldm.h b/contrib/long_distance_matching/versions/v1/ldm.h deleted file mode 100644 index f4ca25a3..00000000 --- a/contrib/long_distance_matching/versions/v1/ldm.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef LDM_H -#define LDM_H - -#include /* size_t */ - -#define LDM_COMPRESS_SIZE 4 -#define LDM_DECOMPRESS_SIZE 4 -#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) - -size_t LDM_compress(void const *source, void *dest, size_t source_size, - size_t max_dest_size); - -size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, - size_t max_decompressed_size); - -void LDM_read_header(void const *source, size_t *compressed_size, - size_t *decompressed_size); - -#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v1/main-ldm.c b/contrib/long_distance_matching/versions/v1/main-ldm.c deleted file mode 100644 index 10869cce..00000000 --- a/contrib/long_distance_matching/versions/v1/main-ldm.c +++ /dev/null @@ -1,459 +0,0 @@ -// TODO: file size must fit into a U32 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "ldm.h" - -// #define BUF_SIZE 16*1024 // Block size -#define DEBUG - -//#define ZSTD - -#if 0 -static size_t compress_file(FILE *in, FILE *out, size_t *size_in, - size_t *size_out) { - char *src, *buf = NULL; - size_t r = 1; - size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; - - src = malloc(BUF_SIZE); - if (!src) { - printf("Not enough memory\n"); - goto cleanup; - } - - size = BUF_SIZE + LDM_HEADER_SIZE; - buf = malloc(size); - if (!buf) { - printf("Not enough memory\n"); - goto cleanup; - } - - - for (;;) { - k = fread(src, 1, BUF_SIZE, in); - if (k == 0) - break; - count_in += k; - - n = LDM_compress(src, buf, k, BUF_SIZE); - - // n = k; - // offset += n; - offset = k; - count_out += k; - -// k = fwrite(src, 1, offset, out); - - k = fwrite(buf, 1, offset, out); - if (k < offset) { - if (ferror(out)) - printf("Write failed\n"); - else - printf("Short write\n"); - goto cleanup; - } - - } - *size_in = count_in; - *size_out = count_out; - r = 0; - cleanup: - free(src); - free(buf); - return r; -} - -static size_t decompress_file(FILE *in, FILE *out) { - void *src = malloc(BUF_SIZE); - void *dst = NULL; - size_t dst_capacity = BUF_SIZE; - size_t ret = 1; - size_t bytes_written = 0; - - if (!src) { - perror("decompress_file(src)"); - goto cleanup; - } - - while (ret != 0) { - /* Load more input */ - size_t src_size = fread(src, 1, BUF_SIZE, in); - void *src_ptr = src; - void *src_end = src_ptr + src_size; - if (src_size == 0 || ferror(in)) { - printf("(TODO): Decompress: not enough input or error reading file\n"); - //TODO - ret = 0; - goto cleanup; - } - - /* Allocate destination buffer if it hasn't been allocated already */ - if (!dst) { - dst = malloc(dst_capacity); - if (!dst) { - perror("decompress_file(dst)"); - goto cleanup; - } - } - - // TODO - - /* Decompress: - * Continue while there is more input to read. - */ - while (src_ptr != src_end && ret != 0) { - // size_t dst_size = src_size; - size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); - size_t written = fwrite(dst, 1, dst_size, out); -// printf("Writing %zu bytes\n", dst_size); - bytes_written += dst_size; - if (written != dst_size) { - printf("Decompress: Failed to write to file\n"); - goto cleanup; - } - src_ptr += src_size; - src_size = src_end - src_ptr; - } - - /* Update input */ - - } - - printf("Wrote %zu bytes\n", bytes_written); - - cleanup: - free(src); - free(dst); - - return ret; -} -#endif - -static size_t compress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - - /* open the input file */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* open the output file */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* find size of input file */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - size_t size_in = statbuf.st_size; - - /* go to the location corresponding to the last byte */ - if (lseek(fdout, size_in + LDM_HEADER_SIZE - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the input file */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - size_t out_size = statbuf.st_size + LDM_HEADER_SIZE; - - /* mmap the output file */ - if ((dst = mmap(0, out_size, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - - #ifdef ZSTD - size_t size_out = ZSTD_compress(dst, statbuf.st_size, - src, statbuf.st_size, 1); - #else - size_t size_out = LDM_compress(src, dst + LDM_HEADER_SIZE, statbuf.st_size, - statbuf.st_size); - size_out += LDM_HEADER_SIZE; - - // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &size_out, 4); - memcpy(dst + 4, &(statbuf.st_size), 4); - printf("Compressed size: %zu\n", size_out); - printf("Decompressed size: %zu\n", statbuf.st_size); - #endif - ftruncate(fdout, size_out); - - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)size_out, oname, - (double)size_out / (statbuf.st_size) * 100); - - close(fdin); - close(fdout); - return 0; -} - -static size_t decompress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - - /* open the input file */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* open the output file */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* find size of input file */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - /* mmap the input file */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* read header */ - size_t compressed_size, decompressed_size; - LDM_read_header(src, &compressed_size, &decompressed_size); - - printf("Size, compressed_size, decompressed_size: %zu %zu %zu\n", - statbuf.st_size, compressed_size, decompressed_size); - - /* go to the location corresponding to the last byte */ - if (lseek(fdout, decompressed_size - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, decompressed_size, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - - /* Copy input file to output file */ -// memcpy(dst, src, statbuf.st_size); - - #ifdef ZSTD - size_t size_out = ZSTD_decompress(dst, decomrpessed_size, - src + LDM_HEADER_SIZE, - statbuf.st_size - LDM_HEADER_SIZE); - #else - size_t size_out = LDM_decompress(src + LDM_HEADER_SIZE, dst, - statbuf.st_size - LDM_HEADER_SIZE, - decompressed_size); - printf("Ret size out: %zu\n", size_out); - #endif - ftruncate(fdout, size_out); - - close(fdin); - close(fdout); - return 0; -} - -static int compare(FILE *fp0, FILE *fp1) { - int result = 0; - while (result == 0) { - char b0[1024]; - char b1[1024]; - const size_t r0 = fread(b0, 1, sizeof(b0), fp0); - const size_t r1 = fread(b1, 1, sizeof(b1), fp1); - - result = (int)r0 - (int)r1; - - if (0 == r0 || 0 == r1) { - break; - } - if (0 == result) { - result = memcmp(b0, b1, r0); - } - } - return result; -} - -static void verify(const char *inpFilename, const char *decFilename) { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); -} - -int main(int argc, const char *argv[]) { - const char * const exeName = argv[0]; - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Wrong arguments\n"); - printf("Usage:\n"); - printf("%s FILE\n", exeName); - return 1; - } - - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - struct timeval tv1, tv2; - /* compress */ - { - gettimeofday(&tv1, NULL); - if (compress(inpFilename, ldmFilename)) { - printf("Compress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - } - - /* decompress */ - - gettimeofday(&tv1, NULL); - if (decompress(ldmFilename, decFilename)) { - printf("Decompress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - - /* verify */ - verify(inpFilename, decFilename); - return 0; -} - -#if 0 -int main2(int argc, char *argv[]) { - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Please specify input filename\n"); - return 0; - } - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - /* compress */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *outFp = fopen(ldmFilename, "wb"); - size_t sizeIn = 0; - size_t sizeOut = 0; - size_t ret; - printf("compress : %s -> %s\n", inpFilename, ldmFilename); - ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); - if (ret) { - printf("compress : failed with code %zu\n", ret); - return ret; - } - printf("%s: %zu → %zu bytes, %.1f%%\n", - inpFilename, sizeIn, sizeOut, - (double)sizeOut / sizeIn * 100); - printf("compress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* decompress */ - { - FILE *inpFp = fopen(ldmFilename, "rb"); - FILE *outFp = fopen(decFilename, "wb"); - size_t ret; - - printf("decompress : %s -> %s\n", ldmFilename, decFilename); - ret = decompress_file(inpFp, outFp); - if (ret) { - printf("decompress : failed with code %zu\n", ret); - return ret; - } - printf("decompress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* verify */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); - } - return 0; -} -#endif - diff --git a/contrib/long_distance_matching/versions/v2/Makefile b/contrib/long_distance_matching/versions/v2/Makefile deleted file mode 100644 index 4e04fd6a..00000000 --- a/contrib/long_distance_matching/versions/v2/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -# ################################################################ -# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. An additional grant -# of patent rights can be found in the PATENTS file in the same directory. -# ################################################################ - -# This Makefile presumes libzstd is installed, using `sudo make install` - - -LDFLAGS += -lzstd - -.PHONY: default all clean - -default: all - -all: main-ldm - - -#main : ldm.c main.c -# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -main-ldm : ldm.c main-ldm.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -clean: - @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main main-ldm - @echo Cleaning completed - diff --git a/contrib/long_distance_matching/versions/v2/ldm.c b/contrib/long_distance_matching/versions/v2/ldm.c deleted file mode 100644 index 9081d136..00000000 --- a/contrib/long_distance_matching/versions/v2/ldm.c +++ /dev/null @@ -1,436 +0,0 @@ -#include -#include -#include -#include - -#include "ldm.h" - -#define HASH_EVERY 7 - -#define LDM_MEMORY_USAGE 14 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) - -#define WINDOW_SIZE (1 << 20) -#define MAX_WINDOW_SIZE 31 -#define HASH_SIZE 8 -#define MINMATCH 8 - -#define ML_BITS 4 -#define ML_MASK ((1U<>8); - } -} - -static U32 LDM_read32(const void *ptr) { - return *(const U32 *)ptr; -} - -static U64 LDM_read64(const void *ptr) { - return *(const U64 *)ptr; -} - -static void LDM_copy8(void *dst, const void *src) { - memcpy(dst, src, 8); -} - -typedef struct compress_stats { - U32 num_matches; - U32 total_match_length; - U32 total_literal_length; - U64 total_offset; -} compress_stats; - -static void LDM_printCompressStats(const compress_stats *stats) { - printf("=====================\n"); - printf("Compression statistics\n"); - printf("Total number of matches: %u\n", stats->num_matches); - printf("Average match length: %.1f\n", ((double)stats->total_match_length) / - (double)stats->num_matches); - printf("Average literal length: %.1f\n", - ((double)stats->total_literal_length) / (double)stats->num_matches); - printf("Average offset length: %.1f\n", - ((double)stats->total_offset) / (double)stats->num_matches); - printf("=====================\n"); -} - -// TODO: unused. -struct hash_entry { - U64 offset; - tag t; -}; - -static U32 LDM_hash(U32 sequence) { - return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); -} - -static U32 LDM_hash5(U64 sequence) { - static const U64 prime5bytes = 889523592379ULL; - static const U64 prime8bytes = 11400714785074694791ULL; - const U32 hashLog = LDM_HASHLOG; - if (LDM_isLittleEndian()) - return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); - else - return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); -} - -static U32 LDM_hash_position(const void * const p) { - return LDM_hash(LDM_read32(p)); -} - -static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, - const BYTE *srcBase) { - if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { - return; - } - - U32 *hashTable = (U32 *) tableBase; - hashTable[h] = (U32)(p - srcBase); -} - -static void LDM_put_position(const BYTE *p, void *tableBase, - const BYTE *srcBase) { - if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { - return; - } - U32 const h = LDM_hash_position(p); - LDM_put_position_on_hash(p, h, tableBase, srcBase); -} - -static const BYTE *LDM_get_position_on_hash( - U32 h, void *tableBase, const BYTE *srcBase) { - const U32 * const hashTable = (U32*)tableBase; - return hashTable[h] + srcBase; -} - -static BYTE LDM_read_byte(const void *memPtr) { - BYTE val; - memcpy(&val, memPtr, 1); - return val; -} - -static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (unsigned)(pIn - pStart); - } - return (unsigned)(pIn - pStart); -} - -void LDM_read_header(const void *src, size_t *compressSize, - size_t *decompressSize) { - const U32 *ip = (const U32 *)src; - *compressSize = *ip++; - *decompressSize = *ip; -} - -// TODO: maxDstSize is unused -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - const BYTE * const istart = (const BYTE*)src; - const BYTE *ip = istart; - const BYTE * const iend = istart + srcSize; - const BYTE *ilimit = iend - HASH_SIZE; - const BYTE * const matchlimit = iend - HASH_SIZE; - const BYTE * const mflimit = iend - MINMATCH; - BYTE *op = (BYTE*) dst; - - compress_stats compressStats = { 0 }; - - U32 hashTable[LDM_HASHTABLESIZE_U32]; - memset(hashTable, 0, sizeof(hashTable)); - - const BYTE *anchor = (const BYTE *)src; -// struct LDM_cctx cctx; - size_t output_size = 0; - - U32 forwardH; - - /* Hash first byte: put into hash table */ - - LDM_put_position(ip, hashTable, istart); - const BYTE *lastHash = ip; - ip++; - forwardH = LDM_hash_position(ip); - - //TODO Loop terminates before ip>=ilimit. - while (ip < ilimit) { - const BYTE *match; - BYTE *token; - - /* Find a match */ - { - const BYTE *forwardIp = ip; - unsigned step = 1; - - do { - U32 const h = forwardH; - ip = forwardIp; - forwardIp += step; - - if (forwardIp > mflimit) { - goto _last_literals; - } - - match = LDM_get_position_on_hash(h, hashTable, istart); - - forwardH = LDM_hash_position(forwardIp); - LDM_put_position_on_hash(ip, h, hashTable, istart); - lastHash = ip; - } while (ip - match > WINDOW_SIZE || - LDM_read64(match) != LDM_read64(ip)); - } - compressStats.num_matches++; - - /* Catchup: look back to extend match from found match */ - while (ip > anchor && match > istart && ip[-1] == match[-1]) { - ip--; - match--; - } - - /* Encode literals */ - { - unsigned const litLength = (unsigned)(ip - anchor); - token = op++; - - compressStats.total_literal_length += litLength; - -#ifdef LDM_DEBUG - printf("Cur position: %zu\n", anchor - istart); - printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); -#endif - - if (litLength >= RUN_MASK) { - int len = (int)litLength - RUN_MASK; - *token = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *op++ = 255; - } - *op++ = (BYTE)len; - } else { - *token = (BYTE)(litLength << ML_BITS); - } -#ifdef LDM_DEBUG - printf("Literals "); - fwrite(anchor, litLength, 1, stdout); - printf("\n"); -#endif - memcpy(op, anchor, litLength); - op += litLength; - } -_next_match: - /* Encode offset */ - { - /* - LDM_writeLE16(op, ip-match); - op += 2; - */ - LDM_write32(op, ip - match); - op += 4; - compressStats.total_offset += (ip - match); - } - - /* Encode Match Length */ - { - unsigned matchCode; - matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, - matchlimit); -#ifdef LDM_DEBUG - printf("Match length %zu\n", matchCode + MINMATCH); - fwrite(ip, MINMATCH + matchCode, 1, stdout); - printf("\n"); -#endif - compressStats.total_match_length += matchCode + MINMATCH; - unsigned ctr = 1; - ip++; - for (; ctr < MINMATCH + matchCode; ip++, ctr++) { - LDM_put_position(ip, hashTable, istart); - } -// ip += MINMATCH + matchCode; - if (matchCode >= ML_MASK) { - *token += ML_MASK; - matchCode -= ML_MASK; - LDM_write32(op, 0xFFFFFFFF); - while (matchCode >= 4*0xFF) { - op += 4; - LDM_write32(op, 0xffffffff); - matchCode -= 4*0xFF; - } - op += matchCode / 255; - *op++ = (BYTE)(matchCode % 255); - } else { - *token += (BYTE)(matchCode); - } -#ifdef LDM_DEBUG - printf("\n"); - -#endif - } - - anchor = ip; - - LDM_put_position(ip, hashTable, istart); - forwardH = LDM_hash_position(++ip); - lastHash = ip; - } -_last_literals: - /* Encode last literals */ - { - size_t const lastRun = (size_t)(iend - anchor); - if (lastRun >= RUN_MASK) { - size_t accumulator = lastRun - RUN_MASK; - *op++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255; accumulator -= 255) { - *op++ = 255; - } - *op++ = (BYTE)accumulator; - } else { - *op++ = (BYTE)(lastRun << ML_BITS); - } - memcpy(op, anchor, lastRun); - op += lastRun; - } - LDM_printCompressStats(&compressStats); - return (op - (BYTE *)dst); -} - -typedef struct LDM_DCtx { - const BYTE * const ibase; /* Pointer to base of input */ - const BYTE *ip; /* Pointer to current input position */ - const BYTE *iend; /* End of source */ - BYTE *op; /* Pointer to output */ - const BYTE * const oend; /* Pointer to end of output */ - -} LDM_DCtx; - -size_t LDM_decompress(const void *src, size_t compressed_size, - void *dst, size_t max_decompressed_size) { - const BYTE *ip = (const BYTE *)src; - const BYTE * const iend = ip + compressed_size; - BYTE *op = (BYTE *)dst; - BYTE * const oend = op + max_decompressed_size; - BYTE *cpy; - - while (ip < iend) { - size_t length; - const BYTE *match; - size_t offset; - - /* get literal length */ - unsigned const token = *ip++; - if ((length=(token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *ip++; - length += s; - } while (s == 255); - } -#ifdef LDM_DEBUG - printf("Literal length: %zu\n", length); -#endif - - /* copy literals */ - cpy = op + length; -#ifdef LDM_DEBUG - printf("Literals "); - fwrite(ip, length, 1, stdout); - printf("\n"); -#endif - memcpy(op, ip, length); - ip += length; - op = cpy; - - /* get offset */ - /* - offset = LDM_readLE16(ip); - ip += 2; - */ - offset = LDM_read32(ip); - ip += 4; -#ifdef LDM_DEBUG - printf("Offset: %zu\n", offset); -#endif - match = op - offset; - // LDM_write32(op, (U32)offset); - - /* get matchlength */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *ip++; - length += s; - } while (s == 255); - } - length += MINMATCH; -#ifdef LDM_DEBUG - printf("Match length: %zu\n", length); -#endif - /* copy match */ - cpy = op + length; - - // Inefficient for now - while (match < cpy - offset && op < oend) { - *op++ = *match++; - } - } - return op - (BYTE *)dst; -} - - diff --git a/contrib/long_distance_matching/versions/v2/ldm.h b/contrib/long_distance_matching/versions/v2/ldm.h deleted file mode 100644 index 0ac7b2ec..00000000 --- a/contrib/long_distance_matching/versions/v2/ldm.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef LDM_H -#define LDM_H - -#include /* size_t */ - -#define LDM_COMPRESS_SIZE 4 -#define LDM_DECOMPRESS_SIZE 4 -#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) - -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -size_t LDM_decompress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -void LDM_read_header(const void *src, size_t *compressSize, - size_t *decompressSize); - -#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v2/main-ldm.c b/contrib/long_distance_matching/versions/v2/main-ldm.c deleted file mode 100644 index 0017335b..00000000 --- a/contrib/long_distance_matching/versions/v2/main-ldm.c +++ /dev/null @@ -1,474 +0,0 @@ -// TODO: file size must fit into a U32 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "ldm.h" - -// #define BUF_SIZE 16*1024 // Block size -#define DEBUG - -//#define ZSTD - -/* Compress file given by fname and output to oname. - * Returns 0 if successful, error code otherwise. - */ -static int compress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - size_t maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; - - /* Go to the location corresponding to the last byte. */ - /* TODO: fallocate? */ - if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* Write a dummy byte at the last location. */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - -#ifdef ZSTD - size_t compressSize = ZSTD_compress(dst, statbuf.st_size, - src, statbuf.st_size, 1); -#else - size_t compressSize = LDM_HEADER_SIZE + - LDM_compress(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, statbuf.st_size); - - // Write compress and decompress size to header - // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &compressSize, 4); - memcpy(dst + 4, &(statbuf.st_size), 4); - -#ifdef DEBUG - printf("Compressed size: %zu\n", compressSize); - printf("Decompressed size: %zu\n", statbuf.st_size); -#endif -#endif - - // Truncate file to compressSize. - ftruncate(fdout, compressSize); - - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)compressSize, oname, - (double)compressSize / (statbuf.st_size) * 100); - - // Close files. - close(fdin); - close(fdout); - return 0; -} - -/* Decompress file compressed using LDM_compress. - * The input file should have the LDM_HEADER followed by payload. - * Returns 0 if succesful, and an error code otherwise. - */ -static int decompress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* Read the header. */ - size_t compressSize, decompressSize; - LDM_read_header(src, &compressSize, &decompressSize); - -#ifdef DEBUG - printf("Size, compressSize, decompressSize: %zu %zu %zu\n", - statbuf.st_size, compressSize, decompressSize); -#endif - - /* Go to the location corresponding to the last byte. */ - if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - -#ifdef ZSTD - size_t outSize = ZSTD_decompress(dst, decomrpessed_size, - src + LDM_HEADER_SIZE, - statbuf.st_size - LDM_HEADER_SIZE); -#else - size_t outSize = LDM_decompress( - src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, - dst, decompressSize); - - printf("Ret size out: %zu\n", outSize); - #endif - ftruncate(fdout, outSize); - - close(fdin); - close(fdout); - return 0; -} - -/* Compare two files. - * Returns 0 iff they are the same. - */ -static int compare(FILE *fp0, FILE *fp1) { - int result = 0; - while (result == 0) { - char b0[1024]; - char b1[1024]; - const size_t r0 = fread(b0, 1, sizeof(b0), fp0); - const size_t r1 = fread(b1, 1, sizeof(b1), fp1); - - result = (int)r0 - (int)r1; - - if (0 == r0 || 0 == r1) break; - - if (0 == result) result = memcmp(b0, b1, r0); - } - return result; -} - -/* Verify the input file is the same as the decompressed file. */ -static void verify(const char *inpFilename, const char *decFilename) { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); -} - -int main(int argc, const char *argv[]) { - const char * const exeName = argv[0]; - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Wrong arguments\n"); - printf("Usage:\n"); - printf("%s FILE\n", exeName); - return 1; - } - - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - struct timeval tv1, tv2; - - /* Compress */ - - gettimeofday(&tv1, NULL); - if (compress(inpFilename, ldmFilename)) { - printf("Compress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - - /* Decompress */ - - gettimeofday(&tv1, NULL); - if (decompress(ldmFilename, decFilename)) { - printf("Decompress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - - /* verify */ - verify(inpFilename, decFilename); - return 0; -} - - -#if 0 -static size_t compress_file(FILE *in, FILE *out, size_t *size_in, - size_t *size_out) { - char *src, *buf = NULL; - size_t r = 1; - size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; - - src = malloc(BUF_SIZE); - if (!src) { - printf("Not enough memory\n"); - goto cleanup; - } - - size = BUF_SIZE + LDM_HEADER_SIZE; - buf = malloc(size); - if (!buf) { - printf("Not enough memory\n"); - goto cleanup; - } - - - for (;;) { - k = fread(src, 1, BUF_SIZE, in); - if (k == 0) - break; - count_in += k; - - n = LDM_compress(src, buf, k, BUF_SIZE); - - // n = k; - // offset += n; - offset = k; - count_out += k; - -// k = fwrite(src, 1, offset, out); - - k = fwrite(buf, 1, offset, out); - if (k < offset) { - if (ferror(out)) - printf("Write failed\n"); - else - printf("Short write\n"); - goto cleanup; - } - - } - *size_in = count_in; - *size_out = count_out; - r = 0; - cleanup: - free(src); - free(buf); - return r; -} - -static size_t decompress_file(FILE *in, FILE *out) { - void *src = malloc(BUF_SIZE); - void *dst = NULL; - size_t dst_capacity = BUF_SIZE; - size_t ret = 1; - size_t bytes_written = 0; - - if (!src) { - perror("decompress_file(src)"); - goto cleanup; - } - - while (ret != 0) { - /* Load more input */ - size_t src_size = fread(src, 1, BUF_SIZE, in); - void *src_ptr = src; - void *src_end = src_ptr + src_size; - if (src_size == 0 || ferror(in)) { - printf("(TODO): Decompress: not enough input or error reading file\n"); - //TODO - ret = 0; - goto cleanup; - } - - /* Allocate destination buffer if it hasn't been allocated already */ - if (!dst) { - dst = malloc(dst_capacity); - if (!dst) { - perror("decompress_file(dst)"); - goto cleanup; - } - } - - // TODO - - /* Decompress: - * Continue while there is more input to read. - */ - while (src_ptr != src_end && ret != 0) { - // size_t dst_size = src_size; - size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); - size_t written = fwrite(dst, 1, dst_size, out); -// printf("Writing %zu bytes\n", dst_size); - bytes_written += dst_size; - if (written != dst_size) { - printf("Decompress: Failed to write to file\n"); - goto cleanup; - } - src_ptr += src_size; - src_size = src_end - src_ptr; - } - - /* Update input */ - - } - - printf("Wrote %zu bytes\n", bytes_written); - - cleanup: - free(src); - free(dst); - - return ret; -} - -int main2(int argc, char *argv[]) { - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Please specify input filename\n"); - return 0; - } - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - /* compress */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *outFp = fopen(ldmFilename, "wb"); - size_t sizeIn = 0; - size_t sizeOut = 0; - size_t ret; - printf("compress : %s -> %s\n", inpFilename, ldmFilename); - ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); - if (ret) { - printf("compress : failed with code %zu\n", ret); - return ret; - } - printf("%s: %zu → %zu bytes, %.1f%%\n", - inpFilename, sizeIn, sizeOut, - (double)sizeOut / sizeIn * 100); - printf("compress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* decompress */ - { - FILE *inpFp = fopen(ldmFilename, "rb"); - FILE *outFp = fopen(decFilename, "wb"); - size_t ret; - - printf("decompress : %s -> %s\n", ldmFilename, decFilename); - ret = decompress_file(inpFp, outFp); - if (ret) { - printf("decompress : failed with code %zu\n", ret); - return ret; - } - printf("decompress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* verify */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); - } - return 0; -} -#endif - From e0d416246403c6606bff15eb844b6c12155751de Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 12 Jul 2017 09:50:24 -0700 Subject: [PATCH 21/62] Minor fix for non-rolling hash --- contrib/long_distance_matching/ldm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 79648097..a1fe6174 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -518,10 +518,11 @@ static void LDM_outputBlock(LDM_CCtx *cctx, const BYTE *match) { while (cctx->ip < cctx->anchor + MINMATCH + matchLength + literalLength) { // printf("Loop\n"); if (cctx->ip > cctx->lastPosHashed) { - LDM_updateLastHashFromNextHash(cctx); -// LDM_putHashOfCurrentPosition(cctx); #ifdef LDM_ROLLING_HASH + LDM_updateLastHashFromNextHash(cctx); LDM_setNextHash(cctx); +#else + LDM_putHashOfCurrentPosition(cctx); #endif } /* @@ -594,9 +595,10 @@ size_t LDM_compress(const void *src, size_t srcSize, // Set start of next block to current input pointer. cctx.anchor = cctx.ip; +#ifdef LDM_ROLLING_HASH LDM_updateLastHashFromNextHash(&cctx); -// LDM_putHashOfCurrentPosition(&cctx); -#ifndef LDM_ROLLING_HASH +#else + LDM_putHashOfCurrentPosition(&cctx); cctx.ip++; #endif From 3a48ffd4fd7a63ea36e2abae141a4fd4b7df847d Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 12 Jul 2017 10:53:19 -0700 Subject: [PATCH 22/62] Fix sumToHash to use hash space more efficiently --- contrib/long_distance_matching/ldm.c | 52 +++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index a1fe6174..4ed09cff 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -7,9 +7,9 @@ #include "ldm.h" #include "util.h" -#define HASH_EVERY 1 +#define HASH_EVERY 7 -#define LDM_MEMORY_USAGE 22 +#define LDM_MEMORY_USAGE 18 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) @@ -132,7 +132,7 @@ static int LDM_isValidMatch(const BYTE *p, const BYTE *match) { curMatch += 8; } if (lengthLeft > 0) { - return LDM_read32(curP) == LDM_read32(curMatch); + return (LDM_read32(curP) == LDM_read32(curMatch)); } return 1; } @@ -144,8 +144,15 @@ static int LDM_isValidMatch(const BYTE *p, const BYTE *match) { * Convert a sum computed from LDM_getRollingHash to a hash value in the range * of the hash table. */ +#define LDM_SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF) +#define LDM_SUM2HASH(sum) (LDM_SUM2HASH2((sum)&0xFFFF,(sum)>>16)) + static hash_t LDM_sumToHash(U32 sum) { - return sum & (LDM_HASH_SIZE_U32 - 1); +// return sum & (LDM_HASH_SIZE_U32 - 1); +// return sum % (LDM_HASHTABLESIZE_U32 ); + + return ((sum* 2654435761U) >> ((32)-LDM_HASHLOG)); +// return LDM_SUM2HASH2(sum&0xFFFF, sum >> 16); } static U32 LDM_getRollingHash(const char *data, U32 len) { @@ -240,15 +247,32 @@ static void LDM_putHashOfCurrentPositionFromHash( } */ #ifdef COMPUTE_STATS - if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { + if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32 ) { offset_t offset = (cctx->hashTable)[hash].offset; cctx->stats.numHashInserts++; - if (offset == 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { + if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { +// printf("%u %u %zu\n", hash, offset, cctx->ip - cctx->ibase); +// printf("TST: %u %u\n", LDM_read32(cctx->ip), LDM_read32(offset + cctx->ibase)); cctx->stats.numCollisions++; } } + #endif - (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; + + if (((cctx->ip - cctx->ibase) & HASH_EVERY) == HASH_EVERY) { +#ifdef COMPUTE_STATS + /* + offset_t offset = (cctx->hashTable)[hash].offset; + if (offset == 0) { + printf("NEW HASH: %u\n", hash); + } + */ +#endif + + (cctx->hashTable)[hash] = (LDM_hashEntry){ (offset_t)(cctx->ip - cctx->ibase) }; + } + + // Book-keeping cctx->lastPosHashed = cctx->ip; cctx->lastHash = hash; cctx->lastSum = sum; @@ -296,7 +320,8 @@ static void LDM_putHashOfCurrentPositionFromHash( if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { offset_t offset = (cctx->hashTable)[hash].offset; cctx->stats.numHashInserts++; - if (offset == 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { + + if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { cctx->stats.numCollisions++; } } @@ -629,6 +654,17 @@ _last_literals: cctx.op += lastRun; } LDM_printCompressStats(&cctx.stats); + + { + U32 tmp = 0; + U32 ctr = 0; + for (; tmp < LDM_HASH_SIZE_U32; tmp++) { + if ((cctx.hashTable)[tmp].offset == 0) { + ctr++; + } + } + printf("HASH: %u %u\n", ctr, LDM_HASH_SIZE_U32); + } return (cctx.op - (const BYTE *)cctx.obase); } From 8ff8cdb15bae9c2d5db78477958ff90172cb83b9 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 12 Jul 2017 15:11:06 -0700 Subject: [PATCH 23/62] [ldm] Clean up code --- contrib/long_distance_matching/ldm.c | 515 ++++++-------- contrib/long_distance_matching/ldm.h | 33 + contrib/long_distance_matching/main-ldm.c | 20 +- .../versions/v0.5/Makefile | 40 ++ .../versions/v0.5/ldm.c | 659 ++++++++++++++++++ .../versions/v0.5/ldm.h | 26 + .../versions/v0.5/main-ldm.c | 468 +++++++++++++ .../versions/v0.5/util.c | 69 ++ .../versions/v0.5/util.h | 25 + 9 files changed, 1528 insertions(+), 327 deletions(-) create mode 100644 contrib/long_distance_matching/versions/v0.5/Makefile create mode 100644 contrib/long_distance_matching/versions/v0.5/ldm.c create mode 100644 contrib/long_distance_matching/versions/v0.5/ldm.h create mode 100644 contrib/long_distance_matching/versions/v0.5/main-ldm.c create mode 100644 contrib/long_distance_matching/versions/v0.5/util.c create mode 100644 contrib/long_distance_matching/versions/v0.5/util.h diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 4ed09cff..b17a0f15 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -3,34 +3,30 @@ #include #include - #include "ldm.h" #include "util.h" -#define HASH_EVERY 7 +// Insert every (HASH_ONLY_EVERY + 1) into the hash table. +#define HASH_ONLY_EVERY 0 -#define LDM_MEMORY_USAGE 18 +#define LDM_MEMORY_USAGE 20 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) #define LDM_OFFSET_SIZE 4 -#define WINDOW_SIZE (1 << 23) -#define MAX_WINDOW_SIZE 31 -#define HASH_SIZE 4 -#define LDM_HASH_LENGTH 4 +#define WINDOW_SIZE (1 << 20) -// Should be multiple of four -#define MINMATCH 4 +//These should be multiples of four. +#define LDM_HASH_LENGTH 100 +#define MINMATCH 100 #define ML_BITS 4 #define ML_MASK ((1U<numMatches); - printf("Average match length: %.1f\n", ((double)stats->totalMatchLength) / - (double)stats->numMatches); - printf("Average literal length: %.1f\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches); - printf("Average offset length: %.1f\n", - ((double)stats->totalOffset) / (double)stats->numMatches); - printf("Num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", - stats->numCollisions, stats->numHashInserts, - stats->numHashInserts == 0 ? - 1.0 : (100.0 * (double)stats->numCollisions) / - (double)stats->numHashInserts); - printf("=====================\n"); -} - typedef struct LDM_CCtx { size_t isize; /* Input size */ size_t maxOSize; /* Maximum output size */ @@ -101,25 +78,79 @@ typedef struct LDM_CCtx { LDM_compressStats stats; /* Compression statistics */ - LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; + hashEntry hashTable[LDM_HASHTABLESIZE_U32]; const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ - const BYTE *nextIp; + U32 lastSum; + + const BYTE *nextIp; // TODO: this is redundant (ip + step) const BYTE *nextPosHashed; hash_t nextHash; /* Hash corresponding to nextPosHashed */ - - // Members for rolling hash. - U32 lastSum; U32 nextSum; - unsigned step; + unsigned step; // ip step, should be 1. // DEBUG const BYTE *DEBUG_setNextHash; } LDM_CCtx; +/** + * Outputs compression statistics. + */ +static void printCompressStats(const LDM_CCtx *cctx) { + const LDM_compressStats *stats = &(cctx->stats); +#ifdef COMPUTE_STATS + printf("=====================\n"); + printf("Compression statistics\n"); + printf("Total number of matches: %u\n", stats->numMatches); + printf("Average match length: %.1f\n", ((double)stats->totalMatchLength) / + (double)stats->numMatches); + printf("Average literal length: %.1f\n", + ((double)stats->totalLiteralLength) / (double)stats->numMatches); + printf("Average offset length: %.1f\n", + ((double)stats->totalOffset) / (double)stats->numMatches); + printf("Num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", + stats->numCollisions, stats->numHashInserts, + stats->numHashInserts == 0 ? + 1.0 : (100.0 * (double)stats->numCollisions) / + (double)stats->numHashInserts); + + // Output occupancy of hash table. + { + U32 i = 0; + U32 ctr = 0; + for (; i < LDM_HASHTABLESIZE_U32; i++) { + if ((cctx->hashTable)[i].offset == 0) { + ctr++; + } + } + printf("Hash table size, empty slots, %% empty: %u %u %.3f\n", + LDM_HASHTABLESIZE_U32, ctr, + 100.0 * (double)(ctr) / (double)LDM_HASHTABLESIZE_U32); + } + + printf("=====================\n"); +#endif +} + +/** + * Checks whether the MINMATCH bytes from p are the same as the MINMATCH + * bytes from match. + * + * This assumes MINMATCH is a multiple of four. + * + * Return 1 if valid, 0 otherwise. + */ static int LDM_isValidMatch(const BYTE *p, const BYTE *match) { + /* + if (memcmp(p, match, MINMATCH) == 0) { + return 1; + } + return 0; + */ + + //TODO: This seems to be faster for some reason? U16 lengthLeft = MINMATCH; const BYTE *curP = p; const BYTE *curMatch = match; @@ -137,25 +168,22 @@ static int LDM_isValidMatch(const BYTE *p, const BYTE *match) { return 1; } - - -#ifdef LDM_ROLLING_HASH /** - * Convert a sum computed from LDM_getRollingHash to a hash value in the range + * Convert a sum computed from getChecksum to a hash value in the range * of the hash table. */ -#define LDM_SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF) -#define LDM_SUM2HASH(sum) (LDM_SUM2HASH2((sum)&0xFFFF,(sum)>>16)) - -static hash_t LDM_sumToHash(U32 sum) { -// return sum & (LDM_HASH_SIZE_U32 - 1); -// return sum % (LDM_HASHTABLESIZE_U32 ); - - return ((sum* 2654435761U) >> ((32)-LDM_HASHLOG)); -// return LDM_SUM2HASH2(sum&0xFFFF, sum >> 16); +static hash_t checksumToHash(U32 sum) { + return ((sum * 2654435761U) >> ((32)-LDM_HASHLOG)); } -static U32 LDM_getRollingHash(const char *data, U32 len) { +/** + * Computes a checksum based on rsync's checksum. + * + * a(k,l) = \sum_{i = k}^l x_i (mod M) + * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) + * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) + */ +static U32 getChecksum(const char *data, U32 len) { U32 i; U32 s1, s2; const schar *buf = (const schar *)data; @@ -173,34 +201,31 @@ static U32 LDM_getRollingHash(const char *data, U32 len) { return (s1 & 0xffff) + (s2 << 16); } -typedef struct LDM_sumStruct { - U16 s1, s2; -} LDM_sumStruct; - -static U32 LDM_updateRollingHash(U32 sum, U32 len, - schar toRemove, schar toAdd) { +/** + * Update a checksum computed from getChecksum(data, len). + * + * The checksum can be updated along its ends as follows: + * a(k+1, l+1) = (a(k,l) - x_k + x_{l+1}) (mod M) + * b(k+1, l+1) = (b(k,l) - (l-k+1)*x_k + (a(k+1,l+1)) (mod M) + * + * Thus toRemove should correspond to data[0]. + */ +static U32 updateChecksum(U32 sum, U32 len, + schar toRemove, schar toAdd) { U32 s1 = (sum & 0xffff) - toRemove + toAdd; U32 s2 = (sum >> 16) - (toRemove * len) + s1; return (s1 & 0xffff) + (s2 << 16); } - -/* -static hash_t LDM_hashPosition(const void * const p) { - return LDM_sumToHash(LDM_getRollingHash((const char *)p, LDM_HASH_LENGTH)); -} -*/ - -/* -static void LDM_getRollingHashParts(U32 sum, LDM_sumStruct *sumStruct) { - sumStruct->s1 = sum & 0xffff; - sumStruct->s2 = sum >> 16; -} -*/ - -static void LDM_setNextHash(LDM_CCtx *cctx) { - +/** + * Update cctx->nextSum, cctx->nextHash, and cctx->nextPosHashed + * based on cctx->lastSum and cctx->lastPosHashed. + * + * This uses a rolling hash and requires that the last position hashed + * corresponds to cctx->nextIp - step. + */ +static void setNextHash(LDM_CCtx *cctx) { #ifdef RUN_CHECKS U32 check; if ((cctx->nextIp - cctx->ibase != 1) && @@ -212,160 +237,100 @@ static void LDM_setNextHash(LDM_CCtx *cctx) { cctx->DEBUG_setNextHash = cctx->nextIp; #endif -// cctx->nextSum = LDM_getRollingHash((const char *)cctx->nextIp, LDM_HASH_LENGTH); - cctx->nextSum = LDM_updateRollingHash( +// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); + cctx->nextSum = updateChecksum( cctx->lastSum, LDM_HASH_LENGTH, (schar)((cctx->lastPosHashed)[0]), (schar)((cctx->lastPosHashed)[LDM_HASH_LENGTH])); + cctx->nextPosHashed = cctx->nextIp; + cctx->nextHash = checksumToHash(cctx->nextSum); #ifdef RUN_CHECKS - check = LDM_getRollingHash((const char *)cctx->nextIp, LDM_HASH_LENGTH); + check = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); if (check != cctx->nextSum) { printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); -// printf("INFO: %u %u %u\n", LDM_read32(cctx->nextIp), } -#endif - cctx->nextPosHashed = cctx->nextIp; - cctx->nextHash = LDM_sumToHash(cctx->nextSum); -#ifdef RUN_CHECKS if ((cctx->nextIp - cctx->lastPosHashed) != 1) { printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, cctx->ip - cctx->ibase); } #endif - } -static void LDM_putHashOfCurrentPositionFromHash( +static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { - /* - if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { - return; - } - */ #ifdef COMPUTE_STATS - if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32 ) { + if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { offset_t offset = (cctx->hashTable)[hash].offset; cctx->stats.numHashInserts++; if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { -// printf("%u %u %zu\n", hash, offset, cctx->ip - cctx->ibase); -// printf("TST: %u %u\n", LDM_read32(cctx->ip), LDM_read32(offset + cctx->ibase)); cctx->stats.numCollisions++; } } - #endif - if (((cctx->ip - cctx->ibase) & HASH_EVERY) == HASH_EVERY) { -#ifdef COMPUTE_STATS - /* - offset_t offset = (cctx->hashTable)[hash].offset; - if (offset == 0) { - printf("NEW HASH: %u\n", hash); - } - */ -#endif - - (cctx->hashTable)[hash] = (LDM_hashEntry){ (offset_t)(cctx->ip - cctx->ibase) }; + // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. + // Note: this works only when cctx->step is 1. + if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { + (cctx->hashTable)[hash] = (hashEntry){ (offset_t)(cctx->ip - cctx->ibase) }; } - // Book-keeping cctx->lastPosHashed = cctx->ip; cctx->lastHash = hash; cctx->lastSum = sum; } +/** + * Copy over the cctx->lastHash, cctx->lastSum, and cctx->lastPosHashed + * fields from the "next" fields. + * + * This requires that cctx->ip == cctx->nextPosHashed. + */ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { #ifdef RUN_CHECKS if (cctx->ip != cctx->nextPosHashed) { - printf("CHECK failed: updateLastHashFromNextHash %zu\n", cctx->ip - cctx->ibase); + printf("CHECK failed: updateLastHashFromNextHash %zu\n", + cctx->ip - cctx->ibase); } #endif - LDM_putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); + putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); } +/** + * Insert hash of the current position into the hash table. + */ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U32 sum = LDM_getRollingHash((const char *)cctx->ip, LDM_HASH_LENGTH); - hash_t hash = LDM_sumToHash(sum); + U32 sum = getChecksum((const char *)cctx->ip, LDM_HASH_LENGTH); + hash_t hash = checksumToHash(sum); + #ifdef RUN_CHECKS if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { - printf("CHECK failed: putHashOfCurrentPosition %zu\n", cctx->ip - cctx->ibase); - } -#endif -// hash_t hash = LDM_hashPosition(cctx->ip); - LDM_putHashOfCurrentPositionFromHash(cctx, hash, sum); -// printf("Offset %zu\n", cctx->ip - cctx->ibase); -} - -#else -static hash_t LDM_hash(U32 sequence) { - return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); -} - -static hash_t LDM_hashPosition(const void * const p) { - return LDM_hash(LDM_read32(p)); -} - -static void LDM_putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash) { - /* - if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { - return; - } - */ -#ifdef COMPUTE_STATS - if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { - offset_t offset = (cctx->hashTable)[hash].offset; - cctx->stats.numHashInserts++; - - if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { - cctx->stats.numCollisions++; - } + printf("CHECK failed: putHashOfCurrentPosition %zu\n", + cctx->ip - cctx->ibase); } #endif - (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; -#ifdef RUN_CHECKS - if (cctx->ip - cctx->lastPosHashed != 1) { - printf("putHashError\n"); - } -#endif - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; + putHashOfCurrentPositionFromHash(cctx, hash, sum); } -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - hash_t hash = LDM_hashPosition(cctx->ip); - LDM_putHashOfCurrentPositionFromHash(cctx, hash); +/** + * Returns the position of the entry at hashTable[hash]. + */ +static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) { + return cctx->hashTable[hash].offset + cctx->ibase; } -#endif - -/* -static hash_t LDM_hash5(U64 sequence) { - static const U64 prime5bytes = 889523592379ULL; - static const U64 prime8bytes = 11400714785074694791ULL; - const U32 hashLog = LDM_HASHLOG; - if (LDM_isLittleEndian()) - return (((sequence << 24) * prime5bytes) >> (64 - hashLog)); - else - return (((sequence >> 24) * prime8bytes) >> (64 - hashLog)); -} -*/ - - -static const BYTE *LDM_getPositionOnHash( - hash_t h, void *tableBase, const BYTE *srcBase) { - const LDM_hashEntry * const hashTable = (LDM_hashEntry *)tableBase; - return hashTable[h].offset + srcBase; -} - - -static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { +/** + * Counts the number of bytes that match from pIn and pMatch, + * up to pInLimit. + * + * TODO: make more efficient. + */ +static unsigned countMatchLength(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { const BYTE * const pStart = pIn; while (pIn < pInLimit - 1) { BYTE const diff = LDM_readByte(pMatch) ^ LDM_readByte(pIn); @@ -386,9 +351,12 @@ void LDM_readHeader(const void *src, size_t *compressSize, *decompressSize = *ip; } -static void LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { +/** + * Initialize a compression context. + */ +static void initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { cctx->isize = srcSize; cctx->maxOSize = maxDstSize; @@ -396,11 +364,7 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->ip = cctx->ibase; cctx->iend = cctx->ibase + srcSize; -#ifdef LDM_ROLLING_HASH cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; -#else - cctx->ihashLimit = cctx->iend - HASH_SIZE; -#endif cctx->imatchLimit = cctx->iend - MINMATCH; cctx->obase = (BYTE *)dst; @@ -413,23 +377,27 @@ static void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->lastPosHashed = NULL; - cctx->step = 1; + cctx->step = 1; // Fixed to be 1 for now. Changing may break things. cctx->nextIp = cctx->ip + cctx->step; cctx->nextPosHashed = 0; cctx->DEBUG_setNextHash = 0; } -#ifdef LDM_ROLLING_HASH +/** + * Finds the "best" match. + * + * Returns 0 if successful and 1 otherwise (i.e. no match can be found + * in the remaining input that is long enough). + * + */ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { cctx->nextIp = cctx->ip + cctx->step; do { hash_t h; U32 sum; -// printf("Call A\n"); - LDM_setNextHash(cctx); -// printf("End call a\n"); + setNextHash(cctx); h = cctx->nextHash; sum = cctx->nextSum; cctx->ip = cctx->nextIp; @@ -439,62 +407,27 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { return 1; } - *match = LDM_getPositionOnHash(h, cctx->hashTable, cctx->ibase); - -// // Compute cctx->nextSum and cctx->nextHash from cctx->nextIp. -// LDM_setNextHash(cctx); - LDM_putHashOfCurrentPositionFromHash(cctx, h, sum); - -// printf("%u %u\n", cctx->lastHash, cctx->nextHash); - } while (cctx->ip - *match > WINDOW_SIZE || - !LDM_isValidMatch(cctx->ip, *match)); -// LDM_read64(*match) != LDM_read64(cctx->ip)); - LDM_setNextHash(cctx); - return 0; -} -#else -static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { - cctx->nextIp = cctx->ip; - - do { - hash_t const h = cctx->nextHash; - cctx->ip = cctx->nextIp; - cctx->nextIp += cctx->step; - - if (cctx->ip > cctx->imatchLimit) { - return 1; - } - - *match = LDM_getPositionOnHash(h, cctx->hashTable, cctx->ibase); - - cctx->nextHash = LDM_hashPosition(cctx->nextIp); - LDM_putHashOfCurrentPositionFromHash(cctx, h); + *match = getPositionOnHash(cctx, h); + putHashOfCurrentPositionFromHash(cctx, h, sum); } while (cctx->ip - *match > WINDOW_SIZE || !LDM_isValidMatch(cctx->ip, *match)); + setNextHash(cctx); return 0; } -#endif - /** * Write current block (literals, literal length, match offset, * match length). * - * Update input pointer, inserting hashes into hash table along the - * way. + * Update input pointer, inserting hashes into hash table along the way. */ -static void LDM_outputBlock(LDM_CCtx *cctx, const BYTE *match) { - unsigned const literalLength = (unsigned)(cctx->ip - cctx->anchor); - unsigned const offset = cctx->ip - match; - unsigned const matchLength = LDM_count( - cctx->ip + MINMATCH, match + MINMATCH, cctx->ihashLimit); +static void outputBlock(LDM_CCtx *cctx, + unsigned const literalLength, + unsigned const offset, + unsigned const matchLength) { BYTE *token = cctx->op++; - cctx->stats.totalLiteralLength += literalLength; - cctx->stats.totalOffset += offset; - cctx->stats.totalMatchLength += matchLength + MINMATCH; - /* Encode the literal length. */ if (literalLength >= RUN_MASK) { int len = (int)literalLength - RUN_MASK; @@ -515,7 +448,7 @@ static void LDM_outputBlock(LDM_CCtx *cctx, const BYTE *match) { LDM_write32(cctx->op, offset); cctx->op += LDM_OFFSET_SIZE; - /* Encode match length */ + /* Encode the match length. */ if (matchLength >= ML_MASK) { unsigned matchLengthRemaining = matchLength; *token += ML_MASK; @@ -531,62 +464,21 @@ static void LDM_outputBlock(LDM_CCtx *cctx, const BYTE *match) { } else { *token += (BYTE)(matchLength); } - -// LDM_setNextHash(cctx); -// cctx->ip = cctx->lastPosHashed + 1; -// cctx->nextIp = cctx->ip + cctx->step; -// printf("HERE: %zu %zu %zu\n", cctx->ip - cctx->ibase, -// cctx->lastPosHashed - cctx->ibase, cctx->nextIp - cctx->ibase); - - cctx->nextIp = cctx->ip + cctx->step; - - while (cctx->ip < cctx->anchor + MINMATCH + matchLength + literalLength) { -// printf("Loop\n"); - if (cctx->ip > cctx->lastPosHashed) { -#ifdef LDM_ROLLING_HASH - LDM_updateLastHashFromNextHash(cctx); - LDM_setNextHash(cctx); -#else - LDM_putHashOfCurrentPosition(cctx); -#endif - } - /* - printf("Call b %zu %zu %zu\n", - cctx->lastPosHashed - cctx->ibase, - cctx->nextIp - cctx->ibase, - cctx->ip - cctx->ibase); - */ -// printf("end call b\n"); - cctx->ip++; - cctx->nextIp++; - } - -// printf("There: %zu %zu\n", cctx->ip - cctx->ibase, cctx->lastPosHashed - cctx->ibase); } // TODO: srcSize and maxDstSize is unused +// This is based upon lz4. size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; - LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); -#ifdef LDM_ROLLING_HASH -// LDM_setNextHash(&cctx); -// tmp_hash = LDM_updateRollingHash(cctx.lastSum, LDM_HASH_LENGTH, -// cctx.ip[0], cctx.ip[LDM_HASH_LENGTH]); -// printf("Update test: %u %u\n", tmp_hash, cctx.nextSum); -// cctx.ip++; -#else - cctx.ip++; - cctx.nextHash = LDM_hashPosition(cctx.ip); -#endif // TODO: loop condition is not accurate. while (1) { const BYTE *match; -// printf("Start of loop\n"); /** * Find a match. @@ -597,16 +489,15 @@ size_t LDM_compress(const void *src, size_t srcSize, if (LDM_findBestMatch(&cctx, &match) != 0) { goto _last_literals; } -// printf("End of match finding\n"); - +#ifdef COMPUTE_STATS cctx.stats.numMatches++; +#endif /** * Catch up: look back to extend the match backwards from the found match. */ while (cctx.ip > cctx.anchor && match > cctx.ibase && cctx.ip[-1] == match[-1]) { -// printf("Catch up\n"); cctx.ip--; match--; } @@ -615,26 +506,35 @@ size_t LDM_compress(const void *src, size_t srcSize, * Write current block (literals, literal length, match offset, match * length) and update pointers and hashes. */ - LDM_outputBlock(&cctx, match); -// printf("End of loop\n"); + { + unsigned const literalLength = (unsigned)(cctx.ip - cctx.anchor); + unsigned const offset = cctx.ip - match; + unsigned const matchLength = countMatchLength( + cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); + +#ifdef COMPUTE_STATS + cctx.stats.totalLiteralLength += literalLength; + cctx.stats.totalOffset += offset; + cctx.stats.totalMatchLength += matchLength + MINMATCH; +#endif + outputBlock(&cctx, literalLength, offset, matchLength); + + // Move ip to end of block, inserting hashes at each position. + cctx.nextIp = cctx.ip + cctx.step; + while (cctx.ip < cctx.anchor + MINMATCH + matchLength + literalLength) { + if (cctx.ip > cctx.lastPosHashed) { + // TODO: Simplify. + LDM_updateLastHashFromNextHash(&cctx); + setNextHash(&cctx); + } + cctx.ip++; + cctx.nextIp++; + } + } // Set start of next block to current input pointer. cctx.anchor = cctx.ip; -#ifdef LDM_ROLLING_HASH LDM_updateLastHashFromNextHash(&cctx); -#else - LDM_putHashOfCurrentPosition(&cctx); - cctx.ip++; -#endif - - /* - LDM_putHashOfCurrentPosition(&cctx); - printf("Call c\n"); - LDM_setNextHash(&cctx); - printf("End call c\n"); - cctx.ip++; - cctx.nextIp++; - */ } _last_literals: /* Encode the last literals (no more matches). */ @@ -653,18 +553,11 @@ _last_literals: memcpy(cctx.op, cctx.anchor, lastRun); cctx.op += lastRun; } - LDM_printCompressStats(&cctx.stats); - { - U32 tmp = 0; - U32 ctr = 0; - for (; tmp < LDM_HASH_SIZE_U32; tmp++) { - if ((cctx.hashTable)[tmp].offset == 0) { - ctr++; - } - } - printf("HASH: %u %u\n", ctr, LDM_HASH_SIZE_U32); - } +#ifdef COMPUTE_STATS + printCompressStats(&cctx); +#endif + return (cctx.op - (const BYTE *)cctx.obase); } @@ -715,7 +608,7 @@ size_t LDM_decompress(const void *src, size_t compressSize, } while (s == 255); } - /* Copy literals. */ + /* Copy the literals. */ cpy = dctx.op + length; memcpy(dctx.op, dctx.ip, length); dctx.ip += length; @@ -748,20 +641,20 @@ size_t LDM_decompress(const void *src, size_t compressSize, return dctx.op - (BYTE *)dst; } +/* void LDM_test(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { -#ifdef LDM_ROLLING_HASH const BYTE *ip = (const BYTE *)src + 1125; - U32 sum = LDM_getRollingHash((const char *)ip, LDM_HASH_LENGTH); + U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); U32 sum2; ++ip; for (; ip < (const BYTE *)src + 1125 + 100; ip++) { - sum2 = LDM_updateRollingHash(sum, LDM_HASH_LENGTH, + sum2 = updateChecksum(sum, LDM_HASH_LENGTH, ip[-1], ip[LDM_HASH_LENGTH - 1]); - sum = LDM_getRollingHash((const char *)ip, LDM_HASH_LENGTH); + sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2); } -#endif } +*/ diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index a34faac4..d7f977d9 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -7,12 +7,45 @@ #define LDM_DECOMPRESS_SIZE 4 #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) +/** + * Compresses src into dst. + * + * NB: This currently ignores maxDstSize and assumes enough space is available. + * + * Block format (see lz4 documentation for more information): + * github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md + * + * A block is composed of sequences. Each sequence begins with a token, which + * is a one-byte value separated into two 4-bit fields. + * + * The first field uses the four high bits of the token and encodes the literal + * length. If the field value is 0, there is no literal. If it is 15, + * additional bytes are added (each ranging from 0 to 255) to the previous + * value to produce a total length. + * + * Following the token and optional length bytes are the literals. + * + * Next are the 4 bytes representing the offset of the match (2 in lz4), + * representing the position to copy the literals. + * + * The lower four bits of the token encode the match length. With additional + * bytes added similarly to the additional literal length bytes after the offset. + * + * The last sequence is incomplete and stops right after the lieterals. + * + */ size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize); size_t LDM_decompress(const void *src, size_t srcSize, void *dst, size_t maxDstSize); +/** + * Reads the header from src and writes the compressed size and + * decompressed size into compressSize and decompressSize respectively. + * + * NB: LDM_compress and LDM_decompress currently do not add/read headers. + */ void LDM_readHeader(const void *src, size_t *compressSize, size_t *decompressSize); diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index f8ae5469..fbfd789b 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -13,12 +13,9 @@ #include #include "ldm.h" -// #define BUF_SIZE 16*1024 // Block size #define DEBUG //#define TEST -//#define ZSTD - /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. */ @@ -75,28 +72,25 @@ static int compress(const char *fname, const char *oname) { return 1; } +/* #ifdef TEST LDM_test(src, statbuf.st_size, dst + LDM_HEADER_SIZE, statbuf.st_size); #endif +*/ -#ifdef ZSTD - compressSize = ZSTD_compress(dst, statbuf.st_size, - src, statbuf.st_size, 1); -#else compressSize = LDM_HEADER_SIZE + LDM_compress(src, statbuf.st_size, dst + LDM_HEADER_SIZE, statbuf.st_size); - // Write compress and decompress size to header - // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + // Write compress and decompress size to header + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 memcpy(dst, &compressSize, 4); memcpy(dst + 4, &(statbuf.st_size), 4); #ifdef DEBUG printf("Compressed size: %zu\n", compressSize); printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); -#endif #endif // Truncate file to compressSize. @@ -169,17 +163,11 @@ static int decompress(const char *fname, const char *oname) { return 1; } -#ifdef ZSTD - outSize = ZSTD_decompress(dst, decomrpessed_size, - src + LDM_HEADER_SIZE, - statbuf.st_size - LDM_HEADER_SIZE); -#else outSize = LDM_decompress( src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, dst, decompressSize); printf("Ret size out: %zu\n", outSize); - #endif ftruncate(fdout, outSize); close(fdin); diff --git a/contrib/long_distance_matching/versions/v0.5/Makefile b/contrib/long_distance_matching/versions/v0.5/Makefile new file mode 100644 index 00000000..5ffd4eaf --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.5/Makefile @@ -0,0 +1,40 @@ +# ################################################################ +# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# ################################################################ + +# This Makefile presumes libzstd is installed, using `sudo make install` + +CFLAGS ?= -O3 +DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ + -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ + -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ + -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ + -Wredundant-decls +CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS) +FLAGS = $(CPPFLAGS) $(CFLAGS) + +LDFLAGS += -lzstd + +.PHONY: default all clean + +default: all + +all: main-ldm + + +#main : ldm.c main.c +# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +main-ldm : util.c ldm.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +clean: + @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ + main main-ldm + @echo Cleaning completed + diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.c b/contrib/long_distance_matching/versions/v0.5/ldm.c new file mode 100644 index 00000000..325c5040 --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.5/ldm.c @@ -0,0 +1,659 @@ +#include +#include +#include +#include + +#include "ldm.h" +#include "util.h" + +// Insert every (HASH_ONLY_EVERY + 1) into the hash table. +#define HASH_ONLY_EVERY 0 + +#define LDM_MEMORY_USAGE 20 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) + +#define LDM_OFFSET_SIZE 4 + +#define WINDOW_SIZE (1 << 20) + +//These should be multiples of four. +#define LDM_HASH_LENGTH 100 +#define MINMATCH 100 + +#define ML_BITS 4 +#define ML_MASK ((1U<stats); +#ifdef COMPUTE_STATS + printf("=====================\n"); + printf("Compression statistics\n"); + printf("Total number of matches: %u\n", stats->numMatches); + printf("Average match length: %.1f\n", ((double)stats->totalMatchLength) / + (double)stats->numMatches); + printf("Average literal length: %.1f\n", + ((double)stats->totalLiteralLength) / (double)stats->numMatches); + printf("Average offset length: %.1f\n", + ((double)stats->totalOffset) / (double)stats->numMatches); + printf("Num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", + stats->numCollisions, stats->numHashInserts, + stats->numHashInserts == 0 ? + 1.0 : (100.0 * (double)stats->numCollisions) / + (double)stats->numHashInserts); + + // Output occupancy of hash table. + { + U32 i = 0; + U32 ctr = 0; + for (; i < LDM_HASHTABLESIZE_U32; i++) { + if ((cctx->hashTable)[i].offset == 0) { + ctr++; + } + } + printf("Hash table size, empty slots, %% empty: %u %u %.3f\n", + LDM_HASHTABLESIZE_U32, ctr, + 100.0 * (double)(ctr) / (double)LDM_HASHTABLESIZE_U32); + } + + printf("=====================\n"); +#endif +} + +/** + * Checks whether the MINMATCH bytes from p are the same as the MINMATCH + * bytes from match. + * + * This assumes MINMATCH is a multiple of four. + * + * Return 1 if valid, 0 otherwise. + */ +static int LDM_isValidMatch(const BYTE *p, const BYTE *match) { + /* + if (memcmp(p, match, MINMATCH) == 0) { + return 1; + } + return 0; + */ + + //TODO: This seems to be faster for some reason? + U16 lengthLeft = MINMATCH; + const BYTE *curP = p; + const BYTE *curMatch = match; + + for (; lengthLeft >= 8; lengthLeft -= 8) { + if (LDM_read64(curP) != LDM_read64(curMatch)) { + return 0; + } + curP += 8; + curMatch += 8; + } + if (lengthLeft > 0) { + return (LDM_read32(curP) == LDM_read32(curMatch)); + } + return 1; +} + +/** + * Convert a sum computed from getChecksum to a hash value in the range + * of the hash table. + */ +static hash_t checksumToHash(U32 sum) { + return ((sum * 2654435761U) >> ((32)-LDM_HASHLOG)); +} + +/** + * Computes a checksum based on rsync's checksum. + * + * a(k,l) = \sum_{i = k}^l x_i (mod M) + * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) + * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) + */ +static U32 getChecksum(const char *data, U32 len) { + U32 i; + U32 s1, s2; + const schar *buf = (const schar *)data; + + s1 = s2 = 0; + for (i = 0; i < (len - 4); i += 4) { + s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + + (2 * buf[i + 2]) + (buf[i + 3]); + s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3]; + } + for(; i < len; i++) { + s1 += buf[i]; + s2 += s1; + } + return (s1 & 0xffff) + (s2 << 16); +} + +/** + * Update a checksum computed from getChecksum(data, len). + * + * The checksum can be updated along its ends as follows: + * a(k+1, l+1) = (a(k,l) - x_k + x_{l+1}) (mod M) + * b(k+1, l+1) = (b(k,l) - (l-k+1)*x_k + (a(k+1,l+1)) (mod M) + * + * Thus toRemove should correspond to data[0]. + */ +static U32 updateChecksum(U32 sum, U32 len, + schar toRemove, schar toAdd) { + U32 s1 = (sum & 0xffff) - toRemove + toAdd; + U32 s2 = (sum >> 16) - (toRemove * len) + s1; + + return (s1 & 0xffff) + (s2 << 16); +} + +/** + * Update cctx->nextSum, cctx->nextHash, and cctx->nextPosHashed + * based on cctx->lastSum and cctx->lastPosHashed. + * + * This uses a rolling hash and requires that the last position hashed + * corresponds to cctx->nextIp - step. + */ +static void setNextHash(LDM_CCtx *cctx) { +#ifdef RUN_CHECKS + U32 check; + if ((cctx->nextIp - cctx->ibase != 1) && + (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { + printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, + cctx->DEBUG_setNextHash - cctx->ibase); + } + + cctx->DEBUG_setNextHash = cctx->nextIp; +#endif + +// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); + cctx->nextSum = updateChecksum( + cctx->lastSum, LDM_HASH_LENGTH, + (schar)((cctx->lastPosHashed)[0]), + (schar)((cctx->lastPosHashed)[LDM_HASH_LENGTH])); + cctx->nextPosHashed = cctx->nextIp; + cctx->nextHash = checksumToHash(cctx->nextSum); + +#ifdef RUN_CHECKS + check = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); + + if (check != cctx->nextSum) { + printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); + } + + if ((cctx->nextIp - cctx->lastPosHashed) != 1) { + printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", + cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, + cctx->ip - cctx->ibase); + } +#endif +} + +static void putHashOfCurrentPositionFromHash( + LDM_CCtx *cctx, hash_t hash, U32 sum) { +#ifdef COMPUTE_STATS + if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { + offset_t offset = (cctx->hashTable)[hash].offset; + cctx->stats.numHashInserts++; + if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { + cctx->stats.numCollisions++; + } + } +#endif + + // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. + // Note: this works only when cctx->step is 1. + if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { + (cctx->hashTable)[hash] = (hashEntry){ (offset_t)(cctx->ip - cctx->ibase) }; + } + + cctx->lastPosHashed = cctx->ip; + cctx->lastHash = hash; + cctx->lastSum = sum; +} + +/** + * Copy over the cctx->lastHash, cctx->lastSum, and cctx->lastPosHashed + * fields from the "next" fields. + * + * This requires that cctx->ip == cctx->nextPosHashed. + */ +static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { +#ifdef RUN_CHECKS + if (cctx->ip != cctx->nextPosHashed) { + printf("CHECK failed: updateLastHashFromNextHash %zu\n", + cctx->ip - cctx->ibase); + } +#endif + putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); +} + +/** + * Insert hash of the current position into the hash table. + */ +static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { + U32 sum = getChecksum((const char *)cctx->ip, LDM_HASH_LENGTH); + hash_t hash = checksumToHash(sum); + +#ifdef RUN_CHECKS + if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { + printf("CHECK failed: putHashOfCurrentPosition %zu\n", + cctx->ip - cctx->ibase); + } +#endif + + putHashOfCurrentPositionFromHash(cctx, hash, sum); +} + +/** + * Returns the position of the entry at hashTable[hash]. + */ +static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) { + return cctx->hashTable[hash].offset + cctx->ibase; +} + +/** + * Counts the number of bytes that match from pIn and pMatch, + * up to pInLimit. + * + * TODO: make more efficient. + */ +static unsigned countMatchLength(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = LDM_readByte(pMatch) ^ LDM_readByte(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (unsigned)(pIn - pStart); + } + return (unsigned)(pIn - pStart); +} + +void LDM_readHeader(const void *src, size_t *compressSize, + size_t *decompressSize) { + const U32 *ip = (const U32 *)src; + *compressSize = *ip++; + *decompressSize = *ip; +} + +/** + * Initialize a compression context. + */ +static void initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + cctx->isize = srcSize; + cctx->maxOSize = maxDstSize; + + cctx->ibase = (const BYTE *)src; + cctx->ip = cctx->ibase; + cctx->iend = cctx->ibase + srcSize; + + cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; + cctx->imatchLimit = cctx->iend - MINMATCH; + + cctx->obase = (BYTE *)dst; + cctx->op = (BYTE *)dst; + + cctx->anchor = cctx->ibase; + + memset(&(cctx->stats), 0, sizeof(cctx->stats)); + memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); + + cctx->lastPosHashed = NULL; + + cctx->step = 1; // Fixed to be 1 for now. Changing may break things. + cctx->nextIp = cctx->ip + cctx->step; + cctx->nextPosHashed = 0; + + cctx->DEBUG_setNextHash = 0; +} + +/** + * Finds the "best" match. + * + * Returns 0 if successful and 1 otherwise (i.e. no match can be found + * in the remaining input that is long enough). + * + */ +static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { + cctx->nextIp = cctx->ip + cctx->step; + + do { + hash_t h; + U32 sum; + setNextHash(cctx); + h = cctx->nextHash; + sum = cctx->nextSum; + cctx->ip = cctx->nextIp; + cctx->nextIp += cctx->step; + + if (cctx->ip > cctx->imatchLimit) { + return 1; + } + + *match = getPositionOnHash(cctx, h); + putHashOfCurrentPositionFromHash(cctx, h, sum); + + } while (cctx->ip - *match > WINDOW_SIZE || + !LDM_isValidMatch(cctx->ip, *match)); + setNextHash(cctx); + return 0; +} + +/** + * Write current block (literals, literal length, match offset, + * match length). + * + * Update input pointer, inserting hashes into hash table along the way. + */ +static void outputBlock(LDM_CCtx *cctx, + unsigned const literalLength, + unsigned const offset, + unsigned const matchLength) { + BYTE *token = cctx->op++; + + /* Encode the literal length. */ + if (literalLength >= RUN_MASK) { + int len = (int)literalLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *(cctx->op)++ = 255; + } + *(cctx->op)++ = (BYTE)len; + } else { + *token = (BYTE)(literalLength << ML_BITS); + } + + /* Encode the literals. */ + memcpy(cctx->op, cctx->anchor, literalLength); + cctx->op += literalLength; + + /* Encode the offset. */ + LDM_write32(cctx->op, offset); + cctx->op += LDM_OFFSET_SIZE; + + /* Encode the match length. */ + if (matchLength >= ML_MASK) { + unsigned matchLengthRemaining = matchLength; + *token += ML_MASK; + matchLengthRemaining -= ML_MASK; + LDM_write32(cctx->op, 0xFFFFFFFF); + while (matchLengthRemaining >= 4*0xFF) { + cctx->op += 4; + LDM_write32(cctx->op, 0xffffffff); + matchLengthRemaining -= 4*0xFF; + } + cctx->op += matchLengthRemaining / 255; + *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); + } else { + *token += (BYTE)(matchLength); + } +} + +// TODO: srcSize and maxDstSize is unused +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + LDM_CCtx cctx; + initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + + /* Hash the first position and put it into the hash table. */ + LDM_putHashOfCurrentPosition(&cctx); + + // TODO: loop condition is not accurate. + while (1) { + const BYTE *match; + + /** + * Find a match. + * If no more matches can be found (i.e. the length of the remaining input + * is less than the minimum match length), then stop searching for matches + * and encode the final literals. + */ + if (LDM_findBestMatch(&cctx, &match) != 0) { + goto _last_literals; + } +#ifdef COMPUTE_STATS + cctx.stats.numMatches++; +#endif + + /** + * Catch up: look back to extend the match backwards from the found match. + */ + while (cctx.ip > cctx.anchor && match > cctx.ibase && + cctx.ip[-1] == match[-1]) { + cctx.ip--; + match--; + } + + /** + * Write current block (literals, literal length, match offset, match + * length) and update pointers and hashes. + */ + { + unsigned const literalLength = (unsigned)(cctx.ip - cctx.anchor); + unsigned const offset = cctx.ip - match; + unsigned const matchLength = countMatchLength( + cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); + +#ifdef COMPUTE_STATS + cctx.stats.totalLiteralLength += literalLength; + cctx.stats.totalOffset += offset; + cctx.stats.totalMatchLength += matchLength + MINMATCH; +#endif + outputBlock(&cctx, literalLength, offset, matchLength); + + // Move ip to end of block, inserting hashes at each position. + cctx.nextIp = cctx.ip + cctx.step; + while (cctx.ip < cctx.anchor + MINMATCH + matchLength + literalLength) { + if (cctx.ip > cctx.lastPosHashed) { + // TODO: Simplify. + LDM_updateLastHashFromNextHash(&cctx); + setNextHash(&cctx); + } + cctx.ip++; + cctx.nextIp++; + } + } + + // Set start of next block to current input pointer. + cctx.anchor = cctx.ip; + LDM_updateLastHashFromNextHash(&cctx); + } +_last_literals: + /* Encode the last literals (no more matches). */ + { + size_t const lastRun = (size_t)(cctx.iend - cctx.anchor); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *(cctx.op)++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255; accumulator -= 255) { + *(cctx.op)++ = 255; + } + *(cctx.op)++ = (BYTE)accumulator; + } else { + *(cctx.op)++ = (BYTE)(lastRun << ML_BITS); + } + memcpy(cctx.op, cctx.anchor, lastRun); + cctx.op += lastRun; + } + +#ifdef COMPUTE_STATS + printCompressStats(&cctx); +#endif + + return (cctx.op - (const BYTE *)cctx.obase); +} + +typedef struct LDM_DCtx { + size_t compressSize; + size_t maxDecompressSize; + + const BYTE *ibase; /* Base of input */ + const BYTE *ip; /* Current input position */ + const BYTE *iend; /* End of source */ + + const BYTE *obase; /* Base of output */ + BYTE *op; /* Current output position */ + const BYTE *oend; /* End of output */ +} LDM_DCtx; + +static void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressSize, + void *dst, size_t maxDecompressSize) { + dctx->compressSize = compressSize; + dctx->maxDecompressSize = maxDecompressSize; + + dctx->ibase = src; + dctx->ip = (const BYTE *)src; + dctx->iend = dctx->ip + dctx->compressSize; + dctx->op = dst; + dctx->oend = dctx->op + dctx->maxDecompressSize; + +} + +size_t LDM_decompress(const void *src, size_t compressSize, + void *dst, size_t maxDecompressSize) { + LDM_DCtx dctx; + LDM_initializeDCtx(&dctx, src, compressSize, dst, maxDecompressSize); + + while (dctx.ip < dctx.iend) { + BYTE *cpy; + const BYTE *match; + size_t length, offset; + + /* Get the literal length. */ + unsigned const token = *(dctx.ip)++; + if ((length = (token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + + /* Copy the literals. */ + cpy = dctx.op + length; + memcpy(dctx.op, dctx.ip, length); + dctx.ip += length; + dctx.op = cpy; + + //TODO : dynamic offset size + offset = LDM_read32(dctx.ip); + dctx.ip += LDM_OFFSET_SIZE; + match = dctx.op - offset; + + /* Get the match length. */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + length += MINMATCH; + + /* Copy match. */ + cpy = dctx.op + length; + + // Inefficient for now. + while (match < cpy - offset && dctx.op < dctx.oend) { + *(dctx.op)++ = *match++; + } + } + return dctx.op - (BYTE *)dst; +} + +/* +void LDM_test(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + const BYTE *ip = (const BYTE *)src + 1125; + U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); + U32 sum2; + ++ip; + for (; ip < (const BYTE *)src + 1125 + 100; ip++) { + sum2 = updateChecksum(sum, LDM_HASH_LENGTH, + ip[-1], ip[LDM_HASH_LENGTH - 1]); + sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); + printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2); + } +} +*/ + + diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.h b/contrib/long_distance_matching/versions/v0.5/ldm.h new file mode 100644 index 00000000..1bd19745 --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.5/ldm.h @@ -0,0 +1,26 @@ +#ifndef LDM_H +#define LDM_H + +#include /* size_t */ + +#define LDM_COMPRESS_SIZE 4 +#define LDM_DECOMPRESS_SIZE 4 +#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) + +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +size_t LDM_decompress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +/** + * Reads the header from src and writes the compressed size and + * decompressed size into compressSize and decompressSize respectively. + */ +void LDM_readHeader(const void *src, size_t *compressSize, + size_t *decompressSize); + +void LDM_test(const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v0.5/main-ldm.c b/contrib/long_distance_matching/versions/v0.5/main-ldm.c new file mode 100644 index 00000000..fbfd789b --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.5/main-ldm.c @@ -0,0 +1,468 @@ +// TODO: file size must fit into a U32 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ldm.h" + +#define DEBUG +//#define TEST + +/* Compress file given by fname and output to oname. + * Returns 0 if successful, error code otherwise. + */ +static int compress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + size_t maxCompressSize, compressSize; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; + + /* Go to the location corresponding to the last byte. */ + /* TODO: fallocate? */ + if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* Write a dummy byte at the last location. */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + +/* +#ifdef TEST + LDM_test(src, statbuf.st_size, + dst + LDM_HEADER_SIZE, statbuf.st_size); +#endif +*/ + + compressSize = LDM_HEADER_SIZE + + LDM_compress(src, statbuf.st_size, + dst + LDM_HEADER_SIZE, statbuf.st_size); + + // Write compress and decompress size to header + // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + memcpy(dst, &compressSize, 4); + memcpy(dst + 4, &(statbuf.st_size), 4); + +#ifdef DEBUG + printf("Compressed size: %zu\n", compressSize); + printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); +#endif + + // Truncate file to compressSize. + ftruncate(fdout, compressSize); + + printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, + (unsigned)statbuf.st_size, (unsigned)compressSize, oname, + (double)compressSize / (statbuf.st_size) * 100); + + // Close files. + close(fdin); + close(fdout); + return 0; +} + +/* Decompress file compressed using LDM_compress. + * The input file should have the LDM_HEADER followed by payload. + * Returns 0 if succesful, and an error code otherwise. + */ +static int decompress(const char *fname, const char *oname) { + int fdin, fdout; + struct stat statbuf; + char *src, *dst; + size_t compressSize, decompressSize, outSize; + + /* Open the input file. */ + if ((fdin = open(fname, O_RDONLY)) < 0) { + perror("Error in file opening"); + return 1; + } + + /* Open the output file. */ + if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { + perror("Can't create output file"); + return 1; + } + + /* Find the size of the input file. */ + if (fstat (fdin, &statbuf) < 0) { + perror("Fstat error"); + return 1; + } + + /* mmap the input file. */ + if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) + == (caddr_t) - 1) { + perror("mmap error for input"); + return 1; + } + + /* Read the header. */ + LDM_readHeader(src, &compressSize, &decompressSize); + + /* Go to the location corresponding to the last byte. */ + if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { + perror("lseek error"); + return 1; + } + + /* write a dummy byte at the last location */ + if (write(fdout, "", 1) != 1) { + perror("write error"); + return 1; + } + + /* mmap the output file */ + if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { + perror("mmap error for output"); + return 1; + } + + outSize = LDM_decompress( + src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, + dst, decompressSize); + + printf("Ret size out: %zu\n", outSize); + ftruncate(fdout, outSize); + + close(fdin); + close(fdout); + return 0; +} + +/* Compare two files. + * Returns 0 iff they are the same. + */ +static int compare(FILE *fp0, FILE *fp1) { + int result = 0; + while (result == 0) { + char b0[1024]; + char b1[1024]; + const size_t r0 = fread(b0, 1, sizeof(b0), fp0); + const size_t r1 = fread(b1, 1, sizeof(b1), fp1); + + result = (int)r0 - (int)r1; + + if (0 == r0 || 0 == r1) break; + + if (0 == result) result = memcmp(b0, b1, r0); + } + return result; +} + +/* Verify the input file is the same as the decompressed file. */ +static void verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + { + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + } + + fclose(decFp); + fclose(inpFp); +} + +int main(int argc, const char *argv[]) { + const char * const exeName = argv[0]; + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Wrong arguments\n"); + printf("Usage:\n"); + printf("%s FILE\n", exeName); + return 1; + } + + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + + /* Compress */ + { + struct timeval tv1, tv2; + gettimeofday(&tv1, NULL); + if (compress(inpFilename, ldmFilename)) { + printf("Compress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total compress time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + } + + /* Decompress */ + { + struct timeval tv1, tv2; + gettimeofday(&tv1, NULL); + if (decompress(ldmFilename, decFilename)) { + printf("Decompress error"); + return 1; + } + gettimeofday(&tv2, NULL); + printf("Total decompress time = %f seconds\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec)); + } + /* verify */ + verify(inpFilename, decFilename); + return 0; +} + + +#if 0 +static size_t compress_file(FILE *in, FILE *out, size_t *size_in, + size_t *size_out) { + char *src, *buf = NULL; + size_t r = 1; + size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; + + src = malloc(BUF_SIZE); + if (!src) { + printf("Not enough memory\n"); + goto cleanup; + } + + size = BUF_SIZE + LDM_HEADER_SIZE; + buf = malloc(size); + if (!buf) { + printf("Not enough memory\n"); + goto cleanup; + } + + + for (;;) { + k = fread(src, 1, BUF_SIZE, in); + if (k == 0) + break; + count_in += k; + + n = LDM_compress(src, buf, k, BUF_SIZE); + + // n = k; + // offset += n; + offset = k; + count_out += k; + +// k = fwrite(src, 1, offset, out); + + k = fwrite(buf, 1, offset, out); + if (k < offset) { + if (ferror(out)) + printf("Write failed\n"); + else + printf("Short write\n"); + goto cleanup; + } + + } + *size_in = count_in; + *size_out = count_out; + r = 0; + cleanup: + free(src); + free(buf); + return r; +} + +static size_t decompress_file(FILE *in, FILE *out) { + void *src = malloc(BUF_SIZE); + void *dst = NULL; + size_t dst_capacity = BUF_SIZE; + size_t ret = 1; + size_t bytes_written = 0; + + if (!src) { + perror("decompress_file(src)"); + goto cleanup; + } + + while (ret != 0) { + /* Load more input */ + size_t src_size = fread(src, 1, BUF_SIZE, in); + void *src_ptr = src; + void *src_end = src_ptr + src_size; + if (src_size == 0 || ferror(in)) { + printf("(TODO): Decompress: not enough input or error reading file\n"); + //TODO + ret = 0; + goto cleanup; + } + + /* Allocate destination buffer if it hasn't been allocated already */ + if (!dst) { + dst = malloc(dst_capacity); + if (!dst) { + perror("decompress_file(dst)"); + goto cleanup; + } + } + + // TODO + + /* Decompress: + * Continue while there is more input to read. + */ + while (src_ptr != src_end && ret != 0) { + // size_t dst_size = src_size; + size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); + size_t written = fwrite(dst, 1, dst_size, out); +// printf("Writing %zu bytes\n", dst_size); + bytes_written += dst_size; + if (written != dst_size) { + printf("Decompress: Failed to write to file\n"); + goto cleanup; + } + src_ptr += src_size; + src_size = src_end - src_ptr; + } + + /* Update input */ + + } + + printf("Wrote %zu bytes\n", bytes_written); + + cleanup: + free(src); + free(dst); + + return ret; +} + +int main2(int argc, char *argv[]) { + char inpFilename[256] = { 0 }; + char ldmFilename[256] = { 0 }; + char decFilename[256] = { 0 }; + + if (argc < 2) { + printf("Please specify input filename\n"); + return 0; + } + snprintf(inpFilename, 256, "%s", argv[1]); + snprintf(ldmFilename, 256, "%s.ldm", argv[1]); + snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); + + printf("inp = [%s]\n", inpFilename); + printf("ldm = [%s]\n", ldmFilename); + printf("dec = [%s]\n", decFilename); + + /* compress */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *outFp = fopen(ldmFilename, "wb"); + size_t sizeIn = 0; + size_t sizeOut = 0; + size_t ret; + printf("compress : %s -> %s\n", inpFilename, ldmFilename); + ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); + if (ret) { + printf("compress : failed with code %zu\n", ret); + return ret; + } + printf("%s: %zu → %zu bytes, %.1f%%\n", + inpFilename, sizeIn, sizeOut, + (double)sizeOut / sizeIn * 100); + printf("compress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* decompress */ + { + FILE *inpFp = fopen(ldmFilename, "rb"); + FILE *outFp = fopen(decFilename, "wb"); + size_t ret; + + printf("decompress : %s -> %s\n", ldmFilename, decFilename); + ret = decompress_file(inpFp, outFp); + if (ret) { + printf("decompress : failed with code %zu\n", ret); + return ret; + } + printf("decompress : done\n"); + + fclose(outFp); + fclose(inpFp); + } + + /* verify */ + { + FILE *inpFp = fopen(inpFilename, "rb"); + FILE *decFp = fopen(decFilename, "rb"); + + printf("verify : %s <-> %s\n", inpFilename, decFilename); + const int cmp = compare(inpFp, decFp); + if(0 == cmp) { + printf("verify : OK\n"); + } else { + printf("verify : NG\n"); + } + + fclose(decFp); + fclose(inpFp); + } + return 0; +} +#endif + diff --git a/contrib/long_distance_matching/versions/v0.5/util.c b/contrib/long_distance_matching/versions/v0.5/util.c new file mode 100644 index 00000000..70fcbc2c --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.5/util.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include + +#include "util.h" + +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; + +unsigned LDM_isLittleEndian(void) { + const union { U32 u; BYTE c[4]; } one = { 1 }; + return one.c[0]; +} + +U16 LDM_read16(const void *memPtr) { + U16 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +U16 LDM_readLE16(const void *memPtr) { + if (LDM_isLittleEndian()) { + return LDM_read16(memPtr); + } else { + const BYTE *p = (const BYTE *)memPtr; + return (U16)((U16)p[0] + (p[1] << 8)); + } +} + +void LDM_write16(void *memPtr, U16 value){ + memcpy(memPtr, &value, sizeof(value)); +} + +void LDM_write32(void *memPtr, U32 value) { + memcpy(memPtr, &value, sizeof(value)); +} + +void LDM_writeLE16(void *memPtr, U16 value) { + if (LDM_isLittleEndian()) { + LDM_write16(memPtr, value); + } else { + BYTE* p = (BYTE *)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +U32 LDM_read32(const void *ptr) { + return *(const U32 *)ptr; +} + +U64 LDM_read64(const void *ptr) { + return *(const U64 *)ptr; +} + +void LDM_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} + +BYTE LDM_readByte(const void *memPtr) { + BYTE val; + memcpy(&val, memPtr, 1); + return val; +} + diff --git a/contrib/long_distance_matching/versions/v0.5/util.h b/contrib/long_distance_matching/versions/v0.5/util.h new file mode 100644 index 00000000..d1c3c999 --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.5/util.h @@ -0,0 +1,25 @@ +#ifndef LDM_UTIL_H +#define LDM_UTIL_H + +unsigned LDM_isLittleEndian(void); + +uint16_t LDM_read16(const void *memPtr); + +uint16_t LDM_readLE16(const void *memPtr); + +void LDM_write16(void *memPtr, uint16_t value); + +void LDM_write32(void *memPtr, uint32_t value); + +void LDM_writeLE16(void *memPtr, uint16_t value); + +uint32_t LDM_read32(const void *ptr); + +uint64_t LDM_read64(const void *ptr); + +void LDM_copy8(void *dst, const void *src); + +uint8_t LDM_readByte(const void *ptr); + + +#endif /* LDM_UTIL_H */ From 8de82b6eb045d66dc8f0c4d54a7bbe7f144a05cc Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 12 Jul 2017 16:31:31 -0700 Subject: [PATCH 24/62] [ldm] Clean up versions --- contrib/long_distance_matching/ldm.c | 5 +- contrib/long_distance_matching/util.c | 1 - .../versions/v0.1/ldm.c | 394 ---------- .../versions/v0.1/ldm.h | 19 - .../versions/v0.1/main-ldm.c | 459 ----------- .../versions/v0.2/Makefile | 32 - .../versions/v0.2/ldm.c | 436 ----------- .../versions/v0.2/ldm.h | 19 - .../versions/v0.2/main-ldm.c | 474 ------------ .../versions/v0.3/Makefile | 10 - .../versions/v0.3/README | 3 + .../versions/v0.4/Makefile | 40 - .../versions/v0.4/ldm.c | 729 ------------------ .../versions/v0.4/ldm.h | 22 - .../versions/v0.4/main-ldm.c | 480 ------------ .../versions/v0.4/util.c | 69 -- .../versions/v0.4/util.h | 25 - .../versions/v0.5/Makefile | 15 +- .../versions/v0.5/README | 5 + .../versions/v0.5/ldm.c | 8 +- 20 files changed, 15 insertions(+), 3230 deletions(-) delete mode 100644 contrib/long_distance_matching/versions/v0.1/ldm.c delete mode 100644 contrib/long_distance_matching/versions/v0.1/ldm.h delete mode 100644 contrib/long_distance_matching/versions/v0.1/main-ldm.c delete mode 100644 contrib/long_distance_matching/versions/v0.2/Makefile delete mode 100644 contrib/long_distance_matching/versions/v0.2/ldm.c delete mode 100644 contrib/long_distance_matching/versions/v0.2/ldm.h delete mode 100644 contrib/long_distance_matching/versions/v0.2/main-ldm.c create mode 100644 contrib/long_distance_matching/versions/v0.3/README delete mode 100644 contrib/long_distance_matching/versions/v0.4/Makefile delete mode 100644 contrib/long_distance_matching/versions/v0.4/ldm.c delete mode 100644 contrib/long_distance_matching/versions/v0.4/ldm.h delete mode 100644 contrib/long_distance_matching/versions/v0.4/main-ldm.c delete mode 100644 contrib/long_distance_matching/versions/v0.4/util.c delete mode 100644 contrib/long_distance_matching/versions/v0.4/util.h create mode 100644 contrib/long_distance_matching/versions/v0.5/README diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index b17a0f15..87645b76 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -19,8 +19,8 @@ #define WINDOW_SIZE (1 << 20) //These should be multiples of four. -#define LDM_HASH_LENGTH 100 -#define MINMATCH 100 +#define LDM_HASH_LENGTH 4 +#define MINMATCH 4 #define ML_BITS 4 #define ML_MASK ((1U<iend = dctx->ip + dctx->compressSize; dctx->op = dst; dctx->oend = dctx->op + dctx->maxDecompressSize; - } size_t LDM_decompress(const void *src, size_t compressSize, diff --git a/contrib/long_distance_matching/util.c b/contrib/long_distance_matching/util.c index 70fcbc2c..47ac8a12 100644 --- a/contrib/long_distance_matching/util.c +++ b/contrib/long_distance_matching/util.c @@ -66,4 +66,3 @@ BYTE LDM_readByte(const void *memPtr) { memcpy(&val, memPtr, 1); return val; } - diff --git a/contrib/long_distance_matching/versions/v0.1/ldm.c b/contrib/long_distance_matching/versions/v0.1/ldm.c deleted file mode 100644 index 266425f8..00000000 --- a/contrib/long_distance_matching/versions/v0.1/ldm.c +++ /dev/null @@ -1,394 +0,0 @@ -#include -#include -#include -#include - -#include "ldm.h" - -#define LDM_MEMORY_USAGE 14 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) - -#define WINDOW_SIZE (1 << 20) -#define MAX_WINDOW_SIZE 31 -#define HASH_SIZE 4 -#define MINMATCH 4 - -#define ML_BITS 4 -#define ML_MASK ((1U<>8); - } -} - -static U32 LDM_read32(const void *ptr) { - return *(const U32 *)ptr; -} - -static U64 LDM_read64(const void *ptr) { - return *(const U64 *)ptr; -} - - -static void LDM_copy8(void *dst, const void *src) { - memcpy(dst, src, 8); -} - -static void LDM_wild_copy(void *dstPtr, const void *srcPtr, void *dstEnd) { - BYTE *d = (BYTE *)dstPtr; - const BYTE *s = (const BYTE *)srcPtr; - BYTE * const e = (BYTE *)dstEnd; - - do { - LDM_copy8(d, s); - d += 8; - s += 8; - } while (d < e); - -} - -struct hash_entry { - U64 offset; - tag t; -}; - -static U32 LDM_hash(U32 sequence) { - return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); -} - -static U32 LDM_hash5(U64 sequence) { - static const U64 prime5bytes = 889523592379ULL; - static const U64 prime8bytes = 11400714785074694791ULL; - const U32 hashLog = LDM_HASHLOG; - if (LDM_isLittleEndian()) - return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); - else - return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); -} - -static U32 LDM_hash_position(const void * const p) { - return LDM_hash(LDM_read32(p)); -} - -static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, - const BYTE *srcBase) { - U32 *hashTable = (U32 *) tableBase; - hashTable[h] = (U32)(p - srcBase); -} - -static void LDM_put_position(const BYTE *p, void *tableBase, - const BYTE *srcBase) { - U32 const h = LDM_hash_position(p); - LDM_put_position_on_hash(p, h, tableBase, srcBase); -} - -static const BYTE *LDM_get_position_on_hash( - U32 h, void *tableBase, const BYTE *srcBase) { - const U32 * const hashTable = (U32*)tableBase; - return hashTable[h] + srcBase; -} - -static BYTE LDM_read_byte(const void *memPtr) { - BYTE val; - memcpy(&val, memPtr, 1); - return val; -} - -static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (unsigned)(pIn - pStart); - } - return (unsigned)(pIn - pStart); -} - -void LDM_read_header(void const *source, size_t *compressed_size, - size_t *decompressed_size) { - const U32 *ip = (const U32 *)source; - *compressed_size = *ip++; - *decompressed_size = *ip; -} - -size_t LDM_compress(void const *source, void *dest, size_t source_size, - size_t max_dest_size) { - const BYTE * const istart = (const BYTE*)source; - const BYTE *ip = istart; - const BYTE * const iend = istart + source_size; - const BYTE *ilimit = iend - HASH_SIZE; - const BYTE * const matchlimit = iend - HASH_SIZE; - const BYTE * const mflimit = iend - MINMATCH; - BYTE *op = (BYTE*) dest; - U32 hashTable[LDM_HASHTABLESIZE_U32]; - memset(hashTable, 0, sizeof(hashTable)); - - const BYTE *anchor = (const BYTE *)source; -// struct LDM_cctx cctx; - size_t output_size = 0; - - U32 forwardH; - - /* Hash first byte: put into hash table */ - - LDM_put_position(ip, hashTable, istart); - ip++; - forwardH = LDM_hash_position(ip); - - //TODO Loop terminates before ip>=ilimit. - while (ip < ilimit) { - const BYTE *match; - BYTE *token; - - /* Find a match */ - { - const BYTE *forwardIp = ip; - unsigned step = 1; - - do { - U32 const h = forwardH; - ip = forwardIp; - forwardIp += step; - - if (forwardIp > mflimit) { - goto _last_literals; - } - - match = LDM_get_position_on_hash(h, hashTable, istart); - - forwardH = LDM_hash_position(forwardIp); - LDM_put_position_on_hash(ip, h, hashTable, istart); - } while (ip - match > WINDOW_SIZE || - LDM_read64(match) != LDM_read64(ip)); - } - - // TODO catchup - while (ip > anchor && match > istart && ip[-1] == match[-1]) { - ip--; - match--; - } - - /* Encode literals */ - { - unsigned const litLength = (unsigned)(ip - anchor); - token = op++; - -#ifdef LDM_DEBUG - printf("Cur position: %zu\n", anchor - istart); - printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); -#endif - /* - fwrite(match, 4, 1, stdout); - printf("\n"); - */ - - if (litLength >= RUN_MASK) { - int len = (int)litLength - RUN_MASK; - *token = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *op++ = 255; - } - *op++ = (BYTE)len; - } else { - *token = (BYTE)(litLength << ML_BITS); - } -#ifdef LDM_DEBUG - printf("Literals "); - fwrite(anchor, litLength, 1, stdout); - printf("\n"); -#endif - memcpy(op, anchor, litLength); - //LDM_wild_copy(op, anchor, op + litLength); - op += litLength; - } -_next_match: - /* Encode offset */ - { - LDM_write32(op, ip - match); - op += 4; - } - - /* Encode Match Length */ - { - unsigned matchCode; - matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, - matchlimit); -#ifdef LDM_DEBUG - printf("Match length %zu\n", matchCode + MINMATCH); - fwrite(ip, MINMATCH + matchCode, 1, stdout); - printf("\n"); -#endif - ip += MINMATCH + matchCode; - if (matchCode >= ML_MASK) { - *token += ML_MASK; - matchCode -= ML_MASK; - LDM_write32(op, 0xFFFFFFFF); - while (matchCode >= 4*0xFF) { - op += 4; - LDM_write32(op, 0xffffffff); - matchCode -= 4*0xFF; - } - op += matchCode / 255; - *op++ = (BYTE)(matchCode % 255); - } else { - *token += (BYTE)(matchCode); - } -#ifdef LDM_DEBUG - printf("\n"); -#endif - } - - anchor = ip; - - LDM_put_position(ip, hashTable, istart); - forwardH = LDM_hash_position(++ip); - } -_last_literals: - /* Encode last literals */ - { - size_t const lastRun = (size_t)(iend - anchor); - if (lastRun >= RUN_MASK) { - size_t accumulator = lastRun - RUN_MASK; - *op++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255; accumulator -= 255) { - *op++ = 255; - } - *op++ = (BYTE)accumulator; - } else { - *op++ = (BYTE)(lastRun << ML_BITS); - } - memcpy(op, anchor, lastRun); - op += lastRun; - } - return (op - (BYTE *)dest); -} - -size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, - size_t max_decompressed_size) { - const BYTE *ip = (const BYTE *)source; - const BYTE * const iend = ip + compressed_size; - BYTE *op = (BYTE *)dest; - BYTE * const oend = op + max_decompressed_size; - BYTE *cpy; - - while (ip < iend) { - size_t length; - const BYTE *match; - size_t offset; - - /* get literal length */ - unsigned const token = *ip++; - if ((length=(token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *ip++; - length += s; - } while (s == 255); - } -#ifdef LDM_DEBUG - printf("Literal length: %zu\n", length); -#endif - - /* copy literals */ - cpy = op + length; -#ifdef LDM_DEBUG - printf("Literals "); - fwrite(ip, length, 1, stdout); - printf("\n"); -#endif - memcpy(op, ip, length); -// LDM_wild_copy(op, ip, cpy); - ip += length; - op = cpy; - - /* get offset */ - offset = LDM_read32(ip); - -#ifdef LDM_DEBUG - printf("Offset: %zu\n", offset); -#endif - ip += 4; - match = op - offset; - // LDM_write32(op, (U32)offset); - - /* get matchlength */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *ip++; - length += s; - } while (s == 255); - } - length += MINMATCH; -#ifdef LDM_DEBUG - printf("Match length: %zu\n", length); -#endif - /* copy match */ - cpy = op + length; - - // Inefficient for now - - while (match < cpy - offset && op < oend) { - *op++ = *match++; - } - } -// memcpy(dest, source, compressed_size); - return op - (BYTE *)dest; -} - - diff --git a/contrib/long_distance_matching/versions/v0.1/ldm.h b/contrib/long_distance_matching/versions/v0.1/ldm.h deleted file mode 100644 index f4ca25a3..00000000 --- a/contrib/long_distance_matching/versions/v0.1/ldm.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef LDM_H -#define LDM_H - -#include /* size_t */ - -#define LDM_COMPRESS_SIZE 4 -#define LDM_DECOMPRESS_SIZE 4 -#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) - -size_t LDM_compress(void const *source, void *dest, size_t source_size, - size_t max_dest_size); - -size_t LDM_decompress(void const *source, void *dest, size_t compressed_size, - size_t max_decompressed_size); - -void LDM_read_header(void const *source, size_t *compressed_size, - size_t *decompressed_size); - -#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v0.1/main-ldm.c b/contrib/long_distance_matching/versions/v0.1/main-ldm.c deleted file mode 100644 index 10869cce..00000000 --- a/contrib/long_distance_matching/versions/v0.1/main-ldm.c +++ /dev/null @@ -1,459 +0,0 @@ -// TODO: file size must fit into a U32 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "ldm.h" - -// #define BUF_SIZE 16*1024 // Block size -#define DEBUG - -//#define ZSTD - -#if 0 -static size_t compress_file(FILE *in, FILE *out, size_t *size_in, - size_t *size_out) { - char *src, *buf = NULL; - size_t r = 1; - size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; - - src = malloc(BUF_SIZE); - if (!src) { - printf("Not enough memory\n"); - goto cleanup; - } - - size = BUF_SIZE + LDM_HEADER_SIZE; - buf = malloc(size); - if (!buf) { - printf("Not enough memory\n"); - goto cleanup; - } - - - for (;;) { - k = fread(src, 1, BUF_SIZE, in); - if (k == 0) - break; - count_in += k; - - n = LDM_compress(src, buf, k, BUF_SIZE); - - // n = k; - // offset += n; - offset = k; - count_out += k; - -// k = fwrite(src, 1, offset, out); - - k = fwrite(buf, 1, offset, out); - if (k < offset) { - if (ferror(out)) - printf("Write failed\n"); - else - printf("Short write\n"); - goto cleanup; - } - - } - *size_in = count_in; - *size_out = count_out; - r = 0; - cleanup: - free(src); - free(buf); - return r; -} - -static size_t decompress_file(FILE *in, FILE *out) { - void *src = malloc(BUF_SIZE); - void *dst = NULL; - size_t dst_capacity = BUF_SIZE; - size_t ret = 1; - size_t bytes_written = 0; - - if (!src) { - perror("decompress_file(src)"); - goto cleanup; - } - - while (ret != 0) { - /* Load more input */ - size_t src_size = fread(src, 1, BUF_SIZE, in); - void *src_ptr = src; - void *src_end = src_ptr + src_size; - if (src_size == 0 || ferror(in)) { - printf("(TODO): Decompress: not enough input or error reading file\n"); - //TODO - ret = 0; - goto cleanup; - } - - /* Allocate destination buffer if it hasn't been allocated already */ - if (!dst) { - dst = malloc(dst_capacity); - if (!dst) { - perror("decompress_file(dst)"); - goto cleanup; - } - } - - // TODO - - /* Decompress: - * Continue while there is more input to read. - */ - while (src_ptr != src_end && ret != 0) { - // size_t dst_size = src_size; - size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); - size_t written = fwrite(dst, 1, dst_size, out); -// printf("Writing %zu bytes\n", dst_size); - bytes_written += dst_size; - if (written != dst_size) { - printf("Decompress: Failed to write to file\n"); - goto cleanup; - } - src_ptr += src_size; - src_size = src_end - src_ptr; - } - - /* Update input */ - - } - - printf("Wrote %zu bytes\n", bytes_written); - - cleanup: - free(src); - free(dst); - - return ret; -} -#endif - -static size_t compress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - - /* open the input file */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* open the output file */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* find size of input file */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - size_t size_in = statbuf.st_size; - - /* go to the location corresponding to the last byte */ - if (lseek(fdout, size_in + LDM_HEADER_SIZE - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the input file */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - size_t out_size = statbuf.st_size + LDM_HEADER_SIZE; - - /* mmap the output file */ - if ((dst = mmap(0, out_size, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - - #ifdef ZSTD - size_t size_out = ZSTD_compress(dst, statbuf.st_size, - src, statbuf.st_size, 1); - #else - size_t size_out = LDM_compress(src, dst + LDM_HEADER_SIZE, statbuf.st_size, - statbuf.st_size); - size_out += LDM_HEADER_SIZE; - - // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &size_out, 4); - memcpy(dst + 4, &(statbuf.st_size), 4); - printf("Compressed size: %zu\n", size_out); - printf("Decompressed size: %zu\n", statbuf.st_size); - #endif - ftruncate(fdout, size_out); - - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)size_out, oname, - (double)size_out / (statbuf.st_size) * 100); - - close(fdin); - close(fdout); - return 0; -} - -static size_t decompress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - - /* open the input file */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* open the output file */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* find size of input file */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - /* mmap the input file */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* read header */ - size_t compressed_size, decompressed_size; - LDM_read_header(src, &compressed_size, &decompressed_size); - - printf("Size, compressed_size, decompressed_size: %zu %zu %zu\n", - statbuf.st_size, compressed_size, decompressed_size); - - /* go to the location corresponding to the last byte */ - if (lseek(fdout, decompressed_size - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, decompressed_size, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - - /* Copy input file to output file */ -// memcpy(dst, src, statbuf.st_size); - - #ifdef ZSTD - size_t size_out = ZSTD_decompress(dst, decomrpessed_size, - src + LDM_HEADER_SIZE, - statbuf.st_size - LDM_HEADER_SIZE); - #else - size_t size_out = LDM_decompress(src + LDM_HEADER_SIZE, dst, - statbuf.st_size - LDM_HEADER_SIZE, - decompressed_size); - printf("Ret size out: %zu\n", size_out); - #endif - ftruncate(fdout, size_out); - - close(fdin); - close(fdout); - return 0; -} - -static int compare(FILE *fp0, FILE *fp1) { - int result = 0; - while (result == 0) { - char b0[1024]; - char b1[1024]; - const size_t r0 = fread(b0, 1, sizeof(b0), fp0); - const size_t r1 = fread(b1, 1, sizeof(b1), fp1); - - result = (int)r0 - (int)r1; - - if (0 == r0 || 0 == r1) { - break; - } - if (0 == result) { - result = memcmp(b0, b1, r0); - } - } - return result; -} - -static void verify(const char *inpFilename, const char *decFilename) { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); -} - -int main(int argc, const char *argv[]) { - const char * const exeName = argv[0]; - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Wrong arguments\n"); - printf("Usage:\n"); - printf("%s FILE\n", exeName); - return 1; - } - - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - struct timeval tv1, tv2; - /* compress */ - { - gettimeofday(&tv1, NULL); - if (compress(inpFilename, ldmFilename)) { - printf("Compress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - } - - /* decompress */ - - gettimeofday(&tv1, NULL); - if (decompress(ldmFilename, decFilename)) { - printf("Decompress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - - /* verify */ - verify(inpFilename, decFilename); - return 0; -} - -#if 0 -int main2(int argc, char *argv[]) { - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Please specify input filename\n"); - return 0; - } - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - /* compress */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *outFp = fopen(ldmFilename, "wb"); - size_t sizeIn = 0; - size_t sizeOut = 0; - size_t ret; - printf("compress : %s -> %s\n", inpFilename, ldmFilename); - ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); - if (ret) { - printf("compress : failed with code %zu\n", ret); - return ret; - } - printf("%s: %zu → %zu bytes, %.1f%%\n", - inpFilename, sizeIn, sizeOut, - (double)sizeOut / sizeIn * 100); - printf("compress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* decompress */ - { - FILE *inpFp = fopen(ldmFilename, "rb"); - FILE *outFp = fopen(decFilename, "wb"); - size_t ret; - - printf("decompress : %s -> %s\n", ldmFilename, decFilename); - ret = decompress_file(inpFp, outFp); - if (ret) { - printf("decompress : failed with code %zu\n", ret); - return ret; - } - printf("decompress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* verify */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); - } - return 0; -} -#endif - diff --git a/contrib/long_distance_matching/versions/v0.2/Makefile b/contrib/long_distance_matching/versions/v0.2/Makefile deleted file mode 100644 index 4e04fd6a..00000000 --- a/contrib/long_distance_matching/versions/v0.2/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -# ################################################################ -# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. An additional grant -# of patent rights can be found in the PATENTS file in the same directory. -# ################################################################ - -# This Makefile presumes libzstd is installed, using `sudo make install` - - -LDFLAGS += -lzstd - -.PHONY: default all clean - -default: all - -all: main-ldm - - -#main : ldm.c main.c -# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -main-ldm : ldm.c main-ldm.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -clean: - @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main main-ldm - @echo Cleaning completed - diff --git a/contrib/long_distance_matching/versions/v0.2/ldm.c b/contrib/long_distance_matching/versions/v0.2/ldm.c deleted file mode 100644 index 9081d136..00000000 --- a/contrib/long_distance_matching/versions/v0.2/ldm.c +++ /dev/null @@ -1,436 +0,0 @@ -#include -#include -#include -#include - -#include "ldm.h" - -#define HASH_EVERY 7 - -#define LDM_MEMORY_USAGE 14 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) - -#define WINDOW_SIZE (1 << 20) -#define MAX_WINDOW_SIZE 31 -#define HASH_SIZE 8 -#define MINMATCH 8 - -#define ML_BITS 4 -#define ML_MASK ((1U<>8); - } -} - -static U32 LDM_read32(const void *ptr) { - return *(const U32 *)ptr; -} - -static U64 LDM_read64(const void *ptr) { - return *(const U64 *)ptr; -} - -static void LDM_copy8(void *dst, const void *src) { - memcpy(dst, src, 8); -} - -typedef struct compress_stats { - U32 num_matches; - U32 total_match_length; - U32 total_literal_length; - U64 total_offset; -} compress_stats; - -static void LDM_printCompressStats(const compress_stats *stats) { - printf("=====================\n"); - printf("Compression statistics\n"); - printf("Total number of matches: %u\n", stats->num_matches); - printf("Average match length: %.1f\n", ((double)stats->total_match_length) / - (double)stats->num_matches); - printf("Average literal length: %.1f\n", - ((double)stats->total_literal_length) / (double)stats->num_matches); - printf("Average offset length: %.1f\n", - ((double)stats->total_offset) / (double)stats->num_matches); - printf("=====================\n"); -} - -// TODO: unused. -struct hash_entry { - U64 offset; - tag t; -}; - -static U32 LDM_hash(U32 sequence) { - return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); -} - -static U32 LDM_hash5(U64 sequence) { - static const U64 prime5bytes = 889523592379ULL; - static const U64 prime8bytes = 11400714785074694791ULL; - const U32 hashLog = LDM_HASHLOG; - if (LDM_isLittleEndian()) - return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); - else - return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); -} - -static U32 LDM_hash_position(const void * const p) { - return LDM_hash(LDM_read32(p)); -} - -static void LDM_put_position_on_hash(const BYTE *p, U32 h, void *tableBase, - const BYTE *srcBase) { - if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { - return; - } - - U32 *hashTable = (U32 *) tableBase; - hashTable[h] = (U32)(p - srcBase); -} - -static void LDM_put_position(const BYTE *p, void *tableBase, - const BYTE *srcBase) { - if (((p - srcBase) & HASH_EVERY) != HASH_EVERY) { - return; - } - U32 const h = LDM_hash_position(p); - LDM_put_position_on_hash(p, h, tableBase, srcBase); -} - -static const BYTE *LDM_get_position_on_hash( - U32 h, void *tableBase, const BYTE *srcBase) { - const U32 * const hashTable = (U32*)tableBase; - return hashTable[h] + srcBase; -} - -static BYTE LDM_read_byte(const void *memPtr) { - BYTE val; - memcpy(&val, memPtr, 1); - return val; -} - -static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (unsigned)(pIn - pStart); - } - return (unsigned)(pIn - pStart); -} - -void LDM_read_header(const void *src, size_t *compressSize, - size_t *decompressSize) { - const U32 *ip = (const U32 *)src; - *compressSize = *ip++; - *decompressSize = *ip; -} - -// TODO: maxDstSize is unused -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - const BYTE * const istart = (const BYTE*)src; - const BYTE *ip = istart; - const BYTE * const iend = istart + srcSize; - const BYTE *ilimit = iend - HASH_SIZE; - const BYTE * const matchlimit = iend - HASH_SIZE; - const BYTE * const mflimit = iend - MINMATCH; - BYTE *op = (BYTE*) dst; - - compress_stats compressStats = { 0 }; - - U32 hashTable[LDM_HASHTABLESIZE_U32]; - memset(hashTable, 0, sizeof(hashTable)); - - const BYTE *anchor = (const BYTE *)src; -// struct LDM_cctx cctx; - size_t output_size = 0; - - U32 forwardH; - - /* Hash first byte: put into hash table */ - - LDM_put_position(ip, hashTable, istart); - const BYTE *lastHash = ip; - ip++; - forwardH = LDM_hash_position(ip); - - //TODO Loop terminates before ip>=ilimit. - while (ip < ilimit) { - const BYTE *match; - BYTE *token; - - /* Find a match */ - { - const BYTE *forwardIp = ip; - unsigned step = 1; - - do { - U32 const h = forwardH; - ip = forwardIp; - forwardIp += step; - - if (forwardIp > mflimit) { - goto _last_literals; - } - - match = LDM_get_position_on_hash(h, hashTable, istart); - - forwardH = LDM_hash_position(forwardIp); - LDM_put_position_on_hash(ip, h, hashTable, istart); - lastHash = ip; - } while (ip - match > WINDOW_SIZE || - LDM_read64(match) != LDM_read64(ip)); - } - compressStats.num_matches++; - - /* Catchup: look back to extend match from found match */ - while (ip > anchor && match > istart && ip[-1] == match[-1]) { - ip--; - match--; - } - - /* Encode literals */ - { - unsigned const litLength = (unsigned)(ip - anchor); - token = op++; - - compressStats.total_literal_length += litLength; - -#ifdef LDM_DEBUG - printf("Cur position: %zu\n", anchor - istart); - printf("LitLength %zu. (Match offset). %zu\n", litLength, ip - match); -#endif - - if (litLength >= RUN_MASK) { - int len = (int)litLength - RUN_MASK; - *token = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *op++ = 255; - } - *op++ = (BYTE)len; - } else { - *token = (BYTE)(litLength << ML_BITS); - } -#ifdef LDM_DEBUG - printf("Literals "); - fwrite(anchor, litLength, 1, stdout); - printf("\n"); -#endif - memcpy(op, anchor, litLength); - op += litLength; - } -_next_match: - /* Encode offset */ - { - /* - LDM_writeLE16(op, ip-match); - op += 2; - */ - LDM_write32(op, ip - match); - op += 4; - compressStats.total_offset += (ip - match); - } - - /* Encode Match Length */ - { - unsigned matchCode; - matchCode = LDM_count(ip + MINMATCH, match + MINMATCH, - matchlimit); -#ifdef LDM_DEBUG - printf("Match length %zu\n", matchCode + MINMATCH); - fwrite(ip, MINMATCH + matchCode, 1, stdout); - printf("\n"); -#endif - compressStats.total_match_length += matchCode + MINMATCH; - unsigned ctr = 1; - ip++; - for (; ctr < MINMATCH + matchCode; ip++, ctr++) { - LDM_put_position(ip, hashTable, istart); - } -// ip += MINMATCH + matchCode; - if (matchCode >= ML_MASK) { - *token += ML_MASK; - matchCode -= ML_MASK; - LDM_write32(op, 0xFFFFFFFF); - while (matchCode >= 4*0xFF) { - op += 4; - LDM_write32(op, 0xffffffff); - matchCode -= 4*0xFF; - } - op += matchCode / 255; - *op++ = (BYTE)(matchCode % 255); - } else { - *token += (BYTE)(matchCode); - } -#ifdef LDM_DEBUG - printf("\n"); - -#endif - } - - anchor = ip; - - LDM_put_position(ip, hashTable, istart); - forwardH = LDM_hash_position(++ip); - lastHash = ip; - } -_last_literals: - /* Encode last literals */ - { - size_t const lastRun = (size_t)(iend - anchor); - if (lastRun >= RUN_MASK) { - size_t accumulator = lastRun - RUN_MASK; - *op++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255; accumulator -= 255) { - *op++ = 255; - } - *op++ = (BYTE)accumulator; - } else { - *op++ = (BYTE)(lastRun << ML_BITS); - } - memcpy(op, anchor, lastRun); - op += lastRun; - } - LDM_printCompressStats(&compressStats); - return (op - (BYTE *)dst); -} - -typedef struct LDM_DCtx { - const BYTE * const ibase; /* Pointer to base of input */ - const BYTE *ip; /* Pointer to current input position */ - const BYTE *iend; /* End of source */ - BYTE *op; /* Pointer to output */ - const BYTE * const oend; /* Pointer to end of output */ - -} LDM_DCtx; - -size_t LDM_decompress(const void *src, size_t compressed_size, - void *dst, size_t max_decompressed_size) { - const BYTE *ip = (const BYTE *)src; - const BYTE * const iend = ip + compressed_size; - BYTE *op = (BYTE *)dst; - BYTE * const oend = op + max_decompressed_size; - BYTE *cpy; - - while (ip < iend) { - size_t length; - const BYTE *match; - size_t offset; - - /* get literal length */ - unsigned const token = *ip++; - if ((length=(token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *ip++; - length += s; - } while (s == 255); - } -#ifdef LDM_DEBUG - printf("Literal length: %zu\n", length); -#endif - - /* copy literals */ - cpy = op + length; -#ifdef LDM_DEBUG - printf("Literals "); - fwrite(ip, length, 1, stdout); - printf("\n"); -#endif - memcpy(op, ip, length); - ip += length; - op = cpy; - - /* get offset */ - /* - offset = LDM_readLE16(ip); - ip += 2; - */ - offset = LDM_read32(ip); - ip += 4; -#ifdef LDM_DEBUG - printf("Offset: %zu\n", offset); -#endif - match = op - offset; - // LDM_write32(op, (U32)offset); - - /* get matchlength */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *ip++; - length += s; - } while (s == 255); - } - length += MINMATCH; -#ifdef LDM_DEBUG - printf("Match length: %zu\n", length); -#endif - /* copy match */ - cpy = op + length; - - // Inefficient for now - while (match < cpy - offset && op < oend) { - *op++ = *match++; - } - } - return op - (BYTE *)dst; -} - - diff --git a/contrib/long_distance_matching/versions/v0.2/ldm.h b/contrib/long_distance_matching/versions/v0.2/ldm.h deleted file mode 100644 index 0ac7b2ec..00000000 --- a/contrib/long_distance_matching/versions/v0.2/ldm.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef LDM_H -#define LDM_H - -#include /* size_t */ - -#define LDM_COMPRESS_SIZE 4 -#define LDM_DECOMPRESS_SIZE 4 -#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) - -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -size_t LDM_decompress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -void LDM_read_header(const void *src, size_t *compressSize, - size_t *decompressSize); - -#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v0.2/main-ldm.c b/contrib/long_distance_matching/versions/v0.2/main-ldm.c deleted file mode 100644 index 0017335b..00000000 --- a/contrib/long_distance_matching/versions/v0.2/main-ldm.c +++ /dev/null @@ -1,474 +0,0 @@ -// TODO: file size must fit into a U32 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "ldm.h" - -// #define BUF_SIZE 16*1024 // Block size -#define DEBUG - -//#define ZSTD - -/* Compress file given by fname and output to oname. - * Returns 0 if successful, error code otherwise. - */ -static int compress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - size_t maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; - - /* Go to the location corresponding to the last byte. */ - /* TODO: fallocate? */ - if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* Write a dummy byte at the last location. */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - -#ifdef ZSTD - size_t compressSize = ZSTD_compress(dst, statbuf.st_size, - src, statbuf.st_size, 1); -#else - size_t compressSize = LDM_HEADER_SIZE + - LDM_compress(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, statbuf.st_size); - - // Write compress and decompress size to header - // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &compressSize, 4); - memcpy(dst + 4, &(statbuf.st_size), 4); - -#ifdef DEBUG - printf("Compressed size: %zu\n", compressSize); - printf("Decompressed size: %zu\n", statbuf.st_size); -#endif -#endif - - // Truncate file to compressSize. - ftruncate(fdout, compressSize); - - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)compressSize, oname, - (double)compressSize / (statbuf.st_size) * 100); - - // Close files. - close(fdin); - close(fdout); - return 0; -} - -/* Decompress file compressed using LDM_compress. - * The input file should have the LDM_HEADER followed by payload. - * Returns 0 if succesful, and an error code otherwise. - */ -static int decompress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* Read the header. */ - size_t compressSize, decompressSize; - LDM_read_header(src, &compressSize, &decompressSize); - -#ifdef DEBUG - printf("Size, compressSize, decompressSize: %zu %zu %zu\n", - statbuf.st_size, compressSize, decompressSize); -#endif - - /* Go to the location corresponding to the last byte. */ - if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - -#ifdef ZSTD - size_t outSize = ZSTD_decompress(dst, decomrpessed_size, - src + LDM_HEADER_SIZE, - statbuf.st_size - LDM_HEADER_SIZE); -#else - size_t outSize = LDM_decompress( - src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, - dst, decompressSize); - - printf("Ret size out: %zu\n", outSize); - #endif - ftruncate(fdout, outSize); - - close(fdin); - close(fdout); - return 0; -} - -/* Compare two files. - * Returns 0 iff they are the same. - */ -static int compare(FILE *fp0, FILE *fp1) { - int result = 0; - while (result == 0) { - char b0[1024]; - char b1[1024]; - const size_t r0 = fread(b0, 1, sizeof(b0), fp0); - const size_t r1 = fread(b1, 1, sizeof(b1), fp1); - - result = (int)r0 - (int)r1; - - if (0 == r0 || 0 == r1) break; - - if (0 == result) result = memcmp(b0, b1, r0); - } - return result; -} - -/* Verify the input file is the same as the decompressed file. */ -static void verify(const char *inpFilename, const char *decFilename) { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); -} - -int main(int argc, const char *argv[]) { - const char * const exeName = argv[0]; - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Wrong arguments\n"); - printf("Usage:\n"); - printf("%s FILE\n", exeName); - return 1; - } - - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - struct timeval tv1, tv2; - - /* Compress */ - - gettimeofday(&tv1, NULL); - if (compress(inpFilename, ldmFilename)) { - printf("Compress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - - /* Decompress */ - - gettimeofday(&tv1, NULL); - if (decompress(ldmFilename, decFilename)) { - printf("Decompress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - - /* verify */ - verify(inpFilename, decFilename); - return 0; -} - - -#if 0 -static size_t compress_file(FILE *in, FILE *out, size_t *size_in, - size_t *size_out) { - char *src, *buf = NULL; - size_t r = 1; - size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; - - src = malloc(BUF_SIZE); - if (!src) { - printf("Not enough memory\n"); - goto cleanup; - } - - size = BUF_SIZE + LDM_HEADER_SIZE; - buf = malloc(size); - if (!buf) { - printf("Not enough memory\n"); - goto cleanup; - } - - - for (;;) { - k = fread(src, 1, BUF_SIZE, in); - if (k == 0) - break; - count_in += k; - - n = LDM_compress(src, buf, k, BUF_SIZE); - - // n = k; - // offset += n; - offset = k; - count_out += k; - -// k = fwrite(src, 1, offset, out); - - k = fwrite(buf, 1, offset, out); - if (k < offset) { - if (ferror(out)) - printf("Write failed\n"); - else - printf("Short write\n"); - goto cleanup; - } - - } - *size_in = count_in; - *size_out = count_out; - r = 0; - cleanup: - free(src); - free(buf); - return r; -} - -static size_t decompress_file(FILE *in, FILE *out) { - void *src = malloc(BUF_SIZE); - void *dst = NULL; - size_t dst_capacity = BUF_SIZE; - size_t ret = 1; - size_t bytes_written = 0; - - if (!src) { - perror("decompress_file(src)"); - goto cleanup; - } - - while (ret != 0) { - /* Load more input */ - size_t src_size = fread(src, 1, BUF_SIZE, in); - void *src_ptr = src; - void *src_end = src_ptr + src_size; - if (src_size == 0 || ferror(in)) { - printf("(TODO): Decompress: not enough input or error reading file\n"); - //TODO - ret = 0; - goto cleanup; - } - - /* Allocate destination buffer if it hasn't been allocated already */ - if (!dst) { - dst = malloc(dst_capacity); - if (!dst) { - perror("decompress_file(dst)"); - goto cleanup; - } - } - - // TODO - - /* Decompress: - * Continue while there is more input to read. - */ - while (src_ptr != src_end && ret != 0) { - // size_t dst_size = src_size; - size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); - size_t written = fwrite(dst, 1, dst_size, out); -// printf("Writing %zu bytes\n", dst_size); - bytes_written += dst_size; - if (written != dst_size) { - printf("Decompress: Failed to write to file\n"); - goto cleanup; - } - src_ptr += src_size; - src_size = src_end - src_ptr; - } - - /* Update input */ - - } - - printf("Wrote %zu bytes\n", bytes_written); - - cleanup: - free(src); - free(dst); - - return ret; -} - -int main2(int argc, char *argv[]) { - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Please specify input filename\n"); - return 0; - } - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - /* compress */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *outFp = fopen(ldmFilename, "wb"); - size_t sizeIn = 0; - size_t sizeOut = 0; - size_t ret; - printf("compress : %s -> %s\n", inpFilename, ldmFilename); - ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); - if (ret) { - printf("compress : failed with code %zu\n", ret); - return ret; - } - printf("%s: %zu → %zu bytes, %.1f%%\n", - inpFilename, sizeIn, sizeOut, - (double)sizeOut / sizeIn * 100); - printf("compress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* decompress */ - { - FILE *inpFp = fopen(ldmFilename, "rb"); - FILE *outFp = fopen(decFilename, "wb"); - size_t ret; - - printf("decompress : %s -> %s\n", ldmFilename, decFilename); - ret = decompress_file(inpFp, outFp); - if (ret) { - printf("decompress : failed with code %zu\n", ret); - return ret; - } - printf("decompress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* verify */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); - } - return 0; -} -#endif - diff --git a/contrib/long_distance_matching/versions/v0.3/Makefile b/contrib/long_distance_matching/versions/v0.3/Makefile index 5ffd4eaf..e5153970 100644 --- a/contrib/long_distance_matching/versions/v0.3/Makefile +++ b/contrib/long_distance_matching/versions/v0.3/Makefile @@ -1,12 +1,3 @@ -# ################################################################ -# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. An additional grant -# of patent rights can be found in the PATENTS file in the same directory. -# ################################################################ - # This Makefile presumes libzstd is installed, using `sudo make install` CFLAGS ?= -O3 @@ -26,7 +17,6 @@ default: all all: main-ldm - #main : ldm.c main.c # $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ diff --git a/contrib/long_distance_matching/versions/v0.3/README b/contrib/long_distance_matching/versions/v0.3/README new file mode 100644 index 00000000..8699562e --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.3/README @@ -0,0 +1,3 @@ +This version uses simple lz4-style compression: +- A 4-byte hash is inserted into the hash table for every position. +- Hash table replacement policy: direct overwrite. diff --git a/contrib/long_distance_matching/versions/v0.4/Makefile b/contrib/long_distance_matching/versions/v0.4/Makefile deleted file mode 100644 index 5ffd4eaf..00000000 --- a/contrib/long_distance_matching/versions/v0.4/Makefile +++ /dev/null @@ -1,40 +0,0 @@ -# ################################################################ -# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. An additional grant -# of patent rights can be found in the PATENTS file in the same directory. -# ################################################################ - -# This Makefile presumes libzstd is installed, using `sudo make install` - -CFLAGS ?= -O3 -DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ - -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ - -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ - -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ - -Wredundant-decls -CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS) -FLAGS = $(CPPFLAGS) $(CFLAGS) - -LDFLAGS += -lzstd - -.PHONY: default all clean - -default: all - -all: main-ldm - - -#main : ldm.c main.c -# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -main-ldm : util.c ldm.c main-ldm.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -clean: - @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main main-ldm - @echo Cleaning completed - diff --git a/contrib/long_distance_matching/versions/v0.4/ldm.c b/contrib/long_distance_matching/versions/v0.4/ldm.c deleted file mode 100644 index 79648097..00000000 --- a/contrib/long_distance_matching/versions/v0.4/ldm.c +++ /dev/null @@ -1,729 +0,0 @@ -#include -#include -#include -#include - - -#include "ldm.h" -#include "util.h" - -#define HASH_EVERY 1 - -#define LDM_MEMORY_USAGE 22 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) - -#define LDM_OFFSET_SIZE 4 - -#define WINDOW_SIZE (1 << 23) -#define MAX_WINDOW_SIZE 31 -#define HASH_SIZE 4 -#define LDM_HASH_LENGTH 4 - -// Should be multiple of four -#define MINMATCH 4 - -#define ML_BITS 4 -#define ML_MASK ((1U<numMatches); - printf("Average match length: %.1f\n", ((double)stats->totalMatchLength) / - (double)stats->numMatches); - printf("Average literal length: %.1f\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches); - printf("Average offset length: %.1f\n", - ((double)stats->totalOffset) / (double)stats->numMatches); - printf("Num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", - stats->numCollisions, stats->numHashInserts, - stats->numHashInserts == 0 ? - 1.0 : (100.0 * (double)stats->numCollisions) / - (double)stats->numHashInserts); - printf("=====================\n"); -} - -typedef struct LDM_CCtx { - size_t isize; /* Input size */ - size_t maxOSize; /* Maximum output size */ - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of input */ - - // Maximum input position such that hashing at the position does not exceed - // end of input. - const BYTE *ihashLimit; - - // Maximum input position such that finding a match of at least the minimum - // match length does not exceed end of input. - const BYTE *imatchLimit; - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Output */ - - const BYTE *anchor; /* Anchor to start of current (match) block */ - - LDM_compressStats stats; /* Compression statistics */ - - LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; - - const BYTE *lastPosHashed; /* Last position hashed */ - hash_t lastHash; /* Hash corresponding to lastPosHashed */ - const BYTE *nextIp; - const BYTE *nextPosHashed; - hash_t nextHash; /* Hash corresponding to nextPosHashed */ - - // Members for rolling hash. - U32 lastSum; - U32 nextSum; - - unsigned step; - - // DEBUG - const BYTE *DEBUG_setNextHash; -} LDM_CCtx; - -static int LDM_isValidMatch(const BYTE *p, const BYTE *match) { - U16 lengthLeft = MINMATCH; - const BYTE *curP = p; - const BYTE *curMatch = match; - - for (; lengthLeft >= 8; lengthLeft -= 8) { - if (LDM_read64(curP) != LDM_read64(curMatch)) { - return 0; - } - curP += 8; - curMatch += 8; - } - if (lengthLeft > 0) { - return LDM_read32(curP) == LDM_read32(curMatch); - } - return 1; -} - - - -#ifdef LDM_ROLLING_HASH -/** - * Convert a sum computed from LDM_getRollingHash to a hash value in the range - * of the hash table. - */ -static hash_t LDM_sumToHash(U32 sum) { - return sum & (LDM_HASH_SIZE_U32 - 1); -} - -static U32 LDM_getRollingHash(const char *data, U32 len) { - U32 i; - U32 s1, s2; - const schar *buf = (const schar *)data; - - s1 = s2 = 0; - for (i = 0; i < (len - 4); i += 4) { - s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + - (2 * buf[i + 2]) + (buf[i + 3]); - s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3]; - } - for(; i < len; i++) { - s1 += buf[i]; - s2 += s1; - } - return (s1 & 0xffff) + (s2 << 16); -} - -typedef struct LDM_sumStruct { - U16 s1, s2; -} LDM_sumStruct; - -static U32 LDM_updateRollingHash(U32 sum, U32 len, - schar toRemove, schar toAdd) { - U32 s1 = (sum & 0xffff) - toRemove + toAdd; - U32 s2 = (sum >> 16) - (toRemove * len) + s1; - - return (s1 & 0xffff) + (s2 << 16); -} - - -/* -static hash_t LDM_hashPosition(const void * const p) { - return LDM_sumToHash(LDM_getRollingHash((const char *)p, LDM_HASH_LENGTH)); -} -*/ - -/* -static void LDM_getRollingHashParts(U32 sum, LDM_sumStruct *sumStruct) { - sumStruct->s1 = sum & 0xffff; - sumStruct->s2 = sum >> 16; -} -*/ - -static void LDM_setNextHash(LDM_CCtx *cctx) { - -#ifdef RUN_CHECKS - U32 check; - if ((cctx->nextIp - cctx->ibase != 1) && - (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { - printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, - cctx->DEBUG_setNextHash - cctx->ibase); - } - - cctx->DEBUG_setNextHash = cctx->nextIp; -#endif - -// cctx->nextSum = LDM_getRollingHash((const char *)cctx->nextIp, LDM_HASH_LENGTH); - cctx->nextSum = LDM_updateRollingHash( - cctx->lastSum, LDM_HASH_LENGTH, - (schar)((cctx->lastPosHashed)[0]), - (schar)((cctx->lastPosHashed)[LDM_HASH_LENGTH])); - -#ifdef RUN_CHECKS - check = LDM_getRollingHash((const char *)cctx->nextIp, LDM_HASH_LENGTH); - - if (check != cctx->nextSum) { - printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); -// printf("INFO: %u %u %u\n", LDM_read32(cctx->nextIp), - } -#endif - cctx->nextPosHashed = cctx->nextIp; - cctx->nextHash = LDM_sumToHash(cctx->nextSum); - -#ifdef RUN_CHECKS - if ((cctx->nextIp - cctx->lastPosHashed) != 1) { - printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", - cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, - cctx->ip - cctx->ibase); - } -#endif - -} - -static void LDM_putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash, U32 sum) { - /* - if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { - return; - } - */ -#ifdef COMPUTE_STATS - if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { - offset_t offset = (cctx->hashTable)[hash].offset; - cctx->stats.numHashInserts++; - if (offset == 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { - cctx->stats.numCollisions++; - } - } -#endif - (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; - cctx->lastSum = sum; -} - -static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - if (cctx->ip != cctx->nextPosHashed) { - printf("CHECK failed: updateLastHashFromNextHash %zu\n", cctx->ip - cctx->ibase); - } -#endif - LDM_putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); -} - -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U32 sum = LDM_getRollingHash((const char *)cctx->ip, LDM_HASH_LENGTH); - hash_t hash = LDM_sumToHash(sum); -#ifdef RUN_CHECKS - if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { - printf("CHECK failed: putHashOfCurrentPosition %zu\n", cctx->ip - cctx->ibase); - } -#endif -// hash_t hash = LDM_hashPosition(cctx->ip); - LDM_putHashOfCurrentPositionFromHash(cctx, hash, sum); -// printf("Offset %zu\n", cctx->ip - cctx->ibase); -} - -#else -static hash_t LDM_hash(U32 sequence) { - return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); -} - -static hash_t LDM_hashPosition(const void * const p) { - return LDM_hash(LDM_read32(p)); -} - -static void LDM_putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash) { - /* - if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { - return; - } - */ -#ifdef COMPUTE_STATS - if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { - offset_t offset = (cctx->hashTable)[hash].offset; - cctx->stats.numHashInserts++; - if (offset == 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { - cctx->stats.numCollisions++; - } - } -#endif - - (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; -#ifdef RUN_CHECKS - if (cctx->ip - cctx->lastPosHashed != 1) { - printf("putHashError\n"); - } -#endif - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; -} - -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - hash_t hash = LDM_hashPosition(cctx->ip); - LDM_putHashOfCurrentPositionFromHash(cctx, hash); -} - -#endif - -/* -static hash_t LDM_hash5(U64 sequence) { - static const U64 prime5bytes = 889523592379ULL; - static const U64 prime8bytes = 11400714785074694791ULL; - const U32 hashLog = LDM_HASHLOG; - if (LDM_isLittleEndian()) - return (((sequence << 24) * prime5bytes) >> (64 - hashLog)); - else - return (((sequence >> 24) * prime8bytes) >> (64 - hashLog)); -} -*/ - - -static const BYTE *LDM_getPositionOnHash( - hash_t h, void *tableBase, const BYTE *srcBase) { - const LDM_hashEntry * const hashTable = (LDM_hashEntry *)tableBase; - return hashTable[h].offset + srcBase; -} - - -static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = LDM_readByte(pMatch) ^ LDM_readByte(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (unsigned)(pIn - pStart); - } - return (unsigned)(pIn - pStart); -} - -void LDM_readHeader(const void *src, size_t *compressSize, - size_t *decompressSize) { - const U32 *ip = (const U32 *)src; - *compressSize = *ip++; - *decompressSize = *ip; -} - -static void LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - cctx->isize = srcSize; - cctx->maxOSize = maxDstSize; - - cctx->ibase = (const BYTE *)src; - cctx->ip = cctx->ibase; - cctx->iend = cctx->ibase + srcSize; - -#ifdef LDM_ROLLING_HASH - cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; -#else - cctx->ihashLimit = cctx->iend - HASH_SIZE; -#endif - cctx->imatchLimit = cctx->iend - MINMATCH; - - cctx->obase = (BYTE *)dst; - cctx->op = (BYTE *)dst; - - cctx->anchor = cctx->ibase; - - memset(&(cctx->stats), 0, sizeof(cctx->stats)); - memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); - - cctx->lastPosHashed = NULL; - - cctx->step = 1; - cctx->nextIp = cctx->ip + cctx->step; - cctx->nextPosHashed = 0; - - cctx->DEBUG_setNextHash = 0; -} - -#ifdef LDM_ROLLING_HASH -static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { - cctx->nextIp = cctx->ip + cctx->step; - - do { - hash_t h; - U32 sum; -// printf("Call A\n"); - LDM_setNextHash(cctx); -// printf("End call a\n"); - h = cctx->nextHash; - sum = cctx->nextSum; - cctx->ip = cctx->nextIp; - cctx->nextIp += cctx->step; - - if (cctx->ip > cctx->imatchLimit) { - return 1; - } - - *match = LDM_getPositionOnHash(h, cctx->hashTable, cctx->ibase); - -// // Compute cctx->nextSum and cctx->nextHash from cctx->nextIp. -// LDM_setNextHash(cctx); - LDM_putHashOfCurrentPositionFromHash(cctx, h, sum); - -// printf("%u %u\n", cctx->lastHash, cctx->nextHash); - } while (cctx->ip - *match > WINDOW_SIZE || - !LDM_isValidMatch(cctx->ip, *match)); -// LDM_read64(*match) != LDM_read64(cctx->ip)); - LDM_setNextHash(cctx); - return 0; -} -#else -static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { - cctx->nextIp = cctx->ip; - - do { - hash_t const h = cctx->nextHash; - cctx->ip = cctx->nextIp; - cctx->nextIp += cctx->step; - - if (cctx->ip > cctx->imatchLimit) { - return 1; - } - - *match = LDM_getPositionOnHash(h, cctx->hashTable, cctx->ibase); - - cctx->nextHash = LDM_hashPosition(cctx->nextIp); - LDM_putHashOfCurrentPositionFromHash(cctx, h); - - } while (cctx->ip - *match > WINDOW_SIZE || - !LDM_isValidMatch(cctx->ip, *match)); - return 0; -} - -#endif - -/** - * Write current block (literals, literal length, match offset, - * match length). - * - * Update input pointer, inserting hashes into hash table along the - * way. - */ -static void LDM_outputBlock(LDM_CCtx *cctx, const BYTE *match) { - unsigned const literalLength = (unsigned)(cctx->ip - cctx->anchor); - unsigned const offset = cctx->ip - match; - unsigned const matchLength = LDM_count( - cctx->ip + MINMATCH, match + MINMATCH, cctx->ihashLimit); - BYTE *token = cctx->op++; - - cctx->stats.totalLiteralLength += literalLength; - cctx->stats.totalOffset += offset; - cctx->stats.totalMatchLength += matchLength + MINMATCH; - - /* Encode the literal length. */ - if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; - *token = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *(cctx->op)++ = 255; - } - *(cctx->op)++ = (BYTE)len; - } else { - *token = (BYTE)(literalLength << ML_BITS); - } - - /* Encode the literals. */ - memcpy(cctx->op, cctx->anchor, literalLength); - cctx->op += literalLength; - - /* Encode the offset. */ - LDM_write32(cctx->op, offset); - cctx->op += LDM_OFFSET_SIZE; - - /* Encode match length */ - if (matchLength >= ML_MASK) { - unsigned matchLengthRemaining = matchLength; - *token += ML_MASK; - matchLengthRemaining -= ML_MASK; - LDM_write32(cctx->op, 0xFFFFFFFF); - while (matchLengthRemaining >= 4*0xFF) { - cctx->op += 4; - LDM_write32(cctx->op, 0xffffffff); - matchLengthRemaining -= 4*0xFF; - } - cctx->op += matchLengthRemaining / 255; - *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); - } else { - *token += (BYTE)(matchLength); - } - -// LDM_setNextHash(cctx); -// cctx->ip = cctx->lastPosHashed + 1; -// cctx->nextIp = cctx->ip + cctx->step; -// printf("HERE: %zu %zu %zu\n", cctx->ip - cctx->ibase, -// cctx->lastPosHashed - cctx->ibase, cctx->nextIp - cctx->ibase); - - cctx->nextIp = cctx->ip + cctx->step; - - while (cctx->ip < cctx->anchor + MINMATCH + matchLength + literalLength) { -// printf("Loop\n"); - if (cctx->ip > cctx->lastPosHashed) { - LDM_updateLastHashFromNextHash(cctx); -// LDM_putHashOfCurrentPosition(cctx); -#ifdef LDM_ROLLING_HASH - LDM_setNextHash(cctx); -#endif - } - /* - printf("Call b %zu %zu %zu\n", - cctx->lastPosHashed - cctx->ibase, - cctx->nextIp - cctx->ibase, - cctx->ip - cctx->ibase); - */ -// printf("end call b\n"); - cctx->ip++; - cctx->nextIp++; - } - -// printf("There: %zu %zu\n", cctx->ip - cctx->ibase, cctx->lastPosHashed - cctx->ibase); -} - -// TODO: srcSize and maxDstSize is unused -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - LDM_CCtx cctx; - LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); - - /* Hash the first position and put it into the hash table. */ - LDM_putHashOfCurrentPosition(&cctx); -#ifdef LDM_ROLLING_HASH -// LDM_setNextHash(&cctx); -// tmp_hash = LDM_updateRollingHash(cctx.lastSum, LDM_HASH_LENGTH, -// cctx.ip[0], cctx.ip[LDM_HASH_LENGTH]); -// printf("Update test: %u %u\n", tmp_hash, cctx.nextSum); -// cctx.ip++; -#else - cctx.ip++; - cctx.nextHash = LDM_hashPosition(cctx.ip); -#endif - - // TODO: loop condition is not accurate. - while (1) { - const BYTE *match; -// printf("Start of loop\n"); - - /** - * Find a match. - * If no more matches can be found (i.e. the length of the remaining input - * is less than the minimum match length), then stop searching for matches - * and encode the final literals. - */ - if (LDM_findBestMatch(&cctx, &match) != 0) { - goto _last_literals; - } -// printf("End of match finding\n"); - - cctx.stats.numMatches++; - - /** - * Catch up: look back to extend the match backwards from the found match. - */ - while (cctx.ip > cctx.anchor && match > cctx.ibase && - cctx.ip[-1] == match[-1]) { -// printf("Catch up\n"); - cctx.ip--; - match--; - } - - /** - * Write current block (literals, literal length, match offset, match - * length) and update pointers and hashes. - */ - LDM_outputBlock(&cctx, match); -// printf("End of loop\n"); - - // Set start of next block to current input pointer. - cctx.anchor = cctx.ip; - LDM_updateLastHashFromNextHash(&cctx); -// LDM_putHashOfCurrentPosition(&cctx); -#ifndef LDM_ROLLING_HASH - cctx.ip++; -#endif - - /* - LDM_putHashOfCurrentPosition(&cctx); - printf("Call c\n"); - LDM_setNextHash(&cctx); - printf("End call c\n"); - cctx.ip++; - cctx.nextIp++; - */ - } -_last_literals: - /* Encode the last literals (no more matches). */ - { - size_t const lastRun = (size_t)(cctx.iend - cctx.anchor); - if (lastRun >= RUN_MASK) { - size_t accumulator = lastRun - RUN_MASK; - *(cctx.op)++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255; accumulator -= 255) { - *(cctx.op)++ = 255; - } - *(cctx.op)++ = (BYTE)accumulator; - } else { - *(cctx.op)++ = (BYTE)(lastRun << ML_BITS); - } - memcpy(cctx.op, cctx.anchor, lastRun); - cctx.op += lastRun; - } - LDM_printCompressStats(&cctx.stats); - return (cctx.op - (const BYTE *)cctx.obase); -} - -typedef struct LDM_DCtx { - size_t compressSize; - size_t maxDecompressSize; - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of source */ - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Current output position */ - const BYTE *oend; /* End of output */ -} LDM_DCtx; - -static void LDM_initializeDCtx(LDM_DCtx *dctx, - const void *src, size_t compressSize, - void *dst, size_t maxDecompressSize) { - dctx->compressSize = compressSize; - dctx->maxDecompressSize = maxDecompressSize; - - dctx->ibase = src; - dctx->ip = (const BYTE *)src; - dctx->iend = dctx->ip + dctx->compressSize; - dctx->op = dst; - dctx->oend = dctx->op + dctx->maxDecompressSize; - -} - -size_t LDM_decompress(const void *src, size_t compressSize, - void *dst, size_t maxDecompressSize) { - LDM_DCtx dctx; - LDM_initializeDCtx(&dctx, src, compressSize, dst, maxDecompressSize); - - while (dctx.ip < dctx.iend) { - BYTE *cpy; - const BYTE *match; - size_t length, offset; - - /* Get the literal length. */ - unsigned const token = *(dctx.ip)++; - if ((length = (token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - - /* Copy literals. */ - cpy = dctx.op + length; - memcpy(dctx.op, dctx.ip, length); - dctx.ip += length; - dctx.op = cpy; - - //TODO : dynamic offset size - offset = LDM_read32(dctx.ip); - dctx.ip += LDM_OFFSET_SIZE; - match = dctx.op - offset; - - /* Get the match length. */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - length += MINMATCH; - - /* Copy match. */ - cpy = dctx.op + length; - - // Inefficient for now. - while (match < cpy - offset && dctx.op < dctx.oend) { - *(dctx.op)++ = *match++; - } - } - return dctx.op - (BYTE *)dst; -} - -void LDM_test(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { -#ifdef LDM_ROLLING_HASH - const BYTE *ip = (const BYTE *)src + 1125; - U32 sum = LDM_getRollingHash((const char *)ip, LDM_HASH_LENGTH); - U32 sum2; - ++ip; - for (; ip < (const BYTE *)src + 1125 + 100; ip++) { - sum2 = LDM_updateRollingHash(sum, LDM_HASH_LENGTH, - ip[-1], ip[LDM_HASH_LENGTH - 1]); - sum = LDM_getRollingHash((const char *)ip, LDM_HASH_LENGTH); - printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2); - } -#endif -} - - diff --git a/contrib/long_distance_matching/versions/v0.4/ldm.h b/contrib/long_distance_matching/versions/v0.4/ldm.h deleted file mode 100644 index a34faac4..00000000 --- a/contrib/long_distance_matching/versions/v0.4/ldm.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef LDM_H -#define LDM_H - -#include /* size_t */ - -#define LDM_COMPRESS_SIZE 4 -#define LDM_DECOMPRESS_SIZE 4 -#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) - -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -size_t LDM_decompress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -void LDM_readHeader(const void *src, size_t *compressSize, - size_t *decompressSize); - -void LDM_test(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v0.4/main-ldm.c b/contrib/long_distance_matching/versions/v0.4/main-ldm.c deleted file mode 100644 index f8ae5469..00000000 --- a/contrib/long_distance_matching/versions/v0.4/main-ldm.c +++ /dev/null @@ -1,480 +0,0 @@ -// TODO: file size must fit into a U32 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "ldm.h" - -// #define BUF_SIZE 16*1024 // Block size -#define DEBUG -//#define TEST - -//#define ZSTD - -/* Compress file given by fname and output to oname. - * Returns 0 if successful, error code otherwise. - */ -static int compress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - size_t maxCompressSize, compressSize; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; - - /* Go to the location corresponding to the last byte. */ - /* TODO: fallocate? */ - if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* Write a dummy byte at the last location. */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - -#ifdef TEST - LDM_test(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, statbuf.st_size); -#endif - -#ifdef ZSTD - compressSize = ZSTD_compress(dst, statbuf.st_size, - src, statbuf.st_size, 1); -#else - compressSize = LDM_HEADER_SIZE + - LDM_compress(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, statbuf.st_size); - - // Write compress and decompress size to header - // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &compressSize, 4); - memcpy(dst + 4, &(statbuf.st_size), 4); - -#ifdef DEBUG - printf("Compressed size: %zu\n", compressSize); - printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); -#endif -#endif - - // Truncate file to compressSize. - ftruncate(fdout, compressSize); - - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)compressSize, oname, - (double)compressSize / (statbuf.st_size) * 100); - - // Close files. - close(fdin); - close(fdout); - return 0; -} - -/* Decompress file compressed using LDM_compress. - * The input file should have the LDM_HEADER followed by payload. - * Returns 0 if succesful, and an error code otherwise. - */ -static int decompress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - size_t compressSize, decompressSize, outSize; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* Read the header. */ - LDM_readHeader(src, &compressSize, &decompressSize); - - /* Go to the location corresponding to the last byte. */ - if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - -#ifdef ZSTD - outSize = ZSTD_decompress(dst, decomrpessed_size, - src + LDM_HEADER_SIZE, - statbuf.st_size - LDM_HEADER_SIZE); -#else - outSize = LDM_decompress( - src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, - dst, decompressSize); - - printf("Ret size out: %zu\n", outSize); - #endif - ftruncate(fdout, outSize); - - close(fdin); - close(fdout); - return 0; -} - -/* Compare two files. - * Returns 0 iff they are the same. - */ -static int compare(FILE *fp0, FILE *fp1) { - int result = 0; - while (result == 0) { - char b0[1024]; - char b1[1024]; - const size_t r0 = fread(b0, 1, sizeof(b0), fp0); - const size_t r1 = fread(b1, 1, sizeof(b1), fp1); - - result = (int)r0 - (int)r1; - - if (0 == r0 || 0 == r1) break; - - if (0 == result) result = memcmp(b0, b1, r0); - } - return result; -} - -/* Verify the input file is the same as the decompressed file. */ -static void verify(const char *inpFilename, const char *decFilename) { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - { - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - } - - fclose(decFp); - fclose(inpFp); -} - -int main(int argc, const char *argv[]) { - const char * const exeName = argv[0]; - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Wrong arguments\n"); - printf("Usage:\n"); - printf("%s FILE\n", exeName); - return 1; - } - - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - - /* Compress */ - { - struct timeval tv1, tv2; - gettimeofday(&tv1, NULL); - if (compress(inpFilename, ldmFilename)) { - printf("Compress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total compress time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - } - - /* Decompress */ - { - struct timeval tv1, tv2; - gettimeofday(&tv1, NULL); - if (decompress(ldmFilename, decFilename)) { - printf("Decompress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total decompress time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - } - /* verify */ - verify(inpFilename, decFilename); - return 0; -} - - -#if 0 -static size_t compress_file(FILE *in, FILE *out, size_t *size_in, - size_t *size_out) { - char *src, *buf = NULL; - size_t r = 1; - size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; - - src = malloc(BUF_SIZE); - if (!src) { - printf("Not enough memory\n"); - goto cleanup; - } - - size = BUF_SIZE + LDM_HEADER_SIZE; - buf = malloc(size); - if (!buf) { - printf("Not enough memory\n"); - goto cleanup; - } - - - for (;;) { - k = fread(src, 1, BUF_SIZE, in); - if (k == 0) - break; - count_in += k; - - n = LDM_compress(src, buf, k, BUF_SIZE); - - // n = k; - // offset += n; - offset = k; - count_out += k; - -// k = fwrite(src, 1, offset, out); - - k = fwrite(buf, 1, offset, out); - if (k < offset) { - if (ferror(out)) - printf("Write failed\n"); - else - printf("Short write\n"); - goto cleanup; - } - - } - *size_in = count_in; - *size_out = count_out; - r = 0; - cleanup: - free(src); - free(buf); - return r; -} - -static size_t decompress_file(FILE *in, FILE *out) { - void *src = malloc(BUF_SIZE); - void *dst = NULL; - size_t dst_capacity = BUF_SIZE; - size_t ret = 1; - size_t bytes_written = 0; - - if (!src) { - perror("decompress_file(src)"); - goto cleanup; - } - - while (ret != 0) { - /* Load more input */ - size_t src_size = fread(src, 1, BUF_SIZE, in); - void *src_ptr = src; - void *src_end = src_ptr + src_size; - if (src_size == 0 || ferror(in)) { - printf("(TODO): Decompress: not enough input or error reading file\n"); - //TODO - ret = 0; - goto cleanup; - } - - /* Allocate destination buffer if it hasn't been allocated already */ - if (!dst) { - dst = malloc(dst_capacity); - if (!dst) { - perror("decompress_file(dst)"); - goto cleanup; - } - } - - // TODO - - /* Decompress: - * Continue while there is more input to read. - */ - while (src_ptr != src_end && ret != 0) { - // size_t dst_size = src_size; - size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); - size_t written = fwrite(dst, 1, dst_size, out); -// printf("Writing %zu bytes\n", dst_size); - bytes_written += dst_size; - if (written != dst_size) { - printf("Decompress: Failed to write to file\n"); - goto cleanup; - } - src_ptr += src_size; - src_size = src_end - src_ptr; - } - - /* Update input */ - - } - - printf("Wrote %zu bytes\n", bytes_written); - - cleanup: - free(src); - free(dst); - - return ret; -} - -int main2(int argc, char *argv[]) { - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Please specify input filename\n"); - return 0; - } - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - /* compress */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *outFp = fopen(ldmFilename, "wb"); - size_t sizeIn = 0; - size_t sizeOut = 0; - size_t ret; - printf("compress : %s -> %s\n", inpFilename, ldmFilename); - ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); - if (ret) { - printf("compress : failed with code %zu\n", ret); - return ret; - } - printf("%s: %zu → %zu bytes, %.1f%%\n", - inpFilename, sizeIn, sizeOut, - (double)sizeOut / sizeIn * 100); - printf("compress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* decompress */ - { - FILE *inpFp = fopen(ldmFilename, "rb"); - FILE *outFp = fopen(decFilename, "wb"); - size_t ret; - - printf("decompress : %s -> %s\n", ldmFilename, decFilename); - ret = decompress_file(inpFp, outFp); - if (ret) { - printf("decompress : failed with code %zu\n", ret); - return ret; - } - printf("decompress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* verify */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); - } - return 0; -} -#endif - diff --git a/contrib/long_distance_matching/versions/v0.4/util.c b/contrib/long_distance_matching/versions/v0.4/util.c deleted file mode 100644 index 70fcbc2c..00000000 --- a/contrib/long_distance_matching/versions/v0.4/util.c +++ /dev/null @@ -1,69 +0,0 @@ -#include -#include -#include -#include - -#include "util.h" - -typedef uint8_t BYTE; -typedef uint16_t U16; -typedef uint32_t U32; -typedef int32_t S32; -typedef uint64_t U64; - -unsigned LDM_isLittleEndian(void) { - const union { U32 u; BYTE c[4]; } one = { 1 }; - return one.c[0]; -} - -U16 LDM_read16(const void *memPtr) { - U16 val; - memcpy(&val, memPtr, sizeof(val)); - return val; -} - -U16 LDM_readLE16(const void *memPtr) { - if (LDM_isLittleEndian()) { - return LDM_read16(memPtr); - } else { - const BYTE *p = (const BYTE *)memPtr; - return (U16)((U16)p[0] + (p[1] << 8)); - } -} - -void LDM_write16(void *memPtr, U16 value){ - memcpy(memPtr, &value, sizeof(value)); -} - -void LDM_write32(void *memPtr, U32 value) { - memcpy(memPtr, &value, sizeof(value)); -} - -void LDM_writeLE16(void *memPtr, U16 value) { - if (LDM_isLittleEndian()) { - LDM_write16(memPtr, value); - } else { - BYTE* p = (BYTE *)memPtr; - p[0] = (BYTE) value; - p[1] = (BYTE)(value>>8); - } -} - -U32 LDM_read32(const void *ptr) { - return *(const U32 *)ptr; -} - -U64 LDM_read64(const void *ptr) { - return *(const U64 *)ptr; -} - -void LDM_copy8(void *dst, const void *src) { - memcpy(dst, src, 8); -} - -BYTE LDM_readByte(const void *memPtr) { - BYTE val; - memcpy(&val, memPtr, 1); - return val; -} - diff --git a/contrib/long_distance_matching/versions/v0.4/util.h b/contrib/long_distance_matching/versions/v0.4/util.h deleted file mode 100644 index d1c3c999..00000000 --- a/contrib/long_distance_matching/versions/v0.4/util.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef LDM_UTIL_H -#define LDM_UTIL_H - -unsigned LDM_isLittleEndian(void); - -uint16_t LDM_read16(const void *memPtr); - -uint16_t LDM_readLE16(const void *memPtr); - -void LDM_write16(void *memPtr, uint16_t value); - -void LDM_write32(void *memPtr, uint32_t value); - -void LDM_writeLE16(void *memPtr, uint16_t value); - -uint32_t LDM_read32(const void *ptr); - -uint64_t LDM_read64(const void *ptr); - -void LDM_copy8(void *dst, const void *src); - -uint8_t LDM_readByte(const void *ptr); - - -#endif /* LDM_UTIL_H */ diff --git a/contrib/long_distance_matching/versions/v0.5/Makefile b/contrib/long_distance_matching/versions/v0.5/Makefile index 5ffd4eaf..fa4abce6 100644 --- a/contrib/long_distance_matching/versions/v0.5/Makefile +++ b/contrib/long_distance_matching/versions/v0.5/Makefile @@ -1,12 +1,3 @@ -# ################################################################ -# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. An additional grant -# of patent rights can be found in the PATENTS file in the same directory. -# ################################################################ - # This Makefile presumes libzstd is installed, using `sudo make install` CFLAGS ?= -O3 @@ -26,15 +17,11 @@ default: all all: main-ldm - -#main : ldm.c main.c -# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - main-ldm : util.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main main-ldm + main-ldm @echo Cleaning completed diff --git a/contrib/long_distance_matching/versions/v0.5/README b/contrib/long_distance_matching/versions/v0.5/README new file mode 100644 index 00000000..7901ae76 --- /dev/null +++ b/contrib/long_distance_matching/versions/v0.5/README @@ -0,0 +1,5 @@ +This version uses simple lz4-style compression with a rolling hash. +- A rolling checksum based on rsync's Adler-32 style checksum is used. +- The checksum is hashed using lz4's hash function. +- Hash table replacement policy: direct overwrite. +- The length of input to the hash function can be set with LDM_HASH_LENGTH. diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.c b/contrib/long_distance_matching/versions/v0.5/ldm.c index 325c5040..5fa20c06 100644 --- a/contrib/long_distance_matching/versions/v0.5/ldm.c +++ b/contrib/long_distance_matching/versions/v0.5/ldm.c @@ -19,8 +19,8 @@ #define WINDOW_SIZE (1 << 20) //These should be multiples of four. -#define LDM_HASH_LENGTH 100 -#define MINMATCH 100 +#define LDM_HASH_LENGTH 4 +#define MINMATCH 4 #define ML_BITS 4 #define ML_MASK ((1U<stats); -#ifdef COMPUTE_STATS printf("=====================\n"); printf("Compression statistics\n"); printf("Total number of matches: %u\n", stats->numMatches); @@ -131,8 +131,8 @@ static void printCompressStats(const LDM_CCtx *cctx) { } printf("=====================\n"); -#endif } +#endif /** * Checks whether the MINMATCH bytes from p are the same as the MINMATCH From 92bed4a7e0051c06dc6ceca058b5559615fcc2a8 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 12 Jul 2017 18:47:26 -0700 Subject: [PATCH 25/62] [ldm] Add CHAR_OFFSET in hash function and extend header size --- contrib/long_distance_matching/ldm.c | 18 +- contrib/long_distance_matching/ldm.h | 4 +- contrib/long_distance_matching/main-ldm.c | 203 +--------------------- contrib/long_distance_matching/util.c | 5 + contrib/long_distance_matching/util.h | 2 + 5 files changed, 22 insertions(+), 210 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 87645b76..e64d2865 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -28,6 +28,7 @@ #define RUN_MASK ((1U<> 16) - (toRemove * len) + s1; + U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; return (s1 & 0xffff) + (s2 << 16); } @@ -344,9 +348,9 @@ static unsigned countMatchLength(const BYTE *pIn, const BYTE *pMatch, return (unsigned)(pIn - pStart); } -void LDM_readHeader(const void *src, size_t *compressSize, - size_t *decompressSize) { - const U32 *ip = (const U32 *)src; +void LDM_readHeader(const void *src, U64 *compressSize, + U64 *decompressSize) { + const U64 *ip = (const U64 *)src; *compressSize = *ip++; *decompressSize = *ip; } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index d7f977d9..f04b6e95 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -3,8 +3,8 @@ #include /* size_t */ -#define LDM_COMPRESS_SIZE 4 -#define LDM_DECOMPRESS_SIZE 4 +#define LDM_COMPRESS_SIZE 8 +#define LDM_DECOMPRESS_SIZE 8 #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) /** diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index fbfd789b..8354b795 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -85,8 +85,8 @@ static int compress(const char *fname, const char *oname) { // Write compress and decompress size to header // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &compressSize, 4); - memcpy(dst + 4, &(statbuf.st_size), 4); + memcpy(dst, &compressSize, 8); + memcpy(dst + 8, &(statbuf.st_size), 8); #ifdef DEBUG printf("Compressed size: %zu\n", compressSize); @@ -267,202 +267,3 @@ int main(int argc, const char *argv[]) { verify(inpFilename, decFilename); return 0; } - - -#if 0 -static size_t compress_file(FILE *in, FILE *out, size_t *size_in, - size_t *size_out) { - char *src, *buf = NULL; - size_t r = 1; - size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; - - src = malloc(BUF_SIZE); - if (!src) { - printf("Not enough memory\n"); - goto cleanup; - } - - size = BUF_SIZE + LDM_HEADER_SIZE; - buf = malloc(size); - if (!buf) { - printf("Not enough memory\n"); - goto cleanup; - } - - - for (;;) { - k = fread(src, 1, BUF_SIZE, in); - if (k == 0) - break; - count_in += k; - - n = LDM_compress(src, buf, k, BUF_SIZE); - - // n = k; - // offset += n; - offset = k; - count_out += k; - -// k = fwrite(src, 1, offset, out); - - k = fwrite(buf, 1, offset, out); - if (k < offset) { - if (ferror(out)) - printf("Write failed\n"); - else - printf("Short write\n"); - goto cleanup; - } - - } - *size_in = count_in; - *size_out = count_out; - r = 0; - cleanup: - free(src); - free(buf); - return r; -} - -static size_t decompress_file(FILE *in, FILE *out) { - void *src = malloc(BUF_SIZE); - void *dst = NULL; - size_t dst_capacity = BUF_SIZE; - size_t ret = 1; - size_t bytes_written = 0; - - if (!src) { - perror("decompress_file(src)"); - goto cleanup; - } - - while (ret != 0) { - /* Load more input */ - size_t src_size = fread(src, 1, BUF_SIZE, in); - void *src_ptr = src; - void *src_end = src_ptr + src_size; - if (src_size == 0 || ferror(in)) { - printf("(TODO): Decompress: not enough input or error reading file\n"); - //TODO - ret = 0; - goto cleanup; - } - - /* Allocate destination buffer if it hasn't been allocated already */ - if (!dst) { - dst = malloc(dst_capacity); - if (!dst) { - perror("decompress_file(dst)"); - goto cleanup; - } - } - - // TODO - - /* Decompress: - * Continue while there is more input to read. - */ - while (src_ptr != src_end && ret != 0) { - // size_t dst_size = src_size; - size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); - size_t written = fwrite(dst, 1, dst_size, out); -// printf("Writing %zu bytes\n", dst_size); - bytes_written += dst_size; - if (written != dst_size) { - printf("Decompress: Failed to write to file\n"); - goto cleanup; - } - src_ptr += src_size; - src_size = src_end - src_ptr; - } - - /* Update input */ - - } - - printf("Wrote %zu bytes\n", bytes_written); - - cleanup: - free(src); - free(dst); - - return ret; -} - -int main2(int argc, char *argv[]) { - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Please specify input filename\n"); - return 0; - } - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - /* compress */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *outFp = fopen(ldmFilename, "wb"); - size_t sizeIn = 0; - size_t sizeOut = 0; - size_t ret; - printf("compress : %s -> %s\n", inpFilename, ldmFilename); - ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); - if (ret) { - printf("compress : failed with code %zu\n", ret); - return ret; - } - printf("%s: %zu → %zu bytes, %.1f%%\n", - inpFilename, sizeIn, sizeOut, - (double)sizeOut / sizeIn * 100); - printf("compress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* decompress */ - { - FILE *inpFp = fopen(ldmFilename, "rb"); - FILE *outFp = fopen(decFilename, "wb"); - size_t ret; - - printf("decompress : %s -> %s\n", ldmFilename, decFilename); - ret = decompress_file(inpFp, outFp); - if (ret) { - printf("decompress : failed with code %zu\n", ret); - return ret; - } - printf("decompress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* verify */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); - } - return 0; -} -#endif - diff --git a/contrib/long_distance_matching/util.c b/contrib/long_distance_matching/util.c index 47ac8a12..62749215 100644 --- a/contrib/long_distance_matching/util.c +++ b/contrib/long_distance_matching/util.c @@ -53,6 +53,11 @@ U32 LDM_read32(const void *ptr) { return *(const U32 *)ptr; } +//TODO: endianness? +void LDM_write64(void *memPtr, U64 value) { + memcpy(memPtr, &value, sizeof(value)); +} + U64 LDM_read64(const void *ptr) { return *(const U64 *)ptr; } diff --git a/contrib/long_distance_matching/util.h b/contrib/long_distance_matching/util.h index d1c3c999..dbf55cbc 100644 --- a/contrib/long_distance_matching/util.h +++ b/contrib/long_distance_matching/util.h @@ -21,5 +21,7 @@ void LDM_copy8(void *dst, const void *src); uint8_t LDM_readByte(const void *ptr); +void LDM_write64(void *memPtr, uint64_t value); + #endif /* LDM_UTIL_H */ From 68c4560701cca29837dfa5a57a730a56be0199fb Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 13 Jul 2017 10:38:19 -0700 Subject: [PATCH 26/62] [ldm] Add TODO and comment for segfaulting in compress function --- contrib/long_distance_matching/ldm.c | 7 +++++-- contrib/long_distance_matching/main-ldm.c | 10 ++++++---- contrib/long_distance_matching/util.c | 5 ----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index e64d2865..7a545073 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -28,7 +28,7 @@ #define RUN_MASK ((1U< #include #include @@ -12,12 +10,17 @@ #include #include "ldm.h" +#include "zstd.h" #define DEBUG //#define TEST /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. + * + * TODO: This currently seg faults if the compressed size is > the decompress + * size due to the mmapping and output file size allocated to be the input size. + * The compress function should check before writing or buffer writes. */ static int compress(const char *fname, const char *oname) { int fdin, fdout; @@ -78,10 +81,9 @@ static int compress(const char *fname, const char *oname) { dst + LDM_HEADER_SIZE, statbuf.st_size); #endif */ - compressSize = LDM_HEADER_SIZE + LDM_compress(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, statbuf.st_size); + dst + LDM_HEADER_SIZE, maxCompressSize); // Write compress and decompress size to header // TODO: should depend on LDM_DECOMPRESS_SIZE write32 diff --git a/contrib/long_distance_matching/util.c b/contrib/long_distance_matching/util.c index 62749215..47ac8a12 100644 --- a/contrib/long_distance_matching/util.c +++ b/contrib/long_distance_matching/util.c @@ -53,11 +53,6 @@ U32 LDM_read32(const void *ptr) { return *(const U32 *)ptr; } -//TODO: endianness? -void LDM_write64(void *memPtr, U64 value) { - memcpy(memPtr, &value, sizeof(value)); -} - U64 LDM_read64(const void *ptr) { return *(const U64 *)ptr; } From 50421d9474710b44618d843fb000b08b3a96df65 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 13 Jul 2017 11:45:00 -0700 Subject: [PATCH 27/62] [ldm] Remove old main files --- contrib/long_distance_matching/main.c | 240 -------------------------- contrib/long_distance_matching/main.h | 7 - 2 files changed, 247 deletions(-) delete mode 100644 contrib/long_distance_matching/main.c delete mode 100644 contrib/long_distance_matching/main.h diff --git a/contrib/long_distance_matching/main.c b/contrib/long_distance_matching/main.c deleted file mode 100644 index 67144166..00000000 --- a/contrib/long_distance_matching/main.c +++ /dev/null @@ -1,240 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "ldm.h" - -#define BUF_SIZE 16*1024 // Block size -#define LDM_HEADER_SIZE 8 - -/* -static size_t compress_file_mmap(FILE *in, FILE *out, size_t *size_in, - size_t *size_out) { - char *src, *dst; - struct stat statbuf; - - if (fstat(in, &statbuf) < 0) { - printf("fstat error\n"); - return 1; - } - - - return 0; -} -*/ - -static size_t compress_file(FILE *in, FILE *out, size_t *size_in, - size_t *size_out) { - char *src, *buf = NULL; - size_t r = 1; - size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; - - src = malloc(BUF_SIZE); - if (!src) { - printf("Not enough memory\n"); - goto cleanup; - } - - size = BUF_SIZE + LDM_HEADER_SIZE; - buf = malloc(size); - if (!buf) { - printf("Not enough memory\n"); - goto cleanup; - } - - for (;;) { - k = fread(src, 1, BUF_SIZE, in); - if (k == 0) - break; - count_in += k; - - n = LDM_compress(src, buf, k, BUF_SIZE); - - // n = k; - // offset += n; - offset = n; - count_out += n; - - k = fwrite(buf, 1, offset, out); - if (k < offset) { - if (ferror(out)) - printf("Write failed\n"); - else - printf("Short write\n"); - goto cleanup; - } - - } - *size_in = count_in; - *size_out = count_out; - r = 0; - cleanup: - free(src); - free(buf); - return r; -} - -static size_t decompress_file(FILE *in, FILE *out) { - void *src = malloc(BUF_SIZE); - void *dst = NULL; - size_t dst_capacity = BUF_SIZE; - size_t ret = 1; - size_t bytes_written = 0; - - if (!src) { - perror("decompress_file(src)"); - goto cleanup; - } - - while (ret != 0) { - /* Load more input */ - size_t src_size = fread(src, 1, BUF_SIZE, in); - void *src_ptr = src; - void *src_end = src_ptr + src_size; - if (src_size == 0 || ferror(in)) { - printf("(TODO): Decompress: not enough input or error reading file\n"); - //TODO - ret = 0; - goto cleanup; - } - - /* Allocate destination buffer if it hasn't been allocated already */ - if (!dst) { - dst = malloc(dst_capacity); - if (!dst) { - perror("decompress_file(dst)"); - goto cleanup; - } - } - - /* Decompress: - * Continue while there is more input to read. - */ - while (src_ptr != src_end && ret != 0) { - // size_t dst_size = src_size; - size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); - size_t written = fwrite(dst, 1, dst_size, out); -// printf("Writing %zu bytes\n", dst_size); - bytes_written += dst_size; - if (written != dst_size) { - printf("Decompress: Failed to write to file\n"); - goto cleanup; - } - src_ptr += src_size; - src_size = src_end - src_ptr; - } - - /* Update input */ - - } - - printf("Wrote %zu bytes\n", bytes_written); - - cleanup: - free(src); - free(dst); - - return ret; -} - -static int compare(FILE *fp0, FILE *fp1) { - int result = 0; - while (result == 0) { - char b0[1024]; - char b1[1024]; - const size_t r0 = fread(b0, 1, sizeof(b0), fp0); - const size_t r1 = fread(b1, 1, sizeof(b1), fp1); - - result = (int)r0 - (int)r1; - - if (0 == r0 || 0 == r1) { - break; - } - if (0 == result) { - result = memcmp(b0, b1, r0); - } - } - return result; -} - -int main(int argc, char *argv[]) { - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Please specify input filename\n"); - return 0; - } - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - /* compress */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *outFp = fopen(ldmFilename, "wb"); - size_t sizeIn = 0; - size_t sizeOut = 0; - size_t ret; - printf("compress : %s -> %s\n", inpFilename, ldmFilename); - ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); - if (ret) { - printf("compress : failed with code %zu\n", ret); - return ret; - } - printf("%s: %zu → %zu bytes, %.1f%%\n", - inpFilename, sizeIn, sizeOut, - (double)sizeOut / sizeIn * 100); - printf("compress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* decompress */ - { - FILE *inpFp = fopen(ldmFilename, "rb"); - FILE *outFp = fopen(decFilename, "wb"); - size_t ret; - - printf("decompress : %s -> %s\n", ldmFilename, decFilename); - ret = decompress_file(inpFp, outFp); - if (ret) { - printf("decompress : failed with code %zu\n", ret); - return ret; - } - printf("decompress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* verify */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); - } - - return 0; -} - - diff --git a/contrib/long_distance_matching/main.h b/contrib/long_distance_matching/main.h deleted file mode 100644 index a0b03012..00000000 --- a/contrib/long_distance_matching/main.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _MAIN_H -#define _MAIN_H - -void compress_file(FILE *in, FILE *out, int argc, char *argv[]); -void decompress_file(FILE *in, FILE *out, int argc, char *argv[]); - -#endif /* _MAIN_H */ From 9306feb8fabdbaa1e4c4ccee90eb3cf8848556e6 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 13 Jul 2017 13:44:48 -0700 Subject: [PATCH 28/62] [ldm] Switch to using lib/common/mem.h and move typedefs to ldm.h Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Blame Revision: --- contrib/long_distance_matching/Makefile | 7 +- contrib/long_distance_matching/ldm.c | 82 +++++++++++------------ contrib/long_distance_matching/ldm.h | 15 +++-- contrib/long_distance_matching/main-ldm.c | 13 ++-- contrib/long_distance_matching/util.c | 68 ------------------- contrib/long_distance_matching/util.h | 27 -------- 6 files changed, 57 insertions(+), 155 deletions(-) delete mode 100644 contrib/long_distance_matching/util.c delete mode 100644 contrib/long_distance_matching/util.h diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 5ffd4eaf..8ba16d03 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -9,6 +9,7 @@ # This Makefile presumes libzstd is installed, using `sudo make install` +CPPFLAGS+= -I../../lib/common CFLAGS ?= -O3 DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ @@ -26,11 +27,7 @@ default: all all: main-ldm - -#main : ldm.c main.c -# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -main-ldm : util.c ldm.c main-ldm.c +main-ldm : ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 7a545073..00099fbe 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -4,23 +4,22 @@ #include #include "ldm.h" -#include "util.h" // Insert every (HASH_ONLY_EVERY + 1) into the hash table. #define HASH_ONLY_EVERY 0 -#define LDM_MEMORY_USAGE 20 +#define LDM_MEMORY_USAGE 22 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_OFFSET_SIZE 4 -#define WINDOW_SIZE (1 << 20) +#define WINDOW_SIZE (1 << 29) //These should be multiples of four. -#define LDM_HASH_LENGTH 4 -#define MINMATCH 4 +#define LDM_HASH_LENGTH 8 +#define MINMATCH 8 #define ML_BITS 4 #define ML_MASK ((1U<= 8; lengthLeft -= 8) { - if (LDM_read64(curP) != LDM_read64(curMatch)) { + if (MEM_read64(curP) != MEM_read64(curMatch)) { return 0; } curP += 8; curMatch += 8; } if (lengthLeft > 0) { - return (LDM_read32(curP) == LDM_read32(curMatch)); + return (MEM_read32(curP) == MEM_read32(curMatch)); } return 1; } @@ -184,10 +173,9 @@ static hash_t checksumToHash(U32 sum) { * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) */ -static U32 getChecksum(const char *data, U32 len) { +static U32 getChecksum(const BYTE *buf, U32 len) { U32 i; U32 s1, s2; - const schar *buf = (const schar *)data; s1 = s2 = 0; for (i = 0; i < (len - 4); i += 4) { @@ -215,7 +203,7 @@ static U32 getChecksum(const char *data, U32 len) { * Thus toRemove should correspond to data[0]. */ static U32 updateChecksum(U32 sum, U32 len, - schar toRemove, schar toAdd) { + BYTE toRemove, BYTE toAdd) { U32 s1 = (sum & 0xffff) - toRemove + toAdd; U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; @@ -244,13 +232,13 @@ static void setNextHash(LDM_CCtx *cctx) { // cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); cctx->nextSum = updateChecksum( cctx->lastSum, LDM_HASH_LENGTH, - (schar)((cctx->lastPosHashed)[0]), - (schar)((cctx->lastPosHashed)[LDM_HASH_LENGTH])); + (cctx->lastPosHashed)[0], + (cctx->lastPosHashed)[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; cctx->nextHash = checksumToHash(cctx->nextSum); #ifdef RUN_CHECKS - check = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); + check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); if (check != cctx->nextSum) { printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); @@ -279,7 +267,8 @@ static void putHashOfCurrentPositionFromHash( // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - (cctx->hashTable)[hash] = (hashEntry){ (offset_t)(cctx->ip - cctx->ibase) }; + (cctx->hashTable)[hash] = + (LDM_hashEntry){ (offset_t)(cctx->ip - cctx->ibase) }; } cctx->lastPosHashed = cctx->ip; @@ -307,7 +296,7 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { * Insert hash of the current position into the hash table. */ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U32 sum = getChecksum((const char *)cctx->ip, LDM_HASH_LENGTH); + U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); hash_t hash = checksumToHash(sum); #ifdef RUN_CHECKS @@ -337,7 +326,7 @@ static unsigned countMatchLength(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE * const pStart = pIn; while (pIn < pInLimit - 1) { - BYTE const diff = LDM_readByte(pMatch) ^ LDM_readByte(pIn); + BYTE const diff = (*pMatch) ^ *(pIn); if (!diff) { pIn++; pMatch++; @@ -427,9 +416,9 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { * Update input pointer, inserting hashes into hash table along the way. */ static void outputBlock(LDM_CCtx *cctx, - unsigned const literalLength, - unsigned const offset, - unsigned const matchLength) { + const unsigned literalLength, + const unsigned offset, + const unsigned matchLength) { BYTE *token = cctx->op++; /* Encode the literal length. */ @@ -449,7 +438,7 @@ static void outputBlock(LDM_CCtx *cctx, cctx->op += literalLength; /* Encode the offset. */ - LDM_write32(cctx->op, offset); + MEM_write32(cctx->op, offset); cctx->op += LDM_OFFSET_SIZE; /* Encode the match length. */ @@ -457,10 +446,10 @@ static void outputBlock(LDM_CCtx *cctx, unsigned matchLengthRemaining = matchLength; *token += ML_MASK; matchLengthRemaining -= ML_MASK; - LDM_write32(cctx->op, 0xFFFFFFFF); + MEM_write32(cctx->op, 0xFFFFFFFF); while (matchLengthRemaining >= 4*0xFF) { cctx->op += 4; - LDM_write32(cctx->op, 0xffffffff); + MEM_write32(cctx->op, 0xffffffff); matchLengthRemaining -= 4*0xFF; } cctx->op += matchLengthRemaining / 255; @@ -514,9 +503,9 @@ size_t LDM_compress(const void *src, size_t srcSize, * length) and update pointers and hashes. */ { - unsigned const literalLength = (unsigned)(cctx.ip - cctx.anchor); - unsigned const offset = cctx.ip - match; - unsigned const matchLength = countMatchLength( + const unsigned literalLength = (unsigned)(cctx.ip - cctx.anchor); + const unsigned offset = cctx.ip - match; + const unsigned matchLength = countMatchLength( cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); #ifdef COMPUTE_STATS @@ -605,7 +594,7 @@ size_t LDM_decompress(const void *src, size_t compressSize, size_t length, offset; /* Get the literal length. */ - unsigned const token = *(dctx.ip)++; + const unsigned token = *(dctx.ip)++; if ((length = (token >> ML_BITS)) == RUN_MASK) { unsigned s; do { @@ -621,7 +610,7 @@ size_t LDM_decompress(const void *src, size_t compressSize, dctx.op = cpy; //TODO : dynamic offset size - offset = LDM_read32(dctx.ip); + offset = MEM_read32(dctx.ip); dctx.ip += LDM_OFFSET_SIZE; match = dctx.op - offset; @@ -647,6 +636,11 @@ size_t LDM_decompress(const void *src, size_t compressSize, return dctx.op - (BYTE *)dst; } +// TODO: implement and test hash function +void LDM_test(void) { + +} + /* void LDM_test(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index f04b6e95..fd8c2ab8 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -3,10 +3,18 @@ #include /* size_t */ +#include "mem.h" // from /lib/common/mem.h + #define LDM_COMPRESS_SIZE 8 #define LDM_DECOMPRESS_SIZE 8 #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) +typedef U32 offset_t; +typedef U32 hash_t; +typedef struct LDM_hashEntry LDM_hashEntry; +typedef struct LDM_compressStats LDM_compressStats; +typedef struct LDM_CCtx LDM_CCtx; + /** * Compresses src into dst. * @@ -46,10 +54,9 @@ size_t LDM_decompress(const void *src, size_t srcSize, * * NB: LDM_compress and LDM_decompress currently do not add/read headers. */ -void LDM_readHeader(const void *src, size_t *compressSize, - size_t *decompressSize); +void LDM_readHeader(const void *src, U64 *compressSize, + U64 *decompressSize); -void LDM_test(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); +void LDM_test(void); #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 9e7d4526..2017cf4e 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -75,12 +75,6 @@ static int compress(const char *fname, const char *oname) { return 1; } -/* -#ifdef TEST - LDM_test(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, statbuf.st_size); -#endif -*/ compressSize = LDM_HEADER_SIZE + LDM_compress(src, statbuf.st_size, dst + LDM_HEADER_SIZE, maxCompressSize); @@ -116,7 +110,8 @@ static int decompress(const char *fname, const char *oname) { int fdin, fdout; struct stat statbuf; char *src, *dst; - size_t compressSize, decompressSize, outSize; + U64 compressSize, decompressSize; + size_t outSize; /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { @@ -267,5 +262,9 @@ int main(int argc, const char *argv[]) { } /* verify */ verify(inpFilename, decFilename); + +#ifdef TEST + LDM_test(); +#endif return 0; } diff --git a/contrib/long_distance_matching/util.c b/contrib/long_distance_matching/util.c deleted file mode 100644 index 47ac8a12..00000000 --- a/contrib/long_distance_matching/util.c +++ /dev/null @@ -1,68 +0,0 @@ -#include -#include -#include -#include - -#include "util.h" - -typedef uint8_t BYTE; -typedef uint16_t U16; -typedef uint32_t U32; -typedef int32_t S32; -typedef uint64_t U64; - -unsigned LDM_isLittleEndian(void) { - const union { U32 u; BYTE c[4]; } one = { 1 }; - return one.c[0]; -} - -U16 LDM_read16(const void *memPtr) { - U16 val; - memcpy(&val, memPtr, sizeof(val)); - return val; -} - -U16 LDM_readLE16(const void *memPtr) { - if (LDM_isLittleEndian()) { - return LDM_read16(memPtr); - } else { - const BYTE *p = (const BYTE *)memPtr; - return (U16)((U16)p[0] + (p[1] << 8)); - } -} - -void LDM_write16(void *memPtr, U16 value){ - memcpy(memPtr, &value, sizeof(value)); -} - -void LDM_write32(void *memPtr, U32 value) { - memcpy(memPtr, &value, sizeof(value)); -} - -void LDM_writeLE16(void *memPtr, U16 value) { - if (LDM_isLittleEndian()) { - LDM_write16(memPtr, value); - } else { - BYTE* p = (BYTE *)memPtr; - p[0] = (BYTE) value; - p[1] = (BYTE)(value>>8); - } -} - -U32 LDM_read32(const void *ptr) { - return *(const U32 *)ptr; -} - -U64 LDM_read64(const void *ptr) { - return *(const U64 *)ptr; -} - -void LDM_copy8(void *dst, const void *src) { - memcpy(dst, src, 8); -} - -BYTE LDM_readByte(const void *memPtr) { - BYTE val; - memcpy(&val, memPtr, 1); - return val; -} diff --git a/contrib/long_distance_matching/util.h b/contrib/long_distance_matching/util.h deleted file mode 100644 index dbf55cbc..00000000 --- a/contrib/long_distance_matching/util.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef LDM_UTIL_H -#define LDM_UTIL_H - -unsigned LDM_isLittleEndian(void); - -uint16_t LDM_read16(const void *memPtr); - -uint16_t LDM_readLE16(const void *memPtr); - -void LDM_write16(void *memPtr, uint16_t value); - -void LDM_write32(void *memPtr, uint32_t value); - -void LDM_writeLE16(void *memPtr, uint16_t value); - -uint32_t LDM_read32(const void *ptr); - -uint64_t LDM_read64(const void *ptr); - -void LDM_copy8(void *dst, const void *src); - -uint8_t LDM_readByte(const void *ptr); - -void LDM_write64(void *memPtr, uint64_t value); - - -#endif /* LDM_UTIL_H */ From 2b3c7e4199842a7ae2038e31adb36be69bb0725d Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 13 Jul 2017 14:39:35 -0700 Subject: [PATCH 29/62] [ldm] Make some functions shared --- contrib/long_distance_matching/ldm.c | 178 +++++++++------------- contrib/long_distance_matching/ldm.h | 71 ++++++++- contrib/long_distance_matching/main-ldm.c | 34 ++--- 3 files changed, 160 insertions(+), 123 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 00099fbe..a5057aec 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -19,7 +19,6 @@ //These should be multiples of four. #define LDM_HASH_LENGTH 8 -#define MINMATCH 8 #define ML_BITS 4 #define ML_MASK ((1U<stats); -#ifdef COMPUTE_STATS +void LDM_printCompressStats(const LDM_compressStats *stats, + const LDM_hashEntry *hashTable, + U32 hashTableSize) { printf("=====================\n"); printf("Compression statistics\n"); printf("Total number of matches: %u\n", stats->numMatches); @@ -110,50 +106,41 @@ static void printCompressStats(const LDM_CCtx *cctx) { { U32 i = 0; U32 ctr = 0; - for (; i < LDM_HASHTABLESIZE_U32; i++) { - if ((cctx->hashTable)[i].offset == 0) { + for (; i < hashTableSize; i++) { + if (hashTable[i].offset == 0) { ctr++; } } printf("Hash table size, empty slots, %% empty: %u %u %.3f\n", - LDM_HASHTABLESIZE_U32, ctr, - 100.0 * (double)(ctr) / (double)LDM_HASHTABLESIZE_U32); + hashTableSize, ctr, + 100.0 * (double)(ctr) / (double)hashTableSize); } printf("=====================\n"); -#endif } -/** - * Checks whether the MINMATCH bytes from p are the same as the MINMATCH - * bytes from match. - * - * This assumes MINMATCH is a multiple of four. - * - * Return 1 if valid, 0 otherwise. - */ -static int LDM_isValidMatch(const BYTE *p, const BYTE *match) { +int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { /* - if (memcmp(p, match, MINMATCH) == 0) { + if (memcmp(pIn, pMatch, LDM_MIN_MATCH_LENGTH) == 0) { return 1; } return 0; */ //TODO: This seems to be faster for some reason? - U16 lengthLeft = MINMATCH; - const BYTE *curP = p; - const BYTE *curMatch = match; + U32 lengthLeft = LDM_MIN_MATCH_LENGTH; + const BYTE *curIn = pIn; + const BYTE *curMatch = pMatch; for (; lengthLeft >= 8; lengthLeft -= 8) { - if (MEM_read64(curP) != MEM_read64(curMatch)) { + if (MEM_read64(curIn) != MEM_read64(curMatch)) { return 0; } - curP += 8; + curIn += 8; curMatch += 8; } if (lengthLeft > 0) { - return (MEM_read32(curP) == MEM_read32(curMatch)); + return (MEM_read32(curIn) == MEM_read32(curMatch)); } return 1; } @@ -316,14 +303,8 @@ static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) { return cctx->hashTable[hash].offset + cctx->ibase; } -/** - * Counts the number of bytes that match from pIn and pMatch, - * up to pInLimit. - * - * TODO: make more efficient. - */ -static unsigned countMatchLength(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { +U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { const BYTE * const pStart = pIn; while (pIn < pInLimit - 1) { BYTE const diff = (*pMatch) ^ *(pIn); @@ -332,24 +313,23 @@ static unsigned countMatchLength(const BYTE *pIn, const BYTE *pMatch, pMatch++; continue; } - return (unsigned)(pIn - pStart); + return (U32)(pIn - pStart); } - return (unsigned)(pIn - pStart); + return (U32)(pIn - pStart); } -void LDM_readHeader(const void *src, U64 *compressSize, - U64 *decompressSize) { - const U64 *ip = (const U64 *)src; - *compressSize = *ip++; - *decompressSize = *ip; +void LDM_readHeader(const void *src, U64 *compressedSize, + U64 *decompressedSize) { + const BYTE *ip = (const BYTE *)src; + *compressedSize = MEM_readLE64(ip); + ip += sizeof(U64); + *decompressedSize = MEM_readLE64(ip); + // ip += sizeof(U64); } -/** - * Initialize a compression context. - */ -static void initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { +void LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { cctx->isize = srcSize; cctx->maxOSize = maxDstSize; @@ -358,7 +338,7 @@ static void initializeCCtx(LDM_CCtx *cctx, cctx->iend = cctx->ibase + srcSize; cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; - cctx->imatchLimit = cctx->iend - MINMATCH; + cctx->imatchLimit = cctx->iend - LDM_MIN_MATCH_LENGTH; cctx->obase = (BYTE *)dst; cctx->op = (BYTE *)dst; @@ -409,33 +389,33 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { return 0; } -/** - * Write current block (literals, literal length, match offset, - * match length). - * - * Update input pointer, inserting hashes into hash table along the way. - */ -static void outputBlock(LDM_CCtx *cctx, - const unsigned literalLength, - const unsigned offset, - const unsigned matchLength) { - BYTE *token = cctx->op++; - +void LDM_encodeLiteralLengthAndLiterals( + LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { /* Encode the literal length. */ if (literalLength >= RUN_MASK) { int len = (int)literalLength - RUN_MASK; - *token = (RUN_MASK << ML_BITS); + *pToken = (RUN_MASK << ML_BITS); for (; len >= 255; len -= 255) { *(cctx->op)++ = 255; } *(cctx->op)++ = (BYTE)len; } else { - *token = (BYTE)(literalLength << ML_BITS); + *pToken = (BYTE)(literalLength << ML_BITS); } /* Encode the literals. */ memcpy(cctx->op, cctx->anchor, literalLength); cctx->op += literalLength; +} + +void LDM_outputBlock(LDM_CCtx *cctx, + const U32 literalLength, + const U32 offset, + const U32 matchLength) { + BYTE *pToken = cctx->op++; + + /* Encode the literal length and literals. */ + LDM_encodeLiteralLengthAndLiterals(cctx, pToken, literalLength); /* Encode the offset. */ MEM_write32(cctx->op, offset); @@ -444,7 +424,7 @@ static void outputBlock(LDM_CCtx *cctx, /* Encode the match length. */ if (matchLength >= ML_MASK) { unsigned matchLengthRemaining = matchLength; - *token += ML_MASK; + *pToken += ML_MASK; matchLengthRemaining -= ML_MASK; MEM_write32(cctx->op, 0xFFFFFFFF); while (matchLengthRemaining >= 4*0xFF) { @@ -455,7 +435,7 @@ static void outputBlock(LDM_CCtx *cctx, cctx->op += matchLengthRemaining / 255; *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); } else { - *token += (BYTE)(matchLength); + *pToken += (BYTE)(matchLength); } } @@ -467,7 +447,7 @@ static void outputBlock(LDM_CCtx *cctx, size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; - initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); @@ -503,21 +483,23 @@ size_t LDM_compress(const void *src, size_t srcSize, * length) and update pointers and hashes. */ { - const unsigned literalLength = (unsigned)(cctx.ip - cctx.anchor); - const unsigned offset = cctx.ip - match; - const unsigned matchLength = countMatchLength( - cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); + const U32 literalLength = cctx.ip - cctx.anchor; + const U32 offset = cctx.ip - match; + const U32 matchLength = LDM_countMatchLength( + cctx.ip + LDM_MIN_MATCH_LENGTH, match + LDM_MIN_MATCH_LENGTH, + cctx.ihashLimit); #ifdef COMPUTE_STATS cctx.stats.totalLiteralLength += literalLength; cctx.stats.totalOffset += offset; - cctx.stats.totalMatchLength += matchLength + MINMATCH; + cctx.stats.totalMatchLength += matchLength + LDM_MIN_MATCH_LENGTH; #endif - outputBlock(&cctx, literalLength, offset, matchLength); + LDM_outputBlock(&cctx, literalLength, offset, matchLength); // Move ip to end of block, inserting hashes at each position. cctx.nextIp = cctx.ip + cctx.step; - while (cctx.ip < cctx.anchor + MINMATCH + matchLength + literalLength) { + while (cctx.ip < cctx.anchor + LDM_MIN_MATCH_LENGTH + + matchLength + literalLength) { if (cctx.ip > cctx.lastPosHashed) { // TODO: Simplify. LDM_updateLastHashFromNextHash(&cctx); @@ -535,31 +517,21 @@ size_t LDM_compress(const void *src, size_t srcSize, _last_literals: /* Encode the last literals (no more matches). */ { - size_t const lastRun = (size_t)(cctx.iend - cctx.anchor); - if (lastRun >= RUN_MASK) { - size_t accumulator = lastRun - RUN_MASK; - *(cctx.op)++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255; accumulator -= 255) { - *(cctx.op)++ = 255; - } - *(cctx.op)++ = (BYTE)accumulator; - } else { - *(cctx.op)++ = (BYTE)(lastRun << ML_BITS); - } - memcpy(cctx.op, cctx.anchor, lastRun); - cctx.op += lastRun; + const size_t lastRun = (size_t)(cctx.iend - cctx.anchor); + BYTE *pToken = cctx.op++; + LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); } #ifdef COMPUTE_STATS - printCompressStats(&cctx); + LDM_printCompressStats(&cctx.stats, cctx.hashTable, LDM_HASHTABLESIZE_U32); #endif return (cctx.op - (const BYTE *)cctx.obase); } -typedef struct LDM_DCtx { - size_t compressSize; - size_t maxDecompressSize; +struct LDM_DCtx { + size_t compressedSize; + size_t maxDecompressedSize; const BYTE *ibase; /* Base of input */ const BYTE *ip; /* Current input position */ @@ -568,25 +540,25 @@ typedef struct LDM_DCtx { const BYTE *obase; /* Base of output */ BYTE *op; /* Current output position */ const BYTE *oend; /* End of output */ -} LDM_DCtx; +}; -static void LDM_initializeDCtx(LDM_DCtx *dctx, - const void *src, size_t compressSize, - void *dst, size_t maxDecompressSize) { - dctx->compressSize = compressSize; - dctx->maxDecompressSize = maxDecompressSize; +void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize) { + dctx->compressedSize = compressedSize; + dctx->maxDecompressedSize = maxDecompressedSize; dctx->ibase = src; dctx->ip = (const BYTE *)src; - dctx->iend = dctx->ip + dctx->compressSize; + dctx->iend = dctx->ip + dctx->compressedSize; dctx->op = dst; - dctx->oend = dctx->op + dctx->maxDecompressSize; + dctx->oend = dctx->op + dctx->maxDecompressedSize; } -size_t LDM_decompress(const void *src, size_t compressSize, - void *dst, size_t maxDecompressSize) { +size_t LDM_decompress(const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize) { LDM_DCtx dctx; - LDM_initializeDCtx(&dctx, src, compressSize, dst, maxDecompressSize); + LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); while (dctx.ip < dctx.iend) { BYTE *cpy; @@ -623,7 +595,7 @@ size_t LDM_decompress(const void *src, size_t compressSize, length += s; } while (s == 255); } - length += MINMATCH; + length += LDM_MIN_MATCH_LENGTH; /* Copy match. */ cpy = dctx.op + length; diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index fd8c2ab8..19d475dc 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -9,11 +9,15 @@ #define LDM_DECOMPRESS_SIZE 8 #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) +// This should be a multiple of four. +#define LDM_MIN_MATCH_LENGTH 8 + typedef U32 offset_t; typedef U32 hash_t; typedef struct LDM_hashEntry LDM_hashEntry; typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; +typedef struct LDM_DCtx LDM_DCtx; /** * Compresses src into dst. @@ -45,17 +49,78 @@ typedef struct LDM_CCtx LDM_CCtx; size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize); +/** + * Initialize the compression context. + */ +void LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize); +/** + * Outputs compression statistics to stdout. + */ +void LDM_printCompressStats(const LDM_compressStats *stats, + const LDM_hashEntry *hashTable, + U32 hashTableSize); +/** + * Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the + * LDM_MIN_MATCH_LENGTH bytes from match. + * + * This assumes LDM_MIN_MATCH_LENGTH is a multiple of four. + * + * Return 1 if valid, 0 otherwise. + */ +int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch); + +/** + * Counts the number of bytes that match from pIn and pMatch, + * up to pInLimit. + */ +U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit); + +/** + * Encode the literal length followed by the literals. + * + * The literal length is written to the upper four bits of pToken, with + * additional bytes written to the output as needed (see lz4). + * + * This is followed by literalLength bytes corresponding to the literals. + */ +void LDM_encodeLiteralLengthAndLiterals( + LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength); + +/** + * Write current block (literals, literal length, match offset, + * match length). + */ +void LDM_outputBlock(LDM_CCtx *cctx, + const U32 literalLength, + const U32 offset, + const U32 matchLength); + +/** + * Decompresses src into dst. + * + * Note: assumes src does not have a header. + */ size_t LDM_decompress(const void *src, size_t srcSize, void *dst, size_t maxDstSize); +/** + * Initialize the decompression context. + */ +void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize); + /** * Reads the header from src and writes the compressed size and - * decompressed size into compressSize and decompressSize respectively. + * decompressed size into compressedSize and decompressedSize respectively. * * NB: LDM_compress and LDM_decompress currently do not add/read headers. */ -void LDM_readHeader(const void *src, U64 *compressSize, - U64 *decompressSize); +void LDM_readHeader(const void *src, U64 *compressedSize, + U64 *decompressedSize); void LDM_test(void); diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 2017cf4e..40afef8c 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -26,7 +26,7 @@ static int compress(const char *fname, const char *oname) { int fdin, fdout; struct stat statbuf; char *src, *dst; - size_t maxCompressSize, compressSize; + size_t maxCompressedSize, compressedSize; /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { @@ -46,11 +46,11 @@ static int compress(const char *fname, const char *oname) { return 1; } - maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; + maxCompressedSize = statbuf.st_size + LDM_HEADER_SIZE; /* Go to the location corresponding to the last byte. */ /* TODO: fallocate? */ - if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { + if (lseek(fdout, maxCompressedSize - 1, SEEK_SET) == -1) { perror("lseek error"); return 1; } @@ -69,32 +69,32 @@ static int compress(const char *fname, const char *oname) { } /* mmap the output file */ - if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, + if ((dst = mmap(0, maxCompressedSize, PROT_READ | PROT_WRITE, MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { perror("mmap error for output"); return 1; } - compressSize = LDM_HEADER_SIZE + + compressedSize = LDM_HEADER_SIZE + LDM_compress(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, maxCompressSize); + dst + LDM_HEADER_SIZE, maxCompressedSize); // Write compress and decompress size to header // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &compressSize, 8); + memcpy(dst, &compressedSize, 8); memcpy(dst + 8, &(statbuf.st_size), 8); #ifdef DEBUG - printf("Compressed size: %zu\n", compressSize); + printf("Compressed size: %zu\n", compressedSize); printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); #endif - // Truncate file to compressSize. - ftruncate(fdout, compressSize); + // Truncate file to compressedSize. + ftruncate(fdout, compressedSize); printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)compressSize, oname, - (double)compressSize / (statbuf.st_size) * 100); + (unsigned)statbuf.st_size, (unsigned)compressedSize, oname, + (double)compressedSize / (statbuf.st_size) * 100); // Close files. close(fdin); @@ -110,7 +110,7 @@ static int decompress(const char *fname, const char *oname) { int fdin, fdout; struct stat statbuf; char *src, *dst; - U64 compressSize, decompressSize; + U64 compressedSize, decompressedSize; size_t outSize; /* Open the input file. */ @@ -139,10 +139,10 @@ static int decompress(const char *fname, const char *oname) { } /* Read the header. */ - LDM_readHeader(src, &compressSize, &decompressSize); + LDM_readHeader(src, &compressedSize, &decompressedSize); /* Go to the location corresponding to the last byte. */ - if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { + if (lseek(fdout, decompressedSize - 1, SEEK_SET) == -1) { perror("lseek error"); return 1; } @@ -154,7 +154,7 @@ static int decompress(const char *fname, const char *oname) { } /* mmap the output file */ - if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, + if ((dst = mmap(0, decompressedSize, PROT_READ | PROT_WRITE, MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { perror("mmap error for output"); return 1; @@ -162,7 +162,7 @@ static int decompress(const char *fname, const char *oname) { outSize = LDM_decompress( src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, - dst, decompressSize); + dst, decompressedSize); printf("Ret size out: %zu\n", outSize); ftruncate(fdout, outSize); From 361c06df75d154b793cfb02b7bd279e9a6846cbd Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 13 Jul 2017 15:29:41 -0700 Subject: [PATCH 30/62] Add min/max offset to stats --- contrib/long_distance_matching/ldm.c | 97 +++--- contrib/long_distance_matching/ldm.h | 24 +- .../versions/v0.5/Makefile | 14 +- .../versions/v0.5/ldm.c | 328 ++++++++---------- .../versions/v0.5/ldm.h | 135 ++++++- .../versions/v0.5/main-ldm.c | 254 ++------------ .../versions/v0.5/util.c | 69 ---- .../versions/v0.5/util.h | 25 -- 8 files changed, 386 insertions(+), 560 deletions(-) delete mode 100644 contrib/long_distance_matching/versions/v0.5/util.c delete mode 100644 contrib/long_distance_matching/versions/v0.5/util.h diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index a5057aec..b8e8c63b 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -1,25 +1,14 @@ -#include -#include +#include #include #include +#include +#include #include "ldm.h" // Insert every (HASH_ONLY_EVERY + 1) into the hash table. #define HASH_ONLY_EVERY 0 -#define LDM_MEMORY_USAGE 22 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) - -#define LDM_OFFSET_SIZE 4 - -#define WINDOW_SIZE (1 << 29) - -//These should be multiples of four. -#define LDM_HASH_LENGTH 8 - #define ML_BITS 4 #define ML_MASK ((1U<numMatches); - printf("Average match length: %.1f\n", ((double)stats->totalMatchLength) / + //TODO: compute percentage matched? + printf("num matches, total match length: %u, %llu\n", + stats->numMatches, + stats->totalMatchLength); + printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / (double)stats->numMatches); - printf("Average literal length: %.1f\n", + printf("avg literal length: %.1f\n", ((double)stats->totalLiteralLength) / (double)stats->numMatches); - printf("Average offset length: %.1f\n", + printf("avg offset length: %.1f\n", ((double)stats->totalOffset) / (double)stats->numMatches); - printf("Num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", + printf("min offset, max offset: %u %u\n", + stats->minOffset, stats->maxOffset); + printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", stats->numCollisions, stats->numHashInserts, stats->numHashInserts == 0 ? 1.0 : (100.0 * (double)stats->numCollisions) / (double)stats->numHashInserts); - - // Output occupancy of hash table. - { - U32 i = 0; - U32 ctr = 0; - for (; i < hashTableSize; i++) { - if (hashTable[i].offset == 0) { - ctr++; - } - } - printf("Hash table size, empty slots, %% empty: %u %u %.3f\n", - hashTableSize, ctr, - 100.0 * (double)(ctr) / (double)hashTableSize); - } - - printf("=====================\n"); } int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { @@ -219,8 +214,8 @@ static void setNextHash(LDM_CCtx *cctx) { // cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); cctx->nextSum = updateChecksum( cctx->lastSum, LDM_HASH_LENGTH, - (cctx->lastPosHashed)[0], - (cctx->lastPosHashed)[LDM_HASH_LENGTH]); + cctx->lastPosHashed[0], + cctx->lastPosHashed[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; cctx->nextHash = checksumToHash(cctx->nextSum); @@ -243,7 +238,7 @@ static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { #ifdef COMPUTE_STATS if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { - offset_t offset = (cctx->hashTable)[hash].offset; + offset_t offset = cctx->hashTable[hash].offset; cctx->stats.numHashInserts++; if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { cctx->stats.numCollisions++; @@ -254,8 +249,8 @@ static void putHashOfCurrentPositionFromHash( // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - (cctx->hashTable)[hash] = - (LDM_hashEntry){ (offset_t)(cctx->ip - cctx->ibase) }; + const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; + cctx->hashTable[hash] = entry; } cctx->lastPosHashed = cctx->ip; @@ -347,6 +342,7 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, memset(&(cctx->stats), 0, sizeof(cctx->stats)); memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); + cctx->stats.minOffset = UINT_MAX; cctx->lastPosHashed = NULL; @@ -493,6 +489,10 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.stats.totalLiteralLength += literalLength; cctx.stats.totalOffset += offset; cctx.stats.totalMatchLength += matchLength + LDM_MIN_MATCH_LENGTH; + cctx.stats.minOffset = + offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; + cctx.stats.maxOffset = + offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; #endif LDM_outputBlock(&cctx, literalLength, offset, matchLength); @@ -523,7 +523,8 @@ _last_literals: } #ifdef COMPUTE_STATS - LDM_printCompressStats(&cctx.stats, cctx.hashTable, LDM_HASHTABLESIZE_U32); + LDM_printCompressStats(&cctx.stats); + LDM_outputHashtableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); #endif return (cctx.op - (const BYTE *)cctx.obase); diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 19d475dc..5da3c3b9 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -8,9 +8,19 @@ #define LDM_COMPRESS_SIZE 8 #define LDM_DECOMPRESS_SIZE 8 #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) +#define LDM_OFFSET_SIZE 4 -// This should be a multiple of four. +// Defines the size of the hash table. +#define LDM_MEMORY_USAGE 22 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) + +#define WINDOW_SIZE (1 << 25) + +//These should be multiples of four. #define LDM_MIN_MATCH_LENGTH 8 +#define LDM_HASH_LENGTH 8 typedef U32 offset_t; typedef U32 hash_t; @@ -55,12 +65,18 @@ size_t LDM_compress(const void *src, size_t srcSize, void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize); + +/** + * Prints the percentage of the hash table occupied (where occupied is defined + * as the entry being non-zero). + */ +void LDM_outputHashtableOccupancy(const LDM_hashEntry *hashTable, + U32 hashTableSize); + /** * Outputs compression statistics to stdout. */ -void LDM_printCompressStats(const LDM_compressStats *stats, - const LDM_hashEntry *hashTable, - U32 hashTableSize); +void LDM_printCompressStats(const LDM_compressStats *stats); /** * Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the * LDM_MIN_MATCH_LENGTH bytes from match. diff --git a/contrib/long_distance_matching/versions/v0.5/Makefile b/contrib/long_distance_matching/versions/v0.5/Makefile index fa4abce6..dee686bc 100644 --- a/contrib/long_distance_matching/versions/v0.5/Makefile +++ b/contrib/long_distance_matching/versions/v0.5/Makefile @@ -1,5 +1,15 @@ +# ################################################################ +# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# ################################################################ + # This Makefile presumes libzstd is installed, using `sudo make install` +CPPFLAGS+= -I../../../../lib/common CFLAGS ?= -O3 DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ @@ -17,11 +27,11 @@ default: all all: main-ldm -main-ldm : util.c ldm.c main-ldm.c +main-ldm : ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-ldm + main main-ldm @echo Cleaning completed diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.c b/contrib/long_distance_matching/versions/v0.5/ldm.c index 5fa20c06..b8e8c63b 100644 --- a/contrib/long_distance_matching/versions/v0.5/ldm.c +++ b/contrib/long_distance_matching/versions/v0.5/ldm.c @@ -1,63 +1,46 @@ -#include -#include +#include #include #include +#include +#include #include "ldm.h" -#include "util.h" // Insert every (HASH_ONLY_EVERY + 1) into the hash table. #define HASH_ONLY_EVERY 0 -#define LDM_MEMORY_USAGE 20 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) - -#define LDM_OFFSET_SIZE 4 - -#define WINDOW_SIZE (1 << 20) - -//These should be multiples of four. -#define LDM_HASH_LENGTH 4 -#define MINMATCH 4 - #define ML_BITS 4 #define ML_MASK ((1U<stats); +void LDM_outputHashtableOccupancy( + const LDM_hashEntry *hashTable, U32 hashTableSize) { + U32 i = 0; + U32 ctr = 0; + for (; i < hashTableSize; i++) { + if (hashTable[i].offset == 0) { + ctr++; + } + } + printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", + hashTableSize, ctr, + 100.0 * (double)(ctr) / (double)hashTableSize); +} + +void LDM_printCompressStats(const LDM_compressStats *stats) { printf("=====================\n"); printf("Compression statistics\n"); - printf("Total number of matches: %u\n", stats->numMatches); - printf("Average match length: %.1f\n", ((double)stats->totalMatchLength) / + //TODO: compute percentage matched? + printf("num matches, total match length: %u, %llu\n", + stats->numMatches, + stats->totalMatchLength); + printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / (double)stats->numMatches); - printf("Average literal length: %.1f\n", + printf("avg literal length: %.1f\n", ((double)stats->totalLiteralLength) / (double)stats->numMatches); - printf("Average offset length: %.1f\n", + printf("avg offset length: %.1f\n", ((double)stats->totalOffset) / (double)stats->numMatches); - printf("Num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", + printf("min offset, max offset: %u %u\n", + stats->minOffset, stats->maxOffset); + printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", stats->numCollisions, stats->numHashInserts, stats->numHashInserts == 0 ? 1.0 : (100.0 * (double)stats->numCollisions) / (double)stats->numHashInserts); - - // Output occupancy of hash table. - { - U32 i = 0; - U32 ctr = 0; - for (; i < LDM_HASHTABLESIZE_U32; i++) { - if ((cctx->hashTable)[i].offset == 0) { - ctr++; - } - } - printf("Hash table size, empty slots, %% empty: %u %u %.3f\n", - LDM_HASHTABLESIZE_U32, ctr, - 100.0 * (double)(ctr) / (double)LDM_HASHTABLESIZE_U32); - } - - printf("=====================\n"); } -#endif -/** - * Checks whether the MINMATCH bytes from p are the same as the MINMATCH - * bytes from match. - * - * This assumes MINMATCH is a multiple of four. - * - * Return 1 if valid, 0 otherwise. - */ -static int LDM_isValidMatch(const BYTE *p, const BYTE *match) { +int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { /* - if (memcmp(p, match, MINMATCH) == 0) { + if (memcmp(pIn, pMatch, LDM_MIN_MATCH_LENGTH) == 0) { return 1; } return 0; */ //TODO: This seems to be faster for some reason? - U16 lengthLeft = MINMATCH; - const BYTE *curP = p; - const BYTE *curMatch = match; + U32 lengthLeft = LDM_MIN_MATCH_LENGTH; + const BYTE *curIn = pIn; + const BYTE *curMatch = pMatch; for (; lengthLeft >= 8; lengthLeft -= 8) { - if (LDM_read64(curP) != LDM_read64(curMatch)) { + if (MEM_read64(curIn) != MEM_read64(curMatch)) { return 0; } - curP += 8; + curIn += 8; curMatch += 8; } if (lengthLeft > 0) { - return (LDM_read32(curP) == LDM_read32(curMatch)); + return (MEM_read32(curIn) == MEM_read32(curMatch)); } return 1; } @@ -183,19 +155,21 @@ static hash_t checksumToHash(U32 sum) { * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) */ -static U32 getChecksum(const char *data, U32 len) { +static U32 getChecksum(const BYTE *buf, U32 len) { U32 i; U32 s1, s2; - const schar *buf = (const schar *)data; s1 = s2 = 0; for (i = 0; i < (len - 4); i += 4) { s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + - (2 * buf[i + 2]) + (buf[i + 3]); - s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3]; + (2 * buf[i + 2]) + (buf[i + 3]) + + (10 * CHECKSUM_CHAR_OFFSET); + s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3] + + + (4 * CHECKSUM_CHAR_OFFSET); + } for(; i < len; i++) { - s1 += buf[i]; + s1 += buf[i] + CHECKSUM_CHAR_OFFSET; s2 += s1; } return (s1 & 0xffff) + (s2 << 16); @@ -211,9 +185,9 @@ static U32 getChecksum(const char *data, U32 len) { * Thus toRemove should correspond to data[0]. */ static U32 updateChecksum(U32 sum, U32 len, - schar toRemove, schar toAdd) { + BYTE toRemove, BYTE toAdd) { U32 s1 = (sum & 0xffff) - toRemove + toAdd; - U32 s2 = (sum >> 16) - (toRemove * len) + s1; + U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; return (s1 & 0xffff) + (s2 << 16); } @@ -240,13 +214,13 @@ static void setNextHash(LDM_CCtx *cctx) { // cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); cctx->nextSum = updateChecksum( cctx->lastSum, LDM_HASH_LENGTH, - (schar)((cctx->lastPosHashed)[0]), - (schar)((cctx->lastPosHashed)[LDM_HASH_LENGTH])); + cctx->lastPosHashed[0], + cctx->lastPosHashed[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; cctx->nextHash = checksumToHash(cctx->nextSum); #ifdef RUN_CHECKS - check = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); + check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); if (check != cctx->nextSum) { printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); @@ -264,7 +238,7 @@ static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { #ifdef COMPUTE_STATS if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { - offset_t offset = (cctx->hashTable)[hash].offset; + offset_t offset = cctx->hashTable[hash].offset; cctx->stats.numHashInserts++; if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { cctx->stats.numCollisions++; @@ -275,7 +249,8 @@ static void putHashOfCurrentPositionFromHash( // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - (cctx->hashTable)[hash] = (hashEntry){ (offset_t)(cctx->ip - cctx->ibase) }; + const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; + cctx->hashTable[hash] = entry; } cctx->lastPosHashed = cctx->ip; @@ -303,7 +278,7 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { * Insert hash of the current position into the hash table. */ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U32 sum = getChecksum((const char *)cctx->ip, LDM_HASH_LENGTH); + U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); hash_t hash = checksumToHash(sum); #ifdef RUN_CHECKS @@ -323,40 +298,33 @@ static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) { return cctx->hashTable[hash].offset + cctx->ibase; } -/** - * Counts the number of bytes that match from pIn and pMatch, - * up to pInLimit. - * - * TODO: make more efficient. - */ -static unsigned countMatchLength(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { +U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { const BYTE * const pStart = pIn; while (pIn < pInLimit - 1) { - BYTE const diff = LDM_readByte(pMatch) ^ LDM_readByte(pIn); + BYTE const diff = (*pMatch) ^ *(pIn); if (!diff) { pIn++; pMatch++; continue; } - return (unsigned)(pIn - pStart); + return (U32)(pIn - pStart); } - return (unsigned)(pIn - pStart); + return (U32)(pIn - pStart); } -void LDM_readHeader(const void *src, size_t *compressSize, - size_t *decompressSize) { - const U32 *ip = (const U32 *)src; - *compressSize = *ip++; - *decompressSize = *ip; +void LDM_readHeader(const void *src, U64 *compressedSize, + U64 *decompressedSize) { + const BYTE *ip = (const BYTE *)src; + *compressedSize = MEM_readLE64(ip); + ip += sizeof(U64); + *decompressedSize = MEM_readLE64(ip); + // ip += sizeof(U64); } -/** - * Initialize a compression context. - */ -static void initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { +void LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { cctx->isize = srcSize; cctx->maxOSize = maxDstSize; @@ -365,7 +333,7 @@ static void initializeCCtx(LDM_CCtx *cctx, cctx->iend = cctx->ibase + srcSize; cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; - cctx->imatchLimit = cctx->iend - MINMATCH; + cctx->imatchLimit = cctx->iend - LDM_MIN_MATCH_LENGTH; cctx->obase = (BYTE *)dst; cctx->op = (BYTE *)dst; @@ -374,6 +342,7 @@ static void initializeCCtx(LDM_CCtx *cctx, memset(&(cctx->stats), 0, sizeof(cctx->stats)); memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); + cctx->stats.minOffset = UINT_MAX; cctx->lastPosHashed = NULL; @@ -416,61 +385,65 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { return 0; } -/** - * Write current block (literals, literal length, match offset, - * match length). - * - * Update input pointer, inserting hashes into hash table along the way. - */ -static void outputBlock(LDM_CCtx *cctx, - unsigned const literalLength, - unsigned const offset, - unsigned const matchLength) { - BYTE *token = cctx->op++; - +void LDM_encodeLiteralLengthAndLiterals( + LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { /* Encode the literal length. */ if (literalLength >= RUN_MASK) { int len = (int)literalLength - RUN_MASK; - *token = (RUN_MASK << ML_BITS); + *pToken = (RUN_MASK << ML_BITS); for (; len >= 255; len -= 255) { *(cctx->op)++ = 255; } *(cctx->op)++ = (BYTE)len; } else { - *token = (BYTE)(literalLength << ML_BITS); + *pToken = (BYTE)(literalLength << ML_BITS); } /* Encode the literals. */ memcpy(cctx->op, cctx->anchor, literalLength); cctx->op += literalLength; +} + +void LDM_outputBlock(LDM_CCtx *cctx, + const U32 literalLength, + const U32 offset, + const U32 matchLength) { + BYTE *pToken = cctx->op++; + + /* Encode the literal length and literals. */ + LDM_encodeLiteralLengthAndLiterals(cctx, pToken, literalLength); /* Encode the offset. */ - LDM_write32(cctx->op, offset); + MEM_write32(cctx->op, offset); cctx->op += LDM_OFFSET_SIZE; /* Encode the match length. */ if (matchLength >= ML_MASK) { unsigned matchLengthRemaining = matchLength; - *token += ML_MASK; + *pToken += ML_MASK; matchLengthRemaining -= ML_MASK; - LDM_write32(cctx->op, 0xFFFFFFFF); + MEM_write32(cctx->op, 0xFFFFFFFF); while (matchLengthRemaining >= 4*0xFF) { cctx->op += 4; - LDM_write32(cctx->op, 0xffffffff); + MEM_write32(cctx->op, 0xffffffff); matchLengthRemaining -= 4*0xFF; } cctx->op += matchLengthRemaining / 255; *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); } else { - *token += (BYTE)(matchLength); + *pToken += (BYTE)(matchLength); } } -// TODO: srcSize and maxDstSize is unused +// TODO: maxDstSize is unused. This function may seg fault when writing +// beyond the size of dst, as it does not check maxDstSize. Writing to +// a buffer and performing checks is a possible solution. +// +// This is based upon lz4. size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; - initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); @@ -506,21 +479,27 @@ size_t LDM_compress(const void *src, size_t srcSize, * length) and update pointers and hashes. */ { - unsigned const literalLength = (unsigned)(cctx.ip - cctx.anchor); - unsigned const offset = cctx.ip - match; - unsigned const matchLength = countMatchLength( - cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); + const U32 literalLength = cctx.ip - cctx.anchor; + const U32 offset = cctx.ip - match; + const U32 matchLength = LDM_countMatchLength( + cctx.ip + LDM_MIN_MATCH_LENGTH, match + LDM_MIN_MATCH_LENGTH, + cctx.ihashLimit); #ifdef COMPUTE_STATS cctx.stats.totalLiteralLength += literalLength; cctx.stats.totalOffset += offset; - cctx.stats.totalMatchLength += matchLength + MINMATCH; + cctx.stats.totalMatchLength += matchLength + LDM_MIN_MATCH_LENGTH; + cctx.stats.minOffset = + offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; + cctx.stats.maxOffset = + offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; #endif - outputBlock(&cctx, literalLength, offset, matchLength); + LDM_outputBlock(&cctx, literalLength, offset, matchLength); // Move ip to end of block, inserting hashes at each position. cctx.nextIp = cctx.ip + cctx.step; - while (cctx.ip < cctx.anchor + MINMATCH + matchLength + literalLength) { + while (cctx.ip < cctx.anchor + LDM_MIN_MATCH_LENGTH + + matchLength + literalLength) { if (cctx.ip > cctx.lastPosHashed) { // TODO: Simplify. LDM_updateLastHashFromNextHash(&cctx); @@ -538,31 +517,22 @@ size_t LDM_compress(const void *src, size_t srcSize, _last_literals: /* Encode the last literals (no more matches). */ { - size_t const lastRun = (size_t)(cctx.iend - cctx.anchor); - if (lastRun >= RUN_MASK) { - size_t accumulator = lastRun - RUN_MASK; - *(cctx.op)++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255; accumulator -= 255) { - *(cctx.op)++ = 255; - } - *(cctx.op)++ = (BYTE)accumulator; - } else { - *(cctx.op)++ = (BYTE)(lastRun << ML_BITS); - } - memcpy(cctx.op, cctx.anchor, lastRun); - cctx.op += lastRun; + const size_t lastRun = (size_t)(cctx.iend - cctx.anchor); + BYTE *pToken = cctx.op++; + LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); } #ifdef COMPUTE_STATS - printCompressStats(&cctx); + LDM_printCompressStats(&cctx.stats); + LDM_outputHashtableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); #endif return (cctx.op - (const BYTE *)cctx.obase); } -typedef struct LDM_DCtx { - size_t compressSize; - size_t maxDecompressSize; +struct LDM_DCtx { + size_t compressedSize; + size_t maxDecompressedSize; const BYTE *ibase; /* Base of input */ const BYTE *ip; /* Current input position */ @@ -571,26 +541,25 @@ typedef struct LDM_DCtx { const BYTE *obase; /* Base of output */ BYTE *op; /* Current output position */ const BYTE *oend; /* End of output */ -} LDM_DCtx; +}; -static void LDM_initializeDCtx(LDM_DCtx *dctx, - const void *src, size_t compressSize, - void *dst, size_t maxDecompressSize) { - dctx->compressSize = compressSize; - dctx->maxDecompressSize = maxDecompressSize; +void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize) { + dctx->compressedSize = compressedSize; + dctx->maxDecompressedSize = maxDecompressedSize; dctx->ibase = src; dctx->ip = (const BYTE *)src; - dctx->iend = dctx->ip + dctx->compressSize; + dctx->iend = dctx->ip + dctx->compressedSize; dctx->op = dst; - dctx->oend = dctx->op + dctx->maxDecompressSize; - + dctx->oend = dctx->op + dctx->maxDecompressedSize; } -size_t LDM_decompress(const void *src, size_t compressSize, - void *dst, size_t maxDecompressSize) { +size_t LDM_decompress(const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize) { LDM_DCtx dctx; - LDM_initializeDCtx(&dctx, src, compressSize, dst, maxDecompressSize); + LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); while (dctx.ip < dctx.iend) { BYTE *cpy; @@ -598,7 +567,7 @@ size_t LDM_decompress(const void *src, size_t compressSize, size_t length, offset; /* Get the literal length. */ - unsigned const token = *(dctx.ip)++; + const unsigned token = *(dctx.ip)++; if ((length = (token >> ML_BITS)) == RUN_MASK) { unsigned s; do { @@ -614,7 +583,7 @@ size_t LDM_decompress(const void *src, size_t compressSize, dctx.op = cpy; //TODO : dynamic offset size - offset = LDM_read32(dctx.ip); + offset = MEM_read32(dctx.ip); dctx.ip += LDM_OFFSET_SIZE; match = dctx.op - offset; @@ -627,7 +596,7 @@ size_t LDM_decompress(const void *src, size_t compressSize, length += s; } while (s == 255); } - length += MINMATCH; + length += LDM_MIN_MATCH_LENGTH; /* Copy match. */ cpy = dctx.op + length; @@ -640,6 +609,11 @@ size_t LDM_decompress(const void *src, size_t compressSize, return dctx.op - (BYTE *)dst; } +// TODO: implement and test hash function +void LDM_test(void) { + +} + /* void LDM_test(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.h b/contrib/long_distance_matching/versions/v0.5/ldm.h index 1bd19745..5da3c3b9 100644 --- a/contrib/long_distance_matching/versions/v0.5/ldm.h +++ b/contrib/long_distance_matching/versions/v0.5/ldm.h @@ -3,24 +3,141 @@ #include /* size_t */ -#define LDM_COMPRESS_SIZE 4 -#define LDM_DECOMPRESS_SIZE 4 -#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) +#include "mem.h" // from /lib/common/mem.h +#define LDM_COMPRESS_SIZE 8 +#define LDM_DECOMPRESS_SIZE 8 +#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) +#define LDM_OFFSET_SIZE 4 + +// Defines the size of the hash table. +#define LDM_MEMORY_USAGE 22 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) + +#define WINDOW_SIZE (1 << 25) + +//These should be multiples of four. +#define LDM_MIN_MATCH_LENGTH 8 +#define LDM_HASH_LENGTH 8 + +typedef U32 offset_t; +typedef U32 hash_t; +typedef struct LDM_hashEntry LDM_hashEntry; +typedef struct LDM_compressStats LDM_compressStats; +typedef struct LDM_CCtx LDM_CCtx; +typedef struct LDM_DCtx LDM_DCtx; + +/** + * Compresses src into dst. + * + * NB: This currently ignores maxDstSize and assumes enough space is available. + * + * Block format (see lz4 documentation for more information): + * github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md + * + * A block is composed of sequences. Each sequence begins with a token, which + * is a one-byte value separated into two 4-bit fields. + * + * The first field uses the four high bits of the token and encodes the literal + * length. If the field value is 0, there is no literal. If it is 15, + * additional bytes are added (each ranging from 0 to 255) to the previous + * value to produce a total length. + * + * Following the token and optional length bytes are the literals. + * + * Next are the 4 bytes representing the offset of the match (2 in lz4), + * representing the position to copy the literals. + * + * The lower four bits of the token encode the match length. With additional + * bytes added similarly to the additional literal length bytes after the offset. + * + * The last sequence is incomplete and stops right after the lieterals. + * + */ size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize); +/** + * Initialize the compression context. + */ +void LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize); + +/** + * Prints the percentage of the hash table occupied (where occupied is defined + * as the entry being non-zero). + */ +void LDM_outputHashtableOccupancy(const LDM_hashEntry *hashTable, + U32 hashTableSize); + +/** + * Outputs compression statistics to stdout. + */ +void LDM_printCompressStats(const LDM_compressStats *stats); +/** + * Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the + * LDM_MIN_MATCH_LENGTH bytes from match. + * + * This assumes LDM_MIN_MATCH_LENGTH is a multiple of four. + * + * Return 1 if valid, 0 otherwise. + */ +int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch); + +/** + * Counts the number of bytes that match from pIn and pMatch, + * up to pInLimit. + */ +U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit); + +/** + * Encode the literal length followed by the literals. + * + * The literal length is written to the upper four bits of pToken, with + * additional bytes written to the output as needed (see lz4). + * + * This is followed by literalLength bytes corresponding to the literals. + */ +void LDM_encodeLiteralLengthAndLiterals( + LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength); + +/** + * Write current block (literals, literal length, match offset, + * match length). + */ +void LDM_outputBlock(LDM_CCtx *cctx, + const U32 literalLength, + const U32 offset, + const U32 matchLength); + +/** + * Decompresses src into dst. + * + * Note: assumes src does not have a header. + */ size_t LDM_decompress(const void *src, size_t srcSize, void *dst, size_t maxDstSize); /** - * Reads the header from src and writes the compressed size and - * decompressed size into compressSize and decompressSize respectively. + * Initialize the decompression context. */ -void LDM_readHeader(const void *src, size_t *compressSize, - size_t *decompressSize); +void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize); -void LDM_test(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); +/** + * Reads the header from src and writes the compressed size and + * decompressed size into compressedSize and decompressedSize respectively. + * + * NB: LDM_compress and LDM_decompress currently do not add/read headers. + */ +void LDM_readHeader(const void *src, U64 *compressedSize, + U64 *decompressedSize); + +void LDM_test(void); #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v0.5/main-ldm.c b/contrib/long_distance_matching/versions/v0.5/main-ldm.c index fbfd789b..40afef8c 100644 --- a/contrib/long_distance_matching/versions/v0.5/main-ldm.c +++ b/contrib/long_distance_matching/versions/v0.5/main-ldm.c @@ -1,5 +1,3 @@ -// TODO: file size must fit into a U32 - #include #include #include @@ -12,18 +10,23 @@ #include #include "ldm.h" +#include "zstd.h" #define DEBUG //#define TEST /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. + * + * TODO: This currently seg faults if the compressed size is > the decompress + * size due to the mmapping and output file size allocated to be the input size. + * The compress function should check before writing or buffer writes. */ static int compress(const char *fname, const char *oname) { int fdin, fdout; struct stat statbuf; char *src, *dst; - size_t maxCompressSize, compressSize; + size_t maxCompressedSize, compressedSize; /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { @@ -43,11 +46,11 @@ static int compress(const char *fname, const char *oname) { return 1; } - maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; + maxCompressedSize = statbuf.st_size + LDM_HEADER_SIZE; /* Go to the location corresponding to the last byte. */ /* TODO: fallocate? */ - if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { + if (lseek(fdout, maxCompressedSize - 1, SEEK_SET) == -1) { perror("lseek error"); return 1; } @@ -66,39 +69,32 @@ static int compress(const char *fname, const char *oname) { } /* mmap the output file */ - if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, + if ((dst = mmap(0, maxCompressedSize, PROT_READ | PROT_WRITE, MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { perror("mmap error for output"); return 1; } -/* -#ifdef TEST - LDM_test(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, statbuf.st_size); -#endif -*/ - - compressSize = LDM_HEADER_SIZE + + compressedSize = LDM_HEADER_SIZE + LDM_compress(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, statbuf.st_size); + dst + LDM_HEADER_SIZE, maxCompressedSize); // Write compress and decompress size to header // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &compressSize, 4); - memcpy(dst + 4, &(statbuf.st_size), 4); + memcpy(dst, &compressedSize, 8); + memcpy(dst + 8, &(statbuf.st_size), 8); #ifdef DEBUG - printf("Compressed size: %zu\n", compressSize); + printf("Compressed size: %zu\n", compressedSize); printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); #endif - // Truncate file to compressSize. - ftruncate(fdout, compressSize); + // Truncate file to compressedSize. + ftruncate(fdout, compressedSize); printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)compressSize, oname, - (double)compressSize / (statbuf.st_size) * 100); + (unsigned)statbuf.st_size, (unsigned)compressedSize, oname, + (double)compressedSize / (statbuf.st_size) * 100); // Close files. close(fdin); @@ -114,7 +110,8 @@ static int decompress(const char *fname, const char *oname) { int fdin, fdout; struct stat statbuf; char *src, *dst; - size_t compressSize, decompressSize, outSize; + U64 compressedSize, decompressedSize; + size_t outSize; /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { @@ -142,10 +139,10 @@ static int decompress(const char *fname, const char *oname) { } /* Read the header. */ - LDM_readHeader(src, &compressSize, &decompressSize); + LDM_readHeader(src, &compressedSize, &decompressedSize); /* Go to the location corresponding to the last byte. */ - if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { + if (lseek(fdout, decompressedSize - 1, SEEK_SET) == -1) { perror("lseek error"); return 1; } @@ -157,7 +154,7 @@ static int decompress(const char *fname, const char *oname) { } /* mmap the output file */ - if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, + if ((dst = mmap(0, decompressedSize, PROT_READ | PROT_WRITE, MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { perror("mmap error for output"); return 1; @@ -165,7 +162,7 @@ static int decompress(const char *fname, const char *oname) { outSize = LDM_decompress( src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, - dst, decompressSize); + dst, decompressedSize); printf("Ret size out: %zu\n", outSize); ftruncate(fdout, outSize); @@ -265,204 +262,9 @@ int main(int argc, const char *argv[]) { } /* verify */ verify(inpFilename, decFilename); - return 0; -} - -#if 0 -static size_t compress_file(FILE *in, FILE *out, size_t *size_in, - size_t *size_out) { - char *src, *buf = NULL; - size_t r = 1; - size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; - - src = malloc(BUF_SIZE); - if (!src) { - printf("Not enough memory\n"); - goto cleanup; - } - - size = BUF_SIZE + LDM_HEADER_SIZE; - buf = malloc(size); - if (!buf) { - printf("Not enough memory\n"); - goto cleanup; - } - - - for (;;) { - k = fread(src, 1, BUF_SIZE, in); - if (k == 0) - break; - count_in += k; - - n = LDM_compress(src, buf, k, BUF_SIZE); - - // n = k; - // offset += n; - offset = k; - count_out += k; - -// k = fwrite(src, 1, offset, out); - - k = fwrite(buf, 1, offset, out); - if (k < offset) { - if (ferror(out)) - printf("Write failed\n"); - else - printf("Short write\n"); - goto cleanup; - } - - } - *size_in = count_in; - *size_out = count_out; - r = 0; - cleanup: - free(src); - free(buf); - return r; -} - -static size_t decompress_file(FILE *in, FILE *out) { - void *src = malloc(BUF_SIZE); - void *dst = NULL; - size_t dst_capacity = BUF_SIZE; - size_t ret = 1; - size_t bytes_written = 0; - - if (!src) { - perror("decompress_file(src)"); - goto cleanup; - } - - while (ret != 0) { - /* Load more input */ - size_t src_size = fread(src, 1, BUF_SIZE, in); - void *src_ptr = src; - void *src_end = src_ptr + src_size; - if (src_size == 0 || ferror(in)) { - printf("(TODO): Decompress: not enough input or error reading file\n"); - //TODO - ret = 0; - goto cleanup; - } - - /* Allocate destination buffer if it hasn't been allocated already */ - if (!dst) { - dst = malloc(dst_capacity); - if (!dst) { - perror("decompress_file(dst)"); - goto cleanup; - } - } - - // TODO - - /* Decompress: - * Continue while there is more input to read. - */ - while (src_ptr != src_end && ret != 0) { - // size_t dst_size = src_size; - size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); - size_t written = fwrite(dst, 1, dst_size, out); -// printf("Writing %zu bytes\n", dst_size); - bytes_written += dst_size; - if (written != dst_size) { - printf("Decompress: Failed to write to file\n"); - goto cleanup; - } - src_ptr += src_size; - src_size = src_end - src_ptr; - } - - /* Update input */ - - } - - printf("Wrote %zu bytes\n", bytes_written); - - cleanup: - free(src); - free(dst); - - return ret; -} - -int main2(int argc, char *argv[]) { - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Please specify input filename\n"); - return 0; - } - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - /* compress */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *outFp = fopen(ldmFilename, "wb"); - size_t sizeIn = 0; - size_t sizeOut = 0; - size_t ret; - printf("compress : %s -> %s\n", inpFilename, ldmFilename); - ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); - if (ret) { - printf("compress : failed with code %zu\n", ret); - return ret; - } - printf("%s: %zu → %zu bytes, %.1f%%\n", - inpFilename, sizeIn, sizeOut, - (double)sizeOut / sizeIn * 100); - printf("compress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* decompress */ - { - FILE *inpFp = fopen(ldmFilename, "rb"); - FILE *outFp = fopen(decFilename, "wb"); - size_t ret; - - printf("decompress : %s -> %s\n", ldmFilename, decFilename); - ret = decompress_file(inpFp, outFp); - if (ret) { - printf("decompress : failed with code %zu\n", ret); - return ret; - } - printf("decompress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* verify */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); - } - return 0; -} +#ifdef TEST + LDM_test(); #endif - + return 0; +} diff --git a/contrib/long_distance_matching/versions/v0.5/util.c b/contrib/long_distance_matching/versions/v0.5/util.c deleted file mode 100644 index 70fcbc2c..00000000 --- a/contrib/long_distance_matching/versions/v0.5/util.c +++ /dev/null @@ -1,69 +0,0 @@ -#include -#include -#include -#include - -#include "util.h" - -typedef uint8_t BYTE; -typedef uint16_t U16; -typedef uint32_t U32; -typedef int32_t S32; -typedef uint64_t U64; - -unsigned LDM_isLittleEndian(void) { - const union { U32 u; BYTE c[4]; } one = { 1 }; - return one.c[0]; -} - -U16 LDM_read16(const void *memPtr) { - U16 val; - memcpy(&val, memPtr, sizeof(val)); - return val; -} - -U16 LDM_readLE16(const void *memPtr) { - if (LDM_isLittleEndian()) { - return LDM_read16(memPtr); - } else { - const BYTE *p = (const BYTE *)memPtr; - return (U16)((U16)p[0] + (p[1] << 8)); - } -} - -void LDM_write16(void *memPtr, U16 value){ - memcpy(memPtr, &value, sizeof(value)); -} - -void LDM_write32(void *memPtr, U32 value) { - memcpy(memPtr, &value, sizeof(value)); -} - -void LDM_writeLE16(void *memPtr, U16 value) { - if (LDM_isLittleEndian()) { - LDM_write16(memPtr, value); - } else { - BYTE* p = (BYTE *)memPtr; - p[0] = (BYTE) value; - p[1] = (BYTE)(value>>8); - } -} - -U32 LDM_read32(const void *ptr) { - return *(const U32 *)ptr; -} - -U64 LDM_read64(const void *ptr) { - return *(const U64 *)ptr; -} - -void LDM_copy8(void *dst, const void *src) { - memcpy(dst, src, 8); -} - -BYTE LDM_readByte(const void *memPtr) { - BYTE val; - memcpy(&val, memPtr, 1); - return val; -} - diff --git a/contrib/long_distance_matching/versions/v0.5/util.h b/contrib/long_distance_matching/versions/v0.5/util.h deleted file mode 100644 index d1c3c999..00000000 --- a/contrib/long_distance_matching/versions/v0.5/util.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef LDM_UTIL_H -#define LDM_UTIL_H - -unsigned LDM_isLittleEndian(void); - -uint16_t LDM_read16(const void *memPtr); - -uint16_t LDM_readLE16(const void *memPtr); - -void LDM_write16(void *memPtr, uint16_t value); - -void LDM_write32(void *memPtr, uint32_t value); - -void LDM_writeLE16(void *memPtr, uint16_t value); - -uint32_t LDM_read32(const void *ptr); - -uint64_t LDM_read64(const void *ptr); - -void LDM_copy8(void *dst, const void *src); - -uint8_t LDM_readByte(const void *ptr); - - -#endif /* LDM_UTIL_H */ From 175a6c602928cd1277d961271b7d782d496520eb Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 13 Jul 2017 16:16:31 -0700 Subject: [PATCH 31/62] [ldm] Minor refactoring --- contrib/long_distance_matching/ldm.c | 30 ++++++++++++---------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index b8e8c63b..437feb1c 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -443,24 +443,19 @@ void LDM_outputBlock(LDM_CCtx *cctx, size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; + const BYTE *match; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); - // TODO: loop condition is not accurate. - while (1) { - const BYTE *match; - - /** - * Find a match. - * If no more matches can be found (i.e. the length of the remaining input - * is less than the minimum match length), then stop searching for matches - * and encode the final literals. - */ - if (LDM_findBestMatch(&cctx, &match) != 0) { - goto _last_literals; - } + /** + * Find a match. + * If no more matches can be found (i.e. the length of the remaining input + * is less than the minimum match length), then stop searching for matches + * and encode the final literals. + */ + while (LDM_findBestMatch(&cctx, &match) == 0) { #ifdef COMPUTE_STATS cctx.stats.numMatches++; #endif @@ -485,6 +480,8 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.ip + LDM_MIN_MATCH_LENGTH, match + LDM_MIN_MATCH_LENGTH, cctx.ihashLimit); + LDM_outputBlock(&cctx, literalLength, offset, matchLength); + #ifdef COMPUTE_STATS cctx.stats.totalLiteralLength += literalLength; cctx.stats.totalOffset += offset; @@ -494,7 +491,6 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.stats.maxOffset = offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; #endif - LDM_outputBlock(&cctx, literalLength, offset, matchLength); // Move ip to end of block, inserting hashes at each position. cctx.nextIp = cctx.ip + cctx.step; @@ -514,10 +510,10 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.anchor = cctx.ip; LDM_updateLastHashFromNextHash(&cctx); } -_last_literals: + /* Encode the last literals (no more matches). */ { - const size_t lastRun = (size_t)(cctx.iend - cctx.anchor); + const size_t lastRun = cctx.iend - cctx.anchor; BYTE *pToken = cctx.op++; LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); } @@ -527,7 +523,7 @@ _last_literals: LDM_outputHashtableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); #endif - return (cctx.op - (const BYTE *)cctx.obase); + return cctx.op - cctx.obase; } struct LDM_DCtx { From 4db7f12ef3a3466684a786de4f61b8ad8ef6950e Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 14 Jul 2017 10:52:03 -0700 Subject: [PATCH 32/62] Add offset histogram --- contrib/long_distance_matching/Makefile | 2 +- contrib/long_distance_matching/ldm.c | 41 +++++++++++++++++++++-- contrib/long_distance_matching/ldm.h | 6 ++-- contrib/long_distance_matching/main-ldm.c | 2 +- 4 files changed, 43 insertions(+), 8 deletions(-) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 8ba16d03..cff78644 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -27,7 +27,7 @@ default: all all: main-ldm -main-ldm : ldm.c main-ldm.c +main-ldm : ldm.h ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 437feb1c..186fa08e 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -15,7 +15,7 @@ #define RUN_MASK ((1U<>= 1) { + ret++; + } + return ret; +} + void LDM_printCompressStats(const LDM_compressStats *stats) { + int i = 0; printf("=====================\n"); printf("Compression statistics\n"); //TODO: compute percentage matched? @@ -107,11 +131,22 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { ((double)stats->totalOffset) / (double)stats->numMatches); printf("min offset, max offset: %u %u\n", stats->minOffset, stats->maxOffset); + + printf("\n"); + printf("offset histogram\n"); + for (; i <= intLog2(stats->maxOffset); i++) { + printf("2^%*d: %10u\n", 2, i, stats->offsetHistogram[i]); + } + printf("\n"); + + printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", stats->numCollisions, stats->numHashInserts, stats->numHashInserts == 0 ? 1.0 : (100.0 * (double)stats->numCollisions) / (double)stats->numHashInserts); + printf("=====================\n"); + } int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { @@ -145,7 +180,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { * of the hash table. */ static hash_t checksumToHash(U32 sum) { - return ((sum * 2654435761U) >> ((32)-LDM_HASHLOG)); + return ((sum * 2654435761U) >> (32 - LDM_HASHLOG)); } /** @@ -490,6 +525,7 @@ size_t LDM_compress(const void *src, size_t srcSize, offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; cctx.stats.maxOffset = offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; + cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; #endif // Move ip to end of block, inserting hashes at each position. @@ -607,7 +643,6 @@ size_t LDM_decompress(const void *src, size_t compressedSize, // TODO: implement and test hash function void LDM_test(void) { - } /* diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 5da3c3b9..0e54faa7 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -11,7 +11,7 @@ #define LDM_OFFSET_SIZE 4 // Defines the size of the hash table. -#define LDM_MEMORY_USAGE 22 +#define LDM_MEMORY_USAGE 20 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) @@ -19,8 +19,8 @@ #define WINDOW_SIZE (1 << 25) //These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 8 -#define LDM_HASH_LENGTH 8 +#define LDM_MIN_MATCH_LENGTH 4 +#define LDM_HASH_LENGTH 4 typedef U32 offset_t; typedef U32 hash_t; diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 40afef8c..ea6375ba 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -13,7 +13,7 @@ #include "zstd.h" #define DEBUG -//#define TEST +#define TEST /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. From 55f960e8db9e3df9322ae9c973c699f671f51c90 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 14 Jul 2017 11:00:20 -0700 Subject: [PATCH 33/62] Add percentages to offset histogram --- contrib/long_distance_matching/ldm.c | 5 ++++- contrib/long_distance_matching/ldm.h | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 186fa08e..d935a2bd 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -135,7 +135,10 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { printf("\n"); printf("offset histogram\n"); for (; i <= intLog2(stats->maxOffset); i++) { - printf("2^%*d: %10u\n", 2, i, stats->offsetHistogram[i]); + printf("2^%*d: %10u %6.3f%%\n", 2, i, + stats->offsetHistogram[i], + 100.0 * (double) stats->offsetHistogram[i] / + (double)stats->numMatches); } printf("\n"); diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 0e54faa7..3c8c04ec 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -19,8 +19,8 @@ #define WINDOW_SIZE (1 << 25) //These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 4 -#define LDM_HASH_LENGTH 4 +#define LDM_MIN_MATCH_LENGTH 1024 +#define LDM_HASH_LENGTH 1024 typedef U32 offset_t; typedef U32 hash_t; From 2d8e6c6608bc0de2d24a40ad6fe7a2fb8e8377bf Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 14 Jul 2017 12:31:01 -0700 Subject: [PATCH 34/62] Add more statistics --- contrib/long_distance_matching/ldm.c | 61 ++++++++++++++++++++++------ contrib/long_distance_matching/ldm.h | 19 ++++++--- 2 files changed, 62 insertions(+), 18 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index d935a2bd..c9c6a709 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -37,6 +37,7 @@ typedef struct LDM_hashTable { // TODO: Scanning speed // TODO: Memory usage struct LDM_compressStats { + U32 windowSizeLog, hashTableSizeLog; U32 numMatches; U64 totalMatchLength; U64 totalLiteralLength; @@ -73,7 +74,9 @@ struct LDM_CCtx { LDM_compressStats stats; /* Compression statistics */ - LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; + LDM_hashEntry *hashTable; + +// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ @@ -90,7 +93,7 @@ struct LDM_CCtx { const BYTE *DEBUG_setNextHash; }; -void LDM_outputHashtableOccupancy( +void LDM_outputHashTableOccupancy( const LDM_hashEntry *hashTable, U32 hashTableSize) { U32 i = 0; U32 ctr = 0; @@ -104,9 +107,8 @@ void LDM_outputHashtableOccupancy( 100.0 * (double)(ctr) / (double)hashTableSize); } -// TODO: This can be done more efficienctly but is not that important as it -// is only used for computing stats. -// +// TODO: This can be done more efficiently (but it is not that important as it +// is only used for computing stats). static int intLog2(U32 x) { int ret = 0; while (x >>= 1) { @@ -115,30 +117,57 @@ static int intLog2(U32 x) { return ret; } +// TODO: Maybe we would eventually prefer to have linear rather than +// exponential buckets. +void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { + int i = 0; + int buckets[32] = { 0 }; + + printf("\n"); + printf("Hash table histogram\n"); + for (; i < LDM_HASHTABLESIZE_U32; i++) { + int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset; + buckets[intLog2(offset)]++; + } + + i = 0; + for (; i < 32; i++) { + printf("2^%*d: %10u %6.3f%%\n", 2, i, + buckets[i], + 100.0 * (double) buckets[i] / + (double) LDM_HASHTABLESIZE_U32); + } + printf("\n"); +} + void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; printf("=====================\n"); printf("Compression statistics\n"); //TODO: compute percentage matched? + printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", + stats->windowSizeLog, stats->hashTableSizeLog); printf("num matches, total match length: %u, %llu\n", stats->numMatches, stats->totalMatchLength); printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / (double)stats->numMatches); - printf("avg literal length: %.1f\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches); + printf("avg literal length, total literalLength: %.1f, %llu\n", + ((double)stats->totalLiteralLength) / (double)stats->numMatches, + stats->totalLiteralLength); printf("avg offset length: %.1f\n", ((double)stats->totalOffset) / (double)stats->numMatches); - printf("min offset, max offset: %u %u\n", + printf("min offset, max offset: %u, %u\n", stats->minOffset, stats->maxOffset); printf("\n"); - printf("offset histogram\n"); + printf("offset histogram: offset, num matches, %% of matches\n"); + for (; i <= intLog2(stats->maxOffset); i++) { printf("2^%*d: %10u %6.3f%%\n", 2, i, stats->offsetHistogram[i], 100.0 * (double) stats->offsetHistogram[i] / - (double)stats->numMatches); + (double) stats->numMatches); } printf("\n"); @@ -379,8 +408,12 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); + cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); +// memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->stats.minOffset = UINT_MAX; + cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; + cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; + cctx->lastPosHashed = NULL; @@ -417,7 +450,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { *match = getPositionOnHash(cctx, h); putHashOfCurrentPositionFromHash(cctx, h, sum); - } while (cctx->ip - *match > WINDOW_SIZE || + } while (cctx->ip - *match > LDM_WINDOW_SIZE || !LDM_isValidMatch(cctx->ip, *match)); setNextHash(cctx); return 0; @@ -550,6 +583,8 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_updateLastHashFromNextHash(&cctx); } + // LDM_outputHashTableOffsetHistogram(&cctx); + /* Encode the last literals (no more matches). */ { const size_t lastRun = cctx.iend - cctx.anchor; @@ -559,7 +594,7 @@ size_t LDM_compress(const void *src, size_t srcSize, #ifdef COMPUTE_STATS LDM_printCompressStats(&cctx.stats); - LDM_outputHashtableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); + LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); #endif return cctx.op - cctx.obase; diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 3c8c04ec..87444359 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -11,16 +11,17 @@ #define LDM_OFFSET_SIZE 4 // Defines the size of the hash table. -#define LDM_MEMORY_USAGE 20 +#define LDM_MEMORY_USAGE 16 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define WINDOW_SIZE (1 << 25) +#define LDM_WINDOW_SIZE_LOG 25 +#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 1024 -#define LDM_HASH_LENGTH 1024 +#define LDM_MIN_MATCH_LENGTH 4 +#define LDM_HASH_LENGTH 4 typedef U32 offset_t; typedef U32 hash_t; @@ -70,9 +71,17 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, * Prints the percentage of the hash table occupied (where occupied is defined * as the entry being non-zero). */ -void LDM_outputHashtableOccupancy(const LDM_hashEntry *hashTable, +void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable, U32 hashTableSize); +/** + * Prints the distribution of offsets in the hash table. + * + * The offsets are defined as the distance of the hash table entry from the + * current input position of the cctx. + */ +void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx); + /** * Outputs compression statistics to stdout. */ From 6e443b4960091fb7b885b83372b1edf79f873564 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 14 Jul 2017 14:27:55 -0700 Subject: [PATCH 35/62] Move hash table access for own functions --- contrib/long_distance_matching/ldm.c | 77 ++++++---- contrib/long_distance_matching/ldm.h | 11 +- .../versions/v0.5/Makefile | 4 +- .../versions/v0.5/ldm.c | 133 ++++++++++++++---- .../versions/v0.5/ldm.h | 26 +++- .../versions/v0.5/main-ldm.c | 2 +- 6 files changed, 189 insertions(+), 64 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index c9c6a709..08cb856c 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -23,17 +23,23 @@ struct LDM_hashEntry { offset_t offset; }; -typedef struct LDM_hashTable { - U32 numEntries; - U32 minimumTagMask; // TODO: what if tag == offset? - - // Maximum number of elements in the table. - U32 limit; - +// TODO: move to its own file. +struct LDM_hashTable { + U32 size; LDM_hashEntry *entries; -} LDM_hashTable; +}; + +LDM_hashEntry *HASH_getHash( + const LDM_hashTable *table, const hash_t hash) { + return &(table->entries[hash]); +} + +void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { + *HASH_getHash(table, hash) = entry; +} + -// TODO: Add offset histogram by powers of two // TODO: Scanning speed // TODO: Memory usage struct LDM_compressStats { @@ -74,7 +80,9 @@ struct LDM_CCtx { LDM_compressStats stats; /* Compression statistics */ - LDM_hashEntry *hashTable; + LDM_hashTable hashTable; + +// LDM_hashEntry *hashTable; // LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; @@ -93,18 +101,19 @@ struct LDM_CCtx { const BYTE *DEBUG_setNextHash; }; -void LDM_outputHashTableOccupancy( - const LDM_hashEntry *hashTable, U32 hashTableSize) { + + +void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable) { U32 i = 0; U32 ctr = 0; - for (; i < hashTableSize; i++) { - if (hashTable[i].offset == 0) { + for (; i < hashTable->size; i++) { + if (HASH_getHash(hashTable, i)->offset == 0) { ctr++; } } printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - hashTableSize, ctr, - 100.0 * (double)(ctr) / (double)hashTableSize); + hashTable->size, ctr, + 100.0 * (double)(ctr) / (double)hashTable->size); } // TODO: This can be done more efficiently (but it is not that important as it @@ -120,13 +129,14 @@ static int intLog2(U32 x) { // TODO: Maybe we would eventually prefer to have linear rather than // exponential buckets. void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { - int i = 0; + U32 i = 0; int buckets[32] = { 0 }; printf("\n"); printf("Hash table histogram\n"); - for (; i < LDM_HASHTABLESIZE_U32; i++) { - int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset; + for (; i < cctx->hashTable.size; i++) { + int offset = (cctx->ip - cctx->ibase) - + HASH_getHash(&cctx->hashTable, i)->offset; buckets[intLog2(offset)]++; } @@ -135,7 +145,7 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { printf("2^%*d: %10u %6.3f%%\n", 2, i, buckets[i], 100.0 * (double) buckets[i] / - (double) LDM_HASHTABLESIZE_U32); + (double) cctx->hashTable.size); } printf("\n"); } @@ -305,7 +315,7 @@ static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { #ifdef COMPUTE_STATS if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { - offset_t offset = cctx->hashTable[hash].offset; + offset_t offset = HASH_getHash(&cctx->hashTable, hash)->offset; cctx->stats.numHashInserts++; if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { cctx->stats.numCollisions++; @@ -317,7 +327,7 @@ static void putHashOfCurrentPositionFromHash( // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; - cctx->hashTable[hash] = entry; + HASH_insert(&cctx->hashTable, hash, entry); } cctx->lastPosHashed = cctx->ip; @@ -362,7 +372,7 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { * Returns the position of the entry at hashTable[hash]. */ static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) { - return cctx->hashTable[hash].offset + cctx->ibase; + return HASH_getHash(&cctx->hashTable, hash)->offset + cctx->ibase; } U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, @@ -389,6 +399,11 @@ void LDM_readHeader(const void *src, U64 *compressedSize, // ip += sizeof(U64); } +static void LDM_initializeHashTable(LDM_hashTable *table) { + table->size = LDM_HASHTABLESIZE_U32; + table->entries = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); +} + void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { @@ -408,7 +423,9 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); + + LDM_initializeHashTable(&cctx->hashTable); +// calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); // memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->stats.minOffset = UINT_MAX; cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; @@ -424,6 +441,10 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->DEBUG_setNextHash = 0; } +void LDM_destroyCCtx(LDM_CCtx *cctx) { + free((cctx->hashTable).entries); +} + /** * Finds the "best" match. * @@ -594,10 +615,14 @@ size_t LDM_compress(const void *src, size_t srcSize, #ifdef COMPUTE_STATS LDM_printCompressStats(&cctx.stats); - LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); + LDM_outputHashTableOccupancy(&cctx.hashTable); #endif - return cctx.op - cctx.obase; + { + const size_t ret = cctx.op - cctx.obase; + LDM_destroyCCtx(&cctx); + return ret; + } } struct LDM_DCtx { diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 87444359..8c3aa4e6 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -26,6 +26,7 @@ typedef U32 offset_t; typedef U32 hash_t; typedef struct LDM_hashEntry LDM_hashEntry; +typedef struct LDM_hashTable LDM_hashTable; typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_DCtx LDM_DCtx; @@ -62,17 +63,23 @@ size_t LDM_compress(const void *src, size_t srcSize, /** * Initialize the compression context. + * + * Allocates memory for the hash table. */ void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize); +/** + * Frees up memory allocating in initializeCCtx + */ +void LDM_destroyCCtx(LDM_CCtx *cctx); + /** * Prints the percentage of the hash table occupied (where occupied is defined * as the entry being non-zero). */ -void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable, - U32 hashTableSize); +void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable); /** * Prints the distribution of offsets in the hash table. diff --git a/contrib/long_distance_matching/versions/v0.5/Makefile b/contrib/long_distance_matching/versions/v0.5/Makefile index dee686bc..cff78644 100644 --- a/contrib/long_distance_matching/versions/v0.5/Makefile +++ b/contrib/long_distance_matching/versions/v0.5/Makefile @@ -9,7 +9,7 @@ # This Makefile presumes libzstd is installed, using `sudo make install` -CPPFLAGS+= -I../../../../lib/common +CPPFLAGS+= -I../../lib/common CFLAGS ?= -O3 DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ @@ -27,7 +27,7 @@ default: all all: main-ldm -main-ldm : ldm.c main-ldm.c +main-ldm : ldm.h ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.c b/contrib/long_distance_matching/versions/v0.5/ldm.c index b8e8c63b..06c97bc4 100644 --- a/contrib/long_distance_matching/versions/v0.5/ldm.c +++ b/contrib/long_distance_matching/versions/v0.5/ldm.c @@ -15,7 +15,7 @@ #define RUN_MASK ((1U<>= 1) { + ret++; + } + return ret; +} + +// TODO: Maybe we would eventually prefer to have linear rather than +// exponential buckets. +void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { + int i = 0; + int buckets[32] = { 0 }; + + printf("\n"); + printf("Hash table histogram\n"); + for (; i < LDM_HASHTABLESIZE_U32; i++) { + int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset; + buckets[intLog2(offset)]++; + } + + i = 0; + for (; i < 32; i++) { + printf("2^%*d: %10u %6.3f%%\n", 2, i, + buckets[i], + 100.0 * (double) buckets[i] / + (double) LDM_HASHTABLESIZE_U32); + } + printf("\n"); +} + void LDM_printCompressStats(const LDM_compressStats *stats) { + int i = 0; printf("=====================\n"); printf("Compression statistics\n"); //TODO: compute percentage matched? + printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", + stats->windowSizeLog, stats->hashTableSizeLog); printf("num matches, total match length: %u, %llu\n", stats->numMatches, stats->totalMatchLength); printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / (double)stats->numMatches); - printf("avg literal length: %.1f\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches); + printf("avg literal length, total literalLength: %.1f, %llu\n", + ((double)stats->totalLiteralLength) / (double)stats->numMatches, + stats->totalLiteralLength); printf("avg offset length: %.1f\n", ((double)stats->totalOffset) / (double)stats->numMatches); - printf("min offset, max offset: %u %u\n", + printf("min offset, max offset: %u, %u\n", stats->minOffset, stats->maxOffset); + + printf("\n"); + printf("offset histogram: offset, num matches, %% of matches\n"); + + for (; i <= intLog2(stats->maxOffset); i++) { + printf("2^%*d: %10u %6.3f%%\n", 2, i, + stats->offsetHistogram[i], + 100.0 * (double) stats->offsetHistogram[i] / + (double) stats->numMatches); + } + printf("\n"); + + printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", stats->numCollisions, stats->numHashInserts, stats->numHashInserts == 0 ? 1.0 : (100.0 * (double)stats->numCollisions) / (double)stats->numHashInserts); + printf("=====================\n"); + } int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { @@ -145,7 +212,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { * of the hash table. */ static hash_t checksumToHash(U32 sum) { - return ((sum * 2654435761U) >> ((32)-LDM_HASHLOG)); + return ((sum * 2654435761U) >> (32 - LDM_HASHLOG)); } /** @@ -341,8 +408,12 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); + cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); +// memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->stats.minOffset = UINT_MAX; + cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; + cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; + cctx->lastPosHashed = NULL; @@ -353,6 +424,10 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->DEBUG_setNextHash = 0; } +void LDM_destroyCCtx(LDM_CCtx *cctx) { + free(cctx->hashTable); +} + /** * Finds the "best" match. * @@ -379,7 +454,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { *match = getPositionOnHash(cctx, h); putHashOfCurrentPositionFromHash(cctx, h, sum); - } while (cctx->ip - *match > WINDOW_SIZE || + } while (cctx->ip - *match > LDM_WINDOW_SIZE || !LDM_isValidMatch(cctx->ip, *match)); setNextHash(cctx); return 0; @@ -443,24 +518,19 @@ void LDM_outputBlock(LDM_CCtx *cctx, size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; + const BYTE *match; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); - // TODO: loop condition is not accurate. - while (1) { - const BYTE *match; - - /** - * Find a match. - * If no more matches can be found (i.e. the length of the remaining input - * is less than the minimum match length), then stop searching for matches - * and encode the final literals. - */ - if (LDM_findBestMatch(&cctx, &match) != 0) { - goto _last_literals; - } + /** + * Find a match. + * If no more matches can be found (i.e. the length of the remaining input + * is less than the minimum match length), then stop searching for matches + * and encode the final literals. + */ + while (LDM_findBestMatch(&cctx, &match) == 0) { #ifdef COMPUTE_STATS cctx.stats.numMatches++; #endif @@ -485,6 +555,8 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.ip + LDM_MIN_MATCH_LENGTH, match + LDM_MIN_MATCH_LENGTH, cctx.ihashLimit); + LDM_outputBlock(&cctx, literalLength, offset, matchLength); + #ifdef COMPUTE_STATS cctx.stats.totalLiteralLength += literalLength; cctx.stats.totalOffset += offset; @@ -493,8 +565,8 @@ size_t LDM_compress(const void *src, size_t srcSize, offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; cctx.stats.maxOffset = offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; + cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; #endif - LDM_outputBlock(&cctx, literalLength, offset, matchLength); // Move ip to end of block, inserting hashes at each position. cctx.nextIp = cctx.ip + cctx.step; @@ -514,20 +586,26 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.anchor = cctx.ip; LDM_updateLastHashFromNextHash(&cctx); } -_last_literals: + + // LDM_outputHashTableOffsetHistogram(&cctx); + /* Encode the last literals (no more matches). */ { - const size_t lastRun = (size_t)(cctx.iend - cctx.anchor); + const size_t lastRun = cctx.iend - cctx.anchor; BYTE *pToken = cctx.op++; LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); } #ifdef COMPUTE_STATS LDM_printCompressStats(&cctx.stats); - LDM_outputHashtableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); + LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); #endif - return (cctx.op - (const BYTE *)cctx.obase); + { + const size_t ret = cctx.op - cctx.obase; + LDM_destroyCCtx(&cctx); + return ret; + } } struct LDM_DCtx { @@ -611,7 +689,6 @@ size_t LDM_decompress(const void *src, size_t compressedSize, // TODO: implement and test hash function void LDM_test(void) { - } /* diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.h b/contrib/long_distance_matching/versions/v0.5/ldm.h index 5da3c3b9..70cda8b8 100644 --- a/contrib/long_distance_matching/versions/v0.5/ldm.h +++ b/contrib/long_distance_matching/versions/v0.5/ldm.h @@ -11,16 +11,17 @@ #define LDM_OFFSET_SIZE 4 // Defines the size of the hash table. -#define LDM_MEMORY_USAGE 22 +#define LDM_MEMORY_USAGE 16 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define WINDOW_SIZE (1 << 25) +#define LDM_WINDOW_SIZE_LOG 25 +#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 8 -#define LDM_HASH_LENGTH 8 +#define LDM_MIN_MATCH_LENGTH 4 +#define LDM_HASH_LENGTH 4 typedef U32 offset_t; typedef U32 hash_t; @@ -61,18 +62,33 @@ size_t LDM_compress(const void *src, size_t srcSize, /** * Initialize the compression context. + * + * Allocates memory for the hash table. */ void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize); +/** + * Frees up memory allocating in initializeCCtx + */ +void LDM_destroyCCtx(LDM_CCtx *cctx); + /** * Prints the percentage of the hash table occupied (where occupied is defined * as the entry being non-zero). */ -void LDM_outputHashtableOccupancy(const LDM_hashEntry *hashTable, +void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable, U32 hashTableSize); +/** + * Prints the distribution of offsets in the hash table. + * + * The offsets are defined as the distance of the hash table entry from the + * current input position of the cctx. + */ +void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx); + /** * Outputs compression statistics to stdout. */ diff --git a/contrib/long_distance_matching/versions/v0.5/main-ldm.c b/contrib/long_distance_matching/versions/v0.5/main-ldm.c index 40afef8c..ea6375ba 100644 --- a/contrib/long_distance_matching/versions/v0.5/main-ldm.c +++ b/contrib/long_distance_matching/versions/v0.5/main-ldm.c @@ -13,7 +13,7 @@ #include "zstd.h" #define DEBUG -//#define TEST +#define TEST /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. From ca300ce6e0004a447327d19a2ed879923e8c8baa Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 14 Jul 2017 17:17:00 -0700 Subject: [PATCH 36/62] Decouple hash table from compression function --- contrib/long_distance_matching/Makefile | 2 +- contrib/long_distance_matching/basic_table.c | 56 +++++++++++ contrib/long_distance_matching/ldm.c | 93 +++++++------------ contrib/long_distance_matching/ldm.h | 13 --- .../long_distance_matching/ldm_hashtable.h | 36 +++++++ 5 files changed, 127 insertions(+), 73 deletions(-) create mode 100644 contrib/long_distance_matching/basic_table.c create mode 100644 contrib/long_distance_matching/ldm_hashtable.h diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index cff78644..0d4dea06 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -27,7 +27,7 @@ default: all all: main-ldm -main-ldm : ldm.h ldm.c main-ldm.c +main-ldm : basic_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c new file mode 100644 index 00000000..007086fe --- /dev/null +++ b/contrib/long_distance_matching/basic_table.c @@ -0,0 +1,56 @@ +#include +#include + +#include "ldm_hashtable.h" + +struct LDM_hashTable { + U32 size; + LDM_hashEntry *entries; +}; + +LDM_hashTable *HASH_createTable(U32 size) { + LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); + table->size = size; + table->entries = calloc(size, sizeof(LDM_hashEntry)); + return table; +} + +void HASH_initializeTable(LDM_hashTable *table, U32 size) { + table->size = size; + table->entries = calloc(size, sizeof(LDM_hashEntry)); +} + + +LDM_hashEntry *HASH_getEntryFromHash( + const LDM_hashTable *table, const hash_t hash) { + return &(table->entries[hash]); +} + +void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { + *HASH_getEntryFromHash(table, hash) = entry; +} + +U32 HASH_getSize(const LDM_hashTable *table) { + return table->size; +} + +void HASH_destroyTable(LDM_hashTable *table) { + free(table->entries); + free(table); +} + +void HASH_outputTableOccupancy(const LDM_hashTable *hashTable) { + U32 i = 0; + U32 ctr = 0; + for (; i < HASH_getSize(hashTable); i++) { + if (HASH_getEntryFromHash(hashTable, i)->offset == 0) { + ctr++; + } + } + printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", + HASH_getSize(hashTable), ctr, + 100.0 * (double)(ctr) / (double)HASH_getSize(hashTable)); +} + + diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 08cb856c..32da40f8 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -4,11 +4,13 @@ #include #include -#include "ldm.h" - // Insert every (HASH_ONLY_EVERY + 1) into the hash table. #define HASH_ONLY_EVERY 0 +#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) + #define ML_BITS 4 #define ML_MASK ((1U<entries[hash]); -} - -void HASH_insert(LDM_hashTable *table, - const hash_t hash, const LDM_hashEntry entry) { - *HASH_getHash(table, hash) = entry; -} - +#include "ldm_hashtable.h" // TODO: Scanning speed // TODO: Memory usage @@ -54,6 +39,8 @@ struct LDM_compressStats { U32 numCollisions; U32 numHashInserts; +// U64 numInvalidHashes, numValidHashes; // tmp + U32 offsetHistogram[32]; }; @@ -80,9 +67,7 @@ struct LDM_CCtx { LDM_compressStats stats; /* Compression statistics */ - LDM_hashTable hashTable; - -// LDM_hashEntry *hashTable; + LDM_hashTable *hashTable; // LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; @@ -101,21 +86,6 @@ struct LDM_CCtx { const BYTE *DEBUG_setNextHash; }; - - -void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable) { - U32 i = 0; - U32 ctr = 0; - for (; i < hashTable->size; i++) { - if (HASH_getHash(hashTable, i)->offset == 0) { - ctr++; - } - } - printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - hashTable->size, ctr, - 100.0 * (double)(ctr) / (double)hashTable->size); -} - // TODO: This can be done more efficiently (but it is not that important as it // is only used for computing stats). static int intLog2(U32 x) { @@ -128,15 +98,15 @@ static int intLog2(U32 x) { // TODO: Maybe we would eventually prefer to have linear rather than // exponential buckets. -void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { +void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { U32 i = 0; int buckets[32] = { 0 }; printf("\n"); printf("Hash table histogram\n"); - for (; i < cctx->hashTable.size; i++) { + for (; i < HASH_getSize(cctx->hashTable); i++) { int offset = (cctx->ip - cctx->ibase) - - HASH_getHash(&cctx->hashTable, i)->offset; + HASH_getEntryFromHash(cctx->hashTable, i)->offset; buckets[intLog2(offset)]++; } @@ -145,7 +115,7 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { printf("2^%*d: %10u %6.3f%%\n", 2, i, buckets[i], 100.0 * (double) buckets[i] / - (double) cctx->hashTable.size); + (double) HASH_getSize(cctx->hashTable)); } printf("\n"); } @@ -181,7 +151,10 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { } printf("\n"); - + /* + printf("Num invalid hashes, num valid hashes, %llu %llu\n", + stats->numInvalidHashes, stats->numValidHashes); + */ printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", stats->numCollisions, stats->numHashInserts, stats->numHashInserts == 0 ? @@ -315,7 +288,7 @@ static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { #ifdef COMPUTE_STATS if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { - offset_t offset = HASH_getHash(&cctx->hashTable, hash)->offset; + U32 offset = HASH_getEntryFromHash(cctx->hashTable, hash)->offset; cctx->stats.numHashInserts++; if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { cctx->stats.numCollisions++; @@ -327,7 +300,7 @@ static void putHashOfCurrentPositionFromHash( // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; - HASH_insert(&cctx->hashTable, hash, entry); + HASH_insert(cctx->hashTable, hash, entry); } cctx->lastPosHashed = cctx->ip; @@ -371,9 +344,11 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { /** * Returns the position of the entry at hashTable[hash]. */ -static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) { - return HASH_getHash(&cctx->hashTable, hash)->offset + cctx->ibase; +/* +static const BYTE *getPositionOnHash(const LDM_CCtx *cctx, const hash_t hash) { + return HASH_getEntryFromHash(cctx->hashTable, hash)->offset + cctx->ibase; } +*/ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { @@ -399,11 +374,6 @@ void LDM_readHeader(const void *src, U64 *compressedSize, // ip += sizeof(U64); } -static void LDM_initializeHashTable(LDM_hashTable *table) { - table->size = LDM_HASHTABLESIZE_U32; - table->entries = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); -} - void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { @@ -423,8 +393,10 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); + cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32); + + //HASH_initializeTable(cctx->hashTable, LDM_HASHTABLESIZE_U32); - LDM_initializeHashTable(&cctx->hashTable); // calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); // memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->stats.minOffset = UINT_MAX; @@ -442,7 +414,7 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, } void LDM_destroyCCtx(LDM_CCtx *cctx) { - free((cctx->hashTable).entries); + HASH_destroyTable(cctx->hashTable); } /** @@ -458,6 +430,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { do { hash_t h; U32 sum; + LDM_hashEntry *entry; setNextHash(cctx); h = cctx->nextHash; sum = cctx->nextSum; @@ -468,7 +441,9 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { return 1; } - *match = getPositionOnHash(cctx, h); + entry = HASH_getEntryFromHash(cctx->hashTable, h); + *match = entry->offset + cctx->ibase; + putHashOfCurrentPositionFromHash(cctx, h, sum); } while (cctx->ip - *match > LDM_WINDOW_SIZE || @@ -604,7 +579,7 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_updateLastHashFromNextHash(&cctx); } - // LDM_outputHashTableOffsetHistogram(&cctx); + // HASH_outputTableOffsetHistogram(&cctx); /* Encode the last literals (no more matches). */ { @@ -615,7 +590,7 @@ size_t LDM_compress(const void *src, size_t srcSize, #ifdef COMPUTE_STATS LDM_printCompressStats(&cctx.stats); - LDM_outputHashTableOccupancy(&cctx.hashTable); + HASH_outputTableOccupancy(cctx.hashTable); #endif { diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 8c3aa4e6..18b64e37 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -12,9 +12,6 @@ // Defines the size of the hash table. #define LDM_MEMORY_USAGE 16 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_WINDOW_SIZE_LOG 25 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) @@ -23,10 +20,6 @@ #define LDM_MIN_MATCH_LENGTH 4 #define LDM_HASH_LENGTH 4 -typedef U32 offset_t; -typedef U32 hash_t; -typedef struct LDM_hashEntry LDM_hashEntry; -typedef struct LDM_hashTable LDM_hashTable; typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_DCtx LDM_DCtx; @@ -75,12 +68,6 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, */ void LDM_destroyCCtx(LDM_CCtx *cctx); -/** - * Prints the percentage of the hash table occupied (where occupied is defined - * as the entry being non-zero). - */ -void LDM_outputHashTableOccupancy(const LDM_hashTable *hashTable); - /** * Prints the distribution of offsets in the hash table. * diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h new file mode 100644 index 00000000..690c47a1 --- /dev/null +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -0,0 +1,36 @@ +#ifndef LDM_HASHTABLE_H +#define LDM_HASHTABLE_H + +#include "mem.h" + +typedef U32 hash_t; + +typedef struct LDM_hashEntry { + U32 offset; +} LDM_hashEntry; + +typedef struct LDM_hashTable LDM_hashTable; + +// TODO: rename functions +// TODO: comments + +LDM_hashTable *HASH_createTable(U32 size); + +LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, + const hash_t hash); + +void HASH_insert(LDM_hashTable *table, const hash_t hash, + const LDM_hashEntry entry); + +U32 HASH_getSize(const LDM_hashTable *table); + +void HASH_destroyTable(LDM_hashTable *table); + +/** + * Prints the percentage of the hash table occupied (where occupied is defined + * as the entry being non-zero). + */ +void HASH_outputTableOccupancy(const LDM_hashTable *hashTable); + + +#endif /* LDM_HASHTABLE_H */ From 4bb42b02c190c606176332ee2a1f7cb540b194ab Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 17 Jul 2017 11:53:54 -0700 Subject: [PATCH 37/62] Add basic chaining table --- contrib/long_distance_matching/Makefile | 10 +- contrib/long_distance_matching/basic_table.c | 19 ++-- .../long_distance_matching/chaining_table.c | 92 +++++++++++++++++++ contrib/long_distance_matching/ldm.c | 40 +++++--- contrib/long_distance_matching/ldm.h | 4 +- .../long_distance_matching/ldm_hashtable.h | 6 +- 6 files changed, 144 insertions(+), 27 deletions(-) create mode 100644 contrib/long_distance_matching/chaining_table.c diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 0d4dea06..3159df75 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,13 +25,17 @@ LDFLAGS += -lzstd default: all -all: main-ldm +all: main-basic main-chaining -main-ldm : basic_table.c ldm.c main-ldm.c +main-basic : basic_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ +main-chaining : chaining_table.c ldm.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + + clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main main-ldm + main-basic main-chaining @echo Cleaning completed diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c index 007086fe..c6a5040e 100644 --- a/contrib/long_distance_matching/basic_table.c +++ b/contrib/long_distance_matching/basic_table.c @@ -2,16 +2,19 @@ #include #include "ldm_hashtable.h" +#include "mem.h" struct LDM_hashTable { U32 size; LDM_hashEntry *entries; + const BYTE *offsetBase; }; -LDM_hashTable *HASH_createTable(U32 size) { +LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase) { LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); table->size = size; table->entries = calloc(size, sizeof(LDM_hashEntry)); + table->offsetBase = offsetBase; return table; } @@ -20,15 +23,19 @@ void HASH_initializeTable(LDM_hashTable *table, U32 size) { table->entries = calloc(size, sizeof(LDM_hashEntry)); } +LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { + return table->entries + hash; +} LDM_hashEntry *HASH_getEntryFromHash( - const LDM_hashTable *table, const hash_t hash) { - return &(table->entries[hash]); + const LDM_hashTable *table, const hash_t hash, const U32 checksum) { + (void)checksum; + return getBucket(table, hash); } void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { - *HASH_getEntryFromHash(table, hash) = entry; + *getBucket(table, hash) = entry; } U32 HASH_getSize(const LDM_hashTable *table) { @@ -44,7 +51,7 @@ void HASH_outputTableOccupancy(const LDM_hashTable *hashTable) { U32 i = 0; U32 ctr = 0; for (; i < HASH_getSize(hashTable); i++) { - if (HASH_getEntryFromHash(hashTable, i)->offset == 0) { + if (getBucket(hashTable, i)->offset == 0) { ctr++; } } @@ -52,5 +59,3 @@ void HASH_outputTableOccupancy(const LDM_hashTable *hashTable) { HASH_getSize(hashTable), ctr, 100.0 * (double)(ctr) / (double)HASH_getSize(hashTable)); } - - diff --git a/contrib/long_distance_matching/chaining_table.c b/contrib/long_distance_matching/chaining_table.c new file mode 100644 index 00000000..226f7822 --- /dev/null +++ b/contrib/long_distance_matching/chaining_table.c @@ -0,0 +1,92 @@ +#include +#include + +#include "ldm_hashtable.h" +#include "mem.h" + +//TODO: move def somewhere else. +//TODO: memory usage is currently no longer LDM_MEMORY_USAGE. +// refactor code to scale the number of elements appropriately. + +// Number of elements per hash bucket. +#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now +#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) + +struct LDM_hashTable { + U32 size; + LDM_hashEntry *entries; // 1-D array for now. + + // Position corresponding to offset=0 in LDM_hashEntry. + const BYTE *offsetBase; + BYTE *bucketOffsets; // Pointer to current insert position. + // Last insert was at bucketOffsets - 1? +}; + +LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase) { + LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); + table->size = size; + table->entries = calloc(size * HASH_BUCKET_SIZE, sizeof(LDM_hashEntry)); + table->bucketOffsets = calloc(size, sizeof(BYTE)); + table->offsetBase = offsetBase; + return table; +} + +static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { + return table->entries + (hash << HASH_BUCKET_SIZE_LOG); +} + +/* +static LDM_hashEntry *getLastInsertFromHash(const LDM_hashTable *table, + const hash_t hash) { + LDM_hashEntry *bucket = getBucket(table, hash); + BYTE offset = (table->bucketOffsets[hash] - 1) & (HASH_BUCKET_SIZE - 1); + return bucket + offset; +} +*/ + +LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum) { + // Loop through bucket. + // TODO: in order of recency??? + LDM_hashEntry *bucket = getBucket(table, hash); + LDM_hashEntry *cur = bucket; + for(; cur < bucket + HASH_BUCKET_SIZE; ++cur) { + if (cur->checksum == checksum) { + return cur; + } + } + return NULL; +} + +void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { + *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; + table->bucketOffsets[hash]++; + table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; +} + +U32 HASH_getSize(const LDM_hashTable *table) { + return table->size * HASH_BUCKET_SIZE; +} + +void HASH_destroyTable(LDM_hashTable *table) { + free(table->entries); + free(table->bucketOffsets); + free(table); +} + +void HASH_outputTableOccupancy(const LDM_hashTable *table) { + U32 ctr = 0; + LDM_hashEntry *cur = table->entries; + LDM_hashEntry *end = table->entries + (table->size * HASH_BUCKET_SIZE); + for (; cur < end; ++cur) { + if (cur->offset == 0) { + ctr++; + } + } + + printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", + HASH_getSize(table), ctr, + 100.0 * (double)(ctr) / (double)HASH_getSize(table)); +} diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 32da40f8..3cb82ea6 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -20,9 +20,8 @@ #define CHECKSUM_CHAR_OFFSET 10 //#define RUN_CHECKS //#define LDM_DEBUG -// -#include "ldm.h" +#include "ldm.h" #include "ldm_hashtable.h" // TODO: Scanning speed @@ -98,6 +97,7 @@ static int intLog2(U32 x) { // TODO: Maybe we would eventually prefer to have linear rather than // exponential buckets. +/** void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { U32 i = 0; int buckets[32] = { 0 }; @@ -119,6 +119,7 @@ void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { } printf("\n"); } +*/ void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; @@ -127,9 +128,11 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { //TODO: compute percentage matched? printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", stats->windowSizeLog, stats->hashTableSizeLog); - printf("num matches, total match length: %u, %llu\n", + printf("num matches, total match length, %% matched: %u, %llu, %.3f\n", stats->numMatches, - stats->totalMatchLength); + stats->totalMatchLength, + 100.0 * (double)stats->totalMatchLength / + (double)(stats->totalMatchLength + stats->totalLiteralLength)); printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / (double)stats->numMatches); printf("avg literal length, total literalLength: %.1f, %llu\n", @@ -155,11 +158,13 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { printf("Num invalid hashes, num valid hashes, %llu %llu\n", stats->numInvalidHashes, stats->numValidHashes); */ + /* printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", stats->numCollisions, stats->numHashInserts, stats->numHashInserts == 0 ? 1.0 : (100.0 * (double)stats->numCollisions) / (double)stats->numHashInserts); + */ printf("=====================\n"); } @@ -173,6 +178,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { */ //TODO: This seems to be faster for some reason? + U32 lengthLeft = LDM_MIN_MATCH_LENGTH; const BYTE *curIn = pIn; const BYTE *curMatch = pMatch; @@ -286,8 +292,9 @@ static void setNextHash(LDM_CCtx *cctx) { static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { + /* #ifdef COMPUTE_STATS - if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { + if (cctx->stats.numHashInserts < HASH_getSize(cctx->hashTable)) { U32 offset = HASH_getEntryFromHash(cctx->hashTable, hash)->offset; cctx->stats.numHashInserts++; if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { @@ -295,11 +302,13 @@ static void putHashOfCurrentPositionFromHash( } } #endif +*/ // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; + const LDM_hashEntry entry = { cctx->ip - cctx->ibase , + MEM_read32(cctx->ip) }; HASH_insert(cctx->hashTable, hash, entry); } @@ -393,7 +402,7 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32); + cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32, cctx->ibase); //HASH_initializeTable(cctx->hashTable, LDM_HASHTABLESIZE_U32); @@ -425,12 +434,13 @@ void LDM_destroyCCtx(LDM_CCtx *cctx) { * */ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { + + LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; do { hash_t h; U32 sum; - LDM_hashEntry *entry; setNextHash(cctx); h = cctx->nextHash; sum = cctx->nextSum; @@ -441,13 +451,17 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { return 1; } - entry = HASH_getEntryFromHash(cctx->hashTable, h); - *match = entry->offset + cctx->ibase; + entry = HASH_getEntryFromHash(cctx->hashTable, h, MEM_read32(cctx->ip)); + + if (entry != NULL) { + *match = entry->offset + cctx->ibase; + } putHashOfCurrentPositionFromHash(cctx, h, sum); - } while (cctx->ip - *match > LDM_WINDOW_SIZE || - !LDM_isValidMatch(cctx->ip, *match)); + } while (entry == NULL || + (cctx->ip - *match > LDM_WINDOW_SIZE || + !LDM_isValidMatch(cctx->ip, *match))); setNextHash(cctx); return 0; } @@ -510,7 +524,7 @@ void LDM_outputBlock(LDM_CCtx *cctx, size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; - const BYTE *match; + const BYTE *match = NULL; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 18b64e37..6325d1b1 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -17,8 +17,8 @@ #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 4 -#define LDM_HASH_LENGTH 4 +#define LDM_MIN_MATCH_LENGTH 1024 +#define LDM_HASH_LENGTH 1024 typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 690c47a1..92add96f 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -7,6 +7,7 @@ typedef U32 hash_t; typedef struct LDM_hashEntry { U32 offset; + U32 checksum; // Not needed? } LDM_hashEntry; typedef struct LDM_hashTable LDM_hashTable; @@ -14,10 +15,11 @@ typedef struct LDM_hashTable LDM_hashTable; // TODO: rename functions // TODO: comments -LDM_hashTable *HASH_createTable(U32 size); +LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase); LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, - const hash_t hash); + const hash_t hash, + const U32 checksum); void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry); From 15a041adbf8b59bc88838fe07297d8399319cec0 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 17 Jul 2017 15:16:58 -0700 Subject: [PATCH 38/62] Add function to get valid entries only from table --- contrib/long_distance_matching/Makefile | 6 +-- contrib/long_distance_matching/basic_table.c | 17 ++++++ ...aining_table.c => circular_buffer_table.c} | 21 +++++++- contrib/long_distance_matching/ldm.c | 54 ++++++------------- contrib/long_distance_matching/ldm.h | 11 ++-- .../long_distance_matching/ldm_hashtable.h | 9 +++- 6 files changed, 70 insertions(+), 48 deletions(-) rename contrib/long_distance_matching/{chaining_table.c => circular_buffer_table.c} (79%) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 3159df75..47085022 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,17 +25,17 @@ LDFLAGS += -lzstd default: all -all: main-basic main-chaining +all: main-basic main-circular-buffer main-basic : basic_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-chaining : chaining_table.c ldm.c main-ldm.c +main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-chaining + main-basic main-circular-buffer @echo Cleaning completed diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c index c6a5040e..859bf061 100644 --- a/contrib/long_distance_matching/basic_table.c +++ b/contrib/long_distance_matching/basic_table.c @@ -27,12 +27,29 @@ LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + hash; } + LDM_hashEntry *HASH_getEntryFromHash( const LDM_hashTable *table, const hash_t hash, const U32 checksum) { (void)checksum; return getBucket(table, hash); } +LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum, + const BYTE *pIn, + int (*isValid)(const BYTE *pIn, const BYTE *pMatch)) { + LDM_hashEntry *entry = getBucket(table, hash); + (void)checksum; + if ((*isValid)(pIn, entry->offset + table->offsetBase)) { + return entry; + } else { + return NULL; + } +} + + + void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *getBucket(table, hash) = entry; diff --git a/contrib/long_distance_matching/chaining_table.c b/contrib/long_distance_matching/circular_buffer_table.c similarity index 79% rename from contrib/long_distance_matching/chaining_table.c rename to contrib/long_distance_matching/circular_buffer_table.c index 226f7822..f45f945c 100644 --- a/contrib/long_distance_matching/chaining_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -9,7 +9,7 @@ // refactor code to scale the number of elements appropriately. // Number of elements per hash bucket. -#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now +#define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) struct LDM_hashTable { @@ -44,6 +44,25 @@ static LDM_hashEntry *getLastInsertFromHash(const LDM_hashTable *table, } */ +LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum, + const BYTE *pIn, + int (*isValid)(const BYTE *pIn, const BYTE *pMatch)) { + LDM_hashEntry *bucket = getBucket(table, hash); + LDM_hashEntry *cur = bucket; + // TODO: in order of recency? + for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { + // CHeck checksum for faster check. + if (cur->checksum == checksum && + (*isValid)(pIn, cur->offset + table->offsetBase)) { + return cur; + } + } + return NULL; +} + + LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, const hash_t hash, const U32 checksum) { diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 3cb82ea6..bf54842f 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -5,7 +5,7 @@ #include // Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY 0 +#define HASH_ONLY_EVERY 31 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) @@ -38,8 +38,6 @@ struct LDM_compressStats { U32 numCollisions; U32 numHashInserts; -// U64 numInvalidHashes, numValidHashes; // tmp - U32 offsetHistogram[32]; }; @@ -153,45 +151,25 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { (double) stats->numMatches); } printf("\n"); - - /* - printf("Num invalid hashes, num valid hashes, %llu %llu\n", - stats->numInvalidHashes, stats->numValidHashes); - */ - /* - printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", - stats->numCollisions, stats->numHashInserts, - stats->numHashInserts == 0 ? - 1.0 : (100.0 * (double)stats->numCollisions) / - (double)stats->numHashInserts); - */ printf("=====================\n"); } int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { - /* - if (memcmp(pIn, pMatch, LDM_MIN_MATCH_LENGTH) == 0) { - return 1; - } - return 0; - */ - - //TODO: This seems to be faster for some reason? - U32 lengthLeft = LDM_MIN_MATCH_LENGTH; const BYTE *curIn = pIn; const BYTE *curMatch = pMatch; - for (; lengthLeft >= 8; lengthLeft -= 8) { - if (MEM_read64(curIn) != MEM_read64(curMatch)) { + if (pIn - pMatch > LDM_WINDOW_SIZE) { + return 0; + } + + for (; lengthLeft >= 4; lengthLeft -= 4) { + if (MEM_read32(curIn) != MEM_read32(curMatch)) { return 0; } - curIn += 8; - curMatch += 8; - } - if (lengthLeft > 0) { - return (MEM_read32(curIn) == MEM_read32(curMatch)); + curIn += 4; + curMatch += 4; } return 1; } @@ -307,8 +285,11 @@ static void putHashOfCurrentPositionFromHash( // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { + /** const LDM_hashEntry entry = { cctx->ip - cctx->ibase , MEM_read32(cctx->ip) }; + */ + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; HASH_insert(cctx->hashTable, hash, entry); } @@ -438,7 +419,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; - do { + while (entry == NULL) { hash_t h; U32 sum; setNextHash(cctx); @@ -451,17 +432,14 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { return 1; } - entry = HASH_getEntryFromHash(cctx->hashTable, h, MEM_read32(cctx->ip)); + entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip, + &LDM_isValidMatch); if (entry != NULL) { *match = entry->offset + cctx->ibase; } - putHashOfCurrentPositionFromHash(cctx, h, sum); - - } while (entry == NULL || - (cctx->ip - *match > LDM_WINDOW_SIZE || - !LDM_isValidMatch(cctx->ip, *match))); + } setNextHash(cctx); return 0; } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 6325d1b1..6d97bd56 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -11,14 +11,14 @@ #define LDM_OFFSET_SIZE 4 // Defines the size of the hash table. -#define LDM_MEMORY_USAGE 16 +#define LDM_MEMORY_USAGE 20 -#define LDM_WINDOW_SIZE_LOG 25 +#define LDM_WINDOW_SIZE_LOG 30 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 1024 -#define LDM_HASH_LENGTH 1024 +#define LDM_MIN_MATCH_LENGTH 64 +#define LDM_HASH_LENGTH 64 typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; @@ -82,7 +82,8 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx); void LDM_printCompressStats(const LDM_compressStats *stats); /** * Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the - * LDM_MIN_MATCH_LENGTH bytes from match. + * LDM_MIN_MATCH_LENGTH bytes from match and also if + * pIn - pMatch <= LDM_WINDOW_SIZE. * * This assumes LDM_MIN_MATCH_LENGTH is a multiple of four. * diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 92add96f..88d19ae2 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -7,7 +7,7 @@ typedef U32 hash_t; typedef struct LDM_hashEntry { U32 offset; - U32 checksum; // Not needed? + U32 checksum; } LDM_hashEntry; typedef struct LDM_hashTable LDM_hashTable; @@ -17,10 +17,17 @@ typedef struct LDM_hashTable LDM_hashTable; LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase); +//TODO: unneeded? LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, const hash_t hash, const U32 checksum); +LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum, + const BYTE *pIn, + int (*isValid)(const BYTE *pIn, const BYTE *pMatch)); + void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry); From a00e406231d2cb7fbeffd2a65d1a1044b0d7bcde Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 17 Jul 2017 15:17:32 -0700 Subject: [PATCH 39/62] Remove version archive --- .../versions/v0.3/Makefile | 30 - .../versions/v0.3/README | 3 - .../versions/v0.3/ldm.c | 464 ------------ .../versions/v0.3/ldm.h | 19 - .../versions/v0.3/main-ldm.c | 479 ------------ .../versions/v0.3/util.c | 64 -- .../versions/v0.3/util.h | 23 - .../versions/v0.5/Makefile | 37 - .../versions/v0.5/README | 5 - .../versions/v0.5/ldm.c | 710 ------------------ .../versions/v0.5/ldm.h | 159 ---- .../versions/v0.5/main-ldm.c | 270 ------- 12 files changed, 2263 deletions(-) delete mode 100644 contrib/long_distance_matching/versions/v0.3/Makefile delete mode 100644 contrib/long_distance_matching/versions/v0.3/README delete mode 100644 contrib/long_distance_matching/versions/v0.3/ldm.c delete mode 100644 contrib/long_distance_matching/versions/v0.3/ldm.h delete mode 100644 contrib/long_distance_matching/versions/v0.3/main-ldm.c delete mode 100644 contrib/long_distance_matching/versions/v0.3/util.c delete mode 100644 contrib/long_distance_matching/versions/v0.3/util.h delete mode 100644 contrib/long_distance_matching/versions/v0.5/Makefile delete mode 100644 contrib/long_distance_matching/versions/v0.5/README delete mode 100644 contrib/long_distance_matching/versions/v0.5/ldm.c delete mode 100644 contrib/long_distance_matching/versions/v0.5/ldm.h delete mode 100644 contrib/long_distance_matching/versions/v0.5/main-ldm.c diff --git a/contrib/long_distance_matching/versions/v0.3/Makefile b/contrib/long_distance_matching/versions/v0.3/Makefile deleted file mode 100644 index e5153970..00000000 --- a/contrib/long_distance_matching/versions/v0.3/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# This Makefile presumes libzstd is installed, using `sudo make install` - -CFLAGS ?= -O3 -DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ - -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ - -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ - -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ - -Wredundant-decls -CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS) -FLAGS = $(CPPFLAGS) $(CFLAGS) - -LDFLAGS += -lzstd - -.PHONY: default all clean - -default: all - -all: main-ldm - -#main : ldm.c main.c -# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -main-ldm : util.c ldm.c main-ldm.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -clean: - @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main main-ldm - @echo Cleaning completed - diff --git a/contrib/long_distance_matching/versions/v0.3/README b/contrib/long_distance_matching/versions/v0.3/README deleted file mode 100644 index 8699562e..00000000 --- a/contrib/long_distance_matching/versions/v0.3/README +++ /dev/null @@ -1,3 +0,0 @@ -This version uses simple lz4-style compression: -- A 4-byte hash is inserted into the hash table for every position. -- Hash table replacement policy: direct overwrite. diff --git a/contrib/long_distance_matching/versions/v0.3/ldm.c b/contrib/long_distance_matching/versions/v0.3/ldm.c deleted file mode 100644 index 1dedf5c3..00000000 --- a/contrib/long_distance_matching/versions/v0.3/ldm.c +++ /dev/null @@ -1,464 +0,0 @@ -#include -#include -#include -#include - - -#include "ldm.h" -#include "util.h" - -#define HASH_EVERY 1 - -#define LDM_MEMORY_USAGE 16 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASH_SIZE_U32 (1 << (LDM_HASHLOG)) - -#define LDM_OFFSET_SIZE 4 - -#define WINDOW_SIZE (1 << 20) -#define MAX_WINDOW_SIZE 31 -#define HASH_SIZE 4 -#define LDM_HASH_LENGTH 4 -#define MINMATCH 4 - -#define ML_BITS 4 -#define ML_MASK ((1U<numMatches); - printf("Average match length: %.1f\n", ((double)stats->totalMatchLength) / - (double)stats->numMatches); - printf("Average literal length: %.1f\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches); - printf("Average offset length: %.1f\n", - ((double)stats->totalOffset) / (double)stats->numMatches); - printf("=====================\n"); -} - -typedef struct LDM_CCtx { - size_t isize; /* Input size */ - size_t maxOSize; /* Maximum output size */ - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of input */ - - // Maximum input position such that hashing at the position does not exceed - // end of input. - const BYTE *ihashLimit; - - // Maximum input position such that finding a match of at least the minimum - // match length does not exceed end of input. - const BYTE *imatchLimit; - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Output */ - - const BYTE *anchor; /* Anchor to start of current (match) block */ - - LDM_compressStats stats; /* Compression statistics */ - - LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; - - const BYTE *lastPosHashed; /* Last position hashed */ - hash_t lastHash; /* Hash corresponding to lastPosHashed */ - const BYTE *nextIp; - hash_t nextHash; /* Hash corresponding to nextIp */ - - unsigned step; -} LDM_CCtx; - -#ifdef LDM_ROLLING_HASH -/** - * Convert a sum computed from LDM_getRollingHash to a hash value in the range - * of the hash table. - */ -static hash_t LDM_sumToHash(U32 sum) { - return sum % (LDM_HASHTABLESIZE >> 2); -// return sum & (LDM_HASHTABLESIZE - 1); -} - -static U32 LDM_getRollingHash(const char *data, U32 len) { - U32 i; - U32 s1, s2; - const schar *buf = (const schar *)data; - - s1 = s2 = 0; - for (i = 0; i < (len - 4); i += 4) { - s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + - (2 * buf[i + 2]) + (buf[i + 3]); - s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3]; - } - for(; i < len; i++) { - s1 += buf[i]; - s2 += s1; - } - return (s1 & 0xffff) + (s2 << 16); -} - -static hash_t LDM_hashPosition(const void * const p) { - return LDM_sumToHash(LDM_getRollingHash((const char *)p, LDM_HASH_LENGTH)); -} - -typedef struct LDM_sumStruct { - U16 s1, s2; -} LDM_sumStruct; - -static void LDM_getRollingHashParts(U32 sum, LDM_sumStruct *sumStruct) { - sumStruct->s1 = sum & 0xffff; - sumStruct->s2 = sum >> 16; -} - -#else -static hash_t LDM_hash(U32 sequence) { - return ((sequence * 2654435761U) >> ((32)-LDM_HASHLOG)); -} - -static hash_t LDM_hashPosition(const void * const p) { - return LDM_hash(LDM_read32(p)); -} -#endif - -/* -static hash_t LDM_hash5(U64 sequence) { - static const U64 prime5bytes = 889523592379ULL; - static const U64 prime8bytes = 11400714785074694791ULL; - const U32 hashLog = LDM_HASHLOG; - if (LDM_isLittleEndian()) - return (((sequence << 24) * prime5bytes) >> (64 - hashLog)); - else - return (((sequence >> 24) * prime8bytes) >> (64 - hashLog)); -} -*/ - -static void LDM_putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash) { - if (((cctx->ip - cctx->ibase) & HASH_EVERY) != HASH_EVERY) { - return; - } - (cctx->hashTable)[hash] = (LDM_hashEntry){ (hash_t)(cctx->ip - cctx->ibase) }; - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; -} - -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - hash_t hash = LDM_hashPosition(cctx->ip); - LDM_putHashOfCurrentPositionFromHash(cctx, hash); -} - -static const BYTE *LDM_get_position_on_hash( - hash_t h, void *tableBase, const BYTE *srcBase) { - const LDM_hashEntry * const hashTable = (LDM_hashEntry *)tableBase; - return hashTable[h].offset + srcBase; -} - -static BYTE LDM_read_byte(const void *memPtr) { - BYTE val; - memcpy(&val, memPtr, 1); - return val; -} - -static unsigned LDM_count(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = LDM_read_byte(pMatch) ^ LDM_read_byte(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (unsigned)(pIn - pStart); - } - return (unsigned)(pIn - pStart); -} - -void LDM_readHeader(const void *src, size_t *compressSize, - size_t *decompressSize) { - const U32 *ip = (const U32 *)src; - *compressSize = *ip++; - *decompressSize = *ip; -} - -static void LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - cctx->isize = srcSize; - cctx->maxOSize = maxDstSize; - - cctx->ibase = (const BYTE *)src; - cctx->ip = cctx->ibase; - cctx->iend = cctx->ibase + srcSize; - - cctx->ihashLimit = cctx->iend - HASH_SIZE; - cctx->imatchLimit = cctx->iend - MINMATCH; - - cctx->obase = (BYTE *)dst; - cctx->op = (BYTE *)dst; - - cctx->anchor = cctx->ibase; - - memset(&(cctx->stats), 0, sizeof(cctx->stats)); - memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); - - cctx->lastPosHashed = NULL; - cctx->nextIp = NULL; - - cctx->step = 1; -} - -static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { - cctx->nextIp = cctx->ip; - - do { - hash_t const h = cctx->nextHash; - cctx->ip = cctx->nextIp; - cctx->nextIp += cctx->step; - - if (cctx->nextIp > cctx->imatchLimit) { - return 1; - } - - *match = LDM_get_position_on_hash(h, cctx->hashTable, cctx->ibase); - - cctx->nextHash = LDM_hashPosition(cctx->nextIp); - LDM_putHashOfCurrentPositionFromHash(cctx, h); - } while (cctx->ip - *match > WINDOW_SIZE || - LDM_read64(*match) != LDM_read64(cctx->ip)); - return 0; -} - -// TODO: srcSize and maxDstSize is unused -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - LDM_CCtx cctx; - LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); - - /* Hash the first position and put it into the hash table. */ - LDM_putHashOfCurrentPosition(&cctx); - cctx.ip++; - cctx.nextHash = LDM_hashPosition(cctx.ip); - - // TODO: loop condition is not accurate. - while (1) { - const BYTE *match; - - /** - * Find a match. - * If no more matches can be found (i.e. the length of the remaining input - * is less than the minimum match length), then stop searching for matches - * and encode the final literals. - */ - if (LDM_findBestMatch(&cctx, &match) != 0) { - goto _last_literals; - } - - cctx.stats.numMatches++; - - /** - * Catch up: look back to extend the match backwards from the found match. - */ - while (cctx.ip > cctx.anchor && match > cctx.ibase && - cctx.ip[-1] == match[-1]) { - cctx.ip--; - match--; - } - - /** - * Write current block (literals, literal length, match offset, match - * length) and update pointers and hashes. - */ - { - unsigned const literalLength = (unsigned)(cctx.ip - cctx.anchor); - unsigned const offset = cctx.ip - match; - unsigned const matchLength = LDM_count( - cctx.ip + MINMATCH, match + MINMATCH, cctx.ihashLimit); - BYTE *token = cctx.op++; - - cctx.stats.totalLiteralLength += literalLength; - cctx.stats.totalOffset += offset; - cctx.stats.totalMatchLength += matchLength + MINMATCH; - - /* Encode the literal length. */ - if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; - *token = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *(cctx.op)++ = 255; - } - *(cctx.op)++ = (BYTE)len; - } else { - *token = (BYTE)(literalLength << ML_BITS); - } - - /* Encode the literals. */ - memcpy(cctx.op, cctx.anchor, literalLength); - cctx.op += literalLength; - - /* Encode the offset. */ - LDM_write32(cctx.op, offset); - cctx.op += LDM_OFFSET_SIZE; - - /* Encode match length */ - if (matchLength >= ML_MASK) { - unsigned matchLengthRemaining = matchLength; - *token += ML_MASK; - matchLengthRemaining -= ML_MASK; - LDM_write32(cctx.op, 0xFFFFFFFF); - while (matchLengthRemaining >= 4*0xFF) { - cctx.op += 4; - LDM_write32(cctx.op, 0xffffffff); - matchLengthRemaining -= 4*0xFF; - } - cctx.op += matchLengthRemaining / 255; - *(cctx.op)++ = (BYTE)(matchLengthRemaining % 255); - } else { - *token += (BYTE)(matchLength); - } - - /* Update input pointer, inserting hashes into hash table along the - * way. - */ - while (cctx.ip < cctx.anchor + MINMATCH + matchLength + literalLength) { - LDM_putHashOfCurrentPosition(&cctx); - cctx.ip++; - } - } - - // Set start of next block to current input pointer. - cctx.anchor = cctx.ip; - LDM_putHashOfCurrentPosition(&cctx); - cctx.nextHash = LDM_hashPosition(++cctx.ip); - } -_last_literals: - /* Encode the last literals (no more matches). */ - { - size_t const lastRun = (size_t)(cctx.iend - cctx.anchor); - if (lastRun >= RUN_MASK) { - size_t accumulator = lastRun - RUN_MASK; - *(cctx.op)++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255; accumulator -= 255) { - *(cctx.op)++ = 255; - } - *(cctx.op)++ = (BYTE)accumulator; - } else { - *(cctx.op)++ = (BYTE)(lastRun << ML_BITS); - } - memcpy(cctx.op, cctx.anchor, lastRun); - cctx.op += lastRun; - } - LDM_printCompressStats(&cctx.stats); - return (cctx.op - (const BYTE *)cctx.obase); -} - -typedef struct LDM_DCtx { - size_t compressSize; - size_t maxDecompressSize; - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of source */ - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Current output position */ - const BYTE *oend; /* End of output */ -} LDM_DCtx; - -static void LDM_initializeDCtx(LDM_DCtx *dctx, - const void *src, size_t compressSize, - void *dst, size_t maxDecompressSize) { - dctx->compressSize = compressSize; - dctx->maxDecompressSize = maxDecompressSize; - - dctx->ibase = src; - dctx->ip = (const BYTE *)src; - dctx->iend = dctx->ip + dctx->compressSize; - dctx->op = dst; - dctx->oend = dctx->op + dctx->maxDecompressSize; - -} - -size_t LDM_decompress(const void *src, size_t compressSize, - void *dst, size_t maxDecompressSize) { - LDM_DCtx dctx; - LDM_initializeDCtx(&dctx, src, compressSize, dst, maxDecompressSize); - - while (dctx.ip < dctx.iend) { - BYTE *cpy; - const BYTE *match; - size_t length, offset; - - /* Get the literal length. */ - unsigned const token = *(dctx.ip)++; - if ((length = (token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - - /* Copy literals. */ - cpy = dctx.op + length; - memcpy(dctx.op, dctx.ip, length); - dctx.ip += length; - dctx.op = cpy; - - //TODO : dynamic offset size - offset = LDM_read32(dctx.ip); - dctx.ip += LDM_OFFSET_SIZE; - match = dctx.op - offset; - - /* Get the match length. */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - length += MINMATCH; - - /* Copy match. */ - cpy = dctx.op + length; - - // Inefficient for now - while (match < cpy - offset && dctx.op < dctx.oend) { - *(dctx.op)++ = *match++; - } - } - return dctx.op - (BYTE *)dst; -} - - diff --git a/contrib/long_distance_matching/versions/v0.3/ldm.h b/contrib/long_distance_matching/versions/v0.3/ldm.h deleted file mode 100644 index 287d444d..00000000 --- a/contrib/long_distance_matching/versions/v0.3/ldm.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef LDM_H -#define LDM_H - -#include /* size_t */ - -#define LDM_COMPRESS_SIZE 4 -#define LDM_DECOMPRESS_SIZE 4 -#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) - -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -size_t LDM_decompress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -void LDM_readHeader(const void *src, size_t *compressSize, - size_t *decompressSize); - -#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v0.3/main-ldm.c b/contrib/long_distance_matching/versions/v0.3/main-ldm.c deleted file mode 100644 index 724d735d..00000000 --- a/contrib/long_distance_matching/versions/v0.3/main-ldm.c +++ /dev/null @@ -1,479 +0,0 @@ -// TODO: file size must fit into a U32 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "ldm.h" - -// #define BUF_SIZE 16*1024 // Block size -#define DEBUG - -//#define ZSTD - -/* Compress file given by fname and output to oname. - * Returns 0 if successful, error code otherwise. - */ -static int compress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - size_t maxCompressSize, compressSize; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - maxCompressSize = statbuf.st_size + LDM_HEADER_SIZE; - - /* Go to the location corresponding to the last byte. */ - /* TODO: fallocate? */ - if (lseek(fdout, maxCompressSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* Write a dummy byte at the last location. */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, maxCompressSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - -#ifdef ZSTD - compressSize = ZSTD_compress(dst, statbuf.st_size, - src, statbuf.st_size, 1); -#else - compressSize = LDM_HEADER_SIZE + - LDM_compress(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, statbuf.st_size); - - // Write compress and decompress size to header - // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &compressSize, 4); - memcpy(dst + 4, &(statbuf.st_size), 4); - -#ifdef DEBUG - printf("Compressed size: %zu\n", compressSize); - printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); -#endif -#endif - - // Truncate file to compressSize. - ftruncate(fdout, compressSize); - - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)compressSize, oname, - (double)compressSize / (statbuf.st_size) * 100); - - // Close files. - close(fdin); - close(fdout); - return 0; -} - -/* Decompress file compressed using LDM_compress. - * The input file should have the LDM_HEADER followed by payload. - * Returns 0 if succesful, and an error code otherwise. - */ -static int decompress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - size_t compressSize, decompressSize, outSize; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* Read the header. */ - LDM_readHeader(src, &compressSize, &decompressSize); - -#ifdef DEBUG - printf("Size, compressSize, decompressSize: %zu %zu %zu\n", - (size_t)statbuf.st_size, compressSize, decompressSize); -#endif - - /* Go to the location corresponding to the last byte. */ - if (lseek(fdout, decompressSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, decompressSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - -#ifdef ZSTD - outSize = ZSTD_decompress(dst, decomrpessed_size, - src + LDM_HEADER_SIZE, - statbuf.st_size - LDM_HEADER_SIZE); -#else - outSize = LDM_decompress( - src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, - dst, decompressSize); - - printf("Ret size out: %zu\n", outSize); - #endif - ftruncate(fdout, outSize); - - close(fdin); - close(fdout); - return 0; -} - -/* Compare two files. - * Returns 0 iff they are the same. - */ -static int compare(FILE *fp0, FILE *fp1) { - int result = 0; - while (result == 0) { - char b0[1024]; - char b1[1024]; - const size_t r0 = fread(b0, 1, sizeof(b0), fp0); - const size_t r1 = fread(b1, 1, sizeof(b1), fp1); - - result = (int)r0 - (int)r1; - - if (0 == r0 || 0 == r1) break; - - if (0 == result) result = memcmp(b0, b1, r0); - } - return result; -} - -/* Verify the input file is the same as the decompressed file. */ -static void verify(const char *inpFilename, const char *decFilename) { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - { - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - } - - fclose(decFp); - fclose(inpFp); -} - -int main(int argc, const char *argv[]) { - const char * const exeName = argv[0]; - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Wrong arguments\n"); - printf("Usage:\n"); - printf("%s FILE\n", exeName); - return 1; - } - - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - - /* Compress */ - { - struct timeval tv1, tv2; - gettimeofday(&tv1, NULL); - if (compress(inpFilename, ldmFilename)) { - printf("Compress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - } - - /* Decompress */ - { - struct timeval tv1, tv2; - gettimeofday(&tv1, NULL); - if (decompress(ldmFilename, decFilename)) { - printf("Decompress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - } - /* verify */ - verify(inpFilename, decFilename); - return 0; -} - - -#if 0 -static size_t compress_file(FILE *in, FILE *out, size_t *size_in, - size_t *size_out) { - char *src, *buf = NULL; - size_t r = 1; - size_t size, n, k, count_in = 0, count_out = 0, offset, frame_size = 0; - - src = malloc(BUF_SIZE); - if (!src) { - printf("Not enough memory\n"); - goto cleanup; - } - - size = BUF_SIZE + LDM_HEADER_SIZE; - buf = malloc(size); - if (!buf) { - printf("Not enough memory\n"); - goto cleanup; - } - - - for (;;) { - k = fread(src, 1, BUF_SIZE, in); - if (k == 0) - break; - count_in += k; - - n = LDM_compress(src, buf, k, BUF_SIZE); - - // n = k; - // offset += n; - offset = k; - count_out += k; - -// k = fwrite(src, 1, offset, out); - - k = fwrite(buf, 1, offset, out); - if (k < offset) { - if (ferror(out)) - printf("Write failed\n"); - else - printf("Short write\n"); - goto cleanup; - } - - } - *size_in = count_in; - *size_out = count_out; - r = 0; - cleanup: - free(src); - free(buf); - return r; -} - -static size_t decompress_file(FILE *in, FILE *out) { - void *src = malloc(BUF_SIZE); - void *dst = NULL; - size_t dst_capacity = BUF_SIZE; - size_t ret = 1; - size_t bytes_written = 0; - - if (!src) { - perror("decompress_file(src)"); - goto cleanup; - } - - while (ret != 0) { - /* Load more input */ - size_t src_size = fread(src, 1, BUF_SIZE, in); - void *src_ptr = src; - void *src_end = src_ptr + src_size; - if (src_size == 0 || ferror(in)) { - printf("(TODO): Decompress: not enough input or error reading file\n"); - //TODO - ret = 0; - goto cleanup; - } - - /* Allocate destination buffer if it hasn't been allocated already */ - if (!dst) { - dst = malloc(dst_capacity); - if (!dst) { - perror("decompress_file(dst)"); - goto cleanup; - } - } - - // TODO - - /* Decompress: - * Continue while there is more input to read. - */ - while (src_ptr != src_end && ret != 0) { - // size_t dst_size = src_size; - size_t dst_size = LDM_decompress(src, dst, src_size, dst_capacity); - size_t written = fwrite(dst, 1, dst_size, out); -// printf("Writing %zu bytes\n", dst_size); - bytes_written += dst_size; - if (written != dst_size) { - printf("Decompress: Failed to write to file\n"); - goto cleanup; - } - src_ptr += src_size; - src_size = src_end - src_ptr; - } - - /* Update input */ - - } - - printf("Wrote %zu bytes\n", bytes_written); - - cleanup: - free(src); - free(dst); - - return ret; -} - -int main2(int argc, char *argv[]) { - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Please specify input filename\n"); - return 0; - } - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - /* compress */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *outFp = fopen(ldmFilename, "wb"); - size_t sizeIn = 0; - size_t sizeOut = 0; - size_t ret; - printf("compress : %s -> %s\n", inpFilename, ldmFilename); - ret = compress_file(inpFp, outFp, &sizeIn, &sizeOut); - if (ret) { - printf("compress : failed with code %zu\n", ret); - return ret; - } - printf("%s: %zu → %zu bytes, %.1f%%\n", - inpFilename, sizeIn, sizeOut, - (double)sizeOut / sizeIn * 100); - printf("compress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* decompress */ - { - FILE *inpFp = fopen(ldmFilename, "rb"); - FILE *outFp = fopen(decFilename, "wb"); - size_t ret; - - printf("decompress : %s -> %s\n", ldmFilename, decFilename); - ret = decompress_file(inpFp, outFp); - if (ret) { - printf("decompress : failed with code %zu\n", ret); - return ret; - } - printf("decompress : done\n"); - - fclose(outFp); - fclose(inpFp); - } - - /* verify */ - { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - - fclose(decFp); - fclose(inpFp); - } - return 0; -} -#endif - diff --git a/contrib/long_distance_matching/versions/v0.3/util.c b/contrib/long_distance_matching/versions/v0.3/util.c deleted file mode 100644 index 9ea4ca1e..00000000 --- a/contrib/long_distance_matching/versions/v0.3/util.c +++ /dev/null @@ -1,64 +0,0 @@ -#include -#include -#include -#include - -#include "util.h" - -typedef uint8_t BYTE; -typedef uint16_t U16; -typedef uint32_t U32; -typedef int32_t S32; -typedef uint64_t U64; - -unsigned LDM_isLittleEndian(void) { - const union { U32 u; BYTE c[4]; } one = { 1 }; - return one.c[0]; -} - -U16 LDM_read16(const void *memPtr) { - U16 val; - memcpy(&val, memPtr, sizeof(val)); - return val; -} - -U16 LDM_readLE16(const void *memPtr) { - if (LDM_isLittleEndian()) { - return LDM_read16(memPtr); - } else { - const BYTE *p = (const BYTE *)memPtr; - return (U16)((U16)p[0] + (p[1] << 8)); - } -} - -void LDM_write16(void *memPtr, U16 value){ - memcpy(memPtr, &value, sizeof(value)); -} - -void LDM_write32(void *memPtr, U32 value) { - memcpy(memPtr, &value, sizeof(value)); -} - -void LDM_writeLE16(void *memPtr, U16 value) { - if (LDM_isLittleEndian()) { - LDM_write16(memPtr, value); - } else { - BYTE* p = (BYTE *)memPtr; - p[0] = (BYTE) value; - p[1] = (BYTE)(value>>8); - } -} - -U32 LDM_read32(const void *ptr) { - return *(const U32 *)ptr; -} - -U64 LDM_read64(const void *ptr) { - return *(const U64 *)ptr; -} - -void LDM_copy8(void *dst, const void *src) { - memcpy(dst, src, 8); -} - - diff --git a/contrib/long_distance_matching/versions/v0.3/util.h b/contrib/long_distance_matching/versions/v0.3/util.h deleted file mode 100644 index 90726412..00000000 --- a/contrib/long_distance_matching/versions/v0.3/util.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef LDM_UTIL_H -#define LDM_UTIL_H - -unsigned LDM_isLittleEndian(void); - -uint16_t LDM_read16(const void *memPtr); - -uint16_t LDM_readLE16(const void *memPtr); - -void LDM_write16(void *memPtr, uint16_t value); - -void LDM_write32(void *memPtr, uint32_t value); - -void LDM_writeLE16(void *memPtr, uint16_t value); - -uint32_t LDM_read32(const void *ptr); - -uint64_t LDM_read64(const void *ptr); - -void LDM_copy8(void *dst, const void *src); - - -#endif /* LDM_UTIL_H */ diff --git a/contrib/long_distance_matching/versions/v0.5/Makefile b/contrib/long_distance_matching/versions/v0.5/Makefile deleted file mode 100644 index cff78644..00000000 --- a/contrib/long_distance_matching/versions/v0.5/Makefile +++ /dev/null @@ -1,37 +0,0 @@ -# ################################################################ -# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. An additional grant -# of patent rights can be found in the PATENTS file in the same directory. -# ################################################################ - -# This Makefile presumes libzstd is installed, using `sudo make install` - -CPPFLAGS+= -I../../lib/common -CFLAGS ?= -O3 -DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ - -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ - -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ - -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ - -Wredundant-decls -CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS) -FLAGS = $(CPPFLAGS) $(CFLAGS) - -LDFLAGS += -lzstd - -.PHONY: default all clean - -default: all - -all: main-ldm - -main-ldm : ldm.h ldm.c main-ldm.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -clean: - @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main main-ldm - @echo Cleaning completed - diff --git a/contrib/long_distance_matching/versions/v0.5/README b/contrib/long_distance_matching/versions/v0.5/README deleted file mode 100644 index 7901ae76..00000000 --- a/contrib/long_distance_matching/versions/v0.5/README +++ /dev/null @@ -1,5 +0,0 @@ -This version uses simple lz4-style compression with a rolling hash. -- A rolling checksum based on rsync's Adler-32 style checksum is used. -- The checksum is hashed using lz4's hash function. -- Hash table replacement policy: direct overwrite. -- The length of input to the hash function can be set with LDM_HASH_LENGTH. diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.c b/contrib/long_distance_matching/versions/v0.5/ldm.c deleted file mode 100644 index 06c97bc4..00000000 --- a/contrib/long_distance_matching/versions/v0.5/ldm.c +++ /dev/null @@ -1,710 +0,0 @@ -#include -#include -#include -#include -#include - -#include "ldm.h" - -// Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY 0 - -#define ML_BITS 4 -#define ML_MASK ((1U<>= 1) { - ret++; - } - return ret; -} - -// TODO: Maybe we would eventually prefer to have linear rather than -// exponential buckets. -void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx) { - int i = 0; - int buckets[32] = { 0 }; - - printf("\n"); - printf("Hash table histogram\n"); - for (; i < LDM_HASHTABLESIZE_U32; i++) { - int offset = (cctx->ip - cctx->ibase) - cctx->hashTable[i].offset; - buckets[intLog2(offset)]++; - } - - i = 0; - for (; i < 32; i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, - buckets[i], - 100.0 * (double) buckets[i] / - (double) LDM_HASHTABLESIZE_U32); - } - printf("\n"); -} - -void LDM_printCompressStats(const LDM_compressStats *stats) { - int i = 0; - printf("=====================\n"); - printf("Compression statistics\n"); - //TODO: compute percentage matched? - printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", - stats->windowSizeLog, stats->hashTableSizeLog); - printf("num matches, total match length: %u, %llu\n", - stats->numMatches, - stats->totalMatchLength); - printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / - (double)stats->numMatches); - printf("avg literal length, total literalLength: %.1f, %llu\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches, - stats->totalLiteralLength); - printf("avg offset length: %.1f\n", - ((double)stats->totalOffset) / (double)stats->numMatches); - printf("min offset, max offset: %u, %u\n", - stats->minOffset, stats->maxOffset); - - printf("\n"); - printf("offset histogram: offset, num matches, %% of matches\n"); - - for (; i <= intLog2(stats->maxOffset); i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, - stats->offsetHistogram[i], - 100.0 * (double) stats->offsetHistogram[i] / - (double) stats->numMatches); - } - printf("\n"); - - - printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", - stats->numCollisions, stats->numHashInserts, - stats->numHashInserts == 0 ? - 1.0 : (100.0 * (double)stats->numCollisions) / - (double)stats->numHashInserts); - printf("=====================\n"); - -} - -int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { - /* - if (memcmp(pIn, pMatch, LDM_MIN_MATCH_LENGTH) == 0) { - return 1; - } - return 0; - */ - - //TODO: This seems to be faster for some reason? - U32 lengthLeft = LDM_MIN_MATCH_LENGTH; - const BYTE *curIn = pIn; - const BYTE *curMatch = pMatch; - - for (; lengthLeft >= 8; lengthLeft -= 8) { - if (MEM_read64(curIn) != MEM_read64(curMatch)) { - return 0; - } - curIn += 8; - curMatch += 8; - } - if (lengthLeft > 0) { - return (MEM_read32(curIn) == MEM_read32(curMatch)); - } - return 1; -} - -/** - * Convert a sum computed from getChecksum to a hash value in the range - * of the hash table. - */ -static hash_t checksumToHash(U32 sum) { - return ((sum * 2654435761U) >> (32 - LDM_HASHLOG)); -} - -/** - * Computes a checksum based on rsync's checksum. - * - * a(k,l) = \sum_{i = k}^l x_i (mod M) - * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) - * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) - */ -static U32 getChecksum(const BYTE *buf, U32 len) { - U32 i; - U32 s1, s2; - - s1 = s2 = 0; - for (i = 0; i < (len - 4); i += 4) { - s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + - (2 * buf[i + 2]) + (buf[i + 3]) + - (10 * CHECKSUM_CHAR_OFFSET); - s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3] + - + (4 * CHECKSUM_CHAR_OFFSET); - - } - for(; i < len; i++) { - s1 += buf[i] + CHECKSUM_CHAR_OFFSET; - s2 += s1; - } - return (s1 & 0xffff) + (s2 << 16); -} - -/** - * Update a checksum computed from getChecksum(data, len). - * - * The checksum can be updated along its ends as follows: - * a(k+1, l+1) = (a(k,l) - x_k + x_{l+1}) (mod M) - * b(k+1, l+1) = (b(k,l) - (l-k+1)*x_k + (a(k+1,l+1)) (mod M) - * - * Thus toRemove should correspond to data[0]. - */ -static U32 updateChecksum(U32 sum, U32 len, - BYTE toRemove, BYTE toAdd) { - U32 s1 = (sum & 0xffff) - toRemove + toAdd; - U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; - - return (s1 & 0xffff) + (s2 << 16); -} - -/** - * Update cctx->nextSum, cctx->nextHash, and cctx->nextPosHashed - * based on cctx->lastSum and cctx->lastPosHashed. - * - * This uses a rolling hash and requires that the last position hashed - * corresponds to cctx->nextIp - step. - */ -static void setNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - U32 check; - if ((cctx->nextIp - cctx->ibase != 1) && - (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { - printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, - cctx->DEBUG_setNextHash - cctx->ibase); - } - - cctx->DEBUG_setNextHash = cctx->nextIp; -#endif - -// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); - cctx->nextSum = updateChecksum( - cctx->lastSum, LDM_HASH_LENGTH, - cctx->lastPosHashed[0], - cctx->lastPosHashed[LDM_HASH_LENGTH]); - cctx->nextPosHashed = cctx->nextIp; - cctx->nextHash = checksumToHash(cctx->nextSum); - -#ifdef RUN_CHECKS - check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); - - if (check != cctx->nextSum) { - printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); - } - - if ((cctx->nextIp - cctx->lastPosHashed) != 1) { - printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", - cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, - cctx->ip - cctx->ibase); - } -#endif -} - -static void putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash, U32 sum) { -#ifdef COMPUTE_STATS - if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { - offset_t offset = cctx->hashTable[hash].offset; - cctx->stats.numHashInserts++; - if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { - cctx->stats.numCollisions++; - } - } -#endif - - // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. - // Note: this works only when cctx->step is 1. - if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; - cctx->hashTable[hash] = entry; - } - - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; - cctx->lastSum = sum; -} - -/** - * Copy over the cctx->lastHash, cctx->lastSum, and cctx->lastPosHashed - * fields from the "next" fields. - * - * This requires that cctx->ip == cctx->nextPosHashed. - */ -static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - if (cctx->ip != cctx->nextPosHashed) { - printf("CHECK failed: updateLastHashFromNextHash %zu\n", - cctx->ip - cctx->ibase); - } -#endif - putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); -} - -/** - * Insert hash of the current position into the hash table. - */ -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); - hash_t hash = checksumToHash(sum); - -#ifdef RUN_CHECKS - if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { - printf("CHECK failed: putHashOfCurrentPosition %zu\n", - cctx->ip - cctx->ibase); - } -#endif - - putHashOfCurrentPositionFromHash(cctx, hash, sum); -} - -/** - * Returns the position of the entry at hashTable[hash]. - */ -static const BYTE *getPositionOnHash(LDM_CCtx *cctx, hash_t hash) { - return cctx->hashTable[hash].offset + cctx->ibase; -} - -U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = (*pMatch) ^ *(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (U32)(pIn - pStart); - } - return (U32)(pIn - pStart); -} - -void LDM_readHeader(const void *src, U64 *compressedSize, - U64 *decompressedSize) { - const BYTE *ip = (const BYTE *)src; - *compressedSize = MEM_readLE64(ip); - ip += sizeof(U64); - *decompressedSize = MEM_readLE64(ip); - // ip += sizeof(U64); -} - -void LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - cctx->isize = srcSize; - cctx->maxOSize = maxDstSize; - - cctx->ibase = (const BYTE *)src; - cctx->ip = cctx->ibase; - cctx->iend = cctx->ibase + srcSize; - - cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; - cctx->imatchLimit = cctx->iend - LDM_MIN_MATCH_LENGTH; - - cctx->obase = (BYTE *)dst; - cctx->op = (BYTE *)dst; - - cctx->anchor = cctx->ibase; - - memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); -// memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); - cctx->stats.minOffset = UINT_MAX; - cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; - cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; - - - cctx->lastPosHashed = NULL; - - cctx->step = 1; // Fixed to be 1 for now. Changing may break things. - cctx->nextIp = cctx->ip + cctx->step; - cctx->nextPosHashed = 0; - - cctx->DEBUG_setNextHash = 0; -} - -void LDM_destroyCCtx(LDM_CCtx *cctx) { - free(cctx->hashTable); -} - -/** - * Finds the "best" match. - * - * Returns 0 if successful and 1 otherwise (i.e. no match can be found - * in the remaining input that is long enough). - * - */ -static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { - cctx->nextIp = cctx->ip + cctx->step; - - do { - hash_t h; - U32 sum; - setNextHash(cctx); - h = cctx->nextHash; - sum = cctx->nextSum; - cctx->ip = cctx->nextIp; - cctx->nextIp += cctx->step; - - if (cctx->ip > cctx->imatchLimit) { - return 1; - } - - *match = getPositionOnHash(cctx, h); - putHashOfCurrentPositionFromHash(cctx, h, sum); - - } while (cctx->ip - *match > LDM_WINDOW_SIZE || - !LDM_isValidMatch(cctx->ip, *match)); - setNextHash(cctx); - return 0; -} - -void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { - /* Encode the literal length. */ - if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; - *pToken = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *(cctx->op)++ = 255; - } - *(cctx->op)++ = (BYTE)len; - } else { - *pToken = (BYTE)(literalLength << ML_BITS); - } - - /* Encode the literals. */ - memcpy(cctx->op, cctx->anchor, literalLength); - cctx->op += literalLength; -} - -void LDM_outputBlock(LDM_CCtx *cctx, - const U32 literalLength, - const U32 offset, - const U32 matchLength) { - BYTE *pToken = cctx->op++; - - /* Encode the literal length and literals. */ - LDM_encodeLiteralLengthAndLiterals(cctx, pToken, literalLength); - - /* Encode the offset. */ - MEM_write32(cctx->op, offset); - cctx->op += LDM_OFFSET_SIZE; - - /* Encode the match length. */ - if (matchLength >= ML_MASK) { - unsigned matchLengthRemaining = matchLength; - *pToken += ML_MASK; - matchLengthRemaining -= ML_MASK; - MEM_write32(cctx->op, 0xFFFFFFFF); - while (matchLengthRemaining >= 4*0xFF) { - cctx->op += 4; - MEM_write32(cctx->op, 0xffffffff); - matchLengthRemaining -= 4*0xFF; - } - cctx->op += matchLengthRemaining / 255; - *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); - } else { - *pToken += (BYTE)(matchLength); - } -} - -// TODO: maxDstSize is unused. This function may seg fault when writing -// beyond the size of dst, as it does not check maxDstSize. Writing to -// a buffer and performing checks is a possible solution. -// -// This is based upon lz4. -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - LDM_CCtx cctx; - const BYTE *match; - LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); - - /* Hash the first position and put it into the hash table. */ - LDM_putHashOfCurrentPosition(&cctx); - - /** - * Find a match. - * If no more matches can be found (i.e. the length of the remaining input - * is less than the minimum match length), then stop searching for matches - * and encode the final literals. - */ - while (LDM_findBestMatch(&cctx, &match) == 0) { -#ifdef COMPUTE_STATS - cctx.stats.numMatches++; -#endif - - /** - * Catch up: look back to extend the match backwards from the found match. - */ - while (cctx.ip > cctx.anchor && match > cctx.ibase && - cctx.ip[-1] == match[-1]) { - cctx.ip--; - match--; - } - - /** - * Write current block (literals, literal length, match offset, match - * length) and update pointers and hashes. - */ - { - const U32 literalLength = cctx.ip - cctx.anchor; - const U32 offset = cctx.ip - match; - const U32 matchLength = LDM_countMatchLength( - cctx.ip + LDM_MIN_MATCH_LENGTH, match + LDM_MIN_MATCH_LENGTH, - cctx.ihashLimit); - - LDM_outputBlock(&cctx, literalLength, offset, matchLength); - -#ifdef COMPUTE_STATS - cctx.stats.totalLiteralLength += literalLength; - cctx.stats.totalOffset += offset; - cctx.stats.totalMatchLength += matchLength + LDM_MIN_MATCH_LENGTH; - cctx.stats.minOffset = - offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; - cctx.stats.maxOffset = - offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; - cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; -#endif - - // Move ip to end of block, inserting hashes at each position. - cctx.nextIp = cctx.ip + cctx.step; - while (cctx.ip < cctx.anchor + LDM_MIN_MATCH_LENGTH + - matchLength + literalLength) { - if (cctx.ip > cctx.lastPosHashed) { - // TODO: Simplify. - LDM_updateLastHashFromNextHash(&cctx); - setNextHash(&cctx); - } - cctx.ip++; - cctx.nextIp++; - } - } - - // Set start of next block to current input pointer. - cctx.anchor = cctx.ip; - LDM_updateLastHashFromNextHash(&cctx); - } - - // LDM_outputHashTableOffsetHistogram(&cctx); - - /* Encode the last literals (no more matches). */ - { - const size_t lastRun = cctx.iend - cctx.anchor; - BYTE *pToken = cctx.op++; - LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); - } - -#ifdef COMPUTE_STATS - LDM_printCompressStats(&cctx.stats); - LDM_outputHashTableOccupancy(cctx.hashTable, LDM_HASHTABLESIZE_U32); -#endif - - { - const size_t ret = cctx.op - cctx.obase; - LDM_destroyCCtx(&cctx); - return ret; - } -} - -struct LDM_DCtx { - size_t compressedSize; - size_t maxDecompressedSize; - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of source */ - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Current output position */ - const BYTE *oend; /* End of output */ -}; - -void LDM_initializeDCtx(LDM_DCtx *dctx, - const void *src, size_t compressedSize, - void *dst, size_t maxDecompressedSize) { - dctx->compressedSize = compressedSize; - dctx->maxDecompressedSize = maxDecompressedSize; - - dctx->ibase = src; - dctx->ip = (const BYTE *)src; - dctx->iend = dctx->ip + dctx->compressedSize; - dctx->op = dst; - dctx->oend = dctx->op + dctx->maxDecompressedSize; -} - -size_t LDM_decompress(const void *src, size_t compressedSize, - void *dst, size_t maxDecompressedSize) { - LDM_DCtx dctx; - LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); - - while (dctx.ip < dctx.iend) { - BYTE *cpy; - const BYTE *match; - size_t length, offset; - - /* Get the literal length. */ - const unsigned token = *(dctx.ip)++; - if ((length = (token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - - /* Copy the literals. */ - cpy = dctx.op + length; - memcpy(dctx.op, dctx.ip, length); - dctx.ip += length; - dctx.op = cpy; - - //TODO : dynamic offset size - offset = MEM_read32(dctx.ip); - dctx.ip += LDM_OFFSET_SIZE; - match = dctx.op - offset; - - /* Get the match length. */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - length += LDM_MIN_MATCH_LENGTH; - - /* Copy match. */ - cpy = dctx.op + length; - - // Inefficient for now. - while (match < cpy - offset && dctx.op < dctx.oend) { - *(dctx.op)++ = *match++; - } - } - return dctx.op - (BYTE *)dst; -} - -// TODO: implement and test hash function -void LDM_test(void) { -} - -/* -void LDM_test(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - const BYTE *ip = (const BYTE *)src + 1125; - U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); - U32 sum2; - ++ip; - for (; ip < (const BYTE *)src + 1125 + 100; ip++) { - sum2 = updateChecksum(sum, LDM_HASH_LENGTH, - ip[-1], ip[LDM_HASH_LENGTH - 1]); - sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); - printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2); - } -} -*/ - - diff --git a/contrib/long_distance_matching/versions/v0.5/ldm.h b/contrib/long_distance_matching/versions/v0.5/ldm.h deleted file mode 100644 index 70cda8b8..00000000 --- a/contrib/long_distance_matching/versions/v0.5/ldm.h +++ /dev/null @@ -1,159 +0,0 @@ -#ifndef LDM_H -#define LDM_H - -#include /* size_t */ - -#include "mem.h" // from /lib/common/mem.h - -#define LDM_COMPRESS_SIZE 8 -#define LDM_DECOMPRESS_SIZE 8 -#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) -#define LDM_OFFSET_SIZE 4 - -// Defines the size of the hash table. -#define LDM_MEMORY_USAGE 16 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) - -#define LDM_WINDOW_SIZE_LOG 25 -#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) - -//These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 4 -#define LDM_HASH_LENGTH 4 - -typedef U32 offset_t; -typedef U32 hash_t; -typedef struct LDM_hashEntry LDM_hashEntry; -typedef struct LDM_compressStats LDM_compressStats; -typedef struct LDM_CCtx LDM_CCtx; -typedef struct LDM_DCtx LDM_DCtx; - -/** - * Compresses src into dst. - * - * NB: This currently ignores maxDstSize and assumes enough space is available. - * - * Block format (see lz4 documentation for more information): - * github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md - * - * A block is composed of sequences. Each sequence begins with a token, which - * is a one-byte value separated into two 4-bit fields. - * - * The first field uses the four high bits of the token and encodes the literal - * length. If the field value is 0, there is no literal. If it is 15, - * additional bytes are added (each ranging from 0 to 255) to the previous - * value to produce a total length. - * - * Following the token and optional length bytes are the literals. - * - * Next are the 4 bytes representing the offset of the match (2 in lz4), - * representing the position to copy the literals. - * - * The lower four bits of the token encode the match length. With additional - * bytes added similarly to the additional literal length bytes after the offset. - * - * The last sequence is incomplete and stops right after the lieterals. - * - */ -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -/** - * Initialize the compression context. - * - * Allocates memory for the hash table. - */ -void LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -/** - * Frees up memory allocating in initializeCCtx - */ -void LDM_destroyCCtx(LDM_CCtx *cctx); - -/** - * Prints the percentage of the hash table occupied (where occupied is defined - * as the entry being non-zero). - */ -void LDM_outputHashTableOccupancy(const LDM_hashEntry *hashTable, - U32 hashTableSize); - -/** - * Prints the distribution of offsets in the hash table. - * - * The offsets are defined as the distance of the hash table entry from the - * current input position of the cctx. - */ -void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx); - -/** - * Outputs compression statistics to stdout. - */ -void LDM_printCompressStats(const LDM_compressStats *stats); -/** - * Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the - * LDM_MIN_MATCH_LENGTH bytes from match. - * - * This assumes LDM_MIN_MATCH_LENGTH is a multiple of four. - * - * Return 1 if valid, 0 otherwise. - */ -int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch); - -/** - * Counts the number of bytes that match from pIn and pMatch, - * up to pInLimit. - */ -U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit); - -/** - * Encode the literal length followed by the literals. - * - * The literal length is written to the upper four bits of pToken, with - * additional bytes written to the output as needed (see lz4). - * - * This is followed by literalLength bytes corresponding to the literals. - */ -void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength); - -/** - * Write current block (literals, literal length, match offset, - * match length). - */ -void LDM_outputBlock(LDM_CCtx *cctx, - const U32 literalLength, - const U32 offset, - const U32 matchLength); - -/** - * Decompresses src into dst. - * - * Note: assumes src does not have a header. - */ -size_t LDM_decompress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize); - -/** - * Initialize the decompression context. - */ -void LDM_initializeDCtx(LDM_DCtx *dctx, - const void *src, size_t compressedSize, - void *dst, size_t maxDecompressedSize); - -/** - * Reads the header from src and writes the compressed size and - * decompressed size into compressedSize and decompressedSize respectively. - * - * NB: LDM_compress and LDM_decompress currently do not add/read headers. - */ -void LDM_readHeader(const void *src, U64 *compressedSize, - U64 *decompressedSize); - -void LDM_test(void); - -#endif /* LDM_H */ diff --git a/contrib/long_distance_matching/versions/v0.5/main-ldm.c b/contrib/long_distance_matching/versions/v0.5/main-ldm.c deleted file mode 100644 index ea6375ba..00000000 --- a/contrib/long_distance_matching/versions/v0.5/main-ldm.c +++ /dev/null @@ -1,270 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "ldm.h" -#include "zstd.h" - -#define DEBUG -#define TEST - -/* Compress file given by fname and output to oname. - * Returns 0 if successful, error code otherwise. - * - * TODO: This currently seg faults if the compressed size is > the decompress - * size due to the mmapping and output file size allocated to be the input size. - * The compress function should check before writing or buffer writes. - */ -static int compress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - size_t maxCompressedSize, compressedSize; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - maxCompressedSize = statbuf.st_size + LDM_HEADER_SIZE; - - /* Go to the location corresponding to the last byte. */ - /* TODO: fallocate? */ - if (lseek(fdout, maxCompressedSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* Write a dummy byte at the last location. */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, maxCompressedSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - - compressedSize = LDM_HEADER_SIZE + - LDM_compress(src, statbuf.st_size, - dst + LDM_HEADER_SIZE, maxCompressedSize); - - // Write compress and decompress size to header - // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &compressedSize, 8); - memcpy(dst + 8, &(statbuf.st_size), 8); - -#ifdef DEBUG - printf("Compressed size: %zu\n", compressedSize); - printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); -#endif - - // Truncate file to compressedSize. - ftruncate(fdout, compressedSize); - - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)compressedSize, oname, - (double)compressedSize / (statbuf.st_size) * 100); - - // Close files. - close(fdin); - close(fdout); - return 0; -} - -/* Decompress file compressed using LDM_compress. - * The input file should have the LDM_HEADER followed by payload. - * Returns 0 if succesful, and an error code otherwise. - */ -static int decompress(const char *fname, const char *oname) { - int fdin, fdout; - struct stat statbuf; - char *src, *dst; - U64 compressedSize, decompressedSize; - size_t outSize; - - /* Open the input file. */ - if ((fdin = open(fname, O_RDONLY)) < 0) { - perror("Error in file opening"); - return 1; - } - - /* Open the output file. */ - if ((fdout = open(oname, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) { - perror("Can't create output file"); - return 1; - } - - /* Find the size of the input file. */ - if (fstat (fdin, &statbuf) < 0) { - perror("Fstat error"); - return 1; - } - - /* mmap the input file. */ - if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) - == (caddr_t) - 1) { - perror("mmap error for input"); - return 1; - } - - /* Read the header. */ - LDM_readHeader(src, &compressedSize, &decompressedSize); - - /* Go to the location corresponding to the last byte. */ - if (lseek(fdout, decompressedSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } - - /* mmap the output file */ - if ((dst = mmap(0, decompressedSize, PROT_READ | PROT_WRITE, - MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { - perror("mmap error for output"); - return 1; - } - - outSize = LDM_decompress( - src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, - dst, decompressedSize); - - printf("Ret size out: %zu\n", outSize); - ftruncate(fdout, outSize); - - close(fdin); - close(fdout); - return 0; -} - -/* Compare two files. - * Returns 0 iff they are the same. - */ -static int compare(FILE *fp0, FILE *fp1) { - int result = 0; - while (result == 0) { - char b0[1024]; - char b1[1024]; - const size_t r0 = fread(b0, 1, sizeof(b0), fp0); - const size_t r1 = fread(b1, 1, sizeof(b1), fp1); - - result = (int)r0 - (int)r1; - - if (0 == r0 || 0 == r1) break; - - if (0 == result) result = memcmp(b0, b1, r0); - } - return result; -} - -/* Verify the input file is the same as the decompressed file. */ -static void verify(const char *inpFilename, const char *decFilename) { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); - - printf("verify : %s <-> %s\n", inpFilename, decFilename); - { - const int cmp = compare(inpFp, decFp); - if(0 == cmp) { - printf("verify : OK\n"); - } else { - printf("verify : NG\n"); - } - } - - fclose(decFp); - fclose(inpFp); -} - -int main(int argc, const char *argv[]) { - const char * const exeName = argv[0]; - char inpFilename[256] = { 0 }; - char ldmFilename[256] = { 0 }; - char decFilename[256] = { 0 }; - - if (argc < 2) { - printf("Wrong arguments\n"); - printf("Usage:\n"); - printf("%s FILE\n", exeName); - return 1; - } - - snprintf(inpFilename, 256, "%s", argv[1]); - snprintf(ldmFilename, 256, "%s.ldm", argv[1]); - snprintf(decFilename, 256, "%s.ldm.dec", argv[1]); - - printf("inp = [%s]\n", inpFilename); - printf("ldm = [%s]\n", ldmFilename); - printf("dec = [%s]\n", decFilename); - - - /* Compress */ - { - struct timeval tv1, tv2; - gettimeofday(&tv1, NULL); - if (compress(inpFilename, ldmFilename)) { - printf("Compress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total compress time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - } - - /* Decompress */ - { - struct timeval tv1, tv2; - gettimeofday(&tv1, NULL); - if (decompress(ldmFilename, decFilename)) { - printf("Decompress error"); - return 1; - } - gettimeofday(&tv2, NULL); - printf("Total decompress time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); - } - /* verify */ - verify(inpFilename, decFilename); - -#ifdef TEST - LDM_test(); -#endif - return 0; -} From fc41a8796493063c26103ca7ed32561dd2e649d0 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 17 Jul 2017 18:13:09 -0700 Subject: [PATCH 40/62] Experiment with using a lag when hashing --- contrib/long_distance_matching/Makefile | 6 +- contrib/long_distance_matching/basic_table.c | 6 +- .../circular_buffer_table.c | 20 +++--- contrib/long_distance_matching/ldm.c | 71 ++++++++++++------- contrib/long_distance_matching/ldm.h | 4 +- .../long_distance_matching/ldm_hashtable.h | 32 +++++++-- contrib/long_distance_matching/main-ldm.c | 1 - 7 files changed, 88 insertions(+), 52 deletions(-) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 47085022..df439015 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,7 +25,7 @@ LDFLAGS += -lzstd default: all -all: main-basic main-circular-buffer +all: main-basic main-circular-buffer main-lag main-basic : basic_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ @@ -33,9 +33,11 @@ main-basic : basic_table.c ldm.c main-ldm.c main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ +main-lag: lag_table.c ldm.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-circular-buffer + main-basic main-circular-buffer main-lag @echo Cleaning completed diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c index 859bf061..893a4caf 100644 --- a/contrib/long_distance_matching/basic_table.c +++ b/contrib/long_distance_matching/basic_table.c @@ -27,7 +27,6 @@ LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + hash; } - LDM_hashEntry *HASH_getEntryFromHash( const LDM_hashTable *table, const hash_t hash, const U32 checksum) { (void)checksum; @@ -43,13 +42,10 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, (void)checksum; if ((*isValid)(pIn, entry->offset + table->offsetBase)) { return entry; - } else { - return NULL; } + return NULL; } - - void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *getBucket(table, hash) = entry; diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index f45f945c..b578d2bf 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -9,7 +9,7 @@ // refactor code to scale the number of elements appropriately. // Number of elements per hash bucket. -#define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now +#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) struct LDM_hashTable { @@ -19,6 +19,7 @@ struct LDM_hashTable { // Position corresponding to offset=0 in LDM_hashEntry. const BYTE *offsetBase; BYTE *bucketOffsets; // Pointer to current insert position. + // Last insert was at bucketOffsets - 1? }; @@ -35,15 +36,6 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + (hash << HASH_BUCKET_SIZE_LOG); } -/* -static LDM_hashEntry *getLastInsertFromHash(const LDM_hashTable *table, - const hash_t hash) { - LDM_hashEntry *bucket = getBucket(table, hash); - BYTE offset = (table->bucketOffsets[hash] - 1) & (HASH_BUCKET_SIZE - 1); - return bucket + offset; -} -*/ - LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const hash_t hash, const U32 checksum, @@ -53,7 +45,12 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, LDM_hashEntry *cur = bucket; // TODO: in order of recency? for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - // CHeck checksum for faster check. + /* + if (cur->checksum == 0 && cur->offset == 0) { + return NULL; + } + */ + // Check checksum for faster check. if (cur->checksum == checksum && (*isValid)(pIn, cur->offset + table->offsetBase)) { return cur; @@ -62,7 +59,6 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, return NULL; } - LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, const hash_t hash, const U32 checksum) { diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index bf54842f..dedbf79a 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -5,7 +5,7 @@ #include // Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY 31 +#define HASH_ONLY_EVERY 15 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) @@ -18,6 +18,10 @@ #define COMPUTE_STATS #define CHECKSUM_CHAR_OFFSET 10 + +#define LAG 0 + +//#define HASH_CHECK //#define RUN_CHECKS //#define LDM_DEBUG @@ -79,6 +83,10 @@ struct LDM_CCtx { unsigned step; // ip step, should be 1. + const BYTE *lagIp; + hash_t lagHash; + U32 lagSum; + // DEBUG const BYTE *DEBUG_setNextHash; }; @@ -253,6 +261,17 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->nextPosHashed = cctx->nextIp; cctx->nextHash = checksumToHash(cctx->nextSum); +#if LAG + if (cctx->ip - cctx->ibase > LAG) { +// printf("LAG %zu\n", cctx->ip - cctx->lagIp); + cctx->lagSum = updateChecksum( + cctx->lagSum, LDM_HASH_LENGTH, + cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); + cctx->lagIp++; + cctx->lagHash = checksumToHash(cctx->lagSum); + } +#endif + #ifdef RUN_CHECKS check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); @@ -270,18 +289,6 @@ static void setNextHash(LDM_CCtx *cctx) { static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { - /* -#ifdef COMPUTE_STATS - if (cctx->stats.numHashInserts < HASH_getSize(cctx->hashTable)) { - U32 offset = HASH_getEntryFromHash(cctx->hashTable, hash)->offset; - cctx->stats.numHashInserts++; - if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { - cctx->stats.numCollisions++; - } - } -#endif -*/ - // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { @@ -289,8 +296,19 @@ static void putHashOfCurrentPositionFromHash( const LDM_hashEntry entry = { cctx->ip - cctx->ibase , MEM_read32(cctx->ip) }; */ +#if LAG + // TODO: off by 1, but whatever + if (cctx->lagIp - cctx->ibase > 0) { + const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; + HASH_insert(cctx->hashTable, cctx->lagHash, entry); + } else { + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; + HASH_insert(cctx->hashTable, hash, entry); + } +#else const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; HASH_insert(cctx->hashTable, hash, entry); +#endif } cctx->lastPosHashed = cctx->ip; @@ -331,15 +349,6 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { putHashOfCurrentPositionFromHash(cctx, hash, sum); } -/** - * Returns the position of the entry at hashTable[hash]. - */ -/* -static const BYTE *getPositionOnHash(const LDM_CCtx *cctx, const hash_t hash) { - return HASH_getEntryFromHash(cctx->hashTable, hash)->offset + cctx->ibase; -} -*/ - U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE * const pStart = pIn; @@ -431,12 +440,20 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { if (cctx->ip > cctx->imatchLimit) { return 1; } - +#ifdef HASH_CHECK + entry = HASH_getEntryFromHash(cctx->hashTable, h, sum); +#else entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip, &LDM_isValidMatch); +#endif if (entry != NULL) { *match = entry->offset + cctx->ibase; +#ifdef HASH_CHECK + if (!LDM_isValidMatch(cctx->ip, *match)) { + entry = NULL; + } +#endif } putHashOfCurrentPositionFromHash(cctx, h, sum); } @@ -508,6 +525,12 @@ size_t LDM_compress(const void *src, size_t srcSize, /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); +#if LAG + cctx.lagIp = cctx.ip; + cctx.lagHash = cctx.lastHash; + cctx.lagSum = cctx.lastSum; +#endif + /** * Find a match. * If no more matches can be found (i.e. the length of the remaining input @@ -575,7 +598,7 @@ size_t LDM_compress(const void *src, size_t srcSize, /* Encode the last literals (no more matches). */ { - const size_t lastRun = cctx.iend - cctx.anchor; + const U32 lastRun = cctx.iend - cctx.anchor; BYTE *pToken = cctx.op++; LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 6d97bd56..6d7c4af2 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -10,8 +10,8 @@ #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) #define LDM_OFFSET_SIZE 4 -// Defines the size of the hash table. -#define LDM_MEMORY_USAGE 20 +// Defines the size of the hash table (currently the number of elements). +#define LDM_MEMORY_USAGE 12 #define LDM_WINDOW_SIZE_LOG 30 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 88d19ae2..83a9ed27 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -3,34 +3,54 @@ #include "mem.h" +// TODO: clean up comments + typedef U32 hash_t; typedef struct LDM_hashEntry { - U32 offset; + U32 offset; // TODO: Replace with pointer? U32 checksum; } LDM_hashEntry; typedef struct LDM_hashTable LDM_hashTable; -// TODO: rename functions -// TODO: comments - +/** + * Create a hash table with size hash buckets. + * LDM_hashEntry.offset is added to offsetBase to calculate pMatch in + * HASH_getValidEntry. + */ LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase); -//TODO: unneeded? +/** + * Returns an LDM_hashEntry from the table that matches the checksum. + * Returns NULL if one does not exist. + */ LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, const hash_t hash, const U32 checksum); +/** + * Gets a valid entry that matches the checksum. A valid entry is defined by + * *isValid. + * + * The function finds an entry matching the checksum, computes pMatch as + * offset + table.offsetBase, and calls isValid. + */ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const hash_t hash, const U32 checksum, const BYTE *pIn, int (*isValid)(const BYTE *pIn, const BYTE *pMatch)); +/** + * Insert an LDM_hashEntry into the bucket corresponding to hash. + */ void HASH_insert(LDM_hashTable *table, const hash_t hash, - const LDM_hashEntry entry); + const LDM_hashEntry entry); +/** + * Return the number of distinct hash buckets. + */ U32 HASH_getSize(const LDM_hashTable *table); void HASH_destroyTable(LDM_hashTable *table); diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index ea6375ba..a379d3a6 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -163,7 +163,6 @@ static int decompress(const char *fname, const char *oname) { outSize = LDM_decompress( src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, dst, decompressedSize); - printf("Ret size out: %zu\n", outSize); ftruncate(fdout, outSize); From 19258f51c1d4e9b0e10ae0488e457887b4c383cb Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Tue, 18 Jul 2017 14:25:39 -0700 Subject: [PATCH 41/62] Make the meaning of LDM_MEMORY_USAGE consistent across tables --- contrib/long_distance_matching/Makefile | 7 +-- contrib/long_distance_matching/basic_table.c | 7 +++ .../circular_buffer_table.c | 34 +++++++------ contrib/long_distance_matching/ldm.c | 50 ++++++++++++------- contrib/long_distance_matching/ldm.h | 21 +++++--- .../long_distance_matching/ldm_hashtable.h | 3 +- contrib/long_distance_matching/main-ldm.c | 25 +++++++--- 7 files changed, 94 insertions(+), 53 deletions(-) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index df439015..131638fd 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,7 +25,7 @@ LDFLAGS += -lzstd default: all -all: main-basic main-circular-buffer main-lag +all: main-basic main-circular-buffer main-basic : basic_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ @@ -33,11 +33,8 @@ main-basic : basic_table.c ldm.c main-ldm.c main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-lag: lag_table.c ldm.c main-ldm.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-circular-buffer main-lag + main-basic main-circular-buffer @echo Cleaning completed diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c index 893a4caf..8b3588e8 100644 --- a/contrib/long_distance_matching/basic_table.c +++ b/contrib/long_distance_matching/basic_table.c @@ -1,9 +1,12 @@ #include #include +#include "ldm.h" #include "ldm_hashtable.h" #include "mem.h" +#define LDM_HASHLOG ((LDM_MEMORY_USAGE) - 4) + struct LDM_hashTable { U32 size; LDM_hashEntry *entries; @@ -46,6 +49,10 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, return NULL; } +hash_t HASH_hashU32(U32 value) { + return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); +} + void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *getBucket(table, hash) = entry; diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index b578d2bf..bc7503f1 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -1,33 +1,36 @@ #include #include +#include "ldm.h" #include "ldm_hashtable.h" #include "mem.h" //TODO: move def somewhere else. -//TODO: memory usage is currently no longer LDM_MEMORY_USAGE. -// refactor code to scale the number of elements appropriately. // Number of elements per hash bucket. +// HASH_BUCKET_SIZE_LOG defined in ldm.h #define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) +#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG) + struct LDM_hashTable { - U32 size; + U32 size; // Number of buckets + U32 maxEntries; // Rename... LDM_hashEntry *entries; // 1-D array for now. // Position corresponding to offset=0 in LDM_hashEntry. const BYTE *offsetBase; BYTE *bucketOffsets; // Pointer to current insert position. - // Last insert was at bucketOffsets - 1? }; LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase) { LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); - table->size = size; - table->entries = calloc(size * HASH_BUCKET_SIZE, sizeof(LDM_hashEntry)); - table->bucketOffsets = calloc(size, sizeof(BYTE)); + table->size = size >> HASH_BUCKET_SIZE_LOG; + table->maxEntries = size; + table->entries = calloc(size, sizeof(LDM_hashEntry)); + table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); table->offsetBase = offsetBase; return table; } @@ -45,11 +48,6 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, LDM_hashEntry *cur = bucket; // TODO: in order of recency? for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - /* - if (cur->checksum == 0 && cur->offset == 0) { - return NULL; - } - */ // Check checksum for faster check. if (cur->checksum == checksum && (*isValid)(pIn, cur->offset + table->offsetBase)) { @@ -59,6 +57,11 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, return NULL; } +hash_t HASH_hashU32(U32 value) { + return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); +} + + LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, const hash_t hash, const U32 checksum) { @@ -82,7 +85,7 @@ void HASH_insert(LDM_hashTable *table, } U32 HASH_getSize(const LDM_hashTable *table) { - return table->size * HASH_BUCKET_SIZE; + return table->size; } void HASH_destroyTable(LDM_hashTable *table) { @@ -101,7 +104,8 @@ void HASH_outputTableOccupancy(const LDM_hashTable *table) { } } + printf("Num buckets, bucket size: %d, %d\n", table->size, HASH_BUCKET_SIZE); printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - HASH_getSize(table), ctr, - 100.0 * (double)(ctr) / (double)HASH_getSize(table)); + table->maxEntries, ctr, + 100.0 * (double)(ctr) / table->maxEntries); } diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index dedbf79a..4d8ca40b 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -4,12 +4,16 @@ #include #include -// Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY 15 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +//#define LDM_HASH_ENTRY_SIZE 4 #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 4) + +// Insert every (HASH_ONLY_EVERY + 1) into the hash table. +#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - 4)) +#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) + #define ML_BITS 4 #define ML_MASK ((1U<> (32 - LDM_HASHLOG)); + return HASH_hashU32(sum); +// return ((sum * 2654435761U) >> (32 - LDM_HASHLOG)); } /** @@ -261,9 +266,9 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->nextPosHashed = cctx->nextIp; cctx->nextHash = checksumToHash(cctx->nextSum); -#if LAG - if (cctx->ip - cctx->ibase > LAG) { -// printf("LAG %zu\n", cctx->ip - cctx->lagIp); +#if LDM_LAG +// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); + if (cctx->ip - cctx->ibase > LDM_LAG) { cctx->lagSum = updateChecksum( cctx->lagSum, LDM_HASH_LENGTH, cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); @@ -296,7 +301,7 @@ static void putHashOfCurrentPositionFromHash( const LDM_hashEntry entry = { cctx->ip - cctx->ibase , MEM_read32(cctx->ip) }; */ -#if LAG +#if LDM_LAG // TODO: off by 1, but whatever if (cctx->lagIp - cctx->ibase > 0) { const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; @@ -364,6 +369,18 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, return (U32)(pIn - pStart); } +void LDM_outputConfiguration(void) { + printf("=====================\n"); + printf("Configuration\n"); + printf("Window size log: %d\n", LDM_WINDOW_SIZE_LOG); + printf("Min match, hash length: %d, %d\n", + LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); + printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); + printf("HASH_ONLY_EVERY: %d\n", HASH_ONLY_EVERY); + printf("LDM_LAG %d\n", LDM_LAG); + printf("=====================\n"); +} + void LDM_readHeader(const void *src, U64 *compressedSize, U64 *decompressedSize) { const BYTE *ip = (const BYTE *)src; @@ -392,12 +409,8 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32, cctx->ibase); + cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64, cctx->ibase); - //HASH_initializeTable(cctx->hashTable, LDM_HASHTABLESIZE_U32); - -// calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); -// memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->stats.minOffset = UINT_MAX; cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; @@ -520,17 +533,19 @@ size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; const BYTE *match = NULL; +// printf("TST: %d\n", LDM_WINDOW_SIZE / LDM_HASHTABLESIZE_U64); + printf("HASH LOG: %d\n", HASH_ONLY_EVERY_LOG); + LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); -#if LAG +#if LDM_LAG cctx.lagIp = cctx.ip; cctx.lagHash = cctx.lastHash; cctx.lagSum = cctx.lastSum; #endif - /** * Find a match. * If no more matches can be found (i.e. the length of the remaining input @@ -542,6 +557,7 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.stats.numMatches++; #endif +// printf("HERE %zu\n", cctx.ip - cctx.ibase); /** * Catch up: look back to extend the match backwards from the found match. */ diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 6d7c4af2..2d4ff9cf 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -10,15 +10,20 @@ #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) #define LDM_OFFSET_SIZE 4 -// Defines the size of the hash table (currently the number of elements). -#define LDM_MEMORY_USAGE 12 +// Defines the size of the hash table. +// Currently this should be less than WINDOW_SIZE_LOG + 4? +#define LDM_MEMORY_USAGE 24 -#define LDM_WINDOW_SIZE_LOG 30 +//#define LDM_LAG (1 << 23) +//#define LDM_LAG (1 << 20) +#define LDM_LAG 0 + +#define LDM_WINDOW_SIZE_LOG 28 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) -//These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 64 -#define LDM_HASH_LENGTH 64 +//These should be multiples of four (and perhaps set to the same values?). +#define LDM_MIN_MATCH_LENGTH 512 +#define LDM_HASH_LENGTH 512 typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; @@ -48,7 +53,7 @@ typedef struct LDM_DCtx LDM_DCtx; * The lower four bits of the token encode the match length. With additional * bytes added similarly to the additional literal length bytes after the offset. * - * The last sequence is incomplete and stops right after the lieterals. + * The last sequence is incomplete and stops right after the literals. * */ size_t LDM_compress(const void *src, size_t srcSize, @@ -142,6 +147,8 @@ void LDM_initializeDCtx(LDM_DCtx *dctx, void LDM_readHeader(const void *src, U64 *compressedSize, U64 *decompressedSize); +void LDM_outputConfiguration(void); + void LDM_test(void); #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 83a9ed27..4fef6621 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -42,6 +42,8 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const BYTE *pIn, int (*isValid)(const BYTE *pIn, const BYTE *pMatch)); +hash_t HASH_hashU32(U32 value); + /** * Insert an LDM_hashEntry into the bucket corresponding to hash. */ @@ -61,5 +63,4 @@ void HASH_destroyTable(LDM_hashTable *table); */ void HASH_outputTableOccupancy(const LDM_hashTable *hashTable); - #endif /* LDM_HASHTABLE_H */ diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index a379d3a6..a43ec000 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -18,7 +18,7 @@ /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. * - * TODO: This currently seg faults if the compressed size is > the decompress + * TODO: This might seg fault if the compressed size is > the decompress * size due to the mmapping and output file size allocated to be the input size. * The compress function should check before writing or buffer writes. */ @@ -28,6 +28,8 @@ static int compress(const char *fname, const char *oname) { char *src, *dst; size_t maxCompressedSize, compressedSize; + struct timeval tv1, tv2; + /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { perror("Error in file opening"); @@ -46,7 +48,10 @@ static int compress(const char *fname, const char *oname) { return 1; } - maxCompressedSize = statbuf.st_size + LDM_HEADER_SIZE; + maxCompressedSize = (statbuf.st_size + LDM_HEADER_SIZE); + // Handle case where compressed size is > decompressed size. + // The compress function should check before writing or buffer writes. + maxCompressedSize += statbuf.st_size / 255; /* Go to the location corresponding to the last byte. */ /* TODO: fallocate? */ @@ -74,10 +79,12 @@ static int compress(const char *fname, const char *oname) { perror("mmap error for output"); return 1; } + gettimeofday(&tv1, NULL); compressedSize = LDM_HEADER_SIZE + LDM_compress(src, statbuf.st_size, dst + LDM_HEADER_SIZE, maxCompressedSize); + gettimeofday(&tv2, NULL); // Write compress and decompress size to header // TODO: should depend on LDM_DECOMPRESS_SIZE write32 @@ -96,6 +103,14 @@ static int compress(const char *fname, const char *oname) { (unsigned)statbuf.st_size, (unsigned)compressedSize, oname, (double)compressedSize / (statbuf.st_size) * 100); + printf("Total compress time = %.3f seconds, Average compression speed: %.3f MB/s\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec), + ((double)statbuf.st_size / (double) (1 << 20)) / + ((double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec))); + + // Close files. close(fdin); close(fdout); @@ -234,16 +249,10 @@ int main(int argc, const char *argv[]) { /* Compress */ { - struct timeval tv1, tv2; - gettimeofday(&tv1, NULL); if (compress(inpFilename, ldmFilename)) { printf("Compress error"); return 1; } - gettimeofday(&tv2, NULL); - printf("Total compress time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); } /* Decompress */ From 1fa223859fb7f53d5a03c8b30dfd74d40898b05f Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Tue, 18 Jul 2017 18:05:10 -0700 Subject: [PATCH 42/62] Switch to using ZSTD_count instead of function pointer --- contrib/long_distance_matching/basic_table.c | 28 +++- .../circular_buffer_table.c | 151 +++++++++++++++++- contrib/long_distance_matching/ldm.c | 22 +-- contrib/long_distance_matching/ldm.h | 9 +- .../long_distance_matching/ldm_hashtable.h | 4 +- 5 files changed, 194 insertions(+), 20 deletions(-) diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c index 8b3588e8..6c12b508 100644 --- a/contrib/long_distance_matching/basic_table.c +++ b/contrib/long_distance_matching/basic_table.c @@ -36,14 +36,38 @@ LDM_hashEntry *HASH_getEntryFromHash( return getBucket(table, hash); } +static int isValidMatch(const BYTE *pIn, const BYTE *pMatch, + U32 minMatchLength, U32 maxWindowSize) { + U32 lengthLeft = minMatchLength; + const BYTE *curIn = pIn; + const BYTE *curMatch = pMatch; + + if (pIn - pMatch > maxWindowSize) { + return 0; + } + + for (; lengthLeft >= 4; lengthLeft -= 4) { + if (MEM_read32(curIn) != MEM_read32(curMatch)) { + return 0; + } + curIn += 4; + curMatch += 4; + } + return 1; +} + LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const hash_t hash, const U32 checksum, const BYTE *pIn, - int (*isValid)(const BYTE *pIn, const BYTE *pMatch)) { + const BYTE *pEnd, + U32 minMatchLength, + U32 maxWindowSize) { LDM_hashEntry *entry = getBucket(table, hash); (void)checksum; - if ((*isValid)(pIn, entry->offset + table->offsetBase)) { + (void)pEnd; + if (isValidMatch(pIn, entry->offset + table->offsetBase, + minMatchLength, maxWindowSize)) { return entry; } return NULL; diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index bc7503f1..653d9e51 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -9,11 +9,14 @@ // Number of elements per hash bucket. // HASH_BUCKET_SIZE_LOG defined in ldm.h -#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now +#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) +// TODO: rename. Number of hash buckets. #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG) +//#define TMP_ZSTDTOGGLE + struct LDM_hashTable { U32 size; // Number of buckets U32 maxEntries; // Rename... @@ -39,20 +42,162 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + (hash << HASH_BUCKET_SIZE_LOG); } +#ifdef TMP_ZSTDTOGGLE +static unsigned ZSTD_NbCommonBytes (register size_t val) +{ + if (MEM_isLittleEndian()) { + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanForward64( &r, (U64)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, + 0, 3, 1, 3, 1, 4, 2, 7, + 0, 2, 3, 6, 1, 5, 3, 5, + 1, 3, 4, 4, 2, 5, 6, 7, + 7, 0, 1, 2, 3, 3, 4, 6, + 2, 6, 5, 5, 3, 4, 5, 6, + 7, 1, 2, 4, 6, 4, 4, 5, + 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r=0; + _BitScanForward( &r, (U32)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else { /* Big Endian CPU */ + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clzll(val) >> 3); +# else + unsigned r; + const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ + if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } } +} + +// From lib/compress/zstd_compress.c +static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *const pInLimit) { + const BYTE * const pStart = pIn; + const BYTE * const pInLoopLimit = pInLimit - (sizeof(size_t)-1); + + while (pIn < pInLoopLimit) { + size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (!diff) { + pIn += sizeof(size_t); + pMatch += sizeof(size_t); + continue; + } + pIn += ZSTD_NbCommonBytes(diff); + return (size_t)(pIn - pStart); + } + + if (MEM_64bits()) { + if ((pIn < (pInLimit - 3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { + pIn += 4; + pMatch += 4; + } + } + if ((pIn < (pInLimit - 1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { + pIn += 2; + pMatch += 2; + } + if ((pIn < pInLimit) && (*pMatch == *pIn)) { + pIn++; + } + return (size_t)(pIn - pStart); +} + +#else + +static int isValidMatch(const BYTE *pIn, const BYTE *pMatch, + U32 minMatchLength, U32 maxWindowSize) { + U32 lengthLeft = minMatchLength; + const BYTE *curIn = pIn; + const BYTE *curMatch = pMatch; + + if (pIn - pMatch > maxWindowSize) { + return 0; + } + + for (; lengthLeft >= 4; lengthLeft -= 4) { + if (MEM_read32(curIn) != MEM_read32(curMatch)) { + return 0; + } + curIn += 4; + curMatch += 4; + } + return 1; +} + +#endif // TMP_ZSTDTOGGLE + LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const hash_t hash, const U32 checksum, const BYTE *pIn, - int (*isValid)(const BYTE *pIn, const BYTE *pMatch)) { + const BYTE *pEnd, + U32 minMatchLength, + U32 maxWindowSize) { LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; // TODO: in order of recency? for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { // Check checksum for faster check. + const BYTE *pMatch = cur->offset + table->offsetBase; +#ifdef TMP_ZSTDTOGGLE + if (cur->checksum == checksum && pIn - pMatch <= maxWindowSize) { + U32 matchLength = ZSTD_count(pIn, pMatch, pEnd); + if (matchLength >= minMatchLength) { + return cur; + } + } +#else + (void)pEnd; + (void)minMatchLength; + (void)maxWindowSize; + if (cur->checksum == checksum && - (*isValid)(pIn, cur->offset + table->offsetBase)) { + isValidMatch(pIn, pMatch, minMatchLength, maxWindowSize)) { return cur; } +#endif } return NULL; } diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 4d8ca40b..56b22d28 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -91,6 +91,7 @@ struct LDM_CCtx { hash_t lagHash; U32 lagSum; + U64 numHashInserts; // DEBUG const BYTE *DEBUG_setNextHash; }; @@ -164,7 +165,6 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { } printf("\n"); printf("=====================\n"); - } int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { @@ -376,7 +376,7 @@ void LDM_outputConfiguration(void) { printf("Min match, hash length: %d, %d\n", LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); - printf("HASH_ONLY_EVERY: %d\n", HASH_ONLY_EVERY); + printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); printf("LDM_LAG %d\n", LDM_LAG); printf("=====================\n"); } @@ -456,8 +456,10 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { #ifdef HASH_CHECK entry = HASH_getEntryFromHash(cctx->hashTable, h, sum); #else - entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip, - &LDM_isValidMatch); + entry = HASH_getValidEntry(cctx->hashTable, h, sum, + cctx->ip, cctx->iend, + LDM_MIN_MATCH_LENGTH, + LDM_WINDOW_SIZE); #endif if (entry != NULL) { @@ -534,9 +536,10 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_CCtx cctx; const BYTE *match = NULL; // printf("TST: %d\n", LDM_WINDOW_SIZE / LDM_HASHTABLESIZE_U64); - printf("HASH LOG: %d\n", HASH_ONLY_EVERY_LOG); +// printf("HASH LOG: %d\n", HASH_ONLY_EVERY_LOG); LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + LDM_outputConfiguration(); /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); @@ -553,11 +556,10 @@ size_t LDM_compress(const void *src, size_t srcSize, * and encode the final literals. */ while (LDM_findBestMatch(&cctx, &match) == 0) { + U32 backwardsMatchLen = 0; #ifdef COMPUTE_STATS cctx.stats.numMatches++; #endif - -// printf("HERE %zu\n", cctx.ip - cctx.ibase); /** * Catch up: look back to extend the match backwards from the found match. */ @@ -565,6 +567,7 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.ip[-1] == match[-1]) { cctx.ip--; match--; + backwardsMatchLen++; } /** @@ -575,8 +578,9 @@ size_t LDM_compress(const void *src, size_t srcSize, const U32 literalLength = cctx.ip - cctx.anchor; const U32 offset = cctx.ip - match; const U32 matchLength = LDM_countMatchLength( - cctx.ip + LDM_MIN_MATCH_LENGTH, match + LDM_MIN_MATCH_LENGTH, - cctx.ihashLimit); + cctx.ip + LDM_MIN_MATCH_LENGTH + backwardsMatchLen, + match + LDM_MIN_MATCH_LENGTH + backwardsMatchLen, + cctx.ihashLimit) + backwardsMatchLen; LDM_outputBlock(&cctx, literalLength, offset, matchLength); diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 2d4ff9cf..735435e8 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -12,18 +12,17 @@ // Defines the size of the hash table. // Currently this should be less than WINDOW_SIZE_LOG + 4? -#define LDM_MEMORY_USAGE 24 +#define LDM_MEMORY_USAGE 23 -//#define LDM_LAG (1 << 23) //#define LDM_LAG (1 << 20) -#define LDM_LAG 0 +#define LDM_LAG (0) #define LDM_WINDOW_SIZE_LOG 28 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four (and perhaps set to the same values?). -#define LDM_MIN_MATCH_LENGTH 512 -#define LDM_HASH_LENGTH 512 +#define LDM_MIN_MATCH_LENGTH 64 +#define LDM_HASH_LENGTH 64 typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 4fef6621..7566751d 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -40,7 +40,9 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const hash_t hash, const U32 checksum, const BYTE *pIn, - int (*isValid)(const BYTE *pIn, const BYTE *pMatch)); + const BYTE *pEnd, + U32 minMatchLength, + U32 maxWindowSize); hash_t HASH_hashU32(U32 value); From 4352e09cb002873f3c2eec5d79eddeefca28160f Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Tue, 18 Jul 2017 18:35:25 -0700 Subject: [PATCH 43/62] Avoid recounting match lengths with ZSTD_count --- contrib/long_distance_matching/basic_table.c | 6 +++++- .../circular_buffer_table.c | 13 +++++++------ contrib/long_distance_matching/ldm.c | 18 +++++++++++++----- contrib/long_distance_matching/ldm_hashtable.h | 5 +++-- 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c index 6c12b508..30c548d2 100644 --- a/contrib/long_distance_matching/basic_table.c +++ b/contrib/long_distance_matching/basic_table.c @@ -62,12 +62,16 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const BYTE *pIn, const BYTE *pEnd, U32 minMatchLength, - U32 maxWindowSize) { + U32 maxWindowSize, + U32 *matchLength) { LDM_hashEntry *entry = getBucket(table, hash); (void)checksum; (void)pEnd; + (void)matchLength; + // TODO: Count the entire forward match length rather than check if valid. if (isValidMatch(pIn, entry->offset + table->offsetBase, minMatchLength, maxWindowSize)) { + return entry; } return NULL; diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index 653d9e51..104d1b33 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -15,17 +15,16 @@ // TODO: rename. Number of hash buckets. #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG) -//#define TMP_ZSTDTOGGLE +#define TMP_ZSTDTOGGLE struct LDM_hashTable { U32 size; // Number of buckets U32 maxEntries; // Rename... LDM_hashEntry *entries; // 1-D array for now. + BYTE *bucketOffsets; // Pointer to current insert position. // Position corresponding to offset=0 in LDM_hashEntry. const BYTE *offsetBase; - BYTE *bucketOffsets; // Pointer to current insert position. - // Last insert was at bucketOffsets - 1? }; LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase) { @@ -174,7 +173,8 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const BYTE *pIn, const BYTE *pEnd, U32 minMatchLength, - U32 maxWindowSize) { + U32 maxWindowSize, + U32 *matchLength) { LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; // TODO: in order of recency? @@ -183,8 +183,9 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const BYTE *pMatch = cur->offset + table->offsetBase; #ifdef TMP_ZSTDTOGGLE if (cur->checksum == checksum && pIn - pMatch <= maxWindowSize) { - U32 matchLength = ZSTD_count(pIn, pMatch, pEnd); - if (matchLength >= minMatchLength) { + U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); + if (forwardMatchLength >= minMatchLength) { + *matchLength = forwardMatchLength; return cur; } } diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 56b22d28..1512ab8c 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -28,6 +28,7 @@ //#define HASH_CHECK //#define RUN_CHECKS +//#define TMP_RECOMPUTE_LENGTHS #include "ldm.h" #include "ldm_hashtable.h" @@ -435,8 +436,10 @@ void LDM_destroyCCtx(LDM_CCtx *cctx) { * Returns 0 if successful and 1 otherwise (i.e. no match can be found * in the remaining input that is long enough). * + * matchLength contains the forward length of the match. */ -static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { +static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, + U32 *matchLength) { LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; @@ -459,7 +462,8 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip, cctx->iend, LDM_MIN_MATCH_LENGTH, - LDM_WINDOW_SIZE); + LDM_WINDOW_SIZE, + matchLength); #endif if (entry != NULL) { @@ -535,8 +539,7 @@ size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; const BYTE *match = NULL; -// printf("TST: %d\n", LDM_WINDOW_SIZE / LDM_HASHTABLESIZE_U64); -// printf("HASH LOG: %d\n", HASH_ONLY_EVERY_LOG); + U32 forwardMatchLength = 0; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); LDM_outputConfiguration(); @@ -555,7 +558,7 @@ size_t LDM_compress(const void *src, size_t srcSize, * is less than the minimum match length), then stop searching for matches * and encode the final literals. */ - while (LDM_findBestMatch(&cctx, &match) == 0) { + while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength) == 0) { U32 backwardsMatchLen = 0; #ifdef COMPUTE_STATS cctx.stats.numMatches++; @@ -577,10 +580,15 @@ size_t LDM_compress(const void *src, size_t srcSize, { const U32 literalLength = cctx.ip - cctx.anchor; const U32 offset = cctx.ip - match; +#ifdef TMP_RECOMPUTE_LENGTHS const U32 matchLength = LDM_countMatchLength( cctx.ip + LDM_MIN_MATCH_LENGTH + backwardsMatchLen, match + LDM_MIN_MATCH_LENGTH + backwardsMatchLen, cctx.ihashLimit) + backwardsMatchLen; +#else + const U32 matchLength = forwardMatchLength + backwardsMatchLen - + LDM_MIN_MATCH_LENGTH; +#endif LDM_outputBlock(&cctx, literalLength, offset, matchLength); diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 7566751d..2ea159f7 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -41,8 +41,9 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const U32 checksum, const BYTE *pIn, const BYTE *pEnd, - U32 minMatchLength, - U32 maxWindowSize); + const U32 minMatchLength, + const U32 maxWindowSize, + U32 *matchLength); hash_t HASH_hashU32(U32 value); From 030264ca51814d3bef8debcedace65f56779f691 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 19 Jul 2017 14:14:26 -0700 Subject: [PATCH 44/62] Experiment with integrating ZSTD_count with findBestMatch --- contrib/long_distance_matching/Makefile | 12 +- .../circular_buffer_table.c | 117 ++- contrib/long_distance_matching/ldm.c | 37 +- contrib/long_distance_matching/ldm.h | 9 +- .../long_distance_matching/ldm_hashtable.h | 9 +- .../long_distance_matching/ldm_with_table.c | 959 ++++++++++++++++++ 6 files changed, 1093 insertions(+), 50 deletions(-) create mode 100644 contrib/long_distance_matching/ldm_with_table.c diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 131638fd..3aa3f8bd 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,16 +25,20 @@ LDFLAGS += -lzstd default: all -all: main-basic main-circular-buffer +all: main-circular-buffer main-integrated -main-basic : basic_table.c ldm.c main-ldm.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ +#main-basic : basic_table.c ldm.c main-ldm.c +# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ +main-integrated: ldm_with_table.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + + clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-circular-buffer + main-basic main-circular-buffer main-integrated @echo Cleaning completed diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index 104d1b33..9b7ad088 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -14,8 +14,8 @@ // TODO: rename. Number of hash buckets. #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG) - -#define TMP_ZSTDTOGGLE +#define ZSTD_SKIP +//#define TMP_TST struct LDM_hashTable { U32 size; // Number of buckets @@ -25,15 +25,20 @@ struct LDM_hashTable { // Position corresponding to offset=0 in LDM_hashEntry. const BYTE *offsetBase; + U32 minMatchLength; + U32 maxWindowSize; }; -LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase) { +LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase, + U32 minMatchLength, U32 maxWindowSize) { LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); table->size = size >> HASH_BUCKET_SIZE_LOG; table->maxEntries = size; table->entries = calloc(size, sizeof(LDM_hashEntry)); table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); table->offsetBase = offsetBase; + table->minMatchLength = minMatchLength; + table->maxWindowSize = maxWindowSize; return table; } @@ -41,7 +46,7 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + (hash << HASH_BUCKET_SIZE_LOG); } -#ifdef TMP_ZSTDTOGGLE +#if TMP_ZSTDTOGGLE static unsigned ZSTD_NbCommonBytes (register size_t val) { if (MEM_isLittleEndian()) { @@ -143,10 +148,85 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, return (size_t)(pIn - pStart); } +U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, + const BYTE *pMatch, const BYTE *pBase) { + U32 matchLength = 0; + while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { + pIn--; + pMatch--; + matchLength++; + } + return matchLength; +} + +LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum, + const BYTE *pIn, + const BYTE *pEnd, + U32 *matchLength, + U32 *backwardsMatchLength, + const BYTE *pAnchor) { + LDM_hashEntry *bucket = getBucket(table, hash); + LDM_hashEntry *cur = bucket; + LDM_hashEntry *bestEntry = NULL; + U32 bestMatchLength = 0; + U32 forwardMatch = 0; + U32 backwardMatch = 0; +#ifdef TMP_TST + U32 numBetter = 0; +#endif + for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { + // Check checksum for faster check. + const BYTE *pMatch = cur->offset + table->offsetBase; + if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) { + U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); + U32 backwardMatchLength, totalMatchLength; + if (forwardMatchLength < table->minMatchLength) { + continue; + } + backwardMatchLength = + countBackwardsMatch(pIn, pAnchor, cur->offset + table->offsetBase, + table->offsetBase); + + totalMatchLength = forwardMatchLength + backwardMatchLength; + + if (totalMatchLength >= bestMatchLength) { + bestMatchLength = totalMatchLength; + forwardMatch = forwardMatchLength; + backwardMatch = backwardMatchLength; + bestEntry = cur; +#ifdef TMP_TST + numBetter++; +#endif + +#ifdef ZSTD_SKIP + *matchLength = forwardMatchLength; + *backwardsMatchLength = backwardMatchLength; + + return cur; +#endif +// *matchLength = forwardMatchLength; +// return cur; + } + } + } + if (bestEntry != NULL && bestMatchLength > table->minMatchLength) { +#ifdef TMP_TST + printf("Num better %u\n", numBetter - 1); +#endif + *matchLength = forwardMatch; + *backwardsMatchLength = backwardMatch; + return bestEntry; + } + return NULL; +} + #else static int isValidMatch(const BYTE *pIn, const BYTE *pMatch, U32 minMatchLength, U32 maxWindowSize) { + printf("HERE\n"); U32 lengthLeft = minMatchLength; const BYTE *curIn = pIn; const BYTE *curMatch = pMatch; @@ -165,44 +245,33 @@ static int isValidMatch(const BYTE *pIn, const BYTE *pMatch, return 1; } -#endif // TMP_ZSTDTOGGLE - +//TODO: clean up function call. This is not at all decoupled from LDM. LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const hash_t hash, const U32 checksum, const BYTE *pIn, const BYTE *pEnd, - U32 minMatchLength, - U32 maxWindowSize, - U32 *matchLength) { + U32 *matchLength, + U32 *backwardsMatchLength, + const BYTE *pAnchor) { LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; - // TODO: in order of recency? - for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { + (void)matchLength; + (void)backwardsMatchLength; + (void)pAnchor; for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { // Check checksum for faster check. const BYTE *pMatch = cur->offset + table->offsetBase; -#ifdef TMP_ZSTDTOGGLE - if (cur->checksum == checksum && pIn - pMatch <= maxWindowSize) { - U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); - if (forwardMatchLength >= minMatchLength) { - *matchLength = forwardMatchLength; - return cur; - } - } -#else (void)pEnd; - (void)minMatchLength; - (void)maxWindowSize; if (cur->checksum == checksum && - isValidMatch(pIn, pMatch, minMatchLength, maxWindowSize)) { + isValidMatch(pIn, pMatch, table->minMatchLength, table->maxWindowSize)) { return cur; } -#endif } return NULL; } +#endif hash_t HASH_hashU32(U32 value) { return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); } diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 1512ab8c..a116af70 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -24,8 +24,6 @@ #define OUTPUT_CONFIGURATION #define CHECKSUM_CHAR_OFFSET 10 -//#define LDM_LAG 0 - //#define HASH_CHECK //#define RUN_CHECKS //#define TMP_RECOMPUTE_LENGTHS @@ -410,7 +408,8 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64, cctx->ibase); + cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64, cctx->ibase, + LDM_MIN_MATCH_LENGTH, LDM_WINDOW_SIZE); cctx->stats.minOffset = UINT_MAX; cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; @@ -439,7 +438,7 @@ void LDM_destroyCCtx(LDM_CCtx *cctx) { * matchLength contains the forward length of the match. */ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, - U32 *matchLength) { + U32 *matchLength, U32 *backwardMatchLength) { LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; @@ -461,9 +460,8 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, #else entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip, cctx->iend, - LDM_MIN_MATCH_LENGTH, - LDM_WINDOW_SIZE, - matchLength); + matchLength, backwardMatchLength, + cctx->anchor); #endif if (entry != NULL) { @@ -540,6 +538,7 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_CCtx cctx; const BYTE *match = NULL; U32 forwardMatchLength = 0; + U32 backwardsMatchLength = 0; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); LDM_outputConfiguration(); @@ -558,11 +557,14 @@ size_t LDM_compress(const void *src, size_t srcSize, * is less than the minimum match length), then stop searching for matches * and encode the final literals. */ - while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength) == 0) { - U32 backwardsMatchLen = 0; + while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength, + &backwardsMatchLength) == 0) { #ifdef COMPUTE_STATS cctx.stats.numMatches++; #endif + +#if TMP_RECOMPUTE_LENGTHS + backwardsMatchLength = 0; /** * Catch up: look back to extend the match backwards from the found match. */ @@ -570,8 +572,12 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.ip[-1] == match[-1]) { cctx.ip--; match--; - backwardsMatchLen++; + backwardsMatchLength++; } +#else + cctx.ip -= backwardsMatchLength; + match -= backwardsMatchLength; +#endif /** * Write current block (literals, literal length, match offset, match @@ -580,13 +586,14 @@ size_t LDM_compress(const void *src, size_t srcSize, { const U32 literalLength = cctx.ip - cctx.anchor; const U32 offset = cctx.ip - match; -#ifdef TMP_RECOMPUTE_LENGTHS +#if TMP_RECOMPUTE_LENGTHS const U32 matchLength = LDM_countMatchLength( - cctx.ip + LDM_MIN_MATCH_LENGTH + backwardsMatchLen, - match + LDM_MIN_MATCH_LENGTH + backwardsMatchLen, - cctx.ihashLimit) + backwardsMatchLen; + cctx.ip + LDM_MIN_MATCH_LENGTH + backwardsMatchLength, + match + LDM_MIN_MATCH_LENGTH + backwardsMatchLength, + cctx.ihashLimit) + backwardsMatchLength; #else - const U32 matchLength = forwardMatchLength + backwardsMatchLen - + const U32 matchLength = forwardMatchLength + + backwardsMatchLength - LDM_MIN_MATCH_LENGTH; #endif diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 735435e8..2396227d 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -20,9 +20,12 @@ #define LDM_WINDOW_SIZE_LOG 28 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) -//These should be multiples of four (and perhaps set to the same values?). -#define LDM_MIN_MATCH_LENGTH 64 -#define LDM_HASH_LENGTH 64 +//These should be multiples of four (and perhaps set to the same value?). +#define LDM_MIN_MATCH_LENGTH 1024 +#define LDM_HASH_LENGTH 1024 + +#define TMP_ZSTDTOGGLE 1 +#define TMP_RECOMPUTE_LENGTHS (!(TMP_ZSTDTOGGLE)) typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 2ea159f7..51d82525 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -19,7 +19,8 @@ typedef struct LDM_hashTable LDM_hashTable; * LDM_hashEntry.offset is added to offsetBase to calculate pMatch in * HASH_getValidEntry. */ -LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase); +LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase, + U32 minMatchLength, U32 maxWindowSize); /** * Returns an LDM_hashEntry from the table that matches the checksum. @@ -41,9 +42,9 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const U32 checksum, const BYTE *pIn, const BYTE *pEnd, - const U32 minMatchLength, - const U32 maxWindowSize, - U32 *matchLength); + U32 *matchLength, + U32 *backwardsMatchLength, + const BYTE *pAnchor); hash_t HASH_hashU32(U32 value); diff --git a/contrib/long_distance_matching/ldm_with_table.c b/contrib/long_distance_matching/ldm_with_table.c new file mode 100644 index 00000000..68a33d0f --- /dev/null +++ b/contrib/long_distance_matching/ldm_with_table.c @@ -0,0 +1,959 @@ +#include +#include +#include +#include +#include + +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +//#define LDM_HASH_ENTRY_SIZE 4 +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 4) + +// Insert every (HASH_ONLY_EVERY + 1) into the hash table. +#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - 4)) +#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) + +/* Hash table stuff. */ +#define HASH_BUCKET_SIZE_LOG 3 // MAX is 4 for now +#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) +#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG) + +#define ML_BITS 4 +#define ML_MASK ((1U<size = size >> HASH_BUCKET_SIZE_LOG; + table->maxEntries = size; + table->entries = calloc(size, sizeof(LDM_hashEntry)); + table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); + return table; +} + +static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { + return table->entries + (hash << HASH_BUCKET_SIZE_LOG); +} + + + +static unsigned ZSTD_NbCommonBytes (register size_t val) +{ + if (MEM_isLittleEndian()) { + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanForward64( &r, (U64)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, + 0, 3, 1, 3, 1, 4, 2, 7, + 0, 2, 3, 6, 1, 5, 3, 5, + 1, 3, 4, 4, 2, 5, 6, 7, + 7, 0, 1, 2, 3, 3, 4, 6, + 2, 6, 5, 5, 3, 4, 5, 6, + 7, 1, 2, 4, 6, 4, 4, 5, + 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r=0; + _BitScanForward( &r, (U32)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else { /* Big Endian CPU */ + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clzll(val) >> 3); +# else + unsigned r; + const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ + if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } } +} + +// From lib/compress/zstd_compress.c +static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *const pInLimit) { + const BYTE * const pStart = pIn; + const BYTE * const pInLoopLimit = pInLimit - (sizeof(size_t)-1); + + while (pIn < pInLoopLimit) { + size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (!diff) { + pIn += sizeof(size_t); + pMatch += sizeof(size_t); + continue; + } + pIn += ZSTD_NbCommonBytes(diff); + return (size_t)(pIn - pStart); + } + + if (MEM_64bits()) { + if ((pIn < (pInLimit - 3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { + pIn += 4; + pMatch += 4; + } + } + if ((pIn < (pInLimit - 1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { + pIn += 2; + pMatch += 2; + } + if ((pIn < pInLimit) && (*pMatch == *pIn)) { + pIn++; + } + return (size_t)(pIn - pStart); +} + +U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, + const BYTE *pMatch, const BYTE *pBase) { + U32 matchLength = 0; + while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { + pIn--; + pMatch--; + matchLength++; + } + return matchLength; +} + +LDM_hashEntry *HASH_getValidEntry(const LDM_CCtx *cctx, + const hash_t hash, + const U32 checksum, + U32 *matchLength, + U32 *backwardsMatchLength) { + LDM_hashTable *table = cctx->hashTable; + LDM_hashEntry *bucket = getBucket(table, hash); + LDM_hashEntry *cur = bucket; + LDM_hashEntry *bestEntry = NULL; + U32 bestMatchLength = 0; + for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { + // Check checksum for faster check. + const BYTE *pMatch = cur->offset + cctx->ibase; + + if (cur->checksum == checksum && + cctx->ip - pMatch <= LDM_WINDOW_SIZE) { + U32 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); + U32 backwardMatchLength, totalMatchLength; + + // For speed. + if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) { + continue; + } + + backwardMatchLength = + countBackwardsMatch(cctx->ip, cctx->anchor, + cur->offset + cctx->ibase, + cctx->ibase); + + totalMatchLength = forwardMatchLength + backwardMatchLength; + + if (totalMatchLength >= bestMatchLength && + totalMatchLength >= LDM_MIN_MATCH_LENGTH) { + bestMatchLength = totalMatchLength; + *matchLength = forwardMatchLength; + *backwardsMatchLength = backwardMatchLength; + + bestEntry = cur; +#ifdef ZSTD_SKIP + return cur; +#endif + } + } + } + if (bestEntry != NULL && bestMatchLength > LDM_MIN_MATCH_LENGTH) { + return bestEntry; + } + return NULL; +} + +void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { + *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; + table->bucketOffsets[hash]++; + table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; +} + +U32 HASH_getSize(const LDM_hashTable *table) { + return table->size; +} + +void HASH_destroyTable(LDM_hashTable *table) { + free(table->entries); + free(table->bucketOffsets); + free(table); +} + +void HASH_outputTableOccupancy(const LDM_hashTable *table) { + U32 ctr = 0; + LDM_hashEntry *cur = table->entries; + LDM_hashEntry *end = table->entries + (table->size * HASH_BUCKET_SIZE); + for (; cur < end; ++cur) { + if (cur->offset == 0) { + ctr++; + } + } + + printf("Num buckets, bucket size: %d, %d\n", table->size, HASH_BUCKET_SIZE); + printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", + table->maxEntries, ctr, + 100.0 * (double)(ctr) / table->maxEntries); +} + + +// TODO: This can be done more efficiently (but it is not that important as it +// is only used for computing stats). +static int intLog2(U32 x) { + int ret = 0; + while (x >>= 1) { + ret++; + } + return ret; +} + +// TODO: Maybe we would eventually prefer to have linear rather than +// exponential buckets. +/** +void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { + U32 i = 0; + int buckets[32] = { 0 }; + + printf("\n"); + printf("Hash table histogram\n"); + for (; i < HASH_getSize(cctx->hashTable); i++) { + int offset = (cctx->ip - cctx->ibase) - + HASH_getEntryFromHash(cctx->hashTable, i)->offset; + buckets[intLog2(offset)]++; + } + + i = 0; + for (; i < 32; i++) { + printf("2^%*d: %10u %6.3f%%\n", 2, i, + buckets[i], + 100.0 * (double) buckets[i] / + (double) HASH_getSize(cctx->hashTable)); + } + printf("\n"); +} +*/ + +void LDM_printCompressStats(const LDM_compressStats *stats) { + int i = 0; + printf("=====================\n"); + printf("Compression statistics\n"); + //TODO: compute percentage matched? + printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", + stats->windowSizeLog, stats->hashTableSizeLog); + printf("num matches, total match length, %% matched: %u, %llu, %.3f\n", + stats->numMatches, + stats->totalMatchLength, + 100.0 * (double)stats->totalMatchLength / + (double)(stats->totalMatchLength + stats->totalLiteralLength)); + printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / + (double)stats->numMatches); + printf("avg literal length, total literalLength: %.1f, %llu\n", + ((double)stats->totalLiteralLength) / (double)stats->numMatches, + stats->totalLiteralLength); + printf("avg offset length: %.1f\n", + ((double)stats->totalOffset) / (double)stats->numMatches); + printf("min offset, max offset: %u, %u\n", + stats->minOffset, stats->maxOffset); + + printf("\n"); + printf("offset histogram: offset, num matches, %% of matches\n"); + + for (; i <= intLog2(stats->maxOffset); i++) { + printf("2^%*d: %10u %6.3f%%\n", 2, i, + stats->offsetHistogram[i], + 100.0 * (double) stats->offsetHistogram[i] / + (double) stats->numMatches); + } + printf("\n"); + printf("=====================\n"); +} + +int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { + U32 lengthLeft = LDM_MIN_MATCH_LENGTH; + const BYTE *curIn = pIn; + const BYTE *curMatch = pMatch; + + if (pIn - pMatch > LDM_WINDOW_SIZE) { + return 0; + } + + for (; lengthLeft >= 4; lengthLeft -= 4) { + if (MEM_read32(curIn) != MEM_read32(curMatch)) { + return 0; + } + curIn += 4; + curMatch += 4; + } + return 1; +} + +hash_t HASH_hashU32(U32 value) { + return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); +} + +/** + * Convert a sum computed from getChecksum to a hash value in the range + * of the hash table. + */ +static hash_t checksumToHash(U32 sum) { + return HASH_hashU32(sum); +// return ((sum * 2654435761U) >> (32 - LDM_HASHLOG)); +} + +/** + * Computes a checksum based on rsync's checksum. + * + * a(k,l) = \sum_{i = k}^l x_i (mod M) + * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) + * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) + */ +static U32 getChecksum(const BYTE *buf, U32 len) { + U32 i; + U32 s1, s2; + + s1 = s2 = 0; + for (i = 0; i < (len - 4); i += 4) { + s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + + (2 * buf[i + 2]) + (buf[i + 3]) + + (10 * CHECKSUM_CHAR_OFFSET); + s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3] + + + (4 * CHECKSUM_CHAR_OFFSET); + + } + for(; i < len; i++) { + s1 += buf[i] + CHECKSUM_CHAR_OFFSET; + s2 += s1; + } + return (s1 & 0xffff) + (s2 << 16); +} + +/** + * Update a checksum computed from getChecksum(data, len). + * + * The checksum can be updated along its ends as follows: + * a(k+1, l+1) = (a(k,l) - x_k + x_{l+1}) (mod M) + * b(k+1, l+1) = (b(k,l) - (l-k+1)*x_k + (a(k+1,l+1)) (mod M) + * + * Thus toRemove should correspond to data[0]. + */ +static U32 updateChecksum(U32 sum, U32 len, + BYTE toRemove, BYTE toAdd) { + U32 s1 = (sum & 0xffff) - toRemove + toAdd; + U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; + + return (s1 & 0xffff) + (s2 << 16); +} + +/** + * Update cctx->nextSum, cctx->nextHash, and cctx->nextPosHashed + * based on cctx->lastSum and cctx->lastPosHashed. + * + * This uses a rolling hash and requires that the last position hashed + * corresponds to cctx->nextIp - step. + */ +static void setNextHash(LDM_CCtx *cctx) { +#ifdef RUN_CHECKS + U32 check; + if ((cctx->nextIp - cctx->ibase != 1) && + (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { + printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, + cctx->DEBUG_setNextHash - cctx->ibase); + } + + cctx->DEBUG_setNextHash = cctx->nextIp; +#endif + +// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); + cctx->nextSum = updateChecksum( + cctx->lastSum, LDM_HASH_LENGTH, + cctx->lastPosHashed[0], + cctx->lastPosHashed[LDM_HASH_LENGTH]); + cctx->nextPosHashed = cctx->nextIp; + cctx->nextHash = checksumToHash(cctx->nextSum); + +#if LDM_LAG +// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); + if (cctx->ip - cctx->ibase > LDM_LAG) { + cctx->lagSum = updateChecksum( + cctx->lagSum, LDM_HASH_LENGTH, + cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); + cctx->lagIp++; + cctx->lagHash = checksumToHash(cctx->lagSum); + } +#endif + +#ifdef RUN_CHECKS + check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); + + if (check != cctx->nextSum) { + printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); + } + + if ((cctx->nextIp - cctx->lastPosHashed) != 1) { + printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", + cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, + cctx->ip - cctx->ibase); + } +#endif +} + +static void putHashOfCurrentPositionFromHash( + LDM_CCtx *cctx, hash_t hash, U32 sum) { + // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. + // Note: this works only when cctx->step is 1. + if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { + /** + const LDM_hashEntry entry = { cctx->ip - cctx->ibase , + MEM_read32(cctx->ip) }; + */ +#if LDM_LAG + // TODO: off by 1, but whatever + if (cctx->lagIp - cctx->ibase > 0) { + const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; + HASH_insert(cctx->hashTable, cctx->lagHash, entry); + } else { + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; + HASH_insert(cctx->hashTable, hash, entry); + } +#else + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; + HASH_insert(cctx->hashTable, hash, entry); +#endif + } + + cctx->lastPosHashed = cctx->ip; + cctx->lastHash = hash; + cctx->lastSum = sum; +} + +/** + * Copy over the cctx->lastHash, cctx->lastSum, and cctx->lastPosHashed + * fields from the "next" fields. + * + * This requires that cctx->ip == cctx->nextPosHashed. + */ +static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { +#ifdef RUN_CHECKS + if (cctx->ip != cctx->nextPosHashed) { + printf("CHECK failed: updateLastHashFromNextHash %zu\n", + cctx->ip - cctx->ibase); + } +#endif + putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); +} + +/** + * Insert hash of the current position into the hash table. + */ +static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { + U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); + hash_t hash = checksumToHash(sum); + +#ifdef RUN_CHECKS + if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { + printf("CHECK failed: putHashOfCurrentPosition %zu\n", + cctx->ip - cctx->ibase); + } +#endif + + putHashOfCurrentPositionFromHash(cctx, hash, sum); +} + +U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = (*pMatch) ^ *(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (U32)(pIn - pStart); + } + return (U32)(pIn - pStart); +} + +void LDM_outputConfiguration(void) { + printf("=====================\n"); + printf("Configuration\n"); + printf("Window size log: %d\n", LDM_WINDOW_SIZE_LOG); + printf("Min match, hash length: %d, %d\n", + LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); + printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); + printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); + printf("LDM_LAG %d\n", LDM_LAG); + printf("=====================\n"); +} + +void LDM_readHeader(const void *src, U64 *compressedSize, + U64 *decompressedSize) { + const BYTE *ip = (const BYTE *)src; + *compressedSize = MEM_readLE64(ip); + ip += sizeof(U64); + *decompressedSize = MEM_readLE64(ip); + // ip += sizeof(U64); +} + +void LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + cctx->isize = srcSize; + cctx->maxOSize = maxDstSize; + + cctx->ibase = (const BYTE *)src; + cctx->ip = cctx->ibase; + cctx->iend = cctx->ibase + srcSize; + + cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; + cctx->imatchLimit = cctx->iend - LDM_MIN_MATCH_LENGTH; + + cctx->obase = (BYTE *)dst; + cctx->op = (BYTE *)dst; + + cctx->anchor = cctx->ibase; + + memset(&(cctx->stats), 0, sizeof(cctx->stats)); + cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64); + + cctx->stats.minOffset = UINT_MAX; + cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; + cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; + + + cctx->lastPosHashed = NULL; + + cctx->step = 1; // Fixed to be 1 for now. Changing may break things. + cctx->nextIp = cctx->ip + cctx->step; + cctx->nextPosHashed = 0; + + cctx->DEBUG_setNextHash = 0; +} + +void LDM_destroyCCtx(LDM_CCtx *cctx) { + HASH_destroyTable(cctx->hashTable); +} + +/** + * Finds the "best" match. + * + * Returns 0 if successful and 1 otherwise (i.e. no match can be found + * in the remaining input that is long enough). + * + * matchLength contains the forward length of the match. + */ +static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, + U32 *matchLength, U32 *backwardMatchLength) { + + LDM_hashEntry *entry = NULL; + cctx->nextIp = cctx->ip + cctx->step; + + while (entry == NULL) { + hash_t h; + U32 sum; + setNextHash(cctx); + h = cctx->nextHash; + sum = cctx->nextSum; + cctx->ip = cctx->nextIp; + cctx->nextIp += cctx->step; + + if (cctx->ip > cctx->imatchLimit) { + return 1; + } + + entry = HASH_getValidEntry(cctx, h, sum, + matchLength, backwardMatchLength); + + if (entry != NULL) { + *match = entry->offset + cctx->ibase; + } + putHashOfCurrentPositionFromHash(cctx, h, sum); + } + setNextHash(cctx); + return 0; +} + +void LDM_encodeLiteralLengthAndLiterals( + LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { + /* Encode the literal length. */ + if (literalLength >= RUN_MASK) { + int len = (int)literalLength - RUN_MASK; + *pToken = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *(cctx->op)++ = 255; + } + *(cctx->op)++ = (BYTE)len; + } else { + *pToken = (BYTE)(literalLength << ML_BITS); + } + + /* Encode the literals. */ + memcpy(cctx->op, cctx->anchor, literalLength); + cctx->op += literalLength; +} + +void LDM_outputBlock(LDM_CCtx *cctx, + const U32 literalLength, + const U32 offset, + const U32 matchLength) { + BYTE *pToken = cctx->op++; + + /* Encode the literal length and literals. */ + LDM_encodeLiteralLengthAndLiterals(cctx, pToken, literalLength); + + /* Encode the offset. */ + MEM_write32(cctx->op, offset); + cctx->op += LDM_OFFSET_SIZE; + + /* Encode the match length. */ + if (matchLength >= ML_MASK) { + unsigned matchLengthRemaining = matchLength; + *pToken += ML_MASK; + matchLengthRemaining -= ML_MASK; + MEM_write32(cctx->op, 0xFFFFFFFF); + while (matchLengthRemaining >= 4*0xFF) { + cctx->op += 4; + MEM_write32(cctx->op, 0xffffffff); + matchLengthRemaining -= 4*0xFF; + } + cctx->op += matchLengthRemaining / 255; + *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); + } else { + *pToken += (BYTE)(matchLength); + } +} + +// TODO: maxDstSize is unused. This function may seg fault when writing +// beyond the size of dst, as it does not check maxDstSize. Writing to +// a buffer and performing checks is a possible solution. +// +// This is based upon lz4. +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + LDM_CCtx cctx; + const BYTE *match = NULL; + U32 forwardMatchLength = 0; + U32 backwardsMatchLength = 0; + + LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + LDM_outputConfiguration(); + + /* Hash the first position and put it into the hash table. */ + LDM_putHashOfCurrentPosition(&cctx); + +#if LDM_LAG + cctx.lagIp = cctx.ip; + cctx.lagHash = cctx.lastHash; + cctx.lagSum = cctx.lastSum; +#endif + /** + * Find a match. + * If no more matches can be found (i.e. the length of the remaining input + * is less than the minimum match length), then stop searching for matches + * and encode the final literals. + */ + while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength, + &backwardsMatchLength) == 0) { +#ifdef COMPUTE_STATS + cctx.stats.numMatches++; +#endif + + cctx.ip -= backwardsMatchLength; + match -= backwardsMatchLength; + + /** + * Write current block (literals, literal length, match offset, match + * length) and update pointers and hashes. + */ + { + const U32 literalLength = cctx.ip - cctx.anchor; + const U32 offset = cctx.ip - match; + const U32 matchLength = forwardMatchLength + + backwardsMatchLength - + LDM_MIN_MATCH_LENGTH; + + LDM_outputBlock(&cctx, literalLength, offset, matchLength); + +#ifdef COMPUTE_STATS + cctx.stats.totalLiteralLength += literalLength; + cctx.stats.totalOffset += offset; + cctx.stats.totalMatchLength += matchLength + LDM_MIN_MATCH_LENGTH; + cctx.stats.minOffset = + offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; + cctx.stats.maxOffset = + offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; + cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; +#endif + + // Move ip to end of block, inserting hashes at each position. + cctx.nextIp = cctx.ip + cctx.step; + while (cctx.ip < cctx.anchor + LDM_MIN_MATCH_LENGTH + + matchLength + literalLength) { + if (cctx.ip > cctx.lastPosHashed) { + // TODO: Simplify. + LDM_updateLastHashFromNextHash(&cctx); + setNextHash(&cctx); + } + cctx.ip++; + cctx.nextIp++; + } + } + + // Set start of next block to current input pointer. + cctx.anchor = cctx.ip; + LDM_updateLastHashFromNextHash(&cctx); + } + + // HASH_outputTableOffsetHistogram(&cctx); + + /* Encode the last literals (no more matches). */ + { + const U32 lastRun = cctx.iend - cctx.anchor; + BYTE *pToken = cctx.op++; + LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); + } + +#ifdef COMPUTE_STATS + LDM_printCompressStats(&cctx.stats); + HASH_outputTableOccupancy(cctx.hashTable); +#endif + + { + const size_t ret = cctx.op - cctx.obase; + LDM_destroyCCtx(&cctx); + return ret; + } +} + +struct LDM_DCtx { + size_t compressedSize; + size_t maxDecompressedSize; + + const BYTE *ibase; /* Base of input */ + const BYTE *ip; /* Current input position */ + const BYTE *iend; /* End of source */ + + const BYTE *obase; /* Base of output */ + BYTE *op; /* Current output position */ + const BYTE *oend; /* End of output */ +}; + +void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize) { + dctx->compressedSize = compressedSize; + dctx->maxDecompressedSize = maxDecompressedSize; + + dctx->ibase = src; + dctx->ip = (const BYTE *)src; + dctx->iend = dctx->ip + dctx->compressedSize; + dctx->op = dst; + dctx->oend = dctx->op + dctx->maxDecompressedSize; +} + +size_t LDM_decompress(const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize) { + LDM_DCtx dctx; + LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); + + while (dctx.ip < dctx.iend) { + BYTE *cpy; + const BYTE *match; + size_t length, offset; + + /* Get the literal length. */ + const unsigned token = *(dctx.ip)++; + if ((length = (token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + + /* Copy the literals. */ + cpy = dctx.op + length; + memcpy(dctx.op, dctx.ip, length); + dctx.ip += length; + dctx.op = cpy; + + //TODO : dynamic offset size + offset = MEM_read32(dctx.ip); + dctx.ip += LDM_OFFSET_SIZE; + match = dctx.op - offset; + + /* Get the match length. */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + length += LDM_MIN_MATCH_LENGTH; + + /* Copy match. */ + cpy = dctx.op + length; + + // Inefficient for now. + while (match < cpy - offset && dctx.op < dctx.oend) { + *(dctx.op)++ = *match++; + } + } + return dctx.op - (BYTE *)dst; +} + +// TODO: implement and test hash function +void LDM_test(void) { +} + +/* +void LDM_test(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + const BYTE *ip = (const BYTE *)src + 1125; + U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); + U32 sum2; + ++ip; + for (; ip < (const BYTE *)src + 1125 + 100; ip++) { + sum2 = updateChecksum(sum, LDM_HASH_LENGTH, + ip[-1], ip[LDM_HASH_LENGTH - 1]); + sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); + printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2); + } +} +*/ + + From 2427a154cb6a5af622bdbe679f4b4c5b906b4821 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 19 Jul 2017 16:56:28 -0700 Subject: [PATCH 45/62] Minor refactoring --- contrib/long_distance_matching/basic_table.c | 109 -------------- .../circular_buffer_table.c | 137 ++++-------------- contrib/long_distance_matching/ldm.c | 42 +----- contrib/long_distance_matching/ldm.h | 12 +- .../long_distance_matching/ldm_hashtable.h | 36 +---- .../long_distance_matching/ldm_with_table.c | 84 ++++++----- contrib/long_distance_matching/main-ldm.c | 36 +---- 7 files changed, 102 insertions(+), 354 deletions(-) delete mode 100644 contrib/long_distance_matching/basic_table.c diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c deleted file mode 100644 index 30c548d2..00000000 --- a/contrib/long_distance_matching/basic_table.c +++ /dev/null @@ -1,109 +0,0 @@ -#include -#include - -#include "ldm.h" -#include "ldm_hashtable.h" -#include "mem.h" - -#define LDM_HASHLOG ((LDM_MEMORY_USAGE) - 4) - -struct LDM_hashTable { - U32 size; - LDM_hashEntry *entries; - const BYTE *offsetBase; -}; - -LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase) { - LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); - table->size = size; - table->entries = calloc(size, sizeof(LDM_hashEntry)); - table->offsetBase = offsetBase; - return table; -} - -void HASH_initializeTable(LDM_hashTable *table, U32 size) { - table->size = size; - table->entries = calloc(size, sizeof(LDM_hashEntry)); -} - -LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { - return table->entries + hash; -} - -LDM_hashEntry *HASH_getEntryFromHash( - const LDM_hashTable *table, const hash_t hash, const U32 checksum) { - (void)checksum; - return getBucket(table, hash); -} - -static int isValidMatch(const BYTE *pIn, const BYTE *pMatch, - U32 minMatchLength, U32 maxWindowSize) { - U32 lengthLeft = minMatchLength; - const BYTE *curIn = pIn; - const BYTE *curMatch = pMatch; - - if (pIn - pMatch > maxWindowSize) { - return 0; - } - - for (; lengthLeft >= 4; lengthLeft -= 4) { - if (MEM_read32(curIn) != MEM_read32(curMatch)) { - return 0; - } - curIn += 4; - curMatch += 4; - } - return 1; -} - -LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, - const hash_t hash, - const U32 checksum, - const BYTE *pIn, - const BYTE *pEnd, - U32 minMatchLength, - U32 maxWindowSize, - U32 *matchLength) { - LDM_hashEntry *entry = getBucket(table, hash); - (void)checksum; - (void)pEnd; - (void)matchLength; - // TODO: Count the entire forward match length rather than check if valid. - if (isValidMatch(pIn, entry->offset + table->offsetBase, - minMatchLength, maxWindowSize)) { - - return entry; - } - return NULL; -} - -hash_t HASH_hashU32(U32 value) { - return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); -} - -void HASH_insert(LDM_hashTable *table, - const hash_t hash, const LDM_hashEntry entry) { - *getBucket(table, hash) = entry; -} - -U32 HASH_getSize(const LDM_hashTable *table) { - return table->size; -} - -void HASH_destroyTable(LDM_hashTable *table) { - free(table->entries); - free(table); -} - -void HASH_outputTableOccupancy(const LDM_hashTable *hashTable) { - U32 i = 0; - U32 ctr = 0; - for (; i < HASH_getSize(hashTable); i++) { - if (getBucket(hashTable, i)->offset == 0) { - ctr++; - } - } - printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - HASH_getSize(hashTable), ctr, - 100.0 * (double)(ctr) / (double)HASH_getSize(hashTable)); -} diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index 9b7ad088..9429fbcd 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -5,22 +5,19 @@ #include "ldm_hashtable.h" #include "mem.h" -//TODO: move def somewhere else. // Number of elements per hash bucket. // HASH_BUCKET_SIZE_LOG defined in ldm.h -#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) // TODO: rename. Number of hash buckets. #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG) -#define ZSTD_SKIP -//#define TMP_TST +//#define ZSTD_SKIP struct LDM_hashTable { - U32 size; // Number of buckets - U32 maxEntries; // Rename... - LDM_hashEntry *entries; // 1-D array for now. + U32 numBuckets; + U32 numEntries; + LDM_hashEntry *entries; BYTE *bucketOffsets; // Pointer to current insert position. // Position corresponding to offset=0 in LDM_hashEntry. @@ -32,8 +29,8 @@ struct LDM_hashTable { LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase, U32 minMatchLength, U32 maxWindowSize) { LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); - table->size = size >> HASH_BUCKET_SIZE_LOG; - table->maxEntries = size; + table->numBuckets = size >> HASH_BUCKET_SIZE_LOG; + table->numEntries = size; table->entries = calloc(size, sizeof(LDM_hashEntry)); table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); table->offsetBase = offsetBase; @@ -46,7 +43,6 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + (hash << HASH_BUCKET_SIZE_LOG); } -#if TMP_ZSTDTOGGLE static unsigned ZSTD_NbCommonBytes (register size_t val) { if (MEM_isLittleEndian()) { @@ -159,26 +155,22 @@ U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, return matchLength; } -LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, - const hash_t hash, - const U32 checksum, - const BYTE *pIn, - const BYTE *pEnd, - U32 *matchLength, - U32 *backwardsMatchLength, - const BYTE *pAnchor) { +LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum, + const BYTE *pIn, + const BYTE *pEnd, + const BYTE *pAnchor, + U32 *pForwardMatchLength, + U32 *pBackwardMatchLength) { LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; LDM_hashEntry *bestEntry = NULL; U32 bestMatchLength = 0; - U32 forwardMatch = 0; - U32 backwardMatch = 0; -#ifdef TMP_TST - U32 numBetter = 0; -#endif for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - // Check checksum for faster check. const BYTE *pMatch = cur->offset + table->offsetBase; + + // Check checksum for faster check. if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) { U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); U32 backwardMatchLength, totalMatchLength; @@ -193,105 +185,27 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, if (totalMatchLength >= bestMatchLength) { bestMatchLength = totalMatchLength; - forwardMatch = forwardMatchLength; - backwardMatch = backwardMatchLength; + *pForwardMatchLength = forwardMatchLength; + *pBackwardMatchLength = backwardMatchLength; + bestEntry = cur; -#ifdef TMP_TST - numBetter++; -#endif #ifdef ZSTD_SKIP - *matchLength = forwardMatchLength; - *backwardsMatchLength = backwardMatchLength; - return cur; #endif -// *matchLength = forwardMatchLength; -// return cur; } } } - if (bestEntry != NULL && bestMatchLength > table->minMatchLength) { -#ifdef TMP_TST - printf("Num better %u\n", numBetter - 1); -#endif - *matchLength = forwardMatch; - *backwardsMatchLength = backwardMatch; + if (bestEntry != NULL) { return bestEntry; } return NULL; } -#else - -static int isValidMatch(const BYTE *pIn, const BYTE *pMatch, - U32 minMatchLength, U32 maxWindowSize) { - printf("HERE\n"); - U32 lengthLeft = minMatchLength; - const BYTE *curIn = pIn; - const BYTE *curMatch = pMatch; - - if (pIn - pMatch > maxWindowSize) { - return 0; - } - - for (; lengthLeft >= 4; lengthLeft -= 4) { - if (MEM_read32(curIn) != MEM_read32(curMatch)) { - return 0; - } - curIn += 4; - curMatch += 4; - } - return 1; -} - -//TODO: clean up function call. This is not at all decoupled from LDM. -LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, - const hash_t hash, - const U32 checksum, - const BYTE *pIn, - const BYTE *pEnd, - U32 *matchLength, - U32 *backwardsMatchLength, - const BYTE *pAnchor) { - LDM_hashEntry *bucket = getBucket(table, hash); - LDM_hashEntry *cur = bucket; - (void)matchLength; - (void)backwardsMatchLength; - (void)pAnchor; for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - // Check checksum for faster check. - const BYTE *pMatch = cur->offset + table->offsetBase; - (void)pEnd; - - if (cur->checksum == checksum && - isValidMatch(pIn, pMatch, table->minMatchLength, table->maxWindowSize)) { - return cur; - } - } - return NULL; -} - -#endif hash_t HASH_hashU32(U32 value) { return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); } - -LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, - const hash_t hash, - const U32 checksum) { - // Loop through bucket. - // TODO: in order of recency??? - LDM_hashEntry *bucket = getBucket(table, hash); - LDM_hashEntry *cur = bucket; - for(; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - if (cur->checksum == checksum) { - return cur; - } - } - return NULL; -} - void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; @@ -300,7 +214,7 @@ void HASH_insert(LDM_hashTable *table, } U32 HASH_getSize(const LDM_hashTable *table) { - return table->size; + return table->numBuckets; } void HASH_destroyTable(LDM_hashTable *table) { @@ -312,15 +226,16 @@ void HASH_destroyTable(LDM_hashTable *table) { void HASH_outputTableOccupancy(const LDM_hashTable *table) { U32 ctr = 0; LDM_hashEntry *cur = table->entries; - LDM_hashEntry *end = table->entries + (table->size * HASH_BUCKET_SIZE); + LDM_hashEntry *end = table->entries + (table->numBuckets * HASH_BUCKET_SIZE); for (; cur < end; ++cur) { if (cur->offset == 0) { ctr++; } } - printf("Num buckets, bucket size: %d, %d\n", table->size, HASH_BUCKET_SIZE); + printf("Num buckets, bucket size: %d, %d\n", + table->numBuckets, HASH_BUCKET_SIZE); printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - table->maxEntries, ctr, - 100.0 * (double)(ctr) / table->maxEntries); + table->numEntries, ctr, + 100.0 * (double)(ctr) / table->numEntries); } diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index a116af70..6e9addf7 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -14,7 +14,6 @@ #define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - 4)) #define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) - #define ML_BITS 4 #define ML_MASK ((1U<windowSizeLog, stats->hashTableSizeLog); printf("num matches, total match length, %% matched: %u, %llu, %.3f\n", @@ -191,7 +188,6 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { */ static hash_t checksumToHash(U32 sum) { return HASH_hashU32(sum); -// return ((sum * 2654435761U) >> (32 - LDM_HASHLOG)); } /** @@ -455,22 +451,14 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, if (cctx->ip > cctx->imatchLimit) { return 1; } -#ifdef HASH_CHECK - entry = HASH_getEntryFromHash(cctx->hashTable, h, sum); -#else - entry = HASH_getValidEntry(cctx->hashTable, h, sum, - cctx->ip, cctx->iend, - matchLength, backwardMatchLength, - cctx->anchor); -#endif + + entry = HASH_getBestEntry(cctx->hashTable, h, sum, + cctx->ip, cctx->iend, + cctx->anchor, + matchLength, backwardMatchLength); if (entry != NULL) { *match = entry->offset + cctx->ibase; -#ifdef HASH_CHECK - if (!LDM_isValidMatch(cctx->ip, *match)) { - entry = NULL; - } -#endif } putHashOfCurrentPositionFromHash(cctx, h, sum); } @@ -563,21 +551,8 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.stats.numMatches++; #endif -#if TMP_RECOMPUTE_LENGTHS - backwardsMatchLength = 0; - /** - * Catch up: look back to extend the match backwards from the found match. - */ - while (cctx.ip > cctx.anchor && match > cctx.ibase && - cctx.ip[-1] == match[-1]) { - cctx.ip--; - match--; - backwardsMatchLength++; - } -#else cctx.ip -= backwardsMatchLength; match -= backwardsMatchLength; -#endif /** * Write current block (literals, literal length, match offset, match @@ -586,16 +561,9 @@ size_t LDM_compress(const void *src, size_t srcSize, { const U32 literalLength = cctx.ip - cctx.anchor; const U32 offset = cctx.ip - match; -#if TMP_RECOMPUTE_LENGTHS - const U32 matchLength = LDM_countMatchLength( - cctx.ip + LDM_MIN_MATCH_LENGTH + backwardsMatchLength, - match + LDM_MIN_MATCH_LENGTH + backwardsMatchLength, - cctx.ihashLimit) + backwardsMatchLength; -#else const U32 matchLength = forwardMatchLength + backwardsMatchLength - LDM_MIN_MATCH_LENGTH; -#endif LDM_outputBlock(&cctx, literalLength, offset, matchLength); diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 2396227d..1d5b2f13 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -11,21 +11,21 @@ #define LDM_OFFSET_SIZE 4 // Defines the size of the hash table. +// Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? #define LDM_MEMORY_USAGE 23 +#define HASH_BUCKET_SIZE_LOG 3 // MAX is 4 for now -//#define LDM_LAG (1 << 20) -#define LDM_LAG (0) +// Defines the lag in inserting elements into the hash table. +#define LDM_LAG 0 #define LDM_WINDOW_SIZE_LOG 28 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four (and perhaps set to the same value?). -#define LDM_MIN_MATCH_LENGTH 1024 -#define LDM_HASH_LENGTH 1024 +#define LDM_MIN_MATCH_LENGTH 64 +#define LDM_HASH_LENGTH 64 -#define TMP_ZSTDTOGGLE 1 -#define TMP_RECOMPUTE_LENGTHS (!(TMP_ZSTDTOGGLE)) typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 51d82525..df9dcd78 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -14,37 +14,17 @@ typedef struct LDM_hashEntry { typedef struct LDM_hashTable LDM_hashTable; -/** - * Create a hash table with size hash buckets. - * LDM_hashEntry.offset is added to offsetBase to calculate pMatch in - * HASH_getValidEntry. - */ LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase, U32 minMatchLength, U32 maxWindowSize); -/** - * Returns an LDM_hashEntry from the table that matches the checksum. - * Returns NULL if one does not exist. - */ -LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, - const hash_t hash, - const U32 checksum); - -/** - * Gets a valid entry that matches the checksum. A valid entry is defined by - * *isValid. - * - * The function finds an entry matching the checksum, computes pMatch as - * offset + table.offsetBase, and calls isValid. - */ -LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, - const hash_t hash, - const U32 checksum, - const BYTE *pIn, - const BYTE *pEnd, - U32 *matchLength, - U32 *backwardsMatchLength, - const BYTE *pAnchor); +LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum, + const BYTE *pIn, + const BYTE *pEnd, + const BYTE *pAnchor, + U32 *matchLength, + U32 *backwardsMatchLength); hash_t HASH_hashU32(U32 value); diff --git a/contrib/long_distance_matching/ldm_with_table.c b/contrib/long_distance_matching/ldm_with_table.c index 68a33d0f..5919d588 100644 --- a/contrib/long_distance_matching/ldm_with_table.c +++ b/contrib/long_distance_matching/ldm_with_table.c @@ -4,6 +4,8 @@ #include #include +#include "ldm.h" + #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) //#define LDM_HASH_ENTRY_SIZE 4 #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) @@ -14,7 +16,6 @@ #define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) /* Hash table stuff. */ -#define HASH_BUCKET_SIZE_LOG 3 // MAX is 4 for now #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG) @@ -32,18 +33,15 @@ //#define RUN_CHECKS -#include "ldm.h" - /* Hash table stuff */ typedef U32 hash_t; typedef struct LDM_hashEntry { - U32 offset; // TODO: Replace with pointer? + U32 offset; U32 checksum; } LDM_hashEntry; -// TODO: Scanning speed // TODO: Memory usage struct LDM_compressStats { U32 windowSizeLog, hashTableSizeLog; @@ -110,18 +108,22 @@ struct LDM_CCtx { }; struct LDM_hashTable { - U32 size; // Number of buckets - U32 maxEntries; // Rename... - LDM_hashEntry *entries; // 1-D array for now. + U32 numBuckets; // Number of buckets + U32 numEntries; // Rename... + LDM_hashEntry *entries; BYTE *bucketOffsets; // Position corresponding to offset=0 in LDM_hashEntry. }; +/** + * Create a hash table that can contain size elements. + * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG. + */ LDM_hashTable *HASH_createTable(U32 size) { LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); - table->size = size >> HASH_BUCKET_SIZE_LOG; - table->maxEntries = size; + table->numBuckets = size >> HASH_BUCKET_SIZE_LOG; + table->numEntries = size; table->entries = calloc(size, sizeof(LDM_hashEntry)); table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); return table; @@ -131,10 +133,7 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + (hash << HASH_BUCKET_SIZE_LOG); } - - -static unsigned ZSTD_NbCommonBytes (register size_t val) -{ +static unsigned ZSTD_NbCommonBytes (register size_t val) { if (MEM_isLittleEndian()) { if (MEM_64bits()) { # if defined(_MSC_VER) && defined(_WIN64) @@ -234,6 +233,11 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, return (size_t)(pIn - pStart); } +/** + * Count number of bytes that match backwards before pIn and pMatch. + * + * We count only bytes where pMatch > pBaes and pIn > pAnchor. + */ U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, const BYTE *pMatch, const BYTE *pBase) { U32 matchLength = 0; @@ -245,20 +249,32 @@ U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, return matchLength; } -LDM_hashEntry *HASH_getValidEntry(const LDM_CCtx *cctx, - const hash_t hash, - const U32 checksum, - U32 *matchLength, - U32 *backwardsMatchLength) { +/** + * Returns a pointer to the entry in the hash table matching the hash and + * checksum with the "longest match length" as defined below. The forward and + * backward match lengths are written to *pForwardMatchLength and + * *pBackwardMatchLength. + * + * The match length is defined based on cctx->ip and the entry's offset. + * The forward match is computed from cctx->ip and entry->offset + cctx->ibase. + * The backward match is computed backwards from cctx->ip and + * cctx->ibase only if the forward match is longer than LDM_MIN_MATCH_LENGTH. + * + */ +LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, + const hash_t hash, + const U32 checksum, + U32 *pForwardMatchLength, + U32 *pBackwardMatchLength) { LDM_hashTable *table = cctx->hashTable; LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; LDM_hashEntry *bestEntry = NULL; U32 bestMatchLength = 0; for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - // Check checksum for faster check. const BYTE *pMatch = cur->offset + cctx->ibase; + // Check checksum for faster check. if (cur->checksum == checksum && cctx->ip - pMatch <= LDM_WINDOW_SIZE) { U32 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); @@ -279,8 +295,8 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_CCtx *cctx, if (totalMatchLength >= bestMatchLength && totalMatchLength >= LDM_MIN_MATCH_LENGTH) { bestMatchLength = totalMatchLength; - *matchLength = forwardMatchLength; - *backwardsMatchLength = backwardMatchLength; + *pForwardMatchLength = forwardMatchLength; + *pBackwardMatchLength = backwardMatchLength; bestEntry = cur; #ifdef ZSTD_SKIP @@ -303,7 +319,7 @@ void HASH_insert(LDM_hashTable *table, } U32 HASH_getSize(const LDM_hashTable *table) { - return table->size; + return table->numBuckets; } void HASH_destroyTable(LDM_hashTable *table) { @@ -315,20 +331,20 @@ void HASH_destroyTable(LDM_hashTable *table) { void HASH_outputTableOccupancy(const LDM_hashTable *table) { U32 ctr = 0; LDM_hashEntry *cur = table->entries; - LDM_hashEntry *end = table->entries + (table->size * HASH_BUCKET_SIZE); + LDM_hashEntry *end = table->entries + (table->numBuckets * HASH_BUCKET_SIZE); for (; cur < end; ++cur) { if (cur->offset == 0) { ctr++; } } - printf("Num buckets, bucket size: %d, %d\n", table->size, HASH_BUCKET_SIZE); + printf("Num buckets, bucket size: %d, %d\n", + table->numBuckets, HASH_BUCKET_SIZE); printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - table->maxEntries, ctr, - 100.0 * (double)(ctr) / table->maxEntries); + table->numEntries, ctr, + 100.0 * (double)(ctr) / table->numEntries); } - // TODO: This can be done more efficiently (but it is not that important as it // is only used for computing stats). static int intLog2(U32 x) { @@ -339,7 +355,7 @@ static int intLog2(U32 x) { return ret; } -// TODO: Maybe we would eventually prefer to have linear rather than +// Maybe we would eventually prefer to have linear rather than // exponential buckets. /** void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { @@ -369,7 +385,6 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; printf("=====================\n"); printf("Compression statistics\n"); - //TODO: compute percentage matched? printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", stats->windowSizeLog, stats->hashTableSizeLog); printf("num matches, total match length, %% matched: %u, %llu, %.3f\n", @@ -429,7 +444,6 @@ hash_t HASH_hashU32(U32 value) { */ static hash_t checksumToHash(U32 sum) { return HASH_hashU32(sum); -// return ((sum * 2654435761U) >> (32 - LDM_HASHLOG)); } /** @@ -672,10 +686,10 @@ void LDM_destroyCCtx(LDM_CCtx *cctx) { * Returns 0 if successful and 1 otherwise (i.e. no match can be found * in the remaining input that is long enough). * - * matchLength contains the forward length of the match. + * forwardMatchLength contains the forward length of the match. */ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, - U32 *matchLength, U32 *backwardMatchLength) { + U32 *forwardMatchLength, U32 *backwardMatchLength) { LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; @@ -693,8 +707,8 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, return 1; } - entry = HASH_getValidEntry(cctx, h, sum, - matchLength, backwardMatchLength); + entry = HASH_getBestEntry(cctx, h, sum, + forwardMatchLength, backwardMatchLength); if (entry != NULL) { *match = entry->offset + cctx->ibase; diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index a43ec000..96db0c22 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -29,6 +29,7 @@ static int compress(const char *fname, const char *oname) { size_t maxCompressedSize, compressedSize; struct timeval tv1, tv2; + double timeTaken; /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { @@ -53,18 +54,7 @@ static int compress(const char *fname, const char *oname) { // The compress function should check before writing or buffer writes. maxCompressedSize += statbuf.st_size / 255; - /* Go to the location corresponding to the last byte. */ - /* TODO: fallocate? */ - if (lseek(fdout, maxCompressedSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* Write a dummy byte at the last location. */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } + ftruncate(fdout, maxCompressedSize); /* mmap the input file. */ if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0)) @@ -103,12 +93,12 @@ static int compress(const char *fname, const char *oname) { (unsigned)statbuf.st_size, (unsigned)compressedSize, oname, (double)compressedSize / (statbuf.st_size) * 100); + timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec), + printf("Total compress time = %.3f seconds, Average compression speed: %.3f MB/s\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec), - ((double)statbuf.st_size / (double) (1 << 20)) / - ((double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec))); + timeTaken, + ((double)statbuf.st_size / (double) (1 << 20)) / timeTaken); // Close files. @@ -156,17 +146,7 @@ static int decompress(const char *fname, const char *oname) { /* Read the header. */ LDM_readHeader(src, &compressedSize, &decompressedSize); - /* Go to the location corresponding to the last byte. */ - if (lseek(fdout, decompressedSize - 1, SEEK_SET) == -1) { - perror("lseek error"); - return 1; - } - - /* write a dummy byte at the last location */ - if (write(fdout, "", 1) != 1) { - perror("write error"); - return 1; - } + ftruncate(fdout, decompressedSize); /* mmap the output file */ if ((dst = mmap(0, decompressedSize, PROT_READ | PROT_WRITE, From 13a01ffb27f58d24dae8dc56c1ef75f1743cca27 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 19 Jul 2017 17:24:09 -0700 Subject: [PATCH 46/62] Fix off-by-one in size calculations --- contrib/long_distance_matching/circular_buffer_table.c | 3 ++- contrib/long_distance_matching/ldm.c | 6 +++--- contrib/long_distance_matching/ldm.h | 2 +- contrib/long_distance_matching/ldm_with_table.c | 8 ++++---- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index 9429fbcd..ad7ae9e1 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -11,7 +11,8 @@ #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) // TODO: rename. Number of hash buckets. -#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG) +// TODO: Link to HASH_ENTRY_SIZE_LOG +#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-3-(HASH_BUCKET_SIZE_LOG)) //#define ZSTD_SKIP struct LDM_hashTable { diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 6e9addf7..9ffbab48 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -6,12 +6,12 @@ #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -//#define LDM_HASH_ENTRY_SIZE 4 +#define LDM_HASH_ENTRY_SIZE_LOG 3 #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 4) +#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) // Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - 4)) +#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG))) #define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) #define ML_BITS 4 diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 1d5b2f13..04b6410c 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -13,7 +13,7 @@ // Defines the size of the hash table. // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? -#define LDM_MEMORY_USAGE 23 +#define LDM_MEMORY_USAGE 22 #define HASH_BUCKET_SIZE_LOG 3 // MAX is 4 for now // Defines the lag in inserting elements into the hash table. diff --git a/contrib/long_distance_matching/ldm_with_table.c b/contrib/long_distance_matching/ldm_with_table.c index 5919d588..813ead6a 100644 --- a/contrib/long_distance_matching/ldm_with_table.c +++ b/contrib/long_distance_matching/ldm_with_table.c @@ -7,17 +7,17 @@ #include "ldm.h" #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -//#define LDM_HASH_ENTRY_SIZE 4 +#define LDM_HASH_ENTRY_SIZE_LOG 3 #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 4) +#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) // Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - 4)) +#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG))) #define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) /* Hash table stuff. */ #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) -#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG) +#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) #define ML_BITS 4 #define ML_MASK ((1U< Date: Thu, 20 Jul 2017 16:50:06 -0700 Subject: [PATCH 47/62] Experiment with 64-bit hash and checksum --- contrib/long_distance_matching/Makefile | 8 +- .../circular_buffer_table.c | 7 +- contrib/long_distance_matching/ldm.c | 65 +- contrib/long_distance_matching/ldm.h | 3 +- .../long_distance_matching/ldm_hashtable.h | 2 + contrib/long_distance_matching/ldm_hf_test.c | 1048 +++++++++++++++++ .../long_distance_matching/ldm_with_table.c | 26 +- contrib/long_distance_matching/main-ldm.c | 12 +- 8 files changed, 1096 insertions(+), 75 deletions(-) create mode 100644 contrib/long_distance_matching/ldm_hf_test.c diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 3aa3f8bd..5119f464 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,7 +25,7 @@ LDFLAGS += -lzstd default: all -all: main-circular-buffer main-integrated +all: main-circular-buffer main-integrated main-hf #main-basic : basic_table.c ldm.c main-ldm.c # $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ @@ -33,12 +33,14 @@ all: main-circular-buffer main-integrated main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ +main-hf: ldm_hf_test.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + main-integrated: ldm_with_table.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-circular-buffer main-integrated + main-basic main-circular-buffer main-integrated main-hf @echo Cleaning completed diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index ad7ae9e1..66107e06 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -5,14 +5,16 @@ #include "ldm_hashtable.h" #include "mem.h" - // Number of elements per hash bucket. // HASH_BUCKET_SIZE_LOG defined in ldm.h #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) +#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) + + // TODO: rename. Number of hash buckets. // TODO: Link to HASH_ENTRY_SIZE_LOG -#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-3-(HASH_BUCKET_SIZE_LOG)) + //#define ZSTD_SKIP struct LDM_hashTable { @@ -175,6 +177,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) { U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); U32 backwardMatchLength, totalMatchLength; + if (forwardMatchLength < table->minMatchLength) { continue; } diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 9ffbab48..ab2de7c1 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -4,14 +4,15 @@ #include #include +#include "ldm.h" +#include "ldm_hashtable.h" #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASH_ENTRY_SIZE_LOG 3 #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) // Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG))) +#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) #define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) #define ML_BITS 4 @@ -26,8 +27,7 @@ //#define RUN_CHECKS //#define TMP_RECOMPUTE_LENGTHS -#include "ldm.h" -#include "ldm_hashtable.h" +typedef U32 checksum_t; // TODO: Scanning speed // TODO: Memory usage @@ -71,22 +71,22 @@ struct LDM_CCtx { LDM_hashTable *hashTable; -// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; - const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ - U32 lastSum; + checksum_t lastSum; const BYTE *nextIp; // TODO: this is redundant (ip + step) const BYTE *nextPosHashed; hash_t nextHash; /* Hash corresponding to nextPosHashed */ - U32 nextSum; + checksum_t nextSum; + + unsigned step; // ip step, should be 1. const BYTE *lagIp; hash_t lagHash; - U32 lagSum; + checksum_t lagSum; U64 numHashInserts; // DEBUG @@ -191,15 +191,15 @@ static hash_t checksumToHash(U32 sum) { } /** - * Computes a checksum based on rsync's checksum. + * Computes a 32-bit checksum based on rsync's checksum. * * a(k,l) = \sum_{i = k}^l x_i (mod M) * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) */ -static U32 getChecksum(const BYTE *buf, U32 len) { +static checksum_t getChecksum(const BYTE *buf, U32 len) { U32 i; - U32 s1, s2; + checksum_t s1, s2; s1 = s2 = 0; for (i = 0; i < (len - 4); i += 4) { @@ -226,8 +226,8 @@ static U32 getChecksum(const BYTE *buf, U32 len) { * * Thus toRemove should correspond to data[0]. */ -static U32 updateChecksum(U32 sum, U32 len, - BYTE toRemove, BYTE toAdd) { +static checksum_t updateChecksum(checksum_t sum, U32 len, + BYTE toRemove, BYTE toAdd) { U32 s1 = (sum & 0xffff) - toRemove + toAdd; U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; @@ -262,7 +262,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->nextHash = checksumToHash(cctx->nextSum); #if LDM_LAG -// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); if (cctx->ip - cctx->ibase > LDM_LAG) { cctx->lagSum = updateChecksum( cctx->lagSum, LDM_HASH_LENGTH, @@ -288,32 +287,28 @@ static void setNextHash(LDM_CCtx *cctx) { } static void putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash, U32 sum) { + LDM_CCtx *cctx, hash_t hash, U32 checksum) { // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - /** - const LDM_hashEntry entry = { cctx->ip - cctx->ibase , - MEM_read32(cctx->ip) }; - */ #if LDM_LAG // TODO: off by 1, but whatever if (cctx->lagIp - cctx->ibase > 0) { const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; HASH_insert(cctx->hashTable, cctx->lagHash, entry); } else { - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; HASH_insert(cctx->hashTable, hash, entry); } #else - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; HASH_insert(cctx->hashTable, hash, entry); #endif } cctx->lastPosHashed = cctx->ip; cctx->lastHash = hash; - cctx->lastSum = sum; + cctx->lastSum = checksum; } /** @@ -336,7 +331,7 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { * Insert hash of the current position into the hash table. */ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); + checksum_t sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); hash_t hash = checksumToHash(sum); #ifdef RUN_CHECKS @@ -441,7 +436,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, while (entry == NULL) { hash_t h; - U32 sum; + checksum_t sum; setNextHash(cctx); h = cctx->nextHash; sum = cctx->nextSum; @@ -698,23 +693,7 @@ size_t LDM_decompress(const void *src, size_t compressedSize, } // TODO: implement and test hash function -void LDM_test(void) { +void LDM_test(const BYTE *src) { + (void)src; } -/* -void LDM_test(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - const BYTE *ip = (const BYTE *)src + 1125; - U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); - U32 sum2; - ++ip; - for (; ip < (const BYTE *)src + 1125 + 100; ip++) { - sum2 = updateChecksum(sum, LDM_HASH_LENGTH, - ip[-1], ip[LDM_HASH_LENGTH - 1]); - sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); - printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2); - } -} -*/ - - diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 04b6410c..c420e60c 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -31,6 +31,7 @@ typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_DCtx LDM_DCtx; + /** * Compresses src into dst. * @@ -151,6 +152,6 @@ void LDM_readHeader(const void *src, U64 *compressedSize, void LDM_outputConfiguration(void); -void LDM_test(void); +void LDM_test(const BYTE *src); #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index df9dcd78..9d5ba0e2 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -3,6 +3,8 @@ #include "mem.h" +#define LDM_HASH_ENTRY_SIZE_LOG 3 + // TODO: clean up comments typedef U32 hash_t; diff --git a/contrib/long_distance_matching/ldm_hf_test.c b/contrib/long_distance_matching/ldm_hf_test.c new file mode 100644 index 00000000..63be82d1 --- /dev/null +++ b/contrib/long_distance_matching/ldm_hf_test.c @@ -0,0 +1,1048 @@ +#include +#include +#include +#include +#include + +#include "ldm.h" + +#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASH_ENTRY_SIZE_LOG 3 +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) + +// Insert every (HASH_ONLY_EVERY + 1) into the hash table. +#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG))) +#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) + +/* Hash table stuff. */ +#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) +#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) + +#define ML_BITS 4 +#define ML_MASK ((1U<> HASH_BUCKET_SIZE_LOG. + */ +LDM_hashTable *HASH_createTable(U32 size) { + LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); + table->numBuckets = size >> HASH_BUCKET_SIZE_LOG; + table->numEntries = size; + table->entries = calloc(size, sizeof(LDM_hashEntry)); + table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); + return table; +} + +static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { + return table->entries + (hash << HASH_BUCKET_SIZE_LOG); +} + +static unsigned ZSTD_NbCommonBytes (register size_t val) { + if (MEM_isLittleEndian()) { + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanForward64( &r, (U64)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, + 0, 3, 1, 3, 1, 4, 2, 7, + 0, 2, 3, 6, 1, 5, 3, 5, + 1, 3, 4, 4, 2, 5, 6, 7, + 7, 0, 1, 2, 3, 3, 4, 6, + 2, 6, 5, 5, 3, 4, 5, 6, + 7, 1, 2, 4, 6, 4, 4, 5, + 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r=0; + _BitScanForward( &r, (U32)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else { /* Big Endian CPU */ + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clzll(val) >> 3); +# else + unsigned r; + const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ + if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } } +} + +// From lib/compress/zstd_compress.c +static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, + const BYTE *const pInLimit) { + const BYTE * const pStart = pIn; + const BYTE * const pInLoopLimit = pInLimit - (sizeof(size_t)-1); + + while (pIn < pInLoopLimit) { + size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (!diff) { + pIn += sizeof(size_t); + pMatch += sizeof(size_t); + continue; + } + pIn += ZSTD_NbCommonBytes(diff); + return (size_t)(pIn - pStart); + } + + if (MEM_64bits()) { + if ((pIn < (pInLimit - 3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { + pIn += 4; + pMatch += 4; + } + } + if ((pIn < (pInLimit - 1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { + pIn += 2; + pMatch += 2; + } + if ((pIn < pInLimit) && (*pMatch == *pIn)) { + pIn++; + } + return (size_t)(pIn - pStart); +} + +/** + * Count number of bytes that match backwards before pIn and pMatch. + * + * We count only bytes where pMatch > pBaes and pIn > pAnchor. + */ +U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, + const BYTE *pMatch, const BYTE *pBase) { + U32 matchLength = 0; + while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { + pIn--; + pMatch--; + matchLength++; + } + return matchLength; +} + +/** + * Returns a pointer to the entry in the hash table matching the hash and + * checksum with the "longest match length" as defined below. The forward and + * backward match lengths are written to *pForwardMatchLength and + * *pBackwardMatchLength. + * + * The match length is defined based on cctx->ip and the entry's offset. + * The forward match is computed from cctx->ip and entry->offset + cctx->ibase. + * The backward match is computed backwards from cctx->ip and + * cctx->ibase only if the forward match is longer than LDM_MIN_MATCH_LENGTH. + * + */ +LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, + const hash_t hash, + const U32 checksum, + U32 *pForwardMatchLength, + U32 *pBackwardMatchLength) { + LDM_hashTable *table = cctx->hashTable; + LDM_hashEntry *bucket = getBucket(table, hash); + LDM_hashEntry *cur = bucket; + LDM_hashEntry *bestEntry = NULL; + U32 bestMatchLength = 0; + for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { + const BYTE *pMatch = cur->offset + cctx->ibase; + + // Check checksum for faster check. + if (cur->checksum == checksum && + cctx->ip - pMatch <= LDM_WINDOW_SIZE) { + U32 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); + U32 backwardMatchLength, totalMatchLength; + + // For speed. + if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) { + continue; + } + + backwardMatchLength = + countBackwardsMatch(cctx->ip, cctx->anchor, + cur->offset + cctx->ibase, + cctx->ibase); + + totalMatchLength = forwardMatchLength + backwardMatchLength; + + if (totalMatchLength >= bestMatchLength) { + bestMatchLength = totalMatchLength; + *pForwardMatchLength = forwardMatchLength; + *pBackwardMatchLength = backwardMatchLength; + + bestEntry = cur; +#ifdef ZSTD_SKIP + return cur; +#endif + } + } + } + if (bestEntry != NULL) { + return bestEntry; + } + return NULL; +} + +void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { + *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; + table->bucketOffsets[hash]++; + table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; +} + +U32 HASH_getSize(const LDM_hashTable *table) { + return table->numBuckets; +} + +void HASH_destroyTable(LDM_hashTable *table) { + free(table->entries); + free(table->bucketOffsets); + free(table); +} + +void HASH_outputTableOccupancy(const LDM_hashTable *table) { + U32 ctr = 0; + LDM_hashEntry *cur = table->entries; + LDM_hashEntry *end = table->entries + (table->numBuckets * HASH_BUCKET_SIZE); + for (; cur < end; ++cur) { + if (cur->offset == 0) { + ctr++; + } + } + + printf("Num buckets, bucket size: %d, %d\n", + table->numBuckets, HASH_BUCKET_SIZE); + printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", + table->numEntries, ctr, + 100.0 * (double)(ctr) / table->numEntries); +} + +// TODO: This can be done more efficiently (but it is not that important as it +// is only used for computing stats). +static int intLog2(U32 x) { + int ret = 0; + while (x >>= 1) { + ret++; + } + return ret; +} + +// Maybe we would eventually prefer to have linear rather than +// exponential buckets. +/** +void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { + U32 i = 0; + int buckets[32] = { 0 }; + + printf("\n"); + printf("Hash table histogram\n"); + for (; i < HASH_getSize(cctx->hashTable); i++) { + int offset = (cctx->ip - cctx->ibase) - + HASH_getEntryFromHash(cctx->hashTable, i)->offset; + buckets[intLog2(offset)]++; + } + + i = 0; + for (; i < 32; i++) { + printf("2^%*d: %10u %6.3f%%\n", 2, i, + buckets[i], + 100.0 * (double) buckets[i] / + (double) HASH_getSize(cctx->hashTable)); + } + printf("\n"); +} +*/ + +void LDM_printCompressStats(const LDM_compressStats *stats) { + int i = 0; + printf("=====================\n"); + printf("Compression statistics\n"); + printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", + stats->windowSizeLog, stats->hashTableSizeLog); + printf("num matches, total match length, %% matched: %u, %llu, %.3f\n", + stats->numMatches, + stats->totalMatchLength, + 100.0 * (double)stats->totalMatchLength / + (double)(stats->totalMatchLength + stats->totalLiteralLength)); + printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / + (double)stats->numMatches); + printf("avg literal length, total literalLength: %.1f, %llu\n", + ((double)stats->totalLiteralLength) / (double)stats->numMatches, + stats->totalLiteralLength); + printf("avg offset length: %.1f\n", + ((double)stats->totalOffset) / (double)stats->numMatches); + printf("min offset, max offset: %u, %u\n", + stats->minOffset, stats->maxOffset); + + printf("\n"); + printf("offset histogram: offset, num matches, %% of matches\n"); + + for (; i <= intLog2(stats->maxOffset); i++) { + printf("2^%*d: %10u %6.3f%%\n", 2, i, + stats->offsetHistogram[i], + 100.0 * (double) stats->offsetHistogram[i] / + (double) stats->numMatches); + } + printf("\n"); + printf("=====================\n"); +} + +int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { + U32 lengthLeft = LDM_MIN_MATCH_LENGTH; + const BYTE *curIn = pIn; + const BYTE *curMatch = pMatch; + + if (pIn - pMatch > LDM_WINDOW_SIZE) { + return 0; + } + + for (; lengthLeft >= 4; lengthLeft -= 4) { + if (MEM_read32(curIn) != MEM_read32(curMatch)) { + return 0; + } + curIn += 4; + curMatch += 4; + } + return 1; +} + +#if 0 +hash_t HASH_hashU32(U32 value) { + return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); +} +#endif + +/** + * Convert a sum computed from getChecksum to a hash value in the range + * of the hash table. + */ +#if 0 +static hash_t checksumToHash(U32 sum) { + return HASH_hashU32(sum); +} +#endif + +// Upper LDM_HASH_LOG bits. +static hash_t checksumToHash(U64 sum) { + return sum >> (64 - LDM_HASHLOG); +} + +// 32 bits after LDM_HASH_LOG bits. +static U32 checksumFromHfHash(U64 hfHash) { + return (hfHash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; +} + +#if 0 +/** + * Computes a checksum based on rsync's checksum. + * + * a(k,l) = \sum_{i = k}^l x_i (mod M) + * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) + * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) + */ +static U32 getChecksum(const BYTE *buf, U32 len) { + U32 i; + U32 s1, s2; + + s1 = s2 = 0; + for (i = 0; i < (len - 4); i += 4) { + s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + + (2 * buf[i + 2]) + (buf[i + 3]) + + (10 * CHECKSUM_CHAR_OFFSET); + s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3] + + + (4 * CHECKSUM_CHAR_OFFSET); + + } + for(; i < len; i++) { + s1 += buf[i] + CHECKSUM_CHAR_OFFSET; + s2 += s1; + } + return (s1 & 0xffff) + (s2 << 16); +} +#endif + +static U64 getChecksum(const BYTE *buf, U32 len) { + static const U64 prime8bytes = 11400714785074694791ULL; + +// static const U64 prime8bytes = 5; + U64 ret = 0; + U32 i; + for (i = 0; i < len; i++) { + ret *= prime8bytes; + ret += buf[i] + CHECKSUM_CHAR_OFFSET; +// printf("HERE %llu\n", ret); + } + return ret; + +} + +#if 0 +/** + * Update a checksum computed from getChecksum(data, len). + * + * The checksum can be updated along its ends as follows: + * a(k+1, l+1) = (a(k,l) - x_k + x_{l+1}) (mod M) + * b(k+1, l+1) = (b(k,l) - (l-k+1)*x_k + (a(k+1,l+1)) (mod M) + * + * Thus toRemove should correspond to data[0]. + */ +static U32 updateChecksum(U32 sum, U32 len, + BYTE toRemove, BYTE toAdd) { + U32 s1 = (sum & 0xffff) - toRemove + toAdd; + U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; + + return (s1 & 0xffff) + (s2 << 16); +} +#endif + +static U64 ipow(U64 base, U64 exp) { + U64 ret = 1; + while (exp) { + if (exp & 1) { + ret *= base; + } + exp >>= 1; + base *= base; + } + return ret; +} + +static U64 updateChecksum(U64 sum, U32 len, + BYTE toRemove, BYTE toAdd) { + // TODO: deduplicate. + static const U64 prime8bytes = 11400714785074694791ULL; +// static const U64 prime8bytes = 5; + sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) * + ipow(prime8bytes, len - 1)); + sum *= prime8bytes; + sum += toAdd + CHECKSUM_CHAR_OFFSET; + return sum; +} + +/** + * Update cctx->nextSum, cctx->nextHash, and cctx->nextPosHashed + * based on cctx->lastSum and cctx->lastPosHashed. + * + * This uses a rolling hash and requires that the last position hashed + * corresponds to cctx->nextIp - step. + */ +static void setNextHash(LDM_CCtx *cctx) { +#ifdef RUN_CHECKS + U32 check; + if ((cctx->nextIp - cctx->ibase != 1) && + (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { + printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, + cctx->DEBUG_setNextHash - cctx->ibase); + } + + cctx->DEBUG_setNextHash = cctx->nextIp; +#endif + +// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); + cctx->nextSum = updateChecksum( + cctx->lastSum, LDM_HASH_LENGTH, + cctx->lastPosHashed[0], + cctx->lastPosHashed[LDM_HASH_LENGTH]); + cctx->nextPosHashed = cctx->nextIp; +#if 0 + cctx->nextHash = checksumToHash(cctx->nextSum); +#endif + + +#if LDM_LAG +// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); + if (cctx->ip - cctx->ibase > LDM_LAG) { + cctx->lagSum = updateChecksum( + cctx->lagSum, LDM_HASH_LENGTH, + cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); + cctx->lagIp++; +#if 0 + cctx->lagHash = checksumToHash(cctx->lagSum); +#endif + } +#endif + +#ifdef RUN_CHECKS + check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); + + if (check != cctx->nextSum) { + printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); + } + + if ((cctx->nextIp - cctx->lastPosHashed) != 1) { + printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", + cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, + cctx->ip - cctx->ibase); + } +#endif +} + +static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) { + // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. + // Note: this works only when cctx->step is 1. + U32 hash = checksumToHash(hfHash); + U32 sum = checksumFromHfHash(hfHash); +// printf("TMP %u %u %llu\n", hash, sum, hfHash); + + if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { + +#if LDM_LAG + // TODO: off by 1, but whatever + if (cctx->lagIp - cctx->ibase > 0) { + const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; + HASH_insert(cctx->hashTable, cctx->lagHash, entry); + } else { + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; + HASH_insert(cctx->hashTable, hash, entry); + } +#else + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; + HASH_insert(cctx->hashTable, hash, entry); +#endif + } + + cctx->lastPosHashed = cctx->ip; +#if 0 + cctx->lastHash = hash; +#endif + cctx->lastSum = hfHash; +} + +/** + * Copy over the cctx->lastHash, cctx->lastSum, and cctx->lastPosHashed + * fields from the "next" fields. + * + * This requires that cctx->ip == cctx->nextPosHashed. + */ +static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { +#ifdef RUN_CHECKS + if (cctx->ip != cctx->nextPosHashed) { + printf("CHECK failed: updateLastHashFromNextHash %zu\n", + cctx->ip - cctx->ibase); + } +#endif +#if 0 + putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); +#endif + putHashOfCurrentPositionFromHash(cctx, cctx->nextSum); +} + +/** + * Insert hash of the current position into the hash table. + */ +static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { + U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); +#if 0 + hash_t hash = checksumToHash(sum); +#endif + +#ifdef RUN_CHECKS + if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { + printf("CHECK failed: putHashOfCurrentPosition %zu\n", + cctx->ip - cctx->ibase); + } +#endif +#if 0 + putHashOfCurrentPositionFromHash(cctx, hash, sum); +#endif + putHashOfCurrentPositionFromHash(cctx, sum); +} + +U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, + const BYTE *pInLimit) { + const BYTE * const pStart = pIn; + while (pIn < pInLimit - 1) { + BYTE const diff = (*pMatch) ^ *(pIn); + if (!diff) { + pIn++; + pMatch++; + continue; + } + return (U32)(pIn - pStart); + } + return (U32)(pIn - pStart); +} + +void LDM_outputConfiguration(void) { + printf("=====================\n"); + printf("Configuration\n"); + printf("Window size log: %d\n", LDM_WINDOW_SIZE_LOG); + printf("Min match, hash length: %d, %d\n", + LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); + printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); + printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); + printf("LDM_LAG %d\n", LDM_LAG); + printf("=====================\n"); +} + +void LDM_readHeader(const void *src, U64 *compressedSize, + U64 *decompressedSize) { + const BYTE *ip = (const BYTE *)src; + *compressedSize = MEM_readLE64(ip); + ip += sizeof(U64); + *decompressedSize = MEM_readLE64(ip); + // ip += sizeof(U64); +} + +void LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + cctx->isize = srcSize; + cctx->maxOSize = maxDstSize; + + cctx->ibase = (const BYTE *)src; + cctx->ip = cctx->ibase; + cctx->iend = cctx->ibase + srcSize; + + cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; + cctx->imatchLimit = cctx->iend - LDM_MIN_MATCH_LENGTH; + + cctx->obase = (BYTE *)dst; + cctx->op = (BYTE *)dst; + + cctx->anchor = cctx->ibase; + + memset(&(cctx->stats), 0, sizeof(cctx->stats)); + cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64); + + cctx->stats.minOffset = UINT_MAX; + cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; + cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; + + + cctx->lastPosHashed = NULL; + + cctx->step = 1; // Fixed to be 1 for now. Changing may break things. + cctx->nextIp = cctx->ip + cctx->step; + cctx->nextPosHashed = 0; + + cctx->DEBUG_setNextHash = 0; +} + +void LDM_destroyCCtx(LDM_CCtx *cctx) { + HASH_destroyTable(cctx->hashTable); +} + +/** + * Finds the "best" match. + * + * Returns 0 if successful and 1 otherwise (i.e. no match can be found + * in the remaining input that is long enough). + * + * forwardMatchLength contains the forward length of the match. + */ +static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, + U32 *forwardMatchLength, U32 *backwardMatchLength) { + + LDM_hashEntry *entry = NULL; + cctx->nextIp = cctx->ip + cctx->step; + + while (entry == NULL) { + hash_t h; + U64 hash; + U32 sum; + setNextHash(cctx); +#if 0 + h = cctx->nextHash; + sum = cctx->nextSum; +#endif + hash = cctx->nextSum; + h = checksumToHash(hash); + sum = checksumFromHfHash(hash); + + cctx->ip = cctx->nextIp; + cctx->nextIp += cctx->step; + + if (cctx->ip > cctx->imatchLimit) { + return 1; + } + + entry = HASH_getBestEntry(cctx, h, sum, + forwardMatchLength, backwardMatchLength); + + if (entry != NULL) { + *match = entry->offset + cctx->ibase; + } + putHashOfCurrentPositionFromHash(cctx, hash); + } + setNextHash(cctx); + return 0; +} + +void LDM_encodeLiteralLengthAndLiterals( + LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { + /* Encode the literal length. */ + if (literalLength >= RUN_MASK) { + int len = (int)literalLength - RUN_MASK; + *pToken = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) { + *(cctx->op)++ = 255; + } + *(cctx->op)++ = (BYTE)len; + } else { + *pToken = (BYTE)(literalLength << ML_BITS); + } + + /* Encode the literals. */ + memcpy(cctx->op, cctx->anchor, literalLength); + cctx->op += literalLength; +} + +void LDM_outputBlock(LDM_CCtx *cctx, + const U32 literalLength, + const U32 offset, + const U32 matchLength) { + BYTE *pToken = cctx->op++; + + /* Encode the literal length and literals. */ + LDM_encodeLiteralLengthAndLiterals(cctx, pToken, literalLength); + + /* Encode the offset. */ + MEM_write32(cctx->op, offset); + cctx->op += LDM_OFFSET_SIZE; + + /* Encode the match length. */ + if (matchLength >= ML_MASK) { + unsigned matchLengthRemaining = matchLength; + *pToken += ML_MASK; + matchLengthRemaining -= ML_MASK; + MEM_write32(cctx->op, 0xFFFFFFFF); + while (matchLengthRemaining >= 4*0xFF) { + cctx->op += 4; + MEM_write32(cctx->op, 0xffffffff); + matchLengthRemaining -= 4*0xFF; + } + cctx->op += matchLengthRemaining / 255; + *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); + } else { + *pToken += (BYTE)(matchLength); + } +} + +// TODO: maxDstSize is unused. This function may seg fault when writing +// beyond the size of dst, as it does not check maxDstSize. Writing to +// a buffer and performing checks is a possible solution. +// +// This is based upon lz4. +size_t LDM_compress(const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { + LDM_CCtx cctx; + const BYTE *match = NULL; + U32 forwardMatchLength = 0; + U32 backwardsMatchLength = 0; + + LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + LDM_outputConfiguration(); + + /* Hash the first position and put it into the hash table. */ + LDM_putHashOfCurrentPosition(&cctx); + +#if LDM_LAG + cctx.lagIp = cctx.ip; +// cctx.lagHash = cctx.lastHash; + cctx.lagSum = cctx.lastSum; +#endif + /** + * Find a match. + * If no more matches can be found (i.e. the length of the remaining input + * is less than the minimum match length), then stop searching for matches + * and encode the final literals. + */ + while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength, + &backwardsMatchLength) == 0) { +#ifdef COMPUTE_STATS + cctx.stats.numMatches++; +#endif + + cctx.ip -= backwardsMatchLength; + match -= backwardsMatchLength; + + /** + * Write current block (literals, literal length, match offset, match + * length) and update pointers and hashes. + */ + { + const U32 literalLength = cctx.ip - cctx.anchor; + const U32 offset = cctx.ip - match; + const U32 matchLength = forwardMatchLength + + backwardsMatchLength - + LDM_MIN_MATCH_LENGTH; + + LDM_outputBlock(&cctx, literalLength, offset, matchLength); + +#ifdef COMPUTE_STATS + cctx.stats.totalLiteralLength += literalLength; + cctx.stats.totalOffset += offset; + cctx.stats.totalMatchLength += matchLength + LDM_MIN_MATCH_LENGTH; + cctx.stats.minOffset = + offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; + cctx.stats.maxOffset = + offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; + cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; +#endif + + // Move ip to end of block, inserting hashes at each position. + cctx.nextIp = cctx.ip + cctx.step; + while (cctx.ip < cctx.anchor + LDM_MIN_MATCH_LENGTH + + matchLength + literalLength) { + if (cctx.ip > cctx.lastPosHashed) { + // TODO: Simplify. + LDM_updateLastHashFromNextHash(&cctx); + setNextHash(&cctx); + } + cctx.ip++; + cctx.nextIp++; + } + } + + // Set start of next block to current input pointer. + cctx.anchor = cctx.ip; + LDM_updateLastHashFromNextHash(&cctx); + } + + // HASH_outputTableOffsetHistogram(&cctx); + + /* Encode the last literals (no more matches). */ + { + const U32 lastRun = cctx.iend - cctx.anchor; + BYTE *pToken = cctx.op++; + LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); + } + +#ifdef COMPUTE_STATS + LDM_printCompressStats(&cctx.stats); + HASH_outputTableOccupancy(cctx.hashTable); +#endif + + { + const size_t ret = cctx.op - cctx.obase; + LDM_destroyCCtx(&cctx); + return ret; + } +} + +struct LDM_DCtx { + size_t compressedSize; + size_t maxDecompressedSize; + + const BYTE *ibase; /* Base of input */ + const BYTE *ip; /* Current input position */ + const BYTE *iend; /* End of source */ + + const BYTE *obase; /* Base of output */ + BYTE *op; /* Current output position */ + const BYTE *oend; /* End of output */ +}; + +void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize) { + dctx->compressedSize = compressedSize; + dctx->maxDecompressedSize = maxDecompressedSize; + + dctx->ibase = src; + dctx->ip = (const BYTE *)src; + dctx->iend = dctx->ip + dctx->compressedSize; + dctx->op = dst; + dctx->oend = dctx->op + dctx->maxDecompressedSize; +} + +size_t LDM_decompress(const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize) { + LDM_DCtx dctx; + LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); + + while (dctx.ip < dctx.iend) { + BYTE *cpy; + const BYTE *match; + size_t length, offset; + + /* Get the literal length. */ + const unsigned token = *(dctx.ip)++; + if ((length = (token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + + /* Copy the literals. */ + cpy = dctx.op + length; + memcpy(dctx.op, dctx.ip, length); + dctx.ip += length; + dctx.op = cpy; + + //TODO : dynamic offset size + offset = MEM_read32(dctx.ip); + dctx.ip += LDM_OFFSET_SIZE; + match = dctx.op - offset; + + /* Get the match length. */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + length += LDM_MIN_MATCH_LENGTH; + + /* Copy match. */ + cpy = dctx.op + length; + + // Inefficient for now. + while (match < cpy - offset && dctx.op < dctx.oend) { + *(dctx.op)++ = *match++; + } + } + return dctx.op - (BYTE *)dst; +} + +// TODO: implement and test hash function +void LDM_test(const BYTE *src) { + const U32 diff = 100; + const BYTE *pCur = src + diff; + U64 checksum = getChecksum(pCur, LDM_HASH_LENGTH); + + for (; pCur < src + diff + 60; ++pCur) { + U64 nextSum = getChecksum(pCur + 1, LDM_HASH_LENGTH); + U64 updateSum = updateChecksum(checksum, LDM_HASH_LENGTH, + pCur[0], pCur[LDM_HASH_LENGTH]); + checksum = nextSum; + printf("%llu %llu\n", nextSum, updateSum); + } +} + + diff --git a/contrib/long_distance_matching/ldm_with_table.c b/contrib/long_distance_matching/ldm_with_table.c index 813ead6a..c727616a 100644 --- a/contrib/long_distance_matching/ldm_with_table.c +++ b/contrib/long_distance_matching/ldm_with_table.c @@ -29,7 +29,7 @@ #define CHECKSUM_CHAR_OFFSET 10 // Take first match only. -#define ZSTD_SKIP +//#define ZSTD_SKIP //#define RUN_CHECKS @@ -292,8 +292,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, totalMatchLength = forwardMatchLength + backwardMatchLength; - if (totalMatchLength >= bestMatchLength && - totalMatchLength >= LDM_MIN_MATCH_LENGTH) { + if (totalMatchLength >= bestMatchLength) { bestMatchLength = totalMatchLength; *pForwardMatchLength = forwardMatchLength; *pBackwardMatchLength = backwardMatchLength; @@ -305,7 +304,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, } } } - if (bestEntry != NULL && bestMatchLength > LDM_MIN_MATCH_LENGTH) { + if (bestEntry != NULL) { return bestEntry; } return NULL; @@ -951,23 +950,8 @@ size_t LDM_decompress(const void *src, size_t compressedSize, } // TODO: implement and test hash function -void LDM_test(void) { +void LDM_test(const BYTE *src) { + (void)src; } -/* -void LDM_test(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - const BYTE *ip = (const BYTE *)src + 1125; - U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); - U32 sum2; - ++ip; - for (; ip < (const BYTE *)src + 1125 + 100; ip++) { - sum2 = updateChecksum(sum, LDM_HASH_LENGTH, - ip[-1], ip[LDM_HASH_LENGTH - 1]); - sum = getChecksum((const char *)ip, LDM_HASH_LENGTH); - printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2); - } -} -*/ - diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 96db0c22..3582d5a2 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -13,13 +13,13 @@ #include "zstd.h" #define DEBUG -#define TEST +//#define TEST /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. * * TODO: This might seg fault if the compressed size is > the decompress - * size due to the mmapping and output file size allocated to be the input size. + * size due to the mmapping and output file size allocated to be the input size * The compress function should check before writing or buffer writes. */ static int compress(const char *fname, const char *oname) { @@ -69,6 +69,11 @@ static int compress(const char *fname, const char *oname) { perror("mmap error for output"); return 1; } + +#ifdef TEST + LDM_test((const BYTE *)src); +#endif + gettimeofday(&tv1, NULL); compressedSize = LDM_HEADER_SIZE + @@ -251,8 +256,5 @@ int main(int argc, const char *argv[]) { /* verify */ verify(inpFilename, decFilename); -#ifdef TEST - LDM_test(); -#endif return 0; } From 0b8fb1703b39ae1f07aea1eaac46b97372d9238d Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 20 Jul 2017 16:51:01 -0700 Subject: [PATCH 48/62] Experiment with 64-bit hash insertion policy --- contrib/long_distance_matching/Makefile | 6 +-- .../circular_buffer_table.c | 10 ++--- contrib/long_distance_matching/ldm.c | 10 ++--- contrib/long_distance_matching/ldm.h | 8 ++-- .../{ldm_hf_test.c => ldm_64_hash.c} | 38 ++++++++++--------- .../long_distance_matching/ldm_hashtable.h | 4 +- .../long_distance_matching/ldm_with_table.c | 12 +++--- contrib/long_distance_matching/main-ldm.c | 7 ++-- 8 files changed, 48 insertions(+), 47 deletions(-) rename contrib/long_distance_matching/{ldm_hf_test.c => ldm_64_hash.c} (97%) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 5119f464..9dc33fae 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,7 +25,7 @@ LDFLAGS += -lzstd default: all -all: main-circular-buffer main-integrated main-hf +all: main-circular-buffer main-integrated main-64 #main-basic : basic_table.c ldm.c main-ldm.c # $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ @@ -33,7 +33,7 @@ all: main-circular-buffer main-integrated main-hf main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-hf: ldm_hf_test.c main-ldm.c +main-64: ldm_64_hash.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ main-integrated: ldm_with_table.c main-ldm.c @@ -41,6 +41,6 @@ main-integrated: ldm_with_table.c main-ldm.c clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-circular-buffer main-integrated main-hf + main-basic main-circular-buffer main-integrated main-64 @echo Cleaning completed diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index 66107e06..fb6c19d2 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -164,19 +164,19 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, const BYTE *pIn, const BYTE *pEnd, const BYTE *pAnchor, - U32 *pForwardMatchLength, - U32 *pBackwardMatchLength) { + U64 *pForwardMatchLength, + U64 *pBackwardMatchLength) { LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; LDM_hashEntry *bestEntry = NULL; - U32 bestMatchLength = 0; + U64 bestMatchLength = 0; for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { const BYTE *pMatch = cur->offset + table->offsetBase; // Check checksum for faster check. if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) { - U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); - U32 backwardMatchLength, totalMatchLength; + U64 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); + U64 backwardMatchLength, totalMatchLength; if (forwardMatchLength < table->minMatchLength) { continue; diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index ab2de7c1..b018c475 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -429,7 +429,7 @@ void LDM_destroyCCtx(LDM_CCtx *cctx) { * matchLength contains the forward length of the match. */ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, - U32 *matchLength, U32 *backwardMatchLength) { + U64 *matchLength, U64 *backwardMatchLength) { LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; @@ -462,7 +462,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, } void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { + LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength) { /* Encode the literal length. */ if (literalLength >= RUN_MASK) { int len = (int)literalLength - RUN_MASK; @@ -481,9 +481,9 @@ void LDM_encodeLiteralLengthAndLiterals( } void LDM_outputBlock(LDM_CCtx *cctx, - const U32 literalLength, + const U64 literalLength, const U32 offset, - const U32 matchLength) { + const U64 matchLength) { BYTE *pToken = cctx->op++; /* Encode the literal length and literals. */ @@ -495,7 +495,7 @@ void LDM_outputBlock(LDM_CCtx *cctx, /* Encode the match length. */ if (matchLength >= ML_MASK) { - unsigned matchLengthRemaining = matchLength; + U64 matchLengthRemaining = matchLength; *pToken += ML_MASK; matchLengthRemaining -= ML_MASK; MEM_write32(cctx->op, 0xFFFFFFFF); diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index c420e60c..83cd3623 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -14,7 +14,7 @@ // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? #define LDM_MEMORY_USAGE 22 -#define HASH_BUCKET_SIZE_LOG 3 // MAX is 4 for now +#define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now // Defines the lag in inserting elements into the hash table. #define LDM_LAG 0 @@ -115,16 +115,16 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, * This is followed by literalLength bytes corresponding to the literals. */ void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength); + LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength); /** * Write current block (literals, literal length, match offset, * match length). */ void LDM_outputBlock(LDM_CCtx *cctx, - const U32 literalLength, + const U64 literalLength, const U32 offset, - const U32 matchLength); + const U64 matchLength); /** * Decompresses src into dst. diff --git a/contrib/long_distance_matching/ldm_hf_test.c b/contrib/long_distance_matching/ldm_64_hash.c similarity index 97% rename from contrib/long_distance_matching/ldm_hf_test.c rename to contrib/long_distance_matching/ldm_64_hash.c index 63be82d1..a72c283f 100644 --- a/contrib/long_distance_matching/ldm_hf_test.c +++ b/contrib/long_distance_matching/ldm_64_hash.c @@ -241,9 +241,9 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, * * We count only bytes where pMatch > pBaes and pIn > pAnchor. */ -U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, +U64 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, const BYTE *pMatch, const BYTE *pBase) { - U32 matchLength = 0; + U64 matchLength = 0; while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { pIn--; pMatch--; @@ -267,8 +267,8 @@ U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, const hash_t hash, const U32 checksum, - U32 *pForwardMatchLength, - U32 *pBackwardMatchLength) { + U64 *pForwardMatchLength, + U64 *pBackwardMatchLength) { LDM_hashTable *table = cctx->hashTable; LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; @@ -541,7 +541,7 @@ static U64 updateChecksum(U64 sum, U32 len, BYTE toRemove, BYTE toAdd) { // TODO: deduplicate. static const U64 prime8bytes = 11400714785074694791ULL; -// static const U64 prime8bytes = 5; + sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) * ipow(prime8bytes, len - 1)); sum *= prime8bytes; @@ -696,11 +696,12 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, void LDM_outputConfiguration(void) { printf("=====================\n"); printf("Configuration\n"); - printf("Window size log: %d\n", LDM_WINDOW_SIZE_LOG); - printf("Min match, hash length: %d, %d\n", + printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); + printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); + printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); printf("LDM_LAG %d\n", LDM_LAG); printf("=====================\n"); } @@ -762,7 +763,7 @@ void LDM_destroyCCtx(LDM_CCtx *cctx) { * forwardMatchLength contains the forward length of the match. */ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, - U32 *forwardMatchLength, U32 *backwardMatchLength) { + U64 *forwardMatchLength, U64 *backwardMatchLength) { LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; @@ -800,10 +801,10 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, } void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { + LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength) { /* Encode the literal length. */ if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; + U64 len = (U64)literalLength - RUN_MASK; *pToken = (RUN_MASK << ML_BITS); for (; len >= 255; len -= 255) { *(cctx->op)++ = 255; @@ -819,9 +820,9 @@ void LDM_encodeLiteralLengthAndLiterals( } void LDM_outputBlock(LDM_CCtx *cctx, - const U32 literalLength, + const U64 literalLength, const U32 offset, - const U32 matchLength) { + const U64 matchLength) { BYTE *pToken = cctx->op++; /* Encode the literal length and literals. */ @@ -833,7 +834,7 @@ void LDM_outputBlock(LDM_CCtx *cctx, /* Encode the match length. */ if (matchLength >= ML_MASK) { - unsigned matchLengthRemaining = matchLength; + U64 matchLengthRemaining = matchLength; *pToken += ML_MASK; matchLengthRemaining -= ML_MASK; MEM_write32(cctx->op, 0xFFFFFFFF); @@ -858,8 +859,8 @@ size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; const BYTE *match = NULL; - U32 forwardMatchLength = 0; - U32 backwardsMatchLength = 0; + U64 forwardMatchLength = 0; + U64 backwardsMatchLength = 0; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); LDM_outputConfiguration(); @@ -892,9 +893,9 @@ size_t LDM_compress(const void *src, size_t srcSize, * length) and update pointers and hashes. */ { - const U32 literalLength = cctx.ip - cctx.anchor; + const U64 literalLength = cctx.ip - cctx.anchor; const U32 offset = cctx.ip - match; - const U32 matchLength = forwardMatchLength + + const U64 matchLength = forwardMatchLength + backwardsMatchLength - LDM_MIN_MATCH_LENGTH; @@ -934,7 +935,7 @@ size_t LDM_compress(const void *src, size_t srcSize, /* Encode the last literals (no more matches). */ { - const U32 lastRun = cctx.iend - cctx.anchor; + const U64 lastRun = cctx.iend - cctx.anchor; BYTE *pToken = cctx.op++; LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); } @@ -979,6 +980,7 @@ void LDM_initializeDCtx(LDM_DCtx *dctx, size_t LDM_decompress(const void *src, size_t compressedSize, void *dst, size_t maxDecompressedSize) { + LDM_DCtx dctx; LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 9d5ba0e2..d59f401e 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -25,8 +25,8 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, const BYTE *pIn, const BYTE *pEnd, const BYTE *pAnchor, - U32 *matchLength, - U32 *backwardsMatchLength); + U64 *matchLength, + U64 *backwardsMatchLength); hash_t HASH_hashU32(U32 value); diff --git a/contrib/long_distance_matching/ldm_with_table.c b/contrib/long_distance_matching/ldm_with_table.c index c727616a..babfdf3f 100644 --- a/contrib/long_distance_matching/ldm_with_table.c +++ b/contrib/long_distance_matching/ldm_with_table.c @@ -719,10 +719,10 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, } void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U32 literalLength) { + LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength) { /* Encode the literal length. */ if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; + U64 len = (U64)literalLength - RUN_MASK; *pToken = (RUN_MASK << ML_BITS); for (; len >= 255; len -= 255) { *(cctx->op)++ = 255; @@ -738,9 +738,9 @@ void LDM_encodeLiteralLengthAndLiterals( } void LDM_outputBlock(LDM_CCtx *cctx, - const U32 literalLength, + const U64 literalLength, const U32 offset, - const U32 matchLength) { + const U64 matchLength) { BYTE *pToken = cctx->op++; /* Encode the literal length and literals. */ @@ -811,9 +811,9 @@ size_t LDM_compress(const void *src, size_t srcSize, * length) and update pointers and hashes. */ { - const U32 literalLength = cctx.ip - cctx.anchor; + const U64 literalLength = cctx.ip - cctx.anchor; const U32 offset = cctx.ip - match; - const U32 matchLength = forwardMatchLength + + const U64 matchLength = forwardMatchLength + backwardsMatchLength - LDM_MIN_MATCH_LENGTH; diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 3582d5a2..b6788c67 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -94,8 +94,8 @@ static int compress(const char *fname, const char *oname) { // Truncate file to compressedSize. ftruncate(fdout, compressedSize); - printf("%25s : %6u -> %7u - %s (%.1f%%)\n", fname, - (unsigned)statbuf.st_size, (unsigned)compressedSize, oname, + printf("%25s : %10lu -> %10lu - %s (%.1f%%)\n", fname, + (size_t)statbuf.st_size, (size_t)compressedSize, oname, (double)compressedSize / (statbuf.st_size) * 100); timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + @@ -164,7 +164,7 @@ static int decompress(const char *fname, const char *oname) { src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, dst, decompressedSize); printf("Ret size out: %zu\n", outSize); - ftruncate(fdout, outSize); +// ftruncate(fdout, decompressedSize); close(fdin); close(fdout); @@ -231,7 +231,6 @@ int main(int argc, const char *argv[]) { printf("ldm = [%s]\n", ldmFilename); printf("dec = [%s]\n", decFilename); - /* Compress */ { if (compress(inpFilename, ldmFilename)) { From 1a188fe864c67673621b0b20a9911e3d23ef935a Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Fri, 21 Jul 2017 10:44:39 -0700 Subject: [PATCH 49/62] Fix overflow bug when calculating hash --- contrib/long_distance_matching/Makefile | 4 +- contrib/long_distance_matching/ldm.c | 4 +- contrib/long_distance_matching/ldm.h | 11 +- contrib/long_distance_matching/ldm_64_hash.c | 172 ++++--------------- contrib/long_distance_matching/main-ldm.c | 5 +- 5 files changed, 46 insertions(+), 150 deletions(-) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 9dc33fae..16844297 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,7 +25,7 @@ LDFLAGS += -lzstd default: all -all: main-circular-buffer main-integrated main-64 +all: main-circular-buffer main-integrated main-64 #main-basic : basic_table.c ldm.c main-ldm.c # $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ @@ -41,6 +41,6 @@ main-integrated: ldm_with_table.c main-ldm.c clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-circular-buffer main-integrated main-64 + main-circular-buffer main-integrated main-64 @echo Cleaning completed diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index b018c475..bfaff1f5 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -520,8 +520,8 @@ size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; const BYTE *match = NULL; - U32 forwardMatchLength = 0; - U32 backwardsMatchLength = 0; + U64 forwardMatchLength = 0; + U64 backwardsMatchLength = 0; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); LDM_outputConfiguration(); diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 83cd3623..e2f78697 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -14,7 +14,7 @@ // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? #define LDM_MEMORY_USAGE 22 -#define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now +#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now // Defines the lag in inserting elements into the hash table. #define LDM_LAG 0 @@ -26,7 +26,8 @@ #define LDM_MIN_MATCH_LENGTH 64 #define LDM_HASH_LENGTH 64 - +#define TMP_EVICTION +#define TMP_TAG_INSERT typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_DCtx LDM_DCtx; @@ -99,12 +100,6 @@ void LDM_printCompressStats(const LDM_compressStats *stats); */ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch); -/** - * Counts the number of bytes that match from pIn and pMatch, - * up to pInLimit. - */ -U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit); /** * Encode the literal length followed by the literals. diff --git a/contrib/long_distance_matching/ldm_64_hash.c b/contrib/long_distance_matching/ldm_64_hash.c index a72c283f..7a813534 100644 --- a/contrib/long_distance_matching/ldm_64_hash.c +++ b/contrib/long_distance_matching/ldm_64_hash.c @@ -89,21 +89,16 @@ struct LDM_CCtx { const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ - U32 lastSum; + U64 lastSum; const BYTE *nextIp; // TODO: this is redundant (ip + step) const BYTE *nextPosHashed; U64 nextSum; -// hash_t nextHash; /* Hash corresponding to nextPosHashed */ -// U32 nextSum; - unsigned step; // ip step, should be 1. const BYTE *lagIp; U64 lagSum; -// hash_t lagHash; -// U32 lagSum; U64 numHashInserts; // DEBUG @@ -273,15 +268,15 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; LDM_hashEntry *bestEntry = NULL; - U32 bestMatchLength = 0; + U64 bestMatchLength = 0; for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { const BYTE *pMatch = cur->offset + cctx->ibase; // Check checksum for faster check. if (cur->checksum == checksum && cctx->ip - pMatch <= LDM_WINDOW_SIZE) { - U32 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); - U32 backwardMatchLength, totalMatchLength; + U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); + U64 backwardMatchLength, totalMatchLength; // For speed. if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) { @@ -313,6 +308,8 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, return NULL; } +#ifdef TMP_EVICTION + void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; @@ -320,6 +317,17 @@ void HASH_insert(LDM_hashTable *table, table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; } +#else + +void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { + *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; + table->bucketOffsets[hash]++; + table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; +} +#endif // TMP_EVICTION + + U32 HASH_getSize(const LDM_hashTable *table) { return table->numBuckets; } @@ -349,7 +357,7 @@ void HASH_outputTableOccupancy(const LDM_hashTable *table) { // TODO: This can be done more efficiently (but it is not that important as it // is only used for computing stats). -static int intLog2(U32 x) { +static int intLog2(U64 x) { int ret = 0; while (x >>= 1) { ret++; @@ -357,32 +365,6 @@ static int intLog2(U32 x) { return ret; } -// Maybe we would eventually prefer to have linear rather than -// exponential buckets. -/** -void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { - U32 i = 0; - int buckets[32] = { 0 }; - - printf("\n"); - printf("Hash table histogram\n"); - for (; i < HASH_getSize(cctx->hashTable); i++) { - int offset = (cctx->ip - cctx->ibase) - - HASH_getEntryFromHash(cctx->hashTable, i)->offset; - buckets[intLog2(offset)]++; - } - - i = 0; - for (; i < 32; i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, - buckets[i], - 100.0 * (double) buckets[i] / - (double) HASH_getSize(cctx->hashTable)); - } - printf("\n"); -} -*/ - void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; printf("=====================\n"); @@ -436,22 +418,6 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { return 1; } -#if 0 -hash_t HASH_hashU32(U32 value) { - return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); -} -#endif - -/** - * Convert a sum computed from getChecksum to a hash value in the range - * of the hash table. - */ -#if 0 -static hash_t checksumToHash(U32 sum) { - return HASH_hashU32(sum); -} -#endif - // Upper LDM_HASH_LOG bits. static hash_t checksumToHash(U64 sum) { return sum >> (64 - LDM_HASHLOG); @@ -462,69 +428,19 @@ static U32 checksumFromHfHash(U64 hfHash) { return (hfHash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; } -#if 0 -/** - * Computes a checksum based on rsync's checksum. - * - * a(k,l) = \sum_{i = k}^l x_i (mod M) - * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) - * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) - */ -static U32 getChecksum(const BYTE *buf, U32 len) { - U32 i; - U32 s1, s2; - - s1 = s2 = 0; - for (i = 0; i < (len - 4); i += 4) { - s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + - (2 * buf[i + 2]) + (buf[i + 3]) + - (10 * CHECKSUM_CHAR_OFFSET); - s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3] + - + (4 * CHECKSUM_CHAR_OFFSET); - - } - for(; i < len; i++) { - s1 += buf[i] + CHECKSUM_CHAR_OFFSET; - s2 += s1; - } - return (s1 & 0xffff) + (s2 << 16); -} -#endif - static U64 getChecksum(const BYTE *buf, U32 len) { static const U64 prime8bytes = 11400714785074694791ULL; -// static const U64 prime8bytes = 5; U64 ret = 0; U32 i; for (i = 0; i < len; i++) { ret *= prime8bytes; ret += buf[i] + CHECKSUM_CHAR_OFFSET; -// printf("HERE %llu\n", ret); } return ret; } -#if 0 -/** - * Update a checksum computed from getChecksum(data, len). - * - * The checksum can be updated along its ends as follows: - * a(k+1, l+1) = (a(k,l) - x_k + x_{l+1}) (mod M) - * b(k+1, l+1) = (b(k,l) - (l-k+1)*x_k + (a(k+1,l+1)) (mod M) - * - * Thus toRemove should correspond to data[0]. - */ -static U32 updateChecksum(U32 sum, U32 len, - BYTE toRemove, BYTE toAdd) { - U32 s1 = (sum & 0xffff) - toRemove + toAdd; - U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; - - return (s1 & 0xffff) + (s2 << 16); -} -#endif - static U64 ipow(U64 base, U64 exp) { U64 ret = 1; while (exp) { @@ -542,6 +458,8 @@ static U64 updateChecksum(U64 sum, U32 len, // TODO: deduplicate. static const U64 prime8bytes = 11400714785074694791ULL; + // TODO: relying on compiler optimization here. + // The exponential can be calculated explicitly. sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) * ipow(prime8bytes, len - 1)); sum *= prime8bytes; @@ -558,7 +476,7 @@ static U64 updateChecksum(U64 sum, U32 len, */ static void setNextHash(LDM_CCtx *cctx) { #ifdef RUN_CHECKS - U32 check; + U64 check; if ((cctx->nextIp - cctx->ibase != 1) && (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, @@ -568,16 +486,11 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->DEBUG_setNextHash = cctx->nextIp; #endif -// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); cctx->nextSum = updateChecksum( cctx->lastSum, LDM_HASH_LENGTH, cctx->lastPosHashed[0], cctx->lastPosHashed[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; -#if 0 - cctx->nextHash = checksumToHash(cctx->nextSum); -#endif - #if LDM_LAG // printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); @@ -586,9 +499,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->lagSum, LDM_HASH_LENGTH, cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); cctx->lagIp++; -#if 0 - cctx->lagHash = checksumToHash(cctx->lagSum); -#endif } #endif @@ -596,7 +506,7 @@ static void setNextHash(LDM_CCtx *cctx) { check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); if (check != cctx->nextSum) { - printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); + printf("CHECK: setNextHash failed %llu %llu\n", check, cctx->nextSum); } if ((cctx->nextIp - cctx->lastPosHashed) != 1) { @@ -610,8 +520,6 @@ static void setNextHash(LDM_CCtx *cctx) { static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) { // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. - U32 hash = checksumToHash(hfHash); - U32 sum = checksumFromHfHash(hfHash); // printf("TMP %u %u %llu\n", hash, sum, hfHash); if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { @@ -619,22 +527,26 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) { #if LDM_LAG // TODO: off by 1, but whatever if (cctx->lagIp - cctx->ibase > 0) { - const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; - HASH_insert(cctx->hashTable, cctx->lagHash, entry); + U32 hash = checksumToHash(cctx->lagSum); + U32 sum = checksumFromHfHash(cctx->lagSum); + const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, sum }; + HASH_insert(cctx->hashTable, hash, entry); } else { + U32 hash = checksumToHash(hfHash); + U32 sum = checksumFromHfHash(hfHash); + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; HASH_insert(cctx->hashTable, hash, entry); } #else + U32 hash = checksumToHash(hfHash); + U32 sum = checksumFromHfHash(hfHash); const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; HASH_insert(cctx->hashTable, hash, entry); #endif } cctx->lastPosHashed = cctx->ip; -#if 0 - cctx->lastHash = hash; -#endif cctx->lastSum = hfHash; } @@ -650,9 +562,6 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { printf("CHECK failed: updateLastHashFromNextHash %zu\n", cctx->ip - cctx->ibase); } -#endif -#if 0 - putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); #endif putHashOfCurrentPositionFromHash(cctx, cctx->nextSum); } @@ -661,10 +570,7 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { * Insert hash of the current position into the hash table. */ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); -#if 0 - hash_t hash = checksumToHash(sum); -#endif + U64 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); #ifdef RUN_CHECKS if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { @@ -672,13 +578,11 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { cctx->ip - cctx->ibase); } #endif -#if 0 - putHashOfCurrentPositionFromHash(cctx, hash, sum); -#endif + putHashOfCurrentPositionFromHash(cctx, sum); } -U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, +U64 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE * const pStart = pIn; while (pIn < pInLimit - 1) { @@ -688,9 +592,9 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, pMatch++; continue; } - return (U32)(pIn - pStart); + return (U64)(pIn - pStart); } - return (U32)(pIn - pStart); + return (U64)(pIn - pStart); } void LDM_outputConfiguration(void) { @@ -773,10 +677,6 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, U64 hash; U32 sum; setNextHash(cctx); -#if 0 - h = cctx->nextHash; - sum = cctx->nextSum; -#endif hash = cctx->nextSum; h = checksumToHash(hash); sum = checksumFromHfHash(hash); diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index b6788c67..9769f10e 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -94,9 +94,10 @@ static int compress(const char *fname, const char *oname) { // Truncate file to compressedSize. ftruncate(fdout, compressedSize); - printf("%25s : %10lu -> %10lu - %s (%.1f%%)\n", fname, + printf("%25s : %10lu -> %10lu - %s (%.2fx --- %.1f%%)\n", fname, (size_t)statbuf.st_size, (size_t)compressedSize, oname, - (double)compressedSize / (statbuf.st_size) * 100); + (statbuf.st_size) / (double)compressedSize, + (double)compressedSize / (double)(statbuf.st_size) * 100.0); timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + (double) (tv2.tv_sec - tv1.tv_sec), From eb16da647d38df0f2180c9c335ddadc559624e85 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 24 Jul 2017 10:18:58 -0700 Subject: [PATCH 50/62] Minor clean up --- contrib/long_distance_matching/ldm.h | 13 +- contrib/long_distance_matching/ldm_64_hash.c | 199 +++++++++++++++++-- 2 files changed, 188 insertions(+), 24 deletions(-) diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index e2f78697..840824c4 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -13,21 +13,26 @@ // Defines the size of the hash table. // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? -#define LDM_MEMORY_USAGE 22 -#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now +#define LDM_MEMORY_USAGE 24 +#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now // Defines the lag in inserting elements into the hash table. #define LDM_LAG 0 -#define LDM_WINDOW_SIZE_LOG 28 +#define LDM_WINDOW_SIZE_LOG 28 // Max value is 30 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four (and perhaps set to the same value?). #define LDM_MIN_MATCH_LENGTH 64 #define LDM_HASH_LENGTH 64 -#define TMP_EVICTION +// Experimental. +//:w +//#define TMP_EVICTION #define TMP_TAG_INSERT +//#define TMP_SIMPLE_LOWER +//#define TMP_FORCE_HASH_ONLY + typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_DCtx LDM_DCtx; diff --git a/contrib/long_distance_matching/ldm_64_hash.c b/contrib/long_distance_matching/ldm_64_hash.c index 7a813534..bdbdd199 100644 --- a/contrib/long_distance_matching/ldm_64_hash.c +++ b/contrib/long_distance_matching/ldm_64_hash.c @@ -12,7 +12,11 @@ #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) // Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG))) +#ifdef TMP_FORCE_HASH_ONLY + #define HASH_ONLY_EVERY_LOG 7 +#else + #define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG))) +#endif #define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) /* Hash table stuff. */ @@ -26,12 +30,15 @@ #define COMPUTE_STATS #define OUTPUT_CONFIGURATION -#define CHECKSUM_CHAR_OFFSET 10 +#define CHECKSUM_CHAR_OFFSET 1 // Take first match only. //#define ZSTD_SKIP //#define RUN_CHECKS +// +// +static const U64 prime8bytes = 11400714785074694791ULL; /* Hash table stuff */ @@ -56,6 +63,14 @@ struct LDM_compressStats { U32 numHashInserts; U32 offsetHistogram[32]; + + U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG]; + U64 TMP_totalHashCount; + + U64 TMP_totalInWindow; + U64 TMP_totalInserts; + + U64 TMP_matchCount; }; typedef struct LDM_hashTable LDM_hashTable; @@ -311,10 +326,80 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, #ifdef TMP_EVICTION void HASH_insert(LDM_hashTable *table, - const hash_t hash, const LDM_hashEntry entry) { + const hash_t hash, const LDM_hashEntry entry, + LDM_CCtx *cctx) { + // Overwrite based on part of checksum. + /* + LDM_hashEntry *toOverwrite = + getBucket(table, hash) + table->bucketOffsets[hash]; + const BYTE *pMatch = toOverwrite->offset + cctx->ibase; + if (toOverwrite->offset != 0 && + cctx->ip - pMatch <= LDM_WINDOW_SIZE) { + cctx->stats.TMP_totalInWindow++; + } + + cctx->stats.TMP_totalInserts++; + *(toOverwrite) = entry; + */ + + /* + int i; + LDM_hashEntry *bucket = getBucket(table, hash); + for (i = 0; i < HASH_BUCKET_SIZE; i++) { + if (bucket[i].checksum == entry.checksum) { + bucket[i] = entry; + cctx->stats.TMP_matchCount++; + return; + } + } + */ + + // Find entry beyond window size, replace. Else, random. + int i; + LDM_hashEntry *bucket = getBucket(table, hash); + for (i = 0; i < HASH_BUCKET_SIZE; i++) { + if (cctx->ip - cctx->ibase - bucket[i].offset > LDM_WINDOW_SIZE) { + bucket[i] = entry; + return; + } + } + + i = rand() & (HASH_BUCKET_SIZE - 1); + *(bucket + i) = entry; + + + /** + * Sliding buffer style pointer + * Keep old entry as temporary. If the old entry is outside the window, + * overwrite and we are done. + * + * Backwards (insert at x): + * x, a, b b, c c c c, d d d d d d d d + * x, d d d d d d d d, c c c c, b b, a + * + * Else, find something to evict. + * If old entry has more ones, it takes + * the next spot. <-- reversed order? + * + * If window size > LDM_WINDOW_SIZE, + * overwrite, + * + * Insert forwards. If > tag, keep. Else evict. + * + * + * + * + */ + + + /* *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; table->bucketOffsets[hash]++; table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; + */ + +// U16 mask = entry.checksum & (HASH_BUCKET_SIZE - 1); +// *(getBucket(table, hash) + mask) = entry; } #else @@ -348,8 +433,9 @@ void HASH_outputTableOccupancy(const LDM_hashTable *table) { } } - printf("Num buckets, bucket size: %d, %d\n", - table->numBuckets, HASH_BUCKET_SIZE); + // TODO: repeat numBuckets as a check for now. + printf("Num buckets, bucket size: %d (2^%d), %d\n", + table->numBuckets, LDM_HASHLOG, HASH_BUCKET_SIZE); printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", table->numEntries, ctr, 100.0 * (double)(ctr) / table->numEntries); @@ -396,6 +482,24 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { (double) stats->numMatches); } printf("\n"); +#ifdef TMP_TAG_INSERT +/* + printf("Lower bit distribution\n"); + for (i = 0; i < (1 << HASH_ONLY_EVERY_LOG); i++) { + printf("%5d %5llu %6.3f\n", i, stats->TMP_hashCount[i], + 100.0 * (double) stats->TMP_hashCount[i] / + (double) stats->TMP_totalHashCount); + } +*/ +#endif + +#ifdef TMP_EVICTION + printf("Evicted something in window: %llu %6.3f\n", + stats->TMP_totalInWindow, + 100.0 * (double)stats->TMP_totalInWindow / + (double)stats->TMP_totalInserts); + printf("Match count: %llu\n", stats->TMP_matchCount); +#endif printf("=====================\n"); } @@ -418,7 +522,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { return 1; } -// Upper LDM_HASH_LOG bits. +// Upper LDM_HASHLOG bits. static hash_t checksumToHash(U64 sum) { return sum >> (64 - LDM_HASHLOG); } @@ -428,9 +532,30 @@ static U32 checksumFromHfHash(U64 hfHash) { return (hfHash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; } -static U64 getChecksum(const BYTE *buf, U32 len) { - static const U64 prime8bytes = 11400714785074694791ULL; +#ifdef TMP_TAG_INSERT +static U32 lowerBitsFromHfHash(U64 hfHash) { + // The number of bits used so far is LDM_HASHLOG + 32. + // So there are 32 - LDM_HASHLOG bits left. + // Occasional hashing requires HASH_ONLY_EVERY_LOG bits. + // So if 32 - LDMHASHLOG < HASH_ONLY_EVERY_LOG, just return lower bits + // allowing for reuse of bits. +#ifdef TMP_SIMPLE_LOWER + return hfHash & HASH_ONLY_EVERY; +#else + if (32 - LDM_HASHLOG < HASH_ONLY_EVERY_LOG) { + return hfHash & HASH_ONLY_EVERY; + } else { + // Otherwise shift by (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG) bits first. + return (hfHash >> (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG)) & + HASH_ONLY_EVERY; + } +#endif +} +#endif + + +static U64 getChecksum(const BYTE *buf, U32 len) { U64 ret = 0; U32 i; for (i = 0; i < len; i++) { @@ -455,11 +580,8 @@ static U64 ipow(U64 base, U64 exp) { static U64 updateChecksum(U64 sum, U32 len, BYTE toRemove, BYTE toAdd) { - // TODO: deduplicate. - static const U64 prime8bytes = 11400714785074694791ULL; - // TODO: relying on compiler optimization here. - // The exponential can be calculated explicitly. + // The exponential can (should?) be calculated explicitly. sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) * ipow(prime8bytes, len - 1)); sum *= prime8bytes; @@ -492,6 +614,14 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->lastPosHashed[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; +#ifdef TMP_TAG_INSERT + { + U32 hashEveryMask = lowerBitsFromHfHash(cctx->nextSum); + cctx->stats.TMP_totalHashCount++; + cctx->stats.TMP_hashCount[hashEveryMask]++; + } +#endif + #if LDM_LAG // printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); if (cctx->ip - cctx->ibase > LDM_LAG) { @@ -520,31 +650,48 @@ static void setNextHash(LDM_CCtx *cctx) { static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) { // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. -// printf("TMP %u %u %llu\n", hash, sum, hfHash); - - if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - #if LDM_LAG - // TODO: off by 1, but whatever + if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { + // TODO: off by 1, but whatever. if (cctx->lagIp - cctx->ibase > 0) { U32 hash = checksumToHash(cctx->lagSum); U32 sum = checksumFromHfHash(cctx->lagSum); const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, sum }; +#ifdef TMP_EVICTION + HASH_insert(cctx->hashTable, hash, entry, cctx); +#else HASH_insert(cctx->hashTable, hash, entry); +#endif } else { U32 hash = checksumToHash(hfHash); U32 sum = checksumFromHfHash(hfHash); const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; - HASH_insert(cctx->hashTable, hash, entry); - } +#ifdef TMP_EVICTION + HASH_insert(cctx->hashTable, hash, entry, cctx); #else + HASH_insert(cctx->hashTable, hash, entry); +#endif + } + } +#else +#ifdef TMP_TAG_INSERT + U32 hashEveryMask = lowerBitsFromHfHash(hfHash); + // TODO: look at stats. + if (hashEveryMask == HASH_ONLY_EVERY) { +#else + if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { +#endif U32 hash = checksumToHash(hfHash); U32 sum = checksumFromHfHash(hfHash); const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; +#ifdef TMP_EVICTION + HASH_insert(cctx->hashTable, hash, entry, cctx); +#else HASH_insert(cctx->hashTable, hash, entry); #endif } +#endif cctx->lastPosHashed = cctx->ip; cctx->lastSum = hfHash; @@ -676,10 +823,16 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, hash_t h; U64 hash; U32 sum; +#ifdef TMP_TAG_INSERT + U32 hashEveryMask; +#endif setNextHash(cctx); hash = cctx->nextSum; h = checksumToHash(hash); sum = checksumFromHfHash(hash); +#ifdef TMP_TAG_INSERT + hashEveryMask = lowerBitsFromHfHash(hash); +#endif cctx->ip = cctx->nextIp; cctx->nextIp += cctx->step; @@ -687,9 +840,15 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, if (cctx->ip > cctx->imatchLimit) { return 1; } - +#ifdef TMP_TAG_INSERT + if (hashEveryMask == HASH_ONLY_EVERY) { + entry = HASH_getBestEntry(cctx, h, sum, + forwardMatchLength, backwardMatchLength); + } +#else entry = HASH_getBestEntry(cctx, h, sum, forwardMatchLength, backwardMatchLength); +#endif if (entry != NULL) { *match = entry->offset + cctx->ibase; From 8ed92201024e889333a9ac89dd10b188d28d8647 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 24 Jul 2017 12:05:43 -0700 Subject: [PATCH 51/62] Experiment with eviction policies and minor code cleanup --- contrib/long_distance_matching/Makefile | 7 +- .../circular_buffer_table.c | 43 +++++++----- contrib/long_distance_matching/ldm.c | 51 ++++----------- contrib/long_distance_matching/ldm.h | 44 ++++++------- contrib/long_distance_matching/ldm_64_hash.c | 32 ++++----- .../long_distance_matching/ldm_hashtable.h | 49 ++++++++++++-- .../{ldm_with_table.c => ldm_integrated.c} | 65 +++---------------- contrib/long_distance_matching/main-ldm.c | 17 ++--- 8 files changed, 139 insertions(+), 169 deletions(-) rename contrib/long_distance_matching/{ldm_with_table.c => ldm_integrated.c} (94%) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 16844297..c8129f67 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -27,20 +27,17 @@ default: all all: main-circular-buffer main-integrated main-64 -#main-basic : basic_table.c ldm.c main-ldm.c -# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ main-64: ldm_64_hash.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-integrated: ldm_with_table.c main-ldm.c +main-integrated: ldm_integrated.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-circular-buffer main-integrated main-64 + main-circular-buffer main-64 main-integrated @echo Cleaning completed diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index fb6c19d2..92ffc55b 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -5,26 +5,24 @@ #include "ldm_hashtable.h" #include "mem.h" -// Number of elements per hash bucket. -// HASH_BUCKET_SIZE_LOG defined in ldm.h +// THe number of elements per hash bucket. +// HASH_BUCKET_SIZE_LOG is defined in ldm.h. #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) + +// The number of hash buckets. #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) - - -// TODO: rename. Number of hash buckets. -// TODO: Link to HASH_ENTRY_SIZE_LOG - +// If ZSTD_SKIP is defined, then the first entry is returned in HASH_getBestEntry +// (without looking at other entries in the bucket). //#define ZSTD_SKIP struct LDM_hashTable { - U32 numBuckets; - U32 numEntries; + U32 numBuckets; // The number of buckets. + U32 numEntries; // numBuckets * HASH_BUCKET_SIZE. LDM_hashEntry *entries; - BYTE *bucketOffsets; // Pointer to current insert position. + BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position. - // Position corresponding to offset=0 in LDM_hashEntry. - const BYTE *offsetBase; + const BYTE *offsetBase; // Corresponds to offset=0 in LDM_hashEntry. U32 minMatchLength; U32 maxWindowSize; }; @@ -46,6 +44,7 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + (hash << HASH_BUCKET_SIZE_LOG); } +// From lib/compress/zstd_compress.c static unsigned ZSTD_NbCommonBytes (register size_t val) { if (MEM_isLittleEndian()) { @@ -114,7 +113,11 @@ static unsigned ZSTD_NbCommonBytes (register size_t val) } } } -// From lib/compress/zstd_compress.c +/** + * From lib/compress/zstd_compress.c + * Returns the number of bytes (consecutively) in common between pIn and pMatch + * up to pInLimit. + */ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, const BYTE *const pInLimit) { const BYTE * const pStart = pIn; @@ -147,9 +150,14 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, return (size_t)(pIn - pStart); } -U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, - const BYTE *pMatch, const BYTE *pBase) { - U32 matchLength = 0; +/** + * Returns the number of bytes in common between pIn and pMatch, + * counting backwards, with pIn having a lower limit of pAnchor and + * pMatch having a lower limit of pBase. + */ +static size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, + const BYTE *pMatch, const BYTE *pBase) { + size_t matchLength = 0; while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { pIn--; pMatch--; @@ -178,6 +186,8 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, U64 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); U64 backwardMatchLength, totalMatchLength; + // Only take matches where the forwardMatchLength is large enough + // for speed. if (forwardMatchLength < table->minMatchLength) { continue; } @@ -212,6 +222,7 @@ hash_t HASH_hashU32(U32 value) { void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { + // Circular buffer. *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; table->bucketOffsets[hash]++; table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index bfaff1f5..a5594ff6 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -29,7 +29,6 @@ typedef U32 checksum_t; -// TODO: Scanning speed // TODO: Memory usage struct LDM_compressStats { U32 windowSizeLog, hashTableSizeLog; @@ -40,9 +39,6 @@ struct LDM_compressStats { U32 minOffset, maxOffset; - U32 numCollisions; - U32 numHashInserts; - U32 offsetHistogram[32]; }; @@ -80,15 +76,12 @@ struct LDM_CCtx { hash_t nextHash; /* Hash corresponding to nextPosHashed */ checksum_t nextSum; - - unsigned step; // ip step, should be 1. const BYTE *lagIp; hash_t lagHash; checksum_t lagSum; - U64 numHashInserts; // DEBUG const BYTE *DEBUG_setNextHash; }; @@ -103,32 +96,6 @@ static int intLog2(U32 x) { return ret; } -// TODO: Maybe we would eventually prefer to have linear rather than -// exponential buckets. -/** -void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { - U32 i = 0; - int buckets[32] = { 0 }; - - printf("\n"); - printf("Hash table histogram\n"); - for (; i < HASH_getSize(cctx->hashTable); i++) { - int offset = (cctx->ip - cctx->ibase) - - HASH_getEntryFromHash(cctx->hashTable, i)->offset; - buckets[intLog2(offset)]++; - } - - i = 0; - for (; i < 32; i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, - buckets[i], - 100.0 * (double) buckets[i] / - (double) HASH_getSize(cctx->hashTable)); - } - printf("\n"); -} -*/ - void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; printf("=====================\n"); @@ -163,7 +130,8 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { printf("=====================\n"); } -int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { +/* +static int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { U32 lengthLeft = LDM_MIN_MATCH_LENGTH; const BYTE *curIn = pIn; const BYTE *curMatch = pMatch; @@ -181,6 +149,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { } return 1; } +*/ /** * Convert a sum computed from getChecksum to a hash value in the range @@ -253,7 +222,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->DEBUG_setNextHash = cctx->nextIp; #endif -// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); cctx->nextSum = updateChecksum( cctx->lastSum, LDM_HASH_LENGTH, cctx->lastPosHashed[0], @@ -292,7 +260,7 @@ static void putHashOfCurrentPositionFromHash( // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { #if LDM_LAG - // TODO: off by 1, but whatever + // Off by 1, but whatever if (cctx->lagIp - cctx->ibase > 0) { const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; HASH_insert(cctx->hashTable, cctx->lagHash, entry); @@ -344,6 +312,7 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { putHashOfCurrentPositionFromHash(cctx, hash, sum); } +/* U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE * const pStart = pIn; @@ -358,6 +327,7 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, } return (U32)(pIn - pStart); } +*/ void LDM_outputConfiguration(void) { printf("=====================\n"); @@ -380,6 +350,12 @@ void LDM_readHeader(const void *src, U64 *compressedSize, // ip += sizeof(U64); } +void LDM_writeHeader(void *memPtr, U64 compressedSize, + U64 decompressedSize) { + MEM_write64(memPtr, compressedSize); + MEM_write64((BYTE *)memPtr + 8, decompressedSize); +} + void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { @@ -592,8 +568,6 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_updateLastHashFromNextHash(&cctx); } - // HASH_outputTableOffsetHistogram(&cctx); - /* Encode the last literals (no more matches). */ { const U32 lastRun = cctx.iend - cctx.anchor; @@ -692,7 +666,6 @@ size_t LDM_decompress(const void *src, size_t compressedSize, return dctx.op - (BYTE *)dst; } -// TODO: implement and test hash function void LDM_test(const BYTE *src) { (void)src; } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 840824c4..adbe35bf 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -1,20 +1,24 @@ #ifndef LDM_H #define LDM_H -#include /* size_t */ - #include "mem.h" // from /lib/common/mem.h -#define LDM_COMPRESS_SIZE 8 -#define LDM_DECOMPRESS_SIZE 8 -#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) +// The number of bytes storing the compressed and decompressed size +// in the header. +#define LDM_COMPRESSED_SIZE 8 +#define LDM_DECOMPRESSED_SIZE 8 +#define LDM_HEADER_SIZE ((LDM_COMPRESSED_SIZE)+(LDM_DECOMPRESSED_SIZE)) + +// THe number of bytes storing the offset. #define LDM_OFFSET_SIZE 4 // Defines the size of the hash table. // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? -#define LDM_MEMORY_USAGE 24 -#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now +#define LDM_MEMORY_USAGE 23 + +// The number of entries in a hash bucket. +#define HASH_BUCKET_SIZE_LOG 0 // The maximum is 4 for now. // Defines the lag in inserting elements into the hash table. #define LDM_LAG 0 @@ -23,11 +27,10 @@ #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four (and perhaps set to the same value?). -#define LDM_MIN_MATCH_LENGTH 64 -#define LDM_HASH_LENGTH 64 +#define LDM_MIN_MATCH_LENGTH 16 +#define LDM_HASH_LENGTH 16 // Experimental. -//:w //#define TMP_EVICTION #define TMP_TAG_INSERT //#define TMP_SIMPLE_LOWER @@ -37,7 +40,6 @@ typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_DCtx LDM_DCtx; - /** * Compresses src into dst. * @@ -94,17 +96,6 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx); * Outputs compression statistics to stdout. */ void LDM_printCompressStats(const LDM_compressStats *stats); -/** - * Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the - * LDM_MIN_MATCH_LENGTH bytes from match and also if - * pIn - pMatch <= LDM_WINDOW_SIZE. - * - * This assumes LDM_MIN_MATCH_LENGTH is a multiple of four. - * - * Return 1 if valid, 0 otherwise. - */ -int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch); - /** * Encode the literal length followed by the literals. @@ -150,6 +141,15 @@ void LDM_initializeDCtx(LDM_DCtx *dctx, void LDM_readHeader(const void *src, U64 *compressedSize, U64 *decompressedSize); +/** + * Write the compressed and decompressed size. + */ +void LDM_writeHeader(void *memPtr, U64 compressedSize, + U64 decompressedSize); + +/** + * Output the configuration used. + */ void LDM_outputConfiguration(void); void LDM_test(const BYTE *src); diff --git a/contrib/long_distance_matching/ldm_64_hash.c b/contrib/long_distance_matching/ldm_64_hash.c index bdbdd199..d0080efd 100644 --- a/contrib/long_distance_matching/ldm_64_hash.c +++ b/contrib/long_distance_matching/ldm_64_hash.c @@ -36,8 +36,7 @@ //#define ZSTD_SKIP //#define RUN_CHECKS -// -// + static const U64 prime8bytes = 11400714785074694791ULL; /* Hash table stuff */ @@ -49,7 +48,6 @@ typedef struct LDM_hashEntry { U32 checksum; } LDM_hashEntry; -// TODO: Memory usage struct LDM_compressStats { U32 windowSizeLog, hashTableSizeLog; U32 numMatches; @@ -59,9 +57,6 @@ struct LDM_compressStats { U32 minOffset, maxOffset; - U32 numCollisions; - U32 numHashInserts; - U32 offsetHistogram[32]; U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG]; @@ -115,20 +110,19 @@ struct LDM_CCtx { const BYTE *lagIp; U64 lagSum; - U64 numHashInserts; // DEBUG const BYTE *DEBUG_setNextHash; }; struct LDM_hashTable { - U32 numBuckets; // Number of buckets - U32 numEntries; // Rename... - LDM_hashEntry *entries; + U32 numBuckets; // The number of buckets. + U32 numEntries; // numBuckets * HASH_BUCKET_SIZE. - BYTE *bucketOffsets; - // Position corresponding to offset=0 in LDM_hashEntry. + LDM_hashEntry *entries; + BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position. }; + /** * Create a hash table that can contain size elements. * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG. @@ -251,9 +245,9 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, * * We count only bytes where pMatch > pBaes and pIn > pAnchor. */ -U64 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, +size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, const BYTE *pMatch, const BYTE *pBase) { - U64 matchLength = 0; + size_T matchLength = 0; while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { pIn--; pMatch--; @@ -293,7 +287,8 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); U64 backwardMatchLength, totalMatchLength; - // For speed. + // Only take matches where the forward match length is large enough + // for speed. if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) { continue; } @@ -766,6 +761,13 @@ void LDM_readHeader(const void *src, U64 *compressedSize, // ip += sizeof(U64); } +void LDM_writeHeader(void *memPtr, U64 compressedSize, + U64 decompressedSize) { + MEM_write64(memPtr, compressedSize); + MEM_write64((BYTE *)memPtr + 8, decompressedSize); +} + + void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index d59f401e..6093197d 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -1,37 +1,73 @@ +/** + * A "hash" table used in LDM compression. + * + * This is not exactly a hash table in the sense that inserted entries + * are not guaranteed to remain in the hash table. + */ + #ifndef LDM_HASHTABLE_H #define LDM_HASHTABLE_H #include "mem.h" +// The log size of LDM_hashEntry in bytes. #define LDM_HASH_ENTRY_SIZE_LOG 3 -// TODO: clean up comments - typedef U32 hash_t; typedef struct LDM_hashEntry { - U32 offset; // TODO: Replace with pointer? - U32 checksum; + U32 offset; // Represents the offset of the entry from offsetBase. + U32 checksum; // A checksum to select entries with the same hash value. } LDM_hashEntry; typedef struct LDM_hashTable LDM_hashTable; +/** + * Create a table that can contain size elements. This does not necessarily + * correspond to the number of hash buckets. The number of hash buckets + * is size / (1 << HASH_BUCKET_SIZE_LOG) + * + * minMatchLength is the minimum match length required in HASH_getBestEntry. + * + * maxWindowSize is the maximum distance from pIn in HASH_getBestEntry. + * The window is defined to be (pIn - offsetBase - offset). + */ LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase, U32 minMatchLength, U32 maxWindowSize); +/** + * Return the "best" entry from the table with the same hash and checksum. + * + * pIn: a pointer to the current input position. + * pEnd: a pointer to the maximum input position. + * pAnchor: a pointer to the minimum input position. + * + * This function computes the forward and backward match length from pIn + * and writes it to forwardMatchLength and backwardsMatchLength. + * + * E.g. for the two strings "aaabbbb" "aaabbbb" with pIn and the + * entry pointing at the first "b", the forward match length would be + * four (representing the "b" matches) and the backward match length would + * three (representing the "a" matches before the pointer). + */ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, const hash_t hash, const U32 checksum, const BYTE *pIn, const BYTE *pEnd, const BYTE *pAnchor, - U64 *matchLength, + U64 *forwardMatchLength, U64 *backwardsMatchLength); +/** + * Return a hash of the value. + */ hash_t HASH_hashU32(U32 value); /** * Insert an LDM_hashEntry into the bucket corresponding to hash. + * + * An entry may be evicted in the process. */ void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry); @@ -41,6 +77,9 @@ void HASH_insert(LDM_hashTable *table, const hash_t hash, */ U32 HASH_getSize(const LDM_hashTable *table); +/** + * Destroy the table. + */ void HASH_destroyTable(LDM_hashTable *table); /** diff --git a/contrib/long_distance_matching/ldm_with_table.c b/contrib/long_distance_matching/ldm_integrated.c similarity index 94% rename from contrib/long_distance_matching/ldm_with_table.c rename to contrib/long_distance_matching/ldm_integrated.c index babfdf3f..7733d4e9 100644 --- a/contrib/long_distance_matching/ldm_with_table.c +++ b/contrib/long_distance_matching/ldm_integrated.c @@ -33,8 +33,6 @@ //#define RUN_CHECKS -/* Hash table stuff */ - typedef U32 hash_t; typedef struct LDM_hashEntry { @@ -42,7 +40,6 @@ typedef struct LDM_hashEntry { U32 checksum; } LDM_hashEntry; -// TODO: Memory usage struct LDM_compressStats { U32 windowSizeLog, hashTableSizeLog; U32 numMatches; @@ -52,9 +49,6 @@ struct LDM_compressStats { U32 minOffset, maxOffset; - U32 numCollisions; - U32 numHashInserts; - U32 offsetHistogram[32]; }; @@ -85,8 +79,6 @@ struct LDM_CCtx { LDM_hashTable *hashTable; -// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; - const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ U32 lastSum; @@ -109,11 +101,10 @@ struct LDM_CCtx { struct LDM_hashTable { U32 numBuckets; // Number of buckets - U32 numEntries; // Rename... + U32 numEntries; LDM_hashEntry *entries; BYTE *bucketOffsets; - // Position corresponding to offset=0 in LDM_hashEntry. }; /** @@ -354,32 +345,6 @@ static int intLog2(U32 x) { return ret; } -// Maybe we would eventually prefer to have linear rather than -// exponential buckets. -/** -void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { - U32 i = 0; - int buckets[32] = { 0 }; - - printf("\n"); - printf("Hash table histogram\n"); - for (; i < HASH_getSize(cctx->hashTable); i++) { - int offset = (cctx->ip - cctx->ibase) - - HASH_getEntryFromHash(cctx->hashTable, i)->offset; - buckets[intLog2(offset)]++; - } - - i = 0; - for (; i < 32; i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, - buckets[i], - 100.0 * (double) buckets[i] / - (double) HASH_getSize(cctx->hashTable)); - } - printf("\n"); -} -*/ - void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; printf("=====================\n"); @@ -508,7 +473,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->DEBUG_setNextHash = cctx->nextIp; #endif -// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); cctx->nextSum = updateChecksum( cctx->lastSum, LDM_HASH_LENGTH, cctx->lastPosHashed[0], @@ -517,7 +481,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->nextHash = checksumToHash(cctx->nextSum); #if LDM_LAG -// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); if (cctx->ip - cctx->ibase > LDM_LAG) { cctx->lagSum = updateChecksum( cctx->lagSum, LDM_HASH_LENGTH, @@ -547,10 +510,6 @@ static void putHashOfCurrentPositionFromHash( // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - /** - const LDM_hashEntry entry = { cctx->ip - cctx->ibase , - MEM_read32(cctx->ip) }; - */ #if LDM_LAG // TODO: off by 1, but whatever if (cctx->lagIp - cctx->ibase > 0) { @@ -604,21 +563,6 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { putHashOfCurrentPositionFromHash(cctx, hash, sum); } -U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = (*pMatch) ^ *(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (U32)(pIn - pStart); - } - return (U32)(pIn - pStart); -} - void LDM_outputConfiguration(void) { printf("=====================\n"); printf("Configuration\n"); @@ -640,6 +584,13 @@ void LDM_readHeader(const void *src, U64 *compressedSize, // ip += sizeof(U64); } +void LDM_writeHeader(void *memPtr, U64 compressedSize, + U64 decompressedSize) { + MEM_write64(memPtr, compressedSize); + MEM_write64((BYTE *)memPtr + 8, decompressedSize); +} + + void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 9769f10e..232c14a2 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -12,13 +12,12 @@ #include "ldm.h" #include "zstd.h" -#define DEBUG //#define TEST /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. * - * TODO: This might seg fault if the compressed size is > the decompress + * This might seg fault if the compressed size is > the decompress * size due to the mmapping and output file size allocated to be the input size * The compress function should check before writing or buffer writes. */ @@ -31,6 +30,7 @@ static int compress(const char *fname, const char *oname) { struct timeval tv1, tv2; double timeTaken; + /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { perror("Error in file opening"); @@ -50,6 +50,7 @@ static int compress(const char *fname, const char *oname) { } maxCompressedSize = (statbuf.st_size + LDM_HEADER_SIZE); + // Handle case where compressed size is > decompressed size. // The compress function should check before writing or buffer writes. maxCompressedSize += statbuf.st_size / 255; @@ -79,21 +80,17 @@ static int compress(const char *fname, const char *oname) { compressedSize = LDM_HEADER_SIZE + LDM_compress(src, statbuf.st_size, dst + LDM_HEADER_SIZE, maxCompressedSize); + gettimeofday(&tv2, NULL); // Write compress and decompress size to header // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &compressedSize, 8); - memcpy(dst + 8, &(statbuf.st_size), 8); - -#ifdef DEBUG - printf("Compressed size: %zu\n", compressedSize); - printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); -#endif + LDM_writeHeader(dst, compressedSize, statbuf.st_size); // Truncate file to compressedSize. ftruncate(fdout, compressedSize); + printf("%25s : %10lu -> %10lu - %s (%.2fx --- %.1f%%)\n", fname, (size_t)statbuf.st_size, (size_t)compressedSize, oname, (statbuf.st_size) / (double)compressedSize, @@ -102,7 +99,7 @@ static int compress(const char *fname, const char *oname) { timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + (double) (tv2.tv_sec - tv1.tv_sec), - printf("Total compress time = %.3f seconds, Average compression speed: %.3f MB/s\n", + printf("Total compress time = %.3f seconds, Average scanning speed: %.3f MB/s\n", timeTaken, ((double)statbuf.st_size / (double) (1 << 20)) / timeTaken); From 6eefa3291195e3fc7f592484125ff3bc44ba3a50 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 24 Jul 2017 12:40:59 -0700 Subject: [PATCH 52/62] Deduplicate code --- contrib/long_distance_matching/Makefile | 6 +- contrib/long_distance_matching/ldm.c | 156 -------------- contrib/long_distance_matching/ldm.h | 16 +- contrib/long_distance_matching/ldm_64_hash.c | 198 ++---------------- contrib/long_distance_matching/ldm_common.c | 113 ++++++++++ .../long_distance_matching/ldm_integrated.c | 116 ---------- 6 files changed, 153 insertions(+), 452 deletions(-) create mode 100644 contrib/long_distance_matching/ldm_common.c diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index c8129f67..e1c31112 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -27,13 +27,13 @@ default: all all: main-circular-buffer main-integrated main-64 -main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c +main-circular-buffer: ldm_common.c circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-64: ldm_64_hash.c main-ldm.c +main-64: ldm_common.c ldm_64_hash.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-integrated: ldm_integrated.c main-ldm.c +main-integrated: ldm_common.c ldm_integrated.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index a5594ff6..9d3eda32 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -8,28 +8,16 @@ #include "ldm_hashtable.h" #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) -// Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) -#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) - -#define ML_BITS 4 -#define ML_MASK ((1U< LDM_WINDOW_SIZE) { - return 0; - } - - for (; lengthLeft >= 4; lengthLeft -= 4) { - if (MEM_read32(curIn) != MEM_read32(curMatch)) { - return 0; - } - curIn += 4; - curMatch += 4; - } - return 1; -} -*/ - /** * Convert a sum computed from getChecksum to a hash value in the range * of the hash table. @@ -312,50 +279,6 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { putHashOfCurrentPositionFromHash(cctx, hash, sum); } -/* -U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = (*pMatch) ^ *(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (U32)(pIn - pStart); - } - return (U32)(pIn - pStart); -} -*/ - -void LDM_outputConfiguration(void) { - printf("=====================\n"); - printf("Configuration\n"); - printf("Window size log: %d\n", LDM_WINDOW_SIZE_LOG); - printf("Min match, hash length: %d, %d\n", - LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); - printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); - printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); - printf("LDM_LAG %d\n", LDM_LAG); - printf("=====================\n"); -} - -void LDM_readHeader(const void *src, U64 *compressedSize, - U64 *decompressedSize) { - const BYTE *ip = (const BYTE *)src; - *compressedSize = MEM_readLE64(ip); - ip += sizeof(U64); - *decompressedSize = MEM_readLE64(ip); - // ip += sizeof(U64); -} - -void LDM_writeHeader(void *memPtr, U64 compressedSize, - U64 decompressedSize) { - MEM_write64(memPtr, compressedSize); - MEM_write64((BYTE *)memPtr + 8, decompressedSize); -} - void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { @@ -587,85 +510,6 @@ size_t LDM_compress(const void *src, size_t srcSize, } } -struct LDM_DCtx { - size_t compressedSize; - size_t maxDecompressedSize; - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of source */ - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Current output position */ - const BYTE *oend; /* End of output */ -}; - -void LDM_initializeDCtx(LDM_DCtx *dctx, - const void *src, size_t compressedSize, - void *dst, size_t maxDecompressedSize) { - dctx->compressedSize = compressedSize; - dctx->maxDecompressedSize = maxDecompressedSize; - - dctx->ibase = src; - dctx->ip = (const BYTE *)src; - dctx->iend = dctx->ip + dctx->compressedSize; - dctx->op = dst; - dctx->oend = dctx->op + dctx->maxDecompressedSize; -} - -size_t LDM_decompress(const void *src, size_t compressedSize, - void *dst, size_t maxDecompressedSize) { - LDM_DCtx dctx; - LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); - - while (dctx.ip < dctx.iend) { - BYTE *cpy; - const BYTE *match; - size_t length, offset; - - /* Get the literal length. */ - const unsigned token = *(dctx.ip)++; - if ((length = (token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - - /* Copy the literals. */ - cpy = dctx.op + length; - memcpy(dctx.op, dctx.ip, length); - dctx.ip += length; - dctx.op = cpy; - - //TODO : dynamic offset size - offset = MEM_read32(dctx.ip); - dctx.ip += LDM_OFFSET_SIZE; - match = dctx.op - offset; - - /* Get the match length. */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - length += LDM_MIN_MATCH_LENGTH; - - /* Copy match. */ - cpy = dctx.op + length; - - // Inefficient for now. - while (match < cpy - offset && dctx.op < dctx.oend) { - *(dctx.op)++ = *match++; - } - } - return dctx.op - (BYTE *)dst; -} - void LDM_test(const BYTE *src) { (void)src; } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index adbe35bf..3078fb8c 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -9,6 +9,11 @@ #define LDM_DECOMPRESSED_SIZE 8 #define LDM_HEADER_SIZE ((LDM_COMPRESSED_SIZE)+(LDM_DECOMPRESSED_SIZE)) +#define ML_BITS 4 +#define ML_MASK ((1U<> 2) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) -// Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#ifdef TMP_FORCE_HASH_ONLY - #define HASH_ONLY_EVERY_LOG 7 -#else - #define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG))) -#endif -#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) - /* Hash table stuff. */ #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) -#define ML_BITS 4 -#define ML_MASK ((1U< pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { pIn--; pMatch--; @@ -319,7 +304,6 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, } #ifdef TMP_EVICTION - void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry, LDM_CCtx *cctx) { @@ -381,9 +365,6 @@ void HASH_insert(LDM_hashTable *table, * * Insert forwards. If > tag, keep. Else evict. * - * - * - * */ @@ -428,7 +409,7 @@ void HASH_outputTableOccupancy(const LDM_hashTable *table) { } } - // TODO: repeat numBuckets as a check for now. + // The number of buckets is repeated as a check for now. printf("Num buckets, bucket size: %d (2^%d), %d\n", table->numBuckets, LDM_HASHLOG, HASH_BUCKET_SIZE); printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", @@ -498,31 +479,16 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { printf("=====================\n"); } -int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { - U32 lengthLeft = LDM_MIN_MATCH_LENGTH; - const BYTE *curIn = pIn; - const BYTE *curMatch = pMatch; - - if (pIn - pMatch > LDM_WINDOW_SIZE) { - return 0; - } - - for (; lengthLeft >= 4; lengthLeft -= 4) { - if (MEM_read32(curIn) != MEM_read32(curMatch)) { - return 0; - } - curIn += 4; - curMatch += 4; - } - return 1; -} - -// Upper LDM_HASHLOG bits. +/** + * Return the upper (most significant) LDM_HASHLOG bits. + */ static hash_t checksumToHash(U64 sum) { return sum >> (64 - LDM_HASHLOG); } -// 32 bits after LDM_HASH_LOG bits. +/** + * Return the 32 bits after the upper LDM_HASHLOG bits. + */ static U32 checksumFromHfHash(U64 hfHash) { return (hfHash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; } @@ -534,9 +500,6 @@ static U32 lowerBitsFromHfHash(U64 hfHash) { // Occasional hashing requires HASH_ONLY_EVERY_LOG bits. // So if 32 - LDMHASHLOG < HASH_ONLY_EVERY_LOG, just return lower bits // allowing for reuse of bits. -#ifdef TMP_SIMPLE_LOWER - return hfHash & HASH_ONLY_EVERY; -#else if (32 - LDM_HASHLOG < HASH_ONLY_EVERY_LOG) { return hfHash & HASH_ONLY_EVERY; } else { @@ -544,12 +507,20 @@ static U32 lowerBitsFromHfHash(U64 hfHash) { return (hfHash >> (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG)) & HASH_ONLY_EVERY; } -#endif } #endif - - +/** + * Get a 64-bit hash using the first len bytes from buf. + * + * Giving bytes s = s_1, s_2, ... s_k, the hash is defined to be + * H(s) = s_1*(a^(k-1)) + s_2*(a^(k-2)) + ... + s_k*(a^0) + * + * where the constant a is defined to be prime8bytes. + * + * The implementation adds an offset to each byte, so + * H(s) = (s_1 + CHECKSUM_CHAR_OFFSET)*(a^(k-1)) + ... + */ static U64 getChecksum(const BYTE *buf, U32 len) { U64 ret = 0; U32 i; @@ -575,8 +546,8 @@ static U64 ipow(U64 base, U64 exp) { static U64 updateChecksum(U64 sum, U32 len, BYTE toRemove, BYTE toAdd) { - // TODO: relying on compiler optimization here. - // The exponential can (should?) be calculated explicitly. + // TODO: this relies on compiler optimization. + // The exponential can be calculated explicitly as len is constant. sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) * ipow(prime8bytes, len - 1)); sum *= prime8bytes; @@ -618,7 +589,6 @@ static void setNextHash(LDM_CCtx *cctx) { #endif #if LDM_LAG -// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); if (cctx->ip - cctx->ibase > LDM_LAG) { cctx->lagSum = updateChecksum( cctx->lagSum, LDM_HASH_LENGTH, @@ -647,7 +617,7 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) { // Note: this works only when cctx->step is 1. #if LDM_LAG if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - // TODO: off by 1, but whatever. + // TODO: Off by one, but not important. if (cctx->lagIp - cctx->ibase > 0) { U32 hash = checksumToHash(cctx->lagSum); U32 sum = checksumFromHfHash(cctx->lagSum); @@ -724,50 +694,6 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { putHashOfCurrentPositionFromHash(cctx, sum); } -U64 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = (*pMatch) ^ *(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (U64)(pIn - pStart); - } - return (U64)(pIn - pStart); -} - -void LDM_outputConfiguration(void) { - printf("=====================\n"); - printf("Configuration\n"); - printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); - printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", - LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); - printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); - printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); - printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); - printf("LDM_LAG %d\n", LDM_LAG); - printf("=====================\n"); -} - -void LDM_readHeader(const void *src, U64 *compressedSize, - U64 *decompressedSize) { - const BYTE *ip = (const BYTE *)src; - *compressedSize = MEM_readLE64(ip); - ip += sizeof(U64); - *decompressedSize = MEM_readLE64(ip); - // ip += sizeof(U64); -} - -void LDM_writeHeader(void *memPtr, U64 compressedSize, - U64 decompressedSize) { - MEM_write64(memPtr, compressedSize); - MEM_write64((BYTE *)memPtr + 8, decompressedSize); -} - - void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { @@ -1013,86 +939,6 @@ size_t LDM_compress(const void *src, size_t srcSize, } } -struct LDM_DCtx { - size_t compressedSize; - size_t maxDecompressedSize; - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of source */ - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Current output position */ - const BYTE *oend; /* End of output */ -}; - -void LDM_initializeDCtx(LDM_DCtx *dctx, - const void *src, size_t compressedSize, - void *dst, size_t maxDecompressedSize) { - dctx->compressedSize = compressedSize; - dctx->maxDecompressedSize = maxDecompressedSize; - - dctx->ibase = src; - dctx->ip = (const BYTE *)src; - dctx->iend = dctx->ip + dctx->compressedSize; - dctx->op = dst; - dctx->oend = dctx->op + dctx->maxDecompressedSize; -} - -size_t LDM_decompress(const void *src, size_t compressedSize, - void *dst, size_t maxDecompressedSize) { - - LDM_DCtx dctx; - LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); - - while (dctx.ip < dctx.iend) { - BYTE *cpy; - const BYTE *match; - size_t length, offset; - - /* Get the literal length. */ - const unsigned token = *(dctx.ip)++; - if ((length = (token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - - /* Copy the literals. */ - cpy = dctx.op + length; - memcpy(dctx.op, dctx.ip, length); - dctx.ip += length; - dctx.op = cpy; - - //TODO : dynamic offset size - offset = MEM_read32(dctx.ip); - dctx.ip += LDM_OFFSET_SIZE; - match = dctx.op - offset; - - /* Get the match length. */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - length += LDM_MIN_MATCH_LENGTH; - - /* Copy match. */ - cpy = dctx.op + length; - - // Inefficient for now. - while (match < cpy - offset && dctx.op < dctx.oend) { - *(dctx.op)++ = *match++; - } - } - return dctx.op - (BYTE *)dst; -} - // TODO: implement and test hash function void LDM_test(const BYTE *src) { const U32 diff = 100; diff --git a/contrib/long_distance_matching/ldm_common.c b/contrib/long_distance_matching/ldm_common.c new file mode 100644 index 00000000..673959db --- /dev/null +++ b/contrib/long_distance_matching/ldm_common.c @@ -0,0 +1,113 @@ +#include + +#include "ldm.h" + +void LDM_outputConfiguration(void) { + printf("=====================\n"); + printf("Configuration\n"); + printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); + printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", + LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); + printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); + printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); + printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); + printf("LDM_LAG %d\n", LDM_LAG); + printf("=====================\n"); +} + +void LDM_readHeader(const void *src, U64 *compressedSize, + U64 *decompressedSize) { + const BYTE *ip = (const BYTE *)src; + *compressedSize = MEM_readLE64(ip); + ip += sizeof(U64); + *decompressedSize = MEM_readLE64(ip); + // ip += sizeof(U64); +} + +void LDM_writeHeader(void *memPtr, U64 compressedSize, + U64 decompressedSize) { + MEM_write64(memPtr, compressedSize); + MEM_write64((BYTE *)memPtr + 8, decompressedSize); +} + +struct LDM_DCtx { + size_t compressedSize; + size_t maxDecompressedSize; + + const BYTE *ibase; /* Base of input */ + const BYTE *ip; /* Current input position */ + const BYTE *iend; /* End of source */ + + const BYTE *obase; /* Base of output */ + BYTE *op; /* Current output position */ + const BYTE *oend; /* End of output */ +}; + +void LDM_initializeDCtx(LDM_DCtx *dctx, + const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize) { + dctx->compressedSize = compressedSize; + dctx->maxDecompressedSize = maxDecompressedSize; + + dctx->ibase = src; + dctx->ip = (const BYTE *)src; + dctx->iend = dctx->ip + dctx->compressedSize; + dctx->op = dst; + dctx->oend = dctx->op + dctx->maxDecompressedSize; +} + +size_t LDM_decompress(const void *src, size_t compressedSize, + void *dst, size_t maxDecompressedSize) { + + LDM_DCtx dctx; + LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); + + while (dctx.ip < dctx.iend) { + BYTE *cpy; + const BYTE *match; + size_t length, offset; + + /* Get the literal length. */ + const unsigned token = *(dctx.ip)++; + if ((length = (token >> ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + + /* Copy the literals. */ + cpy = dctx.op + length; + memcpy(dctx.op, dctx.ip, length); + dctx.ip += length; + dctx.op = cpy; + + //TODO : dynamic offset size + offset = MEM_read32(dctx.ip); + dctx.ip += LDM_OFFSET_SIZE; + match = dctx.op - offset; + + /* Get the match length. */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *(dctx.ip)++; + length += s; + } while (s == 255); + } + length += LDM_MIN_MATCH_LENGTH; + + /* Copy match. */ + cpy = dctx.op + length; + + // Inefficient for now. + while (match < cpy - offset && dctx.op < dctx.oend) { + *(dctx.op)++ = *match++; + } + } + return dctx.op - (BYTE *)dst; +} + + diff --git a/contrib/long_distance_matching/ldm_integrated.c b/contrib/long_distance_matching/ldm_integrated.c index 7733d4e9..d51c1e9d 100644 --- a/contrib/long_distance_matching/ldm_integrated.c +++ b/contrib/long_distance_matching/ldm_integrated.c @@ -11,19 +11,10 @@ #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) -// Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG))) -#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) - /* Hash table stuff. */ #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) -#define ML_BITS 4 -#define ML_MASK ((1U<compressedSize = compressedSize; - dctx->maxDecompressedSize = maxDecompressedSize; - - dctx->ibase = src; - dctx->ip = (const BYTE *)src; - dctx->iend = dctx->ip + dctx->compressedSize; - dctx->op = dst; - dctx->oend = dctx->op + dctx->maxDecompressedSize; -} - -size_t LDM_decompress(const void *src, size_t compressedSize, - void *dst, size_t maxDecompressedSize) { - LDM_DCtx dctx; - LDM_initializeDCtx(&dctx, src, compressedSize, dst, maxDecompressedSize); - - while (dctx.ip < dctx.iend) { - BYTE *cpy; - const BYTE *match; - size_t length, offset; - - /* Get the literal length. */ - const unsigned token = *(dctx.ip)++; - if ((length = (token >> ML_BITS)) == RUN_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - - /* Copy the literals. */ - cpy = dctx.op + length; - memcpy(dctx.op, dctx.ip, length); - dctx.ip += length; - dctx.op = cpy; - - //TODO : dynamic offset size - offset = MEM_read32(dctx.ip); - dctx.ip += LDM_OFFSET_SIZE; - match = dctx.op - offset; - - /* Get the match length. */ - length = token & ML_MASK; - if (length == ML_MASK) { - unsigned s; - do { - s = *(dctx.ip)++; - length += s; - } while (s == 255); - } - length += LDM_MIN_MATCH_LENGTH; - - /* Copy match. */ - cpy = dctx.op + length; - - // Inefficient for now. - while (match < cpy - offset && dctx.op < dctx.oend) { - *(dctx.op)++ = *match++; - } - } - return dctx.op - (BYTE *)dst; -} - // TODO: implement and test hash function void LDM_test(const BYTE *src) { (void)src; From 08a6e9a141025f4d41177fc77495c32b138d6e54 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 24 Jul 2017 13:22:00 -0700 Subject: [PATCH 53/62] Minor code cleanup --- contrib/long_distance_matching/ldm_64_hash.c | 291 +++++++++--------- contrib/long_distance_matching/ldm_common.c | 2 - .../long_distance_matching/ldm_integrated.c | 2 +- 3 files changed, 146 insertions(+), 149 deletions(-) diff --git a/contrib/long_distance_matching/ldm_64_hash.c b/contrib/long_distance_matching/ldm_64_hash.c index 95865f70..06ddf520 100644 --- a/contrib/long_distance_matching/ldm_64_hash.c +++ b/contrib/long_distance_matching/ldm_64_hash.c @@ -15,7 +15,7 @@ #define COMPUTE_STATS #define OUTPUT_CONFIGURATION -#define CHECKSUM_CHAR_OFFSET 1 +#define HASH_CHAR_OFFSET 10 // Take first match only. //#define ZSTD_SKIP @@ -24,8 +24,7 @@ static const U64 prime8bytes = 11400714785074694791ULL; -/* Hash table stuff */ - +// Type of the small hash used to index into the hash table. typedef U32 hash_t; typedef struct LDM_hashEntry { @@ -41,7 +40,6 @@ struct LDM_compressStats { U64 totalOffset; U32 minOffset, maxOffset; - U32 offsetHistogram[32]; U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG]; @@ -56,8 +54,8 @@ struct LDM_compressStats { typedef struct LDM_hashTable LDM_hashTable; struct LDM_CCtx { - U64 isize; /* Input size */ - U64 maxOSize; /* Maximum output size */ + size_t isize; /* Input size */ + size_t maxOSize; /* Maximum output size */ const BYTE *ibase; /* Base of input */ const BYTE *ip; /* Current input position */ @@ -80,23 +78,21 @@ struct LDM_CCtx { LDM_hashTable *hashTable; -// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; - const BYTE *lastPosHashed; /* Last position hashed */ - hash_t lastHash; /* Hash corresponding to lastPosHashed */ - U64 lastSum; + U64 lastHash; - const BYTE *nextIp; // TODO: this is redundant (ip + step) + const BYTE *nextIp; // TODO: this is redundant (ip + step) const BYTE *nextPosHashed; - U64 nextSum; + U64 nextHash; unsigned step; // ip step, should be 1. const BYTE *lagIp; - U64 lagSum; + U64 lagHash; - // DEBUG +#ifdef RUN_CHECKS const BYTE *DEBUG_setNextHash; +#endif }; struct LDM_hashTable { @@ -107,7 +103,6 @@ struct LDM_hashTable { BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position. }; - /** * Create a hash table that can contain size elements. * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG. @@ -126,70 +121,74 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { } static unsigned ZSTD_NbCommonBytes (register size_t val) { - if (MEM_isLittleEndian()) { - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanForward64( &r, (U64)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctzll((U64)val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, - 0, 3, 1, 3, 1, 4, 2, 7, - 0, 2, 3, 6, 1, 5, 3, 5, - 1, 3, 4, 4, 2, 5, 6, 7, - 7, 0, 1, 2, 3, 3, 4, 6, - 2, 6, 5, 5, 3, 4, 5, 6, - 7, 1, 2, 4, 6, 4, 4, 5, - 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; + if (MEM_isLittleEndian()) { + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanForward64( &r, (U64)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, + 0, 3, 1, 3, 1, 4, 2, 7, + 0, 2, 3, 6, 1, 5, 3, 5, + 1, 3, 4, 4, 2, 5, 6, 7, + 7, 0, 1, 2, 3, 3, 4, 6, + 2, 6, 5, 5, 3, 4, 5, 6, + 7, 1, 2, 4, 6, 4, 4, 5, + 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[ + ((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r=0; + _BitScanForward( &r, (U32)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[ + ((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else { /* Big Endian CPU */ + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clzll(val) >> 3); +# else + unsigned r; + /* calculate this way due to compiler complaining in 32-bits mode */ + const unsigned n32 = sizeof(size_t)*4; + if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; # endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r=0; - _BitScanForward( &r, (U32)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctz((U32)val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, - 3, 2, 2, 1, 3, 2, 0, 1, - 3, 3, 1, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif - } - } else { /* Big Endian CPU */ - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clzll(val) >> 3); -# else - unsigned r; - const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ - if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r = 0; - _BitScanReverse( &r, (unsigned long)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clz((U32)val) >> 3); -# else - unsigned r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif - } } + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } + } } // From lib/compress/zstd_compress.c @@ -230,8 +229,8 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, * * We count only bytes where pMatch > pBaes and pIn > pAnchor. */ -size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, - const BYTE *pMatch, const BYTE *pBase) { +static size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, + const BYTE *pMatch, const BYTE *pBase) { size_t matchLength = 0; while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { pIn--; @@ -482,29 +481,29 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { /** * Return the upper (most significant) LDM_HASHLOG bits. */ -static hash_t checksumToHash(U64 sum) { - return sum >> (64 - LDM_HASHLOG); +static hash_t getSmallHash(U64 hash) { + return hash >> (64 - LDM_HASHLOG); } /** * Return the 32 bits after the upper LDM_HASHLOG bits. */ -static U32 checksumFromHfHash(U64 hfHash) { - return (hfHash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; +static U32 getChecksum(U64 hash) { + return (hash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; } #ifdef TMP_TAG_INSERT -static U32 lowerBitsFromHfHash(U64 hfHash) { +static U32 lowerBitsFromHfHash(U64 hash) { // The number of bits used so far is LDM_HASHLOG + 32. // So there are 32 - LDM_HASHLOG bits left. // Occasional hashing requires HASH_ONLY_EVERY_LOG bits. // So if 32 - LDMHASHLOG < HASH_ONLY_EVERY_LOG, just return lower bits // allowing for reuse of bits. if (32 - LDM_HASHLOG < HASH_ONLY_EVERY_LOG) { - return hfHash & HASH_ONLY_EVERY; + return hash & HASH_ONLY_EVERY; } else { // Otherwise shift by (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG) bits first. - return (hfHash >> (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG)) & + return (hash >> (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG)) & HASH_ONLY_EVERY; } } @@ -519,14 +518,14 @@ static U32 lowerBitsFromHfHash(U64 hfHash) { * where the constant a is defined to be prime8bytes. * * The implementation adds an offset to each byte, so - * H(s) = (s_1 + CHECKSUM_CHAR_OFFSET)*(a^(k-1)) + ... + * H(s) = (s_1 + HASH_CHAR_OFFSET)*(a^(k-1)) + ... */ -static U64 getChecksum(const BYTE *buf, U32 len) { +static U64 getHash(const BYTE *buf, U32 len) { U64 ret = 0; U32 i; for (i = 0; i < len; i++) { ret *= prime8bytes; - ret += buf[i] + CHECKSUM_CHAR_OFFSET; + ret += buf[i] + HASH_CHAR_OFFSET; } return ret; @@ -544,20 +543,20 @@ static U64 ipow(U64 base, U64 exp) { return ret; } -static U64 updateChecksum(U64 sum, U32 len, - BYTE toRemove, BYTE toAdd) { +static U64 updateHash(U64 hash, U32 len, + BYTE toRemove, BYTE toAdd) { // TODO: this relies on compiler optimization. // The exponential can be calculated explicitly as len is constant. - sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) * + hash -= ((toRemove + HASH_CHAR_OFFSET) * ipow(prime8bytes, len - 1)); - sum *= prime8bytes; - sum += toAdd + CHECKSUM_CHAR_OFFSET; - return sum; + hash *= prime8bytes; + hash += toAdd + HASH_CHAR_OFFSET; + return hash; } /** - * Update cctx->nextSum, cctx->nextHash, and cctx->nextPosHashed - * based on cctx->lastSum and cctx->lastPosHashed. + * Update cctx->nextHash and cctx->nextPosHashed + * based on cctx->lastHash and cctx->lastPosHashed. * * This uses a rolling hash and requires that the last position hashed * corresponds to cctx->nextIp - step. @@ -574,15 +573,15 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->DEBUG_setNextHash = cctx->nextIp; #endif - cctx->nextSum = updateChecksum( - cctx->lastSum, LDM_HASH_LENGTH, + cctx->nextHash = updateHash( + cctx->lastHash, LDM_HASH_LENGTH, cctx->lastPosHashed[0], cctx->lastPosHashed[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; #ifdef TMP_TAG_INSERT { - U32 hashEveryMask = lowerBitsFromHfHash(cctx->nextSum); + U32 hashEveryMask = lowerBitsFromHfHash(cctx->nextHash); cctx->stats.TMP_totalHashCount++; cctx->stats.TMP_hashCount[hashEveryMask]++; } @@ -590,18 +589,18 @@ static void setNextHash(LDM_CCtx *cctx) { #if LDM_LAG if (cctx->ip - cctx->ibase > LDM_LAG) { - cctx->lagSum = updateChecksum( - cctx->lagSum, LDM_HASH_LENGTH, + cctx->lagHash = updateHash( + cctx->lagHash, LDM_HASH_LENGTH, cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); cctx->lagIp++; } #endif #ifdef RUN_CHECKS - check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); + check = getHash(cctx->nextIp, LDM_HASH_LENGTH); - if (check != cctx->nextSum) { - printf("CHECK: setNextHash failed %llu %llu\n", check, cctx->nextSum); + if (check != cctx->nextHash) { + printf("CHECK: setNextHash failed %llu %llu\n", check, cctx->nextHash); } if ((cctx->nextIp - cctx->lastPosHashed) != 1) { @@ -612,58 +611,57 @@ static void setNextHash(LDM_CCtx *cctx) { #endif } -static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) { +static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. #if LDM_LAG if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { // TODO: Off by one, but not important. if (cctx->lagIp - cctx->ibase > 0) { - U32 hash = checksumToHash(cctx->lagSum); - U32 sum = checksumFromHfHash(cctx->lagSum); - const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, sum }; + U32 smallHash = getSmallHash(cctx->lagHash); + U32 checksum = getChecksum(cctx->lagHash); + const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, checksum }; #ifdef TMP_EVICTION - HASH_insert(cctx->hashTable, hash, entry, cctx); + HASH_insert(cctx->hashTable, smallHash, entry, cctx); #else - HASH_insert(cctx->hashTable, hash, entry); + HASH_insert(cctx->hashTable, smallHash, entry); #endif } else { - U32 hash = checksumToHash(hfHash); - U32 sum = checksumFromHfHash(hfHash); + U32 smallHash = getSmallHash(hash); + U32 checksum = getChecksum(hash); - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; #ifdef TMP_EVICTION - HASH_insert(cctx->hashTable, hash, entry, cctx); + HASH_insert(cctx->hashTable, smallHash, entry, cctx); #else - HASH_insert(cctx->hashTable, hash, entry); + HASH_insert(cctx->hashTable, smallHash, entry); #endif } } #else #ifdef TMP_TAG_INSERT - U32 hashEveryMask = lowerBitsFromHfHash(hfHash); - // TODO: look at stats. + U32 hashEveryMask = lowerBitsFromHfHash(hash); if (hashEveryMask == HASH_ONLY_EVERY) { #else if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { #endif - U32 hash = checksumToHash(hfHash); - U32 sum = checksumFromHfHash(hfHash); - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; + U32 smallHash = getSmallHash(hash); + U32 checksum = getChecksum(hash); + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; #ifdef TMP_EVICTION - HASH_insert(cctx->hashTable, hash, entry, cctx); + HASH_insert(cctx->hashTable, smallHash, entry, cctx); #else - HASH_insert(cctx->hashTable, hash, entry); + HASH_insert(cctx->hashTable, smallHash, entry); #endif } #endif cctx->lastPosHashed = cctx->ip; - cctx->lastSum = hfHash; + cctx->lastHash = hash; } /** - * Copy over the cctx->lastHash, cctx->lastSum, and cctx->lastPosHashed + * Copy over the cctx->lastHash, and cctx->lastPosHashed * fields from the "next" fields. * * This requires that cctx->ip == cctx->nextPosHashed. @@ -675,14 +673,14 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { cctx->ip - cctx->ibase); } #endif - putHashOfCurrentPositionFromHash(cctx, cctx->nextSum); + putHashOfCurrentPositionFromHash(cctx, cctx->nextHash); } /** * Insert hash of the current position into the hash table. */ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U64 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); + U64 hash = getHash(cctx->ip, LDM_HASH_LENGTH); #ifdef RUN_CHECKS if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { @@ -691,7 +689,7 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { } #endif - putHashOfCurrentPositionFromHash(cctx, sum); + putHashOfCurrentPositionFromHash(cctx, hash); } void LDM_initializeCCtx(LDM_CCtx *cctx, @@ -726,7 +724,9 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->nextIp = cctx->ip + cctx->step; cctx->nextPosHashed = 0; +#ifdef RUN_CHECKS cctx->DEBUG_setNextHash = 0; +#endif } void LDM_destroyCCtx(LDM_CCtx *cctx) { @@ -748,16 +748,16 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, cctx->nextIp = cctx->ip + cctx->step; while (entry == NULL) { - hash_t h; U64 hash; - U32 sum; + hash_t smallHash; + U32 checksum; #ifdef TMP_TAG_INSERT U32 hashEveryMask; #endif setNextHash(cctx); - hash = cctx->nextSum; - h = checksumToHash(hash); - sum = checksumFromHfHash(hash); + hash = cctx->nextHash; + smallHash = getSmallHash(hash); + checksum = getChecksum(hash); #ifdef TMP_TAG_INSERT hashEveryMask = lowerBitsFromHfHash(hash); #endif @@ -770,11 +770,11 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, } #ifdef TMP_TAG_INSERT if (hashEveryMask == HASH_ONLY_EVERY) { - entry = HASH_getBestEntry(cctx, h, sum, + entry = HASH_getBestEntry(cctx, smallHash, checksum, forwardMatchLength, backwardMatchLength); } #else - entry = HASH_getBestEntry(cctx, h, sum, + entry = HASH_getBestEntry(cctx, smallHash, checksum, forwardMatchLength, backwardMatchLength); #endif @@ -850,15 +850,16 @@ size_t LDM_compress(const void *src, size_t srcSize, U64 backwardsMatchLength = 0; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); +#ifdef OUTPUT_CONFIGURATION LDM_outputConfiguration(); +#endif /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); #if LDM_LAG cctx.lagIp = cctx.ip; -// cctx.lagHash = cctx.lastHash; - cctx.lagSum = cctx.lastSum; + cctx.lagHash = cctx.lastHash; #endif /** * Find a match. @@ -918,8 +919,6 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_updateLastHashFromNextHash(&cctx); } - // HASH_outputTableOffsetHistogram(&cctx); - /* Encode the last literals (no more matches). */ { const U64 lastRun = cctx.iend - cctx.anchor; @@ -943,14 +942,14 @@ size_t LDM_compress(const void *src, size_t srcSize, void LDM_test(const BYTE *src) { const U32 diff = 100; const BYTE *pCur = src + diff; - U64 checksum = getChecksum(pCur, LDM_HASH_LENGTH); + U64 hash = getHash(pCur, LDM_HASH_LENGTH); for (; pCur < src + diff + 60; ++pCur) { - U64 nextSum = getChecksum(pCur + 1, LDM_HASH_LENGTH); - U64 updateSum = updateChecksum(checksum, LDM_HASH_LENGTH, - pCur[0], pCur[LDM_HASH_LENGTH]); - checksum = nextSum; - printf("%llu %llu\n", nextSum, updateSum); + U64 nextHash = getHash(pCur + 1, LDM_HASH_LENGTH); + U64 updatedHash = updateHash(hash, LDM_HASH_LENGTH, + pCur[0], pCur[LDM_HASH_LENGTH]); + hash = nextHash; + printf("%llu %llu\n", nextHash, updatedHash); } } diff --git a/contrib/long_distance_matching/ldm_common.c b/contrib/long_distance_matching/ldm_common.c index 673959db..1aa664f0 100644 --- a/contrib/long_distance_matching/ldm_common.c +++ b/contrib/long_distance_matching/ldm_common.c @@ -109,5 +109,3 @@ size_t LDM_decompress(const void *src, size_t compressedSize, } return dctx.op - (BYTE *)dst; } - - diff --git a/contrib/long_distance_matching/ldm_integrated.c b/contrib/long_distance_matching/ldm_integrated.c index d51c1e9d..6440c009 100644 --- a/contrib/long_distance_matching/ldm_integrated.c +++ b/contrib/long_distance_matching/ldm_integrated.c @@ -17,7 +17,7 @@ #define COMPUTE_STATS #define OUTPUT_CONFIGURATION -#define CHECKSUM_CHAR_OFFSET 10 +#define CHECKSUM_CHAR_OFFSET 1 // Take first match only. //#define ZSTD_SKIP From 0295a27133e68f61ce398006a080353453cb2b40 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 24 Jul 2017 15:26:44 -0700 Subject: [PATCH 54/62] Experiment with not using a checksum --- contrib/long_distance_matching/ldm.c | 23 ++++ contrib/long_distance_matching/ldm.h | 20 +--- contrib/long_distance_matching/ldm_64_hash.c | 104 +++++++++++++++--- contrib/long_distance_matching/ldm_common.c | 13 --- .../long_distance_matching/ldm_integrated.c | 37 ++++++- 5 files changed, 151 insertions(+), 46 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 9d3eda32..fae35f9e 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -10,6 +10,14 @@ #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) +#define LDM_HASH_ENTRY_SIZE_LOG 3 + +//#define HASH_ONLY_EVERY_LOG 7 +#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) + +#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) + + #define COMPUTE_STATS #define OUTPUT_CONFIGURATION #define CHECKSUM_CHAR_OFFSET 10 @@ -510,6 +518,21 @@ size_t LDM_compress(const void *src, size_t srcSize, } } +void LDM_outputConfiguration(void) { + printf("=====================\n"); + printf("Configuration\n"); + printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); + printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", + LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); + printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); + printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); + printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); + printf("LDM_LAG %d\n", LDM_LAG); + printf("=====================\n"); +} + + + void LDM_test(const BYTE *src) { (void)src; } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 3078fb8c..e1a005e3 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -32,23 +32,15 @@ #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four (and perhaps set to the same value?). -#define LDM_MIN_MATCH_LENGTH 16 -#define LDM_HASH_LENGTH 16 +#define LDM_MIN_MATCH_LENGTH 64 +#define LDM_HASH_LENGTH 64 // Experimental. -//#define TMP_EVICTION -#define TMP_TAG_INSERT -//#define TMP_FORCE_HASH_ONLY +//#define TMP_EVICTION // Experiment with eviction policies. +#define TMP_TAG_INSERT // Insertion policy based on hash. -#define LDM_HASH_ENTRY_SIZE_LOG 3 - -// Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#ifdef TMP_FORCE_HASH_ONLY - #define HASH_ONLY_EVERY_LOG 7 -#else - #define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) -#endif -#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) +#define USE_CHECKSUM 1 +//#define USE_CHECKSUM (HASH_BUCKET_SIZE_LOG) typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; diff --git a/contrib/long_distance_matching/ldm_64_hash.c b/contrib/long_distance_matching/ldm_64_hash.c index 06ddf520..c74d71ff 100644 --- a/contrib/long_distance_matching/ldm_64_hash.c +++ b/contrib/long_distance_matching/ldm_64_hash.c @@ -7,9 +7,20 @@ #include "ldm.h" #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) -/* Hash table stuff. */ +#if USE_CHECKSUM + #define LDM_HASH_ENTRY_SIZE_LOG 3 +#else + #define LDM_HASH_ENTRY_SIZE_LOG 2 +#endif + +//#define HASH_ONLY_EVERY_LOG 7 +#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) + +#define HASH_ONLY_EVERY ((1 << (HASH_ONLY_EVERY_LOG)) - 1) + #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) @@ -27,10 +38,16 @@ static const U64 prime8bytes = 11400714785074694791ULL; // Type of the small hash used to index into the hash table. typedef U32 hash_t; +#if USE_CHECKSUM typedef struct LDM_hashEntry { U32 offset; U32 checksum; } LDM_hashEntry; +#else +typedef struct LDM_hashEntry { + U32 offset; +} LDM_hashEntry; +#endif struct LDM_compressStats { U32 windowSizeLog, hashTableSizeLog; @@ -39,6 +56,8 @@ struct LDM_compressStats { U64 totalLiteralLength; U64 totalOffset; + U32 matchLengthHistogram[32]; + U32 minOffset, maxOffset; U32 offsetHistogram[32]; @@ -262,12 +281,19 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, LDM_hashEntry *cur = bucket; LDM_hashEntry *bestEntry = NULL; U64 bestMatchLength = 0; +#if !(USE_CHECKSUM) + (void)checksum; +#endif for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { const BYTE *pMatch = cur->offset + cctx->ibase; // Check checksum for faster check. +#if USE_CHECKSUM if (cur->checksum == checksum && cctx->ip - pMatch <= LDM_WINDOW_SIZE) { +#else + if (cctx->ip - pMatch <= LDM_WINDOW_SIZE) { +#endif U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); U64 backwardMatchLength, totalMatchLength; @@ -448,12 +474,18 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { stats->minOffset, stats->maxOffset); printf("\n"); - printf("offset histogram: offset, num matches, %% of matches\n"); + printf("offset histogram | match length histogram\n"); + printf("offset/ML, num matches, %% of matches | num matches, %% of matches\n"); for (; i <= intLog2(stats->maxOffset); i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, + printf("2^%*d: %10u %6.3f%% |2^%*d: %10u %6.3f \n", + 2, i, stats->offsetHistogram[i], 100.0 * (double) stats->offsetHistogram[i] / + (double) stats->numMatches, + 2, i, + stats->matchLengthHistogram[i], + 100.0 * (double) stats->matchLengthHistogram[i] / (double) stats->numMatches); } printf("\n"); @@ -619,23 +651,32 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { // TODO: Off by one, but not important. if (cctx->lagIp - cctx->ibase > 0) { U32 smallHash = getSmallHash(cctx->lagHash); + +# if USE_CHECKSUM U32 checksum = getChecksum(cctx->lagHash); const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, checksum }; -#ifdef TMP_EVICTION - HASH_insert(cctx->hashTable, smallHash, entry, cctx); -#else - HASH_insert(cctx->hashTable, smallHash, entry); -#endif - } else { - U32 smallHash = getSmallHash(hash); - U32 checksum = getChecksum(hash); +# else + const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase }; +# endif - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; -#ifdef TMP_EVICTION +# ifdef TMP_EVICTION HASH_insert(cctx->hashTable, smallHash, entry, cctx); -#else +# else HASH_insert(cctx->hashTable, smallHash, entry); -#endif +# endif + } else { +# if USE_CHECKSUM + U32 checksum = getChecksum(hash); + const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, checksum }; +# else + const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase }; +# endif + +# ifdef TMP_EVICTION + HASH_insert(cctx->hashTable, smallHash, entry, cctx); +# else + HASH_insert(cctx->hashTable, smallHash, entry); +# endif } } #else @@ -646,8 +687,12 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { #endif U32 smallHash = getSmallHash(hash); +#if USE_CHECKSUM U32 checksum = getChecksum(hash); const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; +#else + const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; +#endif #ifdef TMP_EVICTION HASH_insert(cctx->hashTable, smallHash, entry, cctx); #else @@ -711,8 +756,11 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); +#if USE_CHECKSUM cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64); - +#else + cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32); +#endif cctx->stats.minOffset = UINT_MAX; cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; @@ -755,6 +803,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, U32 hashEveryMask; #endif setNextHash(cctx); + hash = cctx->nextHash; smallHash = getSmallHash(hash); checksum = getChecksum(hash); @@ -770,6 +819,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, } #ifdef TMP_TAG_INSERT if (hashEveryMask == HASH_ONLY_EVERY) { + entry = HASH_getBestEntry(cctx, smallHash, checksum, forwardMatchLength, backwardMatchLength); } @@ -781,7 +831,9 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, if (entry != NULL) { *match = entry->offset + cctx->ibase; } + putHashOfCurrentPositionFromHash(cctx, hash); + } setNextHash(cctx); return 0; @@ -850,6 +902,7 @@ size_t LDM_compress(const void *src, size_t srcSize, U64 backwardsMatchLength = 0; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + #ifdef OUTPUT_CONFIGURATION LDM_outputConfiguration(); #endif @@ -869,6 +922,7 @@ size_t LDM_compress(const void *src, size_t srcSize, */ while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength, &backwardsMatchLength) == 0) { + #ifdef COMPUTE_STATS cctx.stats.numMatches++; #endif @@ -898,6 +952,8 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.stats.maxOffset = offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; + cctx.stats.matchLengthHistogram[ + (U32)intLog2(matchLength + LDM_MIN_MATCH_LENGTH)]++; #endif // Move ip to end of block, inserting hashes at each position. @@ -938,6 +994,22 @@ size_t LDM_compress(const void *src, size_t srcSize, } } +void LDM_outputConfiguration(void) { + printf("=====================\n"); + printf("Configuration\n"); + printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); + printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", + LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); + printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); + printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); + printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); + printf("LDM_LAG %d\n", LDM_LAG); + printf("USE_CHECKSUM %d\n", USE_CHECKSUM); + printf("=====================\n"); +} + + + // TODO: implement and test hash function void LDM_test(const BYTE *src) { const U32 diff = 100; diff --git a/contrib/long_distance_matching/ldm_common.c b/contrib/long_distance_matching/ldm_common.c index 1aa664f0..1953656e 100644 --- a/contrib/long_distance_matching/ldm_common.c +++ b/contrib/long_distance_matching/ldm_common.c @@ -2,19 +2,6 @@ #include "ldm.h" -void LDM_outputConfiguration(void) { - printf("=====================\n"); - printf("Configuration\n"); - printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); - printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", - LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); - printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); - printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); - printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); - printf("LDM_LAG %d\n", LDM_LAG); - printf("=====================\n"); -} - void LDM_readHeader(const void *src, U64 *compressedSize, U64 *decompressedSize) { const BYTE *ip = (const BYTE *)src; diff --git a/contrib/long_distance_matching/ldm_integrated.c b/contrib/long_distance_matching/ldm_integrated.c index 6440c009..b80a5c01 100644 --- a/contrib/long_distance_matching/ldm_integrated.c +++ b/contrib/long_distance_matching/ldm_integrated.c @@ -7,10 +7,16 @@ #include "ldm.h" #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASH_ENTRY_SIZE_LOG 3 #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) +#define LDM_HASH_ENTRY_SIZE_LOG 3 +//#define HASH_ONLY_EVERY_LOG 7 +#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) + +#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) + + /* Hash table stuff. */ #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) @@ -38,6 +44,8 @@ struct LDM_compressStats { U64 totalLiteralLength; U64 totalOffset; + U32 matchLengthHistogram[32]; + U32 minOffset, maxOffset; U32 offsetHistogram[32]; @@ -358,12 +366,18 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { stats->minOffset, stats->maxOffset); printf("\n"); - printf("offset histogram: offset, num matches, %% of matches\n"); + printf("offset histogram | match length histogram\n"); + printf("offset/ML, num matches, %% of matches | num matches, %% of matches\n"); for (; i <= intLog2(stats->maxOffset); i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, + printf("2^%*d: %10u %6.3f%% |2^%*d: %10u %6.3f \n", + 2, i, stats->offsetHistogram[i], 100.0 * (double) stats->offsetHistogram[i] / + (double) stats->numMatches, + 2, i, + stats->matchLengthHistogram[i], + 100.0 * (double) stats->matchLengthHistogram[i] / (double) stats->numMatches); } printf("\n"); @@ -742,6 +756,8 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.stats.maxOffset = offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; + cctx.stats.matchLengthHistogram[ + (U32)intLog2(matchLength + LDM_MIN_MATCH_LENGTH)]++; #endif // Move ip to end of block, inserting hashes at each position. @@ -784,6 +800,21 @@ size_t LDM_compress(const void *src, size_t srcSize, } } +void LDM_outputConfiguration(void) { + printf("=====================\n"); + printf("Configuration\n"); + printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); + printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", + LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); + printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); + printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); + printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); + printf("LDM_LAG %d\n", LDM_LAG); + printf("=====================\n"); +} + + + // TODO: implement and test hash function void LDM_test(const BYTE *src) { (void)src; From 629c30011880ed0f21d454993d53db7aae0cfdbc Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Tue, 25 Jul 2017 15:17:36 -0700 Subject: [PATCH 55/62] Rename and remove unneeded files --- contrib/long_distance_matching/Makefile | 13 +- .../circular_buffer_table.c | 256 ------------------ contrib/long_distance_matching/ldm.h | 14 +- .../{ldm_integrated.c => ldm_hash32.c} | 0 .../{ldm_64_hash.c => ldm_hash64.c} | 65 +++-- .../long_distance_matching/ldm_hashtable.h | 91 ------- .../{main-ldm.c => main.c} | 15 +- 7 files changed, 65 insertions(+), 389 deletions(-) delete mode 100644 contrib/long_distance_matching/circular_buffer_table.c rename contrib/long_distance_matching/{ldm_integrated.c => ldm_hash32.c} (100%) rename contrib/long_distance_matching/{ldm_64_hash.c => ldm_hash64.c} (96%) delete mode 100644 contrib/long_distance_matching/ldm_hashtable.h rename contrib/long_distance_matching/{main-ldm.c => main.c} (95%) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index e1c31112..292ce851 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -1,5 +1,5 @@ # ################################################################ -# Copyright (c) 2016-present, Yann Collet, Facebook, Inc. +# Copyright (c) 2016-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -25,19 +25,16 @@ LDFLAGS += -lzstd default: all -all: main-circular-buffer main-integrated main-64 +all: main-hash32 main-hash64 -main-circular-buffer: ldm_common.c circular_buffer_table.c ldm.c main-ldm.c +main-hash64: ldm_common.c ldm_hash64.c main.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-64: ldm_common.c ldm_64_hash.c main-ldm.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -main-integrated: ldm_common.c ldm_integrated.c main-ldm.c +main-hash32: ldm_common.c ldm_hash32.c main.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-circular-buffer main-64 main-integrated + main-hash64 main-hash32 @echo Cleaning completed diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c deleted file mode 100644 index 92ffc55b..00000000 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ /dev/null @@ -1,256 +0,0 @@ -#include -#include - -#include "ldm.h" -#include "ldm_hashtable.h" -#include "mem.h" - -// THe number of elements per hash bucket. -// HASH_BUCKET_SIZE_LOG is defined in ldm.h. -#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) - -// The number of hash buckets. -#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) - -// If ZSTD_SKIP is defined, then the first entry is returned in HASH_getBestEntry -// (without looking at other entries in the bucket). -//#define ZSTD_SKIP - -struct LDM_hashTable { - U32 numBuckets; // The number of buckets. - U32 numEntries; // numBuckets * HASH_BUCKET_SIZE. - LDM_hashEntry *entries; - BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position. - - const BYTE *offsetBase; // Corresponds to offset=0 in LDM_hashEntry. - U32 minMatchLength; - U32 maxWindowSize; -}; - -LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase, - U32 minMatchLength, U32 maxWindowSize) { - LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); - table->numBuckets = size >> HASH_BUCKET_SIZE_LOG; - table->numEntries = size; - table->entries = calloc(size, sizeof(LDM_hashEntry)); - table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); - table->offsetBase = offsetBase; - table->minMatchLength = minMatchLength; - table->maxWindowSize = maxWindowSize; - return table; -} - -static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { - return table->entries + (hash << HASH_BUCKET_SIZE_LOG); -} - -// From lib/compress/zstd_compress.c -static unsigned ZSTD_NbCommonBytes (register size_t val) -{ - if (MEM_isLittleEndian()) { - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanForward64( &r, (U64)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctzll((U64)val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, - 0, 3, 1, 3, 1, 4, 2, 7, - 0, 2, 3, 6, 1, 5, 3, 5, - 1, 3, 4, 4, 2, 5, 6, 7, - 7, 0, 1, 2, 3, 3, 4, 6, - 2, 6, 5, 5, 3, 4, 5, 6, - 7, 1, 2, 4, 6, 4, 4, 5, - 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r=0; - _BitScanForward( &r, (U32)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctz((U32)val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, - 3, 2, 2, 1, 3, 2, 0, 1, - 3, 3, 1, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif - } - } else { /* Big Endian CPU */ - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clzll(val) >> 3); -# else - unsigned r; - const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ - if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r = 0; - _BitScanReverse( &r, (unsigned long)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clz((U32)val) >> 3); -# else - unsigned r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif - } } -} - -/** - * From lib/compress/zstd_compress.c - * Returns the number of bytes (consecutively) in common between pIn and pMatch - * up to pInLimit. - */ -static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, - const BYTE *const pInLimit) { - const BYTE * const pStart = pIn; - const BYTE * const pInLoopLimit = pInLimit - (sizeof(size_t)-1); - - while (pIn < pInLoopLimit) { - size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); - if (!diff) { - pIn += sizeof(size_t); - pMatch += sizeof(size_t); - continue; - } - pIn += ZSTD_NbCommonBytes(diff); - return (size_t)(pIn - pStart); - } - - if (MEM_64bits()) { - if ((pIn < (pInLimit - 3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { - pIn += 4; - pMatch += 4; - } - } - if ((pIn < (pInLimit - 1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { - pIn += 2; - pMatch += 2; - } - if ((pIn < pInLimit) && (*pMatch == *pIn)) { - pIn++; - } - return (size_t)(pIn - pStart); -} - -/** - * Returns the number of bytes in common between pIn and pMatch, - * counting backwards, with pIn having a lower limit of pAnchor and - * pMatch having a lower limit of pBase. - */ -static size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, - const BYTE *pMatch, const BYTE *pBase) { - size_t matchLength = 0; - while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { - pIn--; - pMatch--; - matchLength++; - } - return matchLength; -} - -LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, - const hash_t hash, - const U32 checksum, - const BYTE *pIn, - const BYTE *pEnd, - const BYTE *pAnchor, - U64 *pForwardMatchLength, - U64 *pBackwardMatchLength) { - LDM_hashEntry *bucket = getBucket(table, hash); - LDM_hashEntry *cur = bucket; - LDM_hashEntry *bestEntry = NULL; - U64 bestMatchLength = 0; - for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - const BYTE *pMatch = cur->offset + table->offsetBase; - - // Check checksum for faster check. - if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) { - U64 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); - U64 backwardMatchLength, totalMatchLength; - - // Only take matches where the forwardMatchLength is large enough - // for speed. - if (forwardMatchLength < table->minMatchLength) { - continue; - } - backwardMatchLength = - countBackwardsMatch(pIn, pAnchor, cur->offset + table->offsetBase, - table->offsetBase); - - totalMatchLength = forwardMatchLength + backwardMatchLength; - - if (totalMatchLength >= bestMatchLength) { - bestMatchLength = totalMatchLength; - *pForwardMatchLength = forwardMatchLength; - *pBackwardMatchLength = backwardMatchLength; - - bestEntry = cur; - -#ifdef ZSTD_SKIP - return cur; -#endif - } - } - } - if (bestEntry != NULL) { - return bestEntry; - } - return NULL; -} - -hash_t HASH_hashU32(U32 value) { - return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); -} - -void HASH_insert(LDM_hashTable *table, - const hash_t hash, const LDM_hashEntry entry) { - // Circular buffer. - *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; - table->bucketOffsets[hash]++; - table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; -} - -U32 HASH_getSize(const LDM_hashTable *table) { - return table->numBuckets; -} - -void HASH_destroyTable(LDM_hashTable *table) { - free(table->entries); - free(table->bucketOffsets); - free(table); -} - -void HASH_outputTableOccupancy(const LDM_hashTable *table) { - U32 ctr = 0; - LDM_hashEntry *cur = table->entries; - LDM_hashEntry *end = table->entries + (table->numBuckets * HASH_BUCKET_SIZE); - for (; cur < end; ++cur) { - if (cur->offset == 0) { - ctr++; - } - } - - printf("Num buckets, bucket size: %d, %d\n", - table->numBuckets, HASH_BUCKET_SIZE); - printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - table->numEntries, ctr, - 100.0 * (double)(ctr) / table->numEntries); -} diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index e1a005e3..f9ad383e 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -17,17 +17,22 @@ // THe number of bytes storing the offset. #define LDM_OFFSET_SIZE 4 +// ============================================================================= +// User parameters. +// ============================================================================= + // Defines the size of the hash table. // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? -#define LDM_MEMORY_USAGE 23 +#define LDM_MEMORY_USAGE 25 // The number of entries in a hash bucket. -#define HASH_BUCKET_SIZE_LOG 0 // The maximum is 4 for now. +#define HASH_BUCKET_SIZE_LOG 3 // The maximum is 4 for now. // Defines the lag in inserting elements into the hash table. #define LDM_LAG 0 +// The maximum window size. #define LDM_WINDOW_SIZE_LOG 28 // Max value is 30 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) @@ -37,10 +42,11 @@ // Experimental. //#define TMP_EVICTION // Experiment with eviction policies. -#define TMP_TAG_INSERT // Insertion policy based on hash. +#define INSERT_BY_TAG // Insertion policy based on hash. #define USE_CHECKSUM 1 -//#define USE_CHECKSUM (HASH_BUCKET_SIZE_LOG) + +// ============================================================================= typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; diff --git a/contrib/long_distance_matching/ldm_integrated.c b/contrib/long_distance_matching/ldm_hash32.c similarity index 100% rename from contrib/long_distance_matching/ldm_integrated.c rename to contrib/long_distance_matching/ldm_hash32.c diff --git a/contrib/long_distance_matching/ldm_64_hash.c b/contrib/long_distance_matching/ldm_hash64.c similarity index 96% rename from contrib/long_distance_matching/ldm_64_hash.c rename to contrib/long_distance_matching/ldm_hash64.c index c74d71ff..e51ac57d 100644 --- a/contrib/long_distance_matching/ldm_64_hash.c +++ b/contrib/long_distance_matching/ldm_hash64.c @@ -489,7 +489,7 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { (double) stats->numMatches); } printf("\n"); -#ifdef TMP_TAG_INSERT +#ifdef INSERT_BY_TAG /* printf("Lower bit distribution\n"); for (i = 0; i < (1 << HASH_ONLY_EVERY_LOG); i++) { @@ -524,7 +524,7 @@ static U32 getChecksum(U64 hash) { return (hash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; } -#ifdef TMP_TAG_INSERT +#ifdef INSERT_BY_TAG static U32 lowerBitsFromHfHash(U64 hash) { // The number of bits used so far is LDM_HASHLOG + 32. // So there are 32 - LDM_HASHLOG bits left. @@ -611,7 +611,7 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->lastPosHashed[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; -#ifdef TMP_TAG_INSERT +#ifdef INSERT_BY_TAG { U32 hashEveryMask = lowerBitsFromHfHash(cctx->nextHash); cctx->stats.TMP_totalHashCount++; @@ -647,9 +647,13 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. #if LDM_LAG - if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - // TODO: Off by one, but not important. - if (cctx->lagIp - cctx->ibase > 0) { + if (cctx -> lagIp - cctx->ibase > 0) { +#ifdef INSERT_BY_TAG + U32 hashEveryMask = lowerBitsFromHfHash(cctx->lagHash); + if (hashEveryMask == HASH_ONLY_EVERY) { +#else + if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { +#endif U32 smallHash = getSmallHash(cctx->lagHash); # if USE_CHECKSUM @@ -664,23 +668,32 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { # else HASH_insert(cctx->hashTable, smallHash, entry); # endif - } else { -# if USE_CHECKSUM - U32 checksum = getChecksum(hash); - const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, checksum }; -# else - const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase }; -# endif + } + } else { +#ifdef INSERT_BY_TAG + U32 hashEveryMask = lowerBitsFromHfHash(hash); + if (hashEveryMask == HASH_ONLY_EVERY) { +#else + if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { +#endif + U32 smallHash = getSmallHash(hash); -# ifdef TMP_EVICTION +#if USE_CHECKSUM + U32 checksum = getChecksum(hash); + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; +#else + const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; +#endif + +#ifdef TMP_EVICTION HASH_insert(cctx->hashTable, smallHash, entry, cctx); -# else +#else HASH_insert(cctx->hashTable, smallHash, entry); -# endif +#endif } } #else -#ifdef TMP_TAG_INSERT +#ifdef INSERT_BY_TAG U32 hashEveryMask = lowerBitsFromHfHash(hash); if (hashEveryMask == HASH_ONLY_EVERY) { #else @@ -799,7 +812,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, U64 hash; hash_t smallHash; U32 checksum; -#ifdef TMP_TAG_INSERT +#ifdef INSERT_BY_TAG U32 hashEveryMask; #endif setNextHash(cctx); @@ -807,7 +820,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, hash = cctx->nextHash; smallHash = getSmallHash(hash); checksum = getChecksum(hash); -#ifdef TMP_TAG_INSERT +#ifdef INSERT_BY_TAG hashEveryMask = lowerBitsFromHfHash(hash); #endif @@ -817,7 +830,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, if (cctx->ip > cctx->imatchLimit) { return 1; } -#ifdef TMP_TAG_INSERT +#ifdef INSERT_BY_TAG if (hashEveryMask == HASH_ONLY_EVERY) { entry = HASH_getBestEntry(cctx, smallHash, checksum, @@ -1003,13 +1016,17 @@ void LDM_outputConfiguration(void) { printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); - printf("LDM_LAG %d\n", LDM_LAG); - printf("USE_CHECKSUM %d\n", USE_CHECKSUM); + printf("LDM_LAG: %d\n", LDM_LAG); + printf("USE_CHECKSUM: %d\n", USE_CHECKSUM); +#ifdef INSERT_BY_TAG + printf("INSERT_BY_TAG: %d\n", 1); +#else + printf("INSERT_BY_TAG: %d\n", 0); +#endif + printf("HASH_CHAR_OFFSET: %d\n", HASH_CHAR_OFFSET); printf("=====================\n"); } - - // TODO: implement and test hash function void LDM_test(const BYTE *src) { const U32 diff = 100; diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h deleted file mode 100644 index 6093197d..00000000 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ /dev/null @@ -1,91 +0,0 @@ -/** - * A "hash" table used in LDM compression. - * - * This is not exactly a hash table in the sense that inserted entries - * are not guaranteed to remain in the hash table. - */ - -#ifndef LDM_HASHTABLE_H -#define LDM_HASHTABLE_H - -#include "mem.h" - -// The log size of LDM_hashEntry in bytes. -#define LDM_HASH_ENTRY_SIZE_LOG 3 - -typedef U32 hash_t; - -typedef struct LDM_hashEntry { - U32 offset; // Represents the offset of the entry from offsetBase. - U32 checksum; // A checksum to select entries with the same hash value. -} LDM_hashEntry; - -typedef struct LDM_hashTable LDM_hashTable; - -/** - * Create a table that can contain size elements. This does not necessarily - * correspond to the number of hash buckets. The number of hash buckets - * is size / (1 << HASH_BUCKET_SIZE_LOG) - * - * minMatchLength is the minimum match length required in HASH_getBestEntry. - * - * maxWindowSize is the maximum distance from pIn in HASH_getBestEntry. - * The window is defined to be (pIn - offsetBase - offset). - */ -LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase, - U32 minMatchLength, U32 maxWindowSize); - -/** - * Return the "best" entry from the table with the same hash and checksum. - * - * pIn: a pointer to the current input position. - * pEnd: a pointer to the maximum input position. - * pAnchor: a pointer to the minimum input position. - * - * This function computes the forward and backward match length from pIn - * and writes it to forwardMatchLength and backwardsMatchLength. - * - * E.g. for the two strings "aaabbbb" "aaabbbb" with pIn and the - * entry pointing at the first "b", the forward match length would be - * four (representing the "b" matches) and the backward match length would - * three (representing the "a" matches before the pointer). - */ -LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, - const hash_t hash, - const U32 checksum, - const BYTE *pIn, - const BYTE *pEnd, - const BYTE *pAnchor, - U64 *forwardMatchLength, - U64 *backwardsMatchLength); - -/** - * Return a hash of the value. - */ -hash_t HASH_hashU32(U32 value); - -/** - * Insert an LDM_hashEntry into the bucket corresponding to hash. - * - * An entry may be evicted in the process. - */ -void HASH_insert(LDM_hashTable *table, const hash_t hash, - const LDM_hashEntry entry); - -/** - * Return the number of distinct hash buckets. - */ -U32 HASH_getSize(const LDM_hashTable *table); - -/** - * Destroy the table. - */ -void HASH_destroyTable(LDM_hashTable *table); - -/** - * Prints the percentage of the hash table occupied (where occupied is defined - * as the entry being non-zero). - */ -void HASH_outputTableOccupancy(const LDM_hashTable *hashTable); - -#endif /* LDM_HASHTABLE_H */ diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main.c similarity index 95% rename from contrib/long_distance_matching/main-ldm.c rename to contrib/long_distance_matching/main.c index 232c14a2..cee5edba 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main.c @@ -12,7 +12,7 @@ #include "ldm.h" #include "zstd.h" -//#define TEST +//#define DECOMPRESS_AND_VERIFY /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. @@ -91,9 +91,10 @@ static int compress(const char *fname, const char *oname) { ftruncate(fdout, compressedSize); - printf("%25s : %10lu -> %10lu - %s (%.2fx --- %.1f%%)\n", fname, - (size_t)statbuf.st_size, (size_t)compressedSize, oname, - (statbuf.st_size) / (double)compressedSize, + printf("%25s : %10lu -> %10lu - %s \n", fname, + (size_t)statbuf.st_size, (size_t)compressedSize, oname); + printf("Compression ratio: %.2fx --- %.1f%%\n", + (double)statbuf.st_size / (double)compressedSize, (double)compressedSize / (double)(statbuf.st_size) * 100.0); timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + @@ -110,6 +111,7 @@ static int compress(const char *fname, const char *oname) { return 0; } +#ifdef DECOMPRESS /* Decompress file compressed using LDM_compress. * The input file should have the LDM_HEADER followed by payload. * Returns 0 if succesful, and an error code otherwise. @@ -162,7 +164,6 @@ static int decompress(const char *fname, const char *oname) { src + LDM_HEADER_SIZE, statbuf.st_size - LDM_HEADER_SIZE, dst, decompressedSize); printf("Ret size out: %zu\n", outSize); -// ftruncate(fdout, decompressedSize); close(fdin); close(fdout); @@ -207,6 +208,7 @@ static void verify(const char *inpFilename, const char *decFilename) { fclose(decFp); fclose(inpFp); } +#endif int main(int argc, const char *argv[]) { const char * const exeName = argv[0]; @@ -237,6 +239,7 @@ int main(int argc, const char *argv[]) { } } +#ifdef DECOMPRESS_AND_VERIFY /* Decompress */ { struct timeval tv1, tv2; @@ -252,6 +255,6 @@ int main(int argc, const char *argv[]) { } /* verify */ verify(inpFilename, decFilename); - +#endif return 0; } From e9161637b28f3b9fb26398d5e85d74e8959ca2c8 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Tue, 25 Jul 2017 18:13:27 -0700 Subject: [PATCH 56/62] Allow parameters to be modified from a separate file --- contrib/long_distance_matching/Makefile | 8 +- contrib/long_distance_matching/ldm.c | 539 -------------------- contrib/long_distance_matching/ldm.h | 30 +- contrib/long_distance_matching/ldm_common.c | 5 +- contrib/long_distance_matching/ldm_hash32.c | 12 +- contrib/long_distance_matching/ldm_hash64.c | 164 +----- contrib/long_distance_matching/ldm_params.h | 10 + contrib/long_distance_matching/main.c | 8 +- 8 files changed, 47 insertions(+), 729 deletions(-) delete mode 100644 contrib/long_distance_matching/ldm.c create mode 100644 contrib/long_distance_matching/ldm_params.h diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 292ce851..b1fd3a1e 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,16 +25,16 @@ LDFLAGS += -lzstd default: all -all: main-hash32 main-hash64 +all: main-64 main-integrated -main-hash64: ldm_common.c ldm_hash64.c main.c +main-64: ldm_common.c ldm_hash64.c main.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-hash32: ldm_common.c ldm_hash32.c main.c +main-integrated: ldm_common.c ldm_hash32.c main.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-hash64 main-hash32 + main-hash64 main-hash32 main-64 main-integrated @echo Cleaning completed diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c deleted file mode 100644 index fae35f9e..00000000 --- a/contrib/long_distance_matching/ldm.c +++ /dev/null @@ -1,539 +0,0 @@ -#include -#include -#include -#include -#include - -#include "ldm.h" -#include "ldm_hashtable.h" - -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) - -#define LDM_HASH_ENTRY_SIZE_LOG 3 - -//#define HASH_ONLY_EVERY_LOG 7 -#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) - -#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) - - -#define COMPUTE_STATS -#define OUTPUT_CONFIGURATION -#define CHECKSUM_CHAR_OFFSET 10 - -//#define RUN_CHECKS - -typedef U32 checksum_t; - -struct LDM_compressStats { - U32 windowSizeLog, hashTableSizeLog; - U32 numMatches; - U64 totalMatchLength; - U64 totalLiteralLength; - U64 totalOffset; - - U32 minOffset, maxOffset; - - U32 offsetHistogram[32]; -}; - -struct LDM_CCtx { - U64 isize; /* Input size */ - U64 maxOSize; /* Maximum output size */ - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of input */ - - // Maximum input position such that hashing at the position does not exceed - // end of input. - const BYTE *ihashLimit; - - // Maximum input position such that finding a match of at least the minimum - // match length does not exceed end of input. - const BYTE *imatchLimit; - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Output */ - - const BYTE *anchor; /* Anchor to start of current (match) block */ - - LDM_compressStats stats; /* Compression statistics */ - - LDM_hashTable *hashTable; - - const BYTE *lastPosHashed; /* Last position hashed */ - hash_t lastHash; /* Hash corresponding to lastPosHashed */ - checksum_t lastSum; - - const BYTE *nextIp; // TODO: this is redundant (ip + step) - const BYTE *nextPosHashed; - hash_t nextHash; /* Hash corresponding to nextPosHashed */ - checksum_t nextSum; - - unsigned step; // ip step, should be 1. - - const BYTE *lagIp; - hash_t lagHash; - checksum_t lagSum; - - // DEBUG - const BYTE *DEBUG_setNextHash; -}; - -// TODO: This can be done more efficiently (but it is not that important as it -// is only used for computing stats). -static int intLog2(U32 x) { - int ret = 0; - while (x >>= 1) { - ret++; - } - return ret; -} - -void LDM_printCompressStats(const LDM_compressStats *stats) { - int i = 0; - printf("=====================\n"); - printf("Compression statistics\n"); - printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", - stats->windowSizeLog, stats->hashTableSizeLog); - printf("num matches, total match length, %% matched: %u, %llu, %.3f\n", - stats->numMatches, - stats->totalMatchLength, - 100.0 * (double)stats->totalMatchLength / - (double)(stats->totalMatchLength + stats->totalLiteralLength)); - printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / - (double)stats->numMatches); - printf("avg literal length, total literalLength: %.1f, %llu\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches, - stats->totalLiteralLength); - printf("avg offset length: %.1f\n", - ((double)stats->totalOffset) / (double)stats->numMatches); - printf("min offset, max offset: %u, %u\n", - stats->minOffset, stats->maxOffset); - - printf("\n"); - printf("offset histogram: offset, num matches, %% of matches\n"); - - for (; i <= intLog2(stats->maxOffset); i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, - stats->offsetHistogram[i], - 100.0 * (double) stats->offsetHistogram[i] / - (double) stats->numMatches); - } - printf("\n"); - printf("=====================\n"); -} - -/** - * Convert a sum computed from getChecksum to a hash value in the range - * of the hash table. - */ -static hash_t checksumToHash(U32 sum) { - return HASH_hashU32(sum); -} - -/** - * Computes a 32-bit checksum based on rsync's checksum. - * - * a(k,l) = \sum_{i = k}^l x_i (mod M) - * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) - * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) - */ -static checksum_t getChecksum(const BYTE *buf, U32 len) { - U32 i; - checksum_t s1, s2; - - s1 = s2 = 0; - for (i = 0; i < (len - 4); i += 4) { - s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + - (2 * buf[i + 2]) + (buf[i + 3]) + - (10 * CHECKSUM_CHAR_OFFSET); - s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3] + - + (4 * CHECKSUM_CHAR_OFFSET); - - } - for(; i < len; i++) { - s1 += buf[i] + CHECKSUM_CHAR_OFFSET; - s2 += s1; - } - return (s1 & 0xffff) + (s2 << 16); -} - -/** - * Update a checksum computed from getChecksum(data, len). - * - * The checksum can be updated along its ends as follows: - * a(k+1, l+1) = (a(k,l) - x_k + x_{l+1}) (mod M) - * b(k+1, l+1) = (b(k,l) - (l-k+1)*x_k + (a(k+1,l+1)) (mod M) - * - * Thus toRemove should correspond to data[0]. - */ -static checksum_t updateChecksum(checksum_t sum, U32 len, - BYTE toRemove, BYTE toAdd) { - U32 s1 = (sum & 0xffff) - toRemove + toAdd; - U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; - - return (s1 & 0xffff) + (s2 << 16); -} - -/** - * Update cctx->nextSum, cctx->nextHash, and cctx->nextPosHashed - * based on cctx->lastSum and cctx->lastPosHashed. - * - * This uses a rolling hash and requires that the last position hashed - * corresponds to cctx->nextIp - step. - */ -static void setNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - U32 check; - if ((cctx->nextIp - cctx->ibase != 1) && - (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { - printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, - cctx->DEBUG_setNextHash - cctx->ibase); - } - - cctx->DEBUG_setNextHash = cctx->nextIp; -#endif - - cctx->nextSum = updateChecksum( - cctx->lastSum, LDM_HASH_LENGTH, - cctx->lastPosHashed[0], - cctx->lastPosHashed[LDM_HASH_LENGTH]); - cctx->nextPosHashed = cctx->nextIp; - cctx->nextHash = checksumToHash(cctx->nextSum); - -#if LDM_LAG - if (cctx->ip - cctx->ibase > LDM_LAG) { - cctx->lagSum = updateChecksum( - cctx->lagSum, LDM_HASH_LENGTH, - cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); - cctx->lagIp++; - cctx->lagHash = checksumToHash(cctx->lagSum); - } -#endif - -#ifdef RUN_CHECKS - check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); - - if (check != cctx->nextSum) { - printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); - } - - if ((cctx->nextIp - cctx->lastPosHashed) != 1) { - printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", - cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, - cctx->ip - cctx->ibase); - } -#endif -} - -static void putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash, U32 checksum) { - // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. - // Note: this works only when cctx->step is 1. - if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { -#if LDM_LAG - // Off by 1, but whatever - if (cctx->lagIp - cctx->ibase > 0) { - const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; - HASH_insert(cctx->hashTable, cctx->lagHash, entry); - } else { - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; - HASH_insert(cctx->hashTable, hash, entry); - } -#else - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; - HASH_insert(cctx->hashTable, hash, entry); -#endif - } - - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; - cctx->lastSum = checksum; -} - -/** - * Copy over the cctx->lastHash, cctx->lastSum, and cctx->lastPosHashed - * fields from the "next" fields. - * - * This requires that cctx->ip == cctx->nextPosHashed. - */ -static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - if (cctx->ip != cctx->nextPosHashed) { - printf("CHECK failed: updateLastHashFromNextHash %zu\n", - cctx->ip - cctx->ibase); - } -#endif - putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); -} - -/** - * Insert hash of the current position into the hash table. - */ -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - checksum_t sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); - hash_t hash = checksumToHash(sum); - -#ifdef RUN_CHECKS - if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { - printf("CHECK failed: putHashOfCurrentPosition %zu\n", - cctx->ip - cctx->ibase); - } -#endif - - putHashOfCurrentPositionFromHash(cctx, hash, sum); -} - -void LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - cctx->isize = srcSize; - cctx->maxOSize = maxDstSize; - - cctx->ibase = (const BYTE *)src; - cctx->ip = cctx->ibase; - cctx->iend = cctx->ibase + srcSize; - - cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; - cctx->imatchLimit = cctx->iend - LDM_MIN_MATCH_LENGTH; - - cctx->obase = (BYTE *)dst; - cctx->op = (BYTE *)dst; - - cctx->anchor = cctx->ibase; - - memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64, cctx->ibase, - LDM_MIN_MATCH_LENGTH, LDM_WINDOW_SIZE); - - cctx->stats.minOffset = UINT_MAX; - cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; - cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; - - - cctx->lastPosHashed = NULL; - - cctx->step = 1; // Fixed to be 1 for now. Changing may break things. - cctx->nextIp = cctx->ip + cctx->step; - cctx->nextPosHashed = 0; - - cctx->DEBUG_setNextHash = 0; -} - -void LDM_destroyCCtx(LDM_CCtx *cctx) { - HASH_destroyTable(cctx->hashTable); -} - -/** - * Finds the "best" match. - * - * Returns 0 if successful and 1 otherwise (i.e. no match can be found - * in the remaining input that is long enough). - * - * matchLength contains the forward length of the match. - */ -static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, - U64 *matchLength, U64 *backwardMatchLength) { - - LDM_hashEntry *entry = NULL; - cctx->nextIp = cctx->ip + cctx->step; - - while (entry == NULL) { - hash_t h; - checksum_t sum; - setNextHash(cctx); - h = cctx->nextHash; - sum = cctx->nextSum; - cctx->ip = cctx->nextIp; - cctx->nextIp += cctx->step; - - if (cctx->ip > cctx->imatchLimit) { - return 1; - } - - entry = HASH_getBestEntry(cctx->hashTable, h, sum, - cctx->ip, cctx->iend, - cctx->anchor, - matchLength, backwardMatchLength); - - if (entry != NULL) { - *match = entry->offset + cctx->ibase; - } - putHashOfCurrentPositionFromHash(cctx, h, sum); - } - setNextHash(cctx); - return 0; -} - -void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength) { - /* Encode the literal length. */ - if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; - *pToken = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *(cctx->op)++ = 255; - } - *(cctx->op)++ = (BYTE)len; - } else { - *pToken = (BYTE)(literalLength << ML_BITS); - } - - /* Encode the literals. */ - memcpy(cctx->op, cctx->anchor, literalLength); - cctx->op += literalLength; -} - -void LDM_outputBlock(LDM_CCtx *cctx, - const U64 literalLength, - const U32 offset, - const U64 matchLength) { - BYTE *pToken = cctx->op++; - - /* Encode the literal length and literals. */ - LDM_encodeLiteralLengthAndLiterals(cctx, pToken, literalLength); - - /* Encode the offset. */ - MEM_write32(cctx->op, offset); - cctx->op += LDM_OFFSET_SIZE; - - /* Encode the match length. */ - if (matchLength >= ML_MASK) { - U64 matchLengthRemaining = matchLength; - *pToken += ML_MASK; - matchLengthRemaining -= ML_MASK; - MEM_write32(cctx->op, 0xFFFFFFFF); - while (matchLengthRemaining >= 4*0xFF) { - cctx->op += 4; - MEM_write32(cctx->op, 0xffffffff); - matchLengthRemaining -= 4*0xFF; - } - cctx->op += matchLengthRemaining / 255; - *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); - } else { - *pToken += (BYTE)(matchLength); - } -} - -// TODO: maxDstSize is unused. This function may seg fault when writing -// beyond the size of dst, as it does not check maxDstSize. Writing to -// a buffer and performing checks is a possible solution. -// -// This is based upon lz4. -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - LDM_CCtx cctx; - const BYTE *match = NULL; - U64 forwardMatchLength = 0; - U64 backwardsMatchLength = 0; - - LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); - LDM_outputConfiguration(); - - /* Hash the first position and put it into the hash table. */ - LDM_putHashOfCurrentPosition(&cctx); - -#if LDM_LAG - cctx.lagIp = cctx.ip; - cctx.lagHash = cctx.lastHash; - cctx.lagSum = cctx.lastSum; -#endif - /** - * Find a match. - * If no more matches can be found (i.e. the length of the remaining input - * is less than the minimum match length), then stop searching for matches - * and encode the final literals. - */ - while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength, - &backwardsMatchLength) == 0) { -#ifdef COMPUTE_STATS - cctx.stats.numMatches++; -#endif - - cctx.ip -= backwardsMatchLength; - match -= backwardsMatchLength; - - /** - * Write current block (literals, literal length, match offset, match - * length) and update pointers and hashes. - */ - { - const U32 literalLength = cctx.ip - cctx.anchor; - const U32 offset = cctx.ip - match; - const U32 matchLength = forwardMatchLength + - backwardsMatchLength - - LDM_MIN_MATCH_LENGTH; - - LDM_outputBlock(&cctx, literalLength, offset, matchLength); - -#ifdef COMPUTE_STATS - cctx.stats.totalLiteralLength += literalLength; - cctx.stats.totalOffset += offset; - cctx.stats.totalMatchLength += matchLength + LDM_MIN_MATCH_LENGTH; - cctx.stats.minOffset = - offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; - cctx.stats.maxOffset = - offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; - cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; -#endif - - // Move ip to end of block, inserting hashes at each position. - cctx.nextIp = cctx.ip + cctx.step; - while (cctx.ip < cctx.anchor + LDM_MIN_MATCH_LENGTH + - matchLength + literalLength) { - if (cctx.ip > cctx.lastPosHashed) { - // TODO: Simplify. - LDM_updateLastHashFromNextHash(&cctx); - setNextHash(&cctx); - } - cctx.ip++; - cctx.nextIp++; - } - } - - // Set start of next block to current input pointer. - cctx.anchor = cctx.ip; - LDM_updateLastHashFromNextHash(&cctx); - } - - /* Encode the last literals (no more matches). */ - { - const U32 lastRun = cctx.iend - cctx.anchor; - BYTE *pToken = cctx.op++; - LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); - } - -#ifdef COMPUTE_STATS - LDM_printCompressStats(&cctx.stats); - HASH_outputTableOccupancy(cctx.hashTable); -#endif - - { - const size_t ret = cctx.op - cctx.obase; - LDM_destroyCCtx(&cctx); - return ret; - } -} - -void LDM_outputConfiguration(void) { - printf("=====================\n"); - printf("Configuration\n"); - printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); - printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", - LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); - printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); - printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); - printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); - printf("LDM_LAG %d\n", LDM_LAG); - printf("=====================\n"); -} - - - -void LDM_test(const BYTE *src) { - (void)src; -} - diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index f9ad383e..b87a57bc 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -2,6 +2,7 @@ #define LDM_H #include "mem.h" // from /lib/common/mem.h +#include "ldm_params.h" // The number of bytes storing the compressed and decompressed size // in the header. @@ -18,35 +19,38 @@ #define LDM_OFFSET_SIZE 4 // ============================================================================= -// User parameters. +// Modify parameters in ldm_params.h if "ldm_params.h" is included. // ============================================================================= +#ifndef LDM_PARAMS_H // Defines the size of the hash table. // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? -#define LDM_MEMORY_USAGE 25 + #define LDM_MEMORY_USAGE 25 // The number of entries in a hash bucket. -#define HASH_BUCKET_SIZE_LOG 3 // The maximum is 4 for now. + #define HASH_BUCKET_SIZE_LOG 3 // The maximum is 4 for now. // Defines the lag in inserting elements into the hash table. -#define LDM_LAG 0 + #define LDM_LAG 0 // The maximum window size. -#define LDM_WINDOW_SIZE_LOG 28 // Max value is 30 -#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) + #define LDM_WINDOW_SIZE_LOG 28 // Max value is 30 //These should be multiples of four (and perhaps set to the same value?). -#define LDM_MIN_MATCH_LENGTH 64 -#define LDM_HASH_LENGTH 64 + #define LDM_MIN_MATCH_LENGTH 64 -// Experimental. -//#define TMP_EVICTION // Experiment with eviction policies. -#define INSERT_BY_TAG // Insertion policy based on hash. + #define INSERT_BY_TAG 1 // Insertion policy based on hash. -#define USE_CHECKSUM 1 + #define USE_CHECKSUM 1 +#endif // ============================================================================= +#define COMPUTE_STATS +#define OUTPUT_CONFIGURATION + +#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) +#define LDM_HASH_LENGTH LDM_MIN_MATCH_LENGTH typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; @@ -164,6 +168,4 @@ void LDM_writeHeader(void *memPtr, U64 compressedSize, */ void LDM_outputConfiguration(void); -void LDM_test(const BYTE *src); - #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/ldm_common.c b/contrib/long_distance_matching/ldm_common.c index 1953656e..26b716a1 100644 --- a/contrib/long_distance_matching/ldm_common.c +++ b/contrib/long_distance_matching/ldm_common.c @@ -70,7 +70,8 @@ size_t LDM_decompress(const void *src, size_t compressedSize, dctx.ip += length; dctx.op = cpy; - //TODO : dynamic offset size + //TODO: dynamic offset size? + /* Encode the offset. */ offset = MEM_read32(dctx.ip); dctx.ip += LDM_OFFSET_SIZE; match = dctx.op - offset; @@ -89,7 +90,7 @@ size_t LDM_decompress(const void *src, size_t compressedSize, /* Copy match. */ cpy = dctx.op + length; - // Inefficient for now. + // TODO: this can be made more efficient. while (match < cpy - offset && dctx.op < dctx.oend) { *(dctx.op)++ = *match++; } diff --git a/contrib/long_distance_matching/ldm_hash32.c b/contrib/long_distance_matching/ldm_hash32.c index b80a5c01..94fa5e92 100644 --- a/contrib/long_distance_matching/ldm_hash32.c +++ b/contrib/long_distance_matching/ldm_hash32.c @@ -21,9 +21,7 @@ #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) -#define COMPUTE_STATS -#define OUTPUT_CONFIGURATION -#define CHECKSUM_CHAR_OFFSET 1 +#define CHECKSUM_CHAR_OFFSET 10 // Take first match only. //#define ZSTD_SKIP @@ -779,8 +777,6 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_updateLastHashFromNextHash(&cctx); } - // HASH_outputTableOffsetHistogram(&cctx); - /* Encode the last literals (no more matches). */ { const U32 lastRun = cctx.iend - cctx.anchor; @@ -815,9 +811,3 @@ void LDM_outputConfiguration(void) { -// TODO: implement and test hash function -void LDM_test(const BYTE *src) { - (void)src; -} - - diff --git a/contrib/long_distance_matching/ldm_hash64.c b/contrib/long_distance_matching/ldm_hash64.c index e51ac57d..884f7b72 100644 --- a/contrib/long_distance_matching/ldm_hash64.c +++ b/contrib/long_distance_matching/ldm_hash64.c @@ -24,8 +24,6 @@ #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) -#define COMPUTE_STATS -#define OUTPUT_CONFIGURATION #define HASH_CHAR_OFFSET 10 // Take first match only. @@ -63,11 +61,6 @@ struct LDM_compressStats { U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG]; U64 TMP_totalHashCount; - - U64 TMP_totalInWindow; - U64 TMP_totalInserts; - - U64 TMP_matchCount; }; typedef struct LDM_hashTable LDM_hashTable; @@ -328,91 +321,12 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, return NULL; } -#ifdef TMP_EVICTION -void HASH_insert(LDM_hashTable *table, - const hash_t hash, const LDM_hashEntry entry, - LDM_CCtx *cctx) { - // Overwrite based on part of checksum. - /* - LDM_hashEntry *toOverwrite = - getBucket(table, hash) + table->bucketOffsets[hash]; - const BYTE *pMatch = toOverwrite->offset + cctx->ibase; - if (toOverwrite->offset != 0 && - cctx->ip - pMatch <= LDM_WINDOW_SIZE) { - cctx->stats.TMP_totalInWindow++; - } - - cctx->stats.TMP_totalInserts++; - *(toOverwrite) = entry; - */ - - /* - int i; - LDM_hashEntry *bucket = getBucket(table, hash); - for (i = 0; i < HASH_BUCKET_SIZE; i++) { - if (bucket[i].checksum == entry.checksum) { - bucket[i] = entry; - cctx->stats.TMP_matchCount++; - return; - } - } - */ - - // Find entry beyond window size, replace. Else, random. - int i; - LDM_hashEntry *bucket = getBucket(table, hash); - for (i = 0; i < HASH_BUCKET_SIZE; i++) { - if (cctx->ip - cctx->ibase - bucket[i].offset > LDM_WINDOW_SIZE) { - bucket[i] = entry; - return; - } - } - - i = rand() & (HASH_BUCKET_SIZE - 1); - *(bucket + i) = entry; - - - /** - * Sliding buffer style pointer - * Keep old entry as temporary. If the old entry is outside the window, - * overwrite and we are done. - * - * Backwards (insert at x): - * x, a, b b, c c c c, d d d d d d d d - * x, d d d d d d d d, c c c c, b b, a - * - * Else, find something to evict. - * If old entry has more ones, it takes - * the next spot. <-- reversed order? - * - * If window size > LDM_WINDOW_SIZE, - * overwrite, - * - * Insert forwards. If > tag, keep. Else evict. - * - */ - - - /* - *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; - table->bucketOffsets[hash]++; - table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; - */ - -// U16 mask = entry.checksum & (HASH_BUCKET_SIZE - 1); -// *(getBucket(table, hash) + mask) = entry; -} - -#else - void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; table->bucketOffsets[hash]++; table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; } -#endif // TMP_EVICTION - U32 HASH_getSize(const LDM_hashTable *table) { return table->numBuckets; @@ -489,7 +403,7 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { (double) stats->numMatches); } printf("\n"); -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG /* printf("Lower bit distribution\n"); for (i = 0; i < (1 << HASH_ONLY_EVERY_LOG); i++) { @@ -500,13 +414,6 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { */ #endif -#ifdef TMP_EVICTION - printf("Evicted something in window: %llu %6.3f\n", - stats->TMP_totalInWindow, - 100.0 * (double)stats->TMP_totalInWindow / - (double)stats->TMP_totalInserts); - printf("Match count: %llu\n", stats->TMP_matchCount); -#endif printf("=====================\n"); } @@ -524,7 +431,7 @@ static U32 getChecksum(U64 hash) { return (hash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; } -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG static U32 lowerBitsFromHfHash(U64 hash) { // The number of bits used so far is LDM_HASHLOG + 32. // So there are 32 - LDM_HASHLOG bits left. @@ -611,7 +518,7 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->lastPosHashed[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG { U32 hashEveryMask = lowerBitsFromHfHash(cctx->nextHash); cctx->stats.TMP_totalHashCount++; @@ -648,7 +555,7 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { // Note: this works only when cctx->step is 1. #if LDM_LAG if (cctx -> lagIp - cctx->ibase > 0) { -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG U32 hashEveryMask = lowerBitsFromHfHash(cctx->lagHash); if (hashEveryMask == HASH_ONLY_EVERY) { #else @@ -663,14 +570,11 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase }; # endif -# ifdef TMP_EVICTION - HASH_insert(cctx->hashTable, smallHash, entry, cctx); -# else HASH_insert(cctx->hashTable, smallHash, entry); -# endif } } else { -#ifdef INSERT_BY_TAG +#endif // LDM_LAG +#if INSERT_BY_TAG U32 hashEveryMask = lowerBitsFromHfHash(hash); if (hashEveryMask == HASH_ONLY_EVERY) { #else @@ -684,33 +588,9 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { #else const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; #endif - -#ifdef TMP_EVICTION - HASH_insert(cctx->hashTable, smallHash, entry, cctx); -#else HASH_insert(cctx->hashTable, smallHash, entry); -#endif } - } -#else -#ifdef INSERT_BY_TAG - U32 hashEveryMask = lowerBitsFromHfHash(hash); - if (hashEveryMask == HASH_ONLY_EVERY) { -#else - if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { -#endif - U32 smallHash = getSmallHash(hash); -#if USE_CHECKSUM - U32 checksum = getChecksum(hash); - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; -#else - const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; -#endif -#ifdef TMP_EVICTION - HASH_insert(cctx->hashTable, smallHash, entry, cctx); -#else - HASH_insert(cctx->hashTable, smallHash, entry); -#endif +#if LDM_LAG } #endif @@ -812,7 +692,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, U64 hash; hash_t smallHash; U32 checksum; -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG U32 hashEveryMask; #endif setNextHash(cctx); @@ -820,7 +700,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, hash = cctx->nextHash; smallHash = getSmallHash(hash); checksum = getChecksum(hash); -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG hashEveryMask = lowerBitsFromHfHash(hash); #endif @@ -830,7 +710,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, if (cctx->ip > cctx->imatchLimit) { return 1; } -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG if (hashEveryMask == HASH_ONLY_EVERY) { entry = HASH_getBestEntry(cctx, smallHash, checksum, @@ -923,10 +803,8 @@ size_t LDM_compress(const void *src, size_t srcSize, /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); -#if LDM_LAG cctx.lagIp = cctx.ip; cctx.lagHash = cctx.lastHash; -#endif /** * Find a match. * If no more matches can be found (i.e. the length of the remaining input @@ -1018,28 +896,8 @@ void LDM_outputConfiguration(void) { printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); printf("LDM_LAG: %d\n", LDM_LAG); printf("USE_CHECKSUM: %d\n", USE_CHECKSUM); -#ifdef INSERT_BY_TAG - printf("INSERT_BY_TAG: %d\n", 1); -#else - printf("INSERT_BY_TAG: %d\n", 0); -#endif + printf("INSERT_BY_TAG: %d\n", INSERT_BY_TAG); printf("HASH_CHAR_OFFSET: %d\n", HASH_CHAR_OFFSET); printf("=====================\n"); } -// TODO: implement and test hash function -void LDM_test(const BYTE *src) { - const U32 diff = 100; - const BYTE *pCur = src + diff; - U64 hash = getHash(pCur, LDM_HASH_LENGTH); - - for (; pCur < src + diff + 60; ++pCur) { - U64 nextHash = getHash(pCur + 1, LDM_HASH_LENGTH); - U64 updatedHash = updateHash(hash, LDM_HASH_LENGTH, - pCur[0], pCur[LDM_HASH_LENGTH]); - hash = nextHash; - printf("%llu %llu\n", nextHash, updatedHash); - } -} - - diff --git a/contrib/long_distance_matching/ldm_params.h b/contrib/long_distance_matching/ldm_params.h new file mode 100644 index 00000000..0fcd30bd --- /dev/null +++ b/contrib/long_distance_matching/ldm_params.h @@ -0,0 +1,10 @@ +#ifndef LDM_PARAMS_H +#define LDM_PARAMS_H +#define LDM_MEMORY_USAGE 23 +#define HASH_BUCKET_SIZE_LOG 3 +#define LDM_LAG 0 +#define LDM_WINDOW_SIZE_LOG 28 +#define LDM_MIN_MATCH_LENGTH 64 +#define INSERT_BY_TAG 1 +#define USE_CHECKSUM 1 +#endif diff --git a/contrib/long_distance_matching/main.c b/contrib/long_distance_matching/main.c index cee5edba..bdd385ce 100644 --- a/contrib/long_distance_matching/main.c +++ b/contrib/long_distance_matching/main.c @@ -12,7 +12,7 @@ #include "ldm.h" #include "zstd.h" -//#define DECOMPRESS_AND_VERIFY +#define DECOMPRESS_AND_VERIFY /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. @@ -71,10 +71,6 @@ static int compress(const char *fname, const char *oname) { return 1; } -#ifdef TEST - LDM_test((const BYTE *)src); -#endif - gettimeofday(&tv1, NULL); compressedSize = LDM_HEADER_SIZE + @@ -111,7 +107,7 @@ static int compress(const char *fname, const char *oname) { return 0; } -#ifdef DECOMPRESS +#ifdef DECOMPRESS_AND_VERIFY /* Decompress file compressed using LDM_compress. * The input file should have the LDM_HEADER followed by payload. * Returns 0 if succesful, and an error code otherwise. From 40759bade90d1ecb1bb52b55e2c7a3399e625998 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 26 Jul 2017 13:18:53 -0700 Subject: [PATCH 57/62] Add README and clean up code --- contrib/long_distance_matching/Makefile | 11 +- contrib/long_distance_matching/README.md | 39 + .../{ldm_hash64.c => ldm.c} | 109 +-- contrib/long_distance_matching/ldm.h | 87 +- contrib/long_distance_matching/ldm_hash32.c | 813 ------------------ contrib/long_distance_matching/ldm_params.h | 4 +- contrib/long_distance_matching/main.c | 13 +- 7 files changed, 134 insertions(+), 942 deletions(-) create mode 100644 contrib/long_distance_matching/README.md rename contrib/long_distance_matching/{ldm_hash64.c => ldm.c} (90%) delete mode 100644 contrib/long_distance_matching/ldm_hash32.c diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index b1fd3a1e..8bc7ac47 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,16 +25,13 @@ LDFLAGS += -lzstd default: all -all: main-64 main-integrated - -main-64: ldm_common.c ldm_hash64.c main.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -main-integrated: ldm_common.c ldm_hash32.c main.c +all: ldm + +ldm: ldm_common.c ldm.c main.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-hash64 main-hash32 main-64 main-integrated + ldm @echo Cleaning completed diff --git a/contrib/long_distance_matching/README.md b/contrib/long_distance_matching/README.md new file mode 100644 index 00000000..d9cb0895 --- /dev/null +++ b/contrib/long_distance_matching/README.md @@ -0,0 +1,39 @@ +This is a compression algorithm focused on finding long distance matches. + +It is based upon lz4 and uses nearly the same block format (github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md). The number of bytes to encode the offset is four instead of two in lz4 to reflect the longer distance matching. The block format is descriped in `ldm.h`. + +### Build + +Run `make`. + +### Compressing a file + +`ldm ` + +Decompression and verification can be enabled by defining `DECOMPRESS_AND_VERIFY` in `main.c`. +The output file names are as follows: +- `.ldm` : compressed file +- `.ldm.dec` : decompressed file + +### Parameters + +There are various parameters that can be tuned. These parameters can be tuned in `ldm.h` or, alternatively if `ldm_params.h` is included, in `ldm_params.h` (for easier configuration). + +The parameters are as follows and must all be defined: +- `LDM_MEMORY_USAGE` : the memory usage of the underlying hash table in bytes. +- `HASH_BUCKET_SIZE_LOG` : the log size of each bucket in the hash table (used in collision resolution). +- `LDM_LAG` : the lag (in bytes) in inserting entries into the hash table. +- `LDM_WINDOW_SIZE_LOG` : the log maximum window size when searching for matches. +- `LDM_MIN_MATCH_LENGTH` : the minimum match length. +- `INSERT_BY_TAG` : insert entries into the hash table as a function of the hash. This increases speed by reducing the number of hash table lookups and match comparisons. Certain hashes will never be inserted. +- `USE_CHECKSUM` : store a checksum with the hash table entries for faster comparison. This halves the number of entries the hash table can contain. + +### Compression statistics + +Compression statistics (and the configuration) can be enabled/disabled via `COMPUTE_STATS` and `OUTPUT_CONFIGURATION` in `ldm.h`. + + + + + + diff --git a/contrib/long_distance_matching/ldm_hash64.c b/contrib/long_distance_matching/ldm.c similarity index 90% rename from contrib/long_distance_matching/ldm_hash64.c rename to contrib/long_distance_matching/ldm.c index 884f7b72..9a843838 100644 --- a/contrib/long_distance_matching/ldm_hash64.c +++ b/contrib/long_distance_matching/ldm.c @@ -16,21 +16,21 @@ #define LDM_HASH_ENTRY_SIZE_LOG 2 #endif +// Force the "probability" of insertion to be some value. +// Entries are inserted into the table HASH_ONLY_EVERY + 1 times "on average". + //#define HASH_ONLY_EVERY_LOG 7 #define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) - #define HASH_ONLY_EVERY ((1 << (HASH_ONLY_EVERY_LOG)) - 1) #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) -#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) +#define NUM_HASH_BUCKETS_LOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) #define HASH_CHAR_OFFSET 10 -// Take first match only. +// Take the first match in the hash bucket only. //#define ZSTD_SKIP -//#define RUN_CHECKS - static const U64 prime8bytes = 11400714785074694791ULL; // Type of the small hash used to index into the hash table. @@ -101,10 +101,6 @@ struct LDM_CCtx { const BYTE *lagIp; U64 lagHash; - -#ifdef RUN_CHECKS - const BYTE *DEBUG_setNextHash; -#endif }; struct LDM_hashTable { @@ -119,7 +115,7 @@ struct LDM_hashTable { * Create a hash table that can contain size elements. * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG. */ -LDM_hashTable *HASH_createTable(U32 size) { +static LDM_hashTable *HASH_createTable(U32 size) { LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); table->numBuckets = size >> HASH_BUCKET_SIZE_LOG; table->numEntries = size; @@ -239,7 +235,7 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, /** * Count number of bytes that match backwards before pIn and pMatch. * - * We count only bytes where pMatch > pBaes and pIn > pAnchor. + * We count only bytes where pMatch > pBase and pIn > pAnchor. */ static size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, const BYTE *pMatch, const BYTE *pBase) { @@ -262,13 +258,12 @@ static size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, * The forward match is computed from cctx->ip and entry->offset + cctx->ibase. * The backward match is computed backwards from cctx->ip and * cctx->ibase only if the forward match is longer than LDM_MIN_MATCH_LENGTH. - * */ -LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, - const hash_t hash, - const U32 checksum, - U64 *pForwardMatchLength, - U64 *pBackwardMatchLength) { +static LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, + const hash_t hash, + const U32 checksum, + U64 *pForwardMatchLength, + U64 *pBackwardMatchLength) { LDM_hashTable *table = cctx->hashTable; LDM_hashEntry *bucket = getBucket(table, hash); LDM_hashEntry *cur = bucket; @@ -321,24 +316,24 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, return NULL; } -void HASH_insert(LDM_hashTable *table, - const hash_t hash, const LDM_hashEntry entry) { +/** + * Insert an entry into the hash table. The table uses a "circular buffer", + * with the oldest entry overwritten. + */ +static void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; table->bucketOffsets[hash]++; table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; } -U32 HASH_getSize(const LDM_hashTable *table) { - return table->numBuckets; -} - -void HASH_destroyTable(LDM_hashTable *table) { +static void HASH_destroyTable(LDM_hashTable *table) { free(table->entries); free(table->bucketOffsets); free(table); } -void HASH_outputTableOccupancy(const LDM_hashTable *table) { +static void HASH_outputTableOccupancy(const LDM_hashTable *table) { U32 ctr = 0; LDM_hashEntry *cur = table->entries; LDM_hashEntry *end = table->entries + (table->numBuckets * HASH_BUCKET_SIZE); @@ -350,7 +345,7 @@ void HASH_outputTableOccupancy(const LDM_hashTable *table) { // The number of buckets is repeated as a check for now. printf("Num buckets, bucket size: %d (2^%d), %d\n", - table->numBuckets, LDM_HASHLOG, HASH_BUCKET_SIZE); + table->numBuckets, NUM_HASH_BUCKETS_LOG, HASH_BUCKET_SIZE); printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", table->numEntries, ctr, 100.0 * (double)(ctr) / table->numEntries); @@ -418,31 +413,32 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { } /** - * Return the upper (most significant) LDM_HASHLOG bits. + * Return the upper (most significant) NUM_HASH_BUCKETS_LOG bits. */ static hash_t getSmallHash(U64 hash) { - return hash >> (64 - LDM_HASHLOG); + return hash >> (64 - NUM_HASH_BUCKETS_LOG); } /** - * Return the 32 bits after the upper LDM_HASHLOG bits. + * Return the 32 bits after the upper NUM_HASH_BUCKETS_LOG bits. */ static U32 getChecksum(U64 hash) { - return (hash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; + return (hash >> (64 - 32 - NUM_HASH_BUCKETS_LOG)) & 0xFFFFFFFF; } #if INSERT_BY_TAG static U32 lowerBitsFromHfHash(U64 hash) { - // The number of bits used so far is LDM_HASHLOG + 32. - // So there are 32 - LDM_HASHLOG bits left. + // The number of bits used so far is NUM_HASH_BUCKETS_LOG + 32. + // So there are 32 - NUM_HASH_BUCKETS_LOG bits left. // Occasional hashing requires HASH_ONLY_EVERY_LOG bits. // So if 32 - LDMHASHLOG < HASH_ONLY_EVERY_LOG, just return lower bits // allowing for reuse of bits. - if (32 - LDM_HASHLOG < HASH_ONLY_EVERY_LOG) { + if (32 - NUM_HASH_BUCKETS_LOG < HASH_ONLY_EVERY_LOG) { return hash & HASH_ONLY_EVERY; } else { - // Otherwise shift by (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG) bits first. - return (hash >> (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG)) & + // Otherwise shift by + // (32 - NUM_HASH_BUCKETS_LOG - HASH_ONLY_EVERY_LOG) bits first. + return (hash >> (32 - NUM_HASH_BUCKETS_LOG - HASH_ONLY_EVERY_LOG)) & HASH_ONLY_EVERY; } } @@ -501,17 +497,6 @@ static U64 updateHash(U64 hash, U32 len, * corresponds to cctx->nextIp - step. */ static void setNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - U64 check; - if ((cctx->nextIp - cctx->ibase != 1) && - (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { - printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, - cctx->DEBUG_setNextHash - cctx->ibase); - } - - cctx->DEBUG_setNextHash = cctx->nextIp; -#endif - cctx->nextHash = updateHash( cctx->lastHash, LDM_HASH_LENGTH, cctx->lastPosHashed[0], @@ -534,20 +519,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->lagIp++; } #endif - -#ifdef RUN_CHECKS - check = getHash(cctx->nextIp, LDM_HASH_LENGTH); - - if (check != cctx->nextHash) { - printf("CHECK: setNextHash failed %llu %llu\n", check, cctx->nextHash); - } - - if ((cctx->nextIp - cctx->lastPosHashed) != 1) { - printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", - cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, - cctx->ip - cctx->ibase); - } -#endif } static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { @@ -605,12 +576,6 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { * This requires that cctx->ip == cctx->nextPosHashed. */ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - if (cctx->ip != cctx->nextPosHashed) { - printf("CHECK failed: updateLastHashFromNextHash %zu\n", - cctx->ip - cctx->ibase); - } -#endif putHashOfCurrentPositionFromHash(cctx, cctx->nextHash); } @@ -620,13 +585,6 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { U64 hash = getHash(cctx->ip, LDM_HASH_LENGTH); -#ifdef RUN_CHECKS - if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { - printf("CHECK failed: putHashOfCurrentPosition %zu\n", - cctx->ip - cctx->ibase); - } -#endif - putHashOfCurrentPositionFromHash(cctx, hash); } @@ -664,10 +622,6 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->step = 1; // Fixed to be 1 for now. Changing may break things. cctx->nextIp = cctx->ip + cctx->step; cctx->nextPosHashed = 0; - -#ifdef RUN_CHECKS - cctx->DEBUG_setNextHash = 0; -#endif } void LDM_destroyCCtx(LDM_CCtx *cctx) { @@ -805,6 +759,7 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.lagIp = cctx.ip; cctx.lagHash = cctx.lastHash; + /** * Find a match. * If no more matches can be found (i.e. the length of the remaining input diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index b87a57bc..38d24015 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -2,7 +2,52 @@ #define LDM_H #include "mem.h" // from /lib/common/mem.h -#include "ldm_params.h" + +// #include "ldm_params.h" + +// ============================================================================= +// Modify the parameters in ldm_params.h if "ldm_params.h" is included. +// Otherwise, modify the parameters here. +// ============================================================================= + +#ifndef LDM_PARAMS_H + // Defines the size of the hash table. + // Note that this is not the number of buckets. + // Currently this should be less than WINDOW_SIZE_LOG + 4. + #define LDM_MEMORY_USAGE 23 + + // The number of entries in a hash bucket. + #define HASH_BUCKET_SIZE_LOG 3 // The maximum is 4 for now. + + // Defines the lag in inserting elements into the hash table. + #define LDM_LAG 0 + + // The maximum window size when searching for matches. + // The maximum value is 30. + #define LDM_WINDOW_SIZE_LOG 28 + + // The minimum match length. + // This should be a multiple of four. + #define LDM_MIN_MATCH_LENGTH 64 + + // If INSERT_BY_TAG, insert entries into the hash table as a function of the + // hash. Certain hashes will not be inserted. + // + // Otherwise, insert as a function of the position. + #define INSERT_BY_TAG 1 + + // Store a checksum with the hash table entries for faster comparison. + // This halves the number of entries the hash table can contain. + #define USE_CHECKSUM 1 +#endif + +// Output compression statistics. +#define COMPUTE_STATS + +// Output the configuration. +#define OUTPUT_CONFIGURATION + +// ============================================================================= // The number of bytes storing the compressed and decompressed size // in the header. @@ -15,40 +60,9 @@ #define RUN_BITS (8-ML_BITS) #define RUN_MASK ((1U< -#include -#include -#include -#include - -#include "ldm.h" - -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) -#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) - -#define LDM_HASH_ENTRY_SIZE_LOG 3 -//#define HASH_ONLY_EVERY_LOG 7 -#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) - -#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) - - -/* Hash table stuff. */ -#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) -#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) - -#define CHECKSUM_CHAR_OFFSET 10 - -// Take first match only. -//#define ZSTD_SKIP - -//#define RUN_CHECKS - -typedef U32 hash_t; - -typedef struct LDM_hashEntry { - U32 offset; - U32 checksum; -} LDM_hashEntry; - -struct LDM_compressStats { - U32 windowSizeLog, hashTableSizeLog; - U32 numMatches; - U64 totalMatchLength; - U64 totalLiteralLength; - U64 totalOffset; - - U32 matchLengthHistogram[32]; - - U32 minOffset, maxOffset; - - U32 offsetHistogram[32]; -}; - -typedef struct LDM_hashTable LDM_hashTable; - -struct LDM_CCtx { - U64 isize; /* Input size */ - U64 maxOSize; /* Maximum output size */ - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of input */ - - // Maximum input position such that hashing at the position does not exceed - // end of input. - const BYTE *ihashLimit; - - // Maximum input position such that finding a match of at least the minimum - // match length does not exceed end of input. - const BYTE *imatchLimit; - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Output */ - - const BYTE *anchor; /* Anchor to start of current (match) block */ - - LDM_compressStats stats; /* Compression statistics */ - - LDM_hashTable *hashTable; - - const BYTE *lastPosHashed; /* Last position hashed */ - hash_t lastHash; /* Hash corresponding to lastPosHashed */ - U32 lastSum; - - const BYTE *nextIp; // TODO: this is redundant (ip + step) - const BYTE *nextPosHashed; - hash_t nextHash; /* Hash corresponding to nextPosHashed */ - U32 nextSum; - - unsigned step; // ip step, should be 1. - - const BYTE *lagIp; - hash_t lagHash; - U32 lagSum; - - U64 numHashInserts; - // DEBUG - const BYTE *DEBUG_setNextHash; -}; - -struct LDM_hashTable { - U32 numBuckets; // Number of buckets - U32 numEntries; - LDM_hashEntry *entries; - - BYTE *bucketOffsets; -}; - -/** - * Create a hash table that can contain size elements. - * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG. - */ -LDM_hashTable *HASH_createTable(U32 size) { - LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); - table->numBuckets = size >> HASH_BUCKET_SIZE_LOG; - table->numEntries = size; - table->entries = calloc(size, sizeof(LDM_hashEntry)); - table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); - return table; -} - -static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { - return table->entries + (hash << HASH_BUCKET_SIZE_LOG); -} - -static unsigned ZSTD_NbCommonBytes (register size_t val) { - if (MEM_isLittleEndian()) { - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanForward64( &r, (U64)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctzll((U64)val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, - 0, 3, 1, 3, 1, 4, 2, 7, - 0, 2, 3, 6, 1, 5, 3, 5, - 1, 3, 4, 4, 2, 5, 6, 7, - 7, 0, 1, 2, 3, 3, 4, 6, - 2, 6, 5, 5, 3, 4, 5, 6, - 7, 1, 2, 4, 6, 4, 4, 5, - 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r=0; - _BitScanForward( &r, (U32)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctz((U32)val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, - 3, 2, 2, 1, 3, 2, 0, 1, - 3, 3, 1, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif - } - } else { /* Big Endian CPU */ - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clzll(val) >> 3); -# else - unsigned r; - const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ - if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r = 0; - _BitScanReverse( &r, (unsigned long)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clz((U32)val) >> 3); -# else - unsigned r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif - } } -} - -// From lib/compress/zstd_compress.c -static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, - const BYTE *const pInLimit) { - const BYTE * const pStart = pIn; - const BYTE * const pInLoopLimit = pInLimit - (sizeof(size_t)-1); - - while (pIn < pInLoopLimit) { - size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); - if (!diff) { - pIn += sizeof(size_t); - pMatch += sizeof(size_t); - continue; - } - pIn += ZSTD_NbCommonBytes(diff); - return (size_t)(pIn - pStart); - } - - if (MEM_64bits()) { - if ((pIn < (pInLimit - 3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { - pIn += 4; - pMatch += 4; - } - } - if ((pIn < (pInLimit - 1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { - pIn += 2; - pMatch += 2; - } - if ((pIn < pInLimit) && (*pMatch == *pIn)) { - pIn++; - } - return (size_t)(pIn - pStart); -} - -/** - * Count number of bytes that match backwards before pIn and pMatch. - * - * We count only bytes where pMatch > pBaes and pIn > pAnchor. - */ -U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, - const BYTE *pMatch, const BYTE *pBase) { - U32 matchLength = 0; - while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { - pIn--; - pMatch--; - matchLength++; - } - return matchLength; -} - -/** - * Returns a pointer to the entry in the hash table matching the hash and - * checksum with the "longest match length" as defined below. The forward and - * backward match lengths are written to *pForwardMatchLength and - * *pBackwardMatchLength. - * - * The match length is defined based on cctx->ip and the entry's offset. - * The forward match is computed from cctx->ip and entry->offset + cctx->ibase. - * The backward match is computed backwards from cctx->ip and - * cctx->ibase only if the forward match is longer than LDM_MIN_MATCH_LENGTH. - * - */ -LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, - const hash_t hash, - const U32 checksum, - U32 *pForwardMatchLength, - U32 *pBackwardMatchLength) { - LDM_hashTable *table = cctx->hashTable; - LDM_hashEntry *bucket = getBucket(table, hash); - LDM_hashEntry *cur = bucket; - LDM_hashEntry *bestEntry = NULL; - U32 bestMatchLength = 0; - for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - const BYTE *pMatch = cur->offset + cctx->ibase; - - // Check checksum for faster check. - if (cur->checksum == checksum && - cctx->ip - pMatch <= LDM_WINDOW_SIZE) { - U32 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); - U32 backwardMatchLength, totalMatchLength; - - // For speed. - if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) { - continue; - } - - backwardMatchLength = - countBackwardsMatch(cctx->ip, cctx->anchor, - cur->offset + cctx->ibase, - cctx->ibase); - - totalMatchLength = forwardMatchLength + backwardMatchLength; - - if (totalMatchLength >= bestMatchLength) { - bestMatchLength = totalMatchLength; - *pForwardMatchLength = forwardMatchLength; - *pBackwardMatchLength = backwardMatchLength; - - bestEntry = cur; -#ifdef ZSTD_SKIP - return cur; -#endif - } - } - } - if (bestEntry != NULL) { - return bestEntry; - } - return NULL; -} - -void HASH_insert(LDM_hashTable *table, - const hash_t hash, const LDM_hashEntry entry) { - *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; - table->bucketOffsets[hash]++; - table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; -} - -U32 HASH_getSize(const LDM_hashTable *table) { - return table->numBuckets; -} - -void HASH_destroyTable(LDM_hashTable *table) { - free(table->entries); - free(table->bucketOffsets); - free(table); -} - -void HASH_outputTableOccupancy(const LDM_hashTable *table) { - U32 ctr = 0; - LDM_hashEntry *cur = table->entries; - LDM_hashEntry *end = table->entries + (table->numBuckets * HASH_BUCKET_SIZE); - for (; cur < end; ++cur) { - if (cur->offset == 0) { - ctr++; - } - } - - printf("Num buckets, bucket size: %d, %d\n", - table->numBuckets, HASH_BUCKET_SIZE); - printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - table->numEntries, ctr, - 100.0 * (double)(ctr) / table->numEntries); -} - -// TODO: This can be done more efficiently (but it is not that important as it -// is only used for computing stats). -static int intLog2(U32 x) { - int ret = 0; - while (x >>= 1) { - ret++; - } - return ret; -} - -void LDM_printCompressStats(const LDM_compressStats *stats) { - int i = 0; - printf("=====================\n"); - printf("Compression statistics\n"); - printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", - stats->windowSizeLog, stats->hashTableSizeLog); - printf("num matches, total match length, %% matched: %u, %llu, %.3f\n", - stats->numMatches, - stats->totalMatchLength, - 100.0 * (double)stats->totalMatchLength / - (double)(stats->totalMatchLength + stats->totalLiteralLength)); - printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / - (double)stats->numMatches); - printf("avg literal length, total literalLength: %.1f, %llu\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches, - stats->totalLiteralLength); - printf("avg offset length: %.1f\n", - ((double)stats->totalOffset) / (double)stats->numMatches); - printf("min offset, max offset: %u, %u\n", - stats->minOffset, stats->maxOffset); - - printf("\n"); - printf("offset histogram | match length histogram\n"); - printf("offset/ML, num matches, %% of matches | num matches, %% of matches\n"); - - for (; i <= intLog2(stats->maxOffset); i++) { - printf("2^%*d: %10u %6.3f%% |2^%*d: %10u %6.3f \n", - 2, i, - stats->offsetHistogram[i], - 100.0 * (double) stats->offsetHistogram[i] / - (double) stats->numMatches, - 2, i, - stats->matchLengthHistogram[i], - 100.0 * (double) stats->matchLengthHistogram[i] / - (double) stats->numMatches); - } - printf("\n"); - printf("=====================\n"); -} - -int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { - U32 lengthLeft = LDM_MIN_MATCH_LENGTH; - const BYTE *curIn = pIn; - const BYTE *curMatch = pMatch; - - if (pIn - pMatch > LDM_WINDOW_SIZE) { - return 0; - } - - for (; lengthLeft >= 4; lengthLeft -= 4) { - if (MEM_read32(curIn) != MEM_read32(curMatch)) { - return 0; - } - curIn += 4; - curMatch += 4; - } - return 1; -} - -hash_t HASH_hashU32(U32 value) { - return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); -} - -/** - * Convert a sum computed from getChecksum to a hash value in the range - * of the hash table. - */ -static hash_t checksumToHash(U32 sum) { - return HASH_hashU32(sum); -} - -/** - * Computes a checksum based on rsync's checksum. - * - * a(k,l) = \sum_{i = k}^l x_i (mod M) - * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) - * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) - */ -static U32 getChecksum(const BYTE *buf, U32 len) { - U32 i; - U32 s1, s2; - - s1 = s2 = 0; - for (i = 0; i < (len - 4); i += 4) { - s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + - (2 * buf[i + 2]) + (buf[i + 3]) + - (10 * CHECKSUM_CHAR_OFFSET); - s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3] + - + (4 * CHECKSUM_CHAR_OFFSET); - - } - for(; i < len; i++) { - s1 += buf[i] + CHECKSUM_CHAR_OFFSET; - s2 += s1; - } - return (s1 & 0xffff) + (s2 << 16); -} - -/** - * Update a checksum computed from getChecksum(data, len). - * - * The checksum can be updated along its ends as follows: - * a(k+1, l+1) = (a(k,l) - x_k + x_{l+1}) (mod M) - * b(k+1, l+1) = (b(k,l) - (l-k+1)*x_k + (a(k+1,l+1)) (mod M) - * - * Thus toRemove should correspond to data[0]. - */ -static U32 updateChecksum(U32 sum, U32 len, - BYTE toRemove, BYTE toAdd) { - U32 s1 = (sum & 0xffff) - toRemove + toAdd; - U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; - - return (s1 & 0xffff) + (s2 << 16); -} - -/** - * Update cctx->nextSum, cctx->nextHash, and cctx->nextPosHashed - * based on cctx->lastSum and cctx->lastPosHashed. - * - * This uses a rolling hash and requires that the last position hashed - * corresponds to cctx->nextIp - step. - */ -static void setNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - U32 check; - if ((cctx->nextIp - cctx->ibase != 1) && - (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { - printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, - cctx->DEBUG_setNextHash - cctx->ibase); - } - - cctx->DEBUG_setNextHash = cctx->nextIp; -#endif - - cctx->nextSum = updateChecksum( - cctx->lastSum, LDM_HASH_LENGTH, - cctx->lastPosHashed[0], - cctx->lastPosHashed[LDM_HASH_LENGTH]); - cctx->nextPosHashed = cctx->nextIp; - cctx->nextHash = checksumToHash(cctx->nextSum); - -#if LDM_LAG - if (cctx->ip - cctx->ibase > LDM_LAG) { - cctx->lagSum = updateChecksum( - cctx->lagSum, LDM_HASH_LENGTH, - cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); - cctx->lagIp++; - cctx->lagHash = checksumToHash(cctx->lagSum); - } -#endif - -#ifdef RUN_CHECKS - check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); - - if (check != cctx->nextSum) { - printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); - } - - if ((cctx->nextIp - cctx->lastPosHashed) != 1) { - printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", - cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, - cctx->ip - cctx->ibase); - } -#endif -} - -static void putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash, U32 sum) { - // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. - // Note: this works only when cctx->step is 1. - if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { -#if LDM_LAG - // TODO: off by 1, but whatever - if (cctx->lagIp - cctx->ibase > 0) { - const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; - HASH_insert(cctx->hashTable, cctx->lagHash, entry); - } else { - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; - HASH_insert(cctx->hashTable, hash, entry); - } -#else - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; - HASH_insert(cctx->hashTable, hash, entry); -#endif - } - - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; - cctx->lastSum = sum; -} - -/** - * Copy over the cctx->lastHash, cctx->lastSum, and cctx->lastPosHashed - * fields from the "next" fields. - * - * This requires that cctx->ip == cctx->nextPosHashed. - */ -static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - if (cctx->ip != cctx->nextPosHashed) { - printf("CHECK failed: updateLastHashFromNextHash %zu\n", - cctx->ip - cctx->ibase); - } -#endif - putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); -} - -/** - * Insert hash of the current position into the hash table. - */ -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); - hash_t hash = checksumToHash(sum); - -#ifdef RUN_CHECKS - if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { - printf("CHECK failed: putHashOfCurrentPosition %zu\n", - cctx->ip - cctx->ibase); - } -#endif - - putHashOfCurrentPositionFromHash(cctx, hash, sum); -} - -void LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - cctx->isize = srcSize; - cctx->maxOSize = maxDstSize; - - cctx->ibase = (const BYTE *)src; - cctx->ip = cctx->ibase; - cctx->iend = cctx->ibase + srcSize; - - cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; - cctx->imatchLimit = cctx->iend - LDM_MIN_MATCH_LENGTH; - - cctx->obase = (BYTE *)dst; - cctx->op = (BYTE *)dst; - - cctx->anchor = cctx->ibase; - - memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64); - - cctx->stats.minOffset = UINT_MAX; - cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; - cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; - - - cctx->lastPosHashed = NULL; - - cctx->step = 1; // Fixed to be 1 for now. Changing may break things. - cctx->nextIp = cctx->ip + cctx->step; - cctx->nextPosHashed = 0; - - cctx->DEBUG_setNextHash = 0; -} - -void LDM_destroyCCtx(LDM_CCtx *cctx) { - HASH_destroyTable(cctx->hashTable); -} - -/** - * Finds the "best" match. - * - * Returns 0 if successful and 1 otherwise (i.e. no match can be found - * in the remaining input that is long enough). - * - * forwardMatchLength contains the forward length of the match. - */ -static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, - U32 *forwardMatchLength, U32 *backwardMatchLength) { - - LDM_hashEntry *entry = NULL; - cctx->nextIp = cctx->ip + cctx->step; - - while (entry == NULL) { - hash_t h; - U32 sum; - setNextHash(cctx); - h = cctx->nextHash; - sum = cctx->nextSum; - cctx->ip = cctx->nextIp; - cctx->nextIp += cctx->step; - - if (cctx->ip > cctx->imatchLimit) { - return 1; - } - - entry = HASH_getBestEntry(cctx, h, sum, - forwardMatchLength, backwardMatchLength); - - if (entry != NULL) { - *match = entry->offset + cctx->ibase; - } - putHashOfCurrentPositionFromHash(cctx, h, sum); - } - setNextHash(cctx); - return 0; -} - -void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength) { - /* Encode the literal length. */ - if (literalLength >= RUN_MASK) { - U64 len = (U64)literalLength - RUN_MASK; - *pToken = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *(cctx->op)++ = 255; - } - *(cctx->op)++ = (BYTE)len; - } else { - *pToken = (BYTE)(literalLength << ML_BITS); - } - - /* Encode the literals. */ - memcpy(cctx->op, cctx->anchor, literalLength); - cctx->op += literalLength; -} - -void LDM_outputBlock(LDM_CCtx *cctx, - const U64 literalLength, - const U32 offset, - const U64 matchLength) { - BYTE *pToken = cctx->op++; - - /* Encode the literal length and literals. */ - LDM_encodeLiteralLengthAndLiterals(cctx, pToken, literalLength); - - /* Encode the offset. */ - MEM_write32(cctx->op, offset); - cctx->op += LDM_OFFSET_SIZE; - - /* Encode the match length. */ - if (matchLength >= ML_MASK) { - unsigned matchLengthRemaining = matchLength; - *pToken += ML_MASK; - matchLengthRemaining -= ML_MASK; - MEM_write32(cctx->op, 0xFFFFFFFF); - while (matchLengthRemaining >= 4*0xFF) { - cctx->op += 4; - MEM_write32(cctx->op, 0xffffffff); - matchLengthRemaining -= 4*0xFF; - } - cctx->op += matchLengthRemaining / 255; - *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); - } else { - *pToken += (BYTE)(matchLength); - } -} - -// TODO: maxDstSize is unused. This function may seg fault when writing -// beyond the size of dst, as it does not check maxDstSize. Writing to -// a buffer and performing checks is a possible solution. -// -// This is based upon lz4. -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - LDM_CCtx cctx; - const BYTE *match = NULL; - U32 forwardMatchLength = 0; - U32 backwardsMatchLength = 0; - - LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); - LDM_outputConfiguration(); - - /* Hash the first position and put it into the hash table. */ - LDM_putHashOfCurrentPosition(&cctx); - -#if LDM_LAG - cctx.lagIp = cctx.ip; - cctx.lagHash = cctx.lastHash; - cctx.lagSum = cctx.lastSum; -#endif - /** - * Find a match. - * If no more matches can be found (i.e. the length of the remaining input - * is less than the minimum match length), then stop searching for matches - * and encode the final literals. - */ - while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength, - &backwardsMatchLength) == 0) { -#ifdef COMPUTE_STATS - cctx.stats.numMatches++; -#endif - - cctx.ip -= backwardsMatchLength; - match -= backwardsMatchLength; - - /** - * Write current block (literals, literal length, match offset, match - * length) and update pointers and hashes. - */ - { - const U64 literalLength = cctx.ip - cctx.anchor; - const U32 offset = cctx.ip - match; - const U64 matchLength = forwardMatchLength + - backwardsMatchLength - - LDM_MIN_MATCH_LENGTH; - - LDM_outputBlock(&cctx, literalLength, offset, matchLength); - -#ifdef COMPUTE_STATS - cctx.stats.totalLiteralLength += literalLength; - cctx.stats.totalOffset += offset; - cctx.stats.totalMatchLength += matchLength + LDM_MIN_MATCH_LENGTH; - cctx.stats.minOffset = - offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; - cctx.stats.maxOffset = - offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; - cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; - cctx.stats.matchLengthHistogram[ - (U32)intLog2(matchLength + LDM_MIN_MATCH_LENGTH)]++; -#endif - - // Move ip to end of block, inserting hashes at each position. - cctx.nextIp = cctx.ip + cctx.step; - while (cctx.ip < cctx.anchor + LDM_MIN_MATCH_LENGTH + - matchLength + literalLength) { - if (cctx.ip > cctx.lastPosHashed) { - // TODO: Simplify. - LDM_updateLastHashFromNextHash(&cctx); - setNextHash(&cctx); - } - cctx.ip++; - cctx.nextIp++; - } - } - - // Set start of next block to current input pointer. - cctx.anchor = cctx.ip; - LDM_updateLastHashFromNextHash(&cctx); - } - - /* Encode the last literals (no more matches). */ - { - const U32 lastRun = cctx.iend - cctx.anchor; - BYTE *pToken = cctx.op++; - LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); - } - -#ifdef COMPUTE_STATS - LDM_printCompressStats(&cctx.stats); - HASH_outputTableOccupancy(cctx.hashTable); -#endif - - { - const size_t ret = cctx.op - cctx.obase; - LDM_destroyCCtx(&cctx); - return ret; - } -} - -void LDM_outputConfiguration(void) { - printf("=====================\n"); - printf("Configuration\n"); - printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); - printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", - LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); - printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); - printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); - printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); - printf("LDM_LAG %d\n", LDM_LAG); - printf("=====================\n"); -} - - - diff --git a/contrib/long_distance_matching/ldm_params.h b/contrib/long_distance_matching/ldm_params.h index 0fcd30bd..a541581b 100644 --- a/contrib/long_distance_matching/ldm_params.h +++ b/contrib/long_distance_matching/ldm_params.h @@ -1,5 +1,6 @@ #ifndef LDM_PARAMS_H #define LDM_PARAMS_H + #define LDM_MEMORY_USAGE 23 #define HASH_BUCKET_SIZE_LOG 3 #define LDM_LAG 0 @@ -7,4 +8,5 @@ #define LDM_MIN_MATCH_LENGTH 64 #define INSERT_BY_TAG 1 #define USE_CHECKSUM 1 -#endif + +#endif // LDM_PARAMS_H diff --git a/contrib/long_distance_matching/main.c b/contrib/long_distance_matching/main.c index bdd385ce..d55e01d3 100644 --- a/contrib/long_distance_matching/main.c +++ b/contrib/long_distance_matching/main.c @@ -12,11 +12,13 @@ #include "ldm.h" #include "zstd.h" -#define DECOMPRESS_AND_VERIFY +// #define DECOMPRESS_AND_VERIFY /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. * + * This adds a header from LDM_writeHeader to the beginning of the output. + * * This might seg fault if the compressed size is > the decompress * size due to the mmapping and output file size allocated to be the input size * The compress function should check before writing or buffer writes. @@ -52,7 +54,7 @@ static int compress(const char *fname, const char *oname) { maxCompressedSize = (statbuf.st_size + LDM_HEADER_SIZE); // Handle case where compressed size is > decompressed size. - // The compress function should check before writing or buffer writes. + // TODO: The compress function should check before writing or buffer writes. maxCompressedSize += statbuf.st_size / 255; ftruncate(fdout, maxCompressedSize); @@ -64,7 +66,7 @@ static int compress(const char *fname, const char *oname) { return 1; } - /* mmap the output file */ + /* mmap the output file. */ if ((dst = mmap(0, maxCompressedSize, PROT_READ | PROT_WRITE, MAP_SHARED, fdout, 0)) == (caddr_t) - 1) { perror("mmap error for output"); @@ -79,14 +81,12 @@ static int compress(const char *fname, const char *oname) { gettimeofday(&tv2, NULL); - // Write compress and decompress size to header - // TODO: should depend on LDM_DECOMPRESS_SIZE write32 + // Write the header. LDM_writeHeader(dst, compressedSize, statbuf.st_size); // Truncate file to compressedSize. ftruncate(fdout, compressedSize); - printf("%25s : %10lu -> %10lu - %s \n", fname, (size_t)statbuf.st_size, (size_t)compressedSize, oname); printf("Compression ratio: %.2fx --- %.1f%%\n", @@ -100,7 +100,6 @@ static int compress(const char *fname, const char *oname) { timeTaken, ((double)statbuf.st_size / (double) (1 << 20)) / timeTaken); - // Close files. close(fdin); close(fdout); From 9eaf3d22d0818fc977b7df6af4ce78654613a97e Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Wed, 26 Jul 2017 16:43:25 -0700 Subject: [PATCH 58/62] Allow HASH_ONLY_EVERY_LOG to be configured in ldm.h --- contrib/long_distance_matching/Makefile | 2 +- contrib/long_distance_matching/ldm.c | 6 +++--- contrib/long_distance_matching/ldm.h | 11 +++++++++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 8bc7ac47..4193cb32 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -32,6 +32,6 @@ ldm: ldm_common.c ldm.c main.c clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - ldm + ldm @echo Cleaning completed diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 9a843838..25bf5c83 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -16,11 +16,11 @@ #define LDM_HASH_ENTRY_SIZE_LOG 2 #endif -// Force the "probability" of insertion to be some value. // Entries are inserted into the table HASH_ONLY_EVERY + 1 times "on average". +#ifndef HASH_ONLY_EVERY_LOG + #define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) +#endif -//#define HASH_ONLY_EVERY_LOG 7 -#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) #define HASH_ONLY_EVERY ((1 << (HASH_ONLY_EVERY_LOG)) - 1) #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 38d24015..af35130e 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -3,7 +3,7 @@ #include "mem.h" // from /lib/common/mem.h -// #include "ldm_params.h" +//#include "ldm_params.h" // ============================================================================= // Modify the parameters in ldm_params.h if "ldm_params.h" is included. @@ -23,7 +23,7 @@ #define LDM_LAG 0 // The maximum window size when searching for matches. - // The maximum value is 30. + // The maximum value is 30 #define LDM_WINDOW_SIZE_LOG 28 // The minimum match length. @@ -47,6 +47,13 @@ // Output the configuration. #define OUTPUT_CONFIGURATION +// If defined, forces the probability of insertion to be approximately +// one per (1 << HASH_ONLY_EVERY_LOG). If not defined, the probability will be +// calculated based on the memory usage and window size for "even" insertion +// throughout the window. + +// #define HASH_ONLY_EVERY_LOG 8 + // ============================================================================= // The number of bytes storing the compressed and decompressed size From c105f605e66bd3bd985ceeaa660e7159e4d16298 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 27 Jul 2017 11:11:35 -0700 Subject: [PATCH 59/62] Update README --- contrib/long_distance_matching/README.md | 75 ++++++++++++++++++++++-- contrib/long_distance_matching/ldm.c | 22 ------- contrib/long_distance_matching/ldm.h | 3 + 3 files changed, 72 insertions(+), 28 deletions(-) diff --git a/contrib/long_distance_matching/README.md b/contrib/long_distance_matching/README.md index d9cb0895..e67bba71 100644 --- a/contrib/long_distance_matching/README.md +++ b/contrib/long_distance_matching/README.md @@ -28,12 +28,75 @@ The parameters are as follows and must all be defined: - `INSERT_BY_TAG` : insert entries into the hash table as a function of the hash. This increases speed by reducing the number of hash table lookups and match comparisons. Certain hashes will never be inserted. - `USE_CHECKSUM` : store a checksum with the hash table entries for faster comparison. This halves the number of entries the hash table can contain. +The optional parameter `HASH_ONLY_EVERY_LOG` is the log inverse frequency of insertion into the hash table. That is, an entry is inserted approximately every `1 << HASH_ONLY_EVERY_LOG` times. If this parameter is not defined, the value is computed as a function of the window size and memory usage to approximate a even coverage of the window. + + +### Benchmark + +Below is a comparison of various compression methods on a tar of four versions of llvm (versions `3.9.0`, `3.9.1`, `4.0.0`, `4.0.1`) with a total size of `727900160` B. + +| Method | Size | Ratio | +|:---|---:|---:| +|lrzip -p 32 -n -w 1 | `369968714` | `1.97`| +|ldm | `209391361` | `3.48`| +|lz4 | `189954338` | `3.83`| +|lrzip -p 32 -l -w 1 | `163940343` | `4.44`| +|zstd -1 | `126080293` | `5.77`| +|lrzip -p 32 -n | `124821009` | `5.83`| +|lrzip -p 32 -n -w 1 & zstd -1 | `120317909` | `6.05`| +|zstd -3 -o | `115290952` | `6.31`| +|lrzip -p 32 -g -L 9 -w 1 | `107168979` | `6.79`| +|zstd -6 -o | `102772098` | `7.08`| +|zstd -T16 -9 | `98040470` | `7.42`| +|lrzip -p 32 -n -w 1 & zstd -T32 -19 | `88050289` | `8.27`| +|zstd -T32 -19 | `83626098` | `8.70`| +|lrzip -p 32 -n & zstd -1 | `36335117` | `20.03`| +|ldm & zstd -6 | `32856232` | `22.15`| +|lrzip -p 32 -g -L 9 | `32243594` | `22.58`| +|lrzip -p 32 -n & zstd -6 | `30954572` | `23.52`| +|lrzip -p 32 -n & zstd -T32 -19 | `26472064` | `27.50`| + +The method marked `ldm` was run with the following parameters: + +| Parameter | Value | +|:---|---:| +| `LDM_MEMORY_USAGE` | `23`| +|`HASH_BUCKET_SIZE_LOG` | `3`| +|`LDM_LAG` | `0`| +|`LDM_WINDOW_SIZE_LOG` | `28`| +|`LDM_MIN_MATCH_LENGTH`| `64`| +|`INSERT_BY_TAG` | `1`| +|`USE_CHECKSUM` | `1`| + +The compression speed was `220.5 MB/s`. + +### Parameter selection + +Below is a brief discussion of the effects of the parameters on the speed and compression ratio. + +#### Speed + +A large bottleneck in terms of speed is finding the matches and comparing to see if they are greater than the minimum match length. Generally: +- The fewer matches found (or the lower the percentage of the literals matched), the slower the algorithm will behave. +- Increasing `HASH_ONLY_EVERY_LOG` results in fewer inserts and, if `INSERT_BY_TAG` is set, fewer lookups in the table. This has a large effect on speed, as well as compression ratio. +- If `HASH_ONLY_EVERY_LOG` is not set, its value is calculated based on `LDM_WINDOW_SIZE_LOG` and `LDM_MEMORY_USAGE`. Increasing `LDM_WINDOW_SIZE_LOG` has the effect of increasing `HASH_ONLY_EVERY_LOG` and increasing `LDM_MEMORY_USAGE` decreases `HASH_ONLY_EVERY_LOG`. +- `USE_CHECKSUM` generally improves speed with hash table lookups. + +#### Compression ratio + +The compression ratio is highly correlated with the coverage of matches. As a long distance matcher, the algorithm was designed to "optimize" for long distance matches outside the zstd compression window. The compression ratio after recompressing the output of the long-distance matcher with zstd was a more important signal in development than the raw compression ratio itself. + +Generally, increasing `LDM_MEMORY_USAGE` will improve the compression ratio. However when using the default computed value of `HASH_ONLY_EVERY_LOG`, this increases the frequency of insertion and lookup in the table and thus may result in a decrease in speed. + +Below is a table showing the speed and compression ratio when compressing the llvm tar (as described above) using different settings for `LDM_MEMORY_USAGE`. The other parameters were the same as used in the benchmark above. + +| `LDM_MEMORY_USAGE` | Ratio | Speed (MB/s) | Ratio after zstd -6 | +|---:| ---: | ---: | ---: | +| `18` | `1.85` | `232.4` | `10.92` | +| `21` | `2.79` | `233.9` | `15.92` | +| `23` | `3.48` | `220.5` | `18.29` | +| `25` | `4.56` | `140.8` | `19.21` | + ### Compression statistics Compression statistics (and the configuration) can be enabled/disabled via `COMPUTE_STATS` and `OUTPUT_CONFIGURATION` in `ldm.h`. - - - - - - diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 25bf5c83..ff9d94d0 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -58,9 +58,6 @@ struct LDM_compressStats { U32 minOffset, maxOffset; U32 offsetHistogram[32]; - - U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG]; - U64 TMP_totalHashCount; }; typedef struct LDM_hashTable LDM_hashTable; @@ -398,17 +395,6 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { (double) stats->numMatches); } printf("\n"); -#if INSERT_BY_TAG -/* - printf("Lower bit distribution\n"); - for (i = 0; i < (1 << HASH_ONLY_EVERY_LOG); i++) { - printf("%5d %5llu %6.3f\n", i, stats->TMP_hashCount[i], - 100.0 * (double) stats->TMP_hashCount[i] / - (double) stats->TMP_totalHashCount); - } -*/ -#endif - printf("=====================\n"); } @@ -503,14 +489,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->lastPosHashed[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; -#if INSERT_BY_TAG - { - U32 hashEveryMask = lowerBitsFromHfHash(cctx->nextHash); - cctx->stats.TMP_totalHashCount++; - cctx->stats.TMP_hashCount[hashEveryMask]++; - } -#endif - #if LDM_LAG if (cctx->ip - cctx->ibase > LDM_LAG) { cctx->lagHash = updateHash( diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index af35130e..456ec5aa 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -71,6 +71,9 @@ #define LDM_OFFSET_SIZE 4 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) + +// TODO: Match lengths that are too small do not use the hash table efficiently. +// There should be a minimum hash length given the hash table size. #define LDM_HASH_LENGTH LDM_MIN_MATCH_LENGTH typedef struct LDM_compressStats LDM_compressStats; From 627621839cf39310793fc5c2358985929497d42b Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 27 Jul 2017 15:37:37 -0700 Subject: [PATCH 60/62] Add checks in initialization code --- contrib/long_distance_matching/ldm.c | 33 ++++++++++++++++----- contrib/long_distance_matching/ldm.h | 9 ++++-- contrib/long_distance_matching/ldm_common.c | 20 +++++++++---- contrib/long_distance_matching/main.c | 18 ++++++++--- 4 files changed, 61 insertions(+), 19 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index ff9d94d0..c2cdb21e 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -111,13 +111,25 @@ struct LDM_hashTable { /** * Create a hash table that can contain size elements. * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG. + * + * Returns NULL if table creation failed. */ static LDM_hashTable *HASH_createTable(U32 size) { LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); + if (!table) return NULL; + table->numBuckets = size >> HASH_BUCKET_SIZE_LOG; table->numEntries = size; table->entries = calloc(size, sizeof(LDM_hashEntry)); table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); + + if (!table->entries || !table->bucketOffsets) { + free(table->bucketOffsets); + free(table->entries); + free(table); + return NULL; + } + return table; } @@ -566,9 +578,9 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { putHashOfCurrentPositionFromHash(cctx, hash); } -void LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { +size_t LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize) { cctx->isize = srcSize; cctx->maxOSize = maxDstSize; @@ -590,16 +602,20 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, #else cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32); #endif + + if (!cctx->hashTable) return 1; + cctx->stats.minOffset = UINT_MAX; cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; - cctx->lastPosHashed = NULL; cctx->step = 1; // Fixed to be 1 for now. Changing may break things. cctx->nextIp = cctx->ip + cctx->step; cctx->nextPosHashed = 0; + + return 0; } void LDM_destroyCCtx(LDM_CCtx *cctx) { @@ -726,7 +742,10 @@ size_t LDM_compress(const void *src, size_t srcSize, U64 forwardMatchLength = 0; U64 backwardsMatchLength = 0; - LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); + if (LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize)) { + // Initialization failed. + return 0; + } #ifdef OUTPUT_CONFIGURATION LDM_outputConfiguration(); @@ -744,8 +763,8 @@ size_t LDM_compress(const void *src, size_t srcSize, * is less than the minimum match length), then stop searching for matches * and encode the final literals. */ - while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength, - &backwardsMatchLength) == 0) { + while (!LDM_findBestMatch(&cctx, &match, &forwardMatchLength, + &backwardsMatchLength)) { #ifdef COMPUTE_STATS cctx.stats.numMatches++; diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 456ec5aa..4adadbd0 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -82,6 +82,7 @@ typedef struct LDM_DCtx LDM_DCtx; /** * Compresses src into dst. + * Returns the compressed size if successful, 0 otherwise. * * NB: This currently ignores maxDstSize and assumes enough space is available. * @@ -113,10 +114,12 @@ size_t LDM_compress(const void *src, size_t srcSize, * Initialize the compression context. * * Allocates memory for the hash table. + * + * Returns 0 if successful, 1 otherwise. */ -void LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize); +size_t LDM_initializeCCtx(LDM_CCtx *cctx, + const void *src, size_t srcSize, + void *dst, size_t maxDstSize); /** * Frees up memory allocated in LDM_initializeCCtx(). diff --git a/contrib/long_distance_matching/ldm_common.c b/contrib/long_distance_matching/ldm_common.c index 26b716a1..8b34f8ad 100644 --- a/contrib/long_distance_matching/ldm_common.c +++ b/contrib/long_distance_matching/ldm_common.c @@ -2,19 +2,29 @@ #include "ldm.h" +/** + * This function reads the header at the beginning of src and writes + * the compressed and decompressed size to compressedSize and + * decompressedSize. + * + * The header consists of 16 bytes: 8 bytes each in little-endian format + * of the compressed size and the decompressed size. + */ void LDM_readHeader(const void *src, U64 *compressedSize, U64 *decompressedSize) { const BYTE *ip = (const BYTE *)src; *compressedSize = MEM_readLE64(ip); - ip += sizeof(U64); - *decompressedSize = MEM_readLE64(ip); - // ip += sizeof(U64); + *decompressedSize = MEM_readLE64(ip + 8); } +/** + * Writes the 16-byte header (8-bytes each of the compressedSize and + * decompressedSize in little-endian format) to memPtr. + */ void LDM_writeHeader(void *memPtr, U64 compressedSize, U64 decompressedSize) { - MEM_write64(memPtr, compressedSize); - MEM_write64((BYTE *)memPtr + 8, decompressedSize); + MEM_writeLE64(memPtr, compressedSize); + MEM_writeLE64((BYTE *)memPtr + 8, decompressedSize); } struct LDM_DCtx { diff --git a/contrib/long_distance_matching/main.c b/contrib/long_distance_matching/main.c index d55e01d3..72af5404 100644 --- a/contrib/long_distance_matching/main.c +++ b/contrib/long_distance_matching/main.c @@ -12,7 +12,7 @@ #include "ldm.h" #include "zstd.h" -// #define DECOMPRESS_AND_VERIFY +#define DECOMPRESS_AND_VERIFY /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. @@ -186,9 +186,18 @@ static int compare(FILE *fp0, FILE *fp1) { } /* Verify the input file is the same as the decompressed file. */ -static void verify(const char *inpFilename, const char *decFilename) { - FILE *inpFp = fopen(inpFilename, "rb"); - FILE *decFp = fopen(decFilename, "rb"); +static int verify(const char *inpFilename, const char *decFilename) { + FILE *inpFp, *decFp; + + if ((inpFp = fopen(inpFilename, "rb")) == NULL) { + perror("Could not open input file\n"); + return 1; + } + + if ((decFp = fopen(decFilename, "rb")) == NULL) { + perror("Could not open decompressed file\n"); + return 1; + } printf("verify : %s <-> %s\n", inpFilename, decFilename); { @@ -202,6 +211,7 @@ static void verify(const char *inpFilename, const char *decFilename) { fclose(decFp); fclose(inpFp); + return 0; } #endif From 1294a4a897d696e9a1f999f465527de31479596d Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 27 Jul 2017 15:49:46 -0700 Subject: [PATCH 61/62] Fix typo --- contrib/long_distance_matching/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/long_distance_matching/README.md b/contrib/long_distance_matching/README.md index e67bba71..771a6c3c 100644 --- a/contrib/long_distance_matching/README.md +++ b/contrib/long_distance_matching/README.md @@ -1,6 +1,6 @@ This is a compression algorithm focused on finding long distance matches. -It is based upon lz4 and uses nearly the same block format (github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md). The number of bytes to encode the offset is four instead of two in lz4 to reflect the longer distance matching. The block format is descriped in `ldm.h`. +It is based upon lz4 and uses nearly the same block format (github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md). The number of bytes to encode the offset is four instead of two in lz4 to reflect the longer distance matching. The block format is described in `ldm.h`. ### Build @@ -28,7 +28,7 @@ The parameters are as follows and must all be defined: - `INSERT_BY_TAG` : insert entries into the hash table as a function of the hash. This increases speed by reducing the number of hash table lookups and match comparisons. Certain hashes will never be inserted. - `USE_CHECKSUM` : store a checksum with the hash table entries for faster comparison. This halves the number of entries the hash table can contain. -The optional parameter `HASH_ONLY_EVERY_LOG` is the log inverse frequency of insertion into the hash table. That is, an entry is inserted approximately every `1 << HASH_ONLY_EVERY_LOG` times. If this parameter is not defined, the value is computed as a function of the window size and memory usage to approximate a even coverage of the window. +The optional parameter `HASH_ONLY_EVERY_LOG` is the log inverse frequency of insertion into the hash table. That is, an entry is inserted approximately every `1 << HASH_ONLY_EVERY_LOG` times. If this parameter is not defined, the value is computed as a function of the window size and memory usage to approximate an even coverage of the window. ### Benchmark From 8fae41c412d99c8069ea1bd98ffcf50e2c99cd1e Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Thu, 27 Jul 2017 17:14:05 -0700 Subject: [PATCH 62/62] Return error code in verify() and minor code cleanup --- contrib/long_distance_matching/ldm.c | 50 ++++++++++++++------------- contrib/long_distance_matching/main.c | 12 ++++--- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index c2cdb21e..4dccd0bf 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -108,6 +108,12 @@ struct LDM_hashTable { BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position. }; +static void HASH_destroyTable(LDM_hashTable *table) { + free(table->entries); + free(table->bucketOffsets); + free(table); +} + /** * Create a hash table that can contain size elements. * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG. @@ -124,9 +130,7 @@ static LDM_hashTable *HASH_createTable(U32 size) { table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); if (!table->entries || !table->bucketOffsets) { - free(table->bucketOffsets); - free(table->entries); - free(table); + HASH_destroyTable(table); return NULL; } @@ -275,13 +279,13 @@ static LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, U64 *pBackwardMatchLength) { LDM_hashTable *table = cctx->hashTable; LDM_hashEntry *bucket = getBucket(table, hash); - LDM_hashEntry *cur = bucket; + LDM_hashEntry *cur; LDM_hashEntry *bestEntry = NULL; U64 bestMatchLength = 0; #if !(USE_CHECKSUM) (void)checksum; #endif - for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { + for (cur = bucket; cur < bucket + HASH_BUCKET_SIZE; ++cur) { const BYTE *pMatch = cur->offset + cctx->ibase; // Check checksum for faster check. @@ -336,12 +340,6 @@ static void HASH_insert(LDM_hashTable *table, table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; } -static void HASH_destroyTable(LDM_hashTable *table) { - free(table->entries); - free(table->bucketOffsets); - free(table); -} - static void HASH_outputTableOccupancy(const LDM_hashTable *table) { U32 ctr = 0; LDM_hashEntry *cur = table->entries; @@ -360,8 +358,9 @@ static void HASH_outputTableOccupancy(const LDM_hashTable *table) { 100.0 * (double)(ctr) / table->numEntries); } -// TODO: This can be done more efficiently (but it is not that important as it -// is only used for computing stats). +// TODO: This can be done more efficiently, for example by using builtin +// functions (but it is not that important as it is only used for computing +// stats). static int intLog2(U64 x) { int ret = 0; while (x >>= 1) { @@ -371,7 +370,6 @@ static int intLog2(U64 x) { } void LDM_printCompressStats(const LDM_compressStats *stats) { - int i = 0; printf("=====================\n"); printf("Compression statistics\n"); printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", @@ -395,16 +393,20 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { printf("offset histogram | match length histogram\n"); printf("offset/ML, num matches, %% of matches | num matches, %% of matches\n"); - for (; i <= intLog2(stats->maxOffset); i++) { - printf("2^%*d: %10u %6.3f%% |2^%*d: %10u %6.3f \n", - 2, i, - stats->offsetHistogram[i], - 100.0 * (double) stats->offsetHistogram[i] / - (double) stats->numMatches, - 2, i, - stats->matchLengthHistogram[i], - 100.0 * (double) stats->matchLengthHistogram[i] / - (double) stats->numMatches); + { + int i; + int logMaxOffset = intLog2(stats->maxOffset); + for (i = 0; i <= logMaxOffset; i++) { + printf("2^%*d: %10u %6.3f%% |2^%*d: %10u %6.3f \n", + 2, i, + stats->offsetHistogram[i], + 100.0 * (double) stats->offsetHistogram[i] / + (double) stats->numMatches, + 2, i, + stats->matchLengthHistogram[i], + 100.0 * (double) stats->matchLengthHistogram[i] / + (double) stats->numMatches); + } } printf("\n"); printf("=====================\n"); diff --git a/contrib/long_distance_matching/main.c b/contrib/long_distance_matching/main.c index 72af5404..7c7086a5 100644 --- a/contrib/long_distance_matching/main.c +++ b/contrib/long_distance_matching/main.c @@ -12,7 +12,7 @@ #include "ldm.h" #include "zstd.h" -#define DECOMPRESS_AND_VERIFY +// #define DECOMPRESS_AND_VERIFY /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. @@ -206,6 +206,7 @@ static int verify(const char *inpFilename, const char *decFilename) { printf("verify : OK\n"); } else { printf("verify : NG\n"); + return 1; } } @@ -239,7 +240,7 @@ int main(int argc, const char *argv[]) { /* Compress */ { if (compress(inpFilename, ldmFilename)) { - printf("Compress error"); + printf("Compress error\n"); return 1; } } @@ -250,7 +251,7 @@ int main(int argc, const char *argv[]) { struct timeval tv1, tv2; gettimeofday(&tv1, NULL); if (decompress(ldmFilename, decFilename)) { - printf("Decompress error"); + printf("Decompress error\n"); return 1; } gettimeofday(&tv2, NULL); @@ -259,7 +260,10 @@ int main(int argc, const char *argv[]) { (double) (tv2.tv_sec - tv1.tv_sec)); } /* verify */ - verify(inpFilename, decFilename); + if (verify(inpFilename, decFilename)) { + printf("Verification error\n"); + return 1; + } #endif return 0; }