Add dictionary random access example
This commit is contained in:
parent
bd88e4007b
commit
94917c9a04
1
examples/.gitignore
vendored
1
examples/.gitignore
vendored
@ -1,6 +1,7 @@
|
||||
/Makefile.lz4*
|
||||
/printVersion
|
||||
/doubleBuffer
|
||||
/dictionaryRandomAccess
|
||||
/ringBuffer
|
||||
/ringBufferHC
|
||||
/lineCompress
|
||||
|
@ -48,7 +48,7 @@ endif
|
||||
|
||||
default: all
|
||||
|
||||
all: printVersion doubleBuffer ringBuffer ringBufferHC lineCompress frameCompress
|
||||
all: printVersion doubleBuffer dictionaryRandomAccess ringBuffer ringBufferHC lineCompress frameCompress
|
||||
|
||||
printVersion: $(LZ4DIR)/lz4.c printVersion.c
|
||||
$(CC) $(FLAGS) $^ -o $@$(EXT)
|
||||
@ -56,6 +56,9 @@ printVersion: $(LZ4DIR)/lz4.c printVersion.c
|
||||
doubleBuffer: $(LZ4DIR)/lz4.c blockStreaming_doubleBuffer.c
|
||||
$(CC) $(FLAGS) $^ -o $@$(EXT)
|
||||
|
||||
dictionaryRandomAccess: $(LZ4DIR)/lz4.c dictionaryRandomAccess.c
|
||||
$(CC) $(FLAGS) $^ -o $@$(EXT)
|
||||
|
||||
ringBuffer : $(LZ4DIR)/lz4.c blockStreaming_ringBuffer.c
|
||||
$(CC) $(FLAGS) $^ -o $@$(EXT)
|
||||
|
||||
@ -66,7 +69,7 @@ lineCompress: $(LZ4DIR)/lz4.c blockStreaming_lineByLine.c
|
||||
$(CC) $(FLAGS) $^ -o $@$(EXT)
|
||||
|
||||
frameCompress: frameCompress.c
|
||||
$(CC) $(FLAGS) $^ -o $@$(EXT) -L$(LZ4DIR) -llz4
|
||||
$(CC) $(FLAGS) $^ -o $@$(EXT) $(LZ4DIR)/liblz4.a
|
||||
|
||||
compressFunctions: $(LZ4DIR)/lz4.c compress_functions.c
|
||||
$(CC) $(FLAGS) $^ -o $@$(EXT) -lrt
|
||||
@ -77,15 +80,16 @@ simpleBuffer: $(LZ4DIR)/lz4.c simple_buffer.c
|
||||
test : all
|
||||
./printVersion$(EXT)
|
||||
./doubleBuffer$(EXT) $(TESTFILE)
|
||||
./dictionaryRandomAccess$(EXT) $(TESTFILE) $(TESTFILE) 1100 1400
|
||||
./ringBuffer$(EXT) $(TESTFILE)
|
||||
./ringBufferHC$(EXT) $(TESTFILE)
|
||||
./lineCompress$(EXT) $(TESTFILE)
|
||||
LD_LIBRARY_PATH=$(LZ4DIR) ./frameCompress$(EXT) $(TESTFILE)
|
||||
./frameCompress$(EXT) $(TESTFILE)
|
||||
$(LZ4) -vt $(TESTFILE).lz4
|
||||
|
||||
clean:
|
||||
@rm -f core *.o *.dec *-0 *-9 *-8192 *.lz4s *.lz4 \
|
||||
printVersion$(EXT) doubleBuffer$(EXT) ringBuffer$(EXT) ringBufferHC$(EXT) \
|
||||
lineCompress$(EXT) frameCompress$(EXT) compressFunctions$(EXT) simpleBuffer$(EXT)
|
||||
printVersion$(EXT) doubleBuffer$(EXT) dictionaryRandomAccess$(EXT) \
|
||||
ringBuffer$(EXT) ringBufferHC$(EXT) lineCompress$(EXT) frameCompress$(EXT) \
|
||||
compressFunctions$(EXT) simpleBuffer$(EXT)
|
||||
@echo Cleaning completed
|
||||
|
||||
|
@ -8,3 +8,4 @@ All examples are GPL-v2 licensed.
|
||||
- Examples
|
||||
- [Double Buffer](blockStreaming_doubleBuffer.md)
|
||||
- [Line by Line Text Compression](blockStreaming_lineByLine.md)
|
||||
- [Dictionary Random Access](dictionaryRandomAccess.md)
|
||||
|
280
examples/dictionaryRandomAccess.c
Normal file
280
examples/dictionaryRandomAccess.c
Normal file
@ -0,0 +1,280 @@
|
||||
// LZ4 API example : Dictionary Random Access
|
||||
|
||||
#ifdef _MSC_VER /* Visual Studio */
|
||||
# define _CRT_SECURE_NO_WARNINGS
|
||||
# define snprintf sprintf_s
|
||||
#endif
|
||||
#include "lz4.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define MIN(x, y) (x) < (y) ? (x) : (y)
|
||||
|
||||
enum {
|
||||
BLOCK_BYTES = 1024, /* 1 KiB of uncompressed data in a block */
|
||||
DICTIONARY_BYTES = 1024, /* Load a 1 KiB dictionary */
|
||||
MAX_BLOCKS = 1024 /* For simplicity of implementation */
|
||||
};
|
||||
|
||||
/**
|
||||
* Magic bytes for this test case.
|
||||
* This is not a great magic number because it is a common word in ASCII.
|
||||
* However, it is important to have some versioning system in your format.
|
||||
*/
|
||||
const char kTestMagic[] = { 'T', 'E', 'S', 'T' };
|
||||
|
||||
|
||||
void write_int(FILE* fp, int i) {
|
||||
size_t written = fwrite(&i, sizeof(i), 1, fp);
|
||||
if (written != 1) { exit(10); }
|
||||
}
|
||||
|
||||
void write_bin(FILE* fp, const void* array, size_t arrayBytes) {
|
||||
size_t written = fwrite(array, 1, arrayBytes, fp);
|
||||
if (written != arrayBytes) { exit(11); }
|
||||
}
|
||||
|
||||
void read_int(FILE* fp, int* i) {
|
||||
size_t read = fread(i, sizeof(*i), 1, fp);
|
||||
if (read != 1) { exit(12); }
|
||||
}
|
||||
|
||||
size_t read_bin(FILE* fp, void* array, size_t arrayBytes) {
|
||||
size_t read = fread(array, 1, arrayBytes, fp);
|
||||
if (ferror(fp)) { exit(12); }
|
||||
return read;
|
||||
}
|
||||
|
||||
void seek_bin(FILE* fp, long offset, int origin) {
|
||||
if (fseek(fp, offset, origin)) { exit(14); }
|
||||
}
|
||||
|
||||
|
||||
void test_compress(FILE* outFp, FILE* inpFp, void *dict, int dictSize)
|
||||
{
|
||||
LZ4_stream_t lz4Stream_body;
|
||||
LZ4_stream_t* lz4Stream = &lz4Stream_body;
|
||||
|
||||
char inpBuf[BLOCK_BYTES];
|
||||
int offsets[MAX_BLOCKS];
|
||||
int *offsetsEnd = offsets;
|
||||
|
||||
|
||||
LZ4_resetStream(lz4Stream);
|
||||
|
||||
/* Write header magic */
|
||||
write_bin(outFp, kTestMagic, sizeof(kTestMagic));
|
||||
|
||||
*offsetsEnd++ = sizeof(kTestMagic);
|
||||
/* Write compressed data blocks. Each block contains BLOCK_BYTES of plain
|
||||
data except possibly the last. */
|
||||
for(;;) {
|
||||
const int inpBytes = (int) read_bin(inpFp, inpBuf, BLOCK_BYTES);
|
||||
if(0 == inpBytes) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* Forget previously compressed data and load the dictionary */
|
||||
LZ4_loadDict(lz4Stream, dict, dictSize);
|
||||
{
|
||||
char cmpBuf[LZ4_COMPRESSBOUND(BLOCK_BYTES)];
|
||||
const int cmpBytes = LZ4_compress_fast_continue(
|
||||
lz4Stream, inpBuf, cmpBuf, inpBytes, sizeof(cmpBuf), 1);
|
||||
if(cmpBytes <= 0) { exit(1); }
|
||||
write_bin(outFp, cmpBuf, (size_t)cmpBytes);
|
||||
/* Keep track of the offsets */
|
||||
*offsetsEnd = *(offsetsEnd - 1) + cmpBytes;
|
||||
++offsetsEnd;
|
||||
}
|
||||
if (offsetsEnd - offsets > MAX_BLOCKS) { exit(2); }
|
||||
}
|
||||
/* Write the tailing jump table */
|
||||
{
|
||||
int *ptr = offsets;
|
||||
while (ptr != offsetsEnd) {
|
||||
write_int(outFp, *ptr++);
|
||||
}
|
||||
write_int(outFp, offsetsEnd - offsets);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_decompress(FILE* outFp, FILE* inpFp, void *dict, int dictSize, int offset, int length)
|
||||
{
|
||||
LZ4_streamDecode_t lz4StreamDecode_body;
|
||||
LZ4_streamDecode_t* lz4StreamDecode = &lz4StreamDecode_body;
|
||||
|
||||
/* The blocks [currentBlock, endBlock) contain the data we want */
|
||||
int currentBlock = offset / BLOCK_BYTES;
|
||||
int endBlock = ((offset + length - 1) / BLOCK_BYTES) + 1;
|
||||
|
||||
char decBuf[BLOCK_BYTES];
|
||||
int offsets[MAX_BLOCKS];
|
||||
|
||||
/* Special cases */
|
||||
if (length == 0) { return; }
|
||||
|
||||
/* Read the magic bytes */
|
||||
{
|
||||
char magic[sizeof(kTestMagic)];
|
||||
size_t read = read_bin(inpFp, magic, sizeof(magic));
|
||||
if (read != sizeof(magic)) { exit(1); }
|
||||
if (memcmp(kTestMagic, magic, sizeof(magic))) { exit(2); }
|
||||
}
|
||||
|
||||
/* Read the offsets tail */
|
||||
{
|
||||
int numOffsets;
|
||||
int block;
|
||||
int *offsetsPtr = offsets;
|
||||
seek_bin(inpFp, -4, SEEK_END);
|
||||
read_int(inpFp, &numOffsets);
|
||||
if (numOffsets <= endBlock) { exit(3); }
|
||||
seek_bin(inpFp, -4 * (numOffsets + 1), SEEK_END);
|
||||
for (block = 0; block <= endBlock; ++block) {
|
||||
read_int(inpFp, offsetsPtr++);
|
||||
}
|
||||
}
|
||||
/* Seek to the first block to read */
|
||||
seek_bin(inpFp, offsets[currentBlock], SEEK_SET);
|
||||
offset = offset % BLOCK_BYTES;
|
||||
|
||||
/* Start decoding */
|
||||
for(; currentBlock < endBlock; ++currentBlock) {
|
||||
char cmpBuf[LZ4_COMPRESSBOUND(BLOCK_BYTES)];
|
||||
/* The difference in offsets is the size of the block */
|
||||
int cmpBytes = offsets[currentBlock + 1] - offsets[currentBlock];
|
||||
{
|
||||
const size_t read = read_bin(inpFp, cmpBuf, (size_t)cmpBytes);
|
||||
if(read != (size_t)cmpBytes) { exit(4); }
|
||||
}
|
||||
|
||||
/* Load the dictionary */
|
||||
LZ4_setStreamDecode(lz4StreamDecode, dict, dictSize);
|
||||
{
|
||||
const int decBytes = LZ4_decompress_safe_continue(
|
||||
lz4StreamDecode, cmpBuf, decBuf, cmpBytes, BLOCK_BYTES);
|
||||
if(decBytes <= 0) { exit(5); }
|
||||
{
|
||||
/* Write out the part of the data we care about */
|
||||
int blockLength = MIN(length, (decBytes - offset));
|
||||
write_bin(outFp, decBuf + offset, (size_t)blockLength);
|
||||
offset = 0;
|
||||
length -= blockLength;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int compare(FILE* fp0, FILE* fp1, int length)
|
||||
{
|
||||
int result = 0;
|
||||
|
||||
while(0 == result) {
|
||||
char b0[4096];
|
||||
char b1[4096];
|
||||
const size_t r0 = read_bin(fp0, b0, MIN(length, (int)sizeof(b0)));
|
||||
const size_t r1 = read_bin(fp1, b1, MIN(length, (int)sizeof(b1)));
|
||||
|
||||
result = (int) r0 - (int) r1;
|
||||
|
||||
if(0 == r0 || 0 == r1) {
|
||||
break;
|
||||
}
|
||||
if(0 == result) {
|
||||
result = memcmp(b0, b1, r0);
|
||||
}
|
||||
length -= r0;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
char inpFilename[256] = { 0 };
|
||||
char lz4Filename[256] = { 0 };
|
||||
char decFilename[256] = { 0 };
|
||||
char dictFilename[256] = { 0 };
|
||||
int offset;
|
||||
int length;
|
||||
char dict[DICTIONARY_BYTES];
|
||||
int dictSize;
|
||||
|
||||
if(argc < 5) {
|
||||
printf("Usage: %s input dictionary offset length", argv[0]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
snprintf(inpFilename, 256, "%s", argv[1]);
|
||||
snprintf(lz4Filename, 256, "%s.lz4s-%d", argv[1], BLOCK_BYTES);
|
||||
snprintf(decFilename, 256, "%s.lz4s-%d.dec", argv[1], BLOCK_BYTES);
|
||||
snprintf(dictFilename, 256, "%s", argv[2]);
|
||||
offset = atoi(argv[3]);
|
||||
length = atoi(argv[4]);
|
||||
|
||||
printf("inp = [%s]\n", inpFilename);
|
||||
printf("lz4 = [%s]\n", lz4Filename);
|
||||
printf("dec = [%s]\n", decFilename);
|
||||
printf("dict = [%s]\n", dictFilename);
|
||||
printf("offset = [%d]\n", offset);
|
||||
printf("length = [%d]\n", length);
|
||||
|
||||
/* Load dictionary */
|
||||
{
|
||||
FILE* dictFp = fopen(dictFilename, "rb");
|
||||
dictSize = (int)read_bin(dictFp, dict, DICTIONARY_BYTES);
|
||||
fclose(dictFp);
|
||||
}
|
||||
|
||||
/* compress */
|
||||
{
|
||||
FILE* inpFp = fopen(inpFilename, "rb");
|
||||
FILE* outFp = fopen(lz4Filename, "wb");
|
||||
|
||||
printf("compress : %s -> %s\n", inpFilename, lz4Filename);
|
||||
test_compress(outFp, inpFp, dict, dictSize);
|
||||
printf("compress : done\n");
|
||||
|
||||
fclose(outFp);
|
||||
fclose(inpFp);
|
||||
}
|
||||
|
||||
/* decompress */
|
||||
{
|
||||
FILE* inpFp = fopen(lz4Filename, "rb");
|
||||
FILE* outFp = fopen(decFilename, "wb");
|
||||
|
||||
printf("decompress : %s -> %s\n", lz4Filename, decFilename);
|
||||
test_decompress(outFp, inpFp, dict, DICTIONARY_BYTES, offset, length);
|
||||
printf("decompress : done\n");
|
||||
|
||||
fclose(outFp);
|
||||
fclose(inpFp);
|
||||
}
|
||||
|
||||
/* verify */
|
||||
{
|
||||
FILE* inpFp = fopen(inpFilename, "rb");
|
||||
FILE* decFp = fopen(decFilename, "rb");
|
||||
seek_bin(inpFp, offset, SEEK_SET);
|
||||
|
||||
printf("verify : %s <-> %s\n", inpFilename, decFilename);
|
||||
const int cmp = compare(inpFp, decFp, length);
|
||||
if(0 == cmp) {
|
||||
printf("verify : OK\n");
|
||||
} else {
|
||||
printf("verify : NG\n");
|
||||
}
|
||||
|
||||
fclose(decFp);
|
||||
fclose(inpFp);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
67
examples/dictionaryRandomAccess.md
Normal file
67
examples/dictionaryRandomAccess.md
Normal file
@ -0,0 +1,67 @@
|
||||
# LZ4 API Example : Dictionary Random Access
|
||||
|
||||
`dictionaryRandomAccess.c` is LZ4 API example which implements dictionary compression and random access decompression.
|
||||
|
||||
Please note that the output file is not compatible with lz4frame and is platform dependent.
|
||||
|
||||
|
||||
## What's the point of this example ?
|
||||
|
||||
- Dictionary based compression for homogenous files.
|
||||
- Random access to compressed blocks.
|
||||
|
||||
|
||||
## How the compression works
|
||||
|
||||
Reads the dictionary from a file, and uses it as the history for each block.
|
||||
This allows each block to be independent, but maintains compression ratio.
|
||||
|
||||
```
|
||||
Dictionary
|
||||
+
|
||||
|
|
||||
v
|
||||
+---------+
|
||||
| Block#1 |
|
||||
+----+----+
|
||||
|
|
||||
v
|
||||
{Out#1}
|
||||
|
||||
|
||||
Dictionary
|
||||
+
|
||||
|
|
||||
v
|
||||
+---------+
|
||||
| Block#2 |
|
||||
+----+----+
|
||||
|
|
||||
v
|
||||
{Out#2}
|
||||
```
|
||||
|
||||
After writing the magic bytes `TEST` and then the compressed blocks, write out the jump table.
|
||||
The last 4 bytes is an integer containing the number of blocks in the stream.
|
||||
If there are `N` blocks, then just before the last 4 bytes is `N + 1` 4 byte integers containing the offsets at the beginning and end of each block.
|
||||
Let `Offset#K` be the total number of bytes written after writing out `Block#K` *including* the magic bytes for simplicity.
|
||||
|
||||
```
|
||||
+------+---------+ +---------+---+----------+ +----------+-----+
|
||||
| TEST | Block#1 | ... | Block#N | 4 | Offset#1 | ... | Offset#N | N+1 |
|
||||
+------+---------+ +---------+---+----------+ +----------+-----+
|
||||
```
|
||||
|
||||
## How the decompression works
|
||||
|
||||
Decompression will do reverse order.
|
||||
|
||||
- Seek to the last 4 bytes of the file and read the number of offsets.
|
||||
- Read each offset into an array.
|
||||
- Seek to the first block containing data we want to read.
|
||||
We know where to look because we know each block contains a fixed amount of uncompressed data, except possibly the last.
|
||||
- Decompress it and write what data we need from it to the file.
|
||||
- Read the next block.
|
||||
- Decompress it and write that page to the file.
|
||||
|
||||
Continue these procedure until all the required data has been read.
|
Loading…
Reference in New Issue
Block a user