Merge pull request #1760 from bimbashrestha/extract_sequences_api
Adding api for extracting sequences from seqstore
This commit is contained in:
commit
fb77afc626
@ -2265,6 +2265,77 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
|
||||
return ZSTDbss_compress;
|
||||
}
|
||||
|
||||
static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
|
||||
{
|
||||
const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
|
||||
const seqDef* seqs = seqStore->sequencesStart;
|
||||
size_t seqsSize = seqStore->sequences - seqs;
|
||||
|
||||
ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
|
||||
size_t i; size_t position; int repIdx;
|
||||
|
||||
assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
|
||||
for (i = 0, position = 0; i < seqsSize; ++i) {
|
||||
outSeqs[i].offset = seqs[i].offset;
|
||||
outSeqs[i].litLength = seqs[i].litLength;
|
||||
outSeqs[i].matchLength = seqs[i].matchLength + MINMATCH;
|
||||
|
||||
if (i == seqStore->longLengthPos) {
|
||||
if (seqStore->longLengthID == 1) {
|
||||
outSeqs[i].litLength += 0x10000;
|
||||
} else if (seqStore->longLengthID == 2) {
|
||||
outSeqs[i].matchLength += 0x10000;
|
||||
}
|
||||
}
|
||||
|
||||
if (outSeqs[i].offset <= ZSTD_REP_NUM) {
|
||||
outSeqs[i].rep = outSeqs[i].offset;
|
||||
repIdx = (unsigned int)i - outSeqs[i].offset;
|
||||
|
||||
if (outSeqs[i].litLength == 0) {
|
||||
if (outSeqs[i].offset < 3) {
|
||||
--repIdx;
|
||||
} else {
|
||||
repIdx = (unsigned int)i - 1;
|
||||
}
|
||||
++outSeqs[i].rep;
|
||||
}
|
||||
assert(repIdx >= -3);
|
||||
outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1];
|
||||
if (outSeqs[i].rep == 4) {
|
||||
--outSeqs[i].offset;
|
||||
}
|
||||
} else {
|
||||
outSeqs[i].offset -= ZSTD_REP_NUM;
|
||||
}
|
||||
|
||||
position += outSeqs[i].litLength;
|
||||
outSeqs[i].matchPos = (unsigned int)position;
|
||||
position += outSeqs[i].matchLength;
|
||||
}
|
||||
zc->seqCollector.seqIndex += seqsSize;
|
||||
}
|
||||
|
||||
size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
|
||||
size_t outSeqsSize, const void* src, size_t srcSize)
|
||||
{
|
||||
const size_t dstCapacity = ZSTD_compressBound(srcSize);
|
||||
void* dst = ZSTD_malloc(dstCapacity, ZSTD_defaultCMem);
|
||||
SeqCollector seqCollector;
|
||||
|
||||
RETURN_ERROR_IF(dst == NULL, memory_allocation);
|
||||
|
||||
seqCollector.collectSequences = 1;
|
||||
seqCollector.seqStart = outSeqs;
|
||||
seqCollector.seqIndex = 0;
|
||||
seqCollector.maxSequences = outSeqsSize;
|
||||
zc->seqCollector = seqCollector;
|
||||
|
||||
ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
|
||||
ZSTD_free(dst, ZSTD_defaultCMem);
|
||||
return zc->seqCollector.seqIndex;
|
||||
}
|
||||
|
||||
/* Returns true if the given block is a RLE block */
|
||||
static int ZSTD_isRLE(const BYTE *ip, size_t length) {
|
||||
size_t i;
|
||||
@ -2296,6 +2367,11 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
|
||||
if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
|
||||
}
|
||||
|
||||
if (zc->seqCollector.collectSequences) {
|
||||
ZSTD_copyBlockSequences(zc);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* encode sequences and literals */
|
||||
cSize = ZSTD_compressSequences(&zc->seqStore,
|
||||
&zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
|
||||
@ -2360,7 +2436,6 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*! ZSTD_compress_frameChunk() :
|
||||
* Compress a chunk of data into one or multiple blocks.
|
||||
* All blocks will be terminated, all input will be consumed.
|
||||
@ -2405,7 +2480,6 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,
|
||||
op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize,
|
||||
ip, blockSize, 1 /* frame */);
|
||||
FORWARD_IF_ERROR(cSize);
|
||||
|
||||
if (cSize == 0) { /* block is not compressible */
|
||||
cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
|
||||
FORWARD_IF_ERROR(cSize);
|
||||
|
@ -193,6 +193,13 @@ typedef struct {
|
||||
size_t capacity; /* The capacity starting from `seq` pointer */
|
||||
} rawSeqStore_t;
|
||||
|
||||
typedef struct {
|
||||
int collectSequences;
|
||||
ZSTD_Sequence* seqStart;
|
||||
size_t seqIndex;
|
||||
size_t maxSequences;
|
||||
} SeqCollector;
|
||||
|
||||
struct ZSTD_CCtx_params_s {
|
||||
ZSTD_format_e format;
|
||||
ZSTD_compressionParameters cParams;
|
||||
@ -240,6 +247,7 @@ struct ZSTD_CCtx_s {
|
||||
XXH64_state_t xxhState;
|
||||
ZSTD_customMem customMem;
|
||||
size_t staticSize;
|
||||
SeqCollector seqCollector;
|
||||
int isFirstBlock;
|
||||
|
||||
seqStore_t seqStore; /* sequences storage ptrs */
|
||||
|
27
lib/zstd.h
27
lib/zstd.h
@ -1077,6 +1077,24 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
|
||||
|
||||
typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
|
||||
|
||||
typedef struct {
|
||||
unsigned int matchPos; /* Match pos in dst */
|
||||
/* If seqDef.offset > 3, then this is seqDef.offset - 3
|
||||
* If seqDef.offset < 3, then this is the corresponding repeat offset
|
||||
* But if seqDef.offset < 3 and litLength == 0, this is the
|
||||
* repeat offset before the corresponding repeat offset
|
||||
* And if seqDef.offset == 3 and litLength == 0, this is the
|
||||
* most recent repeat offset - 1
|
||||
*/
|
||||
unsigned int offset;
|
||||
unsigned int litLength; /* Literal length */
|
||||
unsigned int matchLength; /* Match length */
|
||||
/* 0 when seq not rep and seqDef.offset otherwise
|
||||
* when litLength == 0 this will be <= 4, otherwise <= 3 like normal
|
||||
*/
|
||||
unsigned int rep;
|
||||
} ZSTD_Sequence;
|
||||
|
||||
typedef struct {
|
||||
unsigned windowLog; /**< largest match distance : larger == more compression, more memory needed during decompression */
|
||||
unsigned chainLog; /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
|
||||
@ -1215,6 +1233,15 @@ ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcS
|
||||
* or an error code (if srcSize is too small) */
|
||||
ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
|
||||
|
||||
/*! ZSTD_getSequences() :
|
||||
* Extract sequences from the sequence store
|
||||
* zc can be used to insert custom compression params.
|
||||
* This function invokes ZSTD_compress2
|
||||
* @return : number of sequences extracted
|
||||
*/
|
||||
ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
|
||||
size_t outSeqsSize, const void* src, size_t srcSize);
|
||||
|
||||
|
||||
/***************************************
|
||||
* Memory management
|
||||
|
@ -304,6 +304,28 @@ static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
|
||||
|
||||
#endif
|
||||
|
||||
static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize, BYTE* src, size_t size)
|
||||
{
|
||||
size_t i;
|
||||
size_t j;
|
||||
for(i = 0; i < seqsSize - 1; ++i) {
|
||||
assert(dst + seqs[i].litLength + seqs[i].matchLength < dst + size);
|
||||
assert(src + seqs[i].litLength + seqs[i].matchLength < src + size);
|
||||
|
||||
memcpy(dst, src, seqs[i].litLength);
|
||||
dst += seqs[i].litLength;
|
||||
src += seqs[i].litLength;
|
||||
size -= seqs[i].litLength;
|
||||
|
||||
for (j = 0; j < seqs[i].matchLength; ++j)
|
||||
dst[j] = dst[j - seqs[i].offset];
|
||||
dst += seqs[i].matchLength;
|
||||
src += seqs[i].matchLength;
|
||||
size -= seqs[i].matchLength;
|
||||
}
|
||||
memcpy(dst, src, size);
|
||||
}
|
||||
|
||||
/*=============================================
|
||||
* Unit tests
|
||||
=============================================*/
|
||||
@ -1960,6 +1982,33 @@ static int basicUnitTests(U32 const seed, double compressibility)
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
}
|
||||
|
||||
DISPLAYLEVEL(3, "test%3i : ZSTD_getSequences decode from sequences test : ", testNb++);
|
||||
{
|
||||
size_t srcSize = 100 KB;
|
||||
BYTE* src = (BYTE*)CNBuffer;
|
||||
BYTE* decoded = (BYTE*)compressedBuffer;
|
||||
|
||||
ZSTD_CCtx* cctx = ZSTD_createCCtx();
|
||||
ZSTD_Sequence* seqs = (ZSTD_Sequence*)malloc(srcSize * sizeof(ZSTD_Sequence));
|
||||
size_t seqsSize;
|
||||
|
||||
if (seqs == NULL) goto _output_error;
|
||||
assert(cctx != NULL);
|
||||
|
||||
/* Populate src with random data */
|
||||
RDG_genBuffer(CNBuffer, srcSize, compressibility, 0., seed);
|
||||
|
||||
/* get the sequences */
|
||||
seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize);
|
||||
|
||||
/* "decode" and compare the sequences */
|
||||
FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize);
|
||||
assert(!memcmp(CNBuffer, compressedBuffer, srcSize));
|
||||
|
||||
ZSTD_freeCCtx(cctx);
|
||||
free(seqs);
|
||||
}
|
||||
|
||||
/* Multiple blocks of zeros test */
|
||||
#define LONGZEROSLENGTH 1000000 /* 1MB of zeros */
|
||||
DISPLAYLEVEL(3, "test%3i : compress %u zeroes : ", testNb++, LONGZEROSLENGTH);
|
||||
@ -1972,7 +2021,6 @@ static int basicUnitTests(U32 const seed, double compressibility)
|
||||
if (r != LONGZEROSLENGTH) goto _output_error; }
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
|
||||
|
||||
/* All zeroes test (test bug #137) */
|
||||
#define ZEROESLENGTH 100
|
||||
DISPLAYLEVEL(3, "test%3i : compress %u zeroes : ", testNb++, ZEROESLENGTH);
|
||||
|
Loading…
Reference in New Issue
Block a user