From 7397d0102f582705f755eca894a70ae37448f588 Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 10:15:53 -0500 Subject: [PATCH 01/16] Add new enum for different sequence formats for ingestion/extraction --- lib/zstd.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/zstd.h b/lib/zstd.h index c1fb55cf..608ebd2f 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1297,14 +1297,27 @@ ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcS * or an error code (if srcSize is too small) */ ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); +typedef enum { + ZSTD_sf_blockDelimiters = 0, + ZSTD_sf_noBlockDelimiters = 1, +} ZSTD_sequenceFormat_e; + /*! ZSTD_getSequences() : * Extract sequences from the sequence store. - * Each block will end with a dummy sequence with offset == 0, matchLength == 0, and litLength == length of last literals. + * If invoked with ZSTD_sf_blockDelimiters, block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * + * If invoked with ZSTD_sf_noBlockDelimiters, sequences will still be generated + * on a per-block basis, but any last literals of a block will be merged into the + * last literals of the first sequence in the next block with the exception of the + * final segment of last literals. As such, the final generated result has no + * explicit representation of block boundaries. * * zc can be used to insert custom compression params. * This function invokes ZSTD_compress2 * @return : number of sequences extracted */ + ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, size_t outSeqsSize, const void* src, size_t srcSize); From 33279326097bad2a75d4b0ede4cd23b547050d63 Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 10:17:59 -0500 Subject: [PATCH 02/16] Update ZSTD_getSequences function signature --- lib/compress/zstd_compress.c | 2 +- lib/zstd.h | 2 +- tests/fuzzer.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 97219dee..cf274d7b 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2550,7 +2550,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) } size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, - size_t outSeqsSize, const void* src, size_t srcSize) + size_t outSeqsSize, const void* src, size_t srcSize, ZSTD_sequenceFormat_e format) { const size_t dstCapacity = ZSTD_compressBound(srcSize); void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); diff --git a/lib/zstd.h b/lib/zstd.h index 608ebd2f..3809a033 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1319,7 +1319,7 @@ typedef enum { */ ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, - size_t outSeqsSize, const void* src, size_t srcSize); + size_t outSeqsSize, const void* src, size_t srcSize, ZSTD_sequenceFormat_e format); /*************************************** diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 755c13bd..2557cd4e 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -2718,7 +2718,7 @@ static int basicUnitTests(U32 const seed, double compressibility) RDG_genBuffer(CNBuffer, srcSize, compressibility, 0., seed); /* get the sequences */ - seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize); + seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_blockDelimiters); /* "decode" and compare the sequences */ FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize); From 435a3a04287548676b385d2c8bca6abab2509efe Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 10:19:26 -0500 Subject: [PATCH 03/16] Update seqCollector definition --- lib/compress/zstd_compress.c | 1 + lib/compress/zstd_compress_internal.h | 1 + 2 files changed, 2 insertions(+) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index cf274d7b..12b5a32e 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2562,6 +2562,7 @@ size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, seqCollector.seqStart = outSeqs; seqCollector.seqIndex = 0; seqCollector.maxSequences = outSeqsSize; + seqCollector.format = format; zc->seqCollector = seqCollector; ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 3ff318d5..e11f1dbd 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -206,6 +206,7 @@ typedef struct { ZSTD_Sequence* seqStart; size_t seqIndex; size_t maxSequences; + ZSTD_sequenceFormat_e format; } SeqCollector; struct ZSTD_CCtx_params_s { From a36fdada57342eea750d37f7003f8e5d3f3061ad Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 10:46:52 -0500 Subject: [PATCH 04/16] Add algorithm to remove all delimiters --- lib/compress/zstd_compress.c | 18 ++++++++++++++++++ lib/zstd.h | 6 +++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 12b5a32e..1f274622 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2567,6 +2567,24 @@ size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); ZSTD_customFree(dst, ZSTD_defaultCMem); + + if (format == ZSTD_sf_noBlockDelimiters) { + /* Merge the dummy block delimiters */ + size_t i = 0; + size_t totalSeqs = zc->seqCollector.seqIndex; + for (; i < totalSeqs; ++i) { + if (seqCollector.seqStart[i].offset == 0 && seqCollector.seqStart[i].matchLength == 0) { + /* Merge the block boundary or last literals */ + if (i != totalSeqs-1) { + /* Add last literals to next sequence, then "delete" this sequence */ + seqCollector.seqStart[i+1].litLength += seqCollector.seqStart[i].litLength; + memmove(seqCollector.seqStart+i, seqCollector.seqStart+i+1, (totalSeqs-i-1)*sizeof(ZSTD_sequence)); + } + totalSeqs--; + } + } + zc->seqCollector.seqIndex = totalSeqs; + } return zc->seqCollector.seqIndex; } diff --git a/lib/zstd.h b/lib/zstd.h index 3809a033..4158c4ac 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1309,9 +1309,9 @@ typedef enum { * * If invoked with ZSTD_sf_noBlockDelimiters, sequences will still be generated * on a per-block basis, but any last literals of a block will be merged into the - * last literals of the first sequence in the next block with the exception of the - * final segment of last literals. As such, the final generated result has no - * explicit representation of block boundaries. + * last literals of the first sequence in the next block. + * As such, the final generated result has no explicit representation of block boundaries, + * and the final last literals segment is not represented in the sequences. * * zc can be used to insert custom compression params. * This function invokes ZSTD_compress2 From e8501e00b8df6bab69838a7bf40f4f9f479a288f Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 10:58:18 -0500 Subject: [PATCH 05/16] Fix incorrect index increment in merge algorithm --- lib/compress/zstd_compress.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 1f274622..7357a70a 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2569,22 +2569,24 @@ size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, ZSTD_customFree(dst, ZSTD_defaultCMem); if (format == ZSTD_sf_noBlockDelimiters) { - /* Merge the dummy block delimiters */ size_t i = 0; size_t totalSeqs = zc->seqCollector.seqIndex; - for (; i < totalSeqs; ++i) { + while(i < totalSeqs) { if (seqCollector.seqStart[i].offset == 0 && seqCollector.seqStart[i].matchLength == 0) { /* Merge the block boundary or last literals */ if (i != totalSeqs-1) { /* Add last literals to next sequence, then "delete" this sequence */ seqCollector.seqStart[i+1].litLength += seqCollector.seqStart[i].litLength; - memmove(seqCollector.seqStart+i, seqCollector.seqStart+i+1, (totalSeqs-i-1)*sizeof(ZSTD_sequence)); + memmove(seqCollector.seqStart+i, seqCollector.seqStart+i+1, (totalSeqs-i-1)*sizeof(ZSTD_Sequence)); } totalSeqs--; + } else { + ++i; } } zc->seqCollector.seqIndex = totalSeqs; } + return zc->seqCollector.seqIndex; } From e6178f837f7ea421056662ba71e8b3b88ae0cccd Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 10:59:06 -0500 Subject: [PATCH 06/16] Revert unnecessary seqCollector adjustment --- lib/compress/zstd_compress.c | 3 +-- lib/compress/zstd_compress_internal.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 7357a70a..e92f3e37 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2562,7 +2562,6 @@ size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, seqCollector.seqStart = outSeqs; seqCollector.seqIndex = 0; seqCollector.maxSequences = outSeqsSize; - seqCollector.format = format; zc->seqCollector = seqCollector; ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); @@ -2586,7 +2585,7 @@ size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, } zc->seqCollector.seqIndex = totalSeqs; } - + return zc->seqCollector.seqIndex; } diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index e11f1dbd..3ff318d5 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -206,7 +206,6 @@ typedef struct { ZSTD_Sequence* seqStart; size_t seqIndex; size_t maxSequences; - ZSTD_sequenceFormat_e format; } SeqCollector; struct ZSTD_CCtx_params_s { From 9102f30dbf6aeddc7f5366df02cce9dc7552a666 Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 11:30:31 -0500 Subject: [PATCH 07/16] Update unit test --- tests/fuzzer.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 2557cd4e..d8cda510 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -305,13 +305,17 @@ static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part) #endif -static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize, BYTE* src, size_t size) +static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize, + BYTE* src, size_t size, ZSTD_sequenceFormat_e format) { size_t i; size_t j; for(i = 0; i < seqsSize; ++i) { assert(dst + seqs[i].litLength + seqs[i].matchLength <= dst + size); assert(src + seqs[i].litLength + seqs[i].matchLength <= src + size); + if (format == ZSTD_sf_noBlockDelimiters) { + assert(seqs[i].matchLength != 0 || seqs[i].offset != 0); + } memcpy(dst, src, seqs[i].litLength); dst += seqs[i].litLength; @@ -326,6 +330,9 @@ static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize, size -= seqs[i].matchLength; } } + if (format == ZSTD_sf_noBlockDelimiters) { + memcpy(dst, src, size); + } } /*============================================= @@ -2703,7 +2710,7 @@ static int basicUnitTests(U32 const seed, double compressibility) DISPLAYLEVEL(3, "test%3i : ZSTD_getSequences decode from sequences test : ", testNb++); { - size_t srcSize = 100 KB; + size_t srcSize = 150 KB; BYTE* src = (BYTE*)CNBuffer; BYTE* decoded = (BYTE*)compressedBuffer; @@ -2715,13 +2722,16 @@ static int basicUnitTests(U32 const seed, double compressibility) assert(cctx != NULL); /* Populate src with random data */ - RDG_genBuffer(CNBuffer, srcSize, compressibility, 0., seed); + RDG_genBuffer(CNBuffer, srcSize, 0.03, 0., seed); - /* get the sequences */ + /* Test with block delimiters roundtrip */ seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_blockDelimiters); + FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_blockDelimiters); + assert(!memcmp(CNBuffer, compressedBuffer, srcSize)); - /* "decode" and compare the sequences */ - FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize); + /* Test no block delimiters roundtrip */ + seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_noBlockDelimiters); + FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_noBlockDelimiters); assert(!memcmp(CNBuffer, compressedBuffer, srcSize)); ZSTD_freeCCtx(cctx); From d4d0346b40d255930da5cf23dcb39530abfcafbc Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 11:32:56 -0500 Subject: [PATCH 08/16] Update name of enum, clarify documentation --- lib/compress/zstd_compress.c | 1 + lib/zstd.h | 8 ++++---- tests/fuzzer.c | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index e92f3e37..6ffef2b9 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2505,6 +2505,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) for (i = 0; i < seqStoreSeqSize; ++i) { outSeqs[i].litLength = seqStoreSeqs[i].litLength; outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH; + outSeqs[i].rep = 0; if (i == seqStore->longLengthPos) { if (seqStore->longLengthID == 1) { diff --git a/lib/zstd.h b/lib/zstd.h index 4158c4ac..4fd50043 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1298,16 +1298,16 @@ ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcS ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); typedef enum { - ZSTD_sf_blockDelimiters = 0, - ZSTD_sf_noBlockDelimiters = 1, + ZSTD_sf_explicitBlockDelimiters = 0, /* Representation of ZSTD_Sequence contains explicit block delimiters */ + ZSTD_sf_noBlockDelimiters = 1, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ } ZSTD_sequenceFormat_e; /*! ZSTD_getSequences() : * Extract sequences from the sequence store. - * If invoked with ZSTD_sf_blockDelimiters, block will end with a dummy sequence + * If invoked with ZSTD_sf_explicitBlockDelimiters, each block will end with a dummy sequence * with offset == 0, matchLength == 0, and litLength == length of last literals. * - * If invoked with ZSTD_sf_noBlockDelimiters, sequences will still be generated + * If invoked with ZSTD_sf_noBlockDelimiters, sequences will still be internally generated * on a per-block basis, but any last literals of a block will be merged into the * last literals of the first sequence in the next block. * As such, the final generated result has no explicit representation of block boundaries, diff --git a/tests/fuzzer.c b/tests/fuzzer.c index d8cda510..1e204d06 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -2725,8 +2725,8 @@ static int basicUnitTests(U32 const seed, double compressibility) RDG_genBuffer(CNBuffer, srcSize, 0.03, 0., seed); /* Test with block delimiters roundtrip */ - seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_blockDelimiters); - FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_blockDelimiters); + seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_explicitBlockDelimiters); + FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_explicitBlockDelimiters); assert(!memcmp(CNBuffer, compressedBuffer, srcSize)); /* Test no block delimiters roundtrip */ From c54a25b666c2423c34c8fa46a33ee0e86af95f65 Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 11:35:27 -0500 Subject: [PATCH 09/16] Revert compressibility change --- tests/fuzzer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 1e204d06..f31ba5e0 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -2722,7 +2722,7 @@ static int basicUnitTests(U32 const seed, double compressibility) assert(cctx != NULL); /* Populate src with random data */ - RDG_genBuffer(CNBuffer, srcSize, 0.03, 0., seed); + RDG_genBuffer(CNBuffer, srcSize, compressibility, 0., seed); /* Test with block delimiters roundtrip */ seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_explicitBlockDelimiters); From 3434049c1fe6abdafe9f0fd2a4ce95930de445d5 Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 11:43:19 -0500 Subject: [PATCH 10/16] Use ZSTD_memmove() instead of memmove() --- lib/compress/zstd_compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 6ffef2b9..7571261a 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2577,7 +2577,7 @@ size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, if (i != totalSeqs-1) { /* Add last literals to next sequence, then "delete" this sequence */ seqCollector.seqStart[i+1].litLength += seqCollector.seqStart[i].litLength; - memmove(seqCollector.seqStart+i, seqCollector.seqStart+i+1, (totalSeqs-i-1)*sizeof(ZSTD_Sequence)); + ZSTD_memmove(seqCollector.seqStart+i, seqCollector.seqStart+i+1, (totalSeqs-i-1)*sizeof(ZSTD_Sequence)); } totalSeqs--; } else { From 3c9b43da1d6962682c47d226478f00149c1400af Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 11:53:04 -0500 Subject: [PATCH 11/16] Remove trailing comma --- lib/zstd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/zstd.h b/lib/zstd.h index 4fd50043..2d086fc3 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1298,8 +1298,8 @@ ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcS ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); typedef enum { - ZSTD_sf_explicitBlockDelimiters = 0, /* Representation of ZSTD_Sequence contains explicit block delimiters */ - ZSTD_sf_noBlockDelimiters = 1, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ + ZSTD_sf_explicitBlockDelimiters, /* Representation of ZSTD_Sequence contains explicit block delimiters */ + ZSTD_sf_noBlockDelimiters /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ } ZSTD_sequenceFormat_e; /*! ZSTD_getSequences() : From f782cac3d4b93739b918bc20c3d458febbe81a58 Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Mon, 2 Nov 2020 16:59:16 -0500 Subject: [PATCH 12/16] Change block delimiter removing to linear time approach --- lib/compress/zstd_compress.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 7571261a..928f8fef 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2569,22 +2569,22 @@ size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, ZSTD_customFree(dst, ZSTD_defaultCMem); if (format == ZSTD_sf_noBlockDelimiters) { - size_t i = 0; - size_t totalSeqs = zc->seqCollector.seqIndex; - while(i < totalSeqs) { - if (seqCollector.seqStart[i].offset == 0 && seqCollector.seqStart[i].matchLength == 0) { - /* Merge the block boundary or last literals */ - if (i != totalSeqs-1) { - /* Add last literals to next sequence, then "delete" this sequence */ - seqCollector.seqStart[i+1].litLength += seqCollector.seqStart[i].litLength; - ZSTD_memmove(seqCollector.seqStart+i, seqCollector.seqStart+i+1, (totalSeqs-i-1)*sizeof(ZSTD_Sequence)); + /* Remove all block delimiters and append them to the next sequence's literals + * and do not emit last literals at all + */ + size_t in = 0; + size_t out = 0; + for (; in < zc->seqCollector.seqIndex; ++in) { + if (seqCollector.seqStart[in].offset == 0 && seqCollector.seqStart[in].matchLength == 0) { + if (in != zc->seqCollector.seqIndex - 1) { + seqCollector.seqStart[in+1].litLength += seqCollector.seqStart[in].litLength; } - totalSeqs--; } else { - ++i; + seqCollector.seqStart[out] = seqCollector.seqStart[in]; + ++out; } } - zc->seqCollector.seqIndex = totalSeqs; + zc->seqCollector.seqIndex = out; } return zc->seqCollector.seqIndex; From 261ea69661c2d67e5e1e92f948cfd04bbddbbe03 Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Fri, 6 Nov 2020 10:52:34 -0500 Subject: [PATCH 13/16] Add new mergeGeneratedSequences() function --- lib/zstd.h | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/lib/zstd.h b/lib/zstd.h index 2d086fc3..3c777cf8 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1304,23 +1304,29 @@ typedef enum { /*! ZSTD_getSequences() : * Extract sequences from the sequence store. - * If invoked with ZSTD_sf_explicitBlockDelimiters, each block will end with a dummy sequence - * with offset == 0, matchLength == 0, and litLength == length of last literals. * - * If invoked with ZSTD_sf_noBlockDelimiters, sequences will still be internally generated - * on a per-block basis, but any last literals of a block will be merged into the - * last literals of the first sequence in the next block. - * As such, the final generated result has no explicit representation of block boundaries, - * and the final last literals segment is not represented in the sequences. + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. * * zc can be used to insert custom compression params. * This function invokes ZSTD_compress2 - * @return : number of sequences extracted + * @return : number of sequences generated */ ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, - size_t outSeqsSize, const void* src, size_t srcSize, ZSTD_sequenceFormat_e format); + size_t outSeqsSize, const void* src, size_t srcSize); +/*! ZSTD_mergeGeneratedSequences() : + * Convert an array of ZSTD_Sequence in the representation specified in ZSTD_getSequences() + * and merge all "dummy" sequences that represent last literals and block boundaries. + * + * Any last literals in the block will be merged into the literals of the next sequence. + * + * As such, the final generated result has no explicit representation of block boundaries, + * and the final last literals segment is not represented in the sequences. + * @return : number of sequences in final result + */ +ZSTDLIB_API size_t ZSTD_mergeGeneratedSequences(ZSTD_Sequence* sequences, size_t seqsSize); /*************************************** * Memory management From 51abd58208fa82efc54e76f30d6508f962071fdb Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Fri, 6 Nov 2020 10:53:22 -0500 Subject: [PATCH 14/16] Rename getSequences() to generateSequences() --- lib/compress/zstd_compress.c | 2 +- lib/zstd.h | 10 +++++----- tests/fuzzer.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 928f8fef..61eb5234 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2550,7 +2550,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) zc->seqCollector.seqIndex += seqStoreSeqSize; } -size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, size_t outSeqsSize, const void* src, size_t srcSize, ZSTD_sequenceFormat_e format) { const size_t dstCapacity = ZSTD_compressBound(srcSize); diff --git a/lib/zstd.h b/lib/zstd.h index 3c777cf8..8f39945b 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1149,7 +1149,7 @@ typedef struct { * rep == 2 --> offset == repeat_offset_3 * rep == 3 --> offset == repeat_offset_1 - 1 * - * Note: This field is optional. ZSTD_getSequences() will calculate the value of + * Note: This field is optional. ZSTD_generateSequences() will calculate the value of * 'rep', but repeat offsets do not necessarily need to be calculated from an external * sequence provider's perspective. */ @@ -1302,7 +1302,7 @@ typedef enum { ZSTD_sf_noBlockDelimiters /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ } ZSTD_sequenceFormat_e; -/*! ZSTD_getSequences() : +/*! ZSTD_generateSequences() : * Extract sequences from the sequence store. * * Each block will end with a dummy sequence @@ -1313,11 +1313,11 @@ typedef enum { * @return : number of sequences generated */ -ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, - size_t outSeqsSize, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize); /*! ZSTD_mergeGeneratedSequences() : - * Convert an array of ZSTD_Sequence in the representation specified in ZSTD_getSequences() + * Convert an array of ZSTD_Sequence in the representation specified in ZSTD_generateSequences() * and merge all "dummy" sequences that represent last literals and block boundaries. * * Any last literals in the block will be merged into the literals of the next sequence. diff --git a/tests/fuzzer.c b/tests/fuzzer.c index f31ba5e0..4552089d 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -2708,7 +2708,7 @@ static int basicUnitTests(U32 const seed, double compressibility) DISPLAYLEVEL(3, "OK \n"); } - DISPLAYLEVEL(3, "test%3i : ZSTD_getSequences decode from sequences test : ", testNb++); + DISPLAYLEVEL(3, "test%3i : ZSTD_generateSequences decode from sequences test : ", testNb++); { size_t srcSize = 150 KB; BYTE* src = (BYTE*)CNBuffer; @@ -2725,12 +2725,12 @@ static int basicUnitTests(U32 const seed, double compressibility) RDG_genBuffer(CNBuffer, srcSize, compressibility, 0., seed); /* Test with block delimiters roundtrip */ - seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_explicitBlockDelimiters); + seqsSize = ZSTD_generateSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_explicitBlockDelimiters); FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_explicitBlockDelimiters); assert(!memcmp(CNBuffer, compressedBuffer, srcSize)); /* Test no block delimiters roundtrip */ - seqsSize = ZSTD_getSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_noBlockDelimiters); + seqsSize = ZSTD_generateSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_noBlockDelimiters); FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_noBlockDelimiters); assert(!memcmp(CNBuffer, compressedBuffer, srcSize)); From 779df995c634141ceed2ec3ec7b670de77e182e0 Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Fri, 6 Nov 2020 10:55:46 -0500 Subject: [PATCH 15/16] Implement mergeGeneratedSequences() --- lib/compress/zstd_compress.c | 38 ++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 61eb5234..238e6d7a 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2551,7 +2551,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) } size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, - size_t outSeqsSize, const void* src, size_t srcSize, ZSTD_sequenceFormat_e format) + size_t outSeqsSize, const void* src, size_t srcSize) { const size_t dstCapacity = ZSTD_compressBound(srcSize); void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); @@ -2567,29 +2567,25 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); ZSTD_customFree(dst, ZSTD_defaultCMem); - - if (format == ZSTD_sf_noBlockDelimiters) { - /* Remove all block delimiters and append them to the next sequence's literals - * and do not emit last literals at all - */ - size_t in = 0; - size_t out = 0; - for (; in < zc->seqCollector.seqIndex; ++in) { - if (seqCollector.seqStart[in].offset == 0 && seqCollector.seqStart[in].matchLength == 0) { - if (in != zc->seqCollector.seqIndex - 1) { - seqCollector.seqStart[in+1].litLength += seqCollector.seqStart[in].litLength; - } - } else { - seqCollector.seqStart[out] = seqCollector.seqStart[in]; - ++out; - } - } - zc->seqCollector.seqIndex = out; - } - return zc->seqCollector.seqIndex; } +size_t ZSTD_mergeGeneratedSequences(ZSTD_Sequence* sequences, size_t seqsSize) { + size_t in = 0; + size_t out = 0; + for (; in < seqsSize; ++in) { + if (sequences[in].offset == 0 && sequences[in].matchLength == 0) { + if (in != seqsSize - 1) { + sequences[in+1].litLength += sequences[in].litLength; + } + } else { + sequences[out] = sequences[in]; + ++out; + } + } + return out; +} + /* Returns true if the given block is a RLE block */ static int ZSTD_isRLE(const BYTE *ip, size_t length) { size_t i; From 7d1dea070c77b4b2e141aeb6deede8c7807fc121 Mon Sep 17 00:00:00 2001 From: senhuang42 Date: Fri, 6 Nov 2020 10:56:56 -0500 Subject: [PATCH 16/16] Update unit tests --- lib/compress/zstd_compress.c | 2 +- lib/zstd.h | 16 ++++++++-------- tests/fuzzer.c | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 238e6d7a..00486fd1 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2570,7 +2570,7 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, return zc->seqCollector.seqIndex; } -size_t ZSTD_mergeGeneratedSequences(ZSTD_Sequence* sequences, size_t seqsSize) { +size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize) { size_t in = 0; size_t out = 0; for (; in < seqsSize; ++in) { diff --git a/lib/zstd.h b/lib/zstd.h index 8f39945b..7256c7af 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1303,10 +1303,12 @@ typedef enum { } ZSTD_sequenceFormat_e; /*! ZSTD_generateSequences() : - * Extract sequences from the sequence store. + * Generate sequences using ZSTD_compress2, given a source buffer. * * Each block will end with a dummy sequence * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. * * zc can be used to insert custom compression params. * This function invokes ZSTD_compress2 @@ -1316,17 +1318,15 @@ typedef enum { ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, size_t outSeqsSize, const void* src, size_t srcSize); -/*! ZSTD_mergeGeneratedSequences() : - * Convert an array of ZSTD_Sequence in the representation specified in ZSTD_generateSequences() - * and merge all "dummy" sequences that represent last literals and block boundaries. - * - * Any last literals in the block will be merged into the literals of the next sequence. +/*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals + * by merging them into into the literals of the next sequence. * * As such, the final generated result has no explicit representation of block boundaries, * and the final last literals segment is not represented in the sequences. - * @return : number of sequences in final result + * @return : number of sequences left after merging */ -ZSTDLIB_API size_t ZSTD_mergeGeneratedSequences(ZSTD_Sequence* sequences, size_t seqsSize); +ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); /*************************************** * Memory management diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 4552089d..05341032 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -2725,12 +2725,12 @@ static int basicUnitTests(U32 const seed, double compressibility) RDG_genBuffer(CNBuffer, srcSize, compressibility, 0., seed); /* Test with block delimiters roundtrip */ - seqsSize = ZSTD_generateSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_explicitBlockDelimiters); + seqsSize = ZSTD_generateSequences(cctx, seqs, srcSize, src, srcSize); FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_explicitBlockDelimiters); assert(!memcmp(CNBuffer, compressedBuffer, srcSize)); /* Test no block delimiters roundtrip */ - seqsSize = ZSTD_generateSequences(cctx, seqs, srcSize, src, srcSize, ZSTD_sf_noBlockDelimiters); + seqsSize = ZSTD_mergeBlockDelimiters(seqs, seqsSize); FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_noBlockDelimiters); assert(!memcmp(CNBuffer, compressedBuffer, srcSize));