Merge pull request #236 from Cyan4973/dev

v0.7.2
This commit is contained in:
Yann Collet 2016-07-03 21:06:42 +02:00 committed by GitHub
commit 2b61f74b1e
21 changed files with 838 additions and 225 deletions

View File

@ -168,6 +168,9 @@ bmix32test: clean
bmi32test: clean
CFLAGS="-O3 -mbmi -m32 -Werror" $(MAKE) -C $(PRGDIR) test
staticAnalyze: clean
CPPFLAGS=-g scan-build --status-bugs -v $(MAKE) all
endif

5
NEWS
View File

@ -1,3 +1,8 @@
v0.7.2
fixed : ZSTD_decompressBlock() using multiple consecutive blocks. Reported by Greg Slazinski
fixed : potential segfault on very large files (many gigabytes). Reported by Chip Turner.
fixed : CLI displays system error message when destination file cannot be created (#231). Reported by Chip Turner.
v0.7.1
fixed : ZBUFF_compressEnd() called multiple times with too small `dst` buffer, reported by Christophe Chevalier
fixed : dictBuilder fails if first sample is too small, reported by Руслан Ковалёв

View File

@ -44,10 +44,8 @@ extern "C" {
/* ***************************************************************
* Compiler specifics
*****************************************************************/
/*!
* ZSTD_DLL_EXPORT :
* Enable exporting of functions when building a Windows DLL
*/
/* ZSTD_DLL_EXPORT :
* Enable exporting of functions when building a Windows DLL */
#if defined(_WIN32) && defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
# define ZSTDLIB_API __declspec(dllexport)
#else
@ -103,8 +101,8 @@ ZSTDLIB_API size_t ZBUFF_compressEnd(ZBUFF_CCtx* cctx, void* dst, size_t* dstCap
* @return : nb of bytes still present into internal buffer (0 if it's empty)
* or an error code, which can be tested using ZBUFF_isError().
*
* Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedCInSize / ZBUFF_recommendedCOutSize
* input : ZBUFF_recommendedCInSize==128 KB block size is the internal unit, it improves latency to use this value (skipped buffering).
* Hint : _recommended buffer_ sizes (not compulsory) : ZBUFF_recommendedCInSize() / ZBUFF_recommendedCOutSize()
* input : ZBUFF_recommendedCInSize==128 KB block size is the internal unit, use this value to reduce intermediate stages (better latency)
* output : ZBUFF_recommendedCOutSize==ZSTD_compressBound(128 KB) + 3 + 3 : ensures it's always possible to write/flush/end a full block. Skip some buffering.
* By using both, it ensures that input will be entirely consumed, and output will always contain the result, reducing intermediate buffering.
* **************************************************/

View File

@ -61,7 +61,7 @@ extern "C" {
***************************************/
#define ZSTD_VERSION_MAJOR 0
#define ZSTD_VERSION_MINOR 7
#define ZSTD_VERSION_RELEASE 1
#define ZSTD_VERSION_RELEASE 2
#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
#define ZSTD_QUOTE(str) #str
@ -200,7 +200,6 @@ ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
/*--- Dependency ---*/
#include "mem.h" /* U32 */
/*--- Constants ---*/
#define ZSTD_MAGICNUMBER 0xFD2FB527 /* v0.7 */
#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50U
@ -270,16 +269,21 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictS
ZSTDLIB_API unsigned ZSTD_maxCLevel (void);
/*! ZSTD_getParams() :
* same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of a `ZSTD_compressionParameters`.
* All fields of `ZSTD_frameParameters` are set to default (0) */
ZSTD_parameters ZSTD_getParams(int compressionLevel, U64 srcSize, size_t dictSize);
/*! ZSTD_getCParams() :
* @return ZSTD_compressionParameters structure for a selected compression level and srcSize.
* `srcSize` value is optional, select 0 if not known */
ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, U64 srcSize, size_t dictSize);
/*! ZSTD_checkParams() :
/*! ZSTD_checkCParams() :
* Ensure param values remain within authorized range */
ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
/*! ZSTD_adjustParams() :
/*! ZSTD_adjustCParams() :
* optimize params for a given `srcSize` and `dictSize`.
* both values are optional, select `0` if unknown. */
ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, U64 srcSize, size_t dictSize);
@ -408,6 +412,7 @@ ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t ds
A few rules to respect :
- Uncompressed block size must be <= ZSTD_BLOCKSIZE_MAX (128 KB)
+ If you need to compress more, it's recommended to use ZSTD_compress() instead, since frame metadata costs become negligible.
- Compressing or decompressing requires a context structure
+ Use ZSTD_createCCtx() and ZSTD_createDCtx()
- It is necessary to init context before starting

View File

@ -51,7 +51,7 @@
/*-*************************************
* Common constants
***************************************/
#define ZSTD_OPT_DEBUG 0 // 3 = compression stats; 5 = check encoded sequences; 9 = full logs
#define ZSTD_OPT_DEBUG 0 /* 3 = compression stats; 5 = check encoded sequences; 9 = full logs */
#include <stdio.h>
#if defined(ZSTD_OPT_DEBUG) && ZSTD_OPT_DEBUG>=9
#define ZSTD_LOG_PARSER(...) printf(__VA_ARGS__)
@ -233,6 +233,6 @@ int ZSTD_isSkipFrame(ZSTD_DCtx* dctx);
/* custom memory allocation functions */
void* ZSTD_defaultAllocFunction(void* opaque, size_t size);
void ZSTD_defaultFreeFunction(void* opaque, void* address);
static ZSTD_customMem const defaultCustomMem = { ZSTD_defaultAllocFunction, ZSTD_defaultFreeFunction, NULL };
static const ZSTD_customMem defaultCustomMem = { ZSTD_defaultAllocFunction, ZSTD_defaultFreeFunction, NULL };
#endif /* ZSTD_CCOMMON_H_MODULE */

View File

@ -170,9 +170,7 @@ size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc,
size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, int compressionLevel)
{
ZSTD_parameters params;
memset(&params, 0, sizeof(params));
params.cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
return ZBUFF_compressInit_advanced(zbc, dict, dictSize, params, 0);
}

View File

@ -427,21 +427,8 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
*/
/* Frame descriptor
/* Frame header :
// old
1 byte - Alloc :
bit 0-3 : windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN (see zstd_internal.h)
bit 4 : reserved for windowLog (must be zero)
bit 5 : reserved (must be zero)
bit 6-7 : Frame content size : unknown, 1 byte, 2 bytes, 8 bytes
1 byte - checker :
bit 0-1 : dictID (0, 1, 2 or 4 bytes)
bit 2-7 : reserved (must be zero)
// new
1 byte - FrameHeaderDescription :
bit 0-1 : dictID (0, 1, 2 or 4 bytes)
bit 2-4 : reserved (must be zero)
@ -453,24 +440,24 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
bit 0-2 : octal Fractional (1/8th)
bit 3-7 : Power of 2, with 0 = 1 KB (up to 2 TB)
Optional : content size (0, 1, 2, 4 or 8 bytes)
0 : unknown
1 : 0-255 bytes
2 : 256 - 65535+256
8 : up to 16 exa
Optional : dictID (0, 1, 2 or 4 bytes)
Automatic adaptation
0 : no dictID
1 : 1 - 255
2 : 256 - 65535
4 : all other values
Optional : content size (0, 1, 2, 4 or 8 bytes)
0 : unknown
1 : 0-255 bytes
2 : 256 - 65535+256
8 : up to 16 exa
*/
/* Block format description
Block = Literal Section - Sequences Section
Block = Literals Section - Sequences Section
Prerequisite : size of (compressed) block, maximum size of regenerated data
1) Literal Section
@ -478,7 +465,7 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
1.1) Header : 1-5 bytes
flags: 2 bits
00 compressed by Huff0
01 unused
01 repeat
10 is Raw (uncompressed)
11 is Rle
Note : using 01 => Huff0 with precomputed table ?
@ -514,7 +501,7 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
else => 5 bytes (2-2-18-18)
big endian convention
1- CTable available (stored into workspace ?)
1- CTable available (stored into workspace)
2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
@ -936,7 +923,7 @@ _check_compressibility:
`offsetCode` : distance to match, or 0 == repCode.
`matchCode` : matchLength - MINMATCH
*/
MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const void* literals, size_t offsetCode, size_t matchCode)
MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const void* literals, U32 offsetCode, size_t matchCode)
{
#if 0 /* for debug */
static const BYTE* g_start = NULL;
@ -957,7 +944,7 @@ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const v
*seqStorePtr->litLength++ = (U16)litLength;
/* match offset */
*(seqStorePtr->offset++) = (U32)offsetCode + 1;
*(seqStorePtr->offset++) = offsetCode + 1;
/* match Length */
if (matchCode>0xFFFF) { seqStorePtr->longLengthID = 2; seqStorePtr->longLengthPos = (U32)(seqStorePtr->matchLength - seqStorePtr->matchLengthStart); }
@ -1063,7 +1050,7 @@ static size_t ZSTD_count_2segments(const BYTE* ip, const BYTE* match, const BYTE
***************************************/
static const U32 prime3bytes = 506832829U;
static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; }
static size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); }
MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
static const U32 prime4bytes = 2654435761U;
static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
@ -1129,13 +1116,14 @@ void ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx,
const BYTE* const lowest = base + lowestIndex;
const BYTE* const iend = istart + srcSize;
const BYTE* const ilimit = iend - 8;
size_t offset_1=cctx->rep[0], offset_2=cctx->rep[1];
U32 offset_1=cctx->rep[0], offset_2=cctx->rep[1];
U32 offsetSaved = 0;
/* init */
ip += (ip==lowest);
{ U32 const maxRep = (U32)(ip-lowest);
if (offset_1 > maxRep) offset_1 = 0;
if (offset_2 > maxRep) offset_2 = 0;
if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
}
/* Main Search Loop */
@ -1152,13 +1140,13 @@ void ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx,
ip++;
ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
} else {
size_t offset;
U32 offset;
if ( (matchIndex <= lowestIndex) || (MEM_read32(match) != MEM_read32(ip)) ) {
ip += ((ip-anchor) >> g_searchStrength) + 1;
continue;
}
mLength = ZSTD_count(ip+EQUAL_READ32, match+EQUAL_READ32, iend) + EQUAL_READ32;
offset = ip-match;
offset = (U32)(ip-match);
while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
offset_2 = offset_1;
offset_1 = offset;
@ -1180,7 +1168,7 @@ void ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx,
& (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
/* store sequence */
size_t const rLength = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_2, iend) + EQUAL_READ32;
{ size_t const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
{ U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip-base);
ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength-MINMATCH);
ip += rLength;
@ -1189,8 +1177,8 @@ void ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx,
} } }
/* save reps for next block */
cctx->savedRep[0] = offset_1 ? (U32)offset_1 : (U32)(iend - base) + 1;
cctx->savedRep[1] = offset_2 ? (U32)offset_2 : (U32)(iend - base) + 1;
cctx->savedRep[0] = offset_1 ? offset_1 : offsetSaved;
cctx->savedRep[1] = offset_2 ? offset_2 : offsetSaved;
/* Last Literals */
{ size_t const lastLLSize = iend - anchor;
@ -1364,17 +1352,19 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const U32 mls, co
const U32 windowLow = zc->lowLimit;
U32 matchEndIdx = current+8;
size_t bestLength = 8;
#ifdef ZSTD_C_PREDICT
U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0);
U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1);
predictedSmall += (predictedSmall>0);
predictedLarge += (predictedLarge>0);
#endif /* ZSTD_C_PREDICT */
hashTable[h] = current; /* Update Hash Table */
while (nbCompares-- && (matchIndex > windowLow)) {
U32* nextPtr = bt + 2*(matchIndex & btMask);
size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
#if 0 /* note : can create issues when hlog small <= 11 */
#ifdef ZSTD_C_PREDICT /* note : can create issues when hlog small <= 11 */
const U32* predictPtr = bt + 2*((matchIndex-1) & btMask); /* written this way, as bt is a roll buffer */
if (matchIndex == predictedSmall) {
/* no need to check length, result known */
@ -1731,17 +1721,15 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
size_t* offsetPtr,
U32 maxNbAttempts, U32 matchLengthSearch);
searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS;
U32 rep[ZSTD_REP_INIT];
U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1], savedOffset=0;
/* init */
ip += (ip==base);
ctx->nextToUpdate3 = ctx->nextToUpdate;
{ U32 i;
U32 const maxRep = (U32)(ip-base);
for (i=0; i<ZSTD_REP_INIT; i++) {
rep[i]=ctx->rep[i];
if (rep[i]>maxRep) rep[i]=0;
} }
{ U32 const maxRep = (U32)(ip-base);
if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
}
/* Match Loop */
while (ip < ilimit) {
@ -1750,9 +1738,9 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
const BYTE* start=ip+1;
/* check repCode */
if ((rep[0]>0) & (MEM_read32(ip+1) == MEM_read32(ip+1 - rep[0]))) {
if ((offset_1>0) & (MEM_read32(ip+1) == MEM_read32(ip+1 - offset_1))) {
/* repcode : we take it */
matchLength = ZSTD_count(ip+1+EQUAL_READ32, ip+1+EQUAL_READ32-rep[0], iend) + EQUAL_READ32;
matchLength = ZSTD_count(ip+1+EQUAL_READ32, ip+1+EQUAL_READ32-offset_1, iend) + EQUAL_READ32;
if (depth==0) goto _storeSequence;
}
@ -1772,8 +1760,8 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
if (depth>=1)
while (ip<ilimit) {
ip ++;
if ((offset) && ((rep[0]>0) & (MEM_read32(ip) == MEM_read32(ip - rep[0])))) {
size_t const mlRep = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-rep[0], iend) + EQUAL_READ32;
if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
size_t const mlRep = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_1, iend) + EQUAL_READ32;
int const gain2 = (int)(mlRep * 3);
int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
if ((mlRep >= EQUAL_READ32) && (gain2 > gain1))
@ -1791,8 +1779,8 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
/* let's find an even better one */
if ((depth==2) && (ip<ilimit)) {
ip ++;
if ((offset) && ((rep[0]>0) & (MEM_read32(ip) == MEM_read32(ip - rep[0])))) {
size_t const ml2 = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-rep[0], iend) + EQUAL_READ32;
if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
size_t const ml2 = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_1, iend) + EQUAL_READ32;
int const gain2 = (int)(ml2 * 4);
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
if ((ml2 >= EQUAL_READ32) && (gain2 > gain1))
@ -1813,23 +1801,23 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
if (offset) {
while ((start>anchor) && (start>base+offset-ZSTD_REP_MOVE) && (start[-1] == start[-1-offset+ZSTD_REP_MOVE])) /* only search for offset within prefix */
{ start--; matchLength++; }
rep[1] = rep[0]; rep[0] = (U32)(offset - ZSTD_REP_MOVE);
offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
}
/* store sequence */
_storeSequence:
{ size_t const litLength = start - anchor;
ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, matchLength-MINMATCH);
ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength-MINMATCH);
anchor = ip = start + matchLength;
}
/* check immediate repcode */
while ( (ip <= ilimit)
&& ((rep[1]>0)
& (MEM_read32(ip) == MEM_read32(ip - rep[1])) )) {
&& ((offset_2>0)
& (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
/* store sequence */
matchLength = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-rep[1], iend) + EQUAL_READ32;
offset = rep[1]; rep[1] = rep[0]; rep[0] = (U32)offset; /* swap repcodes */
matchLength = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_2, iend) + EQUAL_READ32;
offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength-MINMATCH);
ip += matchLength;
anchor = ip;
@ -1837,11 +1825,8 @@ _storeSequence:
} }
/* Save reps for next block */
{ int i;
for (i=0; i<ZSTD_REP_NUM; i++) {
if (!rep[i]) rep[i] = (U32)(iend - ctx->base) + 1; /* in case some zero are left */
ctx->savedRep[i] = rep[i];
} }
ctx->savedRep[0] = offset_1 ? offset_1 : savedOffset;
ctx->savedRep[1] = offset_2 ? offset_2 : savedOffset;
/* Last Literals */
{ size_t const lastLLSize = iend - anchor;
@ -1900,10 +1885,9 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
U32 maxNbAttempts, U32 matchLengthSearch);
searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS;
/* init */
U32 rep[ZSTD_REP_INIT];
{ U32 i; for (i=0; i<ZSTD_REP_INIT; i++) rep[i]=ctx->rep[i]; }
U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
/* init */
ctx->nextToUpdate3 = ctx->nextToUpdate;
ip += (ip == prefixStart);
@ -1915,7 +1899,7 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
U32 current = (U32)(ip-base);
/* check repCode */
{ const U32 repIndex = (U32)(current+1 - rep[0]);
{ const U32 repIndex = (U32)(current+1 - offset_1);
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
@ -1945,7 +1929,7 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
current++;
/* check repCode */
if (offset) {
const U32 repIndex = (U32)(current - rep[0]);
const U32 repIndex = (U32)(current - offset_1);
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
@ -1975,7 +1959,7 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
current++;
/* check repCode */
if (offset) {
const U32 repIndex = (U32)(current - rep[0]);
const U32 repIndex = (U32)(current - offset_1);
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
@ -2007,19 +1991,19 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
rep[1] = rep[0]; rep[0] = (U32)(offset - ZSTD_REP_MOVE);
offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
}
/* store sequence */
_storeSequence:
{ size_t const litLength = start - anchor;
ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, matchLength-MINMATCH);
ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength-MINMATCH);
anchor = ip = start + matchLength;
}
/* check immediate repcode */
while (ip <= ilimit) {
const U32 repIndex = (U32)((ip-base) - rep[1]);
const U32 repIndex = (U32)((ip-base) - offset_2);
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
@ -2027,7 +2011,7 @@ _storeSequence:
/* repcode detected we should take it */
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
matchLength = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
offset = rep[1]; rep[1] = rep[0]; rep[0] = (U32)offset; /* swap offset history */
offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength-MINMATCH);
ip += matchLength;
anchor = ip;
@ -2037,7 +2021,7 @@ _storeSequence:
} }
/* Save reps for next block */
ctx->savedRep[0] = rep[0]; ctx->savedRep[1] = rep[1]; ctx->savedRep[2] = rep[2];
ctx->savedRep[0] = offset_1; ctx->savedRep[1] = offset_2;
/* Last Literals */
{ size_t const lastLLSize = iend - anchor;
@ -2068,18 +2052,27 @@ static void ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx* ctx, const void* src,
}
/* The optimal parser */
#include "zstd_opt.h"
static void ZSTD_compressBlock_btopt(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
{
#ifdef ZSTD_OPT_H_91842398743
ZSTD_compressBlock_opt_generic(ctx, src, srcSize);
#else
(void)ctx; (void)src; (void)srcSize;
return;
#endif
}
static void ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
{
#ifdef ZSTD_OPT_H_91842398743
ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize);
#else
(void)ctx; (void)src; (void)srcSize;
return;
#endif
}
@ -2426,9 +2419,7 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
{
ZSTD_parameters params;
memset(&params, 0, sizeof(params));
params.cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
ZSTD_LOG_BLOCK("%p: ZSTD_compressBegin_usingDict compressionLevel=%d\n", cctx->base, compressionLevel);
return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, 0);
}
@ -2538,11 +2529,9 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict, size_t dictSize, int compressionLevel)
{
ZSTD_parameters params;
memset(&params, 0, sizeof(params));
ZSTD_LOG_BLOCK("%p: ZSTD_compress_usingDict srcSize=%d dictSize=%d compressionLevel=%d\n", ctx->base, (int)srcSize, (int)dictSize, compressionLevel);
params.cParams = ZSTD_getCParams(compressionLevel, srcSize, dictSize);
ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, dictSize);
params.fParams.contentSizeFlag = 1;
ZSTD_LOG_BLOCK("%p: ZSTD_compress_usingDict srcSize=%d dictSize=%d compressionLevel=%d\n", ctx->base, (int)srcSize, (int)dictSize, compressionLevel);
return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params);
}
@ -2577,7 +2566,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, ZSTD_pa
if (!customMem.customAlloc && !customMem.customFree)
customMem = defaultCustomMem;
if (!customMem.customAlloc || !customMem.customFree)
if (!customMem.customAlloc || !customMem.customFree) /* can't have 1/2 custom alloc/free as NULL */
return NULL;
{ ZSTD_CDict* const cdict = (ZSTD_CDict*) customMem.customAlloc(customMem.opaque, sizeof(*cdict));
@ -2772,3 +2761,14 @@ ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, U64 srcSize, si
cp = ZSTD_adjustCParams(cp, srcSize, dictSize);
return cp;
}
/*! ZSTD_getParams() :
* same as ZSTD_getCParams(), but @return a `ZSTD_parameters` object instead of a `ZSTD_compressionParameters`.
* All fields of `ZSTD_frameParameters` are set to default (0) */
ZSTD_parameters ZSTD_getParams(int compressionLevel, U64 srcSize, size_t dictSize) {
ZSTD_parameters params;
ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSize, dictSize);
memset(&params, 0, sizeof(params));
params.cParams = cParams;
return params;
}

View File

@ -34,6 +34,10 @@
/* Note : this file is intended to be included within zstd_compress.c */
#ifndef ZSTD_OPT_H_91842398743
#define ZSTD_OPT_H_91842398743
#define ZSTD_FREQ_DIV 5
/*-*************************************
@ -110,7 +114,7 @@ FORCE_INLINE U32 ZSTD_getLiteralPrice(seqStore_t* ssPtr, U32 litLength, const BY
/* literals */
if (ssPtr->cachedLiterals == literals) {
U32 additional = litLength - ssPtr->cachedLitLength;
U32 const additional = litLength - ssPtr->cachedLitLength;
const BYTE* literals2 = ssPtr->cachedLiterals + ssPtr->cachedLitLength;
price = ssPtr->cachedPrice + additional * ssPtr->log2litSum;
for (u=0; u < additional; u++)
@ -150,7 +154,7 @@ FORCE_INLINE U32 ZSTD_getLiteralPrice(seqStore_t* ssPtr, U32 litLength, const BY
FORCE_INLINE U32 ZSTD_getPrice(seqStore_t* seqStorePtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength)
{
/* offset */
BYTE offCode = (BYTE)ZSTD_highbit32(offset+1);
BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1);
U32 price = offCode + seqStorePtr->log2offCodeSum - ZSTD_highbit32(seqStorePtr->offCodeFreq[offCode]+1);
/* match Length */
@ -196,7 +200,7 @@ MEM_STATIC void ZSTD_updatePrice(seqStore_t* seqStorePtr, U32 litLength, const B
}
/* match offset */
{ BYTE offCode = (BYTE)ZSTD_highbit32(offset+1);
{ BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1);
seqStorePtr->offCodeSum++;
seqStorePtr->offCodeFreq[offCode]++;
}
@ -232,7 +236,6 @@ MEM_STATIC void ZSTD_updatePrice(seqStore_t* seqStorePtr, U32 litLength, const B
/* Update hashTable3 up to ip (excluded)
Assumption : always within prefix (ie. not within extDict) */
FORCE_INLINE
@ -1039,3 +1042,5 @@ _storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */
seqStorePtr->lit += lastLLSize;
}
}
#endif /* ZSTD_OPT_H_91842398743 */

View File

@ -173,7 +173,7 @@ size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbd,
if (ZSTD_isError(hSize)) return hSize;
if (toLoad > (size_t)(iend-ip)) { /* not enough input to load full header */
memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip);
zbd->lhSize += iend-ip; ip = iend; notDone = 0;
zbd->lhSize += iend-ip;
*dstCapacityPtr = 0;
return (hSize - zbd->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */
}

View File

@ -207,20 +207,8 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
*/
/* Frame descriptor
/* Frame Header :
// old
1 byte - Alloc :
bit 0-3 : windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN (see zstd_internal.h)
bit 4 : reserved for windowLog (must be zero)
bit 5 : reserved (must be zero)
bit 6-7 : Frame content size : unknown, 1 byte, 2 bytes, 8 bytes
1 byte - checker :
bit 0-1 : dictID (0, 1, 2 or 4 bytes)
bit 2-7 : reserved (must be zero)
// new
1 byte - FrameHeaderDescription :
bit 0-1 : dictID (0, 1, 2 or 4 bytes)
bit 2 : checksumFlag
@ -454,16 +442,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
{
const BYTE* const istart = (const BYTE*) src;
litBlockType_t lbt;
if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
lbt = (litBlockType_t)(istart[0]>> 6);
switch(lbt)
switch((litBlockType_t)(istart[0]>> 6))
{
case lbt_huffman:
{ size_t litSize, litCSize, singleStream=0;
U32 lhSize = ((istart[0]) >> 4) & 3;
U32 lhSize = (istart[0] >> 4) & 3;
if (srcSize < 5) return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for lhSize, + cSize (+nbSeq) */
switch(lhSize)
{
@ -930,8 +916,11 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize)
{
size_t dSize;
ZSTD_checkContinuity(dctx, dst);
return ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
dctx->previousDstEnd = (char*)dst + dSize;
return dSize;
}

View File

@ -826,7 +826,6 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
MEM_writeLE32(dstPtr+4, repStartValue[1]);
MEM_writeLE32(dstPtr+8, repStartValue[2]);
#endif
dstPtr += 12;
eSize += 12;
_cleanup:
@ -906,6 +905,7 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
}
#define DIB_MINSAMPLESSIZE (DIB_FASTSEGMENTSIZE*3)
#define EXIT(e) { dictSize = ERROR(e); goto _cleanup; }
/*! ZDICT_trainFromBuffer_unsafe() :
* `samplesBuffer` must be followed by noisy guard band.
* @return : size of dictionary.
@ -923,12 +923,12 @@ size_t ZDICT_trainFromBuffer_unsafe(
size_t dictSize = 0;
/* checks */
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) return ERROR(dstSize_tooSmall);
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) EXIT(dstSize_tooSmall);
if (!dictList) return ERROR(memory_allocation);
/* init */
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
if (sBuffSize < DIB_MINSAMPLESSIZE) return 0; /* not enough source to create dictionary */
if (sBuffSize < DIB_MINSAMPLESSIZE) EXIT(no_error); /* not enough source to create dictionary */
ZDICT_initDictItem(dictList);
g_displayLevel = params.notificationLevel;
if (selectivity==0) selectivity = g_selectivity_default;
@ -948,9 +948,9 @@ size_t ZDICT_trainFromBuffer_unsafe(
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
DISPLAYLEVEL(3, "list %u best segments \n", nb);
for (u=1; u<=nb; u++) {
U32 p = dictList[u].pos;
U32 l = dictList[u].length;
U32 d = MIN(40, l);
U32 const p = dictList[u].pos;
U32 const l = dictList[u].length;
U32 const d = MIN(40, l);
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
u, l, p, dictList[u].savings);
ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
@ -966,7 +966,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
for (u=1; u<dictList->pos; u++) {
U32 l = dictList[u].length;
ptr -= l;
if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */
if (ptr<(BYTE*)dictBuffer) EXIT(GENERIC); /* should not happen */
memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
} }
@ -983,7 +983,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
params);
}
/* clean up */
_cleanup :
free(dictList);
return dictSize;
}

1
programs/.gitignore vendored
View File

@ -43,6 +43,7 @@ _*
tmp*
*.zst
result
out
# fuzzer
afl

View File

@ -148,16 +148,14 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
size_t const maxCompressedSize = ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024); /* add some room for safety */
void* const compressedBuffer = malloc(maxCompressedSize);
void* const resultBuffer = malloc(srcSize);
ZSTD_CCtx* refCtx = ZSTD_createCCtx();
ZSTD_CCtx* ctx = ZSTD_createCCtx();
ZSTD_DCtx* refDCtx = ZSTD_createDCtx();
ZSTD_DCtx* dctx = ZSTD_createDCtx();
U32 nbBlocks;
UTIL_time_t ticksPerSecond;
/* checks */
if (!compressedBuffer || !resultBuffer || !blockTable || !refCtx || !ctx || !refDCtx || !dctx)
EXM_THROW(31, "not enough memory");
if (!compressedBuffer || !resultBuffer || !blockTable || !ctx || !dctx)
EXM_THROW(31, "allocation error : not enough memory");
/* init */
if (strlen(displayName)>17) displayName += strlen(displayName)-17; /* can only display 17 characters */
@ -217,8 +215,11 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
UTIL_waitForNextTick(ticksPerSecond);
UTIL_getTime(&clockStart);
{ U32 nbLoops = 0;
ZSTD_CDict* cdict = ZSTD_createCDict(dictBuffer, dictBufferSize, cLevel);
{ size_t const refSrcSize = (nbBlocks == 1) ? srcSize : 0;
ZSTD_parameters const zparams = ZSTD_getParams(cLevel, refSrcSize, dictBufferSize);
ZSTD_customMem const cmem = { NULL, NULL, NULL };
U32 nbLoops = 0;
ZSTD_CDict* cdict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, zparams, cmem);
if (cdict==NULL) EXM_THROW(1, "ZSTD_createCDict() allocation failure");
do {
U32 blockNb;
@ -227,7 +228,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
blockTable[blockNb].cPtr, blockTable[blockNb].cRoom,
blockTable[blockNb].srcPtr,blockTable[blockNb].srcSize,
cdict);
if (ZSTD_isError(rSize)) EXM_THROW(1, "ZSTD_compress_usingPreparedCCtx() failed : %s", ZSTD_getErrorName(rSize));
if (ZSTD_isError(rSize)) EXM_THROW(1, "ZSTD_compress_usingCDict() failed : %s", ZSTD_getErrorName(rSize));
blockTable[blockNb].cSize = rSize;
}
nbLoops++;
@ -264,7 +265,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
blockTable[blockNb].cPtr, blockTable[blockNb].cSize,
ddict);
if (ZSTD_isError(regenSize)) {
DISPLAY("ZSTD_decompress_usingPreparedDCtx() failed on block %u : %s \n",
DISPLAY("ZSTD_decompress_usingDDict() failed on block %u : %s \n",
blockNb, ZSTD_getErrorName(regenSize));
clockLoop = 0; /* force immediate test end */
break;
@ -321,9 +322,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
free(blockTable);
free(compressedBuffer);
free(resultBuffer);
ZSTD_freeCCtx(refCtx);
ZSTD_freeCCtx(ctx);
ZSTD_freeDCtx(refDCtx);
ZSTD_freeDCtx(dctx);
return 0;
}

View File

@ -23,12 +23,19 @@
- source repository : https://github.com/Cyan4973/zstd
*/
/* *************************************
* Compiler Options
***************************************/
#define _CRT_SECURE_NO_WARNINGS /* removes Visual warning on strerror() */
/*-************************************
* Includes
**************************************/
#include <stdlib.h> /* malloc */
#include <stdio.h> /* FILE, fwrite, fprintf */
#include <string.h> /* memcpy */
#include <errno.h> /* errno */
#include "mem.h" /* U32 */
@ -104,7 +111,7 @@ static BYTE RDG_genChar(U32* seed, const BYTE* ldt)
U32 const id = RDG_rand(seed) & LTMASK;
//TRACE(" %u : \n", id);
//TRACE(" %4u [%4u] ; val : %4u \n", id, id&255, ldt[id]);
return (ldt[id]); /* memory-sanitizer fails here, stating "uninitialized value" when table initialized with 0.0. Checked : table is fully initialized */
return ldt[id]; /* memory-sanitizer fails here, stating "uninitialized value" when table initialized with P==0.0. Checked : table is fully initialized */
}
@ -115,8 +122,7 @@ static U32 RDG_rand15Bits (unsigned* seedPtr)
static U32 RDG_randLength(unsigned* seedPtr)
{
if (RDG_rand(seedPtr) & 7)
return (RDG_rand(seedPtr) & 0xF);
if (RDG_rand(seedPtr) & 7) return (RDG_rand(seedPtr) & 0xF); /* small length */
return (RDG_rand(seedPtr) & 0x1FF) + 0xF;
}
@ -185,10 +191,10 @@ void RDG_genStdout(unsigned long long size, double matchProba, double litProba,
size_t const stdDictSize = 32 KB;
BYTE* const buff = (BYTE*)malloc(stdDictSize + stdBlockSize);
U64 total = 0;
BYTE ldt[LTSIZE];
BYTE ldt[LTSIZE]; /* literals distribution table */
/* init */
if (buff==NULL) { fprintf(stdout, "not enough memory\n"); exit(1); }
if (buff==NULL) { fprintf(stderr, "datagen: error: %s \n", strerror(errno)); exit(1); }
if (litProba<=0.0) litProba = matchProba / 4.5;
memset(ldt, '0', sizeof(ldt));
RDG_fillLiteralDistrib(ldt, litProba);

View File

@ -42,12 +42,13 @@
* Compiler Options
***************************************/
#define _POSIX_SOURCE 1 /* enable %llu on Windows */
#define _CRT_SECURE_NO_WARNINGS /* removes Visual warning on strerror() */
/*-*************************************
* Includes
***************************************/
#include "util.h" /* Compiler options, UTIL_GetFileSize */
#include "util.h" /* Compiler options, UTIL_GetFileSize, _LARGEFILE64_SOURCE */
#include <stdio.h> /* fprintf, fopen, fread, _fileno, stdin, stdout */
#include <stdlib.h> /* malloc, free */
#include <string.h> /* strcmp, strlen */
@ -58,7 +59,6 @@
#include "fileio.h"
#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_magicNumber, ZSTD_frameHeaderSize_max */
#include "zstd.h"
#include "zstd_internal.h" /* MIN, KB, MB */
#define ZBUFF_STATIC_LINKING_ONLY
#include "zbuff.h"
@ -84,6 +84,10 @@
/*-*************************************
* Constants
***************************************/
#define KB *(1<<10)
#define MB *(1<<20)
#define GB *(1U<<30)
#define _1BIT 0x01
#define _2BITS 0x03
#define _3BITS 0x07
@ -113,21 +117,17 @@ static U32 g_displayLevel = 2; /* 0 : no display; 1: errors; 2 : + result
void FIO_setNotificationLevel(unsigned level) { g_displayLevel=level; }
#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
if ((FIO_GetMilliSpan(g_time) > refreshRate) || (g_displayLevel>=4)) \
if ((clock() - g_time > refreshRate) || (g_displayLevel>=4)) \
{ g_time = clock(); DISPLAY(__VA_ARGS__); \
if (g_displayLevel>=4) fflush(stdout); } }
static const unsigned refreshRate = 150;
static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
static clock_t g_time = 0;
static unsigned FIO_GetMilliSpan(clock_t nPrevious)
{
clock_t const nCurrent = clock();
return (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC);
}
#define MIN(a,b) ((a) < (b) ? (a) : (b))
/*-*************************************
* Local Parameters
* Local Parameters - Not thread safe
***************************************/
static U32 g_overwrite = 0;
void FIO_overwriteMode(void) { g_overwrite=1; }
@ -175,7 +175,7 @@ static FILE* FIO_openSrcFile(const char* srcFileName)
f = fopen(srcFileName, "rb");
}
if ( f==NULL ) DISPLAYLEVEL(1, "zstd: %s: No such file\n", srcFileName);
if ( f==NULL ) DISPLAYLEVEL(1, "zstd: %s: %s \n", srcFileName, strerror(errno));
return f;
}
@ -201,18 +201,20 @@ static FILE* FIO_openDstFile(const char* dstFileName)
if (g_displayLevel <= 1) {
/* No interaction possible */
DISPLAY("zstd: %s already exists; not overwritten \n", dstFileName);
return 0;
return NULL;
}
DISPLAY("zstd: %s already exists; do you wish to overwrite (y/N) ? ", dstFileName);
{ int ch = getchar();
if ((ch!='Y') && (ch!='y')) {
DISPLAY(" not overwritten \n");
return 0;
return NULL;
}
while ((ch!=EOF) && (ch!='\n')) ch = getchar(); /* flush rest of input line */
} } }
f = fopen( dstFileName, "wb" );
}
if (f==NULL) DISPLAYLEVEL(1, "zstd: %s: %s\n", dstFileName, strerror(errno));
return f;
}
@ -233,18 +235,18 @@ static size_t FIO_loadFile(void** bufferPtr, const char* fileName)
DISPLAYLEVEL(4,"Loading %s as dictionary \n", fileName);
fileHandle = fopen(fileName, "rb");
if (fileHandle==0) EXM_THROW(31, "Error opening file %s", fileName);
if (fileHandle==0) EXM_THROW(31, "zstd: %s: %s", fileName, strerror(errno));
fileSize = UTIL_getFileSize(fileName);
if (fileSize > MAX_DICT_SIZE) {
int seekResult;
if (fileSize > 1 GB) EXM_THROW(32, "Dictionary file %s is too large", fileName); /* avoid extreme cases */
DISPLAYLEVEL(2,"Dictionary %s is too large : using last %u bytes only \n", fileName, MAX_DICT_SIZE);
seekResult = fseek(fileHandle, (long int)(fileSize-MAX_DICT_SIZE), SEEK_SET); /* use end of file */
if (seekResult != 0) EXM_THROW(33, "Error seeking into file %s", fileName);
if (seekResult != 0) EXM_THROW(33, "zstd: %s: %s", fileName, strerror(errno));
fileSize = MAX_DICT_SIZE;
}
*bufferPtr = (BYTE*)malloc((size_t)fileSize);
if (*bufferPtr==NULL) EXM_THROW(34, "Allocation error : not enough memory for dictBuffer");
*bufferPtr = malloc((size_t)fileSize);
if (*bufferPtr==NULL) EXM_THROW(34, "zstd: %s", strerror(errno));
{ size_t const readSize = fread(*bufferPtr, 1, (size_t)fileSize, fileHandle);
if (readSize!=fileSize) EXM_THROW(35, "Error reading dictionary file %s", fileName); }
fclose(fileHandle);
@ -273,14 +275,14 @@ static cRess_t FIO_createCResources(const char* dictFileName)
cRess_t ress;
ress.ctx = ZBUFF_createCCtx();
if (ress.ctx == NULL) EXM_THROW(30, "Allocation error : can't create ZBUFF context");
if (ress.ctx == NULL) EXM_THROW(30, "zstd: allocation error : can't create ZBUFF context");
/* Allocate Memory */
ress.srcBufferSize = ZBUFF_recommendedCInSize();
ress.srcBuffer = malloc(ress.srcBufferSize);
ress.dstBufferSize = ZBUFF_recommendedCOutSize();
ress.dstBuffer = malloc(ress.dstBufferSize);
if (!ress.srcBuffer || !ress.dstBuffer) EXM_THROW(31, "Allocation error : not enough memory");
if (!ress.srcBuffer || !ress.dstBuffer) EXM_THROW(31, "zstd: allocation error : not enough memory");
/* dictionary */
ress.dictBufferSize = FIO_loadFile(&(ress.dictBuffer), dictFileName);
@ -295,7 +297,7 @@ static void FIO_freeCResources(cRess_t ress)
free(ress.dstBuffer);
free(ress.dictBuffer);
errorCode = ZBUFF_freeCCtx(ress.ctx);
if (ZBUFF_isError(errorCode)) EXM_THROW(38, "Error : can't release ZBUFF context resource : %s", ZBUFF_getErrorName(errorCode));
if (ZBUFF_isError(errorCode)) EXM_THROW(38, "zstd: error : can't release ZBUFF context resource : %s", ZBUFF_getErrorName(errorCode));
}
@ -315,9 +317,7 @@ static int FIO_compressFilename_internal(cRess_t ress,
U64 const fileSize = UTIL_getFileSize(srcFileName);
/* init */
{ ZSTD_parameters params;
memset(&params, 0, sizeof(params));
params.cParams = ZSTD_getCParams(cLevel, fileSize, ress.dictBufferSize);
{ ZSTD_parameters params = ZSTD_getParams(cLevel, fileSize, ress.dictBufferSize);
params.fParams.contentSizeFlag = 1;
params.fParams.checksumFlag = g_checksumFlag;
params.fParams.noDictIDFlag = !g_dictIDFlag;
@ -375,8 +375,8 @@ static int FIO_compressFilename_internal(cRess_t ress,
}
/*! FIO_compressFilename_internal() :
* same as FIO_compressFilename_extRess(), with ress.destFile already opened (typically stdout)
/*! FIO_compressFilename_srcFile() :
* note : ress.destFile already opened
* @return : 0 : compression completed correctly,
* 1 : missing or pb opening srcFileName
*/
@ -417,7 +417,7 @@ static int FIO_compressFilename_dstFile(cRess_t ress,
result = FIO_compressFilename_srcFile(ress, dstFileName, srcFileName, cLevel);
if (fclose(ress.dstFile)) EXM_THROW(28, "Write error : cannot properly close %s", dstFileName);
if (fclose(ress.dstFile)) { DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno)); result=1; }
if (result!=0) remove(dstFileName); /* remove operation artefact */
return result;
}
@ -429,13 +429,13 @@ int FIO_compressFilename(const char* dstFileName, const char* srcFileName,
clock_t const start = clock();
cRess_t const ress = FIO_createCResources(dictFileName);
int const issueWithSrcFile = FIO_compressFilename_dstFile(ress, dstFileName, srcFileName, compressionLevel);
FIO_freeCResources(ress);
int const result = FIO_compressFilename_dstFile(ress, dstFileName, srcFileName, compressionLevel);
{ double const seconds = (double)(clock() - start) / CLOCKS_PER_SEC;
double const seconds = (double)(clock() - start) / CLOCKS_PER_SEC;
DISPLAYLEVEL(4, "Completed in %.2f sec \n", seconds);
}
return issueWithSrcFile;
FIO_freeCResources(ress);
return result;
}

View File

@ -40,7 +40,7 @@
#include <sys/timeb.h> /* timeb */
#include <string.h> /* strcmp */
#include <time.h> /* clock_t */
#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_compressContinue */
#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_compressContinue, ZSTD_compressBlock */
#include "zstd.h" /* ZSTD_VERSION_STRING, ZSTD_getErrorCode */
#include "zdict.h" /* ZDICT_trainFromBuffer */
#include "datagen.h" /* RDG_genBuffer */
@ -109,9 +109,9 @@ static unsigned FUZ_highbit32(U32 v32)
}
#define CHECKTEST(var, fn) size_t const var = fn; if (ZSTD_isError(var)) goto _output_error
#define CHECK(fn) { CHECKTEST(err, fn); }
#define CHECKPLUS(var, fn, more) { CHECKTEST(var, fn); more; }
#define CHECK_V(var, fn) size_t const var = fn; if (ZSTD_isError(var)) goto _output_error
#define CHECK(fn) { CHECK_V(err, fn); }
#define CHECKPLUS(var, fn, more) { CHECK_V(var, fn); more; }
static int basicUnitTests(U32 seed, double compressibility)
{
size_t const CNBuffSize = 5 MB;
@ -216,10 +216,8 @@ static int basicUnitTests(U32 seed, double compressibility)
DISPLAYLEVEL(4, "test%3i : check content size on duplicated context : ", testNb++);
{ size_t const testSize = CNBuffSize / 3;
{ ZSTD_compressionParameters const cPar = ZSTD_getCParams(2, testSize, dictSize);
ZSTD_frameParameters const fPar = { 1 , 0 , 0 };
ZSTD_parameters p;
p.cParams = cPar; p.fParams = fPar;
{ ZSTD_parameters p = ZSTD_getParams(2, testSize, dictSize);
p.fParams.contentSizeFlag = 1;
CHECK( ZSTD_compressBegin_advanced(ctxOrig, CNBuffer, dictSize, p, testSize-1) );
}
CHECK( ZSTD_copyCCtx(ctxDuplicated, ctxOrig) );
@ -277,10 +275,8 @@ static int basicUnitTests(U32 seed, double compressibility)
DISPLAYLEVEL(4, "OK \n");
DISPLAYLEVEL(4, "test%3i : compress without dictID : ", testNb++);
{ ZSTD_frameParameters const fParams = { 0 /*contentSize*/, 0 /*checksum*/, 1 /*NoDictID*/ };
ZSTD_compressionParameters const cParams = ZSTD_getCParams(3, CNBuffSize, dictSize);
ZSTD_parameters p;
p.cParams = cParams; p.fParams = fParams;
{ ZSTD_parameters p = ZSTD_getParams(3, CNBuffSize, dictSize);
p.fParams.noDictIDFlag = 1;
cSize = ZSTD_compress_advanced(cctx, compressedBuffer, ZSTD_compressBound(CNBuffSize),
CNBuffer, CNBuffSize,
dictBuffer, dictSize, p);
@ -320,6 +316,7 @@ static int basicUnitTests(U32 seed, double compressibility)
ZSTD_DCtx* const dctx = ZSTD_createDCtx();
static const size_t blockSize = 100 KB;
static const size_t dictSize = 16 KB;
size_t cSize2;
/* basic block compression */
DISPLAYLEVEL(4, "test%3i : Block compression test : ", testNb++);
@ -330,7 +327,7 @@ static int basicUnitTests(U32 seed, double compressibility)
DISPLAYLEVEL(4, "test%3i : Block decompression test : ", testNb++);
CHECK( ZSTD_decompressBegin(dctx) );
{ CHECKTEST(r, ZSTD_decompressBlock(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize) );
{ CHECK_V(r, ZSTD_decompressBlock(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize) );
if (r != blockSize) goto _output_error; }
DISPLAYLEVEL(4, "OK \n");
@ -339,11 +336,15 @@ static int basicUnitTests(U32 seed, double compressibility)
CHECK( ZSTD_compressBegin_usingDict(cctx, CNBuffer, dictSize, 5) );
cSize = ZSTD_compressBlock(cctx, compressedBuffer, ZSTD_compressBound(blockSize), (char*)CNBuffer+dictSize, blockSize);
if (ZSTD_isError(cSize)) goto _output_error;
cSize2 = ZSTD_compressBlock(cctx, (char*)compressedBuffer+cSize, ZSTD_compressBound(blockSize), (char*)CNBuffer+dictSize+blockSize, blockSize);
if (ZSTD_isError(cSize2)) goto _output_error;
DISPLAYLEVEL(4, "OK \n");
DISPLAYLEVEL(4, "test%3i : Dictionary Block decompression test : ", testNb++);
CHECK( ZSTD_decompressBegin_usingDict(dctx, CNBuffer, dictSize) );
{ CHECKTEST( r, ZSTD_decompressBlock(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize) );
{ CHECK_V( r, ZSTD_decompressBlock(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize) );
if (r != blockSize) goto _output_error; }
{ CHECK_V( r, ZSTD_decompressBlock(dctx, (char*)decodedBuffer+blockSize, CNBuffSize, (char*)compressedBuffer+cSize, cSize2) );
if (r != blockSize) goto _output_error; }
DISPLAYLEVEL(4, "OK \n");
@ -361,7 +362,7 @@ static int basicUnitTests(U32 seed, double compressibility)
sampleSize += 96 KB;
cSize = ZSTD_compress(compressedBuffer, ZSTD_compressBound(sampleSize), CNBuffer, sampleSize, 1);
if (ZSTD_isError(cSize)) goto _output_error;
{ CHECKTEST(regenSize, ZSTD_decompress(decodedBuffer, sampleSize, compressedBuffer, cSize));
{ CHECK_V(regenSize, ZSTD_decompress(decodedBuffer, sampleSize, compressedBuffer, cSize));
if (regenSize!=sampleSize) goto _output_error; }
DISPLAYLEVEL(4, "OK \n");
}
@ -370,12 +371,12 @@ static int basicUnitTests(U32 seed, double compressibility)
#define ZEROESLENGTH 100
DISPLAYLEVEL(4, "test%3i : compress %u zeroes : ", testNb++, ZEROESLENGTH);
memset(CNBuffer, 0, ZEROESLENGTH);
{ CHECKTEST(r, ZSTD_compress(compressedBuffer, ZSTD_compressBound(ZEROESLENGTH), CNBuffer, ZEROESLENGTH, 1) );
{ CHECK_V(r, ZSTD_compress(compressedBuffer, ZSTD_compressBound(ZEROESLENGTH), CNBuffer, ZEROESLENGTH, 1) );
cSize = r; }
DISPLAYLEVEL(4, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/ZEROESLENGTH*100);
DISPLAYLEVEL(4, "test%3i : decompress %u zeroes : ", testNb++, ZEROESLENGTH);
{ CHECKTEST(r, ZSTD_decompress(decodedBuffer, ZEROESLENGTH, compressedBuffer, cSize) );
{ CHECK_V(r, ZSTD_decompress(decodedBuffer, ZEROESLENGTH, compressedBuffer, cSize) );
if (r != ZEROESLENGTH) goto _output_error; }
DISPLAYLEVEL(4, "OK \n");
@ -403,13 +404,13 @@ static int basicUnitTests(U32 seed, double compressibility)
((BYTE*)CNBuffer)[i+2] = _3BytesSeqs[id][2];
} }}
DISPLAYLEVEL(4, "test%3i : compress lots 3-bytes sequences : ", testNb++);
{ CHECKTEST(r, ZSTD_compress(compressedBuffer, ZSTD_compressBound(_3BYTESTESTLENGTH),
{ CHECK_V(r, ZSTD_compress(compressedBuffer, ZSTD_compressBound(_3BYTESTESTLENGTH),
CNBuffer, _3BYTESTESTLENGTH, 19) );
cSize = r; }
DISPLAYLEVEL(4, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/_3BYTESTESTLENGTH*100);
DISPLAYLEVEL(4, "test%3i : decompress lots 3-bytes sequence : ", testNb++);
{ CHECKTEST(r, ZSTD_decompress(decodedBuffer, _3BYTESTESTLENGTH, compressedBuffer, cSize) );
{ CHECK_V(r, ZSTD_decompress(decodedBuffer, _3BYTESTESTLENGTH, compressedBuffer, cSize) );
if (r != _3BYTESTESTLENGTH) goto _output_error; }
DISPLAYLEVEL(4, "OK \n");

View File

@ -381,13 +381,9 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compres
{ size_t const dictStart = FUZ_rand(&lseed) % (srcBufferSize - dictSize);
dict = srcBuffer + dictStart;
}
{ ZSTD_compressionParameters const cPar = ZSTD_getCParams(cLevel, 0, dictSize);
U32 const checksum = FUZ_rand(&lseed) & 1;
U32 const noDictIDFlag = FUZ_rand(&lseed) & 1;
ZSTD_frameParameters const fPar = { 0, checksum, noDictIDFlag };
ZSTD_parameters params;
params.cParams = cPar;
params.fParams = fPar;
{ ZSTD_parameters params = ZSTD_getParams(cLevel, 0, dictSize);
params.fParams.checksumFlag = FUZ_rand(&lseed) & 1;
params.fParams.noDictIDFlag = FUZ_rand(&lseed) & 1;
{ size_t const initError = ZBUFF_compressInit_advanced(zc, dict, dictSize, params, 0);
CHECK (ZBUFF_isError(initError),"init error : %s", ZBUFF_getErrorName(initError));
} } }

View File

@ -33,7 +33,8 @@ It is based on the \fBLZ77\fR family, with further FSE & huff0 entropy stages.
It also features a very fast decoder, with speed > 500 MB/s per core.
\fBzstd\fR command line is generally similar to gzip, but features the following differences :
- Original files are preserved
- Source files are preserved by default
It's possible to remove them automatically by using \fB--rm\fR command
- By default, when compressing a single file, \fBzstd\fR displays progress notifications and result summary.
Use \fB-q\fR to turn them off
@ -57,6 +58,19 @@ It also features a very fast decoder, with speed > 500 MB/s per core.
.BR \-f ", " --force
overwrite output without prompting
.TP
.BR \-c ", " --stdout
force write to standard output, even if it is the console
.TP
.BR \--rm
remove source file(s) after successful compression or decompression
.TP
.BR \-k ", " --keep
keep source file(s) after successful compression or decompression.
This is the default behavior.
.TP
.BR \-r
operate recursively on directories
.TP
.BR \-h/\-H ", " --help
display help/long help and exit
.TP
@ -69,9 +83,6 @@ It also features a very fast decoder, with speed > 500 MB/s per core.
.BR \-q ", " --quiet
suppress warnings and notifications; specify twice to suppress errors too
.TP
.BR \-c ", " --stdout
force write to standard output, even if it is the console
.TP
.BR \-C ", " --check
add integrity check computed from uncompressed data
.TP

View File

@ -115,6 +115,7 @@ static int usage(const char* programName)
DISPLAY( " -D file: use `file` as Dictionary \n");
DISPLAY( " -o file: result stored into `file` (only if 1 input file) \n");
DISPLAY( " -f : overwrite output without prompting \n");
DISPLAY( "--rm : remove source file(s) after successful de/compression \n");
DISPLAY( " -h/-H : display help/long help and exit\n");
return 0;
}
@ -132,7 +133,6 @@ static int usage_advanced(const char* programName)
#ifdef UTIL_HAS_CREATEFILELIST
DISPLAY( " -r : operate recursively on directories\n");
#endif
DISPLAY( "--rm : remove source files after successful de/compression \n");
#ifndef ZSTD_NOCOMPRESS
DISPLAY( "--ultra : enable ultra modes (requires more memory to decompress)\n");
DISPLAY( "--no-dictID : don't write dictID into header (dictionary compression)\n");
@ -181,7 +181,7 @@ static void waitEnter(void)
/*! readU32FromChar() :
@return : unsigned integer value reach from input in `char` format
Will also modify `*stringPtr`, advancing it to position where it stopped reading.
Note : this function can overflow if result > MAX_UNIT */
Note : this function can overflow if result > MAX_UINT */
static unsigned readU32FromChar(const char** stringPtr)
{
unsigned result = 0;
@ -254,7 +254,7 @@ int main(int argCount, const char** argv)
if (!strcmp(argument, "--help")) { displayOut=stdout; CLEAN_RETURN(usage_advanced(programName)); }
if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; }
if (!strcmp(argument, "--quiet")) { displayLevel--; continue; }
if (!strcmp(argument, "--stdout")) { forceStdout=1; outFileName=stdoutmark; displayLevel=1; continue; }
if (!strcmp(argument, "--stdout")) { forceStdout=1; outFileName=stdoutmark; displayLevel-=(displayLevel==2); continue; }
if (!strcmp(argument, "--ultra")) { FIO_setMaxWLog(0); continue; }
if (!strcmp(argument, "--check")) { FIO_setChecksumFlag(2); continue; }
if (!strcmp(argument, "--no-check")) { FIO_setChecksumFlag(0); continue; }
@ -265,13 +265,17 @@ int main(int argCount, const char** argv)
if (!strcmp(argument, "--train")) { dictBuild=1; outFileName=g_defaultDictName; continue; }
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; continue; }
if (!strcmp(argument, "--keep")) { continue; } /* does nothing, since preserving input is default; for gzip/xz compatibility */
if (!strcmp(argument, "--keep")) { FIO_setRemoveSrcFile(0); continue; }
if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; }
/* '-' means stdin/stdout */
if (!strcmp(argument, "-")){
if (!filenameIdx) { filenameIdx=1, filenameTable[0]=stdinmark; outFileName=stdoutmark; continue; }
}
if (!filenameIdx) {
filenameIdx=1, filenameTable[0]=stdinmark;
outFileName=stdoutmark;
displayLevel-=(displayLevel==2);
continue;
} }
/* Decode commands (note : aggregated commands are allowed) */
if (argument[0]=='-') {
@ -300,7 +304,7 @@ int main(int argCount, const char** argv)
case 'd': decode=1; argument++; break;
/* Force stdout, even if stdout==console */
case 'c': forceStdout=1; outFileName=stdoutmark; displayLevel=1; argument++; break;
case 'c': forceStdout=1; outFileName=stdoutmark; displayLevel-=(displayLevel==2); argument++; break;
/* Use file content as dictionary */
case 'D': nextEntryIsDictionary = 1; argument++; break;
@ -314,8 +318,8 @@ int main(int argCount, const char** argv)
/* Quiet mode */
case 'q': displayLevel--; argument++; break;
/* keep source file (default anyway, so useless; for gzip/xz compatibility) */
case 'k': argument++; break;
/* keep source file (default); for gzip/xz compatibility */
case 'k': FIO_setRemoveSrcFile(0); argument++; break;
/* Checksum */
case 'C': argument++; FIO_setChecksumFlag(2); break;

View File

@ -7,3 +7,4 @@ The following projects are included with the zstd distribution:
- cmake - CMake project contributed by Artyom Dymchenko
- VS2008 - Visual Studio 2008 project
- VS2010 - Visual Studio 2010 project (which also works well with Visual Studio 2012, 2013, 2015)
- build - command line scripts prepared for Visual Studio compilation without IDE

591
zstd_compression_format.md Normal file
View File

@ -0,0 +1,591 @@
Zstandard Compression Format
============================
### Notices
Copyright (c) 2016 Yann Collet
Permission is granted to copy and distribute this document
for any purpose and without charge,
including translations into other languages
and incorporation into compilations,
provided that the copyright notice and this notice are preserved,
and that any substantive changes or deletions from the original
are clearly marked.
Distribution of this document is unlimited.
### Version
0.0.1 (30/06/2016 - Work in progress - unfinished)
Introduction
------------
The purpose of this document is to define a lossless compressed data format,
that is independent of CPU type, operating system,
file system and character set, suitable for
File compression, Pipe and streaming compression
using the [Zstandard algorithm](http://www.zstandard.org).
The data can be produced or consumed,
even for an arbitrarily long sequentially presented input data stream,
using only an a priori bounded amount of intermediate storage,
and hence can be used in data communications.
The format uses the Zstandard compression method,
and optional [xxHash-64 checksum method](http://www.xxhash.org),
for detection of data corruption.
The data format defined by this specification
does not attempt to allow random access to compressed data.
This specification is intended for use by implementers of software
to compress data into Zstandard format and/or decompress data from Zstandard format.
The text of the specification assumes a basic background in programming
at the level of bits and other primitive data representations.
Unless otherwise indicated below,
a compliant compressor must produce data sets
that conform to the specifications presented here.
It doesnt need to support all options though.
A compliant decompressor must be able to decompress
at least one working set of parameters
that conforms to the specifications presented here.
It may also ignore informative fields, such as checksum.
Whenever it does not support a parameter defined in the compressed stream,
it must produce a non-ambiguous error code and associated error message
explaining which parameter is unsupported.
Definitions
-----------
A content compressed by Zstandard is transformed into a Zstandard __frame__.
Multiple frames can be appended into a single file or stream.
A frame is totally independent, has a defined beginning and end,
and a set of parameters which tells the decoder how to decompress it.
A frame encapsulates one or multiple __blocks__.
Each block can be compressed or not,
and has a guaranteed maximum content size, which depends on frame parameters.
Unlike frames, each block depends on previous blocks for proper decoding.
However, each block can be decompressed without waiting for its successor,
allowing streaming operations.
General Structure of Zstandard Frame format
-------------------------------------------
| MagicNb | F. Header | Block | (More blocks) | EndMark |
|:-------:|:----------:| ----- | ------------- | ------- |
| 4 bytes | 2-14 bytes | | | 3 bytes |
__Magic Number__
4 Bytes, Little endian format.
Value : 0xFD2FB527
__Frame Header__
2 to 14 Bytes, to be detailed in the next part.
__Data Blocks__
To be detailed later on.
Thats where compressed data is stored.
__EndMark__
The flow of blocks ends when the last block header brings an _end signal_ .
This last block header may optionally host a __Content Checksum__ .
__Content Checksum__
Content Checksum verify that frame content has been regenrated correctly.
The content checksum is the result
of [xxh64() hash function](https://www.xxHash.com)
digesting the original (decoded) data as input, and a seed of zero.
Bits from 11 to 32 (included) are extracted to form a 22 bits checksum
stored into the last block header.
```
contentChecksum = (XXH64(content, size, 0) >> 11) & (1<<22)-1);
```
Content checksum is only present when its associated flag
is set in the frame descriptor.
Its usage is optional.
__Frame Concatenation__
In some circumstances, it may be required to append multiple frames,
for example in order to add new data to an existing compressed file
without re-framing it.
In such case, each frame brings its own set of descriptor flags.
Each frame is considered independent.
The only relation between frames is their sequential order.
The ability to decode multiple concatenated frames
within a single stream or file is left outside of this specification.
As an example, the reference `zstd` command line utility is able
to decode all concatenated frames in their sequential order,
delivering the final decompressed result as if it was a single content.
Frame Header
-------------
| FHD | (WD) | (Content Size) | (dictID) |
| ------- | --------- |:--------------:| --------- |
| 1 byte | 0-1 byte | 0 - 8 bytes | 0-4 bytes |
Frame header has a variable size, which uses a minimum of 2 bytes,
and up to 14 bytes depending on optional parameters.
__FHD byte__ (Frame Header Descriptor)
The first Header's byte is called the Frame Header Descriptor.
It tells which other fields are present.
Decoding this byte is enough to get the full size of the Frame Header.
| BitNb | 7-6 | 5 | 4 | 3 | 2 | 1-0 |
| ------- | ------ | ------- | ------ | -------- | -------- | -------- |
|FieldName| FCSize | Segment | Unused | Reserved | Checksum | dictID |
In this table, bit 7 is highest bit, while bit 0 is lowest.
__Frame Content Size flag__
This is a 2-bits flag (`= FHD >> 6`),
specifying if decompressed data size is provided within the header.
| Value | 0 | 1 | 2 | 3 |
| ------- | --- | --- | --- | --- |
|FieldSize| 0-1 | 2 | 4 | 8 |
Value 0 has a double meaning :
it either means `0` (size not provided) _if_ the `WD` byte is present,
or it means `1` byte (size <= 255 bytes).
__Single Segment__
If this flag is set,
data shall be regenerated within a single continuous memory segment.
In which case, `WD` byte __is not present__,
but `Frame Content Size` field necessarily is.
As a consequence, the decoder must allocate a memory segment
of size `>= Frame Content Size`.
In order to preserve the decoder from unreasonable memory requirement,
a decoder can refuse a compressed frame
which requests a memory size beyond decoder's authorized range.
For broader compatibility, decoders are recommended to support
memory sizes of 8 MB at least.
However, this is merely a recommendation,
and each decoder is free to support higher or lower limits,
depending on local limitations.
__Unused bit__
The value of this bit is unimportant
and not interpreted by a decoder compliant with this specification version.
It may be used in a future revision,
to signal a property which is not required to properly decode the frame.
__Reserved bit__
This bit is reserved for some future feature.
Its value _must be zero_.
A decoder compliant with this specification version must ensure it is not set.
This bit may be used in a future revision,
to signal a feature that must be interpreted in order to decode the frame.
__Content checksum flag__
If this flag is set, a content checksum will be present into the EndMark.
The checksum is a 22 bits value extracted from the XXH64() of data.
See __Content Checksum__ .
__Dictionary ID flag__
This is a 2-bits flag (`= FHD & 3`),
telling if a dictionary ID is provided within the header
| Value | 0 | 1 | 2 | 3 |
| ------- | --- | --- | --- | --- |
|FieldSize| 0 | 1 | 2 | 4 |
__WD byte__ (Window Descriptor)
Provides guarantees on maximum back-reference distance
that will be present within compressed data.
This information is useful for decoders to allocate enough memory.
| BitNb | 7-3 | 0-2 |
| --------- | -------- | -------- |
| FieldName | Exponent | Mantissa |
Maximum distance is given by the following formulae :
```
windowLog = 10 + Exponent;
windowBase = 1 << windowLog;
windowAdd = (windowBase / 8) * Mantissa;
windowSize = windowBase + windowAdd;
```
The minimum window size is 1 KB.
The maximum size is (15*(2^38))-1 bytes, which is almost 1.875 TB.
To properly decode compressed data,
a decoder will need to allocate a buffer of at least `windowSize` bytes.
Note that `WD` byte is optional. It's not present in `single segment` mode.
In which case, the maximum back-reference distance is the content size itself,
which can be any value from 1 to 2^64-1 bytes (16 EB).
In order to preserve decoder from unreasonable memory requirements,
a decoder can refuse a compressed frame
which requests a memory size beyond decoder's authorized range.
For better interoperability,
decoders are recommended to be compatible with window sizes of 8 MB.
Encoders are recommended to not request more than 8 MB.
It's merely a recommendation though,
decoders are free to support larger or lower limits,
depending on local limitations.
__Frame Content Size__
This is the original (uncompressed) size.
This information is optional, and only present if associated flag is set.
Content size is provided using 1, 2, 4 or 8 Bytes.
Format is Little endian.
| Field Size | Range |
| ---------- | ---------- |
| 0 | 0 |
| 1 | 0 - 255 |
| 2 | 256 - 65791|
| 4 | 0 - 2^32-1 |
| 8 | 0 - 2^64-1 |
When field size is 1, 4 or 8 bytes, the value is read directly.
When field size is 2, _an offset of 256 is added_.
It's allowed to represent a small size (ex: `18`) using the 8-bytes variant.
A size of `0` means `content size is unknown`.
In which case, the `WD` byte will necessarily be present,
and becomes the only hint to determine memory allocation.
In order to preserve decoder from unreasonable memory requirement,
a decoder can refuse a compressed frame
which requests a memory size beyond decoder's authorized range.
__Dictionary ID__
This is a variable size field, which contains a single ID.
It checks if the correct dictionary is used for decoding.
Note that this field is optional. If it's not present,
it's up to the caller to make sure it uses the correct dictionary.
Field size depends on __Dictionary ID flag__.
1 byte can represent an ID 0-255.
2 bytes can represent an ID 0-65535.
4 bytes can represent an ID 0-(2^32-1).
It's allowed to represent a small ID (for example `13`)
with a large 4-bytes dictionary ID, losing some efficiency in the process.
Data Blocks
-----------
| B. Header | data |
|:---------:| ------ |
| 3 bytes | |
__Block Header__
This field uses 3-bytes, format is __big-endian__.
The 2 highest bits represent the `block type`,
while the remaining 22 bits represent the (compressed) block size.
There are 4 block types :
| Value | 0 | 1 | 2 | 3 |
| ---------- | ---------- | --- | --- | ------- |
| Block Type | Compressed | Raw | RLE | EndMark |
- Compressed : this is a Zstandard compressed block,
detailed in a later part of this specification.
"block size" is the compressed size.
Decompressed size is unknown,
but its maximum possible value is guaranteed (see below)
- Raw : this is an uncompressed block.
"block size" is the number of bytes to read and copy.
- RLE : this is a single byte, repeated N times.
In which case, "block size" is the size to regenerate,
while the "compressed" block is just 1 byte (the byte to repeat).
- EndMark : this is not a block. Signal the end of the frame.
The rest of the field may be optionally filled by a checksum
(see frame checksum).
Block sizes must respect a few rules :
- In compressed mode, compressed size if always strictly `< contentSize`.
- Block decompressed size is necessarily <= maximum back-reference distance .
- Block decompressed size is necessarily <= 128 KB
__Data__
Where the actual data to decode stands.
It might be compressed or not, depending on previous field indications.
A data block is not necessarily "full" :
since an arbitrary “flush” may happen anytime,
block content can be any size, up to Block Maximum Size.
Block Maximum Size is the smallest of :
- Max back-reference distance
- 128 KB
Skippable Frames
----------------
| Magic Number | Frame Size | User Data |
|:------------:|:----------:| --------- |
| 4 bytes | 4 bytes | |
Skippable frames allow the insertion of user-defined data
into a flow of concatenated frames.
Its design is pretty straightforward,
with the sole objective to allow the decoder to quickly skip
over user-defined data and continue decoding.
Skippable frames defined in this specification are compatible with LZ4 ones.
__Magic Number__ :
4 Bytes, Little endian format.
Value : 0x184D2A5X, which means any value from 0x184D2A50 to 0x184D2A5F.
All 16 values are valid to identify a skippable frame.
__Frame Size__ :
This is the size, in bytes, of the following User Data
(without including the magic number nor the size field itself).
4 Bytes, Little endian format, unsigned 32-bits.
This means User Data cant be bigger than (2^32-1) Bytes.
__User Data__ :
User Data can be anything. Data will just be skipped by the decoder.
Compressed block format
-----------------------
This specification details the content of a _compressed block_.
A compressed block has a size, which must be known in order to decode it.
It also has a guaranteed maximum regenerated size,
in order to properly allocate destination buffer.
See "Frame format" for more details.
A compressed block consists of 2 sections :
- Literals section
- Sequences section
### Prerequisite
For proper decoding, a compressed block requires access to following elements :
- Previous decoded blocks, up to a distance of `windowSize`,
or all previous blocks in the same frame "single segment" mode.
- List of "recent offsets" from previous compressed block.
### Compressed Literals
Literals are compressed using order-0 huffman compression.
During sequence phase, literals will be entangled with match copy operations.
All literals are regrouped in the first part of the block.
They can be decoded first, and then copied during sequence operations,
or they can be decoded on the flow, as needed by sequences.
| Header | (Tree Description) | Stream1 | (Stream2) | (Stream3) | (Stream4) |
| ------ | ------------------ | ------- | --------- | --------- | --------- |
Literals can be compressed, or uncompressed.
When compressed, an optional tree description can be present,
followed by 1 or 4 streams.
#### Block Literal Header
Header is in charge of describing precisely how literals are packed.
It's a byte-aligned variable-size bitfield, ranging from 1 to 5 bytes,
using big-endian convention.
| BlockType | sizes format | (compressed size) | regenerated size |
| --------- | ------------ | ----------------- | ---------------- |
| 2 bits | 1 - 2 bits | 0 - 18 bits | 5 - 20 bits |
__Block Type__ :
This is a 2-bits field, describing 4 different block types :
| Value | 0 | 1 | 2 | 3 |
| ---------- | ---------- | ------ | --- | ------- |
| Block Type | Compressed | Repeat | Raw | RLE |
- Compressed : This is a standard huffman-compressed block,
starting with a huffman tree description.
See details below.
- Repeat Stats : This is a huffman-compressed block,
using huffman tree from previous huffman-compressed block.
Huffman tree description will be skipped.
Compressed stream is equivalent to "compressed" block type.
- Raw : Literals are stored uncompressed.
- RLE : Literals consist of a single byte value repeated N times.
__Sizes format__ :
Sizes format are divided into 2 families :
- For compressed block, it requires to decode both the compressed size
and the decompressed size. It will also decode the number of streams.
- For Raw or RLE blocks, it's enough to decode the size to regenerate.
For values spanning several bytes, convention is Big-endian.
__Sizes format for Raw or RLE block__ :
- Value : 0x : Regenerated size uses 5 bits (0-31).
Total literal header size is 1 byte.
`size = h[0] & 31;`
- Value : 10 : Regenerated size uses 12 bits (0-4095).
Total literal header size is 2 bytes.
`size = ((h[0] & 15) << 8) + h[1];`
- Value : 11 : Regenerated size uses 20 bits (0-1048575).
Total literal header size is 2 bytes.
`size = ((h[0] & 15) << 16) + (h[1]<<8) + h[2];`
Note : it's allowed to represent a short value (ex : `13`)
using a long format, accepting the reduced compacity.
__Sizes format for Compressed Block__ :
Note : also applicable to "repeat-stats" blocks.
- Value : 00 : 4 streams
Compressed and regenerated sizes use 10 bits (0-1023)
Total literal header size is 3 bytes
- Value : 01 : _Single stream_
Compressed and regenerated sizes use 10 bits (0-1023)
Total literal header size is 3 bytes
- Value : 10 : 4 streams
Compressed and regenerated sizes use 14 bits (0-16383)
Total literal header size is 4 bytes
- Value : 10 : 4 streams
Compressed and regenerated sizes use 18 bits (0-262143)
Total literal header size is 5 bytes
Compressed and regenerated size fields follow big endian convention.
#### Huffman Tree description
This section is only present when block type is _compressed_ (`0`).
It describes the different leaf nodes of the huffman tree,
and their relative weights.
##### Representation
All byte values from zero (included) to last present one (excluded)
are represented by `weight` values, from 0 to `maxBits`.
Transformation from `weight` to `nbBits` follows this formulae :
`nbBits = weight ? maxBits + 1 - weight : 0;` .
The last symbol's weight is deduced from previously decoded ones,
by completing to the nearest power of 2.
This power of 2 gives `maxBits`, the depth of the current tree.
__Example__ :
Let's presume the following huffman tree must be described :
| Value | 0 | 1 | 2 | 3 | 4 | 5 |
| ------ | - | - | - | - | - | - |
| nbBits | 1 | 2 | 3 | 0 | 4 | 4 |
The tree depth is 4, since its smallest element uses 4 bits.
Value `5` will not be listed, nor will values above `5`.
Values from `0` to `4` will be listed using `weight` instead of `nbBits`.
Weight formula is : `weight = nbBits ? maxBits + 1 - nbBits : 0;`
It gives the following serie of weights :
| weight | 4 | 3 | 2 | 0 | 1 |
| ------ | - | - | - | - | - |
| Value | 0 | 1 | 2 | 3 | 4 |
The decoder will do the inverse operation :
having collected weights of symbols from `0` to `4`,
it knows the last symbol, `5`, is present with a non-zero weight.
The weight of `5` can be deduced by joining to the nearest power of 2.
Sum of 2^(weight-1) (excluding 0) is :
8 + 4 + 2 + 0 + 1 = 15
Nearest power of 2 is 16.
Therefore, `maxBits = 4` and `weight[5] = 1`.
It can then proceed to transform back weights into nbBits :
`weight = nbBits ? maxBits + 1 - nbBits : 0;` .
##### Huffman Tree header
This is a single byte value (0-255), which tells how to decode the tree.
- if headerByte >= 242 : this is one of 14 pre-defined weight distributions :
+ 242 : 1x1 (+ 1x1)
+ 243 : 2x1 (+ 1x2)
+ 244 : 3x1 (+ 1x1)
+ 245 : 4x1 (+ 1x4)
+ 246 : 7x1 (+ 1x1)
+ 247 : 8x1 (+ 1x8)
+ 248 : 15x1 (+ 1x1)
+ 249 : 16x1 (+ 1x16)
+ 250 : 31x1 (+ 1x1)
+ 251 : 32x1 (+ 1x32)
+ 252 : 63x1 (+ 1x1)
+ 253 : 64x1 (+ 1x64)
+ 254 :127x1 (+ 1x1)
+ 255 :128x1 (+ 1x128)
- if headerByte >= 128 : this is a direct representation,
where each weight is written directly as a 4 bits field (0-15).
The full representation occupies (nbSymbols+1/2) bytes,
meaning it uses a last full byte even if nbSymbols is odd.
`nbSymbols = headerByte - 127;`
- if headerByte < 128 :
the serie of weights is compressed by FSE.
The length of the compressed serie is `headerByte` (0-127).
##### FSE (Finite State Entropy) compression of huffman weights
The serie of weights is compressed using standard FSE compression.
It's a single bitstream with 2 interleaved states,
using a single distribution table.
To decode an FSE bitstream, it is necessary to know its compressed size.
Compressed size is provided by `headerByte`.
It's also necessary to know its maximum decompressed size.
In this case, it's `255`, since literal values range from `0` to `255`,
and the last symbol value is not represented.
An FSE bitstream starts by a header, describing probabilities distribution.
Result will create a Decoding Table.
It is necessary to know the maximum accuracy of distribution
to properly allocate space for the Table.
For a list of huffman weights, this maximum is 8 bits.
FSE header and bitstreams are described in a separated chapter.
##### Conversion from weights to huffman prefix codes
Version changes
---------------