From 2fa99048444ac00e9f0ca20c0e28ec833603363f Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 1 Jul 2016 20:55:28 +0200 Subject: [PATCH] update specification and comments --- lib/common/zstd_internal.h | 2 +- lib/compress/zstd_compress.c | 33 +-- lib/decompress/zstd_decompress.c | 20 +- ...me_format.md => zstd_compression_format.md | 239 ++++++++++++++---- 4 files changed, 200 insertions(+), 94 deletions(-) rename zstd_frame_format.md => zstd_compression_format.md (52%) diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index 7989e6ac..43cbc9a3 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -51,7 +51,7 @@ /*-************************************* * Common constants ***************************************/ -#define ZSTD_OPT_DEBUG 0 // 3 = compression stats; 5 = check encoded sequences; 9 = full logs +#define ZSTD_OPT_DEBUG 0 /* 3 = compression stats; 5 = check encoded sequences; 9 = full logs */ #include #if defined(ZSTD_OPT_DEBUG) && ZSTD_OPT_DEBUG>=9 #define ZSTD_LOG_PARSER(...) printf(__VA_ARGS__) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 91c8d5e5..07d88022 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -427,21 +427,8 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue) */ -/* Frame descriptor +/* Frame header : - // old - 1 byte - Alloc : - bit 0-3 : windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN (see zstd_internal.h) - bit 4 : reserved for windowLog (must be zero) - bit 5 : reserved (must be zero) - bit 6-7 : Frame content size : unknown, 1 byte, 2 bytes, 8 bytes - - 1 byte - checker : - bit 0-1 : dictID (0, 1, 2 or 4 bytes) - bit 2-7 : reserved (must be zero) - - - // new 1 byte - FrameHeaderDescription : bit 0-1 : dictID (0, 1, 2 or 4 bytes) bit 2-4 : reserved (must be zero) @@ -453,24 +440,24 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue) bit 0-2 : octal Fractional (1/8th) bit 3-7 : Power of 2, with 0 = 1 KB (up to 2 TB) + Optional : content size (0, 1, 2, 4 or 8 bytes) + 0 : unknown + 1 : 0-255 bytes + 2 : 256 - 65535+256 + 8 : up to 16 exa + Optional : dictID (0, 1, 2 or 4 bytes) Automatic adaptation 0 : no dictID 1 : 1 - 255 2 : 256 - 65535 4 : all other values - - Optional : content size (0, 1, 2, 4 or 8 bytes) - 0 : unknown - 1 : 0-255 bytes - 2 : 256 - 65535+256 - 8 : up to 16 exa */ /* Block format description - Block = Literal Section - Sequences Section + Block = Literals Section - Sequences Section Prerequisite : size of (compressed) block, maximum size of regenerated data 1) Literal Section @@ -478,7 +465,7 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue) 1.1) Header : 1-5 bytes flags: 2 bits 00 compressed by Huff0 - 01 unused + 01 repeat 10 is Raw (uncompressed) 11 is Rle Note : using 01 => Huff0 with precomputed table ? @@ -514,7 +501,7 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue) else => 5 bytes (2-2-18-18) big endian convention - 1- CTable available (stored into workspace ?) + 1- CTable available (stored into workspace) 2- Small input (fast heuristic ? Full comparison ? depend on clevel ?) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index 84f64dc8..001a19ae 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -207,20 +207,8 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) */ -/* Frame descriptor +/* Frame Header : - // old - 1 byte - Alloc : - bit 0-3 : windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN (see zstd_internal.h) - bit 4 : reserved for windowLog (must be zero) - bit 5 : reserved (must be zero) - bit 6-7 : Frame content size : unknown, 1 byte, 2 bytes, 8 bytes - - 1 byte - checker : - bit 0-1 : dictID (0, 1, 2 or 4 bytes) - bit 2-7 : reserved (must be zero) - - // new 1 byte - FrameHeaderDescription : bit 0-1 : dictID (0, 1, 2 or 4 bytes) bit 2 : checksumFlag @@ -454,16 +442,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */ { const BYTE* const istart = (const BYTE*) src; - litBlockType_t lbt; if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected); - lbt = (litBlockType_t)(istart[0]>> 6); - switch(lbt) + switch((litBlockType_t)(istart[0]>> 6)) { case lbt_huffman: { size_t litSize, litCSize, singleStream=0; - U32 lhSize = ((istart[0]) >> 4) & 3; + U32 lhSize = (istart[0] >> 4) & 3; if (srcSize < 5) return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for lhSize, + cSize (+nbSeq) */ switch(lhSize) { diff --git a/zstd_frame_format.md b/zstd_compression_format.md similarity index 52% rename from zstd_frame_format.md rename to zstd_compression_format.md index 61b5aef5..b203dd10 100644 --- a/zstd_frame_format.md +++ b/zstd_compression_format.md @@ -1,5 +1,5 @@ -Zstandard Frame Format Description -================================== +Zstandard Compression Format Description +======================================== ### Notices @@ -16,7 +16,7 @@ Distribution of this document is unlimited. ### Version -0.1.0 (30/06/2016) +0.1.0 (30/06/2016 - unfinished) Introduction @@ -53,17 +53,32 @@ A compliant decompressor must be able to decompress at least one working set of parameters that conforms to the specifications presented here. It may also ignore informative fields, such as checksum. -Whenever it does not support a specific parameter within the compressed stream, -it must produce a non-ambiguous error code -and associated error message explaining which parameter is unsupported. +Whenever it does not support a parameter defined in the compressed stream, +it must produce a non-ambiguous error code and associated error message +explaining which parameter is unsupported. + + +Definitions +----------- +A content compressed by Zstandard is transformed into a Zstandard __frame__. +Multiple frames can be appended into a single file or stream. +A frame is totally independent, has a defined beginning and end, +and a set of parameters which tells the decoder how to decompress it. + +A frame encapsulates one or multiple __blocks__. +Each block can be compressed or not, +and has a guaranteed maximum content size, which depends on frame parameters. +Unlike frames, each block depends on previous blocks for proper decoding. +However, each block can be decompressed without waiting for its successor, +allowing streaming operations. General Structure of Zstandard Frame format ------------------------------------------- -| MagicNb | F. Header | Block | (...) | EndMark | -|:-------:|:----------:| ----- | ----- | ------- | -| 4 bytes | 2-14 bytes | | | 3 bytes | +| MagicNb | F. Header | Block | (More blocks) | EndMark | +|:-------:|:----------:| ----- | ------------- | ------- | +| 4 bytes | 2-14 bytes | | | 3 bytes | __Magic Number__ @@ -73,7 +88,6 @@ Value : 0xFD2FB527 __Frame Header__ 2 to 14 Bytes, to be detailed in the next part. -Most important part of the spec. __Data Blocks__ @@ -87,11 +101,11 @@ This last block header may optionally host a __Content Checksum__ . __Content Checksum__ -Content Checksum verify that the full content has been decoded correctly. +Content Checksum verify that frame content has been regenrated correctly. The content checksum is the result of [xxh64() hash function](https://www.xxHash.com) digesting the original (decoded) data as input, and a seed of zero. -Bits from 11 to 32 (included) are extracted to form the 22 bits checksum +Bits from 11 to 32 (included) are extracted to form a 22 bits checksum stored into the last block header. ``` contentChecksum = (XXH64(content, size, 0) >> 11) & (1<<22)-1); @@ -114,52 +128,64 @@ The ability to decode multiple concatenated frames within a single stream or file is left outside of this specification. As an example, the reference `zstd` command line utility is able to decode all concatenated frames in their sequential order, -presenting the final decompressed result as if it was a single frame. +delivering the final decompressed result as if it was a single content. Frame Header ----------------- +------------- | FHD | (WD) | (Content Size) | (dictID) | | ------- | --------- |:--------------:| --------- | | 1 byte | 0-1 byte | 0 - 8 bytes | 0-4 bytes | -Frame header uses a minimum of 2 bytes, +Frame header has a variable size, which uses a minimum of 2 bytes, and up to 14 bytes depending on optional parameters. __FHD byte__ (Frame Header Descriptor) +The first Header's byte is called the Frame Header Descriptor. +It tells which other fields are present. +Decoding this byte is enough to get the full size of the Frame Header. + | BitNb | 7-6 | 5 | 4 | 3 | 2 | 1-0 | | ------- | ------ | ------- | ------ | -------- | -------- | -------- | |FieldName| FCSize | Segment | Unused | Reserved | Checksum | dictID | -In the table, bit 7 is highest bit, while bit 0 is lowest. +In this table, bit 7 is highest bit, while bit 0 is lowest. __Frame Content Size flag__ This is a 2-bits flag (`= FHD >> 6`), -telling if original data size is provided within the header +specifying if decompressed data size is provided within the header. | Value | 0 | 1 | 2 | 3 | | ------- | --- | --- | --- | --- | |FieldSize| 0-1 | 2 | 4 | 8 | -Value 0 is special : it means `0` (data size not provided) -_if_ the `WD` byte is present. -Otherwise, it means `1` byte (data size <= 255 bytes). +Value 0 has a double meaning : +it either means `0` (size not provided) _if_ the `WD` byte is present, +or it means `1` byte (size <= 255 bytes). __Single Segment__ If this flag is set, data shall be regenerated within a single continuous memory segment. -In which case, `WD` byte is not present, +In which case, `WD` byte __is not present__, but `Frame Content Size` field necessarily is. -The size of the memory segment must be at least `>= Frame Content Size`. -In order to preserve decoder from unreasonable memory requirement, +As a consequence, the decoder must allocate a memory segment +of size `>= Frame Content Size`. + +In order to preserve the decoder from unreasonable memory requirement, a decoder can refuse a compressed frame which requests a memory size beyond decoder's authorized range. +For broader compatibility, decoders are recommended to support +memory sizes of 8 MB at least. +However, this is merely a recommendation, +and each decoder is free to support higher or lower limits, +depending on local limitations. + __Unused bit__ The value of this bit is unimportant @@ -170,7 +196,7 @@ to signal a property which is not required to properly decode the frame. __Reserved bit__ This bit is reserved for some future feature. -Its value must be zero. +Its value _must be zero_. A decoder compliant with this specification version must ensure it is not set. This bit may be used in a future revision, to signal a feature that must be interpreted in order to decode the frame. @@ -193,7 +219,7 @@ telling if a dictionary ID is provided within the header __WD byte__ (Window Descriptor) Provides guarantees on maximum back-reference distance -that will be used within compressed data. +that will be present within compressed data. This information is useful for decoders to allocate enough memory. | BitNb | 7-3 | 0-2 | @@ -208,16 +234,25 @@ windowAdd = (windowBase / 8) * Mantissa; windowSize = windowBase + windowAdd; ``` The minimum window size is 1 KB. -The maximum value is (15*(2^38))-1 bytes, which is almost 1.875 TB. +The maximum size is (15*(2^38))-1 bytes, which is almost 1.875 TB. -`WD` byte is optional. It's not present in `single segment` mode. -In which case, the maximum back-reference distance is the content size itself, which can be any value from 1 to 2^64-1 bytes (16 EB). +To properly decode compressed data, +a decoder will need to allocate a buffer of at least `windowSize` bytes. + +Note that `WD` byte is optional. It's not present in `single segment` mode. +In which case, the maximum back-reference distance is the content size itself, +which can be any value from 1 to 2^64-1 bytes (16 EB). In order to preserve decoder from unreasonable memory requirements, a decoder can refuse a compressed frame which requests a memory size beyond decoder's authorized range. -For better interoperability, decoders are recommended to be compatible with window sizes up to 8 MB. Encoders are recommended to not request more than 8 MB. It's just a recommendation, decoders are free to accept or refuse larger or lower values. +For better interoperability, +decoders are recommended to be compatible with window sizes of 8 MB. +Encoders are recommended to not request more than 8 MB. +It's merely a recommendation though, +decoders are free to support larger or lower limits, +depending on local limitations. __Frame Content Size__ @@ -235,11 +270,11 @@ Format is Little endian. | 8 | 0 - 2^64-1 | When field size is 1, 4 or 8 bytes, the value is read directly. -When field size is 2, an offset of 256 is added. -It's possible to represent a small size of `18` using the 8-bytes variant. -A size of `0` means `data size is unknown`. -In which case, the `WD` byte will be the only hint -to determine memory allocation. +When field size is 2, _an offset of 256 is added_. +It's allowed to represent a small size (ex: `18`) using the 8-bytes variant. +A size of `0` means `content size is unknown`. +In which case, the `WD` byte will necessarily be present, +and becomes the only hint to determine memory allocation. In order to preserve decoder from unreasonable memory requirement, a decoder can refuse a compressed frame @@ -257,7 +292,8 @@ Field size depends on __Dictionary ID flag__. 2 bytes can represent an ID 0-65535. 4 bytes can represent an ID 0-(2^32-1). -It's possible to represent a small ID (for example `13`) with a large 4-bytes dictionary ID, losing some efficiency in the process. +It's allowed to represent a small ID (for example `13`) +with a large 4-bytes dictionary ID, losing some efficiency in the process. Data Blocks @@ -270,10 +306,10 @@ Data Blocks __Block Header__ -This field uses 3-bytes, format is big-endian. +This field uses 3-bytes, format is __big-endian__. The 2 highest bits represent the `block type`, -while the remaining 22 bits represent the block size. +while the remaining 22 bits represent the (compressed) block size. There are 4 block types : @@ -281,24 +317,24 @@ There are 4 block types : | ---------- | ---------- | --- | --- | ------- | | Block Type | Compressed | Raw | RLE | EndMark | -- Compressed : this is a compressed block, - following Zstandard's block format specification. - The "block size" is the compressed size. +- Compressed : this is a Zstandard compressed block, + detailed in a later part of this specification. + "block size" is the compressed size. Decompressed size is unknown, - but its maximum possible value is guaranteed (see later) + but its maximum possible value is guaranteed (see below) - Raw : this is an uncompressed block. "block size" is the number of bytes to read and copy. - RLE : this is a single byte, repeated N times. - In which case, the size of the "compressed" block is always 1, - and the "block size" is the size to regenerate. + In which case, "block size" is the size to regenerate, + while the "compressed" block is just 1 byte (the byte to repeat). - EndMark : this is not a block. Signal the end of the frame. The rest of the field may be optionally filled by a checksum (see frame checksum). -Block Size shall never be larger than Block Maximum Size. -Block Maximum Size is the smallest of : -- Max back-reference distance -- 128 KB +Block sizes must respect a few rules : +- In compressed mode, compressed size if always strictly `< contentSize`. +- Block decompressed size is necessarily <= maximum back-reference distance . +- Block decompressed size is necessarily <= 128 KB __Data__ @@ -306,8 +342,8 @@ __Data__ Where the actual data to decode stands. It might be compressed or not, depending on previous field indications. A data block is not necessarily "full" : -an arbitrary “flush” may happen anytime. Any block can be “partially filled”. -Therefore, data can have any size, up to Block Maximum Size. +since an arbitrary “flush” may happen anytime, +block content can be any size, up to Block Maximum Size. Block Maximum Size is the smallest of : - Max back-reference distance - 128 KB @@ -329,25 +365,122 @@ over user-defined data and continue decoding. Skippable frames defined in this specification are compatible with LZ4 ones. -__Magic Number__ +__Magic Number__ : 4 Bytes, Little endian format. Value : 0x184D2A5X, which means any value from 0x184D2A50 to 0x184D2A5F. All 16 values are valid to identify a skippable frame. -__Frame Size__ +__Frame Size__ : This is the size, in bytes, of the following User Data (without including the magic number nor the size field itself). 4 Bytes, Little endian format, unsigned 32-bits. This means User Data can’t be bigger than (2^32-1) Bytes. -__User Data__ +__User Data__ : User Data can be anything. Data will just be skipped by the decoder. +Compressed block format +----------------------- +This specification details the content of a _compressed block_. +A compressed block has a size, which must be known in order to decode it. +It also has a guaranteed maximum regenerated size, +in order to properly allocate destination buffer. +See "Frame format" for more details. + +A compressed block consists of 2 sections : +- Literals section +- Sequences section + +### Compressed Literals + +Literals are compressed using order-0 huffman compression. +During sequence phase, literals will be entangled with match copy operations. +All literals are regrouped in the first part of the block. +They can be decoded first, and then copied during sequence operations, +or they can be decoded on the flow, as needed by sequences. + +| Header | (Tree Description) | Stream1 | (Stream2) | (Stream3) | (Stream4) | +| ------ | ------------------ | ------- | --------- | --------- | --------- | + +Literals can be compressed, or uncompressed. +When compressed, an optional tree description can be present, +followed by 1 or 4 streams. + +#### Block Literal Header + +Header is in charge of describing precisely how literals are packed. +It's a byte-aligned variable-size bitfield, ranging from 1 to 5 bytes, +using big-endian convention. + +| BlockType | sizes format | (compressed size) | regenerated size | +| --------- | ------------ | ----------------- | ---------------- | +| 2 bits | 1 - 2 bits | 0 - 18 bits | 5 - 20 bits | + +__Block Type__ : + +This is a 2-bits field, describing 4 different block types : + +| Value | 0 | 1 | 2 | 3 | +| ---------- | ---------- | ------ | --- | ------- | +| Block Type | Compressed | Repeat | Raw | RLE | + +- Compressed : This is a standard huffman-compressed block, + starting with a huffman tree description. + See details below. +- Repeat Stats : This is a huffman-compressed block, + using huffman tree from previous huffman-compressed block. + Huffman tree description will be skipped. + Compressed stream is equivalent to "compressed" block type. +- Raw : Literals are stored uncompressed. +- RLE : Literals consist of a single byte value repeated N times. + +__Sizes format__ : + +Sizes format are divided into 2 families : + +- For compressed block, it requires to decode both the compressed size + and the decompressed size. It will also decode the number of streams. +- For Raw or RLE blocks, it's enough to decode the size to regenerate. + +For values spanning several bytes, convention is Big-endian. + +__Sizes format for Raw or RLE block__ : + +- Value : 0x : Regenerated size uses 5 bits (0-31). + Total literal header size is 1 byte. + `size = h[0] & 31;` +- Value : 10 : Regenerated size uses 12 bits (0-4095). + Total literal header size is 2 bytes. + `size = ((h[0] & 15) << 8) + h[1];` +- Value : 11 : Regenerated size uses 20 bits (0-1048575). + Total literal header size is 2 bytes. + `size = ((h[0] & 15) << 16) + (h[1]<<8) + h[2];` + +Note : it's allowed to represent a short value (ex : `13`) +using a long format, accepting the reduced compacity. + +__Sizes format for Compressed Block__ : + +Note : also applicable to "repeat-stats" blocks. +- Value : 00 : 4 streams + Compressed and regenerated sizes use 10 bits (0-1023) + Total literal header size is 3 bytes +- Value : 01 : _Single stream_ + Compressed and regenerated sizes use 10 bits (0-1023) + Total literal header size is 3 bytes +- Value : 10 : 4 streams + Compressed and regenerated sizes use 14 bits (0-16383) + Total literal header size is 4 bytes +- Value : 10 : 4 streams + Compressed and regenerated sizes use 18 bits (0-262143) + Total literal header size is 5 bytes + + + Version changes --------------- - 0.1 : initial release