From 2fa99048444ac00e9f0ca20c0e28ec833603363f Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Fri, 1 Jul 2016 20:55:28 +0200
Subject: [PATCH] update specification and comments

---
 lib/common/zstd_internal.h                    |   2 +-
 lib/compress/zstd_compress.c                  |  33 +--
 lib/decompress/zstd_decompress.c              |  20 +-
 ...me_format.md => zstd_compression_format.md | 239 ++++++++++++++----
 4 files changed, 200 insertions(+), 94 deletions(-)
 rename zstd_frame_format.md => zstd_compression_format.md (52%)

diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index 7989e6ac..43cbc9a3 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -51,7 +51,7 @@
 /*-*************************************
 *  Common constants
 ***************************************/
-#define ZSTD_OPT_DEBUG 0     // 3 = compression stats;  5 = check encoded sequences;  9 = full logs
+#define ZSTD_OPT_DEBUG 0     /* 3 = compression stats;  5 = check encoded sequences;  9 = full logs */
 #include <stdio.h>
 #if defined(ZSTD_OPT_DEBUG) && ZSTD_OPT_DEBUG>=9
     #define ZSTD_LOG_PARSER(...) printf(__VA_ARGS__)
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index 91c8d5e5..07d88022 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -427,21 +427,8 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
 */
 
 
-/* Frame descriptor
+/* Frame header :
 
-    // old
-   1 byte - Alloc :
-   bit 0-3 : windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN   (see zstd_internal.h)
-   bit 4   : reserved for windowLog (must be zero)
-   bit 5   : reserved (must be zero)
-   bit 6-7 : Frame content size : unknown, 1 byte, 2 bytes, 8 bytes
-
-   1 byte - checker :
-   bit 0-1 : dictID (0, 1, 2 or 4 bytes)
-   bit 2-7 : reserved (must be zero)
-
-
-    // new
    1 byte - FrameHeaderDescription :
    bit 0-1 : dictID (0, 1, 2 or 4 bytes)
    bit 2-4 : reserved (must be zero)
@@ -453,24 +440,24 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
    bit 0-2 : octal Fractional (1/8th)
    bit 3-7 : Power of 2, with 0 = 1 KB (up to 2 TB)
 
+   Optional : content size (0, 1, 2, 4 or 8 bytes)
+   0 : unknown
+   1 : 0-255 bytes
+   2 : 256 - 65535+256
+   8 : up to 16 exa
+
    Optional : dictID (0, 1, 2 or 4 bytes)
    Automatic adaptation
    0 : no dictID
    1 : 1 - 255
    2 : 256 - 65535
    4 : all other values
-
-   Optional : content size (0, 1, 2, 4 or 8 bytes)
-   0 : unknown
-   1 : 0-255 bytes
-   2 : 256 - 65535+256
-   8 : up to 16 exa
 */
 
 
 /* Block format description
 
-   Block = Literal Section - Sequences Section
+   Block = Literals Section - Sequences Section
    Prerequisite : size of (compressed) block, maximum size of regenerated data
 
    1) Literal Section
@@ -478,7 +465,7 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
    1.1) Header : 1-5 bytes
         flags: 2 bits
             00 compressed by Huff0
-            01 unused
+            01 repeat
             10 is Raw (uncompressed)
             11 is Rle
             Note : using 01 => Huff0 with precomputed table ?
@@ -514,7 +501,7 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
             else           => 5 bytes (2-2-18-18)
             big endian convention
 
-        1- CTable available (stored into workspace ?)
+        1- CTable available (stored into workspace)
         2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
 
 
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index 84f64dc8..001a19ae 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -207,20 +207,8 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
 */
 
 
-/* Frame descriptor
+/* Frame Header :
 
-    // old
-   1 byte - Alloc :
-   bit 0-3 : windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN   (see zstd_internal.h)
-   bit 4   : reserved for windowLog (must be zero)
-   bit 5   : reserved (must be zero)
-   bit 6-7 : Frame content size : unknown, 1 byte, 2 bytes, 8 bytes
-
-   1 byte - checker :
-   bit 0-1 : dictID (0, 1, 2 or 4 bytes)
-   bit 2-7 : reserved (must be zero)
-
-    // new
    1 byte - FrameHeaderDescription :
    bit 0-1 : dictID (0, 1, 2 or 4 bytes)
    bit 2   : checksumFlag
@@ -454,16 +442,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                           const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
 {
     const BYTE* const istart = (const BYTE*) src;
-    litBlockType_t lbt;
 
     if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
-    lbt = (litBlockType_t)(istart[0]>> 6);
 
-    switch(lbt)
+    switch((litBlockType_t)(istart[0]>> 6))
     {
     case lbt_huffman:
         {   size_t litSize, litCSize, singleStream=0;
-            U32 lhSize = ((istart[0]) >> 4) & 3;
+            U32 lhSize = (istart[0] >> 4) & 3;
             if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for lhSize, + cSize (+nbSeq) */
             switch(lhSize)
             {
diff --git a/zstd_frame_format.md b/zstd_compression_format.md
similarity index 52%
rename from zstd_frame_format.md
rename to zstd_compression_format.md
index 61b5aef5..b203dd10 100644
--- a/zstd_frame_format.md
+++ b/zstd_compression_format.md
@@ -1,5 +1,5 @@
-Zstandard Frame Format Description
-==================================
+Zstandard Compression Format Description
+========================================
 
 ### Notices
 
@@ -16,7 +16,7 @@ Distribution of this document is unlimited.
 
 ### Version
 
-0.1.0 (30/06/2016)
+0.1.0 (30/06/2016 - unfinished)
 
 
 Introduction
@@ -53,17 +53,32 @@ A compliant decompressor must be able to decompress
 at least one working set of parameters
 that conforms to the specifications presented here.
 It may also ignore informative fields, such as checksum.
-Whenever it does not support a specific parameter within the compressed stream,
-it must produce a non-ambiguous error code
-and associated error message explaining which parameter is unsupported.
+Whenever it does not support a parameter defined in the compressed stream,
+it must produce a non-ambiguous error code and associated error message
+explaining which parameter is unsupported.
+
+
+Definitions
+-----------
+A content compressed by Zstandard is transformed into a Zstandard __frame__.
+Multiple frames can be appended into a single file or stream.
+A frame is totally independent, has a defined beginning and end,
+and a set of parameters which tells the decoder how to decompress it.
+
+A frame encapsulates one or multiple __blocks__.
+Each block can be compressed or not,
+and has a guaranteed maximum content size, which depends on frame parameters.
+Unlike frames, each block depends on previous blocks for proper decoding.
+However, each block can be decompressed without waiting for its successor,
+allowing streaming operations.
 
 
 General Structure of Zstandard Frame format
 -------------------------------------------
 
-| MagicNb |  F. Header | Block | (...) | EndMark |
-|:-------:|:----------:| ----- | ----- | ------- |
-| 4 bytes | 2-14 bytes |       |       | 3 bytes |
+| MagicNb |  F. Header | Block | (More blocks) | EndMark |
+|:-------:|:----------:| ----- | ------------- | ------- |
+| 4 bytes | 2-14 bytes |       |               | 3 bytes |
 
 __Magic Number__
 
@@ -73,7 +88,6 @@ Value : 0xFD2FB527
 __Frame Header__
 
 2 to 14 Bytes, to be detailed in the next part.
-Most important part of the spec.
 
 __Data Blocks__
 
@@ -87,11 +101,11 @@ This last block header may optionally host a __Content Checksum__ .
 
 __Content Checksum__
 
-Content Checksum verify that the full content has been decoded correctly.
+Content Checksum verify that frame content has been regenrated correctly.
 The content checksum is the result
 of [xxh64() hash function](https://www.xxHash.com)
 digesting the original (decoded) data as input, and a seed of zero.
-Bits from 11 to 32 (included) are extracted to form the 22 bits checksum
+Bits from 11 to 32 (included) are extracted to form a 22 bits checksum
 stored into the last block header.
 ```
 contentChecksum = (XXH64(content, size, 0) >> 11) & (1<<22)-1);
@@ -114,52 +128,64 @@ The ability to decode multiple concatenated frames
 within a single stream or file is left outside of this specification.
 As an example, the reference `zstd` command line utility is able
 to decode all concatenated frames in their sequential order,
-presenting the final decompressed result as if it was a single frame.
+delivering the final decompressed result as if it was a single content.
 
 
 Frame Header
-----------------
+-------------
 
 | FHD     | (WD)      | (Content Size) | (dictID)  |
 | ------- | --------- |:--------------:| --------- |
 | 1 byte  | 0-1 byte  |  0 - 8 bytes   | 0-4 bytes |
 
-Frame header uses a minimum of 2 bytes,
+Frame header has a variable size, which uses a minimum of 2 bytes,
 and up to 14 bytes depending on optional parameters.
 
 __FHD byte__ (Frame Header Descriptor)
 
+The first Header's byte is called the Frame Header Descriptor.
+It tells which other fields are present.
+Decoding this byte is enough to get the full size of the Frame Header.
+
 |  BitNb  |   7-6  |    5    |   4    |    3     |    2     |    1-0   |
 | ------- | ------ | ------- | ------ | -------- | -------- | -------- |
 |FieldName| FCSize | Segment | Unused | Reserved | Checksum |  dictID  |
 
-In the table, bit 7 is highest bit, while bit 0 is lowest.
+In this table, bit 7 is highest bit, while bit 0 is lowest.
 
 __Frame Content Size flag__
 
 This is a 2-bits flag (`= FHD >> 6`),
-telling if original data size is provided within the header
+specifying if decompressed data size is provided within the header.
 
 |  Value  |  0  |  1  |  2  |  3  |
 | ------- | --- | --- | --- | --- |
 |FieldSize| 0-1 |  2  |  4  |  8  |
 
-Value 0 is special : it means `0` (data size not provided)
-_if_ the `WD` byte is present.
-Otherwise, it means `1` byte (data size <= 255 bytes).
+Value 0 has a double meaning :
+it either means `0` (size not provided) _if_ the `WD` byte is present,
+or it means `1` byte (size <= 255 bytes).
 
 __Single Segment__
 
 If this flag is set,
 data shall be regenerated within a single continuous memory segment.
-In which case, `WD` byte is not present,
+In which case, `WD` byte __is not present__,
 but `Frame Content Size` field necessarily is.
-The size of the memory segment must be at least `>= Frame Content Size`.
 
-In order to preserve decoder from unreasonable memory requirement,
+As a consequence, the decoder must allocate a memory segment
+of size `>= Frame Content Size`.
+
+In order to preserve the decoder from unreasonable memory requirement,
 a decoder can refuse a compressed frame
 which requests a memory size beyond decoder's authorized range.
 
+For broader compatibility, decoders are recommended to support
+memory sizes of 8 MB at least.
+However, this is merely a recommendation,
+and each decoder is free to support higher or lower limits,
+depending on local limitations.
+
 __Unused bit__
 
 The value of this bit is unimportant
@@ -170,7 +196,7 @@ to signal a property which is not required to properly decode the frame.
 __Reserved bit__
 
 This bit is reserved for some future feature.
-Its value must be zero.
+Its value _must be zero_.
 A decoder compliant with this specification version must ensure it is not set.
 This bit may be used in a future revision,
 to signal a feature that must be interpreted in order to decode the frame.
@@ -193,7 +219,7 @@ telling if a dictionary ID is provided within the header
 __WD byte__ (Window Descriptor)
 
 Provides guarantees on maximum back-reference distance
-that will be used within compressed data.
+that will be present within compressed data.
 This information is useful for decoders to allocate enough memory.
 
 |   BitNb   |    7-3   |    0-2   |
@@ -208,16 +234,25 @@ windowAdd = (windowBase / 8) * Mantissa;
 windowSize = windowBase + windowAdd;
 ```
 The minimum window size is 1 KB.
-The maximum value is (15*(2^38))-1 bytes, which is almost 1.875 TB.
+The maximum size is (15*(2^38))-1 bytes, which is almost 1.875 TB.
 
-`WD` byte is optional. It's not present in `single segment` mode.
-In which case, the maximum back-reference distance is the content size itself, which can be any value from 1 to 2^64-1 bytes (16 EB).
+To properly decode compressed data,
+a decoder will need to allocate a buffer of at least `windowSize` bytes.
+
+Note that `WD` byte is optional. It's not present in `single segment` mode.
+In which case, the maximum back-reference distance is the content size itself,
+which can be any value from 1 to 2^64-1 bytes (16 EB).
 
 In order to preserve decoder from unreasonable memory requirements,
 a decoder can refuse a compressed frame
 which requests a memory size beyond decoder's authorized range.
 
-For better interoperability, decoders are recommended to be compatible with window sizes up to 8 MB. Encoders are recommended to not request more than 8 MB. It's just a recommendation, decoders are free to accept or refuse larger or lower values.
+For better interoperability,
+decoders are recommended to be compatible with window sizes of 8 MB.
+Encoders are recommended to not request more than 8 MB.
+It's merely a recommendation though,
+decoders are free to support larger or lower limits,
+depending on local limitations.
 
 __Frame Content Size__
 
@@ -235,11 +270,11 @@ Format is Little endian.
 |     8      | 0 - 2^64-1 |
 
 When field size is 1, 4 or 8 bytes, the value is read directly.
-When field size is 2, an offset of 256 is added.
-It's possible to represent a small size of `18` using the 8-bytes variant.
-A size of `0` means `data size is unknown`.
-In which case, the `WD` byte will be the only hint
-to determine memory allocation.
+When field size is 2, _an offset of 256 is added_.
+It's allowed to represent a small size (ex: `18`) using the 8-bytes variant.
+A size of `0` means `content size is unknown`.
+In which case, the `WD` byte will necessarily be present,
+and becomes the only hint to determine memory allocation.
 
 In order to preserve decoder from unreasonable memory requirement,
 a decoder can refuse a compressed frame
@@ -257,7 +292,8 @@ Field size depends on __Dictionary ID flag__.
 2 bytes can represent an ID 0-65535.
 4 bytes can represent an ID 0-(2^32-1).
 
-It's possible to represent a small ID (for example `13`) with a large 4-bytes dictionary ID, losing some efficiency in the process.
+It's allowed to represent a small ID (for example `13`)
+with a large 4-bytes dictionary ID, losing some efficiency in the process.
 
 
 Data Blocks
@@ -270,10 +306,10 @@ Data Blocks
 
 __Block Header__
 
-This field uses 3-bytes, format is big-endian.
+This field uses 3-bytes, format is __big-endian__.
 
 The 2 highest bits represent the `block type`,
-while the remaining 22 bits represent the block size.
+while the remaining 22 bits represent the (compressed) block size.
 
 There are 4 block types :
 
@@ -281,24 +317,24 @@ There are 4 block types :
 | ---------- | ---------- | --- | --- | ------- |
 | Block Type | Compressed | Raw | RLE | EndMark |
 
-- Compressed : this is a compressed block,
-  following Zstandard's block format specification.
-  The "block size" is the compressed size.
+- Compressed : this is a Zstandard compressed block,
+  detailed in a later part of this specification.
+  "block size" is the compressed size.
   Decompressed size is unknown,
-  but its maximum possible value is guaranteed (see later)
+  but its maximum possible value is guaranteed (see below)
 - Raw : this is an uncompressed block.
   "block size" is the number of bytes to read and copy.
 - RLE : this is a single byte, repeated N times.
-  In which case, the size of the "compressed" block is always 1,
-  and the "block size" is the size to regenerate.
+  In which case, "block size" is the size to regenerate,
+  while the "compressed" block is just 1 byte (the byte to repeat).
 - EndMark : this is not a block. Signal the end of the frame.
   The rest of the field may be optionally filled by a checksum
   (see frame checksum).
 
-Block Size shall never be larger than Block Maximum Size.
-Block Maximum Size is the smallest of :
-- Max back-reference distance
-- 128 KB
+Block sizes must respect a few rules :
+- In compressed mode, compressed size if always strictly `< contentSize`.
+- Block decompressed size is necessarily <= maximum back-reference distance .
+- Block decompressed size is necessarily <= 128 KB
 
 
 __Data__
@@ -306,8 +342,8 @@ __Data__
 Where the actual data to decode stands.
 It might be compressed or not, depending on previous field indications.
 A data block is not necessarily "full" :
-an arbitrary “flush” may happen anytime. Any block can be “partially filled”.
-Therefore, data can have any size, up to Block Maximum Size.
+since an arbitrary “flush” may happen anytime,
+block content can be any size, up to Block Maximum Size.
 Block Maximum Size is the smallest of :
 - Max back-reference distance
 - 128 KB
@@ -329,25 +365,122 @@ over user-defined data and continue decoding.
 Skippable frames defined in this specification are compatible with LZ4 ones.
 
 
-__Magic Number__
+__Magic Number__ :
 
 4 Bytes, Little endian format.
 Value : 0x184D2A5X, which means any value from 0x184D2A50 to 0x184D2A5F.
 All 16 values are valid to identify a skippable frame.
 
-__Frame Size__
+__Frame Size__ :
 
 This is the size, in bytes, of the following User Data
 (without including the magic number nor the size field itself).
 4 Bytes, Little endian format, unsigned 32-bits.
 This means User Data can’t be bigger than (2^32-1) Bytes.
 
-__User Data__
+__User Data__ :
 
 User Data can be anything. Data will just be skipped by the decoder.
 
 
+Compressed block format
+-----------------------
+This specification details the content of a _compressed block_.
+A compressed block has a size, which must be known in order to decode it.
+It also has a guaranteed maximum regenerated size,
+in order to properly allocate destination buffer.
+See "Frame format" for more details.
+
+A compressed block consists of 2 sections :
+- Literals section
+- Sequences section
+
+### Compressed Literals
+
+Literals are compressed using order-0 huffman compression.
+During sequence phase, literals will be entangled with match copy operations.
+All literals are regrouped in the first part of the block.
+They can be decoded first, and then copied during sequence operations,
+or they can be decoded on the flow, as needed by sequences.
+
+| Header | (Tree Description) | Stream1 | (Stream2) | (Stream3) | (Stream4) |
+| ------ | ------------------ | ------- | --------- | --------- | --------- |
+
+Literals can be compressed, or uncompressed.
+When compressed, an optional tree description can be present,
+followed by 1 or 4 streams.
+
+#### Block Literal Header
+
+Header is in charge of describing precisely how literals are packed.
+It's a byte-aligned variable-size bitfield, ranging from 1 to 5 bytes,
+using big-endian convention.
+
+| BlockType | sizes format | (compressed size) | regenerated size |
+| --------- | ------------ | ----------------- | ---------------- |
+|   2 bits  |  1 - 2 bits  |    0 - 18 bits    |    5 - 20 bits   |
+
+__Block Type__ :
+
+This is a 2-bits field, describing 4 different block types :
+
+|    Value   |      0     |    1   |  2  |    3    |
+| ---------- | ---------- | ------ | --- | ------- |
+| Block Type | Compressed | Repeat | Raw |   RLE   |
+
+- Compressed : This is a standard huffman-compressed block,
+               starting with a huffman tree description.
+               See details below.
+- Repeat Stats : This is a huffman-compressed block,
+               using huffman tree from previous huffman-compressed block.
+               Huffman tree description will be skipped.
+               Compressed stream is equivalent to "compressed" block type.
+- Raw : Literals are stored uncompressed.
+- RLE : Literals consist of a single byte value repeated N times.
+
+__Sizes format__ :
+
+Sizes format are divided into 2 families :
+
+- For compressed block, it requires to decode both the compressed size
+  and the decompressed size. It will also decode the number of streams.
+- For Raw or RLE blocks, it's enough to decode the size to regenerate.
+
+For values spanning several bytes, convention is Big-endian.
+
+__Sizes format for Raw or RLE block__ :
+
+- Value : 0x : Regenerated size uses 5 bits (0-31).
+               Total literal header size is 1 byte.
+               `size = h[0] & 31;`
+- Value : 10 : Regenerated size uses 12 bits (0-4095).
+               Total literal header size is 2 bytes.
+               `size = ((h[0] & 15) << 8) + h[1];`
+- Value : 11 : Regenerated size uses 20 bits (0-1048575).
+               Total literal header size is 2 bytes.
+               `size = ((h[0] & 15) << 16) + (h[1]<<8) + h[2];`
+
+Note : it's allowed to represent a short value (ex : `13`)
+using a long format, accepting the reduced compacity.
+
+__Sizes format for Compressed Block__ :
+
+Note : also applicable to "repeat-stats" blocks.
+- Value : 00 : 4 streams
+               Compressed and regenerated sizes use 10 bits (0-1023)
+               Total literal header size is 3 bytes
+- Value : 01 : _Single stream_
+               Compressed and regenerated sizes use 10 bits (0-1023)
+               Total literal header size is 3 bytes
+- Value : 10 : 4 streams
+               Compressed and regenerated sizes use 14 bits (0-16383)
+               Total literal header size is 4 bytes
+- Value : 10 : 4 streams
+               Compressed and regenerated sizes use 18 bits (0-262143)
+               Total literal header size is 5 bytes
+
+
+
 Version changes
 ---------------
-
 0.1 : initial release