From e8c6bb1e42f52f3f9beed6167530584498813fec Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Sun, 26 Jul 2015 00:23:57 +0100
Subject: [PATCH 01/21] Integrated huff0 (breaking format change)

---
 lib/fse.c        | 1193 +++++++++++++++++++++++++++++++++++-----------
 lib/fse.h        |   30 +-
 lib/fse_static.h |   32 +-
 lib/zstd.c       |   76 ++-
 programs/bench.c |   17 +-
 5 files changed, 1043 insertions(+), 305 deletions(-)

diff --git a/lib/fse.c b/lib/fse.c
index b0318f15..5390e9cd 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -52,11 +52,22 @@
 
 
 /****************************************************************
-*  Generic function type & suffix (C template emulation)
+*  template functions type & suffix
 ****************************************************************/
 #define FSE_FUNCTION_TYPE BYTE
 #define FSE_FUNCTION_EXTENSION
 
+
+/****************************************************************
+*  Byte symbol type
+****************************************************************/
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
 #endif   /* !FSE_COMMONDEFS_ONLY */
 
 
@@ -116,6 +127,11 @@ typedef   signed long long  S64;
 /****************************************************************
 *  Memory I/O
 *****************************************************************/
+static unsigned FSE_32bits(void)
+{
+    return sizeof(void*)==4;
+}
+
 static unsigned FSE_isLittleEndian(void)
 {
     const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
@@ -197,7 +213,7 @@ static void FSE_writeLE64(void* memPtr, U64 val64)
 
 static size_t FSE_readLEST(const void* memPtr)
 {
-    if (sizeof(size_t)==4)
+    if (FSE_32bits())
         return (size_t)FSE_readLE32(memPtr);
     else
         return (size_t)FSE_readLE64(memPtr);
@@ -205,7 +221,7 @@ static size_t FSE_readLEST(const void* memPtr)
 
 static void FSE_writeLEST(void* memPtr, size_t val)
 {
-    if (sizeof(size_t)==4)
+    if (FSE_32bits())
         FSE_writeLE32(memPtr, (U32)val);
     else
         FSE_writeLE64(memPtr, (U64)val);
@@ -273,6 +289,298 @@ FORCE_INLINE unsigned FSE_highbit32 (register U32 val)
 }
 
 
+/****************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+size_t FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION)
+(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize, unsigned safe)
+{
+    const FSE_FUNCTION_TYPE* ip = source;
+    const FSE_FUNCTION_TYPE* const iend = ip+sourceSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+    int s;
+
+    U32 Counting1[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
+    U32 Counting2[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
+    U32 Counting3[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
+    U32 Counting4[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
+
+    /* safety checks */
+    if (!sourceSize)
+    {
+        memset(count, 0, (maxSymbolValue + 1) * sizeof(FSE_FUNCTION_TYPE));
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_GENERIC;   /* maxSymbolValue too large : unsupported */
+    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;            /* 0 == default */
+
+    if ((safe) || (sizeof(FSE_FUNCTION_TYPE)>1))
+    {
+        /* check input values, to avoid count table overflow */
+        while (ip < iend-3)
+        {
+            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting1[*ip++]++;
+            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting2[*ip++]++;
+            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting3[*ip++]++;
+            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting4[*ip++]++;
+        }
+    }
+    else
+    {
+        U32 cached = FSE_read32(ip); ip += 4;
+        while (ip < iend-15)
+        {
+            U32 c = cached; cached = FSE_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = FSE_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = FSE_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = FSE_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) { if ((safe) && (*ip>maxSymbolValue)) return (size_t)-FSE_ERROR_GENERIC; Counting1[*ip++]++; }
+
+    for (s=0; s<=(int)maxSymbolValue; s++)
+    {
+        count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+        if (count[s] > max) max = count[s];
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+    return (size_t)max;
+}
+
+/* hidden fast variant (unsafe) */
+size_t FSE_FUNCTION_NAME(FSE_countFast, FSE_FUNCTION_EXTENSION)
+(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize)
+{
+    return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, maxSymbolValuePtr, source, sourceSize, 0);
+}
+
+size_t FSE_FUNCTION_NAME(FSE_count, FSE_FUNCTION_EXTENSION)
+(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize)
+{
+    if ((sizeof(FSE_FUNCTION_TYPE)==1) && (*maxSymbolValuePtr >= 255))
+    {
+        *maxSymbolValuePtr = 255;
+        return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, maxSymbolValuePtr, source, sourceSize, 0);
+    }
+    return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, maxSymbolValuePtr, source, sourceSize, 1);
+}
+
+
+static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+size_t FSE_FUNCTION_NAME(FSE_buildCTable, FSE_FUNCTION_EXTENSION)
+(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    const unsigned tableSize = 1 << tableLog;
+    const unsigned tableMask = tableSize - 1;
+    U16* tableU16 = ( (U16*) ct) + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) (((U32*)ct) + 1 + (tableLog ? tableSize>>1 : 1) );
+    const unsigned step = FSE_tableStep(tableSize);
+    unsigned cumul[FSE_MAX_SYMBOL_VALUE+2];
+    U32 position = 0;
+    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE]; /* init not necessary, but analyzer complain about it */
+    U32 highThreshold = tableSize-1;
+    unsigned symbol;
+    unsigned i;
+
+    /* header */
+    tableU16[-2] = (U16) tableLog;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* For explanations on how to distribute symbol values over the table :
+    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+    /* symbol start positions */
+    cumul[0] = 0;
+    for (i=1; i<=maxSymbolValue+1; i++)
+    {
+        if (normalizedCounter[i-1]==-1)   /* Low prob symbol */
+        {
+            cumul[i] = cumul[i-1] + 1;
+            tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(i-1);
+        }
+        else
+            cumul[i] = cumul[i-1] + normalizedCounter[i-1];
+    }
+    cumul[maxSymbolValue+1] = tableSize+1;
+
+    /* Spread symbols */
+    for (symbol=0; symbol<=maxSymbolValue; symbol++)
+    {
+        int nbOccurences;
+        for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++)
+        {
+            tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* Lowprob area */
+        }
+    }
+
+    if (position!=0) return (size_t)-FSE_ERROR_GENERIC;   /* Must have gone through all positions */
+
+    /* Build table */
+    for (i=0; i<tableSize; i++)
+    {
+        FSE_FUNCTION_TYPE s = tableSymbol[i];   /* static analyzer doesn't understand tableSymbol is properly initialized */
+        tableU16[cumul[s]++] = (U16) (tableSize+i);   /* Table U16 : sorted by symbol order; gives next state value */
+    }
+
+    /* Build Symbol Transformation Table */
+    {
+        unsigned s;
+        unsigned total = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+        {
+            switch (normalizedCounter[s])
+            {
+            case 0:
+                break;
+            case -1:
+            case 1:
+                symbolTT[s].minBitsOut = (BYTE)tableLog;
+                symbolTT[s].deltaFindState = total - 1;
+                total ++;
+                symbolTT[s].maxState = (U16)( (tableSize*2) - 1);   /* ensures state <= maxState */
+                break;
+            default :
+                symbolTT[s].minBitsOut = (BYTE)( (tableLog-1) - FSE_highbit32 (normalizedCounter[s]-1) );
+                symbolTT[s].deltaFindState = total - normalizedCounter[s];
+                total +=  normalizedCounter[s];
+                symbolTT[s].maxState = (U16)( (normalizedCounter[s] << (symbolTT[s].minBitsOut+1)) - 1);
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+#define FSE_DECODE_TYPE FSE_TYPE_NAME(FSE_decode_t, FSE_FUNCTION_EXTENSION)
+
+FSE_DTable* FSE_FUNCTION_NAME(FSE_createDTable, FSE_FUNCTION_EXTENSION) (unsigned tableLog)
+{
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSE_FUNCTION_NAME(FSE_freeDTable, FSE_FUNCTION_EXTENSION) (FSE_DTable* dt)
+{
+    free(dt);
+}
+
+
+size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION)
+(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    U32* const base32 = (U32*)dt;
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (base32+1);
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSE_tableStep(tableSize);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_maxSymbolValue_tooLarge;
+    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_tableLog_tooLarge;
+
+    /* Init, lay down lowprob symbols */
+    base32[0] = tableLog;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        if (normalizedCounter[s]==-1)
+        {
+            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        }
+        else
+        {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+        }
+    }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++)
+        {
+            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }
+    }
+
+    if (position!=0) return (size_t)-FSE_ERROR_GENERIC;   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++)
+        {
+            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - FSE_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+        }
+    }
+
+    return noLarge;
+}
+
+
+/******************************************
+*  FSE byte symbol
+******************************************/
 #ifndef FSE_COMMONDEFS_ONLY
 
 unsigned FSE_isError(size_t code) { return (code > (size_t)(-FSE_ERROR_maxCode)); }
@@ -302,8 +610,6 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
     return maxSymbolValue ? maxHeaderSize : FSE_MAX_HEADERSIZE;
 }
 
-#ifndef __clang_analyzer__   /* clang static analyzer has difficulties with this function : seems to believe normalizedCounter is uninitialized */
-
 static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
                                        const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
                                        unsigned safeWrite)
@@ -395,11 +701,10 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
     out[1] = (BYTE)(bitStream>>8);
     out+= (bitCount+7) /8;
 
-    if (charnum > maxSymbolValue + 1) return (size_t)-FSE_ERROR_GENERIC;   /* Too many symbols written (a bit too late?) */
+    if (charnum > maxSymbolValue + 1) return (size_t)-FSE_ERROR_GENERIC;
 
     return (out-ostart);
 }
-#endif // __clang_analyzer__
 
 
 size_t FSE_writeNCount (void* header, size_t headerBufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
@@ -868,6 +1173,12 @@ void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
     statePtr->stateLog = tableLog;
 }
 
+void FSE_addBitsFast(FSE_CStream_t* bitC, size_t value, unsigned nbBits)   /* only use if upper bits are clean 0 */
+{
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
 void FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits)
 {
     static const unsigned mask[] = { 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,  0xFFFFFF, 0x1FFFFFF };   /* up to 25 bits */
@@ -1025,7 +1336,6 @@ size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize
     return op-ostart;
 }
 
-
 size_t FSE_compress (void* dst, size_t dstSize, const void* src, size_t srcSize)
 {
     return FSE_compress2(dst, dstSize, src, (U32)srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
@@ -1035,14 +1345,6 @@ size_t FSE_compress (void* dst, size_t dstSize, const void* src, size_t srcSize)
 /*********************************************************
 *  Decompression (Byte symbols)
 *********************************************************/
-typedef struct
-{
-    U16  newState;
-    BYTE symbol;
-    BYTE nbBits;
-} FSE_decode_t;   /* size == U32 */
-
-
 size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
 {
     U32* const base32 = (U32*)dt;
@@ -1129,6 +1431,29 @@ size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSiz
 }
 
 
+/* FSE_lookBits
+ * Provides next n bits from the bitContainer.
+ * bitContainer is not modified (bits are still present for next read/look)
+ * On 32-bits, maxNbBits==25
+ * On 64-bits, maxNbBits==57
+ * return : value extracted.
+ */
+static size_t FSE_lookBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    return ((bitD->bitContainer << (bitD->bitsConsumed & ((sizeof(bitD->bitContainer)*8)-1))) >> 1) >> (((sizeof(bitD->bitContainer)*8)-1)-nbBits);
+}
+
+static size_t FSE_lookBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
+{
+    return (bitD->bitContainer << bitD->bitsConsumed) >> ((sizeof(bitD->bitContainer)*8)-nbBits);
+}
+
+static void FSE_skipBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+
 /* FSE_readBits
  * Read next n bits from the bitContainer.
  * On 32-bits, don't read more than maxNbBits==25
@@ -1138,41 +1463,45 @@ size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSiz
  */
 size_t FSE_readBits(FSE_DStream_t* bitD, U32 nbBits)
 {
-    size_t value = ((bitD->bitContainer << (bitD->bitsConsumed & ((sizeof(size_t)*8)-1))) >> 1) >> (((sizeof(size_t)*8)-1)-nbBits);
-    bitD->bitsConsumed += nbBits;
+    size_t value = FSE_lookBits(bitD, nbBits);
+    FSE_skipBits(bitD, nbBits);
     return value;
 }
 
 size_t FSE_readBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
 {
-    size_t value = (bitD->bitContainer << bitD->bitsConsumed) >> ((sizeof(size_t)*8)-nbBits);
-    bitD->bitsConsumed += nbBits;
+    size_t value = FSE_lookBitsFast(bitD, nbBits);
+    FSE_skipBits(bitD, nbBits);
     return value;
 }
 
 unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
 {
-    if (bitD->ptr >= bitD->start + sizeof(size_t))
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
     {
         bitD->ptr -= bitD->bitsConsumed >> 3;
         bitD->bitsConsumed &= 7;
         bitD->bitContainer = FSE_readLEST(bitD->ptr);
-        return 0;
+        return FSE_DStream_unfinished;
     }
     if (bitD->ptr == bitD->start)
     {
-        if (bitD->bitsConsumed < sizeof(size_t)*8) return 1;
-        if (bitD->bitsConsumed == sizeof(size_t)*8) return 2;
-        return 3;
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return FSE_DStream_partiallyFilled;
+        if (bitD->bitsConsumed == sizeof(bitD->bitContainer)*8) return FSE_DStream_completed;
+        return FSE_DStream_tooFar;
     }
     {
         U32 nbBytes = bitD->bitsConsumed >> 3;
+        U32 result = FSE_DStream_unfinished;
         if (bitD->ptr - nbBytes < bitD->start)
+        {
             nbBytes = (U32)(bitD->ptr - bitD->start);  /* note : necessarily ptr > start */
+            result = FSE_DStream_partiallyFilled;
+        }
         bitD->ptr -= nbBytes;
         bitD->bitsConsumed -= nbBytes*8;
         bitD->bitContainer = FSE_readLEST(bitD->ptr);   /* note : necessarily srcSize > sizeof(bitD) */
-        return (bitD->ptr == bitD->start);
+        return result;
     }
 }
 
@@ -1212,7 +1541,7 @@ BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
 
 unsigned FSE_endOfDStream(const FSE_DStream_t* bitD)
 {
-    return ((bitD->ptr == bitD->start) && (bitD->bitsConsumed == sizeof(size_t)*8));
+    return ((bitD->ptr == bitD->start) && (bitD->bitsConsumed == sizeof(bitD->bitContainer)*8));
 }
 
 unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
@@ -1245,7 +1574,7 @@ FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
 
 
     /* 2 symbols per loop */
-    while (!FSE_reloadDStream(&bitD) && (op<olimit))
+    while ((FSE_reloadDStream(&bitD)==FSE_DStream_unfinished) && (op<olimit))
     {
         *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
 
@@ -1262,15 +1591,15 @@ FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
     }
 
     /* tail */
-    /* note : FSE_reloadDStream(&bitD) >= 1; Ends at exactly 2 */
+    /* note : FSE_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly FSE_DStream_completed */
     while (1)
     {
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+        if ( (FSE_reloadDStream(&bitD)>FSE_DStream_completed) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
             break;
 
         *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
 
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+        if ( (FSE_reloadDStream(&bitD)>FSE_DStream_completed) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
             break;
 
         *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
@@ -1323,290 +1652,608 @@ size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSr
 }
 
 
-#endif   /* FSE_COMMONDEFS_ONLY */
 
-/*
-  2nd part of the file
-  designed to be included
-  for type-specific functions (template emulation in C)
-  Objective is to write these functions only once, for improved maintenance
-*/
+/*********************************************************
+*  Huff0 : Huffman block compression
+*********************************************************/
+#define HUF_ABSOLUTEMAX_TABLELOG  16
+#define HUF_MAX_TABLELOG  13
+#define HUF_DEFAULT_TABLELOG  12
+#define HUF_MAX_SYMBOL_VALUE 255
 
-/* safety checks */
-#ifndef FSE_FUNCTION_EXTENSION
-#  error "FSE_FUNCTION_EXTENSION must be defined"
-#endif
-#ifndef FSE_FUNCTION_TYPE
-#  error "FSE_FUNCTION_TYPE must be defined"
-#endif
+typedef struct HUF_CElt_s {
+  U16  val;
+  BYTE nbBits;
+} HUF_CElt ;
 
-/* Function names */
-#define FSE_CAT(X,Y) X##Y
-#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
-#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
 
 
-/* Function templates */
-size_t FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION)
-(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize, unsigned safe)
+#define HUF_HEADERLOG 8
+size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* tree, U32 maxSymbolValue, U32 huffLog)
 {
-    const FSE_FUNCTION_TYPE* ip = source;
-    const FSE_FUNCTION_TYPE* const iend = ip+sourceSize;
-    unsigned maxSymbolValue = *maxSymbolValuePtr;
-    unsigned max=0;
-    int s;
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 n;
+    BYTE* op = (BYTE*)dst;
+    size_t size;
 
-    U32 Counting1[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-    U32 Counting2[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-    U32 Counting3[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-    U32 Counting4[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
+    // check conditions
+    if (maxSymbolValue > HUF_MAX_SYMBOL_VALUE + 1)
+        return (size_t)-FSE_ERROR_GENERIC;
 
-    /* safety checks */
-    if (!sourceSize)
-    {
-        memset(count, 0, (maxSymbolValue + 1) * sizeof(FSE_FUNCTION_TYPE));
-        *maxSymbolValuePtr = 0;
-        return 0;
-    }
-    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_GENERIC;   /* maxSymbolValue too large : unsupported */
-    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;            /* 0 == default */
+    for (n=0; n<maxSymbolValue; n++)
+        huffWeight[n] = tree[n].nbBits ? (BYTE)(huffLog + 1 - tree[n].nbBits) : 0;
 
-    if ((safe) || (sizeof(FSE_FUNCTION_TYPE)>1))
-    {
-        /* check input values, to avoid count table overflow */
-        while (ip < iend-3)
-        {
-            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting1[*ip++]++;
-            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting2[*ip++]++;
-            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting3[*ip++]++;
-            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting4[*ip++]++;
-        }
-    }
-    else
-    {
-        U32 cached = FSE_read32(ip); ip += 4;
-        while (ip < iend-15)
-        {
-            U32 c = cached; cached = FSE_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = FSE_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = FSE_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = FSE_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-        }
-        ip-=4;
-    }
+    size = FSE_compress(op+1, maxDstSize-1, huffWeight, maxSymbolValue);   // don't need last symbol stat : implied
+    if (FSE_isError(size)) return size;
+    if (size >= 128) return (size_t)-FSE_ERROR_GENERIC;
+    if (size <= 1) return (size_t)-FSE_ERROR_GENERIC;   // special case, not implemented
 
-    /* finish last symbols */
-    while (ip<iend) { if ((safe) && (*ip>maxSymbolValue)) return (size_t)-FSE_ERROR_GENERIC; Counting1[*ip++]++; }
-
-    for (s=0; s<=(int)maxSymbolValue; s++)
-    {
-        count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
-        if (count[s] > max) max = count[s];
-    }
-
-    while (!count[maxSymbolValue]) maxSymbolValue--;
-    *maxSymbolValuePtr = maxSymbolValue;
-    return (size_t)max;
-}
-
-/* hidden fast variant (unsafe) */
-size_t FSE_FUNCTION_NAME(FSE_countFast, FSE_FUNCTION_EXTENSION)
-(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize)
-{
-    return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, maxSymbolValuePtr, source, sourceSize, 0);
-}
-
-size_t FSE_FUNCTION_NAME(FSE_count, FSE_FUNCTION_EXTENSION)
-(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize)
-{
-    if ((sizeof(FSE_FUNCTION_TYPE)==1) && (*maxSymbolValuePtr >= 255))
-    {
-        *maxSymbolValuePtr = 255;
-        return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, maxSymbolValuePtr, source, sourceSize, 0);
-    }
-    return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, maxSymbolValuePtr, source, sourceSize, 1);
+    op[0] = (BYTE)size;
+    return size+1;
 }
 
 
-static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
-
-size_t FSE_FUNCTION_NAME(FSE_buildCTable, FSE_FUNCTION_EXTENSION)
-(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
 {
-    const unsigned tableSize = 1 << tableLog;
-    const unsigned tableMask = tableSize - 1;
-    U16* tableU16 = ( (U16*) ct) + 2;
-    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) (((U32*)ct) + 1 + (tableLog ? tableSize>>1 : 1) );
-    const unsigned step = FSE_tableStep(tableSize);
-    unsigned cumul[FSE_MAX_SYMBOL_VALUE+2];
-    U32 position = 0;
-    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE] = {0};   /* should not be necessary, but analyzer complain without it, and performance loss is negligible with it */
-    U32 highThreshold = tableSize-1;
-    unsigned symbol;
-    unsigned i;
+    int totalCost = 0;
+    const U32 largestBits = huffNode[lastNonNull].nbBits;
 
-    /* header */
-    tableU16[-2] = (U16) tableLog;
-    tableU16[-1] = (U16) maxSymbolValue;
+    // early exit : all is fine
+    if (largestBits <= maxNbBits) return largestBits;
 
-    /* For explanations on how to distribute symbol values over the table :
-    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
-
-    /* symbol start positions */
-    cumul[0] = 0;
-    for (i=1; i<=maxSymbolValue+1; i++)
+    // now we have a few too large elements (at least >= 2)
     {
-        if (normalizedCounter[i-1]==-1)   /* Low prob symbol */
+        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        U32 n = lastNonNull;
+
+        while (huffNode[n].nbBits > maxNbBits)
         {
-            cumul[i] = cumul[i-1] + 1;
-            tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(i-1);
+            totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+            huffNode[n].nbBits = (BYTE)maxNbBits;
+            n --;
         }
-        else
-            cumul[i] = cumul[i-1] + normalizedCounter[i-1];
-    }
-    cumul[maxSymbolValue+1] = tableSize+1;
 
-    /* Spread symbols */
-    for (symbol=0; symbol<=maxSymbolValue; symbol++)
-    {
-        int nbOccurences;
-        for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++)
+        /* renorm totalCost */
+        totalCost >>= (largestBits - maxNbBits);  /* note : totalCost necessarily multiple of baseCost */
+
+        // repay cost
+        while (huffNode[n].nbBits == maxNbBits) n--;   // n at last of rank (maxNbBits-1)
+
         {
-            tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
-            position = (position + step) & tableMask;
-            while (position > highThreshold) position = (position + step) & tableMask;   /* Lowprob area */
-        }
-    }
-
-    if (position!=0) return (size_t)-FSE_ERROR_GENERIC;   /* Must have gone through all positions */
-
-    /* Build table */
-    for (i=0; i<tableSize; i++)
-    {
-        FSE_FUNCTION_TYPE s = tableSymbol[i];
-        tableU16[cumul[s]++] = (U16) (tableSize+i);   /* Table U16 : sorted by symbol order; gives next state value */
-    }
-
-    /* Build Symbol Transformation Table */
-    {
-        unsigned s;
-        unsigned total = 0;
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            switch (normalizedCounter[s])
+            const U32 noOne = 0xF0F0F0F0;
+            // Get pos of last (smallest) symbol per rank
+            U32 rankLast[HUF_MAX_TABLELOG];
+            U32 currentNbBits = maxNbBits;
+            int pos;
+			memset(rankLast, 0xF0, sizeof(rankLast));
+            for (pos=n ; pos >= 0; pos--)
             {
-            case 0:
-                break;
-            case -1:
-            case 1:
-                symbolTT[s].minBitsOut = (BYTE)tableLog;
-                symbolTT[s].deltaFindState = total - 1;
-                total ++;
-                symbolTT[s].maxState = (U16)( (tableSize*2) - 1);   /* ensures state <= maxState */
-                break;
-            default :
-                symbolTT[s].minBitsOut = (BYTE)( (tableLog-1) - FSE_highbit32 (normalizedCounter[s]-1) );
-                symbolTT[s].deltaFindState = total - normalizedCounter[s];
-                total +=  normalizedCounter[s];
-                symbolTT[s].maxState = (U16)( (normalizedCounter[s] << (symbolTT[s].minBitsOut+1)) - 1);
+                if (huffNode[pos].nbBits >= currentNbBits) continue;
+                currentNbBits = huffNode[pos].nbBits;
+                rankLast[maxNbBits-currentNbBits] = pos;
+            }
+
+            while (totalCost > 0)
+            {
+                U32 nBitsToDecrease = FSE_highbit32(totalCost) + 1;
+                for ( ; nBitsToDecrease > 1; nBitsToDecrease--)
+                {
+                    U32 highPos = rankLast[nBitsToDecrease];
+                    U32 lowPos = rankLast[nBitsToDecrease-1];
+                    if (highPos == noOne) continue;
+                    if (lowPos == noOne) break;
+                    {
+                        U32 highTotal = huffNode[highPos].count;
+                        U32 lowTotal = 2 * huffNode[lowPos].count;
+                        if (highTotal <= lowTotal) break;
+                    }
+                }
+                while (rankLast[nBitsToDecrease] == noOne)
+                    nBitsToDecrease ++;   // In some rare cases, no more rank 1 left => overshoot to closest
+                totalCost -= 1 << (nBitsToDecrease-1);
+                if (rankLast[nBitsToDecrease-1] == noOne)
+                    rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease];   // now there is one elt
+                huffNode[rankLast[nBitsToDecrease]].nbBits ++;
+                rankLast[nBitsToDecrease]--;
+				if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                    rankLast[nBitsToDecrease] = noOne;   // rank list emptied
+            }
+			while (totalCost < 0)   // Sometimes, cost correction overshoot
+			{
+				if (rankLast[1] == noOne)   /* special case, no weight 1, let's find it back at n */
+				{
+					while (huffNode[n].nbBits == maxNbBits) n--;
+					huffNode[n+1].nbBits--;
+					rankLast[1] = n+1;
+					totalCost++;
+					continue;
+				}
+				huffNode[ rankLast[1] + 1 ].nbBits--;
+				rankLast[1]++;
+				totalCost ++;
+			}
+        }
+    }
+
+    return maxNbBits;
+}
+
+
+typedef struct {
+    U32 base;
+    U32 current;
+} rankPos;
+
+static void HUF_sort(nodeElt* huffNode, const U32* count, U32 maxSymbolValue)
+{
+    rankPos rank[32];
+    U32 n;
+
+    memset(rank, 0, sizeof(rank));
+    for (n=0; n<=maxSymbolValue; n++)
+    {
+        U32 r = FSE_highbit32(count[n] + 1);
+        rank[r].base ++;
+    }
+    for (n=30; n>0; n--) rank[n-1].base += rank[n].base;
+    for (n=0; n<32; n++) rank[n].current = rank[n].base;
+    for (n=0; n<=maxSymbolValue; n++)
+    {
+        U32 c = count[n];
+        U32 r = FSE_highbit32(c+1) + 1;
+        U32 pos = rank[r].current++;
+        while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) huffNode[pos]=huffNode[pos-1], pos--;
+        huffNode[pos].count = c;
+        huffNode[pos].byte  = (BYTE)n;
+    }
+}
+
+
+#define STARTNODE (HUF_MAX_SYMBOL_VALUE+1)
+size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)
+{
+    nodeElt huffNode0[2*HUF_MAX_SYMBOL_VALUE+1 +1];
+    nodeElt* huffNode = huffNode0 + 1;
+    U32 n, nonNullRank;
+    int lowS, lowN;
+    U16 nodeNb = STARTNODE;
+    U32 nodeRoot;
+
+    // check
+    if (maxSymbolValue > 255) return (size_t)-FSE_ERROR_GENERIC;
+	memset(huffNode0, 0, sizeof(huffNode0));
+
+    // sort, decreasing order
+    HUF_sort(huffNode, count, maxSymbolValue);
+
+    // init for parents
+    nonNullRank = maxSymbolValue;
+    while(huffNode[nonNullRank].count == 0) nonNullRank--;
+    lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb;
+    huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count;
+    huffNode[lowS].parent = huffNode[lowS-1].parent = nodeNb;
+    nodeNb++; lowS-=2;
+    for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30);
+    huffNode0[0].count = (U32)(1U<<31);
+
+    // create parents
+    while (nodeNb <= nodeRoot)
+    {
+        U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+        huffNode[n1].parent = huffNode[n2].parent = nodeNb;
+        nodeNb++;
+    }
+
+    // distribute weights (unlimited tree height)
+    huffNode[nodeRoot].nbBits = 0;
+    for (n=nodeRoot-1; n>=STARTNODE; n--)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+    for (n=0; n<=nonNullRank; n++)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
+    // enforce maxTableLog
+    maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits);
+
+    // fill result into tree (val, nbBits)
+    {
+        U16 nbPerRank[HUF_ABSOLUTEMAX_TABLELOG+1] = {0};
+        U16 valPerRank[HUF_ABSOLUTEMAX_TABLELOG+1];
+        if (maxNbBits > HUF_ABSOLUTEMAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   // check
+        for (n=0; n<=nonNullRank; n++)
+            nbPerRank[huffNode[n].nbBits]++;
+        {
+            // determine stating value per rank
+            U16 min = 0;
+            for (n=maxNbBits; n>0; n--)
+            {
+                valPerRank[n] = min;      // get starting value within each rank
+                min += nbPerRank[n];
+                min >>= 1;
             }
         }
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[huffNode[n].byte].nbBits = huffNode[n].nbBits;   // push nbBits per symbol, symbol order
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[n].val = valPerRank[tree[n].nbBits]++;   // assign value within rank, symbol order
     }
 
-    return 0;
+    return maxNbBits;
 }
 
 
-#define FSE_DECODE_TYPE FSE_TYPE_NAME(FSE_decode_t, FSE_FUNCTION_EXTENSION)
-
-FSE_DTable* FSE_FUNCTION_NAME(FSE_createDTable, FSE_FUNCTION_EXTENSION) (unsigned tableLog)
+static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, HUF_CElt* CTable)
 {
-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
-}
+    const BYTE* ip = (const BYTE*) src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = (BYTE*) ostart;
+    U16* jumpTable = (U16*) dst;
+    size_t n, streamSize;
+    FSE_CStream_t bitC;
 
-void FSE_FUNCTION_NAME(FSE_freeDTable, FSE_FUNCTION_EXTENSION) (FSE_DTable* dt)
-{
-    free(dt);
-}
+    /* init */
+    (void)dstSize;   /* objective : ensure it fits into dstBuffer (Todo) */
+    op += 6;   /* jump Table -- could be optimized by delta / deviation */
+    FSE_initCStream(&bitC, op);
 
+#define FSE_FLUSHBITS_32(stream) \
+    if (FSE_32bits()) FSE_flushBits(stream)
 
-size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION)
-(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
-{
-    U32* const base32 = (U32*)dt;
-    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (base32+1);
-    const U32 tableSize = 1 << tableLog;
-    const U32 tableMask = tableSize-1;
-    const U32 step = FSE_tableStep(tableSize);
-    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
-    U32 position = 0;
-    U32 highThreshold = tableSize-1;
-    const S16 largeLimit= (S16)(1 << (tableLog-1));
-    U32 noLarge = 1;
-    U32 s;
-
-    /* Sanity Checks */
-    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_maxSymbolValue_tooLarge;
-    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_tableLog_tooLarge;
-
-    /* Init, lay down lowprob symbols */
-    base32[0] = tableLog;
-    for (s=0; s<=maxSymbolValue; s++)
+    n = srcSize & ~15;  // mod 16
+    switch (srcSize & 15)
     {
-        if (normalizedCounter[s]==-1)
-        {
-            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
-            symbolNext[s] = 1;
-        }
-        else
-        {
-            if (normalizedCounter[s] >= largeLimit) noLarge=0;
-            symbolNext[s] = normalizedCounter[s];
-        }
+        case 15: FSE_addBitsFast(&bitC, CTable[ip[n+14]].val, CTable[ip[n+14]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 14: FSE_addBitsFast(&bitC, CTable[ip[n+13]].val, CTable[ip[n+13]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 13: FSE_addBitsFast(&bitC, CTable[ip[n+12]].val, CTable[ip[n+12]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 12: FSE_addBitsFast(&bitC, CTable[ip[n+11]].val, CTable[ip[n+11]].nbBits);
+                 FSE_flushBits(&bitC);
+        case 11: FSE_addBitsFast(&bitC, CTable[ip[n+10]].val, CTable[ip[n+10]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 10: FSE_addBitsFast(&bitC, CTable[ip[n+9]].val, CTable[ip[n+9]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 9 : FSE_addBitsFast(&bitC, CTable[ip[n+8]].val, CTable[ip[n+8]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 8 : FSE_addBitsFast(&bitC, CTable[ip[n+7]].val, CTable[ip[n+7]].nbBits);
+                 FSE_flushBits(&bitC);
+        case 7 : FSE_addBitsFast(&bitC, CTable[ip[n+6]].val, CTable[ip[n+6]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 6 : FSE_addBitsFast(&bitC, CTable[ip[n+5]].val, CTable[ip[n+5]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 5 : FSE_addBitsFast(&bitC, CTable[ip[n+4]].val, CTable[ip[n+4]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 4 : FSE_addBitsFast(&bitC, CTable[ip[n+3]].val, CTable[ip[n+3]].nbBits);
+                 FSE_flushBits(&bitC);
+        case 3 : FSE_addBitsFast(&bitC, CTable[ip[n+2]].val, CTable[ip[n+2]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 2 : FSE_addBitsFast(&bitC, CTable[ip[n+1]].val, CTable[ip[n+1]].nbBits);
+                 FSE_FLUSHBITS_32(&bitC);
+        case 1 : FSE_addBitsFast(&bitC, CTable[ip[n+0]].val, CTable[ip[n+0]].nbBits);
+                 FSE_flushBits(&bitC);
+        case 0 :
+        default: ;
     }
 
-    /* Spread symbols */
-    for (s=0; s<=maxSymbolValue; s++)
+    for (; n>0; n-=16)
     {
-        int i;
-        for (i=0; i<normalizedCounter[s]; i++)
-        {
-            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
-            position = (position + step) & tableMask;
-            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
-        }
+        FSE_addBitsFast(&bitC, CTable[ip[n- 4]].val, CTable[ip[n- 4]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n- 8]].val, CTable[ip[n- 8]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n-12]].val, CTable[ip[n-12]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n-16]].val, CTable[ip[n-16]].nbBits);
+        FSE_flushBits(&bitC);
+    }
+    streamSize = FSE_closeCStream(&bitC);
+    jumpTable[0] = (U16)streamSize;
+    op += streamSize;
+
+    FSE_initCStream(&bitC, op);
+    n = srcSize & ~15;  // mod 16
+    for (; n>0; n-=16)
+    {
+        FSE_addBitsFast(&bitC, CTable[ip[n- 3]].val, CTable[ip[n- 3]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n- 7]].val, CTable[ip[n- 7]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n-11]].val, CTable[ip[n-11]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n-15]].val, CTable[ip[n-15]].nbBits);
+        FSE_flushBits(&bitC);
+    }
+    streamSize = FSE_closeCStream(&bitC);
+    jumpTable[1] = (U16)streamSize;
+    op += streamSize;
+
+    FSE_initCStream(&bitC, op);
+    n = srcSize & ~15;  // mod 16
+    for (; n>0; n-=16)
+    {
+        FSE_addBitsFast(&bitC, CTable[ip[n- 2]].val, CTable[ip[n- 2]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n- 6]].val, CTable[ip[n- 6]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n-10]].val, CTable[ip[n-10]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n-14]].val, CTable[ip[n-14]].nbBits);
+        FSE_flushBits(&bitC);
+    }
+    streamSize = FSE_closeCStream(&bitC);
+    jumpTable[2] = (U16)streamSize;
+    op += streamSize;
+
+    FSE_initCStream(&bitC, op);
+    n = srcSize & ~15;  // mod 16
+    for (; n>0; n-=16)
+    {
+        FSE_addBitsFast(&bitC, CTable[ip[n- 1]].val, CTable[ip[n- 1]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n- 5]].val, CTable[ip[n- 5]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n- 9]].val, CTable[ip[n- 9]].nbBits);
+        FSE_FLUSHBITS_32(&bitC);
+        FSE_addBitsFast(&bitC, CTable[ip[n-13]].val, CTable[ip[n-13]].nbBits);
+        FSE_flushBits(&bitC);
+    }
+    streamSize = FSE_closeCStream(&bitC);
+    op += streamSize;
+
+    return op-ostart;
+}
+
+
+size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32 count[HUF_MAX_SYMBOL_VALUE+1];
+    HUF_CElt CTable[HUF_MAX_SYMBOL_VALUE+1];
+    size_t errorCode;
+
+    /* early out */
+    if (dstSize < FSE_compressBound(srcSize)) return (size_t)-FSE_ERROR_dstSize_tooSmall;
+    if (srcSize <= 1) return srcSize;  /* Uncompressed or RLE */
+    if (!maxSymbolValue) maxSymbolValue = HUF_MAX_SYMBOL_VALUE;
+    if (!huffLog) huffLog = HUF_DEFAULT_TABLELOG;
+
+    /* Scan input and build symbol stats */
+    errorCode = FSE_count (count, &maxSymbolValue, (const BYTE*)src, srcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode == srcSize) return 1;
+    if (errorCode < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
+
+    /* Build Huffman Tree */
+    errorCode = HUF_buildCTable (CTable, count, maxSymbolValue, huffLog);
+    if (FSE_isError(errorCode)) return errorCode;
+    huffLog = (U32)errorCode;
+
+    /* Write table description header */
+    errorCode = HUF_writeCTable (op, dstSize, CTable, maxSymbolValue, huffLog);  /* don't write last symbol, implied */
+    if (FSE_isError(errorCode)) return errorCode;
+    op += errorCode;
+
+    /* Compress */
+    op += HUF_compress_usingCTable(op, oend - op, src, srcSize, CTable);
+
+    /* check compressibility */
+    if ((size_t)(op-ostart) >= srcSize-1)
+        return 0;
+
+    return op-ostart;
+}
+
+size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return HUF_compress2(dst, maxDstSize, src, (U32)srcSize, 255, HUF_DEFAULT_TABLELOG);
+}
+
+
+/*********************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct {
+    BYTE byte;
+    BYTE nbBits;
+} HUF_DElt;
+
+size_t HUF_readDTable (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1] = {0};
+    U32 weightTotal = 0;
+    U32 maxBits;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+    size_t oSize;
+    U32 n;
+    U32 nextRankStart;
+    HUF_DElt* const dt = (HUF_DElt*)(DTable + 1);
+
+    FSE_STATIC_ASSERT(sizeof(HUF_DElt) == sizeof(U16));   // if compilation fails here, assertion is false
+    if (iSize >= 128) return (size_t)-FSE_ERROR_GENERIC;  // special case, not implemented
+    if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+
+    oSize = FSE_decompress(huffWeight, HUF_MAX_SYMBOL_VALUE, ip+1, iSize);   // max 255 values stored, last is implied
+    if (FSE_isError(oSize)) return oSize;
+
+    // stats on weights
+    for (n=0; n<oSize; n++)
+    {
+        rankVal[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
     }
 
-    if (position!=0) return (size_t)-FSE_ERROR_GENERIC;   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    // get last symbol weight(implied)
+    maxBits = FSE_highbit32(weightTotal) + 1;
+    if (maxBits > DTable[0]) return (size_t)-FSE_ERROR_GENERIC;   // DTable is too small
+    DTable[0] = (U16)maxBits;
+    {
+        U32 total = 1 << maxBits;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << FSE_highbit32(rest);
+        if (verif != rest) return (size_t)-FSE_ERROR_GENERIC;    // last value must be a clean power of 2
+        huffWeight[oSize] = (BYTE)(FSE_highbit32(rest) + 1);
+        rankVal[huffWeight[oSize]]++;
+    }
 
-    /* Build Decoding table */
+    // Prepare ranks
+    nextRankStart = 0;
+    for (n=1; n<=maxBits; n++)
+    {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    // fill table
+    for (n=0; n<=oSize; n++)
     {
         U32 i;
-        for (i=0; i<tableSize; i++)
-        {
-            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
-            U16 nextState = symbolNext[symbol]++;
-            tableDecode[i].nbBits = (BYTE) (tableLog - FSE_highbit32 ((U32)nextState) );
-            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
-        }
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        HUF_DElt D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(maxBits + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
     }
 
-    return noLarge;
+    return iSize+1;
 }
+
+/*
+#define HUF_DECODE_SYMBOL(n, Dstream) \
+        val = FSE_lookBitsFast(&Dstream, dtLog); \
+        c = dt[val].byte; \
+        FSE_skipBits(&Dstream, dt[val].nbBits); \
+        op[n] = c;
+*/
+
+static void HUF_decodeSymbol(BYTE* ptr, FSE_DStream_t* Dstream, const HUF_DElt* dt, U32 dtLog)
+{
+        size_t val = FSE_lookBitsFast(Dstream, dtLog);
+        BYTE c = dt[val].byte;
+        FSE_skipBits(Dstream, dt[val].nbBits);
+        *ptr = c;
+}
+
+static size_t HUF_decompress_usingDTable(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-15;
+
+    const HUF_DElt* const dt = (const HUF_DElt*)(DTable+1);
+    const U32 dtLog = DTable[0];
+    size_t errorCode;
+    U32 reloadStatus;
+
+    /* Init */
+
+    const U16* jumpTable = (const U16*)cSrc;
+    const size_t length1 = jumpTable[0];
+    const size_t length2 = jumpTable[1];
+    const size_t length3 = jumpTable[2];
+    const size_t length4 = cSrcSize - 6 - length1 - length2 - length3;   // check coherency !!
+    const char* const start1 = (const char*)(cSrc) + 6;
+    const char* const start2 = start1 + length1;
+    const char* const start3 = start2 + length2;
+    const char* const start4 = start3 + length3;
+    FSE_DStream_t bitD1, bitD2, bitD3, bitD4;
+
+    errorCode = FSE_initDStream(&bitD1, start1, length1);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD2, start2, length2);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD3, start3, length3);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD4, start4, length4);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    reloadStatus=FSE_reloadDStream(&bitD2);
+
+    /* 16 symbols per loop */
+    for ( ; (reloadStatus<FSE_DStream_completed) && (op<olimit);  /* D2-3-4 are supposed to be synchronized and finish together */
+        op+=16, reloadStatus = FSE_reloadDStream(&bitD2) | FSE_reloadDStream(&bitD3) | FSE_reloadDStream(&bitD4), FSE_reloadDStream(&bitD1))
+    {
+#define HUF_DECODE_SYMBOL(n, Dstream) \
+        HUF_decodeSymbol(op+n, &Dstream, dt, dtLog); \
+        if (FSE_32bits()) FSE_reloadDStream(&Dstream)
+
+        HUF_DECODE_SYMBOL( 0, bitD1);
+        HUF_DECODE_SYMBOL( 1, bitD2);
+        HUF_DECODE_SYMBOL( 2, bitD3);
+        HUF_DECODE_SYMBOL( 3, bitD4);
+        HUF_DECODE_SYMBOL( 4, bitD1);
+        HUF_DECODE_SYMBOL( 5, bitD2);
+        HUF_DECODE_SYMBOL( 6, bitD3);
+        HUF_DECODE_SYMBOL( 7, bitD4);
+        HUF_DECODE_SYMBOL( 8, bitD1);
+        HUF_DECODE_SYMBOL( 9, bitD2);
+        HUF_DECODE_SYMBOL(10, bitD3);
+        HUF_DECODE_SYMBOL(11, bitD4);
+        HUF_DECODE_SYMBOL(12, bitD1);
+        HUF_DECODE_SYMBOL(13, bitD2);
+        HUF_DECODE_SYMBOL(14, bitD3);
+        HUF_DECODE_SYMBOL(15, bitD4);
+    }
+
+    if (reloadStatus!=FSE_DStream_completed)   /* not complete : some bitStream might be 0 (unfinished) */
+        return (size_t)-FSE_ERROR_corruptionDetected;
+
+    /* tail */
+    {
+        // bitTail = bitD1;   // *much* slower : -20% !??!
+        FSE_DStream_t bitTail;
+        bitTail.ptr = bitD1.ptr;
+        bitTail.bitsConsumed = bitD1.bitsConsumed;
+        bitTail.bitContainer = bitD1.bitContainer;   // required in case FSE_DStream_partiallyFilled
+        bitTail.start = start1;
+        for ( ; (FSE_reloadDStream(&bitTail) < FSE_DStream_completed) && (op<omax) ; op++)
+        {
+            HUF_DECODE_SYMBOL(0, bitTail);
+        }
+
+        if (FSE_endOfDStream(&bitTail))
+            return op-ostart;
+    }
+
+    if (op==omax) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
+
+    return (size_t)-FSE_ERROR_corruptionDetected;
+}
+
+
+size_t HUF_decompress (void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLE(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUF_readDTable (DTable, cSrc, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUF_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, DTable);
+}
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/lib/fse.h b/lib/fse.h
index df95d258..cbef4def 100644
--- a/lib/fse.h
+++ b/lib/fse.h
@@ -70,7 +70,33 @@ FSE_decompress():
 
     ** Important ** : FSE_decompress() doesn't decompress non-compressible nor RLE data !!!
     Why ? : making this distinction requires a header.
-    FSE library doesn't manage headers, which are intentionally left to the user layer.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+
+
+/******************************************
+*  Huff0 simple functions
+******************************************/
+size_t HUF_compress (void* dst, size_t dstSize, const void* src, size_t srcSize);
+size_t HUF_decompress(void* dst,  size_t maxDstSize,
+                const void* cSrc, size_t cSrcSize);
+/*
+HUF_compress():
+    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+    'dst' buffer must be already allocated, and sized to handle worst case situations.
+    Worst case size evaluation is provided by FSE_compressBound().
+    return : size of compressed data
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+                     if FSE_isError(return), it's an error code.
+
+HUF_decompress():
+    Decompress Huff0 data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'maxDstSize'.
+    return : size of regenerated data (<= maxDstSize)
+             or an error code, which can be tested using FSE_isError()
+
+    ** Important ** : HUF_decompress() doesn't decompress non-compressible nor RLE data !!!
 */
 
 
@@ -98,6 +124,8 @@ FSE_compress2():
 */
 size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
 
+size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+
 
 /******************************************
 *  FSE detailed API
diff --git a/lib/fse_static.h b/lib/fse_static.h
index f085d1f5..472d88c8 100644
--- a/lib/fse_static.h
+++ b/lib/fse_static.h
@@ -50,10 +50,16 @@ extern "C" {
 ******************************************/
 #define FSE_MAX_HEADERSIZE 512
 #define FSE_COMPRESSBOUND(size) (size + (size>>7) + FSE_MAX_HEADERSIZE)   /* Macro can be useful for static allocation */
-/* You can statically allocate a CTable as a table of unsigned using below macro */
+
+/* You can statically allocate CTable/DTable as a table of unsigned using below macro */
 #define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
 #define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
 
+/* You can statically allocate a Huff0 DTable as a table of unsigned char using below macro */
+#define HUF_DTABLE_SIZE_U16(maxTableLog)   (1 + (1<<maxTableLog))
+#define HUF_CREATE_STATIC_DTABLE(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE_U16(maxTableLog)] = { maxTableLog }
+
 
 /******************************************
 *  Error Management
@@ -194,6 +200,12 @@ unsigned int  FSE_reloadDStream(FSE_DStream_t* bitD);
 unsigned FSE_endOfDStream(const FSE_DStream_t* bitD);
 unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
 
+typedef enum { FSE_DStream_unfinished = 0,
+               FSE_DStream_partiallyFilled = 1,
+               FSE_DStream_completed = 2,
+               FSE_DStream_tooFar = 3 } FSE_DStream_status;  /* result of FSE_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... ?! */
+
 /*
 Let's now decompose FSE_decompress_usingDTable() into its unitary components.
 You will decode FSE-encoded symbols from the bitStream,
@@ -218,23 +230,23 @@ Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last
     unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
 
 You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
-Note : maximum allowed nbBits is 25
-    unsigned int bitField = FSE_readBits(&DStream, nbBits);
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+    size_t bitField = FSE_readBits(&DStream, nbBits);
 
-All above operations only read from local register (which size is controlled by bitD_t==32 bits).
+All above operations only read from local register (which size depends on size_t).
 Refueling the register from memory is manually performed by the reload method.
     endSignal = FSE_reloadDStream(&DStream);
 
 FSE_reloadDStream() result tells if there is still some more data to read from DStream.
-0 : there is still some data left into the DStream.
-1 : Dstream reached end of buffer, but is not yet fully extracted. It will not load data from memory any more.
-2 : Dstream reached its exact end, corresponding in general to decompression completed.
-3 : Dstream went too far. Decompression result is corrupted.
+FSE_DStream_unfinished : there is still some data left into the DStream.
+FSE_DStream_partiallyFilled : Dstream reached end of buffer. Its container may no longer be completely filled.
+FSE_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+FSE_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
 
-When reaching end of buffer(1), progress slowly, notably if you decode multiple symbols per loop,
+When reaching end of buffer (FSE_DStream_partiallyFilled), progress slowly, notably if you decode multiple symbols per loop,
 to properly detect the exact end of stream.
 After each decoded symbol, check if DStream is fully consumed using this simple test :
-    FSE_reloadDStream(&DStream) >= 2
+    FSE_reloadDStream(&DStream) >= FSE_DStream_completed
 
 When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
 Checking if DStream has reached its end is performed by :
diff --git a/lib/zstd.c b/lib/zstd.c
index 5e4b2fa5..8fc3282f 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -124,7 +124,7 @@ typedef unsigned long long  U64;
 /********************************************************
 *  Constants
 *********************************************************/
-static const U32 ZSTD_magicNumber = 0xFD2FB51C;   /* Initial (limited) frame format */
+static const U32 ZSTD_magicNumber = 0xFD2FB51D;   /* 2nd magic number (huff0) */
 
 #define HASH_LOG (ZSTD_MEMORY_USAGE - 2)
 #define HASH_TABLESIZE (1 << HASH_LOG)
@@ -512,7 +512,7 @@ static size_t ZSTD_compressRle (void* dst, size_t maxDstSize, const void* src, s
     /* Build header */
     ostart[0]  = (BYTE)(srcSize>>16);
     ostart[1]  = (BYTE)(srcSize>>8);
-    ostart[2]  = (BYTE)srcSize;
+    ostart[2]  = (BYTE) srcSize;
     ostart[0] += (BYTE)(bt_rle<<6);
 
     return ZSTD_blockHeaderSize+1;
@@ -527,9 +527,9 @@ static size_t ZSTD_noCompressBlock (void* dst, size_t maxDstSize, const void* sr
     memcpy(ostart + ZSTD_blockHeaderSize, src, srcSize);
 
     /* Build header */
-    ostart[0] = (BYTE)(srcSize>>16);
-    ostart[1] = (BYTE)(srcSize>>8);
-    ostart[2] = (BYTE)srcSize;
+    ostart[0]  = (BYTE)(srcSize>>16);
+    ostart[1]  = (BYTE)(srcSize>>8);
+    ostart[2]  = (BYTE) srcSize;
     ostart[0] += (BYTE)(bt_raw<<6);   /* is a raw (uncompressed) block */
 
     return ZSTD_blockHeaderSize+srcSize;
@@ -537,7 +537,7 @@ static size_t ZSTD_noCompressBlock (void* dst, size_t maxDstSize, const void* sr
 
 
 /* return : size of CStream in bits */
-static size_t ZSTD_compressLiterals_usingCTable(void* dst, size_t dstSize,
+size_t ZSTD_compressLiterals_usingCTable(void* dst, size_t dstSize,
                                           const void* src, size_t srcSize,
                                           const FSE_CTable* CTable)
 {
@@ -603,6 +603,33 @@ size_t ZSTD_minGain(size_t srcSize)
 static size_t ZSTD_compressLiterals (void* dst, size_t dstSize,
                                      const void* src, size_t srcSize)
 {
+    const size_t minGain = ZSTD_minGain(srcSize);
+
+#if 1
+#define LHSIZE 5
+    BYTE* const ostart = (BYTE*)dst;
+    size_t hsize = HUF_compress(ostart+LHSIZE, dstSize-LHSIZE, src, srcSize);
+    if (hsize<2) return hsize;   /* special cases */
+    if (hsize >= srcSize - minGain) return 0;
+
+    hsize += 2;  /* work around vs fixed 3-bytes header */
+
+    /* Build header */
+    {
+        ostart[0]  = (BYTE)(bt_compressed<<6); /* is a block, is compressed */
+        ostart[0] += (BYTE)(hsize>>16);
+        ostart[1]  = (BYTE)(hsize>>8);
+        ostart[2]  = (BYTE)(hsize>>0);
+        ostart[0] += (BYTE)((srcSize>>16)<<3);
+        ostart[3]  = (BYTE)(srcSize>>8);
+        ostart[4]  = (BYTE)(srcSize>>0);
+    }
+
+    hsize -= 2;
+    return hsize+LHSIZE;
+
+#else
+
     const BYTE* const istart = (const BYTE*) src;
     const BYTE* ip = istart;
 
@@ -616,7 +643,6 @@ static size_t ZSTD_compressLiterals (void* dst, size_t dstSize,
     S16 norm[256];
     U32 CTable[ FSE_CTABLE_SIZE_U32(LitFSELog, 256) ];
     size_t errorCode;
-    const size_t minGain = ZSTD_minGain(srcSize);
 
     /* early out */
     if (dstSize < FSE_compressBound(srcSize)) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
@@ -658,6 +684,8 @@ static size_t ZSTD_compressLiterals (void* dst, size_t dstSize,
     }
 
     return op-ostart;
+
+#endif // 1
 }
 
 
@@ -698,6 +726,7 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     {
         size_t cSize;
         size_t litSize = op_lit - op_lit_start;
+
         if (litSize <= LITERAL_NOENTROPY) cSize = ZSTD_noCompressBlock (op, maxDstSize, op_lit_start, litSize);
         else
         {
@@ -1304,7 +1333,7 @@ FORCE_INLINE size_t ZSTD_decompressLiterals_usingDTable_generic(
     return (size_t)-ZSTD_ERROR_GENERIC;
 }
 
-static size_t ZSTD_decompressLiterals_usingDTable(
+size_t ZSTD_decompressLiterals_usingDTable(
                        void* const dst, size_t maxDstSize,
                  const void* src, size_t srcSize,
                  const FSE_DTable* DTable, U32 fast)
@@ -1313,9 +1342,27 @@ static size_t ZSTD_decompressLiterals_usingDTable(
     return ZSTD_decompressLiterals_usingDTable_generic(dst, maxDstSize, src, srcSize, DTable, 0);
 }
 
-static size_t ZSTD_decompressLiterals(void* ctx, void* dst, size_t maxDstSize,
+static size_t ZSTD_decompressLiterals(void* ctx,
+                                      void* dst, size_t maxDstSize,
                                 const void* src, size_t srcSize)
 {
+#if 1
+
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + maxDstSize;
+    const BYTE* ip = (const BYTE*)src;
+    size_t errorCode;
+    size_t litSize = ip[1] + (ip[0]<<8);
+    litSize += ((ip[-3] >> 3) & 7) << 16;   // mmmmh....
+    op = oend - litSize;
+
+    (void)ctx;
+    errorCode = HUF_decompress(op, litSize, ip+2, srcSize-2);
+    if (FSE_isError(errorCode))
+        return errorCode;
+    return litSize;
+
+#else
     /* assumed : blockType == blockCompressed */
     const BYTE* ip = (const BYTE*)src;
     short norm[256];
@@ -1337,6 +1384,7 @@ static size_t ZSTD_decompressLiterals(void* ctx, void* dst, size_t maxDstSize,
     fastMode = (U32)errorCode;
 
     return ZSTD_decompressLiterals_usingDTable (dst, maxDstSize, ip, srcSize, DTable, fastMode);
+#endif // 1
 }
 
 
@@ -1491,8 +1539,8 @@ FORCE_INLINE size_t ZSTD_decompressBlock(void* ctx, void* dst, size_t maxDstSize
     const BYTE* dumps;
     const BYTE* litPtr;
     const BYTE* litEnd;
-    const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
-    const size_t dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
+    const int dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
+    const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
     FSE_DTable* DTableML = (FSE_DTable*)ctx;
     FSE_DTable* DTableLL = DTableML + FSE_DTABLE_SIZE_U32(MLFSELog);
     FSE_DTable* DTableOffb = DTableLL + FSE_DTABLE_SIZE_U32(LLFSELog);
@@ -1516,7 +1564,7 @@ FORCE_INLINE size_t ZSTD_decompressBlock(void* ctx, void* dst, size_t maxDstSize
         litEnd = ip - lastLLSize;
     ip += errorCode;
 
-    /* decompression */
+    /* LZ decompression */
     {
         FSE_DStream_t DStream;
         FSE_DState_t stateLL, stateOffb, stateML;
@@ -1600,7 +1648,7 @@ _another_round:
 
                 if (offset < 8)
                 {
-                    const size_t dec64 = dec64table[offset];
+                    const int dec64 = dec64table[offset];
                     op[0] = match[0];
                     op[1] = match[1];
                     op[2] = match[2];
@@ -1670,7 +1718,7 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
 
         ip += ZSTD_blockHeaderSize;
         remainingSize -= ZSTD_blockHeaderSize;
-        if (ip+blockSize > iend)
+        if (blockSize > remainingSize)
             return (size_t)-ZSTD_ERROR_wrongSrcSize;
 
         switch(blockProperties.blockType)
diff --git a/programs/bench.c b/programs/bench.c
index 31a43f32..5fdf0109 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -297,34 +297,37 @@ static int BMK_benchMem(void* srcBuffer, size_t srcSize, char* fileName, int cLe
             milliTime = BMK_GetMilliStart();
             while (BMK_GetMilliStart() == milliTime);
             milliTime = BMK_GetMilliStart();
-            while (BMK_GetMilliSpan(milliTime) < TIMELOOP)
+            for ( ; BMK_GetMilliSpan(milliTime) < TIMELOOP; nbLoops++)
             {
-                ZSTD_decompress(resultBuffer, srcSize, compressedBuffer, cSize);
-                nbLoops++;
+                size_t result = ZSTD_decompress(resultBuffer, srcSize, compressedBuffer, cSize);
+                if (ZSTD_isError(result))
+                {
+                    DISPLAY("\n!!! Decompression error !!! %s  !\n", ZSTD_getErrorName(result));
+                    break;
+                }
             }
             milliTime = BMK_GetMilliSpan(milliTime);
 
             if ((double)milliTime < fastestD*nbLoops) fastestD = (double)milliTime / nbLoops;
             DISPLAY("%1i-%-14.14s : %9i -> %9i (%5.2f%%),%7.1f MB/s ,%7.1f MB/s\r", loopNb, fileName, (int)srcSize, (int)cSize, ratio, (double)srcSize / fastestC / 1000., (double)srcSize / fastestD / 1000.);
-#endif
 
             /* CRC Checking */
             crcCheck = XXH64(resultBuffer, srcSize, 0);
             if (crcOrig!=crcCheck)
             {
-                unsigned i = 0;
+                unsigned i;
                 DISPLAY("\n!!! WARNING !!! %14s : Invalid Checksum : %x != %x\n", fileName, (unsigned)crcOrig, (unsigned)crcCheck);
-                while (i<srcSize)
+                for (i=0; i<srcSize; i++)
                 {
                     if (((BYTE*)srcBuffer)[i] != ((BYTE*)resultBuffer)[i])
                     {
                         printf("\nDecoding error at pos %u   \n", i);
                         break;
                     }
-                    i++;
                 }
                 break;
             }
+#endif
         }
 
         if (crcOrig == crcCheck)

From fb8296f55c23ccb411964f7e79d33243905e02a0 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Mon, 27 Jul 2015 19:34:27 +0100
Subject: [PATCH 02/21] Updated fse (faster huff0 32-bits decoding)

---
 lib/fse.c | 52 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/lib/fse.c b/lib/fse.c
index 5390e9cd..b6215b73 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -1656,10 +1656,13 @@ size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSr
 /*********************************************************
 *  Huff0 : Huffman block compression
 *********************************************************/
-#define HUF_ABSOLUTEMAX_TABLELOG  16
-#define HUF_MAX_TABLELOG  13
-#define HUF_DEFAULT_TABLELOG  12
 #define HUF_MAX_SYMBOL_VALUE 255
+#define HUF_DEFAULT_TABLELOG  12       /* used by default, when not specified */
+#define HUF_MAX_TABLELOG  12           /* max possible tableLog; for allocation purpose; can be modified */
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code is unsupported */
+#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
+#  error "HUF_MAX_TABLELOG is too large !"
+#endif
 
 typedef struct HUF_CElt_s {
   U16  val;
@@ -2192,26 +2195,33 @@ static size_t HUF_decompress_usingDTable(
     for ( ; (reloadStatus<FSE_DStream_completed) && (op<olimit);  /* D2-3-4 are supposed to be synchronized and finish together */
         op+=16, reloadStatus = FSE_reloadDStream(&bitD2) | FSE_reloadDStream(&bitD3) | FSE_reloadDStream(&bitD4), FSE_reloadDStream(&bitD1))
     {
-#define HUF_DECODE_SYMBOL(n, Dstream) \
+#define HUF_DECODE_SYMBOL_0(n, Dstream) \
+        HUF_decodeSymbol(op+n, &Dstream, dt, dtLog);
+
+#define HUF_DECODE_SYMBOL_1(n, Dstream) \
+        HUF_decodeSymbol(op+n, &Dstream, dt, dtLog); \
+        if (FSE_32bits() && (HUF_MAX_TABLELOG>12)) FSE_reloadDStream(&Dstream)
+
+#define HUF_DECODE_SYMBOL_2(n, Dstream) \
         HUF_decodeSymbol(op+n, &Dstream, dt, dtLog); \
         if (FSE_32bits()) FSE_reloadDStream(&Dstream)
 
-        HUF_DECODE_SYMBOL( 0, bitD1);
-        HUF_DECODE_SYMBOL( 1, bitD2);
-        HUF_DECODE_SYMBOL( 2, bitD3);
-        HUF_DECODE_SYMBOL( 3, bitD4);
-        HUF_DECODE_SYMBOL( 4, bitD1);
-        HUF_DECODE_SYMBOL( 5, bitD2);
-        HUF_DECODE_SYMBOL( 6, bitD3);
-        HUF_DECODE_SYMBOL( 7, bitD4);
-        HUF_DECODE_SYMBOL( 8, bitD1);
-        HUF_DECODE_SYMBOL( 9, bitD2);
-        HUF_DECODE_SYMBOL(10, bitD3);
-        HUF_DECODE_SYMBOL(11, bitD4);
-        HUF_DECODE_SYMBOL(12, bitD1);
-        HUF_DECODE_SYMBOL(13, bitD2);
-        HUF_DECODE_SYMBOL(14, bitD3);
-        HUF_DECODE_SYMBOL(15, bitD4);
+        HUF_DECODE_SYMBOL_1( 0, bitD1);
+        HUF_DECODE_SYMBOL_1( 1, bitD2);
+        HUF_DECODE_SYMBOL_1( 2, bitD3);
+        HUF_DECODE_SYMBOL_1( 3, bitD4);
+        HUF_DECODE_SYMBOL_2( 4, bitD1);
+        HUF_DECODE_SYMBOL_2( 5, bitD2);
+        HUF_DECODE_SYMBOL_2( 6, bitD3);
+        HUF_DECODE_SYMBOL_2( 7, bitD4);
+        HUF_DECODE_SYMBOL_1( 8, bitD1);
+        HUF_DECODE_SYMBOL_1( 9, bitD2);
+        HUF_DECODE_SYMBOL_1(10, bitD3);
+        HUF_DECODE_SYMBOL_1(11, bitD4);
+        HUF_DECODE_SYMBOL_0(12, bitD1);
+        HUF_DECODE_SYMBOL_0(13, bitD2);
+        HUF_DECODE_SYMBOL_0(14, bitD3);
+        HUF_DECODE_SYMBOL_0(15, bitD4);
     }
 
     if (reloadStatus!=FSE_DStream_completed)   /* not complete : some bitStream might be 0 (unfinished) */
@@ -2227,7 +2237,7 @@ static size_t HUF_decompress_usingDTable(
         bitTail.start = start1;
         for ( ; (FSE_reloadDStream(&bitTail) < FSE_DStream_completed) && (op<omax) ; op++)
         {
-            HUF_DECODE_SYMBOL(0, bitTail);
+            HUF_DECODE_SYMBOL_0(0, bitTail);
         }
 
         if (FSE_endOfDStream(&bitTail))

From 138db2187dc074faedf038e6eb2fab005fed8165 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Mon, 27 Jul 2015 20:19:21 +0100
Subject: [PATCH 03/21] Fixed some minor sanitizer warnings within huff0

---
 lib/fse.c | 44 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/lib/fse.c b/lib/fse.c
index b6215b73..8088711b 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -138,6 +138,38 @@ static unsigned FSE_isLittleEndian(void)
     return one.c[0];
 }
 
+static U16 FSE_read16(const void* memPtr)
+{
+    U16 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+static U16 FSE_readLE16(const void* memPtr)
+{
+    if (FSE_isLittleEndian())
+        return FSE_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+static void FSE_writeLE16(void* memPtr, U16 val)
+{
+    if (FSE_isLittleEndian())
+    {
+        memcpy(memPtr, &val, sizeof(val));
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
 static U32 FSE_read32(const void* memPtr)
 {
     U32 val32;
@@ -1958,7 +1990,7 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
         FSE_flushBits(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
-    jumpTable[0] = (U16)streamSize;
+    FSE_writeLE16(jumpTable, (U16)streamSize);
     op += streamSize;
 
     FSE_initCStream(&bitC, op);
@@ -1975,7 +2007,7 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
         FSE_flushBits(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
-    jumpTable[1] = (U16)streamSize;
+    FSE_writeLE16(jumpTable+1, (U16)streamSize);
     op += streamSize;
 
     FSE_initCStream(&bitC, op);
@@ -1992,7 +2024,7 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
         FSE_flushBits(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
-    jumpTable[2] = (U16)streamSize;
+    FSE_writeLE16(jumpTable+2, (U16)streamSize);
     op += streamSize;
 
     FSE_initCStream(&bitC, op);
@@ -2170,9 +2202,9 @@ static size_t HUF_decompress_usingDTable(
     /* Init */
 
     const U16* jumpTable = (const U16*)cSrc;
-    const size_t length1 = jumpTable[0];
-    const size_t length2 = jumpTable[1];
-    const size_t length3 = jumpTable[2];
+    const size_t length1 = FSE_readLE16(jumpTable);
+    const size_t length2 = FSE_readLE16(jumpTable+1);
+    const size_t length3 = FSE_readLE16(jumpTable+2);
     const size_t length4 = cSrcSize - 6 - length1 - length2 - length3;   // check coherency !!
     const char* const start1 = (const char*)(cSrc) + 6;
     const char* const start2 = start1 + length1;

From 77c82b680b78580c90a6d109765211a97f068aa1 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Sun, 2 Aug 2015 01:19:09 +0100
Subject: [PATCH 04/21] updated fse

---
 lib/fse.c        | 306 +++++++++++++++++++++++------------------------
 lib/fse_static.h |   2 +-
 2 files changed, 151 insertions(+), 157 deletions(-)

diff --git a/lib/fse.c b/lib/fse.c
index 8088711b..4bf587d8 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -286,11 +286,9 @@ static void FSE_writeLEST(void* memPtr, size_t val)
 ****************************************************************/
 typedef struct
 {
-    int  deltaFindState;
-    U16  maxState;
-    BYTE minBitsOut;
-    /* one byte padding ; total 8 bytes */
-} FSE_symbolCompressionTransform;
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
 
 typedef U32 CTable_max_t[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
 typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
@@ -509,20 +507,22 @@ size_t FSE_FUNCTION_NAME(FSE_buildCTable, FSE_FUNCTION_EXTENSION)
         {
             switch (normalizedCounter[s])
             {
-            case 0:
+            case  0:
                 break;
             case -1:
-            case 1:
-                symbolTT[s].minBitsOut = (BYTE)tableLog;
+            case  1:
+                symbolTT[s].deltaNbBits = tableLog * (1 << 16);
                 symbolTT[s].deltaFindState = total - 1;
                 total ++;
-                symbolTT[s].maxState = (U16)( (tableSize*2) - 1);   /* ensures state <= maxState */
                 break;
             default :
-                symbolTT[s].minBitsOut = (BYTE)( (tableLog-1) - FSE_highbit32 (normalizedCounter[s]-1) );
-                symbolTT[s].deltaFindState = total - normalizedCounter[s];
-                total +=  normalizedCounter[s];
-                symbolTT[s].maxState = (U16)( (normalizedCounter[s] << (symbolTT[s].minBitsOut+1)) - 1);
+                {
+                    U32 maxBitsOut = tableLog - FSE_highbit32 (normalizedCounter[s]-1);
+                    U32 minStatePlus = normalizedCounter[s] << maxBitsOut;
+                    symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                    symbolTT[s].deltaFindState = total - normalizedCounter[s];
+                    total +=  normalizedCounter[s];
+                }
             }
         }
     }
@@ -914,50 +914,6 @@ int FSE_compareRankT(const void* r1, const void* r2)
 }
 
 
-#if 0
-static size_t FSE_adjustNormSlow(short* norm, int pointsToRemove, const unsigned* count, U32 maxSymbolValue)
-{
-    rank_t rank[FSE_MAX_SYMBOL_VALUE+2];
-    U32 s;
-
-    /* Init */
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        rank[s].id = s;
-        rank[s].count = count[s];
-        if (norm[s] <= 1) rank[s].count = 0;
-    }
-    rank[maxSymbolValue+1].id = 0;
-    rank[maxSymbolValue+1].count = 0;   /* ensures comparison ends here in worst case */
-
-    /* Sort according to count */
-    qsort(rank, maxSymbolValue+1, sizeof(rank_t), FSE_compareRankT);
-
-    while(pointsToRemove)
-    {
-        int newRank = 1;
-        rank_t savedR;
-        if (norm[rank[0].id] == 1)
-            return (size_t)-FSE_ERROR_GENERIC;
-        norm[rank[0].id]--;
-        pointsToRemove--;
-        rank[0].count -= (rank[0].count + 6) >> 3;
-        if (norm[rank[0].id] == 1)
-            rank[0].count=0;
-        savedR = rank[0];
-        while (rank[newRank].count > savedR.count)
-        {
-            rank[newRank-1] = rank[newRank];
-            newRank++;
-        }
-        rank[newRank-1] = savedR;
-    }
-
-    return 0;
-}
-
-#else
-
 /* Secondary normalization method.
    To be used when primary method fails. */
 
@@ -1048,7 +1004,6 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count,
 
     return 0;
 }
-#endif
 
 
 size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
@@ -1153,9 +1108,8 @@ size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
     /* Build Symbol Transformation Table */
     for (s=0; s<=maxSymbolValue; s++)
     {
-        symbolTT[s].minBitsOut = (BYTE)nbBits;
+        symbolTT[s].deltaNbBits = nbBits << 16;
         symbolTT[s].deltaFindState = s-1;
-        symbolTT[s].maxState = (U16)( (tableSize*2) - 1);   /* ensures state <= maxState */
     }
 
     return 0;
@@ -1165,7 +1119,6 @@ size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
 /* fake FSE_CTable, for rle (100% always same symbol) input */
 size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
 {
-    const unsigned tableSize = 1;
     U16* tableU16 = ( (U16*) ct) + 2;
     FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((U32*)ct + 2);
 
@@ -1179,9 +1132,8 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
 
     /* Build Symbol Transformation Table */
     {
-        symbolTT[symbolValue].minBitsOut = 0;
+        symbolTT[symbolValue].deltaNbBits = 0;
         symbolTT[symbolValue].deltaFindState = 0;
-        symbolTT[symbolValue].maxState = (U16)(2*tableSize-1);   /* ensures state <= maxState */
     }
 
     return 0;
@@ -1218,12 +1170,11 @@ void FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits)
     bitC->bitPos += nbBits;
 }
 
-void FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* statePtr, BYTE symbol)
+void FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
 {
     const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
     const U16* const stateTable = (const U16*)(statePtr->stateTable);
-    int nbBitsOut  = symbolTT.minBitsOut;
-    nbBitsOut -= (int)((symbolTT.maxState - statePtr->value) >> 31);
+    U32 nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
     FSE_addBits(bitC, statePtr->value, nbBitsOut);
     statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
 }
@@ -1286,7 +1237,7 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
     }
 
     /* join to mod 4 */
-    if ((sizeof(size_t)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2))   /* test bit 2 */
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2))   /* test bit 2 */
     {
         FSE_encodeSymbol(&bitC, &CState2, *--ip);
         FSE_encodeSymbol(&bitC, &CState1, *--ip);
@@ -1298,12 +1249,12 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
     {
         FSE_encodeSymbol(&bitC, &CState2, *--ip);
 
-        if (sizeof(size_t)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+        if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
             FSE_flushBits(&bitC);
 
         FSE_encodeSymbol(&bitC, &CState1, *--ip);
 
-        if (sizeof(size_t)*8 > FSE_MAX_TABLELOG*4+7 )   /* this test must be static */
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 )   /* this test must be static */
         {
             FSE_encodeSymbol(&bitC, &CState2, *--ip);
             FSE_encodeSymbol(&bitC, &CState1, *--ip);
@@ -1604,22 +1555,27 @@ FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
     FSE_initDState(&state1, &bitD, dt);
     FSE_initDState(&state2, &bitD, dt);
 
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
 
-    /* 2 symbols per loop */
-    while ((FSE_reloadDStream(&bitD)==FSE_DStream_unfinished) && (op<olimit))
+    /* 4 symbols per loop */
+    for ( ; (FSE_reloadDStream(&bitD)==FSE_DStream_unfinished) && (op<olimit) ; op+=4)
     {
-        *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
+        op[0] = FSE_GETSYMBOL(&state1);
 
-        if (FSE_MAX_TABLELOG*2+7 > sizeof(size_t)*8)    /* This test must be static */
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
             FSE_reloadDStream(&bitD);
 
-        *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
+        op[1] = FSE_GETSYMBOL(&state2);
 
-        if (FSE_MAX_TABLELOG*4+7 < sizeof(size_t)*8)    /* This test must be static */
-        {
-            *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-            *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-        }
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (FSE_reloadDStream(&bitD) > FSE_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            FSE_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
     }
 
     /* tail */
@@ -1629,16 +1585,16 @@ FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
         if ( (FSE_reloadDStream(&bitD)>FSE_DStream_completed) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
             break;
 
-        *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
+        *op++ = FSE_GETSYMBOL(&state1);
 
         if ( (FSE_reloadDStream(&bitD)>FSE_DStream_completed) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
             break;
 
-        *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
+        *op++ = FSE_GETSYMBOL(&state2);
     }
 
     /* end ? */
-    if (FSE_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2) )
+    if (FSE_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
         return op-ostart;
 
     if (op==omax) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
@@ -1691,7 +1647,7 @@ size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSr
 #define HUF_MAX_SYMBOL_VALUE 255
 #define HUF_DEFAULT_TABLELOG  12       /* used by default, when not specified */
 #define HUF_MAX_TABLELOG  12           /* max possible tableLog; for allocation purpose; can be modified */
-#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code is unsupported */
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
 #if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
 #  error "HUF_MAX_TABLELOG is too large !"
 #endif
@@ -1709,7 +1665,6 @@ typedef struct nodeElt_s {
 } nodeElt;
 
 
-#define HUF_HEADERLOG 8
 size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* tree, U32 maxSymbolValue, U32 huffLog)
 {
     BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
@@ -1726,9 +1681,24 @@ size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* tree, U32
 
     size = FSE_compress(op+1, maxDstSize-1, huffWeight, maxSymbolValue);   // don't need last symbol stat : implied
     if (FSE_isError(size)) return size;
-    if (size >= 128) return (size_t)-FSE_ERROR_GENERIC;
-    if (size <= 1) return (size_t)-FSE_ERROR_GENERIC;   // special case, not implemented
+    if (size >= 128) return (size_t)-FSE_ERROR_GENERIC;   // should never happen, since maxSymbolValue <= 255
+    if ((size <= 1) || (size >= maxSymbolValue/2))
+    {
+        if (maxSymbolValue > 64) return (size_t)-FSE_ERROR_GENERIC;   // special case, not implemented (not possible)
+        if (size==1)   // RLE
+        {
+            op[0] = (BYTE)(128 /*special case*/ + 64 /* RLE */ + (maxSymbolValue-1));
+            op[1] = huffWeight[0];
+            return 2;
+        }
+        // Not compressible
+        op[0] = (BYTE)(128 /*special case*/ + 0 /* Not Compressible */ + (maxSymbolValue-1));
+        for (n=0; n<maxSymbolValue; n+=2)
+            op[(n/2)+1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n+1]);
+        return ((maxSymbolValue+1)/2) + 1;
+    }
 
+    // normal case
     op[0] = (BYTE)size;
     return size+1;
 }
@@ -1861,7 +1831,8 @@ size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U3
     U32 nodeRoot;
 
     // check
-    if (maxSymbolValue > 255) return (size_t)-FSE_ERROR_GENERIC;
+    if (maxNbBits == 0) maxNbBits = HUF_DEFAULT_TABLELOG;
+    if (maxSymbolValue > HUF_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_GENERIC;
 	memset(huffNode0, 0, sizeof(huffNode0));
 
     // sort, decreasing order
@@ -1923,6 +1894,16 @@ size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U3
     return maxNbBits;
 }
 
+static void HUF_encodeSymbol(FSE_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
+{
+    FSE_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+#define FSE_FLUSHBITS_1(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_MAX_TABLELOG*2+7) FSE_flushBits(stream)
+
+#define FSE_FLUSHBITS_2(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_MAX_TABLELOG*4+7) FSE_flushBits(stream)
 
 static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, HUF_CElt* CTable)
 {
@@ -1938,41 +1919,38 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
     op += 6;   /* jump Table -- could be optimized by delta / deviation */
     FSE_initCStream(&bitC, op);
 
-#define FSE_FLUSHBITS_32(stream) \
-    if (FSE_32bits()) FSE_flushBits(stream)
-
     n = srcSize & ~15;  // mod 16
     switch (srcSize & 15)
     {
-        case 15: FSE_addBitsFast(&bitC, CTable[ip[n+14]].val, CTable[ip[n+14]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 14: FSE_addBitsFast(&bitC, CTable[ip[n+13]].val, CTable[ip[n+13]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 13: FSE_addBitsFast(&bitC, CTable[ip[n+12]].val, CTable[ip[n+12]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 12: FSE_addBitsFast(&bitC, CTable[ip[n+11]].val, CTable[ip[n+11]].nbBits);
+        case 15: HUF_encodeSymbol(&bitC, ip[n+14], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 14: HUF_encodeSymbol(&bitC, ip[n+13], CTable);
+                 FSE_FLUSHBITS_2(&bitC);
+        case 13: HUF_encodeSymbol(&bitC, ip[n+12], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 12: HUF_encodeSymbol(&bitC, ip[n+11], CTable);
                  FSE_flushBits(&bitC);
-        case 11: FSE_addBitsFast(&bitC, CTable[ip[n+10]].val, CTable[ip[n+10]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 10: FSE_addBitsFast(&bitC, CTable[ip[n+9]].val, CTable[ip[n+9]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 9 : FSE_addBitsFast(&bitC, CTable[ip[n+8]].val, CTable[ip[n+8]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 8 : FSE_addBitsFast(&bitC, CTable[ip[n+7]].val, CTable[ip[n+7]].nbBits);
+        case 11: HUF_encodeSymbol(&bitC, ip[n+10], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 10: HUF_encodeSymbol(&bitC, ip[n+ 9], CTable);
+                 FSE_FLUSHBITS_2(&bitC);
+        case 9 : HUF_encodeSymbol(&bitC, ip[n+ 8], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 8 : HUF_encodeSymbol(&bitC, ip[n+ 7], CTable);
                  FSE_flushBits(&bitC);
-        case 7 : FSE_addBitsFast(&bitC, CTable[ip[n+6]].val, CTable[ip[n+6]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 6 : FSE_addBitsFast(&bitC, CTable[ip[n+5]].val, CTable[ip[n+5]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 5 : FSE_addBitsFast(&bitC, CTable[ip[n+4]].val, CTable[ip[n+4]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 4 : FSE_addBitsFast(&bitC, CTable[ip[n+3]].val, CTable[ip[n+3]].nbBits);
+        case 7 : HUF_encodeSymbol(&bitC, ip[n+ 6], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 6 : HUF_encodeSymbol(&bitC, ip[n+ 5], CTable);
+                 FSE_FLUSHBITS_2(&bitC);
+        case 5 : HUF_encodeSymbol(&bitC, ip[n+ 4], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 4 : HUF_encodeSymbol(&bitC, ip[n+ 3], CTable);
                  FSE_flushBits(&bitC);
-        case 3 : FSE_addBitsFast(&bitC, CTable[ip[n+2]].val, CTable[ip[n+2]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 2 : FSE_addBitsFast(&bitC, CTable[ip[n+1]].val, CTable[ip[n+1]].nbBits);
-                 FSE_FLUSHBITS_32(&bitC);
-        case 1 : FSE_addBitsFast(&bitC, CTable[ip[n+0]].val, CTable[ip[n+0]].nbBits);
+        case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
+                 FSE_FLUSHBITS_2(&bitC);
+        case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
                  FSE_flushBits(&bitC);
         case 0 :
         default: ;
@@ -1980,13 +1958,13 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
 
     for (; n>0; n-=16)
     {
-        FSE_addBitsFast(&bitC, CTable[ip[n- 4]].val, CTable[ip[n- 4]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n- 8]].val, CTable[ip[n- 8]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n-12]].val, CTable[ip[n-12]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n-16]].val, CTable[ip[n-16]].nbBits);
+        HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 8], CTable);
+        FSE_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-12], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-16], CTable);
         FSE_flushBits(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
@@ -1997,13 +1975,13 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
     n = srcSize & ~15;  // mod 16
     for (; n>0; n-=16)
     {
-        FSE_addBitsFast(&bitC, CTable[ip[n- 3]].val, CTable[ip[n- 3]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n- 7]].val, CTable[ip[n- 7]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n-11]].val, CTable[ip[n-11]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n-15]].val, CTable[ip[n-15]].nbBits);
+        HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 7], CTable);
+        FSE_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-11], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-15], CTable);
         FSE_flushBits(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
@@ -2014,13 +1992,13 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
     n = srcSize & ~15;  // mod 16
     for (; n>0; n-=16)
     {
-        FSE_addBitsFast(&bitC, CTable[ip[n- 2]].val, CTable[ip[n- 2]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n- 6]].val, CTable[ip[n- 6]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n-10]].val, CTable[ip[n-10]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n-14]].val, CTable[ip[n-14]].nbBits);
+        HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 6], CTable);
+        FSE_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-10], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-14], CTable);
         FSE_flushBits(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
@@ -2031,13 +2009,13 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
     n = srcSize & ~15;  // mod 16
     for (; n>0; n-=16)
     {
-        FSE_addBitsFast(&bitC, CTable[ip[n- 1]].val, CTable[ip[n- 1]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n- 5]].val, CTable[ip[n- 5]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n- 9]].val, CTable[ip[n- 9]].nbBits);
-        FSE_FLUSHBITS_32(&bitC);
-        FSE_addBitsFast(&bitC, CTable[ip[n-13]].val, CTable[ip[n-13]].nbBits);
+        HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 5], CTable);
+        FSE_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 9], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-13], CTable);
         FSE_flushBits(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
@@ -2117,11 +2095,35 @@ size_t HUF_readDTable (U16* DTable, const void* src, size_t srcSize)
     HUF_DElt* const dt = (HUF_DElt*)(DTable + 1);
 
     FSE_STATIC_ASSERT(sizeof(HUF_DElt) == sizeof(U16));   // if compilation fails here, assertion is false
-    if (iSize >= 128) return (size_t)-FSE_ERROR_GENERIC;  // special case, not implemented
-    if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+    if (iSize >= 128) // special case
+    {
+        if (iSize >= (128+64))   // RLE
+        {
+            if (srcSize < 2) return (size_t)-FSE_ERROR_srcSize_wrong;
+            oSize = (iSize & 63) + 1;
+            memset(huffWeight, ip[1], oSize);
+            iSize = 1;
+        }
+        else   // Incompressible
+        {
+            oSize = (iSize & 63) + 1;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+            ip += 1;
+            for (n=0; n<oSize; n+=2)
+            {
+                huffWeight[n]   = (ip[n/2] >> 4);
+                huffWeight[n+1] = (ip[n/2] & 15);
+            }
+        }
+    }
+    else  // normal case, header compressed with FSE
+    {
+        if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+        oSize = FSE_decompress(huffWeight, HUF_MAX_SYMBOL_VALUE, ip+1, iSize);   // max 255 values stored, last is implied
+        if (FSE_isError(oSize)) return oSize;
+    }
 
-    oSize = FSE_decompress(huffWeight, HUF_MAX_SYMBOL_VALUE, ip+1, iSize);   // max 255 values stored, last is implied
-    if (FSE_isError(oSize)) return oSize;
 
     // stats on weights
     for (n=0; n<oSize; n++)
@@ -2168,14 +2170,6 @@ size_t HUF_readDTable (U16* DTable, const void* src, size_t srcSize)
     return iSize+1;
 }
 
-/*
-#define HUF_DECODE_SYMBOL(n, Dstream) \
-        val = FSE_lookBitsFast(&Dstream, dtLog); \
-        c = dt[val].byte; \
-        FSE_skipBits(&Dstream, dt[val].nbBits); \
-        op[n] = c;
-*/
-
 static void HUF_decodeSymbol(BYTE* ptr, FSE_DStream_t* Dstream, const HUF_DElt* dt, U32 dtLog)
 {
         size_t val = FSE_lookBitsFast(Dstream, dtLog);
diff --git a/lib/fse_static.h b/lib/fse_static.h
index 472d88c8..8deb7615 100644
--- a/lib/fse_static.h
+++ b/lib/fse_static.h
@@ -123,7 +123,7 @@ typedef struct
 void   FSE_initCStream(FSE_CStream_t* bitC, void* dstBuffer);
 void   FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
 
-void   FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned char symbol);
+void   FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
 void   FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits);
 void   FSE_flushBits(FSE_CStream_t* bitC);
 

From a787550d1c3cb221978fd3d8bfc3f61b732cdb6a Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Fri, 7 Aug 2015 15:21:00 +0100
Subject: [PATCH 05/21] Updated FSE lib

---
 lib/fse.c        | 334 +++++++++++++++++++++++++++++------------------
 lib/fse.h        | 102 +++++++--------
 lib/fse_static.h |  59 +++++----
 lib/zstd.c       |  95 +++-----------
 4 files changed, 310 insertions(+), 280 deletions(-)

diff --git a/lib/fse.c b/lib/fse.c
index 4bf587d8..63684fc0 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -496,7 +496,7 @@ size_t FSE_FUNCTION_NAME(FSE_buildCTable, FSE_FUNCTION_EXTENSION)
     for (i=0; i<tableSize; i++)
     {
         FSE_FUNCTION_TYPE s = tableSymbol[i];   /* static analyzer doesn't understand tableSymbol is properly initialized */
-        tableU16[cumul[s]++] = (U16) (tableSize+i);   /* Table U16 : sorted by symbol order; gives next state value */
+        tableU16[cumul[s]++] = (U16) (tableSize+i);   /* TableU16 : sorted by symbol order; gives next state value */
     }
 
     /* Build Symbol Transformation Table */
@@ -511,7 +511,7 @@ size_t FSE_FUNCTION_NAME(FSE_buildCTable, FSE_FUNCTION_EXTENSION)
                 break;
             case -1:
             case  1:
-                symbolTT[s].deltaNbBits = tableLog * (1 << 16);
+                symbolTT[s].deltaNbBits = tableLog << 16;
                 symbolTT[s].deltaFindState = total - 1;
                 total ++;
                 break;
@@ -544,12 +544,16 @@ void FSE_FUNCTION_NAME(FSE_freeDTable, FSE_FUNCTION_EXTENSION) (FSE_DTable* dt)
     free(dt);
 }
 
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
 
 size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION)
 (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
-    U32* const base32 = (U32*)dt;
-    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (base32+1);
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)dt;
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (dt+1);   /* because dt is unsigned, 32-bits aligned on 32-bits */
     const U32 tableSize = 1 << tableLog;
     const U32 tableMask = tableSize-1;
     const U32 step = FSE_tableStep(tableSize);
@@ -565,7 +569,7 @@ size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION)
     if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_tableLog_tooLarge;
 
     /* Init, lay down lowprob symbols */
-    base32[0] = tableLog;
+    DTableH[0].tableLog = (U16)tableLog;
     for (s=0; s<=maxSymbolValue; s++)
     {
         if (normalizedCounter[s]==-1)
@@ -606,7 +610,8 @@ size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION)
         }
     }
 
-    return noLarge;
+    DTableH->fastMode = (U16)noLarge;
+    return 0;
 }
 
 
@@ -639,7 +644,7 @@ static short FSE_abs(short a)
 size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
 {
     size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 1;
-    return maxSymbolValue ? maxHeaderSize : FSE_MAX_HEADERSIZE;
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;
 }
 
 static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
@@ -679,7 +684,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
             {
                 start+=24;
                 bitStream += 0xFFFFU << bitCount;
-                if ((!safeWrite) && (out > oend-2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
+                if ((!safeWrite) && (out > oend-2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
                 out[0] = (BYTE) bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out+=2;
@@ -695,7 +700,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
             bitCount += 2;
             if (bitCount>16)
             {
-                if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
+                if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
                 out[0] = (BYTE)bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out += 2;
@@ -718,7 +723,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
         }
         if (bitCount>16)
         {
-            if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
+            if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
             out[0] = (BYTE)bitStream;
             out[1] = (BYTE)(bitStream>>8);
             out += 2;
@@ -728,7 +733,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
     }
 
     /* flush remaining bitStream */
-    if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
+    if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
     out[0] = (BYTE)bitStream;
     out[1] = (BYTE)(bitStream>>8);
     out+= (bitCount+7) /8;
@@ -899,21 +904,6 @@ unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS
 }
 
 
-typedef struct
-{
-    U32 id;
-    U32 count;
-} rank_t;
-
-int FSE_compareRankT(const void* r1, const void* r2)
-{
-    const rank_t* R1 = (const rank_t*)r1;
-    const rank_t* R2 = (const rank_t*)r2;
-
-    return 2 * (R1->count < R2->count) - 1;
-}
-
-
 /* Secondary normalization method.
    To be used when primary method fails. */
 
@@ -1140,12 +1130,15 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
 }
 
 
-void FSE_initCStream(FSE_CStream_t* bitC, void* start)
+size_t FSE_initCStream(FSE_CStream_t* bitC, void* start, size_t maxSize)
 {
+    if (maxSize < 8) return (size_t)-FSE_ERROR_dstSize_tooSmall;
     bitC->bitContainer = 0;
-    bitC->bitPos = 0;   /* reserved for unusedBits */
+    bitC->bitPos = 0;
     bitC->startPtr = (char*)start;
     bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + maxSize - sizeof(bitC->ptr);
+    return 0;
 }
 
 void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
@@ -1179,13 +1172,26 @@ void FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
     statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
 }
 
+void FSE_flushBitsFast(FSE_CStream_t* bitC)  /* only if dst buffer is large enough ( >= FSE_compressBound()) */
+{
+    size_t nbBytes = bitC->bitPos >> 3;
+    FSE_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
 void FSE_flushBits(FSE_CStream_t* bitC)
 {
     size_t nbBytes = bitC->bitPos >> 3;
     FSE_writeLEST(bitC->ptr, bitC->bitContainer);
-    bitC->bitPos &= 7;
     bitC->ptr += nbBytes;
-    bitC->bitContainer >>= nbBytes*8;
+    if (bitC->ptr <= bitC->endPtr)
+    {
+        bitC->bitPos &= 7;
+        bitC->bitContainer >>= nbBytes*8;
+        return;
+    }
 }
 
 void FSE_flushCState(FSE_CStream_t* bitC, const FSE_CState_t* statePtr)
@@ -1199,9 +1205,12 @@ size_t FSE_closeCStream(FSE_CStream_t* bitC)
 {
     char* endPtr;
 
-    FSE_addBits(bitC, 1, 1);
+    FSE_addBitsFast(bitC, 1, 1);
     FSE_flushBits(bitC);
 
+    if (bitC->bitPos > 7)   /* still some data to flush => too close to buffer's end */
+        return 0;   /* not compressible */
+
     endPtr = bitC->ptr;
     endPtr += bitC->bitPos > 0;
 
@@ -1209,31 +1218,34 @@ size_t FSE_closeCStream(FSE_CStream_t* bitC)
 }
 
 
-size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
                            const void* src, size_t srcSize,
-                           const FSE_CTable* ct)
+                           const FSE_CTable* ct, const unsigned fast)
 {
     const BYTE* const istart = (const BYTE*) src;
     const BYTE* ip;
     const BYTE* const iend = istart + srcSize;
 
+    size_t errorCode;
     FSE_CStream_t bitC;
     FSE_CState_t CState1, CState2;
 
 
     /* init */
-    (void)dstSize;   /* objective : ensure it fits into dstBuffer (Todo) */
-    FSE_initCStream(&bitC, dst);
+    errorCode = FSE_initCStream(&bitC, dst, dstSize);
+    if (FSE_isError(errorCode)) return 0;
     FSE_initCState(&CState1, ct);
     CState2 = CState1;
 
     ip=iend;
 
+#define FSE_FLUSHBITS(s)  (fast ? FSE_flushBitsFast(s) : FSE_flushBits(s))
+
     /* join to even */
     if (srcSize & 1)
     {
         FSE_encodeSymbol(&bitC, &CState1, *--ip);
-        FSE_flushBits(&bitC);
+        FSE_FLUSHBITS(&bitC);
     }
 
     /* join to mod 4 */
@@ -1241,16 +1253,16 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
     {
         FSE_encodeSymbol(&bitC, &CState2, *--ip);
         FSE_encodeSymbol(&bitC, &CState1, *--ip);
-        FSE_flushBits(&bitC);
+        FSE_FLUSHBITS(&bitC);
     }
 
     /* 2 or 4 encoding per loop */
-    while (ip>istart)
+    for ( ; ip>istart ; )
     {
         FSE_encodeSymbol(&bitC, &CState2, *--ip);
 
         if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
-            FSE_flushBits(&bitC);
+            FSE_FLUSHBITS(&bitC);
 
         FSE_encodeSymbol(&bitC, &CState1, *--ip);
 
@@ -1260,7 +1272,7 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
             FSE_encodeSymbol(&bitC, &CState1, *--ip);
         }
 
-        FSE_flushBits(&bitC);
+        FSE_FLUSHBITS(&bitC);
     }
 
     FSE_flushCState(&bitC, &CState2);
@@ -1268,10 +1280,21 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
     return FSE_closeCStream(&bitC);
 }
 
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct)
+{
+    const unsigned fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+    if (fast)
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+    else
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
 
 size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
 
-
 size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
 {
     const BYTE* const istart = (const BYTE*) src;
@@ -1286,9 +1309,8 @@ size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize
     CTable_max_t ct;
     size_t errorCode;
 
-    /* early out */
-    if (dstSize < FSE_compressBound(srcSize)) return (size_t)-FSE_ERROR_dstSize_tooSmall;
-    if (srcSize <= 1) return srcSize;  /* Uncompressed or RLE */
+    /* init conditions */
+    if (srcSize <= 1) return 0;  /* Uncompressible */
     if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
     if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
 
@@ -1303,14 +1325,16 @@ size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize
     if (FSE_isError(errorCode)) return errorCode;
 
     /* Write table description header */
-    errorCode = FSE_writeNCount (op, FSE_MAX_HEADERSIZE, norm, maxSymbolValue, tableLog);
+    errorCode = FSE_writeNCount (op, oend-op, norm, maxSymbolValue, tableLog);
     if (FSE_isError(errorCode)) return errorCode;
     op += errorCode;
 
     /* Compress */
     errorCode = FSE_buildCTable (ct, norm, maxSymbolValue, tableLog);
     if (FSE_isError(errorCode)) return errorCode;
-    op += FSE_compress_usingCTable(op, oend - op, ip, srcSize, ct);
+    errorCode = FSE_compress_usingCTable(op, oend - op, ip, srcSize, ct);
+    if (errorCode == 0) return 0;   /* not enough space for compressed data */
+    op += errorCode;
 
     /* check compressibility */
     if ( (size_t)(op-ostart) >= srcSize-1 )
@@ -1330,10 +1354,11 @@ size_t FSE_compress (void* dst, size_t dstSize, const void* src, size_t srcSize)
 *********************************************************/
 size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
 {
-    U32* const base32 = (U32*)dt;
-    FSE_decode_t* const cell = (FSE_decode_t*)(base32 + 1);
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)dt;
+    FSE_decode_t* const cell = (FSE_decode_t*)(dt + 1);   /* because dt is unsigned */
 
-    base32[0] = 0;
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
 
     cell->newState = 0;
     cell->symbol = symbolValue;
@@ -1345,8 +1370,8 @@ size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
 
 size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
 {
-    U32* const base32 = (U32*)dt;
-    FSE_decode_t* dinfo = (FSE_decode_t*)(base32 + 1);
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)dt;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)(dt + 1);   /* because dt is unsigned */
     const unsigned tableSize = 1 << nbBits;
     const unsigned tableMask = tableSize - 1;
     const unsigned maxSymbolValue = tableMask;
@@ -1356,7 +1381,8 @@ size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
     if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
 
     /* Build Decoding Table */
-    base32[0] = nbBits;
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
     for (s=0; s<=maxSymbolValue; s++)
     {
         dinfo[s].newState = 0;
@@ -1469,7 +1495,7 @@ unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
     }
     if (bitD->ptr == bitD->start)
     {
-        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return FSE_DStream_partiallyFilled;
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return FSE_DStream_endOfBuffer;
         if (bitD->bitsConsumed == sizeof(bitD->bitContainer)*8) return FSE_DStream_completed;
         return FSE_DStream_tooFar;
     }
@@ -1479,7 +1505,7 @@ unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
         if (bitD->ptr - nbBytes < bitD->start)
         {
             nbBytes = (U32)(bitD->ptr - bitD->start);  /* note : necessarily ptr > start */
-            result = FSE_DStream_partiallyFilled;
+            result = FSE_DStream_endOfBuffer;
         }
         bitD->ptr -= nbBytes;
         bitD->bitsConsumed -= nbBytes*8;
@@ -1491,10 +1517,10 @@ unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
 
 void FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const FSE_DTable* dt)
 {
-    const U32* const base32 = (const U32*)dt;
-    DStatePtr->state = FSE_readBits(bitD, base32[0]);
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)dt;
+    DStatePtr->state = FSE_readBits(bitD, DTableH->tableLog);
     FSE_reloadDStream(bitD);
-    DStatePtr->table = base32 + 1;
+    DStatePtr->table = dt + 1;
 }
 
 BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
@@ -1536,7 +1562,7 @@ unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
 FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
           void* dst, size_t maxDstSize,
     const void* cSrc, size_t cSrcSize,
-    const FSE_DTable* dt, unsigned fast)
+    const FSE_DTable* dt, const unsigned fast)
 {
     BYTE* const ostart = (BYTE*) dst;
     BYTE* op = ostart;
@@ -1605,8 +1631,11 @@ FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
 
 size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
                             const void* cSrc, size_t cSrcSize,
-                            const FSE_DTable* dt, size_t fastMode)
+                            const FSE_DTable* dt)
 {
+    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)dt;
+    const U32 fastMode = DTableH->fastMode;
+
     /* select fast mode (static) */
     if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
     return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
@@ -1621,7 +1650,7 @@ size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSr
     DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
     unsigned tableLog;
     unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
-    size_t errorCode, fastMode;
+    size_t errorCode;
 
     if (cSrcSize<2) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
 
@@ -1632,11 +1661,11 @@ size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSr
     ip += errorCode;
     cSrcSize -= errorCode;
 
-    fastMode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
-    if (FSE_isError(fastMode)) return fastMode;
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
 
     /* always return, even if it is an error code */
-    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt, fastMode);
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
 }
 
 
@@ -1664,41 +1693,69 @@ typedef struct nodeElt_s {
     BYTE nbBits;
 } nodeElt;
 
-
+/* HUF_writeCTable() :
+   return : size of saved CTable */
 size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* tree, U32 maxSymbolValue, U32 huffLog)
 {
+    BYTE bitsToWeight[HUF_ABSOLUTEMAX_TABLELOG + 1];
     BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
     U32 n;
     BYTE* op = (BYTE*)dst;
     size_t size;
 
-    // check conditions
+     /* check conditions */
     if (maxSymbolValue > HUF_MAX_SYMBOL_VALUE + 1)
         return (size_t)-FSE_ERROR_GENERIC;
 
+    /* convert to weight */
+    bitsToWeight[0] = 0;
+    for (n=1; n<=huffLog; n++)
+        bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
     for (n=0; n<maxSymbolValue; n++)
-        huffWeight[n] = tree[n].nbBits ? (BYTE)(huffLog + 1 - tree[n].nbBits) : 0;
+        huffWeight[n] = bitsToWeight[tree[n].nbBits];
 
-    size = FSE_compress(op+1, maxDstSize-1, huffWeight, maxSymbolValue);   // don't need last symbol stat : implied
+    size = FSE_compress(op+1, maxDstSize-1, huffWeight, maxSymbolValue);   /* don't need last symbol stat : implied */
     if (FSE_isError(size)) return size;
-    if (size >= 128) return (size_t)-FSE_ERROR_GENERIC;   // should never happen, since maxSymbolValue <= 255
+    if (size >= 128) return (size_t)-FSE_ERROR_GENERIC;   /* should never happen, since maxSymbolValue <= 255 */
     if ((size <= 1) || (size >= maxSymbolValue/2))
     {
-        if (maxSymbolValue > 64) return (size_t)-FSE_ERROR_GENERIC;   // special case, not implemented (not possible)
-        if (size==1)   // RLE
+        if (size==1)   /* RLE */
         {
-            op[0] = (BYTE)(128 /*special case*/ + 64 /* RLE */ + (maxSymbolValue-1));
-            op[1] = huffWeight[0];
-            return 2;
+            /* only possible case : serie of 1 (because there are at least 2) */
+            /* can only be 2^n or (2^n-1), otherwise not an huffman tree */
+            BYTE code;
+            switch(maxSymbolValue)
+            {
+            case 1: code = 0; break;
+            case 2: code = 1; break;
+            case 3: code = 2; break;
+            case 4: code = 3; break;
+            case 7: code = 4; break;
+            case 8: code = 5; break;
+            case 15: code = 6; break;
+            case 16: code = 7; break;
+            case 31: code = 8; break;
+            case 32: code = 9; break;
+            case 63: code = 10; break;
+            case 64: code = 11; break;
+            case 127: code = 12; break;
+            case 128: code = 13; break;
+            default : return (size_t)-FSE_ERROR_corruptionDetected;
+            }
+            op[0] = (BYTE)(255-13 + code);
+            return 1;
         }
-        // Not compressible
+         /* Not compressible */
+        if (maxSymbolValue > (241-128)) return (size_t)-FSE_ERROR_GENERIC;   /* not implemented (not possible with current format) */
+        if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* not enough space within dst buffer */
         op[0] = (BYTE)(128 /*special case*/ + 0 /* Not Compressible */ + (maxSymbolValue-1));
+		huffWeight[maxSymbolValue] = 0;   /* to be sure it doesn't cause issue in final combination */
         for (n=0; n<maxSymbolValue; n+=2)
             op[(n/2)+1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n+1]);
         return ((maxSymbolValue+1)/2) + 1;
     }
 
-    // normal case
+    /* normal header case */
     op[0] = (BYTE)size;
     return size+1;
 }
@@ -1709,7 +1766,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
     int totalCost = 0;
     const U32 largestBits = huffNode[lastNonNull].nbBits;
 
-    // early exit : all is fine
+    /* early exit : all is fine */
     if (largestBits <= maxNbBits) return largestBits;
 
     // now we have a few too large elements (at least >= 2)
@@ -1900,24 +1957,27 @@ static void HUF_encodeSymbol(FSE_CStream_t* bitCPtr, U32 symbol, const HUF_CElt*
 }
 
 #define FSE_FLUSHBITS_1(stream) \
-    if (sizeof((stream)->bitContainer)*8 < HUF_MAX_TABLELOG*2+7) FSE_flushBits(stream)
+    if (sizeof((stream)->bitContainer)*8 < HUF_MAX_TABLELOG*2+7) FSE_FLUSHBITS(stream)
 
 #define FSE_FLUSHBITS_2(stream) \
-    if (sizeof((stream)->bitContainer)*8 < HUF_MAX_TABLELOG*4+7) FSE_flushBits(stream)
+    if (sizeof((stream)->bitContainer)*8 < HUF_MAX_TABLELOG*4+7) FSE_FLUSHBITS(stream)
 
-static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, HUF_CElt* CTable)
+size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, HUF_CElt* CTable)
 {
     const BYTE* ip = (const BYTE*) src;
     BYTE* const ostart = (BYTE*)dst;
     BYTE* op = (BYTE*) ostart;
+    BYTE* const oend = ostart + dstSize;
     U16* jumpTable = (U16*) dst;
     size_t n, streamSize;
+    const unsigned fast = (dstSize >= HUF_BLOCKBOUND(srcSize));
+    size_t errorCode;
     FSE_CStream_t bitC;
 
     /* init */
-    (void)dstSize;   /* objective : ensure it fits into dstBuffer (Todo) */
     op += 6;   /* jump Table -- could be optimized by delta / deviation */
-    FSE_initCStream(&bitC, op);
+    errorCode = FSE_initCStream(&bitC, op, dstSize);
+    if (FSE_isError(errorCode)) return 0;
 
     n = srcSize & ~15;  // mod 16
     switch (srcSize & 15)
@@ -1929,7 +1989,7 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
         case 13: HUF_encodeSymbol(&bitC, ip[n+12], CTable);
                  FSE_FLUSHBITS_1(&bitC);
         case 12: HUF_encodeSymbol(&bitC, ip[n+11], CTable);
-                 FSE_flushBits(&bitC);
+                 FSE_FLUSHBITS(&bitC);
         case 11: HUF_encodeSymbol(&bitC, ip[n+10], CTable);
                  FSE_FLUSHBITS_1(&bitC);
         case 10: HUF_encodeSymbol(&bitC, ip[n+ 9], CTable);
@@ -1937,7 +1997,7 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
         case 9 : HUF_encodeSymbol(&bitC, ip[n+ 8], CTable);
                  FSE_FLUSHBITS_1(&bitC);
         case 8 : HUF_encodeSymbol(&bitC, ip[n+ 7], CTable);
-                 FSE_flushBits(&bitC);
+                 FSE_FLUSHBITS(&bitC);
         case 7 : HUF_encodeSymbol(&bitC, ip[n+ 6], CTable);
                  FSE_FLUSHBITS_1(&bitC);
         case 6 : HUF_encodeSymbol(&bitC, ip[n+ 5], CTable);
@@ -1945,13 +2005,13 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
         case 5 : HUF_encodeSymbol(&bitC, ip[n+ 4], CTable);
                  FSE_FLUSHBITS_1(&bitC);
         case 4 : HUF_encodeSymbol(&bitC, ip[n+ 3], CTable);
-                 FSE_flushBits(&bitC);
+                 FSE_FLUSHBITS(&bitC);
         case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
                  FSE_FLUSHBITS_2(&bitC);
         case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
                  FSE_FLUSHBITS_1(&bitC);
         case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
-                 FSE_flushBits(&bitC);
+                 FSE_FLUSHBITS(&bitC);
         case 0 :
         default: ;
     }
@@ -1965,13 +2025,15 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
         HUF_encodeSymbol(&bitC, ip[n-12], CTable);
         FSE_FLUSHBITS_1(&bitC);
         HUF_encodeSymbol(&bitC, ip[n-16], CTable);
-        FSE_flushBits(&bitC);
+        FSE_FLUSHBITS(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
+    if (streamSize==0) return 0;   /* not enough space within dst buffer == uncompressible */
     FSE_writeLE16(jumpTable, (U16)streamSize);
     op += streamSize;
 
-    FSE_initCStream(&bitC, op);
+    errorCode = FSE_initCStream(&bitC, op, oend-op);
+    if (FSE_isError(errorCode)) return 0;
     n = srcSize & ~15;  // mod 16
     for (; n>0; n-=16)
     {
@@ -1982,13 +2044,15 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
         HUF_encodeSymbol(&bitC, ip[n-11], CTable);
         FSE_FLUSHBITS_1(&bitC);
         HUF_encodeSymbol(&bitC, ip[n-15], CTable);
-        FSE_flushBits(&bitC);
+        FSE_FLUSHBITS(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
+    if (streamSize==0) return 0;   /* not enough space within dst buffer == uncompressible */
     FSE_writeLE16(jumpTable+1, (U16)streamSize);
     op += streamSize;
 
-    FSE_initCStream(&bitC, op);
+    errorCode = FSE_initCStream(&bitC, op, oend-op);
+    if (FSE_isError(errorCode)) return 0;
     n = srcSize & ~15;  // mod 16
     for (; n>0; n-=16)
     {
@@ -1999,13 +2063,15 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
         HUF_encodeSymbol(&bitC, ip[n-10], CTable);
         FSE_FLUSHBITS_1(&bitC);
         HUF_encodeSymbol(&bitC, ip[n-14], CTable);
-        FSE_flushBits(&bitC);
+        FSE_FLUSHBITS(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
+    if (streamSize==0) return 0;   /* not enough space within dst buffer == uncompressible */
     FSE_writeLE16(jumpTable+2, (U16)streamSize);
     op += streamSize;
 
-    FSE_initCStream(&bitC, op);
+    errorCode = FSE_initCStream(&bitC, op, oend-op);
+    if (FSE_isError(errorCode)) return 0;
     n = srcSize & ~15;  // mod 16
     for (; n>0; n-=16)
     {
@@ -2016,9 +2082,10 @@ static size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* sr
         HUF_encodeSymbol(&bitC, ip[n- 9], CTable);
         FSE_FLUSHBITS_1(&bitC);
         HUF_encodeSymbol(&bitC, ip[n-13], CTable);
-        FSE_flushBits(&bitC);
+        FSE_FLUSHBITS(&bitC);
     }
     streamSize = FSE_closeCStream(&bitC);
+    if (streamSize==0) return 0;   /* not enough space within dst buffer == uncompressible */
     op += streamSize;
 
     return op-ostart;
@@ -2036,7 +2103,6 @@ size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize
     size_t errorCode;
 
     /* early out */
-    if (dstSize < FSE_compressBound(srcSize)) return (size_t)-FSE_ERROR_dstSize_tooSmall;
     if (srcSize <= 1) return srcSize;  /* Uncompressed or RLE */
     if (!maxSymbolValue) maxSymbolValue = HUF_MAX_SYMBOL_VALUE;
     if (!huffLog) huffLog = HUF_DEFAULT_TABLELOG;
@@ -2062,7 +2128,7 @@ size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize
 
     /* check compressibility */
     if ((size_t)(op-ostart) >= srcSize-1)
-        return 0;
+        return op-ostart;
 
     return op-ostart;
 }
@@ -2084,8 +2150,8 @@ typedef struct {
 size_t HUF_readDTable (U16* DTable, const void* src, size_t srcSize)
 {
     BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
-    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1] = {0};
-    U32 weightTotal = 0;
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];  /* large enough for values from 0 to 16 */
+    U32 weightTotal;
     U32 maxBits;
     const BYTE* ip = (const BYTE*) src;
     size_t iSize = ip[0];
@@ -2094,58 +2160,65 @@ size_t HUF_readDTable (U16* DTable, const void* src, size_t srcSize)
     U32 nextRankStart;
     HUF_DElt* const dt = (HUF_DElt*)(DTable + 1);
 
-    FSE_STATIC_ASSERT(sizeof(HUF_DElt) == sizeof(U16));   // if compilation fails here, assertion is false
-    if (iSize >= 128) // special case
+    FSE_STATIC_ASSERT(sizeof(HUF_DElt) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* should not be necessary, but some analyzer complain ... */
+    if (iSize >= 128)  /* special header */
     {
-        if (iSize >= (128+64))   // RLE
+        if (iSize >= (242))   /* RLE */
         {
-            if (srcSize < 2) return (size_t)-FSE_ERROR_srcSize_wrong;
-            oSize = (iSize & 63) + 1;
-            memset(huffWeight, ip[1], oSize);
-            iSize = 1;
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, oSize);
+            iSize = 0;
         }
-        else   // Incompressible
+        else   /* Incompressible */
         {
-            oSize = (iSize & 63) + 1;
+            oSize = iSize - 127;
             iSize = ((oSize+1)/2);
             if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
             ip += 1;
             for (n=0; n<oSize; n+=2)
             {
-                huffWeight[n]   = (ip[n/2] >> 4);
-                huffWeight[n+1] = (ip[n/2] & 15);
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
             }
         }
     }
-    else  // normal case, header compressed with FSE
+    else  /* header compressed with FSE (normal case) */
     {
         if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
-        oSize = FSE_decompress(huffWeight, HUF_MAX_SYMBOL_VALUE, ip+1, iSize);   // max 255 values stored, last is implied
+        oSize = FSE_decompress(huffWeight, HUF_MAX_SYMBOL_VALUE, ip+1, iSize);   /* max 255 values decoded, last one is implied */
         if (FSE_isError(oSize)) return oSize;
     }
 
-
-    // stats on weights
+    /* collect weight stats */
+    memset(rankVal, 0, sizeof(rankVal));
+    weightTotal = 0;
     for (n=0; n<oSize; n++)
     {
+        if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return (size_t)-FSE_ERROR_corruptionDetected;
         rankVal[huffWeight[n]]++;
         weightTotal += (1 << huffWeight[n]) >> 1;
     }
 
-    // get last symbol weight(implied)
+    /* get last non-null symbol weight (implied, total must be 2^n) */
     maxBits = FSE_highbit32(weightTotal) + 1;
-    if (maxBits > DTable[0]) return (size_t)-FSE_ERROR_GENERIC;   // DTable is too small
+    if (maxBits > DTable[0]) return (size_t)-FSE_ERROR_tableLog_tooLarge;   /* DTable is too small */
     DTable[0] = (U16)maxBits;
     {
         U32 total = 1 << maxBits;
         U32 rest = total - weightTotal;
         U32 verif = 1 << FSE_highbit32(rest);
-        if (verif != rest) return (size_t)-FSE_ERROR_GENERIC;    // last value must be a clean power of 2
-        huffWeight[oSize] = (BYTE)(FSE_highbit32(rest) + 1);
-        rankVal[huffWeight[oSize]]++;
+        U32 lastWeight = FSE_highbit32(rest) + 1;
+        if (verif != rest) return (size_t)-FSE_ERROR_corruptionDetected;    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankVal[lastWeight]++;
     }
 
-    // Prepare ranks
+    /* check tree construction validity */
+    if ((rankVal[1] < 2) || (rankVal[1] & 1)) return (size_t)-FSE_ERROR_corruptionDetected;   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* Prepare ranks */
     nextRankStart = 0;
     for (n=1; n<=maxBits; n++)
     {
@@ -2154,12 +2227,12 @@ size_t HUF_readDTable (U16* DTable, const void* src, size_t srcSize)
         rankVal[n] = current;
     }
 
-    // fill table
+    /* fill DTable */
     for (n=0; n<=oSize; n++)
     {
-        U32 i;
         const U32 w = huffWeight[n];
         const U32 length = (1 << w) >> 1;
+        U32 i;
         HUF_DElt D;
         D.byte = (BYTE)n; D.nbBits = (BYTE)(maxBits + 1 - w);
         for (i = rankVal[w]; i < rankVal[w] + length; i++)
@@ -2170,15 +2243,16 @@ size_t HUF_readDTable (U16* DTable, const void* src, size_t srcSize)
     return iSize+1;
 }
 
-static void HUF_decodeSymbol(BYTE* ptr, FSE_DStream_t* Dstream, const HUF_DElt* dt, U32 dtLog)
+
+static BYTE HUF_decodeSymbol(FSE_DStream_t* Dstream, const HUF_DElt* dt, const U32 dtLog)
 {
-        size_t val = FSE_lookBitsFast(Dstream, dtLog);
-        BYTE c = dt[val].byte;
+        const size_t val = FSE_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
         FSE_skipBits(Dstream, dt[val].nbBits);
-        *ptr = c;
+        return c;
 }
 
-static size_t HUF_decompress_usingDTable(
+static size_t HUF_decompress_usingDTable(   /* -3% slower when non static */
           void* dst, size_t maxDstSize,
     const void* cSrc, size_t cSrcSize,
     const U16* DTable)
@@ -2206,6 +2280,8 @@ static size_t HUF_decompress_usingDTable(
     const char* const start4 = start3 + length3;
     FSE_DStream_t bitD1, bitD2, bitD3, bitD4;
 
+    if (length1+length2+length3+6 >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+
     errorCode = FSE_initDStream(&bitD1, start1, length1);
     if (FSE_isError(errorCode)) return errorCode;
     errorCode = FSE_initDStream(&bitD2, start2, length2);
@@ -2222,14 +2298,14 @@ static size_t HUF_decompress_usingDTable(
         op+=16, reloadStatus = FSE_reloadDStream(&bitD2) | FSE_reloadDStream(&bitD3) | FSE_reloadDStream(&bitD4), FSE_reloadDStream(&bitD1))
     {
 #define HUF_DECODE_SYMBOL_0(n, Dstream) \
-        HUF_decodeSymbol(op+n, &Dstream, dt, dtLog);
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog);
 
 #define HUF_DECODE_SYMBOL_1(n, Dstream) \
-        HUF_decodeSymbol(op+n, &Dstream, dt, dtLog); \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog); \
         if (FSE_32bits() && (HUF_MAX_TABLELOG>12)) FSE_reloadDStream(&Dstream)
 
 #define HUF_DECODE_SYMBOL_2(n, Dstream) \
-        HUF_decodeSymbol(op+n, &Dstream, dt, dtLog); \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog); \
         if (FSE_32bits()) FSE_reloadDStream(&Dstream)
 
         HUF_DECODE_SYMBOL_1( 0, bitD1);
@@ -2250,7 +2326,7 @@ static size_t HUF_decompress_usingDTable(
         HUF_DECODE_SYMBOL_0(15, bitD4);
     }
 
-    if (reloadStatus!=FSE_DStream_completed)   /* not complete : some bitStream might be 0 (unfinished) */
+    if (reloadStatus!=FSE_DStream_completed)   /* not complete : some bitStream might be FSE_DStream_unfinished */
         return (size_t)-FSE_ERROR_corruptionDetected;
 
     /* tail */
@@ -2259,7 +2335,7 @@ static size_t HUF_decompress_usingDTable(
         FSE_DStream_t bitTail;
         bitTail.ptr = bitD1.ptr;
         bitTail.bitsConsumed = bitD1.bitsConsumed;
-        bitTail.bitContainer = bitD1.bitContainer;   // required in case FSE_DStream_partiallyFilled
+        bitTail.bitContainer = bitD1.bitContainer;   // required in case of FSE_DStream_endOfBuffer
         bitTail.start = start1;
         for ( ; (FSE_reloadDStream(&bitTail) < FSE_DStream_completed) && (op<omax) ; op++)
         {
diff --git a/lib/fse.h b/lib/fse.h
index cbef4def..36d86dcd 100644
--- a/lib/fse.h
+++ b/lib/fse.h
@@ -55,12 +55,11 @@ size_t FSE_decompress(void* dst,  size_t maxDstSize,
 /*
 FSE_compress():
     Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
-    'dst' buffer must be already allocated, and sized to handle worst case situations.
-    Worst case size evaluation is provided by FSE_compressBound().
-    return : size of compressed data
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
-                     if FSE_isError(return), it's an error code.
+    'dst' buffer must be already allocated. Compression runs faster is maxDstSize >= FSE_compressBound(srcSize)
+    return : size of compressed data (<= maxDstSize)
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
 
 FSE_decompress():
     Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
@@ -77,18 +76,18 @@ FSE_decompress():
 /******************************************
 *  Huff0 simple functions
 ******************************************/
-size_t HUF_compress (void* dst, size_t dstSize, const void* src, size_t srcSize);
+size_t HUF_compress(void* dst, size_t maxDstSize,
+              const void* src, size_t srcSize);
 size_t HUF_decompress(void* dst,  size_t maxDstSize,
                 const void* cSrc, size_t cSrcSize);
 /*
 HUF_compress():
     Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
-    'dst' buffer must be already allocated, and sized to handle worst case situations.
-    Worst case size evaluation is provided by FSE_compressBound().
-    return : size of compressed data
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+    'dst' buffer must be already allocated. Compression runs faster is maxDstSize >= HUF_compressBound(srcSize)
+    return : size of compressed data (<= maxDstSize)
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
                      if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
-                     if FSE_isError(return), it's an error code.
+                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
 
 HUF_decompress():
     Decompress Huff0 data from buffer 'cSrc', of size 'cSrcSize',
@@ -134,18 +133,18 @@ size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize
 FSE_compress() does the following:
 1. count symbol occurrence from source[] into table count[]
 2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
-3. save normalized counters to memory buffer using writeHeader()
+3. save normalized counters to memory buffer using writeNCount()
 4. build encoding table 'CTable' from normalized counters
 5. encode the data stream using encoding table 'CTable'
 
 FSE_decompress() does the following:
-1. read normalized counters with readHeader()
+1. read normalized counters with readNCount()
 2. build decoding table 'DTable' from normalized counters
 3. decode the data stream using decoding table 'DTable'
 
-The following API allows to trigger specific sub-functions for advanced tasks.
+The following API allows targeting specific sub-functions for advanced tasks.
 For example, it's possible to compress several blocks using the same 'CTable',
-or to save and provide normalized distribution using one's own method.
+or to save and provide normalized distribution using external method.
 */
 
 /* *** COMPRESSION *** */
@@ -191,8 +190,8 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalized
 
 /*
 Constructor and Destructor of type FSE_CTable
-Not that its size depends on parameters 'tableLog' and 'maxSymbolValue' */
-typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void */
+    Note that its size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
 FSE_CTable* FSE_createCTable (unsigned tableLog, unsigned maxSymbolValue);
 void        FSE_freeCTable (FSE_CTable* ct);
 
@@ -201,30 +200,32 @@ FSE_buildCTable():
    Builds 'ct', which must be already allocated, using FSE_createCTable()
    return : 0
             or an errorCode, which can be tested using FSE_isError() */
-size_t   FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
 
 /*
 FSE_compress_usingCTable():
    Compress 'src' using 'ct' into 'dst' which must be already allocated
-   return : size of compressed data
+   return : size of compressed data (<= maxDstSize)
+            or 0 if compressed data could not fit into 'dst'
             or an errorCode, which can be tested using FSE_isError() */
-size_t FSE_compress_usingCTable (void* dst, size_t dstSize, const void* src, size_t srcSize, const FSE_CTable* ct);
+size_t FSE_compress_usingCTable (void* dst, size_t maxDstSize, const void* src, size_t srcSize, const FSE_CTable* ct);
 
 /*
 Tutorial :
 ----------
-The first step is to count all symbols. FSE_count() provides one quick way to do this job.
+The first step is to count all symbols. FSE_count() does this job very fast.
 Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
 'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
 maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
 FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
 If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
 
 The next step is to normalize the frequencies.
 FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
-It also guarantees a minimum of 1 to any Symbol which frequency is >= 1.
-You can use input 'tableLog'==0 to mean "use default tableLog value".
-If you are unsure of which tableLog value to use, you can optionally call FSE_optimalTableLog(),
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
 which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
 
 The result of FSE_normalizeCount() will be saved into a table,
@@ -232,23 +233,23 @@ called 'normalizedCounter', which is a table of signed short.
 'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
 The return value is tableLog if everything proceeded as expected.
 It is 0 if there is a single symbol within distribution.
-If there is an error(typically, invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
 
-'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeHeader().
-'header' buffer must be already allocated.
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
 For guaranteed success, buffer size must be at least FSE_headerBound().
-The result of the function is the number of bytes written into 'header'.
-If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()) (for example, buffer size too small).
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
 
 'normalizedCounter' can then be used to create the compression table 'CTable'.
-The space required by 'CTable' must be already allocated. Its size is provided by FSE_sizeof_CTable().
-'CTable' must be aligned of 4 bytes boundaries.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
 You can then use FSE_buildCTable() to fill 'CTable'.
-In both cases, if there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
 
 'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
 Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
-The function returns the size of compressed data (without header).
+The function returns the size of compressed data (without header), necessarily <= maxDstSize.
+If it returns '0', compressed data could not fit into 'dst'.
 If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
 */
 
@@ -265,26 +266,25 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, un
 
 /*
 Constructor and Destructor of type FSE_DTable
-Note that its size depends on parameters 'tableLog' */
-typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void */
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
 FSE_DTable* FSE_createDTable(unsigned tableLog);
 void        FSE_freeDTable(FSE_DTable* dt);
 
 /*
 FSE_buildDTable():
    Builds 'dt', which must be already allocated, using FSE_createDTable()
-   return : 1 if 'dt' is compatible with fast mode, 0 otherwise,
+   return : 0,
             or an errorCode, which can be tested using FSE_isError() */
 size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
 
 /*
 FSE_decompress_usingDTable():
-   Decompress compressed source 'cSrc' of size 'cSrcSize'
-   using 'dt' into 'dst' which must be already allocated.
-   Use fastMode==1 only if authorized by result of FSE_buildDTable().
+   Decompress compressed source 'cSrc' of size 'cSrcSize' using 'dt'
+   into 'dst' which must be already allocated.
    return : size of regenerated data (necessarily <= maxDstSize)
             or an errorCode, which can be tested using FSE_isError() */
-size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt, size_t fastMode);
+size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
 
 /*
 Tutorial :
@@ -294,26 +294,24 @@ Tutorial :
  If block is a single repeated byte, use memset() instead )
 
 The first step is to obtain the normalized frequencies of symbols.
-This can be performed by reading a header with FSE_readHeader().
-'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of short.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
 In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
 or size the table to handle worst case situations (typically 256).
-FSE_readHeader will provide 'tableLog' and 'maxSymbolValue' stored into the header.
-The result of FSE_readHeader() is the number of bytes read from 'header'.
-Note that 'headerSize' must be at least 4 bytes, even if useful information is less than that.
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
 If there is an error, the function will return an error code, which can be tested using FSE_isError().
 
-The next step is to create the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
 This is performed by the function FSE_buildDTable().
 The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
-The function will return 1 if FSE_DTable is compatible with fastMode, 0 otherwise.
 If there is an error, the function will return an error code, which can be tested using FSE_isError().
 
 'FSE_DTable' can then be used to decompress 'cSrc', with FSE_decompress_usingDTable().
-Only trigger fastMode if it was authorized by the result of FSE_buildDTable(), otherwise decompression will fail.
-cSrcSize must be correct, otherwise decompression will fail.
-FSE_decompress_usingDTable() result will tell how many bytes were regenerated.
-If there is an error, the function will return an error code, which can be tested using FSE_isError().
+'cSrcSize' must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=maxDstSize).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
 */
 
 
diff --git a/lib/fse_static.h b/lib/fse_static.h
index 8deb7615..84e704c6 100644
--- a/lib/fse_static.h
+++ b/lib/fse_static.h
@@ -48,14 +48,21 @@ extern "C" {
 /******************************************
 *  Static allocation
 ******************************************/
-#define FSE_MAX_HEADERSIZE 512
-#define FSE_COMPRESSBOUND(size) (size + (size>>7) + FSE_MAX_HEADERSIZE)   /* Macro can be useful for static allocation */
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
 
-/* You can statically allocate CTable/DTable as a table of unsigned using below macro */
+/* You can statically allocate FSE CTable/DTable as a table of unsigned using below macro */
 #define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
 #define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
 
-/* You can statically allocate a Huff0 DTable as a table of unsigned char using below macro */
+/* Huff0 buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* You can statically allocate Huff0 DTable as a table of unsigned short using below macro */
 #define HUF_DTABLE_SIZE_U16(maxTableLog)   (1 + (1<<maxTableLog))
 #define HUF_CREATE_STATIC_DTABLE(DTable, maxTableLog) \
         unsigned short DTable[HUF_DTABLE_SIZE_U16(maxTableLog)] = { maxTableLog }
@@ -102,6 +109,7 @@ size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
    You will want to enable link-time-optimization to ensure these functions are properly inlined in your binary.
    Visual seems to do it automatically.
    For gcc or clang, you'll need to add -flto flag at compilation and linking stages.
+   If none of these solutions is applicable, include "fse.c" directly.
 */
 
 typedef struct
@@ -110,6 +118,7 @@ typedef struct
     int    bitPos;
     char*  startPtr;
     char*  ptr;
+    char*  endPtr;
 } FSE_CStream_t;
 
 typedef struct
@@ -120,7 +129,7 @@ typedef struct
     unsigned    stateLog;
 } FSE_CState_t;
 
-void   FSE_initCStream(FSE_CStream_t* bitC, void* dstBuffer);
+size_t FSE_initCStream(FSE_CStream_t* bitC, void* dstBuffer, size_t maxDstSize);
 void   FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
 
 void   FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
@@ -139,17 +148,18 @@ So the first symbol you will encode is the last you will decode, like a LIFO sta
 
 You will need a few variables to track your CStream. They are :
 
-FSE_CTable ct;        // Provided by FSE_buildCTable()
-FSE_CStream_t bitC;   // bitStream tracking structure
-FSE_CState_t state;   // State tracking structure (can have several)
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+FSE_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
 
 
 The first thing to do is to init bitStream and state.
-    FSE_initCStream(&bitC, dstBuffer);
+    size_t errorCode = FSE_initCStream(&bitStream, dstBuffer, maxDstSize);
     FSE_initCState(&state, ct);
 
+Note that FSE_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
 You can then encode your input data, byte after byte.
-FSE_encodeByte() outputs a maximum of 'tableLog' bits at a time.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
 Remember decoding will be done in reverse direction.
     FSE_encodeByte(&bitStream, &state, symbol);
 
@@ -165,8 +175,9 @@ Writing data to memory is a manual operation, performed by the flushBits functio
 Your last FSE encoding operation shall be to flush your last state value(s).
     FSE_flushState(&bitStream, &state);
 
-Finally, you must then close the bitStream.
-The function returns the size in bytes of CStream.
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
 If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
     size_t size = FSE_closeCStream(&bitStream);
 */
@@ -201,7 +212,7 @@ unsigned FSE_endOfDStream(const FSE_DStream_t* bitD);
 unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
 
 typedef enum { FSE_DStream_unfinished = 0,
-               FSE_DStream_partiallyFilled = 1,
+               FSE_DStream_endOfBuffer = 1,
                FSE_DStream_completed = 2,
                FSE_DStream_tooFar = 3 } FSE_DStream_status;  /* result of FSE_reloadDStream() */
                /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... ?! */
@@ -213,16 +224,16 @@ and also any other bitFields you put in, **in reverse order**.
 
 You will need a few variables to track your bitStream. They are :
 
-FSE_DStream_t DStream;  // Stream context
-FSE_DState_t DState;    // State context. Multiple ones are possible
-FSE_DTable dt;          // Decoding table, provided by FSE_buildDTable()
-U32 tableLog;           // Provided by FSE_readHeader()
+FSE_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
 
 The first thing to do is to init the bitStream.
-    errorCode = FSE_initDStream(&DStream, &optionalId, srcBuffer, srcSize);
+    errorCode = FSE_initDStream(&DStream, srcBuffer, srcSize);
 
-You should then retrieve your initial state(s) :
-    errorCode = FSE_initDState(&DState, &DStream, dt, tableLog);
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+    errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
 
 You can then decode your data, symbol after symbol.
 For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
@@ -239,11 +250,11 @@ Refueling the register from memory is manually performed by the reload method.
 
 FSE_reloadDStream() result tells if there is still some more data to read from DStream.
 FSE_DStream_unfinished : there is still some data left into the DStream.
-FSE_DStream_partiallyFilled : Dstream reached end of buffer. Its container may no longer be completely filled.
+FSE_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
 FSE_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
 FSE_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
 
-When reaching end of buffer (FSE_DStream_partiallyFilled), progress slowly, notably if you decode multiple symbols per loop,
+When reaching end of buffer (FSE_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
 to properly detect the exact end of stream.
 After each decoded symbol, check if DStream is fully consumed using this simple test :
     FSE_reloadDStream(&DStream) >= FSE_DStream_completed
@@ -251,7 +262,7 @@ After each decoded symbol, check if DStream is fully consumed using this simple
 When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
 Checking if DStream has reached its end is performed by :
     FSE_endOfDStream(&DStream);
-Check also the states. There might be some entropy left there, able to decode some high probability (>50%) symbol.
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
     FSE_endOfDState(&DState);
 */
 
@@ -263,7 +274,7 @@ size_t FSE_readBitsFast(FSE_DStream_t* bitD, unsigned nbBits);
 /* faster, but works only if nbBits >= 1 (otherwise, result will be corrupted) */
 
 unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD);
-/* faster, but works only if nbBits >= 1 (otherwise, result will be corrupted) */
+/* faster, but works only if allways nbBits >= 1 (otherwise, result will be corrupted) */
 
 
 #if defined (__cplusplus)
diff --git a/lib/zstd.c b/lib/zstd.c
index 8fc3282f..afa44e33 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -536,64 +536,6 @@ static size_t ZSTD_noCompressBlock (void* dst, size_t maxDstSize, const void* sr
 }
 
 
-/* return : size of CStream in bits */
-size_t ZSTD_compressLiterals_usingCTable(void* dst, size_t dstSize,
-                                          const void* src, size_t srcSize,
-                                          const FSE_CTable* CTable)
-{
-    const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
-    const BYTE* const iend = istart + srcSize;
-    FSE_CStream_t bitC;
-    FSE_CState_t CState1, CState2;
-
-    /* init */
-    (void)dstSize;   // objective : ensure it fits into dstBuffer (Todo)
-    FSE_initCStream(&bitC, dst);
-    FSE_initCState(&CState1, CTable);
-    CState2 = CState1;
-
-    /* Note : at this stage, srcSize > LITERALS_NOENTROPY (checked by ZSTD_compressLiterals()) */
-    // join to mod 2
-    if (srcSize & 1)
-    {
-        FSE_encodeSymbol(&bitC, &CState1, *ip++);
-        FSE_flushBits(&bitC);
-    }
-
-    // join to mod 4
-    if ((sizeof(size_t)*8 > LitFSELog*4+7 ) && (srcSize & 2))   // test bit 2
-    {
-        FSE_encodeSymbol(&bitC, &CState2, *ip++);
-        FSE_encodeSymbol(&bitC, &CState1, *ip++);
-        FSE_flushBits(&bitC);
-    }
-
-    // 2 or 4 encoding per loop
-    while (ip<iend)
-    {
-        FSE_encodeSymbol(&bitC, &CState2, *ip++);
-
-        if (sizeof(size_t)*8 < LitFSELog*2+7 )   // this test must be static
-            FSE_flushBits(&bitC);
-
-        FSE_encodeSymbol(&bitC, &CState1, *ip++);
-
-        if (sizeof(size_t)*8 > LitFSELog*4+7 )   // this test must be static
-        {
-            FSE_encodeSymbol(&bitC, &CState2, *ip++);
-            FSE_encodeSymbol(&bitC, &CState1, *ip++);
-        }
-
-        FSE_flushBits(&bitC);
-    }
-
-    FSE_flushCState(&bitC, &CState2);
-    FSE_flushCState(&bitC, &CState1);
-    return FSE_closeCStream(&bitC);
-}
-
-
 size_t ZSTD_minGain(size_t srcSize)
 {
     return (srcSize >> 6) + 1;
@@ -693,7 +635,6 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
                                      const seqStore_t* seqStorePtr,
                                      size_t lastLLSize, size_t srcSize)
 {
-    FSE_CStream_t blockStream;
     U32 count[256];
     S16 norm[256];
     size_t mostFrequent;
@@ -710,8 +651,9 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     const U32*  op_offset = seqStorePtr->offset;
     const BYTE* op_matchLength = seqStorePtr->matchLength;
     const size_t nbSeq = op_litLength - op_litLength_start;
-    BYTE* op;
-    BYTE* offsetBits_start = seqStorePtr->offCodeStart;
+    BYTE* op = dst;
+    BYTE* const oend = dst + maxDstSize;
+    BYTE* const offsetBits_start = seqStorePtr->offCodeStart;
     BYTE* offsetBitsPtr = offsetBits_start;
     const size_t minGain = ZSTD_minGain(srcSize);
     const size_t maxCSize = srcSize - minGain;
@@ -719,10 +661,8 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     const size_t maxLSize = maxCSize > minSeqSize ? maxCSize - minSeqSize : 0;
     BYTE* seqHead;
 
-    /* init */
-    op = dst;
 
-    /* Encode literals */
+    /* Compress literals */
     {
         size_t cSize;
         size_t litSize = op_lit - op_lit_start;
@@ -768,7 +708,7 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
         op += dumpsLength;
     }
 
-    /* Encoding table of Literal Lengths */
+    /* CTable for Literal Lengths */
     max = MaxLL;
     mostFrequent = FSE_countFast(count, &max, seqStorePtr->litLengthStart, nbSeq);
     if ((mostFrequent == nbSeq) && (nbSeq > 2))
@@ -786,14 +726,14 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     {
         tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeNCount(op, maxDstSize, norm, max, tableLog);
+        op += FSE_writeNCount(op, oend-op, norm, max, tableLog);
         FSE_buildCTable(CTable_LitLength, norm, max, tableLog);
         LLtype = bt_compressed;
     }
 
-    /* Encoding table of Offsets */
+    /* CTable for Offsets codes */
     {
-        /* create OffsetBits */
+        /* create Offset codes */
         size_t i;
         const U32* const op_offset_start = seqStorePtr->offsetStart;
         max = MaxOff;
@@ -820,12 +760,12 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     {
         tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeNCount(op, maxDstSize, norm, max, tableLog);
+        op += FSE_writeNCount(op, oend-op, norm, max, tableLog);
         FSE_buildCTable(CTable_OffsetBits, norm, max, tableLog);
         Offtype = bt_compressed;
     }
 
-    /* Encoding Table of MatchLengths */
+    /* CTable for MatchLengths */
     max = MaxML;
     mostFrequent = FSE_countFast(count, &max, seqStorePtr->matchLengthStart, nbSeq);
     if ((mostFrequent == nbSeq) && (nbSeq > 2))
@@ -843,20 +783,23 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     {
         tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeNCount(op, maxDstSize, norm, max, tableLog);
+        op += FSE_writeNCount(op, oend-op, norm, max, tableLog);
         FSE_buildCTable(CTable_MatchLength, norm, max, tableLog);
         MLtype = bt_compressed;
     }
 
     seqHead[0] += (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
 
-    /* Encoding */
+    /* Encoding Sequences */
     {
+        size_t streamSize, errorCode;
+        FSE_CStream_t blockStream;
         FSE_CState_t stateMatchLength;
         FSE_CState_t stateOffsetBits;
         FSE_CState_t stateLitLength;
 
-        FSE_initCStream(&blockStream, op);
+        errorCode = FSE_initCStream(&blockStream, op, oend-op);
+        if (FSE_isError(errorCode)) return 0;   /* not enough space remaining */
         FSE_initCState(&stateMatchLength, CTable_MatchLength);
         FSE_initCState(&stateOffsetBits, CTable_OffsetBits);
         FSE_initCState(&stateLitLength, CTable_LitLength);
@@ -880,9 +823,11 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
         FSE_flushCState(&blockStream, &stateMatchLength);
         FSE_flushCState(&blockStream, &stateOffsetBits);
         FSE_flushCState(&blockStream, &stateLitLength);
-    }
 
-    op += FSE_closeCStream(&blockStream);
+        streamSize = FSE_closeCStream(&blockStream);
+        if (streamSize==0) return 0;   /* not enough space */
+        op += streamSize;
+    }
 
     /* check compressibility */
     if ((size_t)(op-dst) >= maxCSize) return 0;

From 96f3b2c0f76015a3f37228370096f333d86338db Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Fri, 7 Aug 2015 15:50:42 +0100
Subject: [PATCH 06/21] zstd_compress() no longer requires maxDstSize >=
 ZSTD_compressBound(srcSize)

---
 lib/zstd.c | 61 ------------------------------------------------------
 lib/zstd.h |  8 +++----
 2 files changed, 4 insertions(+), 65 deletions(-)

diff --git a/lib/zstd.c b/lib/zstd.c
index afa44e33..181eaf8f 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -547,7 +547,6 @@ static size_t ZSTD_compressLiterals (void* dst, size_t dstSize,
 {
     const size_t minGain = ZSTD_minGain(srcSize);
 
-#if 1
 #define LHSIZE 5
     BYTE* const ostart = (BYTE*)dst;
     size_t hsize = HUF_compress(ostart+LHSIZE, dstSize-LHSIZE, src, srcSize);
@@ -569,65 +568,6 @@ static size_t ZSTD_compressLiterals (void* dst, size_t dstSize,
 
     hsize -= 2;
     return hsize+LHSIZE;
-
-#else
-
-    const BYTE* const istart = (const BYTE*) src;
-    const BYTE* ip = istart;
-
-    BYTE* const ostart = (BYTE*) dst;
-    BYTE* op = ostart + ZSTD_blockHeaderSize;
-    BYTE* const oend = ostart + dstSize;
-
-    U32 maxSymbolValue = 256;
-    U32 tableLog = LitFSELog;
-    U32 count[256];
-    S16 norm[256];
-    U32 CTable[ FSE_CTABLE_SIZE_U32(LitFSELog, 256) ];
-    size_t errorCode;
-
-    /* early out */
-    if (dstSize < FSE_compressBound(srcSize)) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
-
-    /* Scan input and build symbol stats */
-    errorCode = FSE_count (count, &maxSymbolValue, ip, srcSize);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    if (errorCode == srcSize) return 1;
-    if (errorCode < (srcSize >> 6)) return 0;   /* cheap heuristic : probably not compressible enough */
-
-    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
-    errorCode = (int)FSE_normalizeCount (norm, tableLog, count, srcSize, maxSymbolValue);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-
-    /* Write table description header */
-    errorCode = FSE_writeNCount (op, FSE_MAX_HEADERSIZE, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    op += errorCode;
-
-    /* Compress */
-    errorCode = FSE_buildCTable (CTable, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    errorCode = ZSTD_compressLiterals_usingCTable(op, oend - op, ip, srcSize, CTable);
-    if (ZSTD_isError(errorCode)) return errorCode;
-    op += errorCode;
-
-    /* check compressibility */
-    if ( (size_t)(op-ostart) >= srcSize-minGain)
-        return 0;
-
-    /* Build header */
-    {
-        size_t totalSize;
-        totalSize  = op - ostart - ZSTD_blockHeaderSize;
-        ostart[0]  = (BYTE)(totalSize>>16);
-        ostart[1]  = (BYTE)(totalSize>>8);
-        ostart[2]  = (BYTE)totalSize;
-        ostart[0] += (BYTE)(bt_compressed<<6); /* is a block, is compressed */
-    }
-
-    return op-ostart;
-
-#endif // 1
 }
 
 
@@ -1066,7 +1006,6 @@ size_t ZSTD_compressContinue(ZSTD_Cctx*  cctx, void* dst, size_t maxDstSize, con
     const U32 updateRate = 2 * BLOCKSIZE;
 
     /*  Init */
-    if (maxDstSize < ZSTD_compressBound(srcSize) - 4 /* frame header size*/) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
     if (ctx->base==NULL)
         ctx->base = (const BYTE*)src, ctx->current=0, ctx->nextUpdate = g_maxDistance;
     if (src != ctx->base + ctx->current)   /* not contiguous */
diff --git a/lib/zstd.h b/lib/zstd.h
index 50ee72c4..1e20b4f1 100644
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -46,8 +46,8 @@ extern "C" {
 *  Version
 **************************************/
 #define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
-#define ZSTD_VERSION_MINOR    0    /* for new (non-breaking) interface capabilities */
-#define ZSTD_VERSION_RELEASE  2    /* for tweaks, bug-fixes, or development */
+#define ZSTD_VERSION_MINOR    1    /* for new (non-breaking) interface capabilities */
+#define ZSTD_VERSION_RELEASE  0    /* for tweaks, bug-fixes, or development */
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 unsigned ZSTD_versionNumber (void);
 
@@ -64,8 +64,8 @@ size_t ZSTD_decompress( void* dst, size_t maxOriginalSize,
 /*
 ZSTD_compress() :
     Compresses 'srcSize' bytes from buffer 'src' into buffer 'dst', of maximum size 'dstSize'.
-    Destination buffer should be sized to handle worst cases situations (input data not compressible).
-    Worst case size evaluation is provided by function ZSTD_compressBound().
+    Destination buffer must be already allocated.
+    Compression runs faster if maxDstSize >=  ZSTD_compressBound(srcSize).
     return : the number of bytes written into buffer 'dst'
              or an error code if it fails (which can be tested using ZSTD_isError())
 

From e9853b2cdb69a2e6f462947030af94a28191aa0e Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Fri, 7 Aug 2015 19:07:32 +0100
Subject: [PATCH 07/21] Fixed : ZSTD_compress() can attempt compression on a
 too small buffer

---
 Makefile          |  2 +-
 lib/fse.c         | 22 +++++++++++-----------
 lib/zstd.c        |  7 +++----
 programs/Makefile |  4 ++--
 programs/fuzzer.c | 18 ++++++++++++++----
 5 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index 8049649d..6591e26d 100644
--- a/Makefile
+++ b/Makefile
@@ -32,7 +32,7 @@
 # ################################################################
 
 # Version number
-export VERSION=0.0.2
+export VERSION=0.1.0
 export RELEASE=r$(VERSION)
 
 DESTDIR?=
diff --git a/lib/fse.c b/lib/fse.c
index 63684fc0..a82baa41 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -1132,7 +1132,7 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
 
 size_t FSE_initCStream(FSE_CStream_t* bitC, void* start, size_t maxSize)
 {
-    if (maxSize < 8) return (size_t)-FSE_ERROR_dstSize_tooSmall;
+    if (maxSize < sizeof(bitC->ptr)) return (size_t)-FSE_ERROR_dstSize_tooSmall;
     bitC->bitContainer = 0;
     bitC->bitPos = 0;
     bitC->startPtr = (char*)start;
@@ -1186,12 +1186,9 @@ void FSE_flushBits(FSE_CStream_t* bitC)
     size_t nbBytes = bitC->bitPos >> 3;
     FSE_writeLEST(bitC->ptr, bitC->bitContainer);
     bitC->ptr += nbBytes;
-    if (bitC->ptr <= bitC->endPtr)
-    {
-        bitC->bitPos &= 7;
-        bitC->bitContainer >>= nbBytes*8;
-        return;
-    }
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
 }
 
 void FSE_flushCState(FSE_CStream_t* bitC, const FSE_CState_t* statePtr)
@@ -1208,7 +1205,7 @@ size_t FSE_closeCStream(FSE_CStream_t* bitC)
     FSE_addBitsFast(bitC, 1, 1);
     FSE_flushBits(bitC);
 
-    if (bitC->bitPos > 7)   /* still some data to flush => too close to buffer's end */
+    if (bitC->ptr >= bitC->endPtr)   /* too close to buffer's end */
         return 0;   /* not compressible */
 
     endPtr = bitC->ptr;
@@ -1887,7 +1884,7 @@ size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U3
     U16 nodeNb = STARTNODE;
     U32 nodeRoot;
 
-    // check
+    /* safety checks */
     if (maxNbBits == 0) maxNbBits = HUF_DEFAULT_TABLELOG;
     if (maxSymbolValue > HUF_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_GENERIC;
 	memset(huffNode0, 0, sizeof(huffNode0));
@@ -1976,7 +1973,7 @@ size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size
 
     /* init */
     op += 6;   /* jump Table -- could be optimized by delta / deviation */
-    errorCode = FSE_initCStream(&bitC, op, dstSize);
+    errorCode = FSE_initCStream(&bitC, op, oend-op);
     if (FSE_isError(errorCode)) return 0;
 
     n = srcSize & ~15;  // mod 16
@@ -2124,7 +2121,10 @@ size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize
     op += errorCode;
 
     /* Compress */
-    op += HUF_compress_usingCTable(op, oend - op, src, srcSize, CTable);
+    errorCode = HUF_compress_usingCTable(op, oend - op, src, srcSize, CTable);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode==0) return 0;
+    op += errorCode;
 
     /* check compressibility */
     if ((size_t)(op-ostart) >= srcSize-1)
diff --git a/lib/zstd.c b/lib/zstd.c
index 181eaf8f..35474b12 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -912,7 +912,7 @@ static size_t ZSTD_compressBlock(void* cctx, void* dst, size_t maxDstSize, const
 }
 
 
-size_t ZSTD_compressBegin(ZSTD_Cctx*  ctx, void* dst, size_t maxDstSize)
+size_t ZSTD_compressBegin(ZSTD_Cctx* ctx, void* dst, size_t maxDstSize)
 {
     /* Sanity check */
     if (maxDstSize < ZSTD_frameHeaderSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
@@ -1081,7 +1081,6 @@ size_t ZSTD_compressEnd(ZSTD_Cctx*  ctx, void* dst, size_t maxDstSize)
 static size_t ZSTD_compressCCtx(ZSTD_Cctx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE* const)dst;
-    BYTE* const oend = ostart + maxDstSize;
     BYTE* op = ostart;
 
     /* Header */
@@ -1094,7 +1093,7 @@ static size_t ZSTD_compressCCtx(ZSTD_Cctx* ctx, void* dst, size_t maxDstSize, co
 
     /* Compression */
     {
-        size_t cSize = ZSTD_compressContinue(ctx, op, oend-op, src, srcSize);
+        size_t cSize = ZSTD_compressContinue(ctx, op, maxDstSize, src, srcSize);
         if (ZSTD_isError(cSize)) return cSize;
         op += cSize;
         maxDstSize -= cSize;
@@ -1102,7 +1101,7 @@ static size_t ZSTD_compressCCtx(ZSTD_Cctx* ctx, void* dst, size_t maxDstSize, co
 
     /* Close frame */
     {
-        size_t endSize = ZSTD_compressEnd(ctx, op, oend-op);
+        size_t endSize = ZSTD_compressEnd(ctx, op, maxDstSize);
         if(ZSTD_isError(endSize)) return endSize;
         op += endSize;
     }
diff --git a/programs/Makefile b/programs/Makefile
index 20205b90..1f7bdd5f 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -30,7 +30,7 @@
 # fullbench32: Same as fullbench, but forced to compile in 32-bits mode
 # ##########################################################################
 
-RELEASE?= v0.0.2
+RELEASE?= v0.1.0
 
 DESTDIR?=
 PREFIX ?= /usr
@@ -61,7 +61,7 @@ default: zstd
 
 all: zstd zstd32 fullbench fullbench32 fuzzer fuzzer32 datagen
 
-zstd: $(ZSTDDIR)/zstd.c xxhash.c bench.c fileio.c zstdcli.c
+zstd  : $(ZSTDDIR)/zstd.c xxhash.c bench.c fileio.c zstdcli.c
 	$(CC)      $(FLAGS) $^ -o $@$(EXT)
 
 zstd32: $(ZSTDDIR)/zstd.c xxhash.c bench.c fileio.c zstdcli.c
diff --git a/programs/fuzzer.c b/programs/fuzzer.c
index db4bc65f..658a0cc6 100644
--- a/programs/fuzzer.c
+++ b/programs/fuzzer.c
@@ -323,7 +323,6 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
     U32 result = 0;
     U32 testNb = 0;
     U32 coreSeed = seed, lseed = 0;
-    (void)startTest; (void)compressibility;
 
     /* allocation */
     srcBuffer = (BYTE*)malloc (srcBufferSize);
@@ -332,7 +331,7 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
     CHECK (!srcBuffer || !dstBuffer || !cBuffer, "Not enough memory, fuzzer tests cancelled");
 
     /* Create initial sample */
-    FUZ_generateSynthetic(srcBuffer, srcBufferSize, 0.50, &coreSeed);
+    FUZ_generateSynthetic(srcBuffer, srcBufferSize, compressibility, &coreSeed);
 
     /* catch up testNb */
     for (testNb=0; testNb < startTest; testNb++)
@@ -356,10 +355,20 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         sampleStart = FUZ_rand(&lseed) % (srcBufferSize - sampleSize);
         crcOrig = XXH64(srcBuffer + sampleStart, sampleSize, 0);
 
-        /* compression tests*/
+        /* compression test */
         cSize = ZSTD_compress(cBuffer, cBufferSize, srcBuffer + sampleStart, sampleSize);
         CHECK(ZSTD_isError(cSize), "ZSTD_compress failed");
 
+        /* compression failure test */
+        {
+            size_t errorCode;
+            void* dBufferTooSmall = malloc(cSize-1);   /* valgrind should catch overflows */
+            if (dBufferTooSmall==NULL) { DISPLAY("not enough memory !"); exit(1); }
+            errorCode = ZSTD_compress(dBufferTooSmall, cSize-1, srcBuffer + sampleStart, sampleSize);
+            CHECK(!ZSTD_isError(errorCode), "ZSTD_compress should have failed ! (buffer too small)");
+            free(dBufferTooSmall);
+        }
+
         /* decompression tests*/
         dSupSize = (FUZ_rand(&lseed) & 1) ? 0 : (FUZ_rand(&lseed) & 31) + 1;
         dSize = ZSTD_decompress(dstBuffer, sampleSize + dSupSize, cBuffer, cSize);
@@ -393,8 +402,9 @@ int FUZ_usage(char* programName)
     DISPLAY( " -i#    : Nb of tests (default:%u) \n", nbTestsDefault);
     DISPLAY( " -s#    : Select seed (default:prompt user)\n");
     DISPLAY( " -t#    : Select starting test number (default:0)\n");
-    DISPLAY( " -p#    : Select compressibility in %% (default:%i%%)\n", FUZ_COMPRESSIBILITY_DEFAULT);
+    DISPLAY( " -P#    : Select compressibility in %% (default:%i%%)\n", FUZ_COMPRESSIBILITY_DEFAULT);
     DISPLAY( " -v     : verbose\n");
+    DISPLAY( " -p     : pause at the end\n");
     DISPLAY( " -h     : display help and exit\n");
     return 0;
 }

From 56213d89f91e41411c5550429e7a56b6b2b36627 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Fri, 7 Aug 2015 20:15:27 +0100
Subject: [PATCH 08/21] updated benchmark

---
 README.md | 50 ++++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 24 deletions(-)
 mode change 100755 => 100644 README.md

diff --git a/README.md b/README.md
old mode 100755
new mode 100644
index d53bb38f..c5bc0059
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
- **Zstd**, short for Zstandard, is a new lossless compression algorithm, which provides both good compression ratio _and_ speed for your standard compression needs. "Standard" translates into everyday situations which neither look for highest possible ratio (which LZMA and ZPAQ cover) nor extreme speeds (which LZ4 covers).
+ **Zstd**, short for Zstandard, is a new lossless compression algorithm, which provides both good compression ratio _and_ speed for your standard compression needs. "Standard" translates into everyday situations which neither look for highest possible ratio nor extreme speeds.s
 
 It is provided as a BSD-license package, hosted on Github.
 
@@ -7,40 +7,42 @@ It is provided as a BSD-license package, hosted on Github.
 |master      | [![Build Status](https://travis-ci.org/Cyan4973/zstd.svg?branch=master)](https://travis-ci.org/Cyan4973/zstd) |
 |dev         | [![Build Status](https://travis-ci.org/Cyan4973/zstd.svg?branch=dev)](https://travis-ci.org/Cyan4973/zstd) |
 
-For a taste of its performance, here are a few benchmark numbers, completed on a Core i5-4300U @ 1.9 GHz, using [fsbench 0.14.3](http://encode.ru/threads/1371-Filesystem-benchmark?p=34029&viewfull=1#post34029), an open-source benchmark program by m^2.
+For a taste of its performance, here are a few benchmark numbers, completed on a Core i7-5600U @ 2.6 GHz, using [fsbench 0.14.3](http://encode.ru/threads/1371-Filesystem-benchmark?p=34029&viewfull=1#post34029), an open-source benchmark program by m^2.
 
-|Name           | Ratio | C.speed | D.speed |
-|---------------|-------|---------|---------|
-|               |       |   MB/s  |  MB/s   |
-| [zlib 1.2.8 -6](http://www.zlib.net/)| 3.099 |    18   |  275    |
-| **zstd**      |**2.872**|**201**|**498**  |
-| [zlib 1.2.8 -1](http://www.zlib.net/)| 2.730 |    58   |   250   |
-| [LZ4 HC r127](https://github.com/Cyan4973/lz4)| 2.720 |   26    |  1720   |
-| QuickLZ 1.5.1b6|2.237 |  323    |  373    |
-| LZO 2.06      | 2.106 |  351    |  510    |
-| Snappy 1.1.0  | 2.091 |  238    |  964    |
-| [LZ4 r127](https://github.com/Cyan4973/lz4)| 2.084 |  370    | 1590    |
-| LZF 3.6       | 2.077 |  220    |  502    |
+|Name            | Ratio | C.speed | D.speed |
+|----------------|-------|--------:|--------:|
+|                |       |   MB/s  |  MB/s   |
+| [zlib 1.2.8] -6| 3.099 |    21   |   320   |
+| **zstd**       |**2.871**|**250**|**650**  |
+| [zlib 1.2.8] -1| 2.730 |    70   |   300   | 
+| [LZ4] HC r131  | 2.720 |    25   |  2100   |
+| QuickLZ 1.5.1b6| 2.237 |   370   |   415   |
+| LZO 2.06       | 2.106 |   400   |   580   |
+| Snappy 1.1.0   | 2.091 |   330   |  1100   |
+| [LZ4] r131     | 2.084 |   450   |  2100   |
+| LZF 3.6        | 2.077 |   200   |   560   |
+
+[zlib 1.2.8]:http://www.zlib.net/
+[LZ4]:http://www.lz4.org/
 
 An interesting feature of zstd is that it can qualify as both a reasonably strong compressor and a fast one.
 
-Zstd delivers high decompression speed, at around ~500 MB/s per core.
+Zstd delivers high decompression speed, at more than >600 MB/s per core.
 Obviously, your exact mileage will vary depending on your target system.
 
-Zstd compression speed, on the other hand, can be configured to fit different situations.
-The first, fast, derivative offers ~200 MB/s per core, which is suitable for a few real-time scenarios.
-But similar to [LZ4](https://github.com/Cyan4973/lz4), zstd can offer derivatives trading compression time for compression ratio, while keeping decompression properties intact. "Offline compression", where compression time is of little importance because the content is only compressed once and decompressed many times, is therefore within the scope.
+Zstd compression speed will be configurable to fit different situations.
+The first version offered is the fast one, at ~250 MB/s per core, which is suitable for a few real-time scenarios.
+But similar to [LZ4], zstd can offer derivatives trading compression time for compression ratio, keeping decompression properties intact. "Offline compression", where compression time is of little importance because the content is only compressed once and decompressed many times, is therefore within scope.
 
 Note that high compression derivatives still have to be developed.
-It's a complex area which will certainly benefit the contributions from a few experts.
+It's a complex area which will require time and benefit from contributions.
 
 
 Another property zstd is developed for is configurable memory requirement, with the objective to fit into low-memory configurations, or servers handling many connections in parallel.
 
-Zstd entropy stage is provided by [FSE (Finite State Entropy)](https://github.com/Cyan4973/FiniteStateEntropy).
+Zstd entropy stage is provided by [Huff0 and FSE, of Finite State Entrop library](https://github.com/Cyan4973/FiniteStateEntropy).
 
-Zstd development is starting. So consider current results merely as early ones. The implementation will gradually evolve and improve overtime, especially during this first year. This is a phase which will depend a lot on user feedback, since these feedback will be key in deciding next priorities or features to add.
+Zstd is still considered experimental at this stage. Specifically, it doesn't guarantee yet that its current stream/file format will remain supported in future versions of the library. Therefore, only use Zstd in environments where you can control the availability of the decompression library. "Stable" status, including official documented format format and long-term support commitment, is projected sometimes early 2016.
 
-The "master" branch is reserved for stable release and betas.
-The "dev" branch is the one where all contributions will be merged. If you plan to propose a patch, please commit into the "dev" branch. Direct commit to "master" are not permitted.
-Feature branches will also exist, typically to introduce new requirements, and be temporarily available for testing before merge into "dev" branch.
+### Branch Policy
+The "dev" branch is the one where all contributions will be merged before reaching "master". If you plan to propose a patch, please commit into the "dev" branch or its own feature branch. Direct commit to "master" are not permitted.

From 8a46c159c5308845e6945b779f3d2cc8038ce4b8 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Sat, 8 Aug 2015 02:16:11 +0100
Subject: [PATCH 09/21] minor compress sequence refactor

---
 lib/zstd.c | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/lib/zstd.c b/lib/zstd.c
index 35474b12..2c89c2bb 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -158,6 +158,8 @@ static const U32 g_searchStrength = 8;
 #define MLFSELog   10
 #define LLFSELog   10
 #define OffFSELog   9
+#define MAX(a,b) ((a)<(b)?(b):(a))
+#define MaxSeq MAX(MaxLL, MaxML)
 
 #define LITERAL_NOENTROPY 63
 #define COMMAND_NOENTROPY 7   /* to remove */
@@ -575,26 +577,25 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
                                      const seqStore_t* seqStorePtr,
                                      size_t lastLLSize, size_t srcSize)
 {
-    U32 count[256];
-    S16 norm[256];
+    U32 count[MaxSeq];
+    S16 norm[MaxSeq];
     size_t mostFrequent;
     U32 max = 255;
     U32 tableLog = 11;
     U32 CTable_LitLength  [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL )];
     U32 CTable_OffsetBits [FSE_CTABLE_SIZE_U32(OffFSELog,MaxOff)];
     U32 CTable_MatchLength[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML )];
-    U32 LLtype, Offtype, MLtype;
+    U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
     const BYTE* const op_lit_start = seqStorePtr->litStart;
     const BYTE* op_lit = seqStorePtr->lit;
-    const BYTE* const op_litLength_start = seqStorePtr->litLengthStart;
+    const BYTE* const llTable = seqStorePtr->litLengthStart;
     const BYTE* op_litLength = seqStorePtr->litLength;
-    const U32*  op_offset = seqStorePtr->offset;
-    const BYTE* op_matchLength = seqStorePtr->matchLength;
-    const size_t nbSeq = op_litLength - op_litLength_start;
+    const BYTE* const mlTable = seqStorePtr->matchLengthStart;
+    const U32*  const offsetTable = seqStorePtr->offsetStart;
+    BYTE* const offCodeTable = seqStorePtr->offCodeStart;
     BYTE* op = dst;
     BYTE* const oend = dst + maxDstSize;
-    BYTE* const offsetBits_start = seqStorePtr->offCodeStart;
-    BYTE* offsetBitsPtr = offsetBits_start;
+    const size_t nbSeq = op_litLength - llTable;
     const size_t minGain = ZSTD_minGain(srcSize);
     const size_t maxCSize = srcSize - minGain;
     const size_t minSeqSize = 1 /*lastL*/ + 2 /*dHead*/ + 2 /*dumpsIn*/ + 5 /*SeqHead*/ + 3 /*SeqIn*/ + 1 /*margin*/ + ZSTD_blockHeaderSize;
@@ -622,13 +623,11 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
         op += cSize;
     }
 
-    /* Encode Sequences */
-
-    /* seqHeader */
+    /* Sequences Header */
     op += ZSTD_writeProgressive(op, lastLLSize);
     seqHead = op;
 
-    /* dumps */
+    /* dumps : contains too large lengths */
     {
         size_t dumpsLength = seqStorePtr->dumps - seqStorePtr->dumpsStart;
         if (dumpsLength < 512)
@@ -675,19 +674,17 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     {
         /* create Offset codes */
         size_t i;
-        const U32* const op_offset_start = seqStorePtr->offsetStart;
         max = MaxOff;
         for (i=0; i<nbSeq; i++)
         {
-            offsetBits_start[i] = (BYTE)ZSTD_highbit(op_offset_start[i]) + 1;
-            if (op_offset_start[i]==0) offsetBits_start[i]=0;
+            offCodeTable[i] = (BYTE)ZSTD_highbit(offsetTable[i]) + 1;
+            if (offsetTable[i]==0) offCodeTable[i]=0;
         }
-        offsetBitsPtr += nbSeq;
-        mostFrequent = FSE_countFast(count, &max, offsetBits_start, nbSeq);
+        mostFrequent = FSE_countFast(count, &max, offCodeTable, nbSeq);
     }
     if ((mostFrequent == nbSeq) && (nbSeq > 2))
     {
-        *op++ = *offsetBits_start;
+        *op++ = *offCodeTable;
         FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
         Offtype = bt_rle;
     }
@@ -737,6 +734,7 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
         FSE_CState_t stateMatchLength;
         FSE_CState_t stateOffsetBits;
         FSE_CState_t stateLitLength;
+        int i;
 
         errorCode = FSE_initCStream(&blockStream, op, oend-op);
         if (FSE_isError(errorCode)) return 0;   /* not enough space remaining */
@@ -744,13 +742,13 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
         FSE_initCState(&stateOffsetBits, CTable_OffsetBits);
         FSE_initCState(&stateLitLength, CTable_LitLength);
 
-        while (op_litLength > op_litLength_start)
+        for (i=nbSeq-1; i>=0; i--)
         {
-            BYTE matchLength = *(--op_matchLength);
-            U32  offset = *(--op_offset);
-            BYTE offCode = *(--offsetBitsPtr);                              /* 32b*/  /* 64b*/
+            BYTE matchLength = mlTable[i];
+            U32  offset = offsetTable[i];
+            BYTE offCode = offCodeTable[i];                                 /* 32b*/  /* 64b*/
             U32 nbBits = (offCode-1) * (!!offCode);
-            BYTE litLength = *(--op_litLength);                             /* (7)*/  /* (7)*/
+            BYTE litLength = llTable[i];                                    /* (7)*/  /* (7)*/
             FSE_encodeSymbol(&blockStream, &stateMatchLength, matchLength); /* 17 */  /* 17 */
             if (ZSTD_32bits()) FSE_flushBits(&blockStream);                 /*  7 */
             FSE_addBits(&blockStream, offset, nbBits);                      /* 32 */  /* 42 */

From f90314610d0fbd36785a6dfb0e7a71304c70e56e Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Sat, 8 Aug 2015 02:54:26 +0100
Subject: [PATCH 10/21] Minor decompression simplification

---
 lib/zstd.c | 115 +++++------------------------------------------------
 1 file changed, 10 insertions(+), 105 deletions(-)

diff --git a/lib/zstd.c b/lib/zstd.c
index 2c89c2bb..747e7325 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -1153,82 +1153,10 @@ static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const voi
 }
 
 
-/* force inline : 'fast' really needs to be evaluated at compile time */
-FORCE_INLINE size_t ZSTD_decompressLiterals_usingDTable_generic(
-                       void* const dst, size_t maxDstSize,
-                 const void* src, size_t srcSize,
-                 const FSE_DTable* DTable, U32 fast)
-{
-    BYTE* op = (BYTE*) dst;
-    BYTE* const olimit = op;
-    BYTE* const oend = op + maxDstSize;
-    FSE_DStream_t bitD;
-    FSE_DState_t state1, state2;
-    size_t errorCode;
-
-    /* Init */
-    errorCode = FSE_initDStream(&bitD, src, srcSize);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-
-    FSE_initDState(&state1, &bitD, DTable);
-    FSE_initDState(&state2, &bitD, DTable);
-    op = oend;
-
-    /* 2-4 symbols per loop */
-    while (!FSE_reloadDStream(&bitD) && (op>olimit+3))
-    {
-        *--op = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-
-        if (LitFSELog*2+7 > sizeof(size_t)*8)    /* This test must be static */
-            FSE_reloadDStream(&bitD);
-
-        *--op = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-
-        if (LitFSELog*4+7 < sizeof(size_t)*8)    /* This test must be static */
-        {
-            *--op = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-            *--op = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-        }
-    }
-
-    /* tail */
-    while (1)
-    {
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==olimit) || (FSE_endOfDState(&state1) && FSE_endOfDStream(&bitD)) )
-            break;
-
-        *--op = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==olimit) || (FSE_endOfDState(&state2) && FSE_endOfDStream(&bitD)) )
-            break;
-
-        *--op = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-    }
-
-    /* end ? */
-    if (FSE_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2) )
-        return oend-op;
-
-    if (op==olimit) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
-
-    return (size_t)-ZSTD_ERROR_GENERIC;
-}
-
-size_t ZSTD_decompressLiterals_usingDTable(
-                       void* const dst, size_t maxDstSize,
-                 const void* src, size_t srcSize,
-                 const FSE_DTable* DTable, U32 fast)
-{
-    if (fast) return ZSTD_decompressLiterals_usingDTable_generic(dst, maxDstSize, src, srcSize, DTable, 1);
-    return ZSTD_decompressLiterals_usingDTable_generic(dst, maxDstSize, src, srcSize, DTable, 0);
-}
-
 static size_t ZSTD_decompressLiterals(void* ctx,
                                       void* dst, size_t maxDstSize,
                                 const void* src, size_t srcSize)
 {
-#if 1
-
     BYTE* op = (BYTE*)dst;
     BYTE* const oend = op + maxDstSize;
     const BYTE* ip = (const BYTE*)src;
@@ -1242,30 +1170,6 @@ static size_t ZSTD_decompressLiterals(void* ctx,
     if (FSE_isError(errorCode))
         return errorCode;
     return litSize;
-
-#else
-    /* assumed : blockType == blockCompressed */
-    const BYTE* ip = (const BYTE*)src;
-    short norm[256];
-    FSE_DTable* DTable = (FSE_DTable*)ctx;
-    U32 maxSymbolValue = 255;
-    U32 tableLog;
-    U32 fastMode;
-    size_t errorCode;
-
-    if (srcSize < 2) return (size_t)-ZSTD_ERROR_wrongLBlockSize;   /* too small input size */
-
-    errorCode = FSE_readNCount (norm, &maxSymbolValue, &tableLog, ip, srcSize);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    ip += errorCode;
-    srcSize -= errorCode;
-
-    errorCode = FSE_buildDTable (DTable, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    fastMode = (U32)errorCode;
-
-    return ZSTD_decompressLiterals_usingDTable (dst, maxDstSize, ip, srcSize, DTable, fastMode);
-#endif // 1
 }
 
 
@@ -1428,7 +1332,7 @@ FORCE_INLINE size_t ZSTD_decompressBlock(void* ctx, void* dst, size_t maxDstSize
 
     /* blockType == blockCompressed, srcSize is trusted */
 
-    /* literal sub-block */
+    /* Decode literals sub-block */
     errorCode = ZSTD_decodeLiteralsBlock(ctx, dst, maxDstSize, &litPtr, src, srcSize);
     if (ZSTD_isError(errorCode)) return errorCode;
     ip += errorCode;
@@ -1516,15 +1420,15 @@ _another_round:
             /* copy Match */
             {
                 BYTE* const endMatch = op + matchLength;
-                size_t qutt=0;
+                size_t qutt=12;
                 U64 saved[2];
+                const U32 overlapRisk = (((size_t)(litPtr - endMatch)) < 12);
 
                 /* save beginning of literal sequence, in case of write overlap */
-                if ((size_t)(litPtr - endMatch) < 12)
+                if (overlapRisk)
                 {
-                    qutt = endMatch + 12 - litPtr;
-                    if ((litPtr + qutt) > oend) qutt = oend-litPtr;
-                    memcpy(saved, litPtr, qutt);
+                    if ((endMatch + qutt) > oend) qutt = oend-endMatch;
+                    memcpy(saved, endMatch, qutt);
                 }
 
                 if (offset < 8)
@@ -1554,8 +1458,9 @@ _another_round:
 
                 op = endMatch;
 
-                if ((size_t)(litPtr - endMatch) < 12)
-                    memcpy(endMatch + (litPtr - endMatch), saved, qutt);  /* required as litPtr is const ptr */
+                /* restore, in case of overlap */
+                if (overlapRisk)
+                    memcpy(endMatch, saved, qutt);
             }
         }
 
@@ -1611,7 +1516,7 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
             errorCode = ZSTD_copyUncompressedBlock(op, oend-op, ip, blockSize);
             break;
         case bt_rle :
-            return (size_t)-ZSTD_ERROR_GENERIC;   /* not yet handled */
+            return (size_t)-ZSTD_ERROR_GENERIC;   /* not yet supported */
             break;
         case bt_end :
             /* end of frame */

From bd8f4e0e5cd4d8c13112ee78dfb6cb0ff7af9e80 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Mon, 10 Aug 2015 18:02:52 +0100
Subject: [PATCH 11/21] added : .gitignore generated binaries

---
 programs/.gitignore | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 programs/.gitignore

diff --git a/programs/.gitignore b/programs/.gitignore
new file mode 100644
index 00000000..021e8937
--- /dev/null
+++ b/programs/.gitignore
@@ -0,0 +1,31 @@
+# local binary (Makefile)
+zstd
+zstd32
+fullbench
+fullbench32
+fuzzer
+fuzzer32
+datagen
+
+# Object files
+*.o
+*.ko
+
+# Libraries
+*.lib
+*.a
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# Visual solution files
+*.suo
+*.user

From f4ce8913a33bc82a1bcfaafd3440b0ed69b966a6 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Tue, 11 Aug 2015 14:18:45 +0100
Subject: [PATCH 12/21] Added more strigent tests : compresson into too small
 buffer

---
 Makefile             |   1 +
 README.md            |   4 +-
 lib/zstd.c           | 387 ++++++++++++++++++++++++++++++++++---------
 lib/zstd_static.h    |   6 +-
 programs/fullbench.c |   5 +-
 programs/fuzzer.c    |  22 +--
 6 files changed, 328 insertions(+), 97 deletions(-)

diff --git a/Makefile b/Makefile
index 6591e26d..c778cb72 100644
--- a/Makefile
+++ b/Makefile
@@ -93,6 +93,7 @@ prg-travis:
 	@cd $(PRGDIR); $(MAKE) -e $(ZSTD_TRAVIS_CI_ENV)
 
 clangtest: clean
+	clang -v
 	$(MAKE) all CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion"
 
 gpptest: clean
diff --git a/README.md b/README.md
index c5bc0059..86d34190 100644
--- a/README.md
+++ b/README.md
@@ -13,13 +13,13 @@ For a taste of its performance, here are a few benchmark numbers, completed on a
 |----------------|-------|--------:|--------:|
 |                |       |   MB/s  |  MB/s   |
 | [zlib 1.2.8] -6| 3.099 |    21   |   320   |
-| **zstd**       |**2.871**|**250**|**650**  |
+| **zstd**       |**2.871**|**255**| **628** |
 | [zlib 1.2.8] -1| 2.730 |    70   |   300   | 
 | [LZ4] HC r131  | 2.720 |    25   |  2100   |
 | QuickLZ 1.5.1b6| 2.237 |   370   |   415   |
 | LZO 2.06       | 2.106 |   400   |   580   |
 | Snappy 1.1.0   | 2.091 |   330   |  1100   |
-| [LZ4] r131     | 2.084 |   450   |  2100   |
+| [LZ4] r131     | 2.101 |   450   |  2100   |
 | LZF 3.6        | 2.077 |   200   |   560   |
 
 [zlib 1.2.8]:http://www.zlib.net/
diff --git a/lib/zstd.c b/lib/zstd.c
index 747e7325..c0945c25 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -124,7 +124,7 @@ typedef unsigned long long  U64;
 /********************************************************
 *  Constants
 *********************************************************/
-static const U32 ZSTD_magicNumber = 0xFD2FB51D;   /* 2nd magic number (huff0) */
+static const U32 ZSTD_magicNumber = 0xFD2FB51E;   /* 3rd version : seqNb header */
 
 #define HASH_LOG (ZSTD_MEMORY_USAGE - 2)
 #define HASH_TABLESIZE (1 << HASH_LOG)
@@ -202,6 +202,27 @@ static void ZSTD_wildcopy(void* dst, const void* src, size_t length)
     while (op < oend) COPY8(op, ip);
 }
 
+static U16 ZSTD_readLE16(const void* memPtr)
+{
+    if (ZSTD_isLittleEndian()) return ZSTD_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + ((U16)p[1]<<8));
+    }
+}
+
+static void ZSTD_writeLE16(void* memPtr, U16 val)
+{
+    if (ZSTD_isLittleEndian()) memcpy(memPtr, &val, sizeof(val));
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
 static U32 ZSTD_readLE32(const void* memPtr)
 {
     if (ZSTD_isLittleEndian())
@@ -244,40 +265,6 @@ static void ZSTD_writeBE32(void* memPtr, U32 value)
     p[3] = (BYTE)(value>>0);
 }
 
-static size_t ZSTD_writeProgressive(void* ptr, size_t value)
-{
-    BYTE* const bStart = (BYTE* const)ptr;
-    BYTE* byte = bStart;
-
-    do
-    {
-        BYTE l = value & 127;
-        value >>= 7;
-        if (value) l += 128;
-        *byte++ = l;
-    } while (value);
-
-    return byte - bStart;
-}
-
-
-static size_t ZSTD_readProgressive(size_t* result, const void* ptr)
-{
-    const BYTE* const bStart = (const BYTE* const)ptr;
-    const BYTE* byte = bStart;
-    size_t r = 0;
-    U32 shift = 0;
-
-    do
-    {
-        r += (*byte & 127) << shift;
-        shift += 7;
-    } while (*byte++ & 128);
-
-    *result = r;
-    return byte - bStart;
-}
-
 
 /**************************************
 *  Local structures
@@ -548,10 +535,13 @@ static size_t ZSTD_compressLiterals (void* dst, size_t dstSize,
                                      const void* src, size_t srcSize)
 {
     const size_t minGain = ZSTD_minGain(srcSize);
-
-#define LHSIZE 5
     BYTE* const ostart = (BYTE*)dst;
-    size_t hsize = HUF_compress(ostart+LHSIZE, dstSize-LHSIZE, src, srcSize);
+    size_t hsize;
+	static const size_t LHSIZE = 5;
+
+	if (dstSize < LHSIZE+1) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* not enough space for compression */
+
+	hsize = HUF_compress(ostart+LHSIZE, dstSize-LHSIZE, src, srcSize);
     if (hsize<2) return hsize;   /* special cases */
     if (hsize >= srcSize - minGain) return 0;
 
@@ -575,10 +565,10 @@ static size_t ZSTD_compressLiterals (void* dst, size_t dstSize,
 
 static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
                                      const seqStore_t* seqStorePtr,
-                                     size_t lastLLSize, size_t srcSize)
+                                     size_t srcSize)
 {
-    U32 count[MaxSeq];
-    S16 norm[MaxSeq];
+    U32 count[MaxSeq+1];
+    S16 norm[MaxSeq+1];
     size_t mostFrequent;
     U32 max = 255;
     U32 tableLog = 11;
@@ -624,7 +614,9 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     }
 
     /* Sequences Header */
-    op += ZSTD_writeProgressive(op, lastLLSize);
+	if ((oend-op) < 2+3+6)  /* nbSeq + dumpsLength + 3*rleCTable*/
+		return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+    ZSTD_writeLE16(op, (U16)nbSeq); op+=2;
     seqHead = op;
 
     /* dumps : contains too large lengths */
@@ -643,6 +635,7 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
             op[2] = (BYTE)(dumpsLength);
             op += 3;
         }
+		if ((size_t)(oend-op) < dumpsLength+6) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
         memcpy(op, seqStorePtr->dumpsStart, dumpsLength);
         op += dumpsLength;
     }
@@ -663,9 +656,12 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     }
     else
     {
+		size_t NCountSize;
         tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeNCount(op, oend-op, norm, max, tableLog);
+		NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
+		if (FSE_isError(NCountSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+        op += NCountSize;
         FSE_buildCTable(CTable_LitLength, norm, max, tableLog);
         LLtype = bt_compressed;
     }
@@ -695,9 +691,12 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     }
     else
     {
+		size_t NCountSize;
         tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeNCount(op, oend-op, norm, max, tableLog);
+		NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
+		if (FSE_isError(NCountSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+        op += NCountSize;
         FSE_buildCTable(CTable_OffsetBits, norm, max, tableLog);
         Offtype = bt_compressed;
     }
@@ -718,9 +717,12 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     }
     else
     {
+		size_t NCountSize;
         tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeNCount(op, oend-op, norm, max, tableLog);
+		NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
+		if (FSE_isError(NCountSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+        op += NCountSize;
         FSE_buildCTable(CTable_MatchLength, norm, max, tableLog);
         MLtype = bt_compressed;
     }
@@ -737,12 +739,12 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
         int i;
 
         errorCode = FSE_initCStream(&blockStream, op, oend-op);
-        if (FSE_isError(errorCode)) return 0;   /* not enough space remaining */
+        if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* not enough space remaining */
         FSE_initCState(&stateMatchLength, CTable_MatchLength);
         FSE_initCState(&stateOffsetBits, CTable_OffsetBits);
         FSE_initCState(&stateLitLength, CTable_LitLength);
 
-        for (i=nbSeq-1; i>=0; i--)
+        for (i=(int)nbSeq-1; i>=0; i--)
         {
             BYTE matchLength = mlTable[i];
             U32  offset = offsetTable[i];
@@ -763,7 +765,7 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
         FSE_flushCState(&blockStream, &stateLitLength);
 
         streamSize = FSE_closeCStream(&blockStream);
-        if (streamSize==0) return 0;   /* not enough space */
+        if (streamSize==0) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* not enough space */
         op += streamSize;
     }
 
@@ -864,7 +866,6 @@ static size_t ZSTD_compressBlock(void* cctx, void* dst, size_t maxDstSize, const
     const BYTE* const ilimit = iend - 16;
 
     size_t prevOffset=0, offset=0;
-    size_t lastLLSize;
 
 
     /* init */
@@ -900,13 +901,15 @@ static size_t ZSTD_compressBlock(void* cctx, void* dst, size_t maxDstSize, const
     }
 
     /* Last Literals */
-    lastLLSize = iend - anchor;
-    memcpy(seqStorePtr->lit, anchor, lastLLSize);
-    seqStorePtr->lit += lastLLSize;
+    {
+        size_t lastLLSize = iend - anchor;
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
 
     /* Finale compression stage */
     return ZSTD_compressSequences((BYTE*)dst, maxDstSize,
-                                  seqStorePtr, lastLLSize, srcSize);
+                                  seqStorePtr, srcSize);
 }
 
 
@@ -1008,9 +1011,9 @@ size_t ZSTD_compressContinue(ZSTD_Cctx*  cctx, void* dst, size_t maxDstSize, con
         ctx->base = (const BYTE*)src, ctx->current=0, ctx->nextUpdate = g_maxDistance;
     if (src != ctx->base + ctx->current)   /* not contiguous */
     {
-            ZSTD_resetCCtx(ctx);
-            ctx->base = (const BYTE*)src;
-            ctx->current = 0;
+        ZSTD_resetCCtx(ctx);
+        ctx->base = (const BYTE*)src;
+        ctx->current = 0;
     }
     ctx->current += (U32)srcSize;
 
@@ -1020,8 +1023,11 @@ size_t ZSTD_compressContinue(ZSTD_Cctx*  cctx, void* dst, size_t maxDstSize, con
         size_t blockSize = BLOCKSIZE;
         if (blockSize > srcSize) blockSize = srcSize;
 
+		if (maxDstSize < 2*ZSTD_blockHeaderSize+1)  /* one RLE block + endMark */
+            return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+
         /* update hash table */
-        if (g_maxDistance <= BLOCKSIZE)   /* static test => all blocks are independent */
+        if (g_maxDistance <= BLOCKSIZE)   /* static test ; yes == blocks are independent */
         {
             ZSTD_resetCCtx(ctx);
             ctx->base = ip;
@@ -1120,7 +1126,6 @@ size_t ZSTD_compress(void* dst, size_t maxDstSize, const void* src, size_t srcSi
 }
 
 
-
 /**************************************************************
 *   Decompression code
 **************************************************************/
@@ -1131,7 +1136,7 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bp
     BYTE headerFlags;
     U32 cSize;
 
-    if (srcSize < 3) return (size_t)-ZSTD_ERROR_wrongSrcSize;
+    if (srcSize < 3) return (size_t)-ZSTD_ERROR_SrcSize;
 
     headerFlags = *in;
     cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
@@ -1167,15 +1172,14 @@ static size_t ZSTD_decompressLiterals(void* ctx,
 
     (void)ctx;
     errorCode = HUF_decompress(op, litSize, ip+2, srcSize-2);
-    if (FSE_isError(errorCode))
-        return errorCode;
+    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
     return litSize;
 }
 
 
 size_t ZSTD_decodeLiteralsBlock(void* ctx,
                                 void* dst, size_t maxDstSize,
-                          const BYTE** litPtr,
+                          const BYTE** litStart, size_t* litSize,
                           const void* src, size_t srcSize)
 {
     const BYTE* const istart = (const BYTE* const)src;
@@ -1186,25 +1190,27 @@ size_t ZSTD_decodeLiteralsBlock(void* ctx,
 
     size_t litcSize = ZSTD_getcBlockSize(src, srcSize, &litbp);
     if (ZSTD_isError(litcSize)) return litcSize;
-    if (litcSize > srcSize - ZSTD_blockHeaderSize) return (size_t)-ZSTD_ERROR_wrongLBlockSize;
+    if (litcSize > srcSize - ZSTD_blockHeaderSize) return (size_t)-ZSTD_ERROR_SrcSize;
     ip += ZSTD_blockHeaderSize;
 
     switch(litbp.blockType)
     {
-    case bt_raw: *litPtr = ip; ip+= litcSize; break;
+    case bt_raw: *litStart = ip; ip += litcSize; *litSize = litcSize; break;
     case bt_rle:
         {
             size_t rleSize = litbp.origSize;
             memset(oend - rleSize, *ip, rleSize);
-            *litPtr = oend - rleSize;
+            *litStart = oend - rleSize;
+            *litSize = rleSize;
             ip++;
             break;
         }
     case bt_compressed:
         {
-            size_t litSize = ZSTD_decompressLiterals(ctx, dst, maxDstSize, ip, litcSize);
-            if (ZSTD_isError(litSize)) return litSize;
-            *litPtr = oend - litSize;
+            size_t decodedLitSize = ZSTD_decompressLiterals(ctx, dst, maxDstSize, ip, litcSize);
+            if (ZSTD_isError(decodedLitSize)) return decodedLitSize;
+            *litStart = oend - decodedLitSize;
+            *litSize = decodedLitSize;
             ip += litcSize;
             break;
         }
@@ -1216,7 +1222,7 @@ size_t ZSTD_decodeLiteralsBlock(void* ctx,
 }
 
 
-size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
+size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr,
                          FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
                          const void* src, size_t srcSize)
 {
@@ -1228,7 +1234,7 @@ size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
     size_t dumpsLength;
 
     /* SeqHead */
-    ip += ZSTD_readProgressive(lastLLPtr, ip);
+    *nbSeq = ZSTD_readLE16(ip); ip+=2;
     LLtype  = *ip >> 6;
     Offtype = (*ip >> 4) & 3;
     MLtype  = (*ip >> 2) & 3;
@@ -1309,8 +1315,228 @@ size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
 }
 
 
-#define ZSTD_prefetch(p) { const BYTE pByte = *(volatile const BYTE*)p; }
+typedef struct {
+    size_t litLength;
+    size_t offset;
+    size_t matchLength;
+} seq_t;
 
+typedef struct {
+    FSE_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    seq_t seq;
+    size_t prevOffset;
+    const BYTE* dumps;
+} seqState_t;
+
+
+static void ZSTD_decodeSequence(seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+
+    /* Literal length */
+    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    prevOffset = litLength ? seqState->seq.offset : seqState->prevOffset;
+    seqState->prevOffset = seqState->seq.offset;
+    if (litLength == MaxLL)
+    {
+        U32 add = *dumps++;
+        if (add < 255) litLength += add;
+        else
+        {
+            litLength = ZSTD_readLE32(dumps) & 0xFFFFFF;
+            dumps += 3;
+        }
+    }
+
+    /* Offset */
+    {
+        U32 offsetCode, nbBits;
+        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));
+        if (ZSTD_32bits()) FSE_reloadDStream(&(seqState->DStream));
+        nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = ((size_t)1 << nbBits) + FSE_readBits(&(seqState->DStream), nbBits);
+        if (ZSTD_32bits()) FSE_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;
+    }
+
+    /* MatchLength */
+    matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML)
+    {
+        U32 add = *dumps++;
+        if (add < 255) matchLength += add;
+        else
+        {
+            matchLength = ZSTD_readLE32(dumps) & 0xFFFFFF;   /* no pb : dumps is always followed by seq tables > 1 byte */
+            dumps += 3;
+        }
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seqState->seq.litLength = litLength;
+    seqState->seq.offset = offset;
+    seqState->seq.matchLength = matchLength;
+    seqState->dumps = dumps;
+}
+
+
+static size_t ZSTD_execSequence(BYTE* op, seq_t sequence, const BYTE** litPtr, BYTE* const oend)
+{
+    static const int dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
+    static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
+    const BYTE* const ostart = op;
+
+    /* copy Literals */
+    const BYTE* const litEnd = *litPtr + sequence.litLength;     /* possible overflow at op + litLength ? */
+    if (((size_t)(*litPtr - op) < 8) || ((size_t)(oend-litEnd) < 8))
+        memmove(op, *litPtr, sequence.litLength);   /* overwrite risk */
+    else
+        ZSTD_wildcopy(op, *litPtr, sequence.litLength);
+    op += sequence.litLength;
+    *litPtr = litEnd;
+
+    /* copy Match */
+    {
+        const BYTE* match = op - sequence.offset;            /* possible underflow at op - offset ? */
+        BYTE* const endMatch = op + sequence.matchLength;    /* possible overflow at op + matchLength ? */
+        size_t qutt=12;
+        U64 saved[2];
+        const U32 overlapRisk = (((size_t)(litEnd - endMatch)) < 12);
+
+        /* save beginning of literal sequence, in case of write overlap */
+        if (overlapRisk)
+        {
+            if ((endMatch + qutt) > oend) qutt = oend-endMatch;
+            memcpy(saved, endMatch, qutt);
+        }
+
+        if (sequence.offset < 8)
+        {
+            const int dec64 = dec64table[sequence.offset];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[sequence.offset];
+            ZSTD_copy4(op+4, match);
+            match -= dec64;
+        } else { ZSTD_copy8(op, match); }
+
+        if (endMatch > oend-12)
+        {
+            if (op < oend-16)
+            {
+                ZSTD_wildcopy(op+8, match+8, (oend-8) - (op+8));
+                match += (oend-8) - op;
+                op = oend-8;
+            }
+            while (op<endMatch) *op++ = *match++;
+        }
+        else
+            ZSTD_wildcopy(op+8, match+8, sequence.matchLength-8);   /* works even if matchLength < 8 */
+
+        op = endMatch;
+
+        /* restore, in case of overlap */
+        if (overlapRisk) memcpy(endMatch, saved, qutt);
+    }
+
+    return op-ostart;
+}
+
+
+static size_t ZSTD_decompressSequences(
+                               void* ctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize,
+                         const BYTE* litStart, size_t litSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode;
+    const BYTE* litPtr = litStart;
+    const BYTE* const litEnd = litStart + litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    FSE_DTable* DTableML = (FSE_DTable*)ctx;
+    FSE_DTable* DTableLL = DTableML + FSE_DTABLE_SIZE_U32(MLFSELog);
+    FSE_DTable* DTableOffb = DTableLL + FSE_DTABLE_SIZE_U32(LLFSELog);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, iend-ip);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    {
+        seqState_t seqState;
+
+        memset(&seqState, 0, sizeof(seqState));
+        seqState.dumps = dumps;
+        FSE_initDStream(&(seqState.DStream), ip, iend-ip);
+        FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (FSE_reloadDStream(&(seqState.DStream)) < FSE_DStream_completed) || (nbSeq>0) ; )
+        {
+            nbSeq--;
+            ZSTD_decodeSequence(&seqState);
+            op += ZSTD_execSequence(op, seqState.seq, &litPtr, oend);
+        }
+
+        /* check if reached exact end */
+        if (FSE_reloadDStream(&(seqState.DStream)) > FSE_DStream_completed) return (size_t)-ZSTD_ERROR_corruption;   /* requested too much : data is corrupted */
+        if (nbSeq<0) return (size_t)-ZSTD_ERROR_corruption;   /* requested too many sequences : data is corrupted */
+
+        /* last literal segment */
+        {
+            size_t lastLLSize = litEnd - litPtr;
+            if (op != litPtr) memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+
+static size_t ZSTD_decompressBlock(
+                            void* ctx,
+                            void* dst, size_t maxDstSize,
+                      const void* src, size_t srcSize)
+{
+    /* blockType == blockCompressed, srcSize is trusted */
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* litPtr;
+    size_t litSize;
+    size_t errorCode;
+
+    /* Decode literals sub-block */
+    errorCode = ZSTD_decodeLiteralsBlock(ctx, dst, maxDstSize, &litPtr, &litSize, src, srcSize);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    ip += errorCode;
+    srcSize -= errorCode;
+
+    return ZSTD_decompressSequences(ctx, dst, maxDstSize, ip, srcSize, litPtr, litSize);
+}
+
+
+#if 0
 FORCE_INLINE size_t ZSTD_decompressBlock(void* ctx, void* dst, size_t maxDstSize,
                              const void* src, size_t srcSize)
 {
@@ -1349,7 +1575,7 @@ FORCE_INLINE size_t ZSTD_decompressBlock(void* ctx, void* dst, size_t maxDstSize
         litEnd = ip - lastLLSize;
     ip += errorCode;
 
-    /* LZ decompression */
+    /* LZ Sequences */
     {
         FSE_DStream_t DStream;
         FSE_DState_t stateLL, stateOffb, stateML;
@@ -1476,6 +1702,7 @@ _another_round:
 
     return op-ostart;
 }
+#endif
 
 
 static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
@@ -1490,12 +1717,13 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
     size_t errorCode=0;
     blockProperties_t blockProperties;
 
-    /* Header */
-    if (srcSize < ZSTD_frameHeaderSize) return (size_t)-ZSTD_ERROR_wrongSrcSize;
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize) return (size_t)-ZSTD_ERROR_SrcSize;
     magicNumber = ZSTD_readBE32(src);
-    if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_wrongMagicNumber;
+    if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_MagicNumber;
     ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
 
+    /* Loop on each block */
     while (1)
     {
         size_t blockSize = ZSTD_getcBlockSize(ip, iend-ip, &blockProperties);
@@ -1504,8 +1732,7 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
 
         ip += ZSTD_blockHeaderSize;
         remainingSize -= ZSTD_blockHeaderSize;
-        if (blockSize > remainingSize)
-            return (size_t)-ZSTD_ERROR_wrongSrcSize;
+        if (blockSize > remainingSize) return (size_t)-ZSTD_ERROR_SrcSize;
 
         switch(blockProperties.blockType)
         {
@@ -1520,7 +1747,7 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
             break;
         case bt_end :
             /* end of frame */
-            if (remainingSize) return (size_t)-ZSTD_ERROR_wrongSrcSize;
+            if (remainingSize) return (size_t)-ZSTD_ERROR_SrcSize;
             break;
         default:
             return (size_t)-ZSTD_ERROR_GENERIC;
@@ -1583,14 +1810,14 @@ size_t ZSTD_decompressContinue(ZSTD_Dctx* dctx, void* dst, size_t maxDstSize, co
     dctx_t* ctx = (dctx_t*)dctx;
 
     /* Sanity check */
-    if (srcSize != ctx->expected) return (size_t)-ZSTD_ERROR_wrongSrcSize;
+    if (srcSize != ctx->expected) return (size_t)-ZSTD_ERROR_SrcSize;
 
     /* Decompress : frame header */
     if (ctx->phase == 0)
     {
         /* Check frame magic header */
         U32 magicNumber = ZSTD_readBE32(src);
-        if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_wrongMagicNumber;
+        if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_MagicNumber;
         ctx->phase = 1;
         ctx->expected = ZSTD_blockHeaderSize;
         return 0;
diff --git a/lib/zstd_static.h b/lib/zstd_static.h
index a059288f..1baa47d3 100644
--- a/lib/zstd_static.h
+++ b/lib/zstd_static.h
@@ -74,9 +74,9 @@ size_t ZSTD_decompressContinue(ZSTD_Dctx* dctx, void* dst, size_t maxDstSize, co
 **************************************/
 #define ZSTD_LIST_ERRORS(ITEM) \
         ITEM(ZSTD_OK_NoError) ITEM(ZSTD_ERROR_GENERIC) \
-        ITEM(ZSTD_ERROR_wrongMagicNumber) \
-        ITEM(ZSTD_ERROR_wrongSrcSize) ITEM(ZSTD_ERROR_maxDstSize_tooSmall) \
-        ITEM(ZSTD_ERROR_wrongLBlockSize) \
+        ITEM(ZSTD_ERROR_MagicNumber) \
+        ITEM(ZSTD_ERROR_SrcSize) ITEM(ZSTD_ERROR_maxDstSize_tooSmall) \
+        ITEM(ZSTD_ERROR_corruption) \
         ITEM(ZSTD_ERROR_maxCode)
 
 #define ZSTD_GENERATE_ENUM(ENUM) ENUM,
diff --git a/programs/fullbench.c b/programs/fullbench.c
index b7ecc8db..668e2990 100644
--- a/programs/fullbench.c
+++ b/programs/fullbench.c
@@ -229,7 +229,6 @@ typedef struct
 static size_t g_cSize = 0;
 
 extern size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr);
-extern size_t ZSTD_decodeLiteralsBlock(void* ctx, void* dst, size_t maxDstSize, const BYTE** litPtr, const void* src, size_t srcSize);
 extern size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr, FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb, const void* src, size_t srcSize);
 
 
@@ -245,12 +244,14 @@ size_t local_ZSTD_decompress(void* dst, size_t dstSize, void* buff2, const void*
     return ZSTD_decompress(dst, dstSize, buff2, g_cSize);
 }
 
+extern size_t ZSTD_decodeLiteralsBlock(void* ctx, void* dst, size_t maxDstSize, const BYTE** litStart, size_t* litSize, const void* src, size_t srcSize);
 size_t local_ZSTD_decodeLiteralsBlock(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
 {
     U32 ctx[1<<12];
     const BYTE* ll;
+    size_t llSize;
     (void)src; (void)srcSize;
-    ZSTD_decodeLiteralsBlock(ctx, dst, dstSize, &ll, buff2, g_cSize);
+    ZSTD_decodeLiteralsBlock(ctx, dst, dstSize, &ll, &llSize, buff2, g_cSize);
     return (const BYTE*)dst + dstSize - ll;
 }
 
diff --git a/programs/fuzzer.c b/programs/fuzzer.c
index 658a0cc6..c8629255 100644
--- a/programs/fuzzer.c
+++ b/programs/fuzzer.c
@@ -239,27 +239,27 @@ static int basicUnitTests(U32 seed, double compressibility)
     DISPLAYLEVEL(4, "test%3i : decompress with 1 missing byte : ", testNb++);
     result = ZSTD_decompress(decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, compressedBuffer, cSize-1);
     if (!ZSTD_isError(result)) goto _output_error;
-    if (result != (size_t)-ZSTD_ERROR_wrongSrcSize) goto _output_error;
+    if (result != (size_t)-ZSTD_ERROR_SrcSize) goto _output_error;
     DISPLAYLEVEL(4, "OK \n");
 
     DISPLAYLEVEL(4, "test%3i : decompress with 1 too much byte : ", testNb++);
     result = ZSTD_decompress(decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, compressedBuffer, cSize+1);
     if (!ZSTD_isError(result)) goto _output_error;
-    if (result != (size_t)-ZSTD_ERROR_wrongSrcSize) goto _output_error;
+    if (result != (size_t)-ZSTD_ERROR_SrcSize) goto _output_error;
     DISPLAYLEVEL(4, "OK \n");
 
     /* Decompression defense tests */
     DISPLAYLEVEL(4, "test%3i : Check input length for magic number : ", testNb++);
     result = ZSTD_decompress(decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, CNBuffer, 3);
     if (!ZSTD_isError(result)) goto _output_error;
-    if (result != (size_t)-ZSTD_ERROR_wrongSrcSize) goto _output_error;
+    if (result != (size_t)-ZSTD_ERROR_SrcSize) goto _output_error;
     DISPLAYLEVEL(4, "OK \n");
 
     DISPLAYLEVEL(4, "test%3i : Check magic Number : ", testNb++);
     ((char*)(CNBuffer))[0] = 1;
     result = ZSTD_decompress(decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, CNBuffer, 4);
     if (!ZSTD_isError(result)) goto _output_error;
-    if (result != (size_t)-ZSTD_ERROR_wrongMagicNumber) goto _output_error;
+    if (result != (size_t)-ZSTD_ERROR_MagicNumber) goto _output_error;
     DISPLAYLEVEL(4, "OK \n");
 
     /* long rle test */
@@ -334,11 +334,11 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
     FUZ_generateSynthetic(srcBuffer, srcBufferSize, compressibility, &coreSeed);
 
     /* catch up testNb */
-    for (testNb=0; testNb < startTest; testNb++)
+    for (testNb=1; testNb <= startTest; testNb++)
         FUZ_rand(&coreSeed);
 
     /* test loop */
-    for (testNb=startTest; testNb < nbTests; testNb++)
+    for (testNb=startTest; testNb <= nbTests; testNb++)
     {
         size_t sampleSize, sampleStart;
         size_t cSize, dSize, dSupSize;
@@ -350,7 +350,7 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         FUZ_rand(&coreSeed);
         lseed = coreSeed ^ prime1;
         sampleSizeLog = FUZ_rand(&lseed) % maxSampleLog;
-        sampleSize = (size_t)1<<sampleSizeLog;
+        sampleSize = (size_t)1 << sampleSizeLog;
         sampleSize += FUZ_rand(&lseed) & (sampleSize-1);
         sampleStart = FUZ_rand(&lseed) % (srcBufferSize - sampleSize);
         crcOrig = XXH64(srcBuffer + sampleStart, sampleSize, 0);
@@ -362,9 +362,11 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         /* compression failure test */
         {
             size_t errorCode;
-            void* dBufferTooSmall = malloc(cSize-1);   /* valgrind should catch overflows */
-            if (dBufferTooSmall==NULL) { DISPLAY("not enough memory !"); exit(1); }
-            errorCode = ZSTD_compress(dBufferTooSmall, cSize-1, srcBuffer + sampleStart, sampleSize);
+            const size_t missing = (FUZ_rand(&lseed) % (cSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
+            const size_t tooSmallSize = cSize - missing;
+            void* dBufferTooSmall = malloc(tooSmallSize);   /* valgrind will catch overflows */
+            CHECK(dBufferTooSmall == NULL, "not enough memory !");
+            errorCode = ZSTD_compress(dBufferTooSmall, tooSmallSize, srcBuffer + sampleStart, sampleSize);
             CHECK(!ZSTD_isError(errorCode), "ZSTD_compress should have failed ! (buffer too small)");
             free(dBufferTooSmall);
         }

From 23743530e042e2f4bb4177d571294962cd397e44 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Wed, 19 Aug 2015 23:53:56 +0100
Subject: [PATCH 13/21] Updated fse

---
 lib/fse.c | 148 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 110 insertions(+), 38 deletions(-)

diff --git a/lib/fse.c b/lib/fse.c
index a82baa41..d7dca9ba 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -127,6 +127,29 @@ typedef   signed long long  S64;
 /****************************************************************
 *  Memory I/O
 *****************************************************************/
+/* FSE_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which generate assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef FSE_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define FSE_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define FSE_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+
 static unsigned FSE_32bits(void)
 {
     return sizeof(void*)==4;
@@ -138,13 +161,64 @@ static unsigned FSE_isLittleEndian(void)
     return one.c[0];
 }
 
+#if defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==2)
+
+static U16 FSE_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static U32 FSE_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 FSE_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+static void FSE_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+static void FSE_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+static void FSE_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U16 FSE_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+static U32 FSE_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 FSE_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+static void FSE_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+static void FSE_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
+static void FSE_write64(void* memPtr, U64 value) { ((unalign*)memPtr)->u64 = value; }
+
+#else
+
 static U16 FSE_read16(const void* memPtr)
 {
-    U16 val;
-    memcpy(&val, memPtr, sizeof(val));
-    return val;
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
 }
 
+static U32 FSE_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U64 FSE_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static void FSE_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+static void FSE_write32(void* memPtr, U32 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+static void FSE_write64(void* memPtr, U64 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif // FSE_FORCE_MEMORY_ACCESS
+
 static U16 FSE_readLE16(const void* memPtr)
 {
     if (FSE_isLittleEndian())
@@ -160,7 +234,7 @@ static void FSE_writeLE16(void* memPtr, U16 val)
 {
     if (FSE_isLittleEndian())
     {
-        memcpy(memPtr, &val, sizeof(val));
+        FSE_write16(memPtr, val);
     }
     else
     {
@@ -170,13 +244,6 @@ static void FSE_writeLE16(void* memPtr, U16 val)
     }
 }
 
-static U32 FSE_read32(const void* memPtr)
-{
-    U32 val32;
-    memcpy(&val32, memPtr, 4);
-    return val32;
-}
-
 static U32 FSE_readLE32(const void* memPtr)
 {
     if (FSE_isLittleEndian())
@@ -192,7 +259,7 @@ static void FSE_writeLE32(void* memPtr, U32 val32)
 {
     if (FSE_isLittleEndian())
     {
-        memcpy(memPtr, &val32, 4);
+        FSE_write32(memPtr, val32);
     }
     else
     {
@@ -204,13 +271,6 @@ static void FSE_writeLE32(void* memPtr, U32 val32)
     }
 }
 
-static U64 FSE_read64(const void* memPtr)
-{
-    U64 val64;
-    memcpy(&val64, memPtr, 8);
-    return val64;
-}
-
 static U64 FSE_readLE64(const void* memPtr)
 {
     if (FSE_isLittleEndian())
@@ -227,7 +287,7 @@ static void FSE_writeLE64(void* memPtr, U64 val64)
 {
     if (FSE_isLittleEndian())
     {
-        memcpy(memPtr, &val64, 8);
+        FSE_write64(memPtr, val64);
     }
     else
     {
@@ -643,13 +703,13 @@ static short FSE_abs(short a)
 ****************************************************************/
 size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
 {
-    size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 1;
-    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;
+    size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 1 + 1;   /* last +1 : written by U16 */
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
 }
 
 static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
                                        const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
-                                       unsigned safeWrite)
+                                       unsigned writeIsSafe)
 {
     BYTE* const ostart = (BYTE*) header;
     BYTE* out = ostart;
@@ -684,7 +744,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
             {
                 start+=24;
                 bitStream += 0xFFFFU << bitCount;
-                if ((!safeWrite) && (out > oend-2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend-2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
                 out[0] = (BYTE) bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out+=2;
@@ -700,7 +760,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
             bitCount += 2;
             if (bitCount>16)
             {
-                if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
                 out[0] = (BYTE)bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out += 2;
@@ -723,7 +783,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
         }
         if (bitCount>16)
         {
-            if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
+            if ((!writeIsSafe) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
             out[0] = (BYTE)bitStream;
             out[1] = (BYTE)(bitStream>>8);
             out += 2;
@@ -733,7 +793,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
     }
 
     /* flush remaining bitStream */
-    if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
+    if ((!writeIsSafe) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
     out[0] = (BYTE)bitStream;
     out[1] = (BYTE)(bitStream>>8);
     out+= (bitCount+7) /8;
@@ -789,8 +849,16 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
             while ((bitStream & 0xFFFF) == 0xFFFF)
             {
                 n0+=24;
-                ip+=2;
-                bitStream = FSE_readLE32(ip) >> bitCount;
+                if (ip < iend-5)
+                {
+                    ip+=2;
+                    bitStream = FSE_readLE32(ip) >> bitCount;
+                }
+                else
+                {
+                    bitStream >>= 16;
+                    bitCount+=16;
+                }
             }
             while ((bitStream & 3) == 3)
             {
@@ -802,9 +870,14 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
             bitCount += 2;
             if (n0 > *maxSVPtr) return (size_t)-FSE_ERROR_maxSymbolValue_tooSmall;
             while (charnum < n0) normalizedCounter[charnum++] = 0;
-            ip += bitCount>>3;
-            bitCount &= 7;
-            bitStream = FSE_readLE32(ip) >> bitCount;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+            {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = FSE_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
         }
         {
             const short max = (short)((2*threshold-1)-remaining);
@@ -833,16 +906,15 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
             }
 
             {
-                const BYTE* itarget = ip + (bitCount>>3);
-                if (itarget > iend - 4)
+                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
                 {
-                    ip = iend - 4;
-                    bitCount -= (int)(8 * (iend - 4 - ip));
+                    ip += bitCount>>3;
+                    bitCount &= 7;
                 }
                 else
                 {
-                    ip = itarget;
-                    bitCount &= 7;
+                    ip = iend - 4;
+                    bitCount -= (int)(8 * (iend - 4 - ip));
                 }
                 bitStream = FSE_readLE32(ip) >> (bitCount & 31);
             }

From f3cb79b58f83ad3531561806d63b4add61475205 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Thu, 20 Aug 2015 00:02:43 +0100
Subject: [PATCH 14/21] Fixed : g++ link error within fullbench

---
 programs/fullbench.c |  7 +++----
 programs/fuzzer.c    | 35 ++++++++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/programs/fullbench.c b/programs/fullbench.c
index 668e2990..a7dbac3b 100644
--- a/programs/fullbench.c
+++ b/programs/fullbench.c
@@ -229,8 +229,7 @@ typedef struct
 static size_t g_cSize = 0;
 
 extern size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr);
-extern size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr, FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb, const void* src, size_t srcSize);
-
+extern size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb, const void* src, size_t srcSize);
 
 size_t local_ZSTD_compress(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
 {
@@ -259,9 +258,9 @@ size_t local_ZSTD_decodeSeqHeaders(void* dst, size_t dstSize, void* buff2, const
 {
     U32 DTableML[1<<11], DTableLL[1<<10], DTableOffb[1<<9];
     const BYTE* dumps;
-    size_t lastllSize;
+    int nbSeq;
     (void)src; (void)srcSize; (void)dst; (void)dstSize;
-    return ZSTD_decodeSeqHeaders(&lastllSize, &dumps, DTableLL, DTableML, DTableOffb, buff2, g_cSize);
+    return ZSTD_decodeSeqHeaders(&nbSeq, &dumps, DTableLL, DTableML, DTableOffb, buff2, g_cSize);
 }
 
 size_t local_conditionalNull(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
diff --git a/programs/fuzzer.c b/programs/fuzzer.c
index c8629255..ba31a3d1 100644
--- a/programs/fuzzer.c
+++ b/programs/fuzzer.c
@@ -259,7 +259,6 @@ static int basicUnitTests(U32 seed, double compressibility)
     ((char*)(CNBuffer))[0] = 1;
     result = ZSTD_decompress(decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, CNBuffer, 4);
     if (!ZSTD_isError(result)) goto _output_error;
-    if (result != (size_t)-ZSTD_ERROR_MagicNumber) goto _output_error;
     DISPLAYLEVEL(4, "OK \n");
 
     /* long rle test */
@@ -334,11 +333,11 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
     FUZ_generateSynthetic(srcBuffer, srcBufferSize, compressibility, &coreSeed);
 
     /* catch up testNb */
-    for (testNb=1; testNb <= startTest; testNb++)
+    for (testNb=1; testNb < startTest; testNb++)
         FUZ_rand(&coreSeed);
 
     /* test loop */
-    for (testNb=startTest; testNb <= nbTests; testNb++)
+    for ( ; testNb <= nbTests; testNb++ )
     {
         size_t sampleSize, sampleStart;
         size_t cSize, dSize, dSupSize;
@@ -359,7 +358,7 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         cSize = ZSTD_compress(cBuffer, cBufferSize, srcBuffer + sampleStart, sampleSize);
         CHECK(ZSTD_isError(cSize), "ZSTD_compress failed");
 
-        /* compression failure test */
+        /* compression failure test : too small dest buffer */
         {
             size_t errorCode;
             const size_t missing = (FUZ_rand(&lseed) % (cSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
@@ -371,12 +370,38 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
             free(dBufferTooSmall);
         }
 
-        /* decompression tests*/
+        /* successfull decompression tests*/
         dSupSize = (FUZ_rand(&lseed) & 1) ? 0 : (FUZ_rand(&lseed) & 31) + 1;
         dSize = ZSTD_decompress(dstBuffer, sampleSize + dSupSize, cBuffer, cSize);
         CHECK(dSize != sampleSize, "ZSTD_decompress failed (%s)", ZSTD_getErrorName(dSize));
         crcDest = XXH64(dstBuffer, sampleSize, 0);
         CHECK(crcOrig != crcDest, "dstBuffer corrupted (pos %u / %u)", (U32)findDiff(srcBuffer+sampleStart, dstBuffer, sampleSize), (U32)sampleSize);
+
+        /* truncated src decompression test */
+        {
+            size_t errorCode;
+            const size_t missing = (FUZ_rand(&lseed) % (cSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
+            const size_t tooSmallSize = cSize - missing;
+            void* cBufferTooSmall = malloc(tooSmallSize);   /* valgrind will catch overflows */
+            memcpy(cBufferTooSmall, cBuffer, tooSmallSize);
+            CHECK(cBufferTooSmall == NULL, "not enough memory !");
+            errorCode = ZSTD_decompress(dstBuffer, dstBufferSize, cBufferTooSmall, tooSmallSize);
+            CHECK(!ZSTD_isError(errorCode), "ZSTD_decompress should have failed ! (truncated src buffer)");
+            free(cBufferTooSmall);
+        }
+
+        /* too small dst decompression test */
+        if (sampleSize > 3)
+        {
+            size_t errorCode;
+            const size_t missing = (FUZ_rand(&lseed) % (sampleSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
+            const size_t tooSmallSize = sampleSize - missing;
+            static const BYTE token = 0xA9;
+            dstBuffer[tooSmallSize] = token;
+            errorCode = ZSTD_decompress(dstBuffer, tooSmallSize, cBuffer, cSize);
+            CHECK(!ZSTD_isError(errorCode), "ZSTD_decompress should have failed : %u > %u (dst buffer too small)", (U32)errorCode, (U32)tooSmallSize);
+            CHECK(dstBuffer[tooSmallSize] != token, "ZSTD_decompress : dst buffer overflow");
+        }
     }
     DISPLAY("\rAll fuzzer tests completed   \n");
 

From 602834f7943d2777baa6ba563a16fefbc44ca778 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Thu, 20 Aug 2015 07:46:10 +0100
Subject: [PATCH 15/21] Fixed : bug in compression in specific conditions (too
 small dst size)

---
 lib/fse.c         |  1 +
 lib/zstd.c        | 59 +++++++++++++++++++++++++++++++----------------
 programs/fuzzer.c | 11 +++++----
 3 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/lib/fse.c b/lib/fse.c
index d7dca9ba..b1a36574 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -2044,6 +2044,7 @@ size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size
     FSE_CStream_t bitC;
 
     /* init */
+	if (dstSize < 8) return 0;   /* need a minimum for jumpTable and first symbols */
     op += 6;   /* jump Table -- could be optimized by delta / deviation */
     errorCode = FSE_initCStream(&bitC, op, oend-op);
     if (FSE_isError(errorCode)) return 0;
diff --git a/lib/zstd.c b/lib/zstd.c
index c0945c25..4bfa8e64 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -322,6 +322,11 @@ ZSTD_Cctx* ZSTD_createCCtx(void)
     ZSTD_Cctx* ctx = (ZSTD_Cctx*) malloc( sizeof(ZSTD_Cctx) );
     if (ctx==NULL) return NULL;
     ctx->seqStore.buffer = malloc(WORKPLACESIZE);
+    if (ctx->seqStore.buffer==NULL)
+    {
+        free(ctx);
+        return NULL;
+    }
     ctx->seqStore.offsetStart = (U32*) (ctx->seqStore.buffer);
     ctx->seqStore.offCodeStart = (BYTE*) (ctx->seqStore.offsetStart + (BLOCKSIZE>>2));
     ctx->seqStore.litStart = ctx->seqStore.offCodeStart + (BLOCKSIZE>>2);
@@ -1120,6 +1125,7 @@ size_t ZSTD_compress(void* dst, size_t maxDstSize, const void* src, size_t srcSi
     size_t r;
 
     ctx = ZSTD_createCCtx();
+    if (ctx==NULL) return (size_t)-ZSTD_ERROR_GENERIC;
     r = ZSTD_compressCCtx(ctx, dst, maxDstSize, src, srcSize);
     ZSTD_freeCCtx(ctx);
     return r;
@@ -1171,6 +1177,7 @@ static size_t ZSTD_decompressLiterals(void* ctx,
     op = oend - litSize;
 
     (void)ctx;
+    if (litSize > maxDstSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
     errorCode = HUF_decompress(op, litSize, ip+2, srcSize-2);
     if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
     return litSize;
@@ -1195,10 +1202,15 @@ size_t ZSTD_decodeLiteralsBlock(void* ctx,
 
     switch(litbp.blockType)
     {
-    case bt_raw: *litStart = ip; ip += litcSize; *litSize = litcSize; break;
+    case bt_raw:
+        *litStart = ip;
+        ip += litcSize;
+        *litSize = litcSize;
+        break;
     case bt_rle:
         {
             size_t rleSize = litbp.origSize;
+            if (rleSize>maxDstSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
             memset(oend - rleSize, *ip, rleSize);
             *litStart = oend - rleSize;
             *litSize = rleSize;
@@ -1326,13 +1338,12 @@ typedef struct {
     FSE_DState_t stateLL;
     FSE_DState_t stateOffb;
     FSE_DState_t stateML;
-    seq_t seq;
     size_t prevOffset;
     const BYTE* dumps;
 } seqState_t;
 
 
-static void ZSTD_decodeSequence(seqState_t* seqState)
+static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
 {
     size_t litLength;
     size_t prevOffset;
@@ -1342,8 +1353,8 @@ static void ZSTD_decodeSequence(seqState_t* seqState)
 
     /* Literal length */
     litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
-    prevOffset = litLength ? seqState->seq.offset : seqState->prevOffset;
-    seqState->prevOffset = seqState->seq.offset;
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    seqState->prevOffset = seq->offset;
     if (litLength == MaxLL)
     {
         U32 add = *dumps++;
@@ -1382,9 +1393,9 @@ static void ZSTD_decodeSequence(seqState_t* seqState)
     matchLength += MINMATCH;
 
     /* save result */
-    seqState->seq.litLength = litLength;
-    seqState->seq.offset = offset;
-    seqState->seq.matchLength = matchLength;
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
     seqState->dumps = dumps;
 }
 
@@ -1394,20 +1405,24 @@ static size_t ZSTD_execSequence(BYTE* op, seq_t sequence, const BYTE** litPtr, B
     static const int dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
     static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
     const BYTE* const ostart = op;
+    size_t litLength = sequence.litLength;
+    BYTE* const endMatch = op + litLength + sequence.matchLength;    /* risk : address space overflow (32-bits) */
+    const BYTE* const litEnd = *litPtr + litLength;
+
+    /* check */
+    if (endMatch > oend) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
 
     /* copy Literals */
-    const BYTE* const litEnd = *litPtr + sequence.litLength;     /* possible overflow at op + litLength ? */
-    if (((size_t)(*litPtr - op) < 8) || ((size_t)(oend-litEnd) < 8))
-        memmove(op, *litPtr, sequence.litLength);   /* overwrite risk */
+    if (((size_t)(*litPtr - op) < 8) || ((size_t)(oend-litEnd) < 8) || (op+litLength > oend-8))
+        memmove(op, *litPtr, litLength);   /* overwrite risk */
     else
-        ZSTD_wildcopy(op, *litPtr, sequence.litLength);
-    op += sequence.litLength;
+        ZSTD_wildcopy(op, *litPtr, litLength);
+    op += litLength;
     *litPtr = litEnd;
 
     /* copy Match */
     {
         const BYTE* match = op - sequence.offset;            /* possible underflow at op - offset ? */
-        BYTE* const endMatch = op + sequence.matchLength;    /* possible overflow at op + matchLength ? */
         size_t qutt=12;
         U64 saved[2];
         const U32 overlapRisk = (((size_t)(litEnd - endMatch)) < 12);
@@ -1483,9 +1498,10 @@ static size_t ZSTD_decompressSequences(
 
     /* Regen sequences */
     {
+        seq_t sequence;
         seqState_t seqState;
 
-        memset(&seqState, 0, sizeof(seqState));
+        memset(&sequence, 0, sizeof(sequence));
         seqState.dumps = dumps;
         FSE_initDStream(&(seqState.DStream), ip, iend-ip);
         FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
@@ -1494,9 +1510,12 @@ static size_t ZSTD_decompressSequences(
 
         for ( ; (FSE_reloadDStream(&(seqState.DStream)) < FSE_DStream_completed) || (nbSeq>0) ; )
         {
+            size_t oneSeqSize;
             nbSeq--;
-            ZSTD_decodeSequence(&seqState);
-            op += ZSTD_execSequence(op, seqState.seq, &litPtr, oend);
+            ZSTD_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, oend);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
         }
 
         /* check if reached exact end */
@@ -1506,6 +1525,7 @@ static size_t ZSTD_decompressSequences(
         /* last literal segment */
         {
             size_t lastLLSize = litEnd - litPtr;
+            if (op+lastLLSize > oend) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
             if (op != litPtr) memmove(op, litPtr, lastLLSize);
             op += lastLLSize;
         }
@@ -1718,7 +1738,7 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
     blockProperties_t blockProperties;
 
     /* Frame Header */
-    if (srcSize < ZSTD_frameHeaderSize) return (size_t)-ZSTD_ERROR_SrcSize;
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return (size_t)-ZSTD_ERROR_SrcSize;
     magicNumber = ZSTD_readBE32(src);
     if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_MagicNumber;
     ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
@@ -1727,8 +1747,7 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
     while (1)
     {
         size_t blockSize = ZSTD_getcBlockSize(ip, iend-ip, &blockProperties);
-        if (ZSTD_isError(blockSize))
-            return blockSize;
+        if (ZSTD_isError(blockSize)) return blockSize;
 
         ip += ZSTD_blockHeaderSize;
         remainingSize -= ZSTD_blockHeaderSize;
diff --git a/programs/fuzzer.c b/programs/fuzzer.c
index ba31a3d1..6416536f 100644
--- a/programs/fuzzer.c
+++ b/programs/fuzzer.c
@@ -359,15 +359,18 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         CHECK(ZSTD_isError(cSize), "ZSTD_compress failed");
 
         /* compression failure test : too small dest buffer */
+         if (cSize > 3)
         {
             size_t errorCode;
             const size_t missing = (FUZ_rand(&lseed) % (cSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
             const size_t tooSmallSize = cSize - missing;
-            void* dBufferTooSmall = malloc(tooSmallSize);   /* valgrind will catch overflows */
-            CHECK(dBufferTooSmall == NULL, "not enough memory !");
-            errorCode = ZSTD_compress(dBufferTooSmall, tooSmallSize, srcBuffer + sampleStart, sampleSize);
+            static const U32 endMark = 0x4DC2B1A9;
+            U32 endCheck;
+            memcpy(dstBuffer+tooSmallSize, &endMark, 4);
+            errorCode = ZSTD_compress(dstBuffer, tooSmallSize, srcBuffer + sampleStart, sampleSize);
             CHECK(!ZSTD_isError(errorCode), "ZSTD_compress should have failed ! (buffer too small)");
-            free(dBufferTooSmall);
+            memcpy(&endCheck, dstBuffer+tooSmallSize, 4);
+            CHECK(endCheck != endMark, "ZSTD_compress : dst buffer overflow");
         }
 
         /* successfull decompression tests*/

From 7d20acd340b85f6553376116d7ae793915c7f321 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Thu, 20 Aug 2015 15:55:50 +0100
Subject: [PATCH 16/21] Fix : decoder issue in exceptionnal circumstances (dst
 buffer too small)

---
 lib/zstd.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/lib/zstd.c b/lib/zstd.c
index 4bfa8e64..ad03e553 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -1405,7 +1405,7 @@ static size_t ZSTD_execSequence(BYTE* op, seq_t sequence, const BYTE** litPtr, B
     static const int dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
     static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
     const BYTE* const ostart = op;
-    size_t litLength = sequence.litLength;
+    const size_t litLength = sequence.litLength;
     BYTE* const endMatch = op + litLength + sequence.matchLength;    /* risk : address space overflow (32-bits) */
     const BYTE* const litEnd = *litPtr + litLength;
 
@@ -1418,14 +1418,17 @@ static size_t ZSTD_execSequence(BYTE* op, seq_t sequence, const BYTE** litPtr, B
     else
         ZSTD_wildcopy(op, *litPtr, litLength);
     op += litLength;
-    *litPtr = litEnd;
+    *litPtr = litEnd;   /* update for next sequence */
 
-    /* copy Match */
+    /* check : last match must be at a minimum distance of 8 from end of dest buffer */
+	if (oend-op < 8) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+
+	/* copy Match */
     {
-        const BYTE* match = op - sequence.offset;            /* possible underflow at op - offset ? */
-        size_t qutt=12;
-        U64 saved[2];
         const U32 overlapRisk = (((size_t)(litEnd - endMatch)) < 12);
+        const BYTE* match = op - sequence.offset;            /* possible underflow at op - offset ? */
+        size_t qutt = 12;
+        U64 saved[2];
 
         /* save beginning of literal sequence, in case of write overlap */
         if (overlapRisk)
@@ -1445,27 +1448,26 @@ static size_t ZSTD_execSequence(BYTE* op, seq_t sequence, const BYTE** litPtr, B
             ZSTD_copy4(op+4, match);
             match -= dec64;
         } else { ZSTD_copy8(op, match); }
+		op += 8; match += 8;
 
         if (endMatch > oend-12)
         {
-            if (op < oend-16)
+            if (op < oend-8)
             {
-                ZSTD_wildcopy(op+8, match+8, (oend-8) - (op+8));
+                ZSTD_wildcopy(op, match, (oend-8) - op);
                 match += (oend-8) - op;
                 op = oend-8;
             }
             while (op<endMatch) *op++ = *match++;
         }
         else
-            ZSTD_wildcopy(op+8, match+8, sequence.matchLength-8);   /* works even if matchLength < 8 */
-
-        op = endMatch;
+            ZSTD_wildcopy(op, match, sequence.matchLength-8);   /* works even if matchLength < 8 */
 
         /* restore, in case of overlap */
         if (overlapRisk) memcpy(endMatch, saved, qutt);
     }
 
-    return op-ostart;
+    return endMatch-ostart;
 }
 
 

From 997f9ee2efebc347ffceefb5ea34c53ee992e143 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Fri, 21 Aug 2015 02:44:20 +0100
Subject: [PATCH 17/21] Added : fuzzer tests : decompressing noisy src

---
 lib/fse.c         |  35 ++++---
 lib/zstd.c        | 230 +++++++++-------------------------------------
 programs/fuzzer.c |  48 +++++++++-
 3 files changed, 109 insertions(+), 204 deletions(-)

diff --git a/lib/fse.c b/lib/fse.c
index b1a36574..a577e81e 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -703,7 +703,7 @@ static short FSE_abs(short a)
 ****************************************************************/
 size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
 {
-    size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 1 + 1;   /* last +1 : written by U16 */
+    size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3; 
     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
 }
 
@@ -804,15 +804,15 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
 }
 
 
-size_t FSE_writeNCount (void* header, size_t headerBufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
     if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported */
     if (tableLog < FSE_MIN_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported */
 
-    if (headerBufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
-        return FSE_writeNCount_generic(header, headerBufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
 
-    return FSE_writeNCount_generic(header, headerBufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
 }
 
 
@@ -913,9 +913,9 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
                 }
                 else
                 {
-                    ip = iend - 4;
                     bitCount -= (int)(8 * (iend - 4 - ip));
-                }
+					ip = iend - 4;
+				}
                 bitStream = FSE_readLE32(ip) >> (bitCount & 31);
             }
         }
@@ -967,9 +967,12 @@ void  FSE_freeCTable (FSE_CTable* ct)
 unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
 {
     U32 tableLog = maxTableLog;
+	U32 minBitsSrc = FSE_highbit32((U32)(srcSize - 1)) + 1;
+	U32 minBitsSymbols = FSE_highbit32(maxSymbolValue) + 2;
+	U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
     if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
-    if ((FSE_highbit32((U32)(srcSize - 1)) - 2) < tableLog) tableLog = FSE_highbit32((U32)(srcSize - 1)) - 2;   /* Accuracy can be reduced */
-    if ((FSE_highbit32(maxSymbolValue)+2) > tableLog) tableLog = FSE_highbit32(maxSymbolValue)+2;   /* Need a minimum to safely represent all symbol values */
+	if (minBitsSrc < tableLog + 3) tableLog = minBitsSrc-3;   /* Accuracy can be reduced */
+	if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
     if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
     if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
     return tableLog;
@@ -1076,7 +1079,7 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
     if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
     if (tableLog < FSE_MIN_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported size */
     if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported size */
-    if ((1U<<tableLog) <= maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC;   /* Too small tableLog, compression potentially impossible */
+    //if ((1U<<tableLog) <= maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC;   /* Too small tableLog, compression potentially impossible */
 
     {
         U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
@@ -1555,6 +1558,9 @@ size_t FSE_readBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >=
 
 unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
 {
+	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+		return FSE_DStream_tooFar;
+
     if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
     {
         bitD->ptr -= bitD->bitsConsumed >> 3;
@@ -1565,20 +1571,19 @@ unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
     if (bitD->ptr == bitD->start)
     {
         if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return FSE_DStream_endOfBuffer;
-        if (bitD->bitsConsumed == sizeof(bitD->bitContainer)*8) return FSE_DStream_completed;
-        return FSE_DStream_tooFar;
+        return FSE_DStream_completed;
     }
     {
         U32 nbBytes = bitD->bitsConsumed >> 3;
         U32 result = FSE_DStream_unfinished;
         if (bitD->ptr - nbBytes < bitD->start)
         {
-            nbBytes = (U32)(bitD->ptr - bitD->start);  /* note : necessarily ptr > start */
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
             result = FSE_DStream_endOfBuffer;
         }
         bitD->ptr -= nbBytes;
         bitD->bitsConsumed -= nbBytes*8;
-        bitD->bitContainer = FSE_readLEST(bitD->ptr);   /* note : necessarily srcSize > sizeof(bitD) */
+        bitD->bitContainer = FSE_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
         return result;
     }
 }
@@ -2044,7 +2049,7 @@ size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size
     FSE_CStream_t bitC;
 
     /* init */
-	if (dstSize < 8) return 0;   /* need a minimum for jumpTable and first symbols */
+	if (dstSize < 8) return 0;
     op += 6;   /* jump Table -- could be optimized by delta / deviation */
     errorCode = FSE_initCStream(&bitC, op, oend-op);
     if (FSE_isError(errorCode)) return 0;
diff --git a/lib/zstd.c b/lib/zstd.c
index ad03e553..5414a56c 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -1245,6 +1245,9 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr,
     U32 LLlog, Offlog, MLlog;
     size_t dumpsLength;
 
+	/* check */
+	if (srcSize < 5) return (size_t)-ZSTD_ERROR_SrcSize;
+
     /* SeqHead */
     *nbSeq = ZSTD_readLE16(ip); ip+=2;
     LLtype  = *ip >> 6;
@@ -1265,6 +1268,9 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr,
     *dumpsPtr = ip;
     ip += dumpsLength;
 
+	/* check */
+	if (ip > iend-1) return (size_t)-ZSTD_ERROR_SrcSize;
+
     /* sequences */
     {
         S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL and MaxOff */
@@ -1284,6 +1290,7 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr,
             max = MaxLL;
             headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+			if (LLlog > LLFSELog) return (size_t)-ZSTD_ERROR_corruption;
             ip += headerSize;
             FSE_buildDTable(DTableLL, norm, max, LLlog);
         }
@@ -1301,6 +1308,7 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr,
             max = MaxOff;
             headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+			if (Offlog > OffFSELog) return (size_t)-ZSTD_ERROR_corruption;
             ip += headerSize;
             FSE_buildDTable(DTableOffb, norm, max, Offlog);
         }
@@ -1318,6 +1326,7 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr,
             max = MaxML;
             headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+			if (MLlog > MLFSELog) return (size_t)-ZSTD_ERROR_corruption;
             ip += headerSize;
             FSE_buildDTable(DTableML, norm, max, MLlog);
         }
@@ -1400,7 +1409,10 @@ static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
 }
 
 
-static size_t ZSTD_execSequence(BYTE* op, seq_t sequence, const BYTE** litPtr, BYTE* const oend)
+static size_t ZSTD_execSequence(BYTE* op,
+								seq_t sequence,
+								const BYTE** litPtr, const BYTE* const litLimit,
+								BYTE* const base, BYTE* const oend)
 {
     static const int dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
     static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
@@ -1411,6 +1423,7 @@ static size_t ZSTD_execSequence(BYTE* op, seq_t sequence, const BYTE** litPtr, B
 
     /* check */
     if (endMatch > oend) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+	if (litEnd > litLimit) return (size_t)-ZSTD_ERROR_corruption;
 
     /* copy Literals */
     if (((size_t)(*litPtr - op) < 8) || ((size_t)(oend-litEnd) < 8) || (op+litLength > oend-8))
@@ -1430,6 +1443,10 @@ static size_t ZSTD_execSequence(BYTE* op, seq_t sequence, const BYTE** litPtr, B
         size_t qutt = 12;
         U64 saved[2];
 
+		/* check */
+		if (match < base) return (size_t)-ZSTD_ERROR_corruption;
+		if (sequence.offset > (size_t)base) return (size_t)-ZSTD_ERROR_corruption;
+
         /* save beginning of literal sequence, in case of write overlap */
         if (overlapRisk)
         {
@@ -1470,6 +1487,18 @@ static size_t ZSTD_execSequence(BYTE* op, seq_t sequence, const BYTE** litPtr, B
     return endMatch-ostart;
 }
 
+typedef struct ZSTD_Dctx_s
+{
+    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+	U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+	U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    void* previousDstEnd;
+    void* base;
+	size_t expected;
+    blockType_t bType;
+    U32 phase;
+} dctx_t;
+
 
 static size_t ZSTD_decompressSequences(
                                void* ctx,
@@ -1477,6 +1506,7 @@ static size_t ZSTD_decompressSequences(
                          const void* seqStart, size_t seqSize,
                          const BYTE* litStart, size_t litSize)
 {
+	dctx_t* dctx = (dctx_t*)ctx;
     const BYTE* ip = (const BYTE*)seqStart;
     const BYTE* const iend = ip + seqSize;
     BYTE* const ostart = (BYTE* const)dst;
@@ -1487,9 +1517,10 @@ static size_t ZSTD_decompressSequences(
     const BYTE* const litEnd = litStart + litSize;
     int nbSeq;
     const BYTE* dumps;
-    FSE_DTable* DTableML = (FSE_DTable*)ctx;
-    FSE_DTable* DTableLL = DTableML + FSE_DTABLE_SIZE_U32(MLFSELog);
-    FSE_DTable* DTableOffb = DTableLL + FSE_DTABLE_SIZE_U32(LLFSELog);
+    U32* DTableLL = dctx->LLTable;
+	U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+	BYTE* const base = (BYTE*) (dctx->base);
 
     /* Build Decoding Tables */
     errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps,
@@ -1515,7 +1546,7 @@ static size_t ZSTD_decompressSequences(
             size_t oneSeqSize;
             nbSeq--;
             ZSTD_decodeSequence(&sequence, &seqState);
-            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, oend);
+            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, litEnd, base, oend);
             if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
             op += oneSeqSize;
         }
@@ -1558,175 +1589,6 @@ static size_t ZSTD_decompressBlock(
 }
 
 
-#if 0
-FORCE_INLINE size_t ZSTD_decompressBlock(void* ctx, void* dst, size_t maxDstSize,
-                             const void* src, size_t srcSize)
-{
-    const BYTE* ip = (const BYTE*)src;
-    const BYTE* const iend = ip + srcSize;
-    BYTE* const ostart = (BYTE* const)dst;
-    BYTE* op = ostart;
-    BYTE* const oend = ostart + maxDstSize;
-    size_t errorCode;
-    size_t lastLLSize;
-    const BYTE* dumps;
-    const BYTE* litPtr;
-    const BYTE* litEnd;
-    const int dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
-    const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
-    FSE_DTable* DTableML = (FSE_DTable*)ctx;
-    FSE_DTable* DTableLL = DTableML + FSE_DTABLE_SIZE_U32(MLFSELog);
-    FSE_DTable* DTableOffb = DTableLL + FSE_DTABLE_SIZE_U32(LLFSELog);
-
-    /* blockType == blockCompressed, srcSize is trusted */
-
-    /* Decode literals sub-block */
-    errorCode = ZSTD_decodeLiteralsBlock(ctx, dst, maxDstSize, &litPtr, src, srcSize);
-    if (ZSTD_isError(errorCode)) return errorCode;
-    ip += errorCode;
-
-    /* Build Decoding Tables */
-    errorCode = ZSTD_decodeSeqHeaders(&lastLLSize, &dumps,
-                                      DTableLL, DTableML, DTableOffb,
-                                      ip, iend-ip);
-    if (ZSTD_isError(errorCode)) return errorCode;
-    /* end pos */
-    if ((litPtr>=ostart) && (litPtr<=oend))   /* decoded literals are into dst buffer */
-        litEnd = oend - lastLLSize;
-    else
-        litEnd = ip - lastLLSize;
-    ip += errorCode;
-
-    /* LZ Sequences */
-    {
-        FSE_DStream_t DStream;
-        FSE_DState_t stateLL, stateOffb, stateML;
-        size_t prevOffset = 0, offset = 0;
-
-        FSE_initDStream(&DStream, ip, iend-ip);
-        FSE_initDState(&stateLL, &DStream, DTableLL);
-        FSE_initDState(&stateOffb, &DStream, DTableOffb);
-        FSE_initDState(&stateML, &DStream, DTableML);
-
-        while (FSE_reloadDStream(&DStream)<2)
-        {
-            U32 nbBits, offsetCode;
-            const BYTE* match;
-            size_t litLength;
-            size_t matchLength;
-            size_t newOffset;
-
-_another_round:
-
-            /* Literals */
-            litLength = FSE_decodeSymbol(&stateLL, &DStream);
-            if (litLength) prevOffset = offset;
-            if (litLength == MaxLL)
-            {
-                BYTE add = *dumps++;
-                if (add < 255) litLength += add;
-                else
-                {
-                    litLength = ZSTD_readLE32(dumps) & 0xFFFFFF;
-                    dumps += 3;
-                }
-            }
-            if (((size_t)(litPtr - op) < 8) || ((size_t)(oend-(litPtr+litLength)) < 8))
-                memmove(op, litPtr, litLength);   /* overwrite risk */
-            else
-                ZSTD_wildcopy(op, litPtr, litLength);
-            op += litLength;
-            litPtr += litLength;
-
-            /* Offset */
-            offsetCode = FSE_decodeSymbol(&stateOffb, &DStream);
-            if (ZSTD_32bits()) FSE_reloadDStream(&DStream);
-            nbBits = offsetCode - 1;
-            if (offsetCode==0) nbBits = 0;   /* cmove */
-            newOffset = FSE_readBits(&DStream, nbBits);
-            if (ZSTD_32bits()) FSE_reloadDStream(&DStream);
-            newOffset += (size_t)1 << nbBits;
-            if (offsetCode==0) newOffset = prevOffset;
-            match = op - newOffset;
-            prevOffset = offset;
-            offset = newOffset;
-
-            /* MatchLength */
-            matchLength = FSE_decodeSymbol(&stateML, &DStream);
-            if (matchLength == MaxML)
-            {
-                BYTE add = *dumps++;
-                if (add < 255) matchLength += add;
-                else
-                {
-                    matchLength = ZSTD_readLE32(dumps) & 0xFFFFFF;   /* no pb : dumps is always followed by seq tables > 1 byte */
-                    dumps += 3;
-                }
-            }
-            matchLength += MINMATCH;
-
-            /* copy Match */
-            {
-                BYTE* const endMatch = op + matchLength;
-                size_t qutt=12;
-                U64 saved[2];
-                const U32 overlapRisk = (((size_t)(litPtr - endMatch)) < 12);
-
-                /* save beginning of literal sequence, in case of write overlap */
-                if (overlapRisk)
-                {
-                    if ((endMatch + qutt) > oend) qutt = oend-endMatch;
-                    memcpy(saved, endMatch, qutt);
-                }
-
-                if (offset < 8)
-                {
-                    const int dec64 = dec64table[offset];
-                    op[0] = match[0];
-                    op[1] = match[1];
-                    op[2] = match[2];
-                    op[3] = match[3];
-                    match += dec32table[offset];
-                    ZSTD_copy4(op+4, match);
-                    match -= dec64;
-                } else { ZSTD_copy8(op, match); }
-
-                if (endMatch > oend-12)
-                {
-                    if (op < oend-16)
-                    {
-                        ZSTD_wildcopy(op+8, match+8, (oend-8) - (op+8));
-                        match += (oend-8) - op;
-                        op = oend-8;
-                    }
-                    while (op<endMatch) *op++ = *match++;
-                }
-                else
-                    ZSTD_wildcopy(op+8, match+8, matchLength-8);   /* works even if matchLength < 8 */
-
-                op = endMatch;
-
-                /* restore, in case of overlap */
-                if (overlapRisk)
-                    memcpy(endMatch, saved, qutt);
-            }
-        }
-
-        /* check if reached exact end */
-        if (FSE_reloadDStream(&DStream) > 2) return (size_t)-ZSTD_ERROR_GENERIC;   /* requested too much : data is corrupted */
-        if (!FSE_endOfDState(&stateLL) && !FSE_endOfDState(&stateML) && !FSE_endOfDState(&stateOffb)) goto _another_round;   /* some ultra-compressible sequence remain ! */
-        if (litPtr != litEnd) goto _another_round;   /* literals not entirely spent */
-
-        /* last literal segment */
-        if (op != litPtr) memmove(op, litPtr, lastLLSize);
-        op += lastLLSize;
-    }
-
-    return op-ostart;
-}
-#endif
-
-
 static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     const BYTE* ip = (const BYTE*)src;
@@ -1784,11 +1646,11 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
     return op-ostart;
 }
 
-
 size_t ZSTD_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
-    U32 ctx[FSE_DTABLE_SIZE_U32(LLFSELog) + FSE_DTABLE_SIZE_U32(OffFSELog) + FSE_DTABLE_SIZE_U32(MLFSELog)];
-    return ZSTD_decompressDCtx(ctx, dst, maxDstSize, src, srcSize);
+	dctx_t ctx;
+	ctx.base = dst;
+    return ZSTD_decompressDCtx(&ctx, dst, maxDstSize, src, srcSize);
 }
 
 
@@ -1796,21 +1658,14 @@ size_t ZSTD_decompress(void* dst, size_t maxDstSize, const void* src, size_t src
 *  Streaming Decompression API
 *******************************/
 
-typedef struct ZSTD_Dctx_s
-{
-    U32 ctx[FSE_DTABLE_SIZE_U32(LLFSELog) + FSE_DTABLE_SIZE_U32(OffFSELog) + FSE_DTABLE_SIZE_U32(MLFSELog)];
-    size_t expected;
-    blockType_t bType;
-    U32 phase;
-} dctx_t;
-
-
 ZSTD_Dctx* ZSTD_createDCtx(void)
 {
     ZSTD_Dctx* dctx = (ZSTD_Dctx*)malloc(sizeof(ZSTD_Dctx));
     if (dctx==NULL) return NULL;
     dctx->expected = ZSTD_frameHeaderSize;
     dctx->phase = 0;
+	dctx->previousDstEnd = NULL;
+	dctx->base = NULL;
     return dctx;
 }
 
@@ -1832,6 +1687,8 @@ size_t ZSTD_decompressContinue(ZSTD_Dctx* dctx, void* dst, size_t maxDstSize, co
 
     /* Sanity check */
     if (srcSize != ctx->expected) return (size_t)-ZSTD_ERROR_SrcSize;
+	if (dst != ctx->previousDstEnd)  /* not contiguous */
+		ctx->base = dst;
 
     /* Decompress : frame header */
     if (ctx->phase == 0)
@@ -1887,6 +1744,7 @@ size_t ZSTD_decompressContinue(ZSTD_Dctx* dctx, void* dst, size_t maxDstSize, co
         }
         ctx->phase = 1;
         ctx->expected = ZSTD_blockHeaderSize;
+		ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
         return rSize;
     }
 
diff --git a/programs/fuzzer.c b/programs/fuzzer.c
index 6416536f..35aee2df 100644
--- a/programs/fuzzer.c
+++ b/programs/fuzzer.c
@@ -177,8 +177,7 @@ static void FUZ_generateSynthetic(void* buffer, size_t bufferSize, double proba,
 }
 
 
-/*
-static unsigned FUZ_highbit(U32 v32)
+static unsigned FUZ_highbit32(U32 v32)
 {
     unsigned nbBits = 0;
     if (v32==0) return 0;
@@ -189,7 +188,6 @@ static unsigned FUZ_highbit(U32 v32)
     }
     return nbBits;
 }
-*/
 
 
 static int basicUnitTests(U32 seed, double compressibility)
@@ -405,6 +403,50 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
             CHECK(!ZSTD_isError(errorCode), "ZSTD_decompress should have failed : %u > %u (dst buffer too small)", (U32)errorCode, (U32)tooSmallSize);
             CHECK(dstBuffer[tooSmallSize] != token, "ZSTD_decompress : dst buffer overflow");
         }
+
+        /* noisy src decompression test */
+        if (cSize > 6)
+        {
+            const U32 maxNbBits = FUZ_highbit32((U32)(cSize-4));
+            size_t pos = 4;   /* preserve magic number (too easy to detect) */
+            U32 nbBits = FUZ_rand(&lseed) % maxNbBits;
+            size_t mask = (1<<nbBits) - 1;
+            size_t skipLength = FUZ_rand(&lseed) & mask;
+            pos += skipLength;
+
+            while (pos < cSize)
+            {
+                /* add noise */
+                size_t noiseStart, noiseLength;
+                nbBits = FUZ_rand(&lseed) % maxNbBits;
+                if (nbBits>0) nbBits--;
+                mask = (1<<nbBits) - 1;
+                noiseLength = (FUZ_rand(&lseed) & mask) + 1;
+                noiseStart = FUZ_rand(&lseed) % (srcBufferSize - noiseLength);
+                memcpy(cBuffer + pos, srcBuffer + noiseStart, noiseLength);
+                pos += noiseLength;
+
+                /* keep some original src */
+                nbBits = FUZ_rand(&lseed) % maxNbBits;
+                mask = (1<<nbBits) - 1;
+                skipLength = FUZ_rand(&lseed) & mask;
+                pos += skipLength;
+            }
+
+            /* decompress noisy source */
+            {
+                const U32 endMark = 0xA9B1C3D6;
+                U32 endCheck;
+                size_t errorCode;
+                memcpy(dstBuffer+sampleSize, &endMark, 4);
+                errorCode = ZSTD_decompress(dstBuffer, sampleSize, cBuffer, cSize);
+                /* result *may* be an unlikely success, but even then, it must strictly respect dest buffer boundaries */
+                CHECK((!ZSTD_isError(errorCode)) && (errorCode>sampleSize),
+                      "ZSTD_decompress on noisy src : result is too large : %u > %u (dst buffer)", (U32)errorCode, (U32)sampleSize);
+                memcpy(&endCheck, dstBuffer+sampleSize, 4);
+                CHECK(endMark!=endCheck, "ZSTD_decompress on noisy src : dst buffer overflow");
+            }
+        }
     }
     DISPLAY("\rAll fuzzer tests completed   \n");
 

From d02114e0e1d4bfa6ae8d038b7d36f207531bdcc4 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Fri, 21 Aug 2015 03:59:31 +0100
Subject: [PATCH 18/21] updated fse

---
 lib/fse.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/lib/fse.c b/lib/fse.c
index a577e81e..2c55a563 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -703,7 +703,7 @@ static short FSE_abs(short a)
 ****************************************************************/
 size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
 {
-    size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3; 
+    size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3;
     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
 }
 
@@ -964,14 +964,22 @@ void  FSE_freeCTable (FSE_CTable* ct)
 }
 
 
-unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 {
-    U32 tableLog = maxTableLog;
 	U32 minBitsSrc = FSE_highbit32((U32)(srcSize - 1)) + 1;
 	U32 minBitsSymbols = FSE_highbit32(maxSymbolValue) + 2;
 	U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+	return minBits;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+	U32 maxBitsSrc = FSE_highbit32((U32)(srcSize - 1)) - 2;
+    U32 tableLog = maxTableLog;
+	U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
     if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
-	if (minBitsSrc < tableLog + 3) tableLog = minBitsSrc-3;   /* Accuracy can be reduced */
+	if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
 	if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
     if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
     if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
@@ -1079,7 +1087,7 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
     if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
     if (tableLog < FSE_MIN_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported size */
     if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported size */
-    //if ((1U<<tableLog) <= maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC;   /* Too small tableLog, compression potentially impossible */
+    if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return (size_t)-FSE_ERROR_GENERIC;   /* Too small tableLog, compression potentially impossible */
 
     {
         U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };

From cf5ce55cca2c3d508c63d3b751231e2df04fad3e Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Fri, 21 Aug 2015 12:00:52 +0100
Subject: [PATCH 19/21] Updated xxhash

---
 programs/xxhash.c | 89 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 68 insertions(+), 21 deletions(-)

diff --git a/programs/xxhash.c b/programs/xxhash.c
index e6fb8f14..511d9941 100644
--- a/programs/xxhash.c
+++ b/programs/xxhash.c
@@ -35,13 +35,26 @@ You can contact the author at :
 /**************************************
 *  Tuning parameters
 **************************************/
-/* Unaligned memory access is automatically enabled for "common" CPU, such as x86.
- * For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
- * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
- * You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
+/* XXH_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which generate assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
  */
-#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-#  define XXH_USE_UNALIGNED_ACCESS 1
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
 #endif
 
 /* XXH_ACCEPT_NULL_INPUT_POINTER :
@@ -55,12 +68,21 @@ You can contact the author at :
  * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
  * Results are therefore identical for little-endian and big-endian CPU.
  * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
- * Should endian-independance be of no importance for your application, you may set the #define below to 1.
- * It will improve speed for Big-endian CPU.
+ * Should endian-independance be of no importance for your application, you may set the #define below to 1,
+ * to improve speed for Big-endian CPU.
  * This option has no impact on Little_Endian CPU.
  */
 #define XXH_FORCE_NATIVE_FORMAT 0
 
+/* XXH_USELESS_ALIGN_BRANCH :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : don't make a test between aligned/unaligned, because performance will be the same.
+ * It saves one initial branch per hash.
+ */
+#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USELESS_ALIGN_BRANCH 1
+#endif
+
 
 /**************************************
 *  Compiler Specific Options
@@ -113,20 +135,43 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp
   typedef unsigned long long U64;
 #endif
 
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
 static U32 XXH_read32(const void* memPtr)
 {
-    U32 val32;
-    memcpy(&val32, memPtr, 4);
-    return val32;
+    U32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
 }
 
 static U64 XXH_read64(const void* memPtr)
 {
-    U64 val64;
-    memcpy(&val64, memPtr, 8);
-    return val64;
+    U64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
 }
 
+#endif // XXH_FORCE_DIRECT_MEMORY_ACCESS
 
 
 /******************************************
@@ -175,8 +220,10 @@ static U64 XXH_swap64 (U64 x)
 *  Architecture Macros
 ***************************************/
 typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
-#ifndef XXH_CPU_LITTLE_ENDIAN   /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example using a compiler switch */
-static const int one = 1;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example one the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+    static const int one = 1;
 #   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&one))
 #endif
 
@@ -315,7 +362,7 @@ FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH
 }
 
 
-unsigned XXH32 (const void* input, size_t len, unsigned seed)
+unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
 {
 #if 0
     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
@@ -326,7 +373,7 @@ unsigned XXH32 (const void* input, size_t len, unsigned seed)
 #else
     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 
-#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+#  if !defined(XXH_USELESS_ALIGN_BRANCH)
     if ((((size_t)input) & 3) == 0)   /* Input is 4-bytes aligned, leverage the speed benefit */
     {
         if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
@@ -466,7 +513,7 @@ unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed
 #else
     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 
-#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+#  if !defined(XXH_USELESS_ALIGN_BRANCH)
     if ((((size_t)input) & 7)==0)   /* Input is aligned, let's leverage the speed advantage */
     {
         if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
@@ -538,7 +585,7 @@ XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
 
 /*** Hash feed ***/
 
-XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed)
+XXH_errorcode XXH32_reset(XXH32_state_t* state_in, unsigned int seed)
 {
     XXH_istate32_t* state = (XXH_istate32_t*) state_in;
     state->seed = seed;
@@ -708,7 +755,7 @@ FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endiane
 }
 
 
-U32 XXH32_digest (const XXH32_state_t* state_in)
+unsigned int XXH32_digest (const XXH32_state_t* state_in)
 {
     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 

From d5d9bc3f8287759752bcc8446d8213dfd77792bf Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Sun, 23 Aug 2015 23:13:49 +0100
Subject: [PATCH 20/21] Added : ZSTD decompression : ability to resist to
 noisy/faulty data source

---
 README.md             |   4 +-
 lib/zstd.c            |   9 ++--
 programs/Makefile     |   4 +-
 programs/datagen.c    |   4 +-
 programs/datagen.h    |  12 ++---
 programs/datagencli.c |   2 +-
 programs/fuzzer.c     | 101 ++++++++++++++++++++----------------------
 7 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/README.md b/README.md
index 86d34190..ccb96595 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
- **Zstd**, short for Zstandard, is a new lossless compression algorithm, which provides both good compression ratio _and_ speed for your standard compression needs. "Standard" translates into everyday situations which neither look for highest possible ratio nor extreme speeds.s
+ **Zstd**, short for Zstandard, is a new lossless compression algorithm, which provides both good compression ratio _and_ speed for your standard compression needs. "Standard" translates into everyday situations which neither look for highest possible ratio nor extreme speed.
 
 It is provided as a BSD-license package, hosted on Github.
 
@@ -40,7 +40,7 @@ It's a complex area which will require time and benefit from contributions.
 
 Another property zstd is developed for is configurable memory requirement, with the objective to fit into low-memory configurations, or servers handling many connections in parallel.
 
-Zstd entropy stage is provided by [Huff0 and FSE, of Finite State Entrop library](https://github.com/Cyan4973/FiniteStateEntropy).
+Zstd entropy stage is provided by [Huff0 and FSE, from Finite State Entrop library](https://github.com/Cyan4973/FiniteStateEntropy).
 
 Zstd is still considered experimental at this stage. Specifically, it doesn't guarantee yet that its current stream/file format will remain supported in future versions of the library. Therefore, only use Zstd in environments where you can control the availability of the decompression library. "Stable" status, including official documented format format and long-term support commitment, is projected sometimes early 2016.
 
diff --git a/lib/zstd.c b/lib/zstd.c
index 5414a56c..bcc16c87 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -1414,7 +1414,7 @@ static size_t ZSTD_execSequence(BYTE* op,
 								const BYTE** litPtr, const BYTE* const litLimit,
 								BYTE* const base, BYTE* const oend)
 {
-    static const int dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
+    static const int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
     static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
     const BYTE* const ostart = op;
     const size_t litLength = sequence.litLength;
@@ -1422,8 +1422,9 @@ static size_t ZSTD_execSequence(BYTE* op,
     const BYTE* const litEnd = *litPtr + litLength;
 
     /* check */
-    if (endMatch > oend) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+    if (endMatch > oend) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* overwrite beyond dst buffer */
 	if (litEnd > litLimit) return (size_t)-ZSTD_ERROR_corruption;
+    if (sequence.matchLength > (size_t)(*litPtr-op))  return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;    /* overwrite literal segment */
 
     /* copy Literals */
     if (((size_t)(*litPtr - op) < 8) || ((size_t)(oend-litEnd) < 8) || (op+litLength > oend-8))
@@ -1536,7 +1537,9 @@ static size_t ZSTD_decompressSequences(
 
         memset(&sequence, 0, sizeof(sequence));
         seqState.dumps = dumps;
-        FSE_initDStream(&(seqState.DStream), ip, iend-ip);
+        seqState.prevOffset = 1;
+        errorCode = FSE_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_corruption;
         FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
         FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
         FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
diff --git a/programs/Makefile b/programs/Makefile
index 1f7bdd5f..b90cfdce 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -73,10 +73,10 @@ fullbench  : $(ZSTDDIR)/zstd.c datagen.c fullbench.c
 fullbench32: $(ZSTDDIR)/zstd.c datagen.c fullbench.c
 	$(CC) -m32 $(FLAGS) $^ -o $@$(EXT)
 
-fuzzer  : $(ZSTDDIR)/zstd.c xxhash.c fuzzer.c
+fuzzer  : $(ZSTDDIR)/zstd.c datagen.c xxhash.c fuzzer.c
 	$(CC)      $(FLAGS) $^ -o $@$(EXT)
 
-fuzzer32: $(ZSTDDIR)/zstd.c xxhash.c fuzzer.c
+fuzzer32: $(ZSTDDIR)/zstd.c datagen.c xxhash.c fuzzer.c
 	$(CC) -m32 $(FLAGS) $^ -o $@$(EXT)
 
 datagen : datagen.c datagencli.c
diff --git a/programs/datagen.c b/programs/datagen.c
index fa0e62a1..23d7d15f 100644
--- a/programs/datagen.c
+++ b/programs/datagen.c
@@ -153,7 +153,7 @@ void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double match
         memset(buffPtr+pos, 0, size0);
         pos += size0;
         buffPtr[pos-1] = RDG_genChar(seed, lt);
-        return;
+        continue;
     }
 
     /* init */
@@ -200,7 +200,7 @@ void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba
 
 #define RDG_DICTSIZE  (32 KB)
 #define RDG_BLOCKSIZE (128 KB)
-void RDG_genOut(unsigned long long size, double matchProba, double litProba, unsigned seed)
+void RDG_genStdout(unsigned long long size, double matchProba, double litProba, unsigned seed)
 {
     BYTE* buff = (BYTE*)malloc(RDG_DICTSIZE + RDG_BLOCKSIZE);
     U64 total = 0;
diff --git a/programs/datagen.h b/programs/datagen.h
index 89482dc2..03b06cae 100644
--- a/programs/datagen.h
+++ b/programs/datagen.h
@@ -26,15 +26,15 @@
 
 #include <stddef.h>   /* size_t */
 
-void RDG_genOut(unsigned long long size, double matchProba, double litProba, unsigned seed);
+void RDG_genStdout(unsigned long long size, double matchProba, double litProba, unsigned seed);
 void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed);
 /* RDG_genBuffer
    Generate 'size' bytes of compressible data into 'buffer'.
-   Compressibility can be controlled using 'matchProba'.
-   'LitProba' is optional, and affect variability of individual bytes. If litProba==0.0, default value is used.
+   Compressibility can be controlled using 'matchProba', which is floating point value between 0 and 1.
+   'LitProba' is optional, it affect variability of individual bytes. If litProba==0.0, default value will be used.
    Generated data pattern can be modified using different 'seed'.
-   If (matchProba, litProba and seed) are equal, the function always generate the same content.
+   For a triplet (matchProba, litProba, seed), the function always generate the same content.
 
-   RDG_genOut
-   Same as RDG_genBuffer, but generate data towards stdout
+   RDG_genStdout
+   Same as RDG_genBuffer, but generates data into stdout
 */
diff --git a/programs/datagencli.c b/programs/datagencli.c
index 801e1980..2665c54b 100644
--- a/programs/datagencli.c
+++ b/programs/datagencli.c
@@ -183,7 +183,7 @@ int main(int argc, char** argv)
     DISPLAYLEVEL(3, "Seed = %u \n", seed);
     if (proba!=COMPRESSIBILITY_DEFAULT) DISPLAYLEVEL(3, "Compressibility : %i%%\n", (U32)(proba*100));
 
-    RDG_genOut(size, proba, litProba, seed);
+    RDG_genStdout(size, proba, litProba, seed);
     DISPLAYLEVEL(1, "\n");
 
     return 0;
diff --git a/programs/fuzzer.c b/programs/fuzzer.c
index 35aee2df..19ab9e3a 100644
--- a/programs/fuzzer.c
+++ b/programs/fuzzer.c
@@ -47,6 +47,7 @@
 #include <sys/timeb.h>   /* timeb */
 #include <string.h>      /* strcmp */
 #include "zstd_static.h"
+#include "datagen.h"     /* RDG_genBuffer */
 #include "xxhash.h"      /* XXH64 */
 
 
@@ -138,45 +139,6 @@ unsigned int FUZ_rand(unsigned int* src)
 }
 
 
-#define FUZ_RAND15BITS  (FUZ_rand(seed) & 0x7FFF)
-#define FUZ_RANDLENGTH  ( (FUZ_rand(seed) & 3) ? (FUZ_rand(seed) % 15) : (FUZ_rand(seed) % 510) + 15)
-static void FUZ_generateSynthetic(void* buffer, size_t bufferSize, double proba, U32* seed)
-{
-    BYTE* BBuffer = (BYTE*)buffer;
-    unsigned pos = 0;
-    U32 P32 = (U32)(32768 * proba);
-
-    // First Byte
-    BBuffer[pos++] = (BYTE)((FUZ_rand(seed) & 0x3F) + '0');
-
-    while (pos < bufferSize)
-    {
-        // Select : Literal (noise) or copy (within 64K)
-        if (FUZ_RAND15BITS < P32)
-        {
-            // Copy (within 64K)
-            size_t match, end;
-            size_t length = FUZ_RANDLENGTH + 4;
-            size_t offset = FUZ_RAND15BITS + 1;
-            if (offset > pos) offset = pos;
-            if (pos + length > bufferSize) length = bufferSize - pos;
-            match = pos - offset;
-            end = pos + length;
-            while (pos < end) BBuffer[pos++] = BBuffer[match++];
-        }
-        else
-        {
-            // Literal (noise)
-            size_t end;
-            size_t length = FUZ_RANDLENGTH;
-            if (pos + length > bufferSize) length = bufferSize - pos;
-            end = pos + length;
-            while (pos < end) BBuffer[pos++] = (BYTE)((FUZ_rand(seed) & 0x3F) + '0');
-        }
-    }
-}
-
-
 static unsigned FUZ_highbit32(U32 v32)
 {
     unsigned nbBits = 0;
@@ -200,7 +162,7 @@ static int basicUnitTests(U32 seed, double compressibility)
     size_t result, cSize;
     U32 testNb=0;
 
-    // Create compressible test buffer
+    /* Create compressible test buffer */
     CNBuffer = malloc(COMPRESSIBLE_NOISE_LENGTH);
     compressedBuffer = malloc(ZSTD_compressBound(COMPRESSIBLE_NOISE_LENGTH));
     decodedBuffer = malloc(COMPRESSIBLE_NOISE_LENGTH);
@@ -210,9 +172,9 @@ static int basicUnitTests(U32 seed, double compressibility)
         testResult = 1;
         goto _end;
     }
-    FUZ_generateSynthetic(CNBuffer, COMPRESSIBLE_NOISE_LENGTH, compressibility, &randState);
+    RDG_genBuffer(CNBuffer, COMPRESSIBLE_NOISE_LENGTH, compressibility, 0., randState);
 
-    // Basic tests
+    /* Basic tests */
     DISPLAYLEVEL(4, "test%3i : compress %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH);
     result = ZSTD_compress(compressedBuffer, ZSTD_compressBound(COMPRESSIBLE_NOISE_LENGTH), CNBuffer, COMPRESSIBLE_NOISE_LENGTH);
     if (ZSTD_isError(result)) goto _output_error;
@@ -263,10 +225,10 @@ static int basicUnitTests(U32 seed, double compressibility)
     {
         size_t sampleSize = 0;
         DISPLAYLEVEL(4, "test%3i : Long RLE test : ", testNb++);
-        FUZ_generateSynthetic(CNBuffer, sampleSize, compressibility, &randState);
+        RDG_genBuffer(CNBuffer, sampleSize, compressibility, 0., randState);
         memset((char*)CNBuffer+sampleSize, 'B', 256 KB - 1);
         sampleSize += 256 KB - 1;
-        FUZ_generateSynthetic((char*)CNBuffer+sampleSize, 96 KB, compressibility, &randState);
+        RDG_genBuffer((char*)CNBuffer+sampleSize, 96 KB, compressibility, 0., randState);
         sampleSize += 96 KB;
         cSize = ZSTD_compress(compressedBuffer, ZSTD_compressBound(sampleSize), CNBuffer, sampleSize);
         if (ZSTD_isError(cSize)) goto _output_error;
@@ -311,6 +273,7 @@ static const U32 maxSampleLog = 22;
 
 int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibility)
 {
+    BYTE* cNoiseBuffer[5];
     BYTE* srcBuffer;
     BYTE* cBuffer;
     BYTE* dstBuffer;
@@ -322,13 +285,23 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
     U32 coreSeed = seed, lseed = 0;
 
     /* allocation */
-    srcBuffer = (BYTE*)malloc (srcBufferSize);
+    cNoiseBuffer[0] = (BYTE*)malloc (srcBufferSize);
+    cNoiseBuffer[1] = (BYTE*)malloc (srcBufferSize);
+    cNoiseBuffer[2] = (BYTE*)malloc (srcBufferSize);
+    cNoiseBuffer[3] = (BYTE*)malloc (srcBufferSize);
+    cNoiseBuffer[4] = (BYTE*)malloc (srcBufferSize);
     dstBuffer = (BYTE*)malloc (dstBufferSize);
     cBuffer   = (BYTE*)malloc (cBufferSize);
-    CHECK (!srcBuffer || !dstBuffer || !cBuffer, "Not enough memory, fuzzer tests cancelled");
+    CHECK (!cNoiseBuffer[0] || !cNoiseBuffer[1] || !cNoiseBuffer[2] || !dstBuffer || !cBuffer,
+           "Not enough memory, fuzzer tests cancelled");
 
-    /* Create initial sample */
-    FUZ_generateSynthetic(srcBuffer, srcBufferSize, compressibility, &coreSeed);
+    /* Create initial samples */
+    RDG_genBuffer(cNoiseBuffer[0], srcBufferSize, 0.00, 0., coreSeed);    /* pure noise */
+    RDG_genBuffer(cNoiseBuffer[1], srcBufferSize, 0.05, 0., coreSeed);    /* barely compressible */
+    RDG_genBuffer(cNoiseBuffer[2], srcBufferSize, compressibility, 0., coreSeed);
+    RDG_genBuffer(cNoiseBuffer[3], srcBufferSize, 0.95, 0., coreSeed);    /* highly compressible */
+    RDG_genBuffer(cNoiseBuffer[4], srcBufferSize, 1.00, 0., coreSeed);    /* sparse content */
+    srcBuffer = cNoiseBuffer[2];
 
     /* catch up testNb */
     for (testNb=1; testNb < startTest; testNb++)
@@ -339,13 +312,30 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
     {
         size_t sampleSize, sampleStart;
         size_t cSize, dSize, dSupSize;
-        U32 sampleSizeLog;
+        U32 sampleSizeLog, buffNb;
         U64 crcOrig, crcDest;
 
         /* init */
         DISPLAYUPDATE(2, "\r%6u/%6u   ", testNb, nbTests);
         FUZ_rand(&coreSeed);
         lseed = coreSeed ^ prime1;
+        buffNb = FUZ_rand(&lseed) & 127;
+        if (buffNb & 7) buffNb=2;
+        else
+        {
+            buffNb >>= 3;
+            if (buffNb & 7)
+            {
+                const U32 tnb[2] = { 1, 3 };
+                buffNb = tnb[buffNb >> 3];
+            }
+            else
+            {
+                const U32 tnb[2] = { 0, 4 };
+                buffNb = tnb[buffNb >> 3];
+            }
+        }
+        srcBuffer = cNoiseBuffer[buffNb];
         sampleSizeLog = FUZ_rand(&lseed) % maxSampleLog;
         sampleSize = (size_t)1 << sampleSizeLog;
         sampleSize += FUZ_rand(&lseed) & (sampleSize-1);
@@ -357,7 +347,7 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         CHECK(ZSTD_isError(cSize), "ZSTD_compress failed");
 
         /* compression failure test : too small dest buffer */
-         if (cSize > 3)
+        if (cSize > 3)
         {
             size_t errorCode;
             const size_t missing = (FUZ_rand(&lseed) % (cSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
@@ -384,8 +374,8 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
             const size_t missing = (FUZ_rand(&lseed) % (cSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
             const size_t tooSmallSize = cSize - missing;
             void* cBufferTooSmall = malloc(tooSmallSize);   /* valgrind will catch overflows */
-            memcpy(cBufferTooSmall, cBuffer, tooSmallSize);
             CHECK(cBufferTooSmall == NULL, "not enough memory !");
+            memcpy(cBufferTooSmall, cBuffer, tooSmallSize);
             errorCode = ZSTD_decompress(dstBuffer, dstBufferSize, cBufferTooSmall, tooSmallSize);
             CHECK(!ZSTD_isError(errorCode), "ZSTD_decompress should have failed ! (truncated src buffer)");
             free(cBufferTooSmall);
@@ -422,6 +412,7 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
                 if (nbBits>0) nbBits--;
                 mask = (1<<nbBits) - 1;
                 noiseLength = (FUZ_rand(&lseed) & mask) + 1;
+                if ( pos+noiseLength > cSize ) noiseLength = cSize-pos;
                 noiseStart = FUZ_rand(&lseed) % (srcBufferSize - noiseLength);
                 memcpy(cBuffer + pos, srcBuffer + noiseStart, noiseLength);
                 pos += noiseLength;
@@ -435,9 +426,11 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
 
             /* decompress noisy source */
             {
+                U32 noiseSrc = FUZ_rand(&lseed) % 5;
                 const U32 endMark = 0xA9B1C3D6;
                 U32 endCheck;
                 size_t errorCode;
+                srcBuffer = cNoiseBuffer[noiseSrc];
                 memcpy(dstBuffer+sampleSize, &endMark, 4);
                 errorCode = ZSTD_decompress(dstBuffer, sampleSize, cBuffer, cSize);
                 /* result *may* be an unlikely success, but even then, it must strictly respect dest buffer boundaries */
@@ -451,7 +444,11 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
     DISPLAY("\rAll fuzzer tests completed   \n");
 
 _cleanup:
-    free(srcBuffer);
+    free(cNoiseBuffer[0]);
+    free(cNoiseBuffer[1]);
+    free(cNoiseBuffer[2]);
+    free(cNoiseBuffer[3]);
+    free(cNoiseBuffer[4]);
     free(cBuffer);
     free(dstBuffer);
     return result;

From d5b7cb0bb820aa49fce51acf626cd2e780735132 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Sun, 23 Aug 2015 23:20:34 +0100
Subject: [PATCH 21/21] Updated Visual solution

---
 visual/2012/fuzzer/fuzzer.vcxproj         | 2 ++
 visual/2012/fuzzer/fuzzer.vcxproj.filters | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/visual/2012/fuzzer/fuzzer.vcxproj b/visual/2012/fuzzer/fuzzer.vcxproj
index 8a114aef..f80b3b85 100644
--- a/visual/2012/fuzzer/fuzzer.vcxproj
+++ b/visual/2012/fuzzer/fuzzer.vcxproj
@@ -161,6 +161,7 @@
   <ItemGroup>
     <ClCompile Include="..\..\..\lib\fse.c" />
     <ClCompile Include="..\..\..\lib\zstd.c" />
+    <ClCompile Include="..\..\..\programs\datagen.c" />
     <ClCompile Include="..\..\..\programs\fuzzer.c" />
     <ClCompile Include="..\..\..\programs\xxhash.c" />
   </ItemGroup>
@@ -169,6 +170,7 @@
     <ClInclude Include="..\..\..\lib\fse_static.h" />
     <ClInclude Include="..\..\..\lib\zstd.h" />
     <ClInclude Include="..\..\..\lib\zstd_static.h" />
+    <ClInclude Include="..\..\..\programs\datagen.h" />
     <ClInclude Include="..\..\..\programs\xxhash.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/visual/2012/fuzzer/fuzzer.vcxproj.filters b/visual/2012/fuzzer/fuzzer.vcxproj.filters
index 7782b08f..6bc481a6 100644
--- a/visual/2012/fuzzer/fuzzer.vcxproj.filters
+++ b/visual/2012/fuzzer/fuzzer.vcxproj.filters
@@ -27,6 +27,9 @@
     <ClCompile Include="..\..\..\programs\xxhash.c">
       <Filter>Fichiers sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\programs\datagen.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\lib\fse.h">
@@ -44,5 +47,8 @@
     <ClInclude Include="..\..\..\programs\xxhash.h">
       <Filter>Fichiers d%27en-tête</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\..\programs\datagen.h">
+      <Filter>Fichiers d%27en-tête</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file