diff --git a/lz4.c b/lz4.c
index a131a81..cce3b17 100644
--- a/lz4.c
+++ b/lz4.c
@@ -45,6 +45,13 @@
 // The default value (6) is recommended
 #define NOTCOMPRESSIBLE_CONFIRMATION 6
 
+// LZ4_COMPRESSMIN :
+// Compression function will *fail* if it is not successful at compressing input by at least LZ4_COMPRESSMIN bytes
+// Since the compression function stops working prematurely, it results in a speed gain
+// The output however is unusable. Compression function result will be zero.
+// Default : 0 = disabled
+#define LZ4_COMPRESSMIN 0
+
 // BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE :
 // This will provide a boost to performance for big endian cpu, but the resulting compressed stream will be incompatible with little-endian CPU.
 // You can set this option to 1 in situations where data will stay within closed environment
@@ -92,13 +99,13 @@
 #endif
 
 #ifdef _MSC_VER
-#define inline __forceinline    // Visual is not C99, but supports inline
+#define inline __forceinline    // Visual is not C99, but supports some kind of inline
 #endif
 
 #ifdef _MSC_VER  // Visual Studio
-#define bswap16(i) _byteswap_ushort(i)
+#define bswap16(x) _byteswap_ushort(x)
 #else
-#define bswap16(i) (((i)>>8) | ((i)<<8))
+#define bswap16(x)  ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8)))
 #endif
 
 
@@ -429,6 +436,7 @@ _last_literals:
 	// Encode Last Literals
 	{
 		int lastRun = iend - anchor;
+		if ((LZ4_COMPRESSMIN>0) && (((op - (BYTE*)dest) + lastRun + 1 + ((lastRun-15)/255)) > isize - LZ4_COMPRESSMIN)) return 0;
 		if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun > 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
 		else *op++ = (lastRun<<ML_BITS);
 		memcpy(op, anchor, iend - anchor);
@@ -570,6 +578,7 @@ _last_literals:
 	// Encode Last Literals
 	{
 		int lastRun = iend - anchor;
+		if ((LZ4_COMPRESSMIN>0) && (((op - (BYTE*)dest) + lastRun + 1 + ((lastRun-15)/255)) > isize - LZ4_COMPRESSMIN)) return 0;
 		if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun > 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
 		else *op++ = (lastRun<<ML_BITS);
 		memcpy(op, anchor, iend - anchor);
diff --git a/lz4.h b/lz4.h
index 556562b..1061fdf 100644
--- a/lz4.h
+++ b/lz4.h
@@ -43,17 +43,18 @@ int LZ4_uncompress (const char* source, char* dest, int osize);
 
 /*
 LZ4_compress() :
-	return : the number of bytes in compressed buffer dest
+	return : the number of bytes written in buffer dest
+			 or 0 if the compression fails (if LZ4_COMPRESSMIN is set)
 	note : destination buffer must be already allocated. 
-		To avoid any problem, size it to handle worst cases situations (input data not compressible)
-		Worst case size is : "inputsize + 0.4%", with "0.4%" being at least 8 bytes.
+		destination buffer must be sized to handle worst cases situations (input data not compressible)
+		worst case size evaluation is provided by function LZ4_compressBound()
 
 LZ4_uncompress() :
 	osize  : is the output size, therefore the original size
 	return : the number of bytes read in the source buffer
 			 If the source stream is malformed, the function will stop decoding and return a negative result, indicating the byte position of the faulty instruction
-			 This version never writes beyond dest + osize, and is therefore protected against malicious data packets
-	note 2 : destination buffer must be already allocated
+			 This function never writes beyond dest + osize, and is therefore protected against malicious data packets
+	note : destination buffer must be already allocated
 */
 
 
@@ -65,11 +66,12 @@ int LZ4_compressBound(int isize);
 
 /*
 LZ4_compressBound() :
-	Provides the maximum size that LZ4 may output in a "worst case" scenario, for memory allocation of output buffer.
+	Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible)
+	primarily useful for memory allocation of output buffer.
 
 	isize  : is the input size
-	return : maximum size that LZ4 may output in a "worst case" scenario
-	note : this function is limited by "int" range
+	return : maximum output size in a "worst case" scenario
+	note : this function is limited by "int" range (2^31-1)
 */
 
 
@@ -81,25 +83,29 @@ LZ4_uncompress_unknownOutputSize() :
 	maxOutputSize : is the size of the destination buffer (which must be already allocated)
 	return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
 			 If the source stream is malformed, the function will stop decoding and return a negative result, indicating the byte position of the faulty instruction
-			 This version never writes beyond dest + maxOutputSize, and is therefore protected against malicious data packets
-	note   : This version is a bit slower than LZ4_uncompress
+			 This function never writes beyond dest + maxOutputSize, and is therefore protected against malicious data packets
+	note   : This version is slightly slower than LZ4_uncompress()
 */
 
 
 int LZ4_compressCtx(void** ctx, const char* source,  char* dest, int isize);
+int LZ4_compress64kCtx(void** ctx, const char* source,  char* dest, int isize);
 
 /*
 LZ4_compressCtx() :
 	This function explicitly handles the CTX memory structure.
-	It avoids allocating/deallocating memory between each call, improving performance when malloc is time-consuming.
-	Note : when memory is allocated into the stack (default mode), there is no "malloc" penalty.
-	Therefore, this function is mostly useful when memory is allocated into the heap (it requires increasing HASH_LOG value beyond STACK_LIMIT)
+	It avoids allocating/deallocating memory between each call, improving performance when malloc is heavily invoked.
+	This function is only useful when memory is allocated into the heap (HASH_LOG value beyond STACK_LIMIT)
+	Performance difference will be noticeable only when repetitively calling the compression function over many small segments.
+	Note : by default, memory is allocated into the stack, therefore "malloc" is not invoked.
+LZ4_compress64kCtx() :
+	Same as LZ4_compressCtx(), but specific to small inputs (<64KB).
+	isize *Must* be <64KB, otherwise the output will be corrupted.
 
 	On first call : provide a *ctx=NULL; It will be automatically allocated.
 	On next calls : reuse the same ctx pointer.
 	Use different pointers for different threads when doing multi-threading.
 
-	note : performance difference is small, mostly noticeable in HeapMode when repetitively calling the compression function over many small segments.
 */
 
 
diff --git a/lz4_format_description.txt b/lz4_format_description.txt
new file mode 100644
index 0000000..e4a053b
--- /dev/null
+++ b/lz4_format_description.txt
@@ -0,0 +1,119 @@
+LZ4 Format Description
+Last revised: 2012-02-12
+Author : Y. Collet
+
+
+
+This is not a formal specification, but intents to provide enough information
+to anyone willing to produce LZ4-compatible compressed streams.
+
+LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding.
+There is no entropy encoder backend nor framing layer -- the latter is
+assumed to be handled by other parts of the system.
+
+This document only describes the format, not how the LZ4 compressor nor
+decompressor actually works. The correctness of the decompressor should not
+depend on implementation details of the compressor, and vice versa.
+
+The most important design principle behind LZ4 is simplicity.
+It is meant to create an easy to read and maintain source code.
+It also helps later on for optimisations, compactness, and speed.
+
+
+-- Compressed stream format --
+
+An LZ4 compressed stream is composed of sequences.
+Schematically, a sequence is a suite of literals, followed by a match copy.
+
+Each sequence starts with a token.
+The token is a one byte value, separated into two 4-bits fields.
+Therefore each field ranges from 0 to 15.
+
+
+The first field uses the 4 high-bits of the token.
+It provides the length of literals to follow.
+A literal is a not-compressed byte.
+If it is 0, then there is no literal.
+If it is 15, then we need to add some more bytes to indicate the full length.
+Each additionnal byte then represent a value from 0 to 255,
+which is added to the previous value to produce a total length.
+When the byte value is 255, another byte is output.
+There can be any number of bytes following the token. There is no "size limit".
+(Sidenote this is why a not-compressible input stream is expanded by 0.4%).
+
+Example 1 : A length of 48 will be represented as :
+- 15 : value for the 4-bits High field
+- 33 : (=48-15) remaining length to reach 48
+
+Example 2 : A length of 280 will be represented as :
+- 15  : value for the 4-bits High field
+- 255 : following byte is maxed, since 280-15 >= 255
+- 10  : (=280 - 15 - 255) ) remaining length to reach 280
+
+Example 3 : A length of 15 will be represented as :
+- 15 : value for the 4-bits High field
+- 0  : (=15-15) yes, the zero must be output
+
+Following the token and optional length bytes, are the literals themselves.
+They are exactly as numerous as previously decoded (length of literals).
+It's possible that there are zero literal.
+
+
+Following the literals is the match copy operation.
+
+It starts by the offset.
+This is a 2 bytes value, in little endian format :
+the lower byte is the first one in the stream.
+
+The offset represents the position of the match to be copied from.
+1 means "current position - 1 byte".
+The maximum offset value is 65535, 65536 cannot be coded.
+Note that 0 is an invalid value, not used. 
+
+Then we need to extract the match length.
+For this, we use the second token field, the low 4-bits.
+Value, obviously, ranges from 0 to 15.
+However here, 0 means that the copy operation will be minimal.
+The minimum length of a match, called minmatch, is 4. 
+As a consequence, a 0 value means 4 bytes, and a value of 15 means 19+ bytes.
+Similar to literal length, on reaching the highest possible value (15), 
+we output additional bytes, one at a time, with values ranging from 0 to 255.
+They are added to total to provide the final match length.
+A 255 value means there is another byte to read and add.
+There is no limit to the number of optional bytes that can be output this way.
+(This points towards a maximum achievable compression ratio of ~250).
+
+With the offset and the matchlength,
+the decoder can now proceed to copy the data from the already decoded buffer.
+On decoding the matchlength, we reach the end of the compressed sequence,
+and therefore start another one.
+
+
+-- Parsing restrictions --
+
+There are specific parsing rules to respect in order to remain compatible
+with assumptions made by the decoder :
+1) The last 5 bytes are always literals
+2) The last match must start at least 12 bytes before end of stream
+Consequently, a file with less than 13 bytes cannot be compressed.
+These rules are in place to ensure that the decoder
+will never read beyond the input buffer, nor write beyond the output buffer.
+
+Note that the last sequence is also incomplete,
+and stops right after literals.
+
+
+-- Additional notes --
+
+There is no assumption nor limits to the way the compressor
+searches and selects matches within the source stream.
+It could be a fast scan,  a multi-probe, a full search using BST,
+standard hash chains or MMC, well whatever.
+
+Advanced parsing strategies can also be implemented, such as lazy match,
+or full optimal parsing.
+
+All these trade-off offer distinctive speed/memory/compression advantages.
+Whatever the method used by the compressor, its result will be decodable
+by any LZ4 decoder if it follows the format described above.
+