diff --git a/Makefile b/Makefile
index ea60d11..53ea0f4 100644
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@ endif
 
 default: lz4c
 
-all: lz4c lz4c32 fuzzer
+all: lz4c lz4c32 fuzzer fullbench
 
 lz4c: lz4.c lz4hc.c bench.c xxhash.c lz4c.c
 	$(CC)      -O3 $(CFLAGS) $^ -o $@$(EXT)
@@ -21,5 +21,8 @@ lz4c32: lz4.c lz4hc.c bench.c xxhash.c lz4c.c
 fuzzer : lz4.c lz4hc.c fuzzer.c
 	$(CC)      -O3 $(CFLAGS) $^ -o $@$(EXT)
 	
+fullbench : lz4.c lz4hc.c xxhash.c fullbench.c
+	$(CC)      -O3 $(CFLAGS) $^ -o $@$(EXT)
+
 clean:
-	rm -f core *.o lz4c$(EXT) lz4c32$(EXT) fuzzer$(EXT)
+	rm -f core *.o lz4c$(EXT) lz4c32$(EXT) fuzzer$(EXT) fullbench$(EXT)
diff --git a/bench.c b/bench.c
index eef5cdb..f605249 100644
--- a/bench.c
+++ b/bench.c
@@ -65,7 +65,6 @@
 #endif
 
 #include "lz4.h"
-//int LZ4_compress_stack(const char* in, char* out, int size);
 #define COMPRESSOR0 LZ4_compress
 #include "lz4hc.h"
 #define COMPRESSOR1 LZ4_compressHC
@@ -209,7 +208,7 @@ static size_t BMK_findMaxMem(U64 requiredMem)
     while (!testmem)
     {
         requiredMem -= step;
-        testmem = malloc ((size_t)requiredMem);
+        testmem = (BYTE*) malloc ((size_t)requiredMem);
     }
 
     free (testmem);
@@ -294,11 +293,11 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles, int cLevel)
 
       // Alloc
       chunkP = (struct chunkParameters*) malloc(((benchedSize / chunkSize)+1) * sizeof(struct chunkParameters));
-      orig_buff = malloc((size_t )benchedSize);
+      orig_buff = (char*)malloc((size_t )benchedSize);
       nbChunks = (int) (benchedSize / chunkSize) + 1;
       maxCChunkSize = LZ4_compressBound(chunkSize);
       compressed_buff_size = nbChunks * maxCChunkSize;
-      compressed_buff = malloc((size_t )compressed_buff_size);
+      compressed_buff = (char*)malloc((size_t )compressed_buff_size);
 
 
       if(!orig_buff || !compressed_buff)
@@ -386,10 +385,11 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles, int cLevel)
           while(BMK_GetMilliSpan(milliTime) < TIMELOOP)
           {
             for (chunkNb=0; chunkNb<nbChunks; chunkNb++)
-                chunkP[chunkNb].origSize = LZ4_decompress_safe(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkSize);
-                //chunkP[chunkNb].compressedSize = LZ4_decompress_fast(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].origSize);
+                //chunkP[chunkNb].origSize = LZ4_decompress_safe(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkSize);
+                chunkP[chunkNb].compressedSize = LZ4_decompress_fast(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].origSize);
                 //chunkP[chunkNb].compressedSize = LZ4_decompress_fast_withPrefix64k(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].origSize);
                 //chunkP[chunkNb].origSize = LZ4_decompress_safe_withPrefix64k(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkSize);
+                //chunkP[chunkNb].origSize = LZ4_decompress_safe_partial(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkSize-5, chunkSize);
                 //chunkP[chunkNb].compressedSize = LZ4_uncompress(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].origSize);
                 //chunkP[chunkNb].origSize = LZ4_uncompress_unknownOutputSize(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkSize);
             nb_loops++;
@@ -423,9 +423,9 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles, int cLevel)
   }
 
   if (nbFiles > 1)
-        printf("%-16.16s :%10llu ->%10llu (%5.2f%%), %6.1f MB/s , %6.1f MB/s\n", "  TOTAL", (long long unsigned int)totals, (long long unsigned int)totalz, (double)totalz/(double)totals*100., (double)totals/totalc/1000., (double)totals/totald/1000.);
+        DISPLAY("%-16.16s :%10llu ->%10llu (%5.2f%%), %6.1f MB/s , %6.1f MB/s\n", "  TOTAL", (long long unsigned int)totals, (long long unsigned int)totalz, (double)totalz/(double)totals*100., (double)totals/totalc/1000., (double)totals/totald/1000.);
 
-  if (BMK_pause) { printf("press enter...\n"); getchar(); }
+  if (BMK_pause) { DISPLAY("press enter...\n"); getchar(); }
 
   return 0;
 }
diff --git a/fullbench.c b/fullbench.c
new file mode 100644
index 0000000..54b46f6
--- /dev/null
+++ b/fullbench.c
@@ -0,0 +1,622 @@
+/*
+    bench.c - Demo program to benchmark open-source compression algorithm
+    Copyright (C) Yann Collet 2012-2013
+    GPL v2 License
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    You can contact the author at :
+    - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+    - LZ4 source repository : http://code.google.com/p/lz4/
+*/
+
+//**************************************
+// Compiler Options
+//**************************************
+// Disable some Visual warning messages
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE     // VS2005
+
+// Unix Large Files support (>4GB)
+#if (defined(__sun__) && (!defined(__LP64__)))   // Sun Solaris 32-bits requires specific definitions
+#  define _LARGEFILE_SOURCE 
+#  define _FILE_OFFSET_BITS 64
+#elif ! defined(__LP64__)                        // No point defining Large file for 64 bit
+#  define _LARGEFILE64_SOURCE
+#endif
+
+// S_ISREG & gettimeofday() are not supported by MSVC
+#if defined(_MSC_VER)
+#  define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
+#  define BMK_LEGACY_TIMER 1
+#endif
+
+// GCC does not support _rotl outside of Windows
+#if !defined(_WIN32)
+#  define _rotl(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+
+
+//**************************************
+// Includes
+//**************************************
+#include <stdlib.h>      // malloc
+#include <stdio.h>       // fprintf, fopen, ftello64
+#include <sys/types.h>   // stat64
+#include <sys/stat.h>    // stat64
+
+// Use ftime() if gettimeofday() is not available on your target
+#if defined(BMK_LEGACY_TIMER)
+#  include <sys/timeb.h>   // timeb, ftime
+#else
+#  include <sys/time.h>    // gettimeofday
+#endif
+
+#include "lz4.h"
+#define COMPRESSOR0 LZ4_compress
+#include "lz4hc.h"
+#define COMPRESSOR1 LZ4_compressHC
+#define DEFAULTCOMPRESSOR COMPRESSOR0
+
+#include "xxhash.h"
+
+
+//**************************************
+// Basic Types
+//**************************************
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
+# include <stdint.h>
+  typedef uint8_t  BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+#endif
+
+
+//****************************
+// Constants
+//****************************
+#define COMPRESSOR_NAME "Full LZ4 speed analyzer"
+#define COMPRESSOR_VERSION ""
+#define COMPILED __DATE__
+#define AUTHOR "Yann Collet"
+#define WELCOME_MESSAGE "*** %s %s, by %s (%s) ***\n", COMPRESSOR_NAME, COMPRESSOR_VERSION, AUTHOR, COMPILED
+
+#define NBLOOPS    6
+#define TIMELOOP   2500
+
+#define KNUTH      2654435761U
+#define MAX_MEM    (1984<<20)
+#define DEFAULT_CHUNKSIZE   (4<<20)
+
+
+//**************************************
+// Local structures
+//**************************************
+struct chunkParameters
+{
+    U32   id;
+    char* origBuffer;
+    char* compressedBuffer;
+    int   origSize;
+    int   compressedSize;
+};
+
+
+//**************************************
+// MACRO
+//**************************************
+#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
+
+
+
+//**************************************
+// Benchmark Parameters
+//**************************************
+static int chunkSize = DEFAULT_CHUNKSIZE;
+static int nbIterations = NBLOOPS;
+static int BMK_pause = 0;
+
+void BMK_SetBlocksize(int bsize)
+{
+    chunkSize = bsize;
+    DISPLAY("-Using Block Size of %i KB-\n", chunkSize>>10);
+}
+
+void BMK_SetNbIterations(int nbLoops)
+{
+    nbIterations = nbLoops;
+    DISPLAY("- %i iterations -\n", nbIterations);
+}
+
+void BMK_SetPause()
+{
+    BMK_pause = 1;
+}
+
+//*********************************************************
+//  Private functions
+//*********************************************************
+
+#if defined(BMK_LEGACY_TIMER)
+
+static int BMK_GetMilliStart()
+{
+  // Based on Legacy ftime()
+  // Rolls over every ~ 12.1 days (0x100000/24/60/60)
+  // Use GetMilliSpan to correct for rollover
+  struct timeb tb;
+  int nCount;
+  ftime( &tb );
+  nCount = (int) (tb.millitm + (tb.time & 0xfffff) * 1000);
+  return nCount;
+}
+
+#else
+
+static int BMK_GetMilliStart()
+{
+  // Based on newer gettimeofday()
+  // Use GetMilliSpan to correct for rollover
+  struct timeval tv;
+  int nCount;
+  gettimeofday(&tv, NULL);
+  nCount = (int) (tv.tv_usec/1000 + (tv.tv_sec & 0xfffff) * 1000);
+  return nCount;
+}
+
+#endif
+
+
+static int BMK_GetMilliSpan( int nTimeStart )
+{
+  int nSpan = BMK_GetMilliStart() - nTimeStart;
+  if ( nSpan < 0 )
+    nSpan += 0x100000 * 1000;
+  return nSpan;
+}
+
+
+static size_t BMK_findMaxMem(U64 requiredMem)
+{
+    size_t step = (64U<<20);   // 64 MB
+    BYTE* testmem=NULL;
+
+    requiredMem = (((requiredMem >> 25) + 1) << 26);
+    if (requiredMem > MAX_MEM) requiredMem = MAX_MEM;
+
+    requiredMem += 2*step;
+    while (!testmem)
+    {
+        requiredMem -= step;
+        testmem = (BYTE*) malloc ((size_t)requiredMem);
+    }
+
+    free (testmem);
+    return (size_t) (requiredMem - step);
+}
+
+
+static U64 BMK_GetFileSize(char* infilename)
+{
+    int r;
+#if defined(_MSC_VER)
+    struct _stat64 statbuf;
+    r = _stat64(infilename, &statbuf);
+#else
+    struct stat statbuf;
+    r = stat(infilename, &statbuf);
+#endif
+    if (r || !S_ISREG(statbuf.st_mode)) return 0;   // No good...
+    return (U64)statbuf.st_size;
+}
+
+
+//*********************************************************
+//  Public function
+//*********************************************************
+
+static inline int local_LZ4_compress_limitedOutput(const char* in, char* out, int inSize)
+{
+    return LZ4_compress_limitedOutput(in, out, inSize, LZ4_compressBound(inSize));
+}
+
+static inline int local_LZ4_compressHC_limitedOutput(const char* in, char* out, int inSize)
+{
+    return LZ4_compressHC_limitedOutput(in, out, inSize, LZ4_compressBound(inSize));
+}
+
+static inline int local_LZ4_decompress_fast(const char* in, char* out, int inSize, int outSize)
+{
+    (void)inSize;
+    LZ4_decompress_fast(in, out, outSize);
+    return outSize;
+}
+
+static inline int local_LZ4_decompress_fast_withPrefix64k(const char* in, char* out, int inSize, int outSize)
+{
+    (void)inSize;
+    LZ4_decompress_fast_withPrefix64k(in, out, outSize);
+    return outSize;
+}
+
+static inline int local_LZ4_decompress_safe_partial(const char* in, char* out, int inSize, int outSize)
+{
+    return LZ4_decompress_safe_partial(in, out, inSize, outSize - 5, outSize);
+}
+
+int fullSpeedBench(char** fileNamesTable, int nbFiles)
+{
+  int fileIdx=0;
+  FILE* fileIn;
+  char* infilename;
+  U64 largefilesize;
+  size_t benchedSize;
+  int nbChunks;
+  int maxCChunkSize;
+  size_t readSize;
+  char* orig_buff;
+  char* compressed_buff; int compressed_buff_size;
+  struct chunkParameters* chunkP;
+  U32 crcc, crcd=0;
+# define NB_COMPRESSION_ALGORITHMS 4
+  static char* compressionNames[] = { "LZ4_compress", "LZ4_compressHC", "LZ4_compressHC_limitedOutput", "LZ4_compress_limitedOutput" };
+  double totalCTime[NB_COMPRESSION_ALGORITHMS] = {0};
+  double totalCSize[NB_COMPRESSION_ALGORITHMS] = {0};
+# define NB_DECOMPRESSION_ALGORITHMS 5
+  static char* decompressionNames[] = { "LZ4_decompress_fast", "LZ4_decompress_fast_withPrefix64k", "LZ4_decompress_safe", "LZ4_decompress_safe_withPrefix64k", "LZ4_decompress_safe_partial" };
+  double totalDTime[NB_DECOMPRESSION_ALGORITHMS] = {0};
+
+  U64 totals = 0;
+
+
+  // Loop for each file
+  while (fileIdx<nbFiles)
+  {
+      // Check file existence
+      infilename = fileNamesTable[fileIdx++];
+      fileIn = fopen( infilename, "rb" );
+      if (fileIn==NULL)
+      {
+        DISPLAY( "Pb opening %s\n", infilename);
+        return 11;
+      }
+
+      // Memory allocation & restrictions
+      largefilesize = BMK_GetFileSize(infilename);
+      benchedSize = (size_t) BMK_findMaxMem(largefilesize) / 2;
+      if ((U64)benchedSize > largefilesize) benchedSize = (size_t)largefilesize;
+      if (benchedSize < largefilesize)
+      {
+          DISPLAY("Not enough memory for '%s' full size; testing %i MB only...\n", infilename, (int)(benchedSize>>20));
+      }
+
+      // Alloc
+      chunkP = (struct chunkParameters*) malloc(((benchedSize / chunkSize)+1) * sizeof(struct chunkParameters));
+      orig_buff = (char*) malloc((size_t)benchedSize);
+      nbChunks = (int) (benchedSize / chunkSize) + 1;
+      maxCChunkSize = LZ4_compressBound(chunkSize);
+      compressed_buff_size = nbChunks * maxCChunkSize;
+      compressed_buff = (char*)malloc((size_t)compressed_buff_size);
+
+
+      if(!orig_buff || !compressed_buff)
+      {
+        DISPLAY("\nError: not enough memory!\n");
+        free(orig_buff);
+        free(compressed_buff);
+        fclose(fileIn);
+        return 12;
+      }
+
+      // Init chunks data
+      {
+          int i;
+          size_t remaining = benchedSize;
+          char* in = orig_buff;
+          char* out = compressed_buff;
+          for (i=0; i<nbChunks; i++)
+          {
+              chunkP[i].id = i;
+              chunkP[i].origBuffer = in; in += chunkSize;
+              if ((int)remaining > chunkSize) { chunkP[i].origSize = chunkSize; remaining -= chunkSize; } else { chunkP[i].origSize = (int)remaining; remaining = 0; }
+              chunkP[i].compressedBuffer = out; out += maxCChunkSize;
+              chunkP[i].compressedSize = 0;
+          }
+      }
+
+      // Fill input buffer
+      DISPLAY("Loading %s...       \r", infilename);
+      readSize = fread(orig_buff, 1, benchedSize, fileIn);
+      fclose(fileIn);
+
+      if(readSize != benchedSize)
+      {
+        DISPLAY("\nError: problem reading file '%s' !!    \n", infilename);
+        free(orig_buff);
+        free(compressed_buff);
+        return 13;
+      }
+
+      // Calculating input Checksum
+      crcc = XXH32(orig_buff, (unsigned int)benchedSize,0);
+
+
+      // Bench
+      {
+        int loopNb, nb_loops, chunkNb, cAlgNb, dAlgNb;
+        size_t cSize=0;
+        double ratio=0.;
+
+        DISPLAY("\r%79s\r", "");
+        DISPLAY(" %s : \n", infilename);
+
+        // Compression Algorithms
+        for (cAlgNb=0; cAlgNb < NB_COMPRESSION_ALGORITHMS; cAlgNb++)
+        {
+            char* cName = compressionNames[cAlgNb];
+            int (*compressionFunction)(const char*, char*, int);
+            double bestTime = 100000000.;
+
+            switch(cAlgNb)
+            {
+            case 0: compressionFunction = LZ4_compress; break;
+            case 1: compressionFunction = LZ4_compressHC; break;
+            case 2: compressionFunction = local_LZ4_compressHC_limitedOutput; break;
+            case 3: compressionFunction = local_LZ4_compress_limitedOutput; break;
+            default : DISPLAY("ERROR ! Bad algorithm Id !! \n"); return 1;
+            }
+
+            for (loopNb = 1; loopNb <= nbIterations; loopNb++)
+            {
+                double averageTime;
+                int milliTime;
+
+                DISPLAY("%1i-%-19.19s : %9i ->\r", loopNb, cName, (int)benchedSize);
+                { size_t i; for (i=0; i<benchedSize; i++) compressed_buff[i]=(char)i; }     // warmimg up memory
+
+                nb_loops = 0;
+                milliTime = BMK_GetMilliStart();
+                while(BMK_GetMilliStart() == milliTime);
+                milliTime = BMK_GetMilliStart();
+                while(BMK_GetMilliSpan(milliTime) < TIMELOOP)
+                {
+                    for (chunkNb=0; chunkNb<nbChunks; chunkNb++)
+                    {
+                        chunkP[chunkNb].compressedSize = compressionFunction(chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origSize);
+                        if (chunkP[chunkNb].compressedSize==0) DISPLAY("ERROR ! %s() = 0 !! \n", cName), exit(1);
+                    }
+                    nb_loops++;
+                }
+                milliTime = BMK_GetMilliSpan(milliTime);
+
+                averageTime = (double)milliTime / nb_loops;
+                if (averageTime < bestTime) bestTime = averageTime;
+                cSize=0; for (chunkNb=0; chunkNb<nbChunks; chunkNb++) cSize += chunkP[chunkNb].compressedSize;
+                ratio = (double)cSize/(double)benchedSize*100.;
+                DISPLAY("%1i-%-19.19s : %9i -> %9i (%5.2f%%),%7.1f MB/s\r", loopNb, cName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 1000.);
+            }
+
+            if (ratio<100.)
+                DISPLAY("%-21.21s : %9i -> %9i (%5.2f%%),%7.1f MB/s\n", cName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 1000.);
+            else
+                DISPLAY("%-21.21s : %9i -> %9i (%5.1f%%),%7.1f MB/s\n", cName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 1000.);
+
+            totalCTime[cAlgNb] += bestTime;
+            totalCSize[cAlgNb] += cSize;
+        }
+
+        { size_t i; for (i=0; i<benchedSize; i++) orig_buff[i]=0; }     // zeroing area, for CRC checking
+
+        // Decompression Algorithms
+        for (dAlgNb=0; dAlgNb < NB_DECOMPRESSION_ALGORITHMS; dAlgNb++)
+        {
+            char* dName = decompressionNames[dAlgNb];
+            int (*decompressionFunction)(const char*, char*, int, int);
+            double bestTime = 100000000.;
+
+            switch(dAlgNb)
+            {
+            case 0: decompressionFunction = local_LZ4_decompress_fast; break;
+            case 1: decompressionFunction = local_LZ4_decompress_fast_withPrefix64k; break;
+            case 2: decompressionFunction = LZ4_decompress_safe; break;
+            case 3: decompressionFunction = LZ4_decompress_safe_withPrefix64k; break;
+            case 4: decompressionFunction = local_LZ4_decompress_safe_partial; break;
+            default : DISPLAY("ERROR ! Bad algorithm Id !! \n"); return 1;
+            }
+
+            for (loopNb = 1; loopNb <= nbIterations; loopNb++)
+            {
+                double averageTime;
+                int milliTime;
+
+                DISPLAY("%1i-%-19.19s : %9i ->\r", loopNb, dName, (int)benchedSize);
+
+                nb_loops = 0;
+                milliTime = BMK_GetMilliStart();
+                while(BMK_GetMilliStart() == milliTime);
+                milliTime = BMK_GetMilliStart();
+                while(BMK_GetMilliSpan(milliTime) < TIMELOOP)
+                {
+                    for (chunkNb=0; chunkNb<nbChunks; chunkNb++)
+                    {
+                        int decodedSize = decompressionFunction(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkP[chunkNb].origSize);
+                        if (chunkP[chunkNb].origSize != decodedSize) DISPLAY("ERROR ! %s() == %i != %i !! \n", dName, decodedSize, chunkP[chunkNb].origSize), exit(1);
+                    }
+                    nb_loops++;
+                }
+                milliTime = BMK_GetMilliSpan(milliTime);
+
+                averageTime = (double)milliTime / nb_loops;
+                if (averageTime < bestTime) bestTime = averageTime;
+
+                DISPLAY("%1i-%-19.19s : %9i -> %7.1f MB/s\r", loopNb, dName, (int)benchedSize, (double)benchedSize / bestTime / 1000.);
+            }
+
+            // CRC Checking
+            crcd = XXH32(orig_buff, (int)benchedSize, 0);
+            if (crcc!=crcd) { DISPLAY("\n!!! WARNING !!! %14s : Invalid Checksum : %x != %x\n", infilename, (unsigned)crcc, (unsigned)crcd); exit(1); }
+            DISPLAY("%-21.21s : %9i -> %7.1f MB/s\n", dName, (int)benchedSize, (double)benchedSize / bestTime / 1000.);
+
+            totalDTime[dAlgNb] += bestTime;
+        }
+
+        totals += benchedSize;
+      }
+
+      free(orig_buff);
+      free(compressed_buff);
+      free(chunkP);
+  }
+
+  if (nbFiles > 1)
+  {
+      int AlgNb;
+
+      DISPLAY(" TOTAL : \n");
+      for (AlgNb = 0; AlgNb < NB_COMPRESSION_ALGORITHMS; AlgNb ++)
+      {
+          char* cName = compressionNames[AlgNb];
+          DISPLAY("%-21.21s :%10llu ->%10llu (%5.2f%%), %6.1f MB/s\n", cName, (long long unsigned int)totals, (long long unsigned int)totalCSize[AlgNb], (double)totalCSize[AlgNb]/(double)totals*100., (double)totals/totalCTime[AlgNb]/1000.);
+      }
+      for (AlgNb = 0; AlgNb < NB_DECOMPRESSION_ALGORITHMS; AlgNb ++)
+      {
+          char* dName = decompressionNames[AlgNb];
+          DISPLAY("%-21.21s :%10llu -> %6.1f MB/s\n", dName, (long long unsigned int)totals, (double)totals/totalDTime[AlgNb]/1000.);
+      }
+  }
+
+  if (BMK_pause) { printf("press enter...\n"); getchar(); }
+
+  return 0;
+}
+
+
+int usage(char* exename)
+{
+    DISPLAY( "Usage :\n");
+    DISPLAY( "      %s [arg] file1 file2 ... fileX\n", exename);
+    DISPLAY( "Arguments :\n");
+    DISPLAY( " -H     : Help (this text + advanced options)\n");
+    return 0;
+}
+
+int usage_advanced()
+{
+    DISPLAY( "\nAdvanced options :\n");
+    DISPLAY( " -B#    : Block size [4-7](default : 7)\n");
+    //DISPLAY( " -BD    : Block dependency (improve compression ratio)\n");
+    DISPLAY( " -i#    : iteration loops [1-9](default : 6)\n");
+    return 0;
+}
+
+int badusage(char* exename)
+{
+    DISPLAY("Wrong parameters\n");
+    usage(exename);
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    int i,
+        filenamesStart=2;
+    char* exename=argv[0];
+    char* input_filename=0;
+
+    // Welcome message
+    DISPLAY( WELCOME_MESSAGE);
+
+    if (argc<2) { badusage(exename); return 1; }
+
+    for(i=1; i<argc; i++)
+    {
+        char* argument = argv[i];
+
+        if(!argument) continue;   // Protection if argument empty
+
+        // Decode command (note : aggregated commands are allowed)
+        if (argument[0]=='-')
+        {
+            while (argument[1]!=0)
+            {
+                argument ++;
+
+                switch(argument[0])
+                {
+                    // Display help on usage
+                case 'H': usage(exename); usage_advanced(); return 0;
+
+                    // Modify Block Properties
+                case 'B':
+                    while (argument[1]!=0)
+                    switch(argument[1])
+                    {
+                    case '4':
+                    case '5':
+                    case '6':
+                    case '7':
+                    { 
+                        int B = argument[1] - '0'; 
+                        int S = 1 << (8 + 2*B); 
+                        BMK_SetBlocksize(S); 
+                        argument++;
+                        break;
+                    }
+                    case 'D': argument++; break;
+                    default : goto _exit_blockProperties;
+                    }
+_exit_blockProperties:
+                    break;
+
+                    // Modify Nb Iterations (benchmark only)
+                case 'i': 
+                    if ((argument[1] >='1') && (argument[1] <='9'))
+                    {
+                        int iters = argument[1] - '0'; 
+                        BMK_SetNbIterations(iters); 
+                        argument++;
+                    }
+                    break;
+
+                    // Pause at the end (benchmark only) (hidden option)
+                case 'p': BMK_SetPause(); break;
+
+                    // Unrecognised command
+                default : badusage(exename); return 1;
+                }
+            }
+            continue;
+        }
+
+        // first provided filename is input
+        if (!input_filename) { input_filename=argument; filenamesStart=i; continue; }
+
+    }
+
+    // No input filename ==> Error
+    if(!input_filename) { badusage(exename); return 1; }
+
+    return fullSpeedBench(argv+filenamesStart, argc-filenamesStart);
+
+}
+
diff --git a/fuzzer.c b/fuzzer.c
index c8bb5d9..44ca885 100644
--- a/fuzzer.c
+++ b/fuzzer.c
@@ -1,7 +1,7 @@
 /*
     fuzzer.c - Fuzzer test tool for LZ4
-    Copyright (C) Andrew Mahone - Yann Collet 2012-2013
-    Original code by Andrew Mahone / Modified by Yann Collet
+    Copyright (C) Yann Collet - Andrew Mahone 2012-2013
+    Code started by Andrew Mahone, modified by Yann Collet
     GPL v2 License
 
     This program is free software; you can redistribute it and/or modify
@@ -56,25 +56,25 @@
 #define PRIME3   3266489917U
 
 
-
 //*********************************************************
 //  Functions
 //*********************************************************
 static int FUZ_GetMilliStart()
 {
-  struct timeb tb;
-  int nCount;
-  ftime( &tb );
-  nCount = (int) (tb.millitm + (tb.time & 0xfffff) * 1000);
-  return nCount;
+   struct timeb tb;
+   int nCount;
+   ftime( &tb );
+   nCount = (int) (tb.millitm + (tb.time & 0xfffff) * 1000);
+   return nCount;
 }
 
+
 static int FUZ_GetMilliSpan( int nTimeStart )
 {
-  int nSpan = FUZ_GetMilliStart() - nTimeStart;
-  if ( nSpan < 0 )
-    nSpan += 0x100000 * 1000;
-  return nSpan;
+   int nSpan = FUZ_GetMilliStart() - nTimeStart;
+   if ( nSpan < 0 )
+      nSpan += 0x100000 * 1000;
+   return nSpan;
 }
 
 
@@ -85,14 +85,16 @@ unsigned int FUZ_rand(unsigned int* src)
 }
 
 
-int test_canary(unsigned char *buf) {
-        int i;
-        for (i = 0; i < 2048; i++)
-                if (buf[i] != buf[i + 2048])
-                        return 0;
-        return 1;
+int test_canary(unsigned char *buf)
+{
+    int i;
+    for (i = 0; i < 2048; i++)
+        if (buf[i] != buf[i + 2048])
+            return 0;
+    return 1;
 }
 
+
 int FUZ_SecurityTest()
 {
   char* output;
@@ -127,9 +129,10 @@ int main() {
 #       define FUZ_avail ROUND_PAGE(FUZ_max)
         const int off_full = FUZ_avail - FUZ_max;
         unsigned char cbuf[FUZ_avail + PAGE_SIZE];
-        unsigned int seed, cur_seq=PRIME3, seeds[NUM_SEQ], timestamp=FUZ_GetMilliStart();
-        int i, j, k, ret, len, lenHC;
+        unsigned int seed, randState, cur_seq=PRIME3, seeds[NUM_SEQ], timestamp=FUZ_GetMilliStart();
+        int i, j, k, ret, len, lenHC, attemptNb;
         char userInput[30] = {0};
+#       define FUZ_CHECKTEST(cond, message) testNb++; if (cond) { printf("Test %i : %s : seed %u, cycle %u \n", testNb, message, seed, attemptNb); goto _output_error; }
 
         printf("starting LZ4 fuzzer\n");
         printf("Select an Initialisation number (default : random) : ");
@@ -140,96 +143,108 @@ int main() {
             else seed = FUZ_GetMilliSpan(timestamp);
         }
         printf("Seed = %u\n", seed);
+        randState = seed;
 
         FUZ_SecurityTest();
 
         for (i = 0; i < 2048; i++)
-                cbuf[FUZ_avail + i] = cbuf[FUZ_avail + 2048 + i] = FUZ_rand(&seed) >> 16;
+                cbuf[FUZ_avail + i] = cbuf[FUZ_avail + 2048 + i] = FUZ_rand(&randState) >> 16;
 
-        for (i = 0; i < NB_ATTEMPTS; i++) 
+        for (attemptNb = 0; attemptNb < NB_ATTEMPTS; attemptNb++) 
         {
-            printf("\r%7i /%7i\r", i, NB_ATTEMPTS);
+            int testNb = 0;
+
+            printf("\r%7i /%7i\r", attemptNb, NB_ATTEMPTS);
             
-            FUZ_rand(&seed);
             for (j = 0; j < NUM_SEQ; j++) {
-                    seeds[j] = FUZ_rand(&seed) << 8;
-                    seeds[j] ^= (FUZ_rand(&seed) >> 8) & 65535;
+                    seeds[j] = FUZ_rand(&randState) << 8;
+                    seeds[j] ^= (FUZ_rand(&randState) >> 8) & 65535;
             }
             for (j = 0; j < LEN; j++) {
-                    k = FUZ_rand(&seed);
+                    k = FUZ_rand(&randState);
                     if (j == 0 || NEW_SEQ(k))
-                            cur_seq = seeds[(FUZ_rand(&seed) >> 16) & SEQ_MSK];
+                            cur_seq = seeds[(FUZ_rand(&randState) >> 16) & SEQ_MSK];
                     if (MOD_SEQ(k)) {
-                            k = (FUZ_rand(&seed) >> 16) & SEQ_MSK;
-                            seeds[k] = FUZ_rand(&seed) << 8;
-                            seeds[k] ^= (FUZ_rand(&seed) >> 8) & 65535;
+                            k = (FUZ_rand(&randState) >> 16) & SEQ_MSK;
+                            seeds[k] = FUZ_rand(&randState) << 8;
+                            seeds[k] ^= (FUZ_rand(&randState) >> 8) & 65535;
                     }
                     buf[j] = FUZ_rand(&cur_seq) >> 16;
             }
 
             // Test compression HC
             ret = LZ4_compressHC_limitedOutput((const char*)buf, (char*)&cbuf[off_full], LEN, FUZ_max);
-            if (ret == 0) { printf("HC compression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret==0, "HC compression failed despite sufficient space");
             lenHC = ret;
 
             // Test compression
             ret = LZ4_compress_limitedOutput((const char*)buf, (char*)&cbuf[off_full], LEN, FUZ_max);
-            if (ret == 0) { printf("compression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret==0, "compression failed despite sufficient space");
             len = ret;
 
             // Test decoding with output size being exactly what's necessary => must work
             ret = LZ4_decompress_fast((char*)&cbuf[off_full], (char*)testOut, LEN);
-            if (ret<0) { printf("decompression failed despite correct space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret<0, "decompression failed despite correct space");
 
             // Test decoding with one byte missing => must fail
             ret = LZ4_decompress_fast((char*)&cbuf[off_full], (char*)testOut, LEN-1);
-            if (ret>=0) { printf("decompression should have failed, due to Output Size being too small : seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret>=0, "decompression should have failed, due to Output Size being too small");
 
             // Test decoding with one byte too much => must fail
             ret = LZ4_decompress_fast((char*)&cbuf[off_full], (char*)testOut, LEN+1);
-            if (ret>=0) { printf("decompression should have failed, due to Output Size being too large : seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret>=0, "decompression should have failed, due to Output Size being too large");
 
             // Test decoding with enough output size => must work
             ret = LZ4_decompress_safe((char*)&cbuf[off_full], (char*)testOut, len, LEN+1);
-            if (ret<0) { printf("decompression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret<0, "decompression failed despite sufficient space");
 
             // Test decoding with output size being exactly what's necessary => must work
             ret = LZ4_decompress_safe((char*)&cbuf[off_full], (char*)testOut, len, LEN);
-            if (ret<0) { printf("decompression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret<0, "decompression failed despite sufficient space");
 
             // Test decoding with output size being one byte too short => must fail
             ret = LZ4_decompress_safe((char*)&cbuf[off_full], (char*)testOut, len, LEN-1);
-            if (ret>=0) { printf("decompression should have failed, due to Output Size being too small : seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret>=0, "LZ4_decompress_safe should have failed, due to Output Size being one byte too short");
 
             // Test decoding with input size being one byte too short => must fail
             ret = LZ4_decompress_safe((char*)&cbuf[off_full], (char*)testOut, len-1, LEN);
-            if (ret>=0) { printf("decompression should have failed, due to input size being too small : seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret>=0, "LZ4_decompress_safe should have failed, due to input size being one byte too short");
 
             // Test decoding with input size being one byte too large => must fail
             ret = LZ4_decompress_safe((char*)&cbuf[off_full], (char*)testOut, len+1, LEN);
-            if (ret>=0) { printf("decompression should have failed, due to input size being too large : seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret>=0, "decompression should have failed, due to input size being too large");
+            //if (ret>=0) { printf("Test 10 : decompression should have failed, due to input size being too large : seed %u, len %d\n", seed, LEN); goto _output_error; }
+
+            // Test partial decoding with target output size being max/2 => must work
+            ret = LZ4_decompress_safe_partial((char*)&cbuf[off_full], (char*)testOut, len, LEN/2, LEN);
+            FUZ_CHECKTEST(ret<0, "partial decompression failed despite sufficient space");
+
+            // Test partial decoding with target output size being just below max => must work
+            ret = LZ4_decompress_safe_partial((char*)&cbuf[off_full], (char*)testOut, len, LEN-3, LEN);
+            FUZ_CHECKTEST(ret<0, "partial decompression failed despite sufficient space");
 
             // Test compression with output size being exactly what's necessary (should work)
             ret = LZ4_compress_limitedOutput((const char*)buf, (char*)&cbuf[FUZ_avail-len], LEN, len);
-            if (!test_canary(&cbuf[FUZ_avail])) { printf("compression overran output buffer: seed %u, len %d, olen %d\n", seed, LEN, len); goto _output_error; }
-            if (ret == 0) { printf("compression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(!test_canary(&cbuf[FUZ_avail]), "compression overran output buffer");
+            FUZ_CHECKTEST(ret==0, "compression failed despite sufficient space");
 
             // Test HC compression with output size being exactly what's necessary (should work)
             ret = LZ4_compressHC_limitedOutput((const char*)buf, (char*)&cbuf[FUZ_avail-len], LEN, lenHC);
-            if (ret == 0) { printf("HC compression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret==0, "HC compression failed despite sufficient space");
 
             // Test compression with just one missing byte into output buffer => must fail
             ret = LZ4_compress_limitedOutput((const char*)buf, (char*)&cbuf[FUZ_avail-(len-1)], LEN, len-1);
-            if (ret) { printf("compression overran output buffer: seed %u, len %d, olen %d => ret %d", seed, LEN, len-1, ret); goto _output_error; }
-            if (!test_canary(&cbuf[FUZ_avail])) { printf("compression overran output buffer: seed %u, len %d, olen %d", seed, LEN, len-1); goto _output_error; }
+            FUZ_CHECKTEST(ret, "compression overran output buffer");
+            FUZ_CHECKTEST(!test_canary(&cbuf[FUZ_avail]), "compression overran output buffer");
 
             // Test HC compression with just one missing byte into output buffer => must fail
             ret = LZ4_compressHC_limitedOutput((const char*)buf, (char*)&cbuf[FUZ_avail-(len-1)], LEN, lenHC-1);
-            if (ret) { printf("HC compression overran output buffer: seed %u, len %d, olen %d => ret %d", seed, LEN, lenHC-1, ret); goto _output_error; }
+            FUZ_CHECKTEST(ret, "HC compression overran output buffer");
 
             bytes += LEN;
             cbytes += len;
             hcbytes += lenHC;
+            FUZ_rand(&randState);
         }
 
         printf("all tests completed successfully \n");
diff --git a/lz4.c b/lz4.c
index 327227e..91819ad 100644
--- a/lz4.c
+++ b/lz4.c
@@ -32,7 +32,7 @@
 */
 
 /*
-Note : this source file requires "lz4_encoder.h" and "lz4_decoder.h"
+Note : this source file requires "lz4_encoder.h"
 */
 
 //**************************************
@@ -64,9 +64,10 @@ Note : this source file requires "lz4_encoder.h" and "lz4_decoder.h"
 // CPU Feature Detection
 //**************************************
 // 32 or 64 bits ?
-#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) \
-  || defined(__ppc64__) || defined(_WIN64) || defined(__LP64__) || defined(_LP64) \
-  || defined(__ia64__) )   // Detects 64 bits mode
+#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \
+  || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \
+  || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \
+  || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) )   // Detects 64 bits mode
 #  define LZ4_ARCH64 1
 #else
 #  define LZ4_ARCH64 0
@@ -82,7 +83,7 @@ Note : this source file requires "lz4_encoder.h" and "lz4_decoder.h"
 #elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
 #  define LZ4_BIG_ENDIAN 1
 #elif defined(__sparc) || defined(__sparc__) \
-   || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \
+   || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
    || defined(__hpux)  || defined(__hppa) \
    || defined(_MIPSEB) || defined(__s390__)
 #  define LZ4_BIG_ENDIAN 1
@@ -218,7 +219,7 @@ typedef struct _U64_S { U64 v; } _PACKED U64_S;
 //**************************************
 // Architecture-specific macros
 //**************************************
-#if LZ4_ARCH64	// 64-bit
+#if LZ4_ARCH64   // 64-bit
 #  define STEPSIZE 8
 #  define UARCH U64
 #  define AARCH A64
@@ -227,7 +228,7 @@ typedef struct _U64_S { U64 v; } _PACKED U64_S;
 #  define LZ4_SECURECOPY(s,d,e)   if (d<e) LZ4_WILDCOPY(s,d,e)
 #  define HTYPE                   U32
 #  define INITBASE(base)          const BYTE* const base = ip
-#else		// 32-bit
+#else      // 32-bit
 #  define STEPSIZE 4
 #  define UARCH U32
 #  define AARCH A32
@@ -241,7 +242,7 @@ typedef struct _U64_S { U64 v; } _PACKED U64_S;
 #if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
 #  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
 #  define LZ4_WRITE_LITTLEENDIAN_16(p,i)  { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p+=2; }
-#else		// Little Endian
+#else      // Little Endian
 #  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = (s) - A16(p); }
 #  define LZ4_WRITE_LITTLEENDIAN_16(p,v)  { A16(p) = v; p+=2; }
 #endif
@@ -522,88 +523,168 @@ int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, in
 // Decompression functions
 //****************************
 
-/*
-int LZ4_decompress_safe(const char* source,
-                        char* dest,
-                        int inputSize,
-                        int maxOutputSize);
-
-LZ4_decompress_safe() guarantees it will never write nor read outside of the provided output buffers.
-This function is safe against "buffer overflow" attacks.
-A corrupted input will produce an error result, a negative int.
-*/
-#define FUNCTION_NAME LZ4_decompress_safe
-#define EXITCONDITION_INPUTSIZE
-#include "lz4_decoder.h"
+typedef enum { noPrefix = 0, withPrefix = 1 } prefix64k_directive;
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } end_directive;
+typedef enum { full = 0, partial = 1 } exit_directive;
 
 
-/*
-int LZ4_decompress_safe_withPrefix64k(
-                        const char* source,
-                        char* dest,
-                        int inputSize,
-                        int maxOutputSize);
+// This generic decompression function cover all use cases.
+// It shall be instanciated several times, using different sets of directives
+// Note that it is essential this generic function is really inlined, 
+// in order to remove useless branches during compilation optimisation.
+static inline int LZ4_decompress_generic(
+                 const char* source,
+                 char* dest,
+                 int inputSize,          //
+                 int outputSize,         // OutputSize must be != 0; if endOnInput==endOnInputSize, this value is the max size of Output Buffer.
 
-Same as LZ4_decompress_safe(), but will also use 64K of memory before the beginning of input buffer.
-Typically used to decode streams of inter-dependant blocks.
-Note : the 64K of memory before pointer 'source' must be allocated and read-allowed.
-*/
-#define FUNCTION_NAME LZ4_decompress_safe_withPrefix64k
-#define EXITCONDITION_INPUTSIZE
-#define PREFIX_64K
-#include "lz4_decoder.h"
+                 int endOnInput,         // endOnOutputSize, endOnInputSize
+                 int prefix64k,          // noPrefix, withPrefix
+                 int partialDecoding,    // full, partial
+                 int targetOutputSize    // only used if partialDecoding==partial
+                 )
+{
+    // Local Variables
+    const BYTE* restrict ip = (const BYTE*) source;
+    const BYTE* ref;
+    const BYTE* const iend = ip + inputSize;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + outputSize;
+    BYTE* cpy;
+    BYTE* oexit = op + targetOutputSize;
+
+    size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+    size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+#endif
 
 
-/*
-int LZ4_decompress_safe_partial(
-                        const char* source,
-                        char* dest,
-                        int inputSize,
-                        int targetOutputSize,
-                        int maxOutputSize);
-
-LZ4_decompress_safe_partial() objective is to decompress only a part of the compressed input block provided.
-The decoding process stops as soon as 'targetOutputSize' bytes have been decoded, reducing decoding time.
-The result of the function is the number of bytes decoded.
-LZ4_decompress_safe_partial() may decode less than 'targetOutputSize' if input doesn't contain enough bytes to decode.
-Always verify how many bytes were decoded to ensure there are as many as wanted into the output buffer 'dest'.
-A corrupted input will produce an error result, a negative int.
-*/
-#define FUNCTION_NAME LZ4_decompress_safe_partial
-#define EXITCONDITION_INPUTSIZE
-#define PARTIAL_DECODING
-#include "lz4_decoder.h"
+    // Special case
+    if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT;   // targetOutputSize too large, better decode everything
+    if unlikely(outputSize==0) goto _output_error;                          // Empty output buffer
 
 
-/*
-int LZ4_decompress_fast(const char* source,
-                        char* dest,
-                        int outputSize);
+    // Main Loop
+    while (1)
+    {
+        unsigned token;
+        size_t length;
 
-This function is faster than LZ4_decompress_safe().
-LZ4_decompress_fast() guarantees it will never write nor read outside of output buffer.
-Since LZ4_decompress_fast() doesn't know the size of input buffer.
-it can only guarantee that it will never write into the input buffer, and will never read before its beginning.
-To be used preferably in a controlled environment (when the compressed data to be decoded is from a trusted source).
-A detected corrupted input will produce an error result, a negative int.
-*/
-#define FUNCTION_NAME LZ4_decompress_fast
-#include "lz4_decoder.h"
+        // get runlength
+        token = *ip++;
+        if ((length=(token>>ML_BITS)) == RUN_MASK)  
+        { 
+            unsigned s=255; 
+            while (((endOnInput)?ip<iend:1) && (s==255)) 
+            { 
+                s = *ip++; 
+                length += s; 
+            } 
+        }
+
+        // copy literals
+        cpy = op+length;
+        if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
+            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
+        {
+            if (partialDecoding)
+            {
+                if (cpy > oend) goto _output_error;                            // Error : write attempt beyond end of output buffer
+                if ((endOnInput) && (ip+length > iend)) goto _output_error;    // Error : read attempt beyond end of input buffer
+            }
+            else
+            {
+                if ((!endOnInput) && (cpy != oend)) goto _output_error;        // Error : block decoding must stop exactly there, due to parsing restrictions
+                if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   // Error : not enough place for another match (min 4) + 5 literals
+            }
+            memcpy(op, ip, length);
+            ip += length;
+            op += length;
+            break;                                       // Necessarily EOF, due to parsing restrictions
+        }
+        LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy;
+
+        // get offset
+        LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
+        if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error;   // Error : offset outside destination buffer
+
+        // get matchlength
+        if ((length=(token&ML_MASK)) == ML_MASK) 
+        { 
+            while (endOnInput ? ip<iend-(LASTLITERALS+1) : 1)    // A minimum nb of input bytes must remain for LASTLITERALS + token
+            { 
+                unsigned s = *ip++; 
+                length += s; 
+                if (s==255) continue; 
+                break; 
+            } 
+        }
+
+        // copy repeated sequence
+        if unlikely((op-ref)<STEPSIZE)
+        {
+#if LZ4_ARCH64
+            size_t dec64 = dec64table[op-ref];
+#else
+            const size_t dec64 = 0;
+#endif
+            op[0] = ref[0];
+            op[1] = ref[1];
+            op[2] = ref[2];
+            op[3] = ref[3];
+            op += 4, ref += 4; ref -= dec32table[op-ref];
+            A32(op) = A32(ref); 
+            op += STEPSIZE-4; ref -= dec64;
+        } else { LZ4_COPYSTEP(ref,op); }
+        cpy = op + length - (STEPSIZE-4);
+
+        if unlikely(cpy>oend-(COPYLENGTH)-(STEPSIZE-4))
+        {
+            if (cpy > oend-LASTLITERALS) goto _output_error;    // Error : last 5 bytes must be literals
+            LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH));
+            while(op<cpy) *op++=*ref++;
+            op=cpy;
+            continue;
+        }
+        LZ4_WILDCOPY(ref, op, cpy);
+        op=cpy;   // correction
+    }
+
+    // end of decoding
+    if (endOnInput)
+       return (int) (((char*)op)-dest);     // Nb of output bytes decoded
+    else
+       return (int) (((char*)ip)-source);   // Nb of input bytes read
+
+    // Overflow error detected
+_output_error:
+    return (int) (-(((char*)ip)-source))-1;
+}
 
 
-/*
-int LZ4_decompress_fast_withPrefix64k(
-                        const char* source,
-                        char* dest,
-                        int inputSize
-                        int maxOutputSize);
+int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, noPrefix, full, 0);
+}
 
-Same as LZ4_decompress_fast(), but will use the 64K of memory before the beginning of input buffer.
-Typically used to decode streams of dependant inter-blocks.
-Note : the 64K of memory before pointer 'source' must be allocated and read-allowed.
-*/
-#define FUNCTION_NAME LZ4_decompress_fast_withPrefix64k
-#define PREFIX_64K
-#include "lz4_decoder.h"
+int LZ4_decompress_fast(const char* source, char* dest, int outputSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, noPrefix, full, 0);
+}
 
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, withPrefix, full, 0);
+}
+
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int outputSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, withPrefix, full, 0);
+}
+
+int LZ4_decompress_safe_partial(const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, noPrefix, partial, targetOutputSize);
+}
 
diff --git a/lz4.h b/lz4.h
index c303e29..ec6a64f 100644
--- a/lz4.h
+++ b/lz4.h
@@ -42,7 +42,7 @@ extern "C" {
 // Compiler Options
 //**************************************
 #if defined(_MSC_VER) && !defined(__cplusplus)   // Visual Studio
-#  define inline __inline           // Visual is not C99, but supports some kind of inline
+#  define inline __forceinline           // Visual C is not C99, but supports some kind of inline. Note : we *do* want to force inline
 #endif
 
 
@@ -66,8 +66,8 @@ LZ4_compress() :
 LZ4_decompress_safe() :
     maxOutputSize : is the size of the destination buffer (which must be already allocated)
     return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
-             If the source stream is malformed, the function will stop decoding and return a negative result.
-             This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
+             If the source stream is malformed or too large, the function will stop decoding and return a negative result.
+             This function is protected against any kind of buffer overflow attemps (never writes outside of output buffer, and never reads outside of input buffer). It is therefore protected against malicious data packets
 */
 
 
@@ -150,8 +150,8 @@ int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outpu
 // Obsolete Functions
 //****************************
 
-static inline int LZ4_uncompress (const char* source, char* dest, int outputSize)   { return LZ4_decompress_fast(source, dest, outputSize); }
-static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize)   { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
+static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
+static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
 
 /* 
 These functions are deprecated and should no longer be used.
diff --git a/lz4_decoder.h b/lz4_decoder.h
deleted file mode 100644
index e7c04a0..0000000
--- a/lz4_decoder.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
-   LZ4 Decoder - Part of LZ4 compression algorithm
-   Copyright (C) 2011-2013, Yann Collet.
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
-   - LZ4 source repository : http://code.google.com/p/lz4/
-*/
-
-/* lz4_decoder.h must be included into lz4.c
-   The objective of this file is to create a single LZ4 decoder function source
-   which will be instanciated multiple times with minor variations
-   depending on a set of #define.
-*/
-
-
-
-//****************************
-// Check required defines
-//****************************
-
-#ifndef FUNCTION_NAME
-#  error "FUNTION_NAME is not defined"
-#endif
-
-
-//****************************
-// Control tests
-//****************************
-
-#ifdef EXITCONDITION_INPUTSIZE
-#  define INPUTBUFFER_CONTROL(ip,iend) likely(ip<iend)
-#else
-#  define INPUTBUFFER_CONTROL(ip,iend) (1)
-#endif
-
-#ifdef PARTIAL_DECODING
-#  define OUTPUTTARGET(cpy,oexit) (cpy >= oexit)
-#else
-#  define OUTPUTTARGET(cpy,oexit) (0)
-#endif
-
-
-
-
-//****************************
-// Function code
-//****************************
-
-int FUNCTION_NAME(const char* source,
-                 char* dest,
-#ifdef EXITCONDITION_INPUTSIZE
-                 int inputSize,
-#endif
-#ifdef PARTIAL_DECODING
-                 int targetOutputSize,
-#endif
-                 int outputSize
-                 )
-{
-    // Local Variables
-    const BYTE* restrict ip = (const BYTE*) source;
-    const BYTE* ref;
-#ifdef EXITCONDITION_INPUTSIZE
-    const BYTE* const iend = ip + inputSize;
-#endif
-
-    BYTE* op = (BYTE*) dest;
-    BYTE* const oend = op + outputSize;
-    BYTE* cpy;
-#ifdef PARTIAL_DECODING
-    BYTE* const oexit = op + targetOutputSize;
-#endif
-
-    size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
-#if LZ4_ARCH64
-    size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
-#endif
-
-
-#ifdef EXITCONDITION_INPUTSIZE
-    // Special case
-    if unlikely(!inputSize) goto _output_error;     // A correctly formed null-compressed LZ4 must have at least one byte (token=0)
-#endif
-
-    // Main Loop
-    while (1)
-    {
-        unsigned token;
-        size_t length;
-
-        // get runlength
-        token = *ip++;
-        if ((length=(token>>ML_BITS)) == RUN_MASK)  
-        { 
-            unsigned s=255; 
-            while (INPUTBUFFER_CONTROL(ip,iend) && (s==255)) 
-            { 
-                s=*ip++; 
-                length += s; 
-            } 
-        }
-
-        // copy literals
-        cpy = op+length;
-#ifdef EXITCONDITION_INPUTSIZE
-        if ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS)) || OUTPUTTARGET(cpy,oexit))
-        {
-            if (cpy > oend) goto _output_error;          // Error : write attempt beyond end of output buffer
-            if ((!OUTPUTTARGET(cpy,oexit)) && (ip+length != iend)) goto _output_error;   // Error : Must consume all input at this stage, except if reaching TargetOutputSize
-#else
-        if (cpy>oend-COPYLENGTH)
-        {
-            if (cpy != oend) goto _output_error;         // Error : not enough place for another match (min 4) + 5 literals
-#endif
-            memcpy(op, ip, length);
-            ip += length;
-            op += length;
-            break;                                       // Necessarily EOF, due to parsing restrictions
-        }
-        LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy;
-
-        // get offset
-        LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
-#ifndef PREFIX_64K
-        if unlikely(ref < (BYTE* const)dest) goto _output_error;   // Error : offset outside destination buffer
-#endif
-
-        // get matchlength
-        if ((length=(token&ML_MASK)) == ML_MASK) 
-        { 
-            while INPUTBUFFER_CONTROL(ip,iend-(LASTLITERALS+1))    // A minimum nb of input bytes must remain for LASTLITERALS + token
-            { 
-                unsigned s = *ip++; 
-                length += s; 
-                if (s==255) continue; 
-                break; 
-            } 
-        }
-
-        // copy repeated sequence
-        if unlikely((op-ref)<STEPSIZE)
-        {
-#if LZ4_ARCH64
-            size_t dec64 = dec64table[op-ref];
-#else
-            const size_t dec64 = 0;
-#endif
-            op[0] = ref[0];
-            op[1] = ref[1];
-            op[2] = ref[2];
-            op[3] = ref[3];
-            op += 4, ref += 4; ref -= dec32table[op-ref];
-            A32(op) = A32(ref); 
-            op += STEPSIZE-4; ref -= dec64;
-        } else { LZ4_COPYSTEP(ref,op); }
-        cpy = op + length - (STEPSIZE-4);
-
-        if unlikely(cpy>oend-(COPYLENGTH)-(STEPSIZE-4))
-        {
-            if (cpy > oend-LASTLITERALS) goto _output_error;    // Error : last 5 bytes must be literals
-            LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH));
-            while(op<cpy) *op++=*ref++;
-            op=cpy;
-            continue;
-        }
-        
-        LZ4_WILDCOPY(ref, op, cpy);
-        op=cpy;		// correction
-    }
-
-    // end of decoding
-#ifdef EXITCONDITION_INPUTSIZE
-    return (int) (((char*)op)-dest);     // Nb of output bytes decoded
-#else
-    return (int) (((char*)ip)-source);   // Nb of input bytes read
-#endif
-
-    // Overflow error detected
-_output_error:
-    return (int) (-(((char*)ip)-source))-1;
-}
-
-
-
-//****************************
-// Clean defines
-//****************************
-
-// Required defines
-#undef FUNCTION_NAME
-
-// Locally Generated
-#undef INPUTBUFFER_CONTROL
-#undef OUTPUTTARGET
-
-// Optional defines
-#ifdef EXITCONDITION_INPUTSIZE
-#undef EXITCONDITION_INPUTSIZE
-#endif
-
-#ifdef PREFIX_64K
-#undef PREFIX_64K
-#endif
-
-#ifdef PARTIAL_DECODING
-#undef PARTIAL_DECODING
-#endif
-
diff --git a/lz4hc.c b/lz4hc.c
index a29eb4b..862f48d 100644
--- a/lz4hc.c
+++ b/lz4hc.c
@@ -50,9 +50,10 @@ Note : this source file requires "lz4hc_encoder.h"
 // CPU Feature Detection
 //**************************************
 // 32 or 64 bits ?
-#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) \
-  || defined(__ppc64__) || defined(_WIN64) || defined(__LP64__) || defined(_LP64) \
-  || defined(__ia64__) )   // Detects 64 bits mode
+#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \
+  || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \
+  || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \
+  || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) )   // Detects 64 bits mode
 #  define LZ4_ARCH64 1
 #else
 #  define LZ4_ARCH64 0
@@ -68,7 +69,7 @@ Note : this source file requires "lz4hc_encoder.h"
 #elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
 #  define LZ4_BIG_ENDIAN 1
 #elif defined(__sparc) || defined(__sparc__) \
-   || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \
+   || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
    || defined(__hpux)  || defined(__hppa) \
    || defined(_MIPSEB) || defined(__s390__)
 #  define LZ4_BIG_ENDIAN 1
@@ -99,8 +100,7 @@ Note : this source file requires "lz4hc_encoder.h"
 #endif
 
 #ifdef _MSC_VER
-#  define inline __inline             // Visual is not C99, but supports some kind of inline
-#  define forceinline __forceinline   
+#  define forceinline __forceinline
 #  include <intrin.h>                 // For Visual 2005
 #  if LZ4_ARCH64	// 64-bit
 #    pragma intrinsic(_BitScanForward64) // For Visual 2005
@@ -205,6 +205,7 @@ typedef struct _U64_S { U64 v; } _PACKED U64_S;
 #define MB *(1U<<20)
 #define GB *(1U<<30)
 
+
 //**************************************
 // Architecture-specific macros
 //**************************************
diff --git a/xxhash.c b/xxhash.c
index 3c5f560..6dacdcb 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -35,6 +35,14 @@ You can contact the author at :
 //**************************************
 // Tuning parameters
 //**************************************
+// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
+// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
+// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
+#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USE_UNALIGNED_ACCESS 1
+#endif
+
 // XXH_ACCEPT_NULL_INPUT_POINTER :
 // If the input pointer is a null pointer, xxHash default behavior is to crash, since it is a bad input.
 // If this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
@@ -45,21 +53,33 @@ You can contact the author at :
 // XXH_FORCE_NATIVE_FORMAT :
 // By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
 // Results are therefore identical for little-endian and big-endian CPU.
-// This comes at a  performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
-// Should endian-independance be of no importance to your application, you may uncomment the #define below
+// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+// Should endian-independance be of no importance for your application, you may uncomment the #define below.
 // It will improve speed for Big-endian CPU.
 // This option has no impact on Little_Endian CPU.
 //#define XXH_FORCE_NATIVE_FORMAT 1
 
 
+//**************************************
+// Compiler Options
+//**************************************
+#if defined(_MSC_VER) && !defined(__cplusplus)   // Visual Studio
+#  define inline __inline           // Visual C is not C99, but supports some kind of inline
+#endif
+
 
 //**************************************
-// Includes
+// Includes & Memory related functions
 //**************************************
-#include <stdlib.h>    // for malloc(), free()
-#include <string.h>    // for memcpy()
 #include "xxhash.h"
-
+// Modify the local functions below should you wish to use some other memory related routines
+// for malloc(), free()
+#include <stdlib.h>
+static inline void* XXH_malloc(size_t s) { return malloc(s); }
+static inline void  XXH_free  (void* p)  { free(p); }
+// for memcpy()
+#include <string.h>
+static inline void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
 
 
 //**************************************
@@ -77,8 +97,8 @@ You can contact the author at :
 #  endif
 #elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
 #  define XXH_BIG_ENDIAN 1
-#elif defined(__sparc) || defined(__sparc__) \
-    || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \
+#elif defined(__sparc)  || defined(__sparc__) \
+    || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
     || defined(__hpux)  || defined(__hppa) \
     || defined(_MIPSEB) || defined(__s390__)
 #  define XXH_BIG_ENDIAN 1
@@ -101,21 +121,39 @@ You can contact the author at :
   typedef  int32_t S32;
   typedef uint64_t U64;
 #else
-  typedef unsigned char       BYTE;
-  typedef unsigned short      U16;
-  typedef unsigned int        U32;
-  typedef   signed int        S32;
-  typedef unsigned long long  U64;
+  typedef unsigned char      BYTE;
+  typedef unsigned short     U16;
+  typedef unsigned int       U32;
+  typedef   signed int       S32;
+  typedef unsigned long long U64;
 #endif
 
+#if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
+#  define _PACKED __attribute__ ((packed))
+#else
+#  define _PACKED
+#endif
 
-//**************************************
-// Compiler-specific Options & Functions
-//**************************************
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  pragma pack(push, 1)
+#endif
+
+typedef struct _U32_S { U32 v; } _PACKED U32_S;
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  pragma pack(pop)
+#endif
+
+#define A32(x) (((U32_S *)(x))->v)
+
+
+//***************************************
+// Compiler-specific Functions and Macros
+//***************************************
 #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 
-// Note : under GCC, it may sometimes be faster to enable the (2nd) macro definition, instead of using win32 intrinsic
-#if defined(_WIN32)
+// Note : although _rotl exists for minGW (GCC under windows), performance seems poor
+#if defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
 #else
 #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
@@ -147,7 +185,9 @@ static inline U32 XXH_swap32 (U32 x) {
 //**************************************
 // Macros
 //**************************************
-#define XXH_LE32(p)  (XXH_BIG_ENDIAN ? XXH_swap32(*(U32*)(p)) : *(U32*)(p))
+#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
+#define XXH_LE32(p)          (XXH_BIG_ENDIAN ? XXH_swap32(A32(p))     : A32(p))
+#define XXH_alignedLE32(p)   (XXH_BIG_ENDIAN ? XXH_swap32(*(U32*)(p)) : *(U32*)(p))
 
 
 
@@ -155,6 +195,53 @@ static inline U32 XXH_swap32 (U32 x) {
 // Simple Hash Functions
 //****************************
 
+#if !defined(XXH_USE_UNALIGNED_ACCESS)
+// Specific version, for aligned 32-bits input. Useless for CPU supporting unaligned access.
+static U32 XXH32_alignedInput(const void* input, int len, U32 seed)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+    U32 h32;
+
+    if (len>=16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+        do
+        {
+            v1 += XXH_alignedLE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_alignedLE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_alignedLE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_alignedLE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else { h32  = seed + PRIME32_5; }
+    h32 += (U32) len;
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_alignedLE32(p) * PRIME32_3;
+        h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return h32;
+}
+#endif
+
 U32 XXH32(const void* input, int len, U32 seed)
 {
 #if 0
@@ -172,6 +259,10 @@ U32 XXH32(const void* input, int len, U32 seed)
     if (p==NULL) { len=0; p=(const BYTE*)16; }
 #endif
 
+#if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((U32)p) & 3) == 0) return XXH32_alignedInput(input, len, seed);   // Input is aligned, let's leverage the speed advantage
+#endif
+
     if (len>=16)
     {
         const BYTE* const limit = bEnd - 16;
@@ -229,21 +320,25 @@ U32 XXH32(const void* input, int len, U32 seed)
 
 struct XXH_state32_t
 {
+    U64 total_len;
     U32 seed;
     U32 v1;
     U32 v2;
     U32 v3;
     U32 v4;
-    U64 total_len;
-    char memory[16];
     int memsize;
+    char memory[16];
 };
 
 
-int XXH32_sizeofState() { return sizeof(struct XXH_state32_t); }
+int XXH32_sizeofState() 
+{
+    XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t));   // A compilation error here means XXH32_SIZEOFSTATE is not large enough
+    return sizeof(struct XXH_state32_t); 
+}
 
 
-XXH_errorcode XXH32_resetState(void* state_in, unsigned int seed)
+XXH_errorcode XXH32_resetState(void* state_in, U32 seed)
 { 
     struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
     state->seed = seed;
@@ -253,15 +348,15 @@ XXH_errorcode XXH32_resetState(void* state_in, unsigned int seed)
     state->v4 = seed - PRIME32_1;
     state->total_len = 0;
     state->memsize = 0;
-    return OK;
+    return XXH_OK;
 }
 
 
 void* XXH32_init (U32 seed)
 {
-    struct XXH_state32_t * state = (struct XXH_state32_t *) malloc (sizeof(struct XXH_state32_t));
+    void* state = XXH_malloc (sizeof(struct XXH_state32_t));
     XXH32_resetState(state, seed);
-    return (void*)state;
+    return state;
 }
 
 
@@ -279,14 +374,14 @@ XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
 
     if (state->memsize + len < 16)   // fill in tmp buffer
     {
-        memcpy(state->memory + state->memsize, input, len);
+        XXH_memcpy(state->memory + state->memsize, input, len);
         state->memsize +=  len;
-        return OK;
+        return XXH_OK;
     }
 
     if (state->memsize)   // some data left from previous update
     {
-        memcpy(state->memory + state->memsize, input, 16-state->memsize);
+        XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize);
         {
             const U32* p32 = (const U32*)state->memory;
             state->v1 += XXH_LE32(p32) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
@@ -322,11 +417,11 @@ XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
 
     if (p < bEnd)
     {
-        memcpy(state->memory, p, bEnd-p);
+        XXH_memcpy(state->memory, p, bEnd-p);
         state->memsize = (int)(bEnd-p);
     }
 
-    return OK;
+    return XXH_OK;
 }
 
 
@@ -337,7 +432,6 @@ U32 XXH32_intermediateDigest (void* state_in)
     BYTE* bEnd = (BYTE*)state->memory + state->memsize;
     U32 h32;
 
-
     if (state->total_len >= 16)
     {
         h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
@@ -377,7 +471,7 @@ U32 XXH32_digest (void* state_in)
 {
     U32 h32 = XXH32_intermediateDigest(state_in);
 
-    free(state_in);
+    XXH_free(state_in);
 
     return h32;
 }
diff --git a/xxhash.h b/xxhash.h
index afdf243..8cb06d3 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -27,8 +27,8 @@
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-	You can contact the author at :
-	- xxHash source repository : http://code.google.com/p/xxhash/
+   You can contact the author at :
+   - xxHash source repository : http://code.google.com/p/xxhash/
 */
 
 /* Notice extracted from xxHash homepage :
@@ -67,7 +67,7 @@ extern "C" {
 //****************************
 // Type
 //****************************
-typedef enum { OK=0, XXH_ERROR } XXH_errorcode;
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 
 
 
@@ -79,13 +79,13 @@ unsigned int XXH32 (const void* input, int len, unsigned int seed);
 
 /*
 XXH32() :
-	Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
+    Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
     The memory between input & input+len must be valid (allocated and read-accessible).
-	"seed" can be used to alter the result predictably.
-	This function successfully passes all SMHasher tests.
-	Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
-	Note that "len" is type "int", which means it is limited to 2^31-1.
-	If your data is larger, use the advanced functions below.
+    "seed" can be used to alter the result predictably.
+    This function successfully passes all SMHasher tests.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+    Note that "len" is type "int", which means it is limited to 2^31-1.
+    If your data is larger, use the advanced functions below.
 */
 
 
@@ -122,14 +122,19 @@ Memory will be freed by XXH32_digest().
 
 
 int           XXH32_sizeofState();
-XXH_errorcode XXH32_resetState(void* state_in, unsigned int seed);
-/*
-These functions are the basic elements of XXH32_init();
-The objective is to allow user application to make its own allocation.
+XXH_errorcode XXH32_resetState(void* state, unsigned int seed);
 
-XXH32_sizeofState() is used to know how much space must be allocated by the application.
-This space must be referenced by a void* pointer.
-This pointer must be provided as 'state_in' into XXH32_resetState(), which initializes the state.
+#define       XXH32_SIZEOFSTATE 48
+typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t;
+/*
+These functions allow user application to make its own allocation for state.
+
+XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state.
+Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer.
+This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state.
+
+For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()),
+use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields.
 */
 
 
@@ -138,7 +143,7 @@ unsigned int XXH32_intermediateDigest (void* state);
 This function does the same as XXH32_digest(), generating a 32-bit hash,
 but preserve memory context.
 This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update().
-To free memory context, use XXH32_digest().
+To free memory context, use XXH32_digest(), or free().
 */