lz4.c no longer depends on lz4_decoder.h (removed)

Decompression speed improved under GCC Improved speed of LZ4_decompress_safe_partial() Added new utility : fullbench Modified x64 detection macro, as suggested by David Karner Improved Fuzzer tool Updated xxHash to r30 git-svn-id: https://lz4.googlecode.com/svn/trunk@97 650e7d94-2a16-8b24-b05c-7c0b3f6821cd
2013-06-10 17:29:13 +00:00 · 2013-06-10 17:29:13 +00:00 · 16c0942822
commit 16c0942822
parent cd3bcd0043
10 changed files with 1019 additions and 431 deletions
--- a/7
+++ b/7
@ -10,7 +10,7 @@ endif
 default: lz4c
-all: lz4c lz4c32 fuzzer
+all: lz4c lz4c32 fuzzer fullbench
 lz4c: lz4.c lz4hc.c bench.c xxhash.c lz4c.c
 	$(CC)      -O3 $(CFLAGS) $^ -o $@$(EXT)
@ -21,5 +21,8 @@ lz4c32: lz4.c lz4hc.c bench.c xxhash.c lz4c.c
 fuzzer : lz4.c lz4hc.c fuzzer.c
 	$(CC)      -O3 $(CFLAGS) $^ -o $@$(EXT)
 fullbench : lz4.c lz4hc.c xxhash.c fullbench.c
 	$(CC)      -O3 $(CFLAGS) $^ -o $@$(EXT)
 clean:
-	rm -f core *.o lz4c$(EXT) lz4c32$(EXT) fuzzer$(EXT)
+	rm -f core *.o lz4c$(EXT) lz4c32$(EXT) fuzzer$(EXT) fullbench$(EXT)
--- a/bench.c
+++ b/bench.c
@ -65,7 +65,6 @@
 #endif
 #include "lz4.h"
 //int LZ4_compress_stack(const char* in, char* out, int size);
 #define COMPRESSOR0 LZ4_compress
 #include "lz4hc.h"
 #define COMPRESSOR1 LZ4_compressHC
@ -209,7 +208,7 @@ static size_t BMK_findMaxMem(U64 requiredMem)
    while (!testmem)
    {
        requiredMem -= step;
-        testmem = malloc ((size_t)requiredMem);
+        testmem = (BYTE*) malloc ((size_t)requiredMem);
    }
    free (testmem);
@ -294,11 +293,11 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles, int cLevel)
      // Alloc
      chunkP = (struct chunkParameters*) malloc(((benchedSize / chunkSize)+1) * sizeof(struct chunkParameters));
-      orig_buff = malloc((size_t )benchedSize);
+      orig_buff = (char*)malloc((size_t )benchedSize);
      nbChunks = (int) (benchedSize / chunkSize) + 1;
      maxCChunkSize = LZ4_compressBound(chunkSize);
      compressed_buff_size = nbChunks * maxCChunkSize;
-      compressed_buff = malloc((size_t )compressed_buff_size);
+      compressed_buff = (char*)malloc((size_t )compressed_buff_size);
      if(!orig_buff || !compressed_buff)
@ -386,10 +385,11 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles, int cLevel)
          while(BMK_GetMilliSpan(milliTime) < TIMELOOP)
          {
            for (chunkNb=0; chunkNb<nbChunks; chunkNb++)
-                chunkP[chunkNb].origSize = LZ4_decompress_safe(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkSize);
+                //chunkP[chunkNb].origSize = LZ4_decompress_safe(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkSize);
-                //chunkP[chunkNb].compressedSize = LZ4_decompress_fast(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].origSize);
+                chunkP[chunkNb].compressedSize = LZ4_decompress_fast(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].origSize);
                //chunkP[chunkNb].compressedSize = LZ4_decompress_fast_withPrefix64k(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].origSize);
                //chunkP[chunkNb].origSize = LZ4_decompress_safe_withPrefix64k(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkSize);
                //chunkP[chunkNb].origSize = LZ4_decompress_safe_partial(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkSize-5, chunkSize);
                //chunkP[chunkNb].compressedSize = LZ4_uncompress(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].origSize);
                //chunkP[chunkNb].origSize = LZ4_uncompress_unknownOutputSize(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkSize);
            nb_loops++;
@ -423,9 +423,9 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles, int cLevel)
  }
  if (nbFiles > 1)
-        printf("%-16.16s :%10llu ->%10llu (%5.2f%%), %6.1f MB/s , %6.1f MB/s\n", "  TOTAL", (long long unsigned int)totals, (long long unsigned int)totalz, (double)totalz/(double)totals*100., (double)totals/totalc/1000., (double)totals/totald/1000.);
+        DISPLAY("%-16.16s :%10llu ->%10llu (%5.2f%%), %6.1f MB/s , %6.1f MB/s\n", "  TOTAL", (long long unsigned int)totals, (long long unsigned int)totalz, (double)totalz/(double)totals*100., (double)totals/totalc/1000., (double)totals/totald/1000.);
-  if (BMK_pause) { printf("press enter...\n"); getchar(); }
+  if (BMK_pause) { DISPLAY("press enter...\n"); getchar(); }
  return 0;
 }
--- a/fullbench.c
+++ b/fullbench.c
@ -0,0 +1,622 @@
 /*
    bench.c - Demo program to benchmark open-source compression algorithm
    Copyright (C) Yann Collet 2012-2013
    GPL v2 License
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    You can contact the author at :
    - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
    - LZ4 source repository : http://code.google.com/p/lz4/
 */
 //**************************************
 // Compiler Options
 //**************************************
 // Disable some Visual warning messages
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE     // VS2005
 // Unix Large Files support (>4GB)
 #if (defined(__sun__) && (!defined(__LP64__)))   // Sun Solaris 32-bits requires specific definitions
 #  define _LARGEFILE_SOURCE 
 #  define _FILE_OFFSET_BITS 64
 #elif ! defined(__LP64__)                        // No point defining Large file for 64 bit
 #  define _LARGEFILE64_SOURCE
 #endif
 // S_ISREG & gettimeofday() are not supported by MSVC
 #if defined(_MSC_VER)
 #  define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
 #  define BMK_LEGACY_TIMER 1
 #endif
 // GCC does not support _rotl outside of Windows
 #if !defined(_WIN32)
 #  define _rotl(x,r) ((x << r) | (x >> (32 - r)))
 #endif
 //**************************************
 // Includes
 //**************************************
 #include <stdlib.h>      // malloc
 #include <stdio.h>       // fprintf, fopen, ftello64
 #include <sys/types.h>   // stat64
 #include <sys/stat.h>    // stat64
 // Use ftime() if gettimeofday() is not available on your target
 #if defined(BMK_LEGACY_TIMER)
 #  include <sys/timeb.h>   // timeb, ftime
 #else
 #  include <sys/time.h>    // gettimeofday
 #endif
 #include "lz4.h"
 #define COMPRESSOR0 LZ4_compress
 #include "lz4hc.h"
 #define COMPRESSOR1 LZ4_compressHC
 #define DEFAULTCOMPRESSOR COMPRESSOR0
 #include "xxhash.h"
 //**************************************
 // Basic Types
 //**************************************
 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
 # include <stdint.h>
  typedef uint8_t  BYTE;
  typedef uint16_t U16;
  typedef uint32_t U32;
  typedef  int32_t S32;
  typedef uint64_t U64;
 #else
  typedef unsigned char       BYTE;
  typedef unsigned short      U16;
  typedef unsigned int        U32;
  typedef   signed int        S32;
  typedef unsigned long long  U64;
 #endif
 //****************************
 // Constants
 //****************************
 #define COMPRESSOR_NAME "Full LZ4 speed analyzer"
 #define COMPRESSOR_VERSION ""
 #define COMPILED __DATE__
 #define AUTHOR "Yann Collet"
 #define WELCOME_MESSAGE "*** %s %s, by %s (%s) ***\n", COMPRESSOR_NAME, COMPRESSOR_VERSION, AUTHOR, COMPILED
 #define NBLOOPS    6
 #define TIMELOOP   2500
 #define KNUTH      2654435761U
 #define MAX_MEM    (1984<<20)
 #define DEFAULT_CHUNKSIZE   (4<<20)
 //**************************************
 // Local structures
 //**************************************
 struct chunkParameters
 {
    U32   id;
    char* origBuffer;
    char* compressedBuffer;
    int   origSize;
    int   compressedSize;
 };
 //**************************************
 // MACRO
 //**************************************
 #define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
 //**************************************
 // Benchmark Parameters
 //**************************************
 static int chunkSize = DEFAULT_CHUNKSIZE;
 static int nbIterations = NBLOOPS;
 static int BMK_pause = 0;
 void BMK_SetBlocksize(int bsize)
 {
    chunkSize = bsize;
    DISPLAY("-Using Block Size of %i KB-\n", chunkSize>>10);
 }
 void BMK_SetNbIterations(int nbLoops)
 {
    nbIterations = nbLoops;
    DISPLAY("- %i iterations -\n", nbIterations);
 }
 void BMK_SetPause()
 {
    BMK_pause = 1;
 }
 //*********************************************************
 //  Private functions
 //*********************************************************
 #if defined(BMK_LEGACY_TIMER)
 static int BMK_GetMilliStart()
 {
  // Based on Legacy ftime()
  // Rolls over every ~ 12.1 days (0x100000/24/60/60)
  // Use GetMilliSpan to correct for rollover
  struct timeb tb;
  int nCount;
  ftime( &tb );
  nCount = (int) (tb.millitm + (tb.time & 0xfffff) * 1000);
  return nCount;
 }
 #else
 static int BMK_GetMilliStart()
 {
  // Based on newer gettimeofday()
  // Use GetMilliSpan to correct for rollover
  struct timeval tv;
  int nCount;
  gettimeofday(&tv, NULL);
  nCount = (int) (tv.tv_usec/1000 + (tv.tv_sec & 0xfffff) * 1000);
  return nCount;
 }
 #endif
 static int BMK_GetMilliSpan( int nTimeStart )
 {
  int nSpan = BMK_GetMilliStart() - nTimeStart;
  if ( nSpan < 0 )
    nSpan += 0x100000 * 1000;
  return nSpan;
 }
 static size_t BMK_findMaxMem(U64 requiredMem)
 {
    size_t step = (64U<<20);   // 64 MB
    BYTE* testmem=NULL;
    requiredMem = (((requiredMem >> 25) + 1) << 26);
    if (requiredMem > MAX_MEM) requiredMem = MAX_MEM;
    requiredMem += 2*step;
    while (!testmem)
    {
        requiredMem -= step;
        testmem = (BYTE*) malloc ((size_t)requiredMem);
    }
    free (testmem);
    return (size_t) (requiredMem - step);
 }
 static U64 BMK_GetFileSize(char* infilename)
 {
    int r;
 #if defined(_MSC_VER)
    struct _stat64 statbuf;
    r = _stat64(infilename, &statbuf);
 #else
    struct stat statbuf;
    r = stat(infilename, &statbuf);
 #endif
    if (r || !S_ISREG(statbuf.st_mode)) return 0;   // No good...
    return (U64)statbuf.st_size;
 }
 //*********************************************************
 //  Public function
 //*********************************************************
 static inline int local_LZ4_compress_limitedOutput(const char* in, char* out, int inSize)
 {
    return LZ4_compress_limitedOutput(in, out, inSize, LZ4_compressBound(inSize));
 }
 static inline int local_LZ4_compressHC_limitedOutput(const char* in, char* out, int inSize)
 {
    return LZ4_compressHC_limitedOutput(in, out, inSize, LZ4_compressBound(inSize));
 }
 static inline int local_LZ4_decompress_fast(const char* in, char* out, int inSize, int outSize)
 {
    (void)inSize;
    LZ4_decompress_fast(in, out, outSize);
    return outSize;
 }
 static inline int local_LZ4_decompress_fast_withPrefix64k(const char* in, char* out, int inSize, int outSize)
 {
    (void)inSize;
    LZ4_decompress_fast_withPrefix64k(in, out, outSize);
    return outSize;
 }
 static inline int local_LZ4_decompress_safe_partial(const char* in, char* out, int inSize, int outSize)
 {
    return LZ4_decompress_safe_partial(in, out, inSize, outSize - 5, outSize);
 }
 int fullSpeedBench(char** fileNamesTable, int nbFiles)
 {
  int fileIdx=0;
  FILE* fileIn;
  char* infilename;
  U64 largefilesize;
  size_t benchedSize;
  int nbChunks;
  int maxCChunkSize;
  size_t readSize;
  char* orig_buff;
  char* compressed_buff; int compressed_buff_size;
  struct chunkParameters* chunkP;
  U32 crcc, crcd=0;
 # define NB_COMPRESSION_ALGORITHMS 4
  static char* compressionNames[] = { "LZ4_compress", "LZ4_compressHC", "LZ4_compressHC_limitedOutput", "LZ4_compress_limitedOutput" };
  double totalCTime[NB_COMPRESSION_ALGORITHMS] = {0};
  double totalCSize[NB_COMPRESSION_ALGORITHMS] = {0};
 # define NB_DECOMPRESSION_ALGORITHMS 5
  static char* decompressionNames[] = { "LZ4_decompress_fast", "LZ4_decompress_fast_withPrefix64k", "LZ4_decompress_safe", "LZ4_decompress_safe_withPrefix64k", "LZ4_decompress_safe_partial" };
  double totalDTime[NB_DECOMPRESSION_ALGORITHMS] = {0};
  U64 totals = 0;
  // Loop for each file
  while (fileIdx<nbFiles)
  {
      // Check file existence
      infilename = fileNamesTable[fileIdx++];
      fileIn = fopen( infilename, "rb" );
      if (fileIn==NULL)
      {
        DISPLAY( "Pb opening %s\n", infilename);
        return 11;
      }
      // Memory allocation & restrictions
      largefilesize = BMK_GetFileSize(infilename);
      benchedSize = (size_t) BMK_findMaxMem(largefilesize) / 2;
      if ((U64)benchedSize > largefilesize) benchedSize = (size_t)largefilesize;
      if (benchedSize < largefilesize)
      {
          DISPLAY("Not enough memory for '%s' full size; testing %i MB only...\n", infilename, (int)(benchedSize>>20));
      }
      // Alloc
      chunkP = (struct chunkParameters*) malloc(((benchedSize / chunkSize)+1) * sizeof(struct chunkParameters));
      orig_buff = (char*) malloc((size_t)benchedSize);
      nbChunks = (int) (benchedSize / chunkSize) + 1;
      maxCChunkSize = LZ4_compressBound(chunkSize);
      compressed_buff_size = nbChunks * maxCChunkSize;
      compressed_buff = (char*)malloc((size_t)compressed_buff_size);
      if(!orig_buff || !compressed_buff)
      {
        DISPLAY("\nError: not enough memory!\n");
        free(orig_buff);
        free(compressed_buff);
        fclose(fileIn);
        return 12;
      }
      // Init chunks data
      {
          int i;
          size_t remaining = benchedSize;
          char* in = orig_buff;
          char* out = compressed_buff;
          for (i=0; i<nbChunks; i++)
          {
              chunkP[i].id = i;
              chunkP[i].origBuffer = in; in += chunkSize;
              if ((int)remaining > chunkSize) { chunkP[i].origSize = chunkSize; remaining -= chunkSize; } else { chunkP[i].origSize = (int)remaining; remaining = 0; }
              chunkP[i].compressedBuffer = out; out += maxCChunkSize;
              chunkP[i].compressedSize = 0;
          }
      }
      // Fill input buffer
      DISPLAY("Loading %s...       \r", infilename);
      readSize = fread(orig_buff, 1, benchedSize, fileIn);
      fclose(fileIn);
      if(readSize != benchedSize)
      {
        DISPLAY("\nError: problem reading file '%s' !!    \n", infilename);
        free(orig_buff);
        free(compressed_buff);
        return 13;
      }
      // Calculating input Checksum
      crcc = XXH32(orig_buff, (unsigned int)benchedSize,0);
      // Bench
      {
        int loopNb, nb_loops, chunkNb, cAlgNb, dAlgNb;
        size_t cSize=0;
        double ratio=0.;
        DISPLAY("\r%79s\r", "");
        DISPLAY(" %s : \n", infilename);
        // Compression Algorithms
        for (cAlgNb=0; cAlgNb < NB_COMPRESSION_ALGORITHMS; cAlgNb++)
        {
            char* cName = compressionNames[cAlgNb];
            int (*compressionFunction)(const char*, char*, int);
            double bestTime = 100000000.;
            switch(cAlgNb)
            {
            case 0: compressionFunction = LZ4_compress; break;
            case 1: compressionFunction = LZ4_compressHC; break;
            case 2: compressionFunction = local_LZ4_compressHC_limitedOutput; break;
            case 3: compressionFunction = local_LZ4_compress_limitedOutput; break;
            default : DISPLAY("ERROR ! Bad algorithm Id !! \n"); return 1;
            }
            for (loopNb = 1; loopNb <= nbIterations; loopNb++)
            {
                double averageTime;
                int milliTime;
                DISPLAY("%1i-%-19.19s : %9i ->\r", loopNb, cName, (int)benchedSize);
                { size_t i; for (i=0; i<benchedSize; i++) compressed_buff[i]=(char)i; }     // warmimg up memory
                nb_loops = 0;
                milliTime = BMK_GetMilliStart();
                while(BMK_GetMilliStart() == milliTime);
                milliTime = BMK_GetMilliStart();
                while(BMK_GetMilliSpan(milliTime) < TIMELOOP)
                {
                    for (chunkNb=0; chunkNb<nbChunks; chunkNb++)
                    {
                        chunkP[chunkNb].compressedSize = compressionFunction(chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origSize);
                        if (chunkP[chunkNb].compressedSize==0) DISPLAY("ERROR ! %s() = 0 !! \n", cName), exit(1);
                    }
                    nb_loops++;
                }
                milliTime = BMK_GetMilliSpan(milliTime);
                averageTime = (double)milliTime / nb_loops;
                if (averageTime < bestTime) bestTime = averageTime;
                cSize=0; for (chunkNb=0; chunkNb<nbChunks; chunkNb++) cSize += chunkP[chunkNb].compressedSize;
                ratio = (double)cSize/(double)benchedSize*100.;
                DISPLAY("%1i-%-19.19s : %9i -> %9i (%5.2f%%),%7.1f MB/s\r", loopNb, cName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 1000.);
            }
            if (ratio<100.)
                DISPLAY("%-21.21s : %9i -> %9i (%5.2f%%),%7.1f MB/s\n", cName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 1000.);
            else
                DISPLAY("%-21.21s : %9i -> %9i (%5.1f%%),%7.1f MB/s\n", cName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 1000.);
            totalCTime[cAlgNb] += bestTime;
            totalCSize[cAlgNb] += cSize;
        }
        { size_t i; for (i=0; i<benchedSize; i++) orig_buff[i]=0; }     // zeroing area, for CRC checking
        // Decompression Algorithms
        for (dAlgNb=0; dAlgNb < NB_DECOMPRESSION_ALGORITHMS; dAlgNb++)
        {
            char* dName = decompressionNames[dAlgNb];
            int (*decompressionFunction)(const char*, char*, int, int);
            double bestTime = 100000000.;
            switch(dAlgNb)
            {
            case 0: decompressionFunction = local_LZ4_decompress_fast; break;
            case 1: decompressionFunction = local_LZ4_decompress_fast_withPrefix64k; break;
            case 2: decompressionFunction = LZ4_decompress_safe; break;
            case 3: decompressionFunction = LZ4_decompress_safe_withPrefix64k; break;
            case 4: decompressionFunction = local_LZ4_decompress_safe_partial; break;
            default : DISPLAY("ERROR ! Bad algorithm Id !! \n"); return 1;
            }
            for (loopNb = 1; loopNb <= nbIterations; loopNb++)
            {
                double averageTime;
                int milliTime;
                DISPLAY("%1i-%-19.19s : %9i ->\r", loopNb, dName, (int)benchedSize);
                nb_loops = 0;
                milliTime = BMK_GetMilliStart();
                while(BMK_GetMilliStart() == milliTime);
                milliTime = BMK_GetMilliStart();
                while(BMK_GetMilliSpan(milliTime) < TIMELOOP)
                {
                    for (chunkNb=0; chunkNb<nbChunks; chunkNb++)
                    {
                        int decodedSize = decompressionFunction(chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedSize, chunkP[chunkNb].origSize);
                        if (chunkP[chunkNb].origSize != decodedSize) DISPLAY("ERROR ! %s() == %i != %i !! \n", dName, decodedSize, chunkP[chunkNb].origSize), exit(1);
                    }
                    nb_loops++;
                }
                milliTime = BMK_GetMilliSpan(milliTime);
                averageTime = (double)milliTime / nb_loops;
                if (averageTime < bestTime) bestTime = averageTime;
                DISPLAY("%1i-%-19.19s : %9i -> %7.1f MB/s\r", loopNb, dName, (int)benchedSize, (double)benchedSize / bestTime / 1000.);
            }
            // CRC Checking
            crcd = XXH32(orig_buff, (int)benchedSize, 0);
            if (crcc!=crcd) { DISPLAY("\n!!! WARNING !!! %14s : Invalid Checksum : %x != %x\n", infilename, (unsigned)crcc, (unsigned)crcd); exit(1); }
            DISPLAY("%-21.21s : %9i -> %7.1f MB/s\n", dName, (int)benchedSize, (double)benchedSize / bestTime / 1000.);
            totalDTime[dAlgNb] += bestTime;
        }
        totals += benchedSize;
      }
      free(orig_buff);
      free(compressed_buff);
      free(chunkP);
  }
  if (nbFiles > 1)
  {
      int AlgNb;
      DISPLAY(" TOTAL : \n");
      for (AlgNb = 0; AlgNb < NB_COMPRESSION_ALGORITHMS; AlgNb ++)
      {
          char* cName = compressionNames[AlgNb];
          DISPLAY("%-21.21s :%10llu ->%10llu (%5.2f%%), %6.1f MB/s\n", cName, (long long unsigned int)totals, (long long unsigned int)totalCSize[AlgNb], (double)totalCSize[AlgNb]/(double)totals*100., (double)totals/totalCTime[AlgNb]/1000.);
      }
      for (AlgNb = 0; AlgNb < NB_DECOMPRESSION_ALGORITHMS; AlgNb ++)
      {
          char* dName = decompressionNames[AlgNb];
          DISPLAY("%-21.21s :%10llu -> %6.1f MB/s\n", dName, (long long unsigned int)totals, (double)totals/totalDTime[AlgNb]/1000.);
      }
  }
  if (BMK_pause) { printf("press enter...\n"); getchar(); }
  return 0;
 }
 int usage(char* exename)
 {
    DISPLAY( "Usage :\n");
    DISPLAY( "      %s [arg] file1 file2 ... fileX\n", exename);
    DISPLAY( "Arguments :\n");
    DISPLAY( " -H     : Help (this text + advanced options)\n");
    return 0;
 }
 int usage_advanced()
 {
    DISPLAY( "\nAdvanced options :\n");
    DISPLAY( " -B#    : Block size [4-7](default : 7)\n");
    //DISPLAY( " -BD    : Block dependency (improve compression ratio)\n");
    DISPLAY( " -i#    : iteration loops [1-9](default : 6)\n");
    return 0;
 }
 int badusage(char* exename)
 {
    DISPLAY("Wrong parameters\n");
    usage(exename);
    return 0;
 }
 int main(int argc, char** argv)
 {
    int i,
        filenamesStart=2;
    char* exename=argv[0];
    char* input_filename=0;
    // Welcome message
    DISPLAY( WELCOME_MESSAGE);
    if (argc<2) { badusage(exename); return 1; }
    for(i=1; i<argc; i++)
    {
        char* argument = argv[i];
        if(!argument) continue;   // Protection if argument empty
        // Decode command (note : aggregated commands are allowed)
        if (argument[0]=='-')
        {
            while (argument[1]!=0)
            {
                argument ++;
                switch(argument[0])
                {
                    // Display help on usage
                case 'H': usage(exename); usage_advanced(); return 0;
                    // Modify Block Properties
                case 'B':
                    while (argument[1]!=0)
                    switch(argument[1])
                    {
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                    { 
                        int B = argument[1] - '0'; 
                        int S = 1 << (8 + 2*B); 
                        BMK_SetBlocksize(S); 
                        argument++;
                        break;
                    }
                    case 'D': argument++; break;
                    default : goto _exit_blockProperties;
                    }
 _exit_blockProperties:
                    break;
                    // Modify Nb Iterations (benchmark only)
                case 'i': 
                    if ((argument[1] >='1') && (argument[1] <='9'))
                    {
                        int iters = argument[1] - '0'; 
                        BMK_SetNbIterations(iters); 
                        argument++;
                    }
                    break;
                    // Pause at the end (benchmark only) (hidden option)
                case 'p': BMK_SetPause(); break;
                    // Unrecognised command
                default : badusage(exename); return 1;
                }
            }
            continue;
        }
        // first provided filename is input
        if (!input_filename) { input_filename=argument; filenamesStart=i; continue; }
    }
    // No input filename ==> Error
    if(!input_filename) { badusage(exename); return 1; }
    return fullSpeedBench(argv+filenamesStart, argc-filenamesStart);
 }
--- a/fuzzer.c
+++ b/fuzzer.c
@ -1,7 +1,7 @@
 /*
    fuzzer.c - Fuzzer test tool for LZ4
-    Copyright (C) Andrew Mahone - Yann Collet 2012-2013
+    Copyright (C) Yann Collet - Andrew Mahone 2012-2013
-    Original code by Andrew Mahone / Modified by Yann Collet
+    Code started by Andrew Mahone, modified by Yann Collet
    GPL v2 License
    This program is free software; you can redistribute it and/or modify
@ -56,25 +56,25 @@
 #define PRIME3   3266489917U
 //*********************************************************
 //  Functions
 //*********************************************************
 static int FUZ_GetMilliStart()
 {
-  struct timeb tb;
+   struct timeb tb;
-  int nCount;
+   int nCount;
-  ftime( &tb );
+   ftime( &tb );
-  nCount = (int) (tb.millitm + (tb.time & 0xfffff) * 1000);
+   nCount = (int) (tb.millitm + (tb.time & 0xfffff) * 1000);
-  return nCount;
+   return nCount;
 }
 static int FUZ_GetMilliSpan( int nTimeStart )
 {
-  int nSpan = FUZ_GetMilliStart() - nTimeStart;
+   int nSpan = FUZ_GetMilliStart() - nTimeStart;
-  if ( nSpan < 0 )
+   if ( nSpan < 0 )
-    nSpan += 0x100000 * 1000;
+      nSpan += 0x100000 * 1000;
-  return nSpan;
+   return nSpan;
 }
@ -85,14 +85,16 @@ unsigned int FUZ_rand(unsigned int* src)
 }
-int test_canary(unsigned char *buf) {
+int test_canary(unsigned char *buf)
-        int i;
+{
-        for (i = 0; i < 2048; i++)
+    int i;
-                if (buf[i] != buf[i + 2048])
+    for (i = 0; i < 2048; i++)
-                        return 0;
+        if (buf[i] != buf[i + 2048])
-        return 1;
+            return 0;
    return 1;
 }
 int FUZ_SecurityTest()
 {
  char* output;
@ -127,9 +129,10 @@ int main() {
 #       define FUZ_avail ROUND_PAGE(FUZ_max)
        const int off_full = FUZ_avail - FUZ_max;
        unsigned char cbuf[FUZ_avail + PAGE_SIZE];
-        unsigned int seed, cur_seq=PRIME3, seeds[NUM_SEQ], timestamp=FUZ_GetMilliStart();
+        unsigned int seed, randState, cur_seq=PRIME3, seeds[NUM_SEQ], timestamp=FUZ_GetMilliStart();
-        int i, j, k, ret, len, lenHC;
+        int i, j, k, ret, len, lenHC, attemptNb;
        char userInput[30] = {0};
 #       define FUZ_CHECKTEST(cond, message) testNb++; if (cond) { printf("Test %i : %s : seed %u, cycle %u \n", testNb, message, seed, attemptNb); goto _output_error; }
        printf("starting LZ4 fuzzer\n");
        printf("Select an Initialisation number (default : random) : ");
@ -140,96 +143,108 @@ int main() {
            else seed = FUZ_GetMilliSpan(timestamp);
        }
        printf("Seed = %u\n", seed);
        randState = seed;
        FUZ_SecurityTest();
        for (i = 0; i < 2048; i++)
-                cbuf[FUZ_avail + i] = cbuf[FUZ_avail + 2048 + i] = FUZ_rand(&seed) >> 16;
+                cbuf[FUZ_avail + i] = cbuf[FUZ_avail + 2048 + i] = FUZ_rand(&randState) >> 16;
-        for (i = 0; i < NB_ATTEMPTS; i++) 
+        for (attemptNb = 0; attemptNb < NB_ATTEMPTS; attemptNb++) 
        {
-            printf("\r%7i /%7i\r", i, NB_ATTEMPTS);
+            int testNb = 0;
            printf("\r%7i /%7i\r", attemptNb, NB_ATTEMPTS);
            FUZ_rand(&seed);
            for (j = 0; j < NUM_SEQ; j++) {
-                    seeds[j] = FUZ_rand(&seed) << 8;
+                    seeds[j] = FUZ_rand(&randState) << 8;
-                    seeds[j] ^= (FUZ_rand(&seed) >> 8) & 65535;
+                    seeds[j] ^= (FUZ_rand(&randState) >> 8) & 65535;
            }
            for (j = 0; j < LEN; j++) {
-                    k = FUZ_rand(&seed);
+                    k = FUZ_rand(&randState);
                    if (j == 0 || NEW_SEQ(k))
-                            cur_seq = seeds[(FUZ_rand(&seed) >> 16) & SEQ_MSK];
+                            cur_seq = seeds[(FUZ_rand(&randState) >> 16) & SEQ_MSK];
                    if (MOD_SEQ(k)) {
-                            k = (FUZ_rand(&seed) >> 16) & SEQ_MSK;
+                            k = (FUZ_rand(&randState) >> 16) & SEQ_MSK;
-                            seeds[k] = FUZ_rand(&seed) << 8;
+                            seeds[k] = FUZ_rand(&randState) << 8;
-                            seeds[k] ^= (FUZ_rand(&seed) >> 8) & 65535;
+                            seeds[k] ^= (FUZ_rand(&randState) >> 8) & 65535;
                    }
                    buf[j] = FUZ_rand(&cur_seq) >> 16;
            }
            // Test compression HC
            ret = LZ4_compressHC_limitedOutput((const char*)buf, (char*)&cbuf[off_full], LEN, FUZ_max);
-            if (ret == 0) { printf("HC compression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret==0, "HC compression failed despite sufficient space");
            lenHC = ret;
            // Test compression
            ret = LZ4_compress_limitedOutput((const char*)buf, (char*)&cbuf[off_full], LEN, FUZ_max);
-            if (ret == 0) { printf("compression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret==0, "compression failed despite sufficient space");
            len = ret;
            // Test decoding with output size being exactly what's necessary => must work
            ret = LZ4_decompress_fast((char*)&cbuf[off_full], (char*)testOut, LEN);
-            if (ret<0) { printf("decompression failed despite correct space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret<0, "decompression failed despite correct space");
            // Test decoding with one byte missing => must fail
            ret = LZ4_decompress_fast((char*)&cbuf[off_full], (char*)testOut, LEN-1);
-            if (ret>=0) { printf("decompression should have failed, due to Output Size being too small : seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret>=0, "decompression should have failed, due to Output Size being too small");
            // Test decoding with one byte too much => must fail
            ret = LZ4_decompress_fast((char*)&cbuf[off_full], (char*)testOut, LEN+1);
-            if (ret>=0) { printf("decompression should have failed, due to Output Size being too large : seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret>=0, "decompression should have failed, due to Output Size being too large");
            // Test decoding with enough output size => must work
            ret = LZ4_decompress_safe((char*)&cbuf[off_full], (char*)testOut, len, LEN+1);
-            if (ret<0) { printf("decompression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret<0, "decompression failed despite sufficient space");
            // Test decoding with output size being exactly what's necessary => must work
            ret = LZ4_decompress_safe((char*)&cbuf[off_full], (char*)testOut, len, LEN);
-            if (ret<0) { printf("decompression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret<0, "decompression failed despite sufficient space");
            // Test decoding with output size being one byte too short => must fail
            ret = LZ4_decompress_safe((char*)&cbuf[off_full], (char*)testOut, len, LEN-1);
-            if (ret>=0) { printf("decompression should have failed, due to Output Size being too small : seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret>=0, "LZ4_decompress_safe should have failed, due to Output Size being one byte too short");
            // Test decoding with input size being one byte too short => must fail
            ret = LZ4_decompress_safe((char*)&cbuf[off_full], (char*)testOut, len-1, LEN);
-            if (ret>=0) { printf("decompression should have failed, due to input size being too small : seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret>=0, "LZ4_decompress_safe should have failed, due to input size being one byte too short");
            // Test decoding with input size being one byte too large => must fail
            ret = LZ4_decompress_safe((char*)&cbuf[off_full], (char*)testOut, len+1, LEN);
-            if (ret>=0) { printf("decompression should have failed, due to input size being too large : seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret>=0, "decompression should have failed, due to input size being too large");
            //if (ret>=0) { printf("Test 10 : decompression should have failed, due to input size being too large : seed %u, len %d\n", seed, LEN); goto _output_error; }
            // Test partial decoding with target output size being max/2 => must work
            ret = LZ4_decompress_safe_partial((char*)&cbuf[off_full], (char*)testOut, len, LEN/2, LEN);
            FUZ_CHECKTEST(ret<0, "partial decompression failed despite sufficient space");
            // Test partial decoding with target output size being just below max => must work
            ret = LZ4_decompress_safe_partial((char*)&cbuf[off_full], (char*)testOut, len, LEN-3, LEN);
            FUZ_CHECKTEST(ret<0, "partial decompression failed despite sufficient space");
            // Test compression with output size being exactly what's necessary (should work)
            ret = LZ4_compress_limitedOutput((const char*)buf, (char*)&cbuf[FUZ_avail-len], LEN, len);
-            if (!test_canary(&cbuf[FUZ_avail])) { printf("compression overran output buffer: seed %u, len %d, olen %d\n", seed, LEN, len); goto _output_error; }
+            FUZ_CHECKTEST(!test_canary(&cbuf[FUZ_avail]), "compression overran output buffer");
-            if (ret == 0) { printf("compression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret==0, "compression failed despite sufficient space");
            // Test HC compression with output size being exactly what's necessary (should work)
            ret = LZ4_compressHC_limitedOutput((const char*)buf, (char*)&cbuf[FUZ_avail-len], LEN, lenHC);
-            if (ret == 0) { printf("HC compression failed despite sufficient space: seed %u, len %d\n", seed, LEN); goto _output_error; }
+            FUZ_CHECKTEST(ret==0, "HC compression failed despite sufficient space");
            // Test compression with just one missing byte into output buffer => must fail
            ret = LZ4_compress_limitedOutput((const char*)buf, (char*)&cbuf[FUZ_avail-(len-1)], LEN, len-1);
-            if (ret) { printf("compression overran output buffer: seed %u, len %d, olen %d => ret %d", seed, LEN, len-1, ret); goto _output_error; }
+            FUZ_CHECKTEST(ret, "compression overran output buffer");
-            if (!test_canary(&cbuf[FUZ_avail])) { printf("compression overran output buffer: seed %u, len %d, olen %d", seed, LEN, len-1); goto _output_error; }
+            FUZ_CHECKTEST(!test_canary(&cbuf[FUZ_avail]), "compression overran output buffer");
            // Test HC compression with just one missing byte into output buffer => must fail
            ret = LZ4_compressHC_limitedOutput((const char*)buf, (char*)&cbuf[FUZ_avail-(len-1)], LEN, lenHC-1);
-            if (ret) { printf("HC compression overran output buffer: seed %u, len %d, olen %d => ret %d", seed, LEN, lenHC-1, ret); goto _output_error; }
+            FUZ_CHECKTEST(ret, "HC compression overran output buffer");
            bytes += LEN;
            cbytes += len;
            hcbytes += lenHC;
            FUZ_rand(&randState);
        }
        printf("all tests completed successfully \n");
--- a/lz4.c
+++ b/lz4.c
@ -32,7 +32,7 @@
 */
 /*
-Note : this source file requires "lz4_encoder.h" and "lz4_decoder.h"
+Note : this source file requires "lz4_encoder.h"
 */
 //**************************************
@ -64,9 +64,10 @@ Note : this source file requires "lz4_encoder.h" and "lz4_decoder.h"
 // CPU Feature Detection
 //**************************************
 // 32 or 64 bits ?
-#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) \
+#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \
-  || defined(__ppc64__) || defined(_WIN64) || defined(__LP64__) || defined(_LP64) \
+  || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \
-  || defined(__ia64__) )   // Detects 64 bits mode
+  || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \
  || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) )   // Detects 64 bits mode
 #  define LZ4_ARCH64 1
 #else
 #  define LZ4_ARCH64 0
@ -82,7 +83,7 @@ Note : this source file requires "lz4_encoder.h" and "lz4_decoder.h"
 #elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
 #  define LZ4_BIG_ENDIAN 1
 #elif defined(__sparc) || defined(__sparc__) \
-   || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \
+   || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
   || defined(__hpux)  || defined(__hppa) \
   || defined(_MIPSEB) || defined(__s390__)
 #  define LZ4_BIG_ENDIAN 1
@ -218,7 +219,7 @@ typedef struct _U64_S { U64 v; } _PACKED U64_S;
 //**************************************
 // Architecture-specific macros
 //**************************************
-#if LZ4_ARCH64	// 64-bit
+#if LZ4_ARCH64   // 64-bit
 #  define STEPSIZE 8
 #  define UARCH U64
 #  define AARCH A64
@ -227,7 +228,7 @@ typedef struct _U64_S { U64 v; } _PACKED U64_S;
 #  define LZ4_SECURECOPY(s,d,e)   if (d<e) LZ4_WILDCOPY(s,d,e)
 #  define HTYPE                   U32
 #  define INITBASE(base)          const BYTE* const base = ip
-#else		// 32-bit
+#else      // 32-bit
 #  define STEPSIZE 4
 #  define UARCH U32
 #  define AARCH A32
@ -241,7 +242,7 @@ typedef struct _U64_S { U64 v; } _PACKED U64_S;
 #if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
 #  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
 #  define LZ4_WRITE_LITTLEENDIAN_16(p,i)  { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p+=2; }
-#else		// Little Endian
+#else      // Little Endian
 #  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = (s) - A16(p); }
 #  define LZ4_WRITE_LITTLEENDIAN_16(p,v)  { A16(p) = v; p+=2; }
 #endif
@ -522,88 +523,168 @@ int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, in
 // Decompression functions
 //****************************
-/*
+typedef enum { noPrefix = 0, withPrefix = 1 } prefix64k_directive;
-int LZ4_decompress_safe(const char* source,
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } end_directive;
-                        char* dest,
+typedef enum { full = 0, partial = 1 } exit_directive;
                        int inputSize,
                        int maxOutputSize);
 LZ4_decompress_safe() guarantees it will never write nor read outside of the provided output buffers.
 This function is safe against "buffer overflow" attacks.
 A corrupted input will produce an error result, a negative int.
 */
 #define FUNCTION_NAME LZ4_decompress_safe
 #define EXITCONDITION_INPUTSIZE
 #include "lz4_decoder.h"
-/*
+// This generic decompression function cover all use cases.
-int LZ4_decompress_safe_withPrefix64k(
+// It shall be instanciated several times, using different sets of directives
-                        const char* source,
+// Note that it is essential this generic function is really inlined, 
-                        char* dest,
+// in order to remove useless branches during compilation optimisation.
-                        int inputSize,
+static inline int LZ4_decompress_generic(
-                        int maxOutputSize);
+                 const char* source,
                 char* dest,
                 int inputSize,          //
                 int outputSize,         // OutputSize must be != 0; if endOnInput==endOnInputSize, this value is the max size of Output Buffer.
-Same as LZ4_decompress_safe(), but will also use 64K of memory before the beginning of input buffer.
+                 int endOnInput,         // endOnOutputSize, endOnInputSize
-Typically used to decode streams of inter-dependant blocks.
+                 int prefix64k,          // noPrefix, withPrefix
-Note : the 64K of memory before pointer 'source' must be allocated and read-allowed.
+                 int partialDecoding,    // full, partial
-*/
+                 int targetOutputSize    // only used if partialDecoding==partial
-#define FUNCTION_NAME LZ4_decompress_safe_withPrefix64k
+                 )
-#define EXITCONDITION_INPUTSIZE
+{
-#define PREFIX_64K
+    // Local Variables
-#include "lz4_decoder.h"
+    const BYTE* restrict ip = (const BYTE*) source;
    const BYTE* ref;
    const BYTE* const iend = ip + inputSize;
    BYTE* op = (BYTE*) dest;
    BYTE* const oend = op + outputSize;
    BYTE* cpy;
    BYTE* oexit = op + targetOutputSize;
    size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
 #if LZ4_ARCH64
    size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
 #endif
-/*
+    // Special case
-int LZ4_decompress_safe_partial(
+    if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT;   // targetOutputSize too large, better decode everything
-                        const char* source,
+    if unlikely(outputSize==0) goto _output_error;                          // Empty output buffer
                        char* dest,
                        int inputSize,
                        int targetOutputSize,
                        int maxOutputSize);
 LZ4_decompress_safe_partial() objective is to decompress only a part of the compressed input block provided.
 The decoding process stops as soon as 'targetOutputSize' bytes have been decoded, reducing decoding time.
 The result of the function is the number of bytes decoded.
 LZ4_decompress_safe_partial() may decode less than 'targetOutputSize' if input doesn't contain enough bytes to decode.
 Always verify how many bytes were decoded to ensure there are as many as wanted into the output buffer 'dest'.
 A corrupted input will produce an error result, a negative int.
 */
 #define FUNCTION_NAME LZ4_decompress_safe_partial
 #define EXITCONDITION_INPUTSIZE
 #define PARTIAL_DECODING
 #include "lz4_decoder.h"
-/*
+    // Main Loop
-int LZ4_decompress_fast(const char* source,
+    while (1)
-                        char* dest,
+    {
-                        int outputSize);
+        unsigned token;
        size_t length;
-This function is faster than LZ4_decompress_safe().
+        // get runlength
-LZ4_decompress_fast() guarantees it will never write nor read outside of output buffer.
+        token = *ip++;
-Since LZ4_decompress_fast() doesn't know the size of input buffer.
+        if ((length=(token>>ML_BITS)) == RUN_MASK)  
-it can only guarantee that it will never write into the input buffer, and will never read before its beginning.
+        { 
-To be used preferably in a controlled environment (when the compressed data to be decoded is from a trusted source).
+            unsigned s=255; 
-A detected corrupted input will produce an error result, a negative int.
+            while (((endOnInput)?ip<iend:1) && (s==255)) 
-*/
+            { 
-#define FUNCTION_NAME LZ4_decompress_fast
+                s = *ip++; 
-#include "lz4_decoder.h"
+                length += s; 
            } 
        }
        // copy literals
        cpy = op+length;
        if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
        {
            if (partialDecoding)
            {
                if (cpy > oend) goto _output_error;                            // Error : write attempt beyond end of output buffer
                if ((endOnInput) && (ip+length > iend)) goto _output_error;    // Error : read attempt beyond end of input buffer
            }
            else
            {
                if ((!endOnInput) && (cpy != oend)) goto _output_error;        // Error : block decoding must stop exactly there, due to parsing restrictions
                if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   // Error : not enough place for another match (min 4) + 5 literals
            }
            memcpy(op, ip, length);
            ip += length;
            op += length;
            break;                                       // Necessarily EOF, due to parsing restrictions
        }
        LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy;
        // get offset
        LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
        if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error;   // Error : offset outside destination buffer
        // get matchlength
        if ((length=(token&ML_MASK)) == ML_MASK) 
        { 
            while (endOnInput ? ip<iend-(LASTLITERALS+1) : 1)    // A minimum nb of input bytes must remain for LASTLITERALS + token
            { 
                unsigned s = *ip++; 
                length += s; 
                if (s==255) continue; 
                break; 
            } 
        }
        // copy repeated sequence
        if unlikely((op-ref)<STEPSIZE)
        {
 #if LZ4_ARCH64
            size_t dec64 = dec64table[op-ref];
 #else
            const size_t dec64 = 0;
 #endif
            op[0] = ref[0];
            op[1] = ref[1];
            op[2] = ref[2];
            op[3] = ref[3];
            op += 4, ref += 4; ref -= dec32table[op-ref];
            A32(op) = A32(ref); 
            op += STEPSIZE-4; ref -= dec64;
        } else { LZ4_COPYSTEP(ref,op); }
        cpy = op + length - (STEPSIZE-4);
        if unlikely(cpy>oend-(COPYLENGTH)-(STEPSIZE-4))
        {
            if (cpy > oend-LASTLITERALS) goto _output_error;    // Error : last 5 bytes must be literals
            LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH));
            while(op<cpy) *op++=*ref++;
            op=cpy;
            continue;
        }
        LZ4_WILDCOPY(ref, op, cpy);
        op=cpy;   // correction
    }
    // end of decoding
    if (endOnInput)
       return (int) (((char*)op)-dest);     // Nb of output bytes decoded
    else
       return (int) (((char*)ip)-source);   // Nb of input bytes read
    // Overflow error detected
 _output_error:
    return (int) (-(((char*)ip)-source))-1;
 }
-/*
+int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize)
-int LZ4_decompress_fast_withPrefix64k(
+{
-                        const char* source,
+    return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, noPrefix, full, 0);
-                        char* dest,
+}
                        int inputSize
                        int maxOutputSize);
-Same as LZ4_decompress_fast(), but will use the 64K of memory before the beginning of input buffer.
+int LZ4_decompress_fast(const char* source, char* dest, int outputSize)
-Typically used to decode streams of dependant inter-blocks.
+{
-Note : the 64K of memory before pointer 'source' must be allocated and read-allowed.
+    return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, noPrefix, full, 0);
-*/
+}
 #define FUNCTION_NAME LZ4_decompress_fast_withPrefix64k
 #define PREFIX_64K
 #include "lz4_decoder.h"
 int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int inputSize, int maxOutputSize)
 {
    return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, withPrefix, full, 0);
 }
 int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int outputSize)
 {
    return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, withPrefix, full, 0);
 }
 int LZ4_decompress_safe_partial(const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize)
 {
    return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, noPrefix, partial, targetOutputSize);
 }
--- a/lz4.h
+++ b/lz4.h
@ -42,7 +42,7 @@ extern "C" {
 // Compiler Options
 //**************************************
 #if defined(_MSC_VER) && !defined(__cplusplus)   // Visual Studio
-#  define inline __inline           // Visual is not C99, but supports some kind of inline
+#  define inline __forceinline           // Visual C is not C99, but supports some kind of inline. Note : we *do* want to force inline
 #endif
@ -66,8 +66,8 @@ LZ4_compress() :
 LZ4_decompress_safe() :
    maxOutputSize : is the size of the destination buffer (which must be already allocated)
    return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
-             If the source stream is malformed, the function will stop decoding and return a negative result.
+             If the source stream is malformed or too large, the function will stop decoding and return a negative result.
-             This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
+             This function is protected against any kind of buffer overflow attemps (never writes outside of output buffer, and never reads outside of input buffer). It is therefore protected against malicious data packets
 */
@ -150,8 +150,8 @@ int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outpu
 // Obsolete Functions
 //****************************
-static inline int LZ4_uncompress (const char* source, char* dest, int outputSize)   { return LZ4_decompress_fast(source, dest, outputSize); }
+static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
-static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize)   { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
+static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
 /* 
 These functions are deprecated and should no longer be used.
--- a/lz4_decoder.h
+++ b/lz4_decoder.h
@ -1,233 +0,0 @@
 /*
   LZ4 Decoder - Part of LZ4 compression algorithm
   Copyright (C) 2011-2013, Yann Collet.
   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:
       * Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
       * Redistributions in binary form must reproduce the above
   copyright notice, this list of conditions and the following disclaimer
   in the documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   You can contact the author at :
   - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
   - LZ4 source repository : http://code.google.com/p/lz4/
 */
 /* lz4_decoder.h must be included into lz4.c
   The objective of this file is to create a single LZ4 decoder function source
   which will be instanciated multiple times with minor variations
   depending on a set of #define.
 */
 //****************************
 // Check required defines
 //****************************
 #ifndef FUNCTION_NAME
 #  error "FUNTION_NAME is not defined"
 #endif
 //****************************
 // Control tests
 //****************************
 #ifdef EXITCONDITION_INPUTSIZE
 #  define INPUTBUFFER_CONTROL(ip,iend) likely(ip<iend)
 #else
 #  define INPUTBUFFER_CONTROL(ip,iend) (1)
 #endif
 #ifdef PARTIAL_DECODING
 #  define OUTPUTTARGET(cpy,oexit) (cpy >= oexit)
 #else
 #  define OUTPUTTARGET(cpy,oexit) (0)
 #endif
 //****************************
 // Function code
 //****************************
 int FUNCTION_NAME(const char* source,
                 char* dest,
 #ifdef EXITCONDITION_INPUTSIZE
                 int inputSize,
 #endif
 #ifdef PARTIAL_DECODING
                 int targetOutputSize,
 #endif
                 int outputSize
                 )
 {
    // Local Variables
    const BYTE* restrict ip = (const BYTE*) source;
    const BYTE* ref;
 #ifdef EXITCONDITION_INPUTSIZE
    const BYTE* const iend = ip + inputSize;
 #endif
    BYTE* op = (BYTE*) dest;
    BYTE* const oend = op + outputSize;
    BYTE* cpy;
 #ifdef PARTIAL_DECODING
    BYTE* const oexit = op + targetOutputSize;
 #endif
    size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
 #if LZ4_ARCH64
    size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
 #endif
 #ifdef EXITCONDITION_INPUTSIZE
    // Special case
    if unlikely(!inputSize) goto _output_error;     // A correctly formed null-compressed LZ4 must have at least one byte (token=0)
 #endif
    // Main Loop
    while (1)
    {
        unsigned token;
        size_t length;
        // get runlength
        token = *ip++;
        if ((length=(token>>ML_BITS)) == RUN_MASK)  
        { 
            unsigned s=255; 
            while (INPUTBUFFER_CONTROL(ip,iend) && (s==255)) 
            { 
                s=*ip++; 
                length += s; 
            } 
        }
        // copy literals
        cpy = op+length;
 #ifdef EXITCONDITION_INPUTSIZE
        if ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS)) || OUTPUTTARGET(cpy,oexit))
        {
            if (cpy > oend) goto _output_error;          // Error : write attempt beyond end of output buffer
            if ((!OUTPUTTARGET(cpy,oexit)) && (ip+length != iend)) goto _output_error;   // Error : Must consume all input at this stage, except if reaching TargetOutputSize
 #else
        if (cpy>oend-COPYLENGTH)
        {
            if (cpy != oend) goto _output_error;         // Error : not enough place for another match (min 4) + 5 literals
 #endif
            memcpy(op, ip, length);
            ip += length;
            op += length;
            break;                                       // Necessarily EOF, due to parsing restrictions
        }
        LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy;
        // get offset
        LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
 #ifndef PREFIX_64K
        if unlikely(ref < (BYTE* const)dest) goto _output_error;   // Error : offset outside destination buffer
 #endif
        // get matchlength
        if ((length=(token&ML_MASK)) == ML_MASK) 
        { 
            while INPUTBUFFER_CONTROL(ip,iend-(LASTLITERALS+1))    // A minimum nb of input bytes must remain for LASTLITERALS + token
            { 
                unsigned s = *ip++; 
                length += s; 
                if (s==255) continue; 
                break; 
            } 
        }
        // copy repeated sequence
        if unlikely((op-ref)<STEPSIZE)
        {
 #if LZ4_ARCH64
            size_t dec64 = dec64table[op-ref];
 #else
            const size_t dec64 = 0;
 #endif
            op[0] = ref[0];
            op[1] = ref[1];
            op[2] = ref[2];
            op[3] = ref[3];
            op += 4, ref += 4; ref -= dec32table[op-ref];
            A32(op) = A32(ref); 
            op += STEPSIZE-4; ref -= dec64;
        } else { LZ4_COPYSTEP(ref,op); }
        cpy = op + length - (STEPSIZE-4);
        if unlikely(cpy>oend-(COPYLENGTH)-(STEPSIZE-4))
        {
            if (cpy > oend-LASTLITERALS) goto _output_error;    // Error : last 5 bytes must be literals
            LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH));
            while(op<cpy) *op++=*ref++;
            op=cpy;
            continue;
        }
        LZ4_WILDCOPY(ref, op, cpy);
        op=cpy;		// correction
    }
    // end of decoding
 #ifdef EXITCONDITION_INPUTSIZE
    return (int) (((char*)op)-dest);     // Nb of output bytes decoded
 #else
    return (int) (((char*)ip)-source);   // Nb of input bytes read
 #endif
    // Overflow error detected
 _output_error:
    return (int) (-(((char*)ip)-source))-1;
 }
 //****************************
 // Clean defines
 //****************************
 // Required defines
 #undef FUNCTION_NAME
 // Locally Generated
 #undef INPUTBUFFER_CONTROL
 #undef OUTPUTTARGET
 // Optional defines
 #ifdef EXITCONDITION_INPUTSIZE
 #undef EXITCONDITION_INPUTSIZE
 #endif
 #ifdef PREFIX_64K
 #undef PREFIX_64K
 #endif
 #ifdef PARTIAL_DECODING
 #undef PARTIAL_DECODING
 #endif
--- a/lz4hc.c
+++ b/lz4hc.c
@ -50,9 +50,10 @@ Note : this source file requires "lz4hc_encoder.h"
 // CPU Feature Detection
 //**************************************
 // 32 or 64 bits ?
-#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) \
+#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \
-  || defined(__ppc64__) || defined(_WIN64) || defined(__LP64__) || defined(_LP64) \
+  || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \
-  || defined(__ia64__) )   // Detects 64 bits mode
+  || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \
  || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) )   // Detects 64 bits mode
 #  define LZ4_ARCH64 1
 #else
 #  define LZ4_ARCH64 0
@ -68,7 +69,7 @@ Note : this source file requires "lz4hc_encoder.h"
 #elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
 #  define LZ4_BIG_ENDIAN 1
 #elif defined(__sparc) || defined(__sparc__) \
-   || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \
+   || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
   || defined(__hpux)  || defined(__hppa) \
   || defined(_MIPSEB) || defined(__s390__)
 #  define LZ4_BIG_ENDIAN 1
@ -99,8 +100,7 @@ Note : this source file requires "lz4hc_encoder.h"
 #endif
 #ifdef _MSC_VER
-#  define inline __inline             // Visual is not C99, but supports some kind of inline
+#  define forceinline __forceinline
 #  define forceinline __forceinline   
 #  include <intrin.h>                 // For Visual 2005
 #  if LZ4_ARCH64	// 64-bit
 #    pragma intrinsic(_BitScanForward64) // For Visual 2005
@ -205,6 +205,7 @@ typedef struct _U64_S { U64 v; } _PACKED U64_S;
 #define MB *(1U<<20)
 #define GB *(1U<<30)
 //**************************************
 // Architecture-specific macros
 //**************************************
--- a/xxhash.c
+++ b/xxhash.c
@ -35,6 +35,14 @@ You can contact the author at :
 //**************************************
 // Tuning parameters
 //**************************************
 // Unaligned memory access is automatically enabled for "common" CPU, such as x86.
 // For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
 // If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
 // You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
 #if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
 #  define XXH_USE_UNALIGNED_ACCESS 1
 #endif
 // XXH_ACCEPT_NULL_INPUT_POINTER :
 // If the input pointer is a null pointer, xxHash default behavior is to crash, since it is a bad input.
 // If this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
@ -45,21 +53,33 @@ You can contact the author at :
 // XXH_FORCE_NATIVE_FORMAT :
 // By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
 // Results are therefore identical for little-endian and big-endian CPU.
-// This comes at a  performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
-// Should endian-independance be of no importance to your application, you may uncomment the #define below
+// Should endian-independance be of no importance for your application, you may uncomment the #define below.
 // It will improve speed for Big-endian CPU.
 // This option has no impact on Little_Endian CPU.
 //#define XXH_FORCE_NATIVE_FORMAT 1
 //**************************************
 // Compiler Options
 //**************************************
 #if defined(_MSC_VER) && !defined(__cplusplus)   // Visual Studio
 #  define inline __inline           // Visual C is not C99, but supports some kind of inline
 #endif
 //**************************************
-// Includes
+// Includes & Memory related functions
 //**************************************
 #include <stdlib.h>    // for malloc(), free()
 #include <string.h>    // for memcpy()
 #include "xxhash.h"
-
+// Modify the local functions below should you wish to use some other memory related routines
 // for malloc(), free()
 #include <stdlib.h>
 static inline void* XXH_malloc(size_t s) { return malloc(s); }
 static inline void  XXH_free  (void* p)  { free(p); }
 // for memcpy()
 #include <string.h>
 static inline void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
 //**************************************
@ -77,8 +97,8 @@ You can contact the author at :
 #  endif
 #elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
 #  define XXH_BIG_ENDIAN 1
-#elif defined(__sparc) || defined(__sparc__) \
+#elif defined(__sparc)  || defined(__sparc__) \
-    || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \
+    || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
    || defined(__hpux)  || defined(__hppa) \
    || defined(_MIPSEB) || defined(__s390__)
 #  define XXH_BIG_ENDIAN 1
@ -101,21 +121,39 @@ You can contact the author at :
  typedef  int32_t S32;
  typedef uint64_t U64;
 #else
-  typedef unsigned char       BYTE;
+  typedef unsigned char      BYTE;
-  typedef unsigned short      U16;
+  typedef unsigned short     U16;
-  typedef unsigned int        U32;
+  typedef unsigned int       U32;
-  typedef   signed int        S32;
+  typedef   signed int       S32;
-  typedef unsigned long long  U64;
+  typedef unsigned long long U64;
 #endif
 #if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
 #  define _PACKED __attribute__ ((packed))
 #else
 #  define _PACKED
 #endif
-//**************************************
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
-// Compiler-specific Options & Functions
+#  pragma pack(push, 1)
-//**************************************
+#endif
 typedef struct _U32_S { U32 v; } _PACKED U32_S;
 #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
 #  pragma pack(pop)
 #endif
 #define A32(x) (((U32_S *)(x))->v)
 //***************************************
 // Compiler-specific Functions and Macros
 //***************************************
 #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-// Note : under GCC, it may sometimes be faster to enable the (2nd) macro definition, instead of using win32 intrinsic
+// Note : although _rotl exists for minGW (GCC under windows), performance seems poor
-#if defined(_WIN32)
+#if defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
 #else
 #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
@ -147,7 +185,9 @@ static inline U32 XXH_swap32 (U32 x) {
 //**************************************
 // Macros
 //**************************************
-#define XXH_LE32(p)  (XXH_BIG_ENDIAN ? XXH_swap32(*(U32*)(p)) : *(U32*)(p))
+#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
 #define XXH_LE32(p)          (XXH_BIG_ENDIAN ? XXH_swap32(A32(p))     : A32(p))
 #define XXH_alignedLE32(p)   (XXH_BIG_ENDIAN ? XXH_swap32(*(U32*)(p)) : *(U32*)(p))
@ -155,6 +195,53 @@ static inline U32 XXH_swap32 (U32 x) {
 // Simple Hash Functions
 //****************************
 #if !defined(XXH_USE_UNALIGNED_ACCESS)
 // Specific version, for aligned 32-bits input. Useless for CPU supporting unaligned access.
 static U32 XXH32_alignedInput(const void* input, int len, U32 seed)
 {
    const BYTE* p = (const BYTE*)input;
    const BYTE* const bEnd = p + len;
    U32 h32;
    if (len>=16)
    {
        const BYTE* const limit = bEnd - 16;
        U32 v1 = seed + PRIME32_1 + PRIME32_2;
        U32 v2 = seed + PRIME32_2;
        U32 v3 = seed + 0;
        U32 v4 = seed - PRIME32_1;
        do
        {
            v1 += XXH_alignedLE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
            v2 += XXH_alignedLE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
            v3 += XXH_alignedLE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
            v4 += XXH_alignedLE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
        } while (p<=limit);
        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
    }
    else { h32  = seed + PRIME32_5; }
    h32 += (U32) len;
    while (p<=bEnd-4)
    {
        h32 += XXH_alignedLE32(p) * PRIME32_3;
        h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
        p+=4;
    }
    while (p<bEnd)
    {
        h32 += (*p) * PRIME32_5;
        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
        p++;
    }
    h32 ^= h32 >> 15;
    h32 *= PRIME32_2;
    h32 ^= h32 >> 13;
    h32 *= PRIME32_3;
    h32 ^= h32 >> 16;
    return h32;
 }
 #endif
 U32 XXH32(const void* input, int len, U32 seed)
 {
 #if 0
@ -172,6 +259,10 @@ U32 XXH32(const void* input, int len, U32 seed)
    if (p==NULL) { len=0; p=(const BYTE*)16; }
 #endif
 #if !defined(XXH_USE_UNALIGNED_ACCESS)
    if ((((U32)p) & 3) == 0) return XXH32_alignedInput(input, len, seed);   // Input is aligned, let's leverage the speed advantage
 #endif
    if (len>=16)
    {
        const BYTE* const limit = bEnd - 16;
@ -229,21 +320,25 @@ U32 XXH32(const void* input, int len, U32 seed)
 struct XXH_state32_t
 {
    U64 total_len;
    U32 seed;
    U32 v1;
    U32 v2;
    U32 v3;
    U32 v4;
    U64 total_len;
    char memory[16];
    int memsize;
    char memory[16];
 };
-int XXH32_sizeofState() { return sizeof(struct XXH_state32_t); }
+int XXH32_sizeofState() 
 {
    XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t));   // A compilation error here means XXH32_SIZEOFSTATE is not large enough
    return sizeof(struct XXH_state32_t); 
 }
-XXH_errorcode XXH32_resetState(void* state_in, unsigned int seed)
+XXH_errorcode XXH32_resetState(void* state_in, U32 seed)
 { 
    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
    state->seed = seed;
@ -253,15 +348,15 @@ XXH_errorcode XXH32_resetState(void* state_in, unsigned int seed)
    state->v4 = seed - PRIME32_1;
    state->total_len = 0;
    state->memsize = 0;
-    return OK;
+    return XXH_OK;
 }
 void* XXH32_init (U32 seed)
 {
-    struct XXH_state32_t * state = (struct XXH_state32_t *) malloc (sizeof(struct XXH_state32_t));
+    void* state = XXH_malloc (sizeof(struct XXH_state32_t));
    XXH32_resetState(state, seed);
-    return (void*)state;
+    return state;
 }
@ -279,14 +374,14 @@ XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
    if (state->memsize + len < 16)   // fill in tmp buffer
    {
-        memcpy(state->memory + state->memsize, input, len);
+        XXH_memcpy(state->memory + state->memsize, input, len);
        state->memsize +=  len;
-        return OK;
+        return XXH_OK;
    }
    if (state->memsize)   // some data left from previous update
    {
-        memcpy(state->memory + state->memsize, input, 16-state->memsize);
+        XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize);
        {
            const U32* p32 = (const U32*)state->memory;
            state->v1 += XXH_LE32(p32) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
@ -322,11 +417,11 @@ XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
    if (p < bEnd)
    {
-        memcpy(state->memory, p, bEnd-p);
+        XXH_memcpy(state->memory, p, bEnd-p);
        state->memsize = (int)(bEnd-p);
    }
-    return OK;
+    return XXH_OK;
 }
@ -337,7 +432,6 @@ U32 XXH32_intermediateDigest (void* state_in)
    BYTE* bEnd = (BYTE*)state->memory + state->memsize;
    U32 h32;
    if (state->total_len >= 16)
    {
        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
@ -377,7 +471,7 @@ U32 XXH32_digest (void* state_in)
 {
    U32 h32 = XXH32_intermediateDigest(state_in);
-    free(state_in);
+    XXH_free(state_in);
    return h32;
 }
--- a/xxhash.h
+++ b/xxhash.h
@ -27,8 +27,8 @@
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-	You can contact the author at :
+   You can contact the author at :
-	- xxHash source repository : http://code.google.com/p/xxhash/
+   - xxHash source repository : http://code.google.com/p/xxhash/
 */
 /* Notice extracted from xxHash homepage :
@ -67,7 +67,7 @@ extern "C" {
 //****************************
 // Type
 //****************************
-typedef enum { OK=0, XXH_ERROR } XXH_errorcode;
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
@ -79,13 +79,13 @@ unsigned int XXH32 (const void* input, int len, unsigned int seed);
 /*
 XXH32() :
-	Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
+    Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
    The memory between input & input+len must be valid (allocated and read-accessible).
-	"seed" can be used to alter the result predictably.
+    "seed" can be used to alter the result predictably.
-	This function successfully passes all SMHasher tests.
+    This function successfully passes all SMHasher tests.
-	Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
-	Note that "len" is type "int", which means it is limited to 2^31-1.
+    Note that "len" is type "int", which means it is limited to 2^31-1.
-	If your data is larger, use the advanced functions below.
+    If your data is larger, use the advanced functions below.
 */
@ -122,14 +122,19 @@ Memory will be freed by XXH32_digest().
 int           XXH32_sizeofState();
-XXH_errorcode XXH32_resetState(void* state_in, unsigned int seed);
+XXH_errorcode XXH32_resetState(void* state, unsigned int seed);
 /*
 These functions are the basic elements of XXH32_init();
 The objective is to allow user application to make its own allocation.
-XXH32_sizeofState() is used to know how much space must be allocated by the application.
+#define       XXH32_SIZEOFSTATE 48
-This space must be referenced by a void* pointer.
+typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t;
-This pointer must be provided as 'state_in' into XXH32_resetState(), which initializes the state.
+/*
 These functions allow user application to make its own allocation for state.
 XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state.
 Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer.
 This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state.
 For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()),
 use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields.
 */
@ -138,7 +143,7 @@ unsigned int XXH32_intermediateDigest (void* state);
 This function does the same as XXH32_digest(), generating a 32-bit hash,
 but preserve memory context.
 This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update().
-To free memory context, use XXH32_digest().
+To free memory context, use XXH32_digest(), or free().
 */