diff --git a/programs/Makefile b/programs/Makefile index 543eb7c..076d3bf 100644 --- a/programs/Makefile +++ b/programs/Makefile @@ -20,7 +20,7 @@ # # You can contact the author at : # - LZ4 source repository : http://code.google.com/p/lz4/ -# - LZ4 forum froup : https://groups.google.com/forum/#!forum/lz4c +# - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c # ########################################################################## # lz4 : Command Line Utility, supporting gzip-like arguments # lz4c : CLU, supporting also legacy lz4demo arguments @@ -143,6 +143,7 @@ test-lz4: lz4 datagen ./datagen -g16KB | ./lz4 -9 | ./lz4 -vdq > $(VOID) ./datagen | ./lz4 | ./lz4 -vdq > $(VOID) ./datagen -g6M -p100 | ./lz4 -9BD | ./lz4 -vdq > $(VOID) + ./datagen -g17M | ./lz4 -9v | ./lz4 -vdq > $(VOID) ./datagen -g256MB | ./lz4 -vqB4D | ./lz4 -vdq > $(VOID) ./datagen -g6GB | ./lz4 -vqB5D | ./lz4 -vdq > $(VOID) # test frame concatenation with null-length frame diff --git a/programs/fullbench.c b/programs/fullbench.c old mode 100644 new mode 100755 index b785924..756357a --- a/programs/fullbench.c +++ b/programs/fullbench.c @@ -24,12 +24,12 @@ - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c */ -//************************************** -// Compiler Options -//************************************** -// Disable some Visual warning messages +/************************************** +* Compiler Options +**************************************/ +/* Disable some Visual warning messages */ #define _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_DEPRECATE // VS2005 +#define _CRT_SECURE_NO_DEPRECATE /* VS2005 */ // Unix Large Files support (>4GB) #if (defined(__sun__) && (!defined(__LP64__))) // Sun Solaris 32-bits requires specific definitions @@ -45,9 +45,9 @@ #endif -//************************************** -// Includes -//************************************** +/************************************** +* Includes +**************************************/ #include // malloc #include // fprintf, fopen, ftello64 #include // stat64 @@ -68,10 +68,10 @@ #include "xxhash.h" -//************************************** -// Compiler Options -//************************************** -// S_ISREG & gettimeofday() are not supported by MSVC +/************************************** +* Compiler Options +**************************************/ +/* S_ISREG & gettimeofday() are not supported by MSVC */ #if !defined(S_ISREG) # define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) #endif @@ -82,9 +82,9 @@ #endif -//************************************** -// Basic Types -//************************************** +/************************************** +* Basic Types +**************************************/ #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 # include typedef uint8_t BYTE; @@ -101,9 +101,9 @@ #endif -//**************************** -// Constants -//**************************** +/************************************** +* Constants +**************************************/ #define PROGRAM_DESCRIPTION "LZ4 speed analyzer" #ifndef LZ4_VERSION # define LZ4_VERSION "" @@ -122,9 +122,9 @@ #define ALL_DECOMPRESSORS 0 -//************************************** -// Local structures -//************************************** +/************************************** +* Local structures +**************************************/ struct chunkParameters { U32 id; @@ -135,9 +135,9 @@ struct chunkParameters }; -//************************************** -// MACRO -//************************************** +/************************************** +* MACRO +**************************************/ #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) #define PROGRESS(...) no_prompt ? 0 : DISPLAY(__VA_ARGS__) @@ -251,8 +251,127 @@ static U64 BMK_GetFileSize(char* infilename) /********************************************************* - Benchmark function +* Benchmark function *********************************************************/ +#ifdef __SSSE3__ + +#include + +/* Idea proposed by Terje Mathisen */ +static BYTE stepSize16[17] = {16,16,16,15,16,15,12,14,16,9,10,11,12,13,14,15,16}; +static __m128i replicateTable[17] = { + {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1}, + {0,1,2,0,1,2,0,1,2,0,1,2,0,1,2,0}, + {0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3}, + {0,1,2,3,4,0,1,2,3,4,0,1,2,3,4,0}, + {0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3}, + {0,1,2,3,4,5,6,0,1,2,3,4,5,6,0,1}, + {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}, + {0,1,2,3,4,5,6,7,8,0,1,2,3,4,5,6}, + {0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, + {0,1,2,3,4,5,6,7,8,9,10,0,1,2,3,4}, + {0,1,2,3,4,5,6,7,8,9,10,11,0,1,2,3}, + {0,1,2,3,4,5,6,7,8,9,10,11,12,0,1,2}, + {0,1,2,3,4,5,6,7,8,9,10,11,12,13,0,1}, + {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0}, + {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}}; +static BYTE stepSize32[17] = {32,32,32,30,32,30,30,28,32,27,30,22,24,26,28,30,16}; +static __m128i replicateTable2[17] = { + {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1}, + {1,2,0,1,2,0,1,2,0,1,2,0,1,2,0,1}, + {0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3}, + {1,2,3,4,0,1,2,3,4,0,1,2,3,4,0,1}, + {4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1}, + {2,3,4,5,6,0,1,2,3,4,5,6,0,1,2,3}, + {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}, + {7,8,0,1,2,3,4,5,6,7,8,0,1,2,3,4}, + {6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1}, + {5,6,7,8,9,10,0,1,2,3,4,5,6,7,8,9}, + {4,5,6,7,8,9,10,11,0,1,2,3,4,5,6,7}, + {3,4,5,6,7,8,9,10,11,12,0,1,2,3,4,5}, + {2,3,4,5,6,7,8,9,10,11,12,13,0,1,2,3}, + {1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,1}, + {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}}; + +U32 lz4_decode_sse(BYTE* dest, BYTE* src, U32 srcLength) +{ + BYTE* d = dest, *e = src+srcLength; + unsigned token, lit_len, mat_len; + __m128i a; + BYTE* dstore, *msrc; + + if (!srcLength) return 0; + goto start; + + do { + U32 step; + unsigned mat_offset = src[0] + (src[1] << 8); + src += 2; + msrc = d - mat_offset; + if (mat_len == 15) { + do { + token = *src++; + mat_len += token; + } while (token == 255); + } + mat_len += 4; + + dstore = d; + d += mat_len; + + if (mat_offset <= 16) + { // Bulk store only! + __m128i a2; + a = _mm_loadu_si128((const __m128i *)msrc); + a2 = _mm_shuffle_epi8(a, replicateTable2[mat_offset]); + a = _mm_shuffle_epi8(a, replicateTable[mat_offset]); + step = stepSize32[mat_offset]; + do { + _mm_storeu_si128((__m128i *)dstore, a); + _mm_storeu_si128((__m128i *)(dstore+16), a2); + dstore += step; + } while (dstore < d); + } + else + { + do + { + a = _mm_loadu_si128((const __m128i *)msrc); + _mm_storeu_si128((__m128i *)dstore, a); + msrc += sizeof(a); + dstore += sizeof(a); + } while (dstore < d); + } +start: + token = *src++; + lit_len = token >> 4; + mat_len = token & 15; + if (token >= 0xf0) { // lit_len == 15 + do { + token = *src++; + lit_len += token; + } while (token == 255); + } + dstore = d; + msrc = src; + d += lit_len; + src += lit_len; + do { + a = _mm_loadu_si128((const __m128i *)msrc); + _mm_storeu_si128((__m128i *)dstore, a); + msrc += sizeof(a); + dstore += sizeof(a); + } while (dstore < d); + } while (src < e); + + return (U32)(d-dest); +} +#endif // __SSSE3__ + static int local_LZ4_compress_limitedOutput(const char* in, char* out, int inSize) { @@ -345,6 +464,7 @@ static int local_LZ4_saveDictHC(const char* in, char* out, int inSize) static int local_LZ4_decompress_fast(const char* in, char* out, int inSize, int outSize) { (void)inSize; + //lz4_decode_sse((BYTE*)out, (BYTE*)in, inSize); LZ4_decompress_fast(in, out, outSize); return outSize; } @@ -677,7 +797,7 @@ int fullSpeedBench(char** fileNamesTable, int nbFiles) PROGRESS("%1i- %-29.29s :%10i -> %7.1f MB/s\r", loopNb, dName, (int)benchedSize, (double)benchedSize / bestTime / 1000.); - // CRC Checking + /* CRC Checking */ crcDecoded = XXH32(orig_buff, (int)benchedSize, 0); if (crcOriginal!=crcDecoded) { DISPLAY("\n!!! WARNING !!! %14s : Invalid Checksum : %x != %x\n", inFileName, (unsigned)crcOriginal, (unsigned)crcDecoded); exit(1); } }