From e98ea20c373ed1de17d4c27f62c7ad52833b085d Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 28 Jan 2015 01:23:14 +0100 Subject: [PATCH 1/4] Fix for VS2005 (AVX2 intrinsics) --- lib/zstd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/zstd.c b/lib/zstd.c index 799df03b..61190578 100644 --- a/lib/zstd.c +++ b/lib/zstd.c @@ -66,10 +66,9 @@ #include /* calloc */ #include /* memcpy, memmove */ #include /* debug : printf */ -#include /* AVX2 intrinsics */ #include "zstd_static.h" #if defined(__clang__) || defined(__GNUC__) -# include "fse.c" /* unfortunately due GCC/Clang inlining limitations, this include runs noticeably faster */ +# include "fse.c" /* due to GCC/Clang inlining limitations, including *.c runs noticeably faster */ #else # include "fse_static.h" #endif @@ -78,6 +77,10 @@ /******************************************************** * Compiler specifics *********************************************************/ +#if (!(defined(_MSC_VER) && (_MSC_VER<=1400))) /* exclude Visual 2005 and below */ +# include /* AVX2 intrinsics */ +#endif + #ifdef _MSC_VER /* Visual Studio */ # define FORCE_INLINE static __forceinline # include /* For Visual 2005 */ From 565b81d0ba57a18b7b29581db120c7ddea859685 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 29 Jan 2015 06:51:30 +0100 Subject: [PATCH 2/4] fixed : corner case (FSE) --- lib/fse.c | 143 ++++++++++++++++++++++++------------------------------ 1 file changed, 64 insertions(+), 79 deletions(-) diff --git a/lib/fse.c b/lib/fse.c index c26c1311..cc9097c8 100644 --- a/lib/fse.c +++ b/lib/fse.c @@ -132,7 +132,7 @@ static U32 FSE_readLE32(const void* memPtr) return FSE_read32(memPtr); else { - const BYTE* p = (const BYTE*)memPtr; + const BYTE* p = memPtr; return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24)); } } @@ -145,7 +145,7 @@ static void FSE_writeLE32(void* memPtr, U32 val32) } else { - BYTE* p = (BYTE*)memPtr; + BYTE* p = memPtr; p[0] = (BYTE)val32; p[1] = (BYTE)(val32>>8); p[2] = (BYTE)(val32>>16); @@ -166,7 +166,7 @@ static U64 FSE_readLE64(const void* memPtr) return FSE_read64(memPtr); else { - const BYTE* p = (const BYTE*)memPtr; + const BYTE* p = memPtr; return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24) + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56)); } @@ -180,7 +180,7 @@ static void FSE_writeLE64(void* memPtr, U64 val64) } else { - BYTE* p = (BYTE*)memPtr; + BYTE* p = memPtr; p[0] = (BYTE)val64; p[1] = (BYTE)(val64>>8); p[2] = (BYTE)(val64>>16); @@ -533,70 +533,6 @@ void FSE_freeCTable (void* CTable) free(CTable); } -/* Emergency distribution strategy (fallback); compression will suffer a lot ; consider increasing table size */ -static void FSE_emergencyDistrib(short* normalizedCounter, int maxSymbolValue, short points) -{ - int s=0; - while (points) - { - if (normalizedCounter[s] > 1) - { - normalizedCounter[s]--; - points--; - } - s++; - if (s>maxSymbolValue) s=0; - } -} - -/* fallback distribution (corner case); compression will suffer a bit ; consider increasing table size */ -void FSE_distribNpts(short* normalizedCounter, int maxSymbolValue, short points) -{ - int s; - int rank[5] = {0}; - int fallback=0; - - /* Sort 4 largest (they'll absorb normalization rounding) */ - for (s=1; s<=maxSymbolValue; s++) - { - int i, b=3; - if (b>=s) b=s-1; - while ((b>=0) && (normalizedCounter[s]>normalizedCounter[rank[b]])) b--; - for (i=3; i>b; i--) rank[i+1] = rank[i]; - rank[b+1]=s; - } - - /* Distribute points */ - s = 0; - while (points) - { - short limit = normalizedCounter[rank[s+1]]+1; - if (normalizedCounter[rank[s]] >= limit + points ) - { - normalizedCounter[rank[s]] -= points; - break; - } - points -= normalizedCounter[rank[s]] - limit; - normalizedCounter[rank[s]] = limit; - s++; - if (s==3) - { - short reduction = points>>2; - if (fallback) - { - FSE_emergencyDistrib(normalizedCounter, maxSymbolValue, points); /* Fallback mode */ - return; - } - if (reduction < 1) reduction=1; - if (reduction >= normalizedCounter[rank[3]]) reduction=normalizedCounter[rank[3]]-1; - fallback = (reduction==0); - normalizedCounter[rank[3]]-=reduction; - points-=reduction; - s=0; - } - } -} - unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) { @@ -618,12 +554,13 @@ typedef struct int FSE_compareRankT(const void* r1, const void* r2) { - const rank_t* R1 = (const rank_t*)r1; - const rank_t* R2 = (const rank_t*)r2; + const rank_t* R1 = r1; + const rank_t* R2 = r2; return 2 * (R1->count < R2->count) - 1; } +#if 0 static void FSE_adjustNormSlow(short* norm, int pointsToRemove, const unsigned* count, U32 maxSymbolValue) { rank_t rank[FSE_MAX_SYMBOL_VALUE+1]; @@ -657,6 +594,48 @@ static void FSE_adjustNormSlow(short* norm, int pointsToRemove, const unsigned* } } +#else + +static size_t FSE_adjustNormSlow(short* norm, int pointsToRemove, const unsigned* count, U32 maxSymbolValue) +{ + rank_t rank[FSE_MAX_SYMBOL_VALUE+1]; + U32 s; + + /* Init */ + for (s=0; s<=maxSymbolValue; s++) + { + rank[s].id = s; + rank[s].count = count[s]; + if (norm[s] <= 1) rank[s].count = 0; + } + + /* Sort according to count */ + qsort(rank, maxSymbolValue+1, sizeof(rank_t), FSE_compareRankT); + + while(pointsToRemove) + { + int newRank = 1; + rank_t savedR; + if (norm[rank[0].id] == 1) + return (size_t)-FSE_ERROR_GENERIC; + norm[rank[0].id]--; + pointsToRemove--; + rank[0].count -= (rank[0].count + 6) >> 3; + if (norm[rank[0].id] == 1) + rank[0].count=0; + savedR = rank[0]; + while (rank[newRank].count > savedR.count) + { + rank[newRank-1] = rank[newRank]; + newRank++; + } + rank[newRank-1] = savedR; + } + + return 0; +} +#endif + size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t total, @@ -710,21 +689,27 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, stillToDistribute -= proba; } } - //if ((int)normalizedCounter[largest] <= -stillToDistribute+8) if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) { - /* largest cant accommodate that amount */ - FSE_adjustNormSlow(normalizedCounter, -stillToDistribute, count, maxSymbolValue); - //FSE_distribNpts(normalizedCounter, maxSymbolValue, (short)(-stillToDistribute)); /* Fallback */ + size_t errorCode; + /* corner case, need to converge towards normalization with caution */ + errorCode = FSE_adjustNormSlow(normalizedCounter, -stillToDistribute, count, maxSymbolValue); + if (FSE_isError(errorCode)) return errorCode; + //FSE_adjustNormSlow(normalizedCounter, -stillToDistribute, count, maxSymbolValue); } else normalizedCounter[largest] += (short)stillToDistribute; } #if 0 { /* Print Table (debug) */ - int s; + U32 s; + U32 nTotal = 0; for (s=0; s<=maxSymbolValue; s++) printf("%3i: %4i \n", s, normalizedCounter[s]); + for (s=0; s<=maxSymbolValue; s++) + nTotal += abs(normalizedCounter[s]); + if (nTotal != (1U<state = FSE_readBits(bitD, base32[0]); FSE_reloadDStream(bitD); DStatePtr->table = base32 + 1; @@ -1511,7 +1496,7 @@ void FSE_FUNCTION_NAME(FSE_freeDTable, FSE_FUNCTION_EXTENSION) (void* DTable) size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION) (void* DTable, const short* const normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) { - U32* const base32 = (U32* const)DTable; + U32* const base32 = DTable; FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (base32+1); const U32 tableSize = 1 << tableLog; const U32 tableMask = tableSize-1; From 6434adcf54fe7e7ca2ec76a7a054b68865112d34 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 29 Jan 2015 07:00:43 +0100 Subject: [PATCH 3/4] Fix : no AVX2 for Visual 2008 and older --- lib/zstd.c | 4 ++-- programs/Makefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/zstd.c b/lib/zstd.c index 61190578..4f2fe472 100644 --- a/lib/zstd.c +++ b/lib/zstd.c @@ -77,7 +77,7 @@ /******************************************************** * Compiler specifics *********************************************************/ -#if (!(defined(_MSC_VER) && (_MSC_VER<=1400))) /* exclude Visual 2005 and below */ +#if (!(defined(_MSC_VER) && (_MSC_VER<=1500))) /* exclude Visual 2008 and below */ # include /* AVX2 intrinsics */ #endif @@ -140,7 +140,7 @@ static const U32 ZSTD_magicNumber = 0xFD2FB51C; #define MB *(1<<20) #define BLOCKSIZE (128 KB) // define, for static allocation -static const size_t g_maxBlockSize = 128 KB; //((size_t)1 << 22) - 1; +static const size_t g_maxBlockSize = 128 KB; static const U32 g_maxDistance = 512 KB; static const U32 g_searchStrength = 8; diff --git a/programs/Makefile b/programs/Makefile index 23a482de..c3335a47 100644 --- a/programs/Makefile +++ b/programs/Makefile @@ -150,7 +150,7 @@ test-mem: zstd datagen fuzzer fullbench valgrind --leak-check=yes ./zstd -vf tmp /dev/null ./datagen -g128MB > tmp valgrind --leak-check=yes ./zstd -vf tmp /dev/null - rm tmp + @rm tmp valgrind --leak-check=yes ./fuzzer -i128 -t1 valgrind --leak-check=yes ./fullbench -i1 From 1cc58def2df7253b1da82675d3c6f26af99f11d7 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 29 Jan 2015 07:13:54 +0100 Subject: [PATCH 4/4] Restored cast for C++ (fse) --- lib/fse.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/fse.c b/lib/fse.c index cc9097c8..a1aeea52 100644 --- a/lib/fse.c +++ b/lib/fse.c @@ -132,7 +132,7 @@ static U32 FSE_readLE32(const void* memPtr) return FSE_read32(memPtr); else { - const BYTE* p = memPtr; + const BYTE* p = (const BYTE*)memPtr; return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24)); } } @@ -145,7 +145,7 @@ static void FSE_writeLE32(void* memPtr, U32 val32) } else { - BYTE* p = memPtr; + BYTE* p = (BYTE*)memPtr; p[0] = (BYTE)val32; p[1] = (BYTE)(val32>>8); p[2] = (BYTE)(val32>>16); @@ -166,7 +166,7 @@ static U64 FSE_readLE64(const void* memPtr) return FSE_read64(memPtr); else { - const BYTE* p = memPtr; + const BYTE* p = (const BYTE*)memPtr; return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24) + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56)); } @@ -180,7 +180,7 @@ static void FSE_writeLE64(void* memPtr, U64 val64) } else { - BYTE* p = memPtr; + BYTE* p = (BYTE*)memPtr; p[0] = (BYTE)val64; p[1] = (BYTE)(val64>>8); p[2] = (BYTE)(val64>>16); @@ -554,8 +554,8 @@ typedef struct int FSE_compareRankT(const void* r1, const void* r2) { - const rank_t* R1 = r1; - const rank_t* R2 = r2; + const rank_t* R1 = (const rank_t*)r1; + const rank_t* R2 = (const rank_t*)r2; return 2 * (R1->count < R2->count) - 1; } @@ -990,7 +990,7 @@ size_t FSE_decompressRLE(void* dst, size_t originalSize, size_t FSE_buildDTable_rle (void* DTable, BYTE symbolValue) { - U32* const base32 = DTable; + U32* const base32 = (U32*)DTable; FSE_decode_t* const cell = (FSE_decode_t*)(base32 + 1); /* Sanity check */ @@ -1008,7 +1008,7 @@ size_t FSE_buildDTable_rle (void* DTable, BYTE symbolValue) size_t FSE_buildDTable_raw (void* DTable, unsigned nbBits) { - U32* const base32 = DTable; + U32* const base32 = (U32*)DTable; FSE_decode_t* dinfo = (FSE_decode_t*)(base32 + 1); const unsigned tableSize = 1 << nbBits; const unsigned tableMask = tableSize - 1; @@ -1127,7 +1127,7 @@ unsigned FSE_reloadDStream(FSE_DStream_t* bitD) void FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const void* DTable) { - const U32* const base32 = DTable; + const U32* const base32 = (const U32*)DTable; DStatePtr->state = FSE_readBits(bitD, base32[0]); FSE_reloadDStream(bitD); DStatePtr->table = base32 + 1; @@ -1496,7 +1496,7 @@ void FSE_FUNCTION_NAME(FSE_freeDTable, FSE_FUNCTION_EXTENSION) (void* DTable) size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION) (void* DTable, const short* const normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) { - U32* const base32 = DTable; + U32* const base32 = (U32*)DTable; FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (base32+1); const U32 tableSize = 1 << tableLog; const U32 tableMask = tableSize-1;