Improved compression speed on big endian CPU
This commit is contained in:
parent
33dca250ee
commit
6658c49a97
124
lz4.c
124
lz4.c
@ -55,15 +55,15 @@
|
|||||||
* You will witness large performance improvements (+50% and up).
|
* You will witness large performance improvements (+50% and up).
|
||||||
* Keep the line uncommented and send a word to upstream (https://groups.google.com/forum/#!forum/lz4c)
|
* Keep the line uncommented and send a word to upstream (https://groups.google.com/forum/#!forum/lz4c)
|
||||||
* The goal is to automatically detect such situations by adding your target CPU within an exception list.
|
* The goal is to automatically detect such situations by adding your target CPU within an exception list.
|
||||||
* 2 - Your target CPU correctly handle unaligned access, and was already correctly optimized by compiler
|
* 2 - Your target CPU correctly handle unaligned access, and was already already optimized by compiler
|
||||||
* No change will be experienced.
|
* No change will be experienced.
|
||||||
* 3 - Your target CPU inefficiently handle unaligned access.
|
* 3 - Your target CPU inefficiently handle unaligned access.
|
||||||
* You will experience a performance loss. Comment back the line.
|
* You will experience a performance loss. Comment back the line.
|
||||||
* 4 - Your target CPU does not handle unaligned access.
|
* 4 - Your target CPU does not handle unaligned access.
|
||||||
* Program will crash.
|
* Program will crash.
|
||||||
* If it effectively results in better speed (case 1)
|
* If uncommenting results in better performance (case 1)
|
||||||
* please report your configuration to upstream (https://groups.google.com/forum/#!forum/lz4c)
|
* please report your configuration to upstream (https://groups.google.com/forum/#!forum/lz4c)
|
||||||
* so that an automatic detection macro can be added for future versions of the library.
|
* An automatic detection macro will be added to match your case within future versions of the library.
|
||||||
*/
|
*/
|
||||||
/* #define CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS 1 */
|
/* #define CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS 1 */
|
||||||
|
|
||||||
@ -177,6 +177,7 @@ static unsigned LZ4_isLittleEndian(void)
|
|||||||
return one.c[0];
|
return one.c[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static U16 LZ4_readLE16(const void* memPtr)
|
static U16 LZ4_readLE16(const void* memPtr)
|
||||||
{
|
{
|
||||||
if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian()))
|
if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian()))
|
||||||
@ -204,12 +205,10 @@ static void LZ4_writeLE16(void* memPtr, U16 value)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static U32 LZ4_read16(const void* memPtr)
|
static U16 LZ4_read16(const void* memPtr)
|
||||||
{
|
{
|
||||||
if (LZ4_UNALIGNED_ACCESS)
|
if (LZ4_UNALIGNED_ACCESS)
|
||||||
{
|
|
||||||
return *(U16*)memPtr;
|
return *(U16*)memPtr;
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
U16 val16;
|
U16 val16;
|
||||||
@ -221,9 +220,7 @@ static U32 LZ4_read16(const void* memPtr)
|
|||||||
static U32 LZ4_read32(const void* memPtr)
|
static U32 LZ4_read32(const void* memPtr)
|
||||||
{
|
{
|
||||||
if (LZ4_UNALIGNED_ACCESS)
|
if (LZ4_UNALIGNED_ACCESS)
|
||||||
{
|
|
||||||
return *(U32*)memPtr;
|
return *(U32*)memPtr;
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
U32 val32;
|
U32 val32;
|
||||||
@ -232,36 +229,24 @@ static U32 LZ4_read32(const void* memPtr)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static U64 LZ4_read64(const void* memPtr)
|
||||||
static U32 LZ4_readLE32(const void* memPtr)
|
|
||||||
{
|
{
|
||||||
if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian()))
|
if (LZ4_UNALIGNED_ACCESS)
|
||||||
return *(U32*)memPtr;
|
|
||||||
{
|
|
||||||
const BYTE* p = memPtr;
|
|
||||||
U32 result = (U32)((U32)p[0] + (p[1]<<8) + (p[2]<<16) + ((U32)p[3]<<24));
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static U64 LZ4_readLE64(const void* memPtr)
|
|
||||||
{
|
|
||||||
if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian()))
|
|
||||||
return *(U64*)memPtr;
|
return *(U64*)memPtr;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
const BYTE* p = memPtr;
|
U64 val64;
|
||||||
return (U64)((U64)p[0] + (p[1]<<8) + (p[2]<<16) + ((U64)p[3]<<24) +
|
memcpy(&val64, memPtr, 8);
|
||||||
(((U64)p[4])<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
|
return val64;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t LZ4_readLE_ARCH(const void* p)
|
static size_t LZ4_read_ARCH(const void* p)
|
||||||
{
|
{
|
||||||
if (LZ4_64bits())
|
if (LZ4_64bits())
|
||||||
return (size_t)LZ4_readLE64(p);
|
return (size_t)LZ4_read64(p);
|
||||||
else
|
else
|
||||||
return (size_t)LZ4_readLE32(p);
|
return (size_t)LZ4_read32(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -365,31 +350,68 @@ int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); }
|
|||||||
********************************/
|
********************************/
|
||||||
static unsigned LZ4_NbCommonBytes (register size_t val)
|
static unsigned LZ4_NbCommonBytes (register size_t val)
|
||||||
{
|
{
|
||||||
if (LZ4_64bits())
|
if (LZ4_isLittleEndian())
|
||||||
{
|
{
|
||||||
# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
if (LZ4_64bits())
|
||||||
unsigned long r = 0;
|
{
|
||||||
_BitScanForward64( &r, (U64)val );
|
# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
||||||
return (int)(r>>3);
|
unsigned long r = 0;
|
||||||
# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
_BitScanForward64( &r, (U64)val );
|
||||||
return (__builtin_ctzll((U64)val) >> 3);
|
return (int)(r>>3);
|
||||||
# else
|
# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
||||||
static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
|
return (__builtin_ctzll((U64)val) >> 3);
|
||||||
return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
|
# else
|
||||||
# endif
|
static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
|
||||||
|
return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
|
||||||
|
# endif
|
||||||
|
}
|
||||||
|
else /* 32 bits */
|
||||||
|
{
|
||||||
|
# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
||||||
|
unsigned long r;
|
||||||
|
_BitScanForward( &r, (U32)val );
|
||||||
|
return (int)(r>>3);
|
||||||
|
# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
||||||
|
return (__builtin_ctz((U32)val) >> 3);
|
||||||
|
# else
|
||||||
|
static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
|
||||||
|
return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
|
||||||
|
# endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/* 32 bits */
|
else /* Big Endian CPU */
|
||||||
{
|
{
|
||||||
# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
if (LZ4_64bits())
|
||||||
unsigned long r;
|
{
|
||||||
_BitScanForward( &r, (U32)val );
|
# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
||||||
return (int)(r>>3);
|
unsigned long r = 0;
|
||||||
# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
_BitScanReverse64( &r, val );
|
||||||
return (__builtin_ctz((U32)val) >> 3);
|
return (unsigned)(r>>3);
|
||||||
# else
|
# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
||||||
static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
|
return (__builtin_clzll(val) >> 3);
|
||||||
return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
|
# else
|
||||||
# endif
|
unsigned r;
|
||||||
|
if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
|
||||||
|
if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
|
||||||
|
r += (!val);
|
||||||
|
return r;
|
||||||
|
# endif
|
||||||
|
}
|
||||||
|
else /* 32 bits */
|
||||||
|
{
|
||||||
|
# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
||||||
|
unsigned long r = 0;
|
||||||
|
_BitScanReverse( &r, val );
|
||||||
|
return (unsigned)(r>>3);
|
||||||
|
# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
|
||||||
|
return (__builtin_clz(val) >> 3);
|
||||||
|
# else
|
||||||
|
unsigned r;
|
||||||
|
if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
|
||||||
|
r += (!val);
|
||||||
|
return r;
|
||||||
|
# endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -439,7 +461,7 @@ static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLi
|
|||||||
|
|
||||||
while (likely(pIn<pInLimit-(STEPSIZE-1)))
|
while (likely(pIn<pInLimit-(STEPSIZE-1)))
|
||||||
{
|
{
|
||||||
size_t diff = LZ4_readLE_ARCH(pMatch) ^ LZ4_readLE_ARCH(pIn);
|
size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
|
||||||
if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
|
if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
|
||||||
pIn += LZ4_NbCommonBytes(diff);
|
pIn += LZ4_NbCommonBytes(diff);
|
||||||
return (unsigned)(pIn - pStart);
|
return (unsigned)(pIn - pStart);
|
||||||
|
Loading…
Reference in New Issue
Block a user