Improved compression speed on big endian CPU

This commit is contained in:
Yann Collet 2014-11-29 17:44:33 +01:00
parent 33dca250ee
commit 6658c49a97

124
lz4.c
View File

@ -55,15 +55,15 @@
* You will witness large performance improvements (+50% and up). * You will witness large performance improvements (+50% and up).
* Keep the line uncommented and send a word to upstream (https://groups.google.com/forum/#!forum/lz4c) * Keep the line uncommented and send a word to upstream (https://groups.google.com/forum/#!forum/lz4c)
* The goal is to automatically detect such situations by adding your target CPU within an exception list. * The goal is to automatically detect such situations by adding your target CPU within an exception list.
* 2 - Your target CPU correctly handle unaligned access, and was already correctly optimized by compiler * 2 - Your target CPU correctly handle unaligned access, and was already already optimized by compiler
* No change will be experienced. * No change will be experienced.
* 3 - Your target CPU inefficiently handle unaligned access. * 3 - Your target CPU inefficiently handle unaligned access.
* You will experience a performance loss. Comment back the line. * You will experience a performance loss. Comment back the line.
* 4 - Your target CPU does not handle unaligned access. * 4 - Your target CPU does not handle unaligned access.
* Program will crash. * Program will crash.
* If it effectively results in better speed (case 1) * If uncommenting results in better performance (case 1)
* please report your configuration to upstream (https://groups.google.com/forum/#!forum/lz4c) * please report your configuration to upstream (https://groups.google.com/forum/#!forum/lz4c)
* so that an automatic detection macro can be added for future versions of the library. * An automatic detection macro will be added to match your case within future versions of the library.
*/ */
/* #define CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS 1 */ /* #define CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS 1 */
@ -177,6 +177,7 @@ static unsigned LZ4_isLittleEndian(void)
return one.c[0]; return one.c[0];
} }
static U16 LZ4_readLE16(const void* memPtr) static U16 LZ4_readLE16(const void* memPtr)
{ {
if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian()))
@ -204,12 +205,10 @@ static void LZ4_writeLE16(void* memPtr, U16 value)
} }
static U32 LZ4_read16(const void* memPtr) static U16 LZ4_read16(const void* memPtr)
{ {
if (LZ4_UNALIGNED_ACCESS) if (LZ4_UNALIGNED_ACCESS)
{
return *(U16*)memPtr; return *(U16*)memPtr;
}
else else
{ {
U16 val16; U16 val16;
@ -221,9 +220,7 @@ static U32 LZ4_read16(const void* memPtr)
static U32 LZ4_read32(const void* memPtr) static U32 LZ4_read32(const void* memPtr)
{ {
if (LZ4_UNALIGNED_ACCESS) if (LZ4_UNALIGNED_ACCESS)
{
return *(U32*)memPtr; return *(U32*)memPtr;
}
else else
{ {
U32 val32; U32 val32;
@ -232,36 +229,24 @@ static U32 LZ4_read32(const void* memPtr)
} }
} }
static U64 LZ4_read64(const void* memPtr)
static U32 LZ4_readLE32(const void* memPtr)
{ {
if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) if (LZ4_UNALIGNED_ACCESS)
return *(U32*)memPtr;
{
const BYTE* p = memPtr;
U32 result = (U32)((U32)p[0] + (p[1]<<8) + (p[2]<<16) + ((U32)p[3]<<24));
return result;
}
}
static U64 LZ4_readLE64(const void* memPtr)
{
if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian()))
return *(U64*)memPtr; return *(U64*)memPtr;
else else
{ {
const BYTE* p = memPtr; U64 val64;
return (U64)((U64)p[0] + (p[1]<<8) + (p[2]<<16) + ((U64)p[3]<<24) + memcpy(&val64, memPtr, 8);
(((U64)p[4])<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56)); return val64;
} }
} }
static size_t LZ4_readLE_ARCH(const void* p) static size_t LZ4_read_ARCH(const void* p)
{ {
if (LZ4_64bits()) if (LZ4_64bits())
return (size_t)LZ4_readLE64(p); return (size_t)LZ4_read64(p);
else else
return (size_t)LZ4_readLE32(p); return (size_t)LZ4_read32(p);
} }
@ -365,31 +350,68 @@ int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); }
********************************/ ********************************/
static unsigned LZ4_NbCommonBytes (register size_t val) static unsigned LZ4_NbCommonBytes (register size_t val)
{ {
if (LZ4_64bits()) if (LZ4_isLittleEndian())
{ {
# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) if (LZ4_64bits())
unsigned long r = 0; {
_BitScanForward64( &r, (U64)val ); # if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
return (int)(r>>3); unsigned long r = 0;
# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) _BitScanForward64( &r, (U64)val );
return (__builtin_ctzll((U64)val) >> 3); return (int)(r>>3);
# else # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return (__builtin_ctzll((U64)val) >> 3);
return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # else
# endif static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
# endif
}
else /* 32 bits */
{
# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
unsigned long r;
_BitScanForward( &r, (U32)val );
return (int)(r>>3);
# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
return (__builtin_ctz((U32)val) >> 3);
# else
static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
# endif
}
} }
/* 32 bits */ else /* Big Endian CPU */
{ {
# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) if (LZ4_64bits())
unsigned long r; {
_BitScanForward( &r, (U32)val ); # if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
return (int)(r>>3); unsigned long r = 0;
# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) _BitScanReverse64( &r, val );
return (__builtin_ctz((U32)val) >> 3); return (unsigned)(r>>3);
# else # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; return (__builtin_clzll(val) >> 3);
return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; # else
# endif unsigned r;
if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
r += (!val);
return r;
# endif
}
else /* 32 bits */
{
# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
unsigned long r = 0;
_BitScanReverse( &r, val );
return (unsigned)(r>>3);
# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
return (__builtin_clz(val) >> 3);
# else
unsigned r;
if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
r += (!val);
return r;
# endif
}
} }
} }
@ -439,7 +461,7 @@ static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLi
while (likely(pIn<pInLimit-(STEPSIZE-1))) while (likely(pIn<pInLimit-(STEPSIZE-1)))
{ {
size_t diff = LZ4_readLE_ARCH(pMatch) ^ LZ4_readLE_ARCH(pIn); size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; } if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
pIn += LZ4_NbCommonBytes(diff); pIn += LZ4_NbCommonBytes(diff);
return (unsigned)(pIn - pStart); return (unsigned)(pIn - pStart);