added documentation about LZ4_FORCE_SW_BITCOUNT

Also : added memory-frugal software byte count for big endian 64-bit cpus.
Disabled by default.
This commit is contained in:
Yann Collet 2020-08-25 22:17:29 -07:00
parent ee0a3cfa0c
commit 5243173b23
2 changed files with 33 additions and 8 deletions

View File

@ -46,11 +46,11 @@ and `LZ4F_PUBLISH_STATIC_FUNCTIONS`.
#### Build macros #### Build macros
The following build macro can be selected at compilation time : The following build macro can be selected to adjust source code behavior at compilation time :
- `LZ4_FAST_DEC_LOOP` : this triggers the optimized decompression loop. - `LZ4_FAST_DEC_LOOP` : this triggers a speed optimized decompression loop, more powerful on modern cpus.
This loops works great on x86/x64 cpus, and is automatically enabled on this platform. This loop works great on x86, x64 and aarch64 cpus, and is automatically enabled for them.
It's possible to enable or disable it manually, by passing `LZ4_FAST_DEC_LOOP=1` or `0` to the preprocessor. It's also possible to enable or disable it manually, by passing `LZ4_FAST_DEC_LOOP=1` or `0` to the preprocessor.
For example, with `gcc` : `-DLZ4_FAST_DEC_LOOP=1`, For example, with `gcc` : `-DLZ4_FAST_DEC_LOOP=1`,
and with `make` : `CPPFLAGS+=-DLZ4_FAST_DEC_LOOP=1 make lz4`. and with `make` : `CPPFLAGS+=-DLZ4_FAST_DEC_LOOP=1 make lz4`.
@ -66,9 +66,17 @@ The following build macro can be selected at compilation time :
Should this be a problem, it's generally possible to make the compiler ignore these warnings, Should this be a problem, it's generally possible to make the compiler ignore these warnings,
for example with `-Wno-deprecated-declarations` on `gcc`, for example with `-Wno-deprecated-declarations` on `gcc`,
or `_CRT_SECURE_NO_WARNINGS` for Visual Studio. or `_CRT_SECURE_NO_WARNINGS` for Visual Studio.
Another method is to define `LZ4_DISABLE_DEPRECATE_WARNINGS` Another project-specific method is to define `LZ4_DISABLE_DEPRECATE_WARNINGS`
before including the LZ4 header files. before including the LZ4 header files.
- `LZ4_FORCE_SW_BITCOUNT` : by default, the compression algorithm tries to determine lengths
by using bitcount instructions, generally implemented as fast single instructions in many cpus.
In case the target cpus doesn't support it, or compiler intrinsic doesn't work, or feature bad performance,
it's possible to use an optimized software path instead.
This is achieved by setting this build macros .
In most cases, it's not expected to be necessary,
but it can be legitimately considered for less common platforms.
#### Amalgamation #### Amalgamation
@ -103,7 +111,7 @@ The compiled executable will require LZ4 DLL which is available at `dll\liblz4.d
#### Miscellaneous #### Miscellaneous
Other files present in the directory are not source code. There are : Other files present in the directory are not source code. They are :
- `LICENSE` : contains the BSD license text - `LICENSE` : contains the BSD license text
- `Makefile` : `make` script to compile and install lz4 library (static and dynamic) - `Makefile` : `make` script to compile and install lz4 library (static and dynamic)

View File

@ -88,6 +88,7 @@
* Define this parameter if your target system or compiler does not support hardware bit count * Define this parameter if your target system or compiler does not support hardware bit count
*/ */
#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for WinCE doesn't support Hardware bit count */ #if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for WinCE doesn't support Hardware bit count */
# undef LZ4_FORCE_SW_BITCOUNT /* avoid double def */
# define LZ4_FORCE_SW_BITCOUNT # define LZ4_FORCE_SW_BITCOUNT
#endif #endif
@ -527,6 +528,9 @@ static unsigned LZ4_NbCommonBytes (reg_t val)
!defined(LZ4_FORCE_SW_BITCOUNT) !defined(LZ4_FORCE_SW_BITCOUNT)
return (unsigned)__builtin_clzll((U64)val) >> 3; return (unsigned)__builtin_clzll((U64)val) >> 3;
# else # else
#if 1
/* this method is probably faster,
* but adds a 128 bytes lookup table */
static const unsigned char ctz7_tab[128] = { static const unsigned char ctz7_tab[128] = {
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
@ -537,9 +541,22 @@ static unsigned LZ4_NbCommonBytes (reg_t val)
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
}; };
const U64 mask = 0x0101010101010101ULL; U64 const mask = 0x0101010101010101ULL;
U64 t = (((val >> 8) - mask) | val) & mask; U64 const t = (((val >> 8) - mask) | val) & mask;
return ctz7_tab[(t * 0x0080402010080402ULL) >> 57]; return ctz7_tab[(t * 0x0080402010080402ULL) >> 57];
#else
/* this method doesn't consume memory space like the previous one,
* but it contains several branches,
* that may end up slowing execution */
static const U32 by32 = sizeof(val)*4; /* 32 on 64 bits (goal), 16 on 32 bits.
Just to avoid some static analyzer complaining about shift by 32 on 32-bits target.
Note that this code path is never triggered in 32-bits mode. */
unsigned r;
if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; }
if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
r += (!val);
return r;
#endif
# endif # endif
} else /* 32 bits */ { } else /* 32 bits */ {
# if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ # if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \