From 419f55f9dfc2df8792902b8953d50690121afeea Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Fri, 20 Oct 2023 23:35:10 +0300 Subject: [PATCH] liblzma: Avoid extern lzma_crc32_clmul() and lzma_crc64_clmul(). A CLMUL-only build will have the crcxx_clmul() inlined into lzma_crcxx(). Previously a jump to the extern lzma_crcxx_clmul() was needed. Notes about shared liblzma on ELF platforms: - On platforms that support ifunc and -fvisibility=hidden, this was silly because CLMUL-only build would have that single extra jump instruction of extra overhead. - On platforms that support neither -fvisibility=hidden nor linker version script (liblzma*.map), jumping to lzma_crcxx_clmul() would go via PLT so a few more instructions of overhead (still not a big issue but silly nevertheless). There was a downside with static liblzma too: if an application only needs lzma_crc64(), static linking would make the linker include the CLMUL code for both CRC32 and CRC64 from crc_x86_clmul.o even though the CRC32 code wouldn't be needed, thus increasing code size of the executable (assuming that -ffunction-sections isn't used). Also, now compilers are likely to inline crc_simd_body() even if they don't support the always_inline attribute (or MSVC's __forceinline). Quite possibly all compilers that build the code do support such an attribute. But now it likely isn't a problem even if the attribute wasn't supported. Now all x86-specific stuff is in crc_x86_clmul.h. If other archs The other archs can then have their own headers with their own is_clmul_supported() and crcxx_clmul(). Another bonus is that the build system doesn't need to care if crc_clmul.c is needed. is_clmul_supported() stays as inline function as it's not needed when doing a CLMUL-only build (avoids a warning about unused function). --- CMakeLists.txt | 7 +- configure.ac | 1 - src/liblzma/check/Makefile.inc | 6 +- src/liblzma/check/crc32_fast.c | 9 +- src/liblzma/check/crc64_fast.c | 9 +- src/liblzma/check/crc_common.h | 64 -------------- .../check/{crc_clmul.c => crc_x86_clmul.h} | 86 ++++++++++++++++--- 7 files changed, 91 insertions(+), 91 deletions(-) rename src/liblzma/check/{crc_clmul.c => crc_x86_clmul.h} (80%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 85844d6..478b879 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -229,6 +229,7 @@ add_library(liblzma src/liblzma/check/check.c src/liblzma/check/check.h src/liblzma/check/crc_common.h + src/liblzma/check/crc_x86_clmul.h src/liblzma/common/block_util.c src/liblzma/common/common.c src/liblzma/common/common.h @@ -1000,11 +1001,7 @@ calculation if supported by the system" ON) int main(void) { return 0; } " HAVE_USABLE_CLMUL) - - if(HAVE_USABLE_CLMUL) - target_sources(liblzma PRIVATE src/liblzma/check/crc_clmul.c) - target_compile_definitions(liblzma PRIVATE HAVE_USABLE_CLMUL) - endif() + tuklib_add_definition_if(liblzma HAVE_USABLE_CLMUL) endif() endif() diff --git a/configure.ac b/configure.ac index a4ef57a..9584c4a 100644 --- a/configure.ac +++ b/configure.ac @@ -1086,7 +1086,6 @@ __m128i my_clmul(__m128i a) ]) AC_MSG_RESULT([$enable_clmul_crc]) ]) -AM_CONDITIONAL([COND_CRC_CLMUL], [test "x$enable_clmul_crc" = xyes]) # Check for sandbox support. If one is found, set enable_sandbox=found. # diff --git a/src/liblzma/check/Makefile.inc b/src/liblzma/check/Makefile.inc index 6186e10..acff40c 100644 --- a/src/liblzma/check/Makefile.inc +++ b/src/liblzma/check/Makefile.inc @@ -14,7 +14,8 @@ EXTRA_DIST += \ liblzma_la_SOURCES += \ check/check.c \ check/check.h \ - check/crc_common.h + check/crc_common.h \ + check/crc_x86_clmul.h if COND_SMALL liblzma_la_SOURCES += check/crc32_small.c @@ -27,9 +28,6 @@ if COND_ASM_X86 liblzma_la_SOURCES += check/crc32_x86.S else liblzma_la_SOURCES += check/crc32_fast.c -if COND_CRC_CLMUL -liblzma_la_SOURCES += check/crc_clmul.c -endif endif endif diff --git a/src/liblzma/check/crc32_fast.c b/src/liblzma/check/crc32_fast.c index 9fce94d..6982836 100644 --- a/src/liblzma/check/crc32_fast.c +++ b/src/liblzma/check/crc32_fast.c @@ -15,6 +15,11 @@ #include "check.h" #include "crc_common.h" +#ifdef CRC_CLMUL +# define BUILDING_CRC32_CLMUL +# include "crc_x86_clmul.h" +#endif + #ifdef CRC_GENERIC @@ -132,7 +137,7 @@ typedef uint32_t (*crc32_func_type)( static crc32_func_type crc32_resolve(void) { - return is_clmul_supported() ? &lzma_crc32_clmul : &crc32_generic; + return is_clmul_supported() ? &crc32_clmul : &crc32_generic; } #if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__) @@ -221,7 +226,7 @@ lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc) return crc32_func(buf, size, crc); #elif defined(CRC_CLMUL) - return lzma_crc32_clmul(buf, size, crc); + return crc32_clmul(buf, size, crc); #else return crc32_generic(buf, size, crc); diff --git a/src/liblzma/check/crc64_fast.c b/src/liblzma/check/crc64_fast.c index ce74901..46b5c64 100644 --- a/src/liblzma/check/crc64_fast.c +++ b/src/liblzma/check/crc64_fast.c @@ -14,6 +14,11 @@ #include "check.h" #include "crc_common.h" +#ifdef CRC_CLMUL +# define BUILDING_CRC64_CLMUL +# include "crc_x86_clmul.h" +#endif + #ifdef CRC_GENERIC @@ -97,7 +102,7 @@ typedef uint64_t (*crc64_func_type)( static crc64_func_type crc64_resolve(void) { - return is_clmul_supported() ? &lzma_crc64_clmul : &crc64_generic; + return is_clmul_supported() ? &crc64_clmul : &crc64_generic; } #if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__) @@ -160,7 +165,7 @@ lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc) // // FIXME: Lookup table isn't currently omitted on 32-bit x86, // see crc64_table.c. - return lzma_crc64_clmul(buf, size, crc); + return crc64_clmul(buf, size, crc); #else return crc64_generic(buf, size, crc); diff --git a/src/liblzma/check/crc_common.h b/src/liblzma/check/crc_common.h index c949f79..552219f 100644 --- a/src/liblzma/check/crc_common.h +++ b/src/liblzma/check/crc_common.h @@ -108,70 +108,6 @@ # define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1 # endif */ - -# if defined(_MSC_VER) -# include -# elif defined(HAVE_CPUID_H) -# include -# endif - -// is_clmul_supported() must be inlined in this header file because the -// ifunc resolver function may not support calling a function in another -// translation unit. Depending on compiler-toolchain and flags, a call to -// a function defined in another translation unit could result in a -// reference to the PLT, which is unsafe to do in an ifunc resolver. The -// ifunc resolver runs very early when loading a shared library, so the PLT -// entries may not be setup at that time. Inlining this function duplicates -// the function body in crc32_resolve() and crc64_resolve(), but this is -// acceptable because the function results in very few instructions. -static inline bool -is_clmul_supported(void) -{ - int success = 1; - uint32_t r[4]; // eax, ebx, ecx, edx - -#if defined(_MSC_VER) - // This needs with MSVC. ICC has it as a built-in - // on all platforms. - __cpuid(r, 1); -#elif defined(HAVE_CPUID_H) - // Compared to just using __asm__ to run CPUID, this also checks - // that CPUID is supported and saves and restores ebx as that is - // needed with GCC < 5 with position-independent code (PIC). - success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]); -#else - // Just a fallback that shouldn't be needed. - __asm__("cpuid\n\t" - : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) - : "a"(1), "c"(0)); #endif - // Returns true if these are supported: - // CLMUL (bit 1 in ecx) - // SSSE3 (bit 9 in ecx) - // SSE4.1 (bit 19 in ecx) - const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19); - return success && (r[2] & ecx_mask) == ecx_mask; - - // Alternative methods that weren't used: - // - ICC's _may_i_use_cpu_feature: the other methods should work too. - // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul") - // - // CPUID decding is needed with MSVC anyway and older GCC. This keeps - // the feature checks in the build system simpler too. The nice thing - // about __builtin_cpu_supports would be that it generates very short - // code as is it only reads a variable set at startup but a few bytes - // doesn't matter here. -} - -#endif - -/// CRC32 implemented with the x86 CLMUL instruction. -extern uint32_t lzma_crc32_clmul(const uint8_t *buf, size_t size, - uint32_t crc); - -/// CRC64 implemented with the x86 CLMUL instruction. -extern uint64_t lzma_crc64_clmul(const uint8_t *buf, size_t size, - uint64_t crc); - #endif diff --git a/src/liblzma/check/crc_clmul.c b/src/liblzma/check/crc_x86_clmul.h similarity index 80% rename from src/liblzma/check/crc_clmul.c rename to src/liblzma/check/crc_x86_clmul.h index 381948a..7a47204 100644 --- a/src/liblzma/check/crc_clmul.c +++ b/src/liblzma/check/crc_x86_clmul.h @@ -1,11 +1,10 @@ /////////////////////////////////////////////////////////////////////////////// // -/// \file crc_clmul.c +/// \file crc_x86_clmul.h /// \brief CRC32 and CRC64 implementations using CLMUL instructions. /// -/// lzma_crc32_clmul() and lzma_crc64_clmul() use 32/64-bit x86 -/// SSSE3, SSE4.1, and CLMUL instructions. This is compatible with -/// Elbrus 2000 (E2K) too. +/// crc32_clmul() and crc64_clmul() use 32/64-bit x86 SSSE3, SSE4.1, and +/// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too. /// /// They were derived from /// https://www.researchgate.net/publication/263424619_Fast_CRC_computation @@ -27,9 +26,20 @@ // /////////////////////////////////////////////////////////////////////////////// -#include "crc_common.h" +// This file must not be included more than once. +#ifdef LZMA_CRC_X86_CLMUL_H +# error crc_x86_clmul.h was included twice. +#endif +#define LZMA_CRC_X86_CLMUL_H + #include +#if defined(_MSC_VER) +# include +#elif defined(HAVE_CPUID_H) +# include +#endif + // EDG-based compilers (Intel's classic compiler and compiler for E2K) can // define __GNUC__ but the attribute must not be used with them. @@ -225,12 +235,12 @@ calc_hi(uint64_t p, uint64_t a, int n) } */ -#ifdef HAVE_CHECK_CRC32 +#ifdef BUILDING_CRC32_CLMUL crc_attr_target crc_attr_no_sanitize_address -extern uint32_t -lzma_crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc) +static uint32_t +crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc) { #ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS // The code assumes that there is at least one byte of input. @@ -265,7 +275,7 @@ lzma_crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc) v0 = _mm_xor_si128(v0, v2); // [2] return ~(uint32_t)_mm_extract_epi32(v0, 2); } -#endif // HAVE_CHECK_CRC32 +#endif // BUILDING_CRC32_CLMUL ///////////////////// @@ -299,7 +309,7 @@ calc_hi(uint64_t poly, uint64_t a) } */ -#ifdef HAVE_CHECK_CRC64 +#ifdef BUILDING_CRC64_CLMUL // MSVC (VS2015 - VS2022) produces bad 32-bit x86 code from the CLMUL CRC // code when optimizations are enabled (release build). According to the bug @@ -318,8 +328,8 @@ calc_hi(uint64_t poly, uint64_t a) crc_attr_target crc_attr_no_sanitize_address -extern uint64_t -lzma_crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc) +static uint64_t +crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc) { #ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS // The code assumes that there is at least one byte of input. @@ -366,4 +376,54 @@ lzma_crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc) # pragma optimize("", on) #endif -#endif // HAVE_CHECK_CRC64 +#endif // BUILDING_CRC64_CLMUL + + +// is_clmul_supported() must be inlined in this header file because the +// ifunc resolver function may not support calling a function in another +// translation unit. Depending on compiler-toolchain and flags, a call to +// a function defined in another translation unit could result in a +// reference to the PLT, which is unsafe to do in an ifunc resolver. The +// ifunc resolver runs very early when loading a shared library, so the PLT +// entries may not be setup at that time. Inlining this function duplicates +// the function body in crc32_resolve() and crc64_resolve(), but this is +// acceptable because the function results in very few instructions. +static inline bool +is_clmul_supported(void) +{ + int success = 1; + uint32_t r[4]; // eax, ebx, ecx, edx + +#if defined(_MSC_VER) + // This needs with MSVC. ICC has it as a built-in + // on all platforms. + __cpuid(r, 1); +#elif defined(HAVE_CPUID_H) + // Compared to just using __asm__ to run CPUID, this also checks + // that CPUID is supported and saves and restores ebx as that is + // needed with GCC < 5 with position-independent code (PIC). + success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]); +#else + // Just a fallback that shouldn't be needed. + __asm__("cpuid\n\t" + : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) + : "a"(1), "c"(0)); +#endif + + // Returns true if these are supported: + // CLMUL (bit 1 in ecx) + // SSSE3 (bit 9 in ecx) + // SSE4.1 (bit 19 in ecx) + const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19); + return success && (r[2] & ecx_mask) == ecx_mask; + + // Alternative methods that weren't used: + // - ICC's _may_i_use_cpu_feature: the other methods should work too. + // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul") + // + // CPUID decding is needed with MSVC anyway and older GCC. This keeps + // the feature checks in the build system simpler too. The nice thing + // about __builtin_cpu_supports would be that it generates very short + // code as is it only reads a variable set at startup but a few bytes + // doesn't matter here. +}