812e8f2a16
* perf improvements for zstd decode tldr: 7.5% average decode speedup on silesia corpus at compression levels 1-3 (sandy bridge) Background: while investigating zstd perf differences between clang and gcc I noticed that even though gcc is vectorizing the loop in in wildcopy, it was not being done as well as could be done by hand. The sites where wildcopy is invoked have an interesting distribution of lengths to be copied. The loop trip count is rarely above 1, yet long copies are common enough to make their performance important.The code in zstd_decompress.c to invoke wildcopy handles the latter well but the gcc autovectorizer introduces a needlessly expensive startup check for vectorization. See how GCC autovectorizes the loop here: https://godbolt.org/z/apr0x0 Here is the code after this diff has been applied: (left hand side is the good one, right is with vectorizer on) After: https://godbolt.org/z/OwO4F8 Note that autovectorization still does not do a good job on the optimized version, so it's turned off\ via attribute and flag. I found that neither attribute nor command-line flag were entirely successful in turning off vectorization, which is why there were both. silesia benchmark data - second triad of each file is with the original code: file orig compressedratio encode decode change 1#dickens 10192446-> 4268865(2.388), 198.9MB/s 709.6MB/s 2#dickens 10192446-> 3876126(2.630), 128.7MB/s 552.5MB/s 3#dickens 10192446-> 3682956(2.767), 104.6MB/s 537MB/s 1#dickens 10192446-> 4268865(2.388), 195.4MB/s 659.5MB/s 7.60% 2#dickens 10192446-> 3876126(2.630), 127MB/s 516.3MB/s 7.01% 3#dickens 10192446-> 3682956(2.767), 105MB/s 479.5MB/s 11.99% 1#mozilla 51220480-> 20117517(2.546), 285.4MB/s 734.9MB/s 2#mozilla 51220480-> 19067018(2.686), 220.8MB/s 686.3MB/s 3#mozilla 51220480-> 18508283(2.767), 152.2MB/s 669.4MB/s 1#mozilla 51220480-> 20117517(2.546), 283.4MB/s 697.9MB/s 5.30% 2#mozilla 51220480-> 19067018(2.686), 225.9MB/s 665MB/s 3.20% 3#mozilla 51220480-> 18508283(2.767), 154.5MB/s 640.6MB/s 4.50% 1#mr 9970564-> 3840242(2.596), 262.4MB/s 899.8MB/s 2#mr 9970564-> 3600976(2.769), 181.2MB/s 717.9MB/s 3#mr 9970564-> 3563987(2.798), 116.3MB/s 620MB/s 1#mr 9970564-> 3840242(2.596), 253.2MB/s 827.3MB/s 8.76% 2#mr 9970564-> 3600976(2.769), 177.4MB/s 655.4MB/s 9.54% 3#mr 9970564-> 3563987(2.798), 111.2MB/s 564.2MB/s 9.89% 1#nci 33553445-> 2849306(11.78), 575.2MB/s , 1335.8MB/s 2#nci 33553445-> 2890166(11.61), 509.3MB/s , 1238.1MB/s 3#nci 33553445-> 2857408(11.74), 431MB/s , 1210.7MB/s 1#nci 33553445-> 2849306(11.78), 565.4MB/s , 1220.2MB/s 9.47% 2#nci 33553445-> 2890166(11.61), 508.2MB/s , 1128.4MB/s 9.72% 3#nci 33553445-> 2857408(11.74), 429.1MB/s , 1097.7MB/s 10.29% 1#ooffice 6152192-> 3590954(1.713), 231.4MB/s , 662.6MB/s 2#ooffice 6152192-> 3323931(1.851), 162.8MB/s , 592.6MB/s 3#ooffice 6152192-> 3145625(1.956), 99.9MB/s , 549.6MB/s 1#ooffice 6152192-> 3590954(1.713), 224.7MB/s , 624.2MB/s 6.15% 2#ooffice 6152192-> 3323931 (1.851), 155MB/s , 564.5MB/s 4.98% 3#ooffice 6152192-> 3145625(1.956), 101.1MB/s , 521.2MB/s 5.45% 1#osdb 10085684-> 3739042(2.697), 271.9MB/s 876.4MB/s 2#osdb 10085684-> 3493875(2.887), 208.2MB/s 857MB/s 3#osdb 10085684-> 3515831(2.869), 135.3MB/s 805.4MB/s 1#osdb 10085684-> 3739042(2.697), 257.4MB/s 793.8MB/s 10.41% 2#osdb 10085684-> 3493875(2.887), 209.7MB/s 776.1MB/s 10.42% 3#osdb 10085684-> 3515831(2.869), 130.6MB/s 727.7MB/s 10.68% 1#reymont 6627202-> 2152771(3.078), 198.9MB/s 696.2MB/s 2#reymont 6627202-> 2071140(3.200), 170MB/s 595.2MB/s 3#reymont 6627202-> 1953597(3.392), 128.5MB/s 609.7MB/s 1#reymont 6627202-> 2152771(3.078), 199.6MB/s 655.2MB/s 6.26% 2#reymont 6627202-> 2071140(3.200), 168.2MB/s 554.4MB/s 7.36% 3#reymont 6627202-> 1953597(3.392), 128.7MB/s 557.4MB/s 9.38% 1#samba 21606400-> 5510994(3.921), 338.1MB/s 1066MB/s 2#samba 21606400-> 5240208(4.123), 258.7MB/s 992.3MB/s 3#samba 21606400-> 5003358(4.318), 200.2MB/s 991.1MB/s 1#samba 21606400-> 5510994(3.921), 330.8MB/s 974MB/s 9.45% 2#samba 21606400-> 5240208(4.123), 257.9MB/s 919.4MB/s 7.93% 3#samba 21606400-> 5003358(4.318), 198.5MB/s 908.9MB/s 9.04% 1#sao 7251944-> 6256401(1.159), 194.6MB/s 602.2MB/s 2#sao 7251944-> 5808761(1.248), 128.2MB/s 532.1MB/s 3#sao 7251944-> 5556318(1.305), 73MB/s 509.4MB/s 1#sao 7251944-> 6256401(1.159), 198.7MB/s 580.7MB/s 3.70% 2#sao 7251944-> 5808761(1.248), 129.1MB/s 502.7MB/s 5.85% 3#sao 7251944-> 5556318(1.305), 74.6MB/s 493.1MB/s 3.31% 1#webster 41458703-> 13692222(3.028), 222.3MB/s 752MB/s 2#webster 41458703-> 12842646(3.228), 157.6MB/s 532.2MB/s 3#webster 41458703-> 12191964(3.400), 124MB/s 468.5MB/s 1#webster 41458703-> 13692222(3.028), 219.7MB/s 697MB/s 7.89% 2#webster 41458703-> 12842646(3.228), 153.9MB/s 495.4MB/s 7.43% 3#webster 41458703-> 12191964(3.400), 124.8MB/s 444.8MB/s 5.33% 1#xml 5345280-> 696652(7.673), 485MB/s , 1333.9MB/s 2#xml 5345280-> 681492(7.843), 405.2MB/s , 1237.5MB/s 3#xml 5345280-> 639057(8.364), 328.5MB/s , 1281.3MB/s 1#xml 5345280-> 696652(7.673), 473.1MB/s , 1232.4MB/s 8.24% 2#xml 5345280-> 681492(7.843), 398.6MB/s , 1145.9MB/s 7.99% 3#xml 5345280-> 639057(8.364), 327.1MB/s , 1175MB/s 9.05% 1#x-ray 8474240-> 6772557(1.251), 521.3MB/s 762.6MB/s 2#x-ray 8474240-> 6684531(1.268), 230.5MB/s 688.5MB/s 3#x-ray 8474240-> 6166679(1.374), 68.7MB/s 478.8MB/s 1#x-ray 8474240-> 6772557(1.251), 502.8MB/s 736.7MB/s 3.52% 2#x-ray 8474240-> 6684531(1.268), 224.4MB/s 662MB/s 4.00% 3#x-ray 8474240-> 6166679(1.374), 67.3MB/s 437.8MB/s 9.37% 7.51% * makefile changed to only pass -fno-tree-vectorize to gcc * <Replace this line with a title. Use 1 line only, 67 chars or less> Don't add "no-tree-vectorize" attribute on clang (which defines __GNUC__) * fix for warning/error with subtraction of void* pointers * fix c90 conformance issue - ISO C90 forbids mixed declarations and code * Fix assert for negative diff, only when there is no overlap * fix overflow revealed in fuzzing tests * tweak for small speed increase
148 lines
4.9 KiB
C
148 lines
4.9 KiB
C
/*
|
|
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
* in the COPYING file in the root directory of this source tree).
|
|
* You may select, at your option, one of the above-listed licenses.
|
|
*/
|
|
|
|
#ifndef ZSTD_COMPILER_H
|
|
#define ZSTD_COMPILER_H
|
|
|
|
/*-*******************************************************
|
|
* Compiler specifics
|
|
*********************************************************/
|
|
/* force inlining */
|
|
|
|
#if !defined(ZSTD_NO_INLINE)
|
|
#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
|
|
# define INLINE_KEYWORD inline
|
|
#else
|
|
# define INLINE_KEYWORD
|
|
#endif
|
|
|
|
#if defined(__GNUC__)
|
|
# define FORCE_INLINE_ATTR __attribute__((always_inline))
|
|
#elif defined(_MSC_VER)
|
|
# define FORCE_INLINE_ATTR __forceinline
|
|
#else
|
|
# define FORCE_INLINE_ATTR
|
|
#endif
|
|
|
|
#else
|
|
|
|
#define INLINE_KEYWORD
|
|
#define FORCE_INLINE_ATTR
|
|
|
|
#endif
|
|
|
|
/**
|
|
* FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
|
|
* parameters. They must be inlined for the compiler to eliminate the constant
|
|
* branches.
|
|
*/
|
|
#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
|
|
/**
|
|
* HINT_INLINE is used to help the compiler generate better code. It is *not*
|
|
* used for "templates", so it can be tweaked based on the compilers
|
|
* performance.
|
|
*
|
|
* gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
|
|
* always_inline attribute.
|
|
*
|
|
* clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
|
|
* attribute.
|
|
*/
|
|
#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
|
|
# define HINT_INLINE static INLINE_KEYWORD
|
|
#else
|
|
# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
|
|
#endif
|
|
|
|
/* force no inlining */
|
|
#ifdef _MSC_VER
|
|
# define FORCE_NOINLINE static __declspec(noinline)
|
|
#else
|
|
# ifdef __GNUC__
|
|
# define FORCE_NOINLINE static __attribute__((__noinline__))
|
|
# else
|
|
# define FORCE_NOINLINE static
|
|
# endif
|
|
#endif
|
|
|
|
/* target attribute */
|
|
#ifndef __has_attribute
|
|
#define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */
|
|
#endif
|
|
#if defined(__GNUC__)
|
|
# define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
|
|
#else
|
|
# define TARGET_ATTRIBUTE(target)
|
|
#endif
|
|
|
|
/* Enable runtime BMI2 dispatch based on the CPU.
|
|
* Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
|
|
*/
|
|
#ifndef DYNAMIC_BMI2
|
|
#if ((defined(__clang__) && __has_attribute(__target__)) \
|
|
|| (defined(__GNUC__) \
|
|
&& (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
|
|
&& (defined(__x86_64__) || defined(_M_X86)) \
|
|
&& !defined(__BMI2__)
|
|
# define DYNAMIC_BMI2 1
|
|
#else
|
|
# define DYNAMIC_BMI2 0
|
|
#endif
|
|
#endif
|
|
|
|
/* prefetch
|
|
* can be disabled, by declaring NO_PREFETCH build macro */
|
|
#if defined(NO_PREFETCH)
|
|
# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
|
|
# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */
|
|
#else
|
|
# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
|
|
# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
|
|
# define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
|
|
# define PREFETCH_L2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
|
|
# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
|
|
# define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
|
|
# define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
|
|
# else
|
|
# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
|
|
# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */
|
|
# endif
|
|
#endif /* NO_PREFETCH */
|
|
|
|
#define CACHELINE_SIZE 64
|
|
|
|
#define PREFETCH_AREA(p, s) { \
|
|
const char* const _ptr = (const char*)(p); \
|
|
size_t const _size = (size_t)(s); \
|
|
size_t _pos; \
|
|
for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
|
|
PREFETCH_L2(_ptr + _pos); \
|
|
} \
|
|
}
|
|
|
|
/* vectorization */
|
|
#if !defined(__clang__) && defined(__GNUC__)
|
|
# define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
|
|
#else
|
|
# define DONT_VECTORIZE
|
|
#endif
|
|
|
|
/* disable warnings */
|
|
#ifdef _MSC_VER /* Visual Studio */
|
|
# include <intrin.h> /* For Visual 2005 */
|
|
# pragma warning(disable : 4100) /* disable: C4100: unreferenced formal parameter */
|
|
# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
|
|
# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */
|
|
# pragma warning(disable : 4214) /* disable: C4214: non-int bitfields */
|
|
# pragma warning(disable : 4324) /* disable: C4324: padded structure */
|
|
#endif
|
|
|
|
#endif /* ZSTD_COMPILER_H */
|