Add support for single-file multi-target intrinsics in Qt
GCC 4.9 now allows us to #include any and all intrinsics headers, not just the one for which we're compiling code, a behavior that ICC and MSVC have had for some time. With that, we're able to have the functions for different targets in the same source file. See the GCC manual: http://gcc.gnu.org/onlinedocs/gcc/Function-Multiversioning.html This functionality is notified by the QT_COMPILER_SUPPORTS_HERE(XXX) macro, which indicates that all the intrinsics from QT_COMPILER_SUPPORTS_xxx are available and enabled. To complement, a QT_COMPILER_SUPPORTS(XXX) macro is also added. Unlike ICC and MSVC, GCC requires a special function attribute, which will also cause code optimization. That's the QT_FUNCTION_TARGET macro. Note: because of the absence of the target attribute, ICC and MSVC will not generate instructions with the VEX prefix unless they only exist with the VEX prefix or if -mavx / -arch:AVX are enabled. Change-Id: I0c1880c20324bd8e0fc68a863e36d1fa7755dff0 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@digia.com>
This commit is contained in:
parent
cb09e1e889
commit
689e8055f5
2
configure
vendored
2
configure
vendored
@ -6035,7 +6035,7 @@ for SUBARCH in SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX AVX2 \
|
||||
eval "VAL=\$CFG_$SUBARCH"
|
||||
case "$VAL" in
|
||||
yes)
|
||||
echo "#define QT_COMPILER_SUPPORTS_$SUBARCH" \
|
||||
echo "#define QT_COMPILER_SUPPORTS_$SUBARCH 1" \
|
||||
>>"$outpath/src/corelib/global/qconfig.h.new"
|
||||
;;
|
||||
esac
|
||||
|
@ -94,13 +94,14 @@ QT_BEGIN_NAMESPACE
|
||||
(for instance, gcc 4.4 does that even at -O0).
|
||||
*/
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_2)
|
||||
static inline bool hasFastCrc32()
|
||||
{
|
||||
return true;
|
||||
return qCpuHasFeature(SSE4_2);
|
||||
}
|
||||
|
||||
template <typename Char>
|
||||
QT_FUNCTION_TARGET(SSE4_2)
|
||||
static uint crc32(const Char *ptr, size_t len, uint h)
|
||||
{
|
||||
// The CRC32 instructions from Nehalem calculate a 32-bit CRC32 checksum
|
||||
|
@ -50,6 +50,7 @@
|
||||
* They mean the compiler supports the necessary flags and the headers
|
||||
* for the x86 and ARM intrinsics:
|
||||
* - GCC: the -mXXX or march=YYY flag is necessary before #include
|
||||
* up to 4.8; GCC >= 4.9 can include unconditionally
|
||||
* - Intel CC: #include can happen unconditionally
|
||||
* - MSVC: #include can happen unconditionally
|
||||
* - RVCT: ???
|
||||
@ -60,25 +61,99 @@
|
||||
* up do define __AVX__ if the -arch:AVX option is passed on the command-line.
|
||||
*
|
||||
* Supported XXX are:
|
||||
* Flag | Arch | GCC | Intel CC | MSVC |
|
||||
* NEON | ARM | I & C | None | ? |
|
||||
* IWMMXT | ARM | I & C | None | I & C |
|
||||
* SSE2 | x86 | I & C | I & C | I & C |
|
||||
* SSE3 | x86 | I & C | I & C | I only |
|
||||
* SSSE3 | x86 | I & C | I & C | I only |
|
||||
* SSE4_1 | x86 | I & C | I & C | I only |
|
||||
* SSE4_2 | x86 | I & C | I & C | I only |
|
||||
* AVX | x86 | I & C | I & C | I & C |
|
||||
* AVX2 | x86 | I & C | I & C | I only |
|
||||
* Flag | Arch | GCC | Intel CC | MSVC |
|
||||
* ARM_NEON | ARM | I & C | None | ? |
|
||||
* IWMMXT | ARM | I & C | None | I & C |
|
||||
* SSE2 | x86 | I & C | I & C | I & C |
|
||||
* SSE3 | x86 | I & C | I & C | I only |
|
||||
* SSSE3 | x86 | I & C | I & C | I only |
|
||||
* SSE4_1 | x86 | I & C | I & C | I only |
|
||||
* SSE4_2 | x86 | I & C | I & C | I only |
|
||||
* AVX | x86 | I & C | I & C | I & C |
|
||||
* AVX2 | x86 | I & C | I & C | I only |
|
||||
* I = intrinsics; C = code generation
|
||||
*
|
||||
* Code can use the following constructs to determine compiler support & status:
|
||||
* - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
|
||||
* If this test passes, then the compiler is already generating code for that
|
||||
* given sub-architecture. The intrinsics for that sub-architecture are
|
||||
* #included and can be used without restriction or runtime check.
|
||||
*
|
||||
* - #if QT_COMPILER_SUPPORTS(XXX)
|
||||
* If this test passes, then the compiler is able to generate code for that
|
||||
* given sub-architecture in another translation unit, given the right set of
|
||||
* flags. Use of the intrinsics is not guaranteed. This is useful with
|
||||
* runtime detection (see below).
|
||||
*
|
||||
* - #if QT_COMPILER_SUPPORTS_HERE(XXX)
|
||||
* If this test passes, then the compiler is able to generate code for that
|
||||
* given sub-architecture in this translation unit, even if it is not doing
|
||||
* that now (it might be). Individual functions may be tagged with
|
||||
* QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
|
||||
* sub-arch. Only inside such functions is the use of the intrisics
|
||||
* guaranteed to work. This is useful with runtime detection (see below).
|
||||
*
|
||||
* Runtime detection of a CPU sub-architecture can be done with the
|
||||
* qCpuHasFeature(XXX) function. There are two strategies for generating
|
||||
* optimized code like that:
|
||||
*
|
||||
* 1) place the optimized code in a different translation unit (C or assembly
|
||||
* sources) and pass the correct flags to the compiler to enable support. Those
|
||||
* sources must not include qglobal.h, which means they cannot include this
|
||||
* file either. The dispatcher function would look like this:
|
||||
*
|
||||
* void foo()
|
||||
* {
|
||||
* #if QT_COMPILER_SUPPORTS(XXX)
|
||||
* if (qCpuHasFeature(XXX)) {
|
||||
* foo_optimized_xxx();
|
||||
* return;
|
||||
* }
|
||||
* #endif
|
||||
* foo_plain();
|
||||
* }
|
||||
*
|
||||
* 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
|
||||
* surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
|
||||
* other Qt code. The dispatcher function would look like this:
|
||||
*
|
||||
* void foo()
|
||||
* {
|
||||
* #if QT_COMPILER_SUPPORTS_HERE(XXX)
|
||||
* if (qCpuHasFeature(XXX)) {
|
||||
* foo_optimized_xxx();
|
||||
* return;
|
||||
* }
|
||||
* #endif
|
||||
* foo_plain();
|
||||
* }
|
||||
*/
|
||||
|
||||
#if defined(__MINGW64_VERSION_MAJOR) || (defined(Q_CC_MSVC) && !defined(Q_OS_WINCE))
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
|
||||
|
||||
#if (defined(Q_CC_INTEL) || defined(Q_CC_MSVC) \
|
||||
|| (defined(Q_CC_GNU) && !defined(Q_CC_CLANG) && (__GNUC__-0) * 100 + (__GNUC_MINOR__-0) >= 409)) \
|
||||
&& !defined(QT_BOOTSTRAPPED)
|
||||
# define QT_COMPILER_SUPPORTS_SIMD_ALWAYS
|
||||
# define QT_COMPILER_SUPPORTS_HERE(x) QT_COMPILER_SUPPORTS(x)
|
||||
# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
|
||||
/* GCC requires attributes for a function */
|
||||
# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
|
||||
# else
|
||||
# define QT_FUNCTION_TARGET(x)
|
||||
# endif
|
||||
#else
|
||||
# define QT_COMPILER_SUPPORTS_HERE(x) defined(__ ## x ## __)
|
||||
# define QT_FUNCTION_TARGET(x)
|
||||
#endif
|
||||
|
||||
// SSE intrinsics
|
||||
#if defined(__SSE2__) || (defined(QT_COMPILER_SUPPORTS_SSE2) && defined(Q_CC_MSVC))
|
||||
#define QT_FUNCTION_TARGET_STRING_SSE2 "sse2"
|
||||
#if defined(__SSE2__) || (defined(QT_COMPILER_SUPPORTS_SSE2) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
|
||||
#if defined(QT_LINUXBASE) || defined(Q_OS_ANDROID_NO_SDK)
|
||||
/// this is an evil hack - the posix_memalign declaration in LSB
|
||||
/// is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=2431
|
||||
@ -95,27 +170,33 @@
|
||||
#endif
|
||||
|
||||
// SSE3 intrinsics
|
||||
#if defined(__SSE3__) || (defined(QT_COMPILER_SUPPORTS_SSE3) && defined(Q_CC_MSVC))
|
||||
#define QT_FUNCTION_TARGET_STRING_SSE3 "sse3"
|
||||
#if defined(__SSE3__) || (defined(QT_COMPILER_SUPPORTS_SSE3) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
|
||||
// SSSE3 intrinsics
|
||||
#if defined(__SSSE3__) || (defined(QT_COMPILER_SUPPORTS_SSSE3) && defined(Q_CC_MSVC))
|
||||
#define QT_FUNCTION_TARGET_STRING_SSSE3 "ssse3"
|
||||
#if defined(__SSSE3__) || (defined(QT_COMPILER_SUPPORTS_SSSE3) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
// SSE4.1 intrinsics
|
||||
#if defined(__SSE4_1__) || (defined(QT_COMPILER_SUPPORTS_SSE4_1) && defined(Q_CC_MSVC))
|
||||
#define QT_FUNCTION_TARGET_STRING_SSE4_1 "sse4.1"
|
||||
#if defined(__SSE4_1__) || (defined(QT_COMPILER_SUPPORTS_SSE4_1) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
// SSE4.2 intrinsics
|
||||
#if defined(__SSE4_2__) || (defined(QT_COMPILER_SUPPORTS_SSE4_2) && defined(Q_CC_MSVC))
|
||||
#define QT_FUNCTION_TARGET_STRING_SSE4_2 "sse4.2"
|
||||
#if defined(__SSE4_2__) || (defined(QT_COMPILER_SUPPORTS_SSE4_2) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
|
||||
#include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
// AVX intrinsics
|
||||
#if defined(__AVX__) || (defined(QT_COMPILER_SUPPORTS_AVX) && defined(Q_CC_MSVC))
|
||||
#define QT_FUNCTION_TARGET_STRING_AVX "avx"
|
||||
#define QT_FUNCTION_TARGET_STRING_AVX2 "avx2"
|
||||
#if defined(__AVX__) || (defined(QT_COMPILER_SUPPORTS_AVX) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
|
||||
// immintrin.h is the ultimate header, we don't need anything else after this
|
||||
#include <immintrin.h>
|
||||
|
||||
@ -147,8 +228,10 @@
|
||||
#endif
|
||||
|
||||
// NEON intrinsics
|
||||
// note: as of GCC 4.9, does not support function targets for ARM
|
||||
#if defined __ARM_NEON__
|
||||
#include <arm_neon.h>
|
||||
#define QT_FUNCTION_TARGET_STRING_ARM_NEON "neon"
|
||||
#endif
|
||||
|
||||
|
||||
@ -169,12 +252,14 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#undef QT_COMPILER_SUPPORTS_SIMD_ALWAYS
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
|
||||
enum CPUFeatures {
|
||||
IWMMXT = 0x1,
|
||||
NEON = 0x2,
|
||||
NEON = 0x2, ARM_NEON = NEON,
|
||||
SSE2 = 0x4,
|
||||
SSE3 = 0x8,
|
||||
SSSE3 = 0x10,
|
||||
|
@ -3476,21 +3476,21 @@ void Configure::generateConfigfiles()
|
||||
|
||||
tmpStream << endl << "// Compiler sub-arch support" << endl;
|
||||
if (dictionary[ "SSE2" ] == "yes")
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE2" << endl;
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE2 1" << endl;
|
||||
if (dictionary[ "SSE3" ] == "yes")
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE3" << endl;
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE3 1" << endl;
|
||||
if (dictionary[ "SSSE3" ] == "yes")
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_SSSE3" << endl;
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_SSSE3 1" << endl;
|
||||
if (dictionary[ "SSE4_1" ] == "yes")
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_1" << endl;
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_1 1" << endl;
|
||||
if (dictionary[ "SSE4_2" ] == "yes")
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_2" << endl;
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_2 1" << endl;
|
||||
if (dictionary[ "AVX" ] == "yes")
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_AVX" << endl;
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_AVX 1" << endl;
|
||||
if (dictionary[ "AVX2" ] == "yes")
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_AVX2" << endl;
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_AVX2 1" << endl;
|
||||
if (dictionary[ "IWMMXT" ] == "yes")
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_IWMMXT" << endl;
|
||||
tmpStream << "#define QT_COMPILER_SUPPORTS_IWMMXT 1" << endl;
|
||||
|
||||
if (dictionary["QREAL"] != "double")
|
||||
tmpStream << "#define QT_COORD_TYPE " << dictionary["QREAL"] << endl;
|
||||
|
Loading…
Reference in New Issue
Block a user