Add support for single-file multi-target intrinsics in Qt

GCC 4.9 now allows us to #include any and all intrinsics headers, not
just the one for which we're compiling code, a behavior that ICC and
MSVC have had for some time. With that, we're able to have the functions
for different targets in the same source file. See the GCC manual:
  http://gcc.gnu.org/onlinedocs/gcc/Function-Multiversioning.html

This functionality is notified by the QT_COMPILER_SUPPORTS_HERE(XXX)
macro, which indicates that all the intrinsics from
QT_COMPILER_SUPPORTS_xxx are available and enabled. To complement, a
QT_COMPILER_SUPPORTS(XXX) macro is also added.

Unlike ICC and MSVC, GCC requires a special function attribute, which
will also cause code optimization. That's the QT_FUNCTION_TARGET macro.

Note: because of the absence of the target attribute, ICC and MSVC will
not generate instructions with the VEX prefix unless they only exist
with the VEX prefix or if -mavx / -arch:AVX are enabled.

Change-Id: I0c1880c20324bd8e0fc68a863e36d1fa7755dff0
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@digia.com>
This commit is contained in:
Thiago Macieira 2013-08-06 19:32:37 -07:00 committed by The Qt Project
parent cb09e1e889
commit 689e8055f5
4 changed files with 114 additions and 28 deletions

2
configure vendored
View File

@ -6035,7 +6035,7 @@ for SUBARCH in SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX AVX2 \
eval "VAL=\$CFG_$SUBARCH"
case "$VAL" in
yes)
echo "#define QT_COMPILER_SUPPORTS_$SUBARCH" \
echo "#define QT_COMPILER_SUPPORTS_$SUBARCH 1" \
>>"$outpath/src/corelib/global/qconfig.h.new"
;;
esac

View File

@ -94,13 +94,14 @@ QT_BEGIN_NAMESPACE
(for instance, gcc 4.4 does that even at -O0).
*/
#ifdef __SSE4_2__
#if QT_COMPILER_SUPPORTS_HERE(SSE4_2)
static inline bool hasFastCrc32()
{
return true;
return qCpuHasFeature(SSE4_2);
}
template <typename Char>
QT_FUNCTION_TARGET(SSE4_2)
static uint crc32(const Char *ptr, size_t len, uint h)
{
// The CRC32 instructions from Nehalem calculate a 32-bit CRC32 checksum

View File

@ -50,6 +50,7 @@
* They mean the compiler supports the necessary flags and the headers
* for the x86 and ARM intrinsics:
* - GCC: the -mXXX or march=YYY flag is necessary before #include
* up to 4.8; GCC >= 4.9 can include unconditionally
* - Intel CC: #include can happen unconditionally
* - MSVC: #include can happen unconditionally
* - RVCT: ???
@ -60,25 +61,99 @@
* up do define __AVX__ if the -arch:AVX option is passed on the command-line.
*
* Supported XXX are:
* Flag | Arch | GCC | Intel CC | MSVC |
* NEON | ARM | I & C | None | ? |
* IWMMXT | ARM | I & C | None | I & C |
* SSE2 | x86 | I & C | I & C | I & C |
* SSE3 | x86 | I & C | I & C | I only |
* SSSE3 | x86 | I & C | I & C | I only |
* SSE4_1 | x86 | I & C | I & C | I only |
* SSE4_2 | x86 | I & C | I & C | I only |
* AVX | x86 | I & C | I & C | I & C |
* AVX2 | x86 | I & C | I & C | I only |
* Flag | Arch | GCC | Intel CC | MSVC |
* ARM_NEON | ARM | I & C | None | ? |
* IWMMXT | ARM | I & C | None | I & C |
* SSE2 | x86 | I & C | I & C | I & C |
* SSE3 | x86 | I & C | I & C | I only |
* SSSE3 | x86 | I & C | I & C | I only |
* SSE4_1 | x86 | I & C | I & C | I only |
* SSE4_2 | x86 | I & C | I & C | I only |
* AVX | x86 | I & C | I & C | I & C |
* AVX2 | x86 | I & C | I & C | I only |
* I = intrinsics; C = code generation
*
* Code can use the following constructs to determine compiler support & status:
* - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
* If this test passes, then the compiler is already generating code for that
* given sub-architecture. The intrinsics for that sub-architecture are
* #included and can be used without restriction or runtime check.
*
* - #if QT_COMPILER_SUPPORTS(XXX)
* If this test passes, then the compiler is able to generate code for that
* given sub-architecture in another translation unit, given the right set of
* flags. Use of the intrinsics is not guaranteed. This is useful with
* runtime detection (see below).
*
* - #if QT_COMPILER_SUPPORTS_HERE(XXX)
* If this test passes, then the compiler is able to generate code for that
* given sub-architecture in this translation unit, even if it is not doing
* that now (it might be). Individual functions may be tagged with
* QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
* sub-arch. Only inside such functions is the use of the intrisics
* guaranteed to work. This is useful with runtime detection (see below).
*
* Runtime detection of a CPU sub-architecture can be done with the
* qCpuHasFeature(XXX) function. There are two strategies for generating
* optimized code like that:
*
* 1) place the optimized code in a different translation unit (C or assembly
* sources) and pass the correct flags to the compiler to enable support. Those
* sources must not include qglobal.h, which means they cannot include this
* file either. The dispatcher function would look like this:
*
* void foo()
* {
* #if QT_COMPILER_SUPPORTS(XXX)
* if (qCpuHasFeature(XXX)) {
* foo_optimized_xxx();
* return;
* }
* #endif
* foo_plain();
* }
*
* 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
* surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
* other Qt code. The dispatcher function would look like this:
*
* void foo()
* {
* #if QT_COMPILER_SUPPORTS_HERE(XXX)
* if (qCpuHasFeature(XXX)) {
* foo_optimized_xxx();
* return;
* }
* #endif
* foo_plain();
* }
*/
#if defined(__MINGW64_VERSION_MAJOR) || (defined(Q_CC_MSVC) && !defined(Q_OS_WINCE))
#include <intrin.h>
#endif
#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
#if (defined(Q_CC_INTEL) || defined(Q_CC_MSVC) \
|| (defined(Q_CC_GNU) && !defined(Q_CC_CLANG) && (__GNUC__-0) * 100 + (__GNUC_MINOR__-0) >= 409)) \
&& !defined(QT_BOOTSTRAPPED)
# define QT_COMPILER_SUPPORTS_SIMD_ALWAYS
# define QT_COMPILER_SUPPORTS_HERE(x) QT_COMPILER_SUPPORTS(x)
# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
/* GCC requires attributes for a function */
# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
# else
# define QT_FUNCTION_TARGET(x)
# endif
#else
# define QT_COMPILER_SUPPORTS_HERE(x) defined(__ ## x ## __)
# define QT_FUNCTION_TARGET(x)
#endif
// SSE intrinsics
#if defined(__SSE2__) || (defined(QT_COMPILER_SUPPORTS_SSE2) && defined(Q_CC_MSVC))
#define QT_FUNCTION_TARGET_STRING_SSE2 "sse2"
#if defined(__SSE2__) || (defined(QT_COMPILER_SUPPORTS_SSE2) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
#if defined(QT_LINUXBASE) || defined(Q_OS_ANDROID_NO_SDK)
/// this is an evil hack - the posix_memalign declaration in LSB
/// is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=2431
@ -95,27 +170,33 @@
#endif
// SSE3 intrinsics
#if defined(__SSE3__) || (defined(QT_COMPILER_SUPPORTS_SSE3) && defined(Q_CC_MSVC))
#define QT_FUNCTION_TARGET_STRING_SSE3 "sse3"
#if defined(__SSE3__) || (defined(QT_COMPILER_SUPPORTS_SSE3) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
#include <pmmintrin.h>
#endif
// SSSE3 intrinsics
#if defined(__SSSE3__) || (defined(QT_COMPILER_SUPPORTS_SSSE3) && defined(Q_CC_MSVC))
#define QT_FUNCTION_TARGET_STRING_SSSE3 "ssse3"
#if defined(__SSSE3__) || (defined(QT_COMPILER_SUPPORTS_SSSE3) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
#include <tmmintrin.h>
#endif
// SSE4.1 intrinsics
#if defined(__SSE4_1__) || (defined(QT_COMPILER_SUPPORTS_SSE4_1) && defined(Q_CC_MSVC))
#define QT_FUNCTION_TARGET_STRING_SSE4_1 "sse4.1"
#if defined(__SSE4_1__) || (defined(QT_COMPILER_SUPPORTS_SSE4_1) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
#include <smmintrin.h>
#endif
// SSE4.2 intrinsics
#if defined(__SSE4_2__) || (defined(QT_COMPILER_SUPPORTS_SSE4_2) && defined(Q_CC_MSVC))
#define QT_FUNCTION_TARGET_STRING_SSE4_2 "sse4.2"
#if defined(__SSE4_2__) || (defined(QT_COMPILER_SUPPORTS_SSE4_2) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
#include <nmmintrin.h>
#endif
// AVX intrinsics
#if defined(__AVX__) || (defined(QT_COMPILER_SUPPORTS_AVX) && defined(Q_CC_MSVC))
#define QT_FUNCTION_TARGET_STRING_AVX "avx"
#define QT_FUNCTION_TARGET_STRING_AVX2 "avx2"
#if defined(__AVX__) || (defined(QT_COMPILER_SUPPORTS_AVX) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS))
// immintrin.h is the ultimate header, we don't need anything else after this
#include <immintrin.h>
@ -147,8 +228,10 @@
#endif
// NEON intrinsics
// note: as of GCC 4.9, does not support function targets for ARM
#if defined __ARM_NEON__
#include <arm_neon.h>
#define QT_FUNCTION_TARGET_STRING_ARM_NEON "neon"
#endif
@ -169,12 +252,14 @@
#endif
#endif
#undef QT_COMPILER_SUPPORTS_SIMD_ALWAYS
QT_BEGIN_NAMESPACE
enum CPUFeatures {
IWMMXT = 0x1,
NEON = 0x2,
NEON = 0x2, ARM_NEON = NEON,
SSE2 = 0x4,
SSE3 = 0x8,
SSSE3 = 0x10,

View File

@ -3476,21 +3476,21 @@ void Configure::generateConfigfiles()
tmpStream << endl << "// Compiler sub-arch support" << endl;
if (dictionary[ "SSE2" ] == "yes")
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE2" << endl;
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE2 1" << endl;
if (dictionary[ "SSE3" ] == "yes")
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE3" << endl;
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE3 1" << endl;
if (dictionary[ "SSSE3" ] == "yes")
tmpStream << "#define QT_COMPILER_SUPPORTS_SSSE3" << endl;
tmpStream << "#define QT_COMPILER_SUPPORTS_SSSE3 1" << endl;
if (dictionary[ "SSE4_1" ] == "yes")
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_1" << endl;
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_1 1" << endl;
if (dictionary[ "SSE4_2" ] == "yes")
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_2" << endl;
tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_2 1" << endl;
if (dictionary[ "AVX" ] == "yes")
tmpStream << "#define QT_COMPILER_SUPPORTS_AVX" << endl;
tmpStream << "#define QT_COMPILER_SUPPORTS_AVX 1" << endl;
if (dictionary[ "AVX2" ] == "yes")
tmpStream << "#define QT_COMPILER_SUPPORTS_AVX2" << endl;
tmpStream << "#define QT_COMPILER_SUPPORTS_AVX2 1" << endl;
if (dictionary[ "IWMMXT" ] == "yes")
tmpStream << "#define QT_COMPILER_SUPPORTS_IWMMXT" << endl;
tmpStream << "#define QT_COMPILER_SUPPORTS_IWMMXT 1" << endl;
if (dictionary["QREAL"] != "double")
tmpStream << "#define QT_COORD_TYPE " << dictionary["QREAL"] << endl;