Work around GCC bug in generating 64-bit population of SSE register

We know what code we want it to generate, so I just replaced the _mm_set1_epi64x() with the code we want it to generate. Except that GCC sees through and tries to "optimize" my code... so that asm() statement makes it separate the two operations. This generates optimal code for both 32- and 64-bit. 64-bit: vmovq %rdi, %xmm0 vpbroadcastq %xmm0, %ymm0 32-bit: vmovq 8(%esp), %xmm0 vpbroadcastq %xmm0, %ymm0 See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80820 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87976 Change-Id: I42a48bd64ccc41aebf84fffd15664109b97fe42b Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
2018-11-11 18:57:12 -08:00 · 2018-11-11 18:57:12 -08:00 · 58f2aa907f
commit 58f2aa907f
parent 3df79b2953
1 changed files with 12 additions and 1 deletions
--- a/src/gui/painting/qdrawhelper_avx2.cpp
+++ b/src/gui/painting/qdrawhelper_avx2.cpp
@ -359,7 +359,18 @@ void Q_DECL_VECTORCALL qt_memfillXX_avx2(uchar *dest, __m256i value256, qsizetyp

 void qt_memfill64_avx2(quint64 *dest, quint64 value, qsizetype count)
 {
-    qt_memfillXX_avx2(reinterpret_cast<uchar *>(dest), _mm256_set1_epi64x(value), count * sizeof(quint64));
+#if defined(Q_CC_GNU) && !defined(Q_CC_CLANG) && !defined(Q_CC_INTEL)
+    // work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80820
+    __m128i value64 = _mm_set_epi64x(0, value); // _mm_cvtsi64_si128(value);
+#  ifdef Q_PROCESSOR_X86_64
+    asm ("" : "+x" (value64));
+#  endif
+    __m256i value256 =  _mm256_broadcastq_epi64(value64);
+#else
+    __m256i value256 = _mm256_set1_epi64x(value);
+#endif
+
+    qt_memfillXX_avx2(reinterpret_cast<uchar *>(dest), value256, count * sizeof(quint64));
 }

 void qt_memfill32_avx2(quint32 *dest, quint32 value, qsizetype count)