Improve skvx::any() and all() intrinsics

Removes specializations for all() on AVX2 and SSE 4.1, which give the wrong results if the ints didn't have all bits set (inconsistent with other platforms and non-SIMD). Added a unit test that checks this case. The mirror specializations for any() on AVX2 and SSE 4.1 are actually valid, so added those, and added a 2 instruction specialization for SSE for any() and all(). This is what clang-trunk produces on -O3, but ToT clang struggles to vectorize it. Also adds specializations for NEON for any() and all(), since even clang-trunk was struggling to vectorize it automatically. In particular, this will help skgpu::graphite::Rect's implementations of intersect and contains, which use any/all to get a final boolean value. In the Instruments app, I had see Rect's intersection as a hotspot on the Mac M1, and this vectorization helps a bit. Also takes the opportunity to remove fake C++14 constexpr for a real constexpr. Change-Id: Ib142e305ae5615056a777424e379b6da82d44f0c Reviewed-on: https://skia-review.googlesource.com/c/skia/+/542296 Commit-Queue: Michael Ludwig <michaelludwig@google.com> Reviewed-by: Herb Derby <herb@google.com>
2022-05-19 15:55:26 -04:00 · 2022-05-19 15:55:26 -04:00 · 4621ef2a8a
commit 4621ef2a8a
parent ab8cfbed8f
2 changed files with 52 additions and 10 deletions
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@ -461,7 +461,6 @@ SIT Vec<1,T> if_then_else(const Vec<1,M<T>>& cond, const Vec<1,T>& t, const Vec<
 }
 SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {
    // Specializations inline here so they can generalize what types the apply to.
-    // (This header is used in C++14 contexts, so we have to kind of fake constexpr if.)
 #if SKVX_USE_SIMD && defined(__AVX2__)
    if constexpr (N*sizeof(T) == 32) {
        return unchecked_bit_pun<Vec<N,T>>(_mm256_blendv_epi8(unchecked_bit_pun<__m256i>(e),
@ -494,6 +493,34 @@ SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec

 SIT  bool any(const Vec<1,T>& x) { return x.val != 0; }
 SINT bool any(const Vec<N,T>& x) {
+    // For any(), the _mm_testz intrinsics are correct and don't require comparing 'x' to 0, so it's
+    // lower latency compared to _mm_movemask + _mm_compneq on plain SSE.
+#if SKVX_USE_SIMD && defined(__AVX2__)
+    if constexpr (N*sizeof(T) == 32) {
+        return !_mm256_testz_si256(unchecked_bit_pun<__m256i>(x), _mm256_set1_epi32(-1));
+    }
+#endif
+#if SKVX_USE_SIMD && defined(__SSE_4_1__)
+    if constexpr (N*sizeof(T) == 16) {
+        return !_mm_testz_si128(unchecked_bit_pun<__m128i>(x), _mm_set1_epi32(-1));
+    }
+#endif
+#if SKVX_USE_SIMD && defined(__SSE__)
+    if constexpr (N*sizeof(T) == 16) {
+        // On SSE, movemask checks only the MSB in each lane, which is fine if the lanes were set
+        // directly from a comparison op (which sets all bits to 1 when true), but skvx::Vec<>
+        // treats any non-zero value as true, so we have to compare 'x' to 0 before calling movemask
+        return _mm_movemask_ps(_mm_cmpneq_ps(unchecked_bit_pun<__m128>(x),
+                                             _mm_set1_ps(0))) != 0b0000;
+    }
+#endif
+#if SKVX_USE_SIMD && defined(__aarch64__)
+    // On 64-bit NEON, take the max across lanes, which will be non-zero if any lane was true.
+    // The specific lane-size doesn't really matter in this case since it's really any set bit
+    // that we're looking for.
+    if constexpr (N*sizeof(T) == 8 ) { return vmaxv_u8 (unchecked_bit_pun<uint8x8_t> (x)) > 0; }
+    if constexpr (N*sizeof(T) == 16) { return vmaxvq_u8(unchecked_bit_pun<uint8x16_t>(x)) > 0; }
+#endif
 #if SKVX_USE_SIMD && defined(__wasm_simd128__)
    if constexpr (N == 4 && sizeof(T) == 4) {
        return wasm_i32x4_any_true(unchecked_bit_pun<VExt<4,int>>(x));
@ -505,17 +532,24 @@ SINT bool any(const Vec<N,T>& x) {

 SIT  bool all(const Vec<1,T>& x) { return x.val != 0; }
 SINT bool all(const Vec<N,T>& x) {
-#if SKVX_USE_SIMD && defined(__AVX2__)
-    if constexpr (N*sizeof(T) == 32) {
-        return _mm256_testc_si256(unchecked_bit_pun<__m256i>(x),
-                                  _mm256_set1_epi32(-1));
+// Unlike any(), we have to respect the lane layout, or we'll miss cases where a
+// true lane has a mix of 0 and 1 bits.
+#if SKVX_USE_SIMD && defined(__SSE__)
+    // Unfortunately, the _mm_testc intrinsics don't let us avoid the comparison to 0 for all()'s
+    // correctness, so always just use the plain SSE version.
+    if constexpr (N == 4 && sizeof(T) == 4) {
+        return _mm_movemask_ps(_mm_cmpneq_ps(unchecked_bit_pun<__m128>(x),
+                                             _mm_set1_ps(0))) == 0b1111;
    }
 #endif
-#if SKVX_USE_SIMD && defined(__SSE4_1__)
-    if constexpr (N*sizeof(T) == 16) {
-        return _mm_testc_si128(unchecked_bit_pun<__m128i>(x),
-                               _mm_set1_epi32(-1));
-    }
+#if SKVX_USE_SIMD && defined(__aarch64__)
+    // On 64-bit NEON, take the min across the lanes, which will be non-zero if all lanes are != 0.
+    if constexpr (sizeof(T)==1 && N==8)  {return vminv_u8  (unchecked_bit_pun<uint8x8_t> (x)) > 0;}
+    if constexpr (sizeof(T)==1 && N==16) {return vminvq_u8 (unchecked_bit_pun<uint8x16_t>(x)) > 0;}
+    if constexpr (sizeof(T)==2 && N==4)  {return vminv_u16 (unchecked_bit_pun<uint16x4_t>(x)) > 0;}
+    if constexpr (sizeof(T)==2 && N==8)  {return vminvq_u16(unchecked_bit_pun<uint16x8_t>(x)) > 0;}
+    if constexpr (sizeof(T)==4 && N==2)  {return vminv_u32 (unchecked_bit_pun<uint32x2_t>(x)) > 0;}
+    if constexpr (sizeof(T)==4 && N==4)  {return vminvq_u32(unchecked_bit_pun<uint32x4_t>(x)) > 0;}
 #endif
 #if SKVX_USE_SIMD && defined(__wasm_simd128__)
    if constexpr (N == 4 && sizeof(T) == 4) {
--- a/tests/SkVxTest.cpp
+++ b/tests/SkVxTest.cpp
@ -43,6 +43,14 @@ DEF_TEST(SkVx, r) {
        REPORTER_ASSERT(r, !all(mask));
    }

+    {
+        // Tests that any/all work with non-zero values, not just full bit lanes.
+        REPORTER_ASSERT(r,  all(int4{1,2,3,4}));
+        REPORTER_ASSERT(r, !all(int4{1,2,3}));
+        REPORTER_ASSERT(r,  any(int4{1,2}));
+        REPORTER_ASSERT(r, !any(int4{}));
+    }
+
    REPORTER_ASSERT(r, min(float4{1,2,3,4}) == 1);
    REPORTER_ASSERT(r, max(float4{1,2,3,4}) == 4);