From 4621ef2a8af08e63fcb14b1c4fed7e5125e308a9 Mon Sep 17 00:00:00 2001
From: Michael Ludwig <michaelludwig@google.com>
Date: Thu, 19 May 2022 15:55:26 -0400
Subject: [PATCH] Improve skvx::any() and all() intrinsics

Removes specializations for all() on AVX2 and SSE 4.1, which give the
wrong results if the ints didn't have all bits set (inconsistent with
other platforms and non-SIMD). Added a unit test that checks this case.

The mirror specializations for any() on AVX2 and SSE 4.1 are actually
valid, so added those, and added a 2 instruction specialization for
SSE for any() and all(). This is what clang-trunk produces on -O3, but
ToT clang struggles to vectorize it.

Also adds specializations for NEON for any() and all(), since even
clang-trunk was struggling to vectorize it automatically. In
particular, this will help skgpu::graphite::Rect's implementations of
intersect and contains, which use any/all to get a final boolean value.
In the Instruments app, I had see Rect's intersection as a hotspot
on the Mac M1, and this vectorization helps a bit.

Also takes the opportunity to remove fake C++14 constexpr for a
real constexpr.

Change-Id: Ib142e305ae5615056a777424e379b6da82d44f0c
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/542296
Commit-Queue: Michael Ludwig <michaelludwig@google.com>
Reviewed-by: Herb Derby <herb@google.com>
---
 include/private/SkVx.h | 54 ++++++++++++++++++++++++++++++++++--------
 tests/SkVxTest.cpp     |  8 +++++++
 2 files changed, 52 insertions(+), 10 deletions(-)
diff --git a/include/private/SkVx.h b/include/private/SkVx.h
index efe3ceec5d..2a5c2445dc 100644
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@@ -461,7 +461,6 @@ SIT Vec<1,T> if_then_else(const Vec<1,M<T>>& cond, const Vec<1,T>& t, const Vec<
 }
 SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {
     // Specializations inline here so they can generalize what types the apply to.
-    // (This header is used in C++14 contexts, so we have to kind of fake constexpr if.)
 #if SKVX_USE_SIMD && defined(__AVX2__)
     if constexpr (N*sizeof(T) == 32) {
         return unchecked_bit_pun<Vec<N,T>>(_mm256_blendv_epi8(unchecked_bit_pun<__m256i>(e),
@@ -494,6 +493,34 @@ SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec
 
 SIT  bool any(const Vec<1,T>& x) { return x.val != 0; }
 SINT bool any(const Vec<N,T>& x) {
+    // For any(), the _mm_testz intrinsics are correct and don't require comparing 'x' to 0, so it's
+    // lower latency compared to _mm_movemask + _mm_compneq on plain SSE.
+#if SKVX_USE_SIMD && defined(__AVX2__)
+    if constexpr (N*sizeof(T) == 32) {
+        return !_mm256_testz_si256(unchecked_bit_pun<__m256i>(x), _mm256_set1_epi32(-1));
+    }
+#endif
+#if SKVX_USE_SIMD && defined(__SSE_4_1__)
+    if constexpr (N*sizeof(T) == 16) {
+        return !_mm_testz_si128(unchecked_bit_pun<__m128i>(x), _mm_set1_epi32(-1));
+    }
+#endif
+#if SKVX_USE_SIMD && defined(__SSE__)
+    if constexpr (N*sizeof(T) == 16) {
+        // On SSE, movemask checks only the MSB in each lane, which is fine if the lanes were set
+        // directly from a comparison op (which sets all bits to 1 when true), but skvx::Vec<>
+        // treats any non-zero value as true, so we have to compare 'x' to 0 before calling movemask
+        return _mm_movemask_ps(_mm_cmpneq_ps(unchecked_bit_pun<__m128>(x),
+                                             _mm_set1_ps(0))) != 0b0000;
+    }
+#endif
+#if SKVX_USE_SIMD && defined(__aarch64__)
+    // On 64-bit NEON, take the max across lanes, which will be non-zero if any lane was true.
+    // The specific lane-size doesn't really matter in this case since it's really any set bit
+    // that we're looking for.
+    if constexpr (N*sizeof(T) == 8 ) { return vmaxv_u8 (unchecked_bit_pun<uint8x8_t> (x)) > 0; }
+    if constexpr (N*sizeof(T) == 16) { return vmaxvq_u8(unchecked_bit_pun<uint8x16_t>(x)) > 0; }
+#endif
 #if SKVX_USE_SIMD && defined(__wasm_simd128__)
     if constexpr (N == 4 && sizeof(T) == 4) {
         return wasm_i32x4_any_true(unchecked_bit_pun<VExt<4,int>>(x));
@@ -505,17 +532,24 @@ SINT bool any(const Vec<N,T>& x) {
 
 SIT  bool all(const Vec<1,T>& x) { return x.val != 0; }
 SINT bool all(const Vec<N,T>& x) {
-#if SKVX_USE_SIMD && defined(__AVX2__)
-    if constexpr (N*sizeof(T) == 32) {
-        return _mm256_testc_si256(unchecked_bit_pun<__m256i>(x),
-                                  _mm256_set1_epi32(-1));
+// Unlike any(), we have to respect the lane layout, or we'll miss cases where a
+// true lane has a mix of 0 and 1 bits.
+#if SKVX_USE_SIMD && defined(__SSE__)
+    // Unfortunately, the _mm_testc intrinsics don't let us avoid the comparison to 0 for all()'s
+    // correctness, so always just use the plain SSE version.
+    if constexpr (N == 4 && sizeof(T) == 4) {
+        return _mm_movemask_ps(_mm_cmpneq_ps(unchecked_bit_pun<__m128>(x),
+                                             _mm_set1_ps(0))) == 0b1111;
     }
 #endif
-#if SKVX_USE_SIMD && defined(__SSE4_1__)
-    if constexpr (N*sizeof(T) == 16) {
-        return _mm_testc_si128(unchecked_bit_pun<__m128i>(x),
-                               _mm_set1_epi32(-1));
-    }
+#if SKVX_USE_SIMD && defined(__aarch64__)
+    // On 64-bit NEON, take the min across the lanes, which will be non-zero if all lanes are != 0.
+    if constexpr (sizeof(T)==1 && N==8)  {return vminv_u8  (unchecked_bit_pun<uint8x8_t> (x)) > 0;}
+    if constexpr (sizeof(T)==1 && N==16) {return vminvq_u8 (unchecked_bit_pun<uint8x16_t>(x)) > 0;}
+    if constexpr (sizeof(T)==2 && N==4)  {return vminv_u16 (unchecked_bit_pun<uint16x4_t>(x)) > 0;}
+    if constexpr (sizeof(T)==2 && N==8)  {return vminvq_u16(unchecked_bit_pun<uint16x8_t>(x)) > 0;}
+    if constexpr (sizeof(T)==4 && N==2)  {return vminv_u32 (unchecked_bit_pun<uint32x2_t>(x)) > 0;}
+    if constexpr (sizeof(T)==4 && N==4)  {return vminvq_u32(unchecked_bit_pun<uint32x4_t>(x)) > 0;}
 #endif
 #if SKVX_USE_SIMD && defined(__wasm_simd128__)
     if constexpr (N == 4 && sizeof(T) == 4) {
diff --git a/tests/SkVxTest.cpp b/tests/SkVxTest.cpp
index 1dcd28a4e6..9999a81935 100644
--- a/tests/SkVxTest.cpp
+++ b/tests/SkVxTest.cpp
@@ -43,6 +43,14 @@ DEF_TEST(SkVx, r) {
         REPORTER_ASSERT(r, !all(mask));
     }
 
+    {
+        // Tests that any/all work with non-zero values, not just full bit lanes.
+        REPORTER_ASSERT(r,  all(int4{1,2,3,4}));
+        REPORTER_ASSERT(r, !all(int4{1,2,3}));
+        REPORTER_ASSERT(r,  any(int4{1,2}));
+        REPORTER_ASSERT(r, !any(int4{}));
+    }
+
     REPORTER_ASSERT(r, min(float4{1,2,3,4}) == 1);
     REPORTER_ASSERT(r, max(float4{1,2,3,4}) == 4);