Improve skvx::any() and all() intrinsics

Removes specializations for all() on AVX2 and SSE 4.1, which give the
wrong results if the ints didn't have all bits set (inconsistent with
other platforms and non-SIMD). Added a unit test that checks this case.

The mirror specializations for any() on AVX2 and SSE 4.1 are actually
valid, so added those, and added a 2 instruction specialization for
SSE for any() and all(). This is what clang-trunk produces on -O3, but
ToT clang struggles to vectorize it.

Also adds specializations for NEON for any() and all(), since even
clang-trunk was struggling to vectorize it automatically. In
particular, this will help skgpu::graphite::Rect's implementations of
intersect and contains, which use any/all to get a final boolean value.
In the Instruments app, I had see Rect's intersection as a hotspot
on the Mac M1, and this vectorization helps a bit.

Also takes the opportunity to remove fake C++14 constexpr for a
real constexpr.

Change-Id: Ib142e305ae5615056a777424e379b6da82d44f0c
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/542296
Commit-Queue: Michael Ludwig <michaelludwig@google.com>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Michael Ludwig 2022-05-19 15:55:26 -04:00 committed by SkCQ
parent ab8cfbed8f
commit 4621ef2a8a
2 changed files with 52 additions and 10 deletions

View File

@ -461,7 +461,6 @@ SIT Vec<1,T> if_then_else(const Vec<1,M<T>>& cond, const Vec<1,T>& t, const Vec<
}
SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {
// Specializations inline here so they can generalize what types the apply to.
// (This header is used in C++14 contexts, so we have to kind of fake constexpr if.)
#if SKVX_USE_SIMD && defined(__AVX2__)
if constexpr (N*sizeof(T) == 32) {
return unchecked_bit_pun<Vec<N,T>>(_mm256_blendv_epi8(unchecked_bit_pun<__m256i>(e),
@ -494,6 +493,34 @@ SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec
SIT bool any(const Vec<1,T>& x) { return x.val != 0; }
SINT bool any(const Vec<N,T>& x) {
// For any(), the _mm_testz intrinsics are correct and don't require comparing 'x' to 0, so it's
// lower latency compared to _mm_movemask + _mm_compneq on plain SSE.
#if SKVX_USE_SIMD && defined(__AVX2__)
if constexpr (N*sizeof(T) == 32) {
return !_mm256_testz_si256(unchecked_bit_pun<__m256i>(x), _mm256_set1_epi32(-1));
}
#endif
#if SKVX_USE_SIMD && defined(__SSE_4_1__)
if constexpr (N*sizeof(T) == 16) {
return !_mm_testz_si128(unchecked_bit_pun<__m128i>(x), _mm_set1_epi32(-1));
}
#endif
#if SKVX_USE_SIMD && defined(__SSE__)
if constexpr (N*sizeof(T) == 16) {
// On SSE, movemask checks only the MSB in each lane, which is fine if the lanes were set
// directly from a comparison op (which sets all bits to 1 when true), but skvx::Vec<>
// treats any non-zero value as true, so we have to compare 'x' to 0 before calling movemask
return _mm_movemask_ps(_mm_cmpneq_ps(unchecked_bit_pun<__m128>(x),
_mm_set1_ps(0))) != 0b0000;
}
#endif
#if SKVX_USE_SIMD && defined(__aarch64__)
// On 64-bit NEON, take the max across lanes, which will be non-zero if any lane was true.
// The specific lane-size doesn't really matter in this case since it's really any set bit
// that we're looking for.
if constexpr (N*sizeof(T) == 8 ) { return vmaxv_u8 (unchecked_bit_pun<uint8x8_t> (x)) > 0; }
if constexpr (N*sizeof(T) == 16) { return vmaxvq_u8(unchecked_bit_pun<uint8x16_t>(x)) > 0; }
#endif
#if SKVX_USE_SIMD && defined(__wasm_simd128__)
if constexpr (N == 4 && sizeof(T) == 4) {
return wasm_i32x4_any_true(unchecked_bit_pun<VExt<4,int>>(x));
@ -505,17 +532,24 @@ SINT bool any(const Vec<N,T>& x) {
SIT bool all(const Vec<1,T>& x) { return x.val != 0; }
SINT bool all(const Vec<N,T>& x) {
#if SKVX_USE_SIMD && defined(__AVX2__)
if constexpr (N*sizeof(T) == 32) {
return _mm256_testc_si256(unchecked_bit_pun<__m256i>(x),
_mm256_set1_epi32(-1));
// Unlike any(), we have to respect the lane layout, or we'll miss cases where a
// true lane has a mix of 0 and 1 bits.
#if SKVX_USE_SIMD && defined(__SSE__)
// Unfortunately, the _mm_testc intrinsics don't let us avoid the comparison to 0 for all()'s
// correctness, so always just use the plain SSE version.
if constexpr (N == 4 && sizeof(T) == 4) {
return _mm_movemask_ps(_mm_cmpneq_ps(unchecked_bit_pun<__m128>(x),
_mm_set1_ps(0))) == 0b1111;
}
#endif
#if SKVX_USE_SIMD && defined(__SSE4_1__)
if constexpr (N*sizeof(T) == 16) {
return _mm_testc_si128(unchecked_bit_pun<__m128i>(x),
_mm_set1_epi32(-1));
}
#if SKVX_USE_SIMD && defined(__aarch64__)
// On 64-bit NEON, take the min across the lanes, which will be non-zero if all lanes are != 0.
if constexpr (sizeof(T)==1 && N==8) {return vminv_u8 (unchecked_bit_pun<uint8x8_t> (x)) > 0;}
if constexpr (sizeof(T)==1 && N==16) {return vminvq_u8 (unchecked_bit_pun<uint8x16_t>(x)) > 0;}
if constexpr (sizeof(T)==2 && N==4) {return vminv_u16 (unchecked_bit_pun<uint16x4_t>(x)) > 0;}
if constexpr (sizeof(T)==2 && N==8) {return vminvq_u16(unchecked_bit_pun<uint16x8_t>(x)) > 0;}
if constexpr (sizeof(T)==4 && N==2) {return vminv_u32 (unchecked_bit_pun<uint32x2_t>(x)) > 0;}
if constexpr (sizeof(T)==4 && N==4) {return vminvq_u32(unchecked_bit_pun<uint32x4_t>(x)) > 0;}
#endif
#if SKVX_USE_SIMD && defined(__wasm_simd128__)
if constexpr (N == 4 && sizeof(T) == 4) {

View File

@ -43,6 +43,14 @@ DEF_TEST(SkVx, r) {
REPORTER_ASSERT(r, !all(mask));
}
{
// Tests that any/all work with non-zero values, not just full bit lanes.
REPORTER_ASSERT(r, all(int4{1,2,3,4}));
REPORTER_ASSERT(r, !all(int4{1,2,3}));
REPORTER_ASSERT(r, any(int4{1,2}));
REPORTER_ASSERT(r, !any(int4{}));
}
REPORTER_ASSERT(r, min(float4{1,2,3,4}) == 1);
REPORTER_ASSERT(r, max(float4{1,2,3,4}) == 4);