From 952f8f17e41b21c0b301e63412ba941b1ffdad9d Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Wed, 16 Sep 2020 14:33:37 -0500 Subject: [PATCH] Reland "update skvx scalar-fallback strategy" This is a reland of 4985db413d3ee27b944936c71c0cd04740cd28da ...with a better implementation of map(). I don't understand why we had to revert, but it had something with calling the function pointer in map_(), so maybe this will help. I've flattened the map_() / map() merge CL into this one, and marked the resulting map() as no_sanitize("cfi"). I don't see anything wrong, so I think it's a false positive. Original change's description: > update skvx scalar-fallback strategy > > Turns out Clang's a lot better at auto-vectorizing "obvious" scalar code > into obvious vector code when it's written out the long way, e.g. > > F32x4 x = ...; > x = { sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]) }; > > vectorizes into sqrtps a lot more reliably than our recurse-onto-scalars > strategy, and also better than the other naive approach, > > F32x4 x = ...; > for (int i = 0; i < 4; i++) { x[i] = sqrtf(x[i]); } > > So here I've added a map(V, fn) -> V' using C++14 tricks to let the > compiler handle the expansion of x = { fn(x[0]), fn(x[1]), ... > fn(x[N-1]) } for any N, and implemented most skvx scalar fallback code > using that. > > With these now vectorizing well at any N, we can remove any > specializations we'd written for particular N, really tidying up. > > Over in the SkVM interpreter, this is a big improvement for ceil and > floor, which were being done 2 floats at a time instead of 8. They're > now slimmed way down to > > shlq $6, %r13 > vroundps $K, (%r12,%r13), %ymm0 > vroundps $K, 32(%r12,%r13), %ymm1 > jmp ... > > where K is 9 or 10 depending on the op. > > I haven't found a scalar function that Clang will vectorize to vcvtps2pd > (the rounding one, not truncating vcvttps2pd), so I've kept lrint() > written the long way, updated to the style I've been using lately with > specializations inline. > > Change-Id: Ia97abe3c876008228bf62b1daacd6f6140408fc4 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317375 > Reviewed-by: Herb Derby > Commit-Queue: Mike Klein Cq-Include-Trybots: luci.chromium.try:linux_chromium_cfi_rel_ng Bug: chromium:1129408 Change-Id: Ia9c14074b9a14a67dd221f4925894d35a551f9d7 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317551 Commit-Queue: Mike Klein Reviewed-by: Herb Derby --- include/private/SkVx.h | 127 ++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 78 deletions(-) diff --git a/include/private/SkVx.h b/include/private/SkVx.h index 19ff470392..abe577dba6 100644 --- a/include/private/SkVx.h +++ b/include/private/SkVx.h @@ -329,22 +329,6 @@ SIT bool all(const Vec<1,T>& x) { return x.val != 0; } SIT Vec<1,T> pow(const Vec<1,T>& x, const Vec<1,T>& y) { return std::pow(x.val, y.val); } -SIT Vec<1,T> atan(const Vec<1,T>& x) { return std:: atan(x.val); } -SIT Vec<1,T> ceil(const Vec<1,T>& x) { return std:: ceil(x.val); } -SIT Vec<1,T> floor(const Vec<1,T>& x) { return std::floor(x.val); } -SIT Vec<1,T> trunc(const Vec<1,T>& x) { return std::trunc(x.val); } -SIT Vec<1,T> round(const Vec<1,T>& x) { return std::round(x.val); } -SIT Vec<1,T> sqrt(const Vec<1,T>& x) { return std:: sqrt(x.val); } -SIT Vec<1,T> abs(const Vec<1,T>& x) { return std:: abs(x.val); } -SIT Vec<1,T> sin(const Vec<1,T>& x) { return std:: sin(x.val); } -SIT Vec<1,T> cos(const Vec<1,T>& x) { return std:: cos(x.val); } -SIT Vec<1,T> tan(const Vec<1,T>& x) { return std:: tan(x.val); } - -SIT Vec<1,int> lrint(const Vec<1,T>& x) { return (int)std::lrint(x.val); } - -SIT Vec<1,T> rcp(const Vec<1,T>& x) { return 1 / x.val; } -SIT Vec<1,T> rsqrt(const Vec<1,T>& x) { return rcp(sqrt(x)); } - // All default N != 1 implementations just recurse on lo and hi halves. // Clang can reason about naive_if_then_else() and optimize through it better @@ -395,23 +379,6 @@ SINT Vec pow(const Vec& x, const Vec& y) { return join(pow(x.lo, y.lo), pow(x.hi, y.hi)); } -SINT Vec atan(const Vec& x) { return join( atan(x.lo), atan(x.hi)); } -SINT Vec ceil(const Vec& x) { return join( ceil(x.lo), ceil(x.hi)); } -SINT Vec floor(const Vec& x) { return join(floor(x.lo), floor(x.hi)); } -SINT Vec trunc(const Vec& x) { return join(trunc(x.lo), trunc(x.hi)); } -SINT Vec round(const Vec& x) { return join(round(x.lo), round(x.hi)); } -SINT Vec sqrt(const Vec& x) { return join( sqrt(x.lo), sqrt(x.hi)); } -SINT Vec abs(const Vec& x) { return join( abs(x.lo), abs(x.hi)); } -SINT Vec sin(const Vec& x) { return join( sin(x.lo), sin(x.hi)); } -SINT Vec cos(const Vec& x) { return join( cos(x.lo), cos(x.hi)); } -SINT Vec tan(const Vec& x) { return join( tan(x.lo), tan(x.hi)); } - -SINT Vec lrint(const Vec& x) { return join(lrint(x.lo), lrint(x.hi)); } - -SINT Vec rcp(const Vec& x) { return join( rcp(x.lo), rcp(x.hi)); } -SINT Vec rsqrt(const Vec& x) { return join(rsqrt(x.lo), rsqrt(x.hi)); } - - // Scalar/vector operations just splat the scalar to a vector... SINTU Vec operator+ (U x, const Vec& y) { return Vec(x) + y; } SINTU Vec operator- (U x, const Vec& y) { return Vec(x) - y; } @@ -519,10 +486,57 @@ SIN Vec fma(const Vec& x, const Vec& y, const Vec fract(const Vec& x) { - return x - floor(x); +template +#if defined(__clang__) +// CFI, specifically -fsanitize=cfi-icall, seems to give a false positive here, +// with errors like "control flow integrity check for type 'float (float) +// noexcept' failed during indirect function call... note: sqrtf.cfi_jt defined +// here". But we can be quite sure fn is the right type: it's all inferred! +// So, stifle CFI in this function. +__attribute__((no_sanitize("cfi"))) +#endif +SI auto map(const skvx::Vec& x, Fn&& fn, + std::index_sequence ix = {}) -> skvx::Vec { + if /*constexpr*/ (sizeof...(I) == 0) { + // When called as map(x, fn), bootstrap the index_sequence we want: 0,1,...,N-1. + return map(x, fn, std::make_index_sequence{}); + } + return { fn(x[I])... }; } +SIN Vec atan(const Vec& x) { return map(x, atanf); } +SIN Vec ceil(const Vec& x) { return map(x, ceilf); } +SIN Vec floor(const Vec& x) { return map(x, floorf); } +SIN Vec trunc(const Vec& x) { return map(x, truncf); } +SIN Vec round(const Vec& x) { return map(x, roundf); } +SIN Vec sqrt(const Vec& x) { return map(x, sqrtf); } +SIN Vec abs(const Vec& x) { return map(x, fabsf); } +SIN Vec sin(const Vec& x) { return map(x, sinf); } +SIN Vec cos(const Vec& x) { return map(x, cosf); } +SIN Vec tan(const Vec& x) { return map(x, tanf); } + +SI Vec<1,int> lrint(const Vec<1,float>& x) { + return (int)lrintf(x.val); +} +SIN Vec lrint(const Vec& x) { +#if defined(__AVX__) + if /*constexpr*/ (N == 8) { + return unchecked_bit_pun>(_mm256_cvtps_epi32(unchecked_bit_pun<__m256>(x))); + } +#endif +#if defined(__SSE__) + if /*constexpr*/ (N == 4) { + return unchecked_bit_pun>(_mm_cvtps_epi32(unchecked_bit_pun<__m128>(x))); + } +#endif + return join(lrint(x.lo), + lrint(x.hi)); +} + +SIN Vec rcp(const Vec& x) { return 1/x; } +SIN Vec rsqrt(const Vec& x) { return rcp(sqrt(x)); } +SIN Vec fract(const Vec& x) { return x - floor(x); } + // The default cases for to_half/from_half are borrowed from skcms, // and assume inputs are finite and treat/flush denorm half floats as/to zero. // Key constants to watch for: @@ -638,46 +652,28 @@ SIN Vec approx_scale(const Vec& x, const Vec& y // Platform-specific specializations and overloads can now drop in here. #if defined(__AVX__) - SI Vec<8,float> sqrt(const Vec<8,float>& x) { - return bit_pun>(_mm256_sqrt_ps(bit_pun<__m256>(x))); - } SI Vec<8,float> rsqrt(const Vec<8,float>& x) { return bit_pun>(_mm256_rsqrt_ps(bit_pun<__m256>(x))); } SI Vec<8,float> rcp(const Vec<8,float>& x) { return bit_pun>(_mm256_rcp_ps(bit_pun<__m256>(x))); } - SI Vec<8,int> lrint(const Vec<8,float>& x) { - return bit_pun>(_mm256_cvtps_epi32(bit_pun<__m256>(x))); - } #endif #if defined(__SSE__) - SI Vec<4,float> sqrt(const Vec<4,float>& x) { - return bit_pun>(_mm_sqrt_ps(bit_pun<__m128>(x))); - } SI Vec<4,float> rsqrt(const Vec<4,float>& x) { return bit_pun>(_mm_rsqrt_ps(bit_pun<__m128>(x))); } SI Vec<4,float> rcp(const Vec<4,float>& x) { return bit_pun>(_mm_rcp_ps(bit_pun<__m128>(x))); } - SI Vec<4,int> lrint(const Vec<4,float>& x) { - return bit_pun>(_mm_cvtps_epi32(bit_pun<__m128>(x))); - } - SI Vec<2,float> sqrt(const Vec<2,float>& x) { - return shuffle<0,1>( sqrt(shuffle<0,1,0,1>(x))); - } SI Vec<2,float> rsqrt(const Vec<2,float>& x) { return shuffle<0,1>(rsqrt(shuffle<0,1,0,1>(x))); } SI Vec<2,float> rcp(const Vec<2,float>& x) { return shuffle<0,1>( rcp(shuffle<0,1,0,1>(x))); } - SI Vec<2,int> lrint(const Vec<2,float>& x) { - return shuffle<0,1>(lrint(shuffle<0,1,0,1>(x))); - } #endif #if defined(__AVX2__) @@ -701,36 +697,11 @@ SIN Vec approx_scale(const Vec& x, const Vec& y } #endif - // WASM SIMD compatible operations which are not automatically compiled to SIMD commands - // by emscripten: #if defined __wasm_simd128__ - SI Vec<4, float> rcp (const Vec<4, float>& x) { return 1.0f / x; } - SI Vec<2,double> rcp (const Vec<2,double>& x) { return 1.0f / x; } - SI Vec<4, float> rsqrt(const Vec<4, float>& x) { return 1.0f / sqrt(x); } - SI Vec<2,double> rsqrt(const Vec<2,double>& x) { return 1.0f / sqrt(x); } - - SI Vec<4,float> sqrt(const Vec<4,float>& x) { - return to_vec<4,float>(wasm_f32x4_sqrt(to_vext(x))); - } - SI Vec<4,float> abs(const Vec<4,float>& x) { - return to_vec<4,float>(wasm_f32x4_abs(to_vext(x))); - } - - SI Vec<2,double> sqrt(const Vec<2,double>& x) { - return to_vec<2,double>(wasm_f64x2_sqrt(to_vext(x))); - } - SI Vec<2,double> abs(const Vec<2,double>& x) { - return to_vec<2,double>(wasm_f64x2_abs(to_vext(x))); - } - SI bool any(const Vec<4, int32_t>& x) { return wasm_i32x4_any_true(to_vext(x)); } SI bool any(const Vec<4,uint32_t>& x) { return wasm_i32x4_any_true(to_vext(x)); } SI bool all(const Vec<4, int32_t>& x) { return wasm_i32x4_all_true(to_vext(x)); } SI bool all(const Vec<4,uint32_t>& x) { return wasm_i32x4_all_true(to_vext(x)); } - - SI Vec<4,int32_t> abs(const Vec<4,int32_t>& x) { - return to_vec<4,int32_t>(wasm_i32x4_abs(to_vext(x))); - } #endif #endif // !defined(SKNX_NO_SIMD)