From 952f8f17e41b21c0b301e63412ba941b1ffdad9d Mon Sep 17 00:00:00 2001
From: Mike Klein <mtklein@google.com>
Date: Wed, 16 Sep 2020 14:33:37 -0500
Subject: [PATCH] Reland "update skvx scalar-fallback strategy"

This is a reland of 4985db413d3ee27b944936c71c0cd04740cd28da

...with a better implementation of map().  I don't understand
why we had to revert, but it had something with calling the
function pointer in map_(), so maybe this will help.

I've flattened the map_() / map() merge CL into this one,
and marked the resulting map() as no_sanitize("cfi").  I
don't see anything wrong, so I think it's a false positive.

Original change's description:
> update skvx scalar-fallback strategy
>
> Turns out Clang's a lot better at auto-vectorizing "obvious" scalar code
> into obvious vector code when it's written out the long way, e.g.
>
>      F32x4 x = ...;
>      x = { sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]) };
>
> vectorizes into sqrtps a lot more reliably than our recurse-onto-scalars
> strategy, and also better than the other naive approach,
>
>      F32x4 x = ...;
>      for (int i = 0; i < 4; i++) { x[i] = sqrtf(x[i]); }
>
> So here I've added a map(V, fn) -> V' using C++14 tricks to let the
> compiler handle the expansion of x = { fn(x[0]), fn(x[1]), ...
> fn(x[N-1]) } for any N, and implemented most skvx scalar fallback code
> using that.
>
> With these now vectorizing well at any N, we can remove any
> specializations we'd written for particular N, really tidying up.
>
> Over in the SkVM interpreter, this is a big improvement for ceil and
> floor, which were being done 2 floats at a time instead of 8.  They're
> now slimmed way down to
>
>    shlq       $6, %r13
>    vroundps   $K, (%r12,%r13), %ymm0
>    vroundps   $K, 32(%r12,%r13), %ymm1
>    jmp        ...
>
> where K is 9 or 10 depending on the op.
>
> I haven't found a scalar function that Clang will vectorize to vcvtps2pd
> (the rounding one, not truncating vcvttps2pd), so I've kept lrint()
> written the long way, updated to the style I've been using lately with
> specializations inline.
>
> Change-Id: Ia97abe3c876008228bf62b1daacd6f6140408fc4
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317375
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>

Cq-Include-Trybots: luci.chromium.try:linux_chromium_cfi_rel_ng
Bug: chromium:1129408
Change-Id: Ia9c14074b9a14a67dd221f4925894d35a551f9d7
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317551
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
---
 include/private/SkVx.h | 127 ++++++++++++++++-------------------------
 1 file changed, 49 insertions(+), 78 deletions(-)

diff --git a/include/private/SkVx.h b/include/private/SkVx.h
index 19ff470392..abe577dba6 100644
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@@ -329,22 +329,6 @@ SIT bool all(const Vec<1,T>& x) { return x.val != 0; }
 
 SIT Vec<1,T> pow(const Vec<1,T>& x, const Vec<1,T>& y) { return std::pow(x.val, y.val); }
 
-SIT Vec<1,T>  atan(const Vec<1,T>& x) { return std:: atan(x.val); }
-SIT Vec<1,T>  ceil(const Vec<1,T>& x) { return std:: ceil(x.val); }
-SIT Vec<1,T> floor(const Vec<1,T>& x) { return std::floor(x.val); }
-SIT Vec<1,T> trunc(const Vec<1,T>& x) { return std::trunc(x.val); }
-SIT Vec<1,T> round(const Vec<1,T>& x) { return std::round(x.val); }
-SIT Vec<1,T>  sqrt(const Vec<1,T>& x) { return std:: sqrt(x.val); }
-SIT Vec<1,T>   abs(const Vec<1,T>& x) { return std::  abs(x.val); }
-SIT Vec<1,T>   sin(const Vec<1,T>& x) { return std::  sin(x.val); }
-SIT Vec<1,T>   cos(const Vec<1,T>& x) { return std::  cos(x.val); }
-SIT Vec<1,T>   tan(const Vec<1,T>& x) { return std::  tan(x.val); }
-
-SIT Vec<1,int> lrint(const Vec<1,T>& x) { return (int)std::lrint(x.val); }
-
-SIT Vec<1,T>   rcp(const Vec<1,T>& x) { return 1 / x.val; }
-SIT Vec<1,T> rsqrt(const Vec<1,T>& x) { return rcp(sqrt(x)); }
-
 // All default N != 1 implementations just recurse on lo and hi halves.
 
 // Clang can reason about naive_if_then_else() and optimize through it better
@@ -395,23 +379,6 @@ SINT Vec<N,T> pow(const Vec<N,T>& x, const Vec<N,T>& y) {
     return join(pow(x.lo, y.lo), pow(x.hi, y.hi));
 }
 
-SINT Vec<N,T>  atan(const Vec<N,T>& x) { return join( atan(x.lo),  atan(x.hi)); }
-SINT Vec<N,T>  ceil(const Vec<N,T>& x) { return join( ceil(x.lo),  ceil(x.hi)); }
-SINT Vec<N,T> floor(const Vec<N,T>& x) { return join(floor(x.lo), floor(x.hi)); }
-SINT Vec<N,T> trunc(const Vec<N,T>& x) { return join(trunc(x.lo), trunc(x.hi)); }
-SINT Vec<N,T> round(const Vec<N,T>& x) { return join(round(x.lo), round(x.hi)); }
-SINT Vec<N,T>  sqrt(const Vec<N,T>& x) { return join( sqrt(x.lo),  sqrt(x.hi)); }
-SINT Vec<N,T>   abs(const Vec<N,T>& x) { return join(  abs(x.lo),   abs(x.hi)); }
-SINT Vec<N,T>   sin(const Vec<N,T>& x) { return join(  sin(x.lo),   sin(x.hi)); }
-SINT Vec<N,T>   cos(const Vec<N,T>& x) { return join(  cos(x.lo),   cos(x.hi)); }
-SINT Vec<N,T>   tan(const Vec<N,T>& x) { return join(  tan(x.lo),   tan(x.hi)); }
-
-SINT Vec<N,int> lrint(const Vec<N,T>& x) { return join(lrint(x.lo), lrint(x.hi)); }
-
-SINT Vec<N,T>   rcp(const Vec<N,T>& x) { return join(  rcp(x.lo),   rcp(x.hi)); }
-SINT Vec<N,T> rsqrt(const Vec<N,T>& x) { return join(rsqrt(x.lo), rsqrt(x.hi)); }
-
-
 // Scalar/vector operations just splat the scalar to a vector...
 SINTU Vec<N,T>    operator+ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) +  y; }
 SINTU Vec<N,T>    operator- (U x, const Vec<N,T>& y) { return Vec<N,T>(x) -  y; }
@@ -519,10 +486,57 @@ SIN Vec<N,float> fma(const Vec<N,float>& x, const Vec<N,float>& y, const Vec<N,f
                 fma(x.hi, y.hi, z.hi));
 }
 
-SIN Vec<N,float> fract(const Vec<N,float>& x) {
-    return x - floor(x);
+template <int N, typename T, typename Fn, std::size_t... I>
+#if defined(__clang__)
+// CFI, specifically -fsanitize=cfi-icall, seems to give a false positive here,
+// with errors like "control flow integrity check for type 'float (float)
+// noexcept' failed during indirect function call... note: sqrtf.cfi_jt defined
+// here".  But we can be quite sure fn is the right type: it's all inferred!
+// So, stifle CFI in this function.
+__attribute__((no_sanitize("cfi")))
+#endif
+SI auto map(const skvx::Vec<N,T>& x, Fn&& fn,
+            std::index_sequence<I...> ix = {}) -> skvx::Vec<N, decltype(fn(x[0]))> {
+    if /*constexpr*/ (sizeof...(I) == 0) {
+        // When called as map(x, fn), bootstrap the index_sequence we want: 0,1,...,N-1.
+        return map(x, fn, std::make_index_sequence<N>{});
+    }
+    return { fn(x[I])... };
 }
 
+SIN Vec<N,float>  atan(const Vec<N,float>& x) { return map(x,  atanf); }
+SIN Vec<N,float>  ceil(const Vec<N,float>& x) { return map(x,  ceilf); }
+SIN Vec<N,float> floor(const Vec<N,float>& x) { return map(x, floorf); }
+SIN Vec<N,float> trunc(const Vec<N,float>& x) { return map(x, truncf); }
+SIN Vec<N,float> round(const Vec<N,float>& x) { return map(x, roundf); }
+SIN Vec<N,float>  sqrt(const Vec<N,float>& x) { return map(x,  sqrtf); }
+SIN Vec<N,float>   abs(const Vec<N,float>& x) { return map(x,  fabsf); }
+SIN Vec<N,float>   sin(const Vec<N,float>& x) { return map(x,   sinf); }
+SIN Vec<N,float>   cos(const Vec<N,float>& x) { return map(x,   cosf); }
+SIN Vec<N,float>   tan(const Vec<N,float>& x) { return map(x,   tanf); }
+
+SI Vec<1,int> lrint(const Vec<1,float>& x) {
+    return (int)lrintf(x.val);
+}
+SIN Vec<N,int> lrint(const Vec<N,float>& x) {
+#if defined(__AVX__)
+    if /*constexpr*/ (N == 8) {
+        return unchecked_bit_pun<Vec<N,int>>(_mm256_cvtps_epi32(unchecked_bit_pun<__m256>(x)));
+    }
+#endif
+#if defined(__SSE__)
+    if /*constexpr*/ (N == 4) {
+        return unchecked_bit_pun<Vec<N,int>>(_mm_cvtps_epi32(unchecked_bit_pun<__m128>(x)));
+    }
+#endif
+    return join(lrint(x.lo),
+                lrint(x.hi));
+}
+
+SIN Vec<N,float>   rcp(const Vec<N,float>& x) { return 1/x; }
+SIN Vec<N,float> rsqrt(const Vec<N,float>& x) { return rcp(sqrt(x)); }
+SIN Vec<N,float> fract(const Vec<N,float>& x) { return x - floor(x); }
+
 // The default cases for to_half/from_half are borrowed from skcms,
 // and assume inputs are finite and treat/flush denorm half floats as/to zero.
 // Key constants to watch for:
@@ -638,46 +652,28 @@ SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y
     // Platform-specific specializations and overloads can now drop in here.
 
     #if defined(__AVX__)
-        SI Vec<8,float> sqrt(const Vec<8,float>& x) {
-            return bit_pun<Vec<8,float>>(_mm256_sqrt_ps(bit_pun<__m256>(x)));
-        }
         SI Vec<8,float> rsqrt(const Vec<8,float>& x) {
             return bit_pun<Vec<8,float>>(_mm256_rsqrt_ps(bit_pun<__m256>(x)));
         }
         SI Vec<8,float> rcp(const Vec<8,float>& x) {
             return bit_pun<Vec<8,float>>(_mm256_rcp_ps(bit_pun<__m256>(x)));
         }
-        SI Vec<8,int> lrint(const Vec<8,float>& x) {
-            return bit_pun<Vec<8,int>>(_mm256_cvtps_epi32(bit_pun<__m256>(x)));
-        }
     #endif
 
     #if defined(__SSE__)
-        SI Vec<4,float> sqrt(const Vec<4,float>& x) {
-            return bit_pun<Vec<4,float>>(_mm_sqrt_ps(bit_pun<__m128>(x)));
-        }
         SI Vec<4,float> rsqrt(const Vec<4,float>& x) {
             return bit_pun<Vec<4,float>>(_mm_rsqrt_ps(bit_pun<__m128>(x)));
         }
         SI Vec<4,float> rcp(const Vec<4,float>& x) {
             return bit_pun<Vec<4,float>>(_mm_rcp_ps(bit_pun<__m128>(x)));
         }
-        SI Vec<4,int> lrint(const Vec<4,float>& x) {
-            return bit_pun<Vec<4,int>>(_mm_cvtps_epi32(bit_pun<__m128>(x)));
-        }
 
-        SI Vec<2,float>  sqrt(const Vec<2,float>& x) {
-            return shuffle<0,1>( sqrt(shuffle<0,1,0,1>(x)));
-        }
         SI Vec<2,float> rsqrt(const Vec<2,float>& x) {
             return shuffle<0,1>(rsqrt(shuffle<0,1,0,1>(x)));
         }
         SI Vec<2,float>   rcp(const Vec<2,float>& x) {
             return shuffle<0,1>(  rcp(shuffle<0,1,0,1>(x)));
         }
-        SI Vec<2,int> lrint(const Vec<2,float>& x) {
-            return shuffle<0,1>(lrint(shuffle<0,1,0,1>(x)));
-        }
     #endif
 
     #if defined(__AVX2__)
@@ -701,36 +697,11 @@ SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y
         }
     #endif
 
-    // WASM SIMD compatible operations which are not automatically compiled to SIMD commands
-    // by emscripten:
     #if defined __wasm_simd128__
-        SI Vec<4, float> rcp  (const Vec<4, float>& x) { return 1.0f / x; }
-        SI Vec<2,double> rcp  (const Vec<2,double>& x) { return 1.0f / x; }
-        SI Vec<4, float> rsqrt(const Vec<4, float>& x) { return 1.0f / sqrt(x); }
-        SI Vec<2,double> rsqrt(const Vec<2,double>& x) { return 1.0f / sqrt(x); }
-
-        SI Vec<4,float> sqrt(const Vec<4,float>& x) {
-            return to_vec<4,float>(wasm_f32x4_sqrt(to_vext(x)));
-        }
-        SI Vec<4,float> abs(const Vec<4,float>& x) {
-            return to_vec<4,float>(wasm_f32x4_abs(to_vext(x)));
-        }
-
-        SI Vec<2,double> sqrt(const Vec<2,double>& x) {
-            return to_vec<2,double>(wasm_f64x2_sqrt(to_vext(x)));
-        }
-        SI Vec<2,double> abs(const Vec<2,double>& x) {
-            return to_vec<2,double>(wasm_f64x2_abs(to_vext(x)));
-        }
-
         SI bool any(const Vec<4, int32_t>& x) { return wasm_i32x4_any_true(to_vext(x)); }
         SI bool any(const Vec<4,uint32_t>& x) { return wasm_i32x4_any_true(to_vext(x)); }
         SI bool all(const Vec<4, int32_t>& x) { return wasm_i32x4_all_true(to_vext(x)); }
         SI bool all(const Vec<4,uint32_t>& x) { return wasm_i32x4_all_true(to_vext(x)); }
-
-        SI Vec<4,int32_t> abs(const Vec<4,int32_t>& x) {
-            return to_vec<4,int32_t>(wasm_i32x4_abs(to_vext(x)));
-        }
     #endif
 
 #endif // !defined(SKNX_NO_SIMD)