From 511ea17b967914ae8342d7861c6695ec8d492bcc Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Sat, 15 Oct 2016 00:13:29 -0400 Subject: [PATCH] SkNx_abi for passing Sk4f as function arguments, etc. CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot,Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug-ASAN-Trybot BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=3422 Change-Id: Idc0a192faa7ff843aef023229186580c69baf1f7 Reviewed-on: https://skia-review.googlesource.com/3422 Reviewed-by: Mike Klein --- src/core/SkNx.h | 11 ++++++ src/core/SkRasterPipeline.h | 4 +- src/opts/SkNx_neon.h | 5 +++ src/opts/SkNx_sse.h | 5 +++ src/opts/SkRasterPipeline_opts.h | 63 +++++++++++++++++--------------- 5 files changed, 57 insertions(+), 31 deletions(-) diff --git a/src/core/SkNx.h b/src/core/SkNx.h index afba75a487..6d9af9fe47 100644 --- a/src/core/SkNx.h +++ b/src/core/SkNx.h @@ -16,6 +16,11 @@ #include #include +// These _abi types are data-only, and so can be used to store SkNx in structs or +// pass them as function parameters or return values, even across compilation units. +template struct SkNx_abi { SkNx_abi lo, hi; }; +template < typename T> struct SkNx_abi<1,T> { T val; }; + namespace { #define SI static inline @@ -42,6 +47,9 @@ struct SkNx { static_assert(N==16, ""); } + SkNx(const SkNx_abi& a) : fLo(a.lo), fHi(a.hi) {} + operator SkNx_abi() const { return { (SkNx_abi)fLo, (SkNx_abi)fHi }; } + T operator[](int k) const { SkASSERT(0 <= k && k < N); return k < N/2 ? fLo[k] : fHi[k-N/2]; @@ -129,6 +137,9 @@ struct SkNx<1,T> { SkNx() = default; SkNx(T v) : fVal(v) {} + SkNx(const SkNx_abi<1,T>& a) : fVal(a.val) {} + operator SkNx_abi<1,T>() const { return { fVal }; } + // Android complains against unused parameters, so we guard it T operator[](int SkDEBUGCODE(k)) const { SkASSERT(k == 0); diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h index 05947fcb59..c6b85ad510 100644 --- a/src/core/SkRasterPipeline.h +++ b/src/core/SkRasterPipeline.h @@ -57,9 +57,9 @@ class SkRasterPipeline { public: struct Stage; #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 - using V = Sk8f; + using V = SkNx_abi<8,float>; #else - using V = Sk4f; + using V = SkNx_abi<4,float>; #endif using Fn = void(SK_VECTORCALL *)(Stage*, size_t, size_t, V,V,V,V, V,V,V,V); diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index 534bb0ea04..abdebe2c2f 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -12,6 +12,8 @@ #define SKNX_IS_FAST +template <> struct SkNx_abi<4,float> { float32x4_t vec; }; + namespace { // ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it: @@ -111,6 +113,9 @@ public: SkNx(float val) : fVec(vdupq_n_f32(val)) {} SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; } + SkNx(const SkNx_abi<4,float>& a) : fVec(a.vec) {} + operator SkNx_abi<4,float>() const { return { fVec }; } + static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 28d2cde5c0..25fb0b7a8d 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -15,6 +15,8 @@ #define SKNX_IS_FAST +template <> struct SkNx_abi<4,float> { __m128 vec; }; + namespace { template <> @@ -71,6 +73,9 @@ public: SkNx(float val) : fVec( _mm_set1_ps(val) ) {} SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} + SkNx(const SkNx_abi<4,float>& a) : fVec(a.vec) {} + operator SkNx_abi<4,float>() const { return { fVec }; } + static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); } void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h index 52b02d045e..85623887de 100644 --- a/src/opts/SkRasterPipeline_opts.h +++ b/src/opts/SkRasterPipeline_opts.h @@ -13,35 +13,38 @@ #include "SkRasterPipeline.h" #include "SkSRGB.h" -using SkNf = SkRasterPipeline::V; -static constexpr auto N = sizeof(SkNf) / sizeof(float); +using SkNf_abi = SkRasterPipeline::V; +static constexpr auto N = sizeof(SkNf_abi) / sizeof(float); +using SkNf = SkNx; using SkNi = SkNx; using SkNh = SkNx; #define SI static inline -#define STAGE(name, kCallNext) \ - static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail, \ - SkNf& r, SkNf& g, SkNf& b, SkNf& a, \ - SkNf& dr, SkNf& dg, SkNf& db, SkNf& da); \ - SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \ - SkNf r, SkNf g, SkNf b, SkNf a, \ - SkNf dr, SkNf dg, SkNf db, SkNf da) { \ - name##_kernel(st->ctx(), x,0, r,g,b,a, dr,dg,db,da); \ - if (kCallNext) { \ - st->next(x,tail, r,g,b,a, dr,dg,db,da); \ - } \ - } \ - SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail, \ - SkNf r, SkNf g, SkNf b, SkNf a, \ - SkNf dr, SkNf dg, SkNf db, SkNf da) { \ - name##_kernel(st->ctx(), x,tail, r,g,b,a, dr,dg,db,da); \ - if (kCallNext) { \ - st->next(x,tail, r,g,b,a, dr,dg,db,da); \ - } \ - } \ - static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail, \ - SkNf& r, SkNf& g, SkNf& b, SkNf& a, \ +#define STAGE(name, kCallNext) \ + static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail, \ + SkNf& r, SkNf& g, SkNf& b, SkNf& a, \ + SkNf& dr, SkNf& dg, SkNf& db, SkNf& da); \ + SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \ + SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \ + SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \ + SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \ + name##_kernel(st->ctx(), x,0, r,g,b,a, dr,dg,db,da); \ + if (kCallNext) { \ + st->next(x,tail, r,g,b,a, dr,dg,db,da); \ + } \ + } \ + SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail, \ + SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \ + SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \ + SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \ + name##_kernel(st->ctx(), x,tail, r,g,b,a, dr,dg,db,da); \ + if (kCallNext) { \ + st->next(x,tail, r,g,b,a, dr,dg,db,da); \ + } \ + } \ + static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail, \ + SkNf& r, SkNf& g, SkNf& b, SkNf& a, \ SkNf& dr, SkNf& dg, SkNf& db, SkNf& da) @@ -50,8 +53,9 @@ using SkNh = SkNx; static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \ const SkNf& d, const SkNf& da); \ SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \ - SkNf r, SkNf g, SkNf b, SkNf a, \ - SkNf dr, SkNf dg, SkNf db, SkNf da) { \ + SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \ + SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \ + SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \ r = name##_kernel(r,a,dr,da); \ g = name##_kernel(g,a,dg,da); \ b = name##_kernel(b,a,db,da); \ @@ -66,8 +70,9 @@ using SkNh = SkNx; static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \ const SkNf& d, const SkNf& da); \ SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \ - SkNf r, SkNf g, SkNf b, SkNf a, \ - SkNf dr, SkNf dg, SkNf db, SkNf da) { \ + SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \ + SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \ + SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \ r = name##_kernel(r,a,dr,da); \ g = name##_kernel(g,a,dg,da); \ b = name##_kernel(b,a,db,da); \ @@ -85,7 +90,7 @@ namespace SK_OPTS_NS { void (*vTailStart)(), SkRasterPipeline::Stage* tail) { auto bodyStart = (SkRasterPipeline::Fn)vBodyStart, tailStart = (SkRasterPipeline::Fn)vTailStart; - SkNf v; // Fastest to start uninitialized. + SkNf v{0}; // TODO: uninitialized would be a bit faster, but some compilers are whiny. while (n >= N) { bodyStart(body, x,0, v,v,v,v, v,v,v,v); x += N;