diff --git a/src/gpu/GrVx.h b/src/gpu/GrVx.h index ca23b20276..6138269a39 100644 --- a/src/gpu/GrVx.h +++ b/src/gpu/GrVx.h @@ -8,7 +8,7 @@ #ifndef GrVx_DEFINED #define GrVx_DEFINED -// If more headers are required, then the desired functionality might not belong in this file. +#include "include/core/SkTypes.h" #include "include/private/SkVx.h" // grvx is Ganesh's addendum to skvx, Skia's SIMD library. Here we introduce functions that are @@ -35,12 +35,12 @@ template using uvec = skvx::Vec; using uint2 = uvec<2>; using uint4 = uvec<4>; -static inline float dot(float2 a, float2 b) { +static SK_ALWAYS_INLINE float dot(float2 a, float2 b) { float2 ab = a*b; return ab[0] + ab[1]; } -static inline float cross(float2 a, float2 b) { +static SK_ALWAYS_INLINE float cross(float2 a, float2 b) { float2 x = a*skvx::shuffle<1,0>(b); return x[0] - x[1]; } @@ -48,7 +48,7 @@ static inline float cross(float2 a, float2 b) { // Returns f*m + a. The actual implementation may or may not be fused, depending on hardware // support. We call this method "fast_madd" to draw attention to the fact that the operation may // give different results on different platforms. -template vec inline fast_madd(vec f, vec m, vec a) { +template SK_ALWAYS_INLINE vec fast_madd(vec f, vec m, vec a) { #if FP_FAST_FMAF return skvx::fma(f,m,a); #else @@ -67,7 +67,7 @@ template vec inline fast_madd(vec f, vec m, vec a) { // NOTE: This function deviates immediately from pi and 0 outside -1 and 1. (The derivatives are // infinite at -1 and 1). So the input must still be clamped between -1 and 1. #define GRVX_FAST_ACOS_MAX_ERROR SkDegreesToRadians(.96f) -template inline vec approx_acos(vec x) { +template SK_ALWAYS_INLINE vec approx_acos(vec x) { static const vec a = -0.939115566365855f; static const vec b = 0.9217841528914573f; static const vec c = -1.2845906244690837f; @@ -87,8 +87,8 @@ template inline vec approx_acos(vec x) { // // NOTE: If necessary, we can extend our valid range to 2^(+/-63) by normalizing a and b separately. // i.e.: "cosTheta = dot(a,b) / sqrt(dot(a,a)) / sqrt(dot(b,b))". -template inline vec approx_angle_between_vectors(vec ax, vec ay, vec bx, - vec by) { +template +SK_ALWAYS_INLINE vec approx_angle_between_vectors(vec ax, vec ay, vec bx, vec by) { vec ab_cosTheta = fast_madd(ax, bx, ay*by); vec ab_pow2 = fast_madd(ay, ay, ax*ax) * fast_madd(by, by, bx*bx); vec cosTheta = ab_cosTheta / skvx::sqrt(ab_pow2); @@ -97,6 +97,111 @@ template inline vec approx_angle_between_vectors(vec ax, vec ay, return approx_acos(cosTheta); } +// De-interleaving load of 4 vectors. +// +// WARNING: These are really only supported well on NEON. Consider restructuring your data before +// resorting to these methods. +template +SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b, + skvx::Vec<1,T>& c, skvx::Vec<1,T>& d) { + a.val = v[0]; + b.val = v[1]; + c.val = v[2]; + d.val = v[3]; +} +template +SK_ALWAYS_INLINE typename std::enable_if= 2, void>::type +strided_load4(const T* v, skvx::Vec& a, skvx::Vec& b, skvx::Vec& c, + skvx::Vec& d) { + strided_load4(v, a.lo, b.lo, c.lo, d.lo); + strided_load4(v + 4*(N/2), a.hi, b.hi, c.hi, d.hi); +} +#if !defined(SKNX_NO_SIMD) +#if defined(__ARM_NEON) +#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \ +template<> \ +SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec& a, skvx::Vec& b, \ + skvx::Vec& c, skvx::Vec& d) { \ + auto mat = VLD(v); \ + a = skvx::bit_pun>(mat.val[0]); \ + b = skvx::bit_pun>(mat.val[1]); \ + c = skvx::bit_pun>(mat.val[2]); \ + d = skvx::bit_pun>(mat.val[3]); \ +} +IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32); +IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16); +IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8); +IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32); +IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16); +IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8); +IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32); +IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32); +IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16); +IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8); +IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32); +IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16); +IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8); +IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32); +#undef IMPL_LOAD4_TRANSPOSED +#elif defined(__SSE__) +template<> +SK_ALWAYS_INLINE void strided_load4(const float* v, float4& a, float4& b, float4& c, float4& d) { + using skvx::bit_pun; + __m128 a_ = _mm_loadu_ps(v); + __m128 b_ = _mm_loadu_ps(v+4); + __m128 c_ = _mm_loadu_ps(v+8); + __m128 d_ = _mm_loadu_ps(v+12); + _MM_TRANSPOSE4_PS(a_, b_, c_, d_); + a = bit_pun(a_); + b = bit_pun(b_); + c = bit_pun(c_); + d = bit_pun(d_); +} +#endif +#endif + +// De-interleaving load of 2 vectors. +// +// WARNING: These are really only supported well on NEON. Consider restructuring your data before +// resorting to these methods. +template +SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b) { + a.val = v[0]; + b.val = v[1]; +} +template +SK_ALWAYS_INLINE typename std::enable_if= 2, void>::type +strided_load2(const T* v, skvx::Vec& a, skvx::Vec& b) { + strided_load2(v, a.lo, b.lo); + strided_load2(v + 2*(N/2), a.hi, b.hi); +} +#if !defined(SKNX_NO_SIMD) +#if defined(__ARM_NEON) +#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \ +template<> \ +SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec& a, skvx::Vec& b) { \ + auto mat = VLD(v); \ + a = skvx::bit_pun>(mat.val[0]); \ + b = skvx::bit_pun>(mat.val[1]); \ +} +IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32); +IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16); +IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8); +IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32); +IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16); +IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8); +IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32); +IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32); +IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16); +IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8); +IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32); +IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16); +IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8); +IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32); +#undef IMPL_LOAD2_TRANSPOSED +#endif +#endif + #if defined(__clang__) #pragma STDC FP_CONTRACT DEFAULT #endif diff --git a/tests/GrVxTest.cpp b/tests/GrVxTest.cpp index 1ca1ba31d1..712a903e4c 100644 --- a/tests/GrVxTest.cpp +++ b/tests/GrVxTest.cpp @@ -10,6 +10,7 @@ #include "src/gpu/GrVx.h" #include "tests/Test.h" #include +#include using namespace grvx; using skvx::bit_pun; @@ -229,3 +230,41 @@ DEF_TEST(grvx_approx_angle_between_vectors, r) { exp = (exp + uint4{79, 83, 199, 7}) & 0xff; } } + +template void check_strided_loads(skiatest::Reporter* r) { + using Vec = skvx::Vec; + T values[N*4]; + std::iota(values, values + N*4, 0); + Vec a, b, c, d; + grvx::strided_load2(values, a, b); + for (int i = 0; i < N; ++i) { + REPORTER_ASSERT(r, a[i] == values[i*2]); + REPORTER_ASSERT(r, b[i] == values[i*2 + 1]); + } + grvx::strided_load4(values, a, b, c, d); + for (int i = 0; i < N; ++i) { + REPORTER_ASSERT(r, a[i] == values[i*4]); + REPORTER_ASSERT(r, b[i] == values[i*4 + 1]); + REPORTER_ASSERT(r, c[i] == values[i*4 + 2]); + REPORTER_ASSERT(r, d[i] == values[i*4 + 3]); + } +} + +template void check_strided_loads(skiatest::Reporter* r) { + check_strided_loads<1,T>(r); + check_strided_loads<2,T>(r); + check_strided_loads<4,T>(r); + check_strided_loads<8,T>(r); + check_strided_loads<16,T>(r); + check_strided_loads<32,T>(r); +} + +DEF_TEST(GrVx_strided_loads, r) { + check_strided_loads(r); + check_strided_loads(r); + check_strided_loads(r); + check_strided_loads(r); + check_strided_loads(r); + check_strided_loads(r); + check_strided_loads(r); +}