diff --git a/src/core/SkColorSpaceXform.cpp b/src/core/SkColorSpaceXform.cpp index 1c0dd880e7..6a67875ec1 100644 --- a/src/core/SkColorSpaceXform.cpp +++ b/src/core/SkColorSpaceXform.cpp @@ -905,10 +905,10 @@ template static inline void store_f16(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da, const uint8_t* const[3]) { - Sk4h_store4(dst, SkFloatToHalf_finite_ftz(dr), - SkFloatToHalf_finite_ftz(dg), - SkFloatToHalf_finite_ftz(db), - SkFloatToHalf_finite_ftz(da)); + Sk4h::Store4(dst, SkFloatToHalf_finite_ftz(dr), + SkFloatToHalf_finite_ftz(dg), + SkFloatToHalf_finite_ftz(db), + SkFloatToHalf_finite_ftz(da)); } template @@ -923,7 +923,7 @@ template static inline void store_f32(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da, const uint8_t* const[3]) { - Sk4f_store4(dst, dr, dg, db, da); + Sk4f::Store4(dst, dr, dg, db, da); } template @@ -938,10 +938,10 @@ template static inline void store_f16_opaque(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&, const uint8_t* const[3]) { - Sk4h_store4(dst, SkFloatToHalf_finite_ftz(dr), - SkFloatToHalf_finite_ftz(dg), - SkFloatToHalf_finite_ftz(db), - SK_Half1); + Sk4h::Store4(dst, SkFloatToHalf_finite_ftz(dr), + SkFloatToHalf_finite_ftz(dg), + SkFloatToHalf_finite_ftz(db), + SK_Half1); } template diff --git a/src/core/SkNx.h b/src/core/SkNx.h index 6bca856d8b..383f2aaae0 100644 --- a/src/core/SkNx.h +++ b/src/core/SkNx.h @@ -55,6 +55,23 @@ struct SkNx { fHi.store(ptr + N/2*sizeof(T)); } + static void Load4(const void* vptr, SkNx* a, SkNx* b, SkNx* c, SkNx* d) { + auto ptr = (const char*)vptr; + Half al, bl, cl, dl, + ah, bh, ch, dh; + Half::Load4(ptr , &al, &bl, &cl, &dl); + Half::Load4(ptr + 4*N/2*sizeof(T), &ah, &bh, &ch, &dh); + *a = SkNx{al, ah}; + *b = SkNx{bl, bh}; + *c = SkNx{cl, ch}; + *d = SkNx{dl, dh}; + } + static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) { + auto ptr = (char*)vptr; + Half::Store4(ptr, a.fLo, b.fLo, c.fLo, d.fLo); + Half::Store4(ptr + 4*N/2*sizeof(T), a.fHi, b.fHi, c.fHi, d.fHi); + } + bool anyTrue() const { return fLo.anyTrue() || fHi.anyTrue(); } bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); } @@ -123,6 +140,21 @@ struct SkNx<1,T> { } void store(void* ptr) const { memcpy(ptr, &fVal, sizeof(T)); } + static void Load4(const void* vptr, SkNx* a, SkNx* b, SkNx* c, SkNx* d) { + auto ptr = (const char*)vptr; + *a = Load(ptr + 0*sizeof(T)); + *b = Load(ptr + 1*sizeof(T)); + *c = Load(ptr + 2*sizeof(T)); + *d = Load(ptr + 3*sizeof(T)); + } + static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) { + auto ptr = (char*)vptr; + a.store(ptr + 0*sizeof(T)); + b.store(ptr + 1*sizeof(T)); + c.store(ptr + 2*sizeof(T)); + d.store(ptr + 3*sizeof(T)); + } + bool anyTrue() const { return fVal != 0; } bool allTrue() const { return fVal != 0; } @@ -310,50 +342,6 @@ SI Sk4i Sk4f_round(const Sk4f& x) { (int) lrintf (x[3]), }; } -// Load 4 Sk4h and transpose them (256 bits total). -SI void Sk4h_load4(const void* vptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) { - const uint64_t* ptr = (const uint64_t*)vptr; - auto p0 = Sk4h::Load(ptr+0), - p1 = Sk4h::Load(ptr+1), - p2 = Sk4h::Load(ptr+2), - p3 = Sk4h::Load(ptr+3); - *r = { p0[0], p1[0], p2[0], p3[0] }; - *g = { p0[1], p1[1], p2[1], p3[1] }; - *b = { p0[2], p1[2], p2[2], p3[2] }; - *a = { p0[3], p1[3], p2[3], p3[3] }; -} - -// Transpose 4 Sk4h and store (256 bits total). -SI void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b, const Sk4h& a) { - uint64_t* dst64 = (uint64_t*) dst; - Sk4h(r[0], g[0], b[0], a[0]).store(dst64 + 0); - Sk4h(r[1], g[1], b[1], a[1]).store(dst64 + 1); - Sk4h(r[2], g[2], b[2], a[2]).store(dst64 + 2); - Sk4h(r[3], g[3], b[3], a[3]).store(dst64 + 3); -} - -// Load 4 Sk4f and transpose them (512 bits total). -SI void Sk4f_load4(const void* vptr, Sk4f* r, Sk4f* g, Sk4f* b, Sk4f* a) { - const float* ptr = (const float*) vptr; - auto p0 = Sk4f::Load(ptr + 0), - p1 = Sk4f::Load(ptr + 4), - p2 = Sk4f::Load(ptr + 8), - p3 = Sk4f::Load(ptr + 12); - *r = { p0[0], p1[0], p2[0], p3[0] }; - *g = { p0[1], p1[1], p2[1], p3[1] }; - *b = { p0[2], p1[2], p2[2], p3[2] }; - *a = { p0[3], p1[3], p2[3], p3[3] }; -} - -// Transpose 4 Sk4f and store (512 bits total). -SI void Sk4f_store4(void* vdst, const Sk4f& r, const Sk4f& g, const Sk4f& b, const Sk4f& a) { - float* dst = (float*) vdst; - Sk4f(r[0], g[0], b[0], a[0]).store(dst + 0); - Sk4f(r[1], g[1], b[1], a[1]).store(dst + 4); - Sk4f(r[2], g[2], b[2], a[2]).store(dst + 8); - Sk4f(r[3], g[3], b[3], a[3]).store(dst + 12); -} - #endif SI void Sk4f_ToBytes(uint8_t p[16], const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) { diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index f5a0b09785..103f2e2be4 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -106,11 +106,29 @@ public: SkNx(float32x4_t vec) : fVec(vec) {} SkNx() {} - SkNx(float val) : fVec(vdupq_n_f32(val)) {} - static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } + SkNx(float val) : fVec(vdupq_n_f32(val)) {} SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; } + static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } + + static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { + float32x4x4_t rgba = vld4q_f32((const float*) ptr); + *r = rgba.val[0]; + *g = rgba.val[1]; + *b = rgba.val[2]; + *a = rgba.val[3]; + } + static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { + float32x4x4_t rgba = {{ + r.fVec, + g.fVec, + b.fVec, + a.fVec, + }}; + vst4q_f32((float*) dst, rgba); + } + SkNx invert() const { float32x4_t est0 = vrecpeq_f32(fVec), est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); @@ -203,14 +221,31 @@ public: SkNx() {} SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {} - static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } - SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { fVec = (uint16x4_t) { a,b,c,d }; } + static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } + static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { + uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr); + *r = rgba.val[0]; + *g = rgba.val[1]; + *b = rgba.val[2]; + *a = rgba.val[3]; + } + + static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { + uint16x4x4_t rgba = {{ + r.fVec, + g.fVec, + b.fVec, + a.fVec, + }}; + vst4_u16((uint16_t*) dst, rgba); + } + SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); } SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); } SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); } @@ -515,42 +550,4 @@ static inline Sk4i Sk4f_round(const Sk4f& x) { return vcvtq_s32_f32((x + 0.5f).fVec); } -static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) { - uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr); - *r = rgba.val[0]; - *g = rgba.val[1]; - *b = rgba.val[2]; - *a = rgba.val[3]; -} - -static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b, - const Sk4h& a) { - uint16x4x4_t rgba = {{ - r.fVec, - g.fVec, - b.fVec, - a.fVec, - }}; - vst4_u16((uint16_t*) dst, rgba); -} - -static inline void Sk4f_load4(const void* ptr, Sk4f* r, Sk4f* g, Sk4f* b, Sk4f* a) { - float32x4x4_t rgba = vld4q_f32((const float*) ptr); - *r = rgba.val[0]; - *g = rgba.val[1]; - *b = rgba.val[2]; - *a = rgba.val[3]; -} - -static inline void Sk4f_store4(void* dst, const Sk4f& r, const Sk4f& g, const Sk4f& b, - const Sk4f& a) { - float32x4x4_t rgba = {{ - r.fVec, - g.fVec, - b.fVec, - a.fVec, - }}; - vst4q_f32((float*) dst, rgba); -} - #endif//SkNx_neon_DEFINED diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 25a5cd8f84..66b5f0e9b3 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -67,12 +67,34 @@ public: SkNx() {} SkNx(float val) : fVec( _mm_set1_ps(val) ) {} - static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); } - SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} + static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); } void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } + static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { + __m128 v0 = _mm_loadu_ps(((float*)ptr) + 0), + v1 = _mm_loadu_ps(((float*)ptr) + 4), + v2 = _mm_loadu_ps(((float*)ptr) + 8), + v3 = _mm_loadu_ps(((float*)ptr) + 12); + _MM_TRANSPOSE4_PS(v0, v1, v2, v3); + *r = v0; + *g = v1; + *b = v2; + *a = v3; + } + static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { + __m128 v0 = r.fVec, + v1 = g.fVec, + v2 = b.fVec, + v3 = a.fVec; + _MM_TRANSPOSE4_PS(v0, v1, v2, v3); + _mm_storeu_ps(((float*) dst) + 0, v0); + _mm_storeu_ps(((float*) dst) + 4, v1); + _mm_storeu_ps(((float*) dst) + 8, v2); + _mm_storeu_ps(((float*) dst) + 12, v3); + } + SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } @@ -231,11 +253,32 @@ public: SkNx() {} SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} - static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); } SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {} + static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); } void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } + static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { + __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0), + hi = _mm_loadu_si128(((__m128i*)ptr) + 1); + __m128i even = _mm_unpacklo_epi16(lo, hi), // r0 r2 g0 g2 b0 b2 a0 a2 + odd = _mm_unpackhi_epi16(lo, hi); // r1 r3 ... + __m128i rg = _mm_unpacklo_epi16(even, odd), // r0 r1 r2 r3 g0 g1 g2 g3 + ba = _mm_unpackhi_epi16(even, odd); // b0 b1 ... a0 a1 ... + *r = rg; + *g = _mm_srli_si128(rg, 8); + *b = ba; + *a = _mm_srli_si128(ba, 8); + } + static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { + __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec); + __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec); + __m128i lo = _mm_unpacklo_epi32(rg, ba); + __m128i hi = _mm_unpackhi_epi32(rg, ba); + _mm_storeu_si128(((__m128i*) dst) + 0, lo); + _mm_storeu_si128(((__m128i*) dst) + 1, hi); + } + SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); } SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); } SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); } @@ -455,52 +498,4 @@ static inline Sk4i Sk4f_round(const Sk4f& x) { return _mm_cvtps_epi32(x.fVec); } -static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) { - __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0), - hi = _mm_loadu_si128(((__m128i*)ptr) + 1); - __m128i even = _mm_unpacklo_epi16(lo, hi), // r0 r2 g0 g2 b0 b2 a0 a2 - odd = _mm_unpackhi_epi16(lo, hi); // r1 r3 ... - __m128i rg = _mm_unpacklo_epi16(even, odd), // r0 r1 r2 r3 g0 g1 g2 g3 - ba = _mm_unpackhi_epi16(even, odd); // b0 b1 ... a0 a1 ... - *r = rg; - *g = _mm_srli_si128(rg, 8); - *b = ba; - *a = _mm_srli_si128(ba, 8); -} - -static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b, - const Sk4h& a) { - __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec); - __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec); - __m128i lo = _mm_unpacklo_epi32(rg, ba); - __m128i hi = _mm_unpackhi_epi32(rg, ba); - _mm_storeu_si128(((__m128i*) dst) + 0, lo); - _mm_storeu_si128(((__m128i*) dst) + 1, hi); -} - -static inline void Sk4f_load4(const void* ptr, Sk4f* r, Sk4f* g, Sk4f* b, Sk4f* a) { - __m128 v0 = _mm_loadu_ps(((float*)ptr) + 0), - v1 = _mm_loadu_ps(((float*)ptr) + 4), - v2 = _mm_loadu_ps(((float*)ptr) + 8), - v3 = _mm_loadu_ps(((float*)ptr) + 12); - _MM_TRANSPOSE4_PS(v0, v1, v2, v3); - *r = v0; - *g = v1; - *b = v2; - *a = v3; -} - -static inline void Sk4f_store4(void* dst, const Sk4f& r, const Sk4f& g, const Sk4f& b, - const Sk4f& a) { - __m128 v0 = r.fVec, - v1 = g.fVec, - v2 = b.fVec, - v3 = a.fVec; - _MM_TRANSPOSE4_PS(v0, v1, v2, v3); - _mm_storeu_ps(((float*) dst) + 0, v0); - _mm_storeu_ps(((float*) dst) + 4, v1); - _mm_storeu_ps(((float*) dst) + 8, v2); - _mm_storeu_ps(((float*) dst) + 12, v3); -} - #endif//SkNx_sse_DEFINED diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h index fdb15b421e..b0e6e1d2f9 100644 --- a/src/opts/SkRasterPipeline_opts.h +++ b/src/opts/SkRasterPipeline_opts.h @@ -220,7 +220,7 @@ namespace SK_OPTS_NS { } Sk4h rh, gh, bh, ah; - Sk4h_load4(ptr, &rh, &gh, &bh, &ah); + Sk4h::Load4(ptr, &rh, &gh, &bh, &ah); dr = SkHalfToFloat_finite_ftz(rh); dg = SkHalfToFloat_finite_ftz(gh); db = SkHalfToFloat_finite_ftz(bh); @@ -242,7 +242,7 @@ namespace SK_OPTS_NS { } Sk4h rh, gh, bh, ah; - Sk4h_load4(ptr, &rh, &gh, &bh, &ah); + Sk4h::Load4(ptr, &rh, &gh, &bh, &ah); r = SkHalfToFloat_finite_ftz(rh); g = SkHalfToFloat_finite_ftz(gh); b = SkHalfToFloat_finite_ftz(bh); @@ -254,10 +254,10 @@ namespace SK_OPTS_NS { auto ptr = (uint64_t*)ctx + x; switch (tail) { - case 0: return Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r), - SkFloatToHalf_finite_ftz(g), - SkFloatToHalf_finite_ftz(b), - SkFloatToHalf_finite_ftz(a)); + case 0: return Sk4h::Store4(ptr, SkFloatToHalf_finite_ftz(r), + SkFloatToHalf_finite_ftz(g), + SkFloatToHalf_finite_ftz(b), + SkFloatToHalf_finite_ftz(a)); case 3: SkFloatToHalf_finite_ftz({r[2], g[2], b[2], a[2]}).store(ptr+2); case 2: SkFloatToHalf_finite_ftz({r[1], g[1], b[1], a[1]}).store(ptr+1); diff --git a/tests/SkNxTest.cpp b/tests/SkNxTest.cpp index a3aef6bb8e..ce7b5bc48f 100644 --- a/tests/SkNxTest.cpp +++ b/tests/SkNxTest.cpp @@ -317,7 +317,7 @@ DEF_TEST(SkNx_4fLoad4Store4, r) { }; Sk4f a, b, c, d; - Sk4f_load4(src, &a, &b, &c, &d); + Sk4f::Load4(src, &a, &b, &c, &d); REPORTER_ASSERT(r, 0.0f == a[0]); REPORTER_ASSERT(r, 4.0f == a[1]); REPORTER_ASSERT(r, 8.0f == a[2]); @@ -336,6 +336,6 @@ DEF_TEST(SkNx_4fLoad4Store4, r) { REPORTER_ASSERT(r, 15.0f == d[3]); float dst[16]; - Sk4f_store4(dst, a, b, c, d); + Sk4f::Store4(dst, a, b, c, d); REPORTER_ASSERT(r, 0 == memcmp(dst, src, 16 * sizeof(float))); }