Make load4 and store4 part of SkNx properly.

Every type now nominally has Load4() and Store4() methods.
The ones that we use are implemented.

BUG=skia:

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=3046
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Change-Id: I7984f0c2063ef8acbc322bd2e968f8f7eaa0d8fd
Reviewed-on: https://skia-review.googlesource.com/3046
Reviewed-by: Matt Sarett <msarett@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2016-10-06 11:09:27 -04:00 committed by Skia Commit-Bot
parent 40f23780e7
commit 33cbfd75af
6 changed files with 134 additions and 154 deletions

View File

@ -905,10 +905,10 @@ template <Order kOrder>
static inline void store_f16(void* dst, const uint32_t* src,
Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da,
const uint8_t* const[3]) {
Sk4h_store4(dst, SkFloatToHalf_finite_ftz(dr),
SkFloatToHalf_finite_ftz(dg),
SkFloatToHalf_finite_ftz(db),
SkFloatToHalf_finite_ftz(da));
Sk4h::Store4(dst, SkFloatToHalf_finite_ftz(dr),
SkFloatToHalf_finite_ftz(dg),
SkFloatToHalf_finite_ftz(db),
SkFloatToHalf_finite_ftz(da));
}
template <Order kOrder>
@ -923,7 +923,7 @@ template <Order kOrder>
static inline void store_f32(void* dst, const uint32_t* src,
Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da,
const uint8_t* const[3]) {
Sk4f_store4(dst, dr, dg, db, da);
Sk4f::Store4(dst, dr, dg, db, da);
}
template <Order kOrder>
@ -938,10 +938,10 @@ template <Order kOrder>
static inline void store_f16_opaque(void* dst, const uint32_t* src,
Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
const uint8_t* const[3]) {
Sk4h_store4(dst, SkFloatToHalf_finite_ftz(dr),
SkFloatToHalf_finite_ftz(dg),
SkFloatToHalf_finite_ftz(db),
SK_Half1);
Sk4h::Store4(dst, SkFloatToHalf_finite_ftz(dr),
SkFloatToHalf_finite_ftz(dg),
SkFloatToHalf_finite_ftz(db),
SK_Half1);
}
template <Order kOrder>

View File

@ -55,6 +55,23 @@ struct SkNx {
fHi.store(ptr + N/2*sizeof(T));
}
static void Load4(const void* vptr, SkNx* a, SkNx* b, SkNx* c, SkNx* d) {
auto ptr = (const char*)vptr;
Half al, bl, cl, dl,
ah, bh, ch, dh;
Half::Load4(ptr , &al, &bl, &cl, &dl);
Half::Load4(ptr + 4*N/2*sizeof(T), &ah, &bh, &ch, &dh);
*a = SkNx{al, ah};
*b = SkNx{bl, bh};
*c = SkNx{cl, ch};
*d = SkNx{dl, dh};
}
static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
auto ptr = (char*)vptr;
Half::Store4(ptr, a.fLo, b.fLo, c.fLo, d.fLo);
Half::Store4(ptr + 4*N/2*sizeof(T), a.fHi, b.fHi, c.fHi, d.fHi);
}
bool anyTrue() const { return fLo.anyTrue() || fHi.anyTrue(); }
bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); }
@ -123,6 +140,21 @@ struct SkNx<1,T> {
}
void store(void* ptr) const { memcpy(ptr, &fVal, sizeof(T)); }
static void Load4(const void* vptr, SkNx* a, SkNx* b, SkNx* c, SkNx* d) {
auto ptr = (const char*)vptr;
*a = Load(ptr + 0*sizeof(T));
*b = Load(ptr + 1*sizeof(T));
*c = Load(ptr + 2*sizeof(T));
*d = Load(ptr + 3*sizeof(T));
}
static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
auto ptr = (char*)vptr;
a.store(ptr + 0*sizeof(T));
b.store(ptr + 1*sizeof(T));
c.store(ptr + 2*sizeof(T));
d.store(ptr + 3*sizeof(T));
}
bool anyTrue() const { return fVal != 0; }
bool allTrue() const { return fVal != 0; }
@ -310,50 +342,6 @@ SI Sk4i Sk4f_round(const Sk4f& x) {
(int) lrintf (x[3]), };
}
// Load 4 Sk4h and transpose them (256 bits total).
SI void Sk4h_load4(const void* vptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
const uint64_t* ptr = (const uint64_t*)vptr;
auto p0 = Sk4h::Load(ptr+0),
p1 = Sk4h::Load(ptr+1),
p2 = Sk4h::Load(ptr+2),
p3 = Sk4h::Load(ptr+3);
*r = { p0[0], p1[0], p2[0], p3[0] };
*g = { p0[1], p1[1], p2[1], p3[1] };
*b = { p0[2], p1[2], p2[2], p3[2] };
*a = { p0[3], p1[3], p2[3], p3[3] };
}
// Transpose 4 Sk4h and store (256 bits total).
SI void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b, const Sk4h& a) {
uint64_t* dst64 = (uint64_t*) dst;
Sk4h(r[0], g[0], b[0], a[0]).store(dst64 + 0);
Sk4h(r[1], g[1], b[1], a[1]).store(dst64 + 1);
Sk4h(r[2], g[2], b[2], a[2]).store(dst64 + 2);
Sk4h(r[3], g[3], b[3], a[3]).store(dst64 + 3);
}
// Load 4 Sk4f and transpose them (512 bits total).
SI void Sk4f_load4(const void* vptr, Sk4f* r, Sk4f* g, Sk4f* b, Sk4f* a) {
const float* ptr = (const float*) vptr;
auto p0 = Sk4f::Load(ptr + 0),
p1 = Sk4f::Load(ptr + 4),
p2 = Sk4f::Load(ptr + 8),
p3 = Sk4f::Load(ptr + 12);
*r = { p0[0], p1[0], p2[0], p3[0] };
*g = { p0[1], p1[1], p2[1], p3[1] };
*b = { p0[2], p1[2], p2[2], p3[2] };
*a = { p0[3], p1[3], p2[3], p3[3] };
}
// Transpose 4 Sk4f and store (512 bits total).
SI void Sk4f_store4(void* vdst, const Sk4f& r, const Sk4f& g, const Sk4f& b, const Sk4f& a) {
float* dst = (float*) vdst;
Sk4f(r[0], g[0], b[0], a[0]).store(dst + 0);
Sk4f(r[1], g[1], b[1], a[1]).store(dst + 4);
Sk4f(r[2], g[2], b[2], a[2]).store(dst + 8);
Sk4f(r[3], g[3], b[3], a[3]).store(dst + 12);
}
#endif
SI void Sk4f_ToBytes(uint8_t p[16], const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) {

View File

@ -106,11 +106,29 @@ public:
SkNx(float32x4_t vec) : fVec(vec) {}
SkNx() {}
SkNx(float val) : fVec(vdupq_n_f32(val)) {}
static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
SkNx(float val) : fVec(vdupq_n_f32(val)) {}
SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); }
static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
float32x4x4_t rgba = vld4q_f32((const float*) ptr);
*r = rgba.val[0];
*g = rgba.val[1];
*b = rgba.val[2];
*a = rgba.val[3];
}
static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
float32x4x4_t rgba = {{
r.fVec,
g.fVec,
b.fVec,
a.fVec,
}};
vst4q_f32((float*) dst, rgba);
}
SkNx invert() const {
float32x4_t est0 = vrecpeq_f32(fVec),
est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
@ -203,14 +221,31 @@ public:
SkNx() {}
SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {}
static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); }
SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
fVec = (uint16x4_t) { a,b,c,d };
}
static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); }
void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); }
static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
*r = rgba.val[0];
*g = rgba.val[1];
*b = rgba.val[2];
*a = rgba.val[3];
}
static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
uint16x4x4_t rgba = {{
r.fVec,
g.fVec,
b.fVec,
a.fVec,
}};
vst4_u16((uint16_t*) dst, rgba);
}
SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); }
SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); }
SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); }
@ -515,42 +550,4 @@ static inline Sk4i Sk4f_round(const Sk4f& x) {
return vcvtq_s32_f32((x + 0.5f).fVec);
}
static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
*r = rgba.val[0];
*g = rgba.val[1];
*b = rgba.val[2];
*a = rgba.val[3];
}
static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b,
const Sk4h& a) {
uint16x4x4_t rgba = {{
r.fVec,
g.fVec,
b.fVec,
a.fVec,
}};
vst4_u16((uint16_t*) dst, rgba);
}
static inline void Sk4f_load4(const void* ptr, Sk4f* r, Sk4f* g, Sk4f* b, Sk4f* a) {
float32x4x4_t rgba = vld4q_f32((const float*) ptr);
*r = rgba.val[0];
*g = rgba.val[1];
*b = rgba.val[2];
*a = rgba.val[3];
}
static inline void Sk4f_store4(void* dst, const Sk4f& r, const Sk4f& g, const Sk4f& b,
const Sk4f& a) {
float32x4x4_t rgba = {{
r.fVec,
g.fVec,
b.fVec,
a.fVec,
}};
vst4q_f32((float*) dst, rgba);
}
#endif//SkNx_neon_DEFINED

View File

@ -67,12 +67,34 @@ public:
SkNx() {}
SkNx(float val) : fVec( _mm_set1_ps(val) ) {}
static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); }
SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); }
void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); }
static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
__m128 v0 = _mm_loadu_ps(((float*)ptr) + 0),
v1 = _mm_loadu_ps(((float*)ptr) + 4),
v2 = _mm_loadu_ps(((float*)ptr) + 8),
v3 = _mm_loadu_ps(((float*)ptr) + 12);
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
*r = v0;
*g = v1;
*b = v2;
*a = v3;
}
static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
__m128 v0 = r.fVec,
v1 = g.fVec,
v2 = b.fVec,
v3 = a.fVec;
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
_mm_storeu_ps(((float*) dst) + 0, v0);
_mm_storeu_ps(((float*) dst) + 4, v1);
_mm_storeu_ps(((float*) dst) + 8, v2);
_mm_storeu_ps(((float*) dst) + 12, v3);
}
SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
@ -231,11 +253,32 @@ public:
SkNx() {}
SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); }
SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {}
static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); }
void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
__m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0),
hi = _mm_loadu_si128(((__m128i*)ptr) + 1);
__m128i even = _mm_unpacklo_epi16(lo, hi), // r0 r2 g0 g2 b0 b2 a0 a2
odd = _mm_unpackhi_epi16(lo, hi); // r1 r3 ...
__m128i rg = _mm_unpacklo_epi16(even, odd), // r0 r1 r2 r3 g0 g1 g2 g3
ba = _mm_unpackhi_epi16(even, odd); // b0 b1 ... a0 a1 ...
*r = rg;
*g = _mm_srli_si128(rg, 8);
*b = ba;
*a = _mm_srli_si128(ba, 8);
}
static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
__m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);
__m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec);
__m128i lo = _mm_unpacklo_epi32(rg, ba);
__m128i hi = _mm_unpackhi_epi32(rg, ba);
_mm_storeu_si128(((__m128i*) dst) + 0, lo);
_mm_storeu_si128(((__m128i*) dst) + 1, hi);
}
SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
@ -455,52 +498,4 @@ static inline Sk4i Sk4f_round(const Sk4f& x) {
return _mm_cvtps_epi32(x.fVec);
}
static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
__m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0),
hi = _mm_loadu_si128(((__m128i*)ptr) + 1);
__m128i even = _mm_unpacklo_epi16(lo, hi), // r0 r2 g0 g2 b0 b2 a0 a2
odd = _mm_unpackhi_epi16(lo, hi); // r1 r3 ...
__m128i rg = _mm_unpacklo_epi16(even, odd), // r0 r1 r2 r3 g0 g1 g2 g3
ba = _mm_unpackhi_epi16(even, odd); // b0 b1 ... a0 a1 ...
*r = rg;
*g = _mm_srli_si128(rg, 8);
*b = ba;
*a = _mm_srli_si128(ba, 8);
}
static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b,
const Sk4h& a) {
__m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);
__m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec);
__m128i lo = _mm_unpacklo_epi32(rg, ba);
__m128i hi = _mm_unpackhi_epi32(rg, ba);
_mm_storeu_si128(((__m128i*) dst) + 0, lo);
_mm_storeu_si128(((__m128i*) dst) + 1, hi);
}
static inline void Sk4f_load4(const void* ptr, Sk4f* r, Sk4f* g, Sk4f* b, Sk4f* a) {
__m128 v0 = _mm_loadu_ps(((float*)ptr) + 0),
v1 = _mm_loadu_ps(((float*)ptr) + 4),
v2 = _mm_loadu_ps(((float*)ptr) + 8),
v3 = _mm_loadu_ps(((float*)ptr) + 12);
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
*r = v0;
*g = v1;
*b = v2;
*a = v3;
}
static inline void Sk4f_store4(void* dst, const Sk4f& r, const Sk4f& g, const Sk4f& b,
const Sk4f& a) {
__m128 v0 = r.fVec,
v1 = g.fVec,
v2 = b.fVec,
v3 = a.fVec;
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
_mm_storeu_ps(((float*) dst) + 0, v0);
_mm_storeu_ps(((float*) dst) + 4, v1);
_mm_storeu_ps(((float*) dst) + 8, v2);
_mm_storeu_ps(((float*) dst) + 12, v3);
}
#endif//SkNx_sse_DEFINED

View File

@ -220,7 +220,7 @@ namespace SK_OPTS_NS {
}
Sk4h rh, gh, bh, ah;
Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
Sk4h::Load4(ptr, &rh, &gh, &bh, &ah);
dr = SkHalfToFloat_finite_ftz(rh);
dg = SkHalfToFloat_finite_ftz(gh);
db = SkHalfToFloat_finite_ftz(bh);
@ -242,7 +242,7 @@ namespace SK_OPTS_NS {
}
Sk4h rh, gh, bh, ah;
Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
Sk4h::Load4(ptr, &rh, &gh, &bh, &ah);
r = SkHalfToFloat_finite_ftz(rh);
g = SkHalfToFloat_finite_ftz(gh);
b = SkHalfToFloat_finite_ftz(bh);
@ -254,10 +254,10 @@ namespace SK_OPTS_NS {
auto ptr = (uint64_t*)ctx + x;
switch (tail) {
case 0: return Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r),
SkFloatToHalf_finite_ftz(g),
SkFloatToHalf_finite_ftz(b),
SkFloatToHalf_finite_ftz(a));
case 0: return Sk4h::Store4(ptr, SkFloatToHalf_finite_ftz(r),
SkFloatToHalf_finite_ftz(g),
SkFloatToHalf_finite_ftz(b),
SkFloatToHalf_finite_ftz(a));
case 3: SkFloatToHalf_finite_ftz({r[2], g[2], b[2], a[2]}).store(ptr+2);
case 2: SkFloatToHalf_finite_ftz({r[1], g[1], b[1], a[1]}).store(ptr+1);

View File

@ -317,7 +317,7 @@ DEF_TEST(SkNx_4fLoad4Store4, r) {
};
Sk4f a, b, c, d;
Sk4f_load4(src, &a, &b, &c, &d);
Sk4f::Load4(src, &a, &b, &c, &d);
REPORTER_ASSERT(r, 0.0f == a[0]);
REPORTER_ASSERT(r, 4.0f == a[1]);
REPORTER_ASSERT(r, 8.0f == a[2]);
@ -336,6 +336,6 @@ DEF_TEST(SkNx_4fLoad4Store4, r) {
REPORTER_ASSERT(r, 15.0f == d[3]);
float dst[16];
Sk4f_store4(dst, a, b, c, d);
Sk4f::Store4(dst, a, b, c, d);
REPORTER_ASSERT(r, 0 == memcmp(dst, src, 16 * sizeof(float)));
}