remove SkNx AVX code

We can't realistically use AVX and SkNx together because of ODR
problems, so remove the code that may tempt us to try.

Remaining code paths using AVX:
  - one intrinsics-only routine in SkOpts_hsw.cpp
  - SkJumper

Change-Id: I0d2d03b47ea4a0eec27f2de2b28a4c3d1ff8376f
Reviewed-on: https://skia-review.googlesource.com/13121
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-04-10 14:44:47 -04:00 committed by Skia Commit-Bot
parent 8d4cbad13c
commit d177ae18d7
3 changed files with 0 additions and 338 deletions

View File

@ -11,10 +11,6 @@
#include "SkNx.h"
#include "SkTypes.h"
#if !defined(_MSC_VER) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
#include <x86intrin.h>
#endif
// 16-bit floating point value
// format is 1 bit sign, 5 bits exponent, 10 bits mantissa
// only used for storage
@ -89,29 +85,4 @@ static inline Sk4h SkFloatToHalf_finite_ftz(const Sk4f& fs) {
#endif
}
static inline Sk8f SkHalfToFloat_finite_ftz(const Sk8h& hs) {
#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
return _mm256_cvtph_ps(hs.fVec);
#else
uint64_t parts[2];
hs.store(parts);
return SkNx_join(SkHalfToFloat_finite_ftz(parts[0]),
SkHalfToFloat_finite_ftz(parts[1]));
#endif
}
static inline Sk8h SkFloatToHalf_finite_ftz(const Sk8f& fs) {
#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
return _mm256_cvtps_ph(fs.fVec, _MM_FROUND_CUR_DIRECTION);
#else
uint64_t parts[2];
SkFloatToHalf_finite_ftz(fs.fLo).store(parts+0);
SkFloatToHalf_finite_ftz(fs.fHi).store(parts+1);
return Sk8h::Load(parts);
#endif
}
#endif

View File

@ -329,7 +329,6 @@ public:
AI void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
// TODO: AVX2 version
__m128i _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
_45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
@ -351,7 +350,6 @@ public:
*a = _mm_unpackhi_epi64(ba0123, ba4567);
}
AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
// TODO: AVX2 version
const uint8_t* ptr8 = (const uint8_t*) ptr;
__m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 + 0*2));
__m128i rgb1 = _mm_srli_si128(rgb0, 3*2);
@ -377,7 +375,6 @@ public:
*b = _mm_unpacklo_epi64(bx03, bx47);
}
AI static void Store4(void* ptr, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
// TODO: AVX2 version
__m128i rg0123 = _mm_unpacklo_epi16(r.fVec, g.fVec), // r0 g0 r1 g1 r2 g2 r3 g3
rg4567 = _mm_unpackhi_epi16(r.fVec, g.fVec), // r4 g4 r5 g5 r6 g6 r7 g7
ba0123 = _mm_unpacklo_epi16(b.fVec, a.fVec),
@ -486,243 +483,6 @@ public:
__m128i fVec;
};
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
template <>
class SkNx<8, uint8_t> {
public:
AI SkNx(const __m128i& vec) : fVec(vec) {}
AI SkNx() {}
AI SkNx(uint8_t v) : fVec(_mm_set1_epi8(v)) {}
AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
uint8_t e, uint8_t f, uint8_t g, uint8_t h)
: fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, 0,0,0,0, 0,0,0,0)) {}
AI static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); }
AI void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
AI uint8_t operator[](int k) const {
SkASSERT(0 <= k && k < 8);
union { __m128i v; uint8_t us[16]; } pun = {fVec};
return pun.us[k&7];
}
__m128i fVec;
};
template <>
class SkNx<8, int32_t> {
public:
AI SkNx(const __m256i& vec) : fVec(vec) {}
AI SkNx() {}
AI SkNx(int32_t v) : fVec(_mm256_set1_epi32(v)) {}
AI SkNx(int32_t a, int32_t b, int32_t c, int32_t d,
int32_t e, int32_t f, int32_t g, int32_t h)
: fVec(_mm256_setr_epi32(a,b,c,d, e,f,g,h)) {}
AI static SkNx Load(const void* ptr) { return _mm256_loadu_si256((const __m256i*)ptr); }
AI void store(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, fVec); }
AI SkNx operator + (const SkNx& o) const { return _mm256_add_epi32(fVec, o.fVec); }
AI SkNx operator - (const SkNx& o) const { return _mm256_sub_epi32(fVec, o.fVec); }
AI SkNx operator * (const SkNx& o) const { return _mm256_mullo_epi32(fVec, o.fVec); }
AI SkNx operator & (const SkNx& o) const { return _mm256_and_si256(fVec, o.fVec); }
AI SkNx operator | (const SkNx& o) const { return _mm256_or_si256(fVec, o.fVec); }
AI SkNx operator ^ (const SkNx& o) const { return _mm256_xor_si256(fVec, o.fVec); }
AI SkNx operator << (int bits) const { return _mm256_slli_epi32(fVec, bits); }
AI SkNx operator >> (int bits) const { return _mm256_srai_epi32(fVec, bits); }
AI int32_t operator[](int k) const {
SkASSERT(0 <= k && k < 8);
union { __m256i v; int32_t is[8]; } pun = {fVec};
return pun.is[k&7];
}
__m256i fVec;
};
template <>
class SkNx<8, uint32_t> {
public:
AI SkNx(const __m256i& vec) : fVec(vec) {}
AI SkNx() {}
AI SkNx(uint32_t v) : fVec(_mm256_set1_epi32(v)) {}
AI SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d,
uint32_t e, uint32_t f, uint32_t g, uint32_t h)
: fVec(_mm256_setr_epi32(a,b,c,d, e,f,g,h)) {}
AI static SkNx Load(const void* ptr) { return _mm256_loadu_si256((const __m256i*)ptr); }
AI void store(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, fVec); }
AI SkNx operator + (const SkNx& o) const { return _mm256_add_epi32(fVec, o.fVec); }
AI SkNx operator - (const SkNx& o) const { return _mm256_sub_epi32(fVec, o.fVec); }
AI SkNx operator * (const SkNx& o) const { return _mm256_mullo_epi32(fVec, o.fVec); }
AI SkNx operator & (const SkNx& o) const { return _mm256_and_si256(fVec, o.fVec); }
AI SkNx operator | (const SkNx& o) const { return _mm256_or_si256(fVec, o.fVec); }
AI SkNx operator ^ (const SkNx& o) const { return _mm256_xor_si256(fVec, o.fVec); }
AI SkNx operator << (int bits) const { return _mm256_slli_epi32(fVec, bits); }
AI SkNx operator >> (int bits) const { return _mm256_srli_epi32(fVec, bits); }
AI uint32_t operator[](int k) const {
SkASSERT(0 <= k && k < 8);
union { __m256i v; uint32_t us[8]; } pun = {fVec};
return pun.us[k&7];
}
__m256i fVec;
};
// _mm256_unpack{lo,hi}_pd() auto-casting to and from __m256d.
AI static __m256 unpacklo_pd(__m256 x, __m256 y) {
return _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(x), _mm256_castps_pd(y)));
}
AI static __m256 unpackhi_pd(__m256 x, __m256 y) {
return _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(x), _mm256_castps_pd(y)));
}
template <>
class SkNx<8, float> {
public:
AI SkNx(const __m256& vec) : fVec(vec) {}
AI SkNx() {}
AI SkNx(float val) : fVec(_mm256_set1_ps(val)) {}
AI SkNx(float a, float b, float c, float d,
float e, float f, float g, float h) : fVec(_mm256_setr_ps(a,b,c,d,e,f,g,h)) {}
AI static SkNx Load(const void* ptr) { return _mm256_loadu_ps((const float*)ptr); }
AI void store(void* ptr) const { _mm256_storeu_ps((float*)ptr, fVec); }
AI static void Store4(void* ptr,
const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
__m256 rg0145 = _mm256_unpacklo_ps(r.fVec, g.fVec), // r0 g0 r1 g1 | r4 g4 r5 g5
rg2367 = _mm256_unpackhi_ps(r.fVec, g.fVec), // r2 ... | r6 ...
ba0145 = _mm256_unpacklo_ps(b.fVec, a.fVec), // b0 a0 b1 a1 | b4 a4 b5 a5
ba2367 = _mm256_unpackhi_ps(b.fVec, a.fVec); // b2 ... | b6 ...
__m256 _04 = unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4
_15 = unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ...
_26 = unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ...
_37 = unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ...
__m256 _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
_23 = _mm256_permute2f128_ps(_26, _37, 32),
_45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
_67 = _mm256_permute2f128_ps(_26, _37, 49);
_mm256_storeu_ps((float*)ptr + 0*8, _01);
_mm256_storeu_ps((float*)ptr + 1*8, _23);
_mm256_storeu_ps((float*)ptr + 2*8, _45);
_mm256_storeu_ps((float*)ptr + 3*8, _67);
}
AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
Sk4f rl, gl, bl, al,
rh, gh, bh, ah;
Sk4f::Load4((const float*)ptr + 0, &rl, &gl, &bl, &al);
Sk4f::Load4((const float*)ptr + 16, &rh, &gh, &bh, &ah);
*r = _mm256_setr_m128(rl.fVec, rh.fVec);
*g = _mm256_setr_m128(gl.fVec, gh.fVec);
*b = _mm256_setr_m128(bl.fVec, bh.fVec);
*a = _mm256_setr_m128(al.fVec, ah.fVec);
}
AI SkNx operator+(const SkNx& o) const { return _mm256_add_ps(fVec, o.fVec); }
AI SkNx operator-(const SkNx& o) const { return _mm256_sub_ps(fVec, o.fVec); }
AI SkNx operator*(const SkNx& o) const { return _mm256_mul_ps(fVec, o.fVec); }
AI SkNx operator/(const SkNx& o) const { return _mm256_div_ps(fVec, o.fVec); }
AI SkNx operator==(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_EQ_OQ); }
AI SkNx operator!=(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_NEQ_OQ); }
AI SkNx operator <(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_LT_OQ); }
AI SkNx operator >(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_GT_OQ); }
AI SkNx operator<=(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_LE_OQ); }
AI SkNx operator>=(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_GE_OQ); }
AI static SkNx Min(const SkNx& l, const SkNx& r) { return _mm256_min_ps(l.fVec, r.fVec); }
AI static SkNx Max(const SkNx& l, const SkNx& r) { return _mm256_max_ps(l.fVec, r.fVec); }
AI SkNx sqrt() const { return _mm256_sqrt_ps (fVec); }
AI SkNx rsqrt() const { return _mm256_rsqrt_ps(fVec); }
AI SkNx invert() const { return _mm256_rcp_ps (fVec); }
AI SkNx abs() const { return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), fVec); }
AI SkNx floor() const { return _mm256_floor_ps(fVec); }
AI float operator[](int k) const {
SkASSERT(0 <= k && k < 8);
union { __m256 v; float fs[8]; } pun = {fVec};
return pun.fs[k&7];
}
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
return _mm256_blendv_ps(e.fVec, t.fVec, fVec);
}
__m256 fVec;
};
AI static void SkNx_split(const Sk8f& v, Sk4f* lo, Sk4f* hi) {
*lo = _mm256_extractf128_ps(v.fVec, 0);
*hi = _mm256_extractf128_ps(v.fVec, 1);
}
AI static Sk8f SkNx_join(const Sk4f& lo, const Sk4f& hi) {
return _mm256_insertf128_ps(_mm256_castps128_ps256(lo.fVec), hi.fVec, 1);
}
AI static Sk8f SkNx_fma(const Sk8f& a, const Sk8f& b, const Sk8f& c) {
return _mm256_fmadd_ps(a.fVec, b.fVec, c.fVec);
}
template<> AI /*static*/ Sk8i SkNx_cast<int>(const Sk8b& src) {
return _mm256_cvtepu8_epi32(src.fVec);
}
template<> AI /*static*/ Sk8f SkNx_cast<float>(const Sk8b& src) {
return _mm256_cvtepi32_ps(SkNx_cast<int>(src).fVec);
}
template<> AI /*static*/ Sk8i SkNx_cast<int>(const Sk8h& src) {
return _mm256_cvtepu16_epi32(src.fVec);
}
template<> AI /*static*/ Sk8f SkNx_cast<float>(const Sk8h& src) {
return _mm256_cvtepi32_ps(SkNx_cast<int>(src).fVec);
}
template<> AI /*static*/ Sk8f SkNx_cast<float>(const Sk8i& src) {
return _mm256_cvtepi32_ps(src.fVec);
}
template<> AI /*static*/ Sk8i SkNx_cast<int>(const Sk8f& src) {
return _mm256_cvttps_epi32(src.fVec);
}
template<> AI /*static*/ Sk8h SkNx_cast<uint16_t>(const Sk8i& src) {
__m128i lo = _mm256_extractf128_si256(src.fVec, 0),
hi = _mm256_extractf128_si256(src.fVec, 1);
return _mm_packus_epi32(lo, hi);
}
template<> AI /*static*/ Sk8h SkNx_cast<uint16_t>(const Sk8f& src) {
return SkNx_cast<uint16_t>(SkNx_cast<int>(src));
}
template<> AI /*static*/ Sk8b SkNx_cast<uint8_t>(const Sk8i& src) {
auto _16 = SkNx_cast<uint16_t>(src);
return _mm_packus_epi16(_16.fVec, _16.fVec);
}
#endif
template<> AI /*static*/ Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {
return _mm_cvtepi32_ps(src.fVec);
}

View File

@ -23,11 +23,7 @@
namespace {
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
static constexpr int N = 8;
#else
static constexpr int N = 4;
#endif
using SkNf = SkNx<N, float>;
using SkNi = SkNx<N, int32_t>;
@ -202,71 +198,6 @@ SI void store(size_t tail, const SkNx<N,T>& v, T* dst) {
v.store(dst);
}
#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
SI __m256i mask(size_t tail) {
static const int masks[][8] = {
{~0,~0,~0,~0, ~0,~0,~0,~0 }, // remember, tail == 0 ~~> load all N
{~0, 0, 0, 0, 0, 0, 0, 0 },
{~0,~0, 0, 0, 0, 0, 0, 0 },
{~0,~0,~0, 0, 0, 0, 0, 0 },
{~0,~0,~0,~0, 0, 0, 0, 0 },
{~0,~0,~0,~0, ~0, 0, 0, 0 },
{~0,~0,~0,~0, ~0,~0, 0, 0 },
{~0,~0,~0,~0, ~0,~0,~0, 0 },
};
return SkNi::Load(masks + tail).fVec;
}
SI SkNi load(size_t tail, const int32_t* src) {
return tail ? _mm256_maskload_epi32((const int*)src, mask(tail))
: SkNi::Load(src);
}
SI SkNu load(size_t tail, const uint32_t* src) {
return tail ? _mm256_maskload_epi32((const int*)src, mask(tail))
: SkNu::Load(src);
}
SI SkNf load(size_t tail, const float* src) {
return tail ? _mm256_maskload_ps((const float*)src, mask(tail))
: SkNf::Load(src);
}
SI SkNi gather(size_t tail, const int32_t* src, const SkNi& offset) {
auto m = mask(tail);
return _mm256_mask_i32gather_epi32(SkNi(0).fVec, (const int*)src, offset.fVec, m, 4);
}
SI SkNu gather(size_t tail, const uint32_t* src, const SkNi& offset) {
auto m = mask(tail);
return _mm256_mask_i32gather_epi32(SkNi(0).fVec, (const int*)src, offset.fVec, m, 4);
}
SI SkNf gather(size_t tail, const float* src, const SkNi& offset) {
auto m = _mm256_castsi256_ps(mask(tail));
return _mm256_mask_i32gather_ps(SkNf(0).fVec, (const float*)src, offset.fVec, m, 4);
}
static const char* bug = "I don't think MSAN understands maskstore.";
SI void store(size_t tail, const SkNi& v, int32_t* dst) {
if (tail) {
_mm256_maskstore_epi32((int*)dst, mask(tail), v.fVec);
return sk_msan_mark_initialized(dst, dst+tail, bug);
}
v.store(dst);
}
SI void store(size_t tail, const SkNu& v, uint32_t* dst) {
if (tail) {
_mm256_maskstore_epi32((int*)dst, mask(tail), v.fVec);
return sk_msan_mark_initialized(dst, dst+tail, bug);
}
v.store(dst);
}
SI void store(size_t tail, const SkNf& v, float* dst) {
if (tail) {
_mm256_maskstore_ps((float*)dst, mask(tail), v.fVec);
return sk_msan_mark_initialized(dst, dst+tail, bug);
}
v.store(dst);
}
#endif
SI SkNf SkNf_fma(const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); }
SI SkNi SkNf_round(const SkNf& x, const SkNf& scale) {