Specialize Sk2d for SSE2
Given the autovectorization we've seen, I wouldn't expect big speedups from this, but it does give us a point of control over what's going on. BUG=skia: CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review URL: https://codereview.chromium.org/1526923003
This commit is contained in:
parent
6822c20705
commit
fce612ac32
@ -203,11 +203,16 @@ SkNx<N,D> SkNx_cast(const SkNx<N,S>& src) {
|
||||
} // namespace
|
||||
|
||||
typedef SkNx<2, float> Sk2f;
|
||||
typedef SkNx<2, float> Sk2s;
|
||||
typedef SkNx<4, float> Sk4f;
|
||||
typedef SkNx<4, float> Sk4s;
|
||||
typedef SkNx<8, float> Sk8f;
|
||||
typedef SkNx<8, float> Sk8s;
|
||||
|
||||
typedef SkNx<2, double> Sk2d;
|
||||
typedef SkNx<4, double> Sk4d;
|
||||
typedef SkNx<8, double> Sk8d;
|
||||
|
||||
typedef SkNx<2, SkScalar> Sk2s;
|
||||
typedef SkNx<4, SkScalar> Sk4s;
|
||||
typedef SkNx<8, SkScalar> Sk8s;
|
||||
|
||||
typedef SkNx< 4, uint16_t> Sk4h;
|
||||
typedef SkNx< 8, uint16_t> Sk8h;
|
||||
|
@ -64,6 +64,52 @@ public:
|
||||
__m128 fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<2, double> {
|
||||
public:
|
||||
SkNx(const __m128d& vec) : fVec(vec) {}
|
||||
|
||||
SkNx() {}
|
||||
SkNx(double val) : fVec(_mm_set1_pd(val)) {}
|
||||
static SkNx Load(const double vals[2]) { return _mm_loadu_pd(vals); }
|
||||
SkNx(double a, double b) : fVec(_mm_setr_pd(a,b)) {}
|
||||
|
||||
void store(double vals[2]) const { _mm_storeu_pd(vals, fVec); }
|
||||
|
||||
SkNx operator + (const SkNx& o) const { return _mm_add_pd(fVec, o.fVec); }
|
||||
SkNx operator - (const SkNx& o) const { return _mm_sub_pd(fVec, o.fVec); }
|
||||
SkNx operator * (const SkNx& o) const { return _mm_mul_pd(fVec, o.fVec); }
|
||||
SkNx operator / (const SkNx& o) const { return _mm_div_pd(fVec, o.fVec); }
|
||||
|
||||
SkNx operator == (const SkNx& o) const { return _mm_cmpeq_pd (fVec, o.fVec); }
|
||||
SkNx operator != (const SkNx& o) const { return _mm_cmpneq_pd(fVec, o.fVec); }
|
||||
SkNx operator < (const SkNx& o) const { return _mm_cmplt_pd (fVec, o.fVec); }
|
||||
SkNx operator > (const SkNx& o) const { return _mm_cmpgt_pd (fVec, o.fVec); }
|
||||
SkNx operator <= (const SkNx& o) const { return _mm_cmple_pd (fVec, o.fVec); }
|
||||
SkNx operator >= (const SkNx& o) const { return _mm_cmpge_pd (fVec, o.fVec); }
|
||||
|
||||
static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_pd(l.fVec, r.fVec); }
|
||||
static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_pd(l.fVec, r.fVec); }
|
||||
|
||||
SkNx sqrt() const { return _mm_sqrt_pd(fVec); }
|
||||
|
||||
template <int k> double kth() const {
|
||||
SkASSERT(0 <= k && k < 2);
|
||||
union { __m128d v; double fs[2]; } pun = {fVec};
|
||||
return pun.fs[k&1];
|
||||
}
|
||||
|
||||
bool allTrue() const { return 0x3 == _mm_movemask_pd(fVec); }
|
||||
bool anyTrue() const { return 0x0 != _mm_movemask_pd(fVec); }
|
||||
|
||||
SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return _mm_or_pd(_mm_and_pd (fVec, t.fVec),
|
||||
_mm_andnot_pd(fVec, e.fVec));
|
||||
}
|
||||
|
||||
__m128d fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<4, int> {
|
||||
public:
|
||||
|
Loading…
Reference in New Issue
Block a user