Remove SkNx entirely
Also cleans up the scattered references remaining in the code base (including in files I thought I got already...). Change-Id: I7004354b1e9cea9f9d9f45b791d8ab9ce557ba01 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/542647 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Michael Ludwig <michaelludwig@google.com>
This commit is contained in:
parent
a25aeff052
commit
0221e79b8b
@ -7,7 +7,7 @@
|
||||
|
||||
#include "bench/Benchmark.h"
|
||||
#include "include/core/SkColor.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkVx.h"
|
||||
|
||||
// Writing into this array prevents the loops from being compiled away.
|
||||
static volatile float blackhole[4];
|
||||
@ -29,9 +29,9 @@ struct Sk4fRoundtripBench : public Benchmark {
|
||||
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
|
||||
|
||||
void onDraw(int loops, SkCanvas* canvas) override {
|
||||
Sk4f fs(1,2,3,4);
|
||||
skvx::float4 fs(1,2,3,4);
|
||||
while (loops --> 0) {
|
||||
fs = SkNx_cast<float>(SkNx_cast<T>(fs));
|
||||
fs = skvx::cast<float>(skvx::cast<T>(fs));
|
||||
}
|
||||
fs.store((float*)blackhole);
|
||||
}
|
||||
@ -47,9 +47,9 @@ struct Sk4fFloorBench : public Benchmark {
|
||||
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
|
||||
|
||||
void onDraw(int loops, SkCanvas* canvas) override {
|
||||
Sk4f fs(1,2,3,4);
|
||||
skvx::float4 fs(1,2,3,4);
|
||||
while (loops --> 0) {
|
||||
fs = fs.floor();
|
||||
fs = floor(fs);
|
||||
}
|
||||
fs.store((float*)blackhole);
|
||||
}
|
||||
@ -62,21 +62,24 @@ struct Sk4fGradientBench : public Benchmark {
|
||||
|
||||
SkPMColor fDevice[100];
|
||||
void onDraw(int loops, SkCanvas*) override {
|
||||
Sk4f c0(0,0,255,255),
|
||||
c1(255,0,0,255),
|
||||
dc = c1 - c0,
|
||||
fx(0.1f),
|
||||
dx(0.002f),
|
||||
dcdx(dc*dx),
|
||||
dcdx4(dcdx+dcdx+dcdx+dcdx);
|
||||
skvx::float4 c0(0,0,255,255),
|
||||
c1(255,0,0,255),
|
||||
dc = c1 - c0,
|
||||
fx(0.1f),
|
||||
dx(0.002f),
|
||||
dcdx(dc*dx),
|
||||
dcdx4(dcdx+dcdx+dcdx+dcdx);
|
||||
|
||||
for (int n = 0; n < loops; n++) {
|
||||
Sk4f a = c0 + dc*fx + Sk4f(0.5f), // add an extra 0.5f to get rounding for free.
|
||||
auto a = c0 + dc*fx + 0.5f, // add an extra 0.5f to get rounding for free.
|
||||
b = a + dcdx,
|
||||
c = b + dcdx,
|
||||
d = c + dcdx;
|
||||
for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
|
||||
Sk4f_ToBytes((uint8_t*)(fDevice+i), a, b, c, d);
|
||||
skvx::cast<uint8_t>(a).store(fDevice + i + 0);
|
||||
skvx::cast<uint8_t>(b).store(fDevice + i + 1);
|
||||
skvx::cast<uint8_t>(c).store(fDevice + i + 2);
|
||||
skvx::cast<uint8_t>(d).store(fDevice + i + 3);
|
||||
a = a + dcdx4;
|
||||
b = b + dcdx4;
|
||||
c = c + dcdx4;
|
||||
|
@ -7952,7 +7952,6 @@ generated_cc_atom(
|
||||
"//include/core:SkScalar_hdr",
|
||||
"//include/core:SkSize_hdr",
|
||||
"//include/core:SkString_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//src/core:SkMipmapBuilder_hdr",
|
||||
"//src/core:SkMipmap_hdr",
|
||||
"//tools:Resources_hdr",
|
||||
|
@ -18,7 +18,6 @@
|
||||
#include "include/core/SkScalar.h"
|
||||
#include "include/core/SkSize.h"
|
||||
#include "include/core/SkString.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "src/core/SkMipmap.h"
|
||||
#include "src/core/SkMipmapBuilder.h"
|
||||
#include "tools/Resources.h"
|
||||
|
@ -496,9 +496,6 @@ skia_core_sources = [
|
||||
"$_include/private/SkMalloc.h",
|
||||
"$_include/private/SkMutex.h",
|
||||
"$_include/private/SkNoncopyable.h",
|
||||
"$_include/private/SkNx.h",
|
||||
"$_include/private/SkNx_neon.h",
|
||||
"$_include/private/SkNx_sse.h",
|
||||
"$_include/private/SkOnce.h",
|
||||
"$_include/private/SkPathRef.h",
|
||||
"$_include/private/SkSemaphore.h",
|
||||
|
@ -237,7 +237,6 @@ tests_sources = [
|
||||
"$_tests/SkGlyphBufferTest.cpp",
|
||||
"$_tests/SkGlyphTest.cpp",
|
||||
"$_tests/SkImageTest.cpp",
|
||||
"$_tests/SkNxTest.cpp",
|
||||
"$_tests/SkPathRangeIterTest.cpp",
|
||||
"$_tests/SkRasterPipelineTest.cpp",
|
||||
"$_tests/SkRemoteGlyphCacheTest.cpp",
|
||||
|
@ -152,32 +152,6 @@ generated_cc_atom(
|
||||
deps = ["//include/core:SkTypes_hdr"],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
name = "SkNx_hdr",
|
||||
hdrs = ["SkNx.h"],
|
||||
visibility = ["//:__subpackages__"],
|
||||
deps = [
|
||||
":SkNx_neon_hdr",
|
||||
":SkNx_sse_hdr",
|
||||
":SkSafe_math_hdr",
|
||||
"//include/core:SkScalar_hdr",
|
||||
"//include/core:SkTypes_hdr",
|
||||
],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
name = "SkNx_neon_hdr",
|
||||
hdrs = ["SkNx_neon.h"],
|
||||
visibility = ["//:__subpackages__"],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
name = "SkNx_sse_hdr",
|
||||
hdrs = ["SkNx_sse.h"],
|
||||
visibility = ["//:__subpackages__"],
|
||||
deps = ["//include/core:SkTypes_hdr"],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
name = "SkOnce_hdr",
|
||||
hdrs = ["SkOnce.h"],
|
||||
|
@ -1,430 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkNx_DEFINED
|
||||
#define SkNx_DEFINED
|
||||
|
||||
#include "include/core/SkScalar.h"
|
||||
#include "include/core/SkTypes.h"
|
||||
#include "include/private/SkSafe_math.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
||||
// Every single SkNx method wants to be fully inlined. (We know better than MSVC).
|
||||
#define AI SK_ALWAYS_INLINE
|
||||
|
||||
namespace { // NOLINT(google-build-namespaces)
|
||||
|
||||
// The default SkNx<N,T> just proxies down to a pair of SkNx<N/2, T>.
|
||||
template <int N, typename T>
|
||||
struct SkNx {
|
||||
typedef SkNx<N/2, T> Half;
|
||||
|
||||
Half fLo, fHi;
|
||||
|
||||
AI SkNx() = default;
|
||||
AI SkNx(const Half& lo, const Half& hi) : fLo(lo), fHi(hi) {}
|
||||
|
||||
AI SkNx(T v) : fLo(v), fHi(v) {}
|
||||
|
||||
AI SkNx(T a, T b) : fLo(a) , fHi(b) { static_assert(N==2, ""); }
|
||||
AI SkNx(T a, T b, T c, T d) : fLo(a,b), fHi(c,d) { static_assert(N==4, ""); }
|
||||
AI SkNx(T a, T b, T c, T d, T e, T f, T g, T h) : fLo(a,b,c,d), fHi(e,f,g,h) {
|
||||
static_assert(N==8, "");
|
||||
}
|
||||
AI SkNx(T a, T b, T c, T d, T e, T f, T g, T h,
|
||||
T i, T j, T k, T l, T m, T n, T o, T p)
|
||||
: fLo(a,b,c,d, e,f,g,h), fHi(i,j,k,l, m,n,o,p) {
|
||||
static_assert(N==16, "");
|
||||
}
|
||||
|
||||
AI T operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < N);
|
||||
return k < N/2 ? fLo[k] : fHi[k-N/2];
|
||||
}
|
||||
|
||||
AI static SkNx Load(const void* vptr) {
|
||||
auto ptr = (const char*)vptr;
|
||||
return { Half::Load(ptr), Half::Load(ptr + N/2*sizeof(T)) };
|
||||
}
|
||||
AI void store(void* vptr) const {
|
||||
auto ptr = (char*)vptr;
|
||||
fLo.store(ptr);
|
||||
fHi.store(ptr + N/2*sizeof(T));
|
||||
}
|
||||
|
||||
AI static void Load4(const void* vptr, SkNx* a, SkNx* b, SkNx* c, SkNx* d) {
|
||||
auto ptr = (const char*)vptr;
|
||||
Half al, bl, cl, dl,
|
||||
ah, bh, ch, dh;
|
||||
Half::Load4(ptr , &al, &bl, &cl, &dl);
|
||||
Half::Load4(ptr + 4*N/2*sizeof(T), &ah, &bh, &ch, &dh);
|
||||
*a = SkNx{al, ah};
|
||||
*b = SkNx{bl, bh};
|
||||
*c = SkNx{cl, ch};
|
||||
*d = SkNx{dl, dh};
|
||||
}
|
||||
AI static void Load3(const void* vptr, SkNx* a, SkNx* b, SkNx* c) {
|
||||
auto ptr = (const char*)vptr;
|
||||
Half al, bl, cl,
|
||||
ah, bh, ch;
|
||||
Half::Load3(ptr , &al, &bl, &cl);
|
||||
Half::Load3(ptr + 3*N/2*sizeof(T), &ah, &bh, &ch);
|
||||
*a = SkNx{al, ah};
|
||||
*b = SkNx{bl, bh};
|
||||
*c = SkNx{cl, ch};
|
||||
}
|
||||
AI static void Load2(const void* vptr, SkNx* a, SkNx* b) {
|
||||
auto ptr = (const char*)vptr;
|
||||
Half al, bl,
|
||||
ah, bh;
|
||||
Half::Load2(ptr , &al, &bl);
|
||||
Half::Load2(ptr + 2*N/2*sizeof(T), &ah, &bh);
|
||||
*a = SkNx{al, ah};
|
||||
*b = SkNx{bl, bh};
|
||||
}
|
||||
AI static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
|
||||
auto ptr = (char*)vptr;
|
||||
Half::Store4(ptr, a.fLo, b.fLo, c.fLo, d.fLo);
|
||||
Half::Store4(ptr + 4*N/2*sizeof(T), a.fHi, b.fHi, c.fHi, d.fHi);
|
||||
}
|
||||
AI static void Store3(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c) {
|
||||
auto ptr = (char*)vptr;
|
||||
Half::Store3(ptr, a.fLo, b.fLo, c.fLo);
|
||||
Half::Store3(ptr + 3*N/2*sizeof(T), a.fHi, b.fHi, c.fHi);
|
||||
}
|
||||
AI static void Store2(void* vptr, const SkNx& a, const SkNx& b) {
|
||||
auto ptr = (char*)vptr;
|
||||
Half::Store2(ptr, a.fLo, b.fLo);
|
||||
Half::Store2(ptr + 2*N/2*sizeof(T), a.fHi, b.fHi);
|
||||
}
|
||||
|
||||
AI T min() const { return std::min(fLo.min(), fHi.min()); }
|
||||
AI T max() const { return std::max(fLo.max(), fHi.max()); }
|
||||
AI bool anyTrue() const { return fLo.anyTrue() || fHi.anyTrue(); }
|
||||
AI bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); }
|
||||
|
||||
AI SkNx abs() const { return { fLo. abs(), fHi. abs() }; }
|
||||
AI SkNx sqrt() const { return { fLo. sqrt(), fHi. sqrt() }; }
|
||||
AI SkNx floor() const { return { fLo. floor(), fHi. floor() }; }
|
||||
|
||||
AI SkNx operator!() const { return { !fLo, !fHi }; }
|
||||
AI SkNx operator-() const { return { -fLo, -fHi }; }
|
||||
AI SkNx operator~() const { return { ~fLo, ~fHi }; }
|
||||
|
||||
AI SkNx operator<<(int bits) const { return { fLo << bits, fHi << bits }; }
|
||||
AI SkNx operator>>(int bits) const { return { fLo >> bits, fHi >> bits }; }
|
||||
|
||||
AI SkNx operator+(const SkNx& y) const { return { fLo + y.fLo, fHi + y.fHi }; }
|
||||
AI SkNx operator-(const SkNx& y) const { return { fLo - y.fLo, fHi - y.fHi }; }
|
||||
AI SkNx operator*(const SkNx& y) const { return { fLo * y.fLo, fHi * y.fHi }; }
|
||||
AI SkNx operator/(const SkNx& y) const { return { fLo / y.fLo, fHi / y.fHi }; }
|
||||
|
||||
AI SkNx operator&(const SkNx& y) const { return { fLo & y.fLo, fHi & y.fHi }; }
|
||||
AI SkNx operator|(const SkNx& y) const { return { fLo | y.fLo, fHi | y.fHi }; }
|
||||
AI SkNx operator^(const SkNx& y) const { return { fLo ^ y.fLo, fHi ^ y.fHi }; }
|
||||
|
||||
AI SkNx operator==(const SkNx& y) const { return { fLo == y.fLo, fHi == y.fHi }; }
|
||||
AI SkNx operator!=(const SkNx& y) const { return { fLo != y.fLo, fHi != y.fHi }; }
|
||||
AI SkNx operator<=(const SkNx& y) const { return { fLo <= y.fLo, fHi <= y.fHi }; }
|
||||
AI SkNx operator>=(const SkNx& y) const { return { fLo >= y.fLo, fHi >= y.fHi }; }
|
||||
AI SkNx operator< (const SkNx& y) const { return { fLo < y.fLo, fHi < y.fHi }; }
|
||||
AI SkNx operator> (const SkNx& y) const { return { fLo > y.fLo, fHi > y.fHi }; }
|
||||
|
||||
AI SkNx saturatedAdd(const SkNx& y) const {
|
||||
return { fLo.saturatedAdd(y.fLo), fHi.saturatedAdd(y.fHi) };
|
||||
}
|
||||
|
||||
AI SkNx mulHi(const SkNx& m) const {
|
||||
return { fLo.mulHi(m.fLo), fHi.mulHi(m.fHi) };
|
||||
}
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return { fLo.thenElse(t.fLo, e.fLo), fHi.thenElse(t.fHi, e.fHi) };
|
||||
}
|
||||
AI static SkNx Min(const SkNx& x, const SkNx& y) {
|
||||
return { Half::Min(x.fLo, y.fLo), Half::Min(x.fHi, y.fHi) };
|
||||
}
|
||||
AI static SkNx Max(const SkNx& x, const SkNx& y) {
|
||||
return { Half::Max(x.fLo, y.fLo), Half::Max(x.fHi, y.fHi) };
|
||||
}
|
||||
};
|
||||
|
||||
// The N -> N/2 recursion bottoms out at N == 1, a scalar value.
|
||||
template <typename T>
|
||||
struct SkNx<1,T> {
|
||||
T fVal;
|
||||
|
||||
AI SkNx() = default;
|
||||
AI SkNx(T v) : fVal(v) {}
|
||||
|
||||
// Android complains against unused parameters, so we guard it
|
||||
AI T operator[](int SkDEBUGCODE(k)) const {
|
||||
SkASSERT(k == 0);
|
||||
return fVal;
|
||||
}
|
||||
|
||||
AI static SkNx Load(const void* ptr) {
|
||||
SkNx v;
|
||||
memcpy(&v, ptr, sizeof(T));
|
||||
return v;
|
||||
}
|
||||
AI void store(void* ptr) const { memcpy(ptr, &fVal, sizeof(T)); }
|
||||
|
||||
AI static void Load4(const void* vptr, SkNx* a, SkNx* b, SkNx* c, SkNx* d) {
|
||||
auto ptr = (const char*)vptr;
|
||||
*a = Load(ptr + 0*sizeof(T));
|
||||
*b = Load(ptr + 1*sizeof(T));
|
||||
*c = Load(ptr + 2*sizeof(T));
|
||||
*d = Load(ptr + 3*sizeof(T));
|
||||
}
|
||||
AI static void Load3(const void* vptr, SkNx* a, SkNx* b, SkNx* c) {
|
||||
auto ptr = (const char*)vptr;
|
||||
*a = Load(ptr + 0*sizeof(T));
|
||||
*b = Load(ptr + 1*sizeof(T));
|
||||
*c = Load(ptr + 2*sizeof(T));
|
||||
}
|
||||
AI static void Load2(const void* vptr, SkNx* a, SkNx* b) {
|
||||
auto ptr = (const char*)vptr;
|
||||
*a = Load(ptr + 0*sizeof(T));
|
||||
*b = Load(ptr + 1*sizeof(T));
|
||||
}
|
||||
AI static void Store4(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
|
||||
auto ptr = (char*)vptr;
|
||||
a.store(ptr + 0*sizeof(T));
|
||||
b.store(ptr + 1*sizeof(T));
|
||||
c.store(ptr + 2*sizeof(T));
|
||||
d.store(ptr + 3*sizeof(T));
|
||||
}
|
||||
AI static void Store3(void* vptr, const SkNx& a, const SkNx& b, const SkNx& c) {
|
||||
auto ptr = (char*)vptr;
|
||||
a.store(ptr + 0*sizeof(T));
|
||||
b.store(ptr + 1*sizeof(T));
|
||||
c.store(ptr + 2*sizeof(T));
|
||||
}
|
||||
AI static void Store2(void* vptr, const SkNx& a, const SkNx& b) {
|
||||
auto ptr = (char*)vptr;
|
||||
a.store(ptr + 0*sizeof(T));
|
||||
b.store(ptr + 1*sizeof(T));
|
||||
}
|
||||
|
||||
AI T min() const { return fVal; }
|
||||
AI T max() const { return fVal; }
|
||||
AI bool anyTrue() const { return fVal != 0; }
|
||||
AI bool allTrue() const { return fVal != 0; }
|
||||
|
||||
AI SkNx abs() const { return Abs(fVal); }
|
||||
AI SkNx sqrt() const { return Sqrt(fVal); }
|
||||
AI SkNx floor() const { return Floor(fVal); }
|
||||
|
||||
AI SkNx operator!() const { return !fVal; }
|
||||
AI SkNx operator-() const { return -fVal; }
|
||||
AI SkNx operator~() const { return FromBits(~ToBits(fVal)); }
|
||||
|
||||
AI SkNx operator<<(int bits) const { return fVal << bits; }
|
||||
AI SkNx operator>>(int bits) const { return fVal >> bits; }
|
||||
|
||||
AI SkNx operator+(const SkNx& y) const { return fVal + y.fVal; }
|
||||
AI SkNx operator-(const SkNx& y) const { return fVal - y.fVal; }
|
||||
AI SkNx operator*(const SkNx& y) const { return fVal * y.fVal; }
|
||||
AI SkNx operator/(const SkNx& y) const { return fVal / y.fVal; }
|
||||
|
||||
AI SkNx operator&(const SkNx& y) const { return FromBits(ToBits(fVal) & ToBits(y.fVal)); }
|
||||
AI SkNx operator|(const SkNx& y) const { return FromBits(ToBits(fVal) | ToBits(y.fVal)); }
|
||||
AI SkNx operator^(const SkNx& y) const { return FromBits(ToBits(fVal) ^ ToBits(y.fVal)); }
|
||||
|
||||
AI SkNx operator==(const SkNx& y) const { return FromBits(fVal == y.fVal ? ~0 : 0); }
|
||||
AI SkNx operator!=(const SkNx& y) const { return FromBits(fVal != y.fVal ? ~0 : 0); }
|
||||
AI SkNx operator<=(const SkNx& y) const { return FromBits(fVal <= y.fVal ? ~0 : 0); }
|
||||
AI SkNx operator>=(const SkNx& y) const { return FromBits(fVal >= y.fVal ? ~0 : 0); }
|
||||
AI SkNx operator< (const SkNx& y) const { return FromBits(fVal < y.fVal ? ~0 : 0); }
|
||||
AI SkNx operator> (const SkNx& y) const { return FromBits(fVal > y.fVal ? ~0 : 0); }
|
||||
|
||||
AI static SkNx Min(const SkNx& x, const SkNx& y) { return x.fVal < y.fVal ? x : y; }
|
||||
AI static SkNx Max(const SkNx& x, const SkNx& y) { return x.fVal > y.fVal ? x : y; }
|
||||
|
||||
AI SkNx saturatedAdd(const SkNx& y) const {
|
||||
static_assert(std::is_unsigned<T>::value, "");
|
||||
T sum = fVal + y.fVal;
|
||||
return sum < fVal ? std::numeric_limits<T>::max() : sum;
|
||||
}
|
||||
|
||||
AI SkNx mulHi(const SkNx& m) const {
|
||||
static_assert(std::is_unsigned<T>::value, "");
|
||||
static_assert(sizeof(T) <= 4, "");
|
||||
return static_cast<T>((static_cast<uint64_t>(fVal) * m.fVal) >> (sizeof(T)*8));
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const { return fVal != 0 ? t : e; }
|
||||
|
||||
private:
|
||||
// Helper functions to choose the right float/double methods. (In <cmath> madness lies...)
|
||||
AI static int Abs(int val) { return val < 0 ? -val : val; }
|
||||
|
||||
AI static float Abs(float val) { return ::fabsf(val); }
|
||||
AI static float Sqrt(float val) { return ::sqrtf(val); }
|
||||
AI static float Floor(float val) { return ::floorf(val); }
|
||||
|
||||
AI static double Abs(double val) { return ::fabs(val); }
|
||||
AI static double Sqrt(double val) { return ::sqrt(val); }
|
||||
AI static double Floor(double val) { return ::floor(val); }
|
||||
|
||||
// Helper functions for working with floats/doubles as bit patterns.
|
||||
template <typename U>
|
||||
AI static U ToBits(U v) { return v; }
|
||||
AI static int32_t ToBits(float v) { int32_t bits; memcpy(&bits, &v, sizeof(v)); return bits; }
|
||||
AI static int64_t ToBits(double v) { int64_t bits; memcpy(&bits, &v, sizeof(v)); return bits; }
|
||||
|
||||
template <typename Bits>
|
||||
AI static T FromBits(Bits bits) {
|
||||
static_assert(std::is_pod<T >::value &&
|
||||
std::is_pod<Bits>::value &&
|
||||
sizeof(T) <= sizeof(Bits), "");
|
||||
T val;
|
||||
memcpy(&val, &bits, sizeof(T));
|
||||
return val;
|
||||
}
|
||||
};
|
||||
|
||||
// Allow scalars on the left or right of binary operators, and things like +=, &=, etc.
|
||||
#define V template <int N, typename T> AI static SkNx<N,T>
|
||||
V operator+ (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) + y; }
|
||||
V operator- (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) - y; }
|
||||
V operator* (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) * y; }
|
||||
V operator/ (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) / y; }
|
||||
V operator& (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) & y; }
|
||||
V operator| (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) | y; }
|
||||
V operator^ (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) ^ y; }
|
||||
V operator==(T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) == y; }
|
||||
V operator!=(T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) != y; }
|
||||
V operator<=(T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) <= y; }
|
||||
V operator>=(T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) >= y; }
|
||||
V operator< (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) < y; }
|
||||
V operator> (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) > y; }
|
||||
|
||||
V operator+ (const SkNx<N,T>& x, T y) { return x + SkNx<N,T>(y); }
|
||||
V operator- (const SkNx<N,T>& x, T y) { return x - SkNx<N,T>(y); }
|
||||
V operator* (const SkNx<N,T>& x, T y) { return x * SkNx<N,T>(y); }
|
||||
V operator/ (const SkNx<N,T>& x, T y) { return x / SkNx<N,T>(y); }
|
||||
V operator& (const SkNx<N,T>& x, T y) { return x & SkNx<N,T>(y); }
|
||||
V operator| (const SkNx<N,T>& x, T y) { return x | SkNx<N,T>(y); }
|
||||
V operator^ (const SkNx<N,T>& x, T y) { return x ^ SkNx<N,T>(y); }
|
||||
V operator==(const SkNx<N,T>& x, T y) { return x == SkNx<N,T>(y); }
|
||||
V operator!=(const SkNx<N,T>& x, T y) { return x != SkNx<N,T>(y); }
|
||||
V operator<=(const SkNx<N,T>& x, T y) { return x <= SkNx<N,T>(y); }
|
||||
V operator>=(const SkNx<N,T>& x, T y) { return x >= SkNx<N,T>(y); }
|
||||
V operator< (const SkNx<N,T>& x, T y) { return x < SkNx<N,T>(y); }
|
||||
V operator> (const SkNx<N,T>& x, T y) { return x > SkNx<N,T>(y); }
|
||||
|
||||
V& operator<<=(SkNx<N,T>& x, int bits) { return (x = x << bits); }
|
||||
V& operator>>=(SkNx<N,T>& x, int bits) { return (x = x >> bits); }
|
||||
|
||||
V& operator +=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x + y); }
|
||||
V& operator -=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x - y); }
|
||||
V& operator *=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x * y); }
|
||||
V& operator /=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x / y); }
|
||||
V& operator &=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x & y); }
|
||||
V& operator |=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x | y); }
|
||||
V& operator ^=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x ^ y); }
|
||||
|
||||
V& operator +=(SkNx<N,T>& x, T y) { return (x = x + SkNx<N,T>(y)); }
|
||||
V& operator -=(SkNx<N,T>& x, T y) { return (x = x - SkNx<N,T>(y)); }
|
||||
V& operator *=(SkNx<N,T>& x, T y) { return (x = x * SkNx<N,T>(y)); }
|
||||
V& operator /=(SkNx<N,T>& x, T y) { return (x = x / SkNx<N,T>(y)); }
|
||||
V& operator &=(SkNx<N,T>& x, T y) { return (x = x & SkNx<N,T>(y)); }
|
||||
V& operator |=(SkNx<N,T>& x, T y) { return (x = x | SkNx<N,T>(y)); }
|
||||
V& operator ^=(SkNx<N,T>& x, T y) { return (x = x ^ SkNx<N,T>(y)); }
|
||||
#undef V
|
||||
|
||||
// SkNx<N,T> ~~> SkNx<N/2,T> + SkNx<N/2,T>
|
||||
template <int N, typename T>
|
||||
AI static void SkNx_split(const SkNx<N,T>& v, SkNx<N/2,T>* lo, SkNx<N/2,T>* hi) {
|
||||
*lo = v.fLo;
|
||||
*hi = v.fHi;
|
||||
}
|
||||
|
||||
// SkNx<N/2,T> + SkNx<N/2,T> ~~> SkNx<N,T>
|
||||
template <int N, typename T>
|
||||
AI static SkNx<N*2,T> SkNx_join(const SkNx<N,T>& lo, const SkNx<N,T>& hi) {
|
||||
return { lo, hi };
|
||||
}
|
||||
|
||||
// A very generic shuffle. Can reorder, duplicate, contract, expand...
|
||||
// Sk4f v = { R,G,B,A };
|
||||
// SkNx_shuffle<2,1,0,3>(v) ~~> {B,G,R,A}
|
||||
// SkNx_shuffle<2,1>(v) ~~> {B,G}
|
||||
// SkNx_shuffle<2,1,2,1,2,1,2,1>(v) ~~> {B,G,B,G,B,G,B,G}
|
||||
// SkNx_shuffle<3,3,3,3>(v) ~~> {A,A,A,A}
|
||||
template <int... Ix, int N, typename T>
|
||||
AI static SkNx<sizeof...(Ix),T> SkNx_shuffle(const SkNx<N,T>& v) {
|
||||
return { v[Ix]... };
|
||||
}
|
||||
|
||||
// Cast from SkNx<N, Src> to SkNx<N, Dst>, as if you called static_cast<Dst>(Src).
|
||||
template <typename Dst, typename Src, int N>
|
||||
AI static SkNx<N,Dst> SkNx_cast(const SkNx<N,Src>& v) {
|
||||
return { SkNx_cast<Dst>(v.fLo), SkNx_cast<Dst>(v.fHi) };
|
||||
}
|
||||
template <typename Dst, typename Src>
|
||||
AI static SkNx<1,Dst> SkNx_cast(const SkNx<1,Src>& v) {
|
||||
return static_cast<Dst>(v.fVal);
|
||||
}
|
||||
|
||||
template <int N, typename T>
|
||||
AI static SkNx<N,T> SkNx_fma(const SkNx<N,T>& f, const SkNx<N,T>& m, const SkNx<N,T>& a) {
|
||||
return f*m+a;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
typedef SkNx<2, float> Sk2f;
|
||||
typedef SkNx<4, float> Sk4f;
|
||||
typedef SkNx<8, float> Sk8f;
|
||||
typedef SkNx<16, float> Sk16f;
|
||||
|
||||
typedef SkNx<2, SkScalar> Sk2s;
|
||||
typedef SkNx<4, SkScalar> Sk4s;
|
||||
typedef SkNx<8, SkScalar> Sk8s;
|
||||
typedef SkNx<16, SkScalar> Sk16s;
|
||||
|
||||
typedef SkNx<4, uint8_t> Sk4b;
|
||||
typedef SkNx<8, uint8_t> Sk8b;
|
||||
typedef SkNx<16, uint8_t> Sk16b;
|
||||
|
||||
typedef SkNx<4, uint16_t> Sk4h;
|
||||
typedef SkNx<8, uint16_t> Sk8h;
|
||||
typedef SkNx<16, uint16_t> Sk16h;
|
||||
|
||||
typedef SkNx<4, int32_t> Sk4i;
|
||||
typedef SkNx<8, int32_t> Sk8i;
|
||||
typedef SkNx<4, uint32_t> Sk4u;
|
||||
|
||||
// Include platform specific specializations if available.
|
||||
#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
#include "include/private/SkNx_sse.h"
|
||||
#elif !defined(SKNX_NO_SIMD) && defined(SK_ARM_HAS_NEON)
|
||||
#include "include/private/SkNx_neon.h"
|
||||
#else
|
||||
|
||||
AI static Sk4i Sk4f_round(const Sk4f& x) {
|
||||
return { (int) lrintf (x[0]),
|
||||
(int) lrintf (x[1]),
|
||||
(int) lrintf (x[2]),
|
||||
(int) lrintf (x[3]), };
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
AI static void Sk4f_ToBytes(uint8_t p[16],
|
||||
const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) {
|
||||
SkNx_cast<uint8_t>(SkNx_join(SkNx_join(a,b), SkNx_join(c,d))).store(p);
|
||||
}
|
||||
|
||||
#undef AI
|
||||
|
||||
#endif//SkNx_DEFINED
|
@ -1,713 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkNx_neon_DEFINED
|
||||
#define SkNx_neon_DEFINED
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
namespace { // NOLINT(google-build-namespaces)
|
||||
|
||||
// ARMv8 has vrndm(q)_f32 to floor floats. Here we emulate it:
|
||||
// - roundtrip through integers via truncation
|
||||
// - subtract 1 if that's too big (possible for negative values).
|
||||
// This restricts the domain of our inputs to a maximum somehwere around 2^31. Seems plenty big.
|
||||
AI static float32x4_t emulate_vrndmq_f32(float32x4_t v) {
|
||||
auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
|
||||
auto too_big = vcgtq_f32(roundtrip, v);
|
||||
return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1)));
|
||||
}
|
||||
AI static float32x2_t emulate_vrndm_f32(float32x2_t v) {
|
||||
auto roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
|
||||
auto too_big = vcgt_f32(roundtrip, v);
|
||||
return vsub_f32(roundtrip, (float32x2_t)vand_u32(too_big, (uint32x2_t)vdup_n_f32(1)));
|
||||
}
|
||||
|
||||
template <>
|
||||
class SkNx<2, float> {
|
||||
public:
|
||||
AI SkNx(float32x2_t vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(float val) : fVec(vdup_n_f32(val)) {}
|
||||
AI SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; }
|
||||
|
||||
AI static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); }
|
||||
AI void store(void* ptr) const { vst1_f32((float*)ptr, fVec); }
|
||||
|
||||
AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
|
||||
float32x2x2_t xy = vld2_f32((const float*) ptr);
|
||||
*x = xy.val[0];
|
||||
*y = xy.val[1];
|
||||
}
|
||||
|
||||
AI static void Store2(void* dst, const SkNx& a, const SkNx& b) {
|
||||
float32x2x2_t ab = {{
|
||||
a.fVec,
|
||||
b.fVec,
|
||||
}};
|
||||
vst2_f32((float*) dst, ab);
|
||||
}
|
||||
|
||||
AI static void Store3(void* dst, const SkNx& a, const SkNx& b, const SkNx& c) {
|
||||
float32x2x3_t abc = {{
|
||||
a.fVec,
|
||||
b.fVec,
|
||||
c.fVec,
|
||||
}};
|
||||
vst3_f32((float*) dst, abc);
|
||||
}
|
||||
|
||||
AI static void Store4(void* dst, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
|
||||
float32x2x4_t abcd = {{
|
||||
a.fVec,
|
||||
b.fVec,
|
||||
c.fVec,
|
||||
d.fVec,
|
||||
}};
|
||||
vst4_f32((float*) dst, abcd);
|
||||
}
|
||||
|
||||
AI SkNx operator - () const { return vneg_f32(fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); }
|
||||
AI SkNx operator / (const SkNx& o) const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return vdiv_f32(fVec, o.fVec);
|
||||
#else
|
||||
float32x2_t est0 = vrecpe_f32(o.fVec),
|
||||
est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0),
|
||||
est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1);
|
||||
return vmul_f32(fVec, est2);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI SkNx operator==(const SkNx& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); }
|
||||
AI SkNx operator <(const SkNx& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); }
|
||||
AI SkNx operator >(const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); }
|
||||
AI SkNx operator<=(const SkNx& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); }
|
||||
AI SkNx operator>=(const SkNx& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); }
|
||||
AI SkNx operator!=(const SkNx& o) const {
|
||||
return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec)));
|
||||
}
|
||||
|
||||
AI static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fVec); }
|
||||
AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); }
|
||||
|
||||
AI SkNx abs() const { return vabs_f32(fVec); }
|
||||
AI SkNx floor() const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return vrndm_f32(fVec);
|
||||
#else
|
||||
return emulate_vrndm_f32(fVec);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI SkNx sqrt() const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return vsqrt_f32(fVec);
|
||||
#else
|
||||
float32x2_t est0 = vrsqrte_f32(fVec),
|
||||
est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0),
|
||||
est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
|
||||
return vmul_f32(fVec, est2);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI float operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 2);
|
||||
union { float32x2_t v; float fs[2]; } pun = {fVec};
|
||||
return pun.fs[k&1];
|
||||
}
|
||||
|
||||
AI bool allTrue() const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return 0 != vminv_u32(vreinterpret_u32_f32(fVec));
|
||||
#else
|
||||
auto v = vreinterpret_u32_f32(fVec);
|
||||
return vget_lane_u32(v,0) && vget_lane_u32(v,1);
|
||||
#endif
|
||||
}
|
||||
AI bool anyTrue() const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return 0 != vmaxv_u32(vreinterpret_u32_f32(fVec));
|
||||
#else
|
||||
auto v = vreinterpret_u32_f32(fVec);
|
||||
return vget_lane_u32(v,0) || vget_lane_u32(v,1);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return vbsl_f32(vreinterpret_u32_f32(fVec), t.fVec, e.fVec);
|
||||
}
|
||||
|
||||
float32x2_t fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<4, float> {
|
||||
public:
|
||||
AI SkNx(float32x4_t vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(float val) : fVec(vdupq_n_f32(val)) {}
|
||||
AI SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
|
||||
|
||||
AI static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
|
||||
AI void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); }
|
||||
|
||||
AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
|
||||
float32x4x2_t xy = vld2q_f32((const float*) ptr);
|
||||
*x = xy.val[0];
|
||||
*y = xy.val[1];
|
||||
}
|
||||
|
||||
AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
|
||||
float32x4x4_t rgba = vld4q_f32((const float*) ptr);
|
||||
*r = rgba.val[0];
|
||||
*g = rgba.val[1];
|
||||
*b = rgba.val[2];
|
||||
*a = rgba.val[3];
|
||||
}
|
||||
AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
|
||||
float32x4x4_t rgba = {{
|
||||
r.fVec,
|
||||
g.fVec,
|
||||
b.fVec,
|
||||
a.fVec,
|
||||
}};
|
||||
vst4q_f32((float*) dst, rgba);
|
||||
}
|
||||
|
||||
AI SkNx operator - () const { return vnegq_f32(fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); }
|
||||
AI SkNx operator / (const SkNx& o) const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return vdivq_f32(fVec, o.fVec);
|
||||
#else
|
||||
float32x4_t est0 = vrecpeq_f32(o.fVec),
|
||||
est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),
|
||||
est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
|
||||
return vmulq_f32(fVec, est2);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI SkNx operator==(const SkNx& o) const {return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec));}
|
||||
AI SkNx operator <(const SkNx& o) const {return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec));}
|
||||
AI SkNx operator >(const SkNx& o) const {return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec));}
|
||||
AI SkNx operator<=(const SkNx& o) const {return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec));}
|
||||
AI SkNx operator>=(const SkNx& o) const {return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec));}
|
||||
AI SkNx operator!=(const SkNx& o) const {
|
||||
return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec)));
|
||||
}
|
||||
|
||||
AI static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.fVec); }
|
||||
AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); }
|
||||
|
||||
AI SkNx abs() const { return vabsq_f32(fVec); }
|
||||
AI SkNx floor() const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return vrndmq_f32(fVec);
|
||||
#else
|
||||
return emulate_vrndmq_f32(fVec);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
AI SkNx sqrt() const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return vsqrtq_f32(fVec);
|
||||
#else
|
||||
float32x4_t est0 = vrsqrteq_f32(fVec),
|
||||
est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0),
|
||||
est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
|
||||
return vmulq_f32(fVec, est2);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI float operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
union { float32x4_t v; float fs[4]; } pun = {fVec};
|
||||
return pun.fs[k&3];
|
||||
}
|
||||
|
||||
AI float min() const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return vminvq_f32(fVec);
|
||||
#else
|
||||
SkNx min = Min(*this, vrev64q_f32(fVec));
|
||||
return std::min(min[0], min[2]);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI float max() const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return vmaxvq_f32(fVec);
|
||||
#else
|
||||
SkNx max = Max(*this, vrev64q_f32(fVec));
|
||||
return std::max(max[0], max[2]);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI bool allTrue() const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return 0 != vminvq_u32(vreinterpretq_u32_f32(fVec));
|
||||
#else
|
||||
auto v = vreinterpretq_u32_f32(fVec);
|
||||
return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1)
|
||||
&& vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3);
|
||||
#endif
|
||||
}
|
||||
AI bool anyTrue() const {
|
||||
#if defined(SK_CPU_ARM64)
|
||||
return 0 != vmaxvq_u32(vreinterpretq_u32_f32(fVec));
|
||||
#else
|
||||
auto v = vreinterpretq_u32_f32(fVec);
|
||||
return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1)
|
||||
|| vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec);
|
||||
}
|
||||
|
||||
float32x4_t fVec;
|
||||
};
|
||||
|
||||
#if defined(SK_CPU_ARM64)
|
||||
AI static Sk4f SkNx_fma(const Sk4f& f, const Sk4f& m, const Sk4f& a) {
|
||||
return vfmaq_f32(a.fVec, f.fVec, m.fVec);
|
||||
}
|
||||
#endif
|
||||
|
||||
// It's possible that for our current use cases, representing this as
|
||||
// half a uint16x8_t might be better than representing it as a uint16x4_t.
|
||||
// It'd make conversion to Sk4b one step simpler.
|
||||
template <>
|
||||
class SkNx<4, uint16_t> {
|
||||
public:
|
||||
AI SkNx(const uint16x4_t& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {}
|
||||
AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
|
||||
fVec = (uint16x4_t) { a,b,c,d };
|
||||
}
|
||||
|
||||
AI static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); }
|
||||
AI void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); }
|
||||
|
||||
AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
|
||||
uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
|
||||
*r = rgba.val[0];
|
||||
*g = rgba.val[1];
|
||||
*b = rgba.val[2];
|
||||
*a = rgba.val[3];
|
||||
}
|
||||
AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
|
||||
uint16x4x3_t rgba = vld3_u16((const uint16_t*)ptr);
|
||||
*r = rgba.val[0];
|
||||
*g = rgba.val[1];
|
||||
*b = rgba.val[2];
|
||||
}
|
||||
AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
|
||||
uint16x4x4_t rgba = {{
|
||||
r.fVec,
|
||||
g.fVec,
|
||||
b.fVec,
|
||||
a.fVec,
|
||||
}};
|
||||
vst4_u16((uint16_t*) dst, rgba);
|
||||
}
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); }
|
||||
AI SkNx operator & (const SkNx& o) const { return vand_u16(fVec, o.fVec); }
|
||||
AI SkNx operator | (const SkNx& o) const { return vorr_u16(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
|
||||
AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
|
||||
|
||||
AI static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fVec); }
|
||||
|
||||
AI uint16_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
union { uint16x4_t v; uint16_t us[4]; } pun = {fVec};
|
||||
return pun.us[k&3];
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return vbsl_u16(fVec, t.fVec, e.fVec);
|
||||
}
|
||||
|
||||
uint16x4_t fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<8, uint16_t> {
|
||||
public:
|
||||
AI SkNx(const uint16x8_t& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {}
|
||||
AI static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); }
|
||||
|
||||
AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
|
||||
uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
|
||||
fVec = (uint16x8_t) { a,b,c,d, e,f,g,h };
|
||||
}
|
||||
|
||||
AI void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); }
|
||||
AI SkNx operator & (const SkNx& o) const { return vandq_u16(fVec, o.fVec); }
|
||||
AI SkNx operator | (const SkNx& o) const { return vorrq_u16(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
|
||||
AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
|
||||
|
||||
AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); }
|
||||
|
||||
AI uint16_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 8);
|
||||
union { uint16x8_t v; uint16_t us[8]; } pun = {fVec};
|
||||
return pun.us[k&7];
|
||||
}
|
||||
|
||||
AI SkNx mulHi(const SkNx& m) const {
|
||||
uint32x4_t hi = vmull_u16(vget_high_u16(fVec), vget_high_u16(m.fVec));
|
||||
uint32x4_t lo = vmull_u16( vget_low_u16(fVec), vget_low_u16(m.fVec));
|
||||
|
||||
return { vcombine_u16(vshrn_n_u32(lo,16), vshrn_n_u32(hi,16)) };
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return vbslq_u16(fVec, t.fVec, e.fVec);
|
||||
}
|
||||
|
||||
uint16x8_t fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<4, uint8_t> {
|
||||
public:
|
||||
typedef uint32_t __attribute__((aligned(1))) unaligned_uint32_t;
|
||||
|
||||
AI SkNx(const uint8x8_t& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
|
||||
fVec = (uint8x8_t){a,b,c,d, 0,0,0,0};
|
||||
}
|
||||
AI static SkNx Load(const void* ptr) {
|
||||
return (uint8x8_t)vld1_dup_u32((const unaligned_uint32_t*)ptr);
|
||||
}
|
||||
AI void store(void* ptr) const {
|
||||
return vst1_lane_u32((unaligned_uint32_t*)ptr, (uint32x2_t)fVec, 0);
|
||||
}
|
||||
AI uint8_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
|
||||
return pun.us[k&3];
|
||||
}
|
||||
|
||||
// TODO as needed
|
||||
|
||||
uint8x8_t fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<8, uint8_t> {
|
||||
public:
|
||||
AI SkNx(const uint8x8_t& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint8_t val) : fVec(vdup_n_u8(val)) {}
|
||||
AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
|
||||
uint8_t e, uint8_t f, uint8_t g, uint8_t h) {
|
||||
fVec = (uint8x8_t) { a,b,c,d, e,f,g,h };
|
||||
}
|
||||
|
||||
AI static SkNx Load(const void* ptr) { return vld1_u8((const uint8_t*)ptr); }
|
||||
AI void store(void* ptr) const { vst1_u8((uint8_t*)ptr, fVec); }
|
||||
|
||||
AI uint8_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 8);
|
||||
union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
|
||||
return pun.us[k&7];
|
||||
}
|
||||
|
||||
uint8x8_t fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<16, uint8_t> {
|
||||
public:
|
||||
AI SkNx(const uint8x16_t& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {}
|
||||
AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
|
||||
uint8_t e, uint8_t f, uint8_t g, uint8_t h,
|
||||
uint8_t i, uint8_t j, uint8_t k, uint8_t l,
|
||||
uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
|
||||
fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p };
|
||||
}
|
||||
|
||||
AI static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); }
|
||||
AI void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); }
|
||||
|
||||
AI SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); }
|
||||
AI SkNx operator & (const SkNx& o) const { return vandq_u8(fVec, o.fVec); }
|
||||
|
||||
AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); }
|
||||
AI SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); }
|
||||
|
||||
AI uint8_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 16);
|
||||
union { uint8x16_t v; uint8_t us[16]; } pun = {fVec};
|
||||
return pun.us[k&15];
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return vbslq_u8(fVec, t.fVec, e.fVec);
|
||||
}
|
||||
|
||||
uint8x16_t fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<4, int32_t> {
|
||||
public:
|
||||
AI SkNx(const int32x4_t& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(int32_t v) {
|
||||
fVec = vdupq_n_s32(v);
|
||||
}
|
||||
AI SkNx(int32_t a, int32_t b, int32_t c, int32_t d) {
|
||||
fVec = (int32x4_t){a,b,c,d};
|
||||
}
|
||||
AI static SkNx Load(const void* ptr) {
|
||||
return vld1q_s32((const int32_t*)ptr);
|
||||
}
|
||||
AI void store(void* ptr) const {
|
||||
return vst1q_s32((int32_t*)ptr, fVec);
|
||||
}
|
||||
AI int32_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
union { int32x4_t v; int32_t is[4]; } pun = {fVec};
|
||||
return pun.is[k&3];
|
||||
}
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); }
|
||||
AI SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
|
||||
AI SkNx operator ^ (const SkNx& o) const { return veorq_s32(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
|
||||
AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
|
||||
|
||||
AI SkNx operator == (const SkNx& o) const {
|
||||
return vreinterpretq_s32_u32(vceqq_s32(fVec, o.fVec));
|
||||
}
|
||||
AI SkNx operator < (const SkNx& o) const {
|
||||
return vreinterpretq_s32_u32(vcltq_s32(fVec, o.fVec));
|
||||
}
|
||||
AI SkNx operator > (const SkNx& o) const {
|
||||
return vreinterpretq_s32_u32(vcgtq_s32(fVec, o.fVec));
|
||||
}
|
||||
|
||||
AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.fVec); }
|
||||
AI static SkNx Max(const SkNx& a, const SkNx& b) { return vmaxq_s32(a.fVec, b.fVec); }
|
||||
// TODO as needed
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return vbslq_s32(vreinterpretq_u32_s32(fVec), t.fVec, e.fVec);
|
||||
}
|
||||
|
||||
AI SkNx abs() const { return vabsq_s32(fVec); }
|
||||
|
||||
int32x4_t fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<4, uint32_t> {
|
||||
public:
|
||||
AI SkNx(const uint32x4_t& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint32_t v) {
|
||||
fVec = vdupq_n_u32(v);
|
||||
}
|
||||
AI SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
|
||||
fVec = (uint32x4_t){a,b,c,d};
|
||||
}
|
||||
AI static SkNx Load(const void* ptr) {
|
||||
return vld1q_u32((const uint32_t*)ptr);
|
||||
}
|
||||
AI void store(void* ptr) const {
|
||||
return vst1q_u32((uint32_t*)ptr, fVec);
|
||||
}
|
||||
AI uint32_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
union { uint32x4_t v; uint32_t us[4]; } pun = {fVec};
|
||||
return pun.us[k&3];
|
||||
}
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return vaddq_u32(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return vsubq_u32(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return vmulq_u32(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator & (const SkNx& o) const { return vandq_u32(fVec, o.fVec); }
|
||||
AI SkNx operator | (const SkNx& o) const { return vorrq_u32(fVec, o.fVec); }
|
||||
AI SkNx operator ^ (const SkNx& o) const { return veorq_u32(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
|
||||
AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
|
||||
|
||||
AI SkNx operator == (const SkNx& o) const { return vceqq_u32(fVec, o.fVec); }
|
||||
AI SkNx operator < (const SkNx& o) const { return vcltq_u32(fVec, o.fVec); }
|
||||
AI SkNx operator > (const SkNx& o) const { return vcgtq_u32(fVec, o.fVec); }
|
||||
|
||||
AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.fVec); }
|
||||
// TODO as needed
|
||||
|
||||
AI SkNx mulHi(const SkNx& m) const {
|
||||
uint64x2_t hi = vmull_u32(vget_high_u32(fVec), vget_high_u32(m.fVec));
|
||||
uint64x2_t lo = vmull_u32( vget_low_u32(fVec), vget_low_u32(m.fVec));
|
||||
|
||||
return { vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)) };
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return vbslq_u32(fVec, t.fVec, e.fVec);
|
||||
}
|
||||
|
||||
uint32x4_t fVec;
|
||||
};
|
||||
|
||||
template<> AI /*static*/ Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) {
|
||||
return vcvtq_s32_f32(src.fVec);
|
||||
|
||||
}
|
||||
template<> AI /*static*/ Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {
|
||||
return vcvtq_f32_s32(src.fVec);
|
||||
}
|
||||
template<> AI /*static*/ Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) {
|
||||
return SkNx_cast<float>(Sk4i::Load(&src));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
|
||||
return vqmovn_u32(vcvtq_u32_f32(src.fVec));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
|
||||
return vcvtq_f32_u32(vmovl_u16(src.fVec));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
|
||||
uint32x4_t _32 = vcvtq_u32_f32(src.fVec);
|
||||
uint16x4_t _16 = vqmovn_u32(_32);
|
||||
return vqmovn_u16(vcombine_u16(_16, _16));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4u SkNx_cast<uint32_t, uint8_t>(const Sk4b& src) {
|
||||
uint16x8_t _16 = vmovl_u8(src.fVec);
|
||||
return vmovl_u16(vget_low_u16(_16));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
|
||||
return vreinterpretq_s32_u32(SkNx_cast<uint32_t>(src).fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
|
||||
return vcvtq_f32_s32(SkNx_cast<int32_t>(src).fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) {
|
||||
Sk8f ab, cd;
|
||||
SkNx_split(src, &ab, &cd);
|
||||
|
||||
Sk4f a,b,c,d;
|
||||
SkNx_split(ab, &a, &b);
|
||||
SkNx_split(cd, &c, &d);
|
||||
return vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec),
|
||||
(uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0],
|
||||
vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec),
|
||||
(uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0];
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, int32_t>(const Sk8i& src) {
|
||||
Sk4i a, b;
|
||||
SkNx_split(src, &a, &b);
|
||||
uint16x4_t a16 = vqmovun_s32(a.fVec);
|
||||
uint16x4_t b16 = vqmovun_s32(b.fVec);
|
||||
|
||||
return vqmovn_u16(vcombine_u16(a16, b16));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
|
||||
return vget_low_u16(vmovl_u8(src.fVec));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk8h SkNx_cast<uint16_t, uint8_t>(const Sk8b& src) {
|
||||
return vmovl_u8(src.fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
|
||||
return vmovn_u16(vcombine_u16(src.fVec, src.fVec));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, uint16_t>(const Sk8h& src) {
|
||||
return vqmovn_u16(src.fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
|
||||
uint16x4_t _16 = vqmovun_s32(src.fVec);
|
||||
return vqmovn_u16(vcombine_u16(_16, _16));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) {
|
||||
uint16x4_t _16 = vqmovn_u32(src.fVec);
|
||||
return vqmovn_u16(vcombine_u16(_16, _16));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {
|
||||
return vreinterpretq_s32_u32(vmovl_u16(src.fVec));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) {
|
||||
return vmovn_u32(vreinterpretq_u32_s32(src.fVec));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) {
|
||||
return vreinterpretq_s32_u32(src.fVec);
|
||||
}
|
||||
|
||||
AI static Sk4i Sk4f_round(const Sk4f& x) {
|
||||
return vcvtq_s32_f32((x + 0.5f).fVec);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif//SkNx_neon_DEFINED
|
@ -1,823 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkNx_sse_DEFINED
|
||||
#define SkNx_sse_DEFINED
|
||||
|
||||
#include "include/core/SkTypes.h"
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
#include <smmintrin.h>
|
||||
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
#else
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
// This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent.
|
||||
// If you do, make sure this is in a static inline function... anywhere else risks violating ODR.
|
||||
|
||||
namespace { // NOLINT(google-build-namespaces)
|
||||
|
||||
// Emulate _mm_floor_ps() with SSE2:
|
||||
// - roundtrip through integers via truncation
|
||||
// - subtract 1 if that's too big (possible for negative values).
|
||||
// This restricts the domain of our inputs to a maximum somehwere around 2^31.
|
||||
// Seems plenty big.
|
||||
AI static __m128 emulate_mm_floor_ps(__m128 v) {
|
||||
__m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
|
||||
__m128 too_big = _mm_cmpgt_ps(roundtrip, v);
|
||||
return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f)));
|
||||
}
|
||||
|
||||
template <>
|
||||
class SkNx<2, float> {
|
||||
public:
|
||||
AI SkNx(const __m128& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(float val) : fVec(_mm_set1_ps(val)) {}
|
||||
AI static SkNx Load(const void* ptr) {
|
||||
return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr));
|
||||
}
|
||||
AI SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {}
|
||||
|
||||
AI void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); }
|
||||
|
||||
AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
|
||||
const float* m = (const float*)ptr;
|
||||
*x = SkNx{m[0], m[2]};
|
||||
*y = SkNx{m[1], m[3]};
|
||||
}
|
||||
|
||||
AI static void Store2(void* dst, const SkNx& a, const SkNx& b) {
|
||||
auto vals = _mm_unpacklo_ps(a.fVec, b.fVec);
|
||||
_mm_storeu_ps((float*)dst, vals);
|
||||
}
|
||||
|
||||
AI static void Store3(void* dst, const SkNx& a, const SkNx& b, const SkNx& c) {
|
||||
auto lo = _mm_setr_ps(a[0], b[0], c[0], a[1]),
|
||||
hi = _mm_setr_ps(b[1], c[1], 0, 0);
|
||||
_mm_storeu_ps((float*)dst, lo);
|
||||
_mm_storel_pi(((__m64*)dst) + 2, hi);
|
||||
}
|
||||
|
||||
AI static void Store4(void* dst, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
|
||||
auto lo = _mm_setr_ps(a[0], b[0], c[0], d[0]),
|
||||
hi = _mm_setr_ps(a[1], b[1], c[1], d[1]);
|
||||
_mm_storeu_ps((float*)dst, lo);
|
||||
_mm_storeu_ps(((float*)dst) + 4, hi);
|
||||
}
|
||||
|
||||
AI SkNx operator - () const { return _mm_xor_ps(_mm_set1_ps(-0.0f), fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
|
||||
AI SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
|
||||
AI SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
|
||||
AI SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
|
||||
AI SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
|
||||
AI SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
|
||||
AI SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
|
||||
|
||||
AI static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
|
||||
AI static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
|
||||
|
||||
AI SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
|
||||
AI SkNx floor() const {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_floor_ps(fVec);
|
||||
#else
|
||||
return emulate_mm_floor_ps(fVec);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI SkNx sqrt() const { return _mm_sqrt_ps (fVec); }
|
||||
|
||||
AI float operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 2);
|
||||
union { __m128 v; float fs[4]; } pun = {fVec};
|
||||
return pun.fs[k&1];
|
||||
}
|
||||
|
||||
AI bool allTrue() const { return 0b11 == (_mm_movemask_ps(fVec) & 0b11); }
|
||||
AI bool anyTrue() const { return 0b00 != (_mm_movemask_ps(fVec) & 0b11); }
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_blendv_ps(e.fVec, t.fVec, fVec);
|
||||
#else
|
||||
return _mm_or_ps(_mm_and_ps (fVec, t.fVec),
|
||||
_mm_andnot_ps(fVec, e.fVec));
|
||||
#endif
|
||||
}
|
||||
|
||||
__m128 fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<4, float> {
|
||||
public:
|
||||
AI SkNx(const __m128& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(float val) : fVec( _mm_set1_ps(val) ) {}
|
||||
AI SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
|
||||
|
||||
AI static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); }
|
||||
AI void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); }
|
||||
|
||||
AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
|
||||
SkNx lo = SkNx::Load((const float*)ptr+0),
|
||||
hi = SkNx::Load((const float*)ptr+4);
|
||||
*x = SkNx{lo[0], lo[2], hi[0], hi[2]};
|
||||
*y = SkNx{lo[1], lo[3], hi[1], hi[3]};
|
||||
}
|
||||
|
||||
AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
|
||||
__m128 v0 = _mm_loadu_ps(((float*)ptr) + 0),
|
||||
v1 = _mm_loadu_ps(((float*)ptr) + 4),
|
||||
v2 = _mm_loadu_ps(((float*)ptr) + 8),
|
||||
v3 = _mm_loadu_ps(((float*)ptr) + 12);
|
||||
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
|
||||
*r = v0;
|
||||
*g = v1;
|
||||
*b = v2;
|
||||
*a = v3;
|
||||
}
|
||||
AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
|
||||
__m128 v0 = r.fVec,
|
||||
v1 = g.fVec,
|
||||
v2 = b.fVec,
|
||||
v3 = a.fVec;
|
||||
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
|
||||
_mm_storeu_ps(((float*) dst) + 0, v0);
|
||||
_mm_storeu_ps(((float*) dst) + 4, v1);
|
||||
_mm_storeu_ps(((float*) dst) + 8, v2);
|
||||
_mm_storeu_ps(((float*) dst) + 12, v3);
|
||||
}
|
||||
|
||||
AI SkNx operator - () const { return _mm_xor_ps(_mm_set1_ps(-0.0f), fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
|
||||
AI SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
|
||||
AI SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
|
||||
AI SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
|
||||
AI SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
|
||||
AI SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
|
||||
AI SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
|
||||
|
||||
AI static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
|
||||
AI static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
|
||||
|
||||
AI SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
|
||||
AI SkNx floor() const {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_floor_ps(fVec);
|
||||
#else
|
||||
return emulate_mm_floor_ps(fVec);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI SkNx sqrt() const { return _mm_sqrt_ps (fVec); }
|
||||
|
||||
AI float operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
union { __m128 v; float fs[4]; } pun = {fVec};
|
||||
return pun.fs[k&3];
|
||||
}
|
||||
|
||||
AI float min() const {
|
||||
SkNx min = Min(*this, _mm_shuffle_ps(fVec, fVec, _MM_SHUFFLE(2,3,0,1)));
|
||||
min = Min(min, _mm_shuffle_ps(min.fVec, min.fVec, _MM_SHUFFLE(0,1,2,3)));
|
||||
return min[0];
|
||||
}
|
||||
|
||||
AI float max() const {
|
||||
SkNx max = Max(*this, _mm_shuffle_ps(fVec, fVec, _MM_SHUFFLE(2,3,0,1)));
|
||||
max = Max(max, _mm_shuffle_ps(max.fVec, max.fVec, _MM_SHUFFLE(0,1,2,3)));
|
||||
return max[0];
|
||||
}
|
||||
|
||||
AI bool allTrue() const { return 0b1111 == _mm_movemask_ps(fVec); }
|
||||
AI bool anyTrue() const { return 0b0000 != _mm_movemask_ps(fVec); }
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_blendv_ps(e.fVec, t.fVec, fVec);
|
||||
#else
|
||||
return _mm_or_ps(_mm_and_ps (fVec, t.fVec),
|
||||
_mm_andnot_ps(fVec, e.fVec));
|
||||
#endif
|
||||
}
|
||||
|
||||
__m128 fVec;
|
||||
};
|
||||
|
||||
AI static __m128i mullo32(__m128i a, __m128i b) {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_mullo_epi32(a, b);
|
||||
#else
|
||||
__m128i mul20 = _mm_mul_epu32(a, b),
|
||||
mul31 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
|
||||
return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)),
|
||||
_mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
class SkNx<4, int32_t> {
|
||||
public:
|
||||
AI SkNx(const __m128i& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(int32_t val) : fVec(_mm_set1_epi32(val)) {}
|
||||
AI static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
|
||||
AI SkNx(int32_t a, int32_t b, int32_t c, int32_t d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
|
||||
|
||||
AI void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
|
||||
AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
|
||||
AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
|
||||
AI SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }
|
||||
|
||||
AI SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVec); }
|
||||
AI SkNx operator < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVec); }
|
||||
AI SkNx operator > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVec); }
|
||||
|
||||
AI int32_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
union { __m128i v; int32_t is[4]; } pun = {fVec};
|
||||
return pun.is[k&3];
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_blendv_epi8(e.fVec, t.fVec, fVec);
|
||||
#else
|
||||
return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
|
||||
_mm_andnot_si128(fVec, e.fVec));
|
||||
#endif
|
||||
}
|
||||
|
||||
AI SkNx abs() const {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
||||
return _mm_abs_epi32(fVec);
|
||||
#else
|
||||
SkNx mask = (*this) >> 31;
|
||||
return (mask ^ (*this)) - mask;
|
||||
#endif
|
||||
}
|
||||
|
||||
AI static SkNx Min(const SkNx& x, const SkNx& y) {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_min_epi32(x.fVec, y.fVec);
|
||||
#else
|
||||
return (x < y).thenElse(x, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
AI static SkNx Max(const SkNx& x, const SkNx& y) {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_max_epi32(x.fVec, y.fVec);
|
||||
#else
|
||||
return (x > y).thenElse(x, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
__m128i fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<2, uint32_t> {
|
||||
public:
|
||||
AI SkNx(const __m128i& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint32_t val) : fVec(_mm_set1_epi32((int)val)) {}
|
||||
AI static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); }
|
||||
AI SkNx(uint32_t a, uint32_t b) : fVec(_mm_setr_epi32((int)a,(int)b,0,0)) {}
|
||||
|
||||
AI void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
|
||||
AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
|
||||
AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
|
||||
AI SkNx operator >> (int bits) const { return _mm_srli_epi32(fVec, bits); }
|
||||
|
||||
AI SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVec); }
|
||||
AI SkNx operator != (const SkNx& o) const { return (*this == o) ^ 0xffffffff; }
|
||||
// operator < and > take a little extra fiddling to make work for unsigned ints.
|
||||
|
||||
AI uint32_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 2);
|
||||
union { __m128i v; uint32_t us[4]; } pun = {fVec};
|
||||
return pun.us[k&1];
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_blendv_epi8(e.fVec, t.fVec, fVec);
|
||||
#else
|
||||
return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
|
||||
_mm_andnot_si128(fVec, e.fVec));
|
||||
#endif
|
||||
}
|
||||
|
||||
AI bool allTrue() const { return 0xff == (_mm_movemask_epi8(fVec) & 0xff); }
|
||||
|
||||
__m128i fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<4, uint32_t> {
|
||||
public:
|
||||
AI SkNx(const __m128i& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint32_t val) : fVec(_mm_set1_epi32((int)val)) {}
|
||||
AI static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
|
||||
AI SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
|
||||
: fVec(_mm_setr_epi32((int)a,(int)b,(int)c,(int)d)) {}
|
||||
|
||||
AI void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
|
||||
AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
|
||||
AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
|
||||
AI SkNx operator >> (int bits) const { return _mm_srli_epi32(fVec, bits); }
|
||||
|
||||
AI SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVec); }
|
||||
AI SkNx operator != (const SkNx& o) const { return (*this == o) ^ 0xffffffff; }
|
||||
|
||||
// operator < and > take a little extra fiddling to make work for unsigned ints.
|
||||
|
||||
AI uint32_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
union { __m128i v; uint32_t us[4]; } pun = {fVec};
|
||||
return pun.us[k&3];
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
return _mm_blendv_epi8(e.fVec, t.fVec, fVec);
|
||||
#else
|
||||
return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
|
||||
_mm_andnot_si128(fVec, e.fVec));
|
||||
#endif
|
||||
}
|
||||
|
||||
AI SkNx mulHi(SkNx m) const {
|
||||
SkNx v20{_mm_mul_epu32(m.fVec, fVec)};
|
||||
SkNx v31{_mm_mul_epu32(_mm_srli_si128(m.fVec, 4), _mm_srli_si128(fVec, 4))};
|
||||
|
||||
return SkNx{v20[1], v31[1], v20[3], v31[3]};
|
||||
}
|
||||
|
||||
__m128i fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<4, uint16_t> {
|
||||
public:
|
||||
AI SkNx(const __m128i& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint16_t val) : fVec(_mm_set1_epi16((short)val)) {}
|
||||
AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d)
|
||||
: fVec(_mm_setr_epi16((short)a,(short)b,(short)c,(short)d,0,0,0,0)) {}
|
||||
|
||||
AI static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); }
|
||||
AI void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
|
||||
|
||||
AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
|
||||
__m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0),
|
||||
hi = _mm_loadu_si128(((__m128i*)ptr) + 1);
|
||||
__m128i even = _mm_unpacklo_epi16(lo, hi), // r0 r2 g0 g2 b0 b2 a0 a2
|
||||
odd = _mm_unpackhi_epi16(lo, hi); // r1 r3 ...
|
||||
__m128i rg = _mm_unpacklo_epi16(even, odd), // r0 r1 r2 r3 g0 g1 g2 g3
|
||||
ba = _mm_unpackhi_epi16(even, odd); // b0 b1 ... a0 a1 ...
|
||||
*r = rg;
|
||||
*g = _mm_srli_si128(rg, 8);
|
||||
*b = ba;
|
||||
*a = _mm_srli_si128(ba, 8);
|
||||
}
|
||||
AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
|
||||
// The idea here is to get 4 vectors that are R G B _ _ _ _ _.
|
||||
// The second load is at a funny location to make sure we don't read past
|
||||
// the bounds of memory. This is fine, we just need to shift it a little bit.
|
||||
const uint8_t* ptr8 = (const uint8_t*) ptr;
|
||||
__m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 + 0));
|
||||
__m128i rgb1 = _mm_srli_si128(rgb0, 3*2);
|
||||
__m128i rgb2 = _mm_srli_si128(_mm_loadu_si128((const __m128i*) (ptr8 + 4*2)), 2*2);
|
||||
__m128i rgb3 = _mm_srli_si128(rgb2, 3*2);
|
||||
|
||||
__m128i rrggbb01 = _mm_unpacklo_epi16(rgb0, rgb1);
|
||||
__m128i rrggbb23 = _mm_unpacklo_epi16(rgb2, rgb3);
|
||||
*r = _mm_unpacklo_epi32(rrggbb01, rrggbb23);
|
||||
*g = _mm_srli_si128(r->fVec, 4*2);
|
||||
*b = _mm_unpackhi_epi32(rrggbb01, rrggbb23);
|
||||
}
|
||||
AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
|
||||
__m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);
|
||||
__m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec);
|
||||
__m128i lo = _mm_unpacklo_epi32(rg, ba);
|
||||
__m128i hi = _mm_unpackhi_epi32(rg, ba);
|
||||
_mm_storeu_si128(((__m128i*) dst) + 0, lo);
|
||||
_mm_storeu_si128(((__m128i*) dst) + 1, hi);
|
||||
}
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
|
||||
AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
|
||||
AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
|
||||
AI SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
|
||||
|
||||
AI uint16_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
union { __m128i v; uint16_t us[8]; } pun = {fVec};
|
||||
return pun.us[k&3];
|
||||
}
|
||||
|
||||
__m128i fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<8, uint16_t> {
|
||||
public:
|
||||
AI SkNx(const __m128i& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint16_t val) : fVec(_mm_set1_epi16((short)val)) {}
|
||||
AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
|
||||
uint16_t e, uint16_t f, uint16_t g, uint16_t h)
|
||||
: fVec(_mm_setr_epi16((short)a,(short)b,(short)c,(short)d,
|
||||
(short)e,(short)f,(short)g,(short)h)) {}
|
||||
|
||||
AI static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
|
||||
AI void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
|
||||
|
||||
AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
|
||||
__m128i _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
|
||||
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
|
||||
_45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
|
||||
_67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
|
||||
|
||||
__m128i _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
|
||||
_13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
|
||||
_46 = _mm_unpacklo_epi16(_45, _67),
|
||||
_57 = _mm_unpackhi_epi16(_45, _67);
|
||||
|
||||
__m128i rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
|
||||
ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
|
||||
rg4567 = _mm_unpacklo_epi16(_46, _57),
|
||||
ba4567 = _mm_unpackhi_epi16(_46, _57);
|
||||
|
||||
*r = _mm_unpacklo_epi64(rg0123, rg4567);
|
||||
*g = _mm_unpackhi_epi64(rg0123, rg4567);
|
||||
*b = _mm_unpacklo_epi64(ba0123, ba4567);
|
||||
*a = _mm_unpackhi_epi64(ba0123, ba4567);
|
||||
}
|
||||
AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
|
||||
const uint8_t* ptr8 = (const uint8_t*) ptr;
|
||||
__m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 + 0*2));
|
||||
__m128i rgb1 = _mm_srli_si128(rgb0, 3*2);
|
||||
__m128i rgb2 = _mm_loadu_si128((const __m128i*) (ptr8 + 6*2));
|
||||
__m128i rgb3 = _mm_srli_si128(rgb2, 3*2);
|
||||
__m128i rgb4 = _mm_loadu_si128((const __m128i*) (ptr8 + 12*2));
|
||||
__m128i rgb5 = _mm_srli_si128(rgb4, 3*2);
|
||||
__m128i rgb6 = _mm_srli_si128(_mm_loadu_si128((const __m128i*) (ptr8 + 16*2)), 2*2);
|
||||
__m128i rgb7 = _mm_srli_si128(rgb6, 3*2);
|
||||
|
||||
__m128i rgb01 = _mm_unpacklo_epi16(rgb0, rgb1);
|
||||
__m128i rgb23 = _mm_unpacklo_epi16(rgb2, rgb3);
|
||||
__m128i rgb45 = _mm_unpacklo_epi16(rgb4, rgb5);
|
||||
__m128i rgb67 = _mm_unpacklo_epi16(rgb6, rgb7);
|
||||
|
||||
__m128i rg03 = _mm_unpacklo_epi32(rgb01, rgb23);
|
||||
__m128i bx03 = _mm_unpackhi_epi32(rgb01, rgb23);
|
||||
__m128i rg47 = _mm_unpacklo_epi32(rgb45, rgb67);
|
||||
__m128i bx47 = _mm_unpackhi_epi32(rgb45, rgb67);
|
||||
|
||||
*r = _mm_unpacklo_epi64(rg03, rg47);
|
||||
*g = _mm_unpackhi_epi64(rg03, rg47);
|
||||
*b = _mm_unpacklo_epi64(bx03, bx47);
|
||||
}
|
||||
AI static void Store4(void* ptr, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
|
||||
__m128i rg0123 = _mm_unpacklo_epi16(r.fVec, g.fVec), // r0 g0 r1 g1 r2 g2 r3 g3
|
||||
rg4567 = _mm_unpackhi_epi16(r.fVec, g.fVec), // r4 g4 r5 g5 r6 g6 r7 g7
|
||||
ba0123 = _mm_unpacklo_epi16(b.fVec, a.fVec),
|
||||
ba4567 = _mm_unpackhi_epi16(b.fVec, a.fVec);
|
||||
|
||||
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
|
||||
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
|
||||
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
|
||||
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
|
||||
}
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
|
||||
AI SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
|
||||
AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
|
||||
AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
|
||||
AI SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
|
||||
|
||||
AI static SkNx Min(const SkNx& a, const SkNx& b) {
|
||||
// No unsigned _mm_min_epu16, so we'll shift into a space where we can use the
|
||||
// signed version, _mm_min_epi16, then shift back.
|
||||
const uint16_t top = 0x8000; // Keep this separate from _mm_set1_epi16 or MSVC will whine.
|
||||
const __m128i top_8x = _mm_set1_epi16((short)top);
|
||||
return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x),
|
||||
_mm_sub_epi8(b.fVec, top_8x)));
|
||||
}
|
||||
|
||||
AI SkNx mulHi(const SkNx& m) const {
|
||||
return _mm_mulhi_epu16(fVec, m.fVec);
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
|
||||
_mm_andnot_si128(fVec, e.fVec));
|
||||
}
|
||||
|
||||
AI uint16_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 8);
|
||||
union { __m128i v; uint16_t us[8]; } pun = {fVec};
|
||||
return pun.us[k&7];
|
||||
}
|
||||
|
||||
__m128i fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<4, uint8_t> {
|
||||
public:
|
||||
AI SkNx() {}
|
||||
AI SkNx(const __m128i& vec) : fVec(vec) {}
|
||||
AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d)
|
||||
: fVec(_mm_setr_epi8((char)a,(char)b,(char)c,(char)d, 0,0,0,0, 0,0,0,0, 0,0,0,0)) {}
|
||||
|
||||
AI static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); }
|
||||
AI void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); }
|
||||
|
||||
AI uint8_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
union { __m128i v; uint8_t us[16]; } pun = {fVec};
|
||||
return pun.us[k&3];
|
||||
}
|
||||
|
||||
// TODO as needed
|
||||
|
||||
__m128i fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<8, uint8_t> {
|
||||
public:
|
||||
AI SkNx(const __m128i& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint8_t val) : fVec(_mm_set1_epi8((char)val)) {}
|
||||
AI static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); }
|
||||
AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
|
||||
uint8_t e, uint8_t f, uint8_t g, uint8_t h)
|
||||
: fVec(_mm_setr_epi8((char)a,(char)b,(char)c,(char)d,
|
||||
(char)e,(char)f,(char)g,(char)h,
|
||||
0,0,0,0, 0,0,0,0)) {}
|
||||
|
||||
AI void store(void* ptr) const {_mm_storel_epi64((__m128i*)ptr, fVec);}
|
||||
|
||||
AI SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); }
|
||||
|
||||
AI static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); }
|
||||
AI SkNx operator < (const SkNx& o) const {
|
||||
// There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare.
|
||||
auto flip = _mm_set1_epi8(char(0x80));
|
||||
return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec));
|
||||
}
|
||||
|
||||
AI uint8_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 16);
|
||||
union { __m128i v; uint8_t us[16]; } pun = {fVec};
|
||||
return pun.us[k&15];
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
|
||||
_mm_andnot_si128(fVec, e.fVec));
|
||||
}
|
||||
|
||||
__m128i fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNx<16, uint8_t> {
|
||||
public:
|
||||
AI SkNx(const __m128i& vec) : fVec(vec) {}
|
||||
|
||||
AI SkNx() {}
|
||||
AI SkNx(uint8_t val) : fVec(_mm_set1_epi8((char)val)) {}
|
||||
AI static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
|
||||
AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
|
||||
uint8_t e, uint8_t f, uint8_t g, uint8_t h,
|
||||
uint8_t i, uint8_t j, uint8_t k, uint8_t l,
|
||||
uint8_t m, uint8_t n, uint8_t o, uint8_t p)
|
||||
: fVec(_mm_setr_epi8((char)a,(char)b,(char)c,(char)d,
|
||||
(char)e,(char)f,(char)g,(char)h,
|
||||
(char)i,(char)j,(char)k,(char)l,
|
||||
(char)m,(char)n,(char)o,(char)p)) {}
|
||||
|
||||
AI void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
|
||||
|
||||
AI SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); }
|
||||
|
||||
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); }
|
||||
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); }
|
||||
AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
|
||||
|
||||
AI static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); }
|
||||
AI SkNx operator < (const SkNx& o) const {
|
||||
// There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare.
|
||||
auto flip = _mm_set1_epi8(char(0x80));
|
||||
return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec));
|
||||
}
|
||||
|
||||
AI uint8_t operator[](int k) const {
|
||||
SkASSERT(0 <= k && k < 16);
|
||||
union { __m128i v; uint8_t us[16]; } pun = {fVec};
|
||||
return pun.us[k&15];
|
||||
}
|
||||
|
||||
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
|
||||
return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
|
||||
_mm_andnot_si128(fVec, e.fVec));
|
||||
}
|
||||
|
||||
__m128i fVec;
|
||||
};
|
||||
|
||||
template<> AI /*static*/ Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {
|
||||
return _mm_cvtepi32_ps(src.fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) {
|
||||
return SkNx_cast<float>(Sk4i::Load(&src));
|
||||
}
|
||||
|
||||
template <> AI /*static*/ Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) {
|
||||
return _mm_cvttps_epi32(src.fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) {
|
||||
#if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
// TODO: This seems to be causing code generation problems. Investigate?
|
||||
return _mm_packus_epi32(src.fVec);
|
||||
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
||||
// With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.
|
||||
const int _ = ~0;
|
||||
return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_));
|
||||
#else
|
||||
// With SSE2, we have to sign extend our input, making _mm_packs_epi32 do the pack we want.
|
||||
__m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);
|
||||
return _mm_packs_epi32(x,x);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
|
||||
return SkNx_cast<uint16_t>(SkNx_cast<int32_t>(src));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
|
||||
auto _32 = _mm_cvttps_epi32(src.fVec);
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
||||
const int _ = ~0;
|
||||
return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_,_));
|
||||
#else
|
||||
auto _16 = _mm_packus_epi16(_32, _32);
|
||||
return _mm_packus_epi16(_16, _16);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4u SkNx_cast<uint32_t, uint8_t>(const Sk4b& src) {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
||||
const int _ = ~0;
|
||||
return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
|
||||
#else
|
||||
auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());
|
||||
return _mm_unpacklo_epi16(_16, _mm_setzero_si128());
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
|
||||
return SkNx_cast<uint32_t>(src).fVec;
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
|
||||
return _mm_cvtepi32_ps(SkNx_cast<int32_t>(src).fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
|
||||
auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
|
||||
return _mm_cvtepi32_ps(_32);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, int32_t>(const Sk8i& src) {
|
||||
Sk4i lo, hi;
|
||||
SkNx_split(src, &lo, &hi);
|
||||
|
||||
auto t = _mm_packs_epi32(lo.fVec, hi.fVec);
|
||||
return _mm_packus_epi16(t, t);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) {
|
||||
Sk8f ab, cd;
|
||||
SkNx_split(src, &ab, &cd);
|
||||
|
||||
Sk4f a,b,c,d;
|
||||
SkNx_split(ab, &a, &b);
|
||||
SkNx_split(cd, &c, &d);
|
||||
|
||||
return _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec),
|
||||
_mm_cvttps_epi32(b.fVec)),
|
||||
_mm_packus_epi16(_mm_cvttps_epi32(c.fVec),
|
||||
_mm_cvttps_epi32(d.fVec)));
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
|
||||
return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk8h SkNx_cast<uint16_t, uint8_t>(const Sk8b& src) {
|
||||
return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
|
||||
return _mm_packus_epi16(src.fVec, src.fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, uint16_t>(const Sk8h& src) {
|
||||
return _mm_packus_epi16(src.fVec, src.fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {
|
||||
return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
|
||||
template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
|
||||
return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) {
|
||||
return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
|
||||
}
|
||||
|
||||
template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) {
|
||||
return src.fVec;
|
||||
}
|
||||
|
||||
AI static Sk4i Sk4f_round(const Sk4f& x) {
|
||||
return _mm_cvtps_epi32(x.fVec);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif//SkNx_sse_DEFINED
|
@ -112,8 +112,8 @@ generated_cc_atom(
|
||||
":Animator_hdr",
|
||||
":VectorKeyframeAnimator_hdr",
|
||||
"//include/core:SkTypes_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkTPin_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
"//modules/skottie/src:SkottieJson_hdr",
|
||||
"//modules/skottie/src:SkottieValue_hdr",
|
||||
"//src/core:SkSafeMath_hdr",
|
||||
|
@ -8,8 +8,8 @@
|
||||
#include "modules/skottie/src/animator/VectorKeyframeAnimator.h"
|
||||
|
||||
#include "include/core/SkTypes.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkTPin.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "modules/skottie/src/SkottieJson.h"
|
||||
#include "modules/skottie/src/SkottieValue.h"
|
||||
#include "modules/skottie/src/animator/Animator.h"
|
||||
@ -112,10 +112,12 @@ private:
|
||||
bool changed = false;
|
||||
|
||||
while (count >= 4) {
|
||||
const auto old_val = Sk4f::Load(dst),
|
||||
new_val = Lerp(Sk4f::Load(v0), Sk4f::Load(v1), lerp_info.weight);
|
||||
const auto old_val = skvx::float4::Load(dst),
|
||||
new_val = Lerp(skvx::float4::Load(v0),
|
||||
skvx::float4::Load(v1),
|
||||
lerp_info.weight);
|
||||
|
||||
changed |= (new_val != old_val).anyTrue();
|
||||
changed |= any(new_val != old_val);
|
||||
new_val.store(dst);
|
||||
|
||||
v0 += 4;
|
||||
|
@ -44,7 +44,6 @@ generated_cc_atom(
|
||||
"//include/core:SkCanvas_hdr",
|
||||
"//include/core:SkPictureRecorder_hdr",
|
||||
"//include/effects:SkRuntimeEffect_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//modules/skottie/src:Adapter_hdr",
|
||||
"//modules/skottie/src:SkottieValue_hdr",
|
||||
"//modules/sksg/include:SkSGPaint_hdr",
|
||||
@ -59,7 +58,8 @@ generated_cc_atom(
|
||||
visibility = ["//:__subpackages__"],
|
||||
deps = [
|
||||
":Effects_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkColorData_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
"//modules/skottie/src:Adapter_hdr",
|
||||
"//modules/skottie/src:SkottieValue_hdr",
|
||||
"//modules/sksg/include:SkSGColorFilter_hdr",
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include "include/core/SkCanvas.h"
|
||||
#include "include/core/SkPictureRecorder.h"
|
||||
#include "include/effects/SkRuntimeEffect.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "modules/skottie/src/Adapter.h"
|
||||
#include "modules/skottie/src/SkottieValue.h"
|
||||
#include "modules/sksg/include/SkSGPaint.h"
|
||||
|
@ -7,7 +7,8 @@
|
||||
|
||||
#include "modules/skottie/src/effects/Effects.h"
|
||||
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkColorData.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "modules/skottie/src/Adapter.h"
|
||||
#include "modules/skottie/src/SkottieValue.h"
|
||||
#include "modules/sksg/include/SkSGColorFilter.h"
|
||||
@ -50,13 +51,11 @@ class CCTonerAdapter final : public DiscardableAdapterBase<CCTonerAdapter,
|
||||
}
|
||||
private:
|
||||
static SkColor lerpColor(SkColor c0, SkColor c1, float t) {
|
||||
const auto c0_4f = SkNx_cast<float>(Sk4b::Load(&c0)),
|
||||
c1_4f = SkNx_cast<float>(Sk4b::Load(&c1)),
|
||||
const auto c0_4f = Sk4f_fromL32(c0),
|
||||
c1_4f = Sk4f_fromL32(c1),
|
||||
c_4f = c0_4f + (c1_4f - c0_4f) * t;
|
||||
|
||||
SkColor c;
|
||||
SkNx_cast<uint8_t>(Sk4f_round(c_4f)).store(&c);
|
||||
return c;
|
||||
return Sk4f_toL32(c_4f);
|
||||
}
|
||||
|
||||
void onSync() override {
|
||||
|
@ -117,7 +117,8 @@ generated_cc_atom(
|
||||
":TextAnimator_hdr",
|
||||
"//include/core:SkColor_hdr",
|
||||
"//include/core:SkPoint_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkColorData_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
"//modules/skottie/src:SkottieValue_hdr",
|
||||
"//modules/skottie/src/animator:Animator_hdr",
|
||||
"//src/utils:SkJSON_hdr",
|
||||
|
@ -9,7 +9,8 @@
|
||||
|
||||
#include "include/core/SkColor.h"
|
||||
#include "include/core/SkPoint.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkColorData.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "modules/skottie/src/SkottieValue.h"
|
||||
#include "modules/skottie/src/animator/Animator.h"
|
||||
#include "modules/skottie/src/text/RangeSelector.h"
|
||||
@ -133,13 +134,11 @@ TextAnimator::ResolvedProps TextAnimator::modulateProps(const ResolvedProps& pro
|
||||
return v0 + (v1 - v0)*t;
|
||||
};
|
||||
const auto lerp_color = [](SkColor c0, SkColor c1, float t) {
|
||||
const auto c0_4f = SkNx_cast<float>(Sk4b::Load(&c0)),
|
||||
c1_4f = SkNx_cast<float>(Sk4b::Load(&c1)),
|
||||
const auto c0_4f = Sk4f_fromL32(c0),
|
||||
c1_4f = Sk4f_fromL32(c1),
|
||||
c_4f = c0_4f + (c1_4f - c0_4f) * t;
|
||||
|
||||
SkColor c;
|
||||
SkNx_cast<uint8_t>(Sk4f_round(c_4f)).store(&c);
|
||||
return c;
|
||||
return Sk4f_toL32(c_4f);
|
||||
};
|
||||
|
||||
// Colors and opacity are interpolated, and use a clamped amount value.
|
||||
|
@ -291,9 +291,6 @@ BASE_SRCS_ALL = [
|
||||
"include/private/SkMalloc.h",
|
||||
"include/private/SkMutex.h",
|
||||
"include/private/SkNoncopyable.h",
|
||||
"include/private/SkNx.h",
|
||||
"include/private/SkNx_neon.h",
|
||||
"include/private/SkNx_sse.h",
|
||||
"include/private/SkOnce.h",
|
||||
"include/private/SkOpts_spi.h",
|
||||
"include/private/SkPathRef.h",
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include "include/core/SkCanvas.h"
|
||||
#include "include/core/SkPaint.h"
|
||||
#include "include/core/SkPath.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "include/utils/SkRandom.h"
|
||||
#include "samplecode/Sample.h"
|
||||
#include "src/core/SkPathPriv.h"
|
||||
@ -285,10 +285,10 @@ public:
|
||||
const Glyph& glyph = fGlyphs[i];
|
||||
const SkMatrix& backMatrix = fBackMatrices[i];
|
||||
|
||||
const Sk2f matrix[3] = {
|
||||
Sk2f(backMatrix.getScaleX(), backMatrix.getSkewY()),
|
||||
Sk2f(backMatrix.getSkewX(), backMatrix.getScaleY()),
|
||||
Sk2f(backMatrix.getTranslateX(), backMatrix.getTranslateY())
|
||||
const skvx::float2 matrix[3] = {
|
||||
skvx::float2(backMatrix.getScaleX(), backMatrix.getSkewY()),
|
||||
skvx::float2(backMatrix.getSkewX(), backMatrix.getScaleY()),
|
||||
skvx::float2(backMatrix.getTranslateX(), backMatrix.getTranslateY())
|
||||
};
|
||||
|
||||
SkPath* backpath = &fBackPaths[i];
|
||||
@ -344,7 +344,7 @@ private:
|
||||
class Waves {
|
||||
public:
|
||||
void reset(SkRandom& rand, int w, int h);
|
||||
SkPoint apply(float tsec, const Sk2f matrix[3], const SkPoint& pt) const;
|
||||
SkPoint apply(float tsec, const skvx::float2 matrix[3], const SkPoint& pt) const;
|
||||
|
||||
private:
|
||||
constexpr static double kAverageAngle = SK_ScalarPI / 8.0;
|
||||
@ -383,7 +383,7 @@ void PathText::WavyGlyphAnimator::Waves::reset(SkRandom& rand, int w, int h) {
|
||||
}
|
||||
}
|
||||
|
||||
SkPoint PathText::WavyGlyphAnimator::Waves::apply(float tsec, const Sk2f matrix[3],
|
||||
SkPoint PathText::WavyGlyphAnimator::Waves::apply(float tsec, const skvx::float2 matrix[3],
|
||||
const SkPoint& pt) const {
|
||||
constexpr static int kTablePeriod = 1 << 12;
|
||||
static float sin2table[kTablePeriod + 1];
|
||||
@ -395,38 +395,37 @@ SkPoint PathText::WavyGlyphAnimator::Waves::apply(float tsec, const Sk2f matrix[
|
||||
}
|
||||
});
|
||||
|
||||
const Sk4f amplitudes = Sk4f::Load(fAmplitudes);
|
||||
const Sk4f frequencies = Sk4f::Load(fFrequencies);
|
||||
const Sk4f dirsX = Sk4f::Load(fDirsX);
|
||||
const Sk4f dirsY = Sk4f::Load(fDirsY);
|
||||
const Sk4f speeds = Sk4f::Load(fSpeeds);
|
||||
const Sk4f offsets = Sk4f::Load(fOffsets);
|
||||
const auto amplitudes = skvx::float4::Load(fAmplitudes);
|
||||
const auto frequencies = skvx::float4::Load(fFrequencies);
|
||||
const auto dirsX = skvx::float4::Load(fDirsX);
|
||||
const auto dirsY = skvx::float4::Load(fDirsY);
|
||||
const auto speeds = skvx::float4::Load(fSpeeds);
|
||||
const auto offsets = skvx::float4::Load(fOffsets);
|
||||
|
||||
float devicePt[2];
|
||||
(matrix[0] * pt.x() + matrix[1] * pt.y() + matrix[2]).store(devicePt);
|
||||
|
||||
const Sk4f t = (frequencies * (dirsX * devicePt[0] + dirsY * devicePt[1]) +
|
||||
speeds * tsec +
|
||||
offsets).abs() * (float(kTablePeriod) / float(SK_ScalarPI));
|
||||
const skvx::float4 t = abs(frequencies * (dirsX * devicePt[0] + dirsY * devicePt[1]) +
|
||||
speeds * tsec + offsets) * (float(kTablePeriod) / SK_ScalarPI);
|
||||
|
||||
const Sk4i ipart = SkNx_cast<int>(t);
|
||||
const Sk4f fpart = t - SkNx_cast<float>(ipart);
|
||||
const skvx::int4 ipart = skvx::cast<int32_t>(t);
|
||||
const skvx::float4 fpart = t - skvx::cast<float>(ipart);
|
||||
|
||||
int32_t indices[4];
|
||||
(ipart & (kTablePeriod-1)).store(indices);
|
||||
|
||||
const Sk4f left(sin2table[indices[0]], sin2table[indices[1]],
|
||||
sin2table[indices[2]], sin2table[indices[3]]);
|
||||
const Sk4f right(sin2table[indices[0] + 1], sin2table[indices[1] + 1],
|
||||
sin2table[indices[2] + 1], sin2table[indices[3] + 1]);
|
||||
const Sk4f height = amplitudes * (left * (1.f - fpart) + right * fpart);
|
||||
const skvx::float4 left(sin2table[indices[0]], sin2table[indices[1]],
|
||||
sin2table[indices[2]], sin2table[indices[3]]);
|
||||
const skvx::float4 right(sin2table[indices[0] + 1], sin2table[indices[1] + 1],
|
||||
sin2table[indices[2] + 1], sin2table[indices[3] + 1]);
|
||||
const auto height = amplitudes * (left * (1.f - fpart) + right * fpart);
|
||||
|
||||
Sk4f dy = height * dirsY;
|
||||
Sk4f dx = height * dirsX;
|
||||
auto dy = height * dirsY;
|
||||
auto dx = height * dirsX;
|
||||
|
||||
float offsetY[4], offsetX[4];
|
||||
(dy + SkNx_shuffle<2,3,0,1>(dy)).store(offsetY); // accumulate.
|
||||
(dx + SkNx_shuffle<2,3,0,1>(dx)).store(offsetX);
|
||||
(dy + skvx::shuffle<2,3,0,1>(dy)).store(offsetY); // accumulate.
|
||||
(dx + skvx::shuffle<2,3,0,1>(dx)).store(offsetX);
|
||||
|
||||
return {devicePt[0] + offsetY[0] + offsetY[1], devicePt[1] - offsetX[0] - offsetX[1]};
|
||||
}
|
||||
|
@ -760,7 +760,6 @@ generated_cc_atom(
|
||||
":SkXfermodePriv_hdr",
|
||||
"//include/core:SkShader_hdr",
|
||||
"//include/private:SkColorData_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
],
|
||||
)
|
||||
|
||||
@ -1140,7 +1139,6 @@ generated_cc_atom(
|
||||
"//include/effects:SkColorMatrix_hdr",
|
||||
"//include/effects:SkRuntimeEffect_hdr",
|
||||
"//include/private:SkColorData_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//src/gpu/ganesh/effects:GrSkSLFP_hdr",
|
||||
],
|
||||
)
|
||||
@ -1166,7 +1164,6 @@ generated_cc_atom(
|
||||
"//include/core:SkString_hdr",
|
||||
"//include/core:SkUnPreMultiply_hdr",
|
||||
"//include/effects:SkRuntimeEffect_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkTDArray_hdr",
|
||||
"//include/third_party/skcms:skcms_hdr",
|
||||
"//src/gpu/ganesh:GrColorInfo_hdr",
|
||||
@ -1369,8 +1366,8 @@ generated_cc_atom(
|
||||
deps = [
|
||||
":SkOpts_hdr",
|
||||
"//include/core:SkCubicMap_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkTPin_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
"//src/pathops:SkPathOpsCubic_hdr",
|
||||
],
|
||||
)
|
||||
@ -1768,7 +1765,7 @@ generated_cc_atom(
|
||||
":SkVerticesPriv_hdr",
|
||||
"//include/core:SkColorSpace_hdr",
|
||||
"//include/core:SkString_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
"//src/shaders:SkColorShader_hdr",
|
||||
"//src/shaders:SkComposeShader_hdr",
|
||||
"//src/shaders:SkShaderBase_hdr",
|
||||
@ -2076,7 +2073,6 @@ generated_cc_atom(
|
||||
":SkPointPriv_hdr",
|
||||
"//include/core:SkMatrix_hdr",
|
||||
"//include/core:SkPoint3_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkTPin_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
"//src/pathops:SkPathOpsCubic_hdr",
|
||||
@ -3286,10 +3282,10 @@ generated_cc_atom(
|
||||
":SkSafeMath_hdr",
|
||||
"//include/core:SkPath_hdr",
|
||||
"//include/core:SkRRect_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkOnce_hdr",
|
||||
"//include/private:SkPathRef_hdr",
|
||||
"//include/private:SkTo_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
],
|
||||
)
|
||||
|
||||
@ -3334,7 +3330,6 @@ generated_cc_atom(
|
||||
"//include/core:SkStream_hdr",
|
||||
"//include/core:SkString_hdr",
|
||||
"//include/private:SkMacros_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkPathRef_hdr",
|
||||
"//include/private:SkTo_hdr",
|
||||
"//src/pathops:SkPathOpsPoint_hdr",
|
||||
@ -3817,8 +3812,8 @@ generated_cc_atom(
|
||||
":SkOpts_hdr",
|
||||
":SkRasterPipeline_hdr",
|
||||
"//include/private:SkImageInfoPriv_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkTemplates_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
"//include/third_party/skcms:skcms_hdr",
|
||||
],
|
||||
)
|
||||
|
@ -11,8 +11,6 @@
|
||||
#include "src/core/SkCoreBlitters.h"
|
||||
#include "src/core/SkXfermodePriv.h"
|
||||
|
||||
#include "include/private/SkNx.h"
|
||||
|
||||
static void D16_S32X_src(uint16_t dst[], const SkPMColor src[], int count, uint8_t coverage) {
|
||||
SkASSERT(coverage == 0xFF);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
|
@ -9,7 +9,6 @@
|
||||
#include "include/core/SkString.h"
|
||||
#include "include/core/SkUnPreMultiply.h"
|
||||
#include "include/effects/SkRuntimeEffect.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkTDArray.h"
|
||||
#include "include/third_party/skcms/skcms.h"
|
||||
#include "src/core/SkArenaAlloc.h"
|
||||
|
@ -11,7 +11,6 @@
|
||||
#include "include/effects/SkColorMatrix.h"
|
||||
#include "include/effects/SkRuntimeEffect.h"
|
||||
#include "include/private/SkColorData.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "src/core/SkColorFilter_Matrix.h"
|
||||
#include "src/core/SkColorSpacePriv.h"
|
||||
#include "src/core/SkRasterPipeline.h"
|
||||
|
@ -6,8 +6,8 @@
|
||||
*/
|
||||
|
||||
#include "include/core/SkCubicMap.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkTPin.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "src/core/SkOpts.h"
|
||||
|
||||
//#define CUBICMAP_TRACK_MAX_ERROR
|
||||
@ -84,10 +84,10 @@ SkCubicMap::SkCubicMap(SkPoint p1, SkPoint p2) {
|
||||
p1.fX = std::min(std::max(p1.fX, 0.0f), 1.0f);
|
||||
p2.fX = std::min(std::max(p2.fX, 0.0f), 1.0f);
|
||||
|
||||
Sk2s s1 = Sk2s::Load(&p1) * 3;
|
||||
Sk2s s2 = Sk2s::Load(&p2) * 3;
|
||||
auto s1 = skvx::float2::Load(&p1) * 3;
|
||||
auto s2 = skvx::float2::Load(&p2) * 3;
|
||||
|
||||
(Sk2s(1) + s1 - s2).store(&fCoeff[0]);
|
||||
(1 + s1 - s2).store(&fCoeff[0]);
|
||||
(s2 - s1 - s1).store(&fCoeff[1]);
|
||||
s1.store(&fCoeff[2]);
|
||||
|
||||
@ -100,9 +100,9 @@ SkCubicMap::SkCubicMap(SkPoint p1, SkPoint p2) {
|
||||
}
|
||||
|
||||
SkPoint SkCubicMap::computeFromT(float t) const {
|
||||
Sk2s a = Sk2s::Load(&fCoeff[0]);
|
||||
Sk2s b = Sk2s::Load(&fCoeff[1]);
|
||||
Sk2s c = Sk2s::Load(&fCoeff[2]);
|
||||
auto a = skvx::float2::Load(&fCoeff[0]);
|
||||
auto b = skvx::float2::Load(&fCoeff[1]);
|
||||
auto c = skvx::float2::Load(&fCoeff[2]);
|
||||
|
||||
SkPoint result;
|
||||
(((a * t + b) * t + c) * t).store(&result);
|
||||
|
@ -7,7 +7,7 @@
|
||||
|
||||
#include "include/core/SkColorSpace.h"
|
||||
#include "include/core/SkString.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "src/core/SkArenaAlloc.h"
|
||||
#include "src/core/SkAutoBlitterChoose.h"
|
||||
#include "src/core/SkConvertPixels.h"
|
||||
@ -28,8 +28,10 @@
|
||||
struct Matrix43 {
|
||||
float fMat[12]; // column major
|
||||
|
||||
Sk4f map(float x, float y) const {
|
||||
return Sk4f::Load(&fMat[0]) * x + Sk4f::Load(&fMat[4]) * y + Sk4f::Load(&fMat[8]);
|
||||
skvx::float4 map(float x, float y) const {
|
||||
return skvx::float4::Load(&fMat[0]) * x +
|
||||
skvx::float4::Load(&fMat[4]) * y +
|
||||
skvx::float4::Load(&fMat[8]);
|
||||
}
|
||||
|
||||
// Pass a by value, so we don't have to worry about aliasing with this
|
||||
@ -174,9 +176,9 @@ bool SkTriColorShader::update(const SkMatrix& ctmInv, const SkPoint pts[],
|
||||
|
||||
fM33.setConcat(im, ctmInv);
|
||||
|
||||
Sk4f c0 = Sk4f::Load(colors[index0].vec()),
|
||||
c1 = Sk4f::Load(colors[index1].vec()),
|
||||
c2 = Sk4f::Load(colors[index2].vec());
|
||||
auto c0 = skvx::float4::Load(colors[index0].vec()),
|
||||
c1 = skvx::float4::Load(colors[index1].vec()),
|
||||
c2 = skvx::float4::Load(colors[index2].vec());
|
||||
|
||||
(c1 - c0).store(&fM43.fMat[0]);
|
||||
(c2 - c0).store(&fM43.fMat[4]);
|
||||
|
@ -7,7 +7,6 @@
|
||||
|
||||
#include "include/core/SkMatrix.h"
|
||||
#include "include/core/SkPoint3.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkTPin.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "src/core/SkGeometry.h"
|
||||
@ -584,8 +583,8 @@ float SkMeasureNonInflectCubicRotation(const SkPoint pts[4]) {
|
||||
return 2*SK_ScalarPI - SkMeasureAngleBetweenVectors(a,-b) - SkMeasureAngleBetweenVectors(b,-c);
|
||||
}
|
||||
|
||||
static Sk4f fma(const Sk4f& f, float m, const Sk4f& a) {
|
||||
return SkNx_fma(f, Sk4f(m), a);
|
||||
static skvx::float4 fma(const skvx::float4& f, float m, const skvx::float4& a) {
|
||||
return skvx::fma(f, skvx::float4(m), a);
|
||||
}
|
||||
|
||||
// Finds the root nearest 0.5. Returns 0.5 if the roots are undefined or outside 0..1.
|
||||
@ -626,16 +625,16 @@ float SkFindCubicMidTangent(const SkPoint src[4]) {
|
||||
// |. . | |bisector.y|
|
||||
//
|
||||
// The coeffs for the quadratic equation we need to solve are therefore: C' * bisector
|
||||
static const Sk4f kM[4] = {Sk4f(-1, 2, -1, 0),
|
||||
Sk4f( 3, -4, 1, 0),
|
||||
Sk4f(-3, 2, 0, 0)};
|
||||
Sk4f C_x = fma(kM[0], src[0].fX,
|
||||
static const skvx::float4 kM[4] = {skvx::float4(-1, 2, -1, 0),
|
||||
skvx::float4( 3, -4, 1, 0),
|
||||
skvx::float4(-3, 2, 0, 0)};
|
||||
auto C_x = fma(kM[0], src[0].fX,
|
||||
fma(kM[1], src[1].fX,
|
||||
fma(kM[2], src[2].fX, Sk4f(src[3].fX, 0,0,0))));
|
||||
Sk4f C_y = fma(kM[0], src[0].fY,
|
||||
fma(kM[2], src[2].fX, skvx::float4(src[3].fX, 0,0,0))));
|
||||
auto C_y = fma(kM[0], src[0].fY,
|
||||
fma(kM[1], src[1].fY,
|
||||
fma(kM[2], src[2].fY, Sk4f(src[3].fY, 0,0,0))));
|
||||
Sk4f coeffs = C_x * bisector.x() + C_y * bisector.y();
|
||||
fma(kM[2], src[2].fY, skvx::float4(src[3].fY, 0,0,0))));
|
||||
auto coeffs = C_x * bisector.x() + C_y * bisector.y();
|
||||
|
||||
// Now solve the quadratic for T.
|
||||
float T = 0;
|
||||
|
@ -379,7 +379,7 @@ struct SkConic {
|
||||
const SkMatrix*, SkConic conics[kMaxConicsForArc]);
|
||||
};
|
||||
|
||||
// inline helpers are contained in a namespace to avoid external leakage to fragile SkNx members
|
||||
// inline helpers are contained in a namespace to avoid external leakage to fragile SkVx members
|
||||
namespace { // NOLINT(google-build-namespaces)
|
||||
|
||||
/**
|
||||
|
@ -3290,7 +3290,6 @@ void SkPathPriv::CreateDrawArcPath(SkPath* path, const SkRect& oval, SkScalar st
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#include "include/private/SkNx.h"
|
||||
|
||||
static int compute_quad_extremas(const SkPoint src[3], SkPoint extremas[3]) {
|
||||
SkScalar ts[2];
|
||||
|
@ -9,9 +9,9 @@
|
||||
|
||||
#include "include/core/SkPath.h"
|
||||
#include "include/core/SkRRect.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkOnce.h"
|
||||
#include "include/private/SkTo.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "src/core/SkBuffer.h"
|
||||
#include "src/core/SkPathPriv.h"
|
||||
#include "src/core/SkSafeMath.h"
|
||||
@ -663,13 +663,12 @@ bool SkPathRef::isValid() const {
|
||||
|
||||
if (!fBoundsIsDirty && !fBounds.isEmpty()) {
|
||||
bool isFinite = true;
|
||||
Sk2s leftTop = Sk2s(fBounds.fLeft, fBounds.fTop);
|
||||
Sk2s rightBot = Sk2s(fBounds.fRight, fBounds.fBottom);
|
||||
auto leftTop = skvx::float2(fBounds.fLeft, fBounds.fTop);
|
||||
auto rightBot = skvx::float2(fBounds.fRight, fBounds.fBottom);
|
||||
for (int i = 0; i < fPoints.count(); ++i) {
|
||||
Sk2s point = Sk2s(fPoints[i].fX, fPoints[i].fY);
|
||||
auto point = skvx::float2(fPoints[i].fX, fPoints[i].fY);
|
||||
#ifdef SK_DEBUG
|
||||
if (fPoints[i].isFinite() &&
|
||||
((point < leftTop).anyTrue() || (point > rightBot).anyTrue())) {
|
||||
if (fPoints[i].isFinite() && (any(point < leftTop)|| any(point > rightBot))) {
|
||||
SkDebugf("bad SkPathRef bounds: %g %g %g %g\n",
|
||||
fBounds.fLeft, fBounds.fTop, fBounds.fRight, fBounds.fBottom);
|
||||
for (int j = 0; j < fPoints.count(); ++j) {
|
||||
@ -682,7 +681,7 @@ bool SkPathRef::isValid() const {
|
||||
}
|
||||
#endif
|
||||
|
||||
if (fPoints[i].isFinite() && (point < leftTop).anyTrue() && !(point > rightBot).anyTrue())
|
||||
if (fPoints[i].isFinite() && any(point < leftTop) && !any(point > rightBot))
|
||||
return false;
|
||||
if (!fPoints[i].isFinite()) {
|
||||
isFinite = false;
|
||||
|
@ -6,8 +6,8 @@
|
||||
*/
|
||||
|
||||
#include "include/private/SkImageInfoPriv.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkTemplates.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "include/third_party/skcms/skcms.h"
|
||||
#include "src/core/SkColorSpacePriv.h"
|
||||
#include "src/core/SkOpts.h"
|
||||
@ -117,7 +117,7 @@ void SkRasterPipeline::append_constant_color(SkArenaAlloc* alloc, const float rg
|
||||
this->append(white_color);
|
||||
} else {
|
||||
auto ctx = alloc->make<SkRasterPipeline_UniformColorCtx>();
|
||||
Sk4f color = Sk4f::Load(rgba);
|
||||
skvx::float4 color = skvx::float4::Load(rgba);
|
||||
color.store(&ctx->r);
|
||||
|
||||
// uniform_color requires colors in range and can go lowp,
|
||||
|
@ -51,8 +51,8 @@ generated_cc_atom(
|
||||
deps = [
|
||||
"//include/core:SkRect_hdr",
|
||||
"//include/private:SkColorData_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/private:SkTemplates_hdr",
|
||||
"//include/private:SkVx_hdr",
|
||||
"//src/core:SkConvertPixels_hdr",
|
||||
],
|
||||
)
|
||||
|
@ -11,8 +11,8 @@
|
||||
#include <type_traits>
|
||||
#include "include/core/SkRect.h"
|
||||
#include "include/private/SkColorData.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkTemplates.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "src/core/SkConvertPixels.h"
|
||||
|
||||
namespace skgpu {
|
||||
@ -304,7 +304,7 @@ inline VertexWriter& operator<<(VertexWriter& w, const VertexWriter::RepeatDesc<
|
||||
}
|
||||
|
||||
template <>
|
||||
SK_MAYBE_UNUSED inline VertexWriter& operator<<(VertexWriter& w, const Sk4f& vector) {
|
||||
SK_MAYBE_UNUSED inline VertexWriter& operator<<(VertexWriter& w, const skvx::float4& vector) {
|
||||
w.validate(sizeof(vector));
|
||||
vector.store(w.fPtr);
|
||||
w = w.makeOffset(sizeof(vector));
|
||||
|
@ -839,10 +839,11 @@ void FillRRectOpImpl::onExecute(GrOpFlushState* flushState, const SkRect& chainB
|
||||
}
|
||||
|
||||
// Will the given corner look good if we use HW derivatives?
|
||||
bool can_use_hw_derivatives_with_coverage(const Sk2f& devScale, const Sk2f& cornerRadii) {
|
||||
Sk2f devRadii = devScale * cornerRadii;
|
||||
bool can_use_hw_derivatives_with_coverage(const skvx::float2& devScale,
|
||||
const skvx::float2& cornerRadii) {
|
||||
skvx::float2 devRadii = devScale * cornerRadii;
|
||||
if (devRadii[1] < devRadii[0]) {
|
||||
devRadii = SkNx_shuffle<1,0>(devRadii);
|
||||
devRadii = skvx::shuffle<1,0>(devRadii);
|
||||
}
|
||||
float minDevRadius = std::max(devRadii[0], 1.f); // Shader clamps radius at a minimum of 1.
|
||||
// Is the gradient smooth enough for this corner look ok if we use hardware derivatives?
|
||||
@ -850,8 +851,9 @@ bool can_use_hw_derivatives_with_coverage(const Sk2f& devScale, const Sk2f& corn
|
||||
return minDevRadius * minDevRadius * 5 > devRadii[1];
|
||||
}
|
||||
|
||||
bool can_use_hw_derivatives_with_coverage(const Sk2f& devScale, const SkVector& cornerRadii) {
|
||||
return can_use_hw_derivatives_with_coverage(devScale, Sk2f::Load(&cornerRadii));
|
||||
bool can_use_hw_derivatives_with_coverage(const skvx::float2& devScale,
|
||||
const SkVector& cornerRadii) {
|
||||
return can_use_hw_derivatives_with_coverage(devScale, skvx::float2::Load(&cornerRadii));
|
||||
}
|
||||
|
||||
// Will the given round rect look good if we use HW derivatives?
|
||||
@ -862,9 +864,9 @@ bool can_use_hw_derivatives_with_coverage(const GrShaderCaps& shaderCaps,
|
||||
return false;
|
||||
}
|
||||
|
||||
Sk2f x = Sk2f(viewMatrix.getScaleX(), viewMatrix.getSkewX());
|
||||
Sk2f y = Sk2f(viewMatrix.getSkewY(), viewMatrix.getScaleY());
|
||||
Sk2f devScale = (x*x + y*y).sqrt();
|
||||
auto x = skvx::float2(viewMatrix.getScaleX(), viewMatrix.getSkewX());
|
||||
auto y = skvx::float2(viewMatrix.getSkewY(), viewMatrix.getScaleY());
|
||||
skvx::float2 devScale = sqrt(x*x + y*y);
|
||||
switch (rrect.getType()) {
|
||||
case SkRRect::kEmpty_Type:
|
||||
case SkRRect::kRect_Type:
|
||||
@ -875,12 +877,14 @@ bool can_use_hw_derivatives_with_coverage(const GrShaderCaps& shaderCaps,
|
||||
return can_use_hw_derivatives_with_coverage(devScale, rrect.getSimpleRadii());
|
||||
|
||||
case SkRRect::kNinePatch_Type: {
|
||||
Sk2f r0 = Sk2f::Load(SkRRectPriv::GetRadiiArray(rrect));
|
||||
Sk2f r1 = Sk2f::Load(SkRRectPriv::GetRadiiArray(rrect) + 2);
|
||||
Sk2f minRadii = Sk2f::Min(r0, r1);
|
||||
Sk2f maxRadii = Sk2f::Max(r0, r1);
|
||||
return can_use_hw_derivatives_with_coverage(devScale, Sk2f(minRadii[0], maxRadii[1])) &&
|
||||
can_use_hw_derivatives_with_coverage(devScale, Sk2f(maxRadii[0], minRadii[1]));
|
||||
skvx::float2 r0 = skvx::float2::Load(SkRRectPriv::GetRadiiArray(rrect));
|
||||
skvx::float2 r1 = skvx::float2::Load(SkRRectPriv::GetRadiiArray(rrect) + 2);
|
||||
skvx::float2 minRadii = min(r0, r1);
|
||||
skvx::float2 maxRadii = max(r0, r1);
|
||||
return can_use_hw_derivatives_with_coverage(devScale,
|
||||
skvx::float2(minRadii[0], maxRadii[1])) &&
|
||||
can_use_hw_derivatives_with_coverage(devScale,
|
||||
skvx::float2(maxRadii[0], minRadii[1]));
|
||||
}
|
||||
|
||||
case SkRRect::kComplex_Type: {
|
||||
|
@ -270,20 +270,20 @@ private:
|
||||
|
||||
SkIRect srcR;
|
||||
SkRect dstR;
|
||||
Sk4f scales(1.f / fView.proxy()->width(), 1.f / fView.proxy()->height(),
|
||||
1.f / fView.proxy()->width(), 1.f / fView.proxy()->height());
|
||||
static const Sk4f kDomainOffsets(0.5f, 0.5f, -0.5f, -0.5f);
|
||||
static const Sk4f kFlipOffsets(0.f, 1.f, 0.f, 1.f);
|
||||
static const Sk4f kFlipMuls(1.f, -1.f, 1.f, -1.f);
|
||||
skvx::float4 scales(1.f / fView.proxy()->width(), 1.f / fView.proxy()->height(),
|
||||
1.f / fView.proxy()->width(), 1.f / fView.proxy()->height());
|
||||
static const skvx::float4 kDomainOffsets(0.5f, 0.5f, -0.5f, -0.5f);
|
||||
static const skvx::float4 kFlipOffsets(0.f, 1.f, 0.f, 1.f);
|
||||
static const skvx::float4 kFlipMuls(1.f, -1.f, 1.f, -1.f);
|
||||
while (patch.fIter->next(&srcR, &dstR)) {
|
||||
Sk4f coords(SkIntToScalar(srcR.fLeft), SkIntToScalar(srcR.fTop),
|
||||
SkIntToScalar(srcR.fRight), SkIntToScalar(srcR.fBottom));
|
||||
Sk4f domain = coords + kDomainOffsets;
|
||||
skvx::float4 coords(SkIntToScalar(srcR.fLeft), SkIntToScalar(srcR.fTop),
|
||||
SkIntToScalar(srcR.fRight), SkIntToScalar(srcR.fBottom));
|
||||
skvx::float4 domain = coords + kDomainOffsets;
|
||||
coords *= scales;
|
||||
domain *= scales;
|
||||
if (fView.origin() == kBottomLeft_GrSurfaceOrigin) {
|
||||
coords = kFlipMuls * coords + kFlipOffsets;
|
||||
domain = SkNx_shuffle<0, 3, 2, 1>(kFlipMuls * domain + kFlipOffsets);
|
||||
domain = skvx::shuffle<0, 3, 2, 1>(kFlipMuls * domain + kFlipOffsets);
|
||||
}
|
||||
SkRect texDomain;
|
||||
SkRect texCoords;
|
||||
|
@ -161,7 +161,7 @@ generated_cc_atom(
|
||||
name = "SkUtils_opts_hdr",
|
||||
hdrs = ["SkUtils_opts.h"],
|
||||
visibility = ["//:__subpackages__"],
|
||||
deps = ["//include/private:SkNx_hdr"],
|
||||
deps = ["//include/private:SkVx_hdr"],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
|
@ -9,7 +9,7 @@
|
||||
#define SkUtils_opts_DEFINED
|
||||
|
||||
#include <stdint.h>
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/private/SkVx.h"
|
||||
|
||||
namespace SK_OPTS_NS {
|
||||
|
||||
@ -21,7 +21,7 @@ namespace SK_OPTS_NS {
|
||||
static const int N = 16 / sizeof(T);
|
||||
#endif
|
||||
while (count >= N) {
|
||||
SkNx<N,T>(value).store(buffer);
|
||||
skvx::Vec<N,T>(value).store(buffer);
|
||||
buffer += N;
|
||||
count -= N;
|
||||
}
|
||||
|
@ -281,7 +281,6 @@ CORE_TESTS = [
|
||||
"SkGlyphBufferTest.cpp",
|
||||
"SkGlyphTest.cpp",
|
||||
"SkImageTest.cpp",
|
||||
"SkNxTest.cpp",
|
||||
"SkPathRangeIterTest.cpp",
|
||||
"SkRasterPipelineTest.cpp",
|
||||
"SkRemoteGlyphCacheTest.cpp",
|
||||
@ -1437,7 +1436,6 @@ generated_cc_atom(
|
||||
"//include/core:SkPoint_hdr",
|
||||
"//include/core:SkScalar_hdr",
|
||||
"//include/core:SkTypes_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//src/core:SkGeometry_hdr",
|
||||
"//src/pathops:SkPathOpsCubic_hdr",
|
||||
],
|
||||
@ -5554,17 +5552,6 @@ generated_cc_atom(
|
||||
],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
name = "SkNxTest_src",
|
||||
srcs = ["SkNxTest.cpp"],
|
||||
visibility = ["//:__subpackages__"],
|
||||
deps = [
|
||||
":Test_hdr",
|
||||
"//include/private:SkNx_hdr",
|
||||
"//include/utils:SkRandom_hdr",
|
||||
],
|
||||
)
|
||||
|
||||
generated_cc_atom(
|
||||
name = "SkPathRangeIterTest_src",
|
||||
srcs = ["SkPathRangeIterTest.cpp"],
|
||||
|
@ -9,7 +9,6 @@
|
||||
#include "include/core/SkPoint.h"
|
||||
#include "include/core/SkScalar.h"
|
||||
#include "include/core/SkTypes.h"
|
||||
#include "include/private/SkNx.h"
|
||||
#include "src/core/SkGeometry.h"
|
||||
#include "src/pathops/SkPathOpsCubic.h"
|
||||
#include "tests/Test.h"
|
||||
|
@ -1,494 +0,0 @@
|
||||
/*
|
||||
* Copyright 2015 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "include/private/SkNx.h"
|
||||
#include "include/utils/SkRandom.h"
|
||||
#include "tests/Test.h"
|
||||
|
||||
template <int N>
|
||||
static void test_Nf(skiatest::Reporter* r) {
|
||||
|
||||
auto assert_nearly_eq = [&](float eps, const SkNx<N, float>& v,
|
||||
float a, float b, float c, float d) {
|
||||
auto close = [=](float a, float b) { return fabsf(a-b) <= eps; };
|
||||
float vals[4];
|
||||
v.store(vals);
|
||||
bool ok = close(vals[0], a) && close(vals[1], b)
|
||||
&& close( v[0], a) && close( v[1], b);
|
||||
REPORTER_ASSERT(r, ok);
|
||||
if (N == 4) {
|
||||
ok = close(vals[2], c) && close(vals[3], d)
|
||||
&& close( v[2], c) && close( v[3], d);
|
||||
REPORTER_ASSERT(r, ok);
|
||||
}
|
||||
};
|
||||
auto assert_eq = [&](const SkNx<N, float>& v, float a, float b, float c, float d) {
|
||||
return assert_nearly_eq(0, v, a,b,c,d);
|
||||
};
|
||||
|
||||
float vals[] = {3, 4, 5, 6};
|
||||
SkNx<N,float> a = SkNx<N,float>::Load(vals),
|
||||
b(a),
|
||||
c = a;
|
||||
SkNx<N,float> d;
|
||||
d = a;
|
||||
|
||||
assert_eq(a, 3, 4, 5, 6);
|
||||
assert_eq(b, 3, 4, 5, 6);
|
||||
assert_eq(c, 3, 4, 5, 6);
|
||||
assert_eq(d, 3, 4, 5, 6);
|
||||
|
||||
assert_eq(a+b, 6, 8, 10, 12);
|
||||
assert_eq(a*b, 9, 16, 25, 36);
|
||||
assert_eq(a*b-b, 6, 12, 20, 30);
|
||||
assert_eq((a*b).sqrt(), 3, 4, 5, 6);
|
||||
assert_eq(a/b, 1, 1, 1, 1);
|
||||
assert_eq(SkNx<N,float>(0)-a, -3, -4, -5, -6);
|
||||
|
||||
SkNx<N,float> fours(4);
|
||||
|
||||
assert_eq(fours.sqrt(), 2,2,2,2);
|
||||
|
||||
assert_eq(SkNx<N,float>::Min(a, fours), 3, 4, 4, 4);
|
||||
assert_eq(SkNx<N,float>::Max(a, fours), 4, 4, 5, 6);
|
||||
|
||||
// Test some comparisons. This is not exhaustive.
|
||||
REPORTER_ASSERT(r, (a == b).allTrue());
|
||||
REPORTER_ASSERT(r, (a+b == a*b-b).anyTrue());
|
||||
REPORTER_ASSERT(r, !(a+b == a*b-b).allTrue());
|
||||
REPORTER_ASSERT(r, !(a+b == a*b).anyTrue());
|
||||
REPORTER_ASSERT(r, !(a != b).anyTrue());
|
||||
REPORTER_ASSERT(r, (a < fours).anyTrue());
|
||||
REPORTER_ASSERT(r, (a <= fours).anyTrue());
|
||||
REPORTER_ASSERT(r, !(a > fours).allTrue());
|
||||
REPORTER_ASSERT(r, !(a >= fours).allTrue());
|
||||
}
|
||||
|
||||
DEF_TEST(SkNf, r) {
|
||||
test_Nf<2>(r);
|
||||
test_Nf<4>(r);
|
||||
}
|
||||
|
||||
template <int N, typename T>
|
||||
void test_Ni(skiatest::Reporter* r) {
|
||||
auto assert_eq = [&](const SkNx<N,T>& v, T a, T b, T c, T d, T e, T f, T g, T h) {
|
||||
T vals[8];
|
||||
v.store(vals);
|
||||
|
||||
switch (N) {
|
||||
case 8:
|
||||
REPORTER_ASSERT(r, vals[4] == e && vals[5] == f && vals[6] == g && vals[7] == h);
|
||||
[[fallthrough]];
|
||||
case 4:
|
||||
REPORTER_ASSERT(r, vals[2] == c && vals[3] == d);
|
||||
[[fallthrough]];
|
||||
case 2:
|
||||
REPORTER_ASSERT(r, vals[0] == a && vals[1] == b);
|
||||
}
|
||||
switch (N) {
|
||||
case 8:
|
||||
REPORTER_ASSERT(r, v[4] == e && v[5] == f && v[6] == g && v[7] == h);
|
||||
[[fallthrough]];
|
||||
case 4:
|
||||
REPORTER_ASSERT(r, v[2] == c && v[3] == d);
|
||||
[[fallthrough]];
|
||||
case 2:
|
||||
REPORTER_ASSERT(r, v[0] == a && v[1] == b);
|
||||
}
|
||||
};
|
||||
|
||||
T vals[] = { 1,2,3,4,5,6,7,8 };
|
||||
SkNx<N,T> a = SkNx<N,T>::Load(vals),
|
||||
b(a),
|
||||
c = a;
|
||||
SkNx<N,T> d;
|
||||
d = a;
|
||||
|
||||
assert_eq(a, 1,2,3,4,5,6,7,8);
|
||||
assert_eq(b, 1,2,3,4,5,6,7,8);
|
||||
assert_eq(c, 1,2,3,4,5,6,7,8);
|
||||
assert_eq(d, 1,2,3,4,5,6,7,8);
|
||||
|
||||
assert_eq(a+a, 2,4,6,8,10,12,14,16);
|
||||
assert_eq(a*a, 1,4,9,16,25,36,49,64);
|
||||
assert_eq(a*a-a, 0,2,6,12,20,30,42,56);
|
||||
|
||||
assert_eq(a >> 2, 0,0,0,1,1,1,1,2);
|
||||
assert_eq(a << 1, 2,4,6,8,10,12,14,16);
|
||||
|
||||
REPORTER_ASSERT(r, a[1] == 2);
|
||||
}
|
||||
|
||||
DEF_TEST(SkNx, r) {
|
||||
test_Ni<2, uint16_t>(r);
|
||||
test_Ni<4, uint16_t>(r);
|
||||
test_Ni<8, uint16_t>(r);
|
||||
|
||||
test_Ni<2, int>(r);
|
||||
test_Ni<4, int>(r);
|
||||
test_Ni<8, int>(r);
|
||||
}
|
||||
|
||||
DEF_TEST(SkNi_min_lt, r) {
|
||||
// Exhaustively check the 8x8 bit space.
|
||||
for (int a = 0; a < (1<<8); a++) {
|
||||
for (int b = 0; b < (1<<8); b++) {
|
||||
Sk16b aw(a), bw(b);
|
||||
REPORTER_ASSERT(r, Sk16b::Min(aw, bw)[0] == std::min(a, b));
|
||||
REPORTER_ASSERT(r, !(aw < bw)[0] == !(a < b));
|
||||
}}
|
||||
|
||||
// Exhausting the 16x16 bit space is kind of slow, so only do that in release builds.
|
||||
#ifdef SK_DEBUG
|
||||
SkRandom rand;
|
||||
for (int i = 0; i < (1<<16); i++) {
|
||||
uint16_t a = rand.nextU() >> 16,
|
||||
b = rand.nextU() >> 16;
|
||||
REPORTER_ASSERT(r, Sk16h::Min(Sk16h(a), Sk16h(b))[0] == std::min(a, b));
|
||||
}
|
||||
#else
|
||||
for (int a = 0; a < (1<<16); a++) {
|
||||
for (int b = 0; b < (1<<16); b++) {
|
||||
REPORTER_ASSERT(r, Sk16h::Min(Sk16h(a), Sk16h(b))[0] == std::min(a, b));
|
||||
}}
|
||||
#endif
|
||||
}
|
||||
|
||||
DEF_TEST(SkNi_saturatedAdd, r) {
|
||||
for (int a = 0; a < (1<<8); a++) {
|
||||
for (int b = 0; b < (1<<8); b++) {
|
||||
int exact = a+b;
|
||||
if (exact > 255) { exact = 255; }
|
||||
if (exact < 0) { exact = 0; }
|
||||
|
||||
REPORTER_ASSERT(r, Sk16b(a).saturatedAdd(Sk16b(b))[0] == exact);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEF_TEST(SkNi_mulHi, r) {
|
||||
// First 8 primes.
|
||||
Sk4u a{ 0x00020000, 0x00030000, 0x00050000, 0x00070000 };
|
||||
Sk4u b{ 0x000b0000, 0x000d0000, 0x00110000, 0x00130000 };
|
||||
|
||||
Sk4u q{22, 39, 85, 133};
|
||||
|
||||
Sk4u c = a.mulHi(b);
|
||||
REPORTER_ASSERT(r, c[0] == q[0]);
|
||||
REPORTER_ASSERT(r, c[1] == q[1]);
|
||||
REPORTER_ASSERT(r, c[2] == q[2]);
|
||||
REPORTER_ASSERT(r, c[3] == q[3]);
|
||||
}
|
||||
|
||||
DEF_TEST(SkNx_abs, r) {
|
||||
auto fs = Sk4f(0.0f, -0.0f, 2.0f, -4.0f).abs();
|
||||
REPORTER_ASSERT(r, fs[0] == 0.0f);
|
||||
REPORTER_ASSERT(r, fs[1] == 0.0f);
|
||||
REPORTER_ASSERT(r, fs[2] == 2.0f);
|
||||
REPORTER_ASSERT(r, fs[3] == 4.0f);
|
||||
auto fshi = Sk2f(0.0f, -0.0f).abs();
|
||||
auto fslo = Sk2f(2.0f, -4.0f).abs();
|
||||
REPORTER_ASSERT(r, fshi[0] == 0.0f);
|
||||
REPORTER_ASSERT(r, fshi[1] == 0.0f);
|
||||
REPORTER_ASSERT(r, fslo[0] == 2.0f);
|
||||
REPORTER_ASSERT(r, fslo[1] == 4.0f);
|
||||
}
|
||||
|
||||
DEF_TEST(Sk4i_abs, r) {
|
||||
auto is = Sk4i(0, -1, 2, -2147483647).abs();
|
||||
REPORTER_ASSERT(r, is[0] == 0);
|
||||
REPORTER_ASSERT(r, is[1] == 1);
|
||||
REPORTER_ASSERT(r, is[2] == 2);
|
||||
REPORTER_ASSERT(r, is[3] == 2147483647);
|
||||
}
|
||||
|
||||
DEF_TEST(Sk4i_minmax, r) {
|
||||
auto a = Sk4i(0, 2, 4, 6);
|
||||
auto b = Sk4i(1, 1, 3, 7);
|
||||
auto min = Sk4i::Min(a, b);
|
||||
auto max = Sk4i::Max(a, b);
|
||||
for(int i = 0; i < 4; ++i) {
|
||||
REPORTER_ASSERT(r, min[i] == std::min(a[i], b[i]));
|
||||
REPORTER_ASSERT(r, max[i] == std::max(a[i], b[i]));
|
||||
}
|
||||
}
|
||||
|
||||
DEF_TEST(SkNx_floor, r) {
|
||||
auto fs = Sk4f(0.4f, -0.4f, 0.6f, -0.6f).floor();
|
||||
REPORTER_ASSERT(r, fs[0] == 0.0f);
|
||||
REPORTER_ASSERT(r, fs[1] == -1.0f);
|
||||
REPORTER_ASSERT(r, fs[2] == 0.0f);
|
||||
REPORTER_ASSERT(r, fs[3] == -1.0f);
|
||||
|
||||
auto fs2 = Sk2f(0.4f, -0.4f).floor();
|
||||
REPORTER_ASSERT(r, fs2[0] == 0.0f);
|
||||
REPORTER_ASSERT(r, fs2[1] == -1.0f);
|
||||
|
||||
auto fs3 = Sk2f(0.6f, -0.6f).floor();
|
||||
REPORTER_ASSERT(r, fs3[0] == 0.0f);
|
||||
REPORTER_ASSERT(r, fs3[1] == -1.0f);
|
||||
}
|
||||
|
||||
DEF_TEST(SkNx_shuffle, r) {
|
||||
Sk4f f4(0,10,20,30);
|
||||
|
||||
Sk2f f2 = SkNx_shuffle<2,1>(f4);
|
||||
REPORTER_ASSERT(r, f2[0] == 20);
|
||||
REPORTER_ASSERT(r, f2[1] == 10);
|
||||
|
||||
f4 = SkNx_shuffle<0,1,1,0>(f2);
|
||||
REPORTER_ASSERT(r, f4[0] == 20);
|
||||
REPORTER_ASSERT(r, f4[1] == 10);
|
||||
REPORTER_ASSERT(r, f4[2] == 10);
|
||||
REPORTER_ASSERT(r, f4[3] == 20);
|
||||
}
|
||||
|
||||
DEF_TEST(SkNx_int_float, r) {
|
||||
Sk4f f(-2.3f, 1.0f, 0.45f, 0.6f);
|
||||
|
||||
Sk4i i = SkNx_cast<int>(f);
|
||||
REPORTER_ASSERT(r, i[0] == -2);
|
||||
REPORTER_ASSERT(r, i[1] == 1);
|
||||
REPORTER_ASSERT(r, i[2] == 0);
|
||||
REPORTER_ASSERT(r, i[3] == 0);
|
||||
|
||||
f = SkNx_cast<float>(i);
|
||||
REPORTER_ASSERT(r, f[0] == -2.0f);
|
||||
REPORTER_ASSERT(r, f[1] == 1.0f);
|
||||
REPORTER_ASSERT(r, f[2] == 0.0f);
|
||||
REPORTER_ASSERT(r, f[3] == 0.0f);
|
||||
}
|
||||
|
||||
#include "include/utils/SkRandom.h"
|
||||
|
||||
DEF_TEST(SkNx_u16_float, r) {
|
||||
{
|
||||
// u16 --> float
|
||||
auto h4 = Sk4h(15, 17, 257, 65535);
|
||||
auto f4 = SkNx_cast<float>(h4);
|
||||
REPORTER_ASSERT(r, f4[0] == 15.0f);
|
||||
REPORTER_ASSERT(r, f4[1] == 17.0f);
|
||||
REPORTER_ASSERT(r, f4[2] == 257.0f);
|
||||
REPORTER_ASSERT(r, f4[3] == 65535.0f);
|
||||
}
|
||||
{
|
||||
// float -> u16
|
||||
auto f4 = Sk4f(15, 17, 257, 65535);
|
||||
auto h4 = SkNx_cast<uint16_t>(f4);
|
||||
REPORTER_ASSERT(r, h4[0] == 15);
|
||||
REPORTER_ASSERT(r, h4[1] == 17);
|
||||
REPORTER_ASSERT(r, h4[2] == 257);
|
||||
REPORTER_ASSERT(r, h4[3] == 65535);
|
||||
}
|
||||
|
||||
// starting with any u16 value, we should be able to have a perfect round-trip in/out of floats
|
||||
//
|
||||
SkRandom rand;
|
||||
for (int i = 0; i < 10000; ++i) {
|
||||
const uint16_t s16[4] {
|
||||
(uint16_t)(rand.nextU() >> 16), (uint16_t)(rand.nextU() >> 16),
|
||||
(uint16_t)(rand.nextU() >> 16), (uint16_t)(rand.nextU() >> 16),
|
||||
};
|
||||
auto u4_0 = Sk4h::Load(s16);
|
||||
auto f4 = SkNx_cast<float>(u4_0);
|
||||
auto u4_1 = SkNx_cast<uint16_t>(f4);
|
||||
uint16_t d16[4];
|
||||
u4_1.store(d16);
|
||||
REPORTER_ASSERT(r, !memcmp(s16, d16, sizeof(s16)));
|
||||
}
|
||||
}
|
||||
|
||||
// The SSE2 implementation of SkNx_cast<uint16_t>(Sk4i) is non-trivial, so worth a test.
|
||||
DEF_TEST(SkNx_int_u16, r) {
|
||||
// These are pretty hard to get wrong.
|
||||
for (int i = 0; i <= 0x7fff; i++) {
|
||||
uint16_t expected = (uint16_t)i;
|
||||
uint16_t actual = SkNx_cast<uint16_t>(Sk4i(i))[0];
|
||||
|
||||
REPORTER_ASSERT(r, expected == actual);
|
||||
}
|
||||
|
||||
// A naive implementation with _mm_packs_epi32 would succeed up to 0x7fff but fail here:
|
||||
for (int i = 0x8000; (1) && i <= 0xffff; i++) {
|
||||
uint16_t expected = (uint16_t)i;
|
||||
uint16_t actual = SkNx_cast<uint16_t>(Sk4i(i))[0];
|
||||
|
||||
REPORTER_ASSERT(r, expected == actual);
|
||||
}
|
||||
}
|
||||
|
||||
DEF_TEST(SkNx_4fLoad4Store4, r) {
|
||||
float src[] = {
|
||||
0.0f, 1.0f, 2.0f, 3.0f,
|
||||
4.0f, 5.0f, 6.0f, 7.0f,
|
||||
8.0f, 9.0f, 10.0f, 11.0f,
|
||||
12.0f, 13.0f, 14.0f, 15.0f
|
||||
};
|
||||
|
||||
Sk4f a, b, c, d;
|
||||
Sk4f::Load4(src, &a, &b, &c, &d);
|
||||
REPORTER_ASSERT(r, 0.0f == a[0]);
|
||||
REPORTER_ASSERT(r, 4.0f == a[1]);
|
||||
REPORTER_ASSERT(r, 8.0f == a[2]);
|
||||
REPORTER_ASSERT(r, 12.0f == a[3]);
|
||||
REPORTER_ASSERT(r, 1.0f == b[0]);
|
||||
REPORTER_ASSERT(r, 5.0f == b[1]);
|
||||
REPORTER_ASSERT(r, 9.0f == b[2]);
|
||||
REPORTER_ASSERT(r, 13.0f == b[3]);
|
||||
REPORTER_ASSERT(r, 2.0f == c[0]);
|
||||
REPORTER_ASSERT(r, 6.0f == c[1]);
|
||||
REPORTER_ASSERT(r, 10.0f == c[2]);
|
||||
REPORTER_ASSERT(r, 14.0f == c[3]);
|
||||
REPORTER_ASSERT(r, 3.0f == d[0]);
|
||||
REPORTER_ASSERT(r, 7.0f == d[1]);
|
||||
REPORTER_ASSERT(r, 11.0f == d[2]);
|
||||
REPORTER_ASSERT(r, 15.0f == d[3]);
|
||||
|
||||
float dst[16];
|
||||
Sk4f::Store4(dst, a, b, c, d);
|
||||
REPORTER_ASSERT(r, 0 == memcmp(dst, src, 16 * sizeof(float)));
|
||||
}
|
||||
|
||||
DEF_TEST(SkNx_neg, r) {
|
||||
auto fs = -Sk4f(0.0f, -0.0f, 2.0f, -4.0f);
|
||||
REPORTER_ASSERT(r, fs[0] == 0.0f);
|
||||
REPORTER_ASSERT(r, fs[1] == 0.0f);
|
||||
REPORTER_ASSERT(r, fs[2] == -2.0f);
|
||||
REPORTER_ASSERT(r, fs[3] == 4.0f);
|
||||
auto fshi = -Sk2f(0.0f, -0.0f);
|
||||
auto fslo = -Sk2f(2.0f, -4.0f);
|
||||
REPORTER_ASSERT(r, fshi[0] == 0.0f);
|
||||
REPORTER_ASSERT(r, fshi[1] == 0.0f);
|
||||
REPORTER_ASSERT(r, fslo[0] == -2.0f);
|
||||
REPORTER_ASSERT(r, fslo[1] == 4.0f);
|
||||
}
|
||||
|
||||
DEF_TEST(SkNx_thenElse, r) {
|
||||
auto fs = (Sk4f(0.0f, -0.0f, 2.0f, -4.0f) < 0).thenElse(-1, 1);
|
||||
REPORTER_ASSERT(r, fs[0] == 1);
|
||||
REPORTER_ASSERT(r, fs[1] == 1);
|
||||
REPORTER_ASSERT(r, fs[2] == 1);
|
||||
REPORTER_ASSERT(r, fs[3] == -1);
|
||||
auto fshi = (Sk2f(0.0f, -0.0f) < 0).thenElse(-1, 1);
|
||||
auto fslo = (Sk2f(2.0f, -4.0f) < 0).thenElse(-1, 1);
|
||||
REPORTER_ASSERT(r, fshi[0] == 1);
|
||||
REPORTER_ASSERT(r, fshi[1] == 1);
|
||||
REPORTER_ASSERT(r, fslo[0] == 1);
|
||||
REPORTER_ASSERT(r, fslo[1] == -1);
|
||||
}
|
||||
|
||||
DEF_TEST(Sk4f_Load2, r) {
|
||||
float xy[8] = { 0,1,2,3,4,5,6,7 };
|
||||
|
||||
Sk4f x,y;
|
||||
Sk4f::Load2(xy, &x,&y);
|
||||
|
||||
REPORTER_ASSERT(r, x[0] == 0);
|
||||
REPORTER_ASSERT(r, x[1] == 2);
|
||||
REPORTER_ASSERT(r, x[2] == 4);
|
||||
REPORTER_ASSERT(r, x[3] == 6);
|
||||
|
||||
REPORTER_ASSERT(r, y[0] == 1);
|
||||
REPORTER_ASSERT(r, y[1] == 3);
|
||||
REPORTER_ASSERT(r, y[2] == 5);
|
||||
REPORTER_ASSERT(r, y[3] == 7);
|
||||
}
|
||||
|
||||
DEF_TEST(Sk2f_Load2, r) {
|
||||
float xy[4] = { 0,1,2,3 };
|
||||
|
||||
Sk2f x,y;
|
||||
Sk2f::Load2(xy, &x,&y);
|
||||
|
||||
REPORTER_ASSERT(r, x[0] == 0);
|
||||
REPORTER_ASSERT(r, x[1] == 2);
|
||||
|
||||
REPORTER_ASSERT(r, y[0] == 1);
|
||||
REPORTER_ASSERT(r, y[1] == 3);
|
||||
}
|
||||
|
||||
DEF_TEST(Sk2f_Store2, r) {
|
||||
Sk2f p0{0, 2};
|
||||
Sk2f p1{1, 3};
|
||||
float dst[4];
|
||||
Sk2f::Store2(dst, p0, p1);
|
||||
REPORTER_ASSERT(r, dst[0] == 0);
|
||||
REPORTER_ASSERT(r, dst[1] == 1);
|
||||
REPORTER_ASSERT(r, dst[2] == 2);
|
||||
REPORTER_ASSERT(r, dst[3] == 3);
|
||||
}
|
||||
|
||||
DEF_TEST(Sk2f_Store3, r) {
|
||||
Sk2f p0{0, 3};
|
||||
Sk2f p1{1, 4};
|
||||
Sk2f p2{2, 5};
|
||||
float dst[6];
|
||||
Sk2f::Store3(dst, p0, p1, p2);
|
||||
REPORTER_ASSERT(r, dst[0] == 0);
|
||||
REPORTER_ASSERT(r, dst[1] == 1);
|
||||
REPORTER_ASSERT(r, dst[2] == 2);
|
||||
REPORTER_ASSERT(r, dst[3] == 3);
|
||||
REPORTER_ASSERT(r, dst[4] == 4);
|
||||
REPORTER_ASSERT(r, dst[5] == 5);
|
||||
}
|
||||
|
||||
DEF_TEST(Sk2f_Store4, r) {
|
||||
Sk2f p0{0, 4};
|
||||
Sk2f p1{1, 5};
|
||||
Sk2f p2{2, 6};
|
||||
Sk2f p3{3, 7};
|
||||
|
||||
float dst[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
|
||||
Sk2f::Store4(dst, p0, p1, p2, p3);
|
||||
REPORTER_ASSERT(r, dst[0] == 0);
|
||||
REPORTER_ASSERT(r, dst[1] == 1);
|
||||
REPORTER_ASSERT(r, dst[2] == 2);
|
||||
REPORTER_ASSERT(r, dst[3] == 3);
|
||||
REPORTER_ASSERT(r, dst[4] == 4);
|
||||
REPORTER_ASSERT(r, dst[5] == 5);
|
||||
REPORTER_ASSERT(r, dst[6] == 6);
|
||||
REPORTER_ASSERT(r, dst[7] == 7);
|
||||
|
||||
// Ensure transposing to Sk4f works.
|
||||
Sk4f dst4f[2] = {{-1, -1, -1, -1}, {-1, -1, -1, -1}};
|
||||
Sk2f::Store4(dst4f, p0, p1, p2, p3);
|
||||
REPORTER_ASSERT(r, dst4f[0][0] == 0);
|
||||
REPORTER_ASSERT(r, dst4f[0][1] == 1);
|
||||
REPORTER_ASSERT(r, dst4f[0][2] == 2);
|
||||
REPORTER_ASSERT(r, dst4f[0][3] == 3);
|
||||
REPORTER_ASSERT(r, dst4f[1][0] == 4);
|
||||
REPORTER_ASSERT(r, dst4f[1][1] == 5);
|
||||
REPORTER_ASSERT(r, dst4f[1][2] == 6);
|
||||
REPORTER_ASSERT(r, dst4f[1][3] == 7);
|
||||
|
||||
}
|
||||
|
||||
DEF_TEST(Sk4f_minmax, r) {
|
||||
REPORTER_ASSERT(r, 3 == Sk4f(0,1,2,3).max());
|
||||
REPORTER_ASSERT(r, 2 == Sk4f(1,-5,2,-1).max());
|
||||
REPORTER_ASSERT(r, -1 == Sk4f(-2,-1,-6,-3).max());
|
||||
REPORTER_ASSERT(r, 3 == Sk4f(3,2,1,0).max());
|
||||
|
||||
REPORTER_ASSERT(r, 0 == Sk4f(0,1,2,3).min());
|
||||
REPORTER_ASSERT(r, -5 == Sk4f(1,-5,2,-1).min());
|
||||
REPORTER_ASSERT(r, -6 == Sk4f(-2,-1,-6,-3).min());
|
||||
REPORTER_ASSERT(r, 0 == Sk4f(3,2,1,0).min());
|
||||
}
|
||||
|
||||
DEF_TEST(SkNf_anyTrue_allTrue, r) {
|
||||
REPORTER_ASSERT(r, (Sk2f{1,2} < Sk2f{3,4}).anyTrue());
|
||||
REPORTER_ASSERT(r, (Sk2f{1,2} < Sk2f{3,4}).allTrue());
|
||||
REPORTER_ASSERT(r, (Sk2f{3,2} < Sk2f{1,4}).anyTrue());
|
||||
REPORTER_ASSERT(r, !(Sk2f{3,2} < Sk2f{1,4}).allTrue());
|
||||
REPORTER_ASSERT(r, !(Sk2f{3,4} < Sk2f{1,2}).anyTrue());
|
||||
|
||||
REPORTER_ASSERT(r, (Sk4f{1,2,3,4} < Sk4f{3,4,5,6}).anyTrue());
|
||||
REPORTER_ASSERT(r, (Sk4f{1,2,3,4} < Sk4f{3,4,5,6}).allTrue());
|
||||
REPORTER_ASSERT(r, (Sk4f{1,2,3,4} < Sk4f{1,4,1,1}).anyTrue());
|
||||
REPORTER_ASSERT(r, !(Sk4f{1,2,3,4} < Sk4f{1,4,1,1}).allTrue());
|
||||
REPORTER_ASSERT(r, !(Sk4f{3,4,5,6} < Sk4f{1,2,3,4}).anyTrue());
|
||||
}
|
Loading…
Reference in New Issue
Block a user