Mike's radial gradient CL with better float -> int.
patch from issue 1072303005 at patchset 40001 (http://crrev.com/1072303005#ps40001) This looks quite launchable. radial_gradient3, min of 100 samples: N5: 985µs -> 946µs MBP: 395µs -> 279µs On my MBP, most of the meat looks like it's now in reading the cache and writing to dst one color at a time. Is that something we could do in float math rather than with a lookup table? BUG=skia: CQ_EXTRA_TRYBOTS=client.skia.android:Test-Android-GCC-Nexus5-CPU-NEON-Arm7-Debug-Trybot,Test-Android-GCC-Nexus9-CPU-Denver-Arm64-Debug-Trybot Review URL: https://codereview.chromium.org/1109643002
This commit is contained in:
parent
a3a8eb6f63
commit
abf6c5cf95
@ -77,6 +77,10 @@ private:
|
||||
template <int N, typename T>
|
||||
class SkNf {
|
||||
typedef SkNb<N, sizeof(T)> Nb;
|
||||
|
||||
static int32_t MyNi(float);
|
||||
static int64_t MyNi(double);
|
||||
typedef SkNi<N, decltype(MyNi(T()))> Ni;
|
||||
public:
|
||||
SkNf() {}
|
||||
explicit SkNf(T val) : fLo(val), fHi(val) {}
|
||||
@ -93,6 +97,8 @@ public:
|
||||
fHi.store(vals+N/2);
|
||||
}
|
||||
|
||||
Ni castTrunc() const { return Ni(fLo.castTrunc(), fHi.castTrunc()); }
|
||||
|
||||
SkNf operator + (const SkNf& o) const { return SkNf(fLo + o.fLo, fHi + o.fHi); }
|
||||
SkNf operator - (const SkNf& o) const { return SkNf(fLo - o.fLo, fHi - o.fHi); }
|
||||
SkNf operator * (const SkNf& o) const { return SkNf(fLo * o.fLo, fHi * o.fHi); }
|
||||
@ -172,6 +178,10 @@ private:
|
||||
template <typename T>
|
||||
class SkNf<1,T> {
|
||||
typedef SkNb<1, sizeof(T)> Nb;
|
||||
|
||||
static int32_t MyNi(float);
|
||||
static int64_t MyNi(double);
|
||||
typedef SkNi<1, decltype(MyNi(T()))> Ni;
|
||||
public:
|
||||
SkNf() {}
|
||||
explicit SkNf(T val) : fVal(val) {}
|
||||
@ -179,6 +189,8 @@ public:
|
||||
|
||||
void store(T vals[1]) const { vals[0] = fVal; }
|
||||
|
||||
Ni castTrunc() const { return Ni(fVal); }
|
||||
|
||||
SkNf operator + (const SkNf& o) const { return SkNf(fVal + o.fVal); }
|
||||
SkNf operator - (const SkNf& o) const { return SkNf(fVal - o.fVal); }
|
||||
SkNf operator * (const SkNf& o) const { return SkNf(fVal * o.fVal); }
|
||||
@ -248,4 +260,6 @@ typedef SkNf<4, SkScalar> Sk4s;
|
||||
typedef SkNi<4, uint16_t> Sk4h;
|
||||
typedef SkNi<8, uint16_t> Sk8h;
|
||||
|
||||
typedef SkNi<4, int> Sk4i;
|
||||
|
||||
#endif//SkNx_DEFINED
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
#include "SkRadialGradient.h"
|
||||
#include "SkRadialGradient_Table.h"
|
||||
#include "SkNx.h"
|
||||
|
||||
#define kSQRT_TABLE_BITS 11
|
||||
#define kSQRT_TABLE_SIZE (1 << kSQRT_TABLE_BITS)
|
||||
@ -270,13 +271,16 @@ void SkRadialGradient::flatten(SkWriteBuffer& buffer) const {
|
||||
namespace {
|
||||
|
||||
inline bool radial_completely_pinned(int fx, int dx, int fy, int dy) {
|
||||
// fast, overly-conservative test: checks unit square instead
|
||||
// of unit circle
|
||||
bool xClamped = (fx >= SK_FixedHalf && dx >= 0) ||
|
||||
(fx <= -SK_FixedHalf && dx <= 0);
|
||||
bool yClamped = (fy >= SK_FixedHalf && dy >= 0) ||
|
||||
(fy <= -SK_FixedHalf && dy <= 0);
|
||||
// fast, overly-conservative test: checks unit square instead of unit circle
|
||||
bool xClamped = (fx >= SK_FixedHalf && dx >= 0) || (fx <= -SK_FixedHalf && dx <= 0);
|
||||
bool yClamped = (fy >= SK_FixedHalf && dy >= 0) || (fy <= -SK_FixedHalf && dy <= 0);
|
||||
return xClamped || yClamped;
|
||||
}
|
||||
|
||||
inline bool radial_completely_pinned(SkScalar fx, SkScalar dx, SkScalar fy, SkScalar dy) {
|
||||
// fast, overly-conservative test: checks unit square instead of unit circle
|
||||
bool xClamped = (fx >= 1 && dx >= 0) || (fx <= -1 && dx <= 0);
|
||||
bool yClamped = (fy >= 1 && dy >= 0) || (fy <= -1 && dy <= 0);
|
||||
return xClamped || yClamped;
|
||||
}
|
||||
|
||||
@ -373,6 +377,70 @@ void shadeSpan_radial_clamp(SkScalar sfx, SkScalar sdx,
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: can we get away with 0th approximatino of inverse-sqrt (i.e. faster than rsqrt)?
|
||||
// seems like ~10bits is more than enough for our use, since we want a byte-index
|
||||
static inline Sk4f fast_sqrt(const Sk4f& R) {
|
||||
return R * R.rsqrt();
|
||||
}
|
||||
|
||||
static inline Sk4f sum_squares(const Sk4f& a, const Sk4f& b) {
|
||||
return a * a + b * b;
|
||||
}
|
||||
|
||||
void shadeSpan_radial_clamp2(SkScalar sfx, SkScalar sdx, SkScalar sfy, SkScalar sdy,
|
||||
SkPMColor* SK_RESTRICT dstC, const SkPMColor* SK_RESTRICT cache,
|
||||
int count, int toggle) {
|
||||
if (radial_completely_pinned(sfx, sdx, sfy, sdy)) {
|
||||
unsigned fi = SkGradientShaderBase::kCache32Count - 1;
|
||||
sk_memset32_dither(dstC,
|
||||
cache[toggle + fi],
|
||||
cache[next_dither_toggle(toggle) + fi],
|
||||
count);
|
||||
} else {
|
||||
const Sk4f max(255);
|
||||
const float scale = 255;
|
||||
sfx *= scale;
|
||||
sfy *= scale;
|
||||
sdx *= scale;
|
||||
sdy *= scale;
|
||||
const Sk4f fx4(sfx, sfx + sdx, sfx + 2*sdx, sfx + 3*sdx);
|
||||
const Sk4f fy4(sfy, sfy + sdy, sfy + 2*sdy, sfy + 3*sdy);
|
||||
const Sk4f dx4(sdx * 4);
|
||||
const Sk4f dy4(sdy * 4);
|
||||
|
||||
Sk4f tmpxy = fx4 * dx4 + fy4 * dy4;
|
||||
Sk4f tmpdxdy = sum_squares(dx4, dy4);
|
||||
Sk4f R = sum_squares(fx4, fy4);
|
||||
Sk4f dR = tmpxy + tmpxy + tmpdxdy;
|
||||
const Sk4f ddR = tmpdxdy + tmpdxdy;
|
||||
|
||||
for (int i = 0; i < (count >> 2); ++i) {
|
||||
Sk4f dist = Sk4f::Min(fast_sqrt(R), max);
|
||||
R += dR;
|
||||
dR += ddR;
|
||||
|
||||
int fi[4];
|
||||
dist.castTrunc().store(fi);
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
*dstC++ = cache[toggle + fi[i]];
|
||||
toggle = next_dither_toggle(toggle);
|
||||
}
|
||||
}
|
||||
count &= 3;
|
||||
if (count) {
|
||||
Sk4f dist = Sk4f::Min(fast_sqrt(R), max);
|
||||
|
||||
int fi[4];
|
||||
dist.castTrunc().store(fi);
|
||||
for (int i = 0; i < count; i++) {
|
||||
*dstC++ = cache[toggle + fi[i]];
|
||||
toggle = next_dither_toggle(toggle);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Unrolling this loop doesn't seem to help (when float); we're stalling to
|
||||
// get the results of the sqrt (?), and don't have enough extra registers to
|
||||
// have many in flight.
|
||||
@ -407,6 +475,11 @@ void shadeSpan_radial_repeat(SkScalar fx, SkScalar dx, SkScalar fy, SkScalar dy,
|
||||
|
||||
void SkRadialGradient::RadialGradientContext::shadeSpan(int x, int y,
|
||||
SkPMColor* SK_RESTRICT dstC, int count) {
|
||||
#ifdef SK_SUPPORT_LEGACY_RADIAL_GRADIENT_SQRT
|
||||
const bool use_new_proc = false;
|
||||
#else
|
||||
const bool use_new_proc = true;
|
||||
#endif
|
||||
SkASSERT(count > 0);
|
||||
|
||||
const SkRadialGradient& radialGradient = static_cast<const SkRadialGradient&>(fShader);
|
||||
@ -435,7 +508,7 @@ void SkRadialGradient::RadialGradientContext::shadeSpan(int x, int y,
|
||||
|
||||
RadialShadeProc shadeProc = shadeSpan_radial_repeat;
|
||||
if (SkShader::kClamp_TileMode == radialGradient.fTileMode) {
|
||||
shadeProc = shadeSpan_radial_clamp;
|
||||
shadeProc = use_new_proc ? shadeSpan_radial_clamp2 : shadeSpan_radial_clamp;
|
||||
} else if (SkShader::kMirror_TileMode == radialGradient.fTileMode) {
|
||||
shadeProc = shadeSpan_radial_mirror;
|
||||
} else {
|
||||
|
@ -180,6 +180,48 @@ private:
|
||||
};
|
||||
#endif//defined(SK_CPU_ARM64)
|
||||
|
||||
template <>
|
||||
class SkNi<4, int> {
|
||||
public:
|
||||
SkNi(const int32x4_t& vec) : fVec(vec) {}
|
||||
|
||||
SkNi() {}
|
||||
explicit SkNi(int val) : fVec(vdupq_n_s32(val)) {}
|
||||
static SkNi Load(const int vals[4]) { return vld1q_s32(vals); }
|
||||
SkNi(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; }
|
||||
|
||||
void store(int vals[4]) const { vst1q_s32(vals, fVec); }
|
||||
|
||||
SkNi operator + (const SkNi& o) const { return vaddq_s32(fVec, o.fVec); }
|
||||
SkNi operator - (const SkNi& o) const { return vsubq_s32(fVec, o.fVec); }
|
||||
SkNi operator * (const SkNi& o) const { return vmulq_s32(fVec, o.fVec); }
|
||||
|
||||
// Well, this is absurd. The shifts require compile-time constant arguments.
|
||||
#define SHIFT(op, v, bits) switch(bits) { \
|
||||
case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v, 3); \
|
||||
case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v, 6); \
|
||||
case 7: return op(v, 7); case 8: return op(v, 8); case 9: return op(v, 9); \
|
||||
case 10: return op(v, 10); case 11: return op(v, 11); case 12: return op(v, 12); \
|
||||
case 13: return op(v, 13); case 14: return op(v, 14); case 15: return op(v, 15); \
|
||||
case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v, 18); \
|
||||
case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v, 21); \
|
||||
case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v, 24); \
|
||||
case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v, 27); \
|
||||
case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v, 30); \
|
||||
case 31: return op(v, 31); } return fVec
|
||||
|
||||
SkNi operator << (int bits) const { SHIFT(vshlq_n_s32, fVec, bits); }
|
||||
SkNi operator >> (int bits) const { SHIFT(vshrq_n_s32, fVec, bits); }
|
||||
#undef SHIFT
|
||||
|
||||
template <int k> int kth() const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
return vgetq_lane_s32(fVec, k);
|
||||
}
|
||||
protected:
|
||||
int32x4_t fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNf<4, float> {
|
||||
typedef SkNb<4, 4> Nb;
|
||||
@ -193,6 +235,8 @@ public:
|
||||
|
||||
void store(float vals[4]) const { vst1q_f32(vals, fVec); }
|
||||
|
||||
SkNi<4, int> castTrunc() const { return vcvtq_s32_f32(fVec); }
|
||||
|
||||
SkNf approxInvert() const {
|
||||
float32x4_t est0 = vrecpeq_f32(fVec),
|
||||
est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
|
||||
|
@ -141,6 +141,44 @@ private:
|
||||
__m128d fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNi<4, int> {
|
||||
public:
|
||||
SkNi(const __m128i& vec) : fVec(vec) {}
|
||||
|
||||
SkNi() {}
|
||||
explicit SkNi(int val) : fVec(_mm_set1_epi32(val)) {}
|
||||
static SkNi Load(const int vals[4]) { return _mm_loadu_si128((const __m128i*)vals); }
|
||||
SkNi(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
|
||||
|
||||
void store(int vals[4]) const { _mm_storeu_si128((__m128i*)vals, fVec); }
|
||||
|
||||
SkNi operator + (const SkNi& o) const { return _mm_add_epi32(fVec, o.fVec); }
|
||||
SkNi operator - (const SkNi& o) const { return _mm_sub_epi32(fVec, o.fVec); }
|
||||
SkNi operator * (const SkNi& o) const {
|
||||
__m128i mul20 = _mm_mul_epu32(fVec, o.fVec),
|
||||
mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4));
|
||||
return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)),
|
||||
_mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
|
||||
}
|
||||
|
||||
SkNi operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
|
||||
SkNi operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }
|
||||
|
||||
template <int k> int kth() const {
|
||||
SkASSERT(0 <= k && k < 4);
|
||||
switch (k) {
|
||||
case 0: return _mm_cvtsi128_si32(fVec);
|
||||
case 1: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 4));
|
||||
case 2: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 8));
|
||||
case 3: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 12));
|
||||
default: SkASSERT(false); return 0;
|
||||
}
|
||||
}
|
||||
protected:
|
||||
__m128i fVec;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SkNf<4, float> {
|
||||
typedef SkNb<4, 4> Nb;
|
||||
@ -154,6 +192,8 @@ public:
|
||||
|
||||
void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); }
|
||||
|
||||
SkNi<4, int> castTrunc() const { return _mm_cvttps_epi32(fVec); }
|
||||
|
||||
SkNf operator + (const SkNf& o) const { return _mm_add_ps(fVec, o.fVec); }
|
||||
SkNf operator - (const SkNf& o) const { return _mm_sub_ps(fVec, o.fVec); }
|
||||
SkNf operator * (const SkNf& o) const { return _mm_mul_ps(fVec, o.fVec); }
|
||||
|
@ -89,6 +89,12 @@ void test_Ni(skiatest::Reporter* r) {
|
||||
case 4: REPORTER_ASSERT(r, vals[2] == c && vals[3] == d);
|
||||
case 2: REPORTER_ASSERT(r, vals[0] == a && vals[1] == b);
|
||||
}
|
||||
switch (N) {
|
||||
case 8: REPORTER_ASSERT(r, v.template kth<4>() == e && v.template kth<5>() == f &&
|
||||
v.template kth<6>() == g && v.template kth<7>() == h);
|
||||
case 4: REPORTER_ASSERT(r, v.template kth<2>() == c && v.template kth<3>() == d);
|
||||
case 2: REPORTER_ASSERT(r, v.template kth<0>() == a && v.template kth<1>() == b);
|
||||
}
|
||||
};
|
||||
|
||||
T vals[] = { 1,2,3,4,5,6,7,8 };
|
||||
@ -117,4 +123,8 @@ DEF_TEST(SkNi, r) {
|
||||
test_Ni<2, uint16_t>(r);
|
||||
test_Ni<4, uint16_t>(r);
|
||||
test_Ni<8, uint16_t>(r);
|
||||
|
||||
test_Ni<2, int>(r);
|
||||
test_Ni<4, int>(r);
|
||||
test_Ni<8, int>(r);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user