Faster 4f gradient premul path

Similar to https://codereview.chromium.org/2409583003/, perform the
premul in 4f.  It turns out it's even faster to avoid the 255 load
multiplication in this case.

Also includes some template plumbing because DstTraits<>::load now needs
to be premul-aware (previously it wasn't).

R=reed@google.com
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2416233002

Review-Url: https://codereview.chromium.org/2416233002
This commit is contained in:
fmalita 2016-10-14 08:18:24 -07:00 committed by Commit bot
parent 11abd8d6cb
commit 3a2e45a6ed
5 changed files with 30 additions and 31 deletions

View File

@ -326,7 +326,7 @@ GradientShaderBase4fContext::shadeSpanInternal(int x, int y,
int count) const { int count) const {
static const int kBufSize = 128; static const int kBufSize = 128;
SkScalar ts[kBufSize]; SkScalar ts[kBufSize];
TSampler<dstType, tileMode> sampler(*this); TSampler<dstType, premul, tileMode> sampler(*this);
SkASSERT(count > 0); SkASSERT(count > 0);
do { do {
@ -341,7 +341,7 @@ GradientShaderBase4fContext::shadeSpanInternal(int x, int y,
} while (count > 0); } while (count > 0);
} }
template<DstType dstType, SkShader::TileMode tileMode> template<DstType dstType, ApplyPremul premul, SkShader::TileMode tileMode>
class SkGradientShaderBase::GradientShaderBase4fContext::TSampler { class SkGradientShaderBase::GradientShaderBase4fContext::TSampler {
public: public:
TSampler(const GradientShaderBase4fContext& ctx) TSampler(const GradientShaderBase4fContext& ctx)
@ -424,8 +424,8 @@ private:
} }
void loadIntervalData(const Interval* i) { void loadIntervalData(const Interval* i) {
fCc = DstTraits<dstType>::load(i->fC0); fCc = DstTraits<dstType, premul>::load(i->fC0);
fDc = DstTraits<dstType>::load(i->fDc); fDc = DstTraits<dstType, premul>::load(i->fDc);
} }
const Interval* fFirstInterval; const Interval* fFirstInterval;

View File

@ -60,7 +60,7 @@ private:
void addMirrorIntervals(const SkGradientShaderBase&, void addMirrorIntervals(const SkGradientShaderBase&,
const Sk4f& componentScale, bool reverse); const Sk4f& componentScale, bool reverse);
template<DstType, SkShader::TileMode tileMode> template<DstType, ApplyPremul, SkShader::TileMode tileMode>
class TSampler; class TSampler;
template <DstType dstType, ApplyPremul premul> template <DstType dstType, ApplyPremul premul>

View File

@ -29,17 +29,6 @@ enum class DstType {
F32, // Linear float. Used for shaders only. F32, // Linear float. Used for shaders only.
}; };
template <ApplyPremul premul>
inline SkPMColor trunc_from_4f_255(const Sk4f& c) {
SkPMColor pmc;
SkNx_cast<uint8_t>(c).store(&pmc);
if (premul == ApplyPremul::True) {
pmc = SkPreMultiplyARGB(SkGetPackedA32(pmc), SkGetPackedR32(pmc),
SkGetPackedG32(pmc), SkGetPackedB32(pmc));
}
return pmc;
}
template <ApplyPremul> template <ApplyPremul>
struct PremulTraits; struct PremulTraits;
@ -69,24 +58,34 @@ struct PremulTraits<ApplyPremul::True> {
// //
// - store4x() Store 4 Sk4f values to dest (opportunistic optimization). // - store4x() Store 4 Sk4f values to dest (opportunistic optimization).
// //
template <DstType, ApplyPremul premul = ApplyPremul::False> template <DstType, ApplyPremul premul>
struct DstTraits; struct DstTraits;
template <ApplyPremul premul> template <ApplyPremul premul>
struct DstTraits<DstType::L32, premul> { struct DstTraits<DstType::L32, premul> {
using PM = PremulTraits<premul>;
using Type = SkPMColor; using Type = SkPMColor;
// For L32, we prescale the values by 255 to save a per-pixel multiplication. // For L32, prescaling by 255 saves a per-pixel multiplication when premul is not needed.
static Sk4f load(const SkPM4f& c) { static Sk4f load(const SkPM4f& c) {
return c.to4f_pmorder() * Sk4f(255); return premul == ApplyPremul::False
? c.to4f_pmorder() * Sk4f(255)
: c.to4f_pmorder();
} }
static void store(const Sk4f& c, Type* dst) { static void store(const Sk4f& c, Type* dst) {
*dst = trunc_from_4f_255<premul>(c); if (premul == ApplyPremul::False) {
// c is prescaled by 255, just store.
SkNx_cast<uint8_t>(c).store(dst);
} else {
*dst = Sk4f_toL32(PM::apply(c));
}
} }
static void store(const Sk4f& c, Type* dst, int n) { static void store(const Sk4f& c, Type* dst, int n) {
sk_memset32(dst, trunc_from_4f_255<premul>(c), n); Type pmc;
store(c, &pmc);
sk_memset32(dst, pmc, n);
} }
static void store4x(const Sk4f& c0, const Sk4f& c1, static void store4x(const Sk4f& c0, const Sk4f& c1,

View File

@ -240,12 +240,12 @@ LinearGradient4fContext::shadeSpanInternal(int x, int y,
&pt); &pt);
const SkScalar fx = pinFx<tileMode>(pt.x()); const SkScalar fx = pinFx<tileMode>(pt.x());
const SkScalar dx = fDstToPos.getScaleX(); const SkScalar dx = fDstToPos.getScaleX();
LinearIntervalProcessor<dstType, tileMode> proc(fIntervals.begin(), LinearIntervalProcessor<dstType, premul, tileMode> proc(fIntervals.begin(),
fIntervals.end() - 1, fIntervals.end() - 1,
this->findInterval(fx), this->findInterval(fx),
fx, fx,
dx, dx,
SkScalarNearlyZero(dx * count)); SkScalarNearlyZero(dx * count));
while (count > 0) { while (count > 0) {
// What we really want here is SkTPin(advance, 1, count) // What we really want here is SkTPin(advance, 1, count)
// but that's a significant perf hit for >> stops; investigate. // but that's a significant perf hit for >> stops; investigate.
@ -274,7 +274,7 @@ LinearGradient4fContext::shadeSpanInternal(int x, int y,
} }
} }
template<DstType dstType, SkShader::TileMode tileMode> template<DstType dstType, ApplyPremul premul, SkShader::TileMode tileMode>
class SkLinearGradient:: class SkLinearGradient::
LinearGradient4fContext::LinearIntervalProcessor { LinearGradient4fContext::LinearIntervalProcessor {
public: public:
@ -322,8 +322,8 @@ public:
private: private:
void compute_interval_props(SkScalar t) { void compute_interval_props(SkScalar t) {
const Sk4f dC = DstTraits<dstType>::load(fInterval->fDc); const Sk4f dC = DstTraits<dstType, premul>::load(fInterval->fDc);
fCc = DstTraits<dstType>::load(fInterval->fC0); fCc = DstTraits<dstType, premul>::load(fInterval->fC0);
fCc = fCc + dC * Sk4f(t); fCc = fCc + dC * Sk4f(t);
fDcDx = dC * fDx; fDcDx = dC * fDx;
fZeroRamp = fIsVertical || fInterval->isZeroRamp(); fZeroRamp = fIsVertical || fInterval->isZeroRamp();

View File

@ -27,7 +27,7 @@ protected:
private: private:
using INHERITED = GradientShaderBase4fContext; using INHERITED = GradientShaderBase4fContext;
template<DstType, TileMode> template<DstType, ApplyPremul, TileMode>
class LinearIntervalProcessor; class LinearIntervalProcessor;
template <DstType dstType, ApplyPremul premul> template <DstType dstType, ApplyPremul premul>