Faster 4f gradient premul path

Similar to https://codereview.chromium.org/2409583003/, perform the
premul in 4f.  It turns out it's even faster to avoid the 255 load
multiplication in this case.

Also includes some template plumbing because DstTraits<>::load now needs
to be premul-aware (previously it wasn't).

R=reed@google.com
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2416233002

Review-Url: https://codereview.chromium.org/2416233002
This commit is contained in:
fmalita 2016-10-14 08:18:24 -07:00 committed by Commit bot
parent 11abd8d6cb
commit 3a2e45a6ed
5 changed files with 30 additions and 31 deletions

View File

@ -326,7 +326,7 @@ GradientShaderBase4fContext::shadeSpanInternal(int x, int y,
int count) const {
static const int kBufSize = 128;
SkScalar ts[kBufSize];
TSampler<dstType, tileMode> sampler(*this);
TSampler<dstType, premul, tileMode> sampler(*this);
SkASSERT(count > 0);
do {
@ -341,7 +341,7 @@ GradientShaderBase4fContext::shadeSpanInternal(int x, int y,
} while (count > 0);
}
template<DstType dstType, SkShader::TileMode tileMode>
template<DstType dstType, ApplyPremul premul, SkShader::TileMode tileMode>
class SkGradientShaderBase::GradientShaderBase4fContext::TSampler {
public:
TSampler(const GradientShaderBase4fContext& ctx)
@ -424,8 +424,8 @@ private:
}
void loadIntervalData(const Interval* i) {
fCc = DstTraits<dstType>::load(i->fC0);
fDc = DstTraits<dstType>::load(i->fDc);
fCc = DstTraits<dstType, premul>::load(i->fC0);
fDc = DstTraits<dstType, premul>::load(i->fDc);
}
const Interval* fFirstInterval;

View File

@ -60,7 +60,7 @@ private:
void addMirrorIntervals(const SkGradientShaderBase&,
const Sk4f& componentScale, bool reverse);
template<DstType, SkShader::TileMode tileMode>
template<DstType, ApplyPremul, SkShader::TileMode tileMode>
class TSampler;
template <DstType dstType, ApplyPremul premul>

View File

@ -29,17 +29,6 @@ enum class DstType {
F32, // Linear float. Used for shaders only.
};
template <ApplyPremul premul>
inline SkPMColor trunc_from_4f_255(const Sk4f& c) {
SkPMColor pmc;
SkNx_cast<uint8_t>(c).store(&pmc);
if (premul == ApplyPremul::True) {
pmc = SkPreMultiplyARGB(SkGetPackedA32(pmc), SkGetPackedR32(pmc),
SkGetPackedG32(pmc), SkGetPackedB32(pmc));
}
return pmc;
}
template <ApplyPremul>
struct PremulTraits;
@ -69,24 +58,34 @@ struct PremulTraits<ApplyPremul::True> {
//
// - store4x() Store 4 Sk4f values to dest (opportunistic optimization).
//
template <DstType, ApplyPremul premul = ApplyPremul::False>
template <DstType, ApplyPremul premul>
struct DstTraits;
template <ApplyPremul premul>
struct DstTraits<DstType::L32, premul> {
using PM = PremulTraits<premul>;
using Type = SkPMColor;
// For L32, we prescale the values by 255 to save a per-pixel multiplication.
// For L32, prescaling by 255 saves a per-pixel multiplication when premul is not needed.
static Sk4f load(const SkPM4f& c) {
return c.to4f_pmorder() * Sk4f(255);
return premul == ApplyPremul::False
? c.to4f_pmorder() * Sk4f(255)
: c.to4f_pmorder();
}
static void store(const Sk4f& c, Type* dst) {
*dst = trunc_from_4f_255<premul>(c);
if (premul == ApplyPremul::False) {
// c is prescaled by 255, just store.
SkNx_cast<uint8_t>(c).store(dst);
} else {
*dst = Sk4f_toL32(PM::apply(c));
}
}
static void store(const Sk4f& c, Type* dst, int n) {
sk_memset32(dst, trunc_from_4f_255<premul>(c), n);
Type pmc;
store(c, &pmc);
sk_memset32(dst, pmc, n);
}
static void store4x(const Sk4f& c0, const Sk4f& c1,

View File

@ -240,12 +240,12 @@ LinearGradient4fContext::shadeSpanInternal(int x, int y,
&pt);
const SkScalar fx = pinFx<tileMode>(pt.x());
const SkScalar dx = fDstToPos.getScaleX();
LinearIntervalProcessor<dstType, tileMode> proc(fIntervals.begin(),
fIntervals.end() - 1,
this->findInterval(fx),
fx,
dx,
SkScalarNearlyZero(dx * count));
LinearIntervalProcessor<dstType, premul, tileMode> proc(fIntervals.begin(),
fIntervals.end() - 1,
this->findInterval(fx),
fx,
dx,
SkScalarNearlyZero(dx * count));
while (count > 0) {
// What we really want here is SkTPin(advance, 1, count)
// but that's a significant perf hit for >> stops; investigate.
@ -274,7 +274,7 @@ LinearGradient4fContext::shadeSpanInternal(int x, int y,
}
}
template<DstType dstType, SkShader::TileMode tileMode>
template<DstType dstType, ApplyPremul premul, SkShader::TileMode tileMode>
class SkLinearGradient::
LinearGradient4fContext::LinearIntervalProcessor {
public:
@ -322,8 +322,8 @@ public:
private:
void compute_interval_props(SkScalar t) {
const Sk4f dC = DstTraits<dstType>::load(fInterval->fDc);
fCc = DstTraits<dstType>::load(fInterval->fC0);
const Sk4f dC = DstTraits<dstType, premul>::load(fInterval->fDc);
fCc = DstTraits<dstType, premul>::load(fInterval->fC0);
fCc = fCc + dC * Sk4f(t);
fDcDx = dC * fDx;
fZeroRamp = fIsVertical || fInterval->isZeroRamp();

View File

@ -27,7 +27,7 @@ protected:
private:
using INHERITED = GradientShaderBase4fContext;
template<DstType, TileMode>
template<DstType, ApplyPremul, TileMode>
class LinearIntervalProcessor;
template <DstType dstType, ApplyPremul premul>