Faster 4f gradient premul path
Similar to https://codereview.chromium.org/2409583003/, perform the premul in 4f. It turns out it's even faster to avoid the 255 load multiplication in this case. Also includes some template plumbing because DstTraits<>::load now needs to be premul-aware (previously it wasn't). R=reed@google.com GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2416233002 Review-Url: https://codereview.chromium.org/2416233002
This commit is contained in:
parent
11abd8d6cb
commit
3a2e45a6ed
@ -326,7 +326,7 @@ GradientShaderBase4fContext::shadeSpanInternal(int x, int y,
|
||||
int count) const {
|
||||
static const int kBufSize = 128;
|
||||
SkScalar ts[kBufSize];
|
||||
TSampler<dstType, tileMode> sampler(*this);
|
||||
TSampler<dstType, premul, tileMode> sampler(*this);
|
||||
|
||||
SkASSERT(count > 0);
|
||||
do {
|
||||
@ -341,7 +341,7 @@ GradientShaderBase4fContext::shadeSpanInternal(int x, int y,
|
||||
} while (count > 0);
|
||||
}
|
||||
|
||||
template<DstType dstType, SkShader::TileMode tileMode>
|
||||
template<DstType dstType, ApplyPremul premul, SkShader::TileMode tileMode>
|
||||
class SkGradientShaderBase::GradientShaderBase4fContext::TSampler {
|
||||
public:
|
||||
TSampler(const GradientShaderBase4fContext& ctx)
|
||||
@ -424,8 +424,8 @@ private:
|
||||
}
|
||||
|
||||
void loadIntervalData(const Interval* i) {
|
||||
fCc = DstTraits<dstType>::load(i->fC0);
|
||||
fDc = DstTraits<dstType>::load(i->fDc);
|
||||
fCc = DstTraits<dstType, premul>::load(i->fC0);
|
||||
fDc = DstTraits<dstType, premul>::load(i->fDc);
|
||||
}
|
||||
|
||||
const Interval* fFirstInterval;
|
||||
|
@ -60,7 +60,7 @@ private:
|
||||
void addMirrorIntervals(const SkGradientShaderBase&,
|
||||
const Sk4f& componentScale, bool reverse);
|
||||
|
||||
template<DstType, SkShader::TileMode tileMode>
|
||||
template<DstType, ApplyPremul, SkShader::TileMode tileMode>
|
||||
class TSampler;
|
||||
|
||||
template <DstType dstType, ApplyPremul premul>
|
||||
|
@ -29,17 +29,6 @@ enum class DstType {
|
||||
F32, // Linear float. Used for shaders only.
|
||||
};
|
||||
|
||||
template <ApplyPremul premul>
|
||||
inline SkPMColor trunc_from_4f_255(const Sk4f& c) {
|
||||
SkPMColor pmc;
|
||||
SkNx_cast<uint8_t>(c).store(&pmc);
|
||||
if (premul == ApplyPremul::True) {
|
||||
pmc = SkPreMultiplyARGB(SkGetPackedA32(pmc), SkGetPackedR32(pmc),
|
||||
SkGetPackedG32(pmc), SkGetPackedB32(pmc));
|
||||
}
|
||||
return pmc;
|
||||
}
|
||||
|
||||
template <ApplyPremul>
|
||||
struct PremulTraits;
|
||||
|
||||
@ -69,24 +58,34 @@ struct PremulTraits<ApplyPremul::True> {
|
||||
//
|
||||
// - store4x() Store 4 Sk4f values to dest (opportunistic optimization).
|
||||
//
|
||||
template <DstType, ApplyPremul premul = ApplyPremul::False>
|
||||
template <DstType, ApplyPremul premul>
|
||||
struct DstTraits;
|
||||
|
||||
template <ApplyPremul premul>
|
||||
struct DstTraits<DstType::L32, premul> {
|
||||
using PM = PremulTraits<premul>;
|
||||
using Type = SkPMColor;
|
||||
|
||||
// For L32, we prescale the values by 255 to save a per-pixel multiplication.
|
||||
// For L32, prescaling by 255 saves a per-pixel multiplication when premul is not needed.
|
||||
static Sk4f load(const SkPM4f& c) {
|
||||
return c.to4f_pmorder() * Sk4f(255);
|
||||
return premul == ApplyPremul::False
|
||||
? c.to4f_pmorder() * Sk4f(255)
|
||||
: c.to4f_pmorder();
|
||||
}
|
||||
|
||||
static void store(const Sk4f& c, Type* dst) {
|
||||
*dst = trunc_from_4f_255<premul>(c);
|
||||
if (premul == ApplyPremul::False) {
|
||||
// c is prescaled by 255, just store.
|
||||
SkNx_cast<uint8_t>(c).store(dst);
|
||||
} else {
|
||||
*dst = Sk4f_toL32(PM::apply(c));
|
||||
}
|
||||
}
|
||||
|
||||
static void store(const Sk4f& c, Type* dst, int n) {
|
||||
sk_memset32(dst, trunc_from_4f_255<premul>(c), n);
|
||||
Type pmc;
|
||||
store(c, &pmc);
|
||||
sk_memset32(dst, pmc, n);
|
||||
}
|
||||
|
||||
static void store4x(const Sk4f& c0, const Sk4f& c1,
|
||||
|
@ -240,12 +240,12 @@ LinearGradient4fContext::shadeSpanInternal(int x, int y,
|
||||
&pt);
|
||||
const SkScalar fx = pinFx<tileMode>(pt.x());
|
||||
const SkScalar dx = fDstToPos.getScaleX();
|
||||
LinearIntervalProcessor<dstType, tileMode> proc(fIntervals.begin(),
|
||||
fIntervals.end() - 1,
|
||||
this->findInterval(fx),
|
||||
fx,
|
||||
dx,
|
||||
SkScalarNearlyZero(dx * count));
|
||||
LinearIntervalProcessor<dstType, premul, tileMode> proc(fIntervals.begin(),
|
||||
fIntervals.end() - 1,
|
||||
this->findInterval(fx),
|
||||
fx,
|
||||
dx,
|
||||
SkScalarNearlyZero(dx * count));
|
||||
while (count > 0) {
|
||||
// What we really want here is SkTPin(advance, 1, count)
|
||||
// but that's a significant perf hit for >> stops; investigate.
|
||||
@ -274,7 +274,7 @@ LinearGradient4fContext::shadeSpanInternal(int x, int y,
|
||||
}
|
||||
}
|
||||
|
||||
template<DstType dstType, SkShader::TileMode tileMode>
|
||||
template<DstType dstType, ApplyPremul premul, SkShader::TileMode tileMode>
|
||||
class SkLinearGradient::
|
||||
LinearGradient4fContext::LinearIntervalProcessor {
|
||||
public:
|
||||
@ -322,8 +322,8 @@ public:
|
||||
|
||||
private:
|
||||
void compute_interval_props(SkScalar t) {
|
||||
const Sk4f dC = DstTraits<dstType>::load(fInterval->fDc);
|
||||
fCc = DstTraits<dstType>::load(fInterval->fC0);
|
||||
const Sk4f dC = DstTraits<dstType, premul>::load(fInterval->fDc);
|
||||
fCc = DstTraits<dstType, premul>::load(fInterval->fC0);
|
||||
fCc = fCc + dC * Sk4f(t);
|
||||
fDcDx = dC * fDx;
|
||||
fZeroRamp = fIsVertical || fInterval->isZeroRamp();
|
||||
|
@ -27,7 +27,7 @@ protected:
|
||||
private:
|
||||
using INHERITED = GradientShaderBase4fContext;
|
||||
|
||||
template<DstType, TileMode>
|
||||
template<DstType, ApplyPremul, TileMode>
|
||||
class LinearIntervalProcessor;
|
||||
|
||||
template <DstType dstType, ApplyPremul premul>
|
||||
|
Loading…
Reference in New Issue
Block a user