Faster 4f gradient premul path

Similar to https://codereview.chromium.org/2409583003/, perform the premul in 4f. It turns out it's even faster to avoid the 255 load multiplication in this case. Also includes some template plumbing because DstTraits<>::load now needs to be premul-aware (previously it wasn't). R=reed@google.com GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2416233002 Review-Url: https://codereview.chromium.org/2416233002
2016-10-14 08:18:24 -07:00 · 2016-10-14 08:18:24 -07:00 · 3a2e45a6ed
commit 3a2e45a6ed
parent 11abd8d6cb
5 changed files with 30 additions and 31 deletions
--- a/src/effects/gradients/Sk4fGradientBase.cpp
+++ b/src/effects/gradients/Sk4fGradientBase.cpp
@ -326,7 +326,7 @@ GradientShaderBase4fContext::shadeSpanInternal(int x, int y,
                                               int count) const {
    static const int kBufSize = 128;
    SkScalar ts[kBufSize];
-    TSampler<dstType, tileMode> sampler(*this);
+    TSampler<dstType, premul, tileMode> sampler(*this);

    SkASSERT(count > 0);
    do {
@ -341,7 +341,7 @@ GradientShaderBase4fContext::shadeSpanInternal(int x, int y,
    } while (count > 0);
 }

-template<DstType dstType, SkShader::TileMode tileMode>
+template<DstType dstType, ApplyPremul premul, SkShader::TileMode tileMode>
 class SkGradientShaderBase::GradientShaderBase4fContext::TSampler {
 public:
    TSampler(const GradientShaderBase4fContext& ctx)
@ -424,8 +424,8 @@ private:
    }

    void loadIntervalData(const Interval* i) {
-        fCc = DstTraits<dstType>::load(i->fC0);
-        fDc = DstTraits<dstType>::load(i->fDc);
+        fCc = DstTraits<dstType, premul>::load(i->fC0);
+        fDc = DstTraits<dstType, premul>::load(i->fDc);
    }

    const Interval* fFirstInterval;
--- a/src/effects/gradients/Sk4fGradientBase.h
+++ b/src/effects/gradients/Sk4fGradientBase.h
@ -60,7 +60,7 @@ private:
    void addMirrorIntervals(const SkGradientShaderBase&,
                            const Sk4f& componentScale, bool reverse);

-    template<DstType, SkShader::TileMode tileMode>
+    template<DstType, ApplyPremul, SkShader::TileMode tileMode>
    class TSampler;

    template <DstType dstType, ApplyPremul premul>
--- a/src/effects/gradients/Sk4fGradientPriv.h
+++ b/src/effects/gradients/Sk4fGradientPriv.h
@ -29,17 +29,6 @@ enum class DstType {
    F32,  // Linear float.  Used for shaders only.
 };

-template <ApplyPremul premul>
-inline SkPMColor trunc_from_4f_255(const Sk4f& c) {
-    SkPMColor pmc;
-    SkNx_cast<uint8_t>(c).store(&pmc);
-    if (premul == ApplyPremul::True) {
-        pmc = SkPreMultiplyARGB(SkGetPackedA32(pmc), SkGetPackedR32(pmc),
-                                SkGetPackedG32(pmc), SkGetPackedB32(pmc));
-    }
-    return pmc;
-}
-
 template <ApplyPremul>
 struct PremulTraits;

@ -69,24 +58,34 @@ struct PremulTraits<ApplyPremul::True> {
 //
 //   - store4x()    Store 4 Sk4f values to dest (opportunistic optimization).
 //
-template <DstType, ApplyPremul premul = ApplyPremul::False>
+template <DstType, ApplyPremul premul>
 struct DstTraits;

 template <ApplyPremul premul>
 struct DstTraits<DstType::L32, premul> {
+    using PM   = PremulTraits<premul>;
    using Type = SkPMColor;

-    // For L32, we prescale the values by 255 to save a per-pixel multiplication.
+    // For L32, prescaling by 255 saves a per-pixel multiplication when premul is not needed.
    static Sk4f load(const SkPM4f& c) {
-        return c.to4f_pmorder() * Sk4f(255);
+        return premul == ApplyPremul::False
+            ? c.to4f_pmorder() * Sk4f(255)
+            : c.to4f_pmorder();
    }

    static void store(const Sk4f& c, Type* dst) {
-        *dst = trunc_from_4f_255<premul>(c);
+        if (premul == ApplyPremul::False) {
+            // c is prescaled by 255, just store.
+            SkNx_cast<uint8_t>(c).store(dst);
+        } else {
+            *dst = Sk4f_toL32(PM::apply(c));
+        }
    }

    static void store(const Sk4f& c, Type* dst, int n) {
-        sk_memset32(dst, trunc_from_4f_255<premul>(c), n);
+        Type pmc;
+        store(c, &pmc);
+        sk_memset32(dst, pmc, n);
    }

    static void store4x(const Sk4f& c0, const Sk4f& c1,
--- a/src/effects/gradients/Sk4fLinearGradient.cpp
+++ b/src/effects/gradients/Sk4fLinearGradient.cpp
@ -240,12 +240,12 @@ LinearGradient4fContext::shadeSpanInternal(int x, int y,
                  &pt);
    const SkScalar fx = pinFx<tileMode>(pt.x());
    const SkScalar dx = fDstToPos.getScaleX();
-    LinearIntervalProcessor<dstType, tileMode> proc(fIntervals.begin(),
-                                                    fIntervals.end() - 1,
-                                                    this->findInterval(fx),
-                                                    fx,
-                                                    dx,
-                                                    SkScalarNearlyZero(dx * count));
+    LinearIntervalProcessor<dstType, premul, tileMode> proc(fIntervals.begin(),
+                                                            fIntervals.end() - 1,
+                                                            this->findInterval(fx),
+                                                            fx,
+                                                            dx,
+                                                            SkScalarNearlyZero(dx * count));
    while (count > 0) {
        // What we really want here is SkTPin(advance, 1, count)
        // but that's a significant perf hit for >> stops; investigate.
@ -274,7 +274,7 @@ LinearGradient4fContext::shadeSpanInternal(int x, int y,
    }
 }

-template<DstType dstType, SkShader::TileMode tileMode>
+template<DstType dstType, ApplyPremul premul, SkShader::TileMode tileMode>
 class SkLinearGradient::
 LinearGradient4fContext::LinearIntervalProcessor {
 public:
@ -322,8 +322,8 @@ public:

 private:
    void compute_interval_props(SkScalar t) {
-        const Sk4f dC = DstTraits<dstType>::load(fInterval->fDc);
-        fCc           = DstTraits<dstType>::load(fInterval->fC0);
+        const Sk4f dC = DstTraits<dstType, premul>::load(fInterval->fDc);
+        fCc           = DstTraits<dstType, premul>::load(fInterval->fC0);
        fCc           = fCc + dC * Sk4f(t);
        fDcDx         = dC * fDx;
        fZeroRamp     = fIsVertical || fInterval->isZeroRamp();
--- a/src/effects/gradients/Sk4fLinearGradient.h
+++ b/src/effects/gradients/Sk4fLinearGradient.h
@ -27,7 +27,7 @@ protected:
 private:
    using INHERITED = GradientShaderBase4fContext;

-    template<DstType, TileMode>
+    template<DstType, ApplyPremul, TileMode>
    class LinearIntervalProcessor;

    template <DstType dstType, ApplyPremul premul>