De-templatize Sk4pxXfermode code a bit.

This deduplicates a few pieces of code:
  - we end up with one copy of each xfer32() driver loop instead of one per xfermode;
  - we end up with two* copies of each xfermode implementation instead of ten**.

* For a given Mode: Mode() itself and xfer_aa<Mode>().
** From unrolling: twice at a stride of 8, once at 4, once at 2, and once at 1, then all again for when we have AA.

This decreases the size of SkXfermode.o from 1.5M to 620K on x86-64 and from 1.3M to 680K on ARMv7+NEON.

If we wanted to, we could eliminate the xfer_aa<Mode>() copy by tagging each Mode() function as __attribute__((noinline)) or its equivalent.  This would result in another ~100K space savings.

Performance is affected in proportion to the original xfermode speed:
fast modes like Plus take the largest proportional hit, and slow modes
like HardLight or SoftLight see essentially no hit at all.

This adds SK_VECTORCALL to help keep this code fast on ARMv7 and Windows.  I've looked at the ARMv7 generated code... it looks good, even pretty.

For compatibility with SK_VECTORCALL, we now pass the vector-sized arguments by value instead of by reference.  Some refactoring now allows us to declare each mode as just a static function instead of a struct, which simplifies things.

TBR=reed@google.com
No public API changes.

BUG=skia:

Committed: https://skia.googlesource.com/skia/+/e617e1525916d7ee684142728c0905828caf49da

CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu-GCC-Arm7-Debug-Android_NoNeon-Trybot

Review URL: https://codereview.chromium.org/1242743004
This commit is contained in:
mtklein 2015-07-21 12:39:57 -07:00 committed by Commit bot
parent c3dcb67f07
commit cd1930d4f1
2 changed files with 79 additions and 65 deletions

View File

@ -297,6 +297,14 @@
# endif # endif
#endif #endif
#if defined(SK_BUILD_FOR_WIN)
#define SK_VECTORCALL __vectorcall
#elif defined(SK_CPU_ARM32)
#define SK_VECTORCALL __attribute__((pcs("aapcs-vfp")))
#else
#define SK_VECTORCALL
#endif
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
#if defined(__clang__) || defined(__GNUC__) #if defined(__clang__) || defined(__GNUC__)

View File

@ -16,13 +16,15 @@
// Each gets its own independent instantiation by wrapping in an anonymous namespace. // Each gets its own independent instantiation by wrapping in an anonymous namespace.
namespace { namespace {
#if defined(SK_CPU_ARM32) && !defined(SK_ARM_HAS_NEON)
// Signals SkXfermode.cpp to look for runtime-detected NEON.
static SkProcCoeffXfermode* SkCreate4pxXfermode(const ProcCoeff& rec, SkXfermode::Mode mode) {
return nullptr;
}
#else
// Most xfermodes can be done most efficiently 4 pixels at a time in 8 or 16-bit fixed point. // Most xfermodes can be done most efficiently 4 pixels at a time in 8 or 16-bit fixed point.
#define XFERMODE(Name) \ #define XFERMODE(Name) static Sk4px SK_VECTORCALL Name(Sk4px s, Sk4px d)
struct Name { \
static Sk4px Xfer(const Sk4px&, const Sk4px&); \
static const SkXfermode::Mode kMode = SkXfermode::k##Name##_Mode; \
}; \
inline Sk4px Name::Xfer(const Sk4px& s, const Sk4px& d)
XFERMODE(Clear) { return Sk4px::DupPMColor(0); } XFERMODE(Clear) { return Sk4px::DupPMColor(0); }
XFERMODE(Src) { return s; } XFERMODE(Src) { return s; }
@ -30,13 +32,13 @@ XFERMODE(Dst) { return d; }
XFERMODE(SrcIn) { return s.approxMulDiv255(d.alphas() ); } XFERMODE(SrcIn) { return s.approxMulDiv255(d.alphas() ); }
XFERMODE(SrcOut) { return s.approxMulDiv255(d.alphas().inv()); } XFERMODE(SrcOut) { return s.approxMulDiv255(d.alphas().inv()); }
XFERMODE(SrcOver) { return s + d.approxMulDiv255(s.alphas().inv()); } XFERMODE(SrcOver) { return s + d.approxMulDiv255(s.alphas().inv()); }
XFERMODE(DstIn) { return SrcIn ::Xfer(d,s); } XFERMODE(DstIn) { return SrcIn (d,s); }
XFERMODE(DstOut) { return SrcOut ::Xfer(d,s); } XFERMODE(DstOut) { return SrcOut (d,s); }
XFERMODE(DstOver) { return SrcOver::Xfer(d,s); } XFERMODE(DstOver) { return SrcOver(d,s); }
// [ S * Da + (1 - Sa) * D] // [ S * Da + (1 - Sa) * D]
XFERMODE(SrcATop) { return (s * d.alphas() + d * s.alphas().inv()).div255(); } XFERMODE(SrcATop) { return (s * d.alphas() + d * s.alphas().inv()).div255(); }
XFERMODE(DstATop) { return SrcATop::Xfer(d,s); } XFERMODE(DstATop) { return SrcATop(d,s); }
//[ S * (1 - Da) + (1 - Sa) * D ] //[ S * (1 - Da) + (1 - Sa) * D ]
XFERMODE(Xor) { return (s * d.alphas().inv() + d * s.alphas().inv()).div255(); } XFERMODE(Xor) { return (s * d.alphas().inv() + d * s.alphas().inv()).div255(); }
// [S + D ] // [S + D ]
@ -86,7 +88,7 @@ XFERMODE(HardLight) {
auto colors = (both + isLite.thenElse(lite, dark)).div255(); auto colors = (both + isLite.thenElse(lite, dark)).div255();
return alphas.zeroColors() + colors.zeroAlphas(); return alphas.zeroColors() + colors.zeroAlphas();
} }
XFERMODE(Overlay) { return HardLight::Xfer(d,s); } XFERMODE(Overlay) { return HardLight(d,s); }
XFERMODE(Darken) { XFERMODE(Darken) {
auto sa = s.alphas(), auto sa = s.alphas(),
@ -117,12 +119,7 @@ XFERMODE(Lighten) {
#undef XFERMODE #undef XFERMODE
// Some xfermodes use math like divide or sqrt that's best done in floats 1 pixel at a time. // Some xfermodes use math like divide or sqrt that's best done in floats 1 pixel at a time.
#define XFERMODE(Name) \ #define XFERMODE(Name) static SkPMFloat SK_VECTORCALL Name(SkPMFloat s, SkPMFloat d)
struct Name { \
static SkPMFloat Xfer(const SkPMFloat&, const SkPMFloat&); \
static const SkXfermode::Mode kMode = SkXfermode::k##Name##_Mode; \
}; \
inline SkPMFloat Name::Xfer(const SkPMFloat& s, const SkPMFloat& d)
XFERMODE(ColorDodge) { XFERMODE(ColorDodge) {
auto sa = s.alphas(), auto sa = s.alphas(),
@ -185,15 +182,15 @@ XFERMODE(SoftLight) {
// A reasonable fallback mode for doing AA is to simply apply the transfermode first, // A reasonable fallback mode for doing AA is to simply apply the transfermode first,
// then linearly interpolate the AA. // then linearly interpolate the AA.
template <typename Mode> template <Sk4px (SK_VECTORCALL *Mode)(Sk4px, Sk4px)>
static Sk4px xfer_aa(const Sk4px& s, const Sk4px& d, const Sk4px& aa) { static Sk4px SK_VECTORCALL xfer_aa(Sk4px s, Sk4px d, Sk4px aa) {
Sk4px bw = Mode::Xfer(s, d); Sk4px bw = Mode(s, d);
return (bw * aa + d * aa.inv()).div255(); return (bw * aa + d * aa.inv()).div255();
} }
// For some transfermodes we specialize AA, either for correctness or performance. // For some transfermodes we specialize AA, either for correctness or performance.
#define XFERMODE_AA(Name) \ #define XFERMODE_AA(Name) \
template <> Sk4px xfer_aa<Name>(const Sk4px& s, const Sk4px& d, const Sk4px& aa) template <> Sk4px SK_VECTORCALL xfer_aa<Name>(Sk4px s, Sk4px d, Sk4px aa)
// Plus' clamp needs to happen after AA. skia:3852 // Plus' clamp needs to happen after AA. skia:3852
XFERMODE_AA(Plus) { // [ clamp( (1-AA)D + (AA)(S+D) ) == clamp(D + AA*S) ] XFERMODE_AA(Plus) { // [ clamp( (1-AA)D + (AA)(S+D) ) == clamp(D + AA*S) ]
@ -202,44 +199,47 @@ XFERMODE_AA(Plus) { // [ clamp( (1-AA)D + (AA)(S+D) ) == clamp(D + AA*S) ]
#undef XFERMODE_AA #undef XFERMODE_AA
template <typename ProcType> class Sk4pxXfermode : public SkProcCoeffXfermode {
class SkT4pxXfermode : public SkProcCoeffXfermode {
public: public:
static SkProcCoeffXfermode* Create(const ProcCoeff& rec) { typedef Sk4px (SK_VECTORCALL *Proc4)(Sk4px, Sk4px);
return SkNEW_ARGS(SkT4pxXfermode, (rec)); typedef Sk4px (SK_VECTORCALL *AAProc4)(Sk4px, Sk4px, Sk4px);
}
Sk4pxXfermode(const ProcCoeff& rec, SkXfermode::Mode mode, Proc4 proc4, AAProc4 aaproc4)
: INHERITED(rec, mode)
, fProc4(proc4)
, fAAProc4(aaproc4) {}
void xfer32(SkPMColor dst[], const SkPMColor src[], int n, const SkAlpha aa[]) const override { void xfer32(SkPMColor dst[], const SkPMColor src[], int n, const SkAlpha aa[]) const override {
if (NULL == aa) { if (NULL == aa) {
Sk4px::MapDstSrc(n, dst, src, [&](const Sk4px& dst4, const Sk4px& src4) { Sk4px::MapDstSrc(n, dst, src, [&](const Sk4px& dst4, const Sk4px& src4) {
return ProcType::Xfer(src4, dst4); return fProc4(src4, dst4);
}); });
} else { } else {
Sk4px::MapDstSrcAlpha(n, dst, src, aa, Sk4px::MapDstSrcAlpha(n, dst, src, aa,
[&](const Sk4px& dst4, const Sk4px& src4, const Sk4px& alpha) { [&](const Sk4px& dst4, const Sk4px& src4, const Sk4px& alpha) {
return xfer_aa<ProcType>(src4, dst4, alpha); return fAAProc4(src4, dst4, alpha);
}); });
} }
} }
private: private:
SkT4pxXfermode(const ProcCoeff& rec) : INHERITED(rec, ProcType::kMode) {} Proc4 fProc4;
AAProc4 fAAProc4;
typedef SkProcCoeffXfermode INHERITED; typedef SkProcCoeffXfermode INHERITED;
}; };
template <typename ProcType> class SkPMFloatXfermode : public SkProcCoeffXfermode {
class SkTPMFloatXfermode : public SkProcCoeffXfermode {
public: public:
static SkProcCoeffXfermode* Create(const ProcCoeff& rec) { typedef SkPMFloat (SK_VECTORCALL *ProcF)(SkPMFloat, SkPMFloat);
return SkNEW_ARGS(SkTPMFloatXfermode, (rec)); SkPMFloatXfermode(const ProcCoeff& rec, SkXfermode::Mode mode, ProcF procf)
} : INHERITED(rec, mode)
, fProcF(procf) {}
void xfer32(SkPMColor dst[], const SkPMColor src[], int n, const SkAlpha aa[]) const override { void xfer32(SkPMColor dst[], const SkPMColor src[], int n, const SkAlpha aa[]) const override {
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
SkPMFloat s(src[i]), SkPMFloat s(src[i]),
d(dst[i]), d(dst[i]),
b(ProcType::Xfer(s,d)); b(fProcF(s,d));
if (aa) { if (aa) {
// We do aa in full float precision before going back down to bytes, because we can! // We do aa in full float precision before going back down to bytes, because we can!
SkPMFloat a = Sk4f(aa[i]) * Sk4f(1.0f/255); SkPMFloat a = Sk4f(aa[i]) * Sk4f(1.0f/255);
@ -250,46 +250,52 @@ public:
} }
private: private:
SkTPMFloatXfermode(const ProcCoeff& rec) : INHERITED(rec, ProcType::kMode) {} ProcF fProcF;
typedef SkProcCoeffXfermode INHERITED; typedef SkProcCoeffXfermode INHERITED;
}; };
static SkProcCoeffXfermode* SkCreate4pxXfermode(const ProcCoeff& rec, SkXfermode::Mode mode) { static SkProcCoeffXfermode* SkCreate4pxXfermode(const ProcCoeff& rec, SkXfermode::Mode mode) {
#if !defined(SK_CPU_ARM32) || defined(SK_ARM_HAS_NEON)
switch (mode) { switch (mode) {
case SkXfermode::kClear_Mode: return SkT4pxXfermode<Clear>::Create(rec); #define CASE(Mode) case SkXfermode::k##Mode##_Mode: \
case SkXfermode::kSrc_Mode: return SkT4pxXfermode<Src>::Create(rec); return SkNEW_ARGS(Sk4pxXfermode, (rec, mode, &Mode, &xfer_aa<Mode>))
case SkXfermode::kDst_Mode: return SkT4pxXfermode<Dst>::Create(rec); CASE(Clear);
case SkXfermode::kSrcOver_Mode: return SkT4pxXfermode<SrcOver>::Create(rec); CASE(Src);
case SkXfermode::kDstOver_Mode: return SkT4pxXfermode<DstOver>::Create(rec); CASE(Dst);
case SkXfermode::kSrcIn_Mode: return SkT4pxXfermode<SrcIn>::Create(rec); CASE(SrcOver);
case SkXfermode::kDstIn_Mode: return SkT4pxXfermode<DstIn>::Create(rec); CASE(DstOver);
case SkXfermode::kSrcOut_Mode: return SkT4pxXfermode<SrcOut>::Create(rec); CASE(SrcIn);
case SkXfermode::kDstOut_Mode: return SkT4pxXfermode<DstOut>::Create(rec); CASE(DstIn);
case SkXfermode::kSrcATop_Mode: return SkT4pxXfermode<SrcATop>::Create(rec); CASE(SrcOut);
case SkXfermode::kDstATop_Mode: return SkT4pxXfermode<DstATop>::Create(rec); CASE(DstOut);
case SkXfermode::kXor_Mode: return SkT4pxXfermode<Xor>::Create(rec); CASE(SrcATop);
case SkXfermode::kPlus_Mode: return SkT4pxXfermode<Plus>::Create(rec); CASE(DstATop);
case SkXfermode::kModulate_Mode: return SkT4pxXfermode<Modulate>::Create(rec); CASE(Xor);
case SkXfermode::kScreen_Mode: return SkT4pxXfermode<Screen>::Create(rec); CASE(Plus);
case SkXfermode::kMultiply_Mode: return SkT4pxXfermode<Multiply>::Create(rec); CASE(Modulate);
case SkXfermode::kDifference_Mode: return SkT4pxXfermode<Difference>::Create(rec); CASE(Screen);
case SkXfermode::kExclusion_Mode: return SkT4pxXfermode<Exclusion>::Create(rec); CASE(Multiply);
case SkXfermode::kHardLight_Mode: return SkT4pxXfermode<HardLight>::Create(rec); CASE(Difference);
case SkXfermode::kOverlay_Mode: return SkT4pxXfermode<Overlay>::Create(rec); CASE(Exclusion);
case SkXfermode::kDarken_Mode: return SkT4pxXfermode<Darken>::Create(rec); CASE(HardLight);
case SkXfermode::kLighten_Mode: return SkT4pxXfermode<Lighten>::Create(rec); CASE(Overlay);
CASE(Darken);
CASE(Lighten);
#undef CASE
#define CASE(Mode) case SkXfermode::k##Mode##_Mode: \
return SkNEW_ARGS(SkPMFloatXfermode, (rec, mode, &Mode))
CASE(ColorDodge);
CASE(ColorBurn);
CASE(SoftLight);
#undef CASE
case SkXfermode::kColorDodge_Mode: return SkTPMFloatXfermode<ColorDodge>::Create(rec);
case SkXfermode::kColorBurn_Mode: return SkTPMFloatXfermode<ColorBurn>::Create(rec);
case SkXfermode::kSoftLight_Mode: return SkTPMFloatXfermode<SoftLight>::Create(rec);
default: break; default: break;
} }
#endif
return nullptr; return nullptr;
} }
#endif
} // namespace } // namespace
#endif//Sk4pxXfermode_DEFINED #endif//Sk4pxXfermode_DEFINED