3-15% speedup to HardLight / Overlay xfermodes.
While investigating my bug (skia:4052) I saw this TODO and figured it'd make me feel better about an otherwise unsuccessful investigation. This speeds up HardLight and Overlay (same code) by about 15% with SSE, mostly by rewriting the logic from 1 cheap comparison and 2 expensive div255() calls to 2 cheap comparisons and 1 expensive div255(). NEON speeds up by a more modest ~3%. BUG=skia: Review URL: https://codereview.chromium.org/1230663005
This commit is contained in:
parent
a5517e2b19
commit
4be181e304
@ -70,6 +70,7 @@ public:
|
||||
Wide operator >> (int bits) const { return INHERITED::operator>>(bits); }
|
||||
Wide operator << (int bits) const { return INHERITED::operator<<(bits); }
|
||||
static Wide Min(const Wide& a, const Wide& b) { return INHERITED::Min(a,b); }
|
||||
Wide thenElse(const Wide& t, const Wide& e) const { return INHERITED::thenElse(t,e); }
|
||||
|
||||
private:
|
||||
typedef Sk16h INHERITED;
|
||||
@ -77,6 +78,7 @@ public:
|
||||
|
||||
Wide widenLo() const; // ARGB -> 0A 0R 0G 0B
|
||||
Wide widenHi() const; // ARGB -> A0 R0 G0 B0
|
||||
Wide widenLoHi() const; // ARGB -> AA RR GG BB
|
||||
Wide mulWiden(const Sk16b&) const; // 8-bit x 8-bit -> 16-bit components.
|
||||
|
||||
// The only 8-bit multiply we use is 8-bit x 8-bit -> 16-bit. Might as well make it pithy.
|
||||
|
@ -68,15 +68,13 @@ XFERMODE(HardLight) {
|
||||
auto sa = s.alphas(),
|
||||
da = d.alphas();
|
||||
|
||||
auto isLite = (sa-s) < s;
|
||||
auto isLite = ((sa-s) < s).widenLoHi();
|
||||
|
||||
auto dark = s*d << 1,
|
||||
lite = sa*da - ((da-d)*(sa-s) << 1),
|
||||
both = s*da.inv() + d*sa.inv();
|
||||
|
||||
// TODO: do isLite in 16-bit so we only have to div255() once.
|
||||
auto colors = isLite.thenElse((lite + both).div255(),
|
||||
(dark + both).div255());
|
||||
auto colors = (both + isLite.thenElse(lite, dark)).div255();
|
||||
return alphas.zeroColors() + colors.zeroAlphas();
|
||||
}
|
||||
XFERMODE(Overlay) { return HardLight::Xfer(d,s); }
|
||||
|
@ -40,6 +40,12 @@ inline Sk4px::Wide Sk4px::widenHi() const {
|
||||
vshll_n_u8(vget_high_u8(this->fVec), 8));
|
||||
}
|
||||
|
||||
inline Sk4px::Wide Sk4px::widenLoHi() const {
|
||||
auto zipped = vzipq_u8(this->fVec, this->fVec);
|
||||
return Sk16h((uint16x8_t)zipped.val[0],
|
||||
(uint16x8_t)zipped.val[1]);
|
||||
}
|
||||
|
||||
inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
|
||||
return Sk16h(vmull_u8(vget_low_u8 (this->fVec), vget_low_u8 (other.fVec)),
|
||||
vmull_u8(vget_high_u8(this->fVec), vget_high_u8(other.fVec)));
|
||||
|
@ -31,6 +31,11 @@ inline Sk4px::Wide Sk4px::widenHi() const {
|
||||
_mm_unpackhi_epi8(_mm_setzero_si128(), this->fVec));
|
||||
}
|
||||
|
||||
inline Sk4px::Wide Sk4px::widenLoHi() const {
|
||||
return Sk16h(_mm_unpacklo_epi8(this->fVec, this->fVec),
|
||||
_mm_unpackhi_epi8(this->fVec, this->fVec));
|
||||
}
|
||||
|
||||
inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
|
||||
return this->widenLo() * Sk4px(other).widenLo();
|
||||
}
|
||||
|
@ -48,6 +48,8 @@ inline Sk4px::Wide Sk4px::widenLo() const {
|
||||
|
||||
inline Sk4px::Wide Sk4px::widenHi() const { return this->widenLo() << 8; }
|
||||
|
||||
inline Sk4px::Wide Sk4px::widenLoHi() const { return this->widenLo() + this->widenHi(); }
|
||||
|
||||
inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
|
||||
return this->widenLo() * Sk4px(other).widenLo();
|
||||
}
|
||||
|
@ -337,6 +337,11 @@ public:
|
||||
return vgetq_lane_u16(fVec, k&7);
|
||||
}
|
||||
|
||||
SkNi thenElse(const SkNi& t, const SkNi& e) const {
|
||||
return vorrq_u16(vandq_u16(t.fVec, fVec),
|
||||
vbicq_u16(e.fVec, fVec));
|
||||
}
|
||||
|
||||
uint16x8_t fVec;
|
||||
};
|
||||
|
||||
|
@ -257,6 +257,11 @@ public:
|
||||
_mm_sub_epi8(b.fVec, top_8x)));
|
||||
}
|
||||
|
||||
SkNi thenElse(const SkNi& t, const SkNi& e) const {
|
||||
return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
|
||||
_mm_andnot_si128(fVec, e.fVec));
|
||||
}
|
||||
|
||||
template <int k> uint16_t kth() const {
|
||||
SkASSERT(0 <= k && k < 8);
|
||||
return _mm_extract_epi16(fVec, k);
|
||||
|
@ -192,3 +192,19 @@ DEF_TEST(Sk4px_muldiv255round, r) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEF_TEST(Sk4px_widening, r) {
|
||||
SkPMColor colors[] = {
|
||||
SkPreMultiplyColor(0xff00ff00),
|
||||
SkPreMultiplyColor(0x40008000),
|
||||
SkPreMultiplyColor(0x7f020406),
|
||||
SkPreMultiplyColor(0x00000000),
|
||||
};
|
||||
auto packed = Sk4px::Load4(colors);
|
||||
|
||||
auto wideLo = packed.widenLo(),
|
||||
wideHi = packed.widenHi(),
|
||||
wideLoHi = packed.widenLoHi(),
|
||||
wideLoHiAlt = wideLo + wideHi;
|
||||
REPORTER_ASSERT(r, 0 == memcmp(&wideLoHi, &wideLoHiAlt, sizeof(wideLoHi)));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user