Sk4px: alphas() and Load[24]Alphas()

alphas() extracts the 4 alphas from an existing Sk4px as another Sk4px.
LoadNAlphas() constructs an Sk4px from N packed alphas.

In both cases, we end up with 4x repeated alphas aligned with their pixels.

alphas()
  A0 R0 G0 B0  A1 R1 G1 B1  A2 R2 G2 B2  A3 R3 G3 B3
  ->
  A0 A0 A0 A0  A1 A1 A1 A1  A2 A2 A2 A2  A3 A3 A3 A3

Load4Alphas()
  A0 A1 A2 A3
  ->
  A0 A0 A0 A0  A1 A1 A1 A1  A2 A2 A2 A2  A3 A3 A3 A3

Load2Alphas()
  A0 A1
  ->
  A0 A0 A0 A0  A1 A1 A1 A1  0 0 0 0  0 0 0 0

This is a 5-10% speedup for AA on Intel, and wash on ARM.
AA is still mostly dominated by the final lerp.

alphas() isn't used yet, but it's similar enough to Load[24]Alphas()
that it was easier to write all at once.

BUG=skia:

Review URL: https://codereview.chromium.org/1138333003
This commit is contained in:
mtklein 2015-05-13 12:19:42 -07:00 committed by Commit bot
parent 5ae1312c9f
commit 8a90edc2a5
4 changed files with 106 additions and 11 deletions

View File

@ -14,14 +14,22 @@
// 1, 2 or 4 SkPMColors, generally vectorized.
class Sk4px : public Sk16b {
public:
Sk4px(SkPMColor); // Duplicate 4x.
Sk4px(const Sk16b& v) : Sk16b(v) {}
Sk4px(SkAlpha a) : INHERITED(a) {} // Duplicate 16x.
Sk4px(SkPMColor); // Duplicate 4x.
Sk4px(const Sk16b& v) : INHERITED(v) {}
// ARGB argb XYZW xyzw -> AAAA aaaa XXXX xxxx
Sk4px alphas() const;
// When loading or storing fewer than 4 SkPMColors, we use the low lanes.
static Sk4px Load4(const SkPMColor[4]);
static Sk4px Load2(const SkPMColor[2]);
static Sk4px Load1(const SkPMColor[1]);
// Ditto for Alphas... Load2Alphas fills the low two lanes of Sk4px.
static Sk4px Load4Alphas(const SkAlpha[4]); // AaXx -> AAAA aaaa XXXX xxxx
static Sk4px Load2Alphas(const SkAlpha[2]); // Aa -> AAAA aaaa 0000 0000
void store4(SkPMColor[4]) const;
void store2(SkPMColor[2]) const;
void store1(SkPMColor[1]) const;
@ -111,13 +119,10 @@ public:
template <typename Fn>
static void MapDstSrcAlpha(
int count, SkPMColor* dst, const SkPMColor* src, const SkAlpha* a, Fn fn) {
// TODO: find a terser / faster way to construct Sk16b alphas.
while (count > 0) {
if (count >= 8) {
Sk16b alpha0(a[0],a[0],a[0],a[0], a[1],a[1],a[1],a[1],
a[2],a[2],a[2],a[2], a[3],a[3],a[3],a[3]),
alpha4(a[4],a[4],a[4],a[4], a[5],a[5],a[5],a[5],
a[6],a[6],a[6],a[6], a[7],a[7],a[7],a[7]);
Sk4px alpha0 = Load4Alphas(a+0),
alpha4 = Load4Alphas(a+4);
Sk4px dst0 = fn(Load4(dst+0), Load4(src+0), alpha0),
dst4 = fn(Load4(dst+4), Load4(src+4), alpha4);
dst0.store4(dst+0);
@ -127,18 +132,17 @@ public:
}
SkASSERT(count <= 7);
if (count >= 4) {
Sk16b alpha(a[0],a[0],a[0],a[0], a[1],a[1],a[1],a[1],
a[2],a[2],a[2],a[2], a[3],a[3],a[3],a[3]);
Sk4px alpha = Load4Alphas(a);
fn(Load4(dst), Load4(src), alpha).store4(dst);
dst += 4; src += 4; a += 4; count -= 4;
}
if (count >= 2) {
Sk16b alpha(a[0],a[0],a[0],a[0], a[1],a[1],a[1],a[1], 0,0,0,0, 0,0,0,0);
Sk4px alpha = Load2Alphas(a);
fn(Load2(dst), Load2(src), alpha).store2(dst);
dst += 2; src += 2; a += 2; count -= 2;
}
if (count >= 1) {
Sk16b alpha(a[0],a[0],a[0],a[0], 0,0,0,0, 0,0,0,0, 0,0,0,0);
Sk4px alpha(*a);
fn(Load1(dst), Load1(src), alpha).store1(dst);
}
break;

View File

@ -48,3 +48,33 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
return Sk16b(vcombine_u8(vaddhn_u16(this->fLo.fVec, o.fLo.fVec),
vaddhn_u16(this->fHi.fVec, o.fHi.fVec)));
}
inline Sk4px Sk4px::alphas() const {
static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
auto as = vshrq_n_u32((uint32x4_t)this->fVec, 24); // ___3 ___2 ___1 ___0
as = vorrq_u32(as, vshlq_n_u32(as, 8)); // __33 __22 __11 __11
as = vorrq_u32(as, vshlq_n_u32(as, 16)); // 3333 2222 1111 1111
return Sk16b((uint8x16_t)as);
}
inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
uint8x16_t a8 = vdupq_n_u8(0); // ____ ____ ____ ____
a8 = vld1q_lane_u8(a+0, a8, 0); // ____ ____ ____ ___0
a8 = vld1q_lane_u8(a+1, a8, 4); // ____ ____ ___1 ___0
a8 = vld1q_lane_u8(a+2, a8, 8); // ____ ___2 ___1 ___0
a8 = vld1q_lane_u8(a+3, a8, 12); // ___3 ___2 ___1 ___0
auto a32 = (uint32x4_t)a8; //
a32 = vorrq_u32(a32, vshlq_n_u32(a32, 8)); // __33 __22 __11 __00
a32 = vorrq_u32(a32, vshlq_n_u32(a32, 16)); // 3333 2222 1111 0000
return Sk16b((uint8x16_t)a32);
}
inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
uint8x16_t a8 = vdupq_n_u8(0); // ____ ____ ____ ____
a8 = vld1q_lane_u8(a+0, a8, 0); // ____ ____ ____ ___0
a8 = vld1q_lane_u8(a+1, a8, 4); // ____ ____ ___1 ___0
auto a32 = (uint32x4_t)a8; //
a32 = vorrq_u32(a32, vshlq_n_u32(a32, 8)); // ____ ____ __11 __00
a32 = vorrq_u32(a32, vshlq_n_u32(a32, 16)); // ____ ____ 1111 0000
return Sk16b((uint8x16_t)a32);
}

View File

@ -37,3 +37,42 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
Sk4px::Wide r = (*this + other) >> 8;
return Sk4px(_mm_packus_epi16(r.fLo.fVec, r.fHi.fVec));
}
// Load4Alphas and Load2Alphas use possibly-unaligned loads (SkAlpha[] -> uint16_t or uint32_t).
// These are safe on x86, often with no speed penalty.
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
inline Sk4px Sk4px::alphas() const {
static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
__m128i splat = _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7,7,7, 3,3,3,3);
return Sk16b(_mm_shuffle_epi8(this->fVec, splat));
}
inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
uint32_t as = *(const uint32_t*)a;
__m128i splat = _mm_set_epi8(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
return Sk16b(_mm_shuffle_epi8(_mm_cvtsi32_si128(as), splat));
}
#else
inline Sk4px Sk4px::alphas() const {
static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
__m128i as = _mm_srli_epi32(this->fVec, 24); // ___3 ___2 ___1 ___0
as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00
as = _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000
return Sk16b(as);
}
inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
__m128i as = _mm_cvtsi32_si128(*(const uint32_t*)a); // ____ ____ ____ 3210
as = _mm_unpacklo_epi8 (as, _mm_setzero_si128()); // ____ ____ _3_2 _1_0
as = _mm_unpacklo_epi16(as, _mm_setzero_si128()); // ___3 ___2 ___1 ___0
as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00
as = _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000
return Sk16b(as);
}
#endif
inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
uint32_t as = *(const uint16_t*)a; // Aa -> Aa00
return Load4Alphas((const SkAlpha*)&as);
}

View File

@ -55,3 +55,25 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
r.kth< 8>(), r.kth< 9>(), r.kth<10>(), r.kth<11>(),
r.kth<12>(), r.kth<13>(), r.kth<14>(), r.kth<15>());
}
inline Sk4px Sk4px::alphas() const {
static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
return Sk16b(this->kth< 3>(), this->kth< 3>(), this->kth< 3>(), this->kth< 3>(),
this->kth< 7>(), this->kth< 7>(), this->kth< 7>(), this->kth< 7>(),
this->kth<11>(), this->kth<11>(), this->kth<11>(), this->kth<11>(),
this->kth<15>(), this->kth<15>(), this->kth<15>(), this->kth<15>());
}
inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
return Sk16b(a[0], a[0], a[0], a[0],
a[1], a[1], a[1], a[1],
a[2], a[2], a[2], a[2],
a[3], a[3], a[3], a[3]);
}
inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
return Sk16b(a[0], a[0], a[0], a[0],
a[1], a[1], a[1], a[1],
0,0,0,0,
0,0,0,0);
}