Add AVX2 implementation for blit_row_s32a_opaque
The function is added in SkOpts_hsw but doesn't have a AVX2 implementation. The implementation boosts Vellamo Pixelblender test case for 20% performance Change-Id: I3bf77eb7629213df1f1bdfa1087ebaf40894d7c4 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/215400 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
2961ddb9ea
commit
d2f2c04e7e
@ -11,8 +11,33 @@
|
||||
#include "include/private/SkColorData.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "src/core/SkMSAN.h"
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
|
||||
auto SkAlphaMulQ_AVX2 = [](const __m256i& c, const __m256i& scale) {
|
||||
const __m256i mask = _mm256_set1_epi32(0xFF00FF);
|
||||
__m256i s = _mm256_or_si256(_mm256_slli_epi32(scale, 16), scale);
|
||||
|
||||
// uint32_t rb = ((c & mask) * scale) >> 8
|
||||
__m256i rb = _mm256_and_si256(mask, c);
|
||||
rb = _mm256_mullo_epi16(rb, s);
|
||||
rb = _mm256_srli_epi16(rb, 8);
|
||||
|
||||
// uint32_t ag = ((c >> 8) & mask) * scale
|
||||
__m256i ag = _mm256_srli_epi16(c, 8);
|
||||
ag = _mm256_mullo_epi16(ag, s);
|
||||
|
||||
// (rb & mask) | (ag & ~mask)
|
||||
ag = _mm256_andnot_si256(mask, ag);
|
||||
return _mm256_or_si256(rb, ag);
|
||||
};
|
||||
return _mm256_add_epi32(src,
|
||||
SkAlphaMulQ_AVX2(dst, _mm256_sub_epi32(_mm256_set1_epi32(256),
|
||||
_mm256_srli_epi32(src, 24))));
|
||||
}
|
||||
|
||||
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
|
||||
@ -116,8 +141,56 @@ static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
|
||||
void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
|
||||
SkASSERT(alpha == 0xFF);
|
||||
sk_msan_assert_initialized(src, src+len);
|
||||
// Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
|
||||
while (len >= 32) {
|
||||
// Load 32 source pixels.
|
||||
auto s0 = _mm256_loadu_si256((const __m256i*)(src) + 0),
|
||||
s1 = _mm256_loadu_si256((const __m256i*)(src) + 1),
|
||||
s2 = _mm256_loadu_si256((const __m256i*)(src) + 2),
|
||||
s3 = _mm256_loadu_si256((const __m256i*)(src) + 3);
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
const auto alphaMask = _mm256_set1_epi32(0xFF000000);
|
||||
|
||||
auto ORed = _mm256_or_si256(s3, _mm256_or_si256(s2, _mm256_or_si256(s1, s0)));
|
||||
if (_mm256_testz_si256(ORed, alphaMask)) {
|
||||
// All 32 source pixels are transparent. Nothing to do.
|
||||
src += 32;
|
||||
dst += 32;
|
||||
len -= 32;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto d0 = (__m256i*)(dst) + 0,
|
||||
d1 = (__m256i*)(dst) + 1,
|
||||
d2 = (__m256i*)(dst) + 2,
|
||||
d3 = (__m256i*)(dst) + 3;
|
||||
|
||||
auto ANDed = _mm256_and_si256(s3, _mm256_and_si256(s2, _mm256_and_si256(s1, s0)));
|
||||
if (_mm256_testc_si256(ANDed, alphaMask)) {
|
||||
// All 32 source pixels are opaque. SrcOver becomes Src.
|
||||
_mm256_storeu_si256(d0, s0);
|
||||
_mm256_storeu_si256(d1, s1);
|
||||
_mm256_storeu_si256(d2, s2);
|
||||
_mm256_storeu_si256(d3, s3);
|
||||
src += 32;
|
||||
dst += 32;
|
||||
len -= 32;
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: This math is wrong.
|
||||
// Do SrcOver.
|
||||
_mm256_storeu_si256(d0, SkPMSrcOver_AVX2(s0, _mm256_loadu_si256(d0)));
|
||||
_mm256_storeu_si256(d1, SkPMSrcOver_AVX2(s1, _mm256_loadu_si256(d1)));
|
||||
_mm256_storeu_si256(d2, SkPMSrcOver_AVX2(s2, _mm256_loadu_si256(d2)));
|
||||
_mm256_storeu_si256(d3, SkPMSrcOver_AVX2(s3, _mm256_loadu_si256(d3)));
|
||||
src += 32;
|
||||
dst += 32;
|
||||
len -= 32;
|
||||
}
|
||||
|
||||
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
while (len >= 16) {
|
||||
// Load 16 source pixels.
|
||||
auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
|
||||
|
Loading…
Reference in New Issue
Block a user