Add AVX2 implementation for blit_row_s32a_opaque

The function is added in SkOpts_hsw but doesn't have a AVX2 implementation.
The implementation boosts Vellamo Pixelblender test case for 20% performance

Change-Id: I3bf77eb7629213df1f1bdfa1087ebaf40894d7c4
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/215400
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Zhenyu Shan 2019-05-22 21:15:43 +08:00 committed by Skia Commit-Bot
parent 2961ddb9ea
commit d2f2c04e7e

View File

@ -11,8 +11,33 @@
#include "include/private/SkColorData.h"
#include "include/private/SkVx.h"
#include "src/core/SkMSAN.h"
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
#include <immintrin.h>
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
auto SkAlphaMulQ_AVX2 = [](const __m256i& c, const __m256i& scale) {
const __m256i mask = _mm256_set1_epi32(0xFF00FF);
__m256i s = _mm256_or_si256(_mm256_slli_epi32(scale, 16), scale);
// uint32_t rb = ((c & mask) * scale) >> 8
__m256i rb = _mm256_and_si256(mask, c);
rb = _mm256_mullo_epi16(rb, s);
rb = _mm256_srli_epi16(rb, 8);
// uint32_t ag = ((c >> 8) & mask) * scale
__m256i ag = _mm256_srli_epi16(c, 8);
ag = _mm256_mullo_epi16(ag, s);
// (rb & mask) | (ag & ~mask)
ag = _mm256_andnot_si256(mask, ag);
return _mm256_or_si256(rb, ag);
};
return _mm256_add_epi32(src,
SkAlphaMulQ_AVX2(dst, _mm256_sub_epi32(_mm256_set1_epi32(256),
_mm256_srli_epi32(src, 24))));
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
#include <immintrin.h>
static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
@ -116,8 +141,56 @@ static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
SkASSERT(alpha == 0xFF);
sk_msan_assert_initialized(src, src+len);
// Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
while (len >= 32) {
// Load 32 source pixels.
auto s0 = _mm256_loadu_si256((const __m256i*)(src) + 0),
s1 = _mm256_loadu_si256((const __m256i*)(src) + 1),
s2 = _mm256_loadu_si256((const __m256i*)(src) + 2),
s3 = _mm256_loadu_si256((const __m256i*)(src) + 3);
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
const auto alphaMask = _mm256_set1_epi32(0xFF000000);
auto ORed = _mm256_or_si256(s3, _mm256_or_si256(s2, _mm256_or_si256(s1, s0)));
if (_mm256_testz_si256(ORed, alphaMask)) {
// All 32 source pixels are transparent. Nothing to do.
src += 32;
dst += 32;
len -= 32;
continue;
}
auto d0 = (__m256i*)(dst) + 0,
d1 = (__m256i*)(dst) + 1,
d2 = (__m256i*)(dst) + 2,
d3 = (__m256i*)(dst) + 3;
auto ANDed = _mm256_and_si256(s3, _mm256_and_si256(s2, _mm256_and_si256(s1, s0)));
if (_mm256_testc_si256(ANDed, alphaMask)) {
// All 32 source pixels are opaque. SrcOver becomes Src.
_mm256_storeu_si256(d0, s0);
_mm256_storeu_si256(d1, s1);
_mm256_storeu_si256(d2, s2);
_mm256_storeu_si256(d3, s3);
src += 32;
dst += 32;
len -= 32;
continue;
}
// TODO: This math is wrong.
// Do SrcOver.
_mm256_storeu_si256(d0, SkPMSrcOver_AVX2(s0, _mm256_loadu_si256(d0)));
_mm256_storeu_si256(d1, SkPMSrcOver_AVX2(s1, _mm256_loadu_si256(d1)));
_mm256_storeu_si256(d2, SkPMSrcOver_AVX2(s2, _mm256_loadu_si256(d2)));
_mm256_storeu_si256(d3, SkPMSrcOver_AVX2(s3, _mm256_loadu_si256(d3)));
src += 32;
dst += 32;
len -= 32;
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
while (len >= 16) {
// Load 16 source pixels.
auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),