Add AVX512 implementation for blit_row_s32a_opaque

blit_row_s32a_opaque time is improved by ~20-30% using icelake cpu.
nanobench results:
                  before     after
SkVM_4096_Opts    0.141ns    0.108ns
SkVM_1024_Opts    0.161ns    0.110ns
SkVM_256_Opts     0.155ns    0.109ns

Change-Id: If46b3fbeb4a7b68b152aca2c0bc3e1417578d4b2
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284528
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Lingyun Cai 2020-04-26 14:44:32 +08:00 committed by Skia Commit-Bot
parent 81ef385c1f
commit 56f23a1d17
3 changed files with 88 additions and 2 deletions

View File

@ -84,6 +84,9 @@ inline bool SkCpu::Supports(uint32_t mask) {
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
features |= AVX2;
#endif
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
features |= (AVX512F | AVX512DQ | AVX512CD | AVX512BW | AVX512VL);
#endif
// FMA doesn't fit neatly into this total ordering.
// It's available on Haswell+ just like AVX2, but it's technically a different bit.
// TODO: circle back on this if we find ourselves limited by lack of compile-time FMA

View File

@ -44,11 +44,14 @@
#endif
#define SK_OPTS_NS skx
#include "src/opts/SkBlitRow_opts.h"
#include "src/opts/SkRasterPipeline_opts.h"
#include "src/opts/SkVM_opts.h"
namespace SkOpts {
void Init_skx() {
blit_row_s32a_opaque = SK_OPTS_NS::blit_row_s32a_opaque;
#define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
SK_RASTER_PIPELINE_STAGES(M)
just_return_highp = (StageFn)SK_OPTS_NS::just_return;

View File

@ -11,7 +11,37 @@
#include "include/private/SkColorData.h"
#include "include/private/SkVx.h"
#include "src/core/SkMSAN.h"
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
#include <immintrin.h>
static inline __m512i SkPMSrcOver_SKX(const __m512i& src, const __m512i& dst) {
// Detailed explanations in SkPMSrcOver_AVX2
// b = s + (d*(256-srcA)) >> 8
// Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel.
const uint8_t _ = -1; // fills a literal 0 byte.
const uint8_t mask[64] = { 3, _,3, _, 7, _,7, _, 11,_,11,_, 15,_,15,_,
19,_,19,_, 23,_,23,_, 27,_,27,_, 31,_,31,_,
35,_,35,_, 39,_,39,_, 43,_,43,_, 47,_,47,_,
51,_,51,_, 55,_,55,_, 59,_,59,_, 63,_,63,_ };
__m512i srcA_x2 = _mm512_shuffle_epi8(src, _mm512_loadu_si512(mask));
__m512i scale_x2 = _mm512_sub_epi16(_mm512_set1_epi16(256),
srcA_x2);
// Scale red and blue, leaving results in the low byte of each 16-bit lane.
__m512i rb = _mm512_and_si512(_mm512_set1_epi32(0x00ff00ff), dst);
rb = _mm512_mullo_epi16(rb, scale_x2);
rb = _mm512_srli_epi16(rb, 8);
// Scale green and alpha, leaving results in the high byte, masking off the low bits.
__m512i ga = _mm512_srli_epi16(dst, 8);
ga = _mm512_mullo_epi16(ga, scale_x2);
ga = _mm512_andnot_si512(_mm512_set1_epi32(0x00ff00ff), ga);
return _mm512_add_epi32(src, _mm512_or_si512(rb, ga));
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
#include <immintrin.h>
static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
@ -163,8 +193,58 @@ static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
SkASSERT(alpha == 0xFF);
sk_msan_assert_initialized(src, src+len);
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
while (len >= 64) {
// Load 64 source pixels.
auto s0 = _mm512_loadu_si512((const __m512i*)(src) + 0),
s1 = _mm512_loadu_si512((const __m512i*)(src) + 1),
s2 = _mm512_loadu_si512((const __m512i*)(src) + 2),
s3 = _mm512_loadu_si512((const __m512i*)(src) + 3);
const auto alphaMask = _mm512_set1_epi32(0xFF000000);
auto ORed = _mm512_or_si512(s3, _mm512_or_si512(s2, _mm512_or_si512(s1, s0)));
if (0 == _cvtmask64_u64(_mm512_cmpneq_epi8_mask(_mm512_and_si512(ORed, alphaMask),
_mm512_setzero_si512()))) {
// All 64 source pixels are transparent. Nothing to do.
src += 64;
dst += 64;
len -= 64;
continue;
}
auto d0 = (__m512i*)(dst) + 0,
d1 = (__m512i*)(dst) + 1,
d2 = (__m512i*)(dst) + 2,
d3 = (__m512i*)(dst) + 3;
auto ANDed = _mm512_and_si512(s3, _mm512_and_si512(s2, _mm512_and_si512(s1, s0)));
if (0 == _cvtmask64_u64(_mm512_cmpneq_epi8_mask(_mm512_and_si512(ANDed, alphaMask),
alphaMask))) {
// All 64 source pixels are opaque. SrcOver becomes Src.
_mm512_storeu_si512(d0, s0);
_mm512_storeu_si512(d1, s1);
_mm512_storeu_si512(d2, s2);
_mm512_storeu_si512(d3, s3);
src += 64;
dst += 64;
len -= 64;
continue;
}
// TODO: This math is wrong.
// Do SrcOver.
_mm512_storeu_si512(d0, SkPMSrcOver_SKX(s0, _mm512_loadu_si512(d0)));
_mm512_storeu_si512(d1, SkPMSrcOver_SKX(s1, _mm512_loadu_si512(d1)));
_mm512_storeu_si512(d2, SkPMSrcOver_SKX(s2, _mm512_loadu_si512(d2)));
_mm512_storeu_si512(d3, SkPMSrcOver_SKX(s3, _mm512_loadu_si512(d3)));
src += 64;
dst += 64;
len -= 64;
}
// Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
while (len >= 32) {
// Load 32 source pixels.
auto s0 = _mm256_loadu_si256((const __m256i*)(src) + 0),