Add AVX512 implementation for blit_row_s32a_opaque
blit_row_s32a_opaque time is improved by ~20-30% using icelake cpu. nanobench results: before after SkVM_4096_Opts 0.141ns 0.108ns SkVM_1024_Opts 0.161ns 0.110ns SkVM_256_Opts 0.155ns 0.109ns Change-Id: If46b3fbeb4a7b68b152aca2c0bc3e1417578d4b2 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284528 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
81ef385c1f
commit
56f23a1d17
@ -84,6 +84,9 @@ inline bool SkCpu::Supports(uint32_t mask) {
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
|
||||
features |= AVX2;
|
||||
#endif
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
|
||||
features |= (AVX512F | AVX512DQ | AVX512CD | AVX512BW | AVX512VL);
|
||||
#endif
|
||||
// FMA doesn't fit neatly into this total ordering.
|
||||
// It's available on Haswell+ just like AVX2, but it's technically a different bit.
|
||||
// TODO: circle back on this if we find ourselves limited by lack of compile-time FMA
|
||||
|
@ -44,11 +44,14 @@
|
||||
#endif
|
||||
|
||||
#define SK_OPTS_NS skx
|
||||
#include "src/opts/SkBlitRow_opts.h"
|
||||
#include "src/opts/SkRasterPipeline_opts.h"
|
||||
#include "src/opts/SkVM_opts.h"
|
||||
|
||||
namespace SkOpts {
|
||||
void Init_skx() {
|
||||
blit_row_s32a_opaque = SK_OPTS_NS::blit_row_s32a_opaque;
|
||||
|
||||
#define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
|
||||
SK_RASTER_PIPELINE_STAGES(M)
|
||||
just_return_highp = (StageFn)SK_OPTS_NS::just_return;
|
||||
|
@ -11,7 +11,37 @@
|
||||
#include "include/private/SkColorData.h"
|
||||
#include "include/private/SkVx.h"
|
||||
#include "src/core/SkMSAN.h"
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline __m512i SkPMSrcOver_SKX(const __m512i& src, const __m512i& dst) {
|
||||
// Detailed explanations in SkPMSrcOver_AVX2
|
||||
// b = s + (d*(256-srcA)) >> 8
|
||||
|
||||
// Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel.
|
||||
const uint8_t _ = -1; // fills a literal 0 byte.
|
||||
const uint8_t mask[64] = { 3, _,3, _, 7, _,7, _, 11,_,11,_, 15,_,15,_,
|
||||
19,_,19,_, 23,_,23,_, 27,_,27,_, 31,_,31,_,
|
||||
35,_,35,_, 39,_,39,_, 43,_,43,_, 47,_,47,_,
|
||||
51,_,51,_, 55,_,55,_, 59,_,59,_, 63,_,63,_ };
|
||||
__m512i srcA_x2 = _mm512_shuffle_epi8(src, _mm512_loadu_si512(mask));
|
||||
__m512i scale_x2 = _mm512_sub_epi16(_mm512_set1_epi16(256),
|
||||
srcA_x2);
|
||||
|
||||
// Scale red and blue, leaving results in the low byte of each 16-bit lane.
|
||||
__m512i rb = _mm512_and_si512(_mm512_set1_epi32(0x00ff00ff), dst);
|
||||
rb = _mm512_mullo_epi16(rb, scale_x2);
|
||||
rb = _mm512_srli_epi16(rb, 8);
|
||||
|
||||
// Scale green and alpha, leaving results in the high byte, masking off the low bits.
|
||||
__m512i ga = _mm512_srli_epi16(dst, 8);
|
||||
ga = _mm512_mullo_epi16(ga, scale_x2);
|
||||
ga = _mm512_andnot_si512(_mm512_set1_epi32(0x00ff00ff), ga);
|
||||
|
||||
return _mm512_add_epi32(src, _mm512_or_si512(rb, ga));
|
||||
}
|
||||
|
||||
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
|
||||
@ -163,8 +193,58 @@ static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
|
||||
void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
|
||||
SkASSERT(alpha == 0xFF);
|
||||
sk_msan_assert_initialized(src, src+len);
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
|
||||
while (len >= 64) {
|
||||
// Load 64 source pixels.
|
||||
auto s0 = _mm512_loadu_si512((const __m512i*)(src) + 0),
|
||||
s1 = _mm512_loadu_si512((const __m512i*)(src) + 1),
|
||||
s2 = _mm512_loadu_si512((const __m512i*)(src) + 2),
|
||||
s3 = _mm512_loadu_si512((const __m512i*)(src) + 3);
|
||||
|
||||
const auto alphaMask = _mm512_set1_epi32(0xFF000000);
|
||||
|
||||
auto ORed = _mm512_or_si512(s3, _mm512_or_si512(s2, _mm512_or_si512(s1, s0)));
|
||||
if (0 == _cvtmask64_u64(_mm512_cmpneq_epi8_mask(_mm512_and_si512(ORed, alphaMask),
|
||||
_mm512_setzero_si512()))) {
|
||||
// All 64 source pixels are transparent. Nothing to do.
|
||||
src += 64;
|
||||
dst += 64;
|
||||
len -= 64;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto d0 = (__m512i*)(dst) + 0,
|
||||
d1 = (__m512i*)(dst) + 1,
|
||||
d2 = (__m512i*)(dst) + 2,
|
||||
d3 = (__m512i*)(dst) + 3;
|
||||
|
||||
auto ANDed = _mm512_and_si512(s3, _mm512_and_si512(s2, _mm512_and_si512(s1, s0)));
|
||||
if (0 == _cvtmask64_u64(_mm512_cmpneq_epi8_mask(_mm512_and_si512(ANDed, alphaMask),
|
||||
alphaMask))) {
|
||||
// All 64 source pixels are opaque. SrcOver becomes Src.
|
||||
_mm512_storeu_si512(d0, s0);
|
||||
_mm512_storeu_si512(d1, s1);
|
||||
_mm512_storeu_si512(d2, s2);
|
||||
_mm512_storeu_si512(d3, s3);
|
||||
src += 64;
|
||||
dst += 64;
|
||||
len -= 64;
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: This math is wrong.
|
||||
// Do SrcOver.
|
||||
_mm512_storeu_si512(d0, SkPMSrcOver_SKX(s0, _mm512_loadu_si512(d0)));
|
||||
_mm512_storeu_si512(d1, SkPMSrcOver_SKX(s1, _mm512_loadu_si512(d1)));
|
||||
_mm512_storeu_si512(d2, SkPMSrcOver_SKX(s2, _mm512_loadu_si512(d2)));
|
||||
_mm512_storeu_si512(d3, SkPMSrcOver_SKX(s3, _mm512_loadu_si512(d3)));
|
||||
src += 64;
|
||||
dst += 64;
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
// Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
|
||||
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
|
||||
while (len >= 32) {
|
||||
// Load 32 source pixels.
|
||||
auto s0 = _mm256_loadu_si256((const __m256i*)(src) + 0),
|
||||
|
Loading…
Reference in New Issue
Block a user