Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdec
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
parent
8f6d4d369c
commit
7ac2be2020
@ -9,13 +9,13 @@
|
||||
#include "include/private/SkTemplates.h"
|
||||
#include "src/core/SkUtils.h"
|
||||
|
||||
template <typename T, bool kInline>
|
||||
template <typename T>
|
||||
class MemsetBench : public Benchmark {
|
||||
public:
|
||||
explicit MemsetBench(int n)
|
||||
: fN(n)
|
||||
, fBuffer(n)
|
||||
, fName(SkStringPrintf("memset%zu_%d%s", sizeof(T)*8, n, kInline ? "_inline" : "")) {}
|
||||
explicit MemsetBench(size_t bytes)
|
||||
: fN(bytes / sizeof(T))
|
||||
, fBuffer(fN)
|
||||
, fName(SkStringPrintf("memset%zu_%zu", sizeof(T)*8, bytes)) {}
|
||||
|
||||
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
|
||||
const char* onGetName() override { return fName.c_str(); }
|
||||
@ -28,57 +28,50 @@ private:
|
||||
SkString fName;
|
||||
};
|
||||
|
||||
template <> void MemsetBench<uint32_t, false>::onDraw(int loops, SkCanvas*) {
|
||||
template <> void MemsetBench<uint64_t>::onDraw(int loops, SkCanvas*) {
|
||||
for (int i = 0; i < 1000*loops; i++) {
|
||||
sk_memset64(fBuffer.get(), 0xFACEFACEFACEFACE, fN);
|
||||
}
|
||||
}
|
||||
|
||||
template <> void MemsetBench<uint32_t>::onDraw(int loops, SkCanvas*) {
|
||||
for (int i = 0; i < 1000*loops; i++) {
|
||||
sk_memset32(fBuffer.get(), 0xFACEB004, fN);
|
||||
}
|
||||
}
|
||||
|
||||
template <> void MemsetBench<uint16_t, false>::onDraw(int loops, SkCanvas*) {
|
||||
template <> void MemsetBench<uint16_t>::onDraw(int loops, SkCanvas*) {
|
||||
for (int i = 0; i < 1000*loops; i++) {
|
||||
sk_memset16(fBuffer.get(), 0x4973, fN);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void memsetT(T* dst, T val, int n) {
|
||||
for (int i = 0; i < n; i++) { dst[i] = val; }
|
||||
}
|
||||
DEF_BENCH(return (new MemsetBench<uint64_t>(16)));
|
||||
DEF_BENCH(return (new MemsetBench<uint64_t>(64)));
|
||||
DEF_BENCH(return (new MemsetBench<uint64_t>(256)));
|
||||
DEF_BENCH(return (new MemsetBench<uint64_t>(512)));
|
||||
DEF_BENCH(return (new MemsetBench<uint64_t>(768)));
|
||||
DEF_BENCH(return (new MemsetBench<uint64_t>(1024)));
|
||||
DEF_BENCH(return (new MemsetBench<uint64_t>(2048)));
|
||||
DEF_BENCH(return (new MemsetBench<uint64_t>(4096)));
|
||||
DEF_BENCH(return (new MemsetBench<uint64_t>(65536)));
|
||||
|
||||
template <> void MemsetBench<uint32_t, true>::onDraw(int loops, SkCanvas*) {
|
||||
for (int i = 0; i < 1000*loops; i++) {
|
||||
memsetT<uint32_t>(fBuffer.get(), 0xFACEB004, fN);
|
||||
}
|
||||
}
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t>(16)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t>(64)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t>(256)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t>(512)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t>(768)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t>(1024)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t>(2048)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t>(4096)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t>(65536)));
|
||||
|
||||
template <> void MemsetBench<uint16_t, true>::onDraw(int loops, SkCanvas*) {
|
||||
for (int i = 0; i < 1000*loops; i++) {
|
||||
memsetT<uint16_t>(fBuffer.get(), 0x4973, fN);
|
||||
}
|
||||
}
|
||||
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(1)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(1)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(10)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(10)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(100)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(100)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(1000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(1000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(10000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(10000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(100000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(100000)));
|
||||
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(1)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(1)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(10)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(10)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(100)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(100)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(1000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(1000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(10000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(10000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(100000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(100000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t>(16)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t>(64)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t>(256)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t>(512)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t>(768)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t>(1024)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t>(2048)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t>(4096)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t>(65536)));
|
||||
|
@ -279,6 +279,7 @@ skia_core_sources = [
|
||||
"$_src/core/SkOSFile.h",
|
||||
"$_src/core/SkOpts.cpp",
|
||||
"$_src/core/SkOpts.h",
|
||||
"$_src/core/SkOpts_erms.cpp",
|
||||
"$_src/core/SkOrderedReadBuffer.h",
|
||||
"$_src/core/SkOverdrawCanvas.cpp",
|
||||
"$_src/core/SkPaint.cpp",
|
||||
|
@ -57,6 +57,7 @@
|
||||
if (abcd[1] & (1<<5)) { features |= SkCpu::AVX2; }
|
||||
if (abcd[1] & (1<<3)) { features |= SkCpu::BMI1; }
|
||||
if (abcd[1] & (1<<8)) { features |= SkCpu::BMI2; }
|
||||
if (abcd[1] & (1<<9)) { features |= SkCpu::ERMS; }
|
||||
|
||||
if ((xgetbv(0) & (7<<5)) == (7<<5)) { // All ZMM state bits enabled too.
|
||||
if (abcd[1] & (1<<16)) { features |= SkCpu::AVX512F; }
|
||||
|
@ -38,6 +38,8 @@ struct SkCpu {
|
||||
|
||||
// Handy alias for all the cool Skylake Xeon+ instructions.
|
||||
SKX = AVX512F | AVX512DQ | AVX512CD | AVX512BW | AVX512VL,
|
||||
|
||||
ERMS = 1 << 20,
|
||||
};
|
||||
enum {
|
||||
NEON = 1 << 0,
|
||||
|
@ -111,6 +111,7 @@ namespace SkOpts {
|
||||
void Init_avx();
|
||||
void Init_hsw();
|
||||
void Init_skx();
|
||||
void Init_erms();
|
||||
void Init_crc32();
|
||||
|
||||
static void init() {
|
||||
@ -133,6 +134,8 @@ namespace SkOpts {
|
||||
if (SkCpu::Supports(SkCpu::SKX)) { Init_skx(); }
|
||||
#endif
|
||||
|
||||
if (SkCpu::Supports(SkCpu::ERMS)) { Init_erms(); }
|
||||
|
||||
#elif defined(SK_CPU_ARM64)
|
||||
if (SkCpu::Supports(SkCpu::CRC32)) { Init_crc32(); }
|
||||
|
||||
|
122
src/core/SkOpts_erms.cpp
Normal file
122
src/core/SkOpts_erms.cpp
Normal file
@ -0,0 +1,122 @@
|
||||
/*
|
||||
* Copyright 2020 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "src/core/SkMSAN.h"
|
||||
#include "src/core/SkOpts.h"
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) // memset16 and memset32 could work on 32-bit x86 too.
|
||||
|
||||
static const char* note = "MSAN can't see that rep sto initializes memory.";
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
static inline void repsto(uint16_t* dst, uint16_t v, size_t n) {
|
||||
sk_msan_mark_initialized(dst,dst+n,note);
|
||||
__stosw(dst, v, n);
|
||||
}
|
||||
static inline void repsto(uint32_t* dst, uint32_t v, size_t n) {
|
||||
sk_msan_mark_initialized(dst,dst+n,note);
|
||||
static_assert(sizeof(uint32_t) == sizeof(unsigned long));
|
||||
__stosd(reinterpret_cast<unsigned long*>(dst), v, n);
|
||||
}
|
||||
static inline void repsto(uint64_t* dst, uint64_t v, size_t n) {
|
||||
sk_msan_mark_initialized(dst,dst+n,note);
|
||||
__stosq(dst, v, n);
|
||||
}
|
||||
#else
|
||||
static inline void repsto(uint16_t* dst, uint16_t v, size_t n) {
|
||||
sk_msan_mark_initialized(dst,dst+n,note);
|
||||
asm volatile("rep stosw" : "+D"(dst), "+c"(n) : "a"(v) : "memory");
|
||||
}
|
||||
static inline void repsto(uint32_t* dst, uint32_t v, size_t n) {
|
||||
sk_msan_mark_initialized(dst,dst+n,note);
|
||||
asm volatile("rep stosl" : "+D"(dst), "+c"(n) : "a"(v) : "memory");
|
||||
}
|
||||
static inline void repsto(uint64_t* dst, uint64_t v, size_t n) {
|
||||
sk_msan_mark_initialized(dst,dst+n,note);
|
||||
asm volatile("rep stosq" : "+D"(dst), "+c"(n) : "a"(v) : "memory");
|
||||
}
|
||||
#endif
|
||||
|
||||
// ERMS is ideal for large copies but has a relatively high setup cost,
|
||||
// so we use the previous best routine for small inputs. FSRM would make this moot.
|
||||
static void (*g_memset16_prev)(uint16_t*, uint16_t, int);
|
||||
static void (*g_memset32_prev)(uint32_t*, uint32_t, int);
|
||||
static void (*g_memset64_prev)(uint64_t*, uint64_t, int);
|
||||
static void (*g_rect_memset16_prev)(uint16_t*, uint16_t, int, size_t, int);
|
||||
static void (*g_rect_memset32_prev)(uint32_t*, uint32_t, int, size_t, int);
|
||||
static void (*g_rect_memset64_prev)(uint64_t*, uint64_t, int, size_t, int);
|
||||
|
||||
// Empirically determined with `nanobench -m memset`.
|
||||
static bool small(size_t bytes) { return bytes < 1024; }
|
||||
|
||||
#define SK_OPTS_NS erms
|
||||
namespace SK_OPTS_NS {
|
||||
static inline void memset16(uint16_t* dst, uint16_t v, int n) {
|
||||
return small(sizeof(v)*n) ? g_memset16_prev(dst, v, n)
|
||||
: repsto(dst, v, n);
|
||||
}
|
||||
static inline void memset32(uint32_t* dst, uint32_t v, int n) {
|
||||
return small(sizeof(v)*n) ? g_memset32_prev(dst, v, n)
|
||||
: repsto(dst, v, n);
|
||||
}
|
||||
static inline void memset64(uint64_t* dst, uint64_t v, int n) {
|
||||
return small(sizeof(v)*n) ? g_memset64_prev(dst, v, n)
|
||||
: repsto(dst, v, n);
|
||||
}
|
||||
|
||||
static inline void rect_memset16(uint16_t* dst, uint16_t v, int n,
|
||||
size_t rowBytes, int height) {
|
||||
if (small(sizeof(v)*n)) {
|
||||
return g_rect_memset16_prev(dst,v,n, rowBytes,height);
|
||||
}
|
||||
for (int stride = rowBytes/sizeof(v); height --> 0; dst += stride) {
|
||||
repsto(dst, v, n);
|
||||
}
|
||||
}
|
||||
static inline void rect_memset32(uint32_t* dst, uint32_t v, int n,
|
||||
size_t rowBytes, int height) {
|
||||
if (small(sizeof(v)*n)) {
|
||||
return g_rect_memset32_prev(dst,v,n, rowBytes,height);
|
||||
}
|
||||
for (int stride = rowBytes/sizeof(v); height --> 0; dst += stride) {
|
||||
repsto(dst, v, n);
|
||||
}
|
||||
}
|
||||
static inline void rect_memset64(uint64_t* dst, uint64_t v, int n,
|
||||
size_t rowBytes, int height) {
|
||||
if (small(sizeof(v)*n)) {
|
||||
return g_rect_memset64_prev(dst,v,n, rowBytes,height);
|
||||
}
|
||||
for (int stride = rowBytes/sizeof(v); height --> 0; dst += stride) {
|
||||
repsto(dst, v, n);
|
||||
}
|
||||
}
|
||||
} // namespace SK_OPTS_NS
|
||||
|
||||
namespace SkOpts {
|
||||
void Init_erms() {
|
||||
g_memset16_prev = memset16;
|
||||
g_memset32_prev = memset32;
|
||||
g_memset64_prev = memset64;
|
||||
g_rect_memset16_prev = rect_memset16;
|
||||
g_rect_memset32_prev = rect_memset32;
|
||||
g_rect_memset64_prev = rect_memset64;
|
||||
|
||||
memset16 = SK_OPTS_NS::memset16;
|
||||
memset32 = SK_OPTS_NS::memset32;
|
||||
memset64 = SK_OPTS_NS::memset64;
|
||||
rect_memset16 = SK_OPTS_NS::rect_memset16;
|
||||
rect_memset32 = SK_OPTS_NS::rect_memset32;
|
||||
rect_memset64 = SK_OPTS_NS::rect_memset64;
|
||||
}
|
||||
}
|
||||
#else
|
||||
namespace SkOpts {
|
||||
void Init_erms() {}
|
||||
}
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user