diff --git a/bench/MemsetBench.cpp b/bench/MemsetBench.cpp index 821252b12e..c56b50b28c 100644 --- a/bench/MemsetBench.cpp +++ b/bench/MemsetBench.cpp @@ -9,13 +9,13 @@ #include "include/private/SkTemplates.h" #include "src/core/SkUtils.h" -template +template class MemsetBench : public Benchmark { public: - explicit MemsetBench(int n) - : fN(n) - , fBuffer(n) - , fName(SkStringPrintf("memset%zu_%d%s", sizeof(T)*8, n, kInline ? "_inline" : "")) {} + explicit MemsetBench(size_t bytes) + : fN(bytes / sizeof(T)) + , fBuffer(fN) + , fName(SkStringPrintf("memset%zu_%zu", sizeof(T)*8, bytes)) {} bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } const char* onGetName() override { return fName.c_str(); } @@ -28,57 +28,50 @@ private: SkString fName; }; -template <> void MemsetBench::onDraw(int loops, SkCanvas*) { +template <> void MemsetBench::onDraw(int loops, SkCanvas*) { + for (int i = 0; i < 1000*loops; i++) { + sk_memset64(fBuffer.get(), 0xFACEFACEFACEFACE, fN); + } +} + +template <> void MemsetBench::onDraw(int loops, SkCanvas*) { for (int i = 0; i < 1000*loops; i++) { sk_memset32(fBuffer.get(), 0xFACEB004, fN); } } -template <> void MemsetBench::onDraw(int loops, SkCanvas*) { +template <> void MemsetBench::onDraw(int loops, SkCanvas*) { for (int i = 0; i < 1000*loops; i++) { sk_memset16(fBuffer.get(), 0x4973, fN); } } -template -static void memsetT(T* dst, T val, int n) { - for (int i = 0; i < n; i++) { dst[i] = val; } -} +DEF_BENCH(return (new MemsetBench(16))); +DEF_BENCH(return (new MemsetBench(64))); +DEF_BENCH(return (new MemsetBench(256))); +DEF_BENCH(return (new MemsetBench(512))); +DEF_BENCH(return (new MemsetBench(768))); +DEF_BENCH(return (new MemsetBench(1024))); +DEF_BENCH(return (new MemsetBench(2048))); +DEF_BENCH(return (new MemsetBench(4096))); +DEF_BENCH(return (new MemsetBench(65536))); -template <> void MemsetBench::onDraw(int loops, SkCanvas*) { - for (int i = 0; i < 1000*loops; i++) { - memsetT(fBuffer.get(), 0xFACEB004, fN); - } -} +DEF_BENCH(return (new MemsetBench(16))); +DEF_BENCH(return (new MemsetBench(64))); +DEF_BENCH(return (new MemsetBench(256))); +DEF_BENCH(return (new MemsetBench(512))); +DEF_BENCH(return (new MemsetBench(768))); +DEF_BENCH(return (new MemsetBench(1024))); +DEF_BENCH(return (new MemsetBench(2048))); +DEF_BENCH(return (new MemsetBench(4096))); +DEF_BENCH(return (new MemsetBench(65536))); -template <> void MemsetBench::onDraw(int loops, SkCanvas*) { - for (int i = 0; i < 1000*loops; i++) { - memsetT(fBuffer.get(), 0x4973, fN); - } -} - -DEF_BENCH(return (new MemsetBench(1))); -DEF_BENCH(return (new MemsetBench(1))); -DEF_BENCH(return (new MemsetBench(10))); -DEF_BENCH(return (new MemsetBench(10))); -DEF_BENCH(return (new MemsetBench(100))); -DEF_BENCH(return (new MemsetBench(100))); -DEF_BENCH(return (new MemsetBench(1000))); -DEF_BENCH(return (new MemsetBench(1000))); -DEF_BENCH(return (new MemsetBench(10000))); -DEF_BENCH(return (new MemsetBench(10000))); -DEF_BENCH(return (new MemsetBench(100000))); -DEF_BENCH(return (new MemsetBench(100000))); - -DEF_BENCH(return (new MemsetBench(1))); -DEF_BENCH(return (new MemsetBench(1))); -DEF_BENCH(return (new MemsetBench(10))); -DEF_BENCH(return (new MemsetBench(10))); -DEF_BENCH(return (new MemsetBench(100))); -DEF_BENCH(return (new MemsetBench(100))); -DEF_BENCH(return (new MemsetBench(1000))); -DEF_BENCH(return (new MemsetBench(1000))); -DEF_BENCH(return (new MemsetBench(10000))); -DEF_BENCH(return (new MemsetBench(10000))); -DEF_BENCH(return (new MemsetBench(100000))); -DEF_BENCH(return (new MemsetBench(100000))); +DEF_BENCH(return (new MemsetBench(16))); +DEF_BENCH(return (new MemsetBench(64))); +DEF_BENCH(return (new MemsetBench(256))); +DEF_BENCH(return (new MemsetBench(512))); +DEF_BENCH(return (new MemsetBench(768))); +DEF_BENCH(return (new MemsetBench(1024))); +DEF_BENCH(return (new MemsetBench(2048))); +DEF_BENCH(return (new MemsetBench(4096))); +DEF_BENCH(return (new MemsetBench(65536))); diff --git a/gn/core.gni b/gn/core.gni index 32615cb282..086e2727c8 100644 --- a/gn/core.gni +++ b/gn/core.gni @@ -279,6 +279,7 @@ skia_core_sources = [ "$_src/core/SkOSFile.h", "$_src/core/SkOpts.cpp", "$_src/core/SkOpts.h", + "$_src/core/SkOpts_erms.cpp", "$_src/core/SkOrderedReadBuffer.h", "$_src/core/SkOverdrawCanvas.cpp", "$_src/core/SkPaint.cpp", diff --git a/src/core/SkCpu.cpp b/src/core/SkCpu.cpp index 1326a0fba3..a88f49802f 100644 --- a/src/core/SkCpu.cpp +++ b/src/core/SkCpu.cpp @@ -57,6 +57,7 @@ if (abcd[1] & (1<<5)) { features |= SkCpu::AVX2; } if (abcd[1] & (1<<3)) { features |= SkCpu::BMI1; } if (abcd[1] & (1<<8)) { features |= SkCpu::BMI2; } + if (abcd[1] & (1<<9)) { features |= SkCpu::ERMS; } if ((xgetbv(0) & (7<<5)) == (7<<5)) { // All ZMM state bits enabled too. if (abcd[1] & (1<<16)) { features |= SkCpu::AVX512F; } diff --git a/src/core/SkCpu.h b/src/core/SkCpu.h index 2ed1effd95..2450d93dcb 100644 --- a/src/core/SkCpu.h +++ b/src/core/SkCpu.h @@ -38,6 +38,8 @@ struct SkCpu { // Handy alias for all the cool Skylake Xeon+ instructions. SKX = AVX512F | AVX512DQ | AVX512CD | AVX512BW | AVX512VL, + + ERMS = 1 << 20, }; enum { NEON = 1 << 0, diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp index b847c3022a..b88a6064a7 100644 --- a/src/core/SkOpts.cpp +++ b/src/core/SkOpts.cpp @@ -111,6 +111,7 @@ namespace SkOpts { void Init_avx(); void Init_hsw(); void Init_skx(); + void Init_erms(); void Init_crc32(); static void init() { @@ -133,6 +134,8 @@ namespace SkOpts { if (SkCpu::Supports(SkCpu::SKX)) { Init_skx(); } #endif + if (SkCpu::Supports(SkCpu::ERMS)) { Init_erms(); } + #elif defined(SK_CPU_ARM64) if (SkCpu::Supports(SkCpu::CRC32)) { Init_crc32(); } diff --git a/src/core/SkOpts_erms.cpp b/src/core/SkOpts_erms.cpp new file mode 100644 index 0000000000..b0761f1d33 --- /dev/null +++ b/src/core/SkOpts_erms.cpp @@ -0,0 +1,122 @@ +/* + * Copyright 2020 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "src/core/SkMSAN.h" +#include "src/core/SkOpts.h" + +#if defined(__x86_64__) || defined(_M_X64) // memset16 and memset32 could work on 32-bit x86 too. + + static const char* note = "MSAN can't see that rep sto initializes memory."; + + #if defined(_MSC_VER) + #include + static inline void repsto(uint16_t* dst, uint16_t v, size_t n) { + sk_msan_mark_initialized(dst,dst+n,note); + __stosw(dst, v, n); + } + static inline void repsto(uint32_t* dst, uint32_t v, size_t n) { + sk_msan_mark_initialized(dst,dst+n,note); + static_assert(sizeof(uint32_t) == sizeof(unsigned long)); + __stosd(reinterpret_cast(dst), v, n); + } + static inline void repsto(uint64_t* dst, uint64_t v, size_t n) { + sk_msan_mark_initialized(dst,dst+n,note); + __stosq(dst, v, n); + } + #else + static inline void repsto(uint16_t* dst, uint16_t v, size_t n) { + sk_msan_mark_initialized(dst,dst+n,note); + asm volatile("rep stosw" : "+D"(dst), "+c"(n) : "a"(v) : "memory"); + } + static inline void repsto(uint32_t* dst, uint32_t v, size_t n) { + sk_msan_mark_initialized(dst,dst+n,note); + asm volatile("rep stosl" : "+D"(dst), "+c"(n) : "a"(v) : "memory"); + } + static inline void repsto(uint64_t* dst, uint64_t v, size_t n) { + sk_msan_mark_initialized(dst,dst+n,note); + asm volatile("rep stosq" : "+D"(dst), "+c"(n) : "a"(v) : "memory"); + } + #endif + + // ERMS is ideal for large copies but has a relatively high setup cost, + // so we use the previous best routine for small inputs. FSRM would make this moot. + static void (*g_memset16_prev)(uint16_t*, uint16_t, int); + static void (*g_memset32_prev)(uint32_t*, uint32_t, int); + static void (*g_memset64_prev)(uint64_t*, uint64_t, int); + static void (*g_rect_memset16_prev)(uint16_t*, uint16_t, int, size_t, int); + static void (*g_rect_memset32_prev)(uint32_t*, uint32_t, int, size_t, int); + static void (*g_rect_memset64_prev)(uint64_t*, uint64_t, int, size_t, int); + + // Empirically determined with `nanobench -m memset`. + static bool small(size_t bytes) { return bytes < 1024; } + + #define SK_OPTS_NS erms + namespace SK_OPTS_NS { + static inline void memset16(uint16_t* dst, uint16_t v, int n) { + return small(sizeof(v)*n) ? g_memset16_prev(dst, v, n) + : repsto(dst, v, n); + } + static inline void memset32(uint32_t* dst, uint32_t v, int n) { + return small(sizeof(v)*n) ? g_memset32_prev(dst, v, n) + : repsto(dst, v, n); + } + static inline void memset64(uint64_t* dst, uint64_t v, int n) { + return small(sizeof(v)*n) ? g_memset64_prev(dst, v, n) + : repsto(dst, v, n); + } + + static inline void rect_memset16(uint16_t* dst, uint16_t v, int n, + size_t rowBytes, int height) { + if (small(sizeof(v)*n)) { + return g_rect_memset16_prev(dst,v,n, rowBytes,height); + } + for (int stride = rowBytes/sizeof(v); height --> 0; dst += stride) { + repsto(dst, v, n); + } + } + static inline void rect_memset32(uint32_t* dst, uint32_t v, int n, + size_t rowBytes, int height) { + if (small(sizeof(v)*n)) { + return g_rect_memset32_prev(dst,v,n, rowBytes,height); + } + for (int stride = rowBytes/sizeof(v); height --> 0; dst += stride) { + repsto(dst, v, n); + } + } + static inline void rect_memset64(uint64_t* dst, uint64_t v, int n, + size_t rowBytes, int height) { + if (small(sizeof(v)*n)) { + return g_rect_memset64_prev(dst,v,n, rowBytes,height); + } + for (int stride = rowBytes/sizeof(v); height --> 0; dst += stride) { + repsto(dst, v, n); + } + } + } // namespace SK_OPTS_NS + + namespace SkOpts { + void Init_erms() { + g_memset16_prev = memset16; + g_memset32_prev = memset32; + g_memset64_prev = memset64; + g_rect_memset16_prev = rect_memset16; + g_rect_memset32_prev = rect_memset32; + g_rect_memset64_prev = rect_memset64; + + memset16 = SK_OPTS_NS::memset16; + memset32 = SK_OPTS_NS::memset32; + memset64 = SK_OPTS_NS::memset64; + rect_memset16 = SK_OPTS_NS::rect_memset16; + rect_memset32 = SK_OPTS_NS::rect_memset32; + rect_memset64 = SK_OPTS_NS::rect_memset64; + } + } +#else + namespace SkOpts { + void Init_erms() {} + } +#endif