2013-06-03 16:54:10 +00:00
|
|
|
/*
|
|
|
|
* Copyright 2013 Google Inc.
|
|
|
|
*
|
|
|
|
* Use of this source code is governed by a BSD-style license that can be
|
|
|
|
* found in the LICENSE file.
|
|
|
|
*/
|
|
|
|
|
2019-04-23 17:05:21 +00:00
|
|
|
#include "bench/Benchmark.h"
|
|
|
|
#include "include/private/SkTemplates.h"
|
2021-10-28 16:38:48 +00:00
|
|
|
#include "src/core/SkOpts.h"
|
2013-06-03 16:54:10 +00:00
|
|
|
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
template <typename T>
|
2014-06-19 19:32:29 +00:00
|
|
|
class MemsetBench : public Benchmark {
|
2013-06-03 16:54:10 +00:00
|
|
|
public:
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
explicit MemsetBench(size_t bytes)
|
|
|
|
: fN(bytes / sizeof(T))
|
|
|
|
, fBuffer(fN)
|
|
|
|
, fName(SkStringPrintf("memset%zu_%zu", sizeof(T)*8, bytes)) {}
|
2013-11-21 06:21:58 +00:00
|
|
|
|
2015-04-09 21:05:17 +00:00
|
|
|
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
|
|
|
|
const char* onGetName() override { return fName.c_str(); }
|
2013-06-03 16:54:10 +00:00
|
|
|
|
2015-10-01 16:43:39 +00:00
|
|
|
void onDraw(int loops, SkCanvas*) override;
|
2013-06-03 16:54:10 +00:00
|
|
|
|
|
|
|
private:
|
2015-04-09 21:05:17 +00:00
|
|
|
int fN;
|
|
|
|
SkAutoTMalloc<T> fBuffer;
|
|
|
|
SkString fName;
|
2013-06-03 16:54:10 +00:00
|
|
|
};
|
|
|
|
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
template <> void MemsetBench<uint64_t>::onDraw(int loops, SkCanvas*) {
|
2015-04-09 21:05:17 +00:00
|
|
|
for (int i = 0; i < 1000*loops; i++) {
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
sk_memset64(fBuffer.get(), 0xFACEFACEFACEFACE, fN);
|
2015-04-09 21:05:17 +00:00
|
|
|
}
|
|
|
|
}
|
2013-06-03 16:54:10 +00:00
|
|
|
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
template <> void MemsetBench<uint32_t>::onDraw(int loops, SkCanvas*) {
|
2015-04-09 21:05:17 +00:00
|
|
|
for (int i = 0; i < 1000*loops; i++) {
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
sk_memset32(fBuffer.get(), 0xFACEB004, fN);
|
2013-06-03 16:54:10 +00:00
|
|
|
}
|
2015-04-09 21:05:17 +00:00
|
|
|
}
|
2013-06-03 16:54:10 +00:00
|
|
|
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
template <> void MemsetBench<uint16_t>::onDraw(int loops, SkCanvas*) {
|
2015-04-09 21:05:17 +00:00
|
|
|
for (int i = 0; i < 1000*loops; i++) {
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
sk_memset16(fBuffer.get(), 0x4973, fN);
|
2013-06-03 16:54:10 +00:00
|
|
|
}
|
2015-04-09 21:05:17 +00:00
|
|
|
}
|
2013-06-03 16:54:10 +00:00
|
|
|
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
DEF_BENCH(return (new MemsetBench<uint64_t>(16)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint64_t>(64)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint64_t>(256)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint64_t>(512)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint64_t>(768)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint64_t>(1024)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint64_t>(2048)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint64_t>(4096)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint64_t>(65536)));
|
2015-04-09 21:05:17 +00:00
|
|
|
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
DEF_BENCH(return (new MemsetBench<uint32_t>(16)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint32_t>(64)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint32_t>(256)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint32_t>(512)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint32_t>(768)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint32_t>(1024)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint32_t>(2048)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint32_t>(4096)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint32_t>(65536)));
|
2015-04-09 21:05:17 +00:00
|
|
|
|
Reland "add ERMS (enhanced rep mov/sto) SkOpts slice"
This is a reland of 26ad8ccdeca1e8a065c7d45d3620d270da45acd6
... now with MSAN support.
Original change's description:
> add ERMS (enhanced rep mov/sto) SkOpts slice
>
> Intel's got two CPUID bits indicating the speed of rep mov/sto
> (memcpy/memset),
>
> - ERMS, Enhanced Rep Mov/Sto, older, large copies are fast?
> - FSRM, Fast Short Rep Mov, newer, small copies are fast?
>
> ERMS has been around a long time on Intel, but is relatively recent on
> Ryzen, and FSRM is new across the board. The startup cost for
> ERMS-but-not-FSRM copies really is noticeable, so we cut over to the
> previous SSE/AVX routines when N is small.
>
> I've left the memset benchmarks as I found them most useful when
> tuning the small/large cutoff in this CL.
>
> Change-Id: I3ac4e3f34796aba0ea86aabbe9dda7526919456a
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/332580
> Reviewed-by: Herb Derby <herb@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>
Cq-Include-Trybots: luci.skia.skia.primary:Test-Debian10-Clang-GCE-CPU-AVX2-x86_64-Release-All-MSAN
Change-Id: Ia293bba90022c48c884599331ef35aa67644729b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/334343
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
2020-11-05 15:38:53 +00:00
|
|
|
DEF_BENCH(return (new MemsetBench<uint16_t>(16)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint16_t>(64)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint16_t>(256)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint16_t>(512)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint16_t>(768)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint16_t>(1024)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint16_t>(2048)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint16_t>(4096)));
|
|
|
|
DEF_BENCH(return (new MemsetBench<uint16_t>(65536)));
|