explicitly vectorize sk_memset{16,32,64}
This ought to help clients who don't enable autovectorization. With autovectorization enabled, this new version is like, hyper-vectorized compared to the old autovectorization. Instead of handling 128 bytes max per loop, it now handles up to 512 bytes per loop. Pretty exciting. Locally perf effects are a mix, but we'd expect this to help Chrome unambiguously if they've turned off autovectorization. $ out/ok bench:samples=100 sw filter:match=memset32_\\d\* serial Before: [memset32_100000] 16ms @0 20.1ms @99 20.2ms @100 [memset32_10000] 1.07ms @0 1.26ms @99 1.31ms @100 [memset32_1000] 73.9µs @0 89.4µs @99 90.1µs @100 [memset32_100] 8.59µs @0 9.74µs @99 9.96µs @100 [memset32_10] 7.45µs @0 8.96µs @99 8.99µs @100 [memset32_1] 2.29µs @0 2.81µs @99 2.92µs @100 After: [memset32_100000] 16.2ms @0 17.3ms @99 17.3ms @100 [memset32_10000] 1.06ms @0 1.18ms @99 1.23ms @100 [memset32_1000] 72µs @0 75.6µs @99 84.7µs @100 [memset32_100] 9.14µs @0 10.6µs @99 10.7µs @100 [memset32_10] 5.43µs @0 5.88µs @99 5.99µs @100 [memset32_1] 3.43µs @0 3.65µs @99 3.83µs @100 BUG=chromium:755391 Change-Id: If9059a30ca7a345f1f7c37bd51473c29e8bb8922 Reviewed-on: https://skia-review.googlesource.com/34746 Reviewed-by: Florin Malita <fmalita@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
135a1b10fe
commit
25954b64c0
@ -8,24 +8,36 @@
|
||||
#ifndef SkUtils_opts_DEFINED
|
||||
#define SkUtils_opts_DEFINED
|
||||
|
||||
#include "stdint.h"
|
||||
#include <stdint.h>
|
||||
#include "SkNx.h"
|
||||
|
||||
namespace SK_OPTS_NS {
|
||||
|
||||
static void memset16(uint16_t buffer[], uint16_t value, int count) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
buffer[i] = value;
|
||||
template <typename T>
|
||||
static void memsetT(T buffer[], T value, int count) {
|
||||
#if defined(__AVX__)
|
||||
static const int N = 32 / sizeof(T);
|
||||
#else
|
||||
static const int N = 16 / sizeof(T);
|
||||
#endif
|
||||
while (count >= N) {
|
||||
SkNx<N,T>(value).store(buffer);
|
||||
buffer += N;
|
||||
count -= N;
|
||||
}
|
||||
while (count --> 0) {
|
||||
*buffer++ = value;
|
||||
}
|
||||
}
|
||||
|
||||
static void memset16(uint16_t buffer[], uint16_t value, int count) {
|
||||
memsetT(buffer, value, count);
|
||||
}
|
||||
static void memset32(uint32_t buffer[], uint32_t value, int count) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
buffer[i] = value;
|
||||
}
|
||||
memsetT(buffer, value, count);
|
||||
}
|
||||
static void memset64(uint64_t buffer[], uint64_t value, int count) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
buffer[i] = value;
|
||||
}
|
||||
memsetT(buffer, value, count);
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user