Clean up some benches that answer questions we're no longer asking.
NOTREECHECKS=true BUG=skia: R=reed@google.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/512503002
This commit is contained in:
parent
5e8dbd31de
commit
4473be874f
@ -67,87 +67,6 @@ BENCH(memcpy32_memcpy, 1000)
|
||||
BENCH(memcpy32_memcpy, 10000)
|
||||
BENCH(memcpy32_memcpy, 100000)
|
||||
|
||||
// Let the compiler's autovectorizer do what it thinks is best.
|
||||
static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) {
|
||||
while (count --> 0) {
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
BENCH(memcpy32_autovectorize, 10)
|
||||
BENCH(memcpy32_autovectorize, 100)
|
||||
BENCH(memcpy32_autovectorize, 1000)
|
||||
BENCH(memcpy32_autovectorize, 10000)
|
||||
BENCH(memcpy32_autovectorize, 100000)
|
||||
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
|
||||
// Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads.
|
||||
static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {
|
||||
if (count >= 16) {
|
||||
while (uintptr_t(dst) & 0xF) {
|
||||
*dst++ = *src++;
|
||||
count--;
|
||||
}
|
||||
|
||||
__m128i* dst128 = reinterpret_cast<__m128i*>(dst);
|
||||
const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
|
||||
dst += 16 * (count / 16);
|
||||
src += 16 * (count / 16);
|
||||
while (count >= 16) {
|
||||
__m128i a = _mm_loadu_si128(src128++);
|
||||
__m128i b = _mm_loadu_si128(src128++);
|
||||
__m128i c = _mm_loadu_si128(src128++);
|
||||
__m128i d = _mm_loadu_si128(src128++);
|
||||
|
||||
_mm_store_si128(dst128++, a);
|
||||
_mm_store_si128(dst128++, b);
|
||||
_mm_store_si128(dst128++, c);
|
||||
_mm_store_si128(dst128++, d);
|
||||
|
||||
count -= 16;
|
||||
}
|
||||
}
|
||||
|
||||
while (count --> 0) {
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
BENCH(memcpy32_sse2_align, 10)
|
||||
BENCH(memcpy32_sse2_align, 100)
|
||||
BENCH(memcpy32_sse2_align, 1000)
|
||||
BENCH(memcpy32_sse2_align, 10000)
|
||||
BENCH(memcpy32_sse2_align, 100000)
|
||||
|
||||
// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src.
|
||||
static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {
|
||||
__m128i* dst128 = reinterpret_cast<__m128i*>(dst);
|
||||
const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
|
||||
dst += 16 * (count / 16);
|
||||
src += 16 * (count / 16);
|
||||
while (count >= 16) {
|
||||
__m128i a = _mm_loadu_si128(src128++);
|
||||
__m128i b = _mm_loadu_si128(src128++);
|
||||
__m128i c = _mm_loadu_si128(src128++);
|
||||
__m128i d = _mm_loadu_si128(src128++);
|
||||
|
||||
_mm_storeu_si128(dst128++, a);
|
||||
_mm_storeu_si128(dst128++, b);
|
||||
_mm_storeu_si128(dst128++, c);
|
||||
_mm_storeu_si128(dst128++, d);
|
||||
|
||||
count -= 16;
|
||||
}
|
||||
|
||||
while (count --> 0) {
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
BENCH(memcpy32_sse2_unalign, 10)
|
||||
BENCH(memcpy32_sse2_unalign, 100)
|
||||
BENCH(memcpy32_sse2_unalign, 1000)
|
||||
BENCH(memcpy32_sse2_unalign, 10000)
|
||||
BENCH(memcpy32_sse2_unalign, 100000)
|
||||
|
||||
// Test our chosen best, from SkUtils.h
|
||||
BENCH(sk_memcpy32, 10)
|
||||
BENCH(sk_memcpy32, 100)
|
||||
@ -155,6 +74,4 @@ BENCH(sk_memcpy32, 1000)
|
||||
BENCH(sk_memcpy32, 10000)
|
||||
BENCH(sk_memcpy32, 100000)
|
||||
|
||||
#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
|
||||
#undef BENCH
|
||||
|
@ -56,110 +56,3 @@ private:
|
||||
DEF_BENCH( return new ChunkAllocBench(64); )
|
||||
DEF_BENCH( return new ChunkAllocBench(8*1024); )
|
||||
|
||||
static int* calloc(size_t num) {
|
||||
return (int*)sk_calloc_throw(num*sizeof(int));
|
||||
}
|
||||
|
||||
static int* malloc_bzero(size_t num) {
|
||||
const size_t bytes = num*sizeof(int);
|
||||
int* ints = (int*)sk_malloc_throw(bytes);
|
||||
sk_bzero(ints, bytes);
|
||||
return ints;
|
||||
}
|
||||
|
||||
class ZerosBench : public Benchmark {
|
||||
size_t fNum;
|
||||
bool fRead;
|
||||
bool fWrite;
|
||||
bool fUseCalloc;
|
||||
SkString fName;
|
||||
public:
|
||||
ZerosBench(size_t num, bool read, bool write, bool useCalloc)
|
||||
: fNum(num)
|
||||
, fRead(read)
|
||||
, fWrite(write)
|
||||
, fUseCalloc(useCalloc) {
|
||||
fName.printf("memory_%s", useCalloc ? "calloc" : "malloc_bzero");
|
||||
if (read && write) {
|
||||
fName.appendf("_rw");
|
||||
} else if (read) {
|
||||
fName.appendf("_r");
|
||||
} else if (write) {
|
||||
fName.appendf("_w");
|
||||
}
|
||||
fName.appendf("_" SK_SIZE_T_SPECIFIER, num);
|
||||
}
|
||||
|
||||
virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {
|
||||
return backend == kNonRendering_Backend;
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual const char* onGetName() SK_OVERRIDE {
|
||||
return fName.c_str();
|
||||
}
|
||||
|
||||
virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {
|
||||
for (int i = 0; i < loops; i++) {
|
||||
int* zeros = fUseCalloc ? calloc(fNum) : malloc_bzero(fNum);
|
||||
if (fRead) {
|
||||
volatile int x = 15;
|
||||
for (size_t j = 0; j < fNum; j++) {
|
||||
x ^= zeros[j];
|
||||
}
|
||||
}
|
||||
if (fWrite) {
|
||||
for (size_t j = 0; j < fNum; j++) {
|
||||
zeros[j] = 15;
|
||||
}
|
||||
}
|
||||
sk_free(zeros);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// zero count r w useCalloc?
|
||||
DEF_BENCH(return new ZerosBench(1024*1024, 0, 0, 0))
|
||||
DEF_BENCH(return new ZerosBench(1024*1024, 0, 0, 1))
|
||||
DEF_BENCH(return new ZerosBench(1024*1024, 0, 1, 0))
|
||||
DEF_BENCH(return new ZerosBench(1024*1024, 0, 1, 1))
|
||||
DEF_BENCH(return new ZerosBench(1024*1024, 1, 0, 0))
|
||||
DEF_BENCH(return new ZerosBench(1024*1024, 1, 0, 1))
|
||||
DEF_BENCH(return new ZerosBench(1024*1024, 1, 1, 0))
|
||||
DEF_BENCH(return new ZerosBench(1024*1024, 1, 1, 1))
|
||||
|
||||
DEF_BENCH(return new ZerosBench(256*1024, 0, 0, 0))
|
||||
DEF_BENCH(return new ZerosBench(256*1024, 0, 0, 1))
|
||||
DEF_BENCH(return new ZerosBench(256*1024, 0, 1, 0))
|
||||
DEF_BENCH(return new ZerosBench(256*1024, 0, 1, 1))
|
||||
DEF_BENCH(return new ZerosBench(256*1024, 1, 0, 0))
|
||||
DEF_BENCH(return new ZerosBench(256*1024, 1, 0, 1))
|
||||
DEF_BENCH(return new ZerosBench(256*1024, 1, 1, 0))
|
||||
DEF_BENCH(return new ZerosBench(256*1024, 1, 1, 1))
|
||||
|
||||
DEF_BENCH(return new ZerosBench(4*1024, 0, 0, 0))
|
||||
DEF_BENCH(return new ZerosBench(4*1024, 0, 0, 1))
|
||||
DEF_BENCH(return new ZerosBench(4*1024, 0, 1, 0))
|
||||
DEF_BENCH(return new ZerosBench(4*1024, 0, 1, 1))
|
||||
DEF_BENCH(return new ZerosBench(4*1024, 1, 0, 0))
|
||||
DEF_BENCH(return new ZerosBench(4*1024, 1, 0, 1))
|
||||
DEF_BENCH(return new ZerosBench(4*1024, 1, 1, 0))
|
||||
DEF_BENCH(return new ZerosBench(4*1024, 1, 1, 1))
|
||||
|
||||
DEF_BENCH(return new ZerosBench(300, 0, 0, 0))
|
||||
DEF_BENCH(return new ZerosBench(300, 0, 0, 1))
|
||||
DEF_BENCH(return new ZerosBench(300, 0, 1, 0))
|
||||
DEF_BENCH(return new ZerosBench(300, 0, 1, 1))
|
||||
DEF_BENCH(return new ZerosBench(300, 1, 0, 0))
|
||||
DEF_BENCH(return new ZerosBench(300, 1, 0, 1))
|
||||
DEF_BENCH(return new ZerosBench(300, 1, 1, 0))
|
||||
DEF_BENCH(return new ZerosBench(300, 1, 1, 1))
|
||||
|
||||
DEF_BENCH(return new ZerosBench(4, 0, 0, 0))
|
||||
DEF_BENCH(return new ZerosBench(4, 0, 0, 1))
|
||||
DEF_BENCH(return new ZerosBench(4, 0, 1, 0))
|
||||
DEF_BENCH(return new ZerosBench(4, 0, 1, 1))
|
||||
DEF_BENCH(return new ZerosBench(4, 1, 0, 0))
|
||||
DEF_BENCH(return new ZerosBench(4, 1, 0, 1))
|
||||
DEF_BENCH(return new ZerosBench(4, 1, 1, 0))
|
||||
DEF_BENCH(return new ZerosBench(4, 1, 1, 1))
|
||||
|
@ -1,179 +0,0 @@
|
||||
/*
|
||||
* Copyright 2014 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "Benchmark.h"
|
||||
#include "SkRandom.h"
|
||||
|
||||
#include "SkChunkAlloc.h"
|
||||
#include "SkDeque.h"
|
||||
#include "SkTArray.h"
|
||||
#include "SkTDArray.h"
|
||||
|
||||
// This file has several benchmarks using various data structures to do stack-like things:
|
||||
// - push
|
||||
// - push, immediately pop
|
||||
// - push many, pop all of them
|
||||
// - serial access
|
||||
// - random access
|
||||
// When a data structure doesn't suppport an operation efficiently, we leave that combination out.
|
||||
// Where possible we hint to the data structure to allocate in 4K pages.
|
||||
//
|
||||
// These benchmarks may help you decide which data structure to use for a dynamically allocated
|
||||
// ordered list of allocations that grows on one end.
|
||||
//
|
||||
// Current overall winner (01/2014): SkTDArray.
|
||||
// It wins every benchmark on every machine I tried (Desktop, Nexus S, Laptop).
|
||||
|
||||
template <typename Impl>
|
||||
struct StackBench : public Benchmark {
|
||||
virtual bool isSuitableFor(Backend b) SK_OVERRIDE { return b == kNonRendering_Backend; }
|
||||
virtual const char* onGetName() SK_OVERRIDE { return Impl::kName; }
|
||||
virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE { Impl::bench(loops); }
|
||||
};
|
||||
|
||||
#define BENCH(name) \
|
||||
struct name { static const char* const kName; static void bench(int); }; \
|
||||
const char* const name::kName = #name; \
|
||||
DEF_BENCH(return new StackBench<name>();) \
|
||||
void name::bench(int loops)
|
||||
|
||||
static const int K = 2049;
|
||||
|
||||
// Add K items, then iterate through them serially many times.
|
||||
|
||||
BENCH(Deque_Serial) {
|
||||
SkDeque s(sizeof(int), 1024);
|
||||
for (int i = 0; i < K; i++) *(int*)s.push_back() = i;
|
||||
|
||||
volatile int junk = 0;
|
||||
for (int j = 0; j < loops; j++) {
|
||||
SkDeque::Iter it(s, SkDeque::Iter::kFront_IterStart);
|
||||
while(void* p = it.next()) {
|
||||
junk += *(int*)p;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BENCH(TArray_Serial) {
|
||||
SkTArray<int, true> s;
|
||||
for (int i = 0; i < K; i++) s.push_back(i);
|
||||
|
||||
volatile int junk = 0;
|
||||
for (int j = 0; j < loops; j++) {
|
||||
for (int i = 0; i < s.count(); i++) junk += s[i];
|
||||
}
|
||||
}
|
||||
|
||||
BENCH(TDArray_Serial) {
|
||||
SkTDArray<int> s;
|
||||
for (int i = 0; i < K; i++) s.push(i);
|
||||
|
||||
volatile int junk = 0;
|
||||
for (int j = 0; j < loops; j++) {
|
||||
for (int i = 0; i < s.count(); i++) junk += s[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Add K items, then randomly access them many times.
|
||||
|
||||
BENCH(TArray_RandomAccess) {
|
||||
SkTArray<int, true> s;
|
||||
for (int i = 0; i < K; i++) s.push_back(i);
|
||||
|
||||
SkRandom rand;
|
||||
volatile int junk = 0;
|
||||
for (int i = 0; i < K*loops; i++) {
|
||||
junk += s[rand.nextULessThan(K)];
|
||||
}
|
||||
}
|
||||
|
||||
BENCH(TDArray_RandomAccess) {
|
||||
SkTDArray<int> s;
|
||||
for (int i = 0; i < K; i++) s.push(i);
|
||||
|
||||
SkRandom rand;
|
||||
volatile int junk = 0;
|
||||
for (int i = 0; i < K*loops; i++) {
|
||||
junk += s[rand.nextULessThan(K)];
|
||||
}
|
||||
}
|
||||
|
||||
// Push many times.
|
||||
|
||||
BENCH(ChunkAlloc_Push) {
|
||||
SkChunkAlloc s(4096);
|
||||
for (int i = 0; i < K*loops; i++) s.allocThrow(sizeof(int));
|
||||
}
|
||||
|
||||
BENCH(Deque_Push) {
|
||||
SkDeque s(sizeof(int), 1024);
|
||||
for (int i = 0; i < K*loops; i++) *(int*)s.push_back() = i;
|
||||
}
|
||||
|
||||
BENCH(TArray_Push) {
|
||||
SkTArray<int, true> s;
|
||||
for (int i = 0; i < K*loops; i++) s.push_back(i);
|
||||
}
|
||||
|
||||
BENCH(TDArray_Push) {
|
||||
SkTDArray<int> s;
|
||||
for (int i = 0; i < K*loops; i++) s.push(i);
|
||||
}
|
||||
|
||||
// Push then immediately pop many times.
|
||||
|
||||
BENCH(ChunkAlloc_PushPop) {
|
||||
SkChunkAlloc s(4096);
|
||||
for (int i = 0; i < K*loops; i++) {
|
||||
void* p = s.allocThrow(sizeof(int));
|
||||
s.unalloc(p);
|
||||
}
|
||||
}
|
||||
|
||||
BENCH(Deque_PushPop) {
|
||||
SkDeque s(sizeof(int), 1024);
|
||||
for (int i = 0; i < K*loops; i++) {
|
||||
*(int*)s.push_back() = i;
|
||||
s.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
BENCH(TArray_PushPop) {
|
||||
SkTArray<int, true> s;
|
||||
for (int i = 0; i < K*loops; i++) {
|
||||
s.push_back(i);
|
||||
s.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
BENCH(TDArray_PushPop) {
|
||||
SkTDArray<int> s;
|
||||
for (int i = 0; i < K*loops; i++) {
|
||||
s.push(i);
|
||||
s.pop();
|
||||
}
|
||||
}
|
||||
|
||||
// Push many items, then pop them all.
|
||||
|
||||
BENCH(Deque_PushAllPopAll) {
|
||||
SkDeque s(sizeof(int), 1024);
|
||||
for (int i = 0; i < K*loops; i++) *(int*)s.push_back() = i;
|
||||
for (int i = 0; i < K*loops; i++) s.pop_back();
|
||||
}
|
||||
|
||||
BENCH(TArray_PushAllPopAll) {
|
||||
SkTArray<int, true> s;
|
||||
for (int i = 0; i < K*loops; i++) s.push_back(i);
|
||||
for (int i = 0; i < K*loops; i++) s.pop_back();
|
||||
}
|
||||
|
||||
BENCH(TDArray_PushAllPopAll) {
|
||||
SkTDArray<int> s;
|
||||
for (int i = 0; i < K*loops; i++) s.push(i);
|
||||
for (int i = 0; i < K*loops; i++) s.pop();
|
||||
}
|
@ -92,7 +92,6 @@
|
||||
'../bench/ShaderMaskBench.cpp',
|
||||
'../bench/SkipZeroesBench.cpp',
|
||||
'../bench/SortBench.cpp',
|
||||
'../bench/StackBench.cpp',
|
||||
'../bench/StrokeBench.cpp',
|
||||
'../bench/TableBench.cpp',
|
||||
'../bench/TextBench.cpp',
|
||||
|
Loading…
Reference in New Issue
Block a user