0be2d8354b
The whole point of mempcy32_sse2_unalign is that we didn't align dst128 and src128. So it's not safe at all to cast them back to dst and src. That tells the compiler that dst/src are 128-bit aligned, and then it autovectorizes the cleanup while-loop using that (false) knowledge with aligned SSE instructions. This leads to crashes on memcpy32_sse2_unalign_10, which is small enough that we actually get non-16-byte aligned memory. The larger size benches could be crashing too, but they're big enough allocations that they're probably always 16-byte aligned anyway. BUG=skia:2589 R=fmalita@chromium.org, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/291893008 git-svn-id: http://skia.googlecode.com/svn/trunk@14851 2bbb7eff-a529-9590-31e7-b0007b416f81
153 lines
4.4 KiB
C++
153 lines
4.4 KiB
C++
/*
|
|
* Copyright 2014 Google Inc.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license that can be
|
|
* found in the LICENSE file.
|
|
*/
|
|
|
|
#include "SkBenchmark.h"
|
|
#include "SkRandom.h"
|
|
#include "SkTemplates.h"
|
|
|
|
template <typename Memcpy32>
|
|
class Memcpy32Bench : public SkBenchmark {
|
|
public:
|
|
explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)
|
|
: fCount(count)
|
|
, fMemcpy32(memcpy32)
|
|
, fName(SkStringPrintf("%s_%d", name, count)) {}
|
|
|
|
virtual const char* onGetName() SK_OVERRIDE {
|
|
return fName.c_str();
|
|
}
|
|
|
|
virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {
|
|
return backend == kNonRendering_Backend;
|
|
}
|
|
|
|
virtual void onPreDraw() SK_OVERRIDE {
|
|
fDst.reset(fCount);
|
|
fSrc.reset(fCount);
|
|
|
|
SkRandom rand;
|
|
for (int i = 0; i < fCount; i++) {
|
|
fSrc[i] = rand.nextU();
|
|
}
|
|
}
|
|
|
|
virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {
|
|
for (int i = 0; i < loops; i++) {
|
|
fMemcpy32(fDst, fSrc, fCount);
|
|
}
|
|
}
|
|
|
|
private:
|
|
SkAutoTMalloc<uint32_t> fDst, fSrc;
|
|
|
|
int fCount;
|
|
Memcpy32 fMemcpy32;
|
|
const SkString fName;
|
|
};
|
|
|
|
template <typename Memcpy32>
|
|
static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {
|
|
return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);
|
|
}
|
|
#define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); )
|
|
|
|
|
|
// Let the libc developers do what they think is best.
|
|
static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {
|
|
memcpy(dst, src, sizeof(uint32_t) * count);
|
|
}
|
|
BENCH(memcpy32_memcpy, 10)
|
|
BENCH(memcpy32_memcpy, 100)
|
|
BENCH(memcpy32_memcpy, 1000)
|
|
BENCH(memcpy32_memcpy, 10000)
|
|
BENCH(memcpy32_memcpy, 100000)
|
|
|
|
// Let the compiler's autovectorizer do what it thinks is best.
|
|
static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) {
|
|
while (count --> 0) {
|
|
*dst++ = *src++;
|
|
}
|
|
}
|
|
BENCH(memcpy32_autovectorize, 10)
|
|
BENCH(memcpy32_autovectorize, 100)
|
|
BENCH(memcpy32_autovectorize, 1000)
|
|
BENCH(memcpy32_autovectorize, 10000)
|
|
BENCH(memcpy32_autovectorize, 100000)
|
|
|
|
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
|
|
|
// Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads.
|
|
static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {
|
|
if (count >= 16) {
|
|
while (uintptr_t(dst) & 0xF) {
|
|
*dst++ = *src++;
|
|
count--;
|
|
}
|
|
|
|
__m128i* dst128 = reinterpret_cast<__m128i*>(dst);
|
|
const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
|
|
dst += 16 * (count / 16);
|
|
src += 16 * (count / 16);
|
|
while (count >= 16) {
|
|
__m128i a = _mm_loadu_si128(src128++);
|
|
__m128i b = _mm_loadu_si128(src128++);
|
|
__m128i c = _mm_loadu_si128(src128++);
|
|
__m128i d = _mm_loadu_si128(src128++);
|
|
|
|
_mm_store_si128(dst128++, a);
|
|
_mm_store_si128(dst128++, b);
|
|
_mm_store_si128(dst128++, c);
|
|
_mm_store_si128(dst128++, d);
|
|
|
|
count -= 16;
|
|
}
|
|
}
|
|
|
|
while (count --> 0) {
|
|
*dst++ = *src++;
|
|
}
|
|
}
|
|
BENCH(memcpy32_sse2_align, 10)
|
|
BENCH(memcpy32_sse2_align, 100)
|
|
BENCH(memcpy32_sse2_align, 1000)
|
|
BENCH(memcpy32_sse2_align, 10000)
|
|
BENCH(memcpy32_sse2_align, 100000)
|
|
|
|
// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src.
|
|
static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {
|
|
__m128i* dst128 = reinterpret_cast<__m128i*>(dst);
|
|
const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
|
|
dst += 16 * (count / 16);
|
|
src += 16 * (count / 16);
|
|
while (count >= 16) {
|
|
__m128i a = _mm_loadu_si128(src128++);
|
|
__m128i b = _mm_loadu_si128(src128++);
|
|
__m128i c = _mm_loadu_si128(src128++);
|
|
__m128i d = _mm_loadu_si128(src128++);
|
|
|
|
_mm_storeu_si128(dst128++, a);
|
|
_mm_storeu_si128(dst128++, b);
|
|
_mm_storeu_si128(dst128++, c);
|
|
_mm_storeu_si128(dst128++, d);
|
|
|
|
count -= 16;
|
|
}
|
|
|
|
while (count --> 0) {
|
|
*dst++ = *src++;
|
|
}
|
|
}
|
|
BENCH(memcpy32_sse2_unalign, 10)
|
|
BENCH(memcpy32_sse2_unalign, 100)
|
|
BENCH(memcpy32_sse2_unalign, 1000)
|
|
BENCH(memcpy32_sse2_unalign, 10000)
|
|
BENCH(memcpy32_sse2_unalign, 100000)
|
|
|
|
#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
|
|
|
#undef BENCH
|