skia2/bench/MemcpyBench.cpp

/*
 * Copyright 2014 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "Benchmark.h"
#include "SkRandom.h"
#include "SkTemplates.h"
#include "SkUtils.h"

template <typename Memcpy32>
class Memcpy32Bench : public Benchmark {
public:
    explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)
        : fCount(count)
        , fMemcpy32(memcpy32)
        , fName(SkStringPrintf("%s_%d", name, count)) {}

    virtual const char* onGetName() SK_OVERRIDE {
        return fName.c_str();
    }

    virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {
        return backend == kNonRendering_Backend;
    }

    virtual void onPreDraw() SK_OVERRIDE {
        fDst.reset(fCount);
        fSrc.reset(fCount);

        SkRandom rand;
        for (int i = 0; i < fCount; i++) {
            fSrc[i] = rand.nextU();
        }
    }

    virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {
        for (int i = 0; i < loops; i++) {
            fMemcpy32(fDst, fSrc, fCount);
        }
    }

private:
    SkAutoTMalloc<uint32_t> fDst, fSrc;

    int fCount;
    Memcpy32 fMemcpy32;
    const SkString fName;
};

template <typename Memcpy32>
static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {
    return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);
}
#define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); )


// Let the libc developers do what they think is best.
static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {
    memcpy(dst, src, sizeof(uint32_t) * count);
}
BENCH(memcpy32_memcpy, 10)
BENCH(memcpy32_memcpy, 100)
BENCH(memcpy32_memcpy, 1000)
BENCH(memcpy32_memcpy, 10000)
BENCH(memcpy32_memcpy, 100000)

// Let the compiler's autovectorizer do what it thinks is best.
static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) {
    while (count --> 0) {
        *dst++ = *src++;
    }
}
BENCH(memcpy32_autovectorize, 10)
BENCH(memcpy32_autovectorize, 100)
BENCH(memcpy32_autovectorize, 1000)
BENCH(memcpy32_autovectorize, 10000)
BENCH(memcpy32_autovectorize, 100000)

#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

// Align dst to 16 bytes, then use aligned stores.  src isn't algined, so use unaligned loads.
static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {
    if (count >= 16) {
        while (uintptr_t(dst) & 0xF) {
            *dst++ = *src++;
            count--;
        }

        __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
        const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
        dst += 16 * (count / 16);
        src += 16 * (count / 16);
        while (count >= 16) {
            __m128i a = _mm_loadu_si128(src128++);
            __m128i b = _mm_loadu_si128(src128++);
            __m128i c = _mm_loadu_si128(src128++);
            __m128i d = _mm_loadu_si128(src128++);

            _mm_store_si128(dst128++, a);
            _mm_store_si128(dst128++, b);
            _mm_store_si128(dst128++, c);
            _mm_store_si128(dst128++, d);

            count -= 16;
        }
    }

    while (count --> 0) {
        *dst++ = *src++;
    }
}
BENCH(memcpy32_sse2_align, 10)
BENCH(memcpy32_sse2_align, 100)
BENCH(memcpy32_sse2_align, 1000)
BENCH(memcpy32_sse2_align, 10000)
BENCH(memcpy32_sse2_align, 100000)

// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src.
static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {
    __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
    const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
    dst += 16 * (count / 16);
    src += 16 * (count / 16);
    while (count >= 16) {
        __m128i a = _mm_loadu_si128(src128++);
        __m128i b = _mm_loadu_si128(src128++);
        __m128i c = _mm_loadu_si128(src128++);
        __m128i d = _mm_loadu_si128(src128++);

        _mm_storeu_si128(dst128++, a);
        _mm_storeu_si128(dst128++, b);
        _mm_storeu_si128(dst128++, c);
        _mm_storeu_si128(dst128++, d);

        count -= 16;
    }

    while (count --> 0) {
        *dst++ = *src++;
    }
}
BENCH(memcpy32_sse2_unalign, 10)
BENCH(memcpy32_sse2_unalign, 100)
BENCH(memcpy32_sse2_unalign, 1000)
BENCH(memcpy32_sse2_unalign, 10000)
BENCH(memcpy32_sse2_unalign, 100000)

// Test our chosen best, from SkUtils.h
BENCH(sk_memcpy32, 10)
BENCH(sk_memcpy32, 100)
BENCH(sk_memcpy32, 1000)
BENCH(sk_memcpy32, 10000)
BENCH(sk_memcpy32, 100000)

#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

#undef BENCH
Add Memcpy32 bench. This compares 32-bit copies using memcpy, autovectorization, and when SSE2 is available, aligned and unaligned SSE2. Running this on my desktop (Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz), I see all four perform essentially the same, except Clang's autovectorization looks a little better than GCC's. memcpy is calling libc 2.19's __memcpy_sse2_unaligned. BUG=skia: R=reed@google.com, qiankun.miao@intel.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/290533002 git-svn-id: http://skia.googlecode.com/svn/trunk@14799 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-20 14:54:04 +00:00			`/*`
			`* Copyright 2014 Google Inc.`
			`*`
			`* Use of this source code is governed by a BSD-style license that can be`
			`* found in the LICENSE file.`
			`*/`

Remove Sk prefix from some bench classes. This idea came while commenting on https://codereview.chromium.org/343583005/ Since SkBenchmark, SkBenchLogger and SkGMBench are not part of the Skia library, they should not have the Sk prefix. BUG=None TEST=make all R=mtklein@google.com Author: tfarina@chromium.org Review URL: https://codereview.chromium.org/347823004 2014-06-19 19:32:29 +00:00			`#include "Benchmark.h"`
Add Memcpy32 bench. This compares 32-bit copies using memcpy, autovectorization, and when SSE2 is available, aligned and unaligned SSE2. Running this on my desktop (Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz), I see all four perform essentially the same, except Clang's autovectorization looks a little better than GCC's. memcpy is calling libc 2.19's __memcpy_sse2_unaligned. BUG=skia: R=reed@google.com, qiankun.miao@intel.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/290533002 git-svn-id: http://skia.googlecode.com/svn/trunk@14799 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-20 14:54:04 +00:00			`#include "SkRandom.h"`
			`#include "SkTemplates.h"`
Add sk_memcpy32 to Memcpy bench. The bench predates the implementation in SkUtils, but now that we've got it of course we want to measure our actual implementation. BUG=skia: R=reed@google.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/302763006 git-svn-id: http://skia.googlecode.com/svn/trunk@14942 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-28 22:47:26 +00:00			`#include "SkUtils.h"`
Add Memcpy32 bench. This compares 32-bit copies using memcpy, autovectorization, and when SSE2 is available, aligned and unaligned SSE2. Running this on my desktop (Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz), I see all four perform essentially the same, except Clang's autovectorization looks a little better than GCC's. memcpy is calling libc 2.19's __memcpy_sse2_unaligned. BUG=skia: R=reed@google.com, qiankun.miao@intel.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/290533002 git-svn-id: http://skia.googlecode.com/svn/trunk@14799 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-20 14:54:04 +00:00
			`template <typename Memcpy32>`
Remove Sk prefix from some bench classes. This idea came while commenting on https://codereview.chromium.org/343583005/ Since SkBenchmark, SkBenchLogger and SkGMBench are not part of the Skia library, they should not have the Sk prefix. BUG=None TEST=make all R=mtklein@google.com Author: tfarina@chromium.org Review URL: https://codereview.chromium.org/347823004 2014-06-19 19:32:29 +00:00			`class Memcpy32Bench : public Benchmark {`
Add Memcpy32 bench. This compares 32-bit copies using memcpy, autovectorization, and when SSE2 is available, aligned and unaligned SSE2. Running this on my desktop (Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz), I see all four perform essentially the same, except Clang's autovectorization looks a little better than GCC's. memcpy is calling libc 2.19's __memcpy_sse2_unaligned. BUG=skia: R=reed@google.com, qiankun.miao@intel.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/290533002 git-svn-id: http://skia.googlecode.com/svn/trunk@14799 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-20 14:54:04 +00:00			`public:`
			`explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)`
			`: fCount(count)`
			`, fMemcpy32(memcpy32)`
			`, fName(SkStringPrintf("%s_%d", name, count)) {}`

			`virtual const char* onGetName() SK_OVERRIDE {`
			`return fName.c_str();`
			`}`

			`virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {`
			`return backend == kNonRendering_Backend;`
			`}`

			`virtual void onPreDraw() SK_OVERRIDE {`
			`fDst.reset(fCount);`
			`fSrc.reset(fCount);`

			`SkRandom rand;`
			`for (int i = 0; i < fCount; i++) {`
			`fSrc[i] = rand.nextU();`
			`}`
			`}`

			`virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {`
			`for (int i = 0; i < loops; i++) {`
			`fMemcpy32(fDst, fSrc, fCount);`
			`}`
			`}`

			`private:`
			`SkAutoTMalloc<uint32_t> fDst, fSrc;`

			`int fCount;`
			`Memcpy32 fMemcpy32;`
			`const SkString fName;`
			`};`

			`template <typename Memcpy32>`
			`static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {`
			`return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);`
			`}`
			`#define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); )`


			`// Let the libc developers do what they think is best.`
			`static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {`
			`memcpy(dst, src, sizeof(uint32_t) * count);`
			`}`
			`BENCH(memcpy32_memcpy, 10)`
			`BENCH(memcpy32_memcpy, 100)`
			`BENCH(memcpy32_memcpy, 1000)`
			`BENCH(memcpy32_memcpy, 10000)`
			`BENCH(memcpy32_memcpy, 100000)`

			`// Let the compiler's autovectorizer do what it thinks is best.`
			`static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) {`
			`while (count --> 0) {`
			`dst++ = src++;`
			`}`
			`}`
			`BENCH(memcpy32_autovectorize, 10)`
			`BENCH(memcpy32_autovectorize, 100)`
			`BENCH(memcpy32_autovectorize, 1000)`
			`BENCH(memcpy32_autovectorize, 10000)`
			`BENCH(memcpy32_autovectorize, 100000)`

			`#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2`

			`// Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads.`
			`static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {`
			`if (count >= 16) {`
			`while (uintptr_t(dst) & 0xF) {`
			`dst++ = src++;`
			`count--;`
			`}`

			`__m128i* dst128 = reinterpret_cast<__m128i*>(dst);`
			`const __m128i* src128 = reinterpret_cast<const __m128i*>(src);`
Fix memcpy32_sse2_unalign. The whole point of mempcy32_sse2_unalign is that we didn't align dst128 and src128. So it's not safe at all to cast them back to dst and src. That tells the compiler that dst/src are 128-bit aligned, and then it autovectorizes the cleanup while-loop using that (false) knowledge with aligned SSE instructions. This leads to crashes on memcpy32_sse2_unalign_10, which is small enough that we actually get non-16-byte aligned memory. The larger size benches could be crashing too, but they're big enough allocations that they're probably always 16-byte aligned anyway. BUG=skia:2589 R=fmalita@chromium.org, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/291893008 git-svn-id: http://skia.googlecode.com/svn/trunk@14851 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-22 18:24:42 +00:00			`dst += 16 * (count / 16);`
			`src += 16 * (count / 16);`
Add Memcpy32 bench. This compares 32-bit copies using memcpy, autovectorization, and when SSE2 is available, aligned and unaligned SSE2. Running this on my desktop (Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz), I see all four perform essentially the same, except Clang's autovectorization looks a little better than GCC's. memcpy is calling libc 2.19's __memcpy_sse2_unaligned. BUG=skia: R=reed@google.com, qiankun.miao@intel.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/290533002 git-svn-id: http://skia.googlecode.com/svn/trunk@14799 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-20 14:54:04 +00:00			`while (count >= 16) {`
			`__m128i a = _mm_loadu_si128(src128++);`
			`__m128i b = _mm_loadu_si128(src128++);`
			`__m128i c = _mm_loadu_si128(src128++);`
			`__m128i d = _mm_loadu_si128(src128++);`

			`_mm_store_si128(dst128++, a);`
			`_mm_store_si128(dst128++, b);`
			`_mm_store_si128(dst128++, c);`
			`_mm_store_si128(dst128++, d);`

			`count -= 16;`
			`}`
			`}`

			`while (count --> 0) {`
			`dst++ = src++;`
			`}`
			`}`
			`BENCH(memcpy32_sse2_align, 10)`
			`BENCH(memcpy32_sse2_align, 100)`
			`BENCH(memcpy32_sse2_align, 1000)`
			`BENCH(memcpy32_sse2_align, 10000)`
			`BENCH(memcpy32_sse2_align, 100000)`

			`// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src.`
			`static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {`
			`__m128i* dst128 = reinterpret_cast<__m128i*>(dst);`
			`const __m128i* src128 = reinterpret_cast<const __m128i*>(src);`
Fix memcpy32_sse2_unalign. The whole point of mempcy32_sse2_unalign is that we didn't align dst128 and src128. So it's not safe at all to cast them back to dst and src. That tells the compiler that dst/src are 128-bit aligned, and then it autovectorizes the cleanup while-loop using that (false) knowledge with aligned SSE instructions. This leads to crashes on memcpy32_sse2_unalign_10, which is small enough that we actually get non-16-byte aligned memory. The larger size benches could be crashing too, but they're big enough allocations that they're probably always 16-byte aligned anyway. BUG=skia:2589 R=fmalita@chromium.org, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/291893008 git-svn-id: http://skia.googlecode.com/svn/trunk@14851 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-22 18:24:42 +00:00			`dst += 16 * (count / 16);`
			`src += 16 * (count / 16);`
Add Memcpy32 bench. This compares 32-bit copies using memcpy, autovectorization, and when SSE2 is available, aligned and unaligned SSE2. Running this on my desktop (Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz), I see all four perform essentially the same, except Clang's autovectorization looks a little better than GCC's. memcpy is calling libc 2.19's __memcpy_sse2_unaligned. BUG=skia: R=reed@google.com, qiankun.miao@intel.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/290533002 git-svn-id: http://skia.googlecode.com/svn/trunk@14799 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-20 14:54:04 +00:00			`while (count >= 16) {`
			`__m128i a = _mm_loadu_si128(src128++);`
			`__m128i b = _mm_loadu_si128(src128++);`
			`__m128i c = _mm_loadu_si128(src128++);`
			`__m128i d = _mm_loadu_si128(src128++);`

			`_mm_storeu_si128(dst128++, a);`
			`_mm_storeu_si128(dst128++, b);`
			`_mm_storeu_si128(dst128++, c);`
			`_mm_storeu_si128(dst128++, d);`

			`count -= 16;`
			`}`

			`while (count --> 0) {`
			`dst++ = src++;`
			`}`
			`}`
Fix memcpy32_sse2_unalign. The whole point of mempcy32_sse2_unalign is that we didn't align dst128 and src128. So it's not safe at all to cast them back to dst and src. That tells the compiler that dst/src are 128-bit aligned, and then it autovectorizes the cleanup while-loop using that (false) knowledge with aligned SSE instructions. This leads to crashes on memcpy32_sse2_unalign_10, which is small enough that we actually get non-16-byte aligned memory. The larger size benches could be crashing too, but they're big enough allocations that they're probably always 16-byte aligned anyway. BUG=skia:2589 R=fmalita@chromium.org, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/291893008 git-svn-id: http://skia.googlecode.com/svn/trunk@14851 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-22 18:24:42 +00:00			`BENCH(memcpy32_sse2_unalign, 10)`
Add Memcpy32 bench. This compares 32-bit copies using memcpy, autovectorization, and when SSE2 is available, aligned and unaligned SSE2. Running this on my desktop (Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz), I see all four perform essentially the same, except Clang's autovectorization looks a little better than GCC's. memcpy is calling libc 2.19's __memcpy_sse2_unaligned. BUG=skia: R=reed@google.com, qiankun.miao@intel.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/290533002 git-svn-id: http://skia.googlecode.com/svn/trunk@14799 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-20 14:54:04 +00:00			`BENCH(memcpy32_sse2_unalign, 100)`
			`BENCH(memcpy32_sse2_unalign, 1000)`
			`BENCH(memcpy32_sse2_unalign, 10000)`
			`BENCH(memcpy32_sse2_unalign, 100000)`

Add sk_memcpy32 to Memcpy bench. The bench predates the implementation in SkUtils, but now that we've got it of course we want to measure our actual implementation. BUG=skia: R=reed@google.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/302763006 git-svn-id: http://skia.googlecode.com/svn/trunk@14942 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-28 22:47:26 +00:00			`// Test our chosen best, from SkUtils.h`
			`BENCH(sk_memcpy32, 10)`
			`BENCH(sk_memcpy32, 100)`
			`BENCH(sk_memcpy32, 1000)`
			`BENCH(sk_memcpy32, 10000)`
			`BENCH(sk_memcpy32, 100000)`

Add Memcpy32 bench. This compares 32-bit copies using memcpy, autovectorization, and when SSE2 is available, aligned and unaligned SSE2. Running this on my desktop (Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz), I see all four perform essentially the same, except Clang's autovectorization looks a little better than GCC's. memcpy is calling libc 2.19's __memcpy_sse2_unaligned. BUG=skia: R=reed@google.com, qiankun.miao@intel.com, mtklein@google.com Author: mtklein@chromium.org Review URL: https://codereview.chromium.org/290533002 git-svn-id: http://skia.googlecode.com/svn/trunk@14799 2bbb7eff-a529-9590-31e7-b0007b416f81 2014-05-20 14:54:04 +00:00			`#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2`

			`#undef BENCH`