skia2/bench/Sk4fBench.cpp

/*
 * Copyright 2015 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "Benchmark.h"
#include "SkColor.h"
#include "SkNx.h"

// Used to prevent the compiler from optimizing away the whole loop.
volatile uint32_t blackhole = 0;

// Not a great random number generator, but it's very fast.
// The code we're measuring is quite fast, so low overhead is essential.
static uint32_t lcg_rand(uint32_t* seed) {
    *seed *= 1664525;
    *seed += 1013904223;
    return *seed;
}

struct Sk4fBytesRoundtripBench : public Benchmark {
    Sk4fBytesRoundtripBench() {}

    const char* onGetName() override { return "Sk4f_roundtrip"; }
    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }

    void onDraw(int loops, SkCanvas* canvas) override {
        // Unlike blackhole, junk can and probably will be a register.
        uint32_t junk = 0;
        uint32_t seed = 0;
        for (int i = 0; i < loops; i++) {
            uint32_t color = lcg_rand(&seed),
                     back;
            auto f = SkNx_cast<float>(Sk4b::Load((const uint8_t*)&color));
            SkNx_cast<uint8_t>(f).store((uint8_t*)&back);
            junk ^= back;
        }
        blackhole ^= junk;
    }
};
DEF_BENCH(return new Sk4fBytesRoundtripBench;)

struct Sk4fGradientBench : public Benchmark {
    const char* onGetName() override { return "Sk4f_gradient"; }
    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }

    SkPMColor fDevice[100];
    void onDraw(int loops, SkCanvas*) override {
        Sk4f c0(0,0,255,255),
             c1(255,0,0,255),
             dc = c1 - c0,
             fx(0.1f),
             dx(0.002f),
             dcdx(dc*dx),
             dcdx4(dcdx+dcdx+dcdx+dcdx);

        for (int n = 0; n < loops; n++) {
            Sk4f a = c0 + dc*fx + Sk4f(0.5f),  // add an extra 0.5f to get rounding for free.
                 b = a + dcdx,
                 c = b + dcdx,
                 d = c + dcdx;
            for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
                Sk4f_ToBytes((uint8_t*)(fDevice+i), a, b, c, d);
                a = a + dcdx4;
                b = b + dcdx4;
                c = c + dcdx4;
                d = d + dcdx4;
            }
        }
    }
};
DEF_BENCH(return new Sk4fGradientBench;)
Update 4-at-a-time APIs. There is no reason to require the 4 SkPMFloats (registers) to be adjacent. The only potential win in loads and stores comes from the SkPMColors being adjacent. Makes no difference to existing bench. BUG=skia: Review URL: https://codereview.chromium.org/1035583002 2015-03-25 20:43:34 +00:00			`/*`
			`* Copyright 2015 Google Inc.`
			`*`
			`* Use of this source code is governed by a BSD-style license that can be`
			`* found in the LICENSE file.`
			`*/`

Sketch SkPMFloat BUG=skia: Committed: https://skia.googlesource.com/skia/+/50d2b3114b3e59dc84811881591bf25b2c1ecb9f CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon-Trybot http://build.chromium.org/p/client.skia.compile/builders/Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon/builds/2120/steps/build%20most/logs/stdio Review URL: https://codereview.chromium.org/936633002 2015-02-23 18:04:34 +00:00			`#include "Benchmark.h"`
Clean up remaining users of SkPMFloat This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f, and converts the SkPMFloat benches to Sk4f benches. No pixels should change here, and no code beyond the Sk4f_ benches should change speed. The benches are faster than the old versions. BUG=skia:4117 Review URL: https://codereview.chromium.org/1324743002 2015-08-31 22:26:08 +00:00			`#include "SkColor.h"`
			`#include "SkNx.h"`
Trim the fat off SkPMFloat bench. This bench was ~75% overhead, ~25% good bench. It is now just about the opposite: about 30% of the runtime is loop and random number overhead, and about 70% of the time is spent doing SkPMColor <-> SkPMFloat work. BUG=skia: NOPRESUBMIT=true Review URL: https://codereview.chromium.org/968133005 2015-03-03 16:03:27 +00:00
			`// Used to prevent the compiler from optimizing away the whole loop.`
			`volatile uint32_t blackhole = 0;`

			`// Not a great random number generator, but it's very fast.`
			`// The code we're measuring is quite fast, so low overhead is essential.`
			`static uint32_t lcg_rand(uint32_t* seed) {`
			`seed = 1664525;`
			`*seed += 1013904223;`
			`return *seed;`
			`}`
Sketch SkPMFloat BUG=skia: Committed: https://skia.googlesource.com/skia/+/50d2b3114b3e59dc84811881591bf25b2c1ecb9f CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon-Trybot http://build.chromium.org/p/client.skia.compile/builders/Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon/builds/2120/steps/build%20most/logs/stdio Review URL: https://codereview.chromium.org/936633002 2015-02-23 18:04:34 +00:00
Clean up remaining users of SkPMFloat This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f, and converts the SkPMFloat benches to Sk4f benches. No pixels should change here, and no code beyond the Sk4f_ benches should change speed. The benches are faster than the old versions. BUG=skia:4117 Review URL: https://codereview.chromium.org/1324743002 2015-08-31 22:26:08 +00:00			`struct Sk4fBytesRoundtripBench : public Benchmark {`
			`Sk4fBytesRoundtripBench() {}`
Sketch SkPMFloat BUG=skia: Committed: https://skia.googlesource.com/skia/+/50d2b3114b3e59dc84811881591bf25b2c1ecb9f CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon-Trybot http://build.chromium.org/p/client.skia.compile/builders/Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon/builds/2120/steps/build%20most/logs/stdio Review URL: https://codereview.chromium.org/936633002 2015-02-23 18:04:34 +00:00
Clean up remaining users of SkPMFloat This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f, and converts the SkPMFloat benches to Sk4f benches. No pixels should change here, and no code beyond the Sk4f_ benches should change speed. The benches are faster than the old versions. BUG=skia:4117 Review URL: https://codereview.chromium.org/1324743002 2015-08-31 22:26:08 +00:00			`const char* onGetName() override { return "Sk4f_roundtrip"; }`
C++11 override should now be supported by all of {bots,Chrome,Android,Mozilla} NOPRESUBMIT=true BUG=skia: DOCS_PREVIEW= https://skia.org/?cl=1037793002 Review URL: https://codereview.chromium.org/1037793002 2015-03-26 01:17:31 +00:00			`bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }`
Sketch SkPMFloat BUG=skia: Committed: https://skia.googlesource.com/skia/+/50d2b3114b3e59dc84811881591bf25b2c1ecb9f CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon-Trybot http://build.chromium.org/p/client.skia.compile/builders/Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon/builds/2120/steps/build%20most/logs/stdio Review URL: https://codereview.chromium.org/936633002 2015-02-23 18:04:34 +00:00
Remove const from `const int loops`. This drives me nuts, and prevents `while (loops --> 0)`. BUG=skia: Review URL: https://codereview.chromium.org/1379923005 2015-10-01 16:43:39 +00:00			`void onDraw(int loops, SkCanvas* canvas) override {`
Trim the fat off SkPMFloat bench. This bench was ~75% overhead, ~25% good bench. It is now just about the opposite: about 30% of the runtime is loop and random number overhead, and about 70% of the time is spent doing SkPMColor <-> SkPMFloat work. BUG=skia: NOPRESUBMIT=true Review URL: https://codereview.chromium.org/968133005 2015-03-03 16:03:27 +00:00			`// Unlike blackhole, junk can and probably will be a register.`
			`uint32_t junk = 0;`
			`uint32_t seed = 0;`
Sketch SkPMFloat BUG=skia: Committed: https://skia.googlesource.com/skia/+/50d2b3114b3e59dc84811881591bf25b2c1ecb9f CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon-Trybot http://build.chromium.org/p/client.skia.compile/builders/Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon/builds/2120/steps/build%20most/logs/stdio Review URL: https://codereview.chromium.org/936633002 2015-02-23 18:04:34 +00:00			`for (int i = 0; i < loops; i++) {`
Clean up remaining users of SkPMFloat This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f, and converts the SkPMFloat benches to Sk4f benches. No pixels should change here, and no code beyond the Sk4f_ benches should change speed. The benches are faster than the old versions. BUG=skia:4117 Review URL: https://codereview.chromium.org/1324743002 2015-08-31 22:26:08 +00:00			`uint32_t color = lcg_rand(&seed),`
			`back;`
Unify some SkNx code - one base case and one N=1 case instead of two each (or three with doubles) - use SkNx_cast instead of FromBytes/toBytes - 4-at-a-time Sk4f::ToBytes becomes a special standalone Sk4f_ToBytes If I did everything right, this'll be perf- and pixel- neutral. https://gold.skia.org/search2?issue=1526523003&unt=true&query=source_type%3Dgm&master=false BUG=skia: CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review URL: https://codereview.chromium.org/1526523003 2015-12-14 19:25:18 +00:00			`auto f = SkNx_cast<float>(Sk4b::Load((const uint8_t*)&color));`
			`SkNx_cast<uint8_t>(f).store((uint8_t*)&back);`
Convert SkPMFloat to [0,1] range and prune its API. Now that Sk4px exists, there's a lot less sense in eeking out every cycle of speed from SkPMFloat: if we need to go _really_ fast, we should use Sk4px. SkPMFloat's going to be used for things that are already slow: large-range intermediates, divides, sqrts, etc. A [0,1] range is easier to work with, and can even be faster if we eliminate enough 255 and 1/255 steps. This is particularly true on ARM, where NEON can do the *255 and /255 steps for us while converting float<->int. We have lots of experimental SkPMFloat <-> SkPMColor APIs that I'm now removing. Of the existing APIs, roundClamp() is the sanest, so I've kept only that, now called round(). The 4-at-a-time APIs never panned out, so they're gone. There will be small diffs on: colormatrix coloremoji colorfilterimagefilter fadefilter imagefilters_xfermodes imagefilterscropexpand imagefiltersgraph tileimagefilter BUG=skia: Review URL: https://codereview.chromium.org/1201343004 2015-06-25 15:56:28 +00:00			`junk ^= back;`
Sketch SkPMFloat BUG=skia: Committed: https://skia.googlesource.com/skia/+/50d2b3114b3e59dc84811881591bf25b2c1ecb9f CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon-Trybot http://build.chromium.org/p/client.skia.compile/builders/Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon/builds/2120/steps/build%20most/logs/stdio Review URL: https://codereview.chromium.org/936633002 2015-02-23 18:04:34 +00:00			`}`
Trim the fat off SkPMFloat bench. This bench was ~75% overhead, ~25% good bench. It is now just about the opposite: about 30% of the runtime is loop and random number overhead, and about 70% of the time is spent doing SkPMColor <-> SkPMFloat work. BUG=skia: NOPRESUBMIT=true Review URL: https://codereview.chromium.org/968133005 2015-03-03 16:03:27 +00:00			`blackhole ^= junk;`
Sketch SkPMFloat BUG=skia: Committed: https://skia.googlesource.com/skia/+/50d2b3114b3e59dc84811881591bf25b2c1ecb9f CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon-Trybot http://build.chromium.org/p/client.skia.compile/builders/Build-Ubuntu13.10-GCC4.8-Arm7-Release-Android_Neon/builds/2120/steps/build%20most/logs/stdio Review URL: https://codereview.chromium.org/936633002 2015-02-23 18:04:34 +00:00			`}`
			`};`
Clean up remaining users of SkPMFloat This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f, and converts the SkPMFloat benches to Sk4f benches. No pixels should change here, and no code beyond the Sk4f_ benches should change speed. The benches are faster than the old versions. BUG=skia:4117 Review URL: https://codereview.chromium.org/1324743002 2015-08-31 22:26:08 +00:00			`DEF_BENCH(return new Sk4fBytesRoundtripBench;)`
hack on linear gradient Am I going nuts or can we get this down to just adds and converts in the loop? #floats #n9 BUG=skia:3592 CQ_INCLUDE_TRYBOTS=client.skia.android:Test-Android-Nexus9-TegraK1-Arm64-Release-Trybot Review URL: https://codereview.chromium.org/1008973004 2015-03-26 01:13:02 +00:00
Clean up remaining users of SkPMFloat This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f, and converts the SkPMFloat benches to Sk4f benches. No pixels should change here, and no code beyond the Sk4f_ benches should change speed. The benches are faster than the old versions. BUG=skia:4117 Review URL: https://codereview.chromium.org/1324743002 2015-08-31 22:26:08 +00:00			`struct Sk4fGradientBench : public Benchmark {`
			`const char* onGetName() override { return "Sk4f_gradient"; }`
hack on linear gradient Am I going nuts or can we get this down to just adds and converts in the loop? #floats #n9 BUG=skia:3592 CQ_INCLUDE_TRYBOTS=client.skia.android:Test-Android-Nexus9-TegraK1-Arm64-Release-Trybot Review URL: https://codereview.chromium.org/1008973004 2015-03-26 01:13:02 +00:00			`bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }`

			`SkPMColor fDevice[100];`
Remove const from `const int loops`. This drives me nuts, and prevents `while (loops --> 0)`. BUG=skia: Review URL: https://codereview.chromium.org/1379923005 2015-10-01 16:43:39 +00:00			`void onDraw(int loops, SkCanvas*) override {`
Clean up remaining users of SkPMFloat This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f, and converts the SkPMFloat benches to Sk4f benches. No pixels should change here, and no code beyond the Sk4f_ benches should change speed. The benches are faster than the old versions. BUG=skia:4117 Review URL: https://codereview.chromium.org/1324743002 2015-08-31 22:26:08 +00:00			`Sk4f c0(0,0,255,255),`
			`c1(255,0,0,255),`
hack on linear gradient Am I going nuts or can we get this down to just adds and converts in the loop? #floats #n9 BUG=skia:3592 CQ_INCLUDE_TRYBOTS=client.skia.android:Test-Android-Nexus9-TegraK1-Arm64-Release-Trybot Review URL: https://codereview.chromium.org/1008973004 2015-03-26 01:13:02 +00:00			`dc = c1 - c0,`
			`fx(0.1f),`
			`dx(0.002f),`
			`dcdx(dc*dx),`
			`dcdx4(dcdx+dcdx+dcdx+dcdx);`

			`for (int n = 0; n < loops; n++) {`
Clean up remaining users of SkPMFloat This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f, and converts the SkPMFloat benches to Sk4f benches. No pixels should change here, and no code beyond the Sk4f_ benches should change speed. The benches are faster than the old versions. BUG=skia:4117 Review URL: https://codereview.chromium.org/1324743002 2015-08-31 22:26:08 +00:00			`Sk4f a = c0 + dc*fx + Sk4f(0.5f), // add an extra 0.5f to get rounding for free.`
hack on linear gradient Am I going nuts or can we get this down to just adds and converts in the loop? #floats #n9 BUG=skia:3592 CQ_INCLUDE_TRYBOTS=client.skia.android:Test-Android-Nexus9-TegraK1-Arm64-Release-Trybot Review URL: https://codereview.chromium.org/1008973004 2015-03-26 01:13:02 +00:00			`b = a + dcdx,`
			`c = b + dcdx,`
			`d = c + dcdx;`
			`for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {`
Unify some SkNx code - one base case and one N=1 case instead of two each (or three with doubles) - use SkNx_cast instead of FromBytes/toBytes - 4-at-a-time Sk4f::ToBytes becomes a special standalone Sk4f_ToBytes If I did everything right, this'll be perf- and pixel- neutral. https://gold.skia.org/search2?issue=1526523003&unt=true&query=source_type%3Dgm&master=false BUG=skia: CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review URL: https://codereview.chromium.org/1526523003 2015-12-14 19:25:18 +00:00			`Sk4f_ToBytes((uint8_t*)(fDevice+i), a, b, c, d);`
Remove overly-promiscuous SkNx syntax sugar. I haven't figured out a pithy way to have these apply to only classes originating from SkNx, so let's just remove them. There aren't too many use cases, and it's not really any less readable without them. Semantically, this is a no-op. BUG=skia: Review URL: https://codereview.chromium.org/1167153002 2015-06-10 15:57:28 +00:00			`a = a + dcdx4;`
			`b = b + dcdx4;`
			`c = c + dcdx4;`
			`d = d + dcdx4;`
hack on linear gradient Am I going nuts or can we get this down to just adds and converts in the loop? #floats #n9 BUG=skia:3592 CQ_INCLUDE_TRYBOTS=client.skia.android:Test-Android-Nexus9-TegraK1-Arm64-Release-Trybot Review URL: https://codereview.chromium.org/1008973004 2015-03-26 01:13:02 +00:00			`}`
			`}`
			`}`
			`};`
Clean up remaining users of SkPMFloat This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f, and converts the SkPMFloat benches to Sk4f benches. No pixels should change here, and no code beyond the Sk4f_ benches should change speed. The benches are faster than the old versions. BUG=skia:4117 Review URL: https://codereview.chromium.org/1324743002 2015-08-31 22:26:08 +00:00			`DEF_BENCH(return new Sk4fGradientBench;)`