skia2/bench/SkVMBench.cpp
Mike Klein 03ce675b5f fix SKVM_ benches
Things were running suspiciously well...

_I32 had a typo that cut out 3/4 of its multiplies...

_I32_SWAR was missing a mask operation needed to drop
the junk low byte of the high half after the multiply.

The bench times now make a bit more sense and are in line
with how much work we're actually doing: F32's the slowest,
I32 a little faster, and I32_SWAR fastest:

    curr/maxrss	loops	min	median	mean	max	stddev	samples   	config	bench
      35/36  MB	58	2.03ns	2.04ns	2.04ns	2.04ns	0%	▂▂▂▂▁▁█▁▂▁	nonrendering	SkVM_4096_I32_SWAR
      35/36  MB	42	3.44ns	3.48ns	3.49ns	3.59ns	1%	▂▆▅█▃▃▁▂▂▄	nonrendering	SkVM_4096_I32
      35/36  MB	30	4.9ns	5.21ns	5.11ns	5.33ns	3%	▆▇█▆▆▁▂▁▁▅	nonrendering	SkVM_4096_F32
      35/36  MB	203	0.696ns	0.697ns	0.705ns	0.758ns	3%	█▂▂▁▁▁▁▁▁▂	nonrendering	SkVM_4096_RP
      35/36  MB	942	0.188ns	0.188ns	0.188ns	0.189ns	0%	▂▁▂▁▃█▂▁▁▁	nonrendering	SkVM_4096_Opts

Change-Id: I2850dc3f9df1828f03499eb278b8231f48eaae63
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/217982
Commit-Queue: Mike Klein <mtklein@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
Auto-Submit: Mike Klein <mtklein@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
2019-06-03 20:35:24 +00:00

231 lines
8.0 KiB
C++

/*
* Copyright 2019 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "bench/Benchmark.h"
#include "src/core/SkOpts.h"
#include "src/core/SkVM.h"
namespace {
enum Mode {Opts, RP, F32, I32, I32_SWAR};
static const char* kMode_name[] = { "Opts", "RP","F32", "I32", "I32_SWAR" };
struct SrcoverBuilder_F32 : public skvm::Builder {
SrcoverBuilder_F32() {
skvm::Arg src = arg(0),
dst = arg(1);
auto byte_to_f32 = [&](skvm::I32 byte) {
return mul(splat(1/255.0f), to_f32(byte));
};
auto f32_to_byte = [&](skvm::F32 f32) {
return to_i32(mad(f32, splat(255.0f), splat(0.5f)));
};
auto load = [&](skvm::Arg ptr,
skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32* a) {
skvm::I32 rgba = load32(ptr);
*r = byte_to_f32(bit_and( rgba , splat(0xff)));
*g = byte_to_f32(bit_and(shr(rgba, 8), splat(0xff)));
*b = byte_to_f32(bit_and(shr(rgba, 16), splat(0xff)));
*a = byte_to_f32( shr(rgba, 24) );
};
skvm::F32 r,g,b,a;
load(src, &r,&g,&b,&a);
skvm::F32 dr,dg,db,da;
load(dst, &dr,&dg,&db,&da);
skvm::F32 invA = sub(splat(1.0f), a);
r = mad(dr, invA, r);
g = mad(dg, invA, g);
b = mad(db, invA, b);
a = mad(da, invA, a);
store32(dst, bit_or( f32_to_byte(r) ,
bit_or(shl(f32_to_byte(g), 8),
bit_or(shl(f32_to_byte(b), 16),
shl(f32_to_byte(a), 24)))));
}
};
struct SrcoverBuilder_I32 : public skvm::Builder {
SrcoverBuilder_I32() {
skvm::Arg src = arg(0),
dst = arg(1);
auto load = [&](skvm::Arg ptr,
skvm::I32* r, skvm::I32* g, skvm::I32* b, skvm::I32* a) {
skvm::I32 rgba = load32(ptr);
*r = bit_and( rgba , splat(0xff));
*g = bit_and(shr(rgba, 8), splat(0xff));
*b = bit_and(shr(rgba, 16), splat(0xff));
*a = shr(rgba, 24) ;
};
auto mul_unorm8 = [&](skvm::I32 x, skvm::I32 y) {
// (x*y + 127)/255 ~= (x*y+255)/256
return shr(add(mul(x, y), splat(0xff)), 8);
};
skvm::I32 r,g,b,a;
load(src, &r,&g,&b,&a);
skvm::I32 dr,dg,db,da;
load(dst, &dr,&dg,&db,&da);
skvm::I32 invA = sub(splat(0xff), a);
r = add(r, mul_unorm8(dr, invA));
g = add(g, mul_unorm8(dg, invA));
b = add(b, mul_unorm8(db, invA));
a = add(a, mul_unorm8(da, invA));
store32(dst, bit_or( r ,
bit_or(shl(g, 8),
bit_or(shl(b, 16),
shl(a, 24)))));
}
};
struct SrcoverBuilder_I32_SWAR : public skvm::Builder {
SrcoverBuilder_I32_SWAR() {
skvm::Arg src = arg(0),
dst = arg(1);
auto load = [&](skvm::Arg ptr,
skvm::I32* rb, skvm::I32* ga) {
skvm::I32 rgba = load32(ptr);
*rb = bit_and( rgba, splat(0x00ff00ff));
*ga = bit_and(shr(rgba, 8), splat(0x00ff00ff));
};
auto mul_unorm8 = [&](skvm::I32 x, skvm::I32 y) {
// As above, assuming x is two SWAR bytes in lanes 0 and 2, and y is a byte.
return bit_and(shr(add(mul(x, y),
splat(0x00ff00ff)),
8),
splat(0x00ff00ff));
};
skvm::I32 rb, ga;
load(src, &rb, &ga);
skvm::I32 drb, dga;
load(dst, &drb, &dga);
skvm::I32 invA = sub(splat(0xff), shr(ga, 16));
rb = add(rb, mul_unorm8(drb, invA));
ga = add(ga, mul_unorm8(dga, invA));
store32(dst, bit_or(rb, shl(ga, 8)));
}
};
}
class SkVMBench : public Benchmark {
public:
SkVMBench(int pixels, Mode mode)
: fPixels(pixels)
, fMode(mode)
, fName(SkStringPrintf("SkVM_%d_%s", pixels, kMode_name[mode]))
{}
private:
const char* onGetName() override { return fName.c_str(); }
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
void onDelayedSetup() override {
this->setUnits(fPixels);
fSrc.resize(fPixels, 0x7f123456); // Arbitrary non-opaque non-transparent value.
fDst.resize(fPixels, 0xff987654); // Arbitrary value.
if (fMode == F32 ) { fProgram = SrcoverBuilder_F32 {}.done(); }
if (fMode == I32 ) { fProgram = SrcoverBuilder_I32 {}.done(); }
if (fMode == I32_SWAR) { fProgram = SrcoverBuilder_I32_SWAR{}.done(); }
if (fMode == RP) {
fSrcCtx = { fSrc.data(), 0 };
fDstCtx = { fDst.data(), 0 };
fPipeline.append(SkRasterPipeline::load_8888 , &fSrcCtx);
fPipeline.append(SkRasterPipeline::load_8888_dst, &fDstCtx);
fPipeline.append(SkRasterPipeline::srcover);
fPipeline.append(SkRasterPipeline::store_8888, &fDstCtx);
}
// Trigger one run now so we can do a quick correctness check.
this->draw(1,nullptr);
for (int i = 0; i < fPixels; i++) {
SkASSERT(fDst[i] == 0xff5e6f80);
}
}
void onDraw(int loops, SkCanvas*) override {
while (loops --> 0) {
if (fMode == Opts) {
SkOpts::blit_row_s32a_opaque(fDst.data(), fSrc.data(), fPixels, 0xff);
} else if (fMode == RP) {
fPipeline.run(0,0,fPixels,1);
} else {
fProgram.eval(fPixels, fSrc.data(), fDst.data());
}
}
}
int fPixels;
Mode fMode;
SkString fName;
std::vector<uint32_t> fSrc,
fDst;
skvm::Program fProgram;
SkRasterPipeline_MemoryCtx fSrcCtx,
fDstCtx;
SkRasterPipeline_<256> fPipeline;
};
DEF_BENCH(return (new SkVMBench{ 1, Opts});)
DEF_BENCH(return (new SkVMBench{ 4, Opts});)
DEF_BENCH(return (new SkVMBench{ 16, Opts});)
DEF_BENCH(return (new SkVMBench{ 64, Opts});)
DEF_BENCH(return (new SkVMBench{ 256, Opts});)
DEF_BENCH(return (new SkVMBench{1024, Opts});)
DEF_BENCH(return (new SkVMBench{4096, Opts});)
DEF_BENCH(return (new SkVMBench{ 1, RP});)
DEF_BENCH(return (new SkVMBench{ 4, RP});)
DEF_BENCH(return (new SkVMBench{ 16, RP});)
DEF_BENCH(return (new SkVMBench{ 64, RP});)
DEF_BENCH(return (new SkVMBench{ 256, RP});)
DEF_BENCH(return (new SkVMBench{1024, RP});)
DEF_BENCH(return (new SkVMBench{4096, RP});)
DEF_BENCH(return (new SkVMBench{ 1, F32});)
DEF_BENCH(return (new SkVMBench{ 4, F32});)
DEF_BENCH(return (new SkVMBench{ 16, F32});)
DEF_BENCH(return (new SkVMBench{ 64, F32});)
DEF_BENCH(return (new SkVMBench{ 256, F32});)
DEF_BENCH(return (new SkVMBench{1024, F32});)
DEF_BENCH(return (new SkVMBench{4096, F32});)
DEF_BENCH(return (new SkVMBench{ 1, I32});)
DEF_BENCH(return (new SkVMBench{ 4, I32});)
DEF_BENCH(return (new SkVMBench{ 16, I32});)
DEF_BENCH(return (new SkVMBench{ 64, I32});)
DEF_BENCH(return (new SkVMBench{ 256, I32});)
DEF_BENCH(return (new SkVMBench{1024, I32});)
DEF_BENCH(return (new SkVMBench{4096, I32});)
DEF_BENCH(return (new SkVMBench{ 1, I32_SWAR});)
DEF_BENCH(return (new SkVMBench{ 4, I32_SWAR});)
DEF_BENCH(return (new SkVMBench{ 16, I32_SWAR});)
DEF_BENCH(return (new SkVMBench{ 64, I32_SWAR});)
DEF_BENCH(return (new SkVMBench{ 256, I32_SWAR});)
DEF_BENCH(return (new SkVMBench{1024, I32_SWAR});)
DEF_BENCH(return (new SkVMBench{4096, I32_SWAR});)