03ce675b5f
Things were running suspiciously well... _I32 had a typo that cut out 3/4 of its multiplies... _I32_SWAR was missing a mask operation needed to drop the junk low byte of the high half after the multiply. The bench times now make a bit more sense and are in line with how much work we're actually doing: F32's the slowest, I32 a little faster, and I32_SWAR fastest: curr/maxrss loops min median mean max stddev samples config bench 35/36 MB 58 2.03ns 2.04ns 2.04ns 2.04ns 0% ▂▂▂▂▁▁█▁▂▁ nonrendering SkVM_4096_I32_SWAR 35/36 MB 42 3.44ns 3.48ns 3.49ns 3.59ns 1% ▂▆▅█▃▃▁▂▂▄ nonrendering SkVM_4096_I32 35/36 MB 30 4.9ns 5.21ns 5.11ns 5.33ns 3% ▆▇█▆▆▁▂▁▁▅ nonrendering SkVM_4096_F32 35/36 MB 203 0.696ns 0.697ns 0.705ns 0.758ns 3% █▂▂▁▁▁▁▁▁▂ nonrendering SkVM_4096_RP 35/36 MB 942 0.188ns 0.188ns 0.188ns 0.189ns 0% ▂▁▂▁▃█▂▁▁▁ nonrendering SkVM_4096_Opts Change-Id: I2850dc3f9df1828f03499eb278b8231f48eaae63 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/217982 Commit-Queue: Mike Klein <mtklein@google.com> Commit-Queue: Brian Osman <brianosman@google.com> Auto-Submit: Mike Klein <mtklein@google.com> Reviewed-by: Brian Osman <brianosman@google.com>
231 lines
8.0 KiB
C++
231 lines
8.0 KiB
C++
/*
|
|
* Copyright 2019 Google Inc.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license that can be
|
|
* found in the LICENSE file.
|
|
*/
|
|
|
|
#include "bench/Benchmark.h"
|
|
#include "src/core/SkOpts.h"
|
|
#include "src/core/SkVM.h"
|
|
|
|
namespace {
|
|
|
|
enum Mode {Opts, RP, F32, I32, I32_SWAR};
|
|
static const char* kMode_name[] = { "Opts", "RP","F32", "I32", "I32_SWAR" };
|
|
|
|
struct SrcoverBuilder_F32 : public skvm::Builder {
|
|
SrcoverBuilder_F32() {
|
|
|
|
skvm::Arg src = arg(0),
|
|
dst = arg(1);
|
|
|
|
auto byte_to_f32 = [&](skvm::I32 byte) {
|
|
return mul(splat(1/255.0f), to_f32(byte));
|
|
};
|
|
auto f32_to_byte = [&](skvm::F32 f32) {
|
|
return to_i32(mad(f32, splat(255.0f), splat(0.5f)));
|
|
};
|
|
|
|
auto load = [&](skvm::Arg ptr,
|
|
skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32* a) {
|
|
skvm::I32 rgba = load32(ptr);
|
|
*r = byte_to_f32(bit_and( rgba , splat(0xff)));
|
|
*g = byte_to_f32(bit_and(shr(rgba, 8), splat(0xff)));
|
|
*b = byte_to_f32(bit_and(shr(rgba, 16), splat(0xff)));
|
|
*a = byte_to_f32( shr(rgba, 24) );
|
|
};
|
|
|
|
skvm::F32 r,g,b,a;
|
|
load(src, &r,&g,&b,&a);
|
|
|
|
skvm::F32 dr,dg,db,da;
|
|
load(dst, &dr,&dg,&db,&da);
|
|
|
|
skvm::F32 invA = sub(splat(1.0f), a);
|
|
r = mad(dr, invA, r);
|
|
g = mad(dg, invA, g);
|
|
b = mad(db, invA, b);
|
|
a = mad(da, invA, a);
|
|
|
|
store32(dst, bit_or( f32_to_byte(r) ,
|
|
bit_or(shl(f32_to_byte(g), 8),
|
|
bit_or(shl(f32_to_byte(b), 16),
|
|
shl(f32_to_byte(a), 24)))));
|
|
}
|
|
};
|
|
|
|
struct SrcoverBuilder_I32 : public skvm::Builder {
|
|
SrcoverBuilder_I32() {
|
|
skvm::Arg src = arg(0),
|
|
dst = arg(1);
|
|
|
|
auto load = [&](skvm::Arg ptr,
|
|
skvm::I32* r, skvm::I32* g, skvm::I32* b, skvm::I32* a) {
|
|
skvm::I32 rgba = load32(ptr);
|
|
*r = bit_and( rgba , splat(0xff));
|
|
*g = bit_and(shr(rgba, 8), splat(0xff));
|
|
*b = bit_and(shr(rgba, 16), splat(0xff));
|
|
*a = shr(rgba, 24) ;
|
|
};
|
|
|
|
auto mul_unorm8 = [&](skvm::I32 x, skvm::I32 y) {
|
|
// (x*y + 127)/255 ~= (x*y+255)/256
|
|
return shr(add(mul(x, y), splat(0xff)), 8);
|
|
};
|
|
|
|
skvm::I32 r,g,b,a;
|
|
load(src, &r,&g,&b,&a);
|
|
|
|
skvm::I32 dr,dg,db,da;
|
|
load(dst, &dr,&dg,&db,&da);
|
|
|
|
skvm::I32 invA = sub(splat(0xff), a);
|
|
r = add(r, mul_unorm8(dr, invA));
|
|
g = add(g, mul_unorm8(dg, invA));
|
|
b = add(b, mul_unorm8(db, invA));
|
|
a = add(a, mul_unorm8(da, invA));
|
|
|
|
store32(dst, bit_or( r ,
|
|
bit_or(shl(g, 8),
|
|
bit_or(shl(b, 16),
|
|
shl(a, 24)))));
|
|
}
|
|
};
|
|
|
|
struct SrcoverBuilder_I32_SWAR : public skvm::Builder {
|
|
SrcoverBuilder_I32_SWAR() {
|
|
skvm::Arg src = arg(0),
|
|
dst = arg(1);
|
|
|
|
auto load = [&](skvm::Arg ptr,
|
|
skvm::I32* rb, skvm::I32* ga) {
|
|
skvm::I32 rgba = load32(ptr);
|
|
*rb = bit_and( rgba, splat(0x00ff00ff));
|
|
*ga = bit_and(shr(rgba, 8), splat(0x00ff00ff));
|
|
};
|
|
|
|
auto mul_unorm8 = [&](skvm::I32 x, skvm::I32 y) {
|
|
// As above, assuming x is two SWAR bytes in lanes 0 and 2, and y is a byte.
|
|
return bit_and(shr(add(mul(x, y),
|
|
splat(0x00ff00ff)),
|
|
8),
|
|
splat(0x00ff00ff));
|
|
};
|
|
|
|
skvm::I32 rb, ga;
|
|
load(src, &rb, &ga);
|
|
|
|
skvm::I32 drb, dga;
|
|
load(dst, &drb, &dga);
|
|
|
|
skvm::I32 invA = sub(splat(0xff), shr(ga, 16));
|
|
rb = add(rb, mul_unorm8(drb, invA));
|
|
ga = add(ga, mul_unorm8(dga, invA));
|
|
|
|
store32(dst, bit_or(rb, shl(ga, 8)));
|
|
}
|
|
};
|
|
}
|
|
|
|
class SkVMBench : public Benchmark {
|
|
public:
|
|
SkVMBench(int pixels, Mode mode)
|
|
: fPixels(pixels)
|
|
, fMode(mode)
|
|
, fName(SkStringPrintf("SkVM_%d_%s", pixels, kMode_name[mode]))
|
|
{}
|
|
|
|
private:
|
|
const char* onGetName() override { return fName.c_str(); }
|
|
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
|
|
|
|
void onDelayedSetup() override {
|
|
this->setUnits(fPixels);
|
|
fSrc.resize(fPixels, 0x7f123456); // Arbitrary non-opaque non-transparent value.
|
|
fDst.resize(fPixels, 0xff987654); // Arbitrary value.
|
|
|
|
if (fMode == F32 ) { fProgram = SrcoverBuilder_F32 {}.done(); }
|
|
if (fMode == I32 ) { fProgram = SrcoverBuilder_I32 {}.done(); }
|
|
if (fMode == I32_SWAR) { fProgram = SrcoverBuilder_I32_SWAR{}.done(); }
|
|
|
|
if (fMode == RP) {
|
|
fSrcCtx = { fSrc.data(), 0 };
|
|
fDstCtx = { fDst.data(), 0 };
|
|
fPipeline.append(SkRasterPipeline::load_8888 , &fSrcCtx);
|
|
fPipeline.append(SkRasterPipeline::load_8888_dst, &fDstCtx);
|
|
fPipeline.append(SkRasterPipeline::srcover);
|
|
fPipeline.append(SkRasterPipeline::store_8888, &fDstCtx);
|
|
}
|
|
|
|
// Trigger one run now so we can do a quick correctness check.
|
|
this->draw(1,nullptr);
|
|
for (int i = 0; i < fPixels; i++) {
|
|
SkASSERT(fDst[i] == 0xff5e6f80);
|
|
}
|
|
}
|
|
|
|
void onDraw(int loops, SkCanvas*) override {
|
|
while (loops --> 0) {
|
|
if (fMode == Opts) {
|
|
SkOpts::blit_row_s32a_opaque(fDst.data(), fSrc.data(), fPixels, 0xff);
|
|
} else if (fMode == RP) {
|
|
fPipeline.run(0,0,fPixels,1);
|
|
} else {
|
|
fProgram.eval(fPixels, fSrc.data(), fDst.data());
|
|
}
|
|
}
|
|
}
|
|
|
|
int fPixels;
|
|
Mode fMode;
|
|
SkString fName;
|
|
std::vector<uint32_t> fSrc,
|
|
fDst;
|
|
skvm::Program fProgram;
|
|
|
|
SkRasterPipeline_MemoryCtx fSrcCtx,
|
|
fDstCtx;
|
|
SkRasterPipeline_<256> fPipeline;
|
|
};
|
|
|
|
DEF_BENCH(return (new SkVMBench{ 1, Opts});)
|
|
DEF_BENCH(return (new SkVMBench{ 4, Opts});)
|
|
DEF_BENCH(return (new SkVMBench{ 16, Opts});)
|
|
DEF_BENCH(return (new SkVMBench{ 64, Opts});)
|
|
DEF_BENCH(return (new SkVMBench{ 256, Opts});)
|
|
DEF_BENCH(return (new SkVMBench{1024, Opts});)
|
|
DEF_BENCH(return (new SkVMBench{4096, Opts});)
|
|
|
|
DEF_BENCH(return (new SkVMBench{ 1, RP});)
|
|
DEF_BENCH(return (new SkVMBench{ 4, RP});)
|
|
DEF_BENCH(return (new SkVMBench{ 16, RP});)
|
|
DEF_BENCH(return (new SkVMBench{ 64, RP});)
|
|
DEF_BENCH(return (new SkVMBench{ 256, RP});)
|
|
DEF_BENCH(return (new SkVMBench{1024, RP});)
|
|
DEF_BENCH(return (new SkVMBench{4096, RP});)
|
|
|
|
DEF_BENCH(return (new SkVMBench{ 1, F32});)
|
|
DEF_BENCH(return (new SkVMBench{ 4, F32});)
|
|
DEF_BENCH(return (new SkVMBench{ 16, F32});)
|
|
DEF_BENCH(return (new SkVMBench{ 64, F32});)
|
|
DEF_BENCH(return (new SkVMBench{ 256, F32});)
|
|
DEF_BENCH(return (new SkVMBench{1024, F32});)
|
|
DEF_BENCH(return (new SkVMBench{4096, F32});)
|
|
|
|
DEF_BENCH(return (new SkVMBench{ 1, I32});)
|
|
DEF_BENCH(return (new SkVMBench{ 4, I32});)
|
|
DEF_BENCH(return (new SkVMBench{ 16, I32});)
|
|
DEF_BENCH(return (new SkVMBench{ 64, I32});)
|
|
DEF_BENCH(return (new SkVMBench{ 256, I32});)
|
|
DEF_BENCH(return (new SkVMBench{1024, I32});)
|
|
DEF_BENCH(return (new SkVMBench{4096, I32});)
|
|
|
|
DEF_BENCH(return (new SkVMBench{ 1, I32_SWAR});)
|
|
DEF_BENCH(return (new SkVMBench{ 4, I32_SWAR});)
|
|
DEF_BENCH(return (new SkVMBench{ 16, I32_SWAR});)
|
|
DEF_BENCH(return (new SkVMBench{ 64, I32_SWAR});)
|
|
DEF_BENCH(return (new SkVMBench{ 256, I32_SWAR});)
|
|
DEF_BENCH(return (new SkVMBench{1024, I32_SWAR});)
|
|
DEF_BENCH(return (new SkVMBench{4096, I32_SWAR});)
|