Add a real SkXbyak bench, implement enough to run it.

CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD

SkXbyak_…
9320  …JITCompiled 1x  …Interpreted 1.24x  …HandWritten 2.5x

Change-Id: I37d2d255ff32dcce73d29081d506e2d67477af97
Reviewed-on: https://skia-review.googlesource.com/6697
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2017-01-06 14:54:09 -05:00 committed by Skia Commit-Bot
parent fa71067032
commit 83f532e9b5
4 changed files with 146 additions and 3 deletions

78
bench/SkXbyakBench.cpp Normal file
View File

@ -0,0 +1,78 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "Benchmark.h"
#include "SkHalf.h"
#include "SkNx.h"
#include "SkRasterPipeline.h"
static void hand_written(uint64_t* buf, int n) {
while (n --> 0) {
Sk4f rgba = SkHalfToFloat_finite_ftz(*buf);
float a = rgba[3],
scale = a == 0 ? 1 : 1.0f/a;
rgba *= Sk4f{scale,scale,scale,1};
SkFloatToHalf_finite_ftz(rgba).store(buf++);
}
}
class SkXbyakBench : public Benchmark {
public:
enum Mode { kHandWritten, kInterpreted, kJITCompiled };
SkXbyakBench(Mode mode) : fMode(mode) {
if (mode == kInterpreted || mode == kJITCompiled) {
fPtr = &fBuf;
fP.append(SkRasterPipeline::load_f16, &fPtr);
fP.append(SkRasterPipeline::unpremul);
fP.append(SkRasterPipeline::store_f16, &fPtr);
}
if (mode == kJITCompiled) {
fFn = fP.compile();
}
}
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
const char* onGetName() override {
switch(fMode) {
case kHandWritten: return "SkXbyak_HandWritten";
case kInterpreted: return "SkXbyak_Interpreted";
case kJITCompiled: return "SkXbyak_JITCompiled";
}
return "";
}
void onDraw(int loops, SkCanvas*) override {
switch (fMode) {
case kHandWritten:
while (loops --> 0) { hand_written(fBuf, N); }
break;
case kInterpreted:
while (loops --> 0) { fP.run(0,0,N); }
break;
case kJITCompiled:
while (loops --> 0) { fFn(0,0,N); }
break;
}
}
private:
static const int N = 1024; // TODO: 1023, making the tail jagged
SkRasterPipeline fP;
Mode fMode;
uint64_t fBuf[N];
void* fPtr;
std::function<void(size_t, size_t, size_t)> fFn;
};
DEF_BENCH( return new SkXbyakBench(SkXbyakBench::kHandWritten); )
DEF_BENCH( return new SkXbyakBench(SkXbyakBench::kInterpreted); )
DEF_BENCH( return new SkXbyakBench(SkXbyakBench::kJITCompiled); )

View File

@ -111,6 +111,7 @@ bench_sources = [
"$_bench/SKPAnimationBench.cpp",
"$_bench/SKPBench.cpp",
"$_bench/SkRasterPipelineBench.cpp",
"$_bench/SkXbyakBench.cpp",
"$_bench/StreamBench.cpp",
"$_bench/SortBench.cpp",
"$_bench/StrokeBench.cpp",

View File

@ -29,7 +29,6 @@ void SkRasterPipeline::run(size_t x, size_t y, size_t n) const {
std::function<void(size_t, size_t, size_t)> SkRasterPipeline::compile() const {
#ifdef SK_XBYAK
if (auto fn = this->jit()) {
SkDebugf("Jitted with xbyak!\n");
return fn;
}
#endif

View File

@ -35,12 +35,14 @@ namespace {
Pipeline(const SkRasterPipeline::Stage* stages, int n, bool* supported) {
// Set up some register name aliases.
//auto x = rdi, y = rsi, tail = rdx;
// y = rsi, tail = rdx;
auto x = rdi;
auto r = ymm0, g = ymm1, b = ymm2, a = ymm3,
dr = ymm4, dg = ymm5, db = ymm6, da = ymm7;
Xbyak::Label floatOneStorage;
vbroadcastss(ymm8, ptr[rip + floatOneStorage]);
//trap();
// TODO: set up (x+0.5,y+0.5) in (r,g)
vxorps(r,r);
@ -54,6 +56,56 @@ namespace {
for (int i = 0; i < n; i++) {
switch(stages[i].stage) {
case SkRasterPipeline::load_f16:
mov(rax, (size_t)stages[i].ctx);
mov(rax, ptr[rax]);
vmovdqu(xmm0, ptr[rax+x*8+ 0]);
vmovdqu(xmm1, ptr[rax+x*8+16]);
vmovdqu(xmm2, ptr[rax+x*8+32]);
vmovdqu(xmm3, ptr[rax+x*8+48]);
vpunpcklwd(xmm8, xmm1, xmm0); vpunpckhwd(xmm0 , xmm1, xmm0);
vpunpcklwd(xmm1, xmm3, xmm2); vpunpckhwd(xmm2 , xmm3, xmm2);
vpunpcklwd(xmm9, xmm0, xmm8); vpunpckhwd(xmm8 , xmm0, xmm8);
vpunpcklwd(xmm3, xmm2, xmm1); vpunpckhwd(xmm10, xmm2, xmm1);
vpunpcklqdq(xmm0, xmm3, xmm9); vcvtph2ps(ymm0, xmm0);
vpunpckhqdq(xmm1, xmm3, xmm9); vcvtph2ps(ymm1, xmm1);
vpunpcklqdq(xmm2, xmm10, xmm8); vcvtph2ps(ymm2, xmm2);
vpunpckhqdq(xmm3, xmm10, xmm8); vcvtph2ps(ymm3, xmm3);
break;
case SkRasterPipeline::unpremul:
vxorps(ymm8, ymm8); // ymm8: 0
vcmpeqps(ymm10, ymm8, a); // ymm10: a == 0
vbroadcastss(ymm9, ptr[rip + floatOneStorage]); // ymm9: 1.0f
vdivps(ymm11, ymm9, a); // ymm11: 1/a
vblendvps(ymm10, ymm10, ymm8, ymm11); // ymm10: (a==0) ? 0 : 1/a
vmulps(r, r, ymm10);
vmulps(g, g, ymm10);
vmulps(b, b, ymm10);
break;
case SkRasterPipeline::store_f16:
mov(rax, (size_t)stages[i].ctx);
mov(rax, ptr[rax]);
vcvtps2ph(xmm8 , ymm0, 4);
vcvtps2ph(xmm9 , ymm1, 4);
vcvtps2ph(xmm10, ymm2, 4);
vcvtps2ph(xmm11, ymm3, 4);
vpunpcklwd(xmm12, xmm9 , xmm8 );
vpunpckhwd(xmm8 , xmm9 , xmm8 );
vpunpcklwd(xmm9 , xmm11, xmm10);
vpunpckhwd(xmm10, xmm11, xmm10);
vpunpckldq(xmm11, xmm9 , xmm12); vmovdqu(ptr[rax+x*8+ 0], xmm11);
vpunpckhdq(xmm9 , xmm9 , xmm12); vmovdqu(ptr[rax+x*8+16], xmm9 );
vpunpckldq(xmm9 , xmm10, xmm8 ); vmovdqu(ptr[rax+x*8+32], xmm9 );
vpunpckhdq(xmm8 , xmm10, xmm8 ); vmovdqu(ptr[rax+x*8+48], xmm8 );
break;
default:
*supported = false;
@ -61,6 +113,7 @@ namespace {
}
}
vzeroupper();
ret();
L(floatOneStorage); df(1.0f);
}
@ -69,6 +122,14 @@ namespace {
union { float f; uint32_t x; } pun = {f};
dd(pun.x);
}
void dp(void* p) {
union { void* p; uint64_t x; } pun = {p};
dq(pun.x);
}
void trap() {
dw(0x0b0f);
}
};
} // namespace
@ -78,6 +139,7 @@ std::function<void(size_t, size_t, size_t)> SkRasterPipeline::jit() const {
if (auto pipeline = Pipeline::Create(fStages.data(), SkToInt(fStages.size()))) {
return [pipeline] (size_t x, size_t y, size_t n) {
auto call = pipeline->getCode<void(*)(size_t, size_t, size_t)>();
//printf("fn addr: %p\n", (void*)call);
while (n >= 8) {
call(x,y,0);
x += 8;
@ -88,10 +150,13 @@ std::function<void(size_t, size_t, size_t)> SkRasterPipeline::jit() const {
}
};
}
#if 0
SkDebugf("Cannot yet JIT with xbyak:\n");
this->dump();
#endif
return nullptr;
} catch(...) {
SkDebugf("caught exception\n");
return nullptr;
}
}