Add a real SkXbyak bench, implement enough to run it.
CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD SkXbyak_… 9320 …JITCompiled 1x …Interpreted 1.24x …HandWritten 2.5x Change-Id: I37d2d255ff32dcce73d29081d506e2d67477af97 Reviewed-on: https://skia-review.googlesource.com/6697 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
parent
fa71067032
commit
83f532e9b5
78
bench/SkXbyakBench.cpp
Normal file
78
bench/SkXbyakBench.cpp
Normal file
@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "Benchmark.h"
|
||||
#include "SkHalf.h"
|
||||
#include "SkNx.h"
|
||||
#include "SkRasterPipeline.h"
|
||||
|
||||
static void hand_written(uint64_t* buf, int n) {
|
||||
while (n --> 0) {
|
||||
Sk4f rgba = SkHalfToFloat_finite_ftz(*buf);
|
||||
|
||||
float a = rgba[3],
|
||||
scale = a == 0 ? 1 : 1.0f/a;
|
||||
rgba *= Sk4f{scale,scale,scale,1};
|
||||
|
||||
SkFloatToHalf_finite_ftz(rgba).store(buf++);
|
||||
}
|
||||
}
|
||||
|
||||
class SkXbyakBench : public Benchmark {
|
||||
public:
|
||||
enum Mode { kHandWritten, kInterpreted, kJITCompiled };
|
||||
|
||||
SkXbyakBench(Mode mode) : fMode(mode) {
|
||||
if (mode == kInterpreted || mode == kJITCompiled) {
|
||||
fPtr = &fBuf;
|
||||
fP.append(SkRasterPipeline::load_f16, &fPtr);
|
||||
fP.append(SkRasterPipeline::unpremul);
|
||||
fP.append(SkRasterPipeline::store_f16, &fPtr);
|
||||
}
|
||||
|
||||
if (mode == kJITCompiled) {
|
||||
fFn = fP.compile();
|
||||
}
|
||||
}
|
||||
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
|
||||
|
||||
const char* onGetName() override {
|
||||
switch(fMode) {
|
||||
case kHandWritten: return "SkXbyak_HandWritten";
|
||||
case kInterpreted: return "SkXbyak_Interpreted";
|
||||
case kJITCompiled: return "SkXbyak_JITCompiled";
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
void onDraw(int loops, SkCanvas*) override {
|
||||
switch (fMode) {
|
||||
case kHandWritten:
|
||||
while (loops --> 0) { hand_written(fBuf, N); }
|
||||
break;
|
||||
case kInterpreted:
|
||||
while (loops --> 0) { fP.run(0,0,N); }
|
||||
break;
|
||||
case kJITCompiled:
|
||||
while (loops --> 0) { fFn(0,0,N); }
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static const int N = 1024; // TODO: 1023, making the tail jagged
|
||||
|
||||
SkRasterPipeline fP;
|
||||
Mode fMode;
|
||||
uint64_t fBuf[N];
|
||||
void* fPtr;
|
||||
std::function<void(size_t, size_t, size_t)> fFn;
|
||||
};
|
||||
|
||||
DEF_BENCH( return new SkXbyakBench(SkXbyakBench::kHandWritten); )
|
||||
DEF_BENCH( return new SkXbyakBench(SkXbyakBench::kInterpreted); )
|
||||
DEF_BENCH( return new SkXbyakBench(SkXbyakBench::kJITCompiled); )
|
@ -111,6 +111,7 @@ bench_sources = [
|
||||
"$_bench/SKPAnimationBench.cpp",
|
||||
"$_bench/SKPBench.cpp",
|
||||
"$_bench/SkRasterPipelineBench.cpp",
|
||||
"$_bench/SkXbyakBench.cpp",
|
||||
"$_bench/StreamBench.cpp",
|
||||
"$_bench/SortBench.cpp",
|
||||
"$_bench/StrokeBench.cpp",
|
||||
|
@ -29,7 +29,6 @@ void SkRasterPipeline::run(size_t x, size_t y, size_t n) const {
|
||||
std::function<void(size_t, size_t, size_t)> SkRasterPipeline::compile() const {
|
||||
#ifdef SK_XBYAK
|
||||
if (auto fn = this->jit()) {
|
||||
SkDebugf("Jitted with xbyak!\n");
|
||||
return fn;
|
||||
}
|
||||
#endif
|
||||
|
@ -35,12 +35,14 @@ namespace {
|
||||
|
||||
Pipeline(const SkRasterPipeline::Stage* stages, int n, bool* supported) {
|
||||
// Set up some register name aliases.
|
||||
//auto x = rdi, y = rsi, tail = rdx;
|
||||
// y = rsi, tail = rdx;
|
||||
auto x = rdi;
|
||||
auto r = ymm0, g = ymm1, b = ymm2, a = ymm3,
|
||||
dr = ymm4, dg = ymm5, db = ymm6, da = ymm7;
|
||||
|
||||
Xbyak::Label floatOneStorage;
|
||||
vbroadcastss(ymm8, ptr[rip + floatOneStorage]);
|
||||
|
||||
//trap();
|
||||
|
||||
// TODO: set up (x+0.5,y+0.5) in (r,g)
|
||||
vxorps(r,r);
|
||||
@ -54,6 +56,56 @@ namespace {
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
switch(stages[i].stage) {
|
||||
case SkRasterPipeline::load_f16:
|
||||
mov(rax, (size_t)stages[i].ctx);
|
||||
mov(rax, ptr[rax]);
|
||||
|
||||
vmovdqu(xmm0, ptr[rax+x*8+ 0]);
|
||||
vmovdqu(xmm1, ptr[rax+x*8+16]);
|
||||
vmovdqu(xmm2, ptr[rax+x*8+32]);
|
||||
vmovdqu(xmm3, ptr[rax+x*8+48]);
|
||||
|
||||
vpunpcklwd(xmm8, xmm1, xmm0); vpunpckhwd(xmm0 , xmm1, xmm0);
|
||||
vpunpcklwd(xmm1, xmm3, xmm2); vpunpckhwd(xmm2 , xmm3, xmm2);
|
||||
vpunpcklwd(xmm9, xmm0, xmm8); vpunpckhwd(xmm8 , xmm0, xmm8);
|
||||
vpunpcklwd(xmm3, xmm2, xmm1); vpunpckhwd(xmm10, xmm2, xmm1);
|
||||
|
||||
vpunpcklqdq(xmm0, xmm3, xmm9); vcvtph2ps(ymm0, xmm0);
|
||||
vpunpckhqdq(xmm1, xmm3, xmm9); vcvtph2ps(ymm1, xmm1);
|
||||
vpunpcklqdq(xmm2, xmm10, xmm8); vcvtph2ps(ymm2, xmm2);
|
||||
vpunpckhqdq(xmm3, xmm10, xmm8); vcvtph2ps(ymm3, xmm3);
|
||||
break;
|
||||
|
||||
case SkRasterPipeline::unpremul:
|
||||
vxorps(ymm8, ymm8); // ymm8: 0
|
||||
vcmpeqps(ymm10, ymm8, a); // ymm10: a == 0
|
||||
vbroadcastss(ymm9, ptr[rip + floatOneStorage]); // ymm9: 1.0f
|
||||
vdivps(ymm11, ymm9, a); // ymm11: 1/a
|
||||
vblendvps(ymm10, ymm10, ymm8, ymm11); // ymm10: (a==0) ? 0 : 1/a
|
||||
vmulps(r, r, ymm10);
|
||||
vmulps(g, g, ymm10);
|
||||
vmulps(b, b, ymm10);
|
||||
break;
|
||||
|
||||
case SkRasterPipeline::store_f16:
|
||||
mov(rax, (size_t)stages[i].ctx);
|
||||
mov(rax, ptr[rax]);
|
||||
|
||||
vcvtps2ph(xmm8 , ymm0, 4);
|
||||
vcvtps2ph(xmm9 , ymm1, 4);
|
||||
vcvtps2ph(xmm10, ymm2, 4);
|
||||
vcvtps2ph(xmm11, ymm3, 4);
|
||||
|
||||
vpunpcklwd(xmm12, xmm9 , xmm8 );
|
||||
vpunpckhwd(xmm8 , xmm9 , xmm8 );
|
||||
vpunpcklwd(xmm9 , xmm11, xmm10);
|
||||
vpunpckhwd(xmm10, xmm11, xmm10);
|
||||
|
||||
vpunpckldq(xmm11, xmm9 , xmm12); vmovdqu(ptr[rax+x*8+ 0], xmm11);
|
||||
vpunpckhdq(xmm9 , xmm9 , xmm12); vmovdqu(ptr[rax+x*8+16], xmm9 );
|
||||
vpunpckldq(xmm9 , xmm10, xmm8 ); vmovdqu(ptr[rax+x*8+32], xmm9 );
|
||||
vpunpckhdq(xmm8 , xmm10, xmm8 ); vmovdqu(ptr[rax+x*8+48], xmm8 );
|
||||
break;
|
||||
|
||||
default:
|
||||
*supported = false;
|
||||
@ -61,6 +113,7 @@ namespace {
|
||||
}
|
||||
}
|
||||
|
||||
vzeroupper();
|
||||
ret();
|
||||
L(floatOneStorage); df(1.0f);
|
||||
}
|
||||
@ -69,6 +122,14 @@ namespace {
|
||||
union { float f; uint32_t x; } pun = {f};
|
||||
dd(pun.x);
|
||||
}
|
||||
void dp(void* p) {
|
||||
union { void* p; uint64_t x; } pun = {p};
|
||||
dq(pun.x);
|
||||
}
|
||||
|
||||
void trap() {
|
||||
dw(0x0b0f);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
@ -78,6 +139,7 @@ std::function<void(size_t, size_t, size_t)> SkRasterPipeline::jit() const {
|
||||
if (auto pipeline = Pipeline::Create(fStages.data(), SkToInt(fStages.size()))) {
|
||||
return [pipeline] (size_t x, size_t y, size_t n) {
|
||||
auto call = pipeline->getCode<void(*)(size_t, size_t, size_t)>();
|
||||
//printf("fn addr: %p\n", (void*)call);
|
||||
while (n >= 8) {
|
||||
call(x,y,0);
|
||||
x += 8;
|
||||
@ -88,10 +150,13 @@ std::function<void(size_t, size_t, size_t)> SkRasterPipeline::jit() const {
|
||||
}
|
||||
};
|
||||
}
|
||||
#if 0
|
||||
SkDebugf("Cannot yet JIT with xbyak:\n");
|
||||
this->dump();
|
||||
#endif
|
||||
return nullptr;
|
||||
} catch(...) {
|
||||
SkDebugf("caught exception\n");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user