SkSplicer: start on arm64

Seems to be working.  The jump to loop_start might be a little off, but not by much.  Correctness is really still a big TODO.

$ adb shell 'cd /data/local/tmp; ./monobench SkRasterPipeline 200'
SkRasterPipeline_…
200  …f16_compile 1x  …f16_run 1.42x  …srgb_compile 2.21x  …srgb_run 2.59x⏎

Change-Id: I0e1acc6404cf3ce8084d9ef8011cbe0b5f1fd6e3
Reviewed-on: https://skia-review.googlesource.com/6811
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-01-09 17:21:32 -05:00 committed by Skia Commit-Bot
parent 1e74cad9b4
commit 8e619a2b4e
4 changed files with 482 additions and 87 deletions

View File

@ -29,38 +29,86 @@ namespace {
12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb
};
// Short x86-64 instruction sequences that we'll use as glue to splice together Stages.
static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 }; // clear top half of all ymm
static const uint8_t ret[] = { 0xc3 }; // return
static const uint8_t movabsq_rcx[] = { 0x48, 0xb9 }; // move next 8 bytes into rcx
static const uint8_t movabsq_rdx[] = { 0x48, 0xba }; // move next 8 bytes into rdx
static const uint8_t addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 }; // rdi += 8
static const uint8_t cmpq_rsi_rdi[] = { 0x48, 0x39, 0xf7 }; // rdi cmp? rsi
static const uint8_t jb_near[] = { 0x0f, 0x8c }; // jump relative next 4 bytes
// if cmp set unsigned < bit
// We do this a lot, so it's nice to infer the correct size. Works fine with arrays.
template <typename T>
void splice(SkWStream* stream, const T& val) {
stream->write(&val, sizeof(val));
static void splice(SkWStream* buf, const T& val) {
buf->write(&val, sizeof(val));
}
#if defined(__aarch64__)
static constexpr int kStride = 4;
static void set_k(SkWStream* buf, const SkSplicer_constants* k) {
uint16_t parts[4];
memcpy(parts, &k, 8);
splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x3); // move 16-bit intermediate << 48 into x3
splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x3); // merge 16-bit intermediate << 32 into x3
splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x3); // merge 16-bit intermediate << 16 into x3
splice(buf, 0xf2800000 | (parts[0] << 5) | 0x3); // merge 16-bit intermediate << 0 into x3
}
static void set_ctx(SkWStream* buf, void* ctx) {
uint16_t parts[4];
memcpy(parts, &ctx, 8);
splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x2); // move 16-bit intermediate << 48 into x2
splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x2); // merge 16-bit intermediate << 32 into x2
splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x2); // merge 16-bit intermediate << 16 into x2
splice(buf, 0xf2800000 | (parts[0] << 5) | 0x2); // merge 16-bit intermediate << 0 into x2
}
static void loop(SkWStream* buf, int loop_start) {
splice(buf, 0x91001000); // add x0, x0, #4
splice(buf, 0xeb01001f); // cmp x0, x1
int off = loop_start - (int)(buf->bytesWritten() + 4); // TODO: check that this is right
off /= 4; // bytes -> instructions, still signed
off = (off & 0x7ffff) << 5; // 19 bit maximum range (+- 256K instructions)
splice(buf, 0x54000003 | off); // b.cc loop_start (cc == "carry clear", unsigned less than)
}
static void ret(SkWStream* buf) {
splice(buf, 0xd65f03c0); // ret
}
#else
static constexpr int kStride = 8;
static void set_k(SkWStream* buf, const SkSplicer_constants* k) {
static const uint8_t movabsq_rcx[] = { 0x48, 0xb9 };
splice(buf, movabsq_rcx); // movabsq <next 8 bytes>, %rcx
splice(buf, k);
}
static void set_ctx(SkWStream* buf, void* ctx) {
static const uint8_t movabsq_rdx[] = { 0x48, 0xba };
splice(buf, movabsq_rdx); // movabsq <next 8 bytes>, %rdx
splice(buf, ctx);
}
static void loop(SkWStream* buf, int loop_start) {
static const uint8_t addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 };
static const uint8_t cmp_rsi_rdi[] = { 0x48, 0x39, 0xf7 };
static const uint8_t jb_near[] = { 0x0f, 0x8c };
splice(buf, addq_8_rdi); // addq $8, %rdi
splice(buf, cmp_rsi_rdi); // cmp %rsi, %rdi
splice(buf, jb_near); // jb <next 4 bytes> (b == "before", unsigned less than)
splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
}
static void ret(SkWStream* buf) {
static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
static const uint8_t ret[] = { 0xc3 };
splice(buf, vzeroupper);
splice(buf, ret);
}
#endif
#ifdef IACA_DUMP
static const uint8_t ud2[] = { 0x0f, 0x0b }; // undefined... crashes when run
static const uint8_t nop3[] = { 0x64, 0x67, 0x90 }; // 3 byte no-op
static const uint8_t movl_ebx[] = { 0xbb }; // move next 4 bytes into ebx
static void iaca_start(SkWStream* stream) {
splice(stream, ud2);
splice(stream, movl_ebx);
splice(stream, 111);
splice(stream, nop3);
static void iaca_start(SkWStream* buf) {
splice(buf, ud2);
splice(buf, movl_ebx);
splice(buf, 111);
splice(buf, nop3);
}
static void iaca_end(SkWStream* stream) {
splice(stream, movl_ebx);
splice(stream, 222);
splice(stream, nop3);
splice(stream, ud2);
static void iaca_end(SkWStream* buf) {
splice(buf, movl_ebx);
splice(buf, 222);
splice(buf, nop3);
splice(buf, ud2);
}
#else
static void iaca_start(SkWStream*) {}
@ -87,32 +135,32 @@ namespace {
Spliced(const SkRasterPipeline::Stage* stages, int nstages) {
// We always create a backup interpreter pipeline,
// - to handle any program we can't, and
// - to handle the n < 8 tails.
// - to handle the n < kStride tails.
fBackup = SkOpts::compile_pipeline(stages, nstages);
fSplicedLen = 0;
fSpliced = nullptr;
// If we return early anywhere in here, !fSpliced means we'll use fBackup instead.
#if !defined(__aarch64__)
// To keep things simple, only one target supported: Haswell+ x86-64.
if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) {
return;
}
#endif
SkDynamicMemoryWStream buf;
// Put the address of kConstants in rcx, Stage argument 4 "k".
splice(&buf, movabsq_rcx);
splice(&buf, &kConstants);
// Put the address of kConstants in rcx/x3, Stage argument 4 "k".
set_k(&buf, &kConstants);
// We'll loop back to here as long as x<n after x+=8.
// We'll loop back to here as long as x<n after x += kStride.
iaca_start(&buf);
auto loop_start = buf.bytesWritten(); // Think of this like a label, loop_start:
for (int i = 0; i < nstages; i++) {
// If a stage has a context pointer, load it into rdx, Stage argument 3 "ctx".
// If a stage has a context pointer, load it into rdx/x2, Stage argument 3 "ctx".
if (stages[i].ctx) {
splice(&buf, movabsq_rdx);
splice(&buf, stages[i].ctx);
set_ctx(&buf, stages[i].ctx);
}
// Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
@ -144,16 +192,9 @@ namespace {
}
}
// See if we should loop back to handle more pixels.
splice(&buf, addq_8_rdi); // x += 8
splice(&buf, cmpq_rsi_rdi); // if (x < n)
splice(&buf, jb_near); // goto loop_start;
splice(&buf, (int)loop_start - (int)(buf.bytesWritten() + 4));
loop(&buf, loop_start); // Loop back to handle more pixels if not done.
iaca_end(&buf);
// Nope! We're done.
splice(&buf, vzeroupper);
splice(&buf, ret);
ret(&buf); // We're done.
auto data = buf.detachAsData();
fSplicedLen = data->size();
@ -175,14 +216,15 @@ namespace {
// Here's where we call fSpliced if we created it, fBackup if not.
void operator()(size_t x, size_t y, size_t n) const {
// TODO: The looping logic is probably not correct for handling n<8 tails.
if (fSpliced) {
// TODO: The looping logic is probably not correct for n < kStride tails or x != 0.
size_t body = n/kStride*kStride; // Largest multiple of kStride (4 or 8) <= n.
if (fSpliced && body) { // Can we run fSpliced for at least one kStride?
// TODO: At some point we will want to pass in y...
using Fn = void(size_t x, size_t n);
((Fn*)fSpliced)(x,n);
((Fn*)fSpliced)(x,body);
// Fall through to fBackup for any n<8 last pixels.
size_t body = n/8*8;
// Fall through to fBackup for any n<kStride last pixels.
x += body;
n -= body;
}

View File

@ -11,6 +11,279 @@
// This file is generated semi-automatically with this command:
// $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
#if defined(__aarch64__)
static const unsigned int kSplice_clear[] = {
0x6f00e400, // movi v0.2d, #0x0
0x6f00e401, // movi v1.2d, #0x0
0x6f00e402, // movi v2.2d, #0x0
0x6f00e403, // movi v3.2d, #0x0
};
static const unsigned int kSplice_plus[] = {
0x4e24d400, // fadd v0.4s, v0.4s, v4.4s
0x4e25d421, // fadd v1.4s, v1.4s, v5.4s
0x4e26d442, // fadd v2.4s, v2.4s, v6.4s
0x4e27d463, // fadd v3.4s, v3.4s, v7.4s
};
static const unsigned int kSplice_srcover[] = {
0x91001068, // add x8, x3, #0x4
0x4d40c910, // ld1r {v16.4s}, [x8]
0x4ea3d610, // fsub v16.4s, v16.4s, v3.4s
0x4e24ce00, // fmla v0.4s, v16.4s, v4.4s
0x4e25ce01, // fmla v1.4s, v16.4s, v5.4s
0x4e26ce02, // fmla v2.4s, v16.4s, v6.4s
0x4e26ce03, // fmla v3.4s, v16.4s, v6.4s
};
static const unsigned int kSplice_dstover[] = {
0x91001068, // add x8, x3, #0x4
0x4d40c910, // ld1r {v16.4s}, [x8]
0x4ea7d610, // fsub v16.4s, v16.4s, v7.4s
0x4e20ce04, // fmla v4.4s, v16.4s, v0.4s
0x4e21ce05, // fmla v5.4s, v16.4s, v1.4s
0x4e22ce06, // fmla v6.4s, v16.4s, v2.4s
0x4e22ce07, // fmla v7.4s, v16.4s, v2.4s
};
static const unsigned int kSplice_clamp_0[] = {
0x6f00e410, // movi v16.2d, #0x0
0x4e30f400, // fmax v0.4s, v0.4s, v16.4s
0x4e30f421, // fmax v1.4s, v1.4s, v16.4s
0x4e30f442, // fmax v2.4s, v2.4s, v16.4s
0x4e30f463, // fmax v3.4s, v3.4s, v16.4s
};
static const unsigned int kSplice_clamp_1[] = {
0x91001068, // add x8, x3, #0x4
0x4d40c910, // ld1r {v16.4s}, [x8]
0x4eb0f400, // fmin v0.4s, v0.4s, v16.4s
0x4eb0f421, // fmin v1.4s, v1.4s, v16.4s
0x4eb0f442, // fmin v2.4s, v2.4s, v16.4s
0x4eb0f463, // fmin v3.4s, v3.4s, v16.4s
};
static const unsigned int kSplice_clamp_a[] = {
0x91001068, // add x8, x3, #0x4
0x4d40c910, // ld1r {v16.4s}, [x8]
0x4eb0f463, // fmin v3.4s, v3.4s, v16.4s
0x4ea3f400, // fmin v0.4s, v0.4s, v3.4s
0x4ea3f421, // fmin v1.4s, v1.4s, v3.4s
0x4ea3f442, // fmin v2.4s, v2.4s, v3.4s
};
static const unsigned int kSplice_swap[] = {
0x4ea31c70, // mov v16.16b, v3.16b
0x4ea21c51, // mov v17.16b, v2.16b
0x4ea11c32, // mov v18.16b, v1.16b
0x4ea01c13, // mov v19.16b, v0.16b
0x4ea41c80, // mov v0.16b, v4.16b
0x4ea51ca1, // mov v1.16b, v5.16b
0x4ea61cc2, // mov v2.16b, v6.16b
0x4ea71ce3, // mov v3.16b, v7.16b
0x4eb31e64, // mov v4.16b, v19.16b
0x4eb21e45, // mov v5.16b, v18.16b
0x4eb11e26, // mov v6.16b, v17.16b
0x4eb01e07, // mov v7.16b, v16.16b
};
static const unsigned int kSplice_move_src_dst[] = {
0x4ea01c04, // mov v4.16b, v0.16b
0x4ea11c25, // mov v5.16b, v1.16b
0x4ea21c46, // mov v6.16b, v2.16b
0x4ea31c67, // mov v7.16b, v3.16b
};
static const unsigned int kSplice_move_dst_src[] = {
0x4ea41c80, // mov v0.16b, v4.16b
0x4ea51ca1, // mov v1.16b, v5.16b
0x4ea61cc2, // mov v2.16b, v6.16b
0x4ea71ce3, // mov v3.16b, v7.16b
};
static const unsigned int kSplice_premul[] = {
0x6e23dc00, // fmul v0.4s, v0.4s, v3.4s
0x6e23dc21, // fmul v1.4s, v1.4s, v3.4s
0x6e23dc42, // fmul v2.4s, v2.4s, v3.4s
};
static const unsigned int kSplice_unpremul[] = {
0x91001068, // add x8, x3, #0x4
0x4d40c910, // ld1r {v16.4s}, [x8]
0x4ea0d871, // fcmeq v17.4s, v3.4s, #0.0
0x6e23fe10, // fdiv v16.4s, v16.4s, v3.4s
0x4e711e10, // bic v16.16b, v16.16b, v17.16b
0x6e20de00, // fmul v0.4s, v16.4s, v0.4s
0x6e21de01, // fmul v1.4s, v16.4s, v1.4s
0x6e22de02, // fmul v2.4s, v16.4s, v2.4s
};
static const unsigned int kSplice_from_srgb[] = {
0x91005068, // add x8, x3, #0x14
0x4d40c910, // ld1r {v16.4s}, [x8]
0x91004068, // add x8, x3, #0x10
0x4d40c911, // ld1r {v17.4s}, [x8]
0x2d434c72, // ldp s18, s19, [x3,#24]
0x6e22dc54, // fmul v20.4s, v2.4s, v2.4s
0x4eb01e15, // mov v21.16b, v16.16b
0x4eb01e17, // mov v23.16b, v16.16b
0x4f921050, // fmla v16.4s, v2.4s, v18.s[0]
0x4eb11e36, // mov v22.16b, v17.16b
0x4eb11e38, // mov v24.16b, v17.16b
0x4e34ce11, // fmla v17.4s, v16.4s, v20.4s
0x6e20dc10, // fmul v16.4s, v0.4s, v0.4s
0x91008068, // add x8, x3, #0x20
0x4f921015, // fmla v21.4s, v0.4s, v18.s[0]
0x4e30ceb6, // fmla v22.4s, v21.4s, v16.4s
0x4d40c910, // ld1r {v16.4s}, [x8]
0x6e21dc34, // fmul v20.4s, v1.4s, v1.4s
0x4f921037, // fmla v23.4s, v1.4s, v18.s[0]
0x4f939015, // fmul v21.4s, v0.4s, v19.s[0]
0x4f939032, // fmul v18.4s, v1.4s, v19.s[0]
0x4f939053, // fmul v19.4s, v2.4s, v19.s[0]
0x6ea0e600, // fcmgt v0.4s, v16.4s, v0.4s
0x6ea1e601, // fcmgt v1.4s, v16.4s, v1.4s
0x6ea2e602, // fcmgt v2.4s, v16.4s, v2.4s
0x4e34cef8, // fmla v24.4s, v23.4s, v20.4s
0x6e761ea0, // bsl v0.16b, v21.16b, v22.16b
0x6e781e41, // bsl v1.16b, v18.16b, v24.16b
0x6e711e62, // bsl v2.16b, v19.16b, v17.16b
};
static const unsigned int kSplice_to_srgb[] = {
0x6ea1d810, // frsqrte v16.4s, v0.4s
0x6ea1d835, // frsqrte v21.4s, v1.4s
0x6e30de17, // fmul v23.4s, v16.4s, v16.4s
0x6ea1d856, // frsqrte v22.4s, v2.4s
0x6e35deb9, // fmul v25.4s, v21.4s, v21.4s
0x4eb7fc17, // frsqrts v23.4s, v0.4s, v23.4s
0x9100c068, // add x8, x3, #0x30
0x6e36deda, // fmul v26.4s, v22.4s, v22.4s
0x4eb9fc39, // frsqrts v25.4s, v1.4s, v25.4s
0x6e37de10, // fmul v16.4s, v16.4s, v23.4s
0x2d44c871, // ldp s17, s18, [x3,#36]
0x4d40c914, // ld1r {v20.4s}, [x8]
0x4ebafc5a, // frsqrts v26.4s, v2.4s, v26.4s
0x6e39deb5, // fmul v21.4s, v21.4s, v25.4s
0x4ea1da17, // frecpe v23.4s, v16.4s
0xbd402c73, // ldr s19, [x3,#44]
0x9100d068, // add x8, x3, #0x34
0x6e3aded6, // fmul v22.4s, v22.4s, v26.4s
0x4ea1dabb, // frecpe v27.4s, v21.4s
0x4e37fe1d, // frecps v29.4s, v16.4s, v23.4s
0x4d40c918, // ld1r {v24.4s}, [x8]
0x4ea1dadc, // frecpe v28.4s, v22.4s
0x6e3ddef7, // fmul v23.4s, v23.4s, v29.4s
0x4e3bfebd, // frecps v29.4s, v21.4s, v27.4s
0x6e3ddf7b, // fmul v27.4s, v27.4s, v29.4s
0x4e3cfedd, // frecps v29.4s, v22.4s, v28.4s
0x6e3ddf9c, // fmul v28.4s, v28.4s, v29.4s
0x4eb41e9d, // mov v29.16b, v20.16b
0x6ea1da19, // frsqrte v25.4s, v16.4s
0x4f9312fd, // fmla v29.4s, v23.4s, v19.s[0]
0x4eb41e97, // mov v23.16b, v20.16b
0x4f91901a, // fmul v26.4s, v0.4s, v17.s[0]
0x4f931377, // fmla v23.4s, v27.4s, v19.s[0]
0x6ea1dabb, // frsqrte v27.4s, v21.4s
0x4f931394, // fmla v20.4s, v28.4s, v19.s[0]
0x4f919033, // fmul v19.4s, v1.4s, v17.s[0]
0x4f919051, // fmul v17.4s, v2.4s, v17.s[0]
0x6ea0e700, // fcmgt v0.4s, v24.4s, v0.4s
0x6ea1e701, // fcmgt v1.4s, v24.4s, v1.4s
0x6ea2e702, // fcmgt v2.4s, v24.4s, v2.4s
0x6e39df38, // fmul v24.4s, v25.4s, v25.4s
0x6ea1dadc, // frsqrte v28.4s, v22.4s
0x4eb8fe10, // frsqrts v16.4s, v16.4s, v24.4s
0x6e3bdf78, // fmul v24.4s, v27.4s, v27.4s
0x4eb8feb5, // frsqrts v21.4s, v21.4s, v24.4s
0x6e3cdf98, // fmul v24.4s, v28.4s, v28.4s
0x91001068, // add x8, x3, #0x4
0x4eb8fed6, // frsqrts v22.4s, v22.4s, v24.4s
0x4d40c918, // ld1r {v24.4s}, [x8]
0x6e30df30, // fmul v16.4s, v25.4s, v16.4s
0x6e35df75, // fmul v21.4s, v27.4s, v21.4s
0x6e36df96, // fmul v22.4s, v28.4s, v22.4s
0x4f92121d, // fmla v29.4s, v16.4s, v18.s[0]
0x4f9212b7, // fmla v23.4s, v21.4s, v18.s[0]
0x4f9212d4, // fmla v20.4s, v22.4s, v18.s[0]
0x4ebdf710, // fmin v16.4s, v24.4s, v29.4s
0x4eb7f712, // fmin v18.4s, v24.4s, v23.4s
0x4eb4f714, // fmin v20.4s, v24.4s, v20.4s
0x6e701f40, // bsl v0.16b, v26.16b, v16.16b
0x6e721e61, // bsl v1.16b, v19.16b, v18.16b
0x6e741e22, // bsl v2.16b, v17.16b, v20.16b
};
static const unsigned int kSplice_scale_u8[] = {
0xf9400048, // ldr x8, [x2]
0xbd400c71, // ldr s17, [x3,#12]
0x8b000108, // add x8, x8, x0
0x39400109, // ldrb w9, [x8]
0x3940050a, // ldrb w10, [x8,#1]
0x4e021d30, // mov v16.h[0], w9
0x39400909, // ldrb w9, [x8,#2]
0x39400d08, // ldrb w8, [x8,#3]
0x4e061d50, // mov v16.h[1], w10
0x4e0a1d30, // mov v16.h[2], w9
0x4e0e1d10, // mov v16.h[3], w8
0x2f07b7f0, // bic v16.4h, #0xff, lsl #8
0x2f10a610, // uxtl v16.4s, v16.4h
0x6e21da10, // ucvtf v16.4s, v16.4s
0x4f919210, // fmul v16.4s, v16.4s, v17.s[0]
0x6e20de00, // fmul v0.4s, v16.4s, v0.4s
0x6e21de01, // fmul v1.4s, v16.4s, v1.4s
0x6e22de02, // fmul v2.4s, v16.4s, v2.4s
0x6e23de03, // fmul v3.4s, v16.4s, v3.4s
};
static const unsigned int kSplice_load_8888[] = {
0xf9400048, // ldr x8, [x2]
0xd37ef409, // lsl x9, x0, #2
0x4d40c860, // ld1r {v0.4s}, [x3]
0xbd400c63, // ldr s3, [x3,#12]
0x3ce96901, // ldr q1, [x8,x9]
0x4e211c02, // and v2.16b, v0.16b, v1.16b
0x6f380430, // ushr v16.4s, v1.4s, #8
0x6f300431, // ushr v17.4s, v1.4s, #16
0x6f280421, // ushr v1.4s, v1.4s, #24
0x4e21d842, // scvtf v2.4s, v2.4s
0x4e301c10, // and v16.16b, v0.16b, v16.16b
0x4e311c11, // and v17.16b, v0.16b, v17.16b
0x4e21d832, // scvtf v18.4s, v1.4s
0x4f839040, // fmul v0.4s, v2.4s, v3.s[0]
0x4e21da01, // scvtf v1.4s, v16.4s
0x4e21da22, // scvtf v2.4s, v17.4s
0x4f839021, // fmul v1.4s, v1.4s, v3.s[0]
0x4f839042, // fmul v2.4s, v2.4s, v3.s[0]
0x4f839243, // fmul v3.4s, v18.4s, v3.s[0]
};
static const unsigned int kSplice_store_8888[] = {
0xbd400870, // ldr s16, [x3,#8]
0xf9400048, // ldr x8, [x2]
0xd37ef409, // lsl x9, x0, #2
0x4f909032, // fmul v18.4s, v1.4s, v16.s[0]
0x4f909011, // fmul v17.4s, v0.4s, v16.s[0]
0x6e21aa52, // fcvtnu v18.4s, v18.4s
0x6e21aa31, // fcvtnu v17.4s, v17.4s
0x4f285652, // shl v18.4s, v18.4s, #8
0x4eb11e51, // orr v17.16b, v18.16b, v17.16b
0x4f909052, // fmul v18.4s, v2.4s, v16.s[0]
0x4f909070, // fmul v16.4s, v3.4s, v16.s[0]
0x6e21aa52, // fcvtnu v18.4s, v18.4s
0x6e21aa10, // fcvtnu v16.4s, v16.4s
0x4f305652, // shl v18.4s, v18.4s, #16
0x4eb21e31, // orr v17.16b, v17.16b, v18.16b
0x4f385610, // shl v16.4s, v16.4s, #24
0x4eb01e30, // orr v16.16b, v17.16b, v16.16b
0x3ca96910, // str q16, [x8,x9]
};
static const unsigned int kSplice_load_f16[] = {
0xf9400048, // ldr x8, [x2]
0x8b000d08, // add x8, x8, x0, lsl #3
0x0c400510, // ld4 {v16.4h-v19.4h}, [x8]
0x0e217a00, // fcvtl v0.4s, v16.4h
0x0e217a21, // fcvtl v1.4s, v17.4h
0x0e217a42, // fcvtl v2.4s, v18.4h
0x0e217a63, // fcvtl v3.4s, v19.4h
};
static const unsigned int kSplice_store_f16[] = {
0xf9400048, // ldr x8, [x2]
0x0e216810, // fcvtn v16.4h, v0.4s
0x0e216831, // fcvtn v17.4h, v1.4s
0x0e216852, // fcvtn v18.4h, v2.4s
0x8b000d08, // add x8, x8, x0, lsl #3
0x0e216873, // fcvtn v19.4h, v3.4s
0x0c000510, // st4 {v16.4h-v19.4h}, [x8]
};
#else
static const unsigned char kSplice_clear[] = {
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0, %ymm0, %ymm0
0xc5,0xf4,0x57,0xc9, // vxorps %ymm1, %ymm1, %ymm1
@ -255,4 +528,7 @@ static const unsigned char kSplice_store_f16[] = {
0xc4,0x41,0x39,0x6a,0xc2, // vpunpckhdq %xmm10, %xmm8, %xmm8
0xc5,0x7a,0x7f,0x44,0xf8,0x30, // vmovdqu %xmm8, 0x30(%rax,%rdi,8)
};
#endif
#endif//SkSplicer_generated_DEFINED

View File

@ -6,52 +6,71 @@
*/
#include "SkSplicer_shared.h"
#include <immintrin.h>
#include <string.h>
#if !defined(__clang__) || !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
#error This file is not like the rest of Skia.
#error It must be compiled with clang and with -mavx2 -mfma -mf16c -fomit-frame-pointer.
#if !defined(__clang__)
#error This file is not like the rest of Skia. It must be compiled with clang.
#endif
// We have very specific inlining requirements. It helps to just take total control.
#define AI __attribute__((always_inline)) inline
#if defined(__aarch64__)
#include <arm_neon.h>
// Since we know we're using Clang, we can use its vector extensions.
using F = float __attribute__((ext_vector_type(4)));
using I32 = int32_t __attribute__((ext_vector_type(4)));
using U32 = uint32_t __attribute__((ext_vector_type(4)));
using U8 = uint8_t __attribute__((ext_vector_type(4)));
// We polyfill a few routines that Clang doesn't build into ext_vector_types.
AI static U32 round(F v) { return vcvtnq_u32_f32(v); }
AI static F min(F a, F b) { return vminq_f32(a,b); }
AI static F max(F a, F b) { return vmaxq_f32(a,b); }
AI static F fma(F f, F m, F a) { return vfmaq_f32(a,f,m); }
AI static F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
AI static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
AI static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
#else
#if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
#error On x86, compile with -mavx2 -mfma -mf16c.
#endif
#include <immintrin.h>
// These are __m256 and __m256i, but friendlier and strongly-typed.
using F = float __attribute__((ext_vector_type(8)));
using I32 = int32_t __attribute__((ext_vector_type(8)));
using U32 = uint32_t __attribute__((ext_vector_type(8)));
using U8 = uint8_t __attribute__((ext_vector_type(8)));
AI static U32 round(F v) { return _mm256_cvtps_epi32(v); }
AI static F min(F a, F b) { return _mm256_min_ps (a,b); }
AI static F max(F a, F b) { return _mm256_max_ps (a,b); }
AI static F fma(F f, F m, F a) { return _mm256_fmadd_ps(f,m,a);}
AI static F rcp (F v) { return _mm256_rcp_ps (v); }
AI static F rsqrt(F v) { return _mm256_rsqrt_ps (v); }
AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
#endif
AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
AI static U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
// We'll be compiling this file to an object file, then extracting parts of it into
// SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled.
#define C extern "C"
// Since we know we're using Clang, we can use its vector extensions.
// These are __m256 and __m256i, but friendlier and strongly-typed.
using F = float __attribute__((ext_vector_type(8)));
using I32 = int32_t __attribute__((ext_vector_type(8)));
using U32 = uint32_t __attribute__((ext_vector_type(8)));
using U8 = uint8_t __attribute__((ext_vector_type(8)));
// We polyfill a few routines that Clang doesn't build into ext_vector_types.
AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
AI static U32 round (F v) { return _mm256_cvtps_epi32(v); }
AI static U32 expand(U8 v) { return __builtin_convertvector(v, U32); }
AI static F rcp (F v) { return _mm256_rcp_ps (v); }
AI static F rsqrt(F v) { return _mm256_rsqrt_ps(v); }
AI static F min (F a, F b) { return _mm256_min_ps (a,b); }
AI static F max (F a, F b) { return _mm256_max_ps (a,b); }
AI static F fma (F f, F m, F a) { return _mm256_fmadd_ps(f,m,a); }
AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
// Stages all fit a common interface that allows SkSplicer to splice them together.
using K = const SkSplicer_constants;
using Stage = void(size_t x, size_t n, void* ctx, K* constants, F,F,F,F, F,F,F,F);
// Stage's arguments act as the working set of registers within the final spliced function.
// Here's a little primer on the ABI:
// x: rdi x and n work to drive the loop, like for (; x < n; x += 8)
// n: rsi
// ctx: rdx Look for movabsq_rdx in SkSplicer.cpp to see how this works.
// constants: rcx Look for movabsq_rcx in SkSplicer.cpp to see how this works.
// vectors: ymm0-ymm7
// Here's a little primer on the x86-64/aarch64 ABIs:
// x: rdi/x0 x and n work to drive the loop, like for (; x < n; x += 4 or 8)
// n: rsi/x1
// ctx: rdx/x2 Look for movabsq_rdx in SkSplicer.cpp to see how this works.
// constants: rcx/x3 Look for movabsq_rcx in SkSplicer.cpp to see how this works.
// vectors: ymm0-ymm7/v0-v7
// done() is the key to this entire splicing strategy.
@ -231,6 +250,13 @@ STAGE(store_8888) {
STAGE(load_f16) {
auto ptr = *(const uint64_t**)ctx + x;
#if defined(__aarch64__)
auto halfs = vld4_f16((const float16_t*)ptr);
r = vcvt_f32_f16(halfs.val[0]);
g = vcvt_f32_f16(halfs.val[1]);
b = vcvt_f32_f16(halfs.val[2]);
a = vcvt_f32_f16(halfs.val[3]);
#else
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
_45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
@ -250,11 +276,21 @@ STAGE(load_f16) {
g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
#endif
}
STAGE(store_f16) {
auto ptr = *(uint64_t**)ctx + x;
#if defined(__aarch64__)
float16x4x4_t halfs = {{
vcvt_f16_f32(r),
vcvt_f16_f32(g),
vcvt_f16_f32(b),
vcvt_f16_f32(a),
}};
vst4_f16((float16_t*)ptr, halfs);
#else
auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
@ -269,4 +305,5 @@ STAGE(store_f16) {
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
#endif
}

View File

@ -9,11 +9,21 @@ import re
import subprocess
import sys
cflags = '-std=c++11 -Os -fomit-frame-pointer -mavx2 -mfma -mf16c'
cflags = '-std=c++11 -Os -fomit-frame-pointer'.split()
subprocess.check_call(['clang++'] + cflags.split() +
hsw = '-mavx2 -mfma -mf16c'.split()
subprocess.check_call(['clang++'] + cflags + hsw +
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
['-o', 'stages.o'])
['-o', 'hsw.o'])
aarch64 = [
'--target=aarch64-linux-android',
'--sysroot=' +
'/Users/mtklein/brew/opt/android-ndk/platforms/android-21/arch-arm64',
]
subprocess.check_call(['clang++'] + cflags + aarch64 +
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
['-o', 'aarch64.o'])
print '''/*
* Copyright 2017 Google Inc.
@ -27,11 +37,37 @@ print '''/*
// This file is generated semi-automatically with this command:
// $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
#if defined(__aarch64__)
'''
for line in subprocess.check_output(['gobjdump', '-d',
'aarch64.o']).split('\n'):
line = line.strip()
if not line or line.startswith('aarch64.o') or line.startswith('Disassembly'):
continue
m = re.match('''................ <(.*)>:''', line)
if m:
print 'static const unsigned int kSplice_' + m.group(1) + '[] = {'
continue
_, code, inst, args = line.split('\t')
code = code.strip()
# b done, where done has not yet been filled in by the linker.
if code == '14000000':
print '};'
continue
print ' ', '0x'+code+',' + ' // ' + inst + ' ' + args
print '''
#else
'''
for line in subprocess.check_output(['otool', '-tvj', 'stages.o']).split('\n'):
# TODO: port this to gobjdump too
for line in subprocess.check_output(['otool', '-tvj', 'hsw.o']).split('\n'):
line = line.strip()
if line == '' or line == 'stages.o:' or line == '(__TEXT,__text) section':
if line == '' or line == 'hsw.o:' or line == '(__TEXT,__text) section':
continue
m = re.match('_(.*):', line)
@ -41,33 +77,37 @@ for line in subprocess.check_output(['otool', '-tvj', 'stages.o']).split('\n'):
continue
# Skip the leading 16 byte address and a tab,
# leaving the hex and mnemonics of each instruction.
# leaving the code, instruction mnemonic, and its arguments.
line = line[17:]
columns = line.split('\t')
_hex = columns[0].strip()
instr = columns[1]
args = columns[2:]
code = columns[0].strip()
inst = columns[1]
args = columns[2:]
# We can't splice code that uses rip relative addressing.
for arg in args:
assert 'rip' not in arg
# jmp done, the end of each stage (the address of done is not yet filled in)
if _hex == 'e9 00 00 00 00':
if code == 'e9 00 00 00 00':
print '};'
continue
sys.stdout.write(' ')
_bytes = _hex.split(' ')
_bytes = code.split(' ')
# This is the meat of things: copy the code to a C unsigned char array.
for byte in _bytes:
sys.stdout.write('0x' + byte + ',')
# From here on we're just making the generated file readable and pretty.
sys.stdout.write(' ' * (44 - 5*len(_bytes)))
sys.stdout.write('// ' + instr)
sys.stdout.write('// ' + inst)
if args:
sys.stdout.write(' ' * (13 - len(instr)))
sys.stdout.write(' ' * (13 - len(inst)))
sys.stdout.write(' '.join(args))
sys.stdout.write('\n')
print '''
#endif
'''
print '''#endif//SkSplicer_generated_DEFINED'''