SkJumper: everyone gets a start_pipeline().

Windows needs this as a shim to switch to the System V ABI and back.

Other platforms need it too, if only to make UBSAN happy about calling
functions through the correct function pointers.

One day maybe we can move the looping logic inside start_pipeline?

Change-Id: I47d9ef48752becc6c43fc052b12a540c157bcaaa
Reviewed-on: https://skia-review.googlesource.com/8542
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-02-16 10:17:00 -05:00 committed by Skia Commit-Bot
parent c9f4b8c562
commit 78594b26dd
3 changed files with 246 additions and 35 deletions

View File

@ -21,11 +21,6 @@ static const SkJumper_constants kConstants = {
0x77800000, 0x07800000, 0x04000400, // fp16 <-> fp32
};
using JumperStage = void(size_t, void**, const SkJumper_constants*);
// Jumper stages actually pass around 8 floating point vectors too.
// They're designed to work when those vectors start unintialized,
// so we don't need to mention them here.
#define STAGES(M) \
M(seed_shader) \
M(constant_color) \
@ -57,8 +52,12 @@ using JumperStage = void(size_t, void**, const SkJumper_constants*);
// Declare the portable, single pixel stages that are linked into Skia from SkJumper_stages.o.
extern "C" {
JumperStage sk_just_return;
#define M(st) JumperStage sk_##st;
void sk_start_pipeline(size_t, void**, const SkJumper_constants*);
// We use void() as a convenient stand-in for the real stage function type.
// We never call these directly, so we don't really need to know their real types.
void sk_just_return(void);
#define M(st) void sk_##st(void);
STAGES(M)
#undef M
}
@ -123,33 +122,51 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
// We'll look for the best vector instruction set and stride we can use.
size_t stride = 0;
void* (*lookup)(SkRasterPipeline::StockStage) = nullptr;
void* start_pipeline = nullptr;
void* just_return = nullptr;
#if defined(__aarch64__)
stride = 4;
lookup = aarch64_lookup;
just_return = (void*)aarch64_sk_just_return;
stride = 4;
lookup = aarch64_lookup;
start_pipeline = (void*)aarch64_sk_start_pipeline;
just_return = (void*)aarch64_sk_just_return;
#elif defined(__ARM_NEON__)
if (SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
stride = 2;
lookup = armv7_lookup;
just_return = (void*)armv7_sk_just_return;
stride = 2;
lookup = armv7_lookup;
start_pipeline = (void*)armv7_sk_start_pipeline;
just_return = (void*)armv7_sk_just_return;
}
#elif defined(__x86_64__) || defined(_M_X64)
stride = 4;
lookup = sse2_lookup;
just_return = (void*)sse2_sk_just_return;
stride = 4;
lookup = sse2_lookup;
start_pipeline = (void*)sse2_sk_start_pipeline;
just_return = (void*)sse2_sk_just_return;
if (SkCpu::Supports(SkCpu::SSE41)) {
stride = 4;
lookup = sse41_lookup;
just_return = (void*)sse41_sk_just_return;
stride = 4;
lookup = sse41_lookup;
start_pipeline = (void*)sse41_sk_start_pipeline;
just_return = (void*)sse41_sk_just_return;
}
if (SkCpu::Supports(SkCpu::HSW)) {
stride = 8;
lookup = hsw_lookup;
just_return = (void*)hsw_sk_just_return;
stride = 8;
lookup = hsw_lookup;
start_pipeline = (void*)hsw_sk_start_pipeline;
just_return = (void*)hsw_sk_just_return;
}
#endif
#if defined(_MSC_VER)
if (start_pipeline == (void*)sse2_sk_start_pipeline) {
start_pipeline = (void*)sse2_sk_start_pipeline_ms;
}
if (start_pipeline == (void*)sse41_sk_start_pipeline) {
start_pipeline = (void*)sse41_sk_start_pipeline_ms;
}
if (start_pipeline == (void*)hsw_sk_start_pipeline) {
start_pipeline = (void*)hsw_sk_start_pipeline_ms;
}
#endif
@ -170,10 +187,9 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
}
*ip = (void*)just_return;
ip = program.get();
auto start = (JumperStage*)*ip++;
auto start = (decltype(&sk_start_pipeline))start_pipeline;
while (x + stride <= limit) {
start(x, ip, &kConstants);
start(x, program.get(), &kConstants);
x += stride;
}
}
@ -193,10 +209,9 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
}
*ip = (void*)sk_just_return;
ip = program.get();
auto start = (JumperStage*)*ip++;
auto start = sk_start_pipeline;
while (x + stride <= limit) {
start(x, ip, &kConstants);
start(x, program.get(), &kConstants);
x += stride;
}
}

View File

@ -11,6 +11,18 @@
// This file is generated semi-automatically with this command:
// $ src/jumper/build_stages.py
static const unsigned int aarch64_sk_start_pipeline[] = {
0xf8408423, // ldr x3, [x1],#8
0x6f00e400, // movi v0.2d, #0x0
0x6f00e401, // movi v1.2d, #0x0
0x6f00e402, // movi v2.2d, #0x0
0x6f00e403, // movi v3.2d, #0x0
0x6f00e404, // movi v4.2d, #0x0
0x6f00e405, // movi v5.2d, #0x0
0x6f00e406, // movi v6.2d, #0x0
0x6f00e407, // movi v7.2d, #0x0
0xd61f0060, // br x3
};
static const unsigned int aarch64_sk_just_return[] = {
0xd65f03c0, // ret
};
@ -507,6 +519,18 @@ static const unsigned int aarch64_sk_linear_gradient_2stops[] = {
0x4eb01e00, // mov v0.16b, v16.16b
0xd61f0060, // br x3
};
static const unsigned int armv7_sk_start_pipeline[] = {
0xe4913004, // ldr r3, [r1], #4
0xf2800010, // vmov.i32 d0, #0
0xf2801010, // vmov.i32 d1, #0
0xf2802010, // vmov.i32 d2, #0
0xf2803010, // vmov.i32 d3, #0
0xf2804010, // vmov.i32 d4, #0
0xf2805010, // vmov.i32 d5, #0
0xf2806010, // vmov.i32 d6, #0
0xf2807010, // vmov.i32 d7, #0
0xe12fff13, // bx r3
};
static const unsigned int armv7_sk_just_return[] = {
0xe12fff1e, // bx lr
};
@ -1066,9 +1090,62 @@ static const unsigned int armv7_sk_linear_gradient_2stops[] = {
0xf22001b0, // vorr d0, d16, d16
0xe12fff1c, // bx ip
};
static const unsigned char hsw_sk_start_pipeline[] = {
0x48,0xad, // lods %ds:(%rsi),%rax
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
0xc5,0xf4,0x57,0xc9, // vxorps %ymm1,%ymm1,%ymm1
0xc5,0xec,0x57,0xd2, // vxorps %ymm2,%ymm2,%ymm2
0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3
0xc5,0xdc,0x57,0xe4, // vxorps %ymm4,%ymm4,%ymm4
0xc5,0xd4,0x57,0xed, // vxorps %ymm5,%ymm5,%ymm5
0xc5,0xcc,0x57,0xf6, // vxorps %ymm6,%ymm6,%ymm6
0xc5,0xc4,0x57,0xff, // vxorps %ymm7,%ymm7,%ymm7
0xff,0xe0, // jmpq *%rax
};
static const unsigned char hsw_sk_start_pipeline_ms[] = {
0x56, // push %rsi
0x57, // push %rdi
0x48,0x81,0xec,0xa8,0x00,0x00,0x00, // sub $0xa8,%rsp
0xc5,0x78,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps %xmm15,0x90(%rsp)
0xc5,0x78,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps %xmm14,0x80(%rsp)
0xc5,0x78,0x29,0x6c,0x24,0x70, // vmovaps %xmm13,0x70(%rsp)
0xc5,0x78,0x29,0x64,0x24,0x60, // vmovaps %xmm12,0x60(%rsp)
0xc5,0x78,0x29,0x5c,0x24,0x50, // vmovaps %xmm11,0x50(%rsp)
0xc5,0x78,0x29,0x54,0x24,0x40, // vmovaps %xmm10,0x40(%rsp)
0xc5,0x78,0x29,0x4c,0x24,0x30, // vmovaps %xmm9,0x30(%rsp)
0xc5,0x78,0x29,0x44,0x24,0x20, // vmovaps %xmm8,0x20(%rsp)
0xc5,0xf8,0x29,0x7c,0x24,0x10, // vmovaps %xmm7,0x10(%rsp)
0xc5,0xf8,0x29,0x34,0x24, // vmovaps %xmm6,(%rsp)
0x48,0x89,0xd6, // mov %rdx,%rsi
0x48,0xad, // lods %ds:(%rsi),%rax
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
0xc5,0xf4,0x57,0xc9, // vxorps %ymm1,%ymm1,%ymm1
0xc5,0xec,0x57,0xd2, // vxorps %ymm2,%ymm2,%ymm2
0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3
0xc5,0xdc,0x57,0xe4, // vxorps %ymm4,%ymm4,%ymm4
0xc5,0xd4,0x57,0xed, // vxorps %ymm5,%ymm5,%ymm5
0xc5,0xcc,0x57,0xf6, // vxorps %ymm6,%ymm6,%ymm6
0xc5,0xc4,0x57,0xff, // vxorps %ymm7,%ymm7,%ymm7
0x48,0x89,0xcf, // mov %rcx,%rdi
0x4c,0x89,0xc2, // mov %r8,%rdx
0xff,0xd0, // callq *%rax
0xc5,0xf8,0x28,0x34,0x24, // vmovaps (%rsp),%xmm6
0xc5,0xf8,0x28,0x7c,0x24,0x10, // vmovaps 0x10(%rsp),%xmm7
0xc5,0x78,0x28,0x44,0x24,0x20, // vmovaps 0x20(%rsp),%xmm8
0xc5,0x78,0x28,0x4c,0x24,0x30, // vmovaps 0x30(%rsp),%xmm9
0xc5,0x78,0x28,0x54,0x24,0x40, // vmovaps 0x40(%rsp),%xmm10
0xc5,0x78,0x28,0x5c,0x24,0x50, // vmovaps 0x50(%rsp),%xmm11
0xc5,0x78,0x28,0x64,0x24,0x60, // vmovaps 0x60(%rsp),%xmm12
0xc5,0x78,0x28,0x6c,0x24,0x70, // vmovaps 0x70(%rsp),%xmm13
0xc5,0x78,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps 0x80(%rsp),%xmm14
0xc5,0x78,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps 0x90(%rsp),%xmm15
0x48,0x81,0xc4,0xa8,0x00,0x00,0x00, // add $0xa8,%rsp
0x5f, // pop %rdi
0x5e, // pop %rsi
0xc5,0xf8,0x77, // vzeroupper
0xc3, // retq
};
static const unsigned char hsw_sk_just_return[] = {
0xc5,0xf8,0x77, // vzeroupper
0xc5,0xf8,0x77, // vzeroupper
0xc3, // retq
};
static const unsigned char hsw_sk_seed_shader[] = {
@ -1514,6 +1591,60 @@ static const unsigned char hsw_sk_linear_gradient_2stops[] = {
0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0
0xff,0xe0, // jmpq *%rax
};
static const unsigned char sse41_sk_start_pipeline[] = {
0x48,0xad, // lods %ds:(%rsi),%rax
0x0f,0x57,0xc0, // xorps %xmm0,%xmm0
0x0f,0x57,0xc9, // xorps %xmm1,%xmm1
0x0f,0x57,0xd2, // xorps %xmm2,%xmm2
0x0f,0x57,0xdb, // xorps %xmm3,%xmm3
0x0f,0x57,0xe4, // xorps %xmm4,%xmm4
0x0f,0x57,0xed, // xorps %xmm5,%xmm5
0x0f,0x57,0xf6, // xorps %xmm6,%xmm6
0x0f,0x57,0xff, // xorps %xmm7,%xmm7
0xff,0xe0, // jmpq *%rax
};
static const unsigned char sse41_sk_start_pipeline_ms[] = {
0x56, // push %rsi
0x57, // push %rdi
0x48,0x81,0xec,0xa8,0x00,0x00,0x00, // sub $0xa8,%rsp
0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps %xmm15,0x90(%rsp)
0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps %xmm14,0x80(%rsp)
0x44,0x0f,0x29,0x6c,0x24,0x70, // movaps %xmm13,0x70(%rsp)
0x44,0x0f,0x29,0x64,0x24,0x60, // movaps %xmm12,0x60(%rsp)
0x44,0x0f,0x29,0x5c,0x24,0x50, // movaps %xmm11,0x50(%rsp)
0x44,0x0f,0x29,0x54,0x24,0x40, // movaps %xmm10,0x40(%rsp)
0x44,0x0f,0x29,0x4c,0x24,0x30, // movaps %xmm9,0x30(%rsp)
0x44,0x0f,0x29,0x44,0x24,0x20, // movaps %xmm8,0x20(%rsp)
0x0f,0x29,0x7c,0x24,0x10, // movaps %xmm7,0x10(%rsp)
0x0f,0x29,0x34,0x24, // movaps %xmm6,(%rsp)
0x48,0x89,0xd6, // mov %rdx,%rsi
0x48,0xad, // lods %ds:(%rsi),%rax
0x0f,0x57,0xc0, // xorps %xmm0,%xmm0
0x0f,0x57,0xc9, // xorps %xmm1,%xmm1
0x0f,0x57,0xd2, // xorps %xmm2,%xmm2
0x0f,0x57,0xdb, // xorps %xmm3,%xmm3
0x0f,0x57,0xe4, // xorps %xmm4,%xmm4
0x0f,0x57,0xed, // xorps %xmm5,%xmm5
0x0f,0x57,0xf6, // xorps %xmm6,%xmm6
0x0f,0x57,0xff, // xorps %xmm7,%xmm7
0x48,0x89,0xcf, // mov %rcx,%rdi
0x4c,0x89,0xc2, // mov %r8,%rdx
0xff,0xd0, // callq *%rax
0x0f,0x28,0x34,0x24, // movaps (%rsp),%xmm6
0x0f,0x28,0x7c,0x24,0x10, // movaps 0x10(%rsp),%xmm7
0x44,0x0f,0x28,0x44,0x24,0x20, // movaps 0x20(%rsp),%xmm8
0x44,0x0f,0x28,0x4c,0x24,0x30, // movaps 0x30(%rsp),%xmm9
0x44,0x0f,0x28,0x54,0x24,0x40, // movaps 0x40(%rsp),%xmm10
0x44,0x0f,0x28,0x5c,0x24,0x50, // movaps 0x50(%rsp),%xmm11
0x44,0x0f,0x28,0x64,0x24,0x60, // movaps 0x60(%rsp),%xmm12
0x44,0x0f,0x28,0x6c,0x24,0x70, // movaps 0x70(%rsp),%xmm13
0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps 0x80(%rsp),%xmm14
0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps 0x90(%rsp),%xmm15
0x48,0x81,0xc4,0xa8,0x00,0x00,0x00, // add $0xa8,%rsp
0x5f, // pop %rdi
0x5e, // pop %rsi
0xc3, // retq
};
static const unsigned char sse41_sk_just_return[] = {
0xc3, // retq
};
@ -2145,6 +2276,60 @@ static const unsigned char sse41_sk_linear_gradient_2stops[] = {
0x41,0x0f,0x28,0xc0, // movaps %xmm8,%xmm0
0xff,0xe0, // jmpq *%rax
};
static const unsigned char sse2_sk_start_pipeline[] = {
0x48,0xad, // lods %ds:(%rsi),%rax
0x0f,0x57,0xc0, // xorps %xmm0,%xmm0
0x0f,0x57,0xc9, // xorps %xmm1,%xmm1
0x0f,0x57,0xd2, // xorps %xmm2,%xmm2
0x0f,0x57,0xdb, // xorps %xmm3,%xmm3
0x0f,0x57,0xe4, // xorps %xmm4,%xmm4
0x0f,0x57,0xed, // xorps %xmm5,%xmm5
0x0f,0x57,0xf6, // xorps %xmm6,%xmm6
0x0f,0x57,0xff, // xorps %xmm7,%xmm7
0xff,0xe0, // jmpq *%rax
};
static const unsigned char sse2_sk_start_pipeline_ms[] = {
0x56, // push %rsi
0x57, // push %rdi
0x48,0x81,0xec,0xa8,0x00,0x00,0x00, // sub $0xa8,%rsp
0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps %xmm15,0x90(%rsp)
0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps %xmm14,0x80(%rsp)
0x44,0x0f,0x29,0x6c,0x24,0x70, // movaps %xmm13,0x70(%rsp)
0x44,0x0f,0x29,0x64,0x24,0x60, // movaps %xmm12,0x60(%rsp)
0x44,0x0f,0x29,0x5c,0x24,0x50, // movaps %xmm11,0x50(%rsp)
0x44,0x0f,0x29,0x54,0x24,0x40, // movaps %xmm10,0x40(%rsp)
0x44,0x0f,0x29,0x4c,0x24,0x30, // movaps %xmm9,0x30(%rsp)
0x44,0x0f,0x29,0x44,0x24,0x20, // movaps %xmm8,0x20(%rsp)
0x0f,0x29,0x7c,0x24,0x10, // movaps %xmm7,0x10(%rsp)
0x0f,0x29,0x34,0x24, // movaps %xmm6,(%rsp)
0x48,0x89,0xd6, // mov %rdx,%rsi
0x48,0xad, // lods %ds:(%rsi),%rax
0x0f,0x57,0xc0, // xorps %xmm0,%xmm0
0x0f,0x57,0xc9, // xorps %xmm1,%xmm1
0x0f,0x57,0xd2, // xorps %xmm2,%xmm2
0x0f,0x57,0xdb, // xorps %xmm3,%xmm3
0x0f,0x57,0xe4, // xorps %xmm4,%xmm4
0x0f,0x57,0xed, // xorps %xmm5,%xmm5
0x0f,0x57,0xf6, // xorps %xmm6,%xmm6
0x0f,0x57,0xff, // xorps %xmm7,%xmm7
0x48,0x89,0xcf, // mov %rcx,%rdi
0x4c,0x89,0xc2, // mov %r8,%rdx
0xff,0xd0, // callq *%rax
0x0f,0x28,0x34,0x24, // movaps (%rsp),%xmm6
0x0f,0x28,0x7c,0x24,0x10, // movaps 0x10(%rsp),%xmm7
0x44,0x0f,0x28,0x44,0x24,0x20, // movaps 0x20(%rsp),%xmm8
0x44,0x0f,0x28,0x4c,0x24,0x30, // movaps 0x30(%rsp),%xmm9
0x44,0x0f,0x28,0x54,0x24,0x40, // movaps 0x40(%rsp),%xmm10
0x44,0x0f,0x28,0x5c,0x24,0x50, // movaps 0x50(%rsp),%xmm11
0x44,0x0f,0x28,0x64,0x24,0x60, // movaps 0x60(%rsp),%xmm12
0x44,0x0f,0x28,0x6c,0x24,0x70, // movaps 0x70(%rsp),%xmm13
0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps 0x80(%rsp),%xmm14
0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps 0x90(%rsp),%xmm15
0x48,0x81,0xc4,0xa8,0x00,0x00,0x00, // add $0xa8,%rsp
0x5f, // pop %rdi
0x5e, // pop %rsi
0xc3, // retq
};
static const unsigned char sse2_sk_just_return[] = {
0xc3, // retq
};

View File

@ -200,13 +200,24 @@ static void* load_and_inc(void**& program) {
static void name##_k(size_t& x, void* ctx, K* k, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
// A glue Stage to end the tail call chain, finally returning to the caller.
extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {
#if defined(JUMPER) && defined(__AVX2__)
_mm256_zeroupper();
#endif
// Some glue stages that don't fit the normal pattern of stages.
extern "C" void sk_start_pipeline(size_t x, void** program, K* k) {
auto next = (Stage*)load_and_inc(program);
F v{}; // TODO: faster uninitialized?
next(x,program,k, v,v,v,v, v,v,v,v);
}
#if defined(JUMPER) && defined(__x86_64__)
__attribute__((ms_abi))
extern "C" void sk_start_pipeline_ms(size_t x, void** program, K* k) {
sk_start_pipeline(x,program,k);
}
#endif
// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
// We can now define Stages!
// Some things to keep in mind while writing Stages: