From 78594b26ddff7f8773e957e4107ef57c8d7a1bd8 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Thu, 16 Feb 2017 10:17:00 -0500 Subject: [PATCH] SkJumper: everyone gets a start_pipeline(). Windows needs this as a shim to switch to the System V ABI and back. Other platforms need it too, if only to make UBSAN happy about calling functions through the correct function pointers. One day maybe we can move the looping logic inside start_pipeline? Change-Id: I47d9ef48752becc6c43fc052b12a540c157bcaaa Reviewed-on: https://skia-review.googlesource.com/8542 Reviewed-by: Herb Derby Commit-Queue: Mike Klein --- src/jumper/SkJumper.cpp | 71 +++++++----- src/jumper/SkJumper_generated.h | 189 +++++++++++++++++++++++++++++++- src/jumper/SkJumper_stages.cpp | 21 +++- 3 files changed, 246 insertions(+), 35 deletions(-) diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 3b27ffe81e..2ed95a1e0e 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -21,11 +21,6 @@ static const SkJumper_constants kConstants = { 0x77800000, 0x07800000, 0x04000400, // fp16 <-> fp32 }; -using JumperStage = void(size_t, void**, const SkJumper_constants*); -// Jumper stages actually pass around 8 floating point vectors too. -// They're designed to work when those vectors start unintialized, -// so we don't need to mention them here. - #define STAGES(M) \ M(seed_shader) \ M(constant_color) \ @@ -57,8 +52,12 @@ using JumperStage = void(size_t, void**, const SkJumper_constants*); // Declare the portable, single pixel stages that are linked into Skia from SkJumper_stages.o. extern "C" { - JumperStage sk_just_return; -#define M(st) JumperStage sk_##st; + void sk_start_pipeline(size_t, void**, const SkJumper_constants*); + + // We use void() as a convenient stand-in for the real stage function type. + // We never call these directly, so we don't really need to know their real types. + void sk_just_return(void); +#define M(st) void sk_##st(void); STAGES(M) #undef M } @@ -123,33 +122,51 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const { // We'll look for the best vector instruction set and stride we can use. size_t stride = 0; void* (*lookup)(SkRasterPipeline::StockStage) = nullptr; + void* start_pipeline = nullptr; void* just_return = nullptr; #if defined(__aarch64__) - stride = 4; - lookup = aarch64_lookup; - just_return = (void*)aarch64_sk_just_return; + stride = 4; + lookup = aarch64_lookup; + start_pipeline = (void*)aarch64_sk_start_pipeline; + just_return = (void*)aarch64_sk_just_return; #elif defined(__ARM_NEON__) if (SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) { - stride = 2; - lookup = armv7_lookup; - just_return = (void*)armv7_sk_just_return; + stride = 2; + lookup = armv7_lookup; + start_pipeline = (void*)armv7_sk_start_pipeline; + just_return = (void*)armv7_sk_just_return; } #elif defined(__x86_64__) || defined(_M_X64) - stride = 4; - lookup = sse2_lookup; - just_return = (void*)sse2_sk_just_return; + stride = 4; + lookup = sse2_lookup; + start_pipeline = (void*)sse2_sk_start_pipeline; + just_return = (void*)sse2_sk_just_return; if (SkCpu::Supports(SkCpu::SSE41)) { - stride = 4; - lookup = sse41_lookup; - just_return = (void*)sse41_sk_just_return; + stride = 4; + lookup = sse41_lookup; + start_pipeline = (void*)sse41_sk_start_pipeline; + just_return = (void*)sse41_sk_just_return; } if (SkCpu::Supports(SkCpu::HSW)) { - stride = 8; - lookup = hsw_lookup; - just_return = (void*)hsw_sk_just_return; + stride = 8; + lookup = hsw_lookup; + start_pipeline = (void*)hsw_sk_start_pipeline; + just_return = (void*)hsw_sk_just_return; + } +#endif + +#if defined(_MSC_VER) + if (start_pipeline == (void*)sse2_sk_start_pipeline) { + start_pipeline = (void*)sse2_sk_start_pipeline_ms; + } + if (start_pipeline == (void*)sse41_sk_start_pipeline) { + start_pipeline = (void*)sse41_sk_start_pipeline_ms; + } + if (start_pipeline == (void*)hsw_sk_start_pipeline) { + start_pipeline = (void*)hsw_sk_start_pipeline_ms; } #endif @@ -170,10 +187,9 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const { } *ip = (void*)just_return; - ip = program.get(); - auto start = (JumperStage*)*ip++; + auto start = (decltype(&sk_start_pipeline))start_pipeline; while (x + stride <= limit) { - start(x, ip, &kConstants); + start(x, program.get(), &kConstants); x += stride; } } @@ -193,10 +209,9 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const { } *ip = (void*)sk_just_return; - ip = program.get(); - auto start = (JumperStage*)*ip++; + auto start = sk_start_pipeline; while (x + stride <= limit) { - start(x, ip, &kConstants); + start(x, program.get(), &kConstants); x += stride; } } diff --git a/src/jumper/SkJumper_generated.h b/src/jumper/SkJumper_generated.h index ac0dd4855e..cf25d6740d 100644 --- a/src/jumper/SkJumper_generated.h +++ b/src/jumper/SkJumper_generated.h @@ -11,6 +11,18 @@ // This file is generated semi-automatically with this command: // $ src/jumper/build_stages.py +static const unsigned int aarch64_sk_start_pipeline[] = { + 0xf8408423, // ldr x3, [x1],#8 + 0x6f00e400, // movi v0.2d, #0x0 + 0x6f00e401, // movi v1.2d, #0x0 + 0x6f00e402, // movi v2.2d, #0x0 + 0x6f00e403, // movi v3.2d, #0x0 + 0x6f00e404, // movi v4.2d, #0x0 + 0x6f00e405, // movi v5.2d, #0x0 + 0x6f00e406, // movi v6.2d, #0x0 + 0x6f00e407, // movi v7.2d, #0x0 + 0xd61f0060, // br x3 +}; static const unsigned int aarch64_sk_just_return[] = { 0xd65f03c0, // ret }; @@ -507,6 +519,18 @@ static const unsigned int aarch64_sk_linear_gradient_2stops[] = { 0x4eb01e00, // mov v0.16b, v16.16b 0xd61f0060, // br x3 }; +static const unsigned int armv7_sk_start_pipeline[] = { + 0xe4913004, // ldr r3, [r1], #4 + 0xf2800010, // vmov.i32 d0, #0 + 0xf2801010, // vmov.i32 d1, #0 + 0xf2802010, // vmov.i32 d2, #0 + 0xf2803010, // vmov.i32 d3, #0 + 0xf2804010, // vmov.i32 d4, #0 + 0xf2805010, // vmov.i32 d5, #0 + 0xf2806010, // vmov.i32 d6, #0 + 0xf2807010, // vmov.i32 d7, #0 + 0xe12fff13, // bx r3 +}; static const unsigned int armv7_sk_just_return[] = { 0xe12fff1e, // bx lr }; @@ -1066,9 +1090,62 @@ static const unsigned int armv7_sk_linear_gradient_2stops[] = { 0xf22001b0, // vorr d0, d16, d16 0xe12fff1c, // bx ip }; +static const unsigned char hsw_sk_start_pipeline[] = { + 0x48,0xad, // lods %ds:(%rsi),%rax + 0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0 + 0xc5,0xf4,0x57,0xc9, // vxorps %ymm1,%ymm1,%ymm1 + 0xc5,0xec,0x57,0xd2, // vxorps %ymm2,%ymm2,%ymm2 + 0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3 + 0xc5,0xdc,0x57,0xe4, // vxorps %ymm4,%ymm4,%ymm4 + 0xc5,0xd4,0x57,0xed, // vxorps %ymm5,%ymm5,%ymm5 + 0xc5,0xcc,0x57,0xf6, // vxorps %ymm6,%ymm6,%ymm6 + 0xc5,0xc4,0x57,0xff, // vxorps %ymm7,%ymm7,%ymm7 + 0xff,0xe0, // jmpq *%rax +}; +static const unsigned char hsw_sk_start_pipeline_ms[] = { + 0x56, // push %rsi + 0x57, // push %rdi + 0x48,0x81,0xec,0xa8,0x00,0x00,0x00, // sub $0xa8,%rsp + 0xc5,0x78,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps %xmm15,0x90(%rsp) + 0xc5,0x78,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps %xmm14,0x80(%rsp) + 0xc5,0x78,0x29,0x6c,0x24,0x70, // vmovaps %xmm13,0x70(%rsp) + 0xc5,0x78,0x29,0x64,0x24,0x60, // vmovaps %xmm12,0x60(%rsp) + 0xc5,0x78,0x29,0x5c,0x24,0x50, // vmovaps %xmm11,0x50(%rsp) + 0xc5,0x78,0x29,0x54,0x24,0x40, // vmovaps %xmm10,0x40(%rsp) + 0xc5,0x78,0x29,0x4c,0x24,0x30, // vmovaps %xmm9,0x30(%rsp) + 0xc5,0x78,0x29,0x44,0x24,0x20, // vmovaps %xmm8,0x20(%rsp) + 0xc5,0xf8,0x29,0x7c,0x24,0x10, // vmovaps %xmm7,0x10(%rsp) + 0xc5,0xf8,0x29,0x34,0x24, // vmovaps %xmm6,(%rsp) + 0x48,0x89,0xd6, // mov %rdx,%rsi + 0x48,0xad, // lods %ds:(%rsi),%rax + 0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0 + 0xc5,0xf4,0x57,0xc9, // vxorps %ymm1,%ymm1,%ymm1 + 0xc5,0xec,0x57,0xd2, // vxorps %ymm2,%ymm2,%ymm2 + 0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3 + 0xc5,0xdc,0x57,0xe4, // vxorps %ymm4,%ymm4,%ymm4 + 0xc5,0xd4,0x57,0xed, // vxorps %ymm5,%ymm5,%ymm5 + 0xc5,0xcc,0x57,0xf6, // vxorps %ymm6,%ymm6,%ymm6 + 0xc5,0xc4,0x57,0xff, // vxorps %ymm7,%ymm7,%ymm7 + 0x48,0x89,0xcf, // mov %rcx,%rdi + 0x4c,0x89,0xc2, // mov %r8,%rdx + 0xff,0xd0, // callq *%rax + 0xc5,0xf8,0x28,0x34,0x24, // vmovaps (%rsp),%xmm6 + 0xc5,0xf8,0x28,0x7c,0x24,0x10, // vmovaps 0x10(%rsp),%xmm7 + 0xc5,0x78,0x28,0x44,0x24,0x20, // vmovaps 0x20(%rsp),%xmm8 + 0xc5,0x78,0x28,0x4c,0x24,0x30, // vmovaps 0x30(%rsp),%xmm9 + 0xc5,0x78,0x28,0x54,0x24,0x40, // vmovaps 0x40(%rsp),%xmm10 + 0xc5,0x78,0x28,0x5c,0x24,0x50, // vmovaps 0x50(%rsp),%xmm11 + 0xc5,0x78,0x28,0x64,0x24,0x60, // vmovaps 0x60(%rsp),%xmm12 + 0xc5,0x78,0x28,0x6c,0x24,0x70, // vmovaps 0x70(%rsp),%xmm13 + 0xc5,0x78,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps 0x80(%rsp),%xmm14 + 0xc5,0x78,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps 0x90(%rsp),%xmm15 + 0x48,0x81,0xc4,0xa8,0x00,0x00,0x00, // add $0xa8,%rsp + 0x5f, // pop %rdi + 0x5e, // pop %rsi + 0xc5,0xf8,0x77, // vzeroupper + 0xc3, // retq +}; static const unsigned char hsw_sk_just_return[] = { - 0xc5,0xf8,0x77, // vzeroupper - 0xc5,0xf8,0x77, // vzeroupper 0xc3, // retq }; static const unsigned char hsw_sk_seed_shader[] = { @@ -1514,6 +1591,60 @@ static const unsigned char hsw_sk_linear_gradient_2stops[] = { 0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0 0xff,0xe0, // jmpq *%rax }; +static const unsigned char sse41_sk_start_pipeline[] = { + 0x48,0xad, // lods %ds:(%rsi),%rax + 0x0f,0x57,0xc0, // xorps %xmm0,%xmm0 + 0x0f,0x57,0xc9, // xorps %xmm1,%xmm1 + 0x0f,0x57,0xd2, // xorps %xmm2,%xmm2 + 0x0f,0x57,0xdb, // xorps %xmm3,%xmm3 + 0x0f,0x57,0xe4, // xorps %xmm4,%xmm4 + 0x0f,0x57,0xed, // xorps %xmm5,%xmm5 + 0x0f,0x57,0xf6, // xorps %xmm6,%xmm6 + 0x0f,0x57,0xff, // xorps %xmm7,%xmm7 + 0xff,0xe0, // jmpq *%rax +}; +static const unsigned char sse41_sk_start_pipeline_ms[] = { + 0x56, // push %rsi + 0x57, // push %rdi + 0x48,0x81,0xec,0xa8,0x00,0x00,0x00, // sub $0xa8,%rsp + 0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps %xmm15,0x90(%rsp) + 0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps %xmm14,0x80(%rsp) + 0x44,0x0f,0x29,0x6c,0x24,0x70, // movaps %xmm13,0x70(%rsp) + 0x44,0x0f,0x29,0x64,0x24,0x60, // movaps %xmm12,0x60(%rsp) + 0x44,0x0f,0x29,0x5c,0x24,0x50, // movaps %xmm11,0x50(%rsp) + 0x44,0x0f,0x29,0x54,0x24,0x40, // movaps %xmm10,0x40(%rsp) + 0x44,0x0f,0x29,0x4c,0x24,0x30, // movaps %xmm9,0x30(%rsp) + 0x44,0x0f,0x29,0x44,0x24,0x20, // movaps %xmm8,0x20(%rsp) + 0x0f,0x29,0x7c,0x24,0x10, // movaps %xmm7,0x10(%rsp) + 0x0f,0x29,0x34,0x24, // movaps %xmm6,(%rsp) + 0x48,0x89,0xd6, // mov %rdx,%rsi + 0x48,0xad, // lods %ds:(%rsi),%rax + 0x0f,0x57,0xc0, // xorps %xmm0,%xmm0 + 0x0f,0x57,0xc9, // xorps %xmm1,%xmm1 + 0x0f,0x57,0xd2, // xorps %xmm2,%xmm2 + 0x0f,0x57,0xdb, // xorps %xmm3,%xmm3 + 0x0f,0x57,0xe4, // xorps %xmm4,%xmm4 + 0x0f,0x57,0xed, // xorps %xmm5,%xmm5 + 0x0f,0x57,0xf6, // xorps %xmm6,%xmm6 + 0x0f,0x57,0xff, // xorps %xmm7,%xmm7 + 0x48,0x89,0xcf, // mov %rcx,%rdi + 0x4c,0x89,0xc2, // mov %r8,%rdx + 0xff,0xd0, // callq *%rax + 0x0f,0x28,0x34,0x24, // movaps (%rsp),%xmm6 + 0x0f,0x28,0x7c,0x24,0x10, // movaps 0x10(%rsp),%xmm7 + 0x44,0x0f,0x28,0x44,0x24,0x20, // movaps 0x20(%rsp),%xmm8 + 0x44,0x0f,0x28,0x4c,0x24,0x30, // movaps 0x30(%rsp),%xmm9 + 0x44,0x0f,0x28,0x54,0x24,0x40, // movaps 0x40(%rsp),%xmm10 + 0x44,0x0f,0x28,0x5c,0x24,0x50, // movaps 0x50(%rsp),%xmm11 + 0x44,0x0f,0x28,0x64,0x24,0x60, // movaps 0x60(%rsp),%xmm12 + 0x44,0x0f,0x28,0x6c,0x24,0x70, // movaps 0x70(%rsp),%xmm13 + 0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps 0x80(%rsp),%xmm14 + 0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps 0x90(%rsp),%xmm15 + 0x48,0x81,0xc4,0xa8,0x00,0x00,0x00, // add $0xa8,%rsp + 0x5f, // pop %rdi + 0x5e, // pop %rsi + 0xc3, // retq +}; static const unsigned char sse41_sk_just_return[] = { 0xc3, // retq }; @@ -2145,6 +2276,60 @@ static const unsigned char sse41_sk_linear_gradient_2stops[] = { 0x41,0x0f,0x28,0xc0, // movaps %xmm8,%xmm0 0xff,0xe0, // jmpq *%rax }; +static const unsigned char sse2_sk_start_pipeline[] = { + 0x48,0xad, // lods %ds:(%rsi),%rax + 0x0f,0x57,0xc0, // xorps %xmm0,%xmm0 + 0x0f,0x57,0xc9, // xorps %xmm1,%xmm1 + 0x0f,0x57,0xd2, // xorps %xmm2,%xmm2 + 0x0f,0x57,0xdb, // xorps %xmm3,%xmm3 + 0x0f,0x57,0xe4, // xorps %xmm4,%xmm4 + 0x0f,0x57,0xed, // xorps %xmm5,%xmm5 + 0x0f,0x57,0xf6, // xorps %xmm6,%xmm6 + 0x0f,0x57,0xff, // xorps %xmm7,%xmm7 + 0xff,0xe0, // jmpq *%rax +}; +static const unsigned char sse2_sk_start_pipeline_ms[] = { + 0x56, // push %rsi + 0x57, // push %rdi + 0x48,0x81,0xec,0xa8,0x00,0x00,0x00, // sub $0xa8,%rsp + 0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps %xmm15,0x90(%rsp) + 0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps %xmm14,0x80(%rsp) + 0x44,0x0f,0x29,0x6c,0x24,0x70, // movaps %xmm13,0x70(%rsp) + 0x44,0x0f,0x29,0x64,0x24,0x60, // movaps %xmm12,0x60(%rsp) + 0x44,0x0f,0x29,0x5c,0x24,0x50, // movaps %xmm11,0x50(%rsp) + 0x44,0x0f,0x29,0x54,0x24,0x40, // movaps %xmm10,0x40(%rsp) + 0x44,0x0f,0x29,0x4c,0x24,0x30, // movaps %xmm9,0x30(%rsp) + 0x44,0x0f,0x29,0x44,0x24,0x20, // movaps %xmm8,0x20(%rsp) + 0x0f,0x29,0x7c,0x24,0x10, // movaps %xmm7,0x10(%rsp) + 0x0f,0x29,0x34,0x24, // movaps %xmm6,(%rsp) + 0x48,0x89,0xd6, // mov %rdx,%rsi + 0x48,0xad, // lods %ds:(%rsi),%rax + 0x0f,0x57,0xc0, // xorps %xmm0,%xmm0 + 0x0f,0x57,0xc9, // xorps %xmm1,%xmm1 + 0x0f,0x57,0xd2, // xorps %xmm2,%xmm2 + 0x0f,0x57,0xdb, // xorps %xmm3,%xmm3 + 0x0f,0x57,0xe4, // xorps %xmm4,%xmm4 + 0x0f,0x57,0xed, // xorps %xmm5,%xmm5 + 0x0f,0x57,0xf6, // xorps %xmm6,%xmm6 + 0x0f,0x57,0xff, // xorps %xmm7,%xmm7 + 0x48,0x89,0xcf, // mov %rcx,%rdi + 0x4c,0x89,0xc2, // mov %r8,%rdx + 0xff,0xd0, // callq *%rax + 0x0f,0x28,0x34,0x24, // movaps (%rsp),%xmm6 + 0x0f,0x28,0x7c,0x24,0x10, // movaps 0x10(%rsp),%xmm7 + 0x44,0x0f,0x28,0x44,0x24,0x20, // movaps 0x20(%rsp),%xmm8 + 0x44,0x0f,0x28,0x4c,0x24,0x30, // movaps 0x30(%rsp),%xmm9 + 0x44,0x0f,0x28,0x54,0x24,0x40, // movaps 0x40(%rsp),%xmm10 + 0x44,0x0f,0x28,0x5c,0x24,0x50, // movaps 0x50(%rsp),%xmm11 + 0x44,0x0f,0x28,0x64,0x24,0x60, // movaps 0x60(%rsp),%xmm12 + 0x44,0x0f,0x28,0x6c,0x24,0x70, // movaps 0x70(%rsp),%xmm13 + 0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps 0x80(%rsp),%xmm14 + 0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps 0x90(%rsp),%xmm15 + 0x48,0x81,0xc4,0xa8,0x00,0x00,0x00, // add $0xa8,%rsp + 0x5f, // pop %rdi + 0x5e, // pop %rsi + 0xc3, // retq +}; static const unsigned char sse2_sk_just_return[] = { 0xc3, // retq }; diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index fa771c03e6..0c310515e3 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -200,13 +200,24 @@ static void* load_and_inc(void**& program) { static void name##_k(size_t& x, void* ctx, K* k, \ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) -// A glue Stage to end the tail call chain, finally returning to the caller. -extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) { -#if defined(JUMPER) && defined(__AVX2__) - _mm256_zeroupper(); -#endif +// Some glue stages that don't fit the normal pattern of stages. + +extern "C" void sk_start_pipeline(size_t x, void** program, K* k) { + auto next = (Stage*)load_and_inc(program); + F v{}; // TODO: faster uninitialized? + next(x,program,k, v,v,v,v, v,v,v,v); } +#if defined(JUMPER) && defined(__x86_64__) + __attribute__((ms_abi)) + extern "C" void sk_start_pipeline_ms(size_t x, void** program, K* k) { + sk_start_pipeline(x,program,k); + } +#endif + +// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller). +extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {} + // We can now define Stages! // Some things to keep in mind while writing Stages: