Move looping logic into start_pipeline().

This should be a big win on Windows, but I haven't timed there yet.
On my Mac, it's a solid 2% speedup.

PS1 was insufficiently ambitious, but was this for posterity:
    No need to vzeroupper twice on Windows.

    On Windows start_pipeline() will vzeroupper,
    so no need to do it in just_return().

Change-Id: I099320b95da85900a60ce96fdb7a216a36db1858
Reviewed-on: https://skia-review.googlesource.com/8821
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2017-02-21 16:50:52 -05:00 committed by Skia Commit-Bot
parent c663953504
commit 9ef63754a7
4 changed files with 248 additions and 42 deletions

View File

@ -65,23 +65,23 @@ using StageFn = void(void);
extern "C" {
#if defined(__aarch64__)
void ASM(start_pipeline,aarch64)(size_t, void**, K*);
size_t ASM(start_pipeline,aarch64)(size_t, void**, K*, size_t);
StageFn ASM(just_return,aarch64);
#define M(st) StageFn ASM(st,aarch64);
STAGES(M)
#undef M
#elif defined(__arm__)
void ASM(start_pipeline,vfp4)(size_t, void**, K*);
size_t ASM(start_pipeline,vfp4)(size_t, void**, K*, size_t);
StageFn ASM(just_return,vfp4);
#define M(st) StageFn ASM(st,vfp4);
STAGES(M)
#undef M
#elif defined(__x86_64__) || defined(_M_X64)
void ASM(start_pipeline,hsw )(size_t, void**, K*);
void ASM(start_pipeline,sse41)(size_t, void**, K*);
void ASM(start_pipeline,sse2 )(size_t, void**, K*);
size_t ASM(start_pipeline,hsw )(size_t, void**, K*, size_t);
size_t ASM(start_pipeline,sse41)(size_t, void**, K*, size_t);
size_t ASM(start_pipeline,sse2 )(size_t, void**, K*, size_t);
StageFn ASM(just_return,hsw),
ASM(just_return,sse41),
@ -99,7 +99,7 @@ extern "C" {
#endif
// Portable, single-pixel stages.
void sk_start_pipeline(size_t, void**, K*);
size_t sk_start_pipeline(size_t, void**, K*, size_t);
StageFn sk_just_return;
#define M(st) StageFn sk_##st;
STAGES(M)
@ -171,7 +171,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
auto build_and_run = [&](size_t stride,
StageFn* (*lookup)(SkRasterPipeline::StockStage),
StageFn* just_return,
void (*start_pipeline)(size_t, void**, K*)) {
size_t (*start_pipeline)(size_t, void**, K*, size_t)) {
if (x + stride <= limit) {
void** ip = program.get();
for (auto&& st : fStages) {
@ -184,10 +184,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
}
*ip = (void*)just_return;
while (x + stride <= limit) {
start_pipeline(x, program.get(), &kConstants);
x += stride;
}
x = start_pipeline(x, program.get(), &kConstants, limit);
}
return true;
};

View File

@ -12,7 +12,21 @@
.globl _sk_start_pipeline_aarch64
_sk_start_pipeline_aarch64:
.long 0xf8408423 // ldr x3, [x1],#8
.long 0xa9bc5ff8 // stp x24, x23, [sp,#-64]!
.long 0xa90157f6 // stp x22, x21, [sp,#16]
.long 0xa9024ff4 // stp x20, x19, [sp,#32]
.long 0xa9037bfd // stp x29, x30, [sp,#48]
.long 0x9100c3fd // add x29, sp, #0x30
.long 0xaa0103f3 // mov x19, x1
.long 0xf8408677 // ldr x23, [x19],#8
.long 0xaa0003f6 // mov x22, x0
.long 0xaa0303f4 // mov x20, x3
.long 0xaa0203f5 // mov x21, x2
.long 0x910012c8 // add x8, x22, #0x4
.long 0xeb14011f // cmp x8, x20
.long 0x54000069 // b.ls 3c <sk_start_pipeline_aarch64+0x3c>
.long 0xaa1603e0 // mov x0, x22
.long 0x14000012 // b 80 <sk_start_pipeline_aarch64+0x80>
.long 0x6f00e400 // movi v0.2d, #0x0
.long 0x6f00e401 // movi v1.2d, #0x0
.long 0x6f00e402 // movi v2.2d, #0x0
@ -21,7 +35,20 @@ _sk_start_pipeline_aarch64:
.long 0x6f00e405 // movi v5.2d, #0x0
.long 0x6f00e406 // movi v6.2d, #0x0
.long 0x6f00e407 // movi v7.2d, #0x0
.long 0xd61f0060 // br x3
.long 0xaa1603e0 // mov x0, x22
.long 0xaa1303e1 // mov x1, x19
.long 0xaa1503e2 // mov x2, x21
.long 0xd63f02e0 // blr x23
.long 0x910022c8 // add x8, x22, #0x8
.long 0x910012c0 // add x0, x22, #0x4
.long 0xeb14011f // cmp x8, x20
.long 0xaa0003f6 // mov x22, x0
.long 0x54fffe09 // b.ls 3c <sk_start_pipeline_aarch64+0x3c>
.long 0xa9437bfd // ldp x29, x30, [sp,#48]
.long 0xa9424ff4 // ldp x20, x19, [sp,#32]
.long 0xa94157f6 // ldp x22, x21, [sp,#16]
.long 0xa8c45ff8 // ldp x24, x23, [sp],#64
.long 0xd65f03c0 // ret
.globl _sk_just_return_aarch64
_sk_just_return_aarch64:
@ -551,16 +578,32 @@ _sk_linear_gradient_2stops_aarch64:
.globl _sk_start_pipeline_vfp4
_sk_start_pipeline_vfp4:
.long 0xe4913004 // ldr r3, [r1], #4
.long 0xe92d41f0 // push {r4, r5, r6, r7, r8, lr}
.long 0xe1a07001 // mov r7, r1
.long 0xe1a04000 // mov r4, r0
.long 0xe1a05003 // mov r5, r3
.long 0xe1a08002 // mov r8, r2
.long 0xe4976004 // ldr r6, [r7], #4
.long 0xe2840002 // add r0, r4, #2
.long 0xea00000d // b 58 <sk_start_pipeline_vfp4+0x58>
.long 0xf2800010 // vmov.i32 d0, #0
.long 0xe1a00004 // mov r0, r4
.long 0xf2801010 // vmov.i32 d1, #0
.long 0xe1a01007 // mov r1, r7
.long 0xf2802010 // vmov.i32 d2, #0
.long 0xe1a02008 // mov r2, r8
.long 0xf2803010 // vmov.i32 d3, #0
.long 0xf2804010 // vmov.i32 d4, #0
.long 0xf2805010 // vmov.i32 d5, #0
.long 0xf2806010 // vmov.i32 d6, #0
.long 0xf2807010 // vmov.i32 d7, #0
.long 0xe12fff13 // bx r3
.long 0xe12fff36 // blx r6
.long 0xe2840004 // add r0, r4, #4
.long 0xe2844002 // add r4, r4, #2
.long 0xe1500005 // cmp r0, r5
.long 0x9affffef // bls 20 <sk_start_pipeline_vfp4+0x20>
.long 0xe1a00004 // mov r0, r4
.long 0xe8bd81f0 // pop {r4, r5, r6, r7, r8, pc}
.globl _sk_just_return_vfp4
_sk_just_return_vfp4:
@ -1152,7 +1195,22 @@ _sk_linear_gradient_2stops_vfp4:
.globl _sk_start_pipeline_hsw
_sk_start_pipeline_hsw:
.byte 0x41,0x57 // push %r15
.byte 0x41,0x56 // push %r14
.byte 0x41,0x55 // push %r13
.byte 0x41,0x54 // push %r12
.byte 0x53 // push %rbx
.byte 0x49,0x89,0xcf // mov %rcx,%r15
.byte 0x49,0x89,0xd6 // mov %rdx,%r14
.byte 0x48,0x89,0xfb // mov %rdi,%rbx
.byte 0x48,0xad // lods %ds:(%rsi),%rax
.byte 0x49,0x89,0xc4 // mov %rax,%r12
.byte 0x49,0x89,0xf5 // mov %rsi,%r13
.byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax
.byte 0x4c,0x39,0xf8 // cmp %r15,%rax
.byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_hsw+0x28>
.byte 0x48,0x89,0xd8 // mov %rbx,%rax
.byte 0xeb,0x3c // jmp 64 <_sk_start_pipeline_hsw+0x64>
.byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0
.byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1
.byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2
@ -1161,7 +1219,22 @@ _sk_start_pipeline_hsw:
.byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5
.byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6
.byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7
.byte 0xff,0xe0 // jmpq *%rax
.byte 0x48,0x89,0xdf // mov %rbx,%rdi
.byte 0x4c,0x89,0xee // mov %r13,%rsi
.byte 0x4c,0x89,0xf2 // mov %r14,%rdx
.byte 0x41,0xff,0xd4 // callq *%r12
.byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax
.byte 0x48,0x83,0xc3,0x10 // add $0x10,%rbx
.byte 0x4c,0x39,0xfb // cmp %r15,%rbx
.byte 0x48,0x89,0xc3 // mov %rax,%rbx
.byte 0x76,0xc4 // jbe 28 <_sk_start_pipeline_hsw+0x28>
.byte 0x5b // pop %rbx
.byte 0x41,0x5c // pop %r12
.byte 0x41,0x5d // pop %r13
.byte 0x41,0x5e // pop %r14
.byte 0x41,0x5f // pop %r15
.byte 0xc5,0xf8,0x77 // vzeroupper
.byte 0xc3 // retq
.globl _sk_just_return_hsw
_sk_just_return_hsw:
@ -1640,7 +1713,22 @@ _sk_linear_gradient_2stops_hsw:
.globl _sk_start_pipeline_sse41
_sk_start_pipeline_sse41:
.byte 0x41,0x57 // push %r15
.byte 0x41,0x56 // push %r14
.byte 0x41,0x55 // push %r13
.byte 0x41,0x54 // push %r12
.byte 0x53 // push %rbx
.byte 0x49,0x89,0xcf // mov %rcx,%r15
.byte 0x49,0x89,0xd6 // mov %rdx,%r14
.byte 0x48,0x89,0xfb // mov %rdi,%rbx
.byte 0x48,0xad // lods %ds:(%rsi),%rax
.byte 0x49,0x89,0xc4 // mov %rax,%r12
.byte 0x49,0x89,0xf5 // mov %rsi,%r13
.byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
.byte 0x4c,0x39,0xf8 // cmp %r15,%rax
.byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_sse41+0x28>
.byte 0x48,0x89,0xd8 // mov %rbx,%rax
.byte 0xeb,0x34 // jmp 5c <_sk_start_pipeline_sse41+0x5c>
.byte 0x0f,0x57,0xc0 // xorps %xmm0,%xmm0
.byte 0x0f,0x57,0xc9 // xorps %xmm1,%xmm1
.byte 0x0f,0x57,0xd2 // xorps %xmm2,%xmm2
@ -1649,7 +1737,21 @@ _sk_start_pipeline_sse41:
.byte 0x0f,0x57,0xed // xorps %xmm5,%xmm5
.byte 0x0f,0x57,0xf6 // xorps %xmm6,%xmm6
.byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7
.byte 0xff,0xe0 // jmpq *%rax
.byte 0x48,0x89,0xdf // mov %rbx,%rdi
.byte 0x4c,0x89,0xee // mov %r13,%rsi
.byte 0x4c,0x89,0xf2 // mov %r14,%rdx
.byte 0x41,0xff,0xd4 // callq *%r12
.byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
.byte 0x48,0x83,0xc3,0x08 // add $0x8,%rbx
.byte 0x4c,0x39,0xfb // cmp %r15,%rbx
.byte 0x48,0x89,0xc3 // mov %rax,%rbx
.byte 0x76,0xcc // jbe 28 <_sk_start_pipeline_sse41+0x28>
.byte 0x5b // pop %rbx
.byte 0x41,0x5c // pop %r12
.byte 0x41,0x5d // pop %r13
.byte 0x41,0x5e // pop %r14
.byte 0x41,0x5f // pop %r15
.byte 0xc3 // retq
.globl _sk_just_return_sse41
_sk_just_return_sse41:
@ -2312,7 +2414,22 @@ _sk_linear_gradient_2stops_sse41:
.globl _sk_start_pipeline_sse2
_sk_start_pipeline_sse2:
.byte 0x41,0x57 // push %r15
.byte 0x41,0x56 // push %r14
.byte 0x41,0x55 // push %r13
.byte 0x41,0x54 // push %r12
.byte 0x53 // push %rbx
.byte 0x49,0x89,0xcf // mov %rcx,%r15
.byte 0x49,0x89,0xd6 // mov %rdx,%r14
.byte 0x48,0x89,0xfb // mov %rdi,%rbx
.byte 0x48,0xad // lods %ds:(%rsi),%rax
.byte 0x49,0x89,0xc4 // mov %rax,%r12
.byte 0x49,0x89,0xf5 // mov %rsi,%r13
.byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
.byte 0x4c,0x39,0xf8 // cmp %r15,%rax
.byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_sse2+0x28>
.byte 0x48,0x89,0xd8 // mov %rbx,%rax
.byte 0xeb,0x34 // jmp 5c <_sk_start_pipeline_sse2+0x5c>
.byte 0x0f,0x57,0xc0 // xorps %xmm0,%xmm0
.byte 0x0f,0x57,0xc9 // xorps %xmm1,%xmm1
.byte 0x0f,0x57,0xd2 // xorps %xmm2,%xmm2
@ -2321,7 +2438,21 @@ _sk_start_pipeline_sse2:
.byte 0x0f,0x57,0xed // xorps %xmm5,%xmm5
.byte 0x0f,0x57,0xf6 // xorps %xmm6,%xmm6
.byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7
.byte 0xff,0xe0 // jmpq *%rax
.byte 0x48,0x89,0xdf // mov %rbx,%rdi
.byte 0x4c,0x89,0xee // mov %r13,%rsi
.byte 0x4c,0x89,0xf2 // mov %r14,%rdx
.byte 0x41,0xff,0xd4 // callq *%r12
.byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
.byte 0x48,0x83,0xc3,0x08 // add $0x8,%rbx
.byte 0x4c,0x39,0xfb // cmp %r15,%rbx
.byte 0x48,0x89,0xc3 // mov %rax,%rbx
.byte 0x76,0xcc // jbe 28 <_sk_start_pipeline_sse2+0x28>
.byte 0x5b // pop %rbx
.byte 0x41,0x5c // pop %r12
.byte 0x41,0x5d // pop %r13
.byte 0x41,0x5e // pop %r14
.byte 0x41,0x5f // pop %r15
.byte 0xc3 // retq
.globl _sk_just_return_sse2
_sk_just_return_sse2:

View File

@ -10,9 +10,14 @@ _text SEGMENT
PUBLIC _sk_start_pipeline_hsw
_sk_start_pipeline_hsw LABEL PROC
DB 65,87 ; push %r15
DB 65,86 ; push %r14
DB 65,85 ; push %r13
DB 65,84 ; push %r12
DB 86 ; push %rsi
DB 87 ; push %rdi
DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp
DB 83 ; push %rbx
DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp
DB 197,120,41,188,36,144,0,0,0 ; vmovaps %xmm15,0x90(%rsp)
DB 197,120,41,180,36,128,0,0,0 ; vmovaps %xmm14,0x80(%rsp)
DB 197,120,41,108,36,112 ; vmovaps %xmm13,0x70(%rsp)
@ -23,8 +28,18 @@ _sk_start_pipeline_hsw LABEL PROC
DB 197,120,41,68,36,32 ; vmovaps %xmm8,0x20(%rsp)
DB 197,248,41,124,36,16 ; vmovaps %xmm7,0x10(%rsp)
DB 197,248,41,52,36 ; vmovaps %xmm6,(%rsp)
DB 77,137,207 ; mov %r9,%r15
DB 77,137,198 ; mov %r8,%r14
DB 72,137,203 ; mov %rcx,%rbx
DB 72,137,214 ; mov %rdx,%rsi
DB 72,173 ; lods %ds:(%rsi),%rax
DB 73,137,196 ; mov %rax,%r12
DB 73,137,245 ; mov %rsi,%r13
DB 72,141,67,8 ; lea 0x8(%rbx),%rax
DB 76,57,248 ; cmp %r15,%rax
DB 118,5 ; jbe 75 <_sk_start_pipeline_hsw+0x75>
DB 72,137,216 ; mov %rbx,%rax
DB 235,60 ; jmp b1 <_sk_start_pipeline_hsw+0xb1>
DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0
DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1
DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2
@ -33,9 +48,15 @@ _sk_start_pipeline_hsw LABEL PROC
DB 197,212,87,237 ; vxorps %ymm5,%ymm5,%ymm5
DB 197,204,87,246 ; vxorps %ymm6,%ymm6,%ymm6
DB 197,196,87,255 ; vxorps %ymm7,%ymm7,%ymm7
DB 72,137,207 ; mov %rcx,%rdi
DB 76,137,194 ; mov %r8,%rdx
DB 255,208 ; callq *%rax
DB 72,137,223 ; mov %rbx,%rdi
DB 76,137,238 ; mov %r13,%rsi
DB 76,137,242 ; mov %r14,%rdx
DB 65,255,212 ; callq *%r12
DB 72,141,67,8 ; lea 0x8(%rbx),%rax
DB 72,131,195,16 ; add $0x10,%rbx
DB 76,57,251 ; cmp %r15,%rbx
DB 72,137,195 ; mov %rax,%rbx
DB 118,196 ; jbe 75 <_sk_start_pipeline_hsw+0x75>
DB 197,248,40,52,36 ; vmovaps (%rsp),%xmm6
DB 197,248,40,124,36,16 ; vmovaps 0x10(%rsp),%xmm7
DB 197,120,40,68,36,32 ; vmovaps 0x20(%rsp),%xmm8
@ -46,15 +67,19 @@ _sk_start_pipeline_hsw LABEL PROC
DB 197,120,40,108,36,112 ; vmovaps 0x70(%rsp),%xmm13
DB 197,120,40,180,36,128,0,0,0 ; vmovaps 0x80(%rsp),%xmm14
DB 197,120,40,188,36,144,0,0,0 ; vmovaps 0x90(%rsp),%xmm15
DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp
DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp
DB 91 ; pop %rbx
DB 95 ; pop %rdi
DB 94 ; pop %rsi
DB 65,92 ; pop %r12
DB 65,93 ; pop %r13
DB 65,94 ; pop %r14
DB 65,95 ; pop %r15
DB 197,248,119 ; vzeroupper
DB 195 ; retq
PUBLIC _sk_just_return_hsw
_sk_just_return_hsw LABEL PROC
DB 197,248,119 ; vzeroupper
DB 195 ; retq
PUBLIC _sk_seed_shader_hsw
@ -529,9 +554,14 @@ _sk_linear_gradient_2stops_hsw LABEL PROC
PUBLIC _sk_start_pipeline_sse41
_sk_start_pipeline_sse41 LABEL PROC
DB 65,87 ; push %r15
DB 65,86 ; push %r14
DB 65,85 ; push %r13
DB 65,84 ; push %r12
DB 86 ; push %rsi
DB 87 ; push %rdi
DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp
DB 83 ; push %rbx
DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp
DB 68,15,41,188,36,144,0,0,0 ; movaps %xmm15,0x90(%rsp)
DB 68,15,41,180,36,128,0,0,0 ; movaps %xmm14,0x80(%rsp)
DB 68,15,41,108,36,112 ; movaps %xmm13,0x70(%rsp)
@ -542,8 +572,18 @@ _sk_start_pipeline_sse41 LABEL PROC
DB 68,15,41,68,36,32 ; movaps %xmm8,0x20(%rsp)
DB 15,41,124,36,16 ; movaps %xmm7,0x10(%rsp)
DB 15,41,52,36 ; movaps %xmm6,(%rsp)
DB 77,137,207 ; mov %r9,%r15
DB 77,137,198 ; mov %r8,%r14
DB 72,137,203 ; mov %rcx,%rbx
DB 72,137,214 ; mov %rdx,%rsi
DB 72,173 ; lods %ds:(%rsi),%rax
DB 73,137,196 ; mov %rax,%r12
DB 73,137,245 ; mov %rsi,%r13
DB 72,141,67,4 ; lea 0x4(%rbx),%rax
DB 76,57,248 ; cmp %r15,%rax
DB 118,5 ; jbe 73 <_sk_start_pipeline_sse41+0x73>
DB 72,137,216 ; mov %rbx,%rax
DB 235,52 ; jmp a7 <_sk_start_pipeline_sse41+0xa7>
DB 15,87,192 ; xorps %xmm0,%xmm0
DB 15,87,201 ; xorps %xmm1,%xmm1
DB 15,87,210 ; xorps %xmm2,%xmm2
@ -552,9 +592,15 @@ _sk_start_pipeline_sse41 LABEL PROC
DB 15,87,237 ; xorps %xmm5,%xmm5
DB 15,87,246 ; xorps %xmm6,%xmm6
DB 15,87,255 ; xorps %xmm7,%xmm7
DB 72,137,207 ; mov %rcx,%rdi
DB 76,137,194 ; mov %r8,%rdx
DB 255,208 ; callq *%rax
DB 72,137,223 ; mov %rbx,%rdi
DB 76,137,238 ; mov %r13,%rsi
DB 76,137,242 ; mov %r14,%rdx
DB 65,255,212 ; callq *%r12
DB 72,141,67,4 ; lea 0x4(%rbx),%rax
DB 72,131,195,8 ; add $0x8,%rbx
DB 76,57,251 ; cmp %r15,%rbx
DB 72,137,195 ; mov %rax,%rbx
DB 118,204 ; jbe 73 <_sk_start_pipeline_sse41+0x73>
DB 15,40,52,36 ; movaps (%rsp),%xmm6
DB 15,40,124,36,16 ; movaps 0x10(%rsp),%xmm7
DB 68,15,40,68,36,32 ; movaps 0x20(%rsp),%xmm8
@ -565,9 +611,14 @@ _sk_start_pipeline_sse41 LABEL PROC
DB 68,15,40,108,36,112 ; movaps 0x70(%rsp),%xmm13
DB 68,15,40,180,36,128,0,0,0 ; movaps 0x80(%rsp),%xmm14
DB 68,15,40,188,36,144,0,0,0 ; movaps 0x90(%rsp),%xmm15
DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp
DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp
DB 91 ; pop %rbx
DB 95 ; pop %rdi
DB 94 ; pop %rsi
DB 65,92 ; pop %r12
DB 65,93 ; pop %r13
DB 65,94 ; pop %r14
DB 65,95 ; pop %r15
DB 195 ; retq
PUBLIC _sk_just_return_sse41
@ -1231,9 +1282,14 @@ _sk_linear_gradient_2stops_sse41 LABEL PROC
PUBLIC _sk_start_pipeline_sse2
_sk_start_pipeline_sse2 LABEL PROC
DB 65,87 ; push %r15
DB 65,86 ; push %r14
DB 65,85 ; push %r13
DB 65,84 ; push %r12
DB 86 ; push %rsi
DB 87 ; push %rdi
DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp
DB 83 ; push %rbx
DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp
DB 68,15,41,188,36,144,0,0,0 ; movaps %xmm15,0x90(%rsp)
DB 68,15,41,180,36,128,0,0,0 ; movaps %xmm14,0x80(%rsp)
DB 68,15,41,108,36,112 ; movaps %xmm13,0x70(%rsp)
@ -1244,8 +1300,18 @@ _sk_start_pipeline_sse2 LABEL PROC
DB 68,15,41,68,36,32 ; movaps %xmm8,0x20(%rsp)
DB 15,41,124,36,16 ; movaps %xmm7,0x10(%rsp)
DB 15,41,52,36 ; movaps %xmm6,(%rsp)
DB 77,137,207 ; mov %r9,%r15
DB 77,137,198 ; mov %r8,%r14
DB 72,137,203 ; mov %rcx,%rbx
DB 72,137,214 ; mov %rdx,%rsi
DB 72,173 ; lods %ds:(%rsi),%rax
DB 73,137,196 ; mov %rax,%r12
DB 73,137,245 ; mov %rsi,%r13
DB 72,141,67,4 ; lea 0x4(%rbx),%rax
DB 76,57,248 ; cmp %r15,%rax
DB 118,5 ; jbe 73 <_sk_start_pipeline_sse2+0x73>
DB 72,137,216 ; mov %rbx,%rax
DB 235,52 ; jmp a7 <_sk_start_pipeline_sse2+0xa7>
DB 15,87,192 ; xorps %xmm0,%xmm0
DB 15,87,201 ; xorps %xmm1,%xmm1
DB 15,87,210 ; xorps %xmm2,%xmm2
@ -1254,9 +1320,15 @@ _sk_start_pipeline_sse2 LABEL PROC
DB 15,87,237 ; xorps %xmm5,%xmm5
DB 15,87,246 ; xorps %xmm6,%xmm6
DB 15,87,255 ; xorps %xmm7,%xmm7
DB 72,137,207 ; mov %rcx,%rdi
DB 76,137,194 ; mov %r8,%rdx
DB 255,208 ; callq *%rax
DB 72,137,223 ; mov %rbx,%rdi
DB 76,137,238 ; mov %r13,%rsi
DB 76,137,242 ; mov %r14,%rdx
DB 65,255,212 ; callq *%r12
DB 72,141,67,4 ; lea 0x4(%rbx),%rax
DB 72,131,195,8 ; add $0x8,%rbx
DB 76,57,251 ; cmp %r15,%rbx
DB 72,137,195 ; mov %rax,%rbx
DB 118,204 ; jbe 73 <_sk_start_pipeline_sse2+0x73>
DB 15,40,52,36 ; movaps (%rsp),%xmm6
DB 15,40,124,36,16 ; movaps 0x10(%rsp),%xmm7
DB 68,15,40,68,36,32 ; movaps 0x20(%rsp),%xmm8
@ -1267,9 +1339,14 @@ _sk_start_pipeline_sse2 LABEL PROC
DB 68,15,40,108,36,112 ; movaps 0x70(%rsp),%xmm13
DB 68,15,40,180,36,128,0,0,0 ; movaps 0x80(%rsp),%xmm14
DB 68,15,40,188,36,144,0,0,0 ; movaps 0x90(%rsp),%xmm15
DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp
DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp
DB 91 ; pop %rbx
DB 95 ; pop %rdi
DB 94 ; pop %rsi
DB 65,92 ; pop %r12
DB 65,93 ; pop %r13
DB 65,94 ; pop %r14
DB 65,95 ; pop %r15
DB 195 ; retq
PUBLIC _sk_just_return_sse2

View File

@ -219,18 +219,19 @@ static void* load_and_inc(void**& program) {
#if defined(JUMPER) && defined(WIN)
__attribute__((ms_abi))
#endif
extern "C" void WRAP(start_pipeline)(size_t x, void** program, K* k) {
auto next = (Stage*)load_and_inc(program);
extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
F v{}; // TODO: faster uninitialized?
next(x,program,k, v,v,v,v, v,v,v,v);
size_t stride = sizeof(F) / sizeof(float);
auto start = (Stage*)load_and_inc(program);
while (x + stride <= limit) {
start(x,program,k, v,v,v,v, v,v,v,v);
x += stride;
}
return x;
}
// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {
#if defined(JUMPER) && defined(__AVX2__)
asm("vzeroupper");
#endif
}
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
// We can now define Stages!