Move looping logic into start_pipeline().
This should be a big win on Windows, but I haven't timed there yet. On my Mac, it's a solid 2% speedup. PS1 was insufficiently ambitious, but was this for posterity: No need to vzeroupper twice on Windows. On Windows start_pipeline() will vzeroupper, so no need to do it in just_return(). Change-Id: I099320b95da85900a60ce96fdb7a216a36db1858 Reviewed-on: https://skia-review.googlesource.com/8821 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
parent
c663953504
commit
9ef63754a7
@ -65,23 +65,23 @@ using StageFn = void(void);
|
||||
extern "C" {
|
||||
|
||||
#if defined(__aarch64__)
|
||||
void ASM(start_pipeline,aarch64)(size_t, void**, K*);
|
||||
size_t ASM(start_pipeline,aarch64)(size_t, void**, K*, size_t);
|
||||
StageFn ASM(just_return,aarch64);
|
||||
#define M(st) StageFn ASM(st,aarch64);
|
||||
STAGES(M)
|
||||
#undef M
|
||||
|
||||
#elif defined(__arm__)
|
||||
void ASM(start_pipeline,vfp4)(size_t, void**, K*);
|
||||
size_t ASM(start_pipeline,vfp4)(size_t, void**, K*, size_t);
|
||||
StageFn ASM(just_return,vfp4);
|
||||
#define M(st) StageFn ASM(st,vfp4);
|
||||
STAGES(M)
|
||||
#undef M
|
||||
|
||||
#elif defined(__x86_64__) || defined(_M_X64)
|
||||
void ASM(start_pipeline,hsw )(size_t, void**, K*);
|
||||
void ASM(start_pipeline,sse41)(size_t, void**, K*);
|
||||
void ASM(start_pipeline,sse2 )(size_t, void**, K*);
|
||||
size_t ASM(start_pipeline,hsw )(size_t, void**, K*, size_t);
|
||||
size_t ASM(start_pipeline,sse41)(size_t, void**, K*, size_t);
|
||||
size_t ASM(start_pipeline,sse2 )(size_t, void**, K*, size_t);
|
||||
|
||||
StageFn ASM(just_return,hsw),
|
||||
ASM(just_return,sse41),
|
||||
@ -99,7 +99,7 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// Portable, single-pixel stages.
|
||||
void sk_start_pipeline(size_t, void**, K*);
|
||||
size_t sk_start_pipeline(size_t, void**, K*, size_t);
|
||||
StageFn sk_just_return;
|
||||
#define M(st) StageFn sk_##st;
|
||||
STAGES(M)
|
||||
@ -171,7 +171,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
|
||||
auto build_and_run = [&](size_t stride,
|
||||
StageFn* (*lookup)(SkRasterPipeline::StockStage),
|
||||
StageFn* just_return,
|
||||
void (*start_pipeline)(size_t, void**, K*)) {
|
||||
size_t (*start_pipeline)(size_t, void**, K*, size_t)) {
|
||||
if (x + stride <= limit) {
|
||||
void** ip = program.get();
|
||||
for (auto&& st : fStages) {
|
||||
@ -184,10 +184,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
|
||||
}
|
||||
*ip = (void*)just_return;
|
||||
|
||||
while (x + stride <= limit) {
|
||||
start_pipeline(x, program.get(), &kConstants);
|
||||
x += stride;
|
||||
}
|
||||
x = start_pipeline(x, program.get(), &kConstants, limit);
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
@ -12,7 +12,21 @@
|
||||
|
||||
.globl _sk_start_pipeline_aarch64
|
||||
_sk_start_pipeline_aarch64:
|
||||
.long 0xf8408423 // ldr x3, [x1],#8
|
||||
.long 0xa9bc5ff8 // stp x24, x23, [sp,#-64]!
|
||||
.long 0xa90157f6 // stp x22, x21, [sp,#16]
|
||||
.long 0xa9024ff4 // stp x20, x19, [sp,#32]
|
||||
.long 0xa9037bfd // stp x29, x30, [sp,#48]
|
||||
.long 0x9100c3fd // add x29, sp, #0x30
|
||||
.long 0xaa0103f3 // mov x19, x1
|
||||
.long 0xf8408677 // ldr x23, [x19],#8
|
||||
.long 0xaa0003f6 // mov x22, x0
|
||||
.long 0xaa0303f4 // mov x20, x3
|
||||
.long 0xaa0203f5 // mov x21, x2
|
||||
.long 0x910012c8 // add x8, x22, #0x4
|
||||
.long 0xeb14011f // cmp x8, x20
|
||||
.long 0x54000069 // b.ls 3c <sk_start_pipeline_aarch64+0x3c>
|
||||
.long 0xaa1603e0 // mov x0, x22
|
||||
.long 0x14000012 // b 80 <sk_start_pipeline_aarch64+0x80>
|
||||
.long 0x6f00e400 // movi v0.2d, #0x0
|
||||
.long 0x6f00e401 // movi v1.2d, #0x0
|
||||
.long 0x6f00e402 // movi v2.2d, #0x0
|
||||
@ -21,7 +35,20 @@ _sk_start_pipeline_aarch64:
|
||||
.long 0x6f00e405 // movi v5.2d, #0x0
|
||||
.long 0x6f00e406 // movi v6.2d, #0x0
|
||||
.long 0x6f00e407 // movi v7.2d, #0x0
|
||||
.long 0xd61f0060 // br x3
|
||||
.long 0xaa1603e0 // mov x0, x22
|
||||
.long 0xaa1303e1 // mov x1, x19
|
||||
.long 0xaa1503e2 // mov x2, x21
|
||||
.long 0xd63f02e0 // blr x23
|
||||
.long 0x910022c8 // add x8, x22, #0x8
|
||||
.long 0x910012c0 // add x0, x22, #0x4
|
||||
.long 0xeb14011f // cmp x8, x20
|
||||
.long 0xaa0003f6 // mov x22, x0
|
||||
.long 0x54fffe09 // b.ls 3c <sk_start_pipeline_aarch64+0x3c>
|
||||
.long 0xa9437bfd // ldp x29, x30, [sp,#48]
|
||||
.long 0xa9424ff4 // ldp x20, x19, [sp,#32]
|
||||
.long 0xa94157f6 // ldp x22, x21, [sp,#16]
|
||||
.long 0xa8c45ff8 // ldp x24, x23, [sp],#64
|
||||
.long 0xd65f03c0 // ret
|
||||
|
||||
.globl _sk_just_return_aarch64
|
||||
_sk_just_return_aarch64:
|
||||
@ -551,16 +578,32 @@ _sk_linear_gradient_2stops_aarch64:
|
||||
|
||||
.globl _sk_start_pipeline_vfp4
|
||||
_sk_start_pipeline_vfp4:
|
||||
.long 0xe4913004 // ldr r3, [r1], #4
|
||||
.long 0xe92d41f0 // push {r4, r5, r6, r7, r8, lr}
|
||||
.long 0xe1a07001 // mov r7, r1
|
||||
.long 0xe1a04000 // mov r4, r0
|
||||
.long 0xe1a05003 // mov r5, r3
|
||||
.long 0xe1a08002 // mov r8, r2
|
||||
.long 0xe4976004 // ldr r6, [r7], #4
|
||||
.long 0xe2840002 // add r0, r4, #2
|
||||
.long 0xea00000d // b 58 <sk_start_pipeline_vfp4+0x58>
|
||||
.long 0xf2800010 // vmov.i32 d0, #0
|
||||
.long 0xe1a00004 // mov r0, r4
|
||||
.long 0xf2801010 // vmov.i32 d1, #0
|
||||
.long 0xe1a01007 // mov r1, r7
|
||||
.long 0xf2802010 // vmov.i32 d2, #0
|
||||
.long 0xe1a02008 // mov r2, r8
|
||||
.long 0xf2803010 // vmov.i32 d3, #0
|
||||
.long 0xf2804010 // vmov.i32 d4, #0
|
||||
.long 0xf2805010 // vmov.i32 d5, #0
|
||||
.long 0xf2806010 // vmov.i32 d6, #0
|
||||
.long 0xf2807010 // vmov.i32 d7, #0
|
||||
.long 0xe12fff13 // bx r3
|
||||
.long 0xe12fff36 // blx r6
|
||||
.long 0xe2840004 // add r0, r4, #4
|
||||
.long 0xe2844002 // add r4, r4, #2
|
||||
.long 0xe1500005 // cmp r0, r5
|
||||
.long 0x9affffef // bls 20 <sk_start_pipeline_vfp4+0x20>
|
||||
.long 0xe1a00004 // mov r0, r4
|
||||
.long 0xe8bd81f0 // pop {r4, r5, r6, r7, r8, pc}
|
||||
|
||||
.globl _sk_just_return_vfp4
|
||||
_sk_just_return_vfp4:
|
||||
@ -1152,7 +1195,22 @@ _sk_linear_gradient_2stops_vfp4:
|
||||
|
||||
.globl _sk_start_pipeline_hsw
|
||||
_sk_start_pipeline_hsw:
|
||||
.byte 0x41,0x57 // push %r15
|
||||
.byte 0x41,0x56 // push %r14
|
||||
.byte 0x41,0x55 // push %r13
|
||||
.byte 0x41,0x54 // push %r12
|
||||
.byte 0x53 // push %rbx
|
||||
.byte 0x49,0x89,0xcf // mov %rcx,%r15
|
||||
.byte 0x49,0x89,0xd6 // mov %rdx,%r14
|
||||
.byte 0x48,0x89,0xfb // mov %rdi,%rbx
|
||||
.byte 0x48,0xad // lods %ds:(%rsi),%rax
|
||||
.byte 0x49,0x89,0xc4 // mov %rax,%r12
|
||||
.byte 0x49,0x89,0xf5 // mov %rsi,%r13
|
||||
.byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax
|
||||
.byte 0x4c,0x39,0xf8 // cmp %r15,%rax
|
||||
.byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_hsw+0x28>
|
||||
.byte 0x48,0x89,0xd8 // mov %rbx,%rax
|
||||
.byte 0xeb,0x3c // jmp 64 <_sk_start_pipeline_hsw+0x64>
|
||||
.byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0
|
||||
.byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1
|
||||
.byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2
|
||||
@ -1161,7 +1219,22 @@ _sk_start_pipeline_hsw:
|
||||
.byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5
|
||||
.byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6
|
||||
.byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7
|
||||
.byte 0xff,0xe0 // jmpq *%rax
|
||||
.byte 0x48,0x89,0xdf // mov %rbx,%rdi
|
||||
.byte 0x4c,0x89,0xee // mov %r13,%rsi
|
||||
.byte 0x4c,0x89,0xf2 // mov %r14,%rdx
|
||||
.byte 0x41,0xff,0xd4 // callq *%r12
|
||||
.byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax
|
||||
.byte 0x48,0x83,0xc3,0x10 // add $0x10,%rbx
|
||||
.byte 0x4c,0x39,0xfb // cmp %r15,%rbx
|
||||
.byte 0x48,0x89,0xc3 // mov %rax,%rbx
|
||||
.byte 0x76,0xc4 // jbe 28 <_sk_start_pipeline_hsw+0x28>
|
||||
.byte 0x5b // pop %rbx
|
||||
.byte 0x41,0x5c // pop %r12
|
||||
.byte 0x41,0x5d // pop %r13
|
||||
.byte 0x41,0x5e // pop %r14
|
||||
.byte 0x41,0x5f // pop %r15
|
||||
.byte 0xc5,0xf8,0x77 // vzeroupper
|
||||
.byte 0xc3 // retq
|
||||
|
||||
.globl _sk_just_return_hsw
|
||||
_sk_just_return_hsw:
|
||||
@ -1640,7 +1713,22 @@ _sk_linear_gradient_2stops_hsw:
|
||||
|
||||
.globl _sk_start_pipeline_sse41
|
||||
_sk_start_pipeline_sse41:
|
||||
.byte 0x41,0x57 // push %r15
|
||||
.byte 0x41,0x56 // push %r14
|
||||
.byte 0x41,0x55 // push %r13
|
||||
.byte 0x41,0x54 // push %r12
|
||||
.byte 0x53 // push %rbx
|
||||
.byte 0x49,0x89,0xcf // mov %rcx,%r15
|
||||
.byte 0x49,0x89,0xd6 // mov %rdx,%r14
|
||||
.byte 0x48,0x89,0xfb // mov %rdi,%rbx
|
||||
.byte 0x48,0xad // lods %ds:(%rsi),%rax
|
||||
.byte 0x49,0x89,0xc4 // mov %rax,%r12
|
||||
.byte 0x49,0x89,0xf5 // mov %rsi,%r13
|
||||
.byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
|
||||
.byte 0x4c,0x39,0xf8 // cmp %r15,%rax
|
||||
.byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_sse41+0x28>
|
||||
.byte 0x48,0x89,0xd8 // mov %rbx,%rax
|
||||
.byte 0xeb,0x34 // jmp 5c <_sk_start_pipeline_sse41+0x5c>
|
||||
.byte 0x0f,0x57,0xc0 // xorps %xmm0,%xmm0
|
||||
.byte 0x0f,0x57,0xc9 // xorps %xmm1,%xmm1
|
||||
.byte 0x0f,0x57,0xd2 // xorps %xmm2,%xmm2
|
||||
@ -1649,7 +1737,21 @@ _sk_start_pipeline_sse41:
|
||||
.byte 0x0f,0x57,0xed // xorps %xmm5,%xmm5
|
||||
.byte 0x0f,0x57,0xf6 // xorps %xmm6,%xmm6
|
||||
.byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7
|
||||
.byte 0xff,0xe0 // jmpq *%rax
|
||||
.byte 0x48,0x89,0xdf // mov %rbx,%rdi
|
||||
.byte 0x4c,0x89,0xee // mov %r13,%rsi
|
||||
.byte 0x4c,0x89,0xf2 // mov %r14,%rdx
|
||||
.byte 0x41,0xff,0xd4 // callq *%r12
|
||||
.byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
|
||||
.byte 0x48,0x83,0xc3,0x08 // add $0x8,%rbx
|
||||
.byte 0x4c,0x39,0xfb // cmp %r15,%rbx
|
||||
.byte 0x48,0x89,0xc3 // mov %rax,%rbx
|
||||
.byte 0x76,0xcc // jbe 28 <_sk_start_pipeline_sse41+0x28>
|
||||
.byte 0x5b // pop %rbx
|
||||
.byte 0x41,0x5c // pop %r12
|
||||
.byte 0x41,0x5d // pop %r13
|
||||
.byte 0x41,0x5e // pop %r14
|
||||
.byte 0x41,0x5f // pop %r15
|
||||
.byte 0xc3 // retq
|
||||
|
||||
.globl _sk_just_return_sse41
|
||||
_sk_just_return_sse41:
|
||||
@ -2312,7 +2414,22 @@ _sk_linear_gradient_2stops_sse41:
|
||||
|
||||
.globl _sk_start_pipeline_sse2
|
||||
_sk_start_pipeline_sse2:
|
||||
.byte 0x41,0x57 // push %r15
|
||||
.byte 0x41,0x56 // push %r14
|
||||
.byte 0x41,0x55 // push %r13
|
||||
.byte 0x41,0x54 // push %r12
|
||||
.byte 0x53 // push %rbx
|
||||
.byte 0x49,0x89,0xcf // mov %rcx,%r15
|
||||
.byte 0x49,0x89,0xd6 // mov %rdx,%r14
|
||||
.byte 0x48,0x89,0xfb // mov %rdi,%rbx
|
||||
.byte 0x48,0xad // lods %ds:(%rsi),%rax
|
||||
.byte 0x49,0x89,0xc4 // mov %rax,%r12
|
||||
.byte 0x49,0x89,0xf5 // mov %rsi,%r13
|
||||
.byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
|
||||
.byte 0x4c,0x39,0xf8 // cmp %r15,%rax
|
||||
.byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_sse2+0x28>
|
||||
.byte 0x48,0x89,0xd8 // mov %rbx,%rax
|
||||
.byte 0xeb,0x34 // jmp 5c <_sk_start_pipeline_sse2+0x5c>
|
||||
.byte 0x0f,0x57,0xc0 // xorps %xmm0,%xmm0
|
||||
.byte 0x0f,0x57,0xc9 // xorps %xmm1,%xmm1
|
||||
.byte 0x0f,0x57,0xd2 // xorps %xmm2,%xmm2
|
||||
@ -2321,7 +2438,21 @@ _sk_start_pipeline_sse2:
|
||||
.byte 0x0f,0x57,0xed // xorps %xmm5,%xmm5
|
||||
.byte 0x0f,0x57,0xf6 // xorps %xmm6,%xmm6
|
||||
.byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7
|
||||
.byte 0xff,0xe0 // jmpq *%rax
|
||||
.byte 0x48,0x89,0xdf // mov %rbx,%rdi
|
||||
.byte 0x4c,0x89,0xee // mov %r13,%rsi
|
||||
.byte 0x4c,0x89,0xf2 // mov %r14,%rdx
|
||||
.byte 0x41,0xff,0xd4 // callq *%r12
|
||||
.byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
|
||||
.byte 0x48,0x83,0xc3,0x08 // add $0x8,%rbx
|
||||
.byte 0x4c,0x39,0xfb // cmp %r15,%rbx
|
||||
.byte 0x48,0x89,0xc3 // mov %rax,%rbx
|
||||
.byte 0x76,0xcc // jbe 28 <_sk_start_pipeline_sse2+0x28>
|
||||
.byte 0x5b // pop %rbx
|
||||
.byte 0x41,0x5c // pop %r12
|
||||
.byte 0x41,0x5d // pop %r13
|
||||
.byte 0x41,0x5e // pop %r14
|
||||
.byte 0x41,0x5f // pop %r15
|
||||
.byte 0xc3 // retq
|
||||
|
||||
.globl _sk_just_return_sse2
|
||||
_sk_just_return_sse2:
|
||||
|
@ -10,9 +10,14 @@ _text SEGMENT
|
||||
|
||||
PUBLIC _sk_start_pipeline_hsw
|
||||
_sk_start_pipeline_hsw LABEL PROC
|
||||
DB 65,87 ; push %r15
|
||||
DB 65,86 ; push %r14
|
||||
DB 65,85 ; push %r13
|
||||
DB 65,84 ; push %r12
|
||||
DB 86 ; push %rsi
|
||||
DB 87 ; push %rdi
|
||||
DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp
|
||||
DB 83 ; push %rbx
|
||||
DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp
|
||||
DB 197,120,41,188,36,144,0,0,0 ; vmovaps %xmm15,0x90(%rsp)
|
||||
DB 197,120,41,180,36,128,0,0,0 ; vmovaps %xmm14,0x80(%rsp)
|
||||
DB 197,120,41,108,36,112 ; vmovaps %xmm13,0x70(%rsp)
|
||||
@ -23,8 +28,18 @@ _sk_start_pipeline_hsw LABEL PROC
|
||||
DB 197,120,41,68,36,32 ; vmovaps %xmm8,0x20(%rsp)
|
||||
DB 197,248,41,124,36,16 ; vmovaps %xmm7,0x10(%rsp)
|
||||
DB 197,248,41,52,36 ; vmovaps %xmm6,(%rsp)
|
||||
DB 77,137,207 ; mov %r9,%r15
|
||||
DB 77,137,198 ; mov %r8,%r14
|
||||
DB 72,137,203 ; mov %rcx,%rbx
|
||||
DB 72,137,214 ; mov %rdx,%rsi
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 73,137,196 ; mov %rax,%r12
|
||||
DB 73,137,245 ; mov %rsi,%r13
|
||||
DB 72,141,67,8 ; lea 0x8(%rbx),%rax
|
||||
DB 76,57,248 ; cmp %r15,%rax
|
||||
DB 118,5 ; jbe 75 <_sk_start_pipeline_hsw+0x75>
|
||||
DB 72,137,216 ; mov %rbx,%rax
|
||||
DB 235,60 ; jmp b1 <_sk_start_pipeline_hsw+0xb1>
|
||||
DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0
|
||||
DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1
|
||||
DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2
|
||||
@ -33,9 +48,15 @@ _sk_start_pipeline_hsw LABEL PROC
|
||||
DB 197,212,87,237 ; vxorps %ymm5,%ymm5,%ymm5
|
||||
DB 197,204,87,246 ; vxorps %ymm6,%ymm6,%ymm6
|
||||
DB 197,196,87,255 ; vxorps %ymm7,%ymm7,%ymm7
|
||||
DB 72,137,207 ; mov %rcx,%rdi
|
||||
DB 76,137,194 ; mov %r8,%rdx
|
||||
DB 255,208 ; callq *%rax
|
||||
DB 72,137,223 ; mov %rbx,%rdi
|
||||
DB 76,137,238 ; mov %r13,%rsi
|
||||
DB 76,137,242 ; mov %r14,%rdx
|
||||
DB 65,255,212 ; callq *%r12
|
||||
DB 72,141,67,8 ; lea 0x8(%rbx),%rax
|
||||
DB 72,131,195,16 ; add $0x10,%rbx
|
||||
DB 76,57,251 ; cmp %r15,%rbx
|
||||
DB 72,137,195 ; mov %rax,%rbx
|
||||
DB 118,196 ; jbe 75 <_sk_start_pipeline_hsw+0x75>
|
||||
DB 197,248,40,52,36 ; vmovaps (%rsp),%xmm6
|
||||
DB 197,248,40,124,36,16 ; vmovaps 0x10(%rsp),%xmm7
|
||||
DB 197,120,40,68,36,32 ; vmovaps 0x20(%rsp),%xmm8
|
||||
@ -46,15 +67,19 @@ _sk_start_pipeline_hsw LABEL PROC
|
||||
DB 197,120,40,108,36,112 ; vmovaps 0x70(%rsp),%xmm13
|
||||
DB 197,120,40,180,36,128,0,0,0 ; vmovaps 0x80(%rsp),%xmm14
|
||||
DB 197,120,40,188,36,144,0,0,0 ; vmovaps 0x90(%rsp),%xmm15
|
||||
DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp
|
||||
DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp
|
||||
DB 91 ; pop %rbx
|
||||
DB 95 ; pop %rdi
|
||||
DB 94 ; pop %rsi
|
||||
DB 65,92 ; pop %r12
|
||||
DB 65,93 ; pop %r13
|
||||
DB 65,94 ; pop %r14
|
||||
DB 65,95 ; pop %r15
|
||||
DB 197,248,119 ; vzeroupper
|
||||
DB 195 ; retq
|
||||
|
||||
PUBLIC _sk_just_return_hsw
|
||||
_sk_just_return_hsw LABEL PROC
|
||||
DB 197,248,119 ; vzeroupper
|
||||
DB 195 ; retq
|
||||
|
||||
PUBLIC _sk_seed_shader_hsw
|
||||
@ -529,9 +554,14 @@ _sk_linear_gradient_2stops_hsw LABEL PROC
|
||||
|
||||
PUBLIC _sk_start_pipeline_sse41
|
||||
_sk_start_pipeline_sse41 LABEL PROC
|
||||
DB 65,87 ; push %r15
|
||||
DB 65,86 ; push %r14
|
||||
DB 65,85 ; push %r13
|
||||
DB 65,84 ; push %r12
|
||||
DB 86 ; push %rsi
|
||||
DB 87 ; push %rdi
|
||||
DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp
|
||||
DB 83 ; push %rbx
|
||||
DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp
|
||||
DB 68,15,41,188,36,144,0,0,0 ; movaps %xmm15,0x90(%rsp)
|
||||
DB 68,15,41,180,36,128,0,0,0 ; movaps %xmm14,0x80(%rsp)
|
||||
DB 68,15,41,108,36,112 ; movaps %xmm13,0x70(%rsp)
|
||||
@ -542,8 +572,18 @@ _sk_start_pipeline_sse41 LABEL PROC
|
||||
DB 68,15,41,68,36,32 ; movaps %xmm8,0x20(%rsp)
|
||||
DB 15,41,124,36,16 ; movaps %xmm7,0x10(%rsp)
|
||||
DB 15,41,52,36 ; movaps %xmm6,(%rsp)
|
||||
DB 77,137,207 ; mov %r9,%r15
|
||||
DB 77,137,198 ; mov %r8,%r14
|
||||
DB 72,137,203 ; mov %rcx,%rbx
|
||||
DB 72,137,214 ; mov %rdx,%rsi
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 73,137,196 ; mov %rax,%r12
|
||||
DB 73,137,245 ; mov %rsi,%r13
|
||||
DB 72,141,67,4 ; lea 0x4(%rbx),%rax
|
||||
DB 76,57,248 ; cmp %r15,%rax
|
||||
DB 118,5 ; jbe 73 <_sk_start_pipeline_sse41+0x73>
|
||||
DB 72,137,216 ; mov %rbx,%rax
|
||||
DB 235,52 ; jmp a7 <_sk_start_pipeline_sse41+0xa7>
|
||||
DB 15,87,192 ; xorps %xmm0,%xmm0
|
||||
DB 15,87,201 ; xorps %xmm1,%xmm1
|
||||
DB 15,87,210 ; xorps %xmm2,%xmm2
|
||||
@ -552,9 +592,15 @@ _sk_start_pipeline_sse41 LABEL PROC
|
||||
DB 15,87,237 ; xorps %xmm5,%xmm5
|
||||
DB 15,87,246 ; xorps %xmm6,%xmm6
|
||||
DB 15,87,255 ; xorps %xmm7,%xmm7
|
||||
DB 72,137,207 ; mov %rcx,%rdi
|
||||
DB 76,137,194 ; mov %r8,%rdx
|
||||
DB 255,208 ; callq *%rax
|
||||
DB 72,137,223 ; mov %rbx,%rdi
|
||||
DB 76,137,238 ; mov %r13,%rsi
|
||||
DB 76,137,242 ; mov %r14,%rdx
|
||||
DB 65,255,212 ; callq *%r12
|
||||
DB 72,141,67,4 ; lea 0x4(%rbx),%rax
|
||||
DB 72,131,195,8 ; add $0x8,%rbx
|
||||
DB 76,57,251 ; cmp %r15,%rbx
|
||||
DB 72,137,195 ; mov %rax,%rbx
|
||||
DB 118,204 ; jbe 73 <_sk_start_pipeline_sse41+0x73>
|
||||
DB 15,40,52,36 ; movaps (%rsp),%xmm6
|
||||
DB 15,40,124,36,16 ; movaps 0x10(%rsp),%xmm7
|
||||
DB 68,15,40,68,36,32 ; movaps 0x20(%rsp),%xmm8
|
||||
@ -565,9 +611,14 @@ _sk_start_pipeline_sse41 LABEL PROC
|
||||
DB 68,15,40,108,36,112 ; movaps 0x70(%rsp),%xmm13
|
||||
DB 68,15,40,180,36,128,0,0,0 ; movaps 0x80(%rsp),%xmm14
|
||||
DB 68,15,40,188,36,144,0,0,0 ; movaps 0x90(%rsp),%xmm15
|
||||
DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp
|
||||
DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp
|
||||
DB 91 ; pop %rbx
|
||||
DB 95 ; pop %rdi
|
||||
DB 94 ; pop %rsi
|
||||
DB 65,92 ; pop %r12
|
||||
DB 65,93 ; pop %r13
|
||||
DB 65,94 ; pop %r14
|
||||
DB 65,95 ; pop %r15
|
||||
DB 195 ; retq
|
||||
|
||||
PUBLIC _sk_just_return_sse41
|
||||
@ -1231,9 +1282,14 @@ _sk_linear_gradient_2stops_sse41 LABEL PROC
|
||||
|
||||
PUBLIC _sk_start_pipeline_sse2
|
||||
_sk_start_pipeline_sse2 LABEL PROC
|
||||
DB 65,87 ; push %r15
|
||||
DB 65,86 ; push %r14
|
||||
DB 65,85 ; push %r13
|
||||
DB 65,84 ; push %r12
|
||||
DB 86 ; push %rsi
|
||||
DB 87 ; push %rdi
|
||||
DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp
|
||||
DB 83 ; push %rbx
|
||||
DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp
|
||||
DB 68,15,41,188,36,144,0,0,0 ; movaps %xmm15,0x90(%rsp)
|
||||
DB 68,15,41,180,36,128,0,0,0 ; movaps %xmm14,0x80(%rsp)
|
||||
DB 68,15,41,108,36,112 ; movaps %xmm13,0x70(%rsp)
|
||||
@ -1244,8 +1300,18 @@ _sk_start_pipeline_sse2 LABEL PROC
|
||||
DB 68,15,41,68,36,32 ; movaps %xmm8,0x20(%rsp)
|
||||
DB 15,41,124,36,16 ; movaps %xmm7,0x10(%rsp)
|
||||
DB 15,41,52,36 ; movaps %xmm6,(%rsp)
|
||||
DB 77,137,207 ; mov %r9,%r15
|
||||
DB 77,137,198 ; mov %r8,%r14
|
||||
DB 72,137,203 ; mov %rcx,%rbx
|
||||
DB 72,137,214 ; mov %rdx,%rsi
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 73,137,196 ; mov %rax,%r12
|
||||
DB 73,137,245 ; mov %rsi,%r13
|
||||
DB 72,141,67,4 ; lea 0x4(%rbx),%rax
|
||||
DB 76,57,248 ; cmp %r15,%rax
|
||||
DB 118,5 ; jbe 73 <_sk_start_pipeline_sse2+0x73>
|
||||
DB 72,137,216 ; mov %rbx,%rax
|
||||
DB 235,52 ; jmp a7 <_sk_start_pipeline_sse2+0xa7>
|
||||
DB 15,87,192 ; xorps %xmm0,%xmm0
|
||||
DB 15,87,201 ; xorps %xmm1,%xmm1
|
||||
DB 15,87,210 ; xorps %xmm2,%xmm2
|
||||
@ -1254,9 +1320,15 @@ _sk_start_pipeline_sse2 LABEL PROC
|
||||
DB 15,87,237 ; xorps %xmm5,%xmm5
|
||||
DB 15,87,246 ; xorps %xmm6,%xmm6
|
||||
DB 15,87,255 ; xorps %xmm7,%xmm7
|
||||
DB 72,137,207 ; mov %rcx,%rdi
|
||||
DB 76,137,194 ; mov %r8,%rdx
|
||||
DB 255,208 ; callq *%rax
|
||||
DB 72,137,223 ; mov %rbx,%rdi
|
||||
DB 76,137,238 ; mov %r13,%rsi
|
||||
DB 76,137,242 ; mov %r14,%rdx
|
||||
DB 65,255,212 ; callq *%r12
|
||||
DB 72,141,67,4 ; lea 0x4(%rbx),%rax
|
||||
DB 72,131,195,8 ; add $0x8,%rbx
|
||||
DB 76,57,251 ; cmp %r15,%rbx
|
||||
DB 72,137,195 ; mov %rax,%rbx
|
||||
DB 118,204 ; jbe 73 <_sk_start_pipeline_sse2+0x73>
|
||||
DB 15,40,52,36 ; movaps (%rsp),%xmm6
|
||||
DB 15,40,124,36,16 ; movaps 0x10(%rsp),%xmm7
|
||||
DB 68,15,40,68,36,32 ; movaps 0x20(%rsp),%xmm8
|
||||
@ -1267,9 +1339,14 @@ _sk_start_pipeline_sse2 LABEL PROC
|
||||
DB 68,15,40,108,36,112 ; movaps 0x70(%rsp),%xmm13
|
||||
DB 68,15,40,180,36,128,0,0,0 ; movaps 0x80(%rsp),%xmm14
|
||||
DB 68,15,40,188,36,144,0,0,0 ; movaps 0x90(%rsp),%xmm15
|
||||
DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp
|
||||
DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp
|
||||
DB 91 ; pop %rbx
|
||||
DB 95 ; pop %rdi
|
||||
DB 94 ; pop %rsi
|
||||
DB 65,92 ; pop %r12
|
||||
DB 65,93 ; pop %r13
|
||||
DB 65,94 ; pop %r14
|
||||
DB 65,95 ; pop %r15
|
||||
DB 195 ; retq
|
||||
|
||||
PUBLIC _sk_just_return_sse2
|
||||
|
@ -219,18 +219,19 @@ static void* load_and_inc(void**& program) {
|
||||
#if defined(JUMPER) && defined(WIN)
|
||||
__attribute__((ms_abi))
|
||||
#endif
|
||||
extern "C" void WRAP(start_pipeline)(size_t x, void** program, K* k) {
|
||||
auto next = (Stage*)load_and_inc(program);
|
||||
extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
|
||||
F v{}; // TODO: faster uninitialized?
|
||||
next(x,program,k, v,v,v,v, v,v,v,v);
|
||||
size_t stride = sizeof(F) / sizeof(float);
|
||||
auto start = (Stage*)load_and_inc(program);
|
||||
while (x + stride <= limit) {
|
||||
start(x,program,k, v,v,v,v, v,v,v,v);
|
||||
x += stride;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
|
||||
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {
|
||||
#if defined(JUMPER) && defined(__AVX2__)
|
||||
asm("vzeroupper");
|
||||
#endif
|
||||
}
|
||||
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
|
||||
|
||||
// We can now define Stages!
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user