From d9e82256e8591e7da1e2afde1e4c6c49c48ddc96 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Wed, 22 Feb 2017 14:17:32 -0500 Subject: [PATCH] SkJumper: set_rgb and swap_rb swap_rb is a big limiting factor on Windows and Linux. set_rgb just happened to be nearby and easy. Change-Id: Ic529c7578eeb278476821090127fa8fb1f70c04f Reviewed-on: https://skia-review.googlesource.com/8859 Reviewed-by: Herb Derby Commit-Queue: Mike Klein --- src/jumper/SkJumper.cpp | 2 + src/jumper/SkJumper_generated.S | 101 ++++++++++++++++++++++++++++ src/jumper/SkJumper_generated_win.S | 60 +++++++++++++++++ src/jumper/SkJumper_stages.cpp | 12 ++++ 4 files changed, 175 insertions(+) diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 3c7aca6057..cdcbb891f0 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -37,6 +37,8 @@ static K kConstants = { M(clamp_0) \ M(clamp_1) \ M(clamp_a) \ + M(set_rgb) \ + M(swap_rb) \ M(swap) \ M(move_src_dst) \ M(move_dst_src) \ diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index af76618632..5d7ec003a2 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -171,6 +171,24 @@ _sk_clamp_a_aarch64: .long 0x4ea3f442 // fmin v2.4s, v2.4s, v3.4s .long 0xd61f0060 // br x3 +.globl _sk_set_rgb_aarch64 +_sk_set_rgb_aarch64: + .long 0xa8c10c28 // ldp x8, x3, [x1],#16 + .long 0x91002109 // add x9, x8, #0x8 + .long 0x4ddfc900 // ld1r {v0.4s}, [x8], #4 + .long 0x4d40c922 // ld1r {v2.4s}, [x9] + .long 0x4d40c901 // ld1r {v1.4s}, [x8] + .long 0xd61f0060 // br x3 + +.globl _sk_swap_rb_aarch64 +_sk_swap_rb_aarch64: + .long 0xf9400423 // ldr x3, [x1,#8] + .long 0x4ea01c10 // mov v16.16b, v0.16b + .long 0x91004021 // add x1, x1, #0x10 + .long 0x4ea21c40 // mov v0.16b, v2.16b + .long 0x4eb01e02 // mov v2.16b, v16.16b + .long 0xd61f0060 // br x3 + .globl _sk_swap_aarch64 _sk_swap_aarch64: .long 0xf9400423 // ldr x3, [x1,#8] @@ -762,6 +780,29 @@ _sk_clamp_a_vfp4: .long 0xf2222f03 // vmin.f32 d2, d2, d3 .long 0xe12fff13 // bx r3 +.globl _sk_set_rgb_vfp4 +_sk_set_rgb_vfp4: + .long 0xe92d4800 // push {fp, lr} + .long 0xe591e000 // ldr lr, [r1] + .long 0xe591c004 // ldr ip, [r1, #4] + .long 0xe2811008 // add r1, r1, #8 + .long 0xe28e3008 // add r3, lr, #8 + .long 0xf4ae0c9f // vld1.32 {d0[]}, [lr :32] + .long 0xf4a32c9f // vld1.32 {d2[]}, [r3 :32] + .long 0xe28e3004 // add r3, lr, #4 + .long 0xf4a31c9f // vld1.32 {d1[]}, [r3 :32] + .long 0xe8bd4800 // pop {fp, lr} + .long 0xe12fff1c // bx ip + +.globl _sk_swap_rb_vfp4 +_sk_swap_rb_vfp4: + .long 0xeef00b40 // vmov.f64 d16, d0 + .long 0xe5913004 // ldr r3, [r1, #4] + .long 0xe2811008 // add r1, r1, #8 + .long 0xeeb00b42 // vmov.f64 d0, d2 + .long 0xeeb02b60 // vmov.f64 d2, d16 + .long 0xe12fff13 // bx r3 + .globl _sk_swap_vfp4 _sk_swap_vfp4: .long 0xeef00b43 // vmov.f64 d16, d3 @@ -1414,6 +1455,24 @@ _sk_clamp_a_hsw: .byte 0x48,0xad // lods %ds:(%rsi),%rax .byte 0xff,0xe0 // jmpq *%rax +.globl _sk_set_rgb_hsw +_sk_set_rgb_hsw: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0xe2,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm0 + .byte 0xc4,0xe2,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x50,0x08 // vbroadcastss 0x8(%rax),%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_swap_rb_hsw +_sk_swap_rb_hsw: + .byte 0xc5,0x7c,0x28,0xc0 // vmovaps %ymm0,%ymm8 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x28,0xc2 // vmovaps %ymm2,%ymm0 + .byte 0xc5,0x7c,0x29,0xc2 // vmovaps %ymm8,%ymm2 + .byte 0xff,0xe0 // jmpq *%rax + .globl _sk_swap_hsw _sk_swap_hsw: .byte 0xc5,0x7c,0x28,0xc3 // vmovaps %ymm3,%ymm8 @@ -1972,6 +2031,27 @@ _sk_clamp_a_sse41: .byte 0x48,0xad // lods %ds:(%rsi),%rax .byte 0xff,0xe0 // jmpq *%rax +.globl _sk_set_rgb_sse41 +_sk_set_rgb_sse41: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xf3,0x0f,0x10,0x00 // movss (%rax),%xmm0 + .byte 0xf3,0x0f,0x10,0x48,0x04 // movss 0x4(%rax),%xmm1 + .byte 0x0f,0xc6,0xc0,0x00 // shufps $0x0,%xmm0,%xmm0 + .byte 0x0f,0xc6,0xc9,0x00 // shufps $0x0,%xmm1,%xmm1 + .byte 0xf3,0x0f,0x10,0x50,0x08 // movss 0x8(%rax),%xmm2 + .byte 0x0f,0xc6,0xd2,0x00 // shufps $0x0,%xmm2,%xmm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_swap_rb_sse41 +_sk_swap_rb_sse41: + .byte 0x44,0x0f,0x28,0xc0 // movaps %xmm0,%xmm8 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x0f,0x28,0xc2 // movaps %xmm2,%xmm0 + .byte 0x41,0x0f,0x28,0xd0 // movaps %xmm8,%xmm2 + .byte 0xff,0xe0 // jmpq *%rax + .globl _sk_swap_sse41 _sk_swap_sse41: .byte 0x44,0x0f,0x28,0xc3 // movaps %xmm3,%xmm8 @@ -2697,6 +2777,27 @@ _sk_clamp_a_sse2: .byte 0x48,0xad // lods %ds:(%rsi),%rax .byte 0xff,0xe0 // jmpq *%rax +.globl _sk_set_rgb_sse2 +_sk_set_rgb_sse2: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xf3,0x0f,0x10,0x00 // movss (%rax),%xmm0 + .byte 0xf3,0x0f,0x10,0x48,0x04 // movss 0x4(%rax),%xmm1 + .byte 0x0f,0xc6,0xc0,0x00 // shufps $0x0,%xmm0,%xmm0 + .byte 0x0f,0xc6,0xc9,0x00 // shufps $0x0,%xmm1,%xmm1 + .byte 0xf3,0x0f,0x10,0x50,0x08 // movss 0x8(%rax),%xmm2 + .byte 0x0f,0xc6,0xd2,0x00 // shufps $0x0,%xmm2,%xmm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_swap_rb_sse2 +_sk_swap_rb_sse2: + .byte 0x44,0x0f,0x28,0xc0 // movaps %xmm0,%xmm8 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x0f,0x28,0xc2 // movaps %xmm2,%xmm0 + .byte 0x41,0x0f,0x28,0xd0 // movaps %xmm8,%xmm2 + .byte 0xff,0xe0 // jmpq *%rax + .globl _sk_swap_sse2 _sk_swap_sse2: .byte 0x44,0x0f,0x28,0xc3 // movaps %xmm3,%xmm8 diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index d681d24c04..d2078b6b4f 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -190,6 +190,24 @@ _sk_clamp_a_hsw LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax +PUBLIC _sk_set_rgb_hsw +_sk_set_rgb_hsw LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,226,125,24,0 ; vbroadcastss (%rax),%ymm0 + DB 196,226,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm1 + DB 196,226,125,24,80,8 ; vbroadcastss 0x8(%rax),%ymm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_swap_rb_hsw +_sk_swap_rb_hsw LABEL PROC + DB 197,124,40,192 ; vmovaps %ymm0,%ymm8 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,252,40,194 ; vmovaps %ymm2,%ymm0 + DB 197,124,41,194 ; vmovaps %ymm8,%ymm2 + DB 255,224 ; jmpq *%rax + PUBLIC _sk_swap_hsw _sk_swap_hsw LABEL PROC DB 197,124,40,195 ; vmovaps %ymm3,%ymm8 @@ -775,6 +793,27 @@ _sk_clamp_a_sse41 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax +PUBLIC _sk_set_rgb_sse41 +_sk_set_rgb_sse41 LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 243,15,16,0 ; movss (%rax),%xmm0 + DB 243,15,16,72,4 ; movss 0x4(%rax),%xmm1 + DB 15,198,192,0 ; shufps $0x0,%xmm0,%xmm0 + DB 15,198,201,0 ; shufps $0x0,%xmm1,%xmm1 + DB 243,15,16,80,8 ; movss 0x8(%rax),%xmm2 + DB 15,198,210,0 ; shufps $0x0,%xmm2,%xmm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_swap_rb_sse41 +_sk_swap_rb_sse41 LABEL PROC + DB 68,15,40,192 ; movaps %xmm0,%xmm8 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 15,40,194 ; movaps %xmm2,%xmm0 + DB 65,15,40,208 ; movaps %xmm8,%xmm2 + DB 255,224 ; jmpq *%rax + PUBLIC _sk_swap_sse41 _sk_swap_sse41 LABEL PROC DB 68,15,40,195 ; movaps %xmm3,%xmm8 @@ -1527,6 +1566,27 @@ _sk_clamp_a_sse2 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax +PUBLIC _sk_set_rgb_sse2 +_sk_set_rgb_sse2 LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 243,15,16,0 ; movss (%rax),%xmm0 + DB 243,15,16,72,4 ; movss 0x4(%rax),%xmm1 + DB 15,198,192,0 ; shufps $0x0,%xmm0,%xmm0 + DB 15,198,201,0 ; shufps $0x0,%xmm1,%xmm1 + DB 243,15,16,80,8 ; movss 0x8(%rax),%xmm2 + DB 15,198,210,0 ; shufps $0x0,%xmm2,%xmm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_swap_rb_sse2 +_sk_swap_rb_sse2 LABEL PROC + DB 68,15,40,192 ; movaps %xmm0,%xmm8 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 15,40,194 ; movaps %xmm2,%xmm0 + DB 65,15,40,208 ; movaps %xmm8,%xmm2 + DB 255,224 ; jmpq *%rax + PUBLIC _sk_swap_sse2 _sk_swap_sse2 LABEL PROC DB 68,15,40,195 ; movaps %xmm3,%xmm8 diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index 0a5d702551..a691f2bc62 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -321,6 +321,18 @@ STAGE(clamp_a) { b = min(b, a); } +STAGE(set_rgb) { + auto rgb = (const float*)ctx; + r = rgb[0]; + g = rgb[1]; + b = rgb[2]; +} +STAGE(swap_rb) { + auto tmp = r; + r = b; + b = tmp; +} + STAGE(swap) { auto swap = [](F& v, F& dv) { auto tmp = v;