jumper, add load_f32()
Change-Id: I71d85ffe29bc11678ff1e696fa4a2c93d0b4fcbe Reviewed-on: https://skia-review.googlesource.com/11446 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
8a823faeba
commit
14987ebb97
@ -99,6 +99,7 @@ static K kConstants = {
|
||||
M(store_f16) \
|
||||
M(load_u16_be) \
|
||||
M(store_u16_be) \
|
||||
M(load_f32) \
|
||||
M(store_f32) \
|
||||
M(luminance_to_alpha) \
|
||||
M(matrix_2x3) \
|
||||
|
@ -1678,6 +1678,15 @@ _sk_store_u16_be_aarch64:
|
||||
.long 0x91004021 // add x1, x1, #0x10
|
||||
.long 0xd61f0060 // br x3
|
||||
|
||||
HIDDEN _sk_load_f32_aarch64
|
||||
.globl _sk_load_f32_aarch64
|
||||
_sk_load_f32_aarch64:
|
||||
.long 0xa8c10c28 // ldp x8, x3, [x1], #16
|
||||
.long 0xf9400108 // ldr x8, [x8]
|
||||
.long 0x8b001108 // add x8, x8, x0, lsl #4
|
||||
.long 0x4c400900 // ld4 {v0.4s-v3.4s}, [x8]
|
||||
.long 0xd61f0060 // br x3
|
||||
|
||||
HIDDEN _sk_store_f32_aarch64
|
||||
.globl _sk_store_f32_aarch64
|
||||
_sk_store_f32_aarch64:
|
||||
@ -3760,6 +3769,16 @@ _sk_store_u16_be_vfp4:
|
||||
.long 0x477fff00 // .word 0x477fff00
|
||||
.long 0x477fff00 // .word 0x477fff00
|
||||
|
||||
HIDDEN _sk_load_f32_vfp4
|
||||
.globl _sk_load_f32_vfp4
|
||||
_sk_load_f32_vfp4:
|
||||
.long 0xe8911008 // ldm r1, {r3, ip}
|
||||
.long 0xe2811008 // add r1, r1, #8
|
||||
.long 0xe5933000 // ldr r3, [r3]
|
||||
.long 0xe0833200 // add r3, r3, r0, lsl #4
|
||||
.long 0xf423008f // vld4.32 {d0-d3}, [r3]
|
||||
.long 0xe12fff1c // bx ip
|
||||
|
||||
HIDDEN _sk_store_f32_vfp4
|
||||
.globl _sk_store_f32_vfp4
|
||||
_sk_store_f32_vfp4:
|
||||
@ -5768,7 +5787,7 @@ _sk_load_4444_hsw:
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 233,255,255,255,225 // jmpq ffffffffe2001650 <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff563>
|
||||
.byte 233,255,255,255,225 // jmpq ffffffffe2001650 <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff4a4>
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
@ -6190,6 +6209,56 @@ _sk_store_u16_be_hsw:
|
||||
.byte 196,65,121,214,68,248,48 // vmovq %xmm8,0x30(%r8,%rdi,8)
|
||||
.byte 235,174 // jmp 1c57 <_sk_store_u16_be_hsw+0xef>
|
||||
|
||||
HIDDEN _sk_load_f32_hsw
|
||||
.globl _sk_load_f32_hsw
|
||||
_sk_load_f32_hsw:
|
||||
.byte 72,173 // lods %ds:(%rsi),%rax
|
||||
.byte 72,131,249,7 // cmp $0x7,%rcx
|
||||
.byte 119,110 // ja 1d1f <_sk_load_f32_hsw+0x76>
|
||||
.byte 76,139,0 // mov (%rax),%r8
|
||||
.byte 76,141,12,189,0,0,0,0 // lea 0x0(,%rdi,4),%r9
|
||||
.byte 76,141,21,133,0,0,0 // lea 0x85(%rip),%r10 # 1d48 <_sk_load_f32_hsw+0x9f>
|
||||
.byte 73,99,4,138 // movslq (%r10,%rcx,4),%rax
|
||||
.byte 76,1,208 // add %r10,%rax
|
||||
.byte 255,224 // jmpq *%rax
|
||||
.byte 196,3,125,24,68,136,112,1 // vinsertf128 $0x1,0x70(%r8,%r9,4),%ymm0,%ymm8
|
||||
.byte 196,131,125,24,92,136,96,1 // vinsertf128 $0x1,0x60(%r8,%r9,4),%ymm0,%ymm3
|
||||
.byte 196,131,125,24,76,136,80,1 // vinsertf128 $0x1,0x50(%r8,%r9,4),%ymm0,%ymm1
|
||||
.byte 196,131,125,24,84,136,64,1 // vinsertf128 $0x1,0x40(%r8,%r9,4),%ymm0,%ymm2
|
||||
.byte 196,129,121,16,68,136,48 // vmovupd 0x30(%r8,%r9,4),%xmm0
|
||||
.byte 196,195,125,13,192,12 // vblendpd $0xc,%ymm8,%ymm0,%ymm0
|
||||
.byte 196,1,121,16,68,136,32 // vmovupd 0x20(%r8,%r9,4),%xmm8
|
||||
.byte 196,99,61,13,203,12 // vblendpd $0xc,%ymm3,%ymm8,%ymm9
|
||||
.byte 196,129,121,16,92,136,16 // vmovupd 0x10(%r8,%r9,4),%xmm3
|
||||
.byte 196,99,101,13,209,12 // vblendpd $0xc,%ymm1,%ymm3,%ymm10
|
||||
.byte 196,129,121,16,12,136 // vmovupd (%r8,%r9,4),%xmm1
|
||||
.byte 196,227,117,13,202,12 // vblendpd $0xc,%ymm2,%ymm1,%ymm1
|
||||
.byte 196,193,116,20,210 // vunpcklps %ymm10,%ymm1,%ymm2
|
||||
.byte 196,193,116,21,218 // vunpckhps %ymm10,%ymm1,%ymm3
|
||||
.byte 197,180,20,200 // vunpcklps %ymm0,%ymm9,%ymm1
|
||||
.byte 197,52,21,192 // vunpckhps %ymm0,%ymm9,%ymm8
|
||||
.byte 197,237,20,193 // vunpcklpd %ymm1,%ymm2,%ymm0
|
||||
.byte 197,237,21,201 // vunpckhpd %ymm1,%ymm2,%ymm1
|
||||
.byte 196,193,101,20,208 // vunpcklpd %ymm8,%ymm3,%ymm2
|
||||
.byte 196,193,101,21,216 // vunpckhpd %ymm8,%ymm3,%ymm3
|
||||
.byte 72,173 // lods %ds:(%rsi),%rax
|
||||
.byte 255,224 // jmpq *%rax
|
||||
.byte 144 // nop
|
||||
.byte 132,255 // test %bh,%bh
|
||||
.byte 255 // (bad)
|
||||
.byte 255,203 // dec %ebx
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 190,255,255,255,177 // mov $0xb1ffffff,%esi
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 255,164,255,255,255,156,255 // jmpq *-0x630001(%rdi,%rdi,8)
|
||||
.byte 255 // (bad)
|
||||
.byte 255,148,255,255,255,140,255 // callq *-0x730001(%rdi,%rdi,8)
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // .byte 0xff
|
||||
|
||||
HIDDEN _sk_store_f32_hsw
|
||||
.globl _sk_store_f32_hsw
|
||||
_sk_store_f32_hsw:
|
||||
@ -6205,7 +6274,7 @@ _sk_store_f32_hsw:
|
||||
.byte 196,65,37,20,196 // vunpcklpd %ymm12,%ymm11,%ymm8
|
||||
.byte 196,65,37,21,220 // vunpckhpd %ymm12,%ymm11,%ymm11
|
||||
.byte 72,133,201 // test %rcx,%rcx
|
||||
.byte 117,55 // jne 1d16 <_sk_store_f32_hsw+0x6d>
|
||||
.byte 117,55 // jne 1dd5 <_sk_store_f32_hsw+0x6d>
|
||||
.byte 196,67,45,24,225,1 // vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
|
||||
.byte 196,67,61,24,235,1 // vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
|
||||
.byte 196,67,45,6,201,49 // vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
|
||||
@ -6218,22 +6287,22 @@ _sk_store_f32_hsw:
|
||||
.byte 255,224 // jmpq *%rax
|
||||
.byte 196,65,121,17,20,128 // vmovupd %xmm10,(%r8,%rax,4)
|
||||
.byte 72,131,249,1 // cmp $0x1,%rcx
|
||||
.byte 116,240 // je 1d12 <_sk_store_f32_hsw+0x69>
|
||||
.byte 116,240 // je 1dd1 <_sk_store_f32_hsw+0x69>
|
||||
.byte 196,65,121,17,76,128,16 // vmovupd %xmm9,0x10(%r8,%rax,4)
|
||||
.byte 72,131,249,3 // cmp $0x3,%rcx
|
||||
.byte 114,227 // jb 1d12 <_sk_store_f32_hsw+0x69>
|
||||
.byte 114,227 // jb 1dd1 <_sk_store_f32_hsw+0x69>
|
||||
.byte 196,65,121,17,68,128,32 // vmovupd %xmm8,0x20(%r8,%rax,4)
|
||||
.byte 116,218 // je 1d12 <_sk_store_f32_hsw+0x69>
|
||||
.byte 116,218 // je 1dd1 <_sk_store_f32_hsw+0x69>
|
||||
.byte 196,65,121,17,92,128,48 // vmovupd %xmm11,0x30(%r8,%rax,4)
|
||||
.byte 72,131,249,5 // cmp $0x5,%rcx
|
||||
.byte 114,205 // jb 1d12 <_sk_store_f32_hsw+0x69>
|
||||
.byte 114,205 // jb 1dd1 <_sk_store_f32_hsw+0x69>
|
||||
.byte 196,67,125,25,84,128,64,1 // vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
|
||||
.byte 116,195 // je 1d12 <_sk_store_f32_hsw+0x69>
|
||||
.byte 116,195 // je 1dd1 <_sk_store_f32_hsw+0x69>
|
||||
.byte 196,67,125,25,76,128,80,1 // vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
|
||||
.byte 72,131,249,7 // cmp $0x7,%rcx
|
||||
.byte 114,181 // jb 1d12 <_sk_store_f32_hsw+0x69>
|
||||
.byte 114,181 // jb 1dd1 <_sk_store_f32_hsw+0x69>
|
||||
.byte 196,67,125,25,68,128,96,1 // vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
|
||||
.byte 235,171 // jmp 1d12 <_sk_store_f32_hsw+0x69>
|
||||
.byte 235,171 // jmp 1dd1 <_sk_store_f32_hsw+0x69>
|
||||
|
||||
HIDDEN _sk_clamp_x_hsw
|
||||
.globl _sk_clamp_x_hsw
|
||||
@ -9016,6 +9085,57 @@ _sk_store_u16_be_avx:
|
||||
.byte 196,65,121,214,68,248,48 // vmovq %xmm8,0x30(%r8,%rdi,8)
|
||||
.byte 235,174 // jmp 255a <_sk_store_u16_be_avx+0xf6>
|
||||
|
||||
HIDDEN _sk_load_f32_avx
|
||||
.globl _sk_load_f32_avx
|
||||
_sk_load_f32_avx:
|
||||
.byte 72,173 // lods %ds:(%rsi),%rax
|
||||
.byte 72,131,249,7 // cmp $0x7,%rcx
|
||||
.byte 119,110 // ja 2622 <_sk_load_f32_avx+0x76>
|
||||
.byte 76,139,0 // mov (%rax),%r8
|
||||
.byte 76,141,12,189,0,0,0,0 // lea 0x0(,%rdi,4),%r9
|
||||
.byte 76,141,21,134,0,0,0 // lea 0x86(%rip),%r10 # 264c <_sk_load_f32_avx+0xa0>
|
||||
.byte 73,99,4,138 // movslq (%r10,%rcx,4),%rax
|
||||
.byte 76,1,208 // add %r10,%rax
|
||||
.byte 255,224 // jmpq *%rax
|
||||
.byte 196,3,125,24,68,136,112,1 // vinsertf128 $0x1,0x70(%r8,%r9,4),%ymm0,%ymm8
|
||||
.byte 196,131,125,24,92,136,96,1 // vinsertf128 $0x1,0x60(%r8,%r9,4),%ymm0,%ymm3
|
||||
.byte 196,131,125,24,76,136,80,1 // vinsertf128 $0x1,0x50(%r8,%r9,4),%ymm0,%ymm1
|
||||
.byte 196,131,125,24,84,136,64,1 // vinsertf128 $0x1,0x40(%r8,%r9,4),%ymm0,%ymm2
|
||||
.byte 196,129,121,16,68,136,48 // vmovupd 0x30(%r8,%r9,4),%xmm0
|
||||
.byte 196,195,125,13,192,12 // vblendpd $0xc,%ymm8,%ymm0,%ymm0
|
||||
.byte 196,1,121,16,68,136,32 // vmovupd 0x20(%r8,%r9,4),%xmm8
|
||||
.byte 196,99,61,13,203,12 // vblendpd $0xc,%ymm3,%ymm8,%ymm9
|
||||
.byte 196,129,121,16,92,136,16 // vmovupd 0x10(%r8,%r9,4),%xmm3
|
||||
.byte 196,99,101,13,209,12 // vblendpd $0xc,%ymm1,%ymm3,%ymm10
|
||||
.byte 196,129,121,16,12,136 // vmovupd (%r8,%r9,4),%xmm1
|
||||
.byte 196,227,117,13,202,12 // vblendpd $0xc,%ymm2,%ymm1,%ymm1
|
||||
.byte 196,193,116,20,210 // vunpcklps %ymm10,%ymm1,%ymm2
|
||||
.byte 196,193,116,21,218 // vunpckhps %ymm10,%ymm1,%ymm3
|
||||
.byte 197,180,20,200 // vunpcklps %ymm0,%ymm9,%ymm1
|
||||
.byte 197,52,21,192 // vunpckhps %ymm0,%ymm9,%ymm8
|
||||
.byte 197,237,20,193 // vunpcklpd %ymm1,%ymm2,%ymm0
|
||||
.byte 197,237,21,201 // vunpckhpd %ymm1,%ymm2,%ymm1
|
||||
.byte 196,193,101,20,208 // vunpcklpd %ymm8,%ymm3,%ymm2
|
||||
.byte 196,193,101,21,216 // vunpckhpd %ymm8,%ymm3,%ymm3
|
||||
.byte 72,173 // lods %ds:(%rsi),%rax
|
||||
.byte 255,224 // jmpq *%rax
|
||||
.byte 102,144 // xchg %ax,%ax
|
||||
.byte 131,255,255 // cmp $0xffffffff,%edi
|
||||
.byte 255,202 // dec %edx
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 189,255,255,255,176 // mov $0xb0ffffff,%ebp
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 255,163,255,255,255,155 // jmpq *-0x64000001(%rbx)
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 255,147,255,255,255,139 // callq *-0x74000001(%rbx)
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // (bad)
|
||||
.byte 255 // .byte 0xff
|
||||
|
||||
HIDDEN _sk_store_f32_avx
|
||||
.globl _sk_store_f32_avx
|
||||
_sk_store_f32_avx:
|
||||
@ -9031,7 +9151,7 @@ _sk_store_f32_avx:
|
||||
.byte 196,65,37,20,196 // vunpcklpd %ymm12,%ymm11,%ymm8
|
||||
.byte 196,65,37,21,220 // vunpckhpd %ymm12,%ymm11,%ymm11
|
||||
.byte 72,133,201 // test %rcx,%rcx
|
||||
.byte 117,55 // jne 2619 <_sk_store_f32_avx+0x6d>
|
||||
.byte 117,55 // jne 26d9 <_sk_store_f32_avx+0x6d>
|
||||
.byte 196,67,45,24,225,1 // vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
|
||||
.byte 196,67,61,24,235,1 // vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
|
||||
.byte 196,67,45,6,201,49 // vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
|
||||
@ -9044,22 +9164,22 @@ _sk_store_f32_avx:
|
||||
.byte 255,224 // jmpq *%rax
|
||||
.byte 196,65,121,17,20,128 // vmovupd %xmm10,(%r8,%rax,4)
|
||||
.byte 72,131,249,1 // cmp $0x1,%rcx
|
||||
.byte 116,240 // je 2615 <_sk_store_f32_avx+0x69>
|
||||
.byte 116,240 // je 26d5 <_sk_store_f32_avx+0x69>
|
||||
.byte 196,65,121,17,76,128,16 // vmovupd %xmm9,0x10(%r8,%rax,4)
|
||||
.byte 72,131,249,3 // cmp $0x3,%rcx
|
||||
.byte 114,227 // jb 2615 <_sk_store_f32_avx+0x69>
|
||||
.byte 114,227 // jb 26d5 <_sk_store_f32_avx+0x69>
|
||||
.byte 196,65,121,17,68,128,32 // vmovupd %xmm8,0x20(%r8,%rax,4)
|
||||
.byte 116,218 // je 2615 <_sk_store_f32_avx+0x69>
|
||||
.byte 116,218 // je 26d5 <_sk_store_f32_avx+0x69>
|
||||
.byte 196,65,121,17,92,128,48 // vmovupd %xmm11,0x30(%r8,%rax,4)
|
||||
.byte 72,131,249,5 // cmp $0x5,%rcx
|
||||
.byte 114,205 // jb 2615 <_sk_store_f32_avx+0x69>
|
||||
.byte 114,205 // jb 26d5 <_sk_store_f32_avx+0x69>
|
||||
.byte 196,67,125,25,84,128,64,1 // vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
|
||||
.byte 116,195 // je 2615 <_sk_store_f32_avx+0x69>
|
||||
.byte 116,195 // je 26d5 <_sk_store_f32_avx+0x69>
|
||||
.byte 196,67,125,25,76,128,80,1 // vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
|
||||
.byte 72,131,249,7 // cmp $0x7,%rcx
|
||||
.byte 114,181 // jb 2615 <_sk_store_f32_avx+0x69>
|
||||
.byte 114,181 // jb 26d5 <_sk_store_f32_avx+0x69>
|
||||
.byte 196,67,125,25,68,128,96,1 // vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
|
||||
.byte 235,171 // jmp 2615 <_sk_store_f32_avx+0x69>
|
||||
.byte 235,171 // jmp 26d5 <_sk_store_f32_avx+0x69>
|
||||
|
||||
HIDDEN _sk_clamp_x_avx
|
||||
.globl _sk_clamp_x_avx
|
||||
@ -11340,6 +11460,32 @@ _sk_store_u16_be_sse41:
|
||||
.byte 72,173 // lods %ds:(%rsi),%rax
|
||||
.byte 255,224 // jmpq *%rax
|
||||
|
||||
HIDDEN _sk_load_f32_sse41
|
||||
.globl _sk_load_f32_sse41
|
||||
_sk_load_f32_sse41:
|
||||
.byte 72,173 // lods %ds:(%rsi),%rax
|
||||
.byte 72,139,0 // mov (%rax),%rax
|
||||
.byte 72,137,249 // mov %rdi,%rcx
|
||||
.byte 72,193,225,4 // shl $0x4,%rcx
|
||||
.byte 68,15,16,4,8 // movups (%rax,%rcx,1),%xmm8
|
||||
.byte 15,16,68,8,16 // movups 0x10(%rax,%rcx,1),%xmm0
|
||||
.byte 15,16,92,8,32 // movups 0x20(%rax,%rcx,1),%xmm3
|
||||
.byte 68,15,16,76,8,48 // movups 0x30(%rax,%rcx,1),%xmm9
|
||||
.byte 65,15,40,208 // movaps %xmm8,%xmm2
|
||||
.byte 15,20,208 // unpcklps %xmm0,%xmm2
|
||||
.byte 15,40,203 // movaps %xmm3,%xmm1
|
||||
.byte 65,15,20,201 // unpcklps %xmm9,%xmm1
|
||||
.byte 68,15,21,192 // unpckhps %xmm0,%xmm8
|
||||
.byte 65,15,21,217 // unpckhps %xmm9,%xmm3
|
||||
.byte 15,40,194 // movaps %xmm2,%xmm0
|
||||
.byte 102,15,20,193 // unpcklpd %xmm1,%xmm0
|
||||
.byte 15,18,202 // movhlps %xmm2,%xmm1
|
||||
.byte 65,15,40,208 // movaps %xmm8,%xmm2
|
||||
.byte 102,15,20,211 // unpcklpd %xmm3,%xmm2
|
||||
.byte 65,15,18,216 // movhlps %xmm8,%xmm3
|
||||
.byte 72,173 // lods %ds:(%rsi),%rax
|
||||
.byte 255,224 // jmpq *%rax
|
||||
|
||||
HIDDEN _sk_store_f32_sse41
|
||||
.globl _sk_store_f32_sse41
|
||||
_sk_store_f32_sse41:
|
||||
@ -13771,6 +13917,32 @@ _sk_store_u16_be_sse2:
|
||||
.byte 72,173 // lods %ds:(%rsi),%rax
|
||||
.byte 255,224 // jmpq *%rax
|
||||
|
||||
HIDDEN _sk_load_f32_sse2
|
||||
.globl _sk_load_f32_sse2
|
||||
_sk_load_f32_sse2:
|
||||
.byte 72,173 // lods %ds:(%rsi),%rax
|
||||
.byte 72,139,0 // mov (%rax),%rax
|
||||
.byte 72,137,249 // mov %rdi,%rcx
|
||||
.byte 72,193,225,4 // shl $0x4,%rcx
|
||||
.byte 68,15,16,4,8 // movups (%rax,%rcx,1),%xmm8
|
||||
.byte 15,16,68,8,16 // movups 0x10(%rax,%rcx,1),%xmm0
|
||||
.byte 15,16,92,8,32 // movups 0x20(%rax,%rcx,1),%xmm3
|
||||
.byte 68,15,16,76,8,48 // movups 0x30(%rax,%rcx,1),%xmm9
|
||||
.byte 65,15,40,208 // movaps %xmm8,%xmm2
|
||||
.byte 15,20,208 // unpcklps %xmm0,%xmm2
|
||||
.byte 15,40,203 // movaps %xmm3,%xmm1
|
||||
.byte 65,15,20,201 // unpcklps %xmm9,%xmm1
|
||||
.byte 68,15,21,192 // unpckhps %xmm0,%xmm8
|
||||
.byte 65,15,21,217 // unpckhps %xmm9,%xmm3
|
||||
.byte 15,40,194 // movaps %xmm2,%xmm0
|
||||
.byte 102,15,20,193 // unpcklpd %xmm1,%xmm0
|
||||
.byte 15,18,202 // movhlps %xmm2,%xmm1
|
||||
.byte 65,15,40,208 // movaps %xmm8,%xmm2
|
||||
.byte 102,15,20,211 // unpcklpd %xmm3,%xmm2
|
||||
.byte 65,15,18,216 // movhlps %xmm8,%xmm3
|
||||
.byte 72,173 // lods %ds:(%rsi),%rax
|
||||
.byte 255,224 // jmpq *%rax
|
||||
|
||||
HIDDEN _sk_store_f32_sse2
|
||||
.globl _sk_store_f32_sse2
|
||||
_sk_store_f32_sse2:
|
||||
|
@ -1607,7 +1607,7 @@ _sk_load_4444_hsw LABEL PROC
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 233,255,255,255,225 ; jmpq ffffffffe20016ec <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff563>
|
||||
DB 233,255,255,255,225 ; jmpq ffffffffe20016ec <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff4a4>
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
@ -2022,6 +2022,55 @@ _sk_store_u16_be_hsw LABEL PROC
|
||||
DB 196,65,121,214,68,248,48 ; vmovq %xmm8,0x30(%r8,%rdi,8)
|
||||
DB 235,174 ; jmp 1cf3 <_sk_store_u16_be_hsw+0xef>
|
||||
|
||||
PUBLIC _sk_load_f32_hsw
|
||||
_sk_load_f32_hsw LABEL PROC
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 72,131,249,7 ; cmp $0x7,%rcx
|
||||
DB 119,110 ; ja 1dbb <_sk_load_f32_hsw+0x76>
|
||||
DB 76,139,0 ; mov (%rax),%r8
|
||||
DB 76,141,12,189,0,0,0,0 ; lea 0x0(,%rdi,4),%r9
|
||||
DB 76,141,21,133,0,0,0 ; lea 0x85(%rip),%r10 # 1de4 <_sk_load_f32_hsw+0x9f>
|
||||
DB 73,99,4,138 ; movslq (%r10,%rcx,4),%rax
|
||||
DB 76,1,208 ; add %r10,%rax
|
||||
DB 255,224 ; jmpq *%rax
|
||||
DB 196,3,125,24,68,136,112,1 ; vinsertf128 $0x1,0x70(%r8,%r9,4),%ymm0,%ymm8
|
||||
DB 196,131,125,24,92,136,96,1 ; vinsertf128 $0x1,0x60(%r8,%r9,4),%ymm0,%ymm3
|
||||
DB 196,131,125,24,76,136,80,1 ; vinsertf128 $0x1,0x50(%r8,%r9,4),%ymm0,%ymm1
|
||||
DB 196,131,125,24,84,136,64,1 ; vinsertf128 $0x1,0x40(%r8,%r9,4),%ymm0,%ymm2
|
||||
DB 196,129,121,16,68,136,48 ; vmovupd 0x30(%r8,%r9,4),%xmm0
|
||||
DB 196,195,125,13,192,12 ; vblendpd $0xc,%ymm8,%ymm0,%ymm0
|
||||
DB 196,1,121,16,68,136,32 ; vmovupd 0x20(%r8,%r9,4),%xmm8
|
||||
DB 196,99,61,13,203,12 ; vblendpd $0xc,%ymm3,%ymm8,%ymm9
|
||||
DB 196,129,121,16,92,136,16 ; vmovupd 0x10(%r8,%r9,4),%xmm3
|
||||
DB 196,99,101,13,209,12 ; vblendpd $0xc,%ymm1,%ymm3,%ymm10
|
||||
DB 196,129,121,16,12,136 ; vmovupd (%r8,%r9,4),%xmm1
|
||||
DB 196,227,117,13,202,12 ; vblendpd $0xc,%ymm2,%ymm1,%ymm1
|
||||
DB 196,193,116,20,210 ; vunpcklps %ymm10,%ymm1,%ymm2
|
||||
DB 196,193,116,21,218 ; vunpckhps %ymm10,%ymm1,%ymm3
|
||||
DB 197,180,20,200 ; vunpcklps %ymm0,%ymm9,%ymm1
|
||||
DB 197,52,21,192 ; vunpckhps %ymm0,%ymm9,%ymm8
|
||||
DB 197,237,20,193 ; vunpcklpd %ymm1,%ymm2,%ymm0
|
||||
DB 197,237,21,201 ; vunpckhpd %ymm1,%ymm2,%ymm1
|
||||
DB 196,193,101,20,208 ; vunpcklpd %ymm8,%ymm3,%ymm2
|
||||
DB 196,193,101,21,216 ; vunpckhpd %ymm8,%ymm3,%ymm3
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 255,224 ; jmpq *%rax
|
||||
DB 144 ; nop
|
||||
DB 132,255 ; test %bh,%bh
|
||||
DB 255 ; (bad)
|
||||
DB 255,203 ; dec %ebx
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 190,255,255,255,177 ; mov $0xb1ffffff,%esi
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 255,164,255,255,255,156,255 ; jmpq *-0x630001(%rdi,%rdi,8)
|
||||
DB 255 ; (bad)
|
||||
DB 255,148,255,255,255,140,255 ; callq *-0x730001(%rdi,%rdi,8)
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; .byte 0xff
|
||||
|
||||
PUBLIC _sk_store_f32_hsw
|
||||
_sk_store_f32_hsw LABEL PROC
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
@ -2036,7 +2085,7 @@ _sk_store_f32_hsw LABEL PROC
|
||||
DB 196,65,37,20,196 ; vunpcklpd %ymm12,%ymm11,%ymm8
|
||||
DB 196,65,37,21,220 ; vunpckhpd %ymm12,%ymm11,%ymm11
|
||||
DB 72,133,201 ; test %rcx,%rcx
|
||||
DB 117,55 ; jne 1db2 <_sk_store_f32_hsw+0x6d>
|
||||
DB 117,55 ; jne 1e71 <_sk_store_f32_hsw+0x6d>
|
||||
DB 196,67,45,24,225,1 ; vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
|
||||
DB 196,67,61,24,235,1 ; vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
|
||||
DB 196,67,45,6,201,49 ; vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
|
||||
@ -2049,22 +2098,22 @@ _sk_store_f32_hsw LABEL PROC
|
||||
DB 255,224 ; jmpq *%rax
|
||||
DB 196,65,121,17,20,128 ; vmovupd %xmm10,(%r8,%rax,4)
|
||||
DB 72,131,249,1 ; cmp $0x1,%rcx
|
||||
DB 116,240 ; je 1dae <_sk_store_f32_hsw+0x69>
|
||||
DB 116,240 ; je 1e6d <_sk_store_f32_hsw+0x69>
|
||||
DB 196,65,121,17,76,128,16 ; vmovupd %xmm9,0x10(%r8,%rax,4)
|
||||
DB 72,131,249,3 ; cmp $0x3,%rcx
|
||||
DB 114,227 ; jb 1dae <_sk_store_f32_hsw+0x69>
|
||||
DB 114,227 ; jb 1e6d <_sk_store_f32_hsw+0x69>
|
||||
DB 196,65,121,17,68,128,32 ; vmovupd %xmm8,0x20(%r8,%rax,4)
|
||||
DB 116,218 ; je 1dae <_sk_store_f32_hsw+0x69>
|
||||
DB 116,218 ; je 1e6d <_sk_store_f32_hsw+0x69>
|
||||
DB 196,65,121,17,92,128,48 ; vmovupd %xmm11,0x30(%r8,%rax,4)
|
||||
DB 72,131,249,5 ; cmp $0x5,%rcx
|
||||
DB 114,205 ; jb 1dae <_sk_store_f32_hsw+0x69>
|
||||
DB 114,205 ; jb 1e6d <_sk_store_f32_hsw+0x69>
|
||||
DB 196,67,125,25,84,128,64,1 ; vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
|
||||
DB 116,195 ; je 1dae <_sk_store_f32_hsw+0x69>
|
||||
DB 116,195 ; je 1e6d <_sk_store_f32_hsw+0x69>
|
||||
DB 196,67,125,25,76,128,80,1 ; vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
|
||||
DB 72,131,249,7 ; cmp $0x7,%rcx
|
||||
DB 114,181 ; jb 1dae <_sk_store_f32_hsw+0x69>
|
||||
DB 114,181 ; jb 1e6d <_sk_store_f32_hsw+0x69>
|
||||
DB 196,67,125,25,68,128,96,1 ; vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
|
||||
DB 235,171 ; jmp 1dae <_sk_store_f32_hsw+0x69>
|
||||
DB 235,171 ; jmp 1e6d <_sk_store_f32_hsw+0x69>
|
||||
|
||||
PUBLIC _sk_clamp_x_hsw
|
||||
_sk_clamp_x_hsw LABEL PROC
|
||||
@ -4804,6 +4853,56 @@ _sk_store_u16_be_avx LABEL PROC
|
||||
DB 196,65,121,214,68,248,48 ; vmovq %xmm8,0x30(%r8,%rdi,8)
|
||||
DB 235,174 ; jmp 25f6 <_sk_store_u16_be_avx+0xf6>
|
||||
|
||||
PUBLIC _sk_load_f32_avx
|
||||
_sk_load_f32_avx LABEL PROC
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 72,131,249,7 ; cmp $0x7,%rcx
|
||||
DB 119,110 ; ja 26be <_sk_load_f32_avx+0x76>
|
||||
DB 76,139,0 ; mov (%rax),%r8
|
||||
DB 76,141,12,189,0,0,0,0 ; lea 0x0(,%rdi,4),%r9
|
||||
DB 76,141,21,134,0,0,0 ; lea 0x86(%rip),%r10 # 26e8 <_sk_load_f32_avx+0xa0>
|
||||
DB 73,99,4,138 ; movslq (%r10,%rcx,4),%rax
|
||||
DB 76,1,208 ; add %r10,%rax
|
||||
DB 255,224 ; jmpq *%rax
|
||||
DB 196,3,125,24,68,136,112,1 ; vinsertf128 $0x1,0x70(%r8,%r9,4),%ymm0,%ymm8
|
||||
DB 196,131,125,24,92,136,96,1 ; vinsertf128 $0x1,0x60(%r8,%r9,4),%ymm0,%ymm3
|
||||
DB 196,131,125,24,76,136,80,1 ; vinsertf128 $0x1,0x50(%r8,%r9,4),%ymm0,%ymm1
|
||||
DB 196,131,125,24,84,136,64,1 ; vinsertf128 $0x1,0x40(%r8,%r9,4),%ymm0,%ymm2
|
||||
DB 196,129,121,16,68,136,48 ; vmovupd 0x30(%r8,%r9,4),%xmm0
|
||||
DB 196,195,125,13,192,12 ; vblendpd $0xc,%ymm8,%ymm0,%ymm0
|
||||
DB 196,1,121,16,68,136,32 ; vmovupd 0x20(%r8,%r9,4),%xmm8
|
||||
DB 196,99,61,13,203,12 ; vblendpd $0xc,%ymm3,%ymm8,%ymm9
|
||||
DB 196,129,121,16,92,136,16 ; vmovupd 0x10(%r8,%r9,4),%xmm3
|
||||
DB 196,99,101,13,209,12 ; vblendpd $0xc,%ymm1,%ymm3,%ymm10
|
||||
DB 196,129,121,16,12,136 ; vmovupd (%r8,%r9,4),%xmm1
|
||||
DB 196,227,117,13,202,12 ; vblendpd $0xc,%ymm2,%ymm1,%ymm1
|
||||
DB 196,193,116,20,210 ; vunpcklps %ymm10,%ymm1,%ymm2
|
||||
DB 196,193,116,21,218 ; vunpckhps %ymm10,%ymm1,%ymm3
|
||||
DB 197,180,20,200 ; vunpcklps %ymm0,%ymm9,%ymm1
|
||||
DB 197,52,21,192 ; vunpckhps %ymm0,%ymm9,%ymm8
|
||||
DB 197,237,20,193 ; vunpcklpd %ymm1,%ymm2,%ymm0
|
||||
DB 197,237,21,201 ; vunpckhpd %ymm1,%ymm2,%ymm1
|
||||
DB 196,193,101,20,208 ; vunpcklpd %ymm8,%ymm3,%ymm2
|
||||
DB 196,193,101,21,216 ; vunpckhpd %ymm8,%ymm3,%ymm3
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 255,224 ; jmpq *%rax
|
||||
DB 102,144 ; xchg %ax,%ax
|
||||
DB 131,255,255 ; cmp $0xffffffff,%edi
|
||||
DB 255,202 ; dec %edx
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 189,255,255,255,176 ; mov $0xb0ffffff,%ebp
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 255,163,255,255,255,155 ; jmpq *-0x64000001(%rbx)
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 255,147,255,255,255,139 ; callq *-0x74000001(%rbx)
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; (bad)
|
||||
DB 255 ; .byte 0xff
|
||||
|
||||
PUBLIC _sk_store_f32_avx
|
||||
_sk_store_f32_avx LABEL PROC
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
@ -4818,7 +4917,7 @@ _sk_store_f32_avx LABEL PROC
|
||||
DB 196,65,37,20,196 ; vunpcklpd %ymm12,%ymm11,%ymm8
|
||||
DB 196,65,37,21,220 ; vunpckhpd %ymm12,%ymm11,%ymm11
|
||||
DB 72,133,201 ; test %rcx,%rcx
|
||||
DB 117,55 ; jne 26b5 <_sk_store_f32_avx+0x6d>
|
||||
DB 117,55 ; jne 2775 <_sk_store_f32_avx+0x6d>
|
||||
DB 196,67,45,24,225,1 ; vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
|
||||
DB 196,67,61,24,235,1 ; vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
|
||||
DB 196,67,45,6,201,49 ; vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
|
||||
@ -4831,22 +4930,22 @@ _sk_store_f32_avx LABEL PROC
|
||||
DB 255,224 ; jmpq *%rax
|
||||
DB 196,65,121,17,20,128 ; vmovupd %xmm10,(%r8,%rax,4)
|
||||
DB 72,131,249,1 ; cmp $0x1,%rcx
|
||||
DB 116,240 ; je 26b1 <_sk_store_f32_avx+0x69>
|
||||
DB 116,240 ; je 2771 <_sk_store_f32_avx+0x69>
|
||||
DB 196,65,121,17,76,128,16 ; vmovupd %xmm9,0x10(%r8,%rax,4)
|
||||
DB 72,131,249,3 ; cmp $0x3,%rcx
|
||||
DB 114,227 ; jb 26b1 <_sk_store_f32_avx+0x69>
|
||||
DB 114,227 ; jb 2771 <_sk_store_f32_avx+0x69>
|
||||
DB 196,65,121,17,68,128,32 ; vmovupd %xmm8,0x20(%r8,%rax,4)
|
||||
DB 116,218 ; je 26b1 <_sk_store_f32_avx+0x69>
|
||||
DB 116,218 ; je 2771 <_sk_store_f32_avx+0x69>
|
||||
DB 196,65,121,17,92,128,48 ; vmovupd %xmm11,0x30(%r8,%rax,4)
|
||||
DB 72,131,249,5 ; cmp $0x5,%rcx
|
||||
DB 114,205 ; jb 26b1 <_sk_store_f32_avx+0x69>
|
||||
DB 114,205 ; jb 2771 <_sk_store_f32_avx+0x69>
|
||||
DB 196,67,125,25,84,128,64,1 ; vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
|
||||
DB 116,195 ; je 26b1 <_sk_store_f32_avx+0x69>
|
||||
DB 116,195 ; je 2771 <_sk_store_f32_avx+0x69>
|
||||
DB 196,67,125,25,76,128,80,1 ; vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
|
||||
DB 72,131,249,7 ; cmp $0x7,%rcx
|
||||
DB 114,181 ; jb 26b1 <_sk_store_f32_avx+0x69>
|
||||
DB 114,181 ; jb 2771 <_sk_store_f32_avx+0x69>
|
||||
DB 196,67,125,25,68,128,96,1 ; vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
|
||||
DB 235,171 ; jmp 26b1 <_sk_store_f32_avx+0x69>
|
||||
DB 235,171 ; jmp 2771 <_sk_store_f32_avx+0x69>
|
||||
|
||||
PUBLIC _sk_clamp_x_avx
|
||||
_sk_clamp_x_avx LABEL PROC
|
||||
@ -7088,6 +7187,31 @@ _sk_store_u16_be_sse41 LABEL PROC
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 255,224 ; jmpq *%rax
|
||||
|
||||
PUBLIC _sk_load_f32_sse41
|
||||
_sk_load_f32_sse41 LABEL PROC
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 72,139,0 ; mov (%rax),%rax
|
||||
DB 72,137,249 ; mov %rdi,%rcx
|
||||
DB 72,193,225,4 ; shl $0x4,%rcx
|
||||
DB 68,15,16,4,8 ; movups (%rax,%rcx,1),%xmm8
|
||||
DB 15,16,68,8,16 ; movups 0x10(%rax,%rcx,1),%xmm0
|
||||
DB 15,16,92,8,32 ; movups 0x20(%rax,%rcx,1),%xmm3
|
||||
DB 68,15,16,76,8,48 ; movups 0x30(%rax,%rcx,1),%xmm9
|
||||
DB 65,15,40,208 ; movaps %xmm8,%xmm2
|
||||
DB 15,20,208 ; unpcklps %xmm0,%xmm2
|
||||
DB 15,40,203 ; movaps %xmm3,%xmm1
|
||||
DB 65,15,20,201 ; unpcklps %xmm9,%xmm1
|
||||
DB 68,15,21,192 ; unpckhps %xmm0,%xmm8
|
||||
DB 65,15,21,217 ; unpckhps %xmm9,%xmm3
|
||||
DB 15,40,194 ; movaps %xmm2,%xmm0
|
||||
DB 102,15,20,193 ; unpcklpd %xmm1,%xmm0
|
||||
DB 15,18,202 ; movhlps %xmm2,%xmm1
|
||||
DB 65,15,40,208 ; movaps %xmm8,%xmm2
|
||||
DB 102,15,20,211 ; unpcklpd %xmm3,%xmm2
|
||||
DB 65,15,18,216 ; movhlps %xmm8,%xmm3
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 255,224 ; jmpq *%rax
|
||||
|
||||
PUBLIC _sk_store_f32_sse41
|
||||
_sk_store_f32_sse41 LABEL PROC
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
@ -9477,6 +9601,31 @@ _sk_store_u16_be_sse2 LABEL PROC
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 255,224 ; jmpq *%rax
|
||||
|
||||
PUBLIC _sk_load_f32_sse2
|
||||
_sk_load_f32_sse2 LABEL PROC
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 72,139,0 ; mov (%rax),%rax
|
||||
DB 72,137,249 ; mov %rdi,%rcx
|
||||
DB 72,193,225,4 ; shl $0x4,%rcx
|
||||
DB 68,15,16,4,8 ; movups (%rax,%rcx,1),%xmm8
|
||||
DB 15,16,68,8,16 ; movups 0x10(%rax,%rcx,1),%xmm0
|
||||
DB 15,16,92,8,32 ; movups 0x20(%rax,%rcx,1),%xmm3
|
||||
DB 68,15,16,76,8,48 ; movups 0x30(%rax,%rcx,1),%xmm9
|
||||
DB 65,15,40,208 ; movaps %xmm8,%xmm2
|
||||
DB 15,20,208 ; unpcklps %xmm0,%xmm2
|
||||
DB 15,40,203 ; movaps %xmm3,%xmm1
|
||||
DB 65,15,20,201 ; unpcklps %xmm9,%xmm1
|
||||
DB 68,15,21,192 ; unpckhps %xmm0,%xmm8
|
||||
DB 65,15,21,217 ; unpckhps %xmm9,%xmm3
|
||||
DB 15,40,194 ; movaps %xmm2,%xmm0
|
||||
DB 102,15,20,193 ; unpcklpd %xmm1,%xmm0
|
||||
DB 15,18,202 ; movhlps %xmm2,%xmm1
|
||||
DB 65,15,40,208 ; movaps %xmm8,%xmm2
|
||||
DB 102,15,20,211 ; unpcklpd %xmm3,%xmm2
|
||||
DB 65,15,18,216 ; movhlps %xmm8,%xmm3
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
DB 255,224 ; jmpq *%rax
|
||||
|
||||
PUBLIC _sk_store_f32_sse2
|
||||
_sk_store_f32_sse2 LABEL PROC
|
||||
DB 72,173 ; lods %ds:(%rsi),%rax
|
||||
|
@ -673,6 +673,10 @@ STAGE(store_u16_be) {
|
||||
store4((uint16_t*)ptr,tail, R,G,B,A);
|
||||
}
|
||||
|
||||
STAGE(load_f32) {
|
||||
auto ptr = *(const float**)ctx + 4*x;
|
||||
load4(ptr,tail, &r,&g,&b,&a);
|
||||
}
|
||||
STAGE(store_f32) {
|
||||
auto ptr = *(float**)ctx + 4*x;
|
||||
store4(ptr,tail, r,g,b,a);
|
||||
|
@ -53,6 +53,13 @@
|
||||
ptr[2] = b;
|
||||
ptr[3] = a;
|
||||
}
|
||||
|
||||
SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
|
||||
*r = ptr[0];
|
||||
*g = ptr[1];
|
||||
*b = ptr[2];
|
||||
*a = ptr[3];
|
||||
}
|
||||
SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
|
||||
ptr[0] = r;
|
||||
ptr[1] = g;
|
||||
@ -106,6 +113,14 @@
|
||||
SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
|
||||
vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
|
||||
}
|
||||
|
||||
SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
|
||||
float32x4x4_t rgba = vld4q_f32(ptr);
|
||||
*r = rgba.val[0];
|
||||
*g = rgba.val[1];
|
||||
*b = rgba.val[2];
|
||||
*a = rgba.val[3];
|
||||
}
|
||||
SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
|
||||
vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
|
||||
}
|
||||
@ -164,6 +179,14 @@
|
||||
vst4_lane_u16(ptr + 0, rgba, 0);
|
||||
vst4_lane_u16(ptr + 4, rgba, 1);
|
||||
}
|
||||
|
||||
SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
|
||||
float32x2x4_t rgba = vld4_f32(ptr);
|
||||
*r = rgba.val[0];
|
||||
*g = rgba.val[1];
|
||||
*b = rgba.val[2];
|
||||
*a = rgba.val[3];
|
||||
}
|
||||
SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
|
||||
vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
|
||||
}
|
||||
@ -285,6 +308,31 @@
|
||||
_mm_storeu_si128((__m128i*)ptr + 3, _67);
|
||||
}
|
||||
}
|
||||
|
||||
SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
|
||||
F _04, _15, _26, _37;
|
||||
|
||||
switch (tail) {
|
||||
case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1);
|
||||
case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1);
|
||||
case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1);
|
||||
case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1);
|
||||
case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0);
|
||||
case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0);
|
||||
case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0);
|
||||
case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0);
|
||||
}
|
||||
|
||||
F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5
|
||||
ba0145 = _mm256_unpackhi_ps(_04,_15),
|
||||
rg2367 = _mm256_unpacklo_ps(_26,_37),
|
||||
ba2367 = _mm256_unpackhi_ps(_26,_37);
|
||||
|
||||
*r = _mm256_unpacklo_pd(rg0145, rg2367);
|
||||
*g = _mm256_unpackhi_pd(rg0145, rg2367);
|
||||
*b = _mm256_unpacklo_pd(ba0145, ba2367);
|
||||
*a = _mm256_unpackhi_pd(ba0145, ba2367);
|
||||
}
|
||||
SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
|
||||
F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
|
||||
rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
|
||||
@ -408,6 +456,18 @@
|
||||
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
|
||||
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
|
||||
}
|
||||
|
||||
SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
|
||||
auto _0 = _mm_loadu_ps(ptr+ 0),
|
||||
_1 = _mm_loadu_ps(ptr+ 4),
|
||||
_2 = _mm_loadu_ps(ptr+ 8),
|
||||
_3 = _mm_loadu_ps(ptr+12);
|
||||
_MM_TRANSPOSE4_PS(_0,_1,_2,_3);
|
||||
*r = _0;
|
||||
*g = _1;
|
||||
*b = _2;
|
||||
*a = _3;
|
||||
}
|
||||
SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
|
||||
_MM_TRANSPOSE4_PS(r,g,b,a);
|
||||
_mm_storeu_ps(ptr+ 0, r);
|
||||
|
Loading…
Reference in New Issue
Block a user