SkSplicer: implement load_tables and matrix_3x4
These are enough to splice interesting SkColorSpaceXform pipelines. SkSplicer_stages.cpp is similar to but still intentionally distinct from SkRasterPipeline_opts. I hope to unify them next week. unaligned_load() is nothing tricky... just a little refactor. Change-Id: I05d0fc38dac985aa351d88776ecc14d2457f2124 Reviewed-on: https://skia-review.googlesource.com/7022 Reviewed-by: Matt Sarett <msarett@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
2873c767fd
commit
69f98c738c
@ -293,10 +293,12 @@ namespace {
|
||||
case SkRasterPipeline::from_srgb: splice(&buf, kSplice_from_srgb ); break;
|
||||
case SkRasterPipeline::to_srgb: splice(&buf, kSplice_to_srgb ); break;
|
||||
case SkRasterPipeline::scale_u8: splice(&buf, kSplice_scale_u8 ); break;
|
||||
case SkRasterPipeline::load_tables: splice(&buf, kSplice_load_tables ); break;
|
||||
case SkRasterPipeline::load_8888: splice(&buf, kSplice_load_8888 ); break;
|
||||
case SkRasterPipeline::store_8888: splice(&buf, kSplice_store_8888 ); break;
|
||||
case SkRasterPipeline::load_f16: splice(&buf, kSplice_load_f16 ); break;
|
||||
case SkRasterPipeline::store_f16: splice(&buf, kSplice_store_f16 ); break;
|
||||
case SkRasterPipeline::matrix_3x4: splice(&buf, kSplice_matrix_3x4 ); break;
|
||||
|
||||
// No joy (probably just not yet implemented).
|
||||
default:
|
||||
|
@ -222,6 +222,58 @@ static const unsigned int kSplice_scale_u8[] = {
|
||||
0x6e22de02, // fmul v2.4s, v16.4s, v2.4s
|
||||
0x6e23de03, // fmul v3.4s, v16.4s, v3.4s
|
||||
};
|
||||
static const unsigned int kSplice_load_tables[] = {
|
||||
0xa9402848, // ldp x8, x10, [x2]
|
||||
0xd37ef409, // lsl x9, x0, #2
|
||||
0x4d40c860, // ld1r {v0.4s}, [x3]
|
||||
0x3ce96903, // ldr q3, [x8,x9]
|
||||
0xa9412448, // ldp x8, x9, [x2,#16]
|
||||
0x4e231c01, // and v1.16b, v0.16b, v3.16b
|
||||
0x1e26002e, // fmov w14, s1
|
||||
0x6f380462, // ushr v2.4s, v3.4s, #8
|
||||
0x6f300470, // ushr v16.4s, v3.4s, #16
|
||||
0x8b2e494e, // add x14, x10, w14, uxtw #2
|
||||
0x0e0c3c2b, // mov w11, v1.s[1]
|
||||
0x0e143c2c, // mov w12, v1.s[2]
|
||||
0x0e1c3c2d, // mov w13, v1.s[3]
|
||||
0x4e221c01, // and v1.16b, v0.16b, v2.16b
|
||||
0x4e301c02, // and v2.16b, v0.16b, v16.16b
|
||||
0x0d4081c0, // ld1 {v0.s}[0], [x14]
|
||||
0x1e26002e, // fmov w14, s1
|
||||
0x8b2e490e, // add x14, x8, w14, uxtw #2
|
||||
0x8b2b494b, // add x11, x10, w11, uxtw #2
|
||||
0xbc6c5950, // ldr s16, [x10,w12,uxtw #2]
|
||||
0xbc6d5951, // ldr s17, [x10,w13,uxtw #2]
|
||||
0x0e0c3c2a, // mov w10, v1.s[1]
|
||||
0x0e143c2c, // mov w12, v1.s[2]
|
||||
0x0e1c3c2d, // mov w13, v1.s[3]
|
||||
0x0d4081c1, // ld1 {v1.s}[0], [x14]
|
||||
0x0d409160, // ld1 {v0.s}[1], [x11]
|
||||
0xbc6c5912, // ldr s18, [x8,w12,uxtw #2]
|
||||
0x0e143c4c, // mov w12, v2.s[2]
|
||||
0x1e26004e, // fmov w14, s2
|
||||
0xbc6c5933, // ldr s19, [x9,w12,uxtw #2]
|
||||
0x8b2e492c, // add x12, x9, w14, uxtw #2
|
||||
0x8b2a490a, // add x10, x8, w10, uxtw #2
|
||||
0x0e0c3c4f, // mov w15, v2.s[1]
|
||||
0x0e1c3c4b, // mov w11, v2.s[3]
|
||||
0x0d408182, // ld1 {v2.s}[0], [x12]
|
||||
0x0d409141, // ld1 {v1.s}[1], [x10]
|
||||
0x6e140600, // mov v0.s[2], v16.s[0]
|
||||
0xbc6d5910, // ldr s16, [x8,w13,uxtw #2]
|
||||
0x8b2f492a, // add x10, x9, w15, uxtw #2
|
||||
0x0d409142, // ld1 {v2.s}[1], [x10]
|
||||
0x6e140641, // mov v1.s[2], v18.s[0]
|
||||
0x6e1c0620, // mov v0.s[3], v17.s[0]
|
||||
0xbc6b5931, // ldr s17, [x9,w11,uxtw #2]
|
||||
0x6e1c0601, // mov v1.s[3], v16.s[0]
|
||||
0xbd400c70, // ldr s16, [x3,#12]
|
||||
0x6f280463, // ushr v3.4s, v3.4s, #24
|
||||
0x6e140662, // mov v2.s[2], v19.s[0]
|
||||
0x4e21d863, // scvtf v3.4s, v3.4s
|
||||
0x6e1c0622, // mov v2.s[3], v17.s[0]
|
||||
0x4f909063, // fmul v3.4s, v3.4s, v16.s[0]
|
||||
};
|
||||
static const unsigned int kSplice_load_8888[] = {
|
||||
0xf9400048, // ldr x8, [x2]
|
||||
0xd37ef409, // lsl x9, x0, #2
|
||||
@ -281,6 +333,33 @@ static const unsigned int kSplice_store_f16[] = {
|
||||
0x0e216873, // fcvtn v19.4h, v3.4s
|
||||
0x0c000510, // st4 {v16.4h-v19.4h}, [x8]
|
||||
};
|
||||
static const unsigned int kSplice_matrix_3x4[] = {
|
||||
0xaa0203e8, // mov x8, x2
|
||||
0x91009049, // add x9, x2, #0x24
|
||||
0x4ddfc913, // ld1r {v19.4s}, [x8], #4
|
||||
0x4d40c930, // ld1r {v16.4s}, [x9]
|
||||
0x9100a049, // add x9, x2, #0x28
|
||||
0x4d40c931, // ld1r {v17.4s}, [x9]
|
||||
0x2d435454, // ldp s20, s21, [x2,#24]
|
||||
0x9100b049, // add x9, x2, #0x2c
|
||||
0xbd402056, // ldr s22, [x2,#32]
|
||||
0x4d40c932, // ld1r {v18.4s}, [x9]
|
||||
0x4f941050, // fmla v16.4s, v2.4s, v20.s[0]
|
||||
0x4f951051, // fmla v17.4s, v2.4s, v21.s[0]
|
||||
0x2d415454, // ldp s20, s21, [x2,#8]
|
||||
0x4f961052, // fmla v18.4s, v2.4s, v22.s[0]
|
||||
0x2d425842, // ldp s2, s22, [x2,#16]
|
||||
0x4f951030, // fmla v16.4s, v1.4s, v21.s[0]
|
||||
0xbd400115, // ldr s21, [x8]
|
||||
0x4f821031, // fmla v17.4s, v1.4s, v2.s[0]
|
||||
0x4f961032, // fmla v18.4s, v1.4s, v22.s[0]
|
||||
0x4e20ce70, // fmla v16.4s, v19.4s, v0.4s
|
||||
0x4f951011, // fmla v17.4s, v0.4s, v21.s[0]
|
||||
0x4f941012, // fmla v18.4s, v0.4s, v20.s[0]
|
||||
0x4eb01e00, // mov v0.16b, v16.16b
|
||||
0x4eb11e21, // mov v1.16b, v17.16b
|
||||
0x4eb21e42, // mov v2.16b, v18.16b
|
||||
};
|
||||
|
||||
#elif defined(__ARM_NEON__)
|
||||
|
||||
@ -505,6 +584,44 @@ static const unsigned int kSplice_scale_u8[] = {
|
||||
0xe28dd008, // add sp, sp, #8
|
||||
0xecbd8b02, // vpop {d8}
|
||||
};
|
||||
static const unsigned int kSplice_load_tables[] = {
|
||||
0xe92d41f0, // push {r4, r5, r6, r7, r8, lr}
|
||||
0xe592c000, // ldr ip, [r2]
|
||||
0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32]
|
||||
0xe08c5100, // add r5, ip, r0, lsl #2
|
||||
0xe592e004, // ldr lr, [r2, #4]
|
||||
0xedd51b00, // vldr d17, [r5]
|
||||
0xf24021b1, // vand d18, d16, d17
|
||||
0xe592800c, // ldr r8, [r2, #12]
|
||||
0xf3f83031, // vshr.u32 d19, d17, #8
|
||||
0xe5924008, // ldr r4, [r2, #8]
|
||||
0xed931a03, // vldr s2, [r3, #12]
|
||||
0xee325b90, // vmov.32 r5, d18[1]
|
||||
0xee126b90, // vmov.32 r6, d18[0]
|
||||
0xf3f02031, // vshr.u32 d18, d17, #16
|
||||
0xf24021b2, // vand d18, d16, d18
|
||||
0xf24001b3, // vand d16, d16, d19
|
||||
0xee127b90, // vmov.32 r7, d18[0]
|
||||
0xe08e5105, // add r5, lr, r5, lsl #2
|
||||
0xe08e6106, // add r6, lr, r6, lsl #2
|
||||
0xedd50a00, // vldr s1, [r5]
|
||||
0xee325b90, // vmov.32 r5, d18[1]
|
||||
0xed960a00, // vldr s0, [r6]
|
||||
0xee306b90, // vmov.32 r6, d16[1]
|
||||
0xe0887107, // add r7, r8, r7, lsl #2
|
||||
0xe088c105, // add ip, r8, r5, lsl #2
|
||||
0xee105b90, // vmov.32 r5, d16[0]
|
||||
0xf3e80031, // vshr.u32 d16, d17, #24
|
||||
0xe0846106, // add r6, r4, r6, lsl #2
|
||||
0xeddc2a00, // vldr s5, [ip]
|
||||
0xf3fb0620, // vcvt.f32.s32 d16, d16
|
||||
0xed972a00, // vldr s4, [r7]
|
||||
0xf2a039c1, // vmul.f32 d3, d16, d1[0]
|
||||
0xedd61a00, // vldr s3, [r6]
|
||||
0xe0846105, // add r6, r4, r5, lsl #2
|
||||
0xed961a00, // vldr s2, [r6]
|
||||
0xe8bd41f0, // pop {r4, r5, r6, r7, r8, lr}
|
||||
};
|
||||
static const unsigned int kSplice_load_8888[] = {
|
||||
0xe592c000, // ldr ip, [r2]
|
||||
0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32]
|
||||
@ -581,6 +698,43 @@ static const unsigned int kSplice_store_f16[] = {
|
||||
0xe08cc180, // add ip, ip, r0, lsl #3
|
||||
0xf44c084f, // vst2.16 {d16-d17}, [ip]
|
||||
};
|
||||
static const unsigned int kSplice_matrix_3x4[] = {
|
||||
0xe282c020, // add ip, r2, #32
|
||||
0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
|
||||
0xe282c02c, // add ip, r2, #44
|
||||
0xf4ec0c9f, // vld1.32 {d16[]}, [ip :32]
|
||||
0xe282c01c, // add ip, r2, #28
|
||||
0xf2420c33, // vfma.f32 d16, d2, d19
|
||||
0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
|
||||
0xe282c018, // add ip, r2, #24
|
||||
0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
|
||||
0xe282c024, // add ip, r2, #36
|
||||
0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
|
||||
0xe282c028, // add ip, r2, #40
|
||||
0xf2421c32, // vfma.f32 d17, d2, d18
|
||||
0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
|
||||
0xe282c010, // add ip, r2, #16
|
||||
0xf2422c34, // vfma.f32 d18, d2, d20
|
||||
0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
|
||||
0xe282c00c, // add ip, r2, #12
|
||||
0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
|
||||
0xe282c014, // add ip, r2, #20
|
||||
0xf2411c34, // vfma.f32 d17, d1, d20
|
||||
0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
|
||||
0xf2410c34, // vfma.f32 d16, d1, d20
|
||||
0xe282c004, // add ip, r2, #4
|
||||
0xf2412c33, // vfma.f32 d18, d1, d19
|
||||
0xf4e23c9f, // vld1.32 {d19[]}, [r2 :32]
|
||||
0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
|
||||
0xe282c008, // add ip, r2, #8
|
||||
0xf2401c33, // vfma.f32 d17, d0, d19
|
||||
0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
|
||||
0xf2400c33, // vfma.f32 d16, d0, d19
|
||||
0xf2402c34, // vfma.f32 d18, d0, d20
|
||||
0xf22101b1, // vorr d0, d17, d17
|
||||
0xf22021b0, // vorr d2, d16, d16
|
||||
0xf22211b2, // vorr d1, d18, d18
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
@ -747,6 +901,30 @@ static const unsigned char kSplice_scale_u8[] = {
|
||||
0xc5,0xbc,0x59,0xd2, // vmulps %ymm2,%ymm8,%ymm2
|
||||
0xc5,0xbc,0x59,0xdb, // vmulps %ymm3,%ymm8,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_load_tables[] = {
|
||||
0x48,0x8b,0x02, // mov (%rdx),%rax
|
||||
0x4c,0x8b,0x42,0x08, // mov 0x8(%rdx),%r8
|
||||
0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3
|
||||
0xc4,0xe2,0x7d,0x18,0x11, // vbroadcastss (%rcx),%ymm2
|
||||
0xc5,0xec,0x54,0xcb, // vandps %ymm3,%ymm2,%ymm1
|
||||
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
|
||||
0xc5,0x7c,0xc2,0xc0,0x00, // vcmpeqps %ymm0,%ymm0,%ymm8
|
||||
0xc4,0x41,0x7c,0x28,0xc8, // vmovaps %ymm8,%ymm9
|
||||
0xc4,0xc2,0x35,0x92,0x04,0x88, // vgatherdps %ymm9,(%r8,%ymm1,4),%ymm0
|
||||
0x48,0x8b,0x42,0x10, // mov 0x10(%rdx),%rax
|
||||
0xc5,0xf5,0x72,0xd3,0x08, // vpsrld $0x8,%ymm3,%ymm1
|
||||
0xc5,0x6c,0x54,0xc9, // vandps %ymm1,%ymm2,%ymm9
|
||||
0xc4,0x41,0x7c,0x28,0xd0, // vmovaps %ymm8,%ymm10
|
||||
0xc4,0xa2,0x2d,0x92,0x0c,0x88, // vgatherdps %ymm10,(%rax,%ymm9,4),%ymm1
|
||||
0x48,0x8b,0x42,0x18, // mov 0x18(%rdx),%rax
|
||||
0xc5,0xb5,0x72,0xd3,0x10, // vpsrld $0x10,%ymm3,%ymm9
|
||||
0xc4,0x41,0x6c,0x54,0xc9, // vandps %ymm9,%ymm2,%ymm9
|
||||
0xc4,0xa2,0x3d,0x92,0x14,0x88, // vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2
|
||||
0xc5,0xe5,0x72,0xd3,0x18, // vpsrld $0x18,%ymm3,%ymm3
|
||||
0xc5,0xfc,0x5b,0xdb, // vcvtdq2ps %ymm3,%ymm3
|
||||
0xc4,0x62,0x7d,0x18,0x41,0x0c, // vbroadcastss 0xc(%rcx),%ymm8
|
||||
0xc4,0xc1,0x64,0x59,0xd8, // vmulps %ymm8,%ymm3,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_load_8888[] = {
|
||||
0x48,0x8b,0x02, // mov (%rdx),%rax
|
||||
0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3
|
||||
@ -828,6 +1006,32 @@ static const unsigned char kSplice_store_f16[] = {
|
||||
0xc4,0x41,0x39,0x6a,0xc2, // vpunpckhdq %xmm10,%xmm8,%xmm8
|
||||
0xc5,0x7a,0x7f,0x44,0xf8,0x30, // vmovdqu %xmm8,0x30(%rax,%rdi,8)
|
||||
};
|
||||
static const unsigned char kSplice_matrix_3x4[] = {
|
||||
0xc4,0x62,0x7d,0x18,0x0a, // vbroadcastss (%rdx),%ymm9
|
||||
0xc4,0x62,0x7d,0x18,0x52,0x0c, // vbroadcastss 0xc(%rdx),%ymm10
|
||||
0xc4,0x62,0x7d,0x18,0x5a,0x18, // vbroadcastss 0x18(%rdx),%ymm11
|
||||
0xc4,0x62,0x7d,0x18,0x42,0x24, // vbroadcastss 0x24(%rdx),%ymm8
|
||||
0xc4,0x42,0x6d,0xb8,0xc3, // vfmadd231ps %ymm11,%ymm2,%ymm8
|
||||
0xc4,0x42,0x75,0xb8,0xc2, // vfmadd231ps %ymm10,%ymm1,%ymm8
|
||||
0xc4,0x42,0x7d,0xb8,0xc1, // vfmadd231ps %ymm9,%ymm0,%ymm8
|
||||
0xc4,0x62,0x7d,0x18,0x52,0x04, // vbroadcastss 0x4(%rdx),%ymm10
|
||||
0xc4,0x62,0x7d,0x18,0x5a,0x10, // vbroadcastss 0x10(%rdx),%ymm11
|
||||
0xc4,0x62,0x7d,0x18,0x62,0x1c, // vbroadcastss 0x1c(%rdx),%ymm12
|
||||
0xc4,0x62,0x7d,0x18,0x4a,0x28, // vbroadcastss 0x28(%rdx),%ymm9
|
||||
0xc4,0x42,0x6d,0xb8,0xcc, // vfmadd231ps %ymm12,%ymm2,%ymm9
|
||||
0xc4,0x42,0x75,0xb8,0xcb, // vfmadd231ps %ymm11,%ymm1,%ymm9
|
||||
0xc4,0x42,0x7d,0xb8,0xca, // vfmadd231ps %ymm10,%ymm0,%ymm9
|
||||
0xc4,0x62,0x7d,0x18,0x5a,0x08, // vbroadcastss 0x8(%rdx),%ymm11
|
||||
0xc4,0x62,0x7d,0x18,0x62,0x14, // vbroadcastss 0x14(%rdx),%ymm12
|
||||
0xc4,0x62,0x7d,0x18,0x6a,0x20, // vbroadcastss 0x20(%rdx),%ymm13
|
||||
0xc4,0x62,0x7d,0x18,0x52,0x2c, // vbroadcastss 0x2c(%rdx),%ymm10
|
||||
0xc4,0x42,0x6d,0xb8,0xd5, // vfmadd231ps %ymm13,%ymm2,%ymm10
|
||||
0xc4,0x42,0x75,0xb8,0xd4, // vfmadd231ps %ymm12,%ymm1,%ymm10
|
||||
0xc4,0x42,0x7d,0xb8,0xd3, // vfmadd231ps %ymm11,%ymm0,%ymm10
|
||||
0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0
|
||||
0xc5,0x7c,0x29,0xc9, // vmovaps %ymm9,%ymm1
|
||||
0xc5,0x7c,0x29,0xd2, // vmovaps %ymm10,%ymm2
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -33,6 +33,7 @@
|
||||
AI static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
|
||||
AI static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
|
||||
|
||||
AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
|
||||
#elif defined(__ARM_NEON__)
|
||||
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
|
||||
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
|
||||
@ -53,6 +54,7 @@
|
||||
AI static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
|
||||
AI static U32 round(F v, F scale) { return vcvt_u32_f32(fma(v,scale,0.5f)); }
|
||||
|
||||
AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
|
||||
#else
|
||||
#if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
|
||||
#error On x86, compile with -mavx2 -mfma -mf16c.
|
||||
@ -72,11 +74,20 @@
|
||||
AI static F rsqrt(F v) { return _mm256_rsqrt_ps (v); }
|
||||
AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
|
||||
AI static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
|
||||
|
||||
AI static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
|
||||
#endif
|
||||
|
||||
AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
|
||||
AI static U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
|
||||
|
||||
template <typename T, typename P>
|
||||
AI static T unaligned_load(const P* p) {
|
||||
T v;
|
||||
memcpy(&v, p, sizeof(v));
|
||||
return v;
|
||||
}
|
||||
|
||||
// We'll be compiling this file to an object file, then extracting parts of it into
|
||||
// SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled.
|
||||
// On ARMv7, use aapcs-vfp calling convention to pass as much data in registers as possible.
|
||||
@ -241,8 +252,7 @@ STAGE(to_srgb) {
|
||||
STAGE(scale_u8) {
|
||||
auto ptr = *(const uint8_t**)ctx + x;
|
||||
|
||||
U8 scales;
|
||||
memcpy(&scales, ptr, sizeof(scales));
|
||||
auto scales = unaligned_load<U8>(ptr);
|
||||
auto c = cast(expand(scales)) * k->_1_255;
|
||||
|
||||
r = r * c;
|
||||
@ -251,12 +261,24 @@ STAGE(scale_u8) {
|
||||
a = a * c;
|
||||
}
|
||||
|
||||
STAGE(load_tables) {
|
||||
struct Ctx {
|
||||
const uint32_t* src;
|
||||
const float *r, *g, *b;
|
||||
};
|
||||
auto c = (const Ctx*)ctx;
|
||||
|
||||
auto px = unaligned_load<U32>(c->src + x);
|
||||
r = gather(c->r, (px ) & k->_0x000000ff);
|
||||
g = gather(c->g, (px >> 8) & k->_0x000000ff);
|
||||
b = gather(c->b, (px >> 16) & k->_0x000000ff);
|
||||
a = cast( (px >> 24)) * k->_1_255;
|
||||
}
|
||||
|
||||
STAGE(load_8888) {
|
||||
auto ptr = *(const uint32_t**)ctx + x;
|
||||
|
||||
U32 px;
|
||||
memcpy(&px, ptr, sizeof(px));
|
||||
|
||||
auto px = unaligned_load<U32>(ptr);
|
||||
r = cast((px ) & k->_0x000000ff) * k->_1_255;
|
||||
g = cast((px >> 8) & k->_0x000000ff) * k->_1_255;
|
||||
b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
|
||||
@ -347,3 +369,14 @@ STAGE(store_f16) {
|
||||
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
|
||||
#endif
|
||||
}
|
||||
|
||||
STAGE(matrix_3x4) {
|
||||
auto m = (const float*)ctx;
|
||||
|
||||
auto R = fma(r,m[0], fma(g,m[3], fma(b,m[6], m[ 9]))),
|
||||
G = fma(r,m[1], fma(g,m[4], fma(b,m[7], m[10]))),
|
||||
B = fma(r,m[2], fma(g,m[5], fma(b,m[8], m[11])));
|
||||
r = R;
|
||||
g = G;
|
||||
b = B;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user