SkSplicer: implement load_tables and matrix_3x4

These are enough to splice interesting SkColorSpaceXform pipelines.

SkSplicer_stages.cpp is similar to but still intentionally distinct from
SkRasterPipeline_opts.  I hope to unify them next week.

unaligned_load() is nothing tricky... just a little refactor.

Change-Id: I05d0fc38dac985aa351d88776ecc14d2457f2124
Reviewed-on: https://skia-review.googlesource.com/7022
Reviewed-by: Matt Sarett <msarett@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-01-13 13:18:44 -05:00 committed by Skia Commit-Bot
parent 2873c767fd
commit 69f98c738c
3 changed files with 244 additions and 5 deletions

View File

@ -293,10 +293,12 @@ namespace {
case SkRasterPipeline::from_srgb: splice(&buf, kSplice_from_srgb ); break;
case SkRasterPipeline::to_srgb: splice(&buf, kSplice_to_srgb ); break;
case SkRasterPipeline::scale_u8: splice(&buf, kSplice_scale_u8 ); break;
case SkRasterPipeline::load_tables: splice(&buf, kSplice_load_tables ); break;
case SkRasterPipeline::load_8888: splice(&buf, kSplice_load_8888 ); break;
case SkRasterPipeline::store_8888: splice(&buf, kSplice_store_8888 ); break;
case SkRasterPipeline::load_f16: splice(&buf, kSplice_load_f16 ); break;
case SkRasterPipeline::store_f16: splice(&buf, kSplice_store_f16 ); break;
case SkRasterPipeline::matrix_3x4: splice(&buf, kSplice_matrix_3x4 ); break;
// No joy (probably just not yet implemented).
default:

View File

@ -222,6 +222,58 @@ static const unsigned int kSplice_scale_u8[] = {
0x6e22de02, // fmul v2.4s, v16.4s, v2.4s
0x6e23de03, // fmul v3.4s, v16.4s, v3.4s
};
static const unsigned int kSplice_load_tables[] = {
0xa9402848, // ldp x8, x10, [x2]
0xd37ef409, // lsl x9, x0, #2
0x4d40c860, // ld1r {v0.4s}, [x3]
0x3ce96903, // ldr q3, [x8,x9]
0xa9412448, // ldp x8, x9, [x2,#16]
0x4e231c01, // and v1.16b, v0.16b, v3.16b
0x1e26002e, // fmov w14, s1
0x6f380462, // ushr v2.4s, v3.4s, #8
0x6f300470, // ushr v16.4s, v3.4s, #16
0x8b2e494e, // add x14, x10, w14, uxtw #2
0x0e0c3c2b, // mov w11, v1.s[1]
0x0e143c2c, // mov w12, v1.s[2]
0x0e1c3c2d, // mov w13, v1.s[3]
0x4e221c01, // and v1.16b, v0.16b, v2.16b
0x4e301c02, // and v2.16b, v0.16b, v16.16b
0x0d4081c0, // ld1 {v0.s}[0], [x14]
0x1e26002e, // fmov w14, s1
0x8b2e490e, // add x14, x8, w14, uxtw #2
0x8b2b494b, // add x11, x10, w11, uxtw #2
0xbc6c5950, // ldr s16, [x10,w12,uxtw #2]
0xbc6d5951, // ldr s17, [x10,w13,uxtw #2]
0x0e0c3c2a, // mov w10, v1.s[1]
0x0e143c2c, // mov w12, v1.s[2]
0x0e1c3c2d, // mov w13, v1.s[3]
0x0d4081c1, // ld1 {v1.s}[0], [x14]
0x0d409160, // ld1 {v0.s}[1], [x11]
0xbc6c5912, // ldr s18, [x8,w12,uxtw #2]
0x0e143c4c, // mov w12, v2.s[2]
0x1e26004e, // fmov w14, s2
0xbc6c5933, // ldr s19, [x9,w12,uxtw #2]
0x8b2e492c, // add x12, x9, w14, uxtw #2
0x8b2a490a, // add x10, x8, w10, uxtw #2
0x0e0c3c4f, // mov w15, v2.s[1]
0x0e1c3c4b, // mov w11, v2.s[3]
0x0d408182, // ld1 {v2.s}[0], [x12]
0x0d409141, // ld1 {v1.s}[1], [x10]
0x6e140600, // mov v0.s[2], v16.s[0]
0xbc6d5910, // ldr s16, [x8,w13,uxtw #2]
0x8b2f492a, // add x10, x9, w15, uxtw #2
0x0d409142, // ld1 {v2.s}[1], [x10]
0x6e140641, // mov v1.s[2], v18.s[0]
0x6e1c0620, // mov v0.s[3], v17.s[0]
0xbc6b5931, // ldr s17, [x9,w11,uxtw #2]
0x6e1c0601, // mov v1.s[3], v16.s[0]
0xbd400c70, // ldr s16, [x3,#12]
0x6f280463, // ushr v3.4s, v3.4s, #24
0x6e140662, // mov v2.s[2], v19.s[0]
0x4e21d863, // scvtf v3.4s, v3.4s
0x6e1c0622, // mov v2.s[3], v17.s[0]
0x4f909063, // fmul v3.4s, v3.4s, v16.s[0]
};
static const unsigned int kSplice_load_8888[] = {
0xf9400048, // ldr x8, [x2]
0xd37ef409, // lsl x9, x0, #2
@ -281,6 +333,33 @@ static const unsigned int kSplice_store_f16[] = {
0x0e216873, // fcvtn v19.4h, v3.4s
0x0c000510, // st4 {v16.4h-v19.4h}, [x8]
};
static const unsigned int kSplice_matrix_3x4[] = {
0xaa0203e8, // mov x8, x2
0x91009049, // add x9, x2, #0x24
0x4ddfc913, // ld1r {v19.4s}, [x8], #4
0x4d40c930, // ld1r {v16.4s}, [x9]
0x9100a049, // add x9, x2, #0x28
0x4d40c931, // ld1r {v17.4s}, [x9]
0x2d435454, // ldp s20, s21, [x2,#24]
0x9100b049, // add x9, x2, #0x2c
0xbd402056, // ldr s22, [x2,#32]
0x4d40c932, // ld1r {v18.4s}, [x9]
0x4f941050, // fmla v16.4s, v2.4s, v20.s[0]
0x4f951051, // fmla v17.4s, v2.4s, v21.s[0]
0x2d415454, // ldp s20, s21, [x2,#8]
0x4f961052, // fmla v18.4s, v2.4s, v22.s[0]
0x2d425842, // ldp s2, s22, [x2,#16]
0x4f951030, // fmla v16.4s, v1.4s, v21.s[0]
0xbd400115, // ldr s21, [x8]
0x4f821031, // fmla v17.4s, v1.4s, v2.s[0]
0x4f961032, // fmla v18.4s, v1.4s, v22.s[0]
0x4e20ce70, // fmla v16.4s, v19.4s, v0.4s
0x4f951011, // fmla v17.4s, v0.4s, v21.s[0]
0x4f941012, // fmla v18.4s, v0.4s, v20.s[0]
0x4eb01e00, // mov v0.16b, v16.16b
0x4eb11e21, // mov v1.16b, v17.16b
0x4eb21e42, // mov v2.16b, v18.16b
};
#elif defined(__ARM_NEON__)
@ -505,6 +584,44 @@ static const unsigned int kSplice_scale_u8[] = {
0xe28dd008, // add sp, sp, #8
0xecbd8b02, // vpop {d8}
};
static const unsigned int kSplice_load_tables[] = {
0xe92d41f0, // push {r4, r5, r6, r7, r8, lr}
0xe592c000, // ldr ip, [r2]
0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32]
0xe08c5100, // add r5, ip, r0, lsl #2
0xe592e004, // ldr lr, [r2, #4]
0xedd51b00, // vldr d17, [r5]
0xf24021b1, // vand d18, d16, d17
0xe592800c, // ldr r8, [r2, #12]
0xf3f83031, // vshr.u32 d19, d17, #8
0xe5924008, // ldr r4, [r2, #8]
0xed931a03, // vldr s2, [r3, #12]
0xee325b90, // vmov.32 r5, d18[1]
0xee126b90, // vmov.32 r6, d18[0]
0xf3f02031, // vshr.u32 d18, d17, #16
0xf24021b2, // vand d18, d16, d18
0xf24001b3, // vand d16, d16, d19
0xee127b90, // vmov.32 r7, d18[0]
0xe08e5105, // add r5, lr, r5, lsl #2
0xe08e6106, // add r6, lr, r6, lsl #2
0xedd50a00, // vldr s1, [r5]
0xee325b90, // vmov.32 r5, d18[1]
0xed960a00, // vldr s0, [r6]
0xee306b90, // vmov.32 r6, d16[1]
0xe0887107, // add r7, r8, r7, lsl #2
0xe088c105, // add ip, r8, r5, lsl #2
0xee105b90, // vmov.32 r5, d16[0]
0xf3e80031, // vshr.u32 d16, d17, #24
0xe0846106, // add r6, r4, r6, lsl #2
0xeddc2a00, // vldr s5, [ip]
0xf3fb0620, // vcvt.f32.s32 d16, d16
0xed972a00, // vldr s4, [r7]
0xf2a039c1, // vmul.f32 d3, d16, d1[0]
0xedd61a00, // vldr s3, [r6]
0xe0846105, // add r6, r4, r5, lsl #2
0xed961a00, // vldr s2, [r6]
0xe8bd41f0, // pop {r4, r5, r6, r7, r8, lr}
};
static const unsigned int kSplice_load_8888[] = {
0xe592c000, // ldr ip, [r2]
0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32]
@ -581,6 +698,43 @@ static const unsigned int kSplice_store_f16[] = {
0xe08cc180, // add ip, ip, r0, lsl #3
0xf44c084f, // vst2.16 {d16-d17}, [ip]
};
static const unsigned int kSplice_matrix_3x4[] = {
0xe282c020, // add ip, r2, #32
0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
0xe282c02c, // add ip, r2, #44
0xf4ec0c9f, // vld1.32 {d16[]}, [ip :32]
0xe282c01c, // add ip, r2, #28
0xf2420c33, // vfma.f32 d16, d2, d19
0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
0xe282c018, // add ip, r2, #24
0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
0xe282c024, // add ip, r2, #36
0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
0xe282c028, // add ip, r2, #40
0xf2421c32, // vfma.f32 d17, d2, d18
0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
0xe282c010, // add ip, r2, #16
0xf2422c34, // vfma.f32 d18, d2, d20
0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
0xe282c00c, // add ip, r2, #12
0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
0xe282c014, // add ip, r2, #20
0xf2411c34, // vfma.f32 d17, d1, d20
0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
0xf2410c34, // vfma.f32 d16, d1, d20
0xe282c004, // add ip, r2, #4
0xf2412c33, // vfma.f32 d18, d1, d19
0xf4e23c9f, // vld1.32 {d19[]}, [r2 :32]
0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
0xe282c008, // add ip, r2, #8
0xf2401c33, // vfma.f32 d17, d0, d19
0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
0xf2400c33, // vfma.f32 d16, d0, d19
0xf2402c34, // vfma.f32 d18, d0, d20
0xf22101b1, // vorr d0, d17, d17
0xf22021b0, // vorr d2, d16, d16
0xf22211b2, // vorr d1, d18, d18
};
#else
@ -747,6 +901,30 @@ static const unsigned char kSplice_scale_u8[] = {
0xc5,0xbc,0x59,0xd2, // vmulps %ymm2,%ymm8,%ymm2
0xc5,0xbc,0x59,0xdb, // vmulps %ymm3,%ymm8,%ymm3
};
static const unsigned char kSplice_load_tables[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0x4c,0x8b,0x42,0x08, // mov 0x8(%rdx),%r8
0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3
0xc4,0xe2,0x7d,0x18,0x11, // vbroadcastss (%rcx),%ymm2
0xc5,0xec,0x54,0xcb, // vandps %ymm3,%ymm2,%ymm1
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
0xc5,0x7c,0xc2,0xc0,0x00, // vcmpeqps %ymm0,%ymm0,%ymm8
0xc4,0x41,0x7c,0x28,0xc8, // vmovaps %ymm8,%ymm9
0xc4,0xc2,0x35,0x92,0x04,0x88, // vgatherdps %ymm9,(%r8,%ymm1,4),%ymm0
0x48,0x8b,0x42,0x10, // mov 0x10(%rdx),%rax
0xc5,0xf5,0x72,0xd3,0x08, // vpsrld $0x8,%ymm3,%ymm1
0xc5,0x6c,0x54,0xc9, // vandps %ymm1,%ymm2,%ymm9
0xc4,0x41,0x7c,0x28,0xd0, // vmovaps %ymm8,%ymm10
0xc4,0xa2,0x2d,0x92,0x0c,0x88, // vgatherdps %ymm10,(%rax,%ymm9,4),%ymm1
0x48,0x8b,0x42,0x18, // mov 0x18(%rdx),%rax
0xc5,0xb5,0x72,0xd3,0x10, // vpsrld $0x10,%ymm3,%ymm9
0xc4,0x41,0x6c,0x54,0xc9, // vandps %ymm9,%ymm2,%ymm9
0xc4,0xa2,0x3d,0x92,0x14,0x88, // vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2
0xc5,0xe5,0x72,0xd3,0x18, // vpsrld $0x18,%ymm3,%ymm3
0xc5,0xfc,0x5b,0xdb, // vcvtdq2ps %ymm3,%ymm3
0xc4,0x62,0x7d,0x18,0x41,0x0c, // vbroadcastss 0xc(%rcx),%ymm8
0xc4,0xc1,0x64,0x59,0xd8, // vmulps %ymm8,%ymm3,%ymm3
};
static const unsigned char kSplice_load_8888[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3
@ -828,6 +1006,32 @@ static const unsigned char kSplice_store_f16[] = {
0xc4,0x41,0x39,0x6a,0xc2, // vpunpckhdq %xmm10,%xmm8,%xmm8
0xc5,0x7a,0x7f,0x44,0xf8,0x30, // vmovdqu %xmm8,0x30(%rax,%rdi,8)
};
static const unsigned char kSplice_matrix_3x4[] = {
0xc4,0x62,0x7d,0x18,0x0a, // vbroadcastss (%rdx),%ymm9
0xc4,0x62,0x7d,0x18,0x52,0x0c, // vbroadcastss 0xc(%rdx),%ymm10
0xc4,0x62,0x7d,0x18,0x5a,0x18, // vbroadcastss 0x18(%rdx),%ymm11
0xc4,0x62,0x7d,0x18,0x42,0x24, // vbroadcastss 0x24(%rdx),%ymm8
0xc4,0x42,0x6d,0xb8,0xc3, // vfmadd231ps %ymm11,%ymm2,%ymm8
0xc4,0x42,0x75,0xb8,0xc2, // vfmadd231ps %ymm10,%ymm1,%ymm8
0xc4,0x42,0x7d,0xb8,0xc1, // vfmadd231ps %ymm9,%ymm0,%ymm8
0xc4,0x62,0x7d,0x18,0x52,0x04, // vbroadcastss 0x4(%rdx),%ymm10
0xc4,0x62,0x7d,0x18,0x5a,0x10, // vbroadcastss 0x10(%rdx),%ymm11
0xc4,0x62,0x7d,0x18,0x62,0x1c, // vbroadcastss 0x1c(%rdx),%ymm12
0xc4,0x62,0x7d,0x18,0x4a,0x28, // vbroadcastss 0x28(%rdx),%ymm9
0xc4,0x42,0x6d,0xb8,0xcc, // vfmadd231ps %ymm12,%ymm2,%ymm9
0xc4,0x42,0x75,0xb8,0xcb, // vfmadd231ps %ymm11,%ymm1,%ymm9
0xc4,0x42,0x7d,0xb8,0xca, // vfmadd231ps %ymm10,%ymm0,%ymm9
0xc4,0x62,0x7d,0x18,0x5a,0x08, // vbroadcastss 0x8(%rdx),%ymm11
0xc4,0x62,0x7d,0x18,0x62,0x14, // vbroadcastss 0x14(%rdx),%ymm12
0xc4,0x62,0x7d,0x18,0x6a,0x20, // vbroadcastss 0x20(%rdx),%ymm13
0xc4,0x62,0x7d,0x18,0x52,0x2c, // vbroadcastss 0x2c(%rdx),%ymm10
0xc4,0x42,0x6d,0xb8,0xd5, // vfmadd231ps %ymm13,%ymm2,%ymm10
0xc4,0x42,0x75,0xb8,0xd4, // vfmadd231ps %ymm12,%ymm1,%ymm10
0xc4,0x42,0x7d,0xb8,0xd3, // vfmadd231ps %ymm11,%ymm0,%ymm10
0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0
0xc5,0x7c,0x29,0xc9, // vmovaps %ymm9,%ymm1
0xc5,0x7c,0x29,0xd2, // vmovaps %ymm10,%ymm2
};
#endif

View File

@ -33,6 +33,7 @@
AI static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
AI static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#elif defined(__ARM_NEON__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
@ -53,6 +54,7 @@
AI static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
AI static U32 round(F v, F scale) { return vcvt_u32_f32(fma(v,scale,0.5f)); }
AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
#else
#if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
#error On x86, compile with -mavx2 -mfma -mf16c.
@ -72,11 +74,20 @@
AI static F rsqrt(F v) { return _mm256_rsqrt_ps (v); }
AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
AI static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
AI static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
#endif
AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
AI static U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
template <typename T, typename P>
AI static T unaligned_load(const P* p) {
T v;
memcpy(&v, p, sizeof(v));
return v;
}
// We'll be compiling this file to an object file, then extracting parts of it into
// SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled.
// On ARMv7, use aapcs-vfp calling convention to pass as much data in registers as possible.
@ -241,8 +252,7 @@ STAGE(to_srgb) {
STAGE(scale_u8) {
auto ptr = *(const uint8_t**)ctx + x;
U8 scales;
memcpy(&scales, ptr, sizeof(scales));
auto scales = unaligned_load<U8>(ptr);
auto c = cast(expand(scales)) * k->_1_255;
r = r * c;
@ -251,12 +261,24 @@ STAGE(scale_u8) {
a = a * c;
}
STAGE(load_tables) {
struct Ctx {
const uint32_t* src;
const float *r, *g, *b;
};
auto c = (const Ctx*)ctx;
auto px = unaligned_load<U32>(c->src + x);
r = gather(c->r, (px ) & k->_0x000000ff);
g = gather(c->g, (px >> 8) & k->_0x000000ff);
b = gather(c->b, (px >> 16) & k->_0x000000ff);
a = cast( (px >> 24)) * k->_1_255;
}
STAGE(load_8888) {
auto ptr = *(const uint32_t**)ctx + x;
U32 px;
memcpy(&px, ptr, sizeof(px));
auto px = unaligned_load<U32>(ptr);
r = cast((px ) & k->_0x000000ff) * k->_1_255;
g = cast((px >> 8) & k->_0x000000ff) * k->_1_255;
b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
@ -347,3 +369,14 @@ STAGE(store_f16) {
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
#endif
}
STAGE(matrix_3x4) {
auto m = (const float*)ctx;
auto R = fma(r,m[0], fma(g,m[3], fma(b,m[6], m[ 9]))),
G = fma(r,m[1], fma(g,m[4], fma(b,m[7], m[10]))),
B = fma(r,m[2], fma(g,m[5], fma(b,m[8], m[11])));
r = R;
g = G;
b = B;
}