Streamline x86 u8 -> fixed15 math.
We can use SSE's 16 bit mul-hi to get a very good approximation to the ideal multiplier. This lets us trim several instructions. This removes the need for the constant 0x0001 and instead uses 0x8081. I've reordered the constants so that 0x8000 comes first, which helps trim an instruction here and there on ARM. Change-Id: I3d490c802df39a89424230c4cfc491f52210c275 Reviewed-on: https://skia-review.googlesource.com/7282 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
c37e61455d
commit
f61bc4d229
@ -44,7 +44,7 @@ namespace {
|
||||
12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb
|
||||
};
|
||||
static const SkSplicer_constants_lowp kConstants_lowp = {
|
||||
0x0001, 0x8000,
|
||||
0x8000, 0x8081,
|
||||
};
|
||||
|
||||
// We do this a lot, so it's nice to infer the correct size. Works fine with arrays.
|
||||
|
@ -29,8 +29,7 @@ static const unsigned int kSplice_plus__lowp[] = {
|
||||
0x6e670c63, // uqadd v3.8h, v3.8h, v7.8h
|
||||
};
|
||||
static const unsigned int kSplice_srcover_lowp[] = {
|
||||
0x91000868, // add x8, x3, #0x2
|
||||
0x4d40c510, // ld1r {v16.8h}, [x8]
|
||||
0x4d40c470, // ld1r {v16.8h}, [x3]
|
||||
0x6e632e10, // uqsub v16.8h, v16.8h, v3.8h
|
||||
0x6e70b491, // sqrdmulh v17.8h, v4.8h, v16.8h
|
||||
0x4e241e12, // and v18.16b, v16.16b, v4.16b
|
||||
@ -54,8 +53,7 @@ static const unsigned int kSplice_srcover_lowp[] = {
|
||||
0x6e630e83, // uqadd v3.8h, v20.8h, v3.8h
|
||||
};
|
||||
static const unsigned int kSplice_dstover_lowp[] = {
|
||||
0x91000868, // add x8, x3, #0x2
|
||||
0x4d40c510, // ld1r {v16.8h}, [x8]
|
||||
0x4d40c470, // ld1r {v16.8h}, [x3]
|
||||
0x6e672e10, // uqsub v16.8h, v16.8h, v7.8h
|
||||
0x6e70b411, // sqrdmulh v17.8h, v0.8h, v16.8h
|
||||
0x4e201e12, // and v18.16b, v16.16b, v0.16b
|
||||
@ -79,16 +77,14 @@ static const unsigned int kSplice_dstover_lowp[] = {
|
||||
0x6e670e87, // uqadd v7.8h, v20.8h, v7.8h
|
||||
};
|
||||
static const unsigned int kSplice_clamp_1_lowp[] = {
|
||||
0x91000868, // add x8, x3, #0x2
|
||||
0x4d40c510, // ld1r {v16.8h}, [x8]
|
||||
0x4d40c470, // ld1r {v16.8h}, [x3]
|
||||
0x6e706c00, // umin v0.8h, v0.8h, v16.8h
|
||||
0x6e706c21, // umin v1.8h, v1.8h, v16.8h
|
||||
0x6e706c42, // umin v2.8h, v2.8h, v16.8h
|
||||
0x6e706c63, // umin v3.8h, v3.8h, v16.8h
|
||||
};
|
||||
static const unsigned int kSplice_clamp_a_lowp[] = {
|
||||
0x91000868, // add x8, x3, #0x2
|
||||
0x4d40c510, // ld1r {v16.8h}, [x8]
|
||||
0x4d40c470, // ld1r {v16.8h}, [x3]
|
||||
0x6e706c63, // umin v3.8h, v3.8h, v16.8h
|
||||
0x6e636c00, // umin v0.8h, v0.8h, v3.8h
|
||||
0x6e636c21, // umin v1.8h, v1.8h, v3.8h
|
||||
@ -197,8 +193,7 @@ static const unsigned int kSplice_plus__lowp[] = {
|
||||
0xf3133017, // vqadd.u16 d3, d3, d7
|
||||
};
|
||||
static const unsigned int kSplice_srcover_lowp[] = {
|
||||
0xe283c002, // add ip, r3, #2
|
||||
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
|
||||
0xf4e30c5f, // vld1.16 {d16[]}, [r3 :16]
|
||||
0xf3500293, // vqsub.u16 d16, d16, d3
|
||||
0xf3541b20, // vqrdmulh.s16 d17, d4, d16
|
||||
0xf3552b20, // vqrdmulh.s16 d18, d5, d16
|
||||
@ -222,8 +217,7 @@ static const unsigned int kSplice_srcover_lowp[] = {
|
||||
0xf3143093, // vqadd.u16 d3, d20, d3
|
||||
};
|
||||
static const unsigned int kSplice_dstover_lowp[] = {
|
||||
0xe283c002, // add ip, r3, #2
|
||||
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
|
||||
0xf4e30c5f, // vld1.16 {d16[]}, [r3 :16]
|
||||
0xf3500297, // vqsub.u16 d16, d16, d7
|
||||
0xf3501b20, // vqrdmulh.s16 d17, d0, d16
|
||||
0xf3512b20, // vqrdmulh.s16 d18, d1, d16
|
||||
@ -247,16 +241,14 @@ static const unsigned int kSplice_dstover_lowp[] = {
|
||||
0xf3147097, // vqadd.u16 d7, d20, d7
|
||||
};
|
||||
static const unsigned int kSplice_clamp_1_lowp[] = {
|
||||
0xe283c002, // add ip, r3, #2
|
||||
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
|
||||
0xf4e30c5f, // vld1.16 {d16[]}, [r3 :16]
|
||||
0xf3100630, // vmin.u16 d0, d0, d16
|
||||
0xf3111630, // vmin.u16 d1, d1, d16
|
||||
0xf3122630, // vmin.u16 d2, d2, d16
|
||||
0xf3133630, // vmin.u16 d3, d3, d16
|
||||
};
|
||||
static const unsigned int kSplice_clamp_a_lowp[] = {
|
||||
0xe283c002, // add ip, r3, #2
|
||||
0xf4ec0c5f, // vld1.16 {d16[]}, [ip :16]
|
||||
0xf4e30c5f, // vld1.16 {d16[]}, [r3 :16]
|
||||
0xf3133630, // vmin.u16 d3, d3, d16
|
||||
0xf3100613, // vmin.u16 d0, d0, d3
|
||||
0xf3111613, // vmin.u16 d1, d1, d3
|
||||
@ -376,7 +368,7 @@ static const unsigned char kSplice_plus__lowp[] = {
|
||||
0xc5,0xe5,0xdd,0xdf, // vpaddusw %ymm7,%ymm3,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_srcover_lowp[] = {
|
||||
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
|
||||
0xc4,0x62,0x7d,0x79,0x01, // vpbroadcastw (%rcx),%ymm8
|
||||
0xc5,0x3d,0xd9,0xc3, // vpsubusw %ymm3,%ymm8,%ymm8
|
||||
0xc4,0x42,0x5d,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm4,%ymm9
|
||||
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
|
||||
@ -392,7 +384,7 @@ static const unsigned char kSplice_srcover_lowp[] = {
|
||||
0xc5,0xbd,0xdd,0xdb, // vpaddusw %ymm3,%ymm8,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_dstover_lowp[] = {
|
||||
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
|
||||
0xc4,0x62,0x7d,0x79,0x01, // vpbroadcastw (%rcx),%ymm8
|
||||
0xc5,0x3d,0xd9,0xc7, // vpsubusw %ymm7,%ymm8,%ymm8
|
||||
0xc4,0x42,0x7d,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm0,%ymm9
|
||||
0xc4,0x42,0x7d,0x1d,0xc9, // vpabsw %ymm9,%ymm9
|
||||
@ -408,14 +400,14 @@ static const unsigned char kSplice_dstover_lowp[] = {
|
||||
0xc5,0xbd,0xdd,0xff, // vpaddusw %ymm7,%ymm8,%ymm7
|
||||
};
|
||||
static const unsigned char kSplice_clamp_1_lowp[] = {
|
||||
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
|
||||
0xc4,0x62,0x7d,0x79,0x01, // vpbroadcastw (%rcx),%ymm8
|
||||
0xc4,0xc2,0x7d,0x3a,0xc0, // vpminuw %ymm8,%ymm0,%ymm0
|
||||
0xc4,0xc2,0x75,0x3a,0xc8, // vpminuw %ymm8,%ymm1,%ymm1
|
||||
0xc4,0xc2,0x6d,0x3a,0xd0, // vpminuw %ymm8,%ymm2,%ymm2
|
||||
0xc4,0xc2,0x65,0x3a,0xd8, // vpminuw %ymm8,%ymm3,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_clamp_a_lowp[] = {
|
||||
0xc4,0x62,0x7d,0x79,0x41,0x02, // vpbroadcastw 0x2(%rcx),%ymm8
|
||||
0xc4,0x62,0x7d,0x79,0x01, // vpbroadcastw (%rcx),%ymm8
|
||||
0xc4,0xc2,0x65,0x3a,0xd8, // vpminuw %ymm8,%ymm3,%ymm3
|
||||
0xc4,0xe2,0x7d,0x3a,0xc3, // vpminuw %ymm3,%ymm0,%ymm0
|
||||
0xc4,0xe2,0x75,0x3a,0xcb, // vpminuw %ymm3,%ymm1,%ymm1
|
||||
@ -458,13 +450,9 @@ static const unsigned char kSplice_premul_lowp[] = {
|
||||
static const unsigned char kSplice_scale_u8_lowp[] = {
|
||||
0x48,0x8b,0x02, // mov (%rdx),%rax
|
||||
0xc4,0x62,0x7d,0x30,0x04,0x38, // vpmovzxbw (%rax,%rdi,1),%ymm8
|
||||
0xc4,0xc1,0x35,0x71,0xf0,0x07, // vpsllw $0x7,%ymm8,%ymm9
|
||||
0xc4,0xc1,0x2d,0x71,0xd0,0x01, // vpsrlw $0x1,%ymm8,%ymm10
|
||||
0xc4,0x41,0x35,0xdd,0xca, // vpaddusw %ymm10,%ymm9,%ymm9
|
||||
0xc4,0x62,0x7d,0x79,0x11, // vpbroadcastw (%rcx),%ymm10
|
||||
0xc4,0x41,0x3d,0xdd,0xc2, // vpaddusw %ymm10,%ymm8,%ymm8
|
||||
0xc4,0xc1,0x3d,0x71,0xd0,0x08, // vpsrlw $0x8,%ymm8,%ymm8
|
||||
0xc4,0x41,0x35,0xdd,0xc0, // vpaddusw %ymm8,%ymm9,%ymm8
|
||||
0xc4,0xc1,0x3d,0x71,0xf0,0x08, // vpsllw $0x8,%ymm8,%ymm8
|
||||
0xc4,0x62,0x7d,0x79,0x49,0x02, // vpbroadcastw 0x2(%rcx),%ymm9
|
||||
0xc4,0x41,0x3d,0xe4,0xc1, // vpmulhuw %ymm9,%ymm8,%ymm8
|
||||
0xc4,0xc2,0x7d,0x0b,0xc0, // vpmulhrsw %ymm8,%ymm0,%ymm0
|
||||
0xc4,0xe2,0x7d,0x1d,0xc0, // vpabsw %ymm0,%ymm0
|
||||
0xc4,0xc2,0x75,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm1,%ymm1
|
||||
@ -494,37 +482,21 @@ static const unsigned char kSplice_load_8888_lowp[] = {
|
||||
0xc5,0x39,0x68,0xc1, // vpunpckhbw %xmm1,%xmm8,%xmm8
|
||||
0xc5,0xe9,0x6c,0xc3, // vpunpcklqdq %xmm3,%xmm2,%xmm0
|
||||
0xc4,0xe2,0x7d,0x30,0xc0, // vpmovzxbw %xmm0,%ymm0
|
||||
0xc5,0xf5,0x71,0xf0,0x07, // vpsllw $0x7,%ymm0,%ymm1
|
||||
0xc5,0xad,0x71,0xd0,0x01, // vpsrlw $0x1,%ymm0,%ymm10
|
||||
0xc4,0xc1,0x75,0xdd,0xca, // vpaddusw %ymm10,%ymm1,%ymm1
|
||||
0xc4,0x62,0x7d,0x79,0x11, // vpbroadcastw (%rcx),%ymm10
|
||||
0xc4,0xc1,0x7d,0xdd,0xc2, // vpaddusw %ymm10,%ymm0,%ymm0
|
||||
0xc5,0xfd,0x71,0xd0,0x08, // vpsrlw $0x8,%ymm0,%ymm0
|
||||
0xc5,0xf5,0xdd,0xc0, // vpaddusw %ymm0,%ymm1,%ymm0
|
||||
0xc5,0xfd,0x71,0xf0,0x08, // vpsllw $0x8,%ymm0,%ymm0
|
||||
0xc4,0x62,0x7d,0x79,0x51,0x02, // vpbroadcastw 0x2(%rcx),%ymm10
|
||||
0xc4,0xc1,0x7d,0xe4,0xc2, // vpmulhuw %ymm10,%ymm0,%ymm0
|
||||
0xc5,0xe9,0x6d,0xcb, // vpunpckhqdq %xmm3,%xmm2,%xmm1
|
||||
0xc4,0xe2,0x7d,0x30,0xc9, // vpmovzxbw %xmm1,%ymm1
|
||||
0xc5,0xed,0x71,0xf1,0x07, // vpsllw $0x7,%ymm1,%ymm2
|
||||
0xc5,0xe5,0x71,0xd1,0x01, // vpsrlw $0x1,%ymm1,%ymm3
|
||||
0xc5,0xed,0xdd,0xd3, // vpaddusw %ymm3,%ymm2,%ymm2
|
||||
0xc4,0xc1,0x75,0xdd,0xca, // vpaddusw %ymm10,%ymm1,%ymm1
|
||||
0xc5,0xf5,0x71,0xd1,0x08, // vpsrlw $0x8,%ymm1,%ymm1
|
||||
0xc5,0xed,0xdd,0xc9, // vpaddusw %ymm1,%ymm2,%ymm1
|
||||
0xc5,0xf5,0x71,0xf1,0x08, // vpsllw $0x8,%ymm1,%ymm1
|
||||
0xc4,0xc1,0x75,0xe4,0xca, // vpmulhuw %ymm10,%ymm1,%ymm1
|
||||
0xc4,0xc1,0x31,0x6c,0xd0, // vpunpcklqdq %xmm8,%xmm9,%xmm2
|
||||
0xc4,0xe2,0x7d,0x30,0xd2, // vpmovzxbw %xmm2,%ymm2
|
||||
0xc5,0xe5,0x71,0xf2,0x07, // vpsllw $0x7,%ymm2,%ymm3
|
||||
0xc5,0xa5,0x71,0xd2,0x01, // vpsrlw $0x1,%ymm2,%ymm11
|
||||
0xc4,0xc1,0x65,0xdd,0xdb, // vpaddusw %ymm11,%ymm3,%ymm3
|
||||
0xc4,0xc1,0x6d,0xdd,0xd2, // vpaddusw %ymm10,%ymm2,%ymm2
|
||||
0xc5,0xed,0x71,0xd2,0x08, // vpsrlw $0x8,%ymm2,%ymm2
|
||||
0xc5,0xe5,0xdd,0xd2, // vpaddusw %ymm2,%ymm3,%ymm2
|
||||
0xc5,0xed,0x71,0xf2,0x08, // vpsllw $0x8,%ymm2,%ymm2
|
||||
0xc4,0xc1,0x6d,0xe4,0xd2, // vpmulhuw %ymm10,%ymm2,%ymm2
|
||||
0xc4,0xc1,0x31,0x6d,0xd8, // vpunpckhqdq %xmm8,%xmm9,%xmm3
|
||||
0xc4,0xe2,0x7d,0x30,0xdb, // vpmovzxbw %xmm3,%ymm3
|
||||
0xc5,0xbd,0x71,0xf3,0x07, // vpsllw $0x7,%ymm3,%ymm8
|
||||
0xc5,0xb5,0x71,0xd3,0x01, // vpsrlw $0x1,%ymm3,%ymm9
|
||||
0xc4,0x41,0x3d,0xdd,0xc1, // vpaddusw %ymm9,%ymm8,%ymm8
|
||||
0xc4,0xc1,0x65,0xdd,0xda, // vpaddusw %ymm10,%ymm3,%ymm3
|
||||
0xc5,0xe5,0x71,0xd3,0x08, // vpsrlw $0x8,%ymm3,%ymm3
|
||||
0xc5,0xbd,0xdd,0xdb, // vpaddusw %ymm3,%ymm8,%ymm3
|
||||
0xc5,0xe5,0x71,0xf3,0x08, // vpsllw $0x8,%ymm3,%ymm3
|
||||
0xc4,0xc1,0x65,0xe4,0xda, // vpmulhuw %ymm10,%ymm3,%ymm3
|
||||
};
|
||||
static const unsigned char kSplice_store_8888_lowp[] = {
|
||||
0x48,0x8b,0x02, // mov (%rdx),%rax
|
||||
|
@ -41,8 +41,8 @@ struct SkSplicer_constants {
|
||||
};
|
||||
|
||||
struct SkSplicer_constants_lowp {
|
||||
uint16_t _0x0001; // 0x0001 == 1 == epsilon
|
||||
uint16_t _1; // 0x8000 == 32768 == 1.0
|
||||
uint16_t _0x8081; // 0x8081 == 32897, closest value to 32768 * (256/255).
|
||||
};
|
||||
|
||||
#endif//SkSplicer_shared_DEFINED
|
||||
|
@ -124,9 +124,11 @@ using K = const SkSplicer_constants_lowp;
|
||||
static F max(F a, F b) { return _mm256_max_epu16(a,b); }
|
||||
|
||||
static F from_u8(U8 u8, K* k) {
|
||||
// Nothing too interesting here. We follow the stock SkFixed15 formula.
|
||||
// Ideally we'd multiply by 32768/255 = 128.50196...
|
||||
// We can approximate that very cheaply as 256*32897/65536 = 128.50391...
|
||||
// 0 and 255 map to 0 and 32768 correctly, and the max error is 1 (on about 1/4 of values).
|
||||
F u16 = _mm256_cvtepu8_epi16(u8);
|
||||
return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
|
||||
return _mm256_mulhi_epu16(u16 << 8, F(k->_0x8081));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user