baby steps into 16-bit ops
I figure the easiest way to expose 16-bit operations is to expose 16x2 pair operations... this means we can continue to always work with the same size vector. Switching from 32-bit multiplies to 16-bit multiplies is going to deliver the most oomph... they cost roughly half what 32-bit multiplies do on x86. Speed now: I32_SWAR: 0.27 ns/px I32: 0.43 ns/px F32: 0.76 ns/px RP: 0.8 ns/px Opts: 0.2 ns/px Change-Id: I8350c71722a9bde714ba18f97b8687fe35cc749f Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220709 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
parent
d9a6511024
commit
3538908983
@ -574,23 +574,23 @@ r2 = load32 arg(0)
|
||||
r3 = extract r2 0 r0
|
||||
r4 = extract r2 8 r0
|
||||
r5 = extract r2 16 r0
|
||||
r2 = extract r2 24 r0
|
||||
r2 = shr r2 24
|
||||
r6 = load32 arg(1)
|
||||
r7 = extract r6 0 r0
|
||||
r8 = extract r6 8 r0
|
||||
r9 = extract r6 16 r0
|
||||
r6 = extract r6 24 r0
|
||||
r6 = shr r6 24
|
||||
r10 = sub_i32 r1 r2
|
||||
r7 = mul_i32 r7 r10
|
||||
r7 = mul_i16x2 r7 r10
|
||||
r7 = shr r7 8
|
||||
r7 = add_i32 r3 r7
|
||||
r8 = mul_i32 r8 r10
|
||||
r8 = mul_i16x2 r8 r10
|
||||
r8 = shr r8 8
|
||||
r8 = add_i32 r4 r8
|
||||
r9 = mul_i32 r9 r10
|
||||
r9 = mul_i16x2 r9 r10
|
||||
r9 = shr r9 8
|
||||
r9 = add_i32 r5 r9
|
||||
r10 = mul_i32 r6 r10
|
||||
r10 = mul_i16x2 r6 r10
|
||||
r10 = shr r10 8
|
||||
r10 = add_i32 r2 r10
|
||||
r8 = pack r7 r8 8
|
||||
@ -599,27 +599,28 @@ r10 = pack r8 r10 16
|
||||
store32 arg(1) r10
|
||||
|
||||
I32 (SWAR) 8888 over 8888
|
||||
8 registers, 21 instructions:
|
||||
8 registers, 22 instructions:
|
||||
r0 = splat FF00FF (2.3418409e-38)
|
||||
r1 = splat 100 (3.5873241e-43)
|
||||
r2 = splat FF00FF00 (-1.7146522e+38)
|
||||
loop:
|
||||
r3 = load32 arg(0)
|
||||
r4 = extract r3 0 r0
|
||||
r3 = extract r3 8 r0
|
||||
r5 = load32 arg(1)
|
||||
r6 = extract r5 0 r0
|
||||
r5 = extract r5 8 r0
|
||||
r7 = shr r3 16
|
||||
r7 = sub_i32 r1 r7
|
||||
r6 = mul_i32 r6 r7
|
||||
r6 = shr r6 8
|
||||
r7 = mul_i32 r5 r7
|
||||
r6 = bit_and r6 r0
|
||||
r7 = bit_and r7 r2
|
||||
r6 = add_i32 r4 r6
|
||||
r3 = shl r3 8
|
||||
r3 = add_i32 r3 r7
|
||||
r3 = bit_or r6 r3
|
||||
store32 arg(1) r3
|
||||
r5 = extract r3 8 r0
|
||||
r3 = shr r3 24
|
||||
r6 = load32 arg(1)
|
||||
r7 = extract r6 0 r0
|
||||
r6 = extract r6 8 r0
|
||||
r3 = sub_i32 r1 r3
|
||||
r3 = pack r3 r3 16
|
||||
r7 = mul_i16x2 r7 r3
|
||||
r7 = shr r7 8
|
||||
r3 = mul_i16x2 r6 r3
|
||||
r7 = bit_and r7 r0
|
||||
r3 = bit_and r3 r2
|
||||
r7 = add_i32 r4 r7
|
||||
r5 = shl r5 8
|
||||
r5 = add_i32 r5 r3
|
||||
r5 = bit_or r7 r5
|
||||
store32 arg(1) r5
|
||||
|
||||
|
@ -218,6 +218,8 @@ namespace skvm {
|
||||
I32 Builder::sub(I32 x, I32 y) { return {this->push(Op::sub_i32, x.id, y.id)}; }
|
||||
I32 Builder::mul(I32 x, I32 y) { return {this->push(Op::mul_i32, x.id, y.id)}; }
|
||||
|
||||
I32 Builder::mul_16x2(I32 x, I32 y) { return {this->push(Op::mul_i16x2, x.id, y.id)}; }
|
||||
|
||||
I32 Builder::bit_and(I32 x, I32 y) { return {this->push(Op::bit_and, x.id, y.id)}; }
|
||||
I32 Builder::bit_or (I32 x, I32 y) { return {this->push(Op::bit_or , x.id, y.id)}; }
|
||||
I32 Builder::bit_xor(I32 x, I32 y) { return {this->push(Op::bit_xor, x.id, y.id)}; }
|
||||
@ -312,6 +314,8 @@ namespace skvm {
|
||||
case Op::sub_i32: write(o, V{id}, "= sub_i32", V{x}, V{y}); break;
|
||||
case Op::mul_i32: write(o, V{id}, "= mul_i32", V{x}, V{y}); break;
|
||||
|
||||
case Op::mul_i16x2: write(o, V{id}, "= mul_i16x2", V{x}, V{y}); break;
|
||||
|
||||
case Op::bit_and: write(o, V{id}, "= bit_and", V{x}, V{y}); break;
|
||||
case Op::bit_or : write(o, V{id}, "= bit_or" , V{x}, V{y}); break;
|
||||
case Op::bit_xor: write(o, V{id}, "= bit_xor", V{x}, V{y}); break;
|
||||
@ -365,6 +369,8 @@ namespace skvm {
|
||||
case Op::sub_i32: write(o, R{d}, "= sub_i32", R{x}, R{y.id}); break;
|
||||
case Op::mul_i32: write(o, R{d}, "= mul_i32", R{x}, R{y.id}); break;
|
||||
|
||||
case Op::mul_i16x2: write(o, R{d}, "= mul_i16x2", R{x}, R{y.id}); break;
|
||||
|
||||
case Op::bit_and: write(o, R{d}, "= bit_and", R{x}, R{y.id}); break;
|
||||
case Op::bit_or : write(o, R{d}, "= bit_or" , R{x}, R{y.id}); break;
|
||||
case Op::bit_xor: write(o, R{d}, "= bit_xor", R{x}, R{y.id}); break;
|
||||
@ -469,6 +475,8 @@ namespace skvm {
|
||||
case Op::sub_i32: vpsubd (r[d], r[x], r[y.id]); break;
|
||||
case Op::mul_i32: vpmulld(r[d], r[x], r[y.id]); break;
|
||||
|
||||
case Op::mul_i16x2: vpmullw(r[d], r[x], r[y.id]); break;
|
||||
|
||||
case Op::bit_and: vandps(r[d], r[x], r[y.id]); break;
|
||||
case Op::bit_or : vorps (r[d], r[x], r[y.id]); break;
|
||||
case Op::bit_xor: vxorps(r[d], r[x], r[y.id]); break;
|
||||
|
@ -20,7 +20,7 @@ namespace skvm {
|
||||
load8, load32,
|
||||
splat,
|
||||
add_f32, sub_f32, mul_f32, div_f32, mad_f32,
|
||||
add_i32, sub_i32, mul_i32,
|
||||
add_i32, sub_i32, mul_i32, mul_i16x2,
|
||||
bit_and, bit_or, bit_xor,
|
||||
shl, shr, sra,
|
||||
extract,
|
||||
@ -99,6 +99,8 @@ namespace skvm {
|
||||
I32 sub(I32 x, I32 y);
|
||||
I32 mul(I32 x, I32 y);
|
||||
|
||||
I32 mul_16x2(I32 x, I32 y);
|
||||
|
||||
I32 bit_and(I32 x, I32 y);
|
||||
I32 bit_or (I32 x, I32 y);
|
||||
I32 bit_xor(I32 x, I32 y);
|
||||
|
@ -28,6 +28,8 @@ namespace SK_OPTS_NS {
|
||||
using U32 = skvx::Vec<K, uint32_t>;
|
||||
using U8 = skvx::Vec<K, uint8_t>;
|
||||
|
||||
using I16x2 = skvx::Vec<2*K, int16_t>;
|
||||
|
||||
union Slot {
|
||||
I32 i32;
|
||||
U32 u32;
|
||||
@ -124,6 +126,10 @@ namespace SK_OPTS_NS {
|
||||
CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y.id).i32; break;
|
||||
CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y.id).i32; break;
|
||||
|
||||
CASE(Op::mul_i16x2):
|
||||
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x ).i32) *
|
||||
skvx::bit_pun<I16x2>(r(y.id).i32) ); break;
|
||||
|
||||
CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y.id).i32; break;
|
||||
CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y.id).i32; break;
|
||||
CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y.id).i32; break;
|
||||
|
@ -100,7 +100,7 @@ SrcoverBuilder_I32::SrcoverBuilder_I32() {
|
||||
*r = extract(rgba, 0, splat(0xff));
|
||||
*g = extract(rgba, 8, splat(0xff));
|
||||
*b = extract(rgba, 16, splat(0xff));
|
||||
*a = extract(rgba, 24, splat(0xff));
|
||||
*a = shr (rgba, 24);
|
||||
};
|
||||
|
||||
skvm::I32 r,g,b,a;
|
||||
@ -115,11 +115,18 @@ SrcoverBuilder_I32::SrcoverBuilder_I32() {
|
||||
// == (d*(255-a+1) )/256
|
||||
// == (d*(256-a ) )/256
|
||||
|
||||
// We're doing 8x8 bit multiplies in 32-bit lanes.
|
||||
// Since the inputs and results both fit in 16 bits,
|
||||
// we can use mul_16x2, which tends to be faster than mul.
|
||||
//
|
||||
// (The top 2 zero bytes of the inputs will also multiply
|
||||
// with each other to produce zero... perfect.)
|
||||
|
||||
skvm::I32 invA = sub(splat(256), a);
|
||||
r = add(r, shr(mul(dr, invA), 8));
|
||||
g = add(g, shr(mul(dg, invA), 8));
|
||||
b = add(b, shr(mul(db, invA), 8));
|
||||
a = add(a, shr(mul(da, invA), 8));
|
||||
r = add(r, shr(mul_16x2(dr, invA), 8));
|
||||
g = add(g, shr(mul_16x2(dg, invA), 8));
|
||||
b = add(b, shr(mul_16x2(db, invA), 8));
|
||||
a = add(a, shr(mul_16x2(da, invA), 8));
|
||||
|
||||
r = pack(r, g, 8);
|
||||
b = pack(b, a, 8);
|
||||
@ -132,24 +139,27 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() {
|
||||
dst = arg(1);
|
||||
|
||||
auto load = [&](skvm::Arg ptr,
|
||||
skvm::I32* rb, skvm::I32* ga) {
|
||||
skvm::I32* rb, skvm::I32* ga, skvm::I32* a) {
|
||||
skvm::I32 rgba = load32(ptr);
|
||||
*rb = extract(rgba, 0, splat(0x00ff00ff));
|
||||
*ga = extract(rgba, 8, splat(0x00ff00ff));
|
||||
* a = shr (rgba, 24);
|
||||
};
|
||||
|
||||
skvm::I32 rb, ga;
|
||||
load(src, &rb, &ga);
|
||||
skvm::I32 rb, ga, a;
|
||||
load(src, &rb, &ga, &a);
|
||||
|
||||
skvm::I32 drb, dga;
|
||||
load(dst, &drb, &dga);
|
||||
skvm::I32 drb, dga, da;
|
||||
load(dst, &drb, &dga, &da);
|
||||
|
||||
// Same approximation as above.
|
||||
skvm::I32 invA = sub(splat(256),
|
||||
shr(ga, 16));
|
||||
// Same approximation as above,
|
||||
// but this time we make sure to use both i16 multiplies to our benefit,
|
||||
// one for r/g, the other for b/a simultaneously.
|
||||
skvm::I32 invA = sub(splat(256), a),
|
||||
invAx2 = pack(invA, invA, 16);
|
||||
|
||||
skvm::I32 RB = shr(mul(drb, invA), 8), // 8 high bits of results shifted back down.
|
||||
GA = mul(dga, invA); // Keep high bits of results in high lanes.
|
||||
skvm::I32 RB = shr(mul_16x2(drb, invAx2), 8), // 8 high bits of results shifted back down.
|
||||
GA = mul_16x2(dga, invAx2); // Keep high bits of results in high lanes.
|
||||
RB = bit_and(RB, splat(0x00ff00ff)); // Mask off any low bits remaining.
|
||||
GA = bit_and(GA, splat(0xff00ff00)); // Ditto.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user