baby steps into 16-bit ops

I figure the easiest way to expose 16-bit operations
is to expose 16x2 pair operations... this means we
can continue to always work with the same size vector.

Switching from 32-bit multiplies to 16-bit multiplies
is going to deliver the most oomph... they cost roughly
half what 32-bit multiplies do on x86.

Speed now:

   I32_SWAR: 0.27 ns/px
   I32:      0.43 ns/px
   F32:      0.76 ns/px
   RP:       0.8  ns/px
   Opts:     0.2  ns/px

Change-Id: I8350c71722a9bde714ba18f97b8687fe35cc749f
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220709
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2019-06-13 11:29:26 -05:00 committed by Skia Commit-Bot
parent d9a6511024
commit 3538908983
5 changed files with 66 additions and 39 deletions

View File

@ -574,23 +574,23 @@ r2 = load32 arg(0)
r3 = extract r2 0 r0
r4 = extract r2 8 r0
r5 = extract r2 16 r0
r2 = extract r2 24 r0
r2 = shr r2 24
r6 = load32 arg(1)
r7 = extract r6 0 r0
r8 = extract r6 8 r0
r9 = extract r6 16 r0
r6 = extract r6 24 r0
r6 = shr r6 24
r10 = sub_i32 r1 r2
r7 = mul_i32 r7 r10
r7 = mul_i16x2 r7 r10
r7 = shr r7 8
r7 = add_i32 r3 r7
r8 = mul_i32 r8 r10
r8 = mul_i16x2 r8 r10
r8 = shr r8 8
r8 = add_i32 r4 r8
r9 = mul_i32 r9 r10
r9 = mul_i16x2 r9 r10
r9 = shr r9 8
r9 = add_i32 r5 r9
r10 = mul_i32 r6 r10
r10 = mul_i16x2 r6 r10
r10 = shr r10 8
r10 = add_i32 r2 r10
r8 = pack r7 r8 8
@ -599,27 +599,28 @@ r10 = pack r8 r10 16
store32 arg(1) r10
I32 (SWAR) 8888 over 8888
8 registers, 21 instructions:
8 registers, 22 instructions:
r0 = splat FF00FF (2.3418409e-38)
r1 = splat 100 (3.5873241e-43)
r2 = splat FF00FF00 (-1.7146522e+38)
loop:
r3 = load32 arg(0)
r4 = extract r3 0 r0
r3 = extract r3 8 r0
r5 = load32 arg(1)
r6 = extract r5 0 r0
r5 = extract r5 8 r0
r7 = shr r3 16
r7 = sub_i32 r1 r7
r6 = mul_i32 r6 r7
r6 = shr r6 8
r7 = mul_i32 r5 r7
r6 = bit_and r6 r0
r7 = bit_and r7 r2
r6 = add_i32 r4 r6
r3 = shl r3 8
r3 = add_i32 r3 r7
r3 = bit_or r6 r3
store32 arg(1) r3
r5 = extract r3 8 r0
r3 = shr r3 24
r6 = load32 arg(1)
r7 = extract r6 0 r0
r6 = extract r6 8 r0
r3 = sub_i32 r1 r3
r3 = pack r3 r3 16
r7 = mul_i16x2 r7 r3
r7 = shr r7 8
r3 = mul_i16x2 r6 r3
r7 = bit_and r7 r0
r3 = bit_and r3 r2
r7 = add_i32 r4 r7
r5 = shl r5 8
r5 = add_i32 r5 r3
r5 = bit_or r7 r5
store32 arg(1) r5

View File

@ -218,6 +218,8 @@ namespace skvm {
I32 Builder::sub(I32 x, I32 y) { return {this->push(Op::sub_i32, x.id, y.id)}; }
I32 Builder::mul(I32 x, I32 y) { return {this->push(Op::mul_i32, x.id, y.id)}; }
I32 Builder::mul_16x2(I32 x, I32 y) { return {this->push(Op::mul_i16x2, x.id, y.id)}; }
I32 Builder::bit_and(I32 x, I32 y) { return {this->push(Op::bit_and, x.id, y.id)}; }
I32 Builder::bit_or (I32 x, I32 y) { return {this->push(Op::bit_or , x.id, y.id)}; }
I32 Builder::bit_xor(I32 x, I32 y) { return {this->push(Op::bit_xor, x.id, y.id)}; }
@ -312,6 +314,8 @@ namespace skvm {
case Op::sub_i32: write(o, V{id}, "= sub_i32", V{x}, V{y}); break;
case Op::mul_i32: write(o, V{id}, "= mul_i32", V{x}, V{y}); break;
case Op::mul_i16x2: write(o, V{id}, "= mul_i16x2", V{x}, V{y}); break;
case Op::bit_and: write(o, V{id}, "= bit_and", V{x}, V{y}); break;
case Op::bit_or : write(o, V{id}, "= bit_or" , V{x}, V{y}); break;
case Op::bit_xor: write(o, V{id}, "= bit_xor", V{x}, V{y}); break;
@ -365,6 +369,8 @@ namespace skvm {
case Op::sub_i32: write(o, R{d}, "= sub_i32", R{x}, R{y.id}); break;
case Op::mul_i32: write(o, R{d}, "= mul_i32", R{x}, R{y.id}); break;
case Op::mul_i16x2: write(o, R{d}, "= mul_i16x2", R{x}, R{y.id}); break;
case Op::bit_and: write(o, R{d}, "= bit_and", R{x}, R{y.id}); break;
case Op::bit_or : write(o, R{d}, "= bit_or" , R{x}, R{y.id}); break;
case Op::bit_xor: write(o, R{d}, "= bit_xor", R{x}, R{y.id}); break;
@ -469,6 +475,8 @@ namespace skvm {
case Op::sub_i32: vpsubd (r[d], r[x], r[y.id]); break;
case Op::mul_i32: vpmulld(r[d], r[x], r[y.id]); break;
case Op::mul_i16x2: vpmullw(r[d], r[x], r[y.id]); break;
case Op::bit_and: vandps(r[d], r[x], r[y.id]); break;
case Op::bit_or : vorps (r[d], r[x], r[y.id]); break;
case Op::bit_xor: vxorps(r[d], r[x], r[y.id]); break;

View File

@ -20,7 +20,7 @@ namespace skvm {
load8, load32,
splat,
add_f32, sub_f32, mul_f32, div_f32, mad_f32,
add_i32, sub_i32, mul_i32,
add_i32, sub_i32, mul_i32, mul_i16x2,
bit_and, bit_or, bit_xor,
shl, shr, sra,
extract,
@ -99,6 +99,8 @@ namespace skvm {
I32 sub(I32 x, I32 y);
I32 mul(I32 x, I32 y);
I32 mul_16x2(I32 x, I32 y);
I32 bit_and(I32 x, I32 y);
I32 bit_or (I32 x, I32 y);
I32 bit_xor(I32 x, I32 y);

View File

@ -28,6 +28,8 @@ namespace SK_OPTS_NS {
using U32 = skvx::Vec<K, uint32_t>;
using U8 = skvx::Vec<K, uint8_t>;
using I16x2 = skvx::Vec<2*K, int16_t>;
union Slot {
I32 i32;
U32 u32;
@ -124,6 +126,10 @@ namespace SK_OPTS_NS {
CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y.id).i32; break;
CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y.id).i32; break;
CASE(Op::mul_i16x2):
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x ).i32) *
skvx::bit_pun<I16x2>(r(y.id).i32) ); break;
CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y.id).i32; break;
CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y.id).i32; break;
CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y.id).i32; break;

View File

@ -100,7 +100,7 @@ SrcoverBuilder_I32::SrcoverBuilder_I32() {
*r = extract(rgba, 0, splat(0xff));
*g = extract(rgba, 8, splat(0xff));
*b = extract(rgba, 16, splat(0xff));
*a = extract(rgba, 24, splat(0xff));
*a = shr (rgba, 24);
};
skvm::I32 r,g,b,a;
@ -115,11 +115,18 @@ SrcoverBuilder_I32::SrcoverBuilder_I32() {
// == (d*(255-a+1) )/256
// == (d*(256-a ) )/256
// We're doing 8x8 bit multiplies in 32-bit lanes.
// Since the inputs and results both fit in 16 bits,
// we can use mul_16x2, which tends to be faster than mul.
//
// (The top 2 zero bytes of the inputs will also multiply
// with each other to produce zero... perfect.)
skvm::I32 invA = sub(splat(256), a);
r = add(r, shr(mul(dr, invA), 8));
g = add(g, shr(mul(dg, invA), 8));
b = add(b, shr(mul(db, invA), 8));
a = add(a, shr(mul(da, invA), 8));
r = add(r, shr(mul_16x2(dr, invA), 8));
g = add(g, shr(mul_16x2(dg, invA), 8));
b = add(b, shr(mul_16x2(db, invA), 8));
a = add(a, shr(mul_16x2(da, invA), 8));
r = pack(r, g, 8);
b = pack(b, a, 8);
@ -132,24 +139,27 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() {
dst = arg(1);
auto load = [&](skvm::Arg ptr,
skvm::I32* rb, skvm::I32* ga) {
skvm::I32* rb, skvm::I32* ga, skvm::I32* a) {
skvm::I32 rgba = load32(ptr);
*rb = extract(rgba, 0, splat(0x00ff00ff));
*ga = extract(rgba, 8, splat(0x00ff00ff));
* a = shr (rgba, 24);
};
skvm::I32 rb, ga;
load(src, &rb, &ga);
skvm::I32 rb, ga, a;
load(src, &rb, &ga, &a);
skvm::I32 drb, dga;
load(dst, &drb, &dga);
skvm::I32 drb, dga, da;
load(dst, &drb, &dga, &da);
// Same approximation as above.
skvm::I32 invA = sub(splat(256),
shr(ga, 16));
// Same approximation as above,
// but this time we make sure to use both i16 multiplies to our benefit,
// one for r/g, the other for b/a simultaneously.
skvm::I32 invA = sub(splat(256), a),
invAx2 = pack(invA, invA, 16);
skvm::I32 RB = shr(mul(drb, invA), 8), // 8 high bits of results shifted back down.
GA = mul(dga, invA); // Keep high bits of results in high lanes.
skvm::I32 RB = shr(mul_16x2(drb, invAx2), 8), // 8 high bits of results shifted back down.
GA = mul_16x2(dga, invAx2); // Keep high bits of results in high lanes.
RB = bit_and(RB, splat(0x00ff00ff)); // Mask off any low bits remaining.
GA = bit_and(GA, splat(0xff00ff00)); // Ditto.