remove mul_unorm8/mad_unorm8
I just kind of remembered that if we're doing (xy+x)/256 and x is a destination channel and y is 255-sa, then you can get the +x for free by multiplying by 256-sa instead. (d * (255-sa) + d) (d * (255-sa + 1)) (d * (256-sa) ) Duh. This is a trick we play in a lot of legacy code and I've just now realized it's exactly equivalent to the trick I want to play here... sigh. Folding this math in kind of makes mul/mad_unorm8 moot. Speed's getting good: I32_SWAR: 0.3 ns/px I32 : 0.55 ns/px F32 : 0.8 ns/px RP : 0.8 ns/px Opts : 0.2 ns/px Change-Id: I4d10db51ea80a3258c36e97b6b334ad253804613 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220708 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
ed8d13089d
commit
821f5e8dfe
@ -566,50 +566,60 @@ r13 = pack r11 r13 16
|
||||
store32 arg(1) r13
|
||||
|
||||
I32 8888 over 8888
|
||||
10 registers, 20 instructions:
|
||||
11 registers, 29 instructions:
|
||||
r0 = splat FF (3.5733111e-43)
|
||||
loop:
|
||||
r1 = load32 arg(0)
|
||||
r2 = extract r1 0 r0
|
||||
r3 = extract r1 8 r0
|
||||
r4 = extract r1 16 r0
|
||||
r1 = extract r1 24 r0
|
||||
r5 = load32 arg(1)
|
||||
r6 = extract r5 0 r0
|
||||
r7 = extract r5 8 r0
|
||||
r8 = extract r5 16 r0
|
||||
r5 = extract r5 24 r0
|
||||
r9 = sub_i32 r0 r1
|
||||
r6 = mad_unorm8 r6 r9 r2
|
||||
r7 = mad_unorm8 r7 r9 r3
|
||||
r8 = mad_unorm8 r8 r9 r4
|
||||
r9 = mad_unorm8 r5 r9 r1
|
||||
r7 = pack r6 r7 8
|
||||
r9 = pack r8 r9 8
|
||||
r9 = pack r7 r9 16
|
||||
store32 arg(1) r9
|
||||
|
||||
I32 (SWAR) 8888 over 8888
|
||||
8 registers, 20 instructions:
|
||||
r0 = splat FF00FF (2.3418409e-38)
|
||||
r1 = splat FF (3.5733111e-43)
|
||||
r1 = splat 100 (3.5873241e-43)
|
||||
loop:
|
||||
r2 = load32 arg(0)
|
||||
r3 = extract r2 0 r0
|
||||
r2 = extract r2 8 r0
|
||||
r4 = load32 arg(1)
|
||||
r5 = extract r4 0 r0
|
||||
r4 = extract r4 8 r0
|
||||
r6 = shr r2 16
|
||||
r6 = sub_i32 r1 r6
|
||||
r7 = mul_i32 r5 r6
|
||||
r7 = add_i32 r7 r5
|
||||
r7 = extract r7 8 r0
|
||||
r4 = extract r2 8 r0
|
||||
r5 = extract r2 16 r0
|
||||
r2 = extract r2 24 r0
|
||||
r6 = load32 arg(1)
|
||||
r7 = extract r6 0 r0
|
||||
r8 = extract r6 8 r0
|
||||
r9 = extract r6 16 r0
|
||||
r6 = extract r6 24 r0
|
||||
r10 = sub_i32 r1 r2
|
||||
r7 = mul_i32 r7 r10
|
||||
r7 = shr r7 8
|
||||
r7 = add_i32 r3 r7
|
||||
r6 = mul_i32 r4 r6
|
||||
r6 = add_i32 r6 r4
|
||||
r6 = extract r6 8 r0
|
||||
r6 = add_i32 r2 r6
|
||||
r6 = pack r7 r6 8
|
||||
store32 arg(1) r6
|
||||
r8 = mul_i32 r8 r10
|
||||
r8 = shr r8 8
|
||||
r8 = add_i32 r4 r8
|
||||
r9 = mul_i32 r9 r10
|
||||
r9 = shr r9 8
|
||||
r9 = add_i32 r5 r9
|
||||
r10 = mul_i32 r6 r10
|
||||
r10 = shr r10 8
|
||||
r10 = add_i32 r2 r10
|
||||
r8 = pack r7 r8 8
|
||||
r10 = pack r9 r10 8
|
||||
r10 = pack r8 r10 16
|
||||
store32 arg(1) r10
|
||||
|
||||
I32 (SWAR) 8888 over 8888
|
||||
8 registers, 21 instructions:
|
||||
r0 = splat FF00FF (2.3418409e-38)
|
||||
r1 = splat 100 (3.5873241e-43)
|
||||
r2 = splat FF00FF00 (-1.7146522e+38)
|
||||
loop:
|
||||
r3 = load32 arg(0)
|
||||
r4 = extract r3 0 r0
|
||||
r3 = extract r3 8 r0
|
||||
r5 = load32 arg(1)
|
||||
r6 = extract r5 0 r0
|
||||
r5 = extract r5 8 r0
|
||||
r7 = shr r3 16
|
||||
r7 = sub_i32 r1 r7
|
||||
r6 = mul_i32 r6 r7
|
||||
r6 = shr r6 8
|
||||
r7 = mul_i32 r5 r7
|
||||
r6 = bit_and r6 r0
|
||||
r7 = bit_and r7 r2
|
||||
r6 = add_i32 r4 r6
|
||||
r3 = shl r3 8
|
||||
r3 = add_i32 r3 r7
|
||||
r3 = bit_or r6 r3
|
||||
store32 arg(1) r3
|
||||
|
||||
|
@ -226,11 +226,6 @@ namespace skvm {
|
||||
I32 Builder::shr(I32 x, int bits) { return {this->push(Op::shr, x.id,NA,NA, bits)}; }
|
||||
I32 Builder::sra(I32 x, int bits) { return {this->push(Op::sra, x.id,NA,NA, bits)}; }
|
||||
|
||||
I32 Builder::mul_unorm8(I32 x, I32 y) { return {this->push(Op::mul_unorm8, x.id, y.id)}; }
|
||||
I32 Builder::mad_unorm8(I32 x, I32 y, I32 z) {
|
||||
return {this->push(Op::mad_unorm8, x.id, y.id, z.id)};
|
||||
}
|
||||
|
||||
I32 Builder::extract(I32 x, int bits, I32 z) {
|
||||
return {this->push(Op::extract, x.id,NA,z.id, bits,0)};
|
||||
}
|
||||
@ -325,9 +320,6 @@ namespace skvm {
|
||||
case Op::shr: write(o, V{id}, "= shr", V{x}, Shift{immy}); break;
|
||||
case Op::sra: write(o, V{id}, "= sra", V{x}, Shift{immy}); break;
|
||||
|
||||
case Op::mul_unorm8: write(o, V{id}, "= mul_unorm8", V{x}, V{y} ); break;
|
||||
case Op::mad_unorm8: write(o, V{id}, "= mad_unorm8", V{x}, V{y}, V{z}); break;
|
||||
|
||||
case Op::extract: write(o, V{id}, "= extract", V{x}, Shift{immy}, V{z}); break;
|
||||
case Op::pack: write(o, V{id}, "= pack", V{x}, V{y}, Shift{immz}); break;
|
||||
|
||||
@ -381,9 +373,6 @@ namespace skvm {
|
||||
case Op::shr: write(o, R{d}, "= shr", R{x}, Shift{y.imm}); break;
|
||||
case Op::sra: write(o, R{d}, "= sra", R{x}, Shift{y.imm}); break;
|
||||
|
||||
case Op::mul_unorm8: write(o, R{d}, "= mul_unorm8", R{x}, R{y.id} ); break;
|
||||
case Op::mad_unorm8: write(o, R{d}, "= mad_unorm8", R{x}, R{y.id}, R{z.id}); break;
|
||||
|
||||
case Op::extract: write(o, R{d}, "= extract", R{x}, Shift{y.imm}, R{z.id}); break;
|
||||
case Op::pack: write(o, R{d}, "= pack", R{x}, R{y.id}, Shift{z.imm}); break;
|
||||
|
||||
@ -488,17 +477,6 @@ namespace skvm {
|
||||
case Op::shr: vpsrld(r[d], r[x], y.imm); break;
|
||||
case Op::sra: vpsrad(r[d], r[x], y.imm); break;
|
||||
|
||||
case Op::mul_unorm8: vpmulld(tmp, r[x], r[y.id]);
|
||||
vpaddd (tmp, tmp, r[x]);
|
||||
vpsrad (r[d],tmp, 8);
|
||||
break;
|
||||
|
||||
case Op::mad_unorm8: vpmulld(tmp, r[x], r[y.id]);
|
||||
vpaddd (tmp, tmp, r[x]);
|
||||
vpsrad (tmp, tmp, 8);
|
||||
vpaddd (r[d],tmp, r[z.id]);
|
||||
break;
|
||||
|
||||
case Op::extract: if (y.imm) {
|
||||
vpsrld(tmp, r[x], y.imm);
|
||||
vandps(r[d], tmp, r[z.id]);
|
||||
|
@ -23,7 +23,6 @@ namespace skvm {
|
||||
add_i32, sub_i32, mul_i32,
|
||||
bit_and, bit_or, bit_xor,
|
||||
shl, shr, sra,
|
||||
mul_unorm8, mad_unorm8,
|
||||
extract,
|
||||
pack,
|
||||
to_f32, to_i32,
|
||||
@ -86,8 +85,9 @@ namespace skvm {
|
||||
I32 load8 (Arg ptr);
|
||||
I32 load32(Arg ptr);
|
||||
|
||||
I32 splat(int n);
|
||||
F32 splat(float f);
|
||||
I32 splat(int n);
|
||||
I32 splat(unsigned u) { return this->splat((int)u); }
|
||||
F32 splat(float f);
|
||||
|
||||
F32 add(F32 x, F32 y);
|
||||
F32 sub(F32 x, F32 y);
|
||||
@ -107,15 +107,8 @@ namespace skvm {
|
||||
I32 shr(I32 x, int bits);
|
||||
I32 sra(I32 x, int bits);
|
||||
|
||||
I32 mul_unorm8(I32 x, I32 y); // (x*y+x)/256, approximating (x*y+127)/255.
|
||||
I32 mad_unorm8(I32 x, I32 y, I32 z); // mul_unorm8(x,y) + z
|
||||
|
||||
I32 extract(I32 x, int bits, I32 z); // (x >> bits) & z
|
||||
|
||||
// Interlace bits from x and y as if x | (y << bits),
|
||||
// assuming no bits from x and (y << bits) collide with each other, (x & (y << bits)) == 0.
|
||||
// (This allows implementation with SSE punpckl?? or NEON vzip.?? instructions.)
|
||||
I32 pack(I32 x, I32 y, int bits);
|
||||
I32 pack (I32 x, I32 y, int bits); // x | (y << bits)
|
||||
|
||||
F32 to_f32(I32 x);
|
||||
I32 to_i32(F32 x);
|
||||
|
@ -132,11 +132,6 @@ namespace SK_OPTS_NS {
|
||||
CASE(Op::sra): r(d).i32 = r(x).i32 >> y.imm; break;
|
||||
CASE(Op::shr): r(d).u32 = r(x).u32 >> y.imm; break;
|
||||
|
||||
CASE(Op::mul_unorm8):
|
||||
r(d).u32 = (r(x).u32 * r(y.id).u32 + r(x).u32)/256 ; break;
|
||||
CASE(Op::mad_unorm8):
|
||||
r(d).u32 = (r(x).u32 * r(y.id).u32 + r(x).u32)/256 + r(z.id).u32; break;
|
||||
|
||||
CASE(Op::extract): r(d).u32 = (r(x).u32 >> y.imm) & r(z.id).u32; break;
|
||||
CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y.id).u32 << z.imm); break;
|
||||
|
||||
|
@ -109,11 +109,17 @@ SrcoverBuilder_I32::SrcoverBuilder_I32() {
|
||||
skvm::I32 dr,dg,db,da;
|
||||
load(dst, &dr,&dg,&db,&da);
|
||||
|
||||
skvm::I32 invA = sub(splat(0xff), a);
|
||||
r = mad_unorm8(dr, invA, r);
|
||||
g = mad_unorm8(dg, invA, g);
|
||||
b = mad_unorm8(db, invA, b);
|
||||
a = mad_unorm8(da, invA, a);
|
||||
// (xy + x)/256 is a good approximation of (xy + 127)/255
|
||||
//
|
||||
// == (d*(255-a) + d)/256
|
||||
// == (d*(255-a+1) )/256
|
||||
// == (d*(256-a ) )/256
|
||||
|
||||
skvm::I32 invA = sub(splat(256), a);
|
||||
r = add(r, shr(mul(dr, invA), 8));
|
||||
g = add(g, shr(mul(dg, invA), 8));
|
||||
b = add(b, shr(mul(db, invA), 8));
|
||||
a = add(a, shr(mul(da, invA), 8));
|
||||
|
||||
r = pack(r, g, 8);
|
||||
b = pack(b, a, 8);
|
||||
@ -132,22 +138,23 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() {
|
||||
*ga = extract(rgba, 8, splat(0x00ff00ff));
|
||||
};
|
||||
|
||||
auto mul_unorm8_SWAR = [&](skvm::I32 x, skvm::I32 y) {
|
||||
// As above, assuming x is two SWAR bytes in lanes 0 and 2, and y is a byte.
|
||||
skvm::I32 _255 = splat(0x00ff00ff);
|
||||
return extract(add(mul(x, y), x), 8, _255);
|
||||
};
|
||||
|
||||
skvm::I32 rb, ga;
|
||||
load(src, &rb, &ga);
|
||||
|
||||
skvm::I32 drb, dga;
|
||||
load(dst, &drb, &dga);
|
||||
|
||||
skvm::I32 _255 = splat(0xff),
|
||||
invA = sub(_255, shr(ga, 16));
|
||||
rb = add(rb, mul_unorm8_SWAR(drb, invA));
|
||||
ga = add(ga, mul_unorm8_SWAR(dga, invA));
|
||||
// Same approximation as above.
|
||||
skvm::I32 invA = sub(splat(256),
|
||||
shr(ga, 16));
|
||||
|
||||
store32(dst, pack(rb, ga, 8));
|
||||
skvm::I32 RB = shr(mul(drb, invA), 8), // 8 high bits of results shifted back down.
|
||||
GA = mul(dga, invA); // Keep high bits of results in high lanes.
|
||||
RB = bit_and(RB, splat(0x00ff00ff)); // Mask off any low bits remaining.
|
||||
GA = bit_and(GA, splat(0xff00ff00)); // Ditto.
|
||||
|
||||
rb = add( rb , RB); // src += dst*invA
|
||||
ga = add(shl(ga, 8), GA);
|
||||
|
||||
store32(dst, bit_or(rb,ga));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user