add bit_clear

I was just reading the ARM docs and realized that their BIC ("BIt
Clear") is the same as SSE's ANDN ("AND Not") instruction.  It's kind of
a neat little tool to have laying around... comes up more than you'd
think, and it's sometimes the clearest way to express what you're doing,
as in the changed program here where the comment is "mask away the low
bits".  That's a bit_clear with a mask for what you want to clear away!

And the real reason to write this up is that I want to have a CL to
point to that shows how to add an instruction top to bottom.

Change-Id: I99690ed9c1009427b3986955e7ae6264de4d215c
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/223120
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
Reviewed-by: Mike Reed <reed@google.com>
This commit is contained in:
Mike Klein 2019-06-23 20:35:28 -04:00 committed by Skia Commit-Bot
parent 81eb6323a9
commit 2b7b2a2331
6 changed files with 50 additions and 38 deletions

View File

@ -632,22 +632,21 @@ r10 = pack r8 r10 16
store32 arg(1) r10
I32 (SWAR) 8888 over 8888
7 registers, 16 instructions:
6 registers, 15 instructions:
r0 = splat 1000100 (2.3510604e-38)
r1 = splat FF00FF (2.3418409e-38)
r2 = splat FF00FF00 (-1.7146522e+38)
loop:
r3 = load32 arg(0)
r4 = bytes r3 404
r4 = sub_i16x2 r0 r4
r5 = load32 arg(1)
r6 = bit_and r5 r1
r2 = load32 arg(0)
r3 = bytes r2 404
r3 = sub_i16x2 r0 r3
r4 = load32 arg(1)
r5 = bit_and r4 r1
r4 = shr_i16x2 r4 8
r5 = mul_i16x2 r5 r3
r5 = shr_i16x2 r5 8
r6 = mul_i16x2 r6 r4
r6 = shr_i16x2 r6 8
r4 = mul_i16x2 r5 r4
r4 = bit_and r4 r2
r4 = bit_or r6 r4
r4 = add_i32 r3 r4
store32 arg(1) r4
r3 = mul_i16x2 r4 r3
r3 = bit_clear r3 r1
r3 = bit_or r5 r3
r3 = add_i32 r2 r3
store32 arg(1) r3

View File

@ -244,9 +244,10 @@ namespace skvm {
I32 Builder::mul_16x2(I32 x, I32 y) { return {this->push(Op::mul_i16x2, x.id, y.id)}; }
I32 Builder::shr_16x2(I32 x, int bits) { return {this->push(Op::shr_i16x2, x.id,NA,NA, bits)}; }
I32 Builder::bit_and(I32 x, I32 y) { return {this->push(Op::bit_and, x.id, y.id)}; }
I32 Builder::bit_or (I32 x, I32 y) { return {this->push(Op::bit_or , x.id, y.id)}; }
I32 Builder::bit_xor(I32 x, I32 y) { return {this->push(Op::bit_xor, x.id, y.id)}; }
I32 Builder::bit_and (I32 x, I32 y) { return {this->push(Op::bit_and , x.id, y.id)}; }
I32 Builder::bit_or (I32 x, I32 y) { return {this->push(Op::bit_or , x.id, y.id)}; }
I32 Builder::bit_xor (I32 x, I32 y) { return {this->push(Op::bit_xor , x.id, y.id)}; }
I32 Builder::bit_clear(I32 x, I32 y) { return {this->push(Op::bit_clear, x.id, y.id)}; }
I32 Builder::shl(I32 x, int bits) { return {this->push(Op::shl, x.id,NA,NA, bits)}; }
I32 Builder::shr(I32 x, int bits) { return {this->push(Op::shr, x.id,NA,NA, bits)}; }
@ -347,9 +348,10 @@ namespace skvm {
case Op::mul_i16x2: write(o, V{id}, "= mul_i16x2", V{x}, V{y}); break;
case Op::shr_i16x2: write(o, V{id}, "= shr_i16x2", V{x}, Shift{imm}); break;
case Op::bit_and: write(o, V{id}, "= bit_and", V{x}, V{y}); break;
case Op::bit_or : write(o, V{id}, "= bit_or" , V{x}, V{y}); break;
case Op::bit_xor: write(o, V{id}, "= bit_xor", V{x}, V{y}); break;
case Op::bit_and : write(o, V{id}, "= bit_and" , V{x}, V{y}); break;
case Op::bit_or : write(o, V{id}, "= bit_or" , V{x}, V{y}); break;
case Op::bit_xor : write(o, V{id}, "= bit_xor" , V{x}, V{y}); break;
case Op::bit_clear: write(o, V{id}, "= bit_clear", V{x}, V{y}); break;
case Op::shl: write(o, V{id}, "= shl", V{x}, Shift{imm}); break;
case Op::shr: write(o, V{id}, "= shr", V{x}, Shift{imm}); break;
@ -407,9 +409,10 @@ namespace skvm {
case Op::mul_i16x2: write(o, R{d}, "= mul_i16x2", R{x}, R{y}); break;
case Op::shr_i16x2: write(o, R{d}, "= shr_i16x2", R{x}, Shift{imm}); break;
case Op::bit_and: write(o, R{d}, "= bit_and", R{x}, R{y}); break;
case Op::bit_or : write(o, R{d}, "= bit_or" , R{x}, R{y}); break;
case Op::bit_xor: write(o, R{d}, "= bit_xor", R{x}, R{y}); break;
case Op::bit_and : write(o, R{d}, "= bit_and" , R{x}, R{y}); break;
case Op::bit_or : write(o, R{d}, "= bit_or" , R{x}, R{y}); break;
case Op::bit_xor : write(o, R{d}, "= bit_xor" , R{x}, R{y}); break;
case Op::bit_clear: write(o, R{d}, "= bit_clear", R{x}, R{y}); break;
case Op::shl: write(o, R{d}, "= shl", R{x}, Shift{imm}); break;
case Op::shr: write(o, R{d}, "= shr", R{x}, Shift{imm}); break;
@ -596,9 +599,10 @@ namespace skvm {
void Assembler::vpsubw (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xf9, dst,x,y); }
void Assembler::vpmullw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xd5, dst,x,y); }
void Assembler::vpand(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
void Assembler::vpor (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
void Assembler::vpxor(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xef, dst,x,y); }
void Assembler::vpand (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
void Assembler::vpor (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
void Assembler::vpxor (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xef, dst,x,y); }
void Assembler::vpandn(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
void Assembler::vaddps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x58, dst,x,y); }
void Assembler::vsubps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5c, dst,x,y); }
@ -860,9 +864,10 @@ namespace skvm {
case Op::mul_i16x2: a.vpmullw(r(d), r(x), r(y)); break;
case Op::shr_i16x2: a.vpsrlw (r(d), r(x), imm); break;
case Op::bit_and: a.vpand(r(d), r(x), r(y)); break;
case Op::bit_or : a.vpor (r(d), r(x), r(y)); break;
case Op::bit_xor: a.vpxor(r(d), r(x), r(y)); break;
case Op::bit_and : a.vpand (r(d), r(x), r(y)); break;
case Op::bit_or : a.vpor (r(d), r(x), r(y)); break;
case Op::bit_xor : a.vpxor (r(d), r(x), r(y)); break;
case Op::bit_clear: a.vpandn(r(d), r(y), r(x)); break; // N.B. passing y then x.
case Op::shl: a.vpslld(r(d), r(x), imm); break;
case Op::shr: a.vpsrld(r(d), r(x), imm); break;

View File

@ -56,7 +56,7 @@ namespace skvm {
using DstEqXOpY = void(Ymm dst, Ymm x, Ymm y);
DstEqXOpY vpaddd, vpsubd, vpmulld,
vpsubw, vpmullw,
vpand, vpor, vpxor,
vpand, vpor, vpxor, vpandn,
vaddps, vsubps, vmulps, vdivps,
vfmadd132ps, vfmadd213ps, vfmadd231ps,
vpackusdw, vpackuswb;
@ -117,7 +117,7 @@ namespace skvm {
add_f32, sub_f32, mul_f32, div_f32, mad_f32,
add_i32, sub_i32, mul_i32,
sub_i16x2, mul_i16x2, shr_i16x2,
bit_and, bit_or, bit_xor,
bit_and, bit_or, bit_xor, bit_clear,
shl, shr, sra,
extract,
pack,
@ -211,9 +211,10 @@ namespace skvm {
I32 mul_16x2(I32 x, I32 y);
I32 shr_16x2(I32 x, int bits);
I32 bit_and(I32 x, I32 y);
I32 bit_or (I32 x, I32 y);
I32 bit_xor(I32 x, I32 y);
I32 bit_and (I32 x, I32 y);
I32 bit_or (I32 x, I32 y);
I32 bit_xor (I32 x, I32 y);
I32 bit_clear(I32 x, I32 y); // x & ~y
I32 shl(I32 x, int bits);
I32 shr(I32 x, int bits);

View File

@ -138,9 +138,10 @@ namespace SK_OPTS_NS {
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<U16x2>(r(x).i32) >> imm);
break;
CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y).i32; break;
CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y).i32; break;
CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y).i32; break;
CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y).i32; break;
CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
CASE(Op::shl): r(d).i32 = r(x).i32 << imm; break;
CASE(Op::sra): r(d).i32 = r(x).i32 >> imm; break;

View File

@ -362,4 +362,10 @@ DEF_TEST(SkVM_Assembler, r) {
0xc5, 0x79, 0xd6, 0b00'111'010,
});
test_asm(r, [&](A& a) {
a.vpandn(A::ymm3, A::ymm12, A::ymm2);
},{
0xc5, 0x9d, 0xdf, 0xda,
});
}

View File

@ -190,7 +190,7 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() {
rb = shr_16x2(mul_16x2(rb, invAx2), 8); // Put the high 8 bits back in the low lane.
ga = mul_16x2(ga, invAx2); // Keep the high 8 bits up high...
ga = bit_and(ga, splat(0xff00ff00)); // ...and mask off the low bits.
ga = bit_clear(ga, splat(0x00ff00ff)); // ...and mask off the low bits.
store32(dst, add(s, bit_or(rb, ga)));
}