remove little-used bit_clear() and bytes()

bit_clear() is just another bit_and(),
and bytes() is a way of expression pshufb
that we never really use (yet).

Can always add them back later, but there's
some extra complexity to think about for each
that I'd like to not think about now:

  - common sub-expression elimination between bit_and and bit_clear
  - large constant management JIT'ing bytes

Change-Id: I3a54afa963231fec1d5de949acc647e3430ed0d8
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/281557
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2020-04-03 12:01:24 -05:00 committed by Skia Commit-Bot
parent b1cef9d6f6
commit cca2acfb77
6 changed files with 113 additions and 278 deletions

View File

@ -607,38 +607,42 @@ loop:
32 store32 arg(1) r7
I32 8888 over 8888
29 values (originally 29):
33 values (originally 33):
v0 = load32 arg(0)
v1 = shr_i32 v0 24
↑ v2 = splat 100 (3.5873241e-43)
v3 = sub_i32 v2 v1
v4 = load32 arg(1)
v5 = bytes v4 3
v6 = mul_i16x2 v5 v3
v7 = shr_i32 v6 8
v8 = bytes v0 3
v9 = add_i32 v8 v7
v10 = shr_i32 v4 24
v11 = mul_i16x2 v10 v3
v12 = shr_i32 v11 8
v13 = add_i32 v1 v12
v14 = pack v9 v13 8
v15 = bytes v4 2
v16 = mul_i16x2 v15 v3
v17 = shr_i32 v16 8
v18 = bytes v0 2
v19 = add_i32 v18 v17
↑ v20 = splat FF (3.5733111e-43)
v21 = bit_and v4 v20
v22 = mul_i16x2 v21 v3
v23 = shr_i32 v22 8
v24 = bit_and v0 v20
v25 = add_i32 v24 v23
v26 = pack v25 v19 8
v27 = pack v26 v14 16
store32 arg(1) v27
v5 = shr_i32 v4 16
↑ v6 = splat FF (3.5733111e-43)
v7 = bit_and v6 v5
v8 = mul_i16x2 v7 v3
v9 = shr_i32 v8 8
v10 = shr_i32 v0 16
v11 = bit_and v6 v10
v12 = add_i32 v11 v9
v13 = shr_i32 v4 24
v14 = mul_i16x2 v13 v3
v15 = shr_i32 v14 8
v16 = add_i32 v1 v15
v17 = pack v12 v16 8
v18 = shr_i32 v4 8
v19 = bit_and v6 v18
v20 = mul_i16x2 v19 v3
v21 = shr_i32 v20 8
v22 = shr_i32 v0 8
v23 = bit_and v6 v22
v24 = add_i32 v23 v21
v25 = bit_and v6 v4
v26 = mul_i16x2 v25 v3
v27 = shr_i32 v26 8
v28 = bit_and v6 v0
v29 = add_i32 v28 v27
v30 = pack v29 v24 8
v31 = pack v30 v17 16
store32 arg(1) v31
8 registers, 29 instructions:
8 registers, 33 instructions:
0 r0 = splat 100 (3.5873241e-43)
1 r1 = splat FF (3.5733111e-43)
loop:
@ -646,65 +650,79 @@ loop:
3 r3 = shr_i32 r2 24
4 r4 = sub_i32 r0 r3
5 r5 = load32 arg(1)
6 r6 = bytes r5 3
7 r6 = mul_i16x2 r6 r4
8 r6 = shr_i32 r6 8
9 r7 = bytes r2 3
10 r6 = add_i32 r7 r6
11 r7 = shr_i32 r5 24
12 r7 = mul_i16x2 r7 r4
13 r7 = shr_i32 r7 8
14 r7 = add_i32 r3 r7
15 r7 = pack r6 r7 8
16 r6 = bytes r5 2
17 r6 = mul_i16x2 r6 r4
18 r6 = shr_i32 r6 8
19 r3 = bytes r2 2
20 r6 = add_i32 r3 r6
21 r5 = bit_and r5 r1
22 r4 = mul_i16x2 r5 r4
23 r4 = shr_i32 r4 8
24 r2 = bit_and r2 r1
25 r4 = add_i32 r2 r4
26 r6 = pack r4 r6 8
27 r7 = pack r6 r7 16
28 store32 arg(1) r7
6 r6 = shr_i32 r5 16
7 r6 = bit_and r1 r6
8 r6 = mul_i16x2 r6 r4
9 r6 = shr_i32 r6 8
10 r7 = shr_i32 r2 16
11 r7 = bit_and r1 r7
12 r6 = add_i32 r7 r6
13 r7 = shr_i32 r5 24
14 r7 = mul_i16x2 r7 r4
15 r7 = shr_i32 r7 8
16 r7 = add_i32 r3 r7
17 r7 = pack r6 r7 8
18 r6 = shr_i32 r5 8
19 r6 = bit_and r1 r6
20 r6 = mul_i16x2 r6 r4
21 r6 = shr_i32 r6 8
22 r3 = shr_i32 r2 8
23 r3 = bit_and r1 r3
24 r6 = add_i32 r3 r6
25 r5 = bit_and r1 r5
26 r4 = mul_i16x2 r5 r4
27 r4 = shr_i32 r4 8
28 r2 = bit_and r1 r2
29 r4 = add_i32 r2 r4
30 r6 = pack r4 r6 8
31 r7 = pack r6 r7 16
32 store32 arg(1) r7
I32 (SWAR) 8888 over 8888
15 values (originally 15):
20 values (originally 21):
v0 = load32 arg(0)
v1 = bytes v0 404
↑ v2 = splat 1000100 (2.3510604e-38)
v3 = sub_i16x2 v2 v1
v4 = load32 arg(1)
v5 = shr_i16x2 v4 8
v6 = mul_i16x2 v5 v3
↑ v7 = splat FF00FF (2.3418409e-38)
v8 = bit_clear v6 v7
v9 = bit_and v4 v7
v10 = mul_i16x2 v9 v3
v11 = shr_i16x2 v10 8
v12 = bit_or v11 v8
v13 = add_i32 v0 v12
store32 arg(1) v13
v1 = shr_i32 v0 8
↑ v2 = splat FF0000 (2.3418052e-38)
v3 = bit_and v2 v1
v4 = shr_i32 v0 24
v5 = bit_or v4 v3
↑ v6 = splat 1000100 (2.3510604e-38)
v7 = sub_i16x2 v6 v5
v8 = load32 arg(1)
v9 = shr_i16x2 v8 8
v10 = mul_i16x2 v9 v7
↑ v11 = splat FF00FF00 (-1.7146522e+38)
v12 = bit_and v10 v11
↑ v13 = splat FF00FF (2.3418409e-38)
v14 = bit_and v8 v13
v15 = mul_i16x2 v14 v7
v16 = shr_i16x2 v15 8
v17 = bit_or v16 v12
v18 = add_i32 v0 v17
store32 arg(1) v18
6 registers, 15 instructions:
0 r0 = splat 1000100 (2.3510604e-38)
1 r1 = splat FF00FF (2.3418409e-38)
8 registers, 20 instructions:
0 r0 = splat FF0000 (2.3418052e-38)
1 r1 = splat 1000100 (2.3510604e-38)
2 r2 = splat FF00FF00 (-1.7146522e+38)
3 r3 = splat FF00FF (2.3418409e-38)
loop:
2 r2 = load32 arg(0)
3 r3 = bytes r2 404
4 r3 = sub_i16x2 r0 r3
5 r4 = load32 arg(1)
6 r5 = shr_i16x2 r4 8
7 r5 = mul_i16x2 r5 r3
8 r5 = bit_clear r5 r1
9 r4 = bit_and r4 r1
10 r3 = mul_i16x2 r4 r3
11 r3 = shr_i16x2 r3 8
12 r5 = bit_or r3 r5
13 r5 = add_i32 r2 r5
14 store32 arg(1) r5
4 r4 = load32 arg(0)
5 r5 = shr_i32 r4 8
6 r5 = bit_and r0 r5
7 r6 = shr_i32 r4 24
8 r5 = bit_or r6 r5
9 r5 = sub_i16x2 r1 r5
10 r6 = load32 arg(1)
11 r7 = shr_i16x2 r6 8
12 r7 = mul_i16x2 r7 r5
13 r7 = bit_and r7 r2
14 r6 = bit_and r6 r3
15 r5 = mul_i16x2 r6 r5
16 r5 = shr_i16x2 r5 8
17 r7 = bit_or r5 r7
18 r7 = add_i32 r4 r7
19 store32 arg(1) r7
6 values (originally 6):
↟ v0 = splat 2 (2.8025969e-45)

View File

@ -284,14 +284,12 @@ namespace skvm {
case Op::bit_and : write(o, V{id}, "=", op, V{x}, V{y} ); break;
case Op::bit_or : write(o, V{id}, "=", op, V{x}, V{y} ); break;
case Op::bit_xor : write(o, V{id}, "=", op, V{x}, V{y} ); break;
case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y} ); break;
case Op::bit_and_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
case Op::bit_or_imm : write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
case Op::bit_xor_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
case Op::bytes: write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
case Op::pack: write(o, V{id}, "=", op, V{x}, V{y}, Shift{immz}); break;
case Op::floor: write(o, V{id}, "=", op, V{x}); break;
@ -404,14 +402,12 @@ namespace skvm {
case Op::bit_and : write(o, R{d}, "=", op, R{x}, R{y} ); break;
case Op::bit_or : write(o, R{d}, "=", op, R{x}, R{y} ); break;
case Op::bit_xor : write(o, R{d}, "=", op, R{x}, R{y} ); break;
case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y} ); break;
case Op::bit_and_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
case Op::bit_or_imm : write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
case Op::bit_xor_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
case Op::bytes: write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
case Op::pack: write(o, R{d}, "=", op, R{x}, R{y}, Shift{immz}); break;
case Op::floor: write(o, R{d}, "=", op, R{x}); break;
@ -425,8 +421,8 @@ namespace skvm {
std::vector<Instruction> specialize_for_jit(std::vector<Instruction> program) {
// We could use a temporary Builder to let new Instructions participate in common
// sub-expression elimination, but there's only a tiny chance of hitting anything valuable
// with the specializations we've got today. Worth keeping in mind for the future though.
// sub-expression elimination, but we'll never hit anything valuable with the
// specializations we've got today. Worth keeping in mind for the future though.
for (Val i = 0; i < (Val)program.size(); i++) {
#if defined(SK_CPU_X86)
Instruction& inst = program[i];
@ -465,13 +461,6 @@ namespace skvm {
inst.y = NA;
inst.immy = bits;
} break;
case Op::bit_clear:
if (int bits; is_imm(inst.y, &bits)) {
inst.op = Op::bit_and_imm;
inst.y = NA;
inst.immy = ~bits;
} break;
}
#endif
}
@ -1020,14 +1009,6 @@ namespace skvm {
if (this->isImm(x.id, 0)) { return y; } // (false ^ y) == y
return {this, push(Op::bit_xor, x.id, y.id)};
}
I32 Builder::bit_clear(I32 x, I32 y) {
int X,Y;
if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X&~Y); }
if (this->isImm(y.id, 0)) { return x; } // (x & ~false) == x
if (this->isImm(y.id,~0)) { return this->splat(0); } // (x & ~true) == false
if (this->isImm(x.id, 0)) { return this->splat(0); } // (false & ~y) == false
return {this, push(Op::bit_clear, x.id, y.id)};
}
I32 Builder::select(I32 x, I32 y, I32 z) {
int X,Y,Z;
@ -1048,10 +1029,6 @@ namespace skvm {
return {this, push(Op::pack, x.id,y.id,NA, 0,bits)};
}
I32 Builder::bytes(I32 x, int control) {
return {this, push(Op::bytes, x.id,NA,NA, control)};
}
F32 Builder::floor(F32 x) {
float X;
if (this->allImm(x.id,&X)) { return this->splat(floorf(X)); }
@ -2440,7 +2417,6 @@ namespace skvm {
case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break;
case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break;
case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break;
case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
case Op::pack: vals[i] = b->CreateOr(vals[x], b->CreateShl(vals[y], immz)); break;
@ -2548,36 +2524,6 @@ namespace skvm {
case Op::gte_i16x2:
vals[i] = I(S(I16x2, b->CreateICmpSGE(x2(vals[x]), x2(vals[y]))));
break;
case Op::bytes: {
int N = vals[x]->getType()->isVectorTy() ? K : 1;
uint32_t off = 0;
auto nibble_to_mask = [&](uint8_t n) -> uint32_t {
switch (n) {
case 0: return 4*N; // Select any byte in the second (zero) arg.
case 1: return off + 0; // 1st byte in this arg.
case 2: return off + 1; // 2nd ...
case 3: return off + 2; // 3rd ...
case 4: return off + 3; // 4th byte in this arg.
}
SkUNREACHABLE;
return 0;
};
std::vector<uint32_t> mask(N*4);
for (int i = 0; i < N; i++) {
mask[4*i+0] = nibble_to_mask( (immy >> 0) & 0xf );
mask[4*i+1] = nibble_to_mask( (immy >> 4) & 0xf );
mask[4*i+2] = nibble_to_mask( (immy >> 8) & 0xf );
mask[4*i+3] = nibble_to_mask( (immy >> 12) & 0xf );
off += 4;
}
llvm::Value* input = b->CreateBitCast(vals[x], I8x4);
llvm::Value* zero = llvm::Constant::getNullValue(I8x4);
vals[i] = I(b->CreateShuffleVector(input, zero, mask));
} break;
}
return true;
};
@ -2930,39 +2876,6 @@ namespace skvm {
#if defined(SKVM_JIT)
// Just so happens that we can translate the immediate control for our bytes() op
// to a single 128-bit mask that can be consumed by both AVX2 vpshufb and NEON tbl!
static void bytes_control(int imm, int mask[4]) {
auto nibble_to_vpshufb = [](uint8_t n) -> uint8_t {
// 0 -> 0xff, Fill with zero
// 1 -> 0x00, Select byte 0
// 2 -> 0x01, " 1
// 3 -> 0x02, " 2
// 4 -> 0x03, " 3
return n - 1;
};
uint8_t control[] = {
nibble_to_vpshufb( (imm >> 0) & 0xf ),
nibble_to_vpshufb( (imm >> 4) & 0xf ),
nibble_to_vpshufb( (imm >> 8) & 0xf ),
nibble_to_vpshufb( (imm >> 12) & 0xf ),
};
for (int i = 0; i < 4; i++) {
mask[i] = (int)control[0] << 0
| (int)control[1] << 8
| (int)control[2] << 16
| (int)control[3] << 24;
// Update each byte that refers to a byte index by 4 to
// point into the next 32-bit lane, but leave any 0xff
// that fills with zero alone.
control[0] += control[0] == 0xff ? 0 : 4;
control[1] += control[1] == 0xff ? 0 : 4;
control[2] += control[2] == 0xff ? 0 : 4;
control[3] += control[3] == 0xff ? 0 : 4;
}
}
bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
const bool try_hoisting,
Assembler* a) const {
@ -3013,38 +2926,9 @@ namespace skvm {
A::Label label;
Reg reg;
};
SkTHashMap<int, LabelAndReg> constants, // All constants share the same pool.
bytes_masks; // These vary per-lane.
SkTHashMap<int, LabelAndReg> constants; // All constants share the same pool.
LabelAndReg iota; // Exists _only_ to vary per-lane.
auto warmup = [&](Val id) {
const OptimizedInstruction& inst = instructions[id];
switch (inst.op) {
default: break;
case Op::bytes: if (!bytes_masks.find(inst.immy)) {
bytes_masks.set(inst.immy, {});
if (try_hoisting) {
// vpshufb can always work with the mask from memory,
// but it helps to hoist the mask to a register for tbl.
#if defined(__aarch64__)
LabelAndReg* entry = bytes_masks.find(inst.immy);
if (int found = __builtin_ffs(avail)) {
entry->reg = (Reg)(found-1);
avail ^= 1 << entry->reg;
a->ldrq(entry->reg, &entry->label);
} else {
return false;
}
#endif
}
}
break;
}
return true;
};
auto emit = [&](Val id, bool scalar) {
const OptimizedInstruction& inst = instructions[id];
@ -3312,7 +3196,6 @@ namespace skvm {
case Op::bit_and : a->vpand (dst(), r[x], r[y]); break;
case Op::bit_or : a->vpor (dst(), r[x], r[y]); break;
case Op::bit_xor : a->vpxor (dst(), r[x], r[y]); break;
case Op::bit_clear: a->vpandn(dst(), r[y], r[x]); break; // N.B. Y then X.
case Op::select : a->vpblendvb(dst(), r[z], r[y], r[x]); break;
case Op::bit_and_imm: a->vpand (dst(), r[x], &constants[immy].label); break;
@ -3340,9 +3223,6 @@ namespace skvm {
case Op::trunc : a->vcvttps2dq(dst(), r[x]); break;
case Op::round : a->vcvtps2dq (dst(), r[x]); break;
case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(immy)->label);
break;
#elif defined(__aarch64__)
case Op::assert_true: {
a->uminv4s(tmp(), r[x]); // uminv acts like an all() across the vector.
@ -3440,7 +3320,6 @@ namespace skvm {
case Op::bit_and : a->and16b(dst(), r[x], r[y]); break;
case Op::bit_or : a->orr16b(dst(), r[x], r[y]); break;
case Op::bit_xor : a->eor16b(dst(), r[x], r[y]); break;
case Op::bit_clear: a->bic16b(dst(), r[x], r[y]); break;
case Op::select: // bsl16b is x = x ? y : z
if (avail & (1<<r[x])) { set_dst(r[x]); a->bsl16b( r[x], r[y], r[z]); }
@ -3467,12 +3346,6 @@ namespace skvm {
case Op::round: a->fcvtns4s(dst(), r[x]); break;
// TODO: fcvtns.4s rounds to nearest even.
// I think we actually want frintx -> fcvtzs to round to current mode.
case Op::bytes:
if (try_hoisting) { a->tbl (dst(), r[x], bytes_masks.find(immy)->reg); }
else { a->ldrq(tmp(), &bytes_masks.find(immy)->label);
a->tbl (dst(), r[x], tmp()); }
break;
#endif
}
@ -3506,9 +3379,6 @@ namespace skvm {
done;
for (Val id = 0; id < (Val)instructions.size(); id++) {
if (!warmup(id)) {
return false;
}
if (hoisted(id) && !emit(id, /*scalar=*/false)) {
return false;
}
@ -3568,18 +3438,6 @@ namespace skvm {
}
});
bytes_masks.foreach([&](int imm, LabelAndReg* entry) {
// One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb.
a->align(4);
a->label(&entry->label);
int mask[4];
bytes_control(imm, mask);
a->bytes(mask, sizeof(mask));
#if defined(__x86_64__)
a->bytes(mask, sizeof(mask));
#endif
});
if (!iota.label.references.empty()) {
a->align(4);
a->label(&iota.label);

View File

@ -326,11 +326,10 @@ namespace skvm {
M(bit_and) \
M(bit_or) \
M(bit_xor) \
M(bit_clear) \
M(bit_and_imm) \
M(bit_or_imm) \
M(bit_xor_imm) \
M(select) M(bytes) M(pack) \
M(select) M(pack) \
// End of SKVM_OPS
enum class Op : int {
@ -626,7 +625,6 @@ namespace skvm {
I32 bit_and (I32, I32); I32 bit_and (I32a x, I32a y) { return bit_and (_(x), _(y)); }
I32 bit_or (I32, I32); I32 bit_or (I32a x, I32a y) { return bit_or (_(x), _(y)); }
I32 bit_xor (I32, I32); I32 bit_xor (I32a x, I32a y) { return bit_xor (_(x), _(y)); }
I32 bit_clear(I32, I32); I32 bit_clear(I32a x, I32a y) { return bit_clear(_(x), _(y)); }
I32 min(I32 x, I32 y) { return select(lt(x,y), x, y); }
I32 max(I32 x, I32 y) { return select(gt(x,y), x, y); }
@ -643,29 +641,6 @@ namespace skvm {
I32 select(I32a cond, I32a t, I32a f) { return select(_(cond), _(t), _(f)); }
F32 select(I32a cond, F32a t, F32a f) { return select(_(cond), _(t), _(f)); }
// More complex operations...
// Shuffle the bytes in x according to each nibble of control, as if
//
// uint8_t bytes[] = {
// 0,
// ((uint32_t)x ) & 0xff,
// ((uint32_t)x >> 8) & 0xff,
// ((uint32_t)x >> 16) & 0xff,
// ((uint32_t)x >> 24) & 0xff,
// };
// return (uint32_t)bytes[(control >> 0) & 0xf] << 0
// | (uint32_t)bytes[(control >> 4) & 0xf] << 8
// | (uint32_t)bytes[(control >> 8) & 0xf] << 16
// | (uint32_t)bytes[(control >> 12) & 0xf] << 24;
//
// So, e.g.,
// - bytes(x, 0x1111) splats the low byte of x to all four bytes
// - bytes(x, 0x4321) is x, an identity
// - bytes(x, 0x0000) is 0
// - bytes(x, 0x0404) transforms an RGBA pixel into an A0A0 bit pattern.
I32 bytes (I32 x, int control);
I32 extract(I32 x, int bits, I32 z); // (x>>bits) & z
I32 pack (I32 x, I32 y, int bits); // x | (y << bits), assuming (x & (y << bits)) == 0
@ -975,8 +950,6 @@ namespace skvm {
static inline I32 select(I32 cond, I32a t, I32a f) { return cond->select(cond,t,f); }
static inline F32 select(I32 cond, F32a t, F32a f) { return cond->select(cond,t,f); }
static inline I32 bytes(I32 x, int control) { return x->bytes(x,control); }
static inline I32 extract(I32 x, int bits, I32a z) { return x->extract(x,bits,z); }
static inline I32 extract(int x, int bits, I32 z) { return z->extract(x,bits,z); }
static inline I32 pack (I32 x, I32a y, int bits) { return x->pack (x,y,bits); }

View File

@ -252,27 +252,12 @@ namespace SK_OPTS_NS {
CASE(Op::bit_and ): r(d).i32 = r(x).i32 & r(y).i32; break;
CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
CASE(Op::bit_xor ): r(d).i32 = r(x).i32 ^ r(y).i32; break;
CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32);
break;
CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << immz); break;
CASE(Op::bytes): {
const U32 table[] = {
0,
(r(x).u32 ) & 0xff,
(r(x).u32 >> 8) & 0xff,
(r(x).u32 >> 16) & 0xff,
(r(x).u32 >> 24) & 0xff,
};
r(d).u32 = table[(immy >> 0) & 0xf] << 0
| table[(immy >> 4) & 0xf] << 8
| table[(immy >> 8) & 0xf] << 16
| table[(immy >> 12) & 0xf] << 24;
} break;
CASE(Op::floor): r(d).f32 = skvx::floor(r(x).f32) ; break;
CASE(Op::to_f32): r(d).f32 = skvx::cast<float>( r(x).i32 ); break;
CASE(Op::trunc): r(d).i32 = skvx::cast<int> ( r(x).f32 ); break;

View File

@ -522,10 +522,10 @@ DEF_TEST(SkVM_bitops, r) {
skvm::I32 x = b.load32(ptr);
x = b.bit_and (x, b.splat(0xf1)); // 0x40
x = b.bit_or (x, b.splat(0x80)); // 0xc0
x = b.bit_xor (x, b.splat(0xfe)); // 0x3e
x = b.bit_clear(x, b.splat(0x30)); // 0x0e
x = b.bit_and (x, b.splat( 0xf1)); // 0x40
x = b.bit_or (x, b.splat( 0x80)); // 0xc0
x = b.bit_xor (x, b.splat( 0xfe)); // 0x3e
x = b.bit_and (x, b.splat(~0x30)); // 0x0e
x = b.shl(x, 28); // 0xe000'0000
x = b.sra(x, 28); // 0xffff'fffe

View File

@ -125,10 +125,10 @@ SrcoverBuilder_I32::SrcoverBuilder_I32() {
auto load = [&](skvm::Arg ptr,
skvm::I32* r, skvm::I32* g, skvm::I32* b, skvm::I32* a) {
skvm::I32 rgba = load32(ptr);
*r = bit_and(rgba, splat(0xff));
*g = bytes (rgba, 0x0002);
*b = bytes (rgba, 0x0003);
*a = shr (rgba, 24);
*r = extract(rgba, 0, splat(0xff));
*g = extract(rgba, 8, splat(0xff));
*b = extract(rgba, 16, splat(0xff));
*a = extract(rgba, 24, splat(0xff));
};
skvm::I32 r,g,b,a;
@ -169,7 +169,8 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() {
// The s += d*invA adds won't overflow,
// so we don't have to unpack s beyond grabbing the alpha channel.
skvm::I32 s = load32(src),
ax2 = bytes(s, 0x0404); // rgba -> a0a0
ax2 = extract(s, 24, splat(0x000000ff))
| extract(s, 8, splat(0x00ff0000));
// We'll use the same approximation math as above, this time making sure to
// use both i16 multiplies to our benefit, one for r/g, the other for b/a.
@ -181,7 +182,7 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() {
rb = shr_16x2(mul_16x2(rb, invAx2), 8); // Put the high 8 bits back in the low lane.
ga = mul_16x2(ga, invAx2); // Keep the high 8 bits up high...
ga = bit_clear(ga, splat(0x00ff00ff)); // ...and mask off the low bits.
ga = bit_and(ga, splat(0xff00ff00)); // ...and mask off the low bits.
store32(dst, add(s, bit_or(rb, ga)));
}