diff --git a/resources/SkVMTest.expected b/resources/SkVMTest.expected index 5de1464f5e..fdd4a60825 100644 --- a/resources/SkVMTest.expected +++ b/resources/SkVMTest.expected @@ -607,38 +607,42 @@ loop: 32 store32 arg(1) r7 I32 8888 over 8888 -29 values (originally 29): +33 values (originally 33): v0 = load32 arg(0) v1 = shr_i32 v0 24 ↑ v2 = splat 100 (3.5873241e-43) v3 = sub_i32 v2 v1 v4 = load32 arg(1) - v5 = bytes v4 3 - v6 = mul_i16x2 v5 v3 - v7 = shr_i32 v6 8 - v8 = bytes v0 3 - v9 = add_i32 v8 v7 - v10 = shr_i32 v4 24 - v11 = mul_i16x2 v10 v3 - v12 = shr_i32 v11 8 - v13 = add_i32 v1 v12 - v14 = pack v9 v13 8 - v15 = bytes v4 2 - v16 = mul_i16x2 v15 v3 - v17 = shr_i32 v16 8 - v18 = bytes v0 2 - v19 = add_i32 v18 v17 -↑ v20 = splat FF (3.5733111e-43) - v21 = bit_and v4 v20 - v22 = mul_i16x2 v21 v3 - v23 = shr_i32 v22 8 - v24 = bit_and v0 v20 - v25 = add_i32 v24 v23 - v26 = pack v25 v19 8 - v27 = pack v26 v14 16 - store32 arg(1) v27 + v5 = shr_i32 v4 16 +↑ v6 = splat FF (3.5733111e-43) + v7 = bit_and v6 v5 + v8 = mul_i16x2 v7 v3 + v9 = shr_i32 v8 8 + v10 = shr_i32 v0 16 + v11 = bit_and v6 v10 + v12 = add_i32 v11 v9 + v13 = shr_i32 v4 24 + v14 = mul_i16x2 v13 v3 + v15 = shr_i32 v14 8 + v16 = add_i32 v1 v15 + v17 = pack v12 v16 8 + v18 = shr_i32 v4 8 + v19 = bit_and v6 v18 + v20 = mul_i16x2 v19 v3 + v21 = shr_i32 v20 8 + v22 = shr_i32 v0 8 + v23 = bit_and v6 v22 + v24 = add_i32 v23 v21 + v25 = bit_and v6 v4 + v26 = mul_i16x2 v25 v3 + v27 = shr_i32 v26 8 + v28 = bit_and v6 v0 + v29 = add_i32 v28 v27 + v30 = pack v29 v24 8 + v31 = pack v30 v17 16 + store32 arg(1) v31 -8 registers, 29 instructions: +8 registers, 33 instructions: 0 r0 = splat 100 (3.5873241e-43) 1 r1 = splat FF (3.5733111e-43) loop: @@ -646,65 +650,79 @@ loop: 3 r3 = shr_i32 r2 24 4 r4 = sub_i32 r0 r3 5 r5 = load32 arg(1) -6 r6 = bytes r5 3 -7 r6 = mul_i16x2 r6 r4 -8 r6 = shr_i32 r6 8 -9 r7 = bytes r2 3 -10 r6 = add_i32 r7 r6 -11 r7 = shr_i32 r5 24 -12 r7 = mul_i16x2 r7 r4 -13 r7 = shr_i32 r7 8 -14 r7 = add_i32 r3 r7 -15 r7 = pack r6 r7 8 -16 r6 = bytes r5 2 -17 r6 = mul_i16x2 r6 r4 -18 r6 = shr_i32 r6 8 -19 r3 = bytes r2 2 -20 r6 = add_i32 r3 r6 -21 r5 = bit_and r5 r1 -22 r4 = mul_i16x2 r5 r4 -23 r4 = shr_i32 r4 8 -24 r2 = bit_and r2 r1 -25 r4 = add_i32 r2 r4 -26 r6 = pack r4 r6 8 -27 r7 = pack r6 r7 16 -28 store32 arg(1) r7 +6 r6 = shr_i32 r5 16 +7 r6 = bit_and r1 r6 +8 r6 = mul_i16x2 r6 r4 +9 r6 = shr_i32 r6 8 +10 r7 = shr_i32 r2 16 +11 r7 = bit_and r1 r7 +12 r6 = add_i32 r7 r6 +13 r7 = shr_i32 r5 24 +14 r7 = mul_i16x2 r7 r4 +15 r7 = shr_i32 r7 8 +16 r7 = add_i32 r3 r7 +17 r7 = pack r6 r7 8 +18 r6 = shr_i32 r5 8 +19 r6 = bit_and r1 r6 +20 r6 = mul_i16x2 r6 r4 +21 r6 = shr_i32 r6 8 +22 r3 = shr_i32 r2 8 +23 r3 = bit_and r1 r3 +24 r6 = add_i32 r3 r6 +25 r5 = bit_and r1 r5 +26 r4 = mul_i16x2 r5 r4 +27 r4 = shr_i32 r4 8 +28 r2 = bit_and r1 r2 +29 r4 = add_i32 r2 r4 +30 r6 = pack r4 r6 8 +31 r7 = pack r6 r7 16 +32 store32 arg(1) r7 I32 (SWAR) 8888 over 8888 -15 values (originally 15): +20 values (originally 21): v0 = load32 arg(0) - v1 = bytes v0 404 -↑ v2 = splat 1000100 (2.3510604e-38) - v3 = sub_i16x2 v2 v1 - v4 = load32 arg(1) - v5 = shr_i16x2 v4 8 - v6 = mul_i16x2 v5 v3 -↑ v7 = splat FF00FF (2.3418409e-38) - v8 = bit_clear v6 v7 - v9 = bit_and v4 v7 - v10 = mul_i16x2 v9 v3 - v11 = shr_i16x2 v10 8 - v12 = bit_or v11 v8 - v13 = add_i32 v0 v12 - store32 arg(1) v13 + v1 = shr_i32 v0 8 +↑ v2 = splat FF0000 (2.3418052e-38) + v3 = bit_and v2 v1 + v4 = shr_i32 v0 24 + v5 = bit_or v4 v3 +↑ v6 = splat 1000100 (2.3510604e-38) + v7 = sub_i16x2 v6 v5 + v8 = load32 arg(1) + v9 = shr_i16x2 v8 8 + v10 = mul_i16x2 v9 v7 +↑ v11 = splat FF00FF00 (-1.7146522e+38) + v12 = bit_and v10 v11 +↑ v13 = splat FF00FF (2.3418409e-38) + v14 = bit_and v8 v13 + v15 = mul_i16x2 v14 v7 + v16 = shr_i16x2 v15 8 + v17 = bit_or v16 v12 + v18 = add_i32 v0 v17 + store32 arg(1) v18 -6 registers, 15 instructions: -0 r0 = splat 1000100 (2.3510604e-38) -1 r1 = splat FF00FF (2.3418409e-38) +8 registers, 20 instructions: +0 r0 = splat FF0000 (2.3418052e-38) +1 r1 = splat 1000100 (2.3510604e-38) +2 r2 = splat FF00FF00 (-1.7146522e+38) +3 r3 = splat FF00FF (2.3418409e-38) loop: -2 r2 = load32 arg(0) -3 r3 = bytes r2 404 -4 r3 = sub_i16x2 r0 r3 -5 r4 = load32 arg(1) -6 r5 = shr_i16x2 r4 8 -7 r5 = mul_i16x2 r5 r3 -8 r5 = bit_clear r5 r1 -9 r4 = bit_and r4 r1 -10 r3 = mul_i16x2 r4 r3 -11 r3 = shr_i16x2 r3 8 -12 r5 = bit_or r3 r5 -13 r5 = add_i32 r2 r5 -14 store32 arg(1) r5 +4 r4 = load32 arg(0) +5 r5 = shr_i32 r4 8 +6 r5 = bit_and r0 r5 +7 r6 = shr_i32 r4 24 +8 r5 = bit_or r6 r5 +9 r5 = sub_i16x2 r1 r5 +10 r6 = load32 arg(1) +11 r7 = shr_i16x2 r6 8 +12 r7 = mul_i16x2 r7 r5 +13 r7 = bit_and r7 r2 +14 r6 = bit_and r6 r3 +15 r5 = mul_i16x2 r6 r5 +16 r5 = shr_i16x2 r5 8 +17 r7 = bit_or r5 r7 +18 r7 = add_i32 r4 r7 +19 store32 arg(1) r7 6 values (originally 6): ↟ v0 = splat 2 (2.8025969e-45) diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp index 3ccfb46aa8..f1c8c14583 100644 --- a/src/core/SkVM.cpp +++ b/src/core/SkVM.cpp @@ -284,14 +284,12 @@ namespace skvm { case Op::bit_and : write(o, V{id}, "=", op, V{x}, V{y} ); break; case Op::bit_or : write(o, V{id}, "=", op, V{x}, V{y} ); break; case Op::bit_xor : write(o, V{id}, "=", op, V{x}, V{y} ); break; - case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y} ); break; case Op::bit_and_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break; case Op::bit_or_imm : write(o, V{id}, "=", op, V{x}, Hex{immy}); break; case Op::bit_xor_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break; case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; - case Op::bytes: write(o, V{id}, "=", op, V{x}, Hex{immy}); break; case Op::pack: write(o, V{id}, "=", op, V{x}, V{y}, Shift{immz}); break; case Op::floor: write(o, V{id}, "=", op, V{x}); break; @@ -404,14 +402,12 @@ namespace skvm { case Op::bit_and : write(o, R{d}, "=", op, R{x}, R{y} ); break; case Op::bit_or : write(o, R{d}, "=", op, R{x}, R{y} ); break; case Op::bit_xor : write(o, R{d}, "=", op, R{x}, R{y} ); break; - case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y} ); break; case Op::bit_and_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break; case Op::bit_or_imm : write(o, R{d}, "=", op, R{x}, Hex{immy}); break; case Op::bit_xor_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break; case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; - case Op::bytes: write(o, R{d}, "=", op, R{x}, Hex{immy}); break; case Op::pack: write(o, R{d}, "=", op, R{x}, R{y}, Shift{immz}); break; case Op::floor: write(o, R{d}, "=", op, R{x}); break; @@ -425,8 +421,8 @@ namespace skvm { std::vector specialize_for_jit(std::vector program) { // We could use a temporary Builder to let new Instructions participate in common - // sub-expression elimination, but there's only a tiny chance of hitting anything valuable - // with the specializations we've got today. Worth keeping in mind for the future though. + // sub-expression elimination, but we'll never hit anything valuable with the + // specializations we've got today. Worth keeping in mind for the future though. for (Val i = 0; i < (Val)program.size(); i++) { #if defined(SK_CPU_X86) Instruction& inst = program[i]; @@ -465,13 +461,6 @@ namespace skvm { inst.y = NA; inst.immy = bits; } break; - - case Op::bit_clear: - if (int bits; is_imm(inst.y, &bits)) { - inst.op = Op::bit_and_imm; - inst.y = NA; - inst.immy = ~bits; - } break; } #endif } @@ -1020,14 +1009,6 @@ namespace skvm { if (this->isImm(x.id, 0)) { return y; } // (false ^ y) == y return {this, push(Op::bit_xor, x.id, y.id)}; } - I32 Builder::bit_clear(I32 x, I32 y) { - int X,Y; - if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X&~Y); } - if (this->isImm(y.id, 0)) { return x; } // (x & ~false) == x - if (this->isImm(y.id,~0)) { return this->splat(0); } // (x & ~true) == false - if (this->isImm(x.id, 0)) { return this->splat(0); } // (false & ~y) == false - return {this, push(Op::bit_clear, x.id, y.id)}; - } I32 Builder::select(I32 x, I32 y, I32 z) { int X,Y,Z; @@ -1048,10 +1029,6 @@ namespace skvm { return {this, push(Op::pack, x.id,y.id,NA, 0,bits)}; } - I32 Builder::bytes(I32 x, int control) { - return {this, push(Op::bytes, x.id,NA,NA, control)}; - } - F32 Builder::floor(F32 x) { float X; if (this->allImm(x.id,&X)) { return this->splat(floorf(X)); } @@ -2440,7 +2417,6 @@ namespace skvm { case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break; case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break; case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break; - case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break; case Op::pack: vals[i] = b->CreateOr(vals[x], b->CreateShl(vals[y], immz)); break; @@ -2548,36 +2524,6 @@ namespace skvm { case Op::gte_i16x2: vals[i] = I(S(I16x2, b->CreateICmpSGE(x2(vals[x]), x2(vals[y])))); break; - - case Op::bytes: { - int N = vals[x]->getType()->isVectorTy() ? K : 1; - - uint32_t off = 0; - auto nibble_to_mask = [&](uint8_t n) -> uint32_t { - switch (n) { - case 0: return 4*N; // Select any byte in the second (zero) arg. - case 1: return off + 0; // 1st byte in this arg. - case 2: return off + 1; // 2nd ... - case 3: return off + 2; // 3rd ... - case 4: return off + 3; // 4th byte in this arg. - } - SkUNREACHABLE; - return 0; - }; - - std::vector mask(N*4); - for (int i = 0; i < N; i++) { - mask[4*i+0] = nibble_to_mask( (immy >> 0) & 0xf ); - mask[4*i+1] = nibble_to_mask( (immy >> 4) & 0xf ); - mask[4*i+2] = nibble_to_mask( (immy >> 8) & 0xf ); - mask[4*i+3] = nibble_to_mask( (immy >> 12) & 0xf ); - off += 4; - } - - llvm::Value* input = b->CreateBitCast(vals[x], I8x4); - llvm::Value* zero = llvm::Constant::getNullValue(I8x4); - vals[i] = I(b->CreateShuffleVector(input, zero, mask)); - } break; } return true; }; @@ -2930,39 +2876,6 @@ namespace skvm { #if defined(SKVM_JIT) - // Just so happens that we can translate the immediate control for our bytes() op - // to a single 128-bit mask that can be consumed by both AVX2 vpshufb and NEON tbl! - static void bytes_control(int imm, int mask[4]) { - auto nibble_to_vpshufb = [](uint8_t n) -> uint8_t { - // 0 -> 0xff, Fill with zero - // 1 -> 0x00, Select byte 0 - // 2 -> 0x01, " 1 - // 3 -> 0x02, " 2 - // 4 -> 0x03, " 3 - return n - 1; - }; - uint8_t control[] = { - nibble_to_vpshufb( (imm >> 0) & 0xf ), - nibble_to_vpshufb( (imm >> 4) & 0xf ), - nibble_to_vpshufb( (imm >> 8) & 0xf ), - nibble_to_vpshufb( (imm >> 12) & 0xf ), - }; - for (int i = 0; i < 4; i++) { - mask[i] = (int)control[0] << 0 - | (int)control[1] << 8 - | (int)control[2] << 16 - | (int)control[3] << 24; - - // Update each byte that refers to a byte index by 4 to - // point into the next 32-bit lane, but leave any 0xff - // that fills with zero alone. - control[0] += control[0] == 0xff ? 0 : 4; - control[1] += control[1] == 0xff ? 0 : 4; - control[2] += control[2] == 0xff ? 0 : 4; - control[3] += control[3] == 0xff ? 0 : 4; - } - } - bool Program::jit(const std::vector& instructions, const bool try_hoisting, Assembler* a) const { @@ -3013,38 +2926,9 @@ namespace skvm { A::Label label; Reg reg; }; - SkTHashMap constants, // All constants share the same pool. - bytes_masks; // These vary per-lane. + SkTHashMap constants; // All constants share the same pool. LabelAndReg iota; // Exists _only_ to vary per-lane. - auto warmup = [&](Val id) { - const OptimizedInstruction& inst = instructions[id]; - - switch (inst.op) { - default: break; - - case Op::bytes: if (!bytes_masks.find(inst.immy)) { - bytes_masks.set(inst.immy, {}); - if (try_hoisting) { - // vpshufb can always work with the mask from memory, - // but it helps to hoist the mask to a register for tbl. - #if defined(__aarch64__) - LabelAndReg* entry = bytes_masks.find(inst.immy); - if (int found = __builtin_ffs(avail)) { - entry->reg = (Reg)(found-1); - avail ^= 1 << entry->reg; - a->ldrq(entry->reg, &entry->label); - } else { - return false; - } - #endif - } - } - break; - } - return true; - }; - auto emit = [&](Val id, bool scalar) { const OptimizedInstruction& inst = instructions[id]; @@ -3312,7 +3196,6 @@ namespace skvm { case Op::bit_and : a->vpand (dst(), r[x], r[y]); break; case Op::bit_or : a->vpor (dst(), r[x], r[y]); break; case Op::bit_xor : a->vpxor (dst(), r[x], r[y]); break; - case Op::bit_clear: a->vpandn(dst(), r[y], r[x]); break; // N.B. Y then X. case Op::select : a->vpblendvb(dst(), r[z], r[y], r[x]); break; case Op::bit_and_imm: a->vpand (dst(), r[x], &constants[immy].label); break; @@ -3340,9 +3223,6 @@ namespace skvm { case Op::trunc : a->vcvttps2dq(dst(), r[x]); break; case Op::round : a->vcvtps2dq (dst(), r[x]); break; - case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(immy)->label); - break; - #elif defined(__aarch64__) case Op::assert_true: { a->uminv4s(tmp(), r[x]); // uminv acts like an all() across the vector. @@ -3440,7 +3320,6 @@ namespace skvm { case Op::bit_and : a->and16b(dst(), r[x], r[y]); break; case Op::bit_or : a->orr16b(dst(), r[x], r[y]); break; case Op::bit_xor : a->eor16b(dst(), r[x], r[y]); break; - case Op::bit_clear: a->bic16b(dst(), r[x], r[y]); break; case Op::select: // bsl16b is x = x ? y : z if (avail & (1<bsl16b( r[x], r[y], r[z]); } @@ -3467,12 +3346,6 @@ namespace skvm { case Op::round: a->fcvtns4s(dst(), r[x]); break; // TODO: fcvtns.4s rounds to nearest even. // I think we actually want frintx -> fcvtzs to round to current mode. - - case Op::bytes: - if (try_hoisting) { a->tbl (dst(), r[x], bytes_masks.find(immy)->reg); } - else { a->ldrq(tmp(), &bytes_masks.find(immy)->label); - a->tbl (dst(), r[x], tmp()); } - break; #endif } @@ -3506,9 +3379,6 @@ namespace skvm { done; for (Val id = 0; id < (Val)instructions.size(); id++) { - if (!warmup(id)) { - return false; - } if (hoisted(id) && !emit(id, /*scalar=*/false)) { return false; } @@ -3568,18 +3438,6 @@ namespace skvm { } }); - bytes_masks.foreach([&](int imm, LabelAndReg* entry) { - // One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb. - a->align(4); - a->label(&entry->label); - int mask[4]; - bytes_control(imm, mask); - a->bytes(mask, sizeof(mask)); - #if defined(__x86_64__) - a->bytes(mask, sizeof(mask)); - #endif - }); - if (!iota.label.references.empty()) { a->align(4); a->label(&iota.label); diff --git a/src/core/SkVM.h b/src/core/SkVM.h index eea3c9b975..2a45fdd489 100644 --- a/src/core/SkVM.h +++ b/src/core/SkVM.h @@ -326,11 +326,10 @@ namespace skvm { M(bit_and) \ M(bit_or) \ M(bit_xor) \ - M(bit_clear) \ M(bit_and_imm) \ M(bit_or_imm) \ M(bit_xor_imm) \ - M(select) M(bytes) M(pack) \ + M(select) M(pack) \ // End of SKVM_OPS enum class Op : int { @@ -626,7 +625,6 @@ namespace skvm { I32 bit_and (I32, I32); I32 bit_and (I32a x, I32a y) { return bit_and (_(x), _(y)); } I32 bit_or (I32, I32); I32 bit_or (I32a x, I32a y) { return bit_or (_(x), _(y)); } I32 bit_xor (I32, I32); I32 bit_xor (I32a x, I32a y) { return bit_xor (_(x), _(y)); } - I32 bit_clear(I32, I32); I32 bit_clear(I32a x, I32a y) { return bit_clear(_(x), _(y)); } I32 min(I32 x, I32 y) { return select(lt(x,y), x, y); } I32 max(I32 x, I32 y) { return select(gt(x,y), x, y); } @@ -643,29 +641,6 @@ namespace skvm { I32 select(I32a cond, I32a t, I32a f) { return select(_(cond), _(t), _(f)); } F32 select(I32a cond, F32a t, F32a f) { return select(_(cond), _(t), _(f)); } - // More complex operations... - - // Shuffle the bytes in x according to each nibble of control, as if - // - // uint8_t bytes[] = { - // 0, - // ((uint32_t)x ) & 0xff, - // ((uint32_t)x >> 8) & 0xff, - // ((uint32_t)x >> 16) & 0xff, - // ((uint32_t)x >> 24) & 0xff, - // }; - // return (uint32_t)bytes[(control >> 0) & 0xf] << 0 - // | (uint32_t)bytes[(control >> 4) & 0xf] << 8 - // | (uint32_t)bytes[(control >> 8) & 0xf] << 16 - // | (uint32_t)bytes[(control >> 12) & 0xf] << 24; - // - // So, e.g., - // - bytes(x, 0x1111) splats the low byte of x to all four bytes - // - bytes(x, 0x4321) is x, an identity - // - bytes(x, 0x0000) is 0 - // - bytes(x, 0x0404) transforms an RGBA pixel into an A0A0 bit pattern. - I32 bytes (I32 x, int control); - I32 extract(I32 x, int bits, I32 z); // (x>>bits) & z I32 pack (I32 x, I32 y, int bits); // x | (y << bits), assuming (x & (y << bits)) == 0 @@ -975,8 +950,6 @@ namespace skvm { static inline I32 select(I32 cond, I32a t, I32a f) { return cond->select(cond,t,f); } static inline F32 select(I32 cond, F32a t, F32a f) { return cond->select(cond,t,f); } - static inline I32 bytes(I32 x, int control) { return x->bytes(x,control); } - static inline I32 extract(I32 x, int bits, I32a z) { return x->extract(x,bits,z); } static inline I32 extract(int x, int bits, I32 z) { return z->extract(x,bits,z); } static inline I32 pack (I32 x, I32a y, int bits) { return x->pack (x,y,bits); } diff --git a/src/opts/SkVM_opts.h b/src/opts/SkVM_opts.h index 377e551e88..74dcc6f1ba 100644 --- a/src/opts/SkVM_opts.h +++ b/src/opts/SkVM_opts.h @@ -252,27 +252,12 @@ namespace SK_OPTS_NS { CASE(Op::bit_and ): r(d).i32 = r(x).i32 & r(y).i32; break; CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break; CASE(Op::bit_xor ): r(d).i32 = r(x).i32 ^ r(y).i32; break; - CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break; CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32); break; CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << immz); break; - CASE(Op::bytes): { - const U32 table[] = { - 0, - (r(x).u32 ) & 0xff, - (r(x).u32 >> 8) & 0xff, - (r(x).u32 >> 16) & 0xff, - (r(x).u32 >> 24) & 0xff, - }; - r(d).u32 = table[(immy >> 0) & 0xf] << 0 - | table[(immy >> 4) & 0xf] << 8 - | table[(immy >> 8) & 0xf] << 16 - | table[(immy >> 12) & 0xf] << 24; - } break; - CASE(Op::floor): r(d).f32 = skvx::floor(r(x).f32) ; break; CASE(Op::to_f32): r(d).f32 = skvx::cast( r(x).i32 ); break; CASE(Op::trunc): r(d).i32 = skvx::cast ( r(x).f32 ); break; diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp index 6ac0e87332..1f1869138f 100644 --- a/tests/SkVMTest.cpp +++ b/tests/SkVMTest.cpp @@ -522,10 +522,10 @@ DEF_TEST(SkVM_bitops, r) { skvm::I32 x = b.load32(ptr); - x = b.bit_and (x, b.splat(0xf1)); // 0x40 - x = b.bit_or (x, b.splat(0x80)); // 0xc0 - x = b.bit_xor (x, b.splat(0xfe)); // 0x3e - x = b.bit_clear(x, b.splat(0x30)); // 0x0e + x = b.bit_and (x, b.splat( 0xf1)); // 0x40 + x = b.bit_or (x, b.splat( 0x80)); // 0xc0 + x = b.bit_xor (x, b.splat( 0xfe)); // 0x3e + x = b.bit_and (x, b.splat(~0x30)); // 0x0e x = b.shl(x, 28); // 0xe000'0000 x = b.sra(x, 28); // 0xffff'fffe diff --git a/tools/SkVMBuilders.cpp b/tools/SkVMBuilders.cpp index 634b769298..f4a3d3001a 100644 --- a/tools/SkVMBuilders.cpp +++ b/tools/SkVMBuilders.cpp @@ -125,10 +125,10 @@ SrcoverBuilder_I32::SrcoverBuilder_I32() { auto load = [&](skvm::Arg ptr, skvm::I32* r, skvm::I32* g, skvm::I32* b, skvm::I32* a) { skvm::I32 rgba = load32(ptr); - *r = bit_and(rgba, splat(0xff)); - *g = bytes (rgba, 0x0002); - *b = bytes (rgba, 0x0003); - *a = shr (rgba, 24); + *r = extract(rgba, 0, splat(0xff)); + *g = extract(rgba, 8, splat(0xff)); + *b = extract(rgba, 16, splat(0xff)); + *a = extract(rgba, 24, splat(0xff)); }; skvm::I32 r,g,b,a; @@ -169,7 +169,8 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() { // The s += d*invA adds won't overflow, // so we don't have to unpack s beyond grabbing the alpha channel. skvm::I32 s = load32(src), - ax2 = bytes(s, 0x0404); // rgba -> a0a0 + ax2 = extract(s, 24, splat(0x000000ff)) + | extract(s, 8, splat(0x00ff0000)); // We'll use the same approximation math as above, this time making sure to // use both i16 multiplies to our benefit, one for r/g, the other for b/a. @@ -181,7 +182,7 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() { rb = shr_16x2(mul_16x2(rb, invAx2), 8); // Put the high 8 bits back in the low lane. ga = mul_16x2(ga, invAx2); // Keep the high 8 bits up high... - ga = bit_clear(ga, splat(0x00ff00ff)); // ...and mask off the low bits. + ga = bit_and(ga, splat(0xff00ff00)); // ...and mask off the low bits. store32(dst, add(s, bit_or(rb, ga))); }