proposed: add bytes() op
I'm staring at this assembly, vmovups (%rsi), %ymm3 vpsrld $24, %ymm3, %ymm4 vpslld $16, %ymm4, %ymm15 vorps %ymm4, %ymm15, %ymm4 vpsubw %ymm4, %ymm0, %ymm4 Just knowing that could be vmovups (%rsi), %ymm3 vpshufb 0x??(%rip), %ymm3, %ymm4 vpsubw %ymm4, %ymm0, %ymm4 That is, instead of shifting, shifting, and bit-oring to create the 0a0a scale factor from ymm3, we could just byte shuffle directly using some pre-baked control pattern (stored at the end of the program like other constants) pshufb lets you arbitrarily remix bytes from its argument and zero bytes, and NEON has a similar family of vtbl instructions, even including that same feature of injecting zeroes. I think I've got this working, and the speedup is great, from 0.19 to 0.16 ns/px for I32_SWAR, and from 0.43 to 0.38 ns/px for I32. Change-Id: Iab850275e826b4187f0efc9495a4b9eab4402c38 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220871 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
parent
e5c1f97de1
commit
342b1b2753
@ -571,14 +571,14 @@ r0 = splat FF (3.5733111e-43)
|
||||
r1 = splat 100 (3.5873241e-43)
|
||||
loop:
|
||||
r2 = load32 arg(0)
|
||||
r3 = extract r2 0 r0
|
||||
r4 = extract r2 8 r0
|
||||
r5 = extract r2 16 r0
|
||||
r3 = bit_and r2 r0
|
||||
r4 = bytes r2 2
|
||||
r5 = bytes r2 3
|
||||
r2 = shr r2 24
|
||||
r6 = load32 arg(1)
|
||||
r7 = extract r6 0 r0
|
||||
r8 = extract r6 8 r0
|
||||
r9 = extract r6 16 r0
|
||||
r7 = bit_and r6 r0
|
||||
r8 = bytes r6 2
|
||||
r9 = bytes r6 3
|
||||
r6 = shr r6 24
|
||||
r10 = sub_i32 r1 r2
|
||||
r7 = mul_i16x2 r7 r10
|
||||
@ -599,14 +599,13 @@ r10 = pack r8 r10 16
|
||||
store32 arg(1) r10
|
||||
|
||||
I32 (SWAR) 8888 over 8888
|
||||
7 registers, 17 instructions:
|
||||
7 registers, 16 instructions:
|
||||
r0 = splat 1000100 (2.3510604e-38)
|
||||
r1 = splat FF00FF (2.3418409e-38)
|
||||
r2 = splat FF00FF00 (-1.7146522e+38)
|
||||
loop:
|
||||
r3 = load32 arg(0)
|
||||
r4 = shr r3 24
|
||||
r4 = pack r4 r4 16
|
||||
r4 = bytes r3 404
|
||||
r4 = sub_i16x2 r0 r4
|
||||
r5 = load32 arg(1)
|
||||
r6 = bit_and r5 r1
|
||||
|
@ -239,6 +239,10 @@ namespace skvm {
|
||||
return {this->push(Op::pack, x.id,y.id,NA, 0,bits)};
|
||||
}
|
||||
|
||||
I32 Builder::bytes(I32 x, int control) {
|
||||
return {this->push(Op::bytes, x.id,NA,NA, control)};
|
||||
}
|
||||
|
||||
F32 Builder::to_f32(I32 x) { return {this->push(Op::to_f32, x.id)}; }
|
||||
I32 Builder::to_i32(F32 x) { return {this->push(Op::to_i32, x.id)}; }
|
||||
|
||||
@ -248,6 +252,7 @@ namespace skvm {
|
||||
struct R { ID id; };
|
||||
struct Shift { int bits; };
|
||||
struct Splat { int bits; };
|
||||
struct Hex { int bits; };
|
||||
|
||||
static void write(SkWStream* o, const char* s) {
|
||||
o->writeText(s);
|
||||
@ -277,6 +282,9 @@ namespace skvm {
|
||||
o->writeScalarAsText(f);
|
||||
write(o, ")");
|
||||
}
|
||||
static void write(SkWStream* o, Hex h) {
|
||||
o->writeHexAsText(h.bits);
|
||||
}
|
||||
|
||||
template <typename T, typename... Ts>
|
||||
static void write(SkWStream* o, T first, Ts... rest) {
|
||||
@ -332,6 +340,8 @@ namespace skvm {
|
||||
case Op::extract: write(o, V{id}, "= extract", V{x}, Shift{immy}, V{z}); break;
|
||||
case Op::pack: write(o, V{id}, "= pack", V{x}, V{y}, Shift{immz}); break;
|
||||
|
||||
case Op::bytes: write(o, V{id}, "= bytes", V{x}, Hex{immy}); break;
|
||||
|
||||
case Op::to_f32: write(o, V{id}, "= to_f32", V{x}); break;
|
||||
case Op::to_i32: write(o, V{id}, "= to_i32", V{x}); break;
|
||||
}
|
||||
@ -389,6 +399,8 @@ namespace skvm {
|
||||
case Op::extract: write(o, R{d}, "= extract", R{x}, Shift{y.imm}, R{z.id}); break;
|
||||
case Op::pack: write(o, R{d}, "= pack", R{x}, R{y.id}, Shift{z.imm}); break;
|
||||
|
||||
case Op::bytes: write(o, R{d}, "= bytes", R{x}, Hex{y.imm}); break;
|
||||
|
||||
case Op::to_f32: write(o, R{d}, "= to_f32", R{x}); break;
|
||||
case Op::to_i32: write(o, R{d}, "= to_i32", R{x}); break;
|
||||
}
|
||||
@ -430,8 +442,15 @@ namespace skvm {
|
||||
|
||||
#endif
|
||||
|
||||
// Label / 4-byte values we need to write after ret.
|
||||
std::vector<std::pair<Xbyak::Label, int>> splats;
|
||||
// Label / N-byte values we need to write after ret.
|
||||
struct Data4 { Xbyak::Label label; int bits ; };
|
||||
struct Data32 { Xbyak::Label label; int bits[8]; };
|
||||
std::vector<Data4 > data4;
|
||||
std::vector<Data32> data32;
|
||||
|
||||
// Map from our bytes() control y.imm to index in data32;
|
||||
// no need to splat out duplicate bytes for the same control.
|
||||
std::unordered_map<int, int> vpshufb_masks;
|
||||
|
||||
for (int i = 0; i < (int)instructions.size(); i++) {
|
||||
if (i == loop) {
|
||||
@ -468,8 +487,8 @@ namespace skvm {
|
||||
case Op::load8: vpmovzxbd(r[d], ptr[arg[y.imm]]); break;
|
||||
case Op::load32: vmovups (r[d], ptr[arg[y.imm]]); break;
|
||||
|
||||
case Op::splat: splats.emplace_back(Xbyak::Label(), y.imm);
|
||||
vbroadcastss(r[d], ptr[rip + splats.back().first]);
|
||||
case Op::splat: data4.push_back(Data4{Xbyak::Label(), y.imm});
|
||||
vbroadcastss(r[d], ptr[rip + data4.back().label]);
|
||||
break;
|
||||
|
||||
case Op::add_f32: vaddps(r[d], r[x], r[y.id]); break;
|
||||
@ -514,6 +533,47 @@ namespace skvm {
|
||||
|
||||
case Op::to_f32: vcvtdq2ps (r[d], r[x]); break;
|
||||
case Op::to_i32: vcvttps2dq(r[d], r[x]); break;
|
||||
|
||||
case Op::bytes: {
|
||||
if (vpshufb_masks.end() == vpshufb_masks.find(y.imm)) {
|
||||
// Translate bytes()'s control nibbles to vpshufb's control bytes.
|
||||
auto nibble_to_vpshufb = [](unsigned n) -> uint8_t {
|
||||
return n == 0 ? 0xff // Fill with zero.
|
||||
: n-1; // Select n'th 1-indexed byte.
|
||||
};
|
||||
uint8_t control[] = {
|
||||
nibble_to_vpshufb( (y.imm >> 0) & 0xf ),
|
||||
nibble_to_vpshufb( (y.imm >> 4) & 0xf ),
|
||||
nibble_to_vpshufb( (y.imm >> 8) & 0xf ),
|
||||
nibble_to_vpshufb( (y.imm >> 12) & 0xf ),
|
||||
};
|
||||
|
||||
// Now, vpshufb is one of those weird AVX instructions
|
||||
// that does everything in 2 128-bit chunks, so we'll
|
||||
// only really need 4 distinct values to write in our pattern:
|
||||
int p[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
p[i] = (int)control[0] << 0
|
||||
| (int)control[1] << 8
|
||||
| (int)control[2] << 16
|
||||
| (int)control[3] << 24;
|
||||
|
||||
// Update each byte that refers to a byte index by 4 to
|
||||
// point into the next 32-bit lane, but leave any 0xff
|
||||
// that fills with zero alone.
|
||||
control[0] += control[0] == 0xff ? 0 : 4;
|
||||
control[1] += control[1] == 0xff ? 0 : 4;
|
||||
control[2] += control[2] == 0xff ? 0 : 4;
|
||||
control[3] += control[3] == 0xff ? 0 : 4;
|
||||
}
|
||||
|
||||
// Notice, same patterns for top 4 32-bit lanes as bottom.
|
||||
data32.push_back(Data32{Xbyak::Label(), {p[0], p[1], p[2], p[3],
|
||||
p[0], p[1], p[2], p[3]}});
|
||||
vpshufb_masks[y.imm] = data32.size() - 1;
|
||||
}
|
||||
vpshufb(r[d], r[x], ptr[rip + data32[vpshufb_masks[y.imm]].label]);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -526,10 +586,17 @@ namespace skvm {
|
||||
vzeroupper();
|
||||
ret();
|
||||
|
||||
for (auto splat : splats) {
|
||||
for (auto data : data4) {
|
||||
align(4);
|
||||
L(splat.first);
|
||||
dd(splat.second);
|
||||
L(data.label);
|
||||
dd(data.bits);
|
||||
}
|
||||
for (auto data : data32) {
|
||||
align(32);
|
||||
L(data.label);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
dd(data.bits[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -26,6 +26,7 @@ namespace skvm {
|
||||
shl, shr, sra,
|
||||
extract,
|
||||
pack,
|
||||
bytes,
|
||||
to_f32, to_i32,
|
||||
};
|
||||
|
||||
@ -115,6 +116,28 @@ namespace skvm {
|
||||
I32 extract(I32 x, int bits, I32 z); // (x >> bits) & z
|
||||
I32 pack (I32 x, I32 y, int bits); // x | (y << bits)
|
||||
|
||||
// Shuffle the bytes in x according to each nibble of control, as if
|
||||
//
|
||||
// uint8_t bytes[] = {
|
||||
// 0,
|
||||
// ((uint32_t)x ) & 0xff,
|
||||
// ((uint32_t)x >> 8) & 0xff,
|
||||
// ((uint32_t)x >> 16) & 0xff,
|
||||
// ((uint32_t)x >> 24) & 0xff,
|
||||
// };
|
||||
// return (uint32_t)bytes[(control >> 0) & 0xf] << 0
|
||||
// | (uint32_t)bytes[(control >> 4) & 0xf] << 8
|
||||
// | (uint32_t)bytes[(control >> 8) & 0xf] << 16
|
||||
// | (uint32_t)bytes[(control >> 12) & 0xf] << 24;
|
||||
//
|
||||
// So, e.g.,
|
||||
// - bytes(x, 0x1111) splats the low byte of x to all four bytes
|
||||
// - bytes(x, 0x4321) is x, an identity
|
||||
// - bytes(x, 0x0000) is 0
|
||||
// - bytes(x, 0x0404) transforms an RGBA pixel into an A0A0 bit pattern.
|
||||
//
|
||||
I32 bytes(I32 x, int control);
|
||||
|
||||
F32 to_f32(I32 x);
|
||||
I32 to_i32(F32 x);
|
||||
|
||||
|
@ -148,6 +148,20 @@ namespace SK_OPTS_NS {
|
||||
CASE(Op::extract): r(d).u32 = (r(x).u32 >> y.imm) & r(z.id).u32; break;
|
||||
CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y.id).u32 << z.imm); break;
|
||||
|
||||
CASE(Op::bytes): {
|
||||
const U32 table[] = {
|
||||
0,
|
||||
(r(x).u32 ) & 0xff,
|
||||
(r(x).u32 >> 8) & 0xff,
|
||||
(r(x).u32 >> 16) & 0xff,
|
||||
(r(x).u32 >> 24) & 0xff,
|
||||
};
|
||||
r(d).u32 = table[(y.imm >> 0) & 0xf] << 0
|
||||
| table[(y.imm >> 4) & 0xf] << 8
|
||||
| table[(y.imm >> 8) & 0xf] << 16
|
||||
| table[(y.imm >> 12) & 0xf] << 24;
|
||||
} break;
|
||||
|
||||
CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
|
||||
CASE(Op::to_i32): r(d).i32 = skvx::cast<int> (r(x).f32); break;
|
||||
#undef CASE
|
||||
|
@ -97,9 +97,9 @@ SrcoverBuilder_I32::SrcoverBuilder_I32() {
|
||||
auto load = [&](skvm::Arg ptr,
|
||||
skvm::I32* r, skvm::I32* g, skvm::I32* b, skvm::I32* a) {
|
||||
skvm::I32 rgba = load32(ptr);
|
||||
*r = extract(rgba, 0, splat(0xff));
|
||||
*g = extract(rgba, 8, splat(0xff));
|
||||
*b = extract(rgba, 16, splat(0xff));
|
||||
*r = bit_and(rgba, splat(0xff));
|
||||
*g = bytes (rgba, 0x0002);
|
||||
*b = bytes (rgba, 0x0003);
|
||||
*a = shr (rgba, 24);
|
||||
};
|
||||
|
||||
@ -141,12 +141,11 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() {
|
||||
// The s += d*invA adds won't overflow,
|
||||
// so we don't have to unpack s beyond grabbing the alpha channel.
|
||||
skvm::I32 s = load32(src),
|
||||
a = shr(s, 24);
|
||||
ax2 = bytes(s, 0x0404); // rgba -> a0a0
|
||||
|
||||
// We'll use the same approximation math as above, this time making sure to
|
||||
// use both i16 multiplies to our benefit, one for r/g, the other for b/a.
|
||||
skvm::I32 ax2 = pack(a,a,16),
|
||||
invAx2 = sub_16x2(splat(0x01000100), ax2);
|
||||
skvm::I32 invAx2 = sub_16x2(splat(0x01000100), ax2);
|
||||
|
||||
skvm::I32 d = load32(dst),
|
||||
rb = bit_and (d, splat(0x00ff00ff)),
|
||||
|
Loading…
Reference in New Issue
Block a user