proposed: add bytes() op

I'm staring at this assembly,

	vmovups	(%rsi), %ymm3
	vpsrld	$24, %ymm3, %ymm4
	vpslld	$16, %ymm4, %ymm15
	vorps	%ymm4, %ymm15, %ymm4
	vpsubw	%ymm4, %ymm0, %ymm4

Just knowing that could be

	vmovups	(%rsi), %ymm3
	vpshufb	 0x??(%rip), %ymm3, %ymm4
	vpsubw	%ymm4, %ymm0, %ymm4

That is, instead of shifting, shifting, and bit-oring
to create the 0a0a scale factor from ymm3, we could just
byte shuffle directly using some pre-baked control pattern
(stored at the end of the program like other constants)

pshufb lets you arbitrarily remix bytes from its argument and
zero bytes, and NEON has a similar family of vtbl instructions,
even including that same feature of injecting zeroes.

I think I've got this working, and the speedup is great,
from 0.19 to 0.16 ns/px for I32_SWAR, and
from 0.43 to 0.38 ns/px for I32.

Change-Id: Iab850275e826b4187f0efc9495a4b9eab4402c38
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220871
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2019-06-13 16:43:18 -05:00 committed by Skia Commit-Bot
parent e5c1f97de1
commit 342b1b2753
5 changed files with 124 additions and 22 deletions

View File

@ -571,14 +571,14 @@ r0 = splat FF (3.5733111e-43)
r1 = splat 100 (3.5873241e-43)
loop:
r2 = load32 arg(0)
r3 = extract r2 0 r0
r4 = extract r2 8 r0
r5 = extract r2 16 r0
r3 = bit_and r2 r0
r4 = bytes r2 2
r5 = bytes r2 3
r2 = shr r2 24
r6 = load32 arg(1)
r7 = extract r6 0 r0
r8 = extract r6 8 r0
r9 = extract r6 16 r0
r7 = bit_and r6 r0
r8 = bytes r6 2
r9 = bytes r6 3
r6 = shr r6 24
r10 = sub_i32 r1 r2
r7 = mul_i16x2 r7 r10
@ -599,14 +599,13 @@ r10 = pack r8 r10 16
store32 arg(1) r10
I32 (SWAR) 8888 over 8888
7 registers, 17 instructions:
7 registers, 16 instructions:
r0 = splat 1000100 (2.3510604e-38)
r1 = splat FF00FF (2.3418409e-38)
r2 = splat FF00FF00 (-1.7146522e+38)
loop:
r3 = load32 arg(0)
r4 = shr r3 24
r4 = pack r4 r4 16
r4 = bytes r3 404
r4 = sub_i16x2 r0 r4
r5 = load32 arg(1)
r6 = bit_and r5 r1

View File

@ -239,6 +239,10 @@ namespace skvm {
return {this->push(Op::pack, x.id,y.id,NA, 0,bits)};
}
I32 Builder::bytes(I32 x, int control) {
return {this->push(Op::bytes, x.id,NA,NA, control)};
}
F32 Builder::to_f32(I32 x) { return {this->push(Op::to_f32, x.id)}; }
I32 Builder::to_i32(F32 x) { return {this->push(Op::to_i32, x.id)}; }
@ -248,6 +252,7 @@ namespace skvm {
struct R { ID id; };
struct Shift { int bits; };
struct Splat { int bits; };
struct Hex { int bits; };
static void write(SkWStream* o, const char* s) {
o->writeText(s);
@ -277,6 +282,9 @@ namespace skvm {
o->writeScalarAsText(f);
write(o, ")");
}
static void write(SkWStream* o, Hex h) {
o->writeHexAsText(h.bits);
}
template <typename T, typename... Ts>
static void write(SkWStream* o, T first, Ts... rest) {
@ -332,6 +340,8 @@ namespace skvm {
case Op::extract: write(o, V{id}, "= extract", V{x}, Shift{immy}, V{z}); break;
case Op::pack: write(o, V{id}, "= pack", V{x}, V{y}, Shift{immz}); break;
case Op::bytes: write(o, V{id}, "= bytes", V{x}, Hex{immy}); break;
case Op::to_f32: write(o, V{id}, "= to_f32", V{x}); break;
case Op::to_i32: write(o, V{id}, "= to_i32", V{x}); break;
}
@ -389,6 +399,8 @@ namespace skvm {
case Op::extract: write(o, R{d}, "= extract", R{x}, Shift{y.imm}, R{z.id}); break;
case Op::pack: write(o, R{d}, "= pack", R{x}, R{y.id}, Shift{z.imm}); break;
case Op::bytes: write(o, R{d}, "= bytes", R{x}, Hex{y.imm}); break;
case Op::to_f32: write(o, R{d}, "= to_f32", R{x}); break;
case Op::to_i32: write(o, R{d}, "= to_i32", R{x}); break;
}
@ -430,8 +442,15 @@ namespace skvm {
#endif
// Label / 4-byte values we need to write after ret.
std::vector<std::pair<Xbyak::Label, int>> splats;
// Label / N-byte values we need to write after ret.
struct Data4 { Xbyak::Label label; int bits ; };
struct Data32 { Xbyak::Label label; int bits[8]; };
std::vector<Data4 > data4;
std::vector<Data32> data32;
// Map from our bytes() control y.imm to index in data32;
// no need to splat out duplicate bytes for the same control.
std::unordered_map<int, int> vpshufb_masks;
for (int i = 0; i < (int)instructions.size(); i++) {
if (i == loop) {
@ -468,8 +487,8 @@ namespace skvm {
case Op::load8: vpmovzxbd(r[d], ptr[arg[y.imm]]); break;
case Op::load32: vmovups (r[d], ptr[arg[y.imm]]); break;
case Op::splat: splats.emplace_back(Xbyak::Label(), y.imm);
vbroadcastss(r[d], ptr[rip + splats.back().first]);
case Op::splat: data4.push_back(Data4{Xbyak::Label(), y.imm});
vbroadcastss(r[d], ptr[rip + data4.back().label]);
break;
case Op::add_f32: vaddps(r[d], r[x], r[y.id]); break;
@ -514,6 +533,47 @@ namespace skvm {
case Op::to_f32: vcvtdq2ps (r[d], r[x]); break;
case Op::to_i32: vcvttps2dq(r[d], r[x]); break;
case Op::bytes: {
if (vpshufb_masks.end() == vpshufb_masks.find(y.imm)) {
// Translate bytes()'s control nibbles to vpshufb's control bytes.
auto nibble_to_vpshufb = [](unsigned n) -> uint8_t {
return n == 0 ? 0xff // Fill with zero.
: n-1; // Select n'th 1-indexed byte.
};
uint8_t control[] = {
nibble_to_vpshufb( (y.imm >> 0) & 0xf ),
nibble_to_vpshufb( (y.imm >> 4) & 0xf ),
nibble_to_vpshufb( (y.imm >> 8) & 0xf ),
nibble_to_vpshufb( (y.imm >> 12) & 0xf ),
};
// Now, vpshufb is one of those weird AVX instructions
// that does everything in 2 128-bit chunks, so we'll
// only really need 4 distinct values to write in our pattern:
int p[4];
for (int i = 0; i < 4; i++) {
p[i] = (int)control[0] << 0
| (int)control[1] << 8
| (int)control[2] << 16
| (int)control[3] << 24;
// Update each byte that refers to a byte index by 4 to
// point into the next 32-bit lane, but leave any 0xff
// that fills with zero alone.
control[0] += control[0] == 0xff ? 0 : 4;
control[1] += control[1] == 0xff ? 0 : 4;
control[2] += control[2] == 0xff ? 0 : 4;
control[3] += control[3] == 0xff ? 0 : 4;
}
// Notice, same patterns for top 4 32-bit lanes as bottom.
data32.push_back(Data32{Xbyak::Label(), {p[0], p[1], p[2], p[3],
p[0], p[1], p[2], p[3]}});
vpshufb_masks[y.imm] = data32.size() - 1;
}
vpshufb(r[d], r[x], ptr[rip + data32[vpshufb_masks[y.imm]].label]);
} break;
}
}
@ -526,10 +586,17 @@ namespace skvm {
vzeroupper();
ret();
for (auto splat : splats) {
for (auto data : data4) {
align(4);
L(splat.first);
dd(splat.second);
L(data.label);
dd(data.bits);
}
for (auto data : data32) {
align(32);
L(data.label);
for (int i = 0; i < 8; i++) {
dd(data.bits[i]);
}
}
}
};

View File

@ -26,6 +26,7 @@ namespace skvm {
shl, shr, sra,
extract,
pack,
bytes,
to_f32, to_i32,
};
@ -115,6 +116,28 @@ namespace skvm {
I32 extract(I32 x, int bits, I32 z); // (x >> bits) & z
I32 pack (I32 x, I32 y, int bits); // x | (y << bits)
// Shuffle the bytes in x according to each nibble of control, as if
//
// uint8_t bytes[] = {
// 0,
// ((uint32_t)x ) & 0xff,
// ((uint32_t)x >> 8) & 0xff,
// ((uint32_t)x >> 16) & 0xff,
// ((uint32_t)x >> 24) & 0xff,
// };
// return (uint32_t)bytes[(control >> 0) & 0xf] << 0
// | (uint32_t)bytes[(control >> 4) & 0xf] << 8
// | (uint32_t)bytes[(control >> 8) & 0xf] << 16
// | (uint32_t)bytes[(control >> 12) & 0xf] << 24;
//
// So, e.g.,
// - bytes(x, 0x1111) splats the low byte of x to all four bytes
// - bytes(x, 0x4321) is x, an identity
// - bytes(x, 0x0000) is 0
// - bytes(x, 0x0404) transforms an RGBA pixel into an A0A0 bit pattern.
//
I32 bytes(I32 x, int control);
F32 to_f32(I32 x);
I32 to_i32(F32 x);

View File

@ -148,6 +148,20 @@ namespace SK_OPTS_NS {
CASE(Op::extract): r(d).u32 = (r(x).u32 >> y.imm) & r(z.id).u32; break;
CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y.id).u32 << z.imm); break;
CASE(Op::bytes): {
const U32 table[] = {
0,
(r(x).u32 ) & 0xff,
(r(x).u32 >> 8) & 0xff,
(r(x).u32 >> 16) & 0xff,
(r(x).u32 >> 24) & 0xff,
};
r(d).u32 = table[(y.imm >> 0) & 0xf] << 0
| table[(y.imm >> 4) & 0xf] << 8
| table[(y.imm >> 8) & 0xf] << 16
| table[(y.imm >> 12) & 0xf] << 24;
} break;
CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
CASE(Op::to_i32): r(d).i32 = skvx::cast<int> (r(x).f32); break;
#undef CASE

View File

@ -97,9 +97,9 @@ SrcoverBuilder_I32::SrcoverBuilder_I32() {
auto load = [&](skvm::Arg ptr,
skvm::I32* r, skvm::I32* g, skvm::I32* b, skvm::I32* a) {
skvm::I32 rgba = load32(ptr);
*r = extract(rgba, 0, splat(0xff));
*g = extract(rgba, 8, splat(0xff));
*b = extract(rgba, 16, splat(0xff));
*r = bit_and(rgba, splat(0xff));
*g = bytes (rgba, 0x0002);
*b = bytes (rgba, 0x0003);
*a = shr (rgba, 24);
};
@ -141,12 +141,11 @@ SrcoverBuilder_I32_SWAR::SrcoverBuilder_I32_SWAR() {
// The s += d*invA adds won't overflow,
// so we don't have to unpack s beyond grabbing the alpha channel.
skvm::I32 s = load32(src),
a = shr(s, 24);
ax2 = bytes(s, 0x0404); // rgba -> a0a0
// We'll use the same approximation math as above, this time making sure to
// use both i16 multiplies to our benefit, one for r/g, the other for b/a.
skvm::I32 ax2 = pack(a,a,16),
invAx2 = sub_16x2(splat(0x01000100), ax2);
skvm::I32 invAx2 = sub_16x2(splat(0x01000100), ax2);
skvm::I32 d = load32(dst),
rb = bit_and (d, splat(0x00ff00ff)),