finish up arm64 ops
Some small refactoring to common up redundant opcode building. Oddly, I think I've got better codegen than what Clang would do here. Clang doesn't generate uxtl-based code to unpack 8-bit to 32-bit, instead preferring to load each byte one at a time and insert them one at a time. Me: ldr s0, [x0] uxtl v0.8h, v0.8b uxtl v0.4s, v0.8h Clang: ldrb w8, [x0] ldrb w9, [x0, #1] ldrb w10, [x0, #2] ldrb w11, [x0, #3] fmov s0, w8 mov v0.s[1], w9 mov v0.s[2], w10 mov v0.s[3], w11 Change-Id: I0fdf5c6cdcde6a4eb9290936284fd3ffcb2159f6 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/224821 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
e0b2dafeb6
commit
1fa149a713
@ -741,36 +741,34 @@ namespace skvm {
|
||||
|
||||
void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
|
||||
|
||||
void Assembler::shift(uint32_t op, int imm, V n, V d) {
|
||||
this->word( (op & 22_mask) << 10
|
||||
| imm << 16 // imm is embedded inside op, bit size depends on op
|
||||
| (n & 5_mask) << 5
|
||||
| (d & 5_mask) << 0);
|
||||
void Assembler::op(uint32_t op22, int imm, V n, V d) {
|
||||
this->word( (op22 & 22_mask) << 10
|
||||
| imm << 16 // imm is embedded inside op, bit size depends on op
|
||||
| (n & 5_mask) << 5
|
||||
| (d & 5_mask) << 0);
|
||||
}
|
||||
|
||||
void Assembler::shl4s(V d, V n, int imm) {
|
||||
this->shift(0b0'1'0'011110'0100'000'01010'1, ( imm&31), n, d);
|
||||
this->op(0b0'1'0'011110'0100'000'01010'1, ( imm&31), n, d);
|
||||
}
|
||||
void Assembler::sshr4s(V d, V n, int imm) {
|
||||
this->shift(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
|
||||
this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
|
||||
}
|
||||
void Assembler::ushr4s(V d, V n, int imm) {
|
||||
this->shift(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
|
||||
this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
|
||||
}
|
||||
void Assembler::ushr8h(V d, V n, int imm) {
|
||||
this->shift(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d);
|
||||
this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d);
|
||||
}
|
||||
|
||||
void Assembler::scvtf4s(V d, V n) {
|
||||
this->word(0b0'1'0'01110'0'0'10000'11101'10 << 10
|
||||
| (n & 5_mask) << 5
|
||||
| (d & 5_mask) << 0);
|
||||
}
|
||||
void Assembler::fcvtzs4s(V d, V n) {
|
||||
this->word(0b0'1'0'01110'1'0'10000'1101'1'10 << 10
|
||||
| (n & 5_mask) << 5
|
||||
| (d & 5_mask) << 0);
|
||||
}
|
||||
void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
|
||||
void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
|
||||
|
||||
void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
|
||||
void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
|
||||
|
||||
void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
|
||||
void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
|
||||
|
||||
void Assembler::ret(X n) {
|
||||
this->word(0b1101011'0'0'10'11111'0000'0'0 << 10
|
||||
@ -784,7 +782,7 @@ namespace skvm {
|
||||
| (d & 5_mask) << 0);
|
||||
}
|
||||
void Assembler::subs(X d, X n, int imm12) {
|
||||
this->word( 0b1'1'1'10001'00 << 22
|
||||
this->word( 0b1'1'1'10001'00 << 22
|
||||
| (imm12 & 12_mask) << 10
|
||||
| (n & 5_mask) << 5
|
||||
| (d & 5_mask) << 0);
|
||||
@ -798,17 +796,11 @@ namespace skvm {
|
||||
| 0b0'0001 << 0);
|
||||
}
|
||||
|
||||
void Assembler::ldrq(V dst, X src) {
|
||||
this->word( 0b00'111'1'01'11'000000000000 << 10
|
||||
| (src & 5_mask) << 5
|
||||
| (dst & 5_mask) << 0);
|
||||
}
|
||||
void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); }
|
||||
void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); }
|
||||
|
||||
void Assembler::strq(V src, X dst) {
|
||||
this->word( 0b00'111'1'01'10'000000000000 << 10
|
||||
| (dst & 5_mask) << 5
|
||||
| (src & 5_mask) << 0);
|
||||
}
|
||||
void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); }
|
||||
void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); }
|
||||
|
||||
void Assembler::ldrq(V dst, Label l) {
|
||||
const int imm19 = (l.offset - here().offset) / 4;
|
||||
@ -1095,11 +1087,16 @@ namespace skvm {
|
||||
z = inst.z;
|
||||
int imm = inst.imm;
|
||||
switch (op) {
|
||||
#define TODO if (0) SkDebugf("op %d\n", op); return 0
|
||||
case Op::store8: TODO;
|
||||
case Op::store8: a.xtns2h(r(tmp), r(x));
|
||||
a.xtnh2b(r(tmp), r(tmp));
|
||||
a.strs (r(tmp), arg[imm]);
|
||||
break;
|
||||
case Op::store32: a.strq(r(x), arg[imm]); break;
|
||||
|
||||
case Op::load8: TODO;
|
||||
case Op::load8: a.ldrs (r(tmp), arg[imm]);
|
||||
a.uxtlb2h(r(tmp), r(tmp));
|
||||
a.uxtlh2s(r(d) , r(tmp));
|
||||
break;
|
||||
case Op::load32: a.ldrq(r(d), arg[imm]); break;
|
||||
|
||||
case Op::splat: a.ldrq(r(d), *splats.find(imm)); break;
|
||||
|
@ -118,7 +118,12 @@ namespace skvm {
|
||||
|
||||
// d = op(n)
|
||||
using DOpN = void(V d, V n);
|
||||
DOpN scvtf4s, fcvtzs4s;
|
||||
DOpN scvtf4s, // int -> float
|
||||
fcvtzs4s, // truncate float -> int
|
||||
xtns2h, // u32 -> u16
|
||||
xtnh2b, // u16 -> u8
|
||||
uxtlb2h, // u8 -> u16
|
||||
uxtlh2s; // u16 -> u32
|
||||
|
||||
// TODO: both these platforms support rounding float->int (vcvtps2dq, fcvtns.4s)... use?
|
||||
|
||||
@ -129,7 +134,9 @@ namespace skvm {
|
||||
|
||||
void ldrq(V dst, Label); // 128-bit PC-relative load
|
||||
void ldrq(V dst, X src); // 128-bit dst = *src
|
||||
void ldrs(V dst, X src); // 32-bit dst[0] = *src
|
||||
void strq(V src, X dst); // 128-bit *dst = src
|
||||
void strs(V src, X dst); // 32-bit *dst = src[0]
|
||||
|
||||
private:
|
||||
// dst = op(dst, imm)
|
||||
@ -155,12 +162,14 @@ namespace skvm {
|
||||
// *ptr = ymm or ymm = *ptr, depending on opcode.
|
||||
void load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr);
|
||||
|
||||
// General layout top to bottom is:
|
||||
// Opcode for 3-arguments ops is split between hi and lo:
|
||||
// [11 bits hi] [5 bits m] [6 bits lo] [5 bits n] [5 bits d]
|
||||
// where the opcode is split between hi and lo.
|
||||
void op(uint32_t hi, V m, uint32_t lo, V n, V d);
|
||||
|
||||
void shift(uint32_t op, int imm, V n, V d);
|
||||
// 2-argument ops, with or without an immediate.
|
||||
void op(uint32_t op22, int imm, V n, V d);
|
||||
void op(uint32_t op22, V n, V d) { this->op(op22,0,n,d); }
|
||||
void op(uint32_t op22, X x, V v) { this->op(op22,0,(V)x,v); }
|
||||
|
||||
uint8_t* fCode;
|
||||
size_t fSize;
|
||||
|
@ -492,4 +492,22 @@ DEF_TEST(SkVM_Assembler, r) {
|
||||
0x00, 0x01, 0xc0, 0x3d,
|
||||
0x00, 0x01, 0x80, 0x3d,
|
||||
});
|
||||
|
||||
test_asm(r, [&](A& a) {
|
||||
a.xtns2h(A::v0, A::v0);
|
||||
a.xtnh2b(A::v0, A::v0);
|
||||
a.strs (A::v0, A::x0);
|
||||
|
||||
a.ldrs (A::v0, A::x0);
|
||||
a.uxtlb2h(A::v0, A::v0);
|
||||
a.uxtlh2s(A::v0, A::v0);
|
||||
},{
|
||||
0x00,0x28,0x61,0x0e,
|
||||
0x00,0x28,0x21,0x0e,
|
||||
0x00,0x00,0x00,0xbd,
|
||||
|
||||
0x00,0x00,0x40,0xbd,
|
||||
0x00,0xa4,0x08,0x2f,
|
||||
0x00,0xa4,0x10,0x2f,
|
||||
});
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user