finish up arm64 ops

Some small refactoring to common up redundant opcode building.

Oddly, I think I've got better codegen than what Clang would do here.
Clang doesn't generate uxtl-based code to unpack 8-bit to 32-bit,
instead preferring to load each byte one at a time and insert them one
at a time.

Me:
    ldr  s0, [x0]
    uxtl v0.8h, v0.8b
    uxtl v0.4s, v0.8h

Clang:
    ldrb  w8,  [x0]
    ldrb  w9,  [x0, #1]
    ldrb  w10, [x0, #2]
    ldrb  w11, [x0, #3]
    fmov  s0,      w8
    mov   v0.s[1], w9
    mov   v0.s[2], w10
    mov   v0.s[3], w11

Change-Id: I0fdf5c6cdcde6a4eb9290936284fd3ffcb2159f6
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/224821
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2019-07-01 11:18:08 -05:00 committed by Skia Commit-Bot
parent e0b2dafeb6
commit 1fa149a713
3 changed files with 61 additions and 37 deletions

View File

@ -741,36 +741,34 @@ namespace skvm {
void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
void Assembler::shift(uint32_t op, int imm, V n, V d) {
this->word( (op & 22_mask) << 10
| imm << 16 // imm is embedded inside op, bit size depends on op
| (n & 5_mask) << 5
| (d & 5_mask) << 0);
void Assembler::op(uint32_t op22, int imm, V n, V d) {
this->word( (op22 & 22_mask) << 10
| imm << 16 // imm is embedded inside op, bit size depends on op
| (n & 5_mask) << 5
| (d & 5_mask) << 0);
}
void Assembler::shl4s(V d, V n, int imm) {
this->shift(0b0'1'0'011110'0100'000'01010'1, ( imm&31), n, d);
this->op(0b0'1'0'011110'0100'000'01010'1, ( imm&31), n, d);
}
void Assembler::sshr4s(V d, V n, int imm) {
this->shift(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
}
void Assembler::ushr4s(V d, V n, int imm) {
this->shift(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
}
void Assembler::ushr8h(V d, V n, int imm) {
this->shift(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d);
this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d);
}
void Assembler::scvtf4s(V d, V n) {
this->word(0b0'1'0'01110'0'0'10000'11101'10 << 10
| (n & 5_mask) << 5
| (d & 5_mask) << 0);
}
void Assembler::fcvtzs4s(V d, V n) {
this->word(0b0'1'0'01110'1'0'10000'1101'1'10 << 10
| (n & 5_mask) << 5
| (d & 5_mask) << 0);
}
void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
void Assembler::ret(X n) {
this->word(0b1101011'0'0'10'11111'0000'0'0 << 10
@ -784,7 +782,7 @@ namespace skvm {
| (d & 5_mask) << 0);
}
void Assembler::subs(X d, X n, int imm12) {
this->word( 0b1'1'1'10001'00 << 22
this->word( 0b1'1'1'10001'00 << 22
| (imm12 & 12_mask) << 10
| (n & 5_mask) << 5
| (d & 5_mask) << 0);
@ -798,17 +796,11 @@ namespace skvm {
| 0b0'0001 << 0);
}
void Assembler::ldrq(V dst, X src) {
this->word( 0b00'111'1'01'11'000000000000 << 10
| (src & 5_mask) << 5
| (dst & 5_mask) << 0);
}
void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); }
void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); }
void Assembler::strq(V src, X dst) {
this->word( 0b00'111'1'01'10'000000000000 << 10
| (dst & 5_mask) << 5
| (src & 5_mask) << 0);
}
void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); }
void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); }
void Assembler::ldrq(V dst, Label l) {
const int imm19 = (l.offset - here().offset) / 4;
@ -1095,11 +1087,16 @@ namespace skvm {
z = inst.z;
int imm = inst.imm;
switch (op) {
#define TODO if (0) SkDebugf("op %d\n", op); return 0
case Op::store8: TODO;
case Op::store8: a.xtns2h(r(tmp), r(x));
a.xtnh2b(r(tmp), r(tmp));
a.strs (r(tmp), arg[imm]);
break;
case Op::store32: a.strq(r(x), arg[imm]); break;
case Op::load8: TODO;
case Op::load8: a.ldrs (r(tmp), arg[imm]);
a.uxtlb2h(r(tmp), r(tmp));
a.uxtlh2s(r(d) , r(tmp));
break;
case Op::load32: a.ldrq(r(d), arg[imm]); break;
case Op::splat: a.ldrq(r(d), *splats.find(imm)); break;

View File

@ -118,7 +118,12 @@ namespace skvm {
// d = op(n)
using DOpN = void(V d, V n);
DOpN scvtf4s, fcvtzs4s;
DOpN scvtf4s, // int -> float
fcvtzs4s, // truncate float -> int
xtns2h, // u32 -> u16
xtnh2b, // u16 -> u8
uxtlb2h, // u8 -> u16
uxtlh2s; // u16 -> u32
// TODO: both these platforms support rounding float->int (vcvtps2dq, fcvtns.4s)... use?
@ -129,7 +134,9 @@ namespace skvm {
void ldrq(V dst, Label); // 128-bit PC-relative load
void ldrq(V dst, X src); // 128-bit dst = *src
void ldrs(V dst, X src); // 32-bit dst[0] = *src
void strq(V src, X dst); // 128-bit *dst = src
void strs(V src, X dst); // 32-bit *dst = src[0]
private:
// dst = op(dst, imm)
@ -155,12 +162,14 @@ namespace skvm {
// *ptr = ymm or ymm = *ptr, depending on opcode.
void load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr);
// General layout top to bottom is:
// Opcode for 3-arguments ops is split between hi and lo:
// [11 bits hi] [5 bits m] [6 bits lo] [5 bits n] [5 bits d]
// where the opcode is split between hi and lo.
void op(uint32_t hi, V m, uint32_t lo, V n, V d);
void shift(uint32_t op, int imm, V n, V d);
// 2-argument ops, with or without an immediate.
void op(uint32_t op22, int imm, V n, V d);
void op(uint32_t op22, V n, V d) { this->op(op22,0,n,d); }
void op(uint32_t op22, X x, V v) { this->op(op22,0,(V)x,v); }
uint8_t* fCode;
size_t fSize;

View File

@ -492,4 +492,22 @@ DEF_TEST(SkVM_Assembler, r) {
0x00, 0x01, 0xc0, 0x3d,
0x00, 0x01, 0x80, 0x3d,
});
test_asm(r, [&](A& a) {
a.xtns2h(A::v0, A::v0);
a.xtnh2b(A::v0, A::v0);
a.strs (A::v0, A::x0);
a.ldrs (A::v0, A::x0);
a.uxtlb2h(A::v0, A::v0);
a.uxtlh2s(A::v0, A::v0);
},{
0x00,0x28,0x61,0x0e,
0x00,0x28,0x21,0x0e,
0x00,0x00,0x00,0xbd,
0x00,0x00,0x40,0xbd,
0x00,0xa4,0x08,0x2f,
0x00,0xa4,0x10,0x2f,
});
}