[x64] Implement 256-bit assembly for v(p)broadcast*
Bug: v8:12228 Change-Id: I434b07e3d7a2e270dc7dd26950b9dd047eb46a56 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3219944 Reviewed-by: Zhi An Ng <zhin@chromium.org> Commit-Queue: Yolanda Chen <yolanda.chen@intel.com> Cr-Commit-Position: refs/heads/main@{#77446}
This commit is contained in:
parent
dfbd9edb87
commit
83a58b70e6
@ -3441,21 +3441,24 @@ void Assembler::vmovshdup(XMMRegister dst, XMMRegister src) {
|
||||
emit_sse_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::vbroadcastss(XMMRegister dst, Operand src) {
|
||||
DCHECK(IsEnabled(AVX));
|
||||
EnsureSpace ensure_space(this);
|
||||
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F38, kW0);
|
||||
emit(0x18);
|
||||
emit_sse_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::vbroadcastss(XMMRegister dst, XMMRegister src) {
|
||||
DCHECK(IsEnabled(AVX2));
|
||||
EnsureSpace ensure_space(this);
|
||||
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F38, kW0);
|
||||
emit(0x18);
|
||||
emit_sse_operand(dst, src);
|
||||
}
|
||||
#define BROADCASTSS(SIMDRegister, length) \
|
||||
void Assembler::vbroadcastss(SIMDRegister dst, Operand src) { \
|
||||
DCHECK(IsEnabled(AVX)); \
|
||||
EnsureSpace ensure_space(this); \
|
||||
emit_vex_prefix(dst, xmm0, src, k##length, k66, k0F38, kW0); \
|
||||
emit(0x18); \
|
||||
emit_sse_operand(dst, src); \
|
||||
} \
|
||||
void Assembler::vbroadcastss(SIMDRegister dst, XMMRegister src) { \
|
||||
DCHECK(IsEnabled(AVX2)); \
|
||||
EnsureSpace ensure_space(this); \
|
||||
emit_vex_prefix(dst, xmm0, src, k##length, k66, k0F38, kW0); \
|
||||
emit(0x18); \
|
||||
emit_sse_operand(dst, src); \
|
||||
}
|
||||
BROADCASTSS(XMMRegister, L128)
|
||||
BROADCASTSS(YMMRegister, L256)
|
||||
#undef BROADCASTSS
|
||||
|
||||
void Assembler::fma_instr(byte op, XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, VectorLength l, SIMDPrefix pp,
|
||||
@ -3652,17 +3655,6 @@ void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1,
|
||||
emit_sse_operand(dst, src2);
|
||||
}
|
||||
|
||||
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
|
||||
YMMRegister src2, SIMDPrefix pp, LeadingOpcode m, VexW w,
|
||||
CpuFeature feature) {
|
||||
DCHECK(IsEnabled(feature));
|
||||
DCHECK(feature == AVX || feature == AVX2);
|
||||
EnsureSpace ensure_space(this);
|
||||
emit_vex_prefix(dst, src1, src2, kL256, pp, m, w);
|
||||
emit(op);
|
||||
emit_sse_operand(dst, src2);
|
||||
}
|
||||
|
||||
void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w,
|
||||
CpuFeature feature) {
|
||||
@ -3674,9 +3666,9 @@ void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
|
||||
emit_sse_operand(dst, src2);
|
||||
}
|
||||
|
||||
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1, Operand src2,
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w,
|
||||
CpuFeature feature) {
|
||||
template <typename Reg1, typename Reg2, typename Op>
|
||||
void Assembler::vinstr(byte op, Reg1 dst, Reg2 src1, Op src2, SIMDPrefix pp,
|
||||
LeadingOpcode m, VexW w, CpuFeature feature) {
|
||||
DCHECK(IsEnabled(feature));
|
||||
DCHECK(feature == AVX || feature == AVX2);
|
||||
EnsureSpace ensure_space(this);
|
||||
@ -3685,6 +3677,19 @@ void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1, Operand src2,
|
||||
emit_sse_operand(dst, src2);
|
||||
}
|
||||
|
||||
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
|
||||
byte op, YMMRegister dst, YMMRegister src1, YMMRegister src2, SIMDPrefix pp,
|
||||
LeadingOpcode m, VexW w, CpuFeature feature);
|
||||
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
|
||||
byte op, YMMRegister dst, XMMRegister src1, XMMRegister src2, SIMDPrefix pp,
|
||||
LeadingOpcode m, VexW w, CpuFeature feature);
|
||||
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
|
||||
byte op, YMMRegister dst, YMMRegister src1, Operand src2, SIMDPrefix pp,
|
||||
LeadingOpcode m, VexW w, CpuFeature feature);
|
||||
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
|
||||
byte op, YMMRegister dst, XMMRegister src1, Operand src2, SIMDPrefix pp,
|
||||
LeadingOpcode m, VexW w, CpuFeature feature);
|
||||
|
||||
void Assembler::vps(byte op, XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2) {
|
||||
DCHECK(IsEnabled(AVX));
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "src/base/export-template.h"
|
||||
#include "src/codegen/assembler.h"
|
||||
#include "src/codegen/cpu-features.h"
|
||||
#include "src/codegen/label.h"
|
||||
@ -930,14 +931,12 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
|
||||
void vinstr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX);
|
||||
void vinstr(byte op, YMMRegister dst, YMMRegister src1, YMMRegister src2,
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w,
|
||||
CpuFeature feature = AVX2);
|
||||
void vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX);
|
||||
void vinstr(byte op, YMMRegister dst, YMMRegister src1, Operand src2,
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w,
|
||||
CpuFeature feature = AVX2);
|
||||
|
||||
template <typename Reg1, typename Reg2, typename Op>
|
||||
void vinstr(byte op, Reg1 dst, Reg2 src1, Op src2, SIMDPrefix pp,
|
||||
LeadingOpcode m, VexW w, CpuFeature feature = AVX2);
|
||||
|
||||
// SSE instructions
|
||||
void sse_instr(XMMRegister dst, XMMRegister src, byte escape, byte opcode);
|
||||
@ -1290,6 +1289,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
void vmovshdup(XMMRegister dst, XMMRegister src);
|
||||
void vbroadcastss(XMMRegister dst, Operand src);
|
||||
void vbroadcastss(XMMRegister dst, XMMRegister src);
|
||||
void vbroadcastss(YMMRegister dst, Operand src);
|
||||
void vbroadcastss(YMMRegister dst, XMMRegister src);
|
||||
|
||||
void fma_instr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
VectorLength l, SIMDPrefix pp, LeadingOpcode m, VexW w);
|
||||
@ -1735,11 +1736,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
|
||||
// AVX2 instructions
|
||||
#define AVX2_INSTRUCTION(instr, prefix, escape1, escape2, opcode) \
|
||||
void instr(XMMRegister dst, XMMRegister src) { \
|
||||
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
|
||||
AVX2); \
|
||||
} \
|
||||
void instr(XMMRegister dst, Operand src) { \
|
||||
template <typename Reg, typename Op> \
|
||||
void instr(Reg dst, Op src) { \
|
||||
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
|
||||
AVX2); \
|
||||
}
|
||||
@ -2418,6 +2416,23 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
#endif
|
||||
};
|
||||
|
||||
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
|
||||
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
|
||||
YMMRegister src2, SIMDPrefix pp,
|
||||
LeadingOpcode m, VexW w, CpuFeature feature);
|
||||
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
|
||||
void Assembler::vinstr(byte op, YMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, SIMDPrefix pp,
|
||||
LeadingOpcode m, VexW w, CpuFeature feature);
|
||||
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
|
||||
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
|
||||
Operand src2, SIMDPrefix pp, LeadingOpcode m,
|
||||
VexW w, CpuFeature feature);
|
||||
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
|
||||
void Assembler::vinstr(byte op, YMMRegister dst, XMMRegister src1,
|
||||
Operand src2, SIMDPrefix pp, LeadingOpcode m,
|
||||
VexW w, CpuFeature feature);
|
||||
|
||||
// Helper class that ensures that there is enough space for generating
|
||||
// instructions and relocation information. The constructor makes
|
||||
// sure that there is enough space and (in debug mode) the destructor
|
||||
|
@ -186,7 +186,7 @@
|
||||
|
||||
#define SSE4_2_INSTRUCTION_LIST(V) V(pcmpgtq, 66, 0F, 38, 37)
|
||||
|
||||
// These require AVX2, and we only define the VEX-128 versions.
|
||||
// These require AVX2.
|
||||
#define AVX2_BROADCAST_LIST(V) \
|
||||
V(vpbroadcastd, 66, 0F, 38, 58) \
|
||||
V(vpbroadcastb, 66, 0F, 38, 78) \
|
||||
|
@ -891,7 +891,7 @@ int DisassemblerX64::AVXInstruction(byte* data) {
|
||||
switch (opcode) {
|
||||
case 0x18:
|
||||
AppendToBuffer("vbroadcastss %s,", NameOfAVXRegister(regop));
|
||||
current += PrintRightAVXOperand(current);
|
||||
current += PrintRightXMMOperand(current);
|
||||
break;
|
||||
case 0x98:
|
||||
AppendToBuffer("vfmadd132p%c %s,%s,", float_size_code(),
|
||||
@ -1017,7 +1017,7 @@ int DisassemblerX64::AVXInstruction(byte* data) {
|
||||
#define DISASSEMBLE_AVX2_BROADCAST(instruction, _1, _2, _3, code) \
|
||||
case 0x##code: \
|
||||
AppendToBuffer("" #instruction " %s,", NameOfAVXRegister(regop)); \
|
||||
current += PrintRightAVXOperand(current); \
|
||||
current += PrintRightXMMOperand(current); \
|
||||
break;
|
||||
AVX2_BROADCAST_LIST(DISASSEMBLE_AVX2_BROADCAST)
|
||||
#undef DISASSEMBLE_AVX2_BROADCAST
|
||||
|
@ -2536,6 +2536,7 @@ TEST(AssemblerX64Regmove256bit) {
|
||||
__ vmovdqu(ymm10, ymm11);
|
||||
__ vmovdqu(ymm9, Operand(rbx, rcx, times_4, 10000));
|
||||
__ vmovdqu(Operand(rbx, rcx, times_4, 10000), ymm0);
|
||||
__ vbroadcastss(ymm7, Operand(rbx, rcx, times_4, 10000));
|
||||
|
||||
CodeDesc desc;
|
||||
masm.GetCode(isolate, &desc);
|
||||
@ -2558,11 +2559,15 @@ TEST(AssemblerX64Regmove256bit) {
|
||||
// vmovdqu ymm9,YMMWORD PTR [rbx+rcx*4+0x2710]
|
||||
0xC5, 0x7E, 0x6F, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00,
|
||||
// vmovdqu YMMWORD PTR [rbx+rcx*4+0x2710],ymm0
|
||||
0xC5, 0xFE, 0x7F, 0x84, 0x8B, 0x10, 0x27, 0x00, 0x00};
|
||||
0xC5, 0xFE, 0x7F, 0x84, 0x8B, 0x10, 0x27, 0x00, 0x00,
|
||||
|
||||
// vbroadcastss ymm7, DWORD PTR [rbx+rcx*4+0x2710]
|
||||
0xc4, 0xe2, 0x7d, 0x18, 0xbc, 0x8b, 0x10, 0x27, 0x00,
|
||||
0x00};
|
||||
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
|
||||
}
|
||||
|
||||
TEST(AssemblerX64LaneOp256bit) {
|
||||
TEST(AssemblerX64AVX2Op256bit) {
|
||||
if (!CpuFeatures::IsSupported(AVX2)) return;
|
||||
CcTest::InitializeVM();
|
||||
v8::HandleScope scope(CcTest::isolate());
|
||||
@ -2581,6 +2586,11 @@ TEST(AssemblerX64LaneOp256bit) {
|
||||
__ vpblendw(ymm2, ymm3, Operand(rbx, rcx, times_4, 10000), 23);
|
||||
__ vpalignr(ymm10, ymm11, ymm12, 4);
|
||||
__ vpalignr(ymm10, ymm11, Operand(rbx, rcx, times_4, 10000), 4);
|
||||
__ vbroadcastss(ymm7, xmm0);
|
||||
__ vpbroadcastb(ymm2, xmm1);
|
||||
__ vpbroadcastb(ymm3, Operand(rbx, rcx, times_4, 10000));
|
||||
__ vpbroadcastw(ymm15, xmm4);
|
||||
__ vpbroadcastw(ymm5, Operand(rbx, rcx, times_4, 10000));
|
||||
|
||||
CodeDesc desc;
|
||||
masm.GetCode(isolate, &desc);
|
||||
@ -2611,7 +2621,17 @@ TEST(AssemblerX64LaneOp256bit) {
|
||||
// vpalignr ymm10, ymm11, ymm12, 4
|
||||
0xC4, 0x43, 0x25, 0x0F, 0xD4, 0x04,
|
||||
// vpalignr ymm10, ymm11, YMMWORD PTR [rbx+rcx*4+0x2710], 4
|
||||
0xC4, 0x63, 0x25, 0x0F, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x04};
|
||||
0xC4, 0x63, 0x25, 0x0F, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x04,
|
||||
// vbroadcastss ymm7, xmm0
|
||||
0xc4, 0xe2, 0x7d, 0x18, 0xf8,
|
||||
// vpbroadcastb ymm2, xmm1
|
||||
0xc4, 0xe2, 0x7d, 0x78, 0xd1,
|
||||
// vpbroadcastb ymm3, BYTE PTR [rbx+rcx*4+0x2710]
|
||||
0xc4, 0xe2, 0x7d, 0x78, 0x9c, 0x8b, 0x10, 0x27, 0x00, 0x00,
|
||||
// vpbroadcastw ymm15, xmm4
|
||||
0xc4, 0x62, 0x7d, 0x79, 0xfc,
|
||||
// vpbroadcastw ymm5, WORD PTR [rbx+rcx*4+0x2710]
|
||||
0xc4, 0xe2, 0x7d, 0x79, 0xac, 0x8b, 0x10, 0x27, 0x00, 0x00};
|
||||
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
|
||||
}
|
||||
|
||||
|
@ -1312,14 +1312,36 @@ UNINITIALIZED_TEST(DisasmX64CheckOutputAVX) {
|
||||
UNINITIALIZED_TEST(DisasmX64YMMRegister) {
|
||||
if (!CpuFeatures::IsSupported(AVX)) return;
|
||||
DisassemblerTester t;
|
||||
CpuFeatureScope fscope(t.assm(), AVX);
|
||||
|
||||
// Short immediate instructions
|
||||
COMPARE("c5fd6fc1 vmovdqa ymm0,ymm1", vmovdqa(ymm0, ymm1));
|
||||
COMPARE("c5f77cc2 vhaddps ymm0,ymm1,ymm2",
|
||||
vhaddps(ymm0, ymm1, ymm2));
|
||||
COMPARE("c5f77c848b10270000 vhaddps ymm0,ymm1,[rbx+rcx*4+0x2710]",
|
||||
vhaddps(ymm0, ymm1, Operand(rbx, rcx, times_4, 10000)));
|
||||
{
|
||||
CpuFeatureScope fscope(t.assm(), AVX);
|
||||
|
||||
// Short immediate instructions
|
||||
COMPARE("c5fd6fc1 vmovdqa ymm0,ymm1", vmovdqa(ymm0, ymm1));
|
||||
COMPARE("c5f77cc2 vhaddps ymm0,ymm1,ymm2",
|
||||
vhaddps(ymm0, ymm1, ymm2));
|
||||
COMPARE("c5f77c848b10270000 vhaddps ymm0,ymm1,[rbx+rcx*4+0x2710]",
|
||||
vhaddps(ymm0, ymm1, Operand(rbx, rcx, times_4, 10000)));
|
||||
COMPARE("c4e27d18bc8b10270000 vbroadcastss ymm7,[rbx+rcx*4+0x2710]",
|
||||
vbroadcastss(ymm7, Operand(rbx, rcx, times_4, 10000)));
|
||||
}
|
||||
|
||||
if (!CpuFeatures::IsSupported(AVX2)) return;
|
||||
{
|
||||
CpuFeatureScope fscope(t.assm(), AVX2);
|
||||
|
||||
// Short immediate instructions
|
||||
COMPARE("c4e27d18d1 vbroadcastss ymm2,xmm1",
|
||||
vbroadcastss(ymm2, xmm1));
|
||||
COMPARE("c4e27d789c8b10270000 vpbroadcastb ymm3,[rbx+rcx*4+0x2710]",
|
||||
vpbroadcastb(ymm3, Operand(rbx, rcx, times_4, 10000)));
|
||||
COMPARE("c4e27d79d3 vpbroadcastw ymm2,xmm3",
|
||||
vpbroadcastw(ymm2, xmm3));
|
||||
COMPARE("c4c27d58f8 vpbroadcastd ymm7,xmm8",
|
||||
vpbroadcastd(ymm7, xmm8));
|
||||
COMPARE("c4627d588c8b10270000 vpbroadcastd ymm9,[rbx+rcx*4+0x2710]",
|
||||
vpbroadcastd(ymm9, Operand(rbx, rcx, times_4, 10000)));
|
||||
}
|
||||
}
|
||||
|
||||
#undef __
|
||||
|
Loading…
Reference in New Issue
Block a user