[wasm-simd][x64][avx2] Improve codegen for load{8,16}_splat
Detect AVX2 support and use vpbroadcastb or vpbroadcastw. No new assembler helpers required because we are only emitting the VEX-128 versions of these instructions. Bug: v8:11258 Change-Id: Ic50178daa6fc8fe767dfc788e61e67538066bdea Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2596582 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Bill Budge <bbudge@chromium.org> Cr-Commit-Position: refs/heads/master@{#71866}
This commit is contained in:
parent
fccbf1b876
commit
c9560d1dbf
@ -3535,9 +3535,10 @@ void Assembler::vmovhps(Operand dst, XMMRegister src) {
|
||||
}
|
||||
|
||||
void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, SIMDPrefix pp, LeadingOpcode m,
|
||||
VexW w) {
|
||||
DCHECK(IsEnabled(AVX));
|
||||
XMMRegister src2, SIMDPrefix pp, LeadingOpcode m, VexW w,
|
||||
CpuFeature feature) {
|
||||
DCHECK(IsEnabled(feature));
|
||||
DCHECK(feature == AVX || feature == AVX2);
|
||||
EnsureSpace ensure_space(this);
|
||||
emit_vex_prefix(dst, src1, src2, kLIG, pp, m, w);
|
||||
emit(op);
|
||||
@ -3545,8 +3546,10 @@ void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1,
|
||||
}
|
||||
|
||||
void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w) {
|
||||
DCHECK(IsEnabled(AVX));
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w,
|
||||
CpuFeature feature) {
|
||||
DCHECK(IsEnabled(feature));
|
||||
DCHECK(feature == AVX || feature == AVX2);
|
||||
EnsureSpace ensure_space(this);
|
||||
emit_vex_prefix(dst, src1, src2, kLIG, pp, m, w);
|
||||
emit(op);
|
||||
|
@ -43,6 +43,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "src/codegen/assembler.h"
|
||||
#include "src/codegen/cpu-features.h"
|
||||
#include "src/codegen/label.h"
|
||||
#include "src/codegen/x64/constants-x64.h"
|
||||
#include "src/codegen/x64/fma-instr.h"
|
||||
@ -945,9 +946,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
void movmskps(Register dst, XMMRegister src);
|
||||
|
||||
void vinstr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w);
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX);
|
||||
void vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w);
|
||||
SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX);
|
||||
|
||||
// SSE instructions
|
||||
void sse_instr(XMMRegister dst, XMMRegister src, byte escape, byte opcode);
|
||||
@ -1669,6 +1670,19 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
void vpd(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
void vpd(byte op, XMMRegister dst, XMMRegister src1, Operand src2);
|
||||
|
||||
// AVX2 instructions
|
||||
#define AVX2_INSTRUCTION(instr, prefix, escape1, escape2, opcode) \
|
||||
void instr(XMMRegister dst, XMMRegister src) { \
|
||||
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
|
||||
AVX2); \
|
||||
} \
|
||||
void instr(XMMRegister dst, Operand src) { \
|
||||
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
|
||||
AVX2); \
|
||||
}
|
||||
AVX2_BROADCAST_LIST(AVX2_INSTRUCTION)
|
||||
#undef AVX2_INSTRUCTION
|
||||
|
||||
// BMI instruction
|
||||
void andnq(Register dst, Register src1, Register src2) {
|
||||
bmi1q(0xf2, dst, src1, src2);
|
||||
|
@ -180,4 +180,9 @@
|
||||
|
||||
#define SSE4_2_INSTRUCTION_LIST(V) V(pcmpgtq, 66, 0F, 38, 37)
|
||||
|
||||
// These require AVX2, and we only define the VEX-128 versions.
|
||||
#define AVX2_BROADCAST_LIST(V) \
|
||||
V(vpbroadcastb, 66, 0F, 38, 78) \
|
||||
V(vpbroadcastw, 66, 0F, 38, 79)
|
||||
|
||||
#endif // V8_CODEGEN_X64_SSE_INSTR_H_
|
||||
|
@ -3850,17 +3850,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
case kX64S128Load8Splat: {
|
||||
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
__ Pinsrb(dst, dst, i.MemoryOperand(), 0);
|
||||
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ Pshufb(dst, kScratchDoubleReg);
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope avx2_scope(tasm(), AVX2);
|
||||
__ vpbroadcastb(dst, i.MemoryOperand());
|
||||
} else {
|
||||
__ Pinsrb(dst, dst, i.MemoryOperand(), 0);
|
||||
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ Pshufb(dst, kScratchDoubleReg);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kX64S128Load16Splat: {
|
||||
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
__ Pinsrw(dst, dst, i.MemoryOperand(), 0);
|
||||
__ Pshuflw(dst, dst, uint8_t{0});
|
||||
__ Punpcklqdq(dst, dst);
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope avx2_scope(tasm(), AVX2);
|
||||
__ vpbroadcastw(dst, i.MemoryOperand());
|
||||
} else {
|
||||
__ Pinsrw(dst, dst, i.MemoryOperand(), 0);
|
||||
__ Pshuflw(dst, dst, uint8_t{0});
|
||||
__ Punpcklqdq(dst, dst);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kX64S128Load32Splat: {
|
||||
|
@ -971,6 +971,14 @@ int DisassemblerX64::AVXInstruction(byte* data) {
|
||||
SSE4_UNOP_INSTRUCTION_LIST(DECLARE_SSE_UNOP_AVX_DIS_CASE)
|
||||
#undef DECLARE_SSE_UNOP_AVX_DIS_CASE
|
||||
|
||||
#define DISASSEMBLE_AVX2_BROADCAST(instruction, _1, _2, _3, code) \
|
||||
case 0x##code: \
|
||||
AppendToBuffer("" #instruction " %s,", NameOfXMMRegister(regop)); \
|
||||
current += PrintRightXMMOperand(current); \
|
||||
break;
|
||||
AVX2_BROADCAST_LIST(DISASSEMBLE_AVX2_BROADCAST)
|
||||
#undef DISASSEMBLE_AVX2_BROADCAST
|
||||
|
||||
default:
|
||||
UnimplementedInstruction();
|
||||
}
|
||||
|
@ -847,6 +847,18 @@ TEST(DisasmX64) {
|
||||
}
|
||||
}
|
||||
|
||||
// AVX2 instructions.
|
||||
{
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope scope(&assm, AVX2);
|
||||
#define EMIT_AVX2_BROADCAST(instruction, notUsed1, notUsed2, notUsed3, \
|
||||
notUsed4) \
|
||||
__ instruction(xmm0, xmm1); \
|
||||
__ instruction(xmm0, Operand(rbx, rcx, times_4, 10000));
|
||||
AVX2_BROADCAST_LIST(EMIT_AVX2_BROADCAST)
|
||||
}
|
||||
}
|
||||
|
||||
// FMA3 instruction
|
||||
{
|
||||
if (CpuFeatures::IsSupported(FMA3)) {
|
||||
|
Loading…
Reference in New Issue
Block a user