[wasm-simd][x64][avx2] Improve codegen for load{8,16}_splat

Detect AVX2 support and use vpbroadcastb or vpbroadcastw.

No new assembler helpers required because we are only emitting the
VEX-128 versions of these instructions.

Bug: v8:11258
Change-Id: Ic50178daa6fc8fe767dfc788e61e67538066bdea
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2596582
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71866}
This commit is contained in:
Zhi An Ng 2020-12-23 01:06:45 +00:00 committed by Commit Bot
parent fccbf1b876
commit c9560d1dbf
6 changed files with 65 additions and 13 deletions

View File

@ -3535,9 +3535,10 @@ void Assembler::vmovhps(Operand dst, XMMRegister src) {
}
void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2, SIMDPrefix pp, LeadingOpcode m,
VexW w) {
DCHECK(IsEnabled(AVX));
XMMRegister src2, SIMDPrefix pp, LeadingOpcode m, VexW w,
CpuFeature feature) {
DCHECK(IsEnabled(feature));
DCHECK(feature == AVX || feature == AVX2);
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, src1, src2, kLIG, pp, m, w);
emit(op);
@ -3545,8 +3546,10 @@ void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1,
}
void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
SIMDPrefix pp, LeadingOpcode m, VexW w) {
DCHECK(IsEnabled(AVX));
SIMDPrefix pp, LeadingOpcode m, VexW w,
CpuFeature feature) {
DCHECK(IsEnabled(feature));
DCHECK(feature == AVX || feature == AVX2);
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, src1, src2, kLIG, pp, m, w);
emit(op);

View File

@ -43,6 +43,7 @@
#include <vector>
#include "src/codegen/assembler.h"
#include "src/codegen/cpu-features.h"
#include "src/codegen/label.h"
#include "src/codegen/x64/constants-x64.h"
#include "src/codegen/x64/fma-instr.h"
@ -945,9 +946,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movmskps(Register dst, XMMRegister src);
void vinstr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
SIMDPrefix pp, LeadingOpcode m, VexW w);
SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX);
void vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
SIMDPrefix pp, LeadingOpcode m, VexW w);
SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX);
// SSE instructions
void sse_instr(XMMRegister dst, XMMRegister src, byte escape, byte opcode);
@ -1669,6 +1670,19 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vpd(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vpd(byte op, XMMRegister dst, XMMRegister src1, Operand src2);
// AVX2 instructions
#define AVX2_INSTRUCTION(instr, prefix, escape1, escape2, opcode) \
void instr(XMMRegister dst, XMMRegister src) { \
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
AVX2); \
} \
void instr(XMMRegister dst, Operand src) { \
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
AVX2); \
}
AVX2_BROADCAST_LIST(AVX2_INSTRUCTION)
#undef AVX2_INSTRUCTION
// BMI instruction
void andnq(Register dst, Register src1, Register src2) {
bmi1q(0xf2, dst, src1, src2);

View File

@ -180,4 +180,9 @@
#define SSE4_2_INSTRUCTION_LIST(V) V(pcmpgtq, 66, 0F, 38, 37)
// These require AVX2, and we only define the VEX-128 versions.
#define AVX2_BROADCAST_LIST(V) \
V(vpbroadcastb, 66, 0F, 38, 78) \
V(vpbroadcastw, 66, 0F, 38, 79)
#endif // V8_CODEGEN_X64_SSE_INSTR_H_

View File

@ -3850,17 +3850,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kX64S128Load8Splat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
XMMRegister dst = i.OutputSimd128Register();
__ Pinsrb(dst, dst, i.MemoryOperand(), 0);
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(dst, kScratchDoubleReg);
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(tasm(), AVX2);
__ vpbroadcastb(dst, i.MemoryOperand());
} else {
__ Pinsrb(dst, dst, i.MemoryOperand(), 0);
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(dst, kScratchDoubleReg);
}
break;
}
case kX64S128Load16Splat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
XMMRegister dst = i.OutputSimd128Register();
__ Pinsrw(dst, dst, i.MemoryOperand(), 0);
__ Pshuflw(dst, dst, uint8_t{0});
__ Punpcklqdq(dst, dst);
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(tasm(), AVX2);
__ vpbroadcastw(dst, i.MemoryOperand());
} else {
__ Pinsrw(dst, dst, i.MemoryOperand(), 0);
__ Pshuflw(dst, dst, uint8_t{0});
__ Punpcklqdq(dst, dst);
}
break;
}
case kX64S128Load32Splat: {

View File

@ -971,6 +971,14 @@ int DisassemblerX64::AVXInstruction(byte* data) {
SSE4_UNOP_INSTRUCTION_LIST(DECLARE_SSE_UNOP_AVX_DIS_CASE)
#undef DECLARE_SSE_UNOP_AVX_DIS_CASE
#define DISASSEMBLE_AVX2_BROADCAST(instruction, _1, _2, _3, code) \
case 0x##code: \
AppendToBuffer("" #instruction " %s,", NameOfXMMRegister(regop)); \
current += PrintRightXMMOperand(current); \
break;
AVX2_BROADCAST_LIST(DISASSEMBLE_AVX2_BROADCAST)
#undef DISASSEMBLE_AVX2_BROADCAST
default:
UnimplementedInstruction();
}

View File

@ -847,6 +847,18 @@ TEST(DisasmX64) {
}
}
// AVX2 instructions.
{
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope scope(&assm, AVX2);
#define EMIT_AVX2_BROADCAST(instruction, notUsed1, notUsed2, notUsed3, \
notUsed4) \
__ instruction(xmm0, xmm1); \
__ instruction(xmm0, Operand(rbx, rcx, times_4, 10000));
AVX2_BROADCAST_LIST(EMIT_AVX2_BROADCAST)
}
}
// FMA3 instruction
{
if (CpuFeatures::IsSupported(FMA3)) {