[wasm-simd] AVX codegen for load splat

Bug: v8:9886
Change-Id: I321e93d02971c6ba568d9d7c52d464ffc2754665
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1929837
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65277}
This commit is contained in:
Ng Zhi An 2019-12-02 01:20:32 -08:00 committed by Commit Bot
parent 2fb290d79a
commit 83fc8559fa
6 changed files with 71 additions and 13 deletions

View File

@ -3517,6 +3517,23 @@ void Assembler::movmskps(Register dst, XMMRegister src) {
}
// AVX instructions
void Assembler::vmovddup(XMMRegister dst, Operand src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, xmm0, src, kL128, kF2, k0F, kWIG);
emit(0x12);
emit_sse_operand(dst, src);
}
void Assembler::vbroadcastss(XMMRegister dst, Operand src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F38, kW0);
emit(0x18);
emit_sse_operand(dst, src);
}
void Assembler::vfmasd(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
DCHECK(IsEnabled(FMA3));

View File

@ -1116,6 +1116,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movlhps(XMMRegister dst, XMMRegister src);
// AVX instruction
void vmovddup(XMMRegister dst, Operand src);
void vbroadcastss(XMMRegister dst, Operand src);
void vfmadd132sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0x99, dst, src1, src2);
}
@ -1628,6 +1630,14 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
vinstr(0x70, dst, xmm0, src, k66, k0F, kWIG);
emit(imm8);
}
void vpshuflw(XMMRegister dst, XMMRegister src, uint8_t imm8) {
vinstr(0x70, dst, xmm0, src, kF2, k0F, kWIG);
emit(imm8);
}
void vpshuflw(XMMRegister dst, Operand src, uint8_t imm8) {
vinstr(0x70, dst, xmm0, src, kF2, k0F, kWIG);
emit(imm8);
}
void vps(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vps(byte op, XMMRegister dst, XMMRegister src1, Operand src2);

View File

@ -116,6 +116,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
.template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \
}
#define AVX_OP_SSE3(macro_name, name) \
template <typename Dst, typename... Args> \
void macro_name(Dst dst, Args... args) { \
AvxHelper<Dst, Args...>{this, base::Optional<CpuFeature>(SSE3)} \
.template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \
}
#define AVX_OP_SSSE3(macro_name, name) \
template <typename Dst, typename... Args> \
void macro_name(Dst dst, Args... args) { \
@ -195,6 +202,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Subps, subps)
AVX_OP(Mulps, mulps)
AVX_OP(Divps, divps)
AVX_OP(Pshuflw, pshuflw)
AVX_OP(Punpcklqdq, punpcklqdq)
AVX_OP_SSE3(Movddup, movddup)
AVX_OP_SSSE3(Pshufb, pshufb)
AVX_OP_SSSE3(Psignd, psignd)
AVX_OP_SSE4_1(Pmulld, pmulld)

View File

@ -3660,31 +3660,34 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64S8x16LoadSplat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
__ pinsrb(i.OutputSimd128Register(), i.MemoryOperand(), 0);
__ pxor(kScratchDoubleReg, kScratchDoubleReg);
__ pshufb(i.OutputSimd128Register(), kScratchDoubleReg);
__ Pinsrb(i.OutputSimd128Register(), i.MemoryOperand(), 0);
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(i.OutputSimd128Register(), kScratchDoubleReg);
break;
}
case kX64S16x8LoadSplat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
__ pinsrw(i.OutputSimd128Register(), i.MemoryOperand(), 0);
__ pshuflw(i.OutputSimd128Register(), i.OutputSimd128Register(), 0);
__ punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register());
__ Pinsrw(i.OutputSimd128Register(), i.MemoryOperand(), 0);
__ Pshuflw(i.OutputSimd128Register(), i.OutputSimd128Register(),
static_cast<uint8_t>(0));
__ Punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register());
break;
}
case kX64S32x4LoadSplat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
// TODO(v8:9886): AVX codegen
__ movss(i.OutputSimd128Register(), i.MemoryOperand());
__ shufps(i.OutputSimd128Register(), i.OutputSimd128Register(),
static_cast<byte>(0));
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand());
} else {
__ Movss(i.OutputSimd128Register(), i.MemoryOperand());
__ Shufps(i.OutputSimd128Register(), i.OutputSimd128Register(),
static_cast<byte>(0));
}
break;
}
case kX64S64x2LoadSplat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
// TODO(v8:9886): AVX codegen
__ movsd(i.OutputSimd128Register(), i.MemoryOperand());
__ punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register());
__ Movddup(i.OutputSimd128Register(), i.MemoryOperand());
break;
}
case kX64I16x8Load8x8S: {

View File

@ -819,6 +819,10 @@ int DisassemblerX64::AVXInstruction(byte* data) {
int mod, regop, rm, vvvv = vex_vreg();
get_modrm(*current, &mod, &regop, &rm);
switch (opcode) {
case 0x18:
AppendToBuffer("vbroadcastss %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x99:
AppendToBuffer("vfmadd132s%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
@ -1067,6 +1071,10 @@ int DisassemblerX64::AVXInstruction(byte* data) {
}
AppendToBuffer(",%s", NameOfXMMRegister(regop));
break;
case 0x12:
AppendToBuffer("vmovddup %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x2A:
AppendToBuffer("%s %s,%s,", vex_w() ? "vcvtqsi2sd" : "vcvtlsi2sd",
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
@ -1126,6 +1134,11 @@ int DisassemblerX64::AVXInstruction(byte* data) {
AppendToBuffer("vlddqu %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x70:
AppendToBuffer("vpshuflw %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
AppendToBuffer(",0x%x", *current++);
break;
case 0x7C:
AppendToBuffer("vhaddps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));

View File

@ -763,7 +763,12 @@ TEST(DisasmX64) {
__ vpinsrd(xmm1, xmm2, rax, 2);
__ vpinsrd(xmm1, xmm2, Operand(rbx, rcx, times_4, 10000), 2);
__ vpshufd(xmm1, xmm2, 85);
__ vpshuflw(xmm1, xmm2, 85);
__ vpshuflw(xmm1, Operand(rbx, rcx, times_4, 10000), 85);
__ vshufps(xmm3, xmm2, xmm3, 3);
__ vmovddup(xmm1, Operand(rbx, rcx, times_4, 10000));
__ vbroadcastss(xmm1, Operand(rbx, rcx, times_4, 10000));
}
}