[wasm-simd][x64][avx2] Optimize f32x4.splat

When AVX2 is available, we can use vbroadcastss. On AVX, use vshufps,
since it is non-destructive. On SSE, shufps is 1 byte shorter.

FIXED=b/175364402

Change-Id: I5bd10914579d8db012192a9c04f7b0038ec1c812
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2599849
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71964}
This commit is contained in:
Zhi An Ng 2020-12-23 05:21:46 +00:00 committed by Commit Bot
parent 451926117e
commit ffc832becf
5 changed files with 35 additions and 5 deletions

View File

@ -3423,6 +3423,14 @@ void Assembler::vbroadcastss(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src);
}
void Assembler::vbroadcastss(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(AVX2));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F38, kW0);
emit(0x18);
emit_sse_operand(dst, src);
}
void Assembler::fma_instr(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2, VectorLength l, SIMDPrefix pp,
LeadingOpcode m, VexW w) {

View File

@ -1299,6 +1299,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vmovddup(XMMRegister dst, Operand src);
void vmovshdup(XMMRegister dst, XMMRegister src);
void vbroadcastss(XMMRegister dst, Operand src);
void vbroadcastss(XMMRegister dst, XMMRegister src);
void fma_instr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
VectorLength l, SIMDPrefix pp, LeadingOpcode m, VexW w);

View File

@ -2522,8 +2522,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64F32x4Splat: {
__ Shufps(i.OutputSimd128Register(), i.InputDoubleRegister(0),
i.InputDoubleRegister(0), 0);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputDoubleRegister(0);
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(tasm(), AVX2);
__ vbroadcastss(dst, src);
} else if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vshufps(dst, src, src, 0);
} else {
if (dst == src) {
// 1 byte shorter than pshufd.
__ shufps(dst, src, 0);
} else {
__ pshufd(dst, src, 0);
}
}
break;
}
case kX64F32x4ExtractLane: {

View File

@ -3010,9 +3010,8 @@ void InstructionSelector::VisitF64x2Splat(Node* node) {
void InstructionSelector::VisitF32x4Splat(Node* node) {
X64OperandGenerator g(this);
InstructionOperand dst =
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
Emit(kX64F32x4Splat, dst, g.UseRegister(node->InputAt(0)));
Emit(kX64F32x4Splat, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)));
}
#define SIMD_VISIT_EXTRACT_LANE(Type, Sign, Op) \

View File

@ -851,6 +851,14 @@ TEST(DisasmX64) {
}
}
// AVX2 instruction
{
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope scope(&assm, AVX2);
__ vbroadcastss(xmm1, xmm2);
}
}
// AVX2 instructions.
{
if (CpuFeatures::IsSupported(AVX2)) {