[wasm-simd][x64][avx2] Optimize f32x4.splat
When AVX2 is available, we can use vbroadcastss. On AVX, use vshufps, since it is non-destructive. On SSE, shufps is 1 byte shorter. FIXED=b/175364402 Change-Id: I5bd10914579d8db012192a9c04f7b0038ec1c812 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2599849 Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#71964}
This commit is contained in:
parent
451926117e
commit
ffc832becf
@ -3423,6 +3423,14 @@ void Assembler::vbroadcastss(XMMRegister dst, Operand src) {
|
||||
emit_sse_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::vbroadcastss(XMMRegister dst, XMMRegister src) {
|
||||
DCHECK(IsEnabled(AVX2));
|
||||
EnsureSpace ensure_space(this);
|
||||
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F38, kW0);
|
||||
emit(0x18);
|
||||
emit_sse_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::fma_instr(byte op, XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, VectorLength l, SIMDPrefix pp,
|
||||
LeadingOpcode m, VexW w) {
|
||||
|
@ -1299,6 +1299,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
void vmovddup(XMMRegister dst, Operand src);
|
||||
void vmovshdup(XMMRegister dst, XMMRegister src);
|
||||
void vbroadcastss(XMMRegister dst, Operand src);
|
||||
void vbroadcastss(XMMRegister dst, XMMRegister src);
|
||||
|
||||
void fma_instr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
VectorLength l, SIMDPrefix pp, LeadingOpcode m, VexW w);
|
||||
|
@ -2522,8 +2522,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kX64F32x4Splat: {
|
||||
__ Shufps(i.OutputSimd128Register(), i.InputDoubleRegister(0),
|
||||
i.InputDoubleRegister(0), 0);
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src = i.InputDoubleRegister(0);
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope avx2_scope(tasm(), AVX2);
|
||||
__ vbroadcastss(dst, src);
|
||||
} else if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
__ vshufps(dst, src, src, 0);
|
||||
} else {
|
||||
if (dst == src) {
|
||||
// 1 byte shorter than pshufd.
|
||||
__ shufps(dst, src, 0);
|
||||
} else {
|
||||
__ pshufd(dst, src, 0);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kX64F32x4ExtractLane: {
|
||||
|
@ -3010,9 +3010,8 @@ void InstructionSelector::VisitF64x2Splat(Node* node) {
|
||||
|
||||
void InstructionSelector::VisitF32x4Splat(Node* node) {
|
||||
X64OperandGenerator g(this);
|
||||
InstructionOperand dst =
|
||||
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
|
||||
Emit(kX64F32x4Splat, dst, g.UseRegister(node->InputAt(0)));
|
||||
Emit(kX64F32x4Splat, g.DefineAsRegister(node),
|
||||
g.UseRegister(node->InputAt(0)));
|
||||
}
|
||||
|
||||
#define SIMD_VISIT_EXTRACT_LANE(Type, Sign, Op) \
|
||||
|
@ -851,6 +851,14 @@ TEST(DisasmX64) {
|
||||
}
|
||||
}
|
||||
|
||||
// AVX2 instruction
|
||||
{
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
CpuFeatureScope scope(&assm, AVX2);
|
||||
__ vbroadcastss(xmm1, xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
// AVX2 instructions.
|
||||
{
|
||||
if (CpuFeatures::IsSupported(AVX2)) {
|
||||
|
Loading…
Reference in New Issue
Block a user