diff --git a/src/codegen/x64/assembler-x64.cc b/src/codegen/x64/assembler-x64.cc index 15012dbc73..55a776b911 100644 --- a/src/codegen/x64/assembler-x64.cc +++ b/src/codegen/x64/assembler-x64.cc @@ -2757,6 +2757,15 @@ void Assembler::movdqu(XMMRegister dst, Operand src) { emit_sse_operand(dst, src); } +void Assembler::movdqu(XMMRegister dst, XMMRegister src) { + EnsureSpace ensure_space(this); + emit(0xF3); + emit_rex_64(dst, src); + emit(0x0F); + emit(0x6F); + emit_sse_operand(dst, src); +} + void Assembler::pinsrw(XMMRegister dst, Register src, uint8_t imm8) { EnsureSpace ensure_space(this); emit(0x66); diff --git a/src/codegen/x64/assembler-x64.h b/src/codegen/x64/assembler-x64.h index 0fc0b10970..dfae40d299 100644 --- a/src/codegen/x64/assembler-x64.h +++ b/src/codegen/x64/assembler-x64.h @@ -1160,6 +1160,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { void movdqu(Operand dst, XMMRegister src); void movdqu(XMMRegister dst, Operand src); + void movdqu(XMMRegister dst, XMMRegister src); void movapd(XMMRegister dst, XMMRegister src); void movupd(XMMRegister dst, Operand src); diff --git a/src/codegen/x64/macro-assembler-x64.cc b/src/codegen/x64/macro-assembler-x64.cc index e64f625620..30ade3f886 100644 --- a/src/codegen/x64/macro-assembler-x64.cc +++ b/src/codegen/x64/macro-assembler-x64.cc @@ -1761,17 +1761,71 @@ void TurboAssembler::Pextrb(Register dst, XMMRegister src, int8_t imm8) { } } -void TurboAssembler::Pinsrd(XMMRegister dst, Register src, uint8_t imm8) { +namespace { + +template +using AvxFn = void (Assembler::*)(XMMRegister, XMMRegister, Src, uint8_t); +template +using NoAvxFn = void (Assembler::*)(XMMRegister, Src, uint8_t); + +template +void PinsrHelper(Assembler* assm, AvxFn avx, NoAvxFn noavx, + XMMRegister dst, XMMRegister src1, Src src2, uint8_t imm8, + base::Optional feature = base::nullopt) { if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope scope(this, AVX); - vpinsrd(dst, dst, src, imm8); - return; - } else if (CpuFeatures::IsSupported(SSE4_1)) { - CpuFeatureScope sse_scope(this, SSE4_1); - pinsrd(dst, src, imm8); + CpuFeatureScope scope(assm, AVX); + (assm->*avx)(dst, src1, src2, imm8); return; } - Movd(kScratchDoubleReg, src); + + if (dst != src1) { + assm->movdqu(dst, src1); + } + if (feature.has_value()) { + DCHECK(CpuFeatures::IsSupported(*feature)); + CpuFeatureScope scope(assm, *feature); + (assm->*noavx)(dst, src2, imm8); + } else { + (assm->*noavx)(dst, src2, imm8); + } +} +} // namespace + +void TurboAssembler::Pinsrb(XMMRegister dst, XMMRegister src1, Register src2, + uint8_t imm8) { + PinsrHelper(this, &Assembler::vpinsrb, &Assembler::pinsrb, dst, src1, src2, + imm8, base::Optional(SSE4_1)); +} + +void TurboAssembler::Pinsrb(XMMRegister dst, XMMRegister src1, Operand src2, + uint8_t imm8) { + PinsrHelper(this, &Assembler::vpinsrb, &Assembler::pinsrb, dst, src1, src2, + imm8, base::Optional(SSE4_1)); +} + +void TurboAssembler::Pinsrw(XMMRegister dst, XMMRegister src1, Register src2, + uint8_t imm8) { + PinsrHelper(this, &Assembler::vpinsrw, &Assembler::pinsrw, dst, src1, src2, + imm8); +} + +void TurboAssembler::Pinsrw(XMMRegister dst, XMMRegister src1, Operand src2, + uint8_t imm8) { + PinsrHelper(this, &Assembler::vpinsrw, &Assembler::pinsrw, dst, src1, src2, + imm8); +} + +void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Register src2, + uint8_t imm8) { + // Need a fall back when SSE4_1 is unavailable. Pinsrb and Pinsrq are used + // only by Wasm SIMD, which requires SSE4_1 already. + if (CpuFeatures::IsSupported(SSE4_1)) { + PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1, src2, + imm8, base::Optional(SSE4_1)); + return; + } + + Movd(kScratchDoubleReg, src2); if (imm8 == 1) { punpckldq(dst, kScratchDoubleReg); } else { @@ -1780,17 +1834,17 @@ void TurboAssembler::Pinsrd(XMMRegister dst, Register src, uint8_t imm8) { } } -void TurboAssembler::Pinsrd(XMMRegister dst, Operand src, uint8_t imm8) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope scope(this, AVX); - vpinsrd(dst, dst, src, imm8); - return; - } else if (CpuFeatures::IsSupported(SSE4_1)) { - CpuFeatureScope sse_scope(this, SSE4_1); - pinsrd(dst, src, imm8); +void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2, + uint8_t imm8) { + // Need a fall back when SSE4_1 is unavailable. Pinsrb and Pinsrq are used + // only by Wasm SIMD, which requires SSE4_1 already. + if (CpuFeatures::IsSupported(SSE4_1)) { + PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1, src2, + imm8, base::Optional(SSE4_1)); return; } - Movd(kScratchDoubleReg, src); + + Movd(kScratchDoubleReg, src2); if (imm8 == 1) { punpckldq(dst, kScratchDoubleReg); } else { @@ -1799,54 +1853,24 @@ void TurboAssembler::Pinsrd(XMMRegister dst, Operand src, uint8_t imm8) { } } -void TurboAssembler::Pinsrw(XMMRegister dst, Register src, uint8_t imm8) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope scope(this, AVX); - vpinsrw(dst, dst, src, imm8); - return; - } else { - DCHECK(CpuFeatures::IsSupported(SSE4_1)); - CpuFeatureScope sse_scope(this, SSE4_1); - pinsrw(dst, src, imm8); - return; - } +void TurboAssembler::Pinsrd(XMMRegister dst, Register src2, uint8_t imm8) { + Pinsrd(dst, dst, src2, imm8); } -void TurboAssembler::Pinsrw(XMMRegister dst, Operand src, uint8_t imm8) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope scope(this, AVX); - vpinsrw(dst, dst, src, imm8); - return; - } else { - CpuFeatureScope sse_scope(this, SSE4_1); - pinsrw(dst, src, imm8); - return; - } +void TurboAssembler::Pinsrd(XMMRegister dst, Operand src2, uint8_t imm8) { + Pinsrd(dst, dst, src2, imm8); } -void TurboAssembler::Pinsrb(XMMRegister dst, Register src, uint8_t imm8) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope scope(this, AVX); - vpinsrb(dst, dst, src, imm8); - return; - } else { - DCHECK(CpuFeatures::IsSupported(SSE4_1)); - CpuFeatureScope sse_scope(this, SSE4_1); - pinsrb(dst, src, imm8); - return; - } +void TurboAssembler::Pinsrq(XMMRegister dst, XMMRegister src1, Register src2, + uint8_t imm8) { + PinsrHelper(this, &Assembler::vpinsrq, &Assembler::pinsrq, dst, src1, src2, + imm8, base::Optional(SSE4_1)); } -void TurboAssembler::Pinsrb(XMMRegister dst, Operand src, uint8_t imm8) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope scope(this, AVX); - vpinsrb(dst, dst, src, imm8); - return; - } else { - CpuFeatureScope sse_scope(this, SSE4_1); - pinsrb(dst, src, imm8); - return; - } +void TurboAssembler::Pinsrq(XMMRegister dst, XMMRegister src1, Operand src2, + uint8_t imm8) { + PinsrHelper(this, &Assembler::vpinsrq, &Assembler::pinsrq, dst, src1, src2, + imm8, base::Optional(SSE4_1)); } void TurboAssembler::Psllq(XMMRegister dst, byte imm8) { diff --git a/src/codegen/x64/macro-assembler-x64.h b/src/codegen/x64/macro-assembler-x64.h index abf262b550..77a04e3bb5 100644 --- a/src/codegen/x64/macro-assembler-x64.h +++ b/src/codegen/x64/macro-assembler-x64.h @@ -517,12 +517,17 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void Pextrd(Register dst, XMMRegister src, int8_t imm8); void Pextrw(Register dst, XMMRegister src, int8_t imm8); void Pextrb(Register dst, XMMRegister src, int8_t imm8); - void Pinsrd(XMMRegister dst, Register src, uint8_t imm8); - void Pinsrd(XMMRegister dst, Operand src, uint8_t imm8); - void Pinsrw(XMMRegister dst, Register src, uint8_t imm8); - void Pinsrw(XMMRegister dst, Operand src, uint8_t imm8); - void Pinsrb(XMMRegister dst, Register src, uint8_t imm8); - void Pinsrb(XMMRegister dst, Operand src, uint8_t imm8); + + void Pinsrb(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8); + void Pinsrb(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8); + void Pinsrw(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8); + void Pinsrw(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8); + void Pinsrd(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8); + void Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8); + void Pinsrd(XMMRegister dst, Register src2, uint8_t imm8); + void Pinsrd(XMMRegister dst, Operand src2, uint8_t imm8); + void Pinsrq(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8); + void Pinsrq(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8); void Psllq(XMMRegister dst, int imm8) { Psllq(dst, static_cast(imm8)); } void Psllq(XMMRegister dst, byte imm8); diff --git a/src/compiler/backend/x64/code-generator-x64.cc b/src/compiler/backend/x64/code-generator-x64.cc index 7f7ab2e2f1..02f1b5b231 100644 --- a/src/compiler/backend/x64/code-generator-x64.cc +++ b/src/compiler/backend/x64/code-generator-x64.cc @@ -2975,11 +2975,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( break; } case kX64I32x4ReplaceLane: { + XMMRegister dst = i.OutputSimd128Register(); + XMMRegister src = i.InputSimd128Register(0); if (HasRegisterInput(instr, 2)) { - __ Pinsrd(i.OutputSimd128Register(), i.InputRegister(2), - i.InputInt8(1)); + __ Pinsrd(dst, src, i.InputRegister(2), i.InputInt8(1)); } else { - __ Pinsrd(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1)); + __ Pinsrd(dst, src, i.InputOperand(2), i.InputInt8(1)); } break; } @@ -3090,8 +3091,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( __ Maxps(dst, tmp2); // scratch: float representation of max_signed __ Pcmpeqd(tmp2, tmp2); - __ Psrld(tmp2, uint8_t{1}); // 0x7fffffff - __ Cvtdq2ps(tmp2, tmp2); // 0x4f000000 + __ Psrld(tmp2, uint8_t{1}); // 0x7fffffff + __ Cvtdq2ps(tmp2, tmp2); // 0x4f000000 // tmp: convert (src-max_signed). // Positive overflow lanes -> 0x7FFFFFFF // Negative lanes -> 0 @@ -3204,11 +3205,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( break; } case kX64I16x8ReplaceLane: { + XMMRegister dst = i.OutputSimd128Register(); + XMMRegister src = i.InputSimd128Register(0); if (HasRegisterInput(instr, 2)) { - __ Pinsrw(i.OutputSimd128Register(), i.InputRegister(2), - i.InputInt8(1)); + __ Pinsrw(dst, src, i.InputRegister(2), i.InputInt8(1)); } else { - __ Pinsrw(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1)); + __ Pinsrw(dst, src, i.InputOperand(2), i.InputInt8(1)); } break; } @@ -3395,11 +3397,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( break; } case kX64I8x16ReplaceLane: { + XMMRegister dst = i.OutputSimd128Register(); + XMMRegister src = i.InputSimd128Register(0); if (HasRegisterInput(instr, 2)) { - __ Pinsrb(i.OutputSimd128Register(), i.InputRegister(2), - i.InputInt8(1)); + __ Pinsrb(dst, src, i.InputRegister(2), i.InputInt8(1)); } else { - __ Pinsrb(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1)); + __ Pinsrb(dst, src, i.InputOperand(2), i.InputInt8(1)); } break; } @@ -3750,17 +3753,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } case kX64S128Load8Splat: { EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset()); - __ Pinsrb(i.OutputSimd128Register(), i.MemoryOperand(), 0); + XMMRegister dst = i.OutputSimd128Register(); + __ Pinsrb(dst, dst, i.MemoryOperand(), 0); __ Pxor(kScratchDoubleReg, kScratchDoubleReg); - __ Pshufb(i.OutputSimd128Register(), kScratchDoubleReg); + __ Pshufb(dst, kScratchDoubleReg); break; } case kX64S128Load16Splat: { EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset()); - __ Pinsrw(i.OutputSimd128Register(), i.MemoryOperand(), 0); - __ Pshuflw(i.OutputSimd128Register(), i.OutputSimd128Register(), - uint8_t{0}); - __ Punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register()); + XMMRegister dst = i.OutputSimd128Register(); + __ Pinsrw(dst, dst, i.MemoryOperand(), 0); + __ Pshuflw(dst, dst, uint8_t{0}); + __ Punpcklqdq(dst, dst); break; } case kX64S128Load32Splat: { diff --git a/src/wasm/baseline/x64/liftoff-assembler-x64.h b/src/wasm/baseline/x64/liftoff-assembler-x64.h index 18fb7a4f7a..40683f4700 100644 --- a/src/wasm/baseline/x64/liftoff-assembler-x64.h +++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h @@ -2292,11 +2292,11 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr, } else { DCHECK_EQ(LoadTransformationKind::kSplat, transform); if (memtype == MachineType::Int8()) { - Pinsrb(dst.fp(), src_op, 0); + Pinsrb(dst.fp(), dst.fp(), src_op, 0); Pxor(kScratchDoubleReg, kScratchDoubleReg); Pshufb(dst.fp(), kScratchDoubleReg); } else if (memtype == MachineType::Int16()) { - Pinsrw(dst.fp(), src_op, 0); + Pinsrw(dst.fp(), dst.fp(), src_op, 0); Pshuflw(dst.fp(), dst.fp(), uint8_t{0}); Punpcklqdq(dst.fp(), dst.fp()); } else if (memtype == MachineType::Int32()) { diff --git a/test/cctest/test-disasm-x64.cc b/test/cctest/test-disasm-x64.cc index ecd89680dc..0a23df4784 100644 --- a/test/cctest/test-disasm-x64.cc +++ b/test/cctest/test-disasm-x64.cc @@ -400,6 +400,7 @@ TEST(DisasmX64) { __ movdqa(Operand(rsp, 12), xmm0); __ movdqu(xmm0, Operand(rsp, 12)); __ movdqu(Operand(rsp, 12), xmm0); + __ movdqu(xmm1, xmm0); __ shufps(xmm0, xmm9, 0x0); __ ucomiss(xmm0, xmm1);