[x64] Refactor pinsrb family of instructions

The existing macro assembler define Pinsrb, which expects 3 arguments:

- XMMRegister dst
- Register/Operand src
- uint8_t imm

which overwrites dst with src at lane specified by imm.

That means we cannot use the AVX version, which has 4 arguments, and
does not overwrite dst.

This refactoring defines the 4 argument AVX version instead, and if AVX
is not supported, fall back to the SSE version, and ensure that the
value is copied over into dst first.

For convenience, we define an overload with 3 arguments that duplicates
dst, this replicates the SSE behavior, so that not all callers have to
be updated.

Bug: v8:10975, v8:10933
Change-Id: I6f9b9d37fa08d3f5cff4f040ae7d5e1f0cf36455
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2444096
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70392}
This commit is contained in:
Ng Zhi An 2020-10-07 15:45:10 -07:00 committed by Commit Bot
parent f996d50d62
commit e30c50f3bf
7 changed files with 128 additions and 84 deletions

View File

@ -2757,6 +2757,15 @@ void Assembler::movdqu(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src); emit_sse_operand(dst, src);
} }
void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit(0xF3);
emit_rex_64(dst, src);
emit(0x0F);
emit(0x6F);
emit_sse_operand(dst, src);
}
void Assembler::pinsrw(XMMRegister dst, Register src, uint8_t imm8) { void Assembler::pinsrw(XMMRegister dst, Register src, uint8_t imm8) {
EnsureSpace ensure_space(this); EnsureSpace ensure_space(this);
emit(0x66); emit(0x66);

View File

@ -1160,6 +1160,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movdqu(Operand dst, XMMRegister src); void movdqu(Operand dst, XMMRegister src);
void movdqu(XMMRegister dst, Operand src); void movdqu(XMMRegister dst, Operand src);
void movdqu(XMMRegister dst, XMMRegister src);
void movapd(XMMRegister dst, XMMRegister src); void movapd(XMMRegister dst, XMMRegister src);
void movupd(XMMRegister dst, Operand src); void movupd(XMMRegister dst, Operand src);

View File

@ -1761,17 +1761,71 @@ void TurboAssembler::Pextrb(Register dst, XMMRegister src, int8_t imm8) {
} }
} }
void TurboAssembler::Pinsrd(XMMRegister dst, Register src, uint8_t imm8) { namespace {
template <typename Src>
using AvxFn = void (Assembler::*)(XMMRegister, XMMRegister, Src, uint8_t);
template <typename Src>
using NoAvxFn = void (Assembler::*)(XMMRegister, Src, uint8_t);
template <typename Src>
void PinsrHelper(Assembler* assm, AvxFn<Src> avx, NoAvxFn<Src> noavx,
XMMRegister dst, XMMRegister src1, Src src2, uint8_t imm8,
base::Optional<CpuFeature> feature = base::nullopt) {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX); CpuFeatureScope scope(assm, AVX);
vpinsrd(dst, dst, src, imm8); (assm->*avx)(dst, src1, src2, imm8);
return;
} else if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
pinsrd(dst, src, imm8);
return; return;
} }
Movd(kScratchDoubleReg, src);
if (dst != src1) {
assm->movdqu(dst, src1);
}
if (feature.has_value()) {
DCHECK(CpuFeatures::IsSupported(*feature));
CpuFeatureScope scope(assm, *feature);
(assm->*noavx)(dst, src2, imm8);
} else {
(assm->*noavx)(dst, src2, imm8);
}
}
} // namespace
void TurboAssembler::Pinsrb(XMMRegister dst, XMMRegister src1, Register src2,
uint8_t imm8) {
PinsrHelper(this, &Assembler::vpinsrb, &Assembler::pinsrb, dst, src1, src2,
imm8, base::Optional<CpuFeature>(SSE4_1));
}
void TurboAssembler::Pinsrb(XMMRegister dst, XMMRegister src1, Operand src2,
uint8_t imm8) {
PinsrHelper(this, &Assembler::vpinsrb, &Assembler::pinsrb, dst, src1, src2,
imm8, base::Optional<CpuFeature>(SSE4_1));
}
void TurboAssembler::Pinsrw(XMMRegister dst, XMMRegister src1, Register src2,
uint8_t imm8) {
PinsrHelper(this, &Assembler::vpinsrw, &Assembler::pinsrw, dst, src1, src2,
imm8);
}
void TurboAssembler::Pinsrw(XMMRegister dst, XMMRegister src1, Operand src2,
uint8_t imm8) {
PinsrHelper(this, &Assembler::vpinsrw, &Assembler::pinsrw, dst, src1, src2,
imm8);
}
void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Register src2,
uint8_t imm8) {
// Need a fall back when SSE4_1 is unavailable. Pinsrb and Pinsrq are used
// only by Wasm SIMD, which requires SSE4_1 already.
if (CpuFeatures::IsSupported(SSE4_1)) {
PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1, src2,
imm8, base::Optional<CpuFeature>(SSE4_1));
return;
}
Movd(kScratchDoubleReg, src2);
if (imm8 == 1) { if (imm8 == 1) {
punpckldq(dst, kScratchDoubleReg); punpckldq(dst, kScratchDoubleReg);
} else { } else {
@ -1780,17 +1834,17 @@ void TurboAssembler::Pinsrd(XMMRegister dst, Register src, uint8_t imm8) {
} }
} }
void TurboAssembler::Pinsrd(XMMRegister dst, Operand src, uint8_t imm8) { void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2,
if (CpuFeatures::IsSupported(AVX)) { uint8_t imm8) {
CpuFeatureScope scope(this, AVX); // Need a fall back when SSE4_1 is unavailable. Pinsrb and Pinsrq are used
vpinsrd(dst, dst, src, imm8); // only by Wasm SIMD, which requires SSE4_1 already.
return; if (CpuFeatures::IsSupported(SSE4_1)) {
} else if (CpuFeatures::IsSupported(SSE4_1)) { PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1, src2,
CpuFeatureScope sse_scope(this, SSE4_1); imm8, base::Optional<CpuFeature>(SSE4_1));
pinsrd(dst, src, imm8);
return; return;
} }
Movd(kScratchDoubleReg, src);
Movd(kScratchDoubleReg, src2);
if (imm8 == 1) { if (imm8 == 1) {
punpckldq(dst, kScratchDoubleReg); punpckldq(dst, kScratchDoubleReg);
} else { } else {
@ -1799,54 +1853,24 @@ void TurboAssembler::Pinsrd(XMMRegister dst, Operand src, uint8_t imm8) {
} }
} }
void TurboAssembler::Pinsrw(XMMRegister dst, Register src, uint8_t imm8) { void TurboAssembler::Pinsrd(XMMRegister dst, Register src2, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) { Pinsrd(dst, dst, src2, imm8);
CpuFeatureScope scope(this, AVX);
vpinsrw(dst, dst, src, imm8);
return;
} else {
DCHECK(CpuFeatures::IsSupported(SSE4_1));
CpuFeatureScope sse_scope(this, SSE4_1);
pinsrw(dst, src, imm8);
return;
}
} }
void TurboAssembler::Pinsrw(XMMRegister dst, Operand src, uint8_t imm8) { void TurboAssembler::Pinsrd(XMMRegister dst, Operand src2, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) { Pinsrd(dst, dst, src2, imm8);
CpuFeatureScope scope(this, AVX);
vpinsrw(dst, dst, src, imm8);
return;
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
pinsrw(dst, src, imm8);
return;
}
} }
void TurboAssembler::Pinsrb(XMMRegister dst, Register src, uint8_t imm8) { void TurboAssembler::Pinsrq(XMMRegister dst, XMMRegister src1, Register src2,
if (CpuFeatures::IsSupported(AVX)) { uint8_t imm8) {
CpuFeatureScope scope(this, AVX); PinsrHelper(this, &Assembler::vpinsrq, &Assembler::pinsrq, dst, src1, src2,
vpinsrb(dst, dst, src, imm8); imm8, base::Optional<CpuFeature>(SSE4_1));
return;
} else {
DCHECK(CpuFeatures::IsSupported(SSE4_1));
CpuFeatureScope sse_scope(this, SSE4_1);
pinsrb(dst, src, imm8);
return;
}
} }
void TurboAssembler::Pinsrb(XMMRegister dst, Operand src, uint8_t imm8) { void TurboAssembler::Pinsrq(XMMRegister dst, XMMRegister src1, Operand src2,
if (CpuFeatures::IsSupported(AVX)) { uint8_t imm8) {
CpuFeatureScope scope(this, AVX); PinsrHelper(this, &Assembler::vpinsrq, &Assembler::pinsrq, dst, src1, src2,
vpinsrb(dst, dst, src, imm8); imm8, base::Optional<CpuFeature>(SSE4_1));
return;
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
pinsrb(dst, src, imm8);
return;
}
} }
void TurboAssembler::Psllq(XMMRegister dst, byte imm8) { void TurboAssembler::Psllq(XMMRegister dst, byte imm8) {

View File

@ -517,12 +517,17 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Pextrd(Register dst, XMMRegister src, int8_t imm8); void Pextrd(Register dst, XMMRegister src, int8_t imm8);
void Pextrw(Register dst, XMMRegister src, int8_t imm8); void Pextrw(Register dst, XMMRegister src, int8_t imm8);
void Pextrb(Register dst, XMMRegister src, int8_t imm8); void Pextrb(Register dst, XMMRegister src, int8_t imm8);
void Pinsrd(XMMRegister dst, Register src, uint8_t imm8);
void Pinsrd(XMMRegister dst, Operand src, uint8_t imm8); void Pinsrb(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8);
void Pinsrw(XMMRegister dst, Register src, uint8_t imm8); void Pinsrb(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8);
void Pinsrw(XMMRegister dst, Operand src, uint8_t imm8); void Pinsrw(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8);
void Pinsrb(XMMRegister dst, Register src, uint8_t imm8); void Pinsrw(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8);
void Pinsrb(XMMRegister dst, Operand src, uint8_t imm8); void Pinsrd(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8);
void Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8);
void Pinsrd(XMMRegister dst, Register src2, uint8_t imm8);
void Pinsrd(XMMRegister dst, Operand src2, uint8_t imm8);
void Pinsrq(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8);
void Pinsrq(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8);
void Psllq(XMMRegister dst, int imm8) { Psllq(dst, static_cast<byte>(imm8)); } void Psllq(XMMRegister dst, int imm8) { Psllq(dst, static_cast<byte>(imm8)); }
void Psllq(XMMRegister dst, byte imm8); void Psllq(XMMRegister dst, byte imm8);

View File

@ -2975,11 +2975,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I32x4ReplaceLane: { case kX64I32x4ReplaceLane: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (HasRegisterInput(instr, 2)) { if (HasRegisterInput(instr, 2)) {
__ Pinsrd(i.OutputSimd128Register(), i.InputRegister(2), __ Pinsrd(dst, src, i.InputRegister(2), i.InputInt8(1));
i.InputInt8(1));
} else { } else {
__ Pinsrd(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1)); __ Pinsrd(dst, src, i.InputOperand(2), i.InputInt8(1));
} }
break; break;
} }
@ -3090,8 +3091,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Maxps(dst, tmp2); __ Maxps(dst, tmp2);
// scratch: float representation of max_signed // scratch: float representation of max_signed
__ Pcmpeqd(tmp2, tmp2); __ Pcmpeqd(tmp2, tmp2);
__ Psrld(tmp2, uint8_t{1}); // 0x7fffffff __ Psrld(tmp2, uint8_t{1}); // 0x7fffffff
__ Cvtdq2ps(tmp2, tmp2); // 0x4f000000 __ Cvtdq2ps(tmp2, tmp2); // 0x4f000000
// tmp: convert (src-max_signed). // tmp: convert (src-max_signed).
// Positive overflow lanes -> 0x7FFFFFFF // Positive overflow lanes -> 0x7FFFFFFF
// Negative lanes -> 0 // Negative lanes -> 0
@ -3204,11 +3205,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I16x8ReplaceLane: { case kX64I16x8ReplaceLane: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (HasRegisterInput(instr, 2)) { if (HasRegisterInput(instr, 2)) {
__ Pinsrw(i.OutputSimd128Register(), i.InputRegister(2), __ Pinsrw(dst, src, i.InputRegister(2), i.InputInt8(1));
i.InputInt8(1));
} else { } else {
__ Pinsrw(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1)); __ Pinsrw(dst, src, i.InputOperand(2), i.InputInt8(1));
} }
break; break;
} }
@ -3395,11 +3397,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I8x16ReplaceLane: { case kX64I8x16ReplaceLane: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (HasRegisterInput(instr, 2)) { if (HasRegisterInput(instr, 2)) {
__ Pinsrb(i.OutputSimd128Register(), i.InputRegister(2), __ Pinsrb(dst, src, i.InputRegister(2), i.InputInt8(1));
i.InputInt8(1));
} else { } else {
__ Pinsrb(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1)); __ Pinsrb(dst, src, i.InputOperand(2), i.InputInt8(1));
} }
break; break;
} }
@ -3750,17 +3753,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kX64S128Load8Splat: { case kX64S128Load8Splat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset()); EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
__ Pinsrb(i.OutputSimd128Register(), i.MemoryOperand(), 0); XMMRegister dst = i.OutputSimd128Register();
__ Pinsrb(dst, dst, i.MemoryOperand(), 0);
__ Pxor(kScratchDoubleReg, kScratchDoubleReg); __ Pxor(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(i.OutputSimd128Register(), kScratchDoubleReg); __ Pshufb(dst, kScratchDoubleReg);
break; break;
} }
case kX64S128Load16Splat: { case kX64S128Load16Splat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset()); EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
__ Pinsrw(i.OutputSimd128Register(), i.MemoryOperand(), 0); XMMRegister dst = i.OutputSimd128Register();
__ Pshuflw(i.OutputSimd128Register(), i.OutputSimd128Register(), __ Pinsrw(dst, dst, i.MemoryOperand(), 0);
uint8_t{0}); __ Pshuflw(dst, dst, uint8_t{0});
__ Punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register()); __ Punpcklqdq(dst, dst);
break; break;
} }
case kX64S128Load32Splat: { case kX64S128Load32Splat: {

View File

@ -2292,11 +2292,11 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
} else { } else {
DCHECK_EQ(LoadTransformationKind::kSplat, transform); DCHECK_EQ(LoadTransformationKind::kSplat, transform);
if (memtype == MachineType::Int8()) { if (memtype == MachineType::Int8()) {
Pinsrb(dst.fp(), src_op, 0); Pinsrb(dst.fp(), dst.fp(), src_op, 0);
Pxor(kScratchDoubleReg, kScratchDoubleReg); Pxor(kScratchDoubleReg, kScratchDoubleReg);
Pshufb(dst.fp(), kScratchDoubleReg); Pshufb(dst.fp(), kScratchDoubleReg);
} else if (memtype == MachineType::Int16()) { } else if (memtype == MachineType::Int16()) {
Pinsrw(dst.fp(), src_op, 0); Pinsrw(dst.fp(), dst.fp(), src_op, 0);
Pshuflw(dst.fp(), dst.fp(), uint8_t{0}); Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
Punpcklqdq(dst.fp(), dst.fp()); Punpcklqdq(dst.fp(), dst.fp());
} else if (memtype == MachineType::Int32()) { } else if (memtype == MachineType::Int32()) {

View File

@ -400,6 +400,7 @@ TEST(DisasmX64) {
__ movdqa(Operand(rsp, 12), xmm0); __ movdqa(Operand(rsp, 12), xmm0);
__ movdqu(xmm0, Operand(rsp, 12)); __ movdqu(xmm0, Operand(rsp, 12));
__ movdqu(Operand(rsp, 12), xmm0); __ movdqu(Operand(rsp, 12), xmm0);
__ movdqu(xmm1, xmm0);
__ shufps(xmm0, xmm9, 0x0); __ shufps(xmm0, xmm9, 0x0);
__ ucomiss(xmm0, xmm1); __ ucomiss(xmm0, xmm1);