Reland "[wasm-simd][ia32] Prototype sign select"

This is a reland of 716dae3ae0

Original change's description:
> [wasm-simd][ia32] Prototype sign select
>
> The implementation is the same as on x64.
>
> Bug: v8:10983
> Change-Id: I2654ce4a627ca5cc6c759051ab9034c528d9f25a
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2567194
> Reviewed-by: Bill Budge <bbudge@chromium.org>
> Commit-Queue: Zhi An Ng <zhin@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#71606}

Bug: v8:10983
Change-Id: I05af92ec2d3531dd2e0d27353cc665967fb5c387
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2574001
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71627}
This commit is contained in:
Zhi An Ng 2020-12-07 01:12:16 +00:00 committed by Commit Bot
parent d5a2c24444
commit 7e67c9a8e1
11 changed files with 177 additions and 4 deletions

View File

@ -2973,6 +2973,24 @@ void Assembler::vpshufd(XMMRegister dst, Operand src, uint8_t shuffle) {
EMIT(shuffle);
}
void Assembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask) {
vinstr(0x4A, dst, src1, src2, k66, k0F3A, kW0);
EMIT(mask.code() << 4);
}
void Assembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask) {
vinstr(0x4B, dst, src1, src2, k66, k0F3A, kW0);
EMIT(mask.code() << 4);
}
void Assembler::vpblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask) {
vinstr(0x4C, dst, src1, src2, k66, k0F3A, kW0);
EMIT(mask.code() << 4);
}
void Assembler::vpblendw(XMMRegister dst, XMMRegister src1, Operand src2,
uint8_t mask) {
vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG);

View File

@ -1406,6 +1406,13 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
}
void vpshufd(XMMRegister dst, Operand src, uint8_t shuffle);
void vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask);
void vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask);
void vpblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask);
void vpblendw(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t mask) {
vpblendw(dst, src1, Operand(src2), mask);
@ -1692,6 +1699,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
SSE4_INSTRUCTION_LIST(DECLARE_SSE4_INSTRUCTION)
SSE4_RM_INSTRUCTION_LIST(DECLARE_SSE4_INSTRUCTION)
DECLARE_SSE4_INSTRUCTION(blendvps, 66, 0F, 38, 14)
DECLARE_SSE4_INSTRUCTION(blendvpd, 66, 0F, 38, 15)
DECLARE_SSE4_INSTRUCTION(pblendvb, 66, 0F, 38, 10)
#undef DECLARE_SSE4_INSTRUCTION
#define DECLARE_SSE34_AVX_INSTRUCTION(instruction, prefix, escape1, escape2, \

View File

@ -505,6 +505,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
Pshufb(dst, src, Operand(mask));
}
void Pshufb(XMMRegister dst, XMMRegister src, Operand mask);
void Pblendw(XMMRegister dst, XMMRegister src, uint8_t imm8) {
Pblendw(dst, Operand(src), imm8);
}

View File

@ -535,6 +535,32 @@ class OutOfLineRecordWrite final : public OutOfLineCode {
} \
} while (false)
// Helper macro to help define signselect opcodes. This should not be used for
// i16x8.signselect, because there is no native word-sized blend instruction.
// We choose a helper macro here instead of a macro-assembler function because
// the blend instruction requires xmm0 as an implicit argument, and the codegen
// relies on xmm0 being the scratch register, so we can freely overwrite it as
// required.
#define ASSEMBLE_SIMD_SIGN_SELECT(BLEND_OP) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
XMMRegister src1 = i.InputSimd128Register(0); \
XMMRegister src2 = i.InputSimd128Register(1); \
XMMRegister mask = i.InputSimd128Register(2); \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ v##BLEND_OP(dst, src1, src2, mask); \
} else { \
CpuFeatureScope scope(tasm(), SSE4_1); \
DCHECK_EQ(dst, src1); \
DCHECK_EQ(kScratchDoubleReg, xmm0); \
if (mask != xmm0) { \
__ movaps(xmm0, mask); \
} \
__ BLEND_OP(dst, src2); \
} \
} while (false)
void CodeGenerator::AssembleDeconstructFrame() {
__ mov(esp, ebp);
__ pop(ebp);
@ -2158,6 +2184,37 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1));
break;
}
case kIA32I8x16SignSelect: {
ASSEMBLE_SIMD_SIGN_SELECT(pblendvb);
break;
}
case kIA32I16x8SignSelect: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src1 = i.InputSimd128Register(0);
XMMRegister src2 = i.InputSimd128Register(1);
XMMRegister mask = i.InputSimd128Register(2);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpsraw(kScratchDoubleReg, mask, 15);
__ vpblendvb(dst, src1, src2, kScratchDoubleReg);
} else {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
DCHECK_EQ(dst, src1);
DCHECK_EQ(kScratchDoubleReg, xmm0);
__ pxor(kScratchDoubleReg, kScratchDoubleReg);
__ pcmpgtw(kScratchDoubleReg, mask);
__ pblendvb(dst, src2);
}
break;
}
case kIA32I32x4SignSelect: {
ASSEMBLE_SIMD_SIGN_SELECT(blendvps);
break;
}
case kIA32I64x2SignSelect: {
ASSEMBLE_SIMD_SIGN_SELECT(blendvpd);
break;
}
case kSSEF32x4Splat: {
DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
XMMRegister dst = i.OutputSimd128Register();

View File

@ -150,6 +150,7 @@ namespace compiler {
V(IA32I64x2ShrU) \
V(IA32I64x2BitMask) \
V(IA32I64x2Eq) \
V(IA32I64x2SignSelect) \
V(SSEF32x4Splat) \
V(AVXF32x4Splat) \
V(SSEF32x4ExtractLane) \
@ -236,6 +237,7 @@ namespace compiler {
V(IA32I32x4Abs) \
V(IA32I32x4BitMask) \
V(IA32I32x4DotI16x8S) \
V(IA32I32x4SignSelect) \
V(IA32I16x8Splat) \
V(IA32I16x8ExtractLaneS) \
V(IA32I16x8SConvertI8x16Low) \
@ -289,6 +291,7 @@ namespace compiler {
V(IA32I16x8RoundingAverageU) \
V(IA32I16x8Abs) \
V(IA32I16x8BitMask) \
V(IA32I16x8SignSelect) \
V(IA32I8x16Splat) \
V(IA32I8x16ExtractLaneS) \
V(IA32Pinsrb) \
@ -342,6 +345,7 @@ namespace compiler {
V(IA32I8x16RoundingAverageU) \
V(IA32I8x16Abs) \
V(IA32I8x16BitMask) \
V(IA32I8x16SignSelect) \
V(IA32S128Const) \
V(IA32S128Zero) \
V(IA32S128AllOnes) \

View File

@ -129,6 +129,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I64x2ShrU:
case kIA32I64x2BitMask:
case kIA32I64x2Eq:
case kIA32I64x2SignSelect:
case kSSEF32x4Splat:
case kAVXF32x4Splat:
case kSSEF32x4ExtractLane:
@ -215,6 +216,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I32x4Abs:
case kIA32I32x4BitMask:
case kIA32I32x4DotI16x8S:
case kIA32I32x4SignSelect:
case kIA32I16x8Splat:
case kIA32I16x8ExtractLaneS:
case kIA32I16x8SConvertI8x16Low:
@ -268,6 +270,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I16x8RoundingAverageU:
case kIA32I16x8Abs:
case kIA32I16x8BitMask:
case kIA32I16x8SignSelect:
case kIA32I8x16Splat:
case kIA32I8x16ExtractLaneS:
case kIA32Pinsrb:
@ -321,6 +324,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I8x16RoundingAverageU:
case kIA32I8x16Abs:
case kIA32I8x16BitMask:
case kIA32I8x16SignSelect:
case kIA32S128Const:
case kIA32S128Zero:
case kIA32S128AllOnes:

View File

@ -2993,6 +2993,44 @@ void InstructionSelector::VisitF64x2Pmax(Node* node) {
VisitPminOrPmax(this, node, kIA32F64x2Pmax);
}
namespace {
void VisitSignSelect(InstructionSelector* selector, Node* node,
ArchOpcode opcode) {
IA32OperandGenerator g(selector);
// signselect(x, y, -1) = x
// pblendvb(dst, x, y, -1) = dst <- y, so we need to swap x and y.
if (selector->IsSupported(AVX)) {
selector->Emit(
opcode, g.DefineAsRegister(node), g.UseRegister(node->InputAt(1)),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(2)));
} else {
// We would like to fix the mask to be xmm0, since that is what
// pblendvb/blendvps/blendvps uses as an implicit operand. However, xmm0 is
// also scratch register, so our mask values can be overwritten. Instead, we
// manually move the mask to xmm0 inside codegen.
selector->Emit(
opcode, g.DefineSameAsFirst(node), g.UseRegister(node->InputAt(1)),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(2)));
}
}
} // namespace
void InstructionSelector::VisitI8x16SignSelect(Node* node) {
VisitSignSelect(this, node, kIA32I8x16SignSelect);
}
void InstructionSelector::VisitI16x8SignSelect(Node* node) {
VisitSignSelect(this, node, kIA32I16x8SignSelect);
}
void InstructionSelector::VisitI32x4SignSelect(Node* node) {
VisitSignSelect(this, node, kIA32I32x4SignSelect);
}
void InstructionSelector::VisitI64x2SignSelect(Node* node) {
VisitSignSelect(this, node, kIA32I64x2SignSelect);
}
// static
MachineOperatorBuilder::Flags
InstructionSelector::SupportedMachineOperatorFlags() {

View File

@ -2826,13 +2826,13 @@ void InstructionSelector::VisitLoadLane(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitStoreLane(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_X64
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32
// TODO(v8:10983) Prototyping sign select.
void InstructionSelector::VisitI8x16SignSelect(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI16x8SignSelect(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI32x4SignSelect(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2SignSelect(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && \
!V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_MIPS

View File

@ -856,6 +856,24 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
AppendToBuffer(",%d", Imm8(current));
current++;
break;
case 0x4A:
AppendToBuffer("vblendvps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
AppendToBuffer(",%s", NameOfXMMRegister(*current >> 4));
break;
case 0x4B:
AppendToBuffer("vblendvps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
AppendToBuffer(",%s", NameOfXMMRegister(*current >> 4));
break;
case 0x4C:
AppendToBuffer("vpblendvb %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
AppendToBuffer(",%s", NameOfXMMRegister(*current >> 4));
break;
default:
UnimplementedInstruction();
}
@ -2189,6 +2207,21 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
SSE4_INSTRUCTION_LIST(SSE34_DIS_CASE)
SSE4_RM_INSTRUCTION_LIST(SSE34_DIS_CASE)
#undef SSE34_DIS_CASE
case 0x10:
AppendToBuffer("pblendvb %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
AppendToBuffer(",xmm0");
break;
case 0x14:
AppendToBuffer("blendvps %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
AppendToBuffer(",xmm0");
break;
case 0x15:
AppendToBuffer("blendvps %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
AppendToBuffer(",xmm0");
break;
default:
UnimplementedInstruction();
}

View File

@ -627,6 +627,10 @@ TEST(DisasmIa320) {
__ pinsrd(xmm1, Operand(edx, 4), 0);
__ extractps(eax, xmm1, 0);
__ blendvps(xmm3, xmm1);
__ blendvpd(xmm3, xmm1);
__ pblendvb(xmm3, xmm1);
SSE4_INSTRUCTION_LIST(EMIT_SSE34_INSTR)
SSE4_RM_INSTRUCTION_LIST(EMIT_SSE34_INSTR)
}
@ -786,6 +790,10 @@ TEST(DisasmIa320) {
__ vpinsrd(xmm0, xmm1, eax, 0);
__ vpinsrd(xmm0, xmm1, Operand(edx, 4), 0);
__ vblendvps(xmm3, xmm1, xmm4, xmm6);
__ vblendvpd(xmm3, xmm1, xmm4, xmm6);
__ vpblendvb(xmm3, xmm1, xmm4, xmm6);
__ vcvtdq2ps(xmm1, xmm0);
__ vcvtdq2ps(xmm1, Operand(ebx, ecx, times_4, 10000));
__ vcvttps2dq(xmm1, xmm0);

View File

@ -765,7 +765,7 @@ WASM_SIMD_TEST(F32x4Le) {
RunF32x4CompareOpTest(execution_tier, lower_simd, kExprF32x4Le, LessEqual);
}
#if V8_TARGET_ARCH_X64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32
// TODO(v8:10983) Prototyping sign select.
template <typename T>
void RunSignSelect(TestExecutionTier execution_tier, LowerSimd lower_simd,
@ -822,7 +822,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2SignSelect) {
RunSignSelect<int64_t>(execution_tier, lower_simd, kExprI64x2SignSelect,
kExprI64x2Splat, mask);
}
#endif // V8_TARGET_ARCH_X64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X
WASM_SIMD_TEST_NO_LOWERING(F32x4Qfma) {