[wasm-simd] Merge extract lane ops into pinsr{b,w,d,q}

The only one that doesn't use a pinsr* is f32x4, which uses insertps, so
that is kept as it is.

Bug: v8:10933
Change-Id: I7442668812c674d4242949e13ef595978290bc8d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2458787
Reviewed-by: Bill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70493}
This commit is contained in:
Ng Zhi An 2020-10-12 16:07:14 -07:00 committed by Commit Bot
parent d2ab873de9
commit 99e252bae9
4 changed files with 58 additions and 102 deletions

View File

@ -647,6 +647,26 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
} \
} while (false)
#define ASSEMBLE_PINSR(ASM_INSTR) \
do { \
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset()); \
XMMRegister dst = i.OutputSimd128Register(); \
XMMRegister src = i.InputSimd128Register(0); \
uint8_t laneidx = i.InputUint8(1); \
if (HasAddressingMode(instr)) { \
__ ASM_INSTR(dst, src, i.MemoryOperand(2), laneidx); \
break; \
} \
if (instr->InputAt(2)->IsFPRegister()) { \
__ Movq(kScratchRegister, i.InputDoubleRegister(2)); \
__ ASM_INSTR(dst, src, kScratchRegister, laneidx); \
} else if (instr->InputAt(2)->IsRegister()) { \
__ ASM_INSTR(dst, src, i.InputRegister(2), laneidx); \
} else { \
__ ASM_INSTR(dst, src, i.InputOperand(2), laneidx); \
} \
} while (false)
void CodeGenerator::AssembleDeconstructFrame() {
unwinding_info_writer_.MarkFrameDeconstructed(__ pc_offset());
__ movq(rsp, rbp);
@ -2354,16 +2374,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
break;
}
case kX64F64x2ReplaceLane: {
if (instr->InputAt(2)->IsFPRegister()) {
__ Movq(kScratchRegister, i.InputDoubleRegister(2));
__ Pinsrq(i.OutputSimd128Register(), kScratchRegister, i.InputUint8(1));
} else {
__ Pinsrq(i.OutputSimd128Register(), i.InputOperand(2),
i.InputUint8(1));
}
break;
}
case kX64F64x2ExtractLane: {
__ Pextrq(kScratchRegister, i.InputSimd128Register(0), i.InputInt8(1));
__ Movq(i.OutputDoubleRegister(), kScratchRegister);
@ -2718,16 +2728,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pextrq(i.OutputRegister(), i.InputSimd128Register(0), i.InputInt8(1));
break;
}
case kX64I64x2ReplaceLane: {
if (HasRegisterInput(instr, 2)) {
__ Pinsrq(i.OutputSimd128Register(), i.InputRegister(2),
i.InputUint8(1));
} else {
__ Pinsrq(i.OutputSimd128Register(), i.InputOperand(2),
i.InputUint8(1));
}
break;
}
case kX64I64x2Neg: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
@ -2826,16 +2826,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pextrd(i.OutputRegister(), i.InputSimd128Register(0), i.InputInt8(1));
break;
}
case kX64I32x4ReplaceLane: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (HasRegisterInput(instr, 2)) {
__ Pinsrd(dst, src, i.InputRegister(2), i.InputInt8(1));
} else {
__ Pinsrd(dst, src, i.InputOperand(2), i.InputInt8(1));
}
break;
}
case kX64I32x4SConvertF32x4: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister dst = i.OutputSimd128Register();
@ -3056,16 +3046,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ movsxwl(dst, dst);
break;
}
case kX64I16x8ReplaceLane: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (HasRegisterInput(instr, 2)) {
__ Pinsrw(dst, src, i.InputRegister(2), i.InputInt8(1));
} else {
__ Pinsrw(dst, src, i.InputOperand(2), i.InputInt8(1));
}
break;
}
case kX64I16x8SConvertI8x16Low: {
__ Pmovsxbw(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
@ -3248,52 +3228,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ movsxbl(dst, dst);
break;
}
case kX64I8x16ReplaceLane: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (HasRegisterInput(instr, 2)) {
__ Pinsrb(dst, src, i.InputRegister(2), i.InputInt8(1));
} else {
__ Pinsrb(dst, src, i.InputOperand(2), i.InputInt8(1));
}
break;
}
case kX64Pinsrb: {
// TODO(zhin): consolidate this opcode with the other usages, like
// ReplaceLane, by implementing support when this has no addressing mode.
DCHECK(HasAddressingMode(instr));
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
size_t offset = 0;
Operand mem = i.MemoryOperand(&offset);
__ Pinsrb(i.OutputSimd128Register(), i.InputSimd128Register(offset + 1),
mem, i.InputUint8(offset));
ASSEMBLE_PINSR(Pinsrb);
break;
}
case kX64Pinsrw: {
DCHECK(HasAddressingMode(instr));
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
size_t offset = 0;
Operand mem = i.MemoryOperand(&offset);
__ Pinsrw(i.OutputSimd128Register(), i.InputSimd128Register(offset + 1),
mem, i.InputUint8(offset));
ASSEMBLE_PINSR(Pinsrw);
break;
}
case kX64Pinsrd: {
DCHECK(HasAddressingMode(instr));
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
size_t offset = 0;
Operand mem = i.MemoryOperand(&offset);
__ Pinsrd(i.OutputSimd128Register(), i.InputSimd128Register(offset + 1),
mem, i.InputUint8(offset));
ASSEMBLE_PINSR(Pinsrd);
break;
}
case kX64Pinsrq: {
DCHECK(HasAddressingMode(instr));
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
size_t offset = 0;
Operand mem = i.MemoryOperand(&offset);
__ Pinsrq(i.OutputSimd128Register(), i.InputSimd128Register(offset + 1),
mem, i.InputUint8(offset));
ASSEMBLE_PINSR(Pinsrq);
break;
}
case kX64I8x16SConvertI16x8: {

View File

@ -156,7 +156,6 @@ namespace compiler {
V(X64Peek) \
V(X64F64x2Splat) \
V(X64F64x2ExtractLane) \
V(X64F64x2ReplaceLane) \
V(X64F64x2Abs) \
V(X64F64x2Neg) \
V(X64F64x2Sqrt) \
@ -203,7 +202,6 @@ namespace compiler {
V(X64F32x4Round) \
V(X64I64x2Splat) \
V(X64I64x2ExtractLane) \
V(X64I64x2ReplaceLane) \
V(X64I64x2Neg) \
V(X64I64x2BitMask) \
V(X64I64x2Shl) \
@ -215,7 +213,6 @@ namespace compiler {
V(X64I64x2ShrU) \
V(X64I32x4Splat) \
V(X64I32x4ExtractLane) \
V(X64I32x4ReplaceLane) \
V(X64I32x4SConvertF32x4) \
V(X64I32x4SConvertI16x8Low) \
V(X64I32x4SConvertI16x8High) \
@ -246,7 +243,6 @@ namespace compiler {
V(X64I16x8Splat) \
V(X64I16x8ExtractLaneU) \
V(X64I16x8ExtractLaneS) \
V(X64I16x8ReplaceLane) \
V(X64I16x8SConvertI8x16Low) \
V(X64I16x8SConvertI8x16High) \
V(X64I16x8Neg) \
@ -281,7 +277,6 @@ namespace compiler {
V(X64I8x16Splat) \
V(X64I8x16ExtractLaneU) \
V(X64I8x16ExtractLaneS) \
V(X64I8x16ReplaceLane) \
V(X64Pinsrb) \
V(X64Pinsrw) \
V(X64Pinsrd) \

View File

@ -126,9 +126,12 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64Lea:
case kX64Dec32:
case kX64Inc32:
case kX64Pinsrb:
case kX64Pinsrw:
case kX64Pinsrd:
case kX64Pinsrq:
case kX64F64x2Splat:
case kX64F64x2ExtractLane:
case kX64F64x2ReplaceLane:
case kX64F64x2Abs:
case kX64F64x2Neg:
case kX64F64x2Sqrt:
@ -175,7 +178,6 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64F32x4Round:
case kX64I64x2Splat:
case kX64I64x2ExtractLane:
case kX64I64x2ReplaceLane:
case kX64I64x2Neg:
case kX64I64x2BitMask:
case kX64I64x2Shl:
@ -187,7 +189,6 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I64x2ShrU:
case kX64I32x4Splat:
case kX64I32x4ExtractLane:
case kX64I32x4ReplaceLane:
case kX64I32x4SConvertF32x4:
case kX64I32x4SConvertI16x8Low:
case kX64I32x4SConvertI16x8High:
@ -218,7 +219,6 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I16x8Splat:
case kX64I16x8ExtractLaneU:
case kX64I16x8ExtractLaneS:
case kX64I16x8ReplaceLane:
case kX64I16x8SConvertI8x16Low:
case kX64I16x8SConvertI8x16High:
case kX64I16x8Neg:
@ -253,11 +253,6 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I8x16Splat:
case kX64I8x16ExtractLaneU:
case kX64I8x16ExtractLaneS:
case kX64I8x16ReplaceLane:
case kX64Pinsrb:
case kX64Pinsrw:
case kX64Pinsrd:
case kX64Pinsrq:
case kX64I8x16SConvertI16x8:
case kX64I8x16Neg:
case kX64I8x16Shl:

View File

@ -361,17 +361,19 @@ void InstructionSelector::VisitLoadLane(Node* node) {
X64OperandGenerator g(this);
InstructionOperand outputs[] = {g.DefineAsRegister(node)};
// GetEffectiveAddressMemoryOperand uses up to 3 inputs, 4th is laneidx, 5th
// is the value node.
// Input 0 is value node, 1 is lane idx, and GetEffectiveAddressMemoryOperand
// uses up to 3 inputs. This ordering is consistent with other operations that
// use the same opcode.
InstructionOperand inputs[5];
size_t input_count = 0;
inputs[input_count++] = g.UseRegister(node->InputAt(2));
inputs[input_count++] = g.UseImmediate(params.laneidx);
AddressingMode mode =
g.GetEffectiveAddressMemoryOperand(node, inputs, &input_count);
opcode |= AddressingModeField::encode(mode);
inputs[input_count++] = g.UseImmediate(params.laneidx);
inputs[input_count++] = g.UseRegister(node->InputAt(2));
DCHECK_GE(5, input_count);
// x64 supports unaligned loads.
@ -2963,15 +2965,31 @@ SIMD_VISIT_EXTRACT_LANE(I8x16, U)
SIMD_VISIT_EXTRACT_LANE(I8x16, S)
#undef SIMD_VISIT_EXTRACT_LANE
#define VISIT_SIMD_REPLACE_LANE(Type) \
void InstructionSelector::Visit##Type##ReplaceLane(Node* node) { \
X64OperandGenerator g(this); \
int32_t lane = OpParameter<int32_t>(node->op()); \
Emit(kX64##Type##ReplaceLane, g.DefineSameAsFirst(node), \
g.UseRegister(node->InputAt(0)), g.UseImmediate(lane), \
g.Use(node->InputAt(1))); \
void InstructionSelector::VisitF32x4ReplaceLane(Node* node) {
X64OperandGenerator g(this);
int32_t lane = OpParameter<int32_t>(node->op());
Emit(kX64F32x4ReplaceLane, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseImmediate(lane),
g.Use(node->InputAt(1)));
}
#define VISIT_SIMD_REPLACE_LANE(TYPE, OPCODE) \
void InstructionSelector::Visit##TYPE##ReplaceLane(Node* node) { \
X64OperandGenerator g(this); \
int32_t lane = OpParameter<int32_t>(node->op()); \
Emit(OPCODE, g.DefineAsRegister(node), g.UseRegister(node->InputAt(0)), \
g.UseImmediate(lane), g.Use(node->InputAt(1))); \
}
SIMD_TYPES(VISIT_SIMD_REPLACE_LANE)
#define SIMD_TYPES_FOR_REPLACE_LANE(V) \
V(F64x2, kX64Pinsrq) \
V(I64x2, kX64Pinsrq) \
V(I32x4, kX64Pinsrd) \
V(I16x8, kX64Pinsrw) \
V(I8x16, kX64Pinsrb)
SIMD_TYPES_FOR_REPLACE_LANE(VISIT_SIMD_REPLACE_LANE)
#undef SIMD_TYPES_FOR_REPLACE_LANE
#undef VISIT_SIMD_REPLACE_LANE
#define VISIT_SIMD_SHIFT(Opcode) \