[ia32][wasm-simd] Optimize and unify f32x4.extract_lane SSE and AVX ops
Change the codegen for f32x4.extract_lane from shufps to insertps. They have the same performance, but shufps has a false dependency on dst (it shuffles dst and src, but we don't care about dst at all). We then merge the SSE and AVX opcode. Bug: v8:11217 Change-Id: I7cdbf486573ce3a19881df84400a9c7e09c3ee48 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2585259 Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#71748}
This commit is contained in:
parent
3ea458bea2
commit
754cb03cee
@ -2343,26 +2343,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kSSEF32x4ExtractLane: {
|
||||
DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
|
||||
XMMRegister dst = i.OutputFloatRegister();
|
||||
int8_t lane = i.InputInt8(1);
|
||||
if (lane != 0) {
|
||||
DCHECK_LT(lane, 4);
|
||||
__ shufps(dst, dst, lane);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kAVXF32x4ExtractLane: {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
case kIA32F32x4ExtractLane: {
|
||||
XMMRegister dst = i.OutputFloatRegister();
|
||||
XMMRegister src = i.InputSimd128Register(0);
|
||||
int8_t lane = i.InputInt8(1);
|
||||
if (lane == 0) {
|
||||
if (dst != src) __ vmovaps(dst, src);
|
||||
} else {
|
||||
uint8_t lane = i.InputUint8(1);
|
||||
DCHECK_LT(lane, 4);
|
||||
__ vshufps(dst, src, src, lane);
|
||||
if (lane == 0 && dst == src) {
|
||||
break;
|
||||
}
|
||||
|
||||
uint8_t zmask = 0xE; // Zero top 3 lanes.
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
// Use src for both operands to avoid false-dependency on dst.
|
||||
__ vinsertps(dst, src, src, zmask | (lane << 6));
|
||||
} else {
|
||||
CpuFeatureScope sse_scope(tasm(), SSE4_1);
|
||||
__ insertps(dst, src, zmask | (lane << 6));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -155,8 +155,7 @@ namespace compiler {
|
||||
V(IA32I64x2ExtMulLowI32x4U) \
|
||||
V(IA32I64x2ExtMulHighI32x4U) \
|
||||
V(IA32F32x4Splat) \
|
||||
V(SSEF32x4ExtractLane) \
|
||||
V(AVXF32x4ExtractLane) \
|
||||
V(IA32F32x4ExtractLane) \
|
||||
V(IA32Insertps) \
|
||||
V(IA32F32x4SConvertI32x4) \
|
||||
V(IA32F32x4UConvertI32x4) \
|
||||
|
@ -134,8 +134,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kIA32I64x2ExtMulLowI32x4U:
|
||||
case kIA32I64x2ExtMulHighI32x4U:
|
||||
case kIA32F32x4Splat:
|
||||
case kSSEF32x4ExtractLane:
|
||||
case kAVXF32x4ExtractLane:
|
||||
case kIA32F32x4ExtractLane:
|
||||
case kIA32Insertps:
|
||||
case kIA32F32x4SConvertI32x4:
|
||||
case kIA32F32x4UConvertI32x4:
|
||||
|
@ -2431,7 +2431,11 @@ void InstructionSelector::VisitF32x4Splat(Node* node) {
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitF32x4ExtractLane(Node* node) {
|
||||
VisitRRISimd(this, node, kAVXF32x4ExtractLane, kSSEF32x4ExtractLane);
|
||||
IA32OperandGenerator g(this);
|
||||
InstructionOperand operand0 = g.UseRegister(node->InputAt(0));
|
||||
InstructionOperand operand1 =
|
||||
g.UseImmediate(OpParameter<int32_t>(node->op()));
|
||||
Emit(kIA32F32x4ExtractLane, g.DefineAsRegister(node), operand0, operand1);
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
|
||||
|
Loading…
Reference in New Issue
Block a user