[ia32][wasm-simd] Optimize and unify f32x4.extract_lane SSE and AVX ops

Change the codegen for f32x4.extract_lane from shufps to insertps. They
have the same performance, but shufps has a false dependency on dst (it
shuffles dst and src, but we don't care about dst at all).

We then merge the SSE and AVX opcode.

Bug: v8:11217
Change-Id: I7cdbf486573ce3a19881df84400a9c7e09c3ee48
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2585259
Reviewed-by: Bill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71748}
This commit is contained in:
Zhi An Ng 2020-12-14 01:32:37 +00:00 committed by Commit Bot
parent 3ea458bea2
commit 754cb03cee
4 changed files with 21 additions and 22 deletions

View File

@ -2343,26 +2343,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
break; break;
} }
case kSSEF32x4ExtractLane: { case kIA32F32x4ExtractLane: {
DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
XMMRegister dst = i.OutputFloatRegister();
int8_t lane = i.InputInt8(1);
if (lane != 0) {
DCHECK_LT(lane, 4);
__ shufps(dst, dst, lane);
}
break;
}
case kAVXF32x4ExtractLane: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputFloatRegister(); XMMRegister dst = i.OutputFloatRegister();
XMMRegister src = i.InputSimd128Register(0); XMMRegister src = i.InputSimd128Register(0);
int8_t lane = i.InputInt8(1); uint8_t lane = i.InputUint8(1);
if (lane == 0) {
if (dst != src) __ vmovaps(dst, src);
} else {
DCHECK_LT(lane, 4); DCHECK_LT(lane, 4);
__ vshufps(dst, src, src, lane); if (lane == 0 && dst == src) {
break;
}
uint8_t zmask = 0xE; // Zero top 3 lanes.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
// Use src for both operands to avoid false-dependency on dst.
__ vinsertps(dst, src, src, zmask | (lane << 6));
} else {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ insertps(dst, src, zmask | (lane << 6));
} }
break; break;
} }

View File

@ -155,8 +155,7 @@ namespace compiler {
V(IA32I64x2ExtMulLowI32x4U) \ V(IA32I64x2ExtMulLowI32x4U) \
V(IA32I64x2ExtMulHighI32x4U) \ V(IA32I64x2ExtMulHighI32x4U) \
V(IA32F32x4Splat) \ V(IA32F32x4Splat) \
V(SSEF32x4ExtractLane) \ V(IA32F32x4ExtractLane) \
V(AVXF32x4ExtractLane) \
V(IA32Insertps) \ V(IA32Insertps) \
V(IA32F32x4SConvertI32x4) \ V(IA32F32x4SConvertI32x4) \
V(IA32F32x4UConvertI32x4) \ V(IA32F32x4UConvertI32x4) \

View File

@ -134,8 +134,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I64x2ExtMulLowI32x4U: case kIA32I64x2ExtMulLowI32x4U:
case kIA32I64x2ExtMulHighI32x4U: case kIA32I64x2ExtMulHighI32x4U:
case kIA32F32x4Splat: case kIA32F32x4Splat:
case kSSEF32x4ExtractLane: case kIA32F32x4ExtractLane:
case kAVXF32x4ExtractLane:
case kIA32Insertps: case kIA32Insertps:
case kIA32F32x4SConvertI32x4: case kIA32F32x4SConvertI32x4:
case kIA32F32x4UConvertI32x4: case kIA32F32x4UConvertI32x4:

View File

@ -2431,7 +2431,11 @@ void InstructionSelector::VisitF32x4Splat(Node* node) {
} }
void InstructionSelector::VisitF32x4ExtractLane(Node* node) { void InstructionSelector::VisitF32x4ExtractLane(Node* node) {
VisitRRISimd(this, node, kAVXF32x4ExtractLane, kSSEF32x4ExtractLane); IA32OperandGenerator g(this);
InstructionOperand operand0 = g.UseRegister(node->InputAt(0));
InstructionOperand operand1 =
g.UseImmediate(OpParameter<int32_t>(node->op()));
Emit(kIA32F32x4ExtractLane, g.DefineAsRegister(node), operand0, operand1);
} }
void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) { void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {