[ia32][wasm-simd] Optimize and unify f32x4.extract_lane SSE and AVX ops

Change the codegen for f32x4.extract_lane from shufps to insertps. They have the same performance, but shufps has a false dependency on dst (it shuffles dst and src, but we don't care about dst at all). We then merge the SSE and AVX opcode. Bug: v8:11217 Change-Id: I7cdbf486573ce3a19881df84400a9c7e09c3ee48 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2585259 Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#71748}
2020-12-14 01:32:37 +00:00 · 2020-12-14 01:32:37 +00:00 · 754cb03cee
commit 754cb03cee
parent 3ea458bea2
4 changed files with 21 additions and 22 deletions
--- a/src/compiler/backend/ia32/code-generator-ia32.cc
+++ b/src/compiler/backend/ia32/code-generator-ia32.cc
@ -2343,26 +2343,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      }
      break;
    }
-    case kSSEF32x4ExtractLane: {
-      DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
-      XMMRegister dst = i.OutputFloatRegister();
-      int8_t lane = i.InputInt8(1);
-      if (lane != 0) {
-        DCHECK_LT(lane, 4);
-        __ shufps(dst, dst, lane);
-      }
-      break;
-    }
-    case kAVXF32x4ExtractLane: {
-      CpuFeatureScope avx_scope(tasm(), AVX);
+    case kIA32F32x4ExtractLane: {
      XMMRegister dst = i.OutputFloatRegister();
      XMMRegister src = i.InputSimd128Register(0);
-      int8_t lane = i.InputInt8(1);
-      if (lane == 0) {
-        if (dst != src) __ vmovaps(dst, src);
+      uint8_t lane = i.InputUint8(1);
+      DCHECK_LT(lane, 4);
+      if (lane == 0 && dst == src) {
+        break;
+      }
+
+      uint8_t zmask = 0xE;  // Zero top 3 lanes.
+      if (CpuFeatures::IsSupported(AVX)) {
+        CpuFeatureScope avx_scope(tasm(), AVX);
+        // Use src for both operands to avoid false-dependency on dst.
+        __ vinsertps(dst, src, src, zmask | (lane << 6));
      } else {
-        DCHECK_LT(lane, 4);
-        __ vshufps(dst, src, src, lane);
+        CpuFeatureScope sse_scope(tasm(), SSE4_1);
+        __ insertps(dst, src, zmask | (lane << 6));
      }
      break;
    }
--- a/src/compiler/backend/ia32/instruction-codes-ia32.h
+++ b/src/compiler/backend/ia32/instruction-codes-ia32.h
@ -155,8 +155,7 @@ namespace compiler {
  V(IA32I64x2ExtMulLowI32x4U)      \
  V(IA32I64x2ExtMulHighI32x4U)     \
  V(IA32F32x4Splat)                \
-  V(SSEF32x4ExtractLane)           \
-  V(AVXF32x4ExtractLane)           \
+  V(IA32F32x4ExtractLane)          \
  V(IA32Insertps)                  \
  V(IA32F32x4SConvertI32x4)        \
  V(IA32F32x4UConvertI32x4)        \
--- a/src/compiler/backend/ia32/instruction-scheduler-ia32.cc
+++ b/src/compiler/backend/ia32/instruction-scheduler-ia32.cc
@ -134,8 +134,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kIA32I64x2ExtMulLowI32x4U:
    case kIA32I64x2ExtMulHighI32x4U:
    case kIA32F32x4Splat:
-    case kSSEF32x4ExtractLane:
-    case kAVXF32x4ExtractLane:
+    case kIA32F32x4ExtractLane:
    case kIA32Insertps:
    case kIA32F32x4SConvertI32x4:
    case kIA32F32x4UConvertI32x4:
--- a/src/compiler/backend/ia32/instruction-selector-ia32.cc
+++ b/src/compiler/backend/ia32/instruction-selector-ia32.cc
@ -2431,7 +2431,11 @@ void InstructionSelector::VisitF32x4Splat(Node* node) {
 }

 void InstructionSelector::VisitF32x4ExtractLane(Node* node) {
-  VisitRRISimd(this, node, kAVXF32x4ExtractLane, kSSEF32x4ExtractLane);
+  IA32OperandGenerator g(this);
+  InstructionOperand operand0 = g.UseRegister(node->InputAt(0));
+  InstructionOperand operand1 =
+      g.UseImmediate(OpParameter<int32_t>(node->op()));
+  Emit(kIA32F32x4ExtractLane, g.DefineAsRegister(node), operand0, operand1);
 }

 void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {