[wasm-simd][ia32] Prototype i32x4.dot_i16x8_s

This implements I32x4DotI16x8S for ia32. Also fixes instruction-selector for SIMD ops, they should all set operand1 to be a register, since we do not have memory alignment yet. Bug: v8:10583 Change-Id: Id273816efd5eea128580f3f7bde533a8e1b2435d Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2231031 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Cr-Commit-Position: refs/heads/master@{#68444}
2020-06-18 16:45:24 -07:00 · 2020-06-18 16:45:24 -07:00 · 08ccfb2002
commit 08ccfb2002
parent 1c39569e2e
8 changed files with 37 additions and 10 deletions
--- a/src/codegen/ia32/macro-assembler-ia32.h
+++ b/src/codegen/ia32/macro-assembler-ia32.h
@ -385,6 +385,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  AVX_PACKED_OP3(Psrlq, psrlq)
  AVX_PACKED_OP3(Psraw, psraw)
  AVX_PACKED_OP3(Psrad, psrad)
+  AVX_PACKED_OP3(Pmaddwd, pmaddwd)
  AVX_PACKED_OP3(Paddd, paddd)
  AVX_PACKED_OP3(Paddq, paddq)
  AVX_PACKED_OP3(Psubq, psubq)
--- a/src/codegen/ia32/sse-instr.h
+++ b/src/codegen/ia32/sse-instr.h
@ -9,6 +9,7 @@
  V(packsswb, 66, 0F, 63)        \
  V(packssdw, 66, 0F, 6B)        \
  V(packuswb, 66, 0F, 67)        \
+  V(pmaddwd, 66, 0F, F5)         \
  V(paddb, 66, 0F, FC)           \
  V(paddw, 66, 0F, FD)           \
  V(paddd, 66, 0F, FE)           \
--- a/src/compiler/backend/ia32/code-generator-ia32.cc
+++ b/src/compiler/backend/ia32/code-generator-ia32.cc
@ -2807,6 +2807,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ Movmskps(i.OutputRegister(), i.InputSimd128Register(0));
      break;
    }
+    case kIA32I32x4DotI16x8S: {
+      __ Pmaddwd(i.OutputSimd128Register(), i.InputSimd128Register(0),
+                 i.InputSimd128Register(1));
+      break;
+    }
    case kIA32I16x8Splat: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Movd(dst, i.InputOperand(0));
--- a/src/compiler/backend/ia32/instruction-codes-ia32.h
+++ b/src/compiler/backend/ia32/instruction-codes-ia32.h
@ -234,6 +234,7 @@ namespace compiler {
  V(AVXI32x4GeU)                   \
  V(IA32I32x4Abs)                  \
  V(IA32I32x4BitMask)              \
+  V(IA32I32x4DotI16x8S)            \
  V(IA32I16x8Splat)                \
  V(IA32I16x8ExtractLaneU)         \
  V(IA32I16x8ExtractLaneS)         \
--- a/src/compiler/backend/ia32/instruction-scheduler-ia32.cc
+++ b/src/compiler/backend/ia32/instruction-scheduler-ia32.cc
@ -215,6 +215,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kAVXI32x4GeU:
    case kIA32I32x4Abs:
    case kIA32I32x4BitMask:
+    case kIA32I32x4DotI16x8S:
    case kIA32I16x8Splat:
    case kIA32I16x8ExtractLaneU:
    case kIA32I16x8ExtractLaneS:
--- a/src/compiler/backend/ia32/instruction-selector-ia32.cc
+++ b/src/compiler/backend/ia32/instruction-selector-ia32.cc
@ -277,6 +277,23 @@ void VisitRRSimd(InstructionSelector* selector, Node* node,
  }
 }

+// TODO(v8:9198): Like VisitRROFloat, but for SIMD. SSE requires operand1 to be
+// a register as we don't have memory alignment yet. For AVX, memory operands
+// are fine, but can have performance issues if not aligned to 16/32 bytes
+// (based on load size), see SDM Vol 1, chapter 14.9
+void VisitRROSimd(InstructionSelector* selector, Node* node,
+                  ArchOpcode avx_opcode, ArchOpcode sse_opcode) {
+  IA32OperandGenerator g(selector);
+  InstructionOperand operand0 = g.UseRegister(node->InputAt(0));
+  if (selector->IsSupported(AVX)) {
+    selector->Emit(avx_opcode, g.DefineAsRegister(node), operand0,
+                   g.Use(node->InputAt(1)));
+  } else {
+    selector->Emit(sse_opcode, g.DefineSameAsFirst(node), operand0,
+                   g.UseRegister(node->InputAt(1)));
+  }
+}
+
 void VisitRRISimd(InstructionSelector* selector, Node* node,
                  ArchOpcode opcode) {
  IA32OperandGenerator g(selector);
@ -2109,6 +2126,7 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
 #define SIMD_BINOP_UNIFIED_SSE_AVX_LIST(V) \
  V(I64x2Add)                              \
  V(I64x2Sub)                              \
+  V(I32x4DotI16x8S)                        \
  V(I16x8RoundingAverageU)                 \
  V(I8x16RoundingAverageU)

@ -2422,17 +2440,17 @@ SIMD_ALLTRUE_LIST(VISIT_SIMD_ALLTRUE)
 #undef VISIT_SIMD_ALLTRUE
 #undef SIMD_ALLTRUE_LIST

-#define VISIT_SIMD_BINOP(Opcode)                           \
-  void InstructionSelector::Visit##Opcode(Node* node) {    \
-    VisitRROFloat(this, node, kAVX##Opcode, kSSE##Opcode); \
+#define VISIT_SIMD_BINOP(Opcode)                          \
+  void InstructionSelector::Visit##Opcode(Node* node) {   \
+    VisitRROSimd(this, node, kAVX##Opcode, kSSE##Opcode); \
  }
 SIMD_BINOP_LIST(VISIT_SIMD_BINOP)
 #undef VISIT_SIMD_BINOP
 #undef SIMD_BINOP_LIST

-#define VISIT_SIMD_BINOP_UNIFIED_SSE_AVX(Opcode)             \
-  void InstructionSelector::Visit##Opcode(Node* node) {      \
-    VisitRROFloat(this, node, kIA32##Opcode, kIA32##Opcode); \
+#define VISIT_SIMD_BINOP_UNIFIED_SSE_AVX(Opcode)            \
+  void InstructionSelector::Visit##Opcode(Node* node) {     \
+    VisitRROSimd(this, node, kIA32##Opcode, kIA32##Opcode); \
  }
 SIMD_BINOP_UNIFIED_SSE_AVX_LIST(VISIT_SIMD_BINOP_UNIFIED_SSE_AVX)
 #undef VISIT_SIMD_BINOP_UNIFIED_SSE_AVX
--- a/src/compiler/backend/instruction-selector.cc
+++ b/src/compiler/backend/instruction-selector.cc
@ -2705,10 +2705,10 @@ void InstructionSelector::VisitF32x4NearestInt(Node* node) { UNIMPLEMENTED(); }
 #endif  // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_S390X
        // && !V8_TARGET_ARCH_IA32

-#if !V8_TARGET_ARCH_X64
+#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32
 // TODO(v8:10583) Prototype i32x4.dot_i16x8_s
 void InstructionSelector::VisitI32x4DotI16x8S(Node* node) { UNIMPLEMENTED(); }
-#endif  // !V8_TARGET_ARCH_X64
+#endif  // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32

 void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }

--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -2317,7 +2317,7 @@ WASM_SIMD_TEST(I16x8RoundingAverageU) {
 }

 // TODO(v8:10583) Prototype i32x4.dot_i16x8_s
-#if V8_TARGET_ARCH_X64
+#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32
 WASM_SIMD_TEST_NO_LOWERING(I32x4DotI16x8S) {
  FLAG_SCOPE(wasm_simd_post_mvp);

@ -2344,7 +2344,7 @@ WASM_SIMD_TEST_NO_LOWERING(I32x4DotI16x8S) {
    }
  }
 }
-#endif  // V8_TARGET_ARCH_X64
+#endif  // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32

 void RunI16x8ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                         WasmOpcode opcode, Int16ShiftOp expected_op) {