From df54d51148b4914b838a485d58d8892ca8bab2c1 Mon Sep 17 00:00:00 2001
From: Ng Zhi An <zhin@chromium.org>
Date: Wed, 26 Jun 2019 10:31:38 -0700
Subject: [PATCH] [wasm simd] Implement I64x2Splat on x64

Bug: v8:8460
Change-Id: Id159c81cd2d25924be96e49c64073e154ef32e6a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1667867
Reviewed-by: Bill Budge <bbudge@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Reviewed-by: Michael Starzinger <mstarzinger@chromium.org>
Auto-Submit: Zhi An Ng <zhin@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62475}
---
 src/codegen/x64/assembler-x64.cc              | 14 +++++++++++++
 src/codegen/x64/assembler-x64.h               |  1 +
 src/compiler/backend/instruction-selector.cc  |  6 ++++++
 .../backend/x64/code-generator-x64.cc         | 10 +++++++++
 .../backend/x64/instruction-codes-x64.h       |  1 +
 .../backend/x64/instruction-scheduler-x64.cc  |  1 +
 .../backend/x64/instruction-selector-x64.cc   |  1 +
 src/compiler/machine-operator.cc              |  1 +
 src/compiler/machine-operator.h               |  1 +
 src/compiler/opcodes.h                        |  1 +
 src/compiler/simd-scalar-lowering.cc          | 14 ++++++++++++-
 src/compiler/simd-scalar-lowering.h           |  8 ++++++-
 src/compiler/wasm-compiler.cc                 |  2 ++
 src/wasm/wasm-interpreter.cc                  |  1 +
 src/wasm/wasm-opcodes.cc                      |  2 ++
 src/wasm/wasm-opcodes.h                       |  2 ++
 src/wasm/wasm-value.h                         |  1 +
 test/cctest/wasm/test-run-wasm-simd.cc        | 21 +++++++++++++++++++
 18 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/src/codegen/x64/assembler-x64.cc b/src/codegen/x64/assembler-x64.cc
index 3236b0f52c..6d42c18e67 100644
--- a/src/codegen/x64/assembler-x64.cc
+++ b/src/codegen/x64/assembler-x64.cc
@@ -2883,6 +2883,18 @@ void Assembler::movd(Register dst, XMMRegister src) {
 }
 
 void Assembler::movq(XMMRegister dst, Register src) {
+  // Mixing AVX and non-AVX is expensive, catch those cases
+  DCHECK(!IsEnabled(AVX));
+  EnsureSpace ensure_space(this);
+  emit(0x66);
+  emit_rex_64(dst, src);
+  emit(0x0F);
+  emit(0x6E);
+  emit_sse_operand(dst, src);
+}
+
+void Assembler::movq(XMMRegister dst, Operand src) {
+  // Mixing AVX and non-AVX is expensive, catch those cases
   DCHECK(!IsEnabled(AVX));
   EnsureSpace ensure_space(this);
   emit(0x66);
@@ -2893,6 +2905,7 @@ void Assembler::movq(XMMRegister dst, Register src) {
 }
 
 void Assembler::movq(Register dst, XMMRegister src) {
+  // Mixing AVX and non-AVX is expensive, catch those cases
   DCHECK(!IsEnabled(AVX));
   EnsureSpace ensure_space(this);
   emit(0x66);
@@ -2903,6 +2916,7 @@ void Assembler::movq(Register dst, XMMRegister src) {
 }
 
 void Assembler::movq(XMMRegister dst, XMMRegister src) {
+  // Mixing AVX and non-AVX is expensive, catch those cases
   DCHECK(!IsEnabled(AVX));
   EnsureSpace ensure_space(this);
   if (dst.low_bits() == 4) {
diff --git a/src/codegen/x64/assembler-x64.h b/src/codegen/x64/assembler-x64.h
index dc6acb67f4..0ec9a13046 100644
--- a/src/codegen/x64/assembler-x64.h
+++ b/src/codegen/x64/assembler-x64.h
@@ -969,6 +969,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
   void movd(XMMRegister dst, Operand src);
   void movd(Register dst, XMMRegister src);
   void movq(XMMRegister dst, Register src);
+  void movq(XMMRegister dst, Operand src);
   void movq(Register dst, XMMRegister src);
   void movq(XMMRegister dst, XMMRegister src);
 
diff --git a/src/compiler/backend/instruction-selector.cc b/src/compiler/backend/instruction-selector.cc
index d7fa926854..a8868ec6e8 100644
--- a/src/compiler/backend/instruction-selector.cc
+++ b/src/compiler/backend/instruction-selector.cc
@@ -1849,6 +1849,8 @@ void InstructionSelector::VisitNode(Node* node) {
       return MarkAsSimd128(node), VisitF32x4Lt(node);
     case IrOpcode::kF32x4Le:
       return MarkAsSimd128(node), VisitF32x4Le(node);
+    case IrOpcode::kI64x2Splat:
+      return MarkAsSimd128(node), VisitI64x2Splat(node);
     case IrOpcode::kI32x4Splat:
       return MarkAsSimd128(node), VisitI32x4Splat(node);
     case IrOpcode::kI32x4ExtractLane:
@@ -2492,6 +2494,10 @@ void InstructionSelector::VisitWord64AtomicCompareExchange(Node* node) {
 #endif  // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_PPC
         // !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_S390
 
+#if !V8_TARGET_ARCH_X64
+void InstructionSelector::VisitI64x2Splat(Node* node) { UNIMPLEMENTED(); }
+#endif  // !V8_TARGET_ARCH_X64
+
 void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
 
 void InstructionSelector::VisitParameter(Node* node) {
diff --git a/src/compiler/backend/x64/code-generator-x64.cc b/src/compiler/backend/x64/code-generator-x64.cc
index 04ccf7d391..74034c3a13 100644
--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@@ -2400,6 +2400,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       __ cmpleps(i.OutputSimd128Register(), i.InputSimd128Register(1));
       break;
     }
+    case kX64I64x2Splat: {
+      XMMRegister dst = i.OutputSimd128Register();
+      if (instr->InputAt(0)->IsRegister()) {
+        __ movq(dst, i.InputRegister(0));
+      } else {
+        __ movq(dst, i.InputOperand(0));
+      }
+      __ pshufd(dst, dst, 0x44);
+      break;
+    }
     case kX64I32x4Splat: {
       XMMRegister dst = i.OutputSimd128Register();
       if (instr->InputAt(0)->IsRegister()) {
diff --git a/src/compiler/backend/x64/instruction-codes-x64.h b/src/compiler/backend/x64/instruction-codes-x64.h
index 57ef26dbd7..bf92cef51a 100644
--- a/src/compiler/backend/x64/instruction-codes-x64.h
+++ b/src/compiler/backend/x64/instruction-codes-x64.h
@@ -177,6 +177,7 @@ namespace compiler {
   V(X64F32x4Ne)                           \
   V(X64F32x4Lt)                           \
   V(X64F32x4Le)                           \
+  V(X64I64x2Splat)                        \
   V(X64I32x4Splat)                        \
   V(X64I32x4ExtractLane)                  \
   V(X64I32x4ReplaceLane)                  \
diff --git a/src/compiler/backend/x64/instruction-scheduler-x64.cc b/src/compiler/backend/x64/instruction-scheduler-x64.cc
index 9d48e9175a..5670fa3b67 100644
--- a/src/compiler/backend/x64/instruction-scheduler-x64.cc
+++ b/src/compiler/backend/x64/instruction-scheduler-x64.cc
@@ -143,6 +143,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
     case kX64F32x4Ne:
     case kX64F32x4Lt:
     case kX64F32x4Le:
+    case kX64I64x2Splat:
     case kX64I32x4Splat:
     case kX64I32x4ExtractLane:
     case kX64I32x4ReplaceLane:
diff --git a/src/compiler/backend/x64/instruction-selector-x64.cc b/src/compiler/backend/x64/instruction-selector-x64.cc
index 8e6336f576..ead00b6f71 100644
--- a/src/compiler/backend/x64/instruction-selector-x64.cc
+++ b/src/compiler/backend/x64/instruction-selector-x64.cc
@@ -2668,6 +2668,7 @@ void InstructionSelector::VisitS128Zero(Node* node) {
          g.Use(node->InputAt(0)));                           \
   }
 SIMD_TYPES(VISIT_SIMD_SPLAT)
+VISIT_SIMD_SPLAT(I64x2)
 #undef VISIT_SIMD_SPLAT
 
 #define VISIT_SIMD_EXTRACT_LANE(Type)                              \
diff --git a/src/compiler/machine-operator.cc b/src/compiler/machine-operator.cc
index 2fc8c9a542..58b251e69a 100644
--- a/src/compiler/machine-operator.cc
+++ b/src/compiler/machine-operator.cc
@@ -262,6 +262,7 @@ MachineType AtomicOpType(Operator const* op) {
   V(F32x4Ne, Operator::kCommutative, 2, 0, 1)                                 \
   V(F32x4Lt, Operator::kNoProperties, 2, 0, 1)                                \
   V(F32x4Le, Operator::kNoProperties, 2, 0, 1)                                \
+  V(I64x2Splat, Operator::kNoProperties, 1, 0, 1)                             \
   V(I32x4Splat, Operator::kNoProperties, 1, 0, 1)                             \
   V(I32x4SConvertF32x4, Operator::kNoProperties, 1, 0, 1)                     \
   V(I32x4SConvertI16x8Low, Operator::kNoProperties, 1, 0, 1)                  \
diff --git a/src/compiler/machine-operator.h b/src/compiler/machine-operator.h
index ceab67decd..b843b6fa41 100644
--- a/src/compiler/machine-operator.h
+++ b/src/compiler/machine-operator.h
@@ -489,6 +489,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
   const Operator* F32x4Lt();
   const Operator* F32x4Le();
 
+  const Operator* I64x2Splat();
   const Operator* I32x4Splat();
   const Operator* I32x4ExtractLane(int32_t);
   const Operator* I32x4ReplaceLane(int32_t);
diff --git a/src/compiler/opcodes.h b/src/compiler/opcodes.h
index ccc69f7ef7..74869f4f71 100644
--- a/src/compiler/opcodes.h
+++ b/src/compiler/opcodes.h
@@ -745,6 +745,7 @@
   V(F32x4Le)                    \
   V(F32x4Gt)                    \
   V(F32x4Ge)                    \
+  V(I64x2Splat)                 \
   V(I32x4Splat)                 \
   V(I32x4ExtractLane)           \
   V(I32x4ReplaceLane)           \
diff --git a/src/compiler/simd-scalar-lowering.cc b/src/compiler/simd-scalar-lowering.cc
index 862f363e63..3db19a6a6d 100644
--- a/src/compiler/simd-scalar-lowering.cc
+++ b/src/compiler/simd-scalar-lowering.cc
@@ -16,6 +16,7 @@ namespace internal {
 namespace compiler {
 
 namespace {
+static const int kNumLanes64 = 2;
 static const int kNumLanes32 = 4;
 static const int kNumLanes16 = 8;
 static const int kNumLanes8 = 16;
@@ -76,6 +77,8 @@ void SimdScalarLowering::LowerGraph() {
   }
 }
 
+#define FOREACH_INT64X2_OPCODE(V) V(I64x2Splat)
+
 #define FOREACH_INT32X4_OPCODE(V) \
   V(I32x4Splat)                   \
   V(I32x4ExtractLane)             \
@@ -208,6 +211,8 @@ void SimdScalarLowering::LowerGraph() {
 
 MachineType SimdScalarLowering::MachineTypeFrom(SimdType simdType) {
   switch (simdType) {
+    case SimdType::kInt64x2:
+      return MachineType::Int64();
     case SimdType::kFloat32x4:
       return MachineType::Float32();
     case SimdType::kInt32x4:
@@ -223,6 +228,10 @@ MachineType SimdScalarLowering::MachineTypeFrom(SimdType simdType) {
 void SimdScalarLowering::SetLoweredType(Node* node, Node* output) {
   switch (node->opcode()) {
 #define CASE_STMT(name) case IrOpcode::k##name:
+    FOREACH_INT64X2_OPCODE(CASE_STMT) {
+      replacements_[node->id()].type = SimdType::kInt64x2;
+      break;
+    }
     FOREACH_INT32X4_OPCODE(CASE_STMT)
     case IrOpcode::kReturn:
     case IrOpcode::kParameter:
@@ -326,7 +335,9 @@ static int GetReturnCountAfterLoweringSimd128(
 
 int SimdScalarLowering::NumLanes(SimdType type) {
   int num_lanes = 0;
-  if (type == SimdType::kFloat32x4 || type == SimdType::kInt32x4) {
+  if (type == SimdType::kInt64x2) {
+    num_lanes = kNumLanes64;
+  } else if (type == SimdType::kFloat32x4 || type == SimdType::kInt32x4) {
     num_lanes = kNumLanes32;
   } else if (type == SimdType::kInt16x8) {
     num_lanes = kNumLanes16;
@@ -1223,6 +1234,7 @@ void SimdScalarLowering::LowerNode(Node* node) {
       LowerUnaryOp(node, SimdType::kInt32x4, machine()->RoundUint32ToFloat32());
       break;
     }
+    case IrOpcode::kI64x2Splat:
     case IrOpcode::kI32x4Splat:
     case IrOpcode::kF32x4Splat:
     case IrOpcode::kI16x8Splat:
diff --git a/src/compiler/simd-scalar-lowering.h b/src/compiler/simd-scalar-lowering.h
index 01ea195bdc..6de48ebccf 100644
--- a/src/compiler/simd-scalar-lowering.h
+++ b/src/compiler/simd-scalar-lowering.h
@@ -32,7 +32,13 @@ class SimdScalarLowering {
  private:
   enum class State : uint8_t { kUnvisited, kOnStack, kVisited };
 
-  enum class SimdType : uint8_t { kFloat32x4, kInt32x4, kInt16x8, kInt8x16 };
+  enum class SimdType : uint8_t {
+    kInt64x2,
+    kFloat32x4,
+    kInt32x4,
+    kInt16x8,
+    kInt8x16
+  };
 
 #if defined(V8_TARGET_BIG_ENDIAN)
   static constexpr int kLaneOffsets[16] = {15, 14, 13, 12, 11, 10, 9, 8,
diff --git a/src/compiler/wasm-compiler.cc b/src/compiler/wasm-compiler.cc
index 0b212c9fb1..027f2b928e 100644
--- a/src/compiler/wasm-compiler.cc
+++ b/src/compiler/wasm-compiler.cc
@@ -4055,6 +4055,8 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
     case wasm::kExprF32x4Ge:
       return graph()->NewNode(mcgraph()->machine()->F32x4Le(), inputs[1],
                               inputs[0]);
+    case wasm::kExprI64x2Splat:
+      return graph()->NewNode(mcgraph()->machine()->I64x2Splat(), inputs[0]);
     case wasm::kExprI32x4Splat:
       return graph()->NewNode(mcgraph()->machine()->I32x4Splat(), inputs[0]);
     case wasm::kExprI32x4SConvertF32x4:
diff --git a/src/wasm/wasm-interpreter.cc b/src/wasm/wasm-interpreter.cc
index 35f769bb28..c6ec2c77ea 100644
--- a/src/wasm/wasm-interpreter.cc
+++ b/src/wasm/wasm-interpreter.cc
@@ -2134,6 +2134,7 @@ class ThreadImpl {
     Push(WasmValue(Simd128(s)));                \
     return true;                                \
   }
+      SPLAT_CASE(I64x2, int2, int64_t, 2)
       SPLAT_CASE(I32x4, int4, int32_t, 4)
       SPLAT_CASE(F32x4, float4, float, 4)
       SPLAT_CASE(I16x8, int8, int32_t, 8)
diff --git a/src/wasm/wasm-opcodes.cc b/src/wasm/wasm-opcodes.cc
index 88b9e90381..3b55561a9b 100644
--- a/src/wasm/wasm-opcodes.cc
+++ b/src/wasm/wasm-opcodes.cc
@@ -25,6 +25,7 @@ namespace wasm {
 #define CASE_REF_OP(name, str) CASE_OP(Ref##name, "ref." str)
 #define CASE_F32x4_OP(name, str) CASE_OP(F32x4##name, "f32x4." str)
 #define CASE_I32x4_OP(name, str) CASE_OP(I32x4##name, "i32x4." str)
+#define CASE_I64x2_OP(name, str) CASE_OP(I64x2##name, "i64x2." str)
 #define CASE_I16x8_OP(name, str) CASE_OP(I16x8##name, "i16x8." str)
 #define CASE_I8x16_OP(name, str) CASE_OP(I8x16##name, "i8x16." str)
 #define CASE_S128_OP(name, str) CASE_OP(S128##name, "s128." str)
@@ -252,6 +253,7 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
     CASE_SIGN_OP(SIMDI, Ge, "ge")
     CASE_SIGN_OP(SIMDI, Shr, "shr")
     CASE_SIMDI_OP(Shl, "shl")
+    CASE_I64x2_OP(Splat, "splat")
     CASE_I32x4_OP(AddHoriz, "add_horizontal")
     CASE_I16x8_OP(AddHoriz, "add_horizontal")
     CASE_SIGN_OP(I16x8, AddSaturate, "add_saturate")
diff --git a/src/wasm/wasm-opcodes.h b/src/wasm/wasm-opcodes.h
index 3f58b72636..ccbe6b3810 100644
--- a/src/wasm/wasm-opcodes.h
+++ b/src/wasm/wasm-opcodes.h
@@ -272,6 +272,7 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, bool hasBigIntFeature);
   V(I8x16Splat, 0xfd04, s_i)             \
   V(I16x8Splat, 0xfd08, s_i)             \
   V(I32x4Splat, 0xfd0c, s_i)             \
+  V(I64x2Splat, 0xfd0f, s_l)             \
   V(F32x4Splat, 0xfd12, s_f)             \
   V(I8x16Eq, 0xfd18, s_ss)               \
   V(I8x16Ne, 0xfd19, s_ss)               \
@@ -560,6 +561,7 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, bool hasBigIntFeature);
   V(s_f, kWasmS128, kWasmF32)              \
   V(s_ss, kWasmS128, kWasmS128, kWasmS128) \
   V(s_i, kWasmS128, kWasmI32)              \
+  V(s_l, kWasmS128, kWasmI64)              \
   V(s_si, kWasmS128, kWasmS128, kWasmI32)  \
   V(i_s, kWasmI32, kWasmS128)              \
   V(v_is, kWasmStmt, kWasmI32, kWasmS128)  \
diff --git a/src/wasm/wasm-value.h b/src/wasm/wasm-value.h
index 8c7571b72f..457b2fefd8 100644
--- a/src/wasm/wasm-value.h
+++ b/src/wasm/wasm-value.h
@@ -17,6 +17,7 @@ namespace wasm {
 
 #define FOREACH_SIMD_TYPE(V) \
   V(float, float4, f32x4, 4) \
+  V(int64_t, int2, i64x2, 2) \
   V(int32_t, int4, i32x4, 4) \
   V(int16_t, int8, i16x8, 8) \
   V(int8_t, int16, i8x16, 16)
diff --git a/test/cctest/wasm/test-run-wasm-simd.cc b/test/cctest/wasm/test-run-wasm-simd.cc
index 74a401725e..9325e00978 100644
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@@ -289,6 +289,7 @@ T Sqrt(T a) {
 #define WASM_SIMD_F32x4_REPLACE_LANE(lane, x, y) \
   x, y, WASM_SIMD_OP(kExprF32x4ReplaceLane), TO_BYTE(lane)
 
+#define WASM_SIMD_I64x2_SPLAT(x) WASM_SIMD_SPLAT(I64x2, x)
 #define WASM_SIMD_I32x4_SPLAT(x) WASM_SIMD_SPLAT(I32x4, x)
 #define WASM_SIMD_I32x4_EXTRACT_LANE(lane, x) \
   x, WASM_SIMD_OP(kExprI32x4ExtractLane), TO_BYTE(lane)
@@ -680,6 +681,26 @@ WASM_SIMD_TEST(F32x4Le) {
   RunF32x4CompareOpTest(execution_tier, lower_simd, kExprF32x4Le, LessEqual);
 }
 
+#if V8_TARGET_ARCH_X64
+WASM_SIMD_TEST(I64x2Splat) {
+  WasmRunner<int32_t, int64_t> r(execution_tier, lower_simd);
+  // Set up a global to hold output vector.
+  int64_t* g = r.builder().AddGlobal<int64_t>(kWasmS128);
+  byte param1 = 0;
+  BUILD(r, WASM_SET_GLOBAL(0, WASM_SIMD_I64x2_SPLAT(WASM_GET_LOCAL(param1))),
+        WASM_ONE);
+
+  FOR_INT64_INPUTS(x) {
+    r.Call(x);
+    int64_t expected = x;
+    for (int i = 0; i < 2; i++) {
+      int64_t actual = ReadLittleEndianValue<int64_t>(&g[i]);
+      CHECK_EQ(actual, expected);
+    }
+  }
+}
+#endif  // V8_TARGET_ARCH_X64
+
 WASM_SIMD_TEST(I32x4Splat) {
   WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
   // Set up a global to hold output vector.