[wasm-simd] Implement v8x16.swizzle for x64

Bug: v8:8460 Change-Id: I79ae753f15aaa91a2154bd7078a1cdb9f3e049f1 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1822497 Reviewed-by: Michael Starzinger <mstarzinger@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#64201}
2019-10-09 10:25:33 -07:00 · 2019-10-09 10:25:33 -07:00 · 3fdc88defb
commit 3fdc88defb
parent c4d90a74e4
16 changed files with 157 additions and 0 deletions
--- a/src/codegen/x64/macro-assembler-x64.cc
+++ b/src/codegen/x64/macro-assembler-x64.cc
@ -1822,6 +1822,16 @@ void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
  }
 }

+void TurboAssembler::Pshufd(XMMRegister dst, XMMRegister src, uint8_t shuffle) {
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vpshufd(dst, src, shuffle);
+  } else {
+    DCHECK(!IsEnabled(AVX));
+    pshufd(dst, src, shuffle);
+  }
+}
+
 void TurboAssembler::Lzcntl(Register dst, Register src) {
  if (CpuFeatures::IsSupported(LZCNT)) {
    CpuFeatureScope scope(this, LZCNT);
--- a/src/codegen/x64/macro-assembler-x64.h
+++ b/src/codegen/x64/macro-assembler-x64.h
@ -154,6 +154,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  AVX_OP(Sqrtsd, sqrtsd)
  AVX_OP(Ucomiss, ucomiss)
  AVX_OP(Ucomisd, ucomisd)
+  AVX_OP(Pshufb, pshufb)
+  AVX_OP(Paddusb, paddusb)

 #undef AVX_OP

@ -375,6 +377,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  void Pslld(XMMRegister dst, byte imm8);
  void Psrld(XMMRegister dst, byte imm8);

+  void Pshufd(XMMRegister dst, XMMRegister src, uint8_t shuffle);
+
  void CompareRoot(Register with, RootIndex index);
  void CompareRoot(Operand with, RootIndex index);

--- a/src/compiler/backend/instruction-selector.cc
+++ b/src/compiler/backend/instruction-selector.cc
@ -2151,6 +2151,8 @@ void InstructionSelector::VisitNode(Node* node) {
      return MarkAsSimd128(node), VisitS128Not(node);
    case IrOpcode::kS128Select:
      return MarkAsSimd128(node), VisitS128Select(node);
+    case IrOpcode::kS8x16Swizzle:
+      return MarkAsSimd128(node), VisitS8x16Swizzle(node);
    case IrOpcode::kS8x16Shuffle:
      return MarkAsSimd128(node), VisitS8x16Shuffle(node);
    case IrOpcode::kS1x2AnyTrue:
@ -2666,6 +2668,7 @@ void InstructionSelector::VisitI64x2MinS(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2MaxS(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
+void InstructionSelector::VisitS8x16Swizzle(Node* node) { UNIMPLEMENTED(); }
 #endif  // !V8_TARGET_ARCH_X64

 void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -3580,6 +3580,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ xorps(dst, i.InputSimd128Register(2));
      break;
    }
+    case kX64S8x16Swizzle: {
+      CpuFeatureScope sse_scope(tasm(), SSSE3);
+      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister mask = i.TempSimd128Register(0);
+
+      // Out-of-range indices should return 0, add 112 so that any value > 15
+      // saturates to 128 (top bit set), so pshufb will zero that lane.
+      __ Move(mask, static_cast<uint32_t>(0x70707070));
+      __ Pshufd(mask, mask, 0x0);
+      __ Paddusb(mask, i.InputSimd128Register(1));
+      __ Pshufb(dst, mask);
+      break;
+    }
    case kX64S8x16Shuffle: {
      XMMRegister dst = i.OutputSimd128Register();
      Register tmp = i.TempRegister(0);
--- a/src/compiler/backend/x64/instruction-codes-x64.h
+++ b/src/compiler/backend/x64/instruction-codes-x64.h
@ -306,6 +306,7 @@ namespace compiler {
  V(X64S128Or)                            \
  V(X64S128Xor)                           \
  V(X64S128Select)                        \
+  V(X64S8x16Swizzle)                      \
  V(X64S8x16Shuffle)                      \
  V(X64S32x4Swizzle)                      \
  V(X64S32x4Shuffle)                      \
--- a/src/compiler/backend/x64/instruction-scheduler-x64.cc
+++ b/src/compiler/backend/x64/instruction-scheduler-x64.cc
@ -281,6 +281,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64S1x4AllTrue:
    case kX64S1x8AnyTrue:
    case kX64S1x8AllTrue:
+    case kX64S8x16Swizzle:
    case kX64S8x16Shuffle:
    case kX64S32x4Swizzle:
    case kX64S32x4Shuffle:
--- a/src/compiler/backend/x64/instruction-selector-x64.cc
+++ b/src/compiler/backend/x64/instruction-selector-x64.cc
@ -3306,6 +3306,14 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
  Emit(opcode, 1, &dst, input_count, inputs, temp_count, temps);
 }

+void InstructionSelector::VisitS8x16Swizzle(Node* node) {
+  X64OperandGenerator g(this);
+  InstructionOperand temps[] = {g.TempSimd128Register()};
+  Emit(kX64S8x16Swizzle, g.DefineSameAsFirst(node),
+       g.UseRegister(node->InputAt(0)), g.UseUniqueRegister(node->InputAt(1)),
+       arraysize(temps), temps);
+}
+
 // static
 MachineOperatorBuilder::Flags
 InstructionSelector::SupportedMachineOperatorFlags() {
--- a/src/compiler/machine-operator.cc
+++ b/src/compiler/machine-operator.cc
@ -402,6 +402,7 @@ MachineType AtomicOpType(Operator const* op) {
  V(S1x8AllTrue, Operator::kNoProperties, 1, 0, 1)                            \
  V(S1x16AnyTrue, Operator::kNoProperties, 1, 0, 1)                           \
  V(S1x16AllTrue, Operator::kNoProperties, 1, 0, 1)                           \
+  V(S8x16Swizzle, Operator::kNoProperties, 2, 0, 1)                           \
  V(StackPointerGreaterThan, Operator::kNoProperties, 1, 0, 1)

 // The format is:
--- a/src/compiler/machine-operator.h
+++ b/src/compiler/machine-operator.h
@ -644,6 +644,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
  const Operator* S128Not();
  const Operator* S128Select();

+  const Operator* S8x16Swizzle();
  const Operator* S8x16Shuffle(const uint8_t shuffle[16]);

  const Operator* S1x2AnyTrue();
--- a/src/compiler/opcodes.h
+++ b/src/compiler/opcodes.h
@ -916,6 +916,7 @@
  V(S128Or)                     \
  V(S128Xor)                    \
  V(S128Select)                 \
+  V(S8x16Swizzle)               \
  V(S8x16Shuffle)               \
  V(S1x2AnyTrue)                \
  V(S1x2AllTrue)                \
--- a/src/compiler/simd-scalar-lowering.cc
+++ b/src/compiler/simd-scalar-lowering.cc
@ -211,6 +211,7 @@ void SimdScalarLowering::LowerGraph() {
  V(I8x16LeS)                     \
  V(I8x16LtU)                     \
  V(I8x16LeU)                     \
+  V(S8x16Swizzle)                 \
  V(S8x16Shuffle)

 MachineType SimdScalarLowering::MachineTypeFrom(SimdType simdType) {
@ -1392,6 +1393,45 @@ void SimdScalarLowering::LowerNode(Node* node) {
      ReplaceNode(node, rep_node, num_lanes);
      break;
    }
+    case IrOpcode::kS8x16Swizzle: {
+      DCHECK_EQ(2, node->InputCount());
+      Node** rep_left = GetReplacementsWithType(node->InputAt(0), rep_type);
+      Node** indices = GetReplacementsWithType(node->InputAt(1), rep_type);
+      Node** rep_nodes = zone()->NewArray<Node*>(num_lanes);
+      Node* stack_slot = graph()->NewNode(
+          machine()->StackSlot(MachineRepresentation::kSimd128));
+
+      // Push all num_lanes values into stack slot.
+      const Operator* store_op = machine()->Store(
+          StoreRepresentation(MachineRepresentation::kWord8, kNoWriteBarrier));
+      Node* effect_input = graph()->start();
+      for (int i = num_lanes - 1; i >= 0; i--) {
+        // We want all the stores to happen first before any of the loads
+        // below, so connect them via effect edge from i-1 to i.
+        Node* store =
+            graph()->NewNode(store_op, stack_slot, mcgraph_->Int32Constant(i),
+                             rep_left[i], effect_input, graph()->start());
+        effect_input = store;
+      }
+
+      for (int i = num_lanes - 1; i >= 0; i--) {
+        // Only select lane when index is < num_lanes, otherwise write 0 to
+        // lane. Use Uint32 to take care of negative indices.
+        Diamond d(graph(), common(),
+                  graph()->NewNode(machine()->Uint32LessThan(), indices[i],
+                                   mcgraph_->Int32Constant(num_lanes)));
+
+        Node* load =
+            graph()->NewNode(machine()->Load(LoadRepresentation::Uint8()),
+                             stack_slot, indices[i], effect_input, d.if_true);
+
+        rep_nodes[i] = d.Phi(MachineRepresentation::kWord8, load,
+                             mcgraph_->Int32Constant(0));
+      }
+
+      ReplaceNode(node, rep_nodes, num_lanes);
+      break;
+    }
    case IrOpcode::kS8x16Shuffle: {
      DCHECK_EQ(2, node->InputCount());
      const uint8_t* shuffle = S8x16ShuffleOf(node->op());
--- a/src/compiler/wasm-compiler.cc
+++ b/src/compiler/wasm-compiler.cc
@ -4472,6 +4472,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
      return graph()->NewNode(mcgraph()->machine()->S1x16AnyTrue(), inputs[0]);
    case wasm::kExprS1x16AllTrue:
      return graph()->NewNode(mcgraph()->machine()->S1x16AllTrue(), inputs[0]);
+    case wasm::kExprS8x16Swizzle:
+      return graph()->NewNode(mcgraph()->machine()->S8x16Swizzle(), inputs[0],
+                              inputs[1]);
    default:
      FATAL_UNSUPPORTED_OPCODE(opcode);
  }
--- a/src/wasm/wasm-interpreter.cc
+++ b/src/wasm/wasm-interpreter.cc
@ -2629,6 +2629,18 @@ class ThreadImpl {
        ADD_HORIZ_CASE(F32x4AddHoriz, f32x4, float4, 4)
        ADD_HORIZ_CASE(I16x8AddHoriz, i16x8, int8, 8)
 #undef ADD_HORIZ_CASE
+      case kExprS8x16Swizzle: {
+        int16 v2 = Pop().to_s128().to_i8x16();
+        int16 v1 = Pop().to_s128().to_i8x16();
+        int16 res;
+        for (size_t i = 0; i < kSimd128Size; ++i) {
+          int lane = v2.val[LANE(i, v1)];
+          res.val[LANE(i, v1)] =
+              lane < kSimd128Size && lane >= 0 ? v1.val[LANE(lane, v1)] : 0;
+        }
+        Push(WasmValue(Simd128(res)));
+        return true;
+      }
      case kExprS8x16Shuffle: {
        Simd8x16ShuffleImmediate<Decoder::kNoValidate> imm(decoder,
                                                           code->at(pc));
--- a/src/wasm/wasm-opcodes.cc
+++ b/src/wasm/wasm-opcodes.cc
@ -306,6 +306,7 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
    CASE_S128_OP(Xor, "xor")
    CASE_S128_OP(Not, "not")
    CASE_S128_OP(Select, "select")
+    CASE_S8x16_OP(Swizzle, "swizzle")
    CASE_S8x16_OP(Shuffle, "shuffle")
    CASE_S1x2_OP(AnyTrue, "any_true")
    CASE_S1x2_OP(AllTrue, "all_true")
--- a/src/wasm/wasm-opcodes.h
+++ b/src/wasm/wasm-opcodes.h
@ -420,6 +420,7 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, const WasmFeatures&);
  V(I32x4UConvertF32x4, 0xfdac, s_s)     \
  V(F32x4SConvertI32x4, 0xfdaf, s_s)     \
  V(F32x4UConvertI32x4, 0xfdb0, s_s)     \
+  V(S8x16Swizzle, 0xfdc0, s_ss)          \
  V(I8x16SConvertI16x8, 0xfdc6, s_ss)    \
  V(I8x16UConvertI16x8, 0xfdc7, s_ss)    \
  V(I16x8SConvertI32x4, 0xfdc8, s_ss)    \
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -2687,6 +2687,62 @@ WASM_SIMD_TEST(S8x16Concat) {
  }
 }

+#ifdef V8_TARGET_ARCH_X64
+struct SwizzleTestArgs {
+  const Shuffle input;
+  const Shuffle indices;
+  const Shuffle expected;
+};
+
+static constexpr SwizzleTestArgs swizzle_test_args[] = {
+    {{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+     {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+     {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}},
+    {{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+     {15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7},
+     {0, 15, 1, 14, 2, 13, 3, 12, 4, 11, 5, 10, 6, 9, 7, 8}},
+    {{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+     {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30},
+     {15, 13, 11, 9, 7, 5, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0}},
+    // all indices are out of range
+    {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+     {16, 17, 18, 19, 20, 124, 125, 126, 127, -1, -2, -3, -4, -5, -6, -7},
+     {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}};
+
+static constexpr Vector<const SwizzleTestArgs> swizzle_test_vector =
+    ArrayVector(swizzle_test_args);
+
+WASM_SIMD_TEST(S8x16Swizzle) {
+  // RunBinaryLaneOpTest set up the two globals to be consecutive integers,
+  // [0-15] and [16-31]. Using [0-15] as the indices will not sufficiently test
+  // swizzle since the expected result is a no-op, using [16-31] will result in
+  // all 0s.
+  WasmRunner<int32_t> r(execution_tier, lower_simd);
+  static const int kElems = kSimd128Size / sizeof(uint8_t);
+  uint8_t* dst = r.builder().AddGlobal<uint8_t>(kWasmS128);
+  uint8_t* src0 = r.builder().AddGlobal<uint8_t>(kWasmS128);
+  uint8_t* src1 = r.builder().AddGlobal<uint8_t>(kWasmS128);
+  BUILD(
+      r,
+      WASM_SET_GLOBAL(0, WASM_SIMD_BINOP(kExprS8x16Swizzle, WASM_GET_GLOBAL(1),
+                                         WASM_GET_GLOBAL(2))),
+      WASM_ONE);
+
+  for (SwizzleTestArgs si : swizzle_test_vector) {
+    for (int i = 0; i < kElems; i++) {
+      WriteLittleEndianValue<uint8_t>(&src0[i], si.input[i]);
+      WriteLittleEndianValue<uint8_t>(&src1[i], si.indices[i]);
+    }
+
+    CHECK_EQ(1, r.Call());
+
+    for (int i = 0; i < kElems; i++) {
+      CHECK_EQ(ReadLittleEndianValue<uint8_t>(&dst[i]), si.expected[i]);
+    }
+  }
+}
+#endif  // V8_TARGET_ARCH_X64
+
 // Combine 3 shuffles a, b, and c by applying both a and b and then applying c
 // to those two results.
 Shuffle Combine(const Shuffle& a, const Shuffle& b, const Shuffle& c) {