[x64][wasm-simd] Pattern match 32x4 rotate

Code like: x = wasm_v32x4_shuffle(x, x, 1, 2, 3, 0); is currently matched by S8x16Concat, which lowers to two instructions: movapd xmm_dst, xmm_src palignr xmm_dst, xmm_src, 0x4 There is a special case after a S8x16Concat is matched:. - is_swizzle, the inputs are the same - it is a 32x4 shuffle (offset % 4 == 0) Which can have a better codegen: - (dst == src) shufps dst, src, 0b00111001 - (dst != src) pshufd dst, src, 0b00111001 Add a new simd shuffle matcher which will match 32x4 rotate, and construct the appropriate indices referring to the 32x4 elements. pshufd for the given example. However, this matching happens after S8x16Concat, so we get the palignr first. We could move the pattern matching cases around, but it will lead to some cases where where it would have matched a S8x16Concat, but now matches a S32x4shuffle instead, leading to worse codegen. Note: we also pattern match on 32x4Swizzle, which correctly generates Change-Id: Ie3aca53bbc06826be2cf49632de4c24ec73d0a9a Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2589062 Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#71754}
2020-12-14 23:31:33 +00:00 · 2020-12-14 23:31:33 +00:00 · 7c98abdb78
commit 7c98abdb78
parent e327fe6944
7 changed files with 57 additions and 9 deletions
--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -3906,6 +3906,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      }
      break;
    }
+    case kX64S32x4Rotate: {
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src = i.InputSimd128Register(0);
+      uint8_t mask = i.InputUint8(1);
+      if (dst == src) {
+        // 1-byte shorter encoding than pshufd.
+        __ Shufps(dst, src, mask);
+      } else {
+        __ Pshufd(dst, src, mask);
+      }
+      break;
+    }
    case kX64S32x4Swizzle: {
      DCHECK_EQ(2, instr->InputCount());
      ASSEMBLE_SIMD_IMM_INSTR(Pshufd, i.OutputSimd128Register(), 0,
--- a/src/compiler/backend/x64/instruction-codes-x64.h
+++ b/src/compiler/backend/x64/instruction-codes-x64.h
@ -350,6 +350,7 @@ namespace compiler {
  V(X64S128Load32x2U)                     \
  V(X64S128Store32Lane)                   \
  V(X64S128Store64Lane)                   \
+  V(X64S32x4Rotate)                       \
  V(X64S32x4Swizzle)                      \
  V(X64S32x4Shuffle)                      \
  V(X64S16x8Blend)                        \
--- a/src/compiler/backend/x64/instruction-scheduler-x64.cc
+++ b/src/compiler/backend/x64/instruction-scheduler-x64.cc
@ -312,6 +312,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64V16x8AllTrue:
    case kX64I8x16Swizzle:
    case kX64I8x16Shuffle:
+    case kX64S32x4Rotate:
    case kX64S32x4Swizzle:
    case kX64S32x4Shuffle:
    case kX64S16x8Blend:
--- a/src/compiler/backend/x64/instruction-selector-x64.cc
+++ b/src/compiler/backend/x64/instruction-selector-x64.cc
@ -3465,15 +3465,22 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
  int index;
  const ShuffleEntry* arch_shuffle;
  if (wasm::SimdShuffle::TryMatchConcat(shuffle, &offset)) {
-    // Swap inputs from the normal order for (v)palignr.
-    SwapShuffleInputs(node);
-    is_swizzle = false;        // It's simpler to just handle the general case.
-    no_same_as_first = false;  // SSE requires same-as-first.
-    // TODO(v8:9608): also see v8:9083
-    src1_needs_reg = true;
-    opcode = kX64S8x16Alignr;
-    // palignr takes a single imm8 offset.
-    imms[imm_count++] = offset;
+    if (wasm::SimdShuffle::TryMatch32x4Rotate(shuffle, shuffle32x4,
+                                              is_swizzle)) {
+      uint8_t shuffle_mask = wasm::SimdShuffle::PackShuffle4(shuffle32x4);
+      opcode = kX64S32x4Rotate;
+      imms[imm_count++] = shuffle_mask;
+    } else {
+      // Swap inputs from the normal order for (v)palignr.
+      SwapShuffleInputs(node);
+      is_swizzle = false;  // It's simpler to just handle the general case.
+      no_same_as_first = false;  // SSE requires same-as-first.
+      // TODO(v8:9608): also see v8:9083
+      src1_needs_reg = true;
+      opcode = kX64S8x16Alignr;
+      // palignr takes a single imm8 offset.
+      imms[imm_count++] = offset;
+    }
  } else if (TryMatchArchShuffle(shuffle, arch_shuffles,
                                 arraysize(arch_shuffles), is_swizzle,
                                 &arch_shuffle)) {
--- a/src/wasm/simd-shuffle.cc
+++ b/src/wasm/simd-shuffle.cc
@ -58,6 +58,25 @@ bool SimdShuffle::TryMatchIdentity(const uint8_t* shuffle) {
  return true;
 }

+bool SimdShuffle::TryMatch32x4Rotate(const uint8_t* shuffle,
+                                     uint8_t* shuffle32x4, bool is_swizzle) {
+  uint8_t offset;
+  bool is_concat = TryMatchConcat(shuffle, &offset);
+  DCHECK_NE(offset, 0);  // 0 is identity, it should not be matched.
+  // Since we already have a concat shuffle, we know that the indices goes from:
+  // [ offset, ..., 15, 0, ... ], it suffices to check that the offset points
+  // to the low byte of a 32x4 element.
+  if (!is_concat || !is_swizzle || offset % 4 != 0) {
+    return false;
+  }
+
+  uint8_t offset_32 = offset / 4;
+  for (int i = 0; i < 4; i++) {
+    shuffle32x4[i] = (offset_32 + i) % 4;
+  }
+  return true;
+}
+
 bool SimdShuffle::TryMatch32x4Shuffle(const uint8_t* shuffle,
                                      uint8_t* shuffle32x4) {
  for (int i = 0; i < 4; ++i) {
--- a/src/wasm/simd-shuffle.h
+++ b/src/wasm/simd-shuffle.h
@ -51,6 +51,12 @@ class V8_EXPORT_PRIVATE SimdShuffle {
    return true;
  }

+  // Tries to match a 32x4 rotate, only makes sense if the inputs are equal
+  // (is_swizzle). A rotation is a shuffle like [1, 2, 3, 0]. This will always
+  // match a Concat, but can have better codegen.
+  static bool TryMatch32x4Rotate(const uint8_t* shuffle, uint8_t* shuffle32x4,
+                                 bool is_swizzle);
+
  // Tries to match an 8x16 byte shuffle to an equivalent 32x4 shuffle. If
  // successful, it writes the 32x4 shuffle word indices. E.g.
  // [0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15] == [0 2 1 3]
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -2951,6 +2951,7 @@ void RunShuffleOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
  V(S32x4TransposeRight) \
  V(S32x2Reverse)        \
  V(S32x4Irregular)      \
+  V(S32x4Rotate)         \
  V(S16x8Dup)            \
  V(S16x8ZipLeft)        \
  V(S16x8ZipRight)       \
@ -3003,6 +3004,7 @@ ShuffleMap test_shuffles = {
     {{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}}},
    {kS32x4Irregular,
     {{0, 1, 2, 3, 16, 17, 18, 19, 16, 17, 18, 19, 20, 21, 22, 23}}},
+    {kS32x4Rotate, {{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3}}},
    {kS16x8Dup,
     {{18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19}}},
    {kS16x8ZipLeft, {{0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23}}},