[x64][wasm-simd] Pattern match 32x4 rotate

Code like:

  x = wasm_v32x4_shuffle(x, x, 1, 2, 3, 0);

is currently matched by S8x16Concat, which lowers to two instructions:

  movapd xmm_dst, xmm_src
  palignr xmm_dst, xmm_src, 0x4

There is a special case after a S8x16Concat is matched:.

- is_swizzle, the inputs are the same
- it is a 32x4 shuffle (offset % 4 == 0)

Which can have a better codegen:

- (dst == src) shufps dst, src, 0b00111001
- (dst != src) pshufd dst, src, 0b00111001

Add a new simd shuffle matcher which will match 32x4 rotate, and
construct the appropriate indices referring to the 32x4 elements.

pshufd for the given example. However, this matching happens after
S8x16Concat, so we get the palignr first. We could move the pattern
matching cases around, but it will lead to some cases where
where it would have matched a S8x16Concat, but now matches a
S32x4shuffle instead, leading to worse codegen.

Note: we also pattern match on 32x4Swizzle, which correctly generates
Change-Id: Ie3aca53bbc06826be2cf49632de4c24ec73d0a9a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2589062
Reviewed-by: Bill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71754}
This commit is contained in:
Zhi An Ng 2020-12-14 23:31:33 +00:00 committed by Commit Bot
parent e327fe6944
commit 7c98abdb78
7 changed files with 57 additions and 9 deletions

View File

@ -3906,6 +3906,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
break;
}
case kX64S32x4Rotate: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
uint8_t mask = i.InputUint8(1);
if (dst == src) {
// 1-byte shorter encoding than pshufd.
__ Shufps(dst, src, mask);
} else {
__ Pshufd(dst, src, mask);
}
break;
}
case kX64S32x4Swizzle: {
DCHECK_EQ(2, instr->InputCount());
ASSEMBLE_SIMD_IMM_INSTR(Pshufd, i.OutputSimd128Register(), 0,

View File

@ -350,6 +350,7 @@ namespace compiler {
V(X64S128Load32x2U) \
V(X64S128Store32Lane) \
V(X64S128Store64Lane) \
V(X64S32x4Rotate) \
V(X64S32x4Swizzle) \
V(X64S32x4Shuffle) \
V(X64S16x8Blend) \

View File

@ -312,6 +312,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64V16x8AllTrue:
case kX64I8x16Swizzle:
case kX64I8x16Shuffle:
case kX64S32x4Rotate:
case kX64S32x4Swizzle:
case kX64S32x4Shuffle:
case kX64S16x8Blend:

View File

@ -3465,6 +3465,12 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
int index;
const ShuffleEntry* arch_shuffle;
if (wasm::SimdShuffle::TryMatchConcat(shuffle, &offset)) {
if (wasm::SimdShuffle::TryMatch32x4Rotate(shuffle, shuffle32x4,
is_swizzle)) {
uint8_t shuffle_mask = wasm::SimdShuffle::PackShuffle4(shuffle32x4);
opcode = kX64S32x4Rotate;
imms[imm_count++] = shuffle_mask;
} else {
// Swap inputs from the normal order for (v)palignr.
SwapShuffleInputs(node);
is_swizzle = false; // It's simpler to just handle the general case.
@ -3474,6 +3480,7 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
opcode = kX64S8x16Alignr;
// palignr takes a single imm8 offset.
imms[imm_count++] = offset;
}
} else if (TryMatchArchShuffle(shuffle, arch_shuffles,
arraysize(arch_shuffles), is_swizzle,
&arch_shuffle)) {

View File

@ -58,6 +58,25 @@ bool SimdShuffle::TryMatchIdentity(const uint8_t* shuffle) {
return true;
}
bool SimdShuffle::TryMatch32x4Rotate(const uint8_t* shuffle,
uint8_t* shuffle32x4, bool is_swizzle) {
uint8_t offset;
bool is_concat = TryMatchConcat(shuffle, &offset);
DCHECK_NE(offset, 0); // 0 is identity, it should not be matched.
// Since we already have a concat shuffle, we know that the indices goes from:
// [ offset, ..., 15, 0, ... ], it suffices to check that the offset points
// to the low byte of a 32x4 element.
if (!is_concat || !is_swizzle || offset % 4 != 0) {
return false;
}
uint8_t offset_32 = offset / 4;
for (int i = 0; i < 4; i++) {
shuffle32x4[i] = (offset_32 + i) % 4;
}
return true;
}
bool SimdShuffle::TryMatch32x4Shuffle(const uint8_t* shuffle,
uint8_t* shuffle32x4) {
for (int i = 0; i < 4; ++i) {

View File

@ -51,6 +51,12 @@ class V8_EXPORT_PRIVATE SimdShuffle {
return true;
}
// Tries to match a 32x4 rotate, only makes sense if the inputs are equal
// (is_swizzle). A rotation is a shuffle like [1, 2, 3, 0]. This will always
// match a Concat, but can have better codegen.
static bool TryMatch32x4Rotate(const uint8_t* shuffle, uint8_t* shuffle32x4,
bool is_swizzle);
// Tries to match an 8x16 byte shuffle to an equivalent 32x4 shuffle. If
// successful, it writes the 32x4 shuffle word indices. E.g.
// [0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15] == [0 2 1 3]

View File

@ -2951,6 +2951,7 @@ void RunShuffleOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
V(S32x4TransposeRight) \
V(S32x2Reverse) \
V(S32x4Irregular) \
V(S32x4Rotate) \
V(S16x8Dup) \
V(S16x8ZipLeft) \
V(S16x8ZipRight) \
@ -3003,6 +3004,7 @@ ShuffleMap test_shuffles = {
{{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}}},
{kS32x4Irregular,
{{0, 1, 2, 3, 16, 17, 18, 19, 16, 17, 18, 19, 20, 21, 22, 23}}},
{kS32x4Rotate, {{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3}}},
{kS16x8Dup,
{{18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19}}},
{kS16x8ZipLeft, {{0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23}}},