[wasm-simd][x64][ia32] Optimize swizzle with constant indices

When swizzle is called with a v128.const node, we can check that the
indices are either all in bounds, or if they are out of bounds the top
bit of each byte is set. This will match exactly pshufb behavior, and so
we can omit the paddusb (and getting external reference).

Bug: v8:10992
Change-Id: I5479a9eb92ebcfc12bedff5efd3e72bb4a43ff40
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2766222
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Reviewed-by: Georg Neis <neis@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73583}
This commit is contained in:
Ng Zhi An 2021-03-19 09:50:27 -07:00 committed by Commit Bot
parent 656f35ab6c
commit 8c9213a191
15 changed files with 158 additions and 8 deletions

View File

@ -1272,7 +1272,12 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,
XMMRegister mask, XMMRegister scratch,
Register tmp) {
Register tmp, bool omit_add) {
if (omit_add) {
Pshufb(dst, src, scratch);
return;
}
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
Operand op = ExternalReferenceAsOperand(

View File

@ -758,7 +758,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
XMMRegister tmp);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
XMMRegister scratch, Register tmp);
XMMRegister scratch, Register tmp, bool omit_add = false);
void Push(Register src) { push(src); }
void Push(Operand src) { push(src); }

View File

@ -2701,7 +2701,14 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
}
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,
XMMRegister mask) {
XMMRegister mask, bool omit_add) {
if (omit_add) {
// We have determined that the indices are immediates, and they are either
// within bounds, or the top bit is set, so we can omit the add.
Pshufb(dst, src, kScratchDoubleReg);
return;
}
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
Operand op = ExternalReferenceAsOperand(

View File

@ -645,7 +645,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src);
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
bool omit_add = false);
void Abspd(XMMRegister dst);
void Negpd(XMMRegister dst);

View File

@ -3624,7 +3624,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kIA32I8x16Swizzle: {
__ I8x16Swizzle(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), kScratchDoubleReg,
i.TempRegister(0));
i.TempRegister(0), MiscField::decode(instr->opcode()));
break;
}
case kIA32I8x16Shuffle: {

View File

@ -3002,9 +3002,20 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) { UNREACHABLE(); }
#endif // V8_ENABLE_WEBASSEMBLY
void InstructionSelector::VisitI8x16Swizzle(Node* node) {
InstructionCode op = kIA32I8x16Swizzle;
auto m = V128ConstMatcher(node->InputAt(1));
if (m.HasResolvedValue()) {
// If the indices vector is a const, check if they are in range, or if the
// top bit is set, then we can avoid the paddusb in the codegen and simply
// emit a pshufb.
auto imms = m.ResolvedValue().immediate();
op |= MiscField::encode(wasm::SimdSwizzle::AllInRangeOrTopBitSet(imms));
}
IA32OperandGenerator g(this);
InstructionOperand temps[] = {g.TempRegister()};
Emit(kIA32I8x16Swizzle,
Emit(op,
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
arraysize(temps), temps);

View File

@ -860,6 +860,7 @@ class V8_EXPORT_PRIVATE Instruction final {
FlagsCondition flags_condition() const {
return FlagsConditionField::decode(opcode());
}
int misc() const { return MiscField::decode(opcode()); }
static Instruction* New(Zone* zone, InstructionCode opcode) {
return New(zone, opcode, 0, nullptr, 0, nullptr, 0, nullptr);

View File

@ -15,6 +15,7 @@
#include "src/compiler/backend/code-generator-impl.h"
#include "src/compiler/backend/code-generator.h"
#include "src/compiler/backend/gap-resolver.h"
#include "src/compiler/backend/instruction-codes.h"
#include "src/compiler/node-matchers.h"
#include "src/compiler/osr.h"
#include "src/heap/memory-chunk.h"
@ -3691,8 +3692,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I8x16Swizzle: {
bool omit_add = MiscField::decode(instr->opcode());
__ I8x16Swizzle(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
i.InputSimd128Register(1), omit_add);
break;
}
case kX64I8x16Shuffle: {

View File

@ -3544,12 +3544,27 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
void InstructionSelector::VisitI8x16Shuffle(Node* node) { UNREACHABLE(); }
#endif // V8_ENABLE_WEBASSEMBLY
#if V8_ENABLE_WEBASSEMBLY
void InstructionSelector::VisitI8x16Swizzle(Node* node) {
InstructionCode op = kX64I8x16Swizzle;
auto m = V128ConstMatcher(node->InputAt(1));
if (m.HasResolvedValue()) {
// If the indices vector is a const, check if they are in range, or if the
// top bit is set, then we can avoid the paddusb in the codegen and simply
// emit a pshufb
auto imms = m.ResolvedValue().immediate();
op |= MiscField::encode(wasm::SimdSwizzle::AllInRangeOrTopBitSet(imms));
}
X64OperandGenerator g(this);
Emit(kX64I8x16Swizzle,
Emit(op,
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
}
#else
void InstructionSelector::VisitI8x16Swizzle(Node* node) { UNREACHABLE(); }
#endif // V8_ENABLE_WEBASSEMBLY
namespace {
void VisitPminOrPmax(InstructionSelector* selector, Node* node,

View File

@ -13,6 +13,7 @@
#include "src/codegen/external-reference.h"
#include "src/common/globals.h"
#include "src/compiler/common-operator.h"
#include "src/compiler/machine-operator.h"
#include "src/compiler/node.h"
#include "src/compiler/operator.h"
#include "src/numbers/double.h"
@ -169,6 +170,8 @@ using Int32Matcher = IntMatcher<int32_t, IrOpcode::kInt32Constant>;
using Uint32Matcher = IntMatcher<uint32_t, IrOpcode::kInt32Constant>;
using Int64Matcher = IntMatcher<int64_t, IrOpcode::kInt64Constant>;
using Uint64Matcher = IntMatcher<uint64_t, IrOpcode::kInt64Constant>;
using V128ConstMatcher =
ValueMatcher<S128ImmediateParameter, IrOpcode::kS128Const>;
#if V8_HOST_ARCH_32_BIT
using IntPtrMatcher = Int32Matcher;
using UintPtrMatcher = Uint32Matcher;

View File

@ -864,6 +864,9 @@ class V8_EXPORT_PRIVATE RawMachineAssembler {
}
// SIMD operations.
Node* S128Const(const uint8_t value[16]) {
return AddNode(machine()->S128Const(value));
}
Node* I64x2Splat(Node* a) { return AddNode(machine()->I64x2Splat(), a); }
Node* I64x2SplatI32Pair(Node* a, Node* b) {
return AddNode(machine()->I64x2SplatI32Pair(), a, b);

View File

@ -161,6 +161,12 @@ void SimdShuffle::Pack16Lanes(uint32_t* dst, const uint8_t* shuffle) {
}
}
bool SimdSwizzle::AllInRangeOrTopBitSet(
std::array<uint8_t, kSimd128Size> shuffle) {
return std::all_of(shuffle.begin(), shuffle.end(),
[](auto i) { return (i < kSimd128Size) || (i & 0x80); });
}
} // namespace wasm
} // namespace internal
} // namespace v8

View File

@ -95,6 +95,14 @@ class V8_EXPORT_PRIVATE SimdShuffle {
// Packs 16 bytes of shuffle into an array of 4 uint32_t.
static void Pack16Lanes(uint32_t* dst, const uint8_t* shuffle);
};
class V8_EXPORT_PRIVATE SimdSwizzle {
public:
// Checks if all the immediates are in range (< kSimd128Size), and if they are
// not, the top bit is set.
static bool AllInRangeOrTopBitSet(std::array<uint8_t, kSimd128Size> shuffle);
};
} // namespace wasm
} // namespace internal
} // namespace v8

View File

@ -886,6 +886,50 @@ TEST_F(InstructionSelectorTest, SIMDSplatZero) {
}
}
struct SwizzleConstants {
uint8_t shuffle[kSimd128Size];
bool omit_add;
};
static constexpr SwizzleConstants kSwizzleConstants[] = {
{
// all lanes < kSimd128Size
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
true,
},
{
// lanes that are >= kSimd128Size have top bit set
{12, 13, 14, 15, 0x90, 0x91, 0x92, 0x93, 0xA0, 0xA1, 0xA2, 0xA3, 0xFC,
0xFD, 0xFE, 0xFF},
true,
},
{
{12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
false,
},
};
using InstructionSelectorSIMDSwizzleConstantTest =
InstructionSelectorTestWithParam<SwizzleConstants>;
TEST_P(InstructionSelectorSIMDSwizzleConstantTest, SimdSwizzleConstant) {
// Test optimization of swizzle with constant indices.
auto param = GetParam();
StreamBuilder m(this, MachineType::Simd128(), MachineType::Simd128());
Node* const c = m.S128Const(param.shuffle);
Node* swizzle = m.AddNode(m.machine()->I8x16Swizzle(), m.Parameter(0), c);
m.Return(swizzle);
Stream s = m.Build();
ASSERT_EQ(2U, s.size());
ASSERT_EQ(kIA32I8x16Swizzle, s[1]->arch_opcode());
ASSERT_EQ(param.omit_add, s[1]->misc());
ASSERT_EQ(1U, s[0]->OutputCount());
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDSwizzleConstantTest,
::testing::ValuesIn(kSwizzleConstants));
} // namespace compiler
} // namespace internal
} // namespace v8

View File

@ -2204,6 +2204,50 @@ INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
::testing::ValuesIn(kArchShuffles));
#endif // V8_ENABLE_WEBASSEMBLY
struct SwizzleConstants {
uint8_t shuffle[kSimd128Size];
bool omit_add;
};
static constexpr SwizzleConstants kSwizzleConstants[] = {
{
// all lanes < kSimd128Size
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
true,
},
{
// lanes that are >= kSimd128Size have top bit set
{12, 13, 14, 15, 0x90, 0x91, 0x92, 0x93, 0xA0, 0xA1, 0xA2, 0xA3, 0xFC,
0xFD, 0xFE, 0xFF},
true,
},
{
{12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
false,
},
};
using InstructionSelectorSIMDSwizzleConstantTest =
InstructionSelectorTestWithParam<SwizzleConstants>;
TEST_P(InstructionSelectorSIMDSwizzleConstantTest, SimdSwizzleConstant) {
// Test optimization of swizzle with constant indices.
auto param = GetParam();
StreamBuilder m(this, MachineType::Simd128(), MachineType::Simd128());
Node* const c = m.S128Const(param.shuffle);
Node* swizzle = m.AddNode(m.machine()->I8x16Swizzle(), m.Parameter(0), c);
m.Return(swizzle);
Stream s = m.Build();
ASSERT_EQ(2U, s.size());
ASSERT_EQ(kX64I8x16Swizzle, s[1]->arch_opcode());
ASSERT_EQ(param.omit_add, s[1]->misc());
ASSERT_EQ(1U, s[0]->OutputCount());
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDSwizzleConstantTest,
::testing::ValuesIn(kSwizzleConstants));
} // namespace compiler
} // namespace internal
} // namespace v8