[wasm-relaxed-simd][x64] Prototype relaxed lane selects

4 instructions, i8x16, i16x8, i32x4, i64x2 relaxed lane select. These
instructions only guarantee results when the entire lane is set or
unset, so vpblendvb will give correct results for all of them.

Bug: v8:12284
Change-Id: I76959a23f2d97de8ecc3bef43d138184484e3c4d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3207006
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#77401}
This commit is contained in:
Ng Zhi An 2021-10-11 15:57:47 -07:00 committed by V8 LUCI CQ
parent 33634d76ec
commit ee3b4eadab
15 changed files with 218 additions and 9 deletions

View File

@ -99,6 +99,19 @@ void SharedTurboAssembler::Movlps(XMMRegister dst, XMMRegister src1,
}
}
void SharedTurboAssembler::Pblendvb(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpblendvb(dst, src1, src2, mask);
} else {
CpuFeatureScope scope(this, SSE4_1);
DCHECK_EQ(mask, xmm0);
DCHECK_EQ(dst, src1);
pblendvb(dst, src2);
}
}
void SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1,
XMMRegister src2, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {

View File

@ -50,6 +50,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
void Movhps(XMMRegister dst, XMMRegister src1, Operand src2);
void Movlps(XMMRegister dst, XMMRegister src1, Operand src2);
void Pblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask);
template <typename Op>
void Pinsrb(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8,
uint32_t* load_pc_offset = nullptr) {

View File

@ -2352,6 +2352,14 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsWord32(node), VisitI16x8AllTrue(node);
case IrOpcode::kI8x16AllTrue:
return MarkAsWord32(node), VisitI8x16AllTrue(node);
case IrOpcode::kI8x16RelaxedLaneSelect:
return MarkAsSimd128(node), VisitI8x16RelaxedLaneSelect(node);
case IrOpcode::kI16x8RelaxedLaneSelect:
return MarkAsSimd128(node), VisitI16x8RelaxedLaneSelect(node);
case IrOpcode::kI32x4RelaxedLaneSelect:
return MarkAsSimd128(node), VisitI32x4RelaxedLaneSelect(node);
case IrOpcode::kI64x2RelaxedLaneSelect:
return MarkAsSimd128(node), VisitI64x2RelaxedLaneSelect(node);
default:
FATAL("Unexpected operator #%d:%s @ node #%d", node->opcode(),
node->op()->mnemonic(), node->id());
@ -2765,6 +2773,21 @@ void InstructionSelector::VisitF32x4Qfms(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM64
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X && !V8_TARGET_ARCH_PPC64
#if !V8_TARGET_ARCH_X64
void InstructionSelector::VisitI8x16RelaxedLaneSelect(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI16x8RelaxedLaneSelect(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI32x4RelaxedLaneSelect(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI64x2RelaxedLaneSelect(Node* node) {
UNIMPLEMENTED();
}
#endif // !V8_TARGET_ARCH_X64
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
void InstructionSelector::VisitParameter(Node* node) {

View File

@ -4069,6 +4069,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
ASSEMBLE_SIMD_ALL_TRUE(Pcmpeqb);
break;
}
case kX64Pblendvb: {
__ Pblendvb(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.InputSimd128Register(2));
break;
}
case kAtomicStoreWord8: {
ASSEMBLE_SEQ_CST_STORE(MachineRepresentation::kWord8);
break;

View File

@ -392,6 +392,7 @@ namespace compiler {
V(X64I32x4AllTrue) \
V(X64I16x8AllTrue) \
V(X64I8x16AllTrue) \
V(X64Pblendvb) \
V(X64Word64AtomicAddUint64) \
V(X64Word64AtomicSubUint64) \
V(X64Word64AtomicAndUint64) \

View File

@ -345,6 +345,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64S8x2Reverse:
case kX64V128AnyTrue:
case kX64I8x16AllTrue:
case kX64Pblendvb:
return (instr->addressing_mode() == kMode_None)
? kNoOpcodeFlags
: kIsLoadOperation | kHasSideEffect;

View File

@ -3706,8 +3706,53 @@ void InstructionSelector::VisitI8x16Swizzle(Node* node) {
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
}
namespace {
// pblendvb is a correct implementation for all the various relaxed lane select,
// see https://github.com/WebAssembly/relaxed-simd/issues/17.
void VisitRelaxedLaneSelect(InstructionSelector* selector, Node* node) {
X64OperandGenerator g(selector);
// pblendvb copies src2 when mask is set, opposite from Wasm semantics.
if (selector->IsSupported(AVX)) {
selector->Emit(
kX64Pblendvb, g.DefineAsRegister(node), g.UseRegister(node->InputAt(1)),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(2)));
} else {
// SSE4.1 pblendvb requires xmm0 to hold the mask as an implicit operand.
selector->Emit(kX64Pblendvb, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(1)),
g.UseRegister(node->InputAt(0)),
g.UseFixed(node->InputAt(2), xmm0));
}
}
} // namespace
void InstructionSelector::VisitI8x16RelaxedLaneSelect(Node* node) {
VisitRelaxedLaneSelect(this, node);
}
void InstructionSelector::VisitI16x8RelaxedLaneSelect(Node* node) {
VisitRelaxedLaneSelect(this, node);
}
void InstructionSelector::VisitI32x4RelaxedLaneSelect(Node* node) {
VisitRelaxedLaneSelect(this, node);
}
void InstructionSelector::VisitI64x2RelaxedLaneSelect(Node* node) {
VisitRelaxedLaneSelect(this, node);
}
#else
void InstructionSelector::VisitI8x16Swizzle(Node* node) { UNREACHABLE(); }
void InstructionSelector::VisitI8x16RelaxedLaneSelect(Node* node) {
UNREACHABLE();
}
void InstructionSelector::VisitI16x8RelaxedLaneSelect(Node* node) {
UNREACHABLE();
}
void InstructionSelector::VisitI32x4RelaxedLaneSelect(Node* node) {
UNREACHABLE();
}
void InstructionSelector::VisitI64x2RelaxedLaneSelect(Node* node) {
UNREACHABLE();
}
#endif // V8_ENABLE_WEBASSEMBLY
namespace {

View File

@ -597,7 +597,11 @@ std::ostream& operator<<(std::ostream& os, TruncateKind kind) {
V(I64x2AllTrue, Operator::kNoProperties, 1, 0, 1) \
V(I32x4AllTrue, Operator::kNoProperties, 1, 0, 1) \
V(I16x8AllTrue, Operator::kNoProperties, 1, 0, 1) \
V(I8x16AllTrue, Operator::kNoProperties, 1, 0, 1)
V(I8x16AllTrue, Operator::kNoProperties, 1, 0, 1) \
V(I8x16RelaxedLaneSelect, Operator::kNoProperties, 3, 0, 1) \
V(I16x8RelaxedLaneSelect, Operator::kNoProperties, 3, 0, 1) \
V(I32x4RelaxedLaneSelect, Operator::kNoProperties, 3, 0, 1) \
V(I64x2RelaxedLaneSelect, Operator::kNoProperties, 3, 0, 1)
// The format is:
// V(Name, properties, value_input_count, control_input_count, output_count)

View File

@ -914,6 +914,12 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* I16x8AllTrue();
const Operator* I8x16AllTrue();
// Relaxed SIMD operators.
const Operator* I8x16RelaxedLaneSelect();
const Operator* I16x8RelaxedLaneSelect();
const Operator* I32x4RelaxedLaneSelect();
const Operator* I64x2RelaxedLaneSelect();
// load [base + index]
const Operator* Load(LoadRepresentation rep);
const Operator* LoadImmutable(LoadRepresentation rep);

View File

@ -981,6 +981,10 @@
V(S128Select) \
V(S128AndNot) \
V(I8x16Swizzle) \
V(I8x16RelaxedLaneSelect) \
V(I16x8RelaxedLaneSelect) \
V(I32x4RelaxedLaneSelect) \
V(I64x2RelaxedLaneSelect) \
V(I8x16Shuffle) \
V(V128AnyTrue) \
V(I64x2AllTrue) \

View File

@ -5116,6 +5116,18 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
case wasm::kExprI8x16RelaxedSwizzle:
return graph()->NewNode(mcgraph()->machine()->I8x16Swizzle(true),
inputs[0], inputs[1]);
case wasm::kExprI8x16RelaxedLaneSelect:
return graph()->NewNode(mcgraph()->machine()->I8x16RelaxedLaneSelect(),
inputs[0], inputs[1], inputs[2]);
case wasm::kExprI16x8RelaxedLaneSelect:
return graph()->NewNode(mcgraph()->machine()->I16x8RelaxedLaneSelect(),
inputs[0], inputs[1], inputs[2]);
case wasm::kExprI32x4RelaxedLaneSelect:
return graph()->NewNode(mcgraph()->machine()->I32x4RelaxedLaneSelect(),
inputs[0], inputs[1], inputs[2]);
case wasm::kExprI64x2RelaxedLaneSelect:
return graph()->NewNode(mcgraph()->machine()->I64x2RelaxedLaneSelect(),
inputs[0], inputs[1], inputs[2]);
default:
FATAL_UNSUPPORTED_OPCODE(opcode);
}

View File

@ -368,6 +368,10 @@ constexpr const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
CASE_SIMDF_OP(Qfma, "qfma")
CASE_SIMDF_OP(Qfms, "qfms")
CASE_I8x16_OP(RelaxedSwizzle, "relaxed_swizzle");
CASE_I8x16_OP(RelaxedLaneSelect, "relaxed_laneselect");
CASE_I16x8_OP(RelaxedLaneSelect, "relaxed_laneselect");
CASE_I32x4_OP(RelaxedLaneSelect, "relaxed_laneselect");
CASE_I64x2_OP(RelaxedLaneSelect, "relaxed_laneselect");
// Atomic operations.
CASE_OP(AtomicNotify, "atomic.notify")

View File

@ -514,14 +514,18 @@ bool V8_EXPORT_PRIVATE IsJSCompatibleSignature(const FunctionSig* sig,
V(F64x2ConvertLowI32x4S, 0xfdfe, s_s) \
V(F64x2ConvertLowI32x4U, 0xfdff, s_s)
#define FOREACH_RELAXED_SIMD_OPCODE(V) \
V(I8x16RelaxedSwizzle, 0xfda2, s_ss) \
V(F32x4Qfma, 0xfdaf, s_sss) \
V(F32x4Qfms, 0xfdb0, s_sss) \
V(F64x2Qfma, 0xfdcf, s_sss) \
V(F64x2Qfms, 0xfdd0, s_sss) \
V(F32x4RecipApprox, 0xfdd2, s_s) \
V(F32x4RecipSqrtApprox, 0xfdd3, s_s)
#define FOREACH_RELAXED_SIMD_OPCODE(V) \
V(I8x16RelaxedSwizzle, 0xfda2, s_ss) \
V(I8x16RelaxedLaneSelect, 0xfdb2, s_sss) \
V(I16x8RelaxedLaneSelect, 0xfdb3, s_sss) \
V(I32x4RelaxedLaneSelect, 0xfdd2, s_sss) \
V(I64x2RelaxedLaneSelect, 0xfdd3, s_sss) \
V(F32x4Qfma, 0xfdaf, s_sss) \
V(F32x4Qfms, 0xfdb0, s_sss) \
V(F64x2Qfma, 0xfdcf, s_sss) \
V(F64x2Qfms, 0xfdd0, s_sss) \
V(F32x4RecipApprox, 0xfda5, s_s) \
V(F32x4RecipSqrtApprox, 0xfda6, s_s)
#define FOREACH_SIMD_1_OPERAND_1_PARAM_OPCODE(V) \
V(I8x16ExtractLaneS, 0xfd15, _) \

View File

@ -3,6 +3,7 @@
// found in the LICENSE file.
#include "src/base/overflowing-math.h"
#include "src/common/globals.h"
#include "src/wasm/compilation-environment.h"
#include "test/cctest/cctest.h"
#include "test/cctest/wasm/wasm-run-utils.h"
@ -252,6 +253,85 @@ WASM_RELAXED_SIMD_TEST(I8x16RelaxedSwizzle) {
CHECK_EQ(LANE(dst, i), i);
}
}
namespace {
// Helper to convert an array of T into an array of uint8_t to be used a v128
// constants.
template <typename T, size_t N = kSimd128Size / sizeof(T)>
std::array<uint8_t, kSimd128Size> as_uint8(const T* src) {
std::array<uint8_t, kSimd128Size> arr;
for (size_t i = 0; i < N; i++) {
WriteLittleEndianValue<T>(bit_cast<T*>(&arr[0]) + i, src[i]);
}
return arr;
}
template <typename T, int kElems>
void RelaxedLaneSelectTest(TestExecutionTier execution_tier, const T v1[kElems],
const T v2[kElems], const T s[kElems],
const T expected[kElems], WasmOpcode laneselect) {
auto lhs = as_uint8<T>(v1);
auto rhs = as_uint8<T>(v2);
auto mask = as_uint8<T>(s);
WasmRunner<int32_t> r(execution_tier);
T* dst = r.builder().AddGlobal<T>(kWasmS128);
BUILD(r,
WASM_GLOBAL_SET(0, WASM_SIMD_OPN(laneselect, WASM_SIMD_CONSTANT(lhs),
WASM_SIMD_CONSTANT(rhs),
WASM_SIMD_CONSTANT(mask))),
WASM_ONE);
CHECK_EQ(1, r.Call());
for (int i = 0; i < kElems; i++) {
CHECK_EQ(expected[i], LANE(dst, i));
}
}
} // namespace
WASM_RELAXED_SIMD_TEST(I8x16RelaxedLaneSelect) {
constexpr int kElems = 16;
constexpr uint8_t v1[kElems] = {0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15};
constexpr uint8_t v2[kElems] = {16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31};
constexpr uint8_t s[kElems] = {0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF,
0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF};
constexpr uint8_t expected[kElems] = {16, 1, 18, 3, 20, 5, 22, 7,
24, 9, 26, 11, 28, 13, 30, 15};
RelaxedLaneSelectTest<uint8_t, kElems>(execution_tier, v1, v2, s, expected,
kExprI8x16RelaxedLaneSelect);
}
WASM_RELAXED_SIMD_TEST(I16x8RelaxedLaneSelect) {
constexpr int kElems = 8;
uint16_t v1[kElems] = {0, 1, 2, 3, 4, 5, 6, 7};
uint16_t v2[kElems] = {8, 9, 10, 11, 12, 13, 14, 15};
uint16_t s[kElems] = {0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF};
constexpr uint16_t expected[kElems] = {8, 1, 10, 3, 12, 5, 14, 7};
RelaxedLaneSelectTest<uint16_t, kElems>(execution_tier, v1, v2, s, expected,
kExprI16x8RelaxedLaneSelect);
}
WASM_RELAXED_SIMD_TEST(I32x4RelaxedLaneSelect) {
constexpr int kElems = 4;
uint32_t v1[kElems] = {0, 1, 2, 3};
uint32_t v2[kElems] = {4, 5, 6, 7};
uint32_t s[kElems] = {0, 0xFFFF'FFFF, 0, 0xFFFF'FFFF};
constexpr uint32_t expected[kElems] = {4, 1, 6, 3};
RelaxedLaneSelectTest<uint32_t, kElems>(execution_tier, v1, v2, s, expected,
kExprI32x4RelaxedLaneSelect);
}
WASM_RELAXED_SIMD_TEST(I64x2RelaxedLaneSelect) {
constexpr int kElems = 2;
uint64_t v1[kElems] = {0, 1};
uint64_t v2[kElems] = {2, 3};
uint64_t s[kElems] = {0, 0xFFFF'FFFF'FFFF'FFFF};
constexpr uint64_t expected[kElems] = {2, 1};
RelaxedLaneSelectTest<uint64_t, kElems>(execution_tier, v1, v2, s, expected,
kExprI64x2RelaxedLaneSelect);
}
#endif // V8_TARGET_ARCH_X64
#undef WASM_RELAXED_SIMD_TEST

View File

@ -2724,6 +2724,10 @@ class WasmInterpreterInternals {
PACK_CASE(I8x16SConvertI16x8, int8, i16x8, int16, 16, int8_t)
PACK_CASE(I8x16UConvertI16x8, int8, i16x8, int16, 16, uint8_t)
#undef PACK_CASE
case kExprI8x16RelaxedLaneSelect:
case kExprI16x8RelaxedLaneSelect:
case kExprI32x4RelaxedLaneSelect:
case kExprI64x2RelaxedLaneSelect:
case kExprS128Select: {
int4 bool_val = Pop().to_s128().to_i32x4();
int4 v2 = Pop().to_s128().to_i32x4();