[wasm-simd][arm64] Prototype extended pairwise additions

Prototype i32x4.extadd_pairwise_i16x8_{s,u} and
i16x8.extadd_pairwise_i8x16{s,u} (names not confirmed) on ARM64 and
interpreter. With a simple test case.

Bug: v8:11086
Change-Id: If1ffc04e179e86ca5cc209bf9ef9d337298e3cc2
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2513872
Reviewed-by: Tobias Tebbi <tebbi@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71068}
This commit is contained in:
Zhi An Ng 2020-11-02 09:16:52 +00:00 committed by Commit Bot
parent 2fed939052
commit 062ba7e78f
14 changed files with 223 additions and 34 deletions

View File

@ -1139,6 +1139,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kArm64Mul32:
__ Mul(i.OutputRegister32(), i.InputRegister32(0), i.InputRegister32(1));
break;
case kArm64Saddlp: {
VectorFormat dst_f = VectorFormatFillQ(MiscField::decode(opcode));
VectorFormat src_f = VectorFormatHalfWidthDoubleLanes(dst_f);
__ Saddlp(i.OutputSimd128Register().Format(dst_f),
i.InputSimd128Register(0).Format(src_f));
break;
}
case kArm64Uaddlp: {
VectorFormat dst_f = VectorFormatFillQ(MiscField::decode(opcode));
VectorFormat src_f = VectorFormatHalfWidthDoubleLanes(dst_f);
__ Uaddlp(i.OutputSimd128Register().Format(dst_f),
i.InputSimd128Register(0).Format(src_f));
break;
}
case kArm64Smull: {
if (instr->InputAt(0)->IsRegister()) {
__ Smull(i.OutputRegister(), i.InputRegister32(0),

View File

@ -35,12 +35,14 @@ namespace compiler {
V(Arm64Eor32) \
V(Arm64Eon) \
V(Arm64Eon32) \
V(Arm64Saddlp) \
V(Arm64Sub) \
V(Arm64Sub32) \
V(Arm64Mul) \
V(Arm64Mul32) \
V(Arm64Smull) \
V(Arm64Smull2) \
V(Arm64Uaddlp) \
V(Arm64Umull) \
V(Arm64Umull2) \
V(Arm64Madd) \

View File

@ -36,12 +36,14 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64Eor32:
case kArm64Eon:
case kArm64Eon32:
case kArm64Saddlp:
case kArm64Sub:
case kArm64Sub32:
case kArm64Mul:
case kArm64Mul32:
case kArm64Smull:
case kArm64Smull2:
case kArm64Uaddlp:
case kArm64Umull:
case kArm64Umull2:
case kArm64Madd:

View File

@ -1708,6 +1708,31 @@ void InstructionSelector::VisitI64x2ExtMulHighI32x4U(Node* node) {
VisitExtMul(this, kArm64Umull2, node, 64);
}
namespace {
void VisitExtAddPairwise(InstructionSelector* selector, ArchOpcode opcode,
Node* node, int dst_lane_size) {
InstructionCode code = opcode;
code |= MiscField::encode(dst_lane_size);
VisitRR(selector, code, node);
}
} // namespace
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
VisitExtAddPairwise(this, kArm64Saddlp, node, 32);
}
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8U(Node* node) {
VisitExtAddPairwise(this, kArm64Uaddlp, node, 32);
}
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) {
VisitExtAddPairwise(this, kArm64Saddlp, node, 16);
}
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
VisitExtAddPairwise(this, kArm64Uaddlp, node, 16);
}
void InstructionSelector::VisitInt32MulHigh(Node* node) {
Arm64OperandGenerator g(this);
InstructionOperand const smull_operand = g.TempRegister();

View File

@ -2097,6 +2097,10 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitI32x4ExtMulHighI16x8U(node);
case IrOpcode::kI32x4SignSelect:
return MarkAsSimd128(node), VisitI32x4SignSelect(node);
case IrOpcode::kI32x4ExtAddPairwiseI16x8S:
return MarkAsSimd128(node), VisitI32x4ExtAddPairwiseI16x8S(node);
case IrOpcode::kI32x4ExtAddPairwiseI16x8U:
return MarkAsSimd128(node), VisitI32x4ExtAddPairwiseI16x8U(node);
case IrOpcode::kI16x8Splat:
return MarkAsSimd128(node), VisitI16x8Splat(node);
case IrOpcode::kI16x8ExtractLaneU:
@ -2179,6 +2183,10 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitI16x8ExtMulHighI8x16U(node);
case IrOpcode::kI16x8SignSelect:
return MarkAsSimd128(node), VisitI16x8SignSelect(node);
case IrOpcode::kI16x8ExtAddPairwiseI8x16S:
return MarkAsSimd128(node), VisitI16x8ExtAddPairwiseI8x16S(node);
case IrOpcode::kI16x8ExtAddPairwiseI8x16U:
return MarkAsSimd128(node), VisitI16x8ExtAddPairwiseI8x16U(node);
case IrOpcode::kI8x16Splat:
return MarkAsSimd128(node), VisitI8x16Splat(node);
case IrOpcode::kI8x16ExtractLaneU:
@ -2772,6 +2780,20 @@ void InstructionSelector::VisitI16x8ExtMulLowI8x16U(Node* node) {
void InstructionSelector::VisitI16x8ExtMulHighI8x16U(Node* node) {
UNIMPLEMENTED();
}
// TODO(v8:11086) Prototype extended pairwise add.
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8U(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
UNIMPLEMENTED();
}
#endif // !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_X64

View File

@ -463,6 +463,8 @@ ShiftKind ShiftKindOf(Operator const* op) {
V(I32x4ExtMulLowI16x8U, Operator::kCommutative, 2, 0, 1) \
V(I32x4ExtMulHighI16x8U, Operator::kCommutative, 2, 0, 1) \
V(I32x4SignSelect, Operator::kNoProperties, 3, 0, 1) \
V(I32x4ExtAddPairwiseI16x8S, Operator::kNoProperties, 1, 0, 1) \
V(I32x4ExtAddPairwiseI16x8U, Operator::kNoProperties, 1, 0, 1) \
V(I16x8Splat, Operator::kNoProperties, 1, 0, 1) \
V(I16x8SConvertI8x16Low, Operator::kNoProperties, 1, 0, 1) \
V(I16x8SConvertI8x16High, Operator::kNoProperties, 1, 0, 1) \
@ -501,6 +503,8 @@ ShiftKind ShiftKindOf(Operator const* op) {
V(I16x8ExtMulLowI8x16U, Operator::kCommutative, 2, 0, 1) \
V(I16x8ExtMulHighI8x16U, Operator::kCommutative, 2, 0, 1) \
V(I16x8SignSelect, Operator::kNoProperties, 3, 0, 1) \
V(I16x8ExtAddPairwiseI8x16S, Operator::kNoProperties, 1, 0, 1) \
V(I16x8ExtAddPairwiseI8x16U, Operator::kNoProperties, 1, 0, 1) \
V(I8x16Splat, Operator::kNoProperties, 1, 0, 1) \
V(I8x16Neg, Operator::kNoProperties, 1, 0, 1) \
V(I8x16Shl, Operator::kNoProperties, 2, 0, 1) \

View File

@ -716,6 +716,8 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* I32x4ExtMulLowI16x8U();
const Operator* I32x4ExtMulHighI16x8U();
const Operator* I32x4SignSelect();
const Operator* I32x4ExtAddPairwiseI16x8S();
const Operator* I32x4ExtAddPairwiseI16x8U();
const Operator* I16x8Splat();
const Operator* I16x8ExtractLaneU(int32_t);
@ -759,6 +761,8 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* I16x8ExtMulLowI8x16U();
const Operator* I16x8ExtMulHighI8x16U();
const Operator* I16x8SignSelect();
const Operator* I16x8ExtAddPairwiseI8x16S();
const Operator* I16x8ExtAddPairwiseI8x16U();
const Operator* I8x16Splat();
const Operator* I8x16ExtractLaneU(int32_t);

View File

@ -875,6 +875,8 @@
V(I32x4ExtMulLowI16x8U) \
V(I32x4ExtMulHighI16x8U) \
V(I32x4SignSelect) \
V(I32x4ExtAddPairwiseI16x8S) \
V(I32x4ExtAddPairwiseI16x8U) \
V(I16x8Splat) \
V(I16x8ExtractLaneU) \
V(I16x8ExtractLaneS) \
@ -920,6 +922,8 @@
V(I16x8ExtMulLowI8x16U) \
V(I16x8ExtMulHighI8x16U) \
V(I16x8SignSelect) \
V(I16x8ExtAddPairwiseI8x16S) \
V(I16x8ExtAddPairwiseI8x16U) \
V(I8x16Splat) \
V(I8x16ExtractLaneU) \
V(I8x16ExtractLaneS) \

View File

@ -4793,6 +4793,12 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
case wasm::kExprI32x4SignSelect:
return graph()->NewNode(mcgraph()->machine()->I32x4SignSelect(),
inputs[0], inputs[1], inputs[2]);
case wasm::kExprI32x4ExtAddPairwiseI16x8S:
return graph()->NewNode(mcgraph()->machine()->I32x4ExtAddPairwiseI16x8S(),
inputs[0]);
case wasm::kExprI32x4ExtAddPairwiseI16x8U:
return graph()->NewNode(mcgraph()->machine()->I32x4ExtAddPairwiseI16x8U(),
inputs[0]);
case wasm::kExprI16x8Splat:
return graph()->NewNode(mcgraph()->machine()->I16x8Splat(), inputs[0]);
case wasm::kExprI16x8SConvertI8x16Low:
@ -4915,6 +4921,12 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
case wasm::kExprI16x8SignSelect:
return graph()->NewNode(mcgraph()->machine()->I16x8SignSelect(),
inputs[0], inputs[1], inputs[2]);
case wasm::kExprI16x8ExtAddPairwiseI8x16S:
return graph()->NewNode(mcgraph()->machine()->I16x8ExtAddPairwiseI8x16S(),
inputs[0]);
case wasm::kExprI16x8ExtAddPairwiseI8x16U:
return graph()->NewNode(mcgraph()->machine()->I16x8ExtAddPairwiseI8x16U(),
inputs[0]);
case wasm::kExprI8x16Splat:
return graph()->NewNode(mcgraph()->machine()->I8x16Splat(), inputs[0]);
case wasm::kExprI8x16Neg:

View File

@ -217,6 +217,21 @@ Wide MultiplyLong(Narrow a, Narrow b) {
return static_cast<Wide>(a) * static_cast<Wide>(b);
}
// Add two numbers, returning a result that is twice as wide, no overflow.
// Put Wide first so we can use function template argument deduction for Narrow,
// and callers can provide only Wide.
template <typename Wide, typename Narrow>
Wide AddLong(Narrow a, Narrow b) {
static_assert(
std::is_integral<Narrow>::value && std::is_integral<Wide>::value,
"only integral types");
static_assert(std::is_signed<Narrow>::value == std::is_signed<Wide>::value,
"both must have same signedness");
static_assert(sizeof(Narrow) * 2 == sizeof(Wide), "only twice as long");
return static_cast<Wide>(a) + static_cast<Wide>(b);
}
// Helper macros for defining a contiguous sequence of field offset constants.
// Example: (backslashes at the ends of respective lines of this multi-line
// macro definition are omitted here to please the compiler)

View File

@ -357,6 +357,9 @@ constexpr const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
CASE_SIMDI_OP(SignSelect, "signselect")
CASE_I64x2_OP(SignSelect, "signselect")
CASE_SIGN_OP(I32x4, ExtAddPairwiseI16x8, "extadd_pairwise_i16x8")
CASE_SIGN_OP(I16x8, ExtAddPairwiseI8x16, "extadd_pairwise_i8x6")
// Atomic operations.
CASE_OP(AtomicNotify, "atomic.notify")
CASE_INT_OP(AtomicWait, "atomic.wait")

View File

@ -471,40 +471,44 @@ bool V8_EXPORT_PRIVATE IsJSCompatibleSignature(const FunctionSig* sig,
V(S128Store32Lane, 0xfd5e, v_is) \
V(S128Store64Lane, 0xfd5f, v_is)
#define FOREACH_SIMD_POST_MVP_OPCODE(V) \
V(I8x16Mul, 0xfd75, s_ss) \
V(I8x16Popcnt, 0xfd7c, s_s) \
V(I8x16SignSelect, 0xfd7d, s_sss) \
V(I16x8SignSelect, 0xfd7e, s_sss) \
V(I32x4SignSelect, 0xfd7f, s_sss) \
V(I64x2SignSelect, 0xfd94, s_sss) \
V(I16x8Q15MulRSatS, 0xfd9c, s_ss) \
V(I16x8ExtMulLowI8x16S, 0xfd9a, s_ss) \
V(I16x8ExtMulHighI8x16S, 0xfd9d, s_ss) \
V(I16x8ExtMulLowI8x16U, 0xfd9e, s_ss) \
V(I16x8ExtMulHighI8x16U, 0xfd9f, s_ss) \
V(I32x4ExtMulLowI16x8S, 0xfdbb, s_ss) \
V(I32x4ExtMulHighI16x8S, 0xfdbd, s_ss) \
V(I32x4ExtMulLowI16x8U, 0xfdbe, s_ss) \
V(I32x4ExtMulHighI16x8U, 0xfdbf, s_ss) \
V(I64x2ExtMulLowI32x4S, 0xfdd2, s_ss) \
V(I64x2ExtMulHighI32x4S, 0xfdd3, s_ss) \
V(I64x2ExtMulLowI32x4U, 0xfdd6, s_ss) \
V(I64x2ExtMulHighI32x4U, 0xfdd7, s_ss) \
V(I64x2Eq, 0xfdc0, s_ss) \
V(F32x4Qfma, 0xfdb4, s_sss) \
V(I64x2BitMask, 0xfdc4, i_s) \
V(I64x2SConvertI32x4Low, 0xfdc7, s_s) \
V(I64x2SConvertI32x4High, 0xfdc8, s_s) \
V(I64x2UConvertI32x4Low, 0xfdc9, s_s) \
V(I64x2UConvertI32x4High, 0xfdca, s_s) \
V(F32x4Qfms, 0xfdd4, s_sss) \
V(F64x2Qfma, 0xfdfe, s_sss) \
V(F64x2Qfms, 0xfdff, s_sss) \
V(I16x8AddHoriz, 0xfdaf, s_ss) \
V(I32x4AddHoriz, 0xfdb0, s_ss) \
V(F32x4AddHoriz, 0xfdb2, s_ss) \
V(F32x4RecipApprox, 0xfdb3, s_s) \
#define FOREACH_SIMD_POST_MVP_OPCODE(V) \
V(I8x16Mul, 0xfd75, s_ss) \
V(I8x16Popcnt, 0xfd7c, s_s) \
V(I8x16SignSelect, 0xfd7d, s_sss) \
V(I16x8SignSelect, 0xfd7e, s_sss) \
V(I32x4SignSelect, 0xfd7f, s_sss) \
V(I64x2SignSelect, 0xfd94, s_sss) \
V(I16x8Q15MulRSatS, 0xfd9c, s_ss) \
V(I16x8ExtMulLowI8x16S, 0xfd9a, s_ss) \
V(I16x8ExtMulHighI8x16S, 0xfd9d, s_ss) \
V(I16x8ExtMulLowI8x16U, 0xfd9e, s_ss) \
V(I16x8ExtMulHighI8x16U, 0xfd9f, s_ss) \
V(I32x4ExtMulLowI16x8S, 0xfdbb, s_ss) \
V(I32x4ExtMulHighI16x8S, 0xfdbd, s_ss) \
V(I32x4ExtMulLowI16x8U, 0xfdbe, s_ss) \
V(I32x4ExtMulHighI16x8U, 0xfdbf, s_ss) \
V(I64x2ExtMulLowI32x4S, 0xfdd2, s_ss) \
V(I64x2ExtMulHighI32x4S, 0xfdd3, s_ss) \
V(I64x2ExtMulLowI32x4U, 0xfdd6, s_ss) \
V(I64x2ExtMulHighI32x4U, 0xfdd7, s_ss) \
V(I32x4ExtAddPairwiseI16x8S, 0xfda5, s_s) \
V(I32x4ExtAddPairwiseI16x8U, 0xfda6, s_s) \
V(I16x8ExtAddPairwiseI8x16S, 0xfdc2, s_s) \
V(I16x8ExtAddPairwiseI8x16U, 0xfdc3, s_s) \
V(I64x2Eq, 0xfdc0, s_ss) \
V(F32x4Qfma, 0xfdb4, s_sss) \
V(I64x2BitMask, 0xfdc4, i_s) \
V(I64x2SConvertI32x4Low, 0xfdc7, s_s) \
V(I64x2SConvertI32x4High, 0xfdc8, s_s) \
V(I64x2UConvertI32x4Low, 0xfdc9, s_s) \
V(I64x2UConvertI32x4High, 0xfdca, s_s) \
V(F32x4Qfms, 0xfdd4, s_sss) \
V(F64x2Qfma, 0xfdfe, s_sss) \
V(F64x2Qfms, 0xfdff, s_sss) \
V(I16x8AddHoriz, 0xfdaf, s_ss) \
V(I32x4AddHoriz, 0xfdb0, s_ss) \
V(F32x4AddHoriz, 0xfdb2, s_ss) \
V(F32x4RecipApprox, 0xfdb3, s_s) \
V(F32x4RecipSqrtApprox, 0xfdbc, s_s)
#define FOREACH_SIMD_1_OPERAND_1_PARAM_OPCODE(V) \

View File

@ -1876,6 +1876,57 @@ WASM_SIMD_TEST(S128Not) {
[](int32_t x) { return ~x; });
}
#if V8_TARGET_ARCH_ARM64
// TODO(v8:11086) Prototype i32x4.extadd_pairwise_i16x8_{s,u}
template <typename Narrow, typename Wide>
void RunExtAddPairwiseTest(TestExecutionTier execution_tier,
LowerSimd lower_simd, WasmOpcode ext_add_pairwise,
WasmOpcode splat) {
FLAG_SCOPE(wasm_simd_post_mvp);
constexpr int num_lanes = kSimd128Size / sizeof(Wide);
WasmRunner<int32_t, Narrow> r(execution_tier, lower_simd);
Wide* g = r.builder().template AddGlobal<Wide>(kWasmS128);
// TODO(v8:11086) We splat the same value, so pairwise adding ends up adding
// the same value to itself, consider a more complicated test, like having 2
// vectors, and shuffling them.
BUILD(r, WASM_GET_LOCAL(0), WASM_SIMD_OP(splat),
WASM_SIMD_OP(ext_add_pairwise), kExprGlobalSet, 0, WASM_ONE);
for (Narrow x : compiler::ValueHelper::GetVector<Narrow>()) {
r.Call(x);
Wide expected = AddLong<Wide>(x, x);
for (int i = 0; i < num_lanes; i++) {
CHECK_EQ(expected, ReadLittleEndianValue<Wide>(&g[i]));
}
}
}
WASM_SIMD_TEST_NO_LOWERING(I32x4ExtAddPairwiseI16x8S) {
RunExtAddPairwiseTest<int16_t, int32_t>(execution_tier, lower_simd,
kExprI32x4ExtAddPairwiseI16x8S,
kExprI16x8Splat);
}
WASM_SIMD_TEST_NO_LOWERING(I32x4ExtAddPairwiseI16x8U) {
RunExtAddPairwiseTest<uint16_t, uint32_t>(execution_tier, lower_simd,
kExprI32x4ExtAddPairwiseI16x8U,
kExprI16x8Splat);
}
WASM_SIMD_TEST_NO_LOWERING(I16x8ExtAddPairwiseI8x16S) {
RunExtAddPairwiseTest<int8_t, int16_t>(execution_tier, lower_simd,
kExprI16x8ExtAddPairwiseI8x16S,
kExprI8x16Splat);
}
WASM_SIMD_TEST_NO_LOWERING(I16x8ExtAddPairwiseI8x16U) {
RunExtAddPairwiseTest<uint8_t, uint16_t>(execution_tier, lower_simd,
kExprI16x8ExtAddPairwiseI8x16U,
kExprI8x16Splat);
}
#endif // V8_TARGET_ARCH_ARM64
void RunI32x4BinOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int32BinOp expected_op) {
WasmRunner<int32_t, int32_t, int32_t> r(execution_tier, lower_simd);

View File

@ -2789,6 +2789,18 @@ class WasmInterpreterInternals {
case kExprI64x2SignSelect: {
return DoSimdSignSelect<int2>();
}
case kExprI32x4ExtAddPairwiseI16x8S: {
return DoSimdExtAddPairwise<int4, int8, int32_t, int16_t>();
}
case kExprI32x4ExtAddPairwiseI16x8U: {
return DoSimdExtAddPairwise<int4, int8, uint32_t, uint16_t>();
}
case kExprI16x8ExtAddPairwiseI8x16S: {
return DoSimdExtAddPairwise<int8, int16, int16_t, int8_t>();
}
case kExprI16x8ExtAddPairwiseI8x16U: {
return DoSimdExtAddPairwise<int8, int16, uint16_t, uint8_t>();
}
default:
return false;
}
@ -2924,6 +2936,21 @@ class WasmInterpreterInternals {
return true;
}
template <typename DstSimdType, typename SrcSimdType, typename Wide,
typename Narrow>
bool DoSimdExtAddPairwise() {
constexpr int lanes = kSimd128Size / sizeof(DstSimdType::val[0]);
auto v = Pop().to_s128().to<SrcSimdType>();
DstSimdType res;
for (int i = 0; i < lanes; ++i) {
res.val[LANE(i, res)] =
AddLong<Wide>(static_cast<Narrow>(v.val[LANE(i * 2, v)]),
static_cast<Narrow>(v.val[LANE(i * 2 + 1, v)]));
}
Push(WasmValue(Simd128(res)));
return true;
}
// Check if our control stack (frames_) exceeds the limit. Trigger stack
// overflow if it does, and unwinding the current frame.
// Returns true if execution can continue, false if the stack was fully