[wasm-simd][arm64] Bitmask instructions

Implement i8x16.bitmask, i16x8.bitmask, i32x4.bitmask on interpreter and
arm64.

These operations are behind wasm_simd_post_mvp flag, as we are only
prototyping to evaluate performance. The codegen is based on guidance at
https://github.com/WebAssembly/simd/pull/201.

Bug: v8:10308
Change-Id: I835aa8a23e677a00ee7897c1c31a028850e238a9
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2099451
Reviewed-by: Tobias Tebbi <tebbi@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66793}
This commit is contained in:
Ng Zhi An 2020-03-18 16:14:40 -07:00 committed by Commit Bot
parent ca5ee9d636
commit 3406cba8fe
13 changed files with 193 additions and 0 deletions

View File

@ -2128,6 +2128,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
SIMD_BINOP_CASE(kArm64I32x4GtU, Cmhi, 4S);
SIMD_BINOP_CASE(kArm64I32x4GeU, Cmhs, 4S);
SIMD_UNOP_CASE(kArm64I32x4Abs, Abs, 4S);
case kArm64I32x4BitMask: {
Register dst = i.OutputRegister32();
VRegister src = i.InputSimd128Register(0);
VRegister tmp = i.TempSimd128Register(0);
VRegister mask = i.TempSimd128Register(1);
__ Sshr(tmp.V4S(), src.V4S(), 31);
// Set i-th bit of each lane i. When AND with tmp, the lanes that
// are signed will have i-th bit set, unsigned will be 0.
__ Movi(mask.V2D(), 0x0000'0008'0000'0004, 0x0000'0002'0000'0001);
__ And(tmp.V16B(), mask.V16B(), tmp.V16B());
__ Addv(tmp.S(), tmp.V4S());
__ Mov(dst.W(), tmp.V4S(), 0);
break;
}
case kArm64I16x8Splat: {
__ Dup(i.OutputSimd128Register().V8H(), i.InputRegister32(0));
break;
@ -2229,6 +2244,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
SIMD_BINOP_CASE(kArm64I16x8GeU, Cmhs, 8H);
SIMD_BINOP_CASE(kArm64I16x8RoundingAverageU, Urhadd, 8H);
SIMD_UNOP_CASE(kArm64I16x8Abs, Abs, 8H);
case kArm64I16x8BitMask: {
Register dst = i.OutputRegister32();
VRegister src = i.InputSimd128Register(0);
VRegister tmp = i.TempSimd128Register(0);
VRegister mask = i.TempSimd128Register(1);
__ Sshr(tmp.V8H(), src.V8H(), 15);
// Set i-th bit of each lane i. When AND with tmp, the lanes that
// are signed will have i-th bit set, unsigned will be 0.
__ Movi(mask.V2D(), 0x0080'0040'0020'0010, 0x0008'0004'0002'0001);
__ And(tmp.V16B(), mask.V16B(), tmp.V16B());
__ Addv(tmp.H(), tmp.V8H());
__ Mov(dst.W(), tmp.V8H(), 0);
break;
}
case kArm64I8x16Splat: {
__ Dup(i.OutputSimd128Register().V16B(), i.InputRegister32(0));
break;
@ -2318,6 +2348,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
SIMD_BINOP_CASE(kArm64I8x16GeU, Cmhs, 16B);
SIMD_BINOP_CASE(kArm64I8x16RoundingAverageU, Urhadd, 16B);
SIMD_UNOP_CASE(kArm64I8x16Abs, Abs, 16B);
case kArm64I8x16BitMask: {
Register dst = i.OutputRegister32();
VRegister src = i.InputSimd128Register(0);
VRegister tmp = i.TempSimd128Register(0);
VRegister mask = i.TempSimd128Register(1);
// Set i-th bit of each lane i. When AND with tmp, the lanes that
// are signed will have i-th bit set, unsigned will be 0.
__ Sshr(tmp.V16B(), src.V16B(), 7);
__ Movi(mask.V2D(), 0x8040'2010'0804'0201);
__ And(tmp.V16B(), mask.V16B(), tmp.V16B());
__ Ext(mask.V16B(), tmp.V16B(), tmp.V16B(), 8);
__ Zip1(tmp.V16B(), tmp.V16B(), mask.V16B());
__ Addv(tmp.H(), tmp.V8H());
__ Mov(dst.W(), tmp.V8H(), 0);
break;
}
case kArm64S128Zero: {
__ Movi(i.OutputSimd128Register().V16B(), 0);
break;

View File

@ -253,6 +253,7 @@ namespace compiler {
V(Arm64I32x4GtU) \
V(Arm64I32x4GeU) \
V(Arm64I32x4Abs) \
V(Arm64I32x4BitMask) \
V(Arm64I16x8Splat) \
V(Arm64I16x8ExtractLaneU) \
V(Arm64I16x8ExtractLaneS) \
@ -287,6 +288,7 @@ namespace compiler {
V(Arm64I16x8GeU) \
V(Arm64I16x8RoundingAverageU) \
V(Arm64I16x8Abs) \
V(Arm64I16x8BitMask) \
V(Arm64I8x16Splat) \
V(Arm64I8x16ExtractLaneU) \
V(Arm64I8x16ExtractLaneS) \
@ -316,6 +318,7 @@ namespace compiler {
V(Arm64I8x16GeU) \
V(Arm64I8x16RoundingAverageU) \
V(Arm64I8x16Abs) \
V(Arm64I8x16BitMask) \
V(Arm64S128Zero) \
V(Arm64S128Dup) \
V(Arm64S128And) \

View File

@ -223,6 +223,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64I32x4GtU:
case kArm64I32x4GeU:
case kArm64I32x4Abs:
case kArm64I32x4BitMask:
case kArm64I16x8Splat:
case kArm64I16x8ExtractLaneU:
case kArm64I16x8ExtractLaneS:
@ -257,6 +258,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64I16x8GeU:
case kArm64I16x8RoundingAverageU:
case kArm64I16x8Abs:
case kArm64I16x8BitMask:
case kArm64I8x16Splat:
case kArm64I8x16ExtractLaneU:
case kArm64I8x16ExtractLaneS:
@ -286,6 +288,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64I8x16GeU:
case kArm64I8x16RoundingAverageU:
case kArm64I8x16Abs:
case kArm64I8x16BitMask:
case kArm64S128Zero:
case kArm64S128Dup:
case kArm64S128And:

View File

@ -3367,6 +3367,29 @@ VISIT_SIMD_QFMOP(F32x4Qfma)
VISIT_SIMD_QFMOP(F32x4Qfms)
#undef VISIT_SIMD_QFMOP
namespace {
template <ArchOpcode opcode>
void VisitBitMask(InstructionSelector* selector, Node* node) {
Arm64OperandGenerator g(selector);
InstructionOperand temps[] = {g.TempSimd128Register(),
g.TempSimd128Register()};
selector->Emit(opcode, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), arraysize(temps), temps);
}
} // namespace
void InstructionSelector::VisitI8x16BitMask(Node* node) {
VisitBitMask<kArm64I8x16BitMask>(this, node);
}
void InstructionSelector::VisitI16x8BitMask(Node* node) {
VisitBitMask<kArm64I16x8BitMask>(this, node);
}
void InstructionSelector::VisitI32x4BitMask(Node* node) {
VisitBitMask<kArm64I32x4BitMask>(this, node);
}
namespace {
struct ShuffleEntry {

View File

@ -2025,6 +2025,8 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitI32x4GeU(node);
case IrOpcode::kI32x4Abs:
return MarkAsSimd128(node), VisitI32x4Abs(node);
case IrOpcode::kI32x4BitMask:
return MarkAsWord32(node), VisitI32x4BitMask(node);
case IrOpcode::kI16x8Splat:
return MarkAsSimd128(node), VisitI16x8Splat(node);
case IrOpcode::kI16x8ExtractLaneU:
@ -2093,6 +2095,8 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitI16x8RoundingAverageU(node);
case IrOpcode::kI16x8Abs:
return MarkAsSimd128(node), VisitI16x8Abs(node);
case IrOpcode::kI16x8BitMask:
return MarkAsWord32(node), VisitI16x8BitMask(node);
case IrOpcode::kI8x16Splat:
return MarkAsSimd128(node), VisitI8x16Splat(node);
case IrOpcode::kI8x16ExtractLaneU:
@ -2151,6 +2155,8 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitI8x16RoundingAverageU(node);
case IrOpcode::kI8x16Abs:
return MarkAsSimd128(node), VisitI8x16Abs(node);
case IrOpcode::kI8x16BitMask:
return MarkAsWord32(node), VisitI8x16BitMask(node);
case IrOpcode::kS128Zero:
return MarkAsSimd128(node), VisitS128Zero(node);
case IrOpcode::kS128And:
@ -2628,6 +2634,12 @@ void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X
#if !V8_TARGET_ARCH_ARM64
void InstructionSelector::VisitI8x16BitMask(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI16x8BitMask(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI32x4BitMask(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM64
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
void InstructionSelector::VisitParameter(Node* node) {

View File

@ -390,6 +390,7 @@ MachineType AtomicOpType(Operator const* op) {
V(I32x4GtU, Operator::kNoProperties, 2, 0, 1) \
V(I32x4GeU, Operator::kNoProperties, 2, 0, 1) \
V(I32x4Abs, Operator::kNoProperties, 1, 0, 1) \
V(I32x4BitMask, Operator::kNoProperties, 1, 0, 1) \
V(I16x8Splat, Operator::kNoProperties, 1, 0, 1) \
V(I16x8SConvertI8x16Low, Operator::kNoProperties, 1, 0, 1) \
V(I16x8SConvertI8x16High, Operator::kNoProperties, 1, 0, 1) \
@ -421,6 +422,7 @@ MachineType AtomicOpType(Operator const* op) {
V(I16x8GeU, Operator::kNoProperties, 2, 0, 1) \
V(I16x8RoundingAverageU, Operator::kCommutative, 2, 0, 1) \
V(I16x8Abs, Operator::kNoProperties, 1, 0, 1) \
V(I16x8BitMask, Operator::kNoProperties, 1, 0, 1) \
V(I8x16Splat, Operator::kNoProperties, 1, 0, 1) \
V(I8x16Neg, Operator::kNoProperties, 1, 0, 1) \
V(I8x16Shl, Operator::kNoProperties, 2, 0, 1) \
@ -447,6 +449,7 @@ MachineType AtomicOpType(Operator const* op) {
V(I8x16GeU, Operator::kNoProperties, 2, 0, 1) \
V(I8x16RoundingAverageU, Operator::kCommutative, 2, 0, 1) \
V(I8x16Abs, Operator::kNoProperties, 1, 0, 1) \
V(I8x16BitMask, Operator::kNoProperties, 1, 0, 1) \
V(S128Load, Operator::kNoProperties, 2, 0, 1) \
V(S128Store, Operator::kNoProperties, 3, 0, 1) \
V(S128Zero, Operator::kNoProperties, 0, 0, 1) \

View File

@ -630,6 +630,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* I32x4GtU();
const Operator* I32x4GeU();
const Operator* I32x4Abs();
const Operator* I32x4BitMask();
const Operator* I16x8Splat();
const Operator* I16x8ExtractLaneU(int32_t);
@ -666,6 +667,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* I16x8GeU();
const Operator* I16x8RoundingAverageU();
const Operator* I16x8Abs();
const Operator* I16x8BitMask();
const Operator* I8x16Splat();
const Operator* I8x16ExtractLaneU(int32_t);
@ -697,6 +699,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* I8x16GeU();
const Operator* I8x16RoundingAverageU();
const Operator* I8x16Abs();
const Operator* I8x16BitMask();
const Operator* S128Load();
const Operator* S128Store();

View File

@ -840,6 +840,7 @@
V(I32x4GtU) \
V(I32x4GeU) \
V(I32x4Abs) \
V(I32x4BitMask) \
V(I16x8Splat) \
V(I16x8ExtractLaneU) \
V(I16x8ExtractLaneS) \
@ -878,6 +879,7 @@
V(I16x8GeU) \
V(I16x8RoundingAverageU) \
V(I16x8Abs) \
V(I16x8BitMask) \
V(I8x16Splat) \
V(I8x16ExtractLaneU) \
V(I8x16ExtractLaneS) \
@ -911,6 +913,7 @@
V(I8x16GeU) \
V(I8x16RoundingAverageU) \
V(I8x16Abs) \
V(I8x16BitMask) \
V(S128Load) \
V(S128Store) \
V(S128Zero) \

View File

@ -4368,6 +4368,8 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
inputs[1]);
case wasm::kExprI32x4Abs:
return graph()->NewNode(mcgraph()->machine()->I32x4Abs(), inputs[0]);
case wasm::kExprI32x4BitMask:
return graph()->NewNode(mcgraph()->machine()->I32x4BitMask(), inputs[0]);
case wasm::kExprI16x8Splat:
return graph()->NewNode(mcgraph()->machine()->I16x8Splat(), inputs[0]);
case wasm::kExprI16x8SConvertI8x16Low:
@ -4470,6 +4472,8 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
inputs[0], inputs[1]);
case wasm::kExprI16x8Abs:
return graph()->NewNode(mcgraph()->machine()->I16x8Abs(), inputs[0]);
case wasm::kExprI16x8BitMask:
return graph()->NewNode(mcgraph()->machine()->I16x8BitMask(), inputs[0]);
case wasm::kExprI8x16Splat:
return graph()->NewNode(mcgraph()->machine()->I8x16Splat(), inputs[0]);
case wasm::kExprI8x16Neg:
@ -4557,6 +4561,8 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
inputs[0], inputs[1]);
case wasm::kExprI8x16Abs:
return graph()->NewNode(mcgraph()->machine()->I8x16Abs(), inputs[0]);
case wasm::kExprI8x16BitMask:
return graph()->NewNode(mcgraph()->machine()->I8x16BitMask(), inputs[0]);
case wasm::kExprS128And:
return graph()->NewNode(mcgraph()->machine()->S128And(), inputs[0],
inputs[1]);

View File

@ -26,6 +26,7 @@
#include "src/wasm/wasm-limits.h"
#include "src/wasm/wasm-module.h"
#include "src/wasm/wasm-objects-inl.h"
#include "src/wasm/wasm-opcodes.h"
#include "src/zone/accounting-allocator.h"
#include "src/zone/zone-containers.h"
@ -2379,6 +2380,26 @@ class ThreadImpl {
UNOP_CASE(I8x16Neg, i8x16, int16, 16, base::NegateWithWraparound(a))
UNOP_CASE(I8x16Abs, i8x16, int16, 16, std::abs(a))
#undef UNOP_CASE
// Cast to double in call to signbit is due to MSCV issue, see
// https://github.com/microsoft/STL/issues/519.
#define BITMASK_CASE(op, name, stype, count) \
case kExpr##op: { \
WasmValue v = Pop(); \
stype s = v.to_s128().to_##name(); \
int32_t res = 0; \
for (size_t i = 0; i < count; ++i) { \
bool sign = std::signbit(static_cast<double>(s.val[i])); \
res |= (sign << i); \
} \
Push(WasmValue(res)); \
return true; \
}
BITMASK_CASE(I8x16BitMask, i8x16, int16, 16)
BITMASK_CASE(I16x8BitMask, i16x8, int8, 8)
BITMASK_CASE(I32x4BitMask, i32x4, int4, 4)
#undef BITMASK_CASE
#define CMPOP_CASE(op, name, stype, out_stype, count, expr) \
case kExpr##op: { \
WasmValue v2 = Pop(); \

View File

@ -317,6 +317,10 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
CASE_I16x8_OP(Abs, "abs")
CASE_I32x4_OP(Abs, "abs")
CASE_I8x16_OP(BitMask, "bitmask")
CASE_I16x8_OP(BitMask, "bitmask")
CASE_I32x4_OP(BitMask, "bitmask")
// Atomic operations.
CASE_OP(AtomicNotify, "atomic.notify")
CASE_INT_OP(AtomicWait, "atomic.wait")

View File

@ -453,6 +453,9 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, const WasmFeatures&);
V(I16x8AddHoriz, 0xfdbd, s_ss) \
V(I32x4AddHoriz, 0xfdbe, s_ss) \
V(F32x4AddHoriz, 0xfdbf, s_ss) \
V(I8x16BitMask, 0xfde4, i_s) \
V(I16x8BitMask, 0xfde5, i_s) \
V(I32x4BitMask, 0xfde6, i_s) \
V(F32x4RecipApprox, 0xfdee, s_s) \
V(F32x4RecipSqrtApprox, 0xfdef, s_s)

View File

@ -1658,6 +1658,68 @@ WASM_SIMD_TEST(I16x8ReplaceLane) {
}
}
#if V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(I8x16BitMask) {
FLAG_SCOPE(wasm_simd_post_mvp);
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
byte value1 = r.AllocateLocal(kWasmS128);
BUILD(r, WASM_SET_LOCAL(value1, WASM_SIMD_I8x16_SPLAT(WASM_GET_LOCAL(0))),
WASM_SET_LOCAL(value1, WASM_SIMD_I8x16_REPLACE_LANE(
0, WASM_GET_LOCAL(value1), WASM_I32V(0))),
WASM_SET_LOCAL(value1, WASM_SIMD_I8x16_REPLACE_LANE(
1, WASM_GET_LOCAL(value1), WASM_I32V(-1))),
WASM_SIMD_UNOP(kExprI8x16BitMask, WASM_GET_LOCAL(value1)));
FOR_INT8_INPUTS(x) {
int32_t actual = r.Call(x);
// Lane 0 is always 0 (positive), lane 1 is always -1.
int32_t expected = std::signbit(x) ? 0xFFFE : 0x0002;
CHECK_EQ(actual, expected);
}
}
WASM_SIMD_TEST_NO_LOWERING(I16x8BitMask) {
FLAG_SCOPE(wasm_simd_post_mvp);
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
byte value1 = r.AllocateLocal(kWasmS128);
BUILD(r, WASM_SET_LOCAL(value1, WASM_SIMD_I16x8_SPLAT(WASM_GET_LOCAL(0))),
WASM_SET_LOCAL(value1, WASM_SIMD_I16x8_REPLACE_LANE(
0, WASM_GET_LOCAL(value1), WASM_I32V(0))),
WASM_SET_LOCAL(value1, WASM_SIMD_I16x8_REPLACE_LANE(
1, WASM_GET_LOCAL(value1), WASM_I32V(-1))),
WASM_SIMD_UNOP(kExprI16x8BitMask, WASM_GET_LOCAL(value1)));
FOR_INT16_INPUTS(x) {
int32_t actual = r.Call(x);
// Lane 0 is always 0 (positive), lane 1 is always -1.
int32_t expected = std::signbit(x) ? 0xFE : 2;
CHECK_EQ(actual, expected);
}
}
WASM_SIMD_TEST_NO_LOWERING(I32x4BitMask) {
FLAG_SCOPE(wasm_simd_post_mvp);
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
byte value1 = r.AllocateLocal(kWasmS128);
BUILD(r, WASM_SET_LOCAL(value1, WASM_SIMD_I32x4_SPLAT(WASM_GET_LOCAL(0))),
WASM_SET_LOCAL(value1, WASM_SIMD_I32x4_REPLACE_LANE(
0, WASM_GET_LOCAL(value1), WASM_I32V(0))),
WASM_SET_LOCAL(value1, WASM_SIMD_I32x4_REPLACE_LANE(
1, WASM_GET_LOCAL(value1), WASM_I32V(-1))),
WASM_SIMD_UNOP(kExprI32x4BitMask, WASM_GET_LOCAL(value1)));
FOR_INT32_INPUTS(x) {
int32_t actual = r.Call(x);
// Lane 0 is always 0 (positive), lane 1 is always -1.
int32_t expected = std::signbit(x) ? 0xE : 2;
CHECK_EQ(actual, expected);
}
}
#endif // V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST(I8x16Splat) {
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
// Set up a global to hold output vector.