[wasm-simd] Implement v8x16.swizzle for x64

Bug: v8:8460
Change-Id: I79ae753f15aaa91a2154bd7078a1cdb9f3e049f1
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1822497
Reviewed-by: Michael Starzinger <mstarzinger@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#64201}
This commit is contained in:
Ng Zhi An 2019-10-09 10:25:33 -07:00 committed by Commit Bot
parent c4d90a74e4
commit 3fdc88defb
16 changed files with 157 additions and 0 deletions

View File

@ -1822,6 +1822,16 @@ void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
}
}
void TurboAssembler::Pshufd(XMMRegister dst, XMMRegister src, uint8_t shuffle) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpshufd(dst, src, shuffle);
} else {
DCHECK(!IsEnabled(AVX));
pshufd(dst, src, shuffle);
}
}
void TurboAssembler::Lzcntl(Register dst, Register src) {
if (CpuFeatures::IsSupported(LZCNT)) {
CpuFeatureScope scope(this, LZCNT);

View File

@ -154,6 +154,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Sqrtsd, sqrtsd)
AVX_OP(Ucomiss, ucomiss)
AVX_OP(Ucomisd, ucomisd)
AVX_OP(Pshufb, pshufb)
AVX_OP(Paddusb, paddusb)
#undef AVX_OP
@ -375,6 +377,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Pslld(XMMRegister dst, byte imm8);
void Psrld(XMMRegister dst, byte imm8);
void Pshufd(XMMRegister dst, XMMRegister src, uint8_t shuffle);
void CompareRoot(Register with, RootIndex index);
void CompareRoot(Operand with, RootIndex index);

View File

@ -2151,6 +2151,8 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitS128Not(node);
case IrOpcode::kS128Select:
return MarkAsSimd128(node), VisitS128Select(node);
case IrOpcode::kS8x16Swizzle:
return MarkAsSimd128(node), VisitS8x16Swizzle(node);
case IrOpcode::kS8x16Shuffle:
return MarkAsSimd128(node), VisitS8x16Shuffle(node);
case IrOpcode::kS1x2AnyTrue:
@ -2666,6 +2668,7 @@ void InstructionSelector::VisitI64x2MinS(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MaxS(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitS8x16Swizzle(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }

View File

@ -3580,6 +3580,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ xorps(dst, i.InputSimd128Register(2));
break;
}
case kX64S8x16Swizzle: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister dst = i.OutputSimd128Register();
XMMRegister mask = i.TempSimd128Register(0);
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
__ Move(mask, static_cast<uint32_t>(0x70707070));
__ Pshufd(mask, mask, 0x0);
__ Paddusb(mask, i.InputSimd128Register(1));
__ Pshufb(dst, mask);
break;
}
case kX64S8x16Shuffle: {
XMMRegister dst = i.OutputSimd128Register();
Register tmp = i.TempRegister(0);

View File

@ -306,6 +306,7 @@ namespace compiler {
V(X64S128Or) \
V(X64S128Xor) \
V(X64S128Select) \
V(X64S8x16Swizzle) \
V(X64S8x16Shuffle) \
V(X64S32x4Swizzle) \
V(X64S32x4Shuffle) \

View File

@ -281,6 +281,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64S1x4AllTrue:
case kX64S1x8AnyTrue:
case kX64S1x8AllTrue:
case kX64S8x16Swizzle:
case kX64S8x16Shuffle:
case kX64S32x4Swizzle:
case kX64S32x4Shuffle:

View File

@ -3306,6 +3306,14 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
Emit(opcode, 1, &dst, input_count, inputs, temp_count, temps);
}
void InstructionSelector::VisitS8x16Swizzle(Node* node) {
X64OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
Emit(kX64S8x16Swizzle, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseUniqueRegister(node->InputAt(1)),
arraysize(temps), temps);
}
// static
MachineOperatorBuilder::Flags
InstructionSelector::SupportedMachineOperatorFlags() {

View File

@ -402,6 +402,7 @@ MachineType AtomicOpType(Operator const* op) {
V(S1x8AllTrue, Operator::kNoProperties, 1, 0, 1) \
V(S1x16AnyTrue, Operator::kNoProperties, 1, 0, 1) \
V(S1x16AllTrue, Operator::kNoProperties, 1, 0, 1) \
V(S8x16Swizzle, Operator::kNoProperties, 2, 0, 1) \
V(StackPointerGreaterThan, Operator::kNoProperties, 1, 0, 1)
// The format is:

View File

@ -644,6 +644,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* S128Not();
const Operator* S128Select();
const Operator* S8x16Swizzle();
const Operator* S8x16Shuffle(const uint8_t shuffle[16]);
const Operator* S1x2AnyTrue();

View File

@ -916,6 +916,7 @@
V(S128Or) \
V(S128Xor) \
V(S128Select) \
V(S8x16Swizzle) \
V(S8x16Shuffle) \
V(S1x2AnyTrue) \
V(S1x2AllTrue) \

View File

@ -211,6 +211,7 @@ void SimdScalarLowering::LowerGraph() {
V(I8x16LeS) \
V(I8x16LtU) \
V(I8x16LeU) \
V(S8x16Swizzle) \
V(S8x16Shuffle)
MachineType SimdScalarLowering::MachineTypeFrom(SimdType simdType) {
@ -1392,6 +1393,45 @@ void SimdScalarLowering::LowerNode(Node* node) {
ReplaceNode(node, rep_node, num_lanes);
break;
}
case IrOpcode::kS8x16Swizzle: {
DCHECK_EQ(2, node->InputCount());
Node** rep_left = GetReplacementsWithType(node->InputAt(0), rep_type);
Node** indices = GetReplacementsWithType(node->InputAt(1), rep_type);
Node** rep_nodes = zone()->NewArray<Node*>(num_lanes);
Node* stack_slot = graph()->NewNode(
machine()->StackSlot(MachineRepresentation::kSimd128));
// Push all num_lanes values into stack slot.
const Operator* store_op = machine()->Store(
StoreRepresentation(MachineRepresentation::kWord8, kNoWriteBarrier));
Node* effect_input = graph()->start();
for (int i = num_lanes - 1; i >= 0; i--) {
// We want all the stores to happen first before any of the loads
// below, so connect them via effect edge from i-1 to i.
Node* store =
graph()->NewNode(store_op, stack_slot, mcgraph_->Int32Constant(i),
rep_left[i], effect_input, graph()->start());
effect_input = store;
}
for (int i = num_lanes - 1; i >= 0; i--) {
// Only select lane when index is < num_lanes, otherwise write 0 to
// lane. Use Uint32 to take care of negative indices.
Diamond d(graph(), common(),
graph()->NewNode(machine()->Uint32LessThan(), indices[i],
mcgraph_->Int32Constant(num_lanes)));
Node* load =
graph()->NewNode(machine()->Load(LoadRepresentation::Uint8()),
stack_slot, indices[i], effect_input, d.if_true);
rep_nodes[i] = d.Phi(MachineRepresentation::kWord8, load,
mcgraph_->Int32Constant(0));
}
ReplaceNode(node, rep_nodes, num_lanes);
break;
}
case IrOpcode::kS8x16Shuffle: {
DCHECK_EQ(2, node->InputCount());
const uint8_t* shuffle = S8x16ShuffleOf(node->op());

View File

@ -4472,6 +4472,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
return graph()->NewNode(mcgraph()->machine()->S1x16AnyTrue(), inputs[0]);
case wasm::kExprS1x16AllTrue:
return graph()->NewNode(mcgraph()->machine()->S1x16AllTrue(), inputs[0]);
case wasm::kExprS8x16Swizzle:
return graph()->NewNode(mcgraph()->machine()->S8x16Swizzle(), inputs[0],
inputs[1]);
default:
FATAL_UNSUPPORTED_OPCODE(opcode);
}

View File

@ -2629,6 +2629,18 @@ class ThreadImpl {
ADD_HORIZ_CASE(F32x4AddHoriz, f32x4, float4, 4)
ADD_HORIZ_CASE(I16x8AddHoriz, i16x8, int8, 8)
#undef ADD_HORIZ_CASE
case kExprS8x16Swizzle: {
int16 v2 = Pop().to_s128().to_i8x16();
int16 v1 = Pop().to_s128().to_i8x16();
int16 res;
for (size_t i = 0; i < kSimd128Size; ++i) {
int lane = v2.val[LANE(i, v1)];
res.val[LANE(i, v1)] =
lane < kSimd128Size && lane >= 0 ? v1.val[LANE(lane, v1)] : 0;
}
Push(WasmValue(Simd128(res)));
return true;
}
case kExprS8x16Shuffle: {
Simd8x16ShuffleImmediate<Decoder::kNoValidate> imm(decoder,
code->at(pc));

View File

@ -306,6 +306,7 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
CASE_S128_OP(Xor, "xor")
CASE_S128_OP(Not, "not")
CASE_S128_OP(Select, "select")
CASE_S8x16_OP(Swizzle, "swizzle")
CASE_S8x16_OP(Shuffle, "shuffle")
CASE_S1x2_OP(AnyTrue, "any_true")
CASE_S1x2_OP(AllTrue, "all_true")

View File

@ -420,6 +420,7 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, const WasmFeatures&);
V(I32x4UConvertF32x4, 0xfdac, s_s) \
V(F32x4SConvertI32x4, 0xfdaf, s_s) \
V(F32x4UConvertI32x4, 0xfdb0, s_s) \
V(S8x16Swizzle, 0xfdc0, s_ss) \
V(I8x16SConvertI16x8, 0xfdc6, s_ss) \
V(I8x16UConvertI16x8, 0xfdc7, s_ss) \
V(I16x8SConvertI32x4, 0xfdc8, s_ss) \

View File

@ -2687,6 +2687,62 @@ WASM_SIMD_TEST(S8x16Concat) {
}
}
#ifdef V8_TARGET_ARCH_X64
struct SwizzleTestArgs {
const Shuffle input;
const Shuffle indices;
const Shuffle expected;
};
static constexpr SwizzleTestArgs swizzle_test_args[] = {
{{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}},
{{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
{15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7},
{0, 15, 1, 14, 2, 13, 3, 12, 4, 11, 5, 10, 6, 9, 7, 8}},
{{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30},
{15, 13, 11, 9, 7, 5, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0}},
// all indices are out of range
{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{16, 17, 18, 19, 20, 124, 125, 126, 127, -1, -2, -3, -4, -5, -6, -7},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}};
static constexpr Vector<const SwizzleTestArgs> swizzle_test_vector =
ArrayVector(swizzle_test_args);
WASM_SIMD_TEST(S8x16Swizzle) {
// RunBinaryLaneOpTest set up the two globals to be consecutive integers,
// [0-15] and [16-31]. Using [0-15] as the indices will not sufficiently test
// swizzle since the expected result is a no-op, using [16-31] will result in
// all 0s.
WasmRunner<int32_t> r(execution_tier, lower_simd);
static const int kElems = kSimd128Size / sizeof(uint8_t);
uint8_t* dst = r.builder().AddGlobal<uint8_t>(kWasmS128);
uint8_t* src0 = r.builder().AddGlobal<uint8_t>(kWasmS128);
uint8_t* src1 = r.builder().AddGlobal<uint8_t>(kWasmS128);
BUILD(
r,
WASM_SET_GLOBAL(0, WASM_SIMD_BINOP(kExprS8x16Swizzle, WASM_GET_GLOBAL(1),
WASM_GET_GLOBAL(2))),
WASM_ONE);
for (SwizzleTestArgs si : swizzle_test_vector) {
for (int i = 0; i < kElems; i++) {
WriteLittleEndianValue<uint8_t>(&src0[i], si.input[i]);
WriteLittleEndianValue<uint8_t>(&src1[i], si.indices[i]);
}
CHECK_EQ(1, r.Call());
for (int i = 0; i < kElems; i++) {
CHECK_EQ(ReadLittleEndianValue<uint8_t>(&dst[i]), si.expected[i]);
}
}
}
#endif // V8_TARGET_ARCH_X64
// Combine 3 shuffles a, b, and c by applying both a and b and then applying c
// to those two results.
Shuffle Combine(const Shuffle& a, const Shuffle& b, const Shuffle& c) {