[wasm-simd] Implement v8x16.swizzle for x64
Bug: v8:8460 Change-Id: I79ae753f15aaa91a2154bd7078a1cdb9f3e049f1 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1822497 Reviewed-by: Michael Starzinger <mstarzinger@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#64201}
This commit is contained in:
parent
c4d90a74e4
commit
3fdc88defb
@ -1822,6 +1822,16 @@ void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Pshufd(XMMRegister dst, XMMRegister src, uint8_t shuffle) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope scope(this, AVX);
|
||||
vpshufd(dst, src, shuffle);
|
||||
} else {
|
||||
DCHECK(!IsEnabled(AVX));
|
||||
pshufd(dst, src, shuffle);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Lzcntl(Register dst, Register src) {
|
||||
if (CpuFeatures::IsSupported(LZCNT)) {
|
||||
CpuFeatureScope scope(this, LZCNT);
|
||||
|
@ -154,6 +154,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
AVX_OP(Sqrtsd, sqrtsd)
|
||||
AVX_OP(Ucomiss, ucomiss)
|
||||
AVX_OP(Ucomisd, ucomisd)
|
||||
AVX_OP(Pshufb, pshufb)
|
||||
AVX_OP(Paddusb, paddusb)
|
||||
|
||||
#undef AVX_OP
|
||||
|
||||
@ -375,6 +377,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
void Pslld(XMMRegister dst, byte imm8);
|
||||
void Psrld(XMMRegister dst, byte imm8);
|
||||
|
||||
void Pshufd(XMMRegister dst, XMMRegister src, uint8_t shuffle);
|
||||
|
||||
void CompareRoot(Register with, RootIndex index);
|
||||
void CompareRoot(Operand with, RootIndex index);
|
||||
|
||||
|
@ -2151,6 +2151,8 @@ void InstructionSelector::VisitNode(Node* node) {
|
||||
return MarkAsSimd128(node), VisitS128Not(node);
|
||||
case IrOpcode::kS128Select:
|
||||
return MarkAsSimd128(node), VisitS128Select(node);
|
||||
case IrOpcode::kS8x16Swizzle:
|
||||
return MarkAsSimd128(node), VisitS8x16Swizzle(node);
|
||||
case IrOpcode::kS8x16Shuffle:
|
||||
return MarkAsSimd128(node), VisitS8x16Shuffle(node);
|
||||
case IrOpcode::kS1x2AnyTrue:
|
||||
@ -2666,6 +2668,7 @@ void InstructionSelector::VisitI64x2MinS(Node* node) { UNIMPLEMENTED(); }
|
||||
void InstructionSelector::VisitI64x2MaxS(Node* node) { UNIMPLEMENTED(); }
|
||||
void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
|
||||
void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
|
||||
void InstructionSelector::VisitS8x16Swizzle(Node* node) { UNIMPLEMENTED(); }
|
||||
#endif // !V8_TARGET_ARCH_X64
|
||||
|
||||
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
|
||||
|
@ -3580,6 +3580,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ xorps(dst, i.InputSimd128Register(2));
|
||||
break;
|
||||
}
|
||||
case kX64S8x16Swizzle: {
|
||||
CpuFeatureScope sse_scope(tasm(), SSSE3);
|
||||
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister mask = i.TempSimd128Register(0);
|
||||
|
||||
// Out-of-range indices should return 0, add 112 so that any value > 15
|
||||
// saturates to 128 (top bit set), so pshufb will zero that lane.
|
||||
__ Move(mask, static_cast<uint32_t>(0x70707070));
|
||||
__ Pshufd(mask, mask, 0x0);
|
||||
__ Paddusb(mask, i.InputSimd128Register(1));
|
||||
__ Pshufb(dst, mask);
|
||||
break;
|
||||
}
|
||||
case kX64S8x16Shuffle: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
Register tmp = i.TempRegister(0);
|
||||
|
@ -306,6 +306,7 @@ namespace compiler {
|
||||
V(X64S128Or) \
|
||||
V(X64S128Xor) \
|
||||
V(X64S128Select) \
|
||||
V(X64S8x16Swizzle) \
|
||||
V(X64S8x16Shuffle) \
|
||||
V(X64S32x4Swizzle) \
|
||||
V(X64S32x4Shuffle) \
|
||||
|
@ -281,6 +281,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kX64S1x4AllTrue:
|
||||
case kX64S1x8AnyTrue:
|
||||
case kX64S1x8AllTrue:
|
||||
case kX64S8x16Swizzle:
|
||||
case kX64S8x16Shuffle:
|
||||
case kX64S32x4Swizzle:
|
||||
case kX64S32x4Shuffle:
|
||||
|
@ -3306,6 +3306,14 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
|
||||
Emit(opcode, 1, &dst, input_count, inputs, temp_count, temps);
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitS8x16Swizzle(Node* node) {
|
||||
X64OperandGenerator g(this);
|
||||
InstructionOperand temps[] = {g.TempSimd128Register()};
|
||||
Emit(kX64S8x16Swizzle, g.DefineSameAsFirst(node),
|
||||
g.UseRegister(node->InputAt(0)), g.UseUniqueRegister(node->InputAt(1)),
|
||||
arraysize(temps), temps);
|
||||
}
|
||||
|
||||
// static
|
||||
MachineOperatorBuilder::Flags
|
||||
InstructionSelector::SupportedMachineOperatorFlags() {
|
||||
|
@ -402,6 +402,7 @@ MachineType AtomicOpType(Operator const* op) {
|
||||
V(S1x8AllTrue, Operator::kNoProperties, 1, 0, 1) \
|
||||
V(S1x16AnyTrue, Operator::kNoProperties, 1, 0, 1) \
|
||||
V(S1x16AllTrue, Operator::kNoProperties, 1, 0, 1) \
|
||||
V(S8x16Swizzle, Operator::kNoProperties, 2, 0, 1) \
|
||||
V(StackPointerGreaterThan, Operator::kNoProperties, 1, 0, 1)
|
||||
|
||||
// The format is:
|
||||
|
@ -644,6 +644,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
|
||||
const Operator* S128Not();
|
||||
const Operator* S128Select();
|
||||
|
||||
const Operator* S8x16Swizzle();
|
||||
const Operator* S8x16Shuffle(const uint8_t shuffle[16]);
|
||||
|
||||
const Operator* S1x2AnyTrue();
|
||||
|
@ -916,6 +916,7 @@
|
||||
V(S128Or) \
|
||||
V(S128Xor) \
|
||||
V(S128Select) \
|
||||
V(S8x16Swizzle) \
|
||||
V(S8x16Shuffle) \
|
||||
V(S1x2AnyTrue) \
|
||||
V(S1x2AllTrue) \
|
||||
|
@ -211,6 +211,7 @@ void SimdScalarLowering::LowerGraph() {
|
||||
V(I8x16LeS) \
|
||||
V(I8x16LtU) \
|
||||
V(I8x16LeU) \
|
||||
V(S8x16Swizzle) \
|
||||
V(S8x16Shuffle)
|
||||
|
||||
MachineType SimdScalarLowering::MachineTypeFrom(SimdType simdType) {
|
||||
@ -1392,6 +1393,45 @@ void SimdScalarLowering::LowerNode(Node* node) {
|
||||
ReplaceNode(node, rep_node, num_lanes);
|
||||
break;
|
||||
}
|
||||
case IrOpcode::kS8x16Swizzle: {
|
||||
DCHECK_EQ(2, node->InputCount());
|
||||
Node** rep_left = GetReplacementsWithType(node->InputAt(0), rep_type);
|
||||
Node** indices = GetReplacementsWithType(node->InputAt(1), rep_type);
|
||||
Node** rep_nodes = zone()->NewArray<Node*>(num_lanes);
|
||||
Node* stack_slot = graph()->NewNode(
|
||||
machine()->StackSlot(MachineRepresentation::kSimd128));
|
||||
|
||||
// Push all num_lanes values into stack slot.
|
||||
const Operator* store_op = machine()->Store(
|
||||
StoreRepresentation(MachineRepresentation::kWord8, kNoWriteBarrier));
|
||||
Node* effect_input = graph()->start();
|
||||
for (int i = num_lanes - 1; i >= 0; i--) {
|
||||
// We want all the stores to happen first before any of the loads
|
||||
// below, so connect them via effect edge from i-1 to i.
|
||||
Node* store =
|
||||
graph()->NewNode(store_op, stack_slot, mcgraph_->Int32Constant(i),
|
||||
rep_left[i], effect_input, graph()->start());
|
||||
effect_input = store;
|
||||
}
|
||||
|
||||
for (int i = num_lanes - 1; i >= 0; i--) {
|
||||
// Only select lane when index is < num_lanes, otherwise write 0 to
|
||||
// lane. Use Uint32 to take care of negative indices.
|
||||
Diamond d(graph(), common(),
|
||||
graph()->NewNode(machine()->Uint32LessThan(), indices[i],
|
||||
mcgraph_->Int32Constant(num_lanes)));
|
||||
|
||||
Node* load =
|
||||
graph()->NewNode(machine()->Load(LoadRepresentation::Uint8()),
|
||||
stack_slot, indices[i], effect_input, d.if_true);
|
||||
|
||||
rep_nodes[i] = d.Phi(MachineRepresentation::kWord8, load,
|
||||
mcgraph_->Int32Constant(0));
|
||||
}
|
||||
|
||||
ReplaceNode(node, rep_nodes, num_lanes);
|
||||
break;
|
||||
}
|
||||
case IrOpcode::kS8x16Shuffle: {
|
||||
DCHECK_EQ(2, node->InputCount());
|
||||
const uint8_t* shuffle = S8x16ShuffleOf(node->op());
|
||||
|
@ -4472,6 +4472,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
|
||||
return graph()->NewNode(mcgraph()->machine()->S1x16AnyTrue(), inputs[0]);
|
||||
case wasm::kExprS1x16AllTrue:
|
||||
return graph()->NewNode(mcgraph()->machine()->S1x16AllTrue(), inputs[0]);
|
||||
case wasm::kExprS8x16Swizzle:
|
||||
return graph()->NewNode(mcgraph()->machine()->S8x16Swizzle(), inputs[0],
|
||||
inputs[1]);
|
||||
default:
|
||||
FATAL_UNSUPPORTED_OPCODE(opcode);
|
||||
}
|
||||
|
@ -2629,6 +2629,18 @@ class ThreadImpl {
|
||||
ADD_HORIZ_CASE(F32x4AddHoriz, f32x4, float4, 4)
|
||||
ADD_HORIZ_CASE(I16x8AddHoriz, i16x8, int8, 8)
|
||||
#undef ADD_HORIZ_CASE
|
||||
case kExprS8x16Swizzle: {
|
||||
int16 v2 = Pop().to_s128().to_i8x16();
|
||||
int16 v1 = Pop().to_s128().to_i8x16();
|
||||
int16 res;
|
||||
for (size_t i = 0; i < kSimd128Size; ++i) {
|
||||
int lane = v2.val[LANE(i, v1)];
|
||||
res.val[LANE(i, v1)] =
|
||||
lane < kSimd128Size && lane >= 0 ? v1.val[LANE(lane, v1)] : 0;
|
||||
}
|
||||
Push(WasmValue(Simd128(res)));
|
||||
return true;
|
||||
}
|
||||
case kExprS8x16Shuffle: {
|
||||
Simd8x16ShuffleImmediate<Decoder::kNoValidate> imm(decoder,
|
||||
code->at(pc));
|
||||
|
@ -306,6 +306,7 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
|
||||
CASE_S128_OP(Xor, "xor")
|
||||
CASE_S128_OP(Not, "not")
|
||||
CASE_S128_OP(Select, "select")
|
||||
CASE_S8x16_OP(Swizzle, "swizzle")
|
||||
CASE_S8x16_OP(Shuffle, "shuffle")
|
||||
CASE_S1x2_OP(AnyTrue, "any_true")
|
||||
CASE_S1x2_OP(AllTrue, "all_true")
|
||||
|
@ -420,6 +420,7 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, const WasmFeatures&);
|
||||
V(I32x4UConvertF32x4, 0xfdac, s_s) \
|
||||
V(F32x4SConvertI32x4, 0xfdaf, s_s) \
|
||||
V(F32x4UConvertI32x4, 0xfdb0, s_s) \
|
||||
V(S8x16Swizzle, 0xfdc0, s_ss) \
|
||||
V(I8x16SConvertI16x8, 0xfdc6, s_ss) \
|
||||
V(I8x16UConvertI16x8, 0xfdc7, s_ss) \
|
||||
V(I16x8SConvertI32x4, 0xfdc8, s_ss) \
|
||||
|
@ -2687,6 +2687,62 @@ WASM_SIMD_TEST(S8x16Concat) {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef V8_TARGET_ARCH_X64
|
||||
struct SwizzleTestArgs {
|
||||
const Shuffle input;
|
||||
const Shuffle indices;
|
||||
const Shuffle expected;
|
||||
};
|
||||
|
||||
static constexpr SwizzleTestArgs swizzle_test_args[] = {
|
||||
{{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
|
||||
{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}},
|
||||
{{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
|
||||
{15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7},
|
||||
{0, 15, 1, 14, 2, 13, 3, 12, 4, 11, 5, 10, 6, 9, 7, 8}},
|
||||
{{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
|
||||
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30},
|
||||
{15, 13, 11, 9, 7, 5, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0}},
|
||||
// all indices are out of range
|
||||
{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{16, 17, 18, 19, 20, 124, 125, 126, 127, -1, -2, -3, -4, -5, -6, -7},
|
||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}};
|
||||
|
||||
static constexpr Vector<const SwizzleTestArgs> swizzle_test_vector =
|
||||
ArrayVector(swizzle_test_args);
|
||||
|
||||
WASM_SIMD_TEST(S8x16Swizzle) {
|
||||
// RunBinaryLaneOpTest set up the two globals to be consecutive integers,
|
||||
// [0-15] and [16-31]. Using [0-15] as the indices will not sufficiently test
|
||||
// swizzle since the expected result is a no-op, using [16-31] will result in
|
||||
// all 0s.
|
||||
WasmRunner<int32_t> r(execution_tier, lower_simd);
|
||||
static const int kElems = kSimd128Size / sizeof(uint8_t);
|
||||
uint8_t* dst = r.builder().AddGlobal<uint8_t>(kWasmS128);
|
||||
uint8_t* src0 = r.builder().AddGlobal<uint8_t>(kWasmS128);
|
||||
uint8_t* src1 = r.builder().AddGlobal<uint8_t>(kWasmS128);
|
||||
BUILD(
|
||||
r,
|
||||
WASM_SET_GLOBAL(0, WASM_SIMD_BINOP(kExprS8x16Swizzle, WASM_GET_GLOBAL(1),
|
||||
WASM_GET_GLOBAL(2))),
|
||||
WASM_ONE);
|
||||
|
||||
for (SwizzleTestArgs si : swizzle_test_vector) {
|
||||
for (int i = 0; i < kElems; i++) {
|
||||
WriteLittleEndianValue<uint8_t>(&src0[i], si.input[i]);
|
||||
WriteLittleEndianValue<uint8_t>(&src1[i], si.indices[i]);
|
||||
}
|
||||
|
||||
CHECK_EQ(1, r.Call());
|
||||
|
||||
for (int i = 0; i < kElems; i++) {
|
||||
CHECK_EQ(ReadLittleEndianValue<uint8_t>(&dst[i]), si.expected[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // V8_TARGET_ARCH_X64
|
||||
|
||||
// Combine 3 shuffles a, b, and c by applying both a and b and then applying c
|
||||
// to those two results.
|
||||
Shuffle Combine(const Shuffle& a, const Shuffle& b, const Shuffle& c) {
|
||||
|
Loading…
Reference in New Issue
Block a user