[wasm-simd] Support returning Simd128 on caller's stack

In Liftoff, we were missing kS128 cases to load to/from stack.

For the x64 and ARM64 instruction selector, the calculation of
reverse_slot is incorrect for 128-bit values:

- reverse_slot += 2 (size of 128-bit values, 2 pointers)
- this copies from slot -2 into register
- but the value starts at slot -1, it occupies slots -1 and -2
- we end up copying slot -2 (most significant half) of the register, and
also slot -3, which is where rsi was store (Wasm instance addr)
- the test ends up with a different result every time

The calculation of reverse_slot is changed to follow how ia32 and ARM
does it, which is to start with

- reverse_slot = 0
- in the code-generator, add 1 to the slot
- then after emitting Peek operation, reverse_slot += 2

The fixes for x64 and ARM64 are in both instruction-selector and
code-generator.

ia32 and ARM didn't support writing kSimd128 values yet, it was only a
missing check in code-generator, so add that in.

For ARM, the codegen is more involved, vld1 does not support addressing
with an offset, so we have to do the addition into a scratch register.

Also adding a test for returning multiple v128. V128 is not exposed to
JavaScript, so we use a Wasm function call, and then an involved chain
of extract lanes, returning 6 i32 which we verify the values of. It
extracts the first and last lane of the i32x4 value in order to catch
bugs where we write or read to a wrong stack slot (off by 1).

The simd-scalar-lowering for kCall was only handling single s128 return,
we adopt the way i64-lowering handles kCall, so that is can now handle
any kinds of calls with s128 in the descriptor.

Bug: v8:10794
Bug: chromium:1115230
Change-Id: I2ccdd55f6292bc5794be78053b27e14da8cce70e
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2355189
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Andreas Haas <ahaas@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#69439}
This commit is contained in:
Ng Zhi An 2020-08-17 13:29:48 -07:00 committed by Commit Bot
parent a575608555
commit 360c9294a8
12 changed files with 206 additions and 46 deletions

View File

@ -1828,17 +1828,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kArmPeek: {
// The incoming value is 0-based, but we need a 1-based value.
int reverse_slot = i.InputInt32(0) + 1;
int reverse_slot = i.InputInt32(0);
int offset =
FrameSlotToFPOffset(frame()->GetTotalFrameSlotCount() - reverse_slot);
if (instr->OutputAt(0)->IsFPRegister()) {
LocationOperand* op = LocationOperand::cast(instr->OutputAt(0));
if (op->representation() == MachineRepresentation::kFloat64) {
__ vldr(i.OutputDoubleRegister(), MemOperand(fp, offset));
} else {
DCHECK_EQ(MachineRepresentation::kFloat32, op->representation());
} else if (op->representation() == MachineRepresentation::kFloat32) {
__ vldr(i.OutputFloatRegister(), MemOperand(fp, offset));
} else {
DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
UseScratchRegisterScope temps(tasm());
Register scratch = temps.Acquire();
__ add(scratch, fp, Operand(offset));
__ vld1(Neon8, NeonListOperand(i.OutputSimd128Register()),
NeonMemOperand(scratch));
}
} else {
__ ldr(i.OutputRegister(), MemOperand(fp, offset));

View File

@ -1667,7 +1667,7 @@ void InstructionSelector::EmitPrepareResults(
Node* node) {
ArmOperandGenerator g(this);
int reverse_slot = 0;
int reverse_slot = 1;
for (PushParameter output : *results) {
if (!output.location.IsCallerFrameSlot()) continue;
// Skip any alignment holes in nodes.
@ -1677,6 +1677,8 @@ void InstructionSelector::EmitPrepareResults(
MarkAsFloat32(output.node);
} else if (output.location.GetType() == MachineType::Float64()) {
MarkAsFloat64(output.node);
} else if (output.location.GetType() == MachineType::Simd128()) {
MarkAsSimd128(output.node);
}
Emit(kArmPeek, g.DefineAsRegister(output.node),
g.UseImmediate(reverse_slot));

View File

@ -1358,9 +1358,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
LocationOperand* op = LocationOperand::cast(instr->OutputAt(0));
if (op->representation() == MachineRepresentation::kFloat64) {
__ Ldr(i.OutputDoubleRegister(), MemOperand(fp, offset));
} else {
DCHECK_EQ(MachineRepresentation::kFloat32, op->representation());
} else if (op->representation() == MachineRepresentation::kFloat32) {
__ Ldr(i.OutputFloatRegister(), MemOperand(fp, offset));
} else {
DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
__ Ldr(i.OutputSimd128Register(), MemOperand(fp, offset));
}
} else {
__ Ldr(i.OutputRegister(), MemOperand(fp, offset));

View File

@ -1909,22 +1909,25 @@ void InstructionSelector::EmitPrepareResults(
Node* node) {
Arm64OperandGenerator g(this);
int reverse_slot = 0;
int reverse_slot = 1;
for (PushParameter output : *results) {
if (!output.location.IsCallerFrameSlot()) continue;
reverse_slot += output.location.GetSizeInPointers();
// Skip any alignment holes in nodes.
if (output.node == nullptr) continue;
DCHECK(!call_descriptor->IsCFunctionCall());
if (output.node != nullptr) {
DCHECK(!call_descriptor->IsCFunctionCall());
if (output.location.GetType() == MachineType::Float32()) {
MarkAsFloat32(output.node);
} else if (output.location.GetType() == MachineType::Float64()) {
MarkAsFloat64(output.node);
if (output.location.GetType() == MachineType::Float32()) {
MarkAsFloat32(output.node);
} else if (output.location.GetType() == MachineType::Float64()) {
MarkAsFloat64(output.node);
} else if (output.location.GetType() == MachineType::Simd128()) {
MarkAsSimd128(output.node);
}
Emit(kArm64Peek, g.DefineAsRegister(output.node),
g.UseImmediate(reverse_slot));
}
Emit(kArm64Peek, g.DefineAsRegister(output.node),
g.UseImmediate(reverse_slot));
reverse_slot += output.location.GetSizeInPointers();
}
}

View File

@ -1845,16 +1845,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32Peek: {
int reverse_slot = i.InputInt32(0) + 1;
int reverse_slot = i.InputInt32(0);
int offset =
FrameSlotToFPOffset(frame()->GetTotalFrameSlotCount() - reverse_slot);
if (instr->OutputAt(0)->IsFPRegister()) {
LocationOperand* op = LocationOperand::cast(instr->OutputAt(0));
if (op->representation() == MachineRepresentation::kFloat64) {
__ movsd(i.OutputDoubleRegister(), Operand(ebp, offset));
} else {
DCHECK_EQ(MachineRepresentation::kFloat32, op->representation());
} else if (op->representation() == MachineRepresentation::kFloat32) {
__ movss(i.OutputFloatRegister(), Operand(ebp, offset));
} else {
DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
__ movdqu(i.OutputSimd128Register(), Operand(ebp, offset));
}
} else {
__ mov(i.OutputRegister(), Operand(ebp, offset));

View File

@ -1269,7 +1269,7 @@ void InstructionSelector::EmitPrepareResults(
Node* node) {
IA32OperandGenerator g(this);
int reverse_slot = 0;
int reverse_slot = 1;
for (PushParameter output : *results) {
if (!output.location.IsCallerFrameSlot()) continue;
// Skip any alignment holes in nodes.
@ -1279,6 +1279,8 @@ void InstructionSelector::EmitPrepareResults(
MarkAsFloat32(output.node);
} else if (output.location.GetType() == MachineType::Float64()) {
MarkAsFloat64(output.node);
} else if (output.location.GetType() == MachineType::Simd128()) {
MarkAsSimd128(output.node);
}
Emit(kIA32Peek, g.DefineAsRegister(output.node),
g.UseImmediate(reverse_slot));

View File

@ -2357,9 +2357,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
LocationOperand* op = LocationOperand::cast(instr->OutputAt(0));
if (op->representation() == MachineRepresentation::kFloat64) {
__ Movsd(i.OutputDoubleRegister(), Operand(rbp, offset));
} else {
DCHECK_EQ(MachineRepresentation::kFloat32, op->representation());
} else if (op->representation() == MachineRepresentation::kFloat32) {
__ Movss(i.OutputFloatRegister(), Operand(rbp, offset));
} else {
DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
__ Movdqu(i.OutputSimd128Register(), Operand(rbp, offset));
}
} else {
__ movq(i.OutputRegister(), Operand(rbp, offset));

View File

@ -1685,21 +1685,24 @@ void InstructionSelector::EmitPrepareResults(
Node* node) {
X64OperandGenerator g(this);
int reverse_slot = 0;
int reverse_slot = 1;
for (PushParameter output : *results) {
if (!output.location.IsCallerFrameSlot()) continue;
reverse_slot += output.location.GetSizeInPointers();
// Skip any alignment holes in nodes.
if (output.node == nullptr) continue;
DCHECK(!call_descriptor->IsCFunctionCall());
if (output.location.GetType() == MachineType::Float32()) {
MarkAsFloat32(output.node);
} else if (output.location.GetType() == MachineType::Float64()) {
MarkAsFloat64(output.node);
if (output.node != nullptr) {
DCHECK(!call_descriptor->IsCFunctionCall());
if (output.location.GetType() == MachineType::Float32()) {
MarkAsFloat32(output.node);
} else if (output.location.GetType() == MachineType::Float64()) {
MarkAsFloat64(output.node);
} else if (output.location.GetType() == MachineType::Simd128()) {
MarkAsSimd128(output.node);
}
InstructionOperand result = g.DefineAsRegister(output.node);
InstructionOperand slot = g.UseImmediate(reverse_slot);
Emit(kX64Peek, 1, &result, 1, &slot);
}
InstructionOperand result = g.DefineAsRegister(output.node);
InstructionOperand slot = g.UseImmediate(reverse_slot);
Emit(kX64Peek, 1, &result, 1, &slot);
reverse_slot += output.location.GetSizeInPointers();
}
}

View File

@ -405,6 +405,24 @@ static int GetReturnCountAfterLoweringSimd128(
return result;
}
int GetReturnIndexAfterLowering(const CallDescriptor* call_descriptor,
int old_index) {
int result = old_index;
for (int i = 0; i < old_index; ++i) {
if (call_descriptor->GetReturnType(i).representation() ==
MachineRepresentation::kSimd128) {
result += kNumLanes32 - 1;
}
}
return result;
}
static int GetReturnCountAfterLoweringSimd128(
const CallDescriptor* call_descriptor) {
return GetReturnIndexAfterLowering(
call_descriptor, static_cast<int>(call_descriptor->ReturnCount()));
}
int SimdScalarLowering::NumLanes(SimdType type) {
int num_lanes = 0;
if (type == SimdType::kFloat64x2 || type == SimdType::kInt64x2) {
@ -1182,23 +1200,46 @@ void SimdScalarLowering::LowerNode(Node* node) {
// TODO(turbofan): Make wasm code const-correct wrt. CallDescriptor.
auto call_descriptor =
const_cast<CallDescriptor*>(CallDescriptorOf(node->op()));
if (DefaultLowering(node) ||
(call_descriptor->ReturnCount() == 1 &&
call_descriptor->GetReturnType(0) == MachineType::Simd128())) {
bool returns_require_lowering =
GetReturnCountAfterLoweringSimd128(call_descriptor) !=
static_cast<int>(call_descriptor->ReturnCount());
if (DefaultLowering(node) || returns_require_lowering) {
// We have to adjust the call descriptor.
const Operator* op = common()->Call(
GetI32WasmCallDescriptorForSimd(zone(), call_descriptor));
NodeProperties::ChangeOp(node, op);
}
if (call_descriptor->ReturnCount() == 1 &&
call_descriptor->GetReturnType(0) == MachineType::Simd128()) {
// We access the additional return values through projections.
Node* rep_node[kNumLanes32];
for (int i = 0; i < kNumLanes32; ++i) {
rep_node[i] =
graph()->NewNode(common()->Projection(i), node, graph()->start());
if (!returns_require_lowering) {
break;
}
size_t return_arity = call_descriptor->ReturnCount();
ZoneVector<Node*> projections(return_arity, zone());
NodeProperties::CollectValueProjections(node, projections.data(),
return_arity);
for (size_t old_index = 0, new_index = 0; old_index < return_arity;
++old_index, ++new_index) {
Node* use_node = projections[old_index];
DCHECK_EQ(ProjectionIndexOf(use_node->op()), old_index);
DCHECK_EQ(GetReturnIndexAfterLowering(call_descriptor,
static_cast<int>(old_index)),
static_cast<int>(new_index));
if (new_index != old_index) {
NodeProperties::ChangeOp(use_node, common()->Projection(new_index));
}
if (call_descriptor->GetReturnType(old_index).representation() ==
MachineRepresentation::kSimd128) {
Node* rep_node[kNumLanes32];
for (int i = 0; i < kNumLanes32; ++i) {
rep_node[i] = graph()->NewNode(common()->Projection(new_index + i),
node, graph()->start());
}
ReplaceNode(use_node, rep_node, kNumLanes32);
new_index += kNumLanes32 - 1;
}
ReplaceNode(node, rep_node, kNumLanes32);
}
break;
}

View File

@ -79,6 +79,9 @@ inline void Store(LiftoffAssembler* assm, Register base, int32_t offset,
case ValueType::kF64:
assm->movsd(dst, src.fp());
break;
case ValueType::kS128:
assm->movdqu(dst, src.fp());
break;
default:
UNREACHABLE();
}

View File

@ -93,6 +93,9 @@ inline void Store(LiftoffAssembler* assm, Operand dst, LiftoffRegister src,
case ValueType::kF64:
assm->Movsd(dst, src.fp());
break;
case ValueType::kS128:
assm->Movdqu(dst, src.fp());
break;
default:
UNREACHABLE();
}

View File

@ -0,0 +1,92 @@
// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --experimental-wasm-simd --experimental-wasm-mv
load("test/mjsunit/wasm/wasm-module-builder.js");
(function MultiReturnS128Test() {
print("MultiReturnS128Test");
// Most backends only support 2 fp return registers, so the third v128
// onwards here will written to caller stack slot.
let builder = new WasmModuleBuilder();
let sig_v_sssss = builder.addType(
makeSig([], [kWasmS128, kWasmS128, kWasmS128, kWasmS128, kWasmS128]));
let sig_iiiiiiiiii_v = builder.addType(
makeSig([], [kWasmI32, kWasmI32, kWasmI32, kWasmI32, kWasmI32, kWasmI32,
kWasmI32, kWasmI32, kWasmI32, kWasmI32] ));
let callee = builder.addFunction("callee", sig_v_sssss)
.addBody([
kExprI32Const, 0,
kSimdPrefix, kExprI32x4Splat,
kExprI32Const, 1,
kSimdPrefix, kExprI32x4Splat,
kExprI32Const, 2,
kSimdPrefix, kExprI32x4Splat,
kExprI32Const, 3,
kSimdPrefix, kExprI32x4Splat,
kExprI32Const, 4,
kSimdPrefix, kExprI32x4Splat,
kExprReturn]);
// For each v128 on the stack, we return the first and last lane. This help
// catch bugs with reading/writing the wrong stack slots.
builder.addFunction("main", sig_iiiiiiiiii_v)
.addLocals({"i32_count": 10, "s128_count": 1})
.addBody([
kExprCallFunction, callee.index,
kExprLocalTee, 10,
kSimdPrefix, kExprI32x4ExtractLane, 0,
kExprLocalSet, 0,
kExprLocalGet, 10,
kSimdPrefix, kExprI32x4ExtractLane, 3,
kExprLocalSet, 1,
kExprLocalTee, 10,
kSimdPrefix, kExprI32x4ExtractLane, 0,
kExprLocalSet, 2,
kExprLocalGet, 10,
kSimdPrefix, kExprI32x4ExtractLane, 3,
kExprLocalSet, 3,
kExprLocalTee, 10,
kSimdPrefix, kExprI32x4ExtractLane, 0,
kExprLocalSet, 4,
kExprLocalGet, 10,
kSimdPrefix, kExprI32x4ExtractLane, 3,
kExprLocalSet, 5,
kExprLocalTee, 10,
kSimdPrefix, kExprI32x4ExtractLane, 0,
kExprLocalSet, 6,
kExprLocalGet, 10,
kSimdPrefix, kExprI32x4ExtractLane, 3,
kExprLocalSet, 7,
kExprLocalTee, 10,
kSimdPrefix, kExprI32x4ExtractLane, 0,
kExprLocalSet, 8,
kExprLocalGet, 10,
kSimdPrefix, kExprI32x4ExtractLane, 3,
kExprLocalSet, 9,
// Return all the stored locals.
kExprLocalGet, 0,
kExprLocalGet, 1,
kExprLocalGet, 2,
kExprLocalGet, 3,
kExprLocalGet, 4,
kExprLocalGet, 5,
kExprLocalGet, 6,
kExprLocalGet, 7,
kExprLocalGet, 8,
kExprLocalGet, 9,
])
.exportAs("main");
let module = new WebAssembly.Module(builder.toBuffer());
let instance = new WebAssembly.Instance(module);
assertEquals(instance.exports.main(), [4, 4, 3, 3, 2, 2, 1, 1, 0, 0]);
})();