[wasm] Implement loop peeling

We implement loop peeling for wasm, currently available behind a flag.
Loops are peeled regardless of size.

Bug: v8:11510
Change-Id: Ia4c883abdee83df632b2611584d608c44e3295c8
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3367615
Reviewed-by: Andreas Haas <ahaas@chromium.org>
Reviewed-by: Tobias Tebbi <tebbi@chromium.org>
Commit-Queue: Manos Koukoutos <manoskouk@chromium.org>
Cr-Commit-Position: refs/heads/main@{#78496}
This commit is contained in:
Manos Koukoutos 2022-01-05 15:30:02 +00:00 committed by V8 LUCI CQ
parent 457827106a
commit 8e9d8e1783
11 changed files with 275 additions and 20 deletions

View File

@ -2249,6 +2249,7 @@ filegroup(
"src/compiler/wasm-compiler.h",
"src/compiler/wasm-escape-analysis.h",
"src/compiler/wasm-inlining.h",
"src/compiler/wasm-loop-peeling.h",
"src/debug/debug-wasm-objects.cc",
"src/debug/debug-wasm-objects.h",
"src/debug/debug-wasm-objects-inl.h",
@ -2634,6 +2635,7 @@ filegroup(
":is_v8_enable_webassembly": [
"src/compiler/int64-lowering.cc",
"src/compiler/wasm-compiler.cc",
"src/compiler/wasm-loop-peeling.cc",
"src/compiler/wasm-escape-analysis.cc",
"src/compiler/wasm-inlining.cc",
],

View File

@ -3435,6 +3435,7 @@ v8_header_set("v8_internal_headers") {
"src/compiler/wasm-compiler.h",
"src/compiler/wasm-escape-analysis.h",
"src/compiler/wasm-inlining.h",
"src/compiler/wasm-loop-peeling.h",
"src/debug/debug-wasm-objects-inl.h",
"src/debug/debug-wasm-objects.h",
"src/trap-handler/trap-handler-internal.h",
@ -3907,6 +3908,7 @@ if (v8_enable_webassembly) {
"src/compiler/wasm-compiler.cc",
"src/compiler/wasm-escape-analysis.cc",
"src/compiler/wasm-inlining.cc",
"src/compiler/wasm-loop-peeling.cc",
]
}

View File

@ -551,7 +551,7 @@ LoopTree* LoopFinder::BuildLoopTree(Graph* graph, TickCounter* tick_counter,
#if V8_ENABLE_WEBASSEMBLY
// static
ZoneUnorderedSet<Node*>* LoopFinder::FindSmallInnermostLoopFromHeader(
Node* loop_header, Zone* zone, size_t max_size) {
Node* loop_header, Zone* zone, size_t max_size, bool calls_are_large) {
auto* visited = zone->New<ZoneUnorderedSet<Node*>>(zone);
std::vector<Node*> queue;
@ -594,13 +594,19 @@ ZoneUnorderedSet<Node*>* LoopFinder::FindSmallInnermostLoopFromHeader(
}
// All uses are outside the loop, do nothing.
break;
// If {calls_are_large}, call nodes are considered to have unbounded size,
// i.e. >max_size, with the exception of certain wasm builtins.
case IrOpcode::kTailCall:
case IrOpcode::kJSWasmCall:
case IrOpcode::kJSCall:
// Call nodes are considered to have unbounded size, i.e. >max_size,
// with the exception of certain wasm builtins.
return nullptr;
if (calls_are_large) return nullptr;
ENQUEUE_USES(use, true)
break;
case IrOpcode::kCall: {
if (!calls_are_large) {
ENQUEUE_USES(use, true);
break;
}
Node* callee = node->InputAt(0);
if (callee->opcode() != IrOpcode::kRelocatableInt32Constant &&
callee->opcode() != IrOpcode::kRelocatableInt64Constant) {

View File

@ -186,11 +186,11 @@ class V8_EXPORT_PRIVATE LoopFinder {
// marked with LoopExit, LoopExitEffect, LoopExitValue, or End nodes.
// Returns {nullptr} if
// 1) the loop size (in graph nodes) exceeds {max_size},
// 2) a function call is found in the loop, excluding calls to a set of wasm
// builtins,
// 2) {calls_are_large} and a function call is found in the loop, excluding
// calls to a set of wasm builtins,
// 3) a nested loop is found in the loop.
static ZoneUnorderedSet<Node*>* FindSmallInnermostLoopFromHeader(
Node* loop_header, Zone* zone, size_t max_size);
Node* loop_header, Zone* zone, size_t max_size, bool calls_are_large);
#endif
};
@ -198,7 +198,7 @@ class V8_EXPORT_PRIVATE LoopFinder {
class NodeCopier {
public:
// {max}: The maximum number of nodes that this copier will track, including
// The original nodes and all copies.
// the original nodes and all copies.
// {p}: A vector that holds the original nodes and all copies.
// {copy_count}: How many times the nodes should be copied.
NodeCopier(Graph* graph, uint32_t max, NodeVector* p, uint32_t copy_count)

View File

@ -99,6 +99,7 @@
#include "src/compiler/wasm-compiler.h"
#include "src/compiler/wasm-escape-analysis.h"
#include "src/compiler/wasm-inlining.h"
#include "src/compiler/wasm-loop-peeling.h"
#include "src/wasm/function-body-decoder.h"
#include "src/wasm/function-compiler.h"
#include "src/wasm/wasm-engine.h"
@ -1680,6 +1681,24 @@ struct WasmInliningPhase {
}
};
namespace {
void EliminateLoopExits(std::vector<compiler::WasmLoopInfo>* loop_infos) {
for (WasmLoopInfo& loop_info : *loop_infos) {
std::unordered_set<Node*> loop_exits;
// We collect exits into a set first because we are not allowed to mutate
// them while iterating uses().
for (Node* use : loop_info.header->uses()) {
if (use->opcode() == IrOpcode::kLoopExit) {
loop_exits.insert(use);
}
}
for (Node* use : loop_exits) {
LoopPeeler::EliminateLoopExit(use);
}
}
}
} // namespace
struct WasmLoopUnrollingPhase {
DECL_PIPELINE_PHASE_CONSTANTS(WasmLoopUnrolling)
@ -1692,7 +1711,7 @@ struct WasmLoopUnrollingPhase {
loop_info.header, temp_zone,
// Only discover the loop until its size is the maximum unrolled
// size for its depth.
maximum_unrollable_size(loop_info.nesting_depth));
maximum_unrollable_size(loop_info.nesting_depth), true);
if (loop == nullptr) continue;
UnrollLoop(loop_info.header, loop, loop_info.nesting_depth,
data->graph(), data->common(), temp_zone,
@ -1700,19 +1719,28 @@ struct WasmLoopUnrollingPhase {
}
}
EliminateLoopExits(loop_infos);
}
};
struct WasmLoopPeelingPhase {
DECL_PIPELINE_PHASE_CONSTANTS(WasmLoopPeeling)
void Run(PipelineData* data, Zone* temp_zone,
std::vector<compiler::WasmLoopInfo>* loop_infos) {
for (WasmLoopInfo& loop_info : *loop_infos) {
std::unordered_set<Node*> loop_exits;
// We collect exits into a set first because we are not allowed to mutate
// them while iterating uses().
for (Node* use : loop_info.header->uses()) {
if (use->opcode() == IrOpcode::kLoopExit) {
loop_exits.insert(use);
}
}
for (Node* use : loop_exits) {
LoopPeeler::EliminateLoopExit(use);
if (loop_info.can_be_innermost) {
ZoneUnorderedSet<Node*>* loop =
LoopFinder::FindSmallInnermostLoopFromHeader(
loop_info.header, temp_zone, std::numeric_limits<size_t>::max(),
false);
if (loop == nullptr) continue;
PeelWasmLoop(loop_info.header, loop, data->graph(), data->common(),
temp_zone, data->source_positions(), data->node_origins());
}
}
// If we are going to unroll later, keep loop exits.
if (!FLAG_wasm_loop_unrolling) EliminateLoopExits(loop_infos);
}
};
#endif // V8_ENABLE_WEBASSEMBLY
@ -3249,6 +3277,10 @@ void Pipeline::GenerateCodeForWasmFunction(
loop_info);
pipeline.RunPrintAndVerify(WasmInliningPhase::phase_name(), true);
}
if (FLAG_wasm_loop_peeling) {
pipeline.Run<WasmLoopPeelingPhase>(loop_info);
pipeline.RunPrintAndVerify(WasmLoopPeelingPhase::phase_name(), true);
}
if (FLAG_wasm_loop_unrolling) {
pipeline.Run<WasmLoopUnrollingPhase>(loop_info);
pipeline.RunPrintAndVerify(WasmLoopUnrollingPhase::phase_name(), true);

View File

@ -0,0 +1,133 @@
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/compiler/wasm-loop-peeling.h"
#include "src/base/small-vector.h"
#include "src/codegen/tick-counter.h"
#include "src/compiler/common-operator.h"
#include "src/compiler/loop-analysis.h"
#include "src/compiler/loop-peeling.h"
namespace v8 {
namespace internal {
namespace compiler {
void PeelWasmLoop(Node* loop_node, ZoneUnorderedSet<Node*>* loop, Graph* graph,
CommonOperatorBuilder* common, Zone* tmp_zone,
SourcePositionTable* source_positions,
NodeOriginTable* node_origins) {
DCHECK_EQ(loop_node->opcode(), IrOpcode::kLoop);
DCHECK_NOT_NULL(loop);
// No back-jump to the loop header means this is not really a loop.
if (loop_node->InputCount() < 2) return;
uint32_t copied_size = static_cast<uint32_t>(loop->size()) * 2;
NodeVector copied_nodes(tmp_zone);
NodeCopier copier(graph, copied_size, &copied_nodes, 1);
source_positions->AddDecorator();
copier.CopyNodes(graph, tmp_zone, graph->NewNode(common->Dead()),
base::make_iterator_range(loop->begin(), loop->end()),
source_positions, node_origins);
source_positions->RemoveDecorator();
Node* peeled_iteration_header = copier.map(loop_node);
// The terminator nodes in the copies need to get connected to the graph's end
// node, except Terminate nodes which will be deleted anyway.
for (Node* node : copied_nodes) {
if (IrOpcode::IsGraphTerminator(node->opcode()) &&
node->opcode() != IrOpcode::kTerminate && node->UseCount() == 0) {
NodeProperties::MergeControlToEnd(graph, common, node);
}
}
// Step 1: Create merges for loop exits.
for (Node* node : loop_node->uses()) {
// We do not need the Terminate node for the peeled iteration.
if (node->opcode() == IrOpcode::kTerminate) {
copier.map(node)->Kill();
continue;
}
if (node->opcode() != IrOpcode::kLoopExit) continue;
DCHECK_EQ(node->InputAt(1), loop_node);
// Create a merge node for the peeled iteration and main loop. Skip the
// LoopExit node in the peeled iteration, use its control input instead.
Node* merge_node =
graph->NewNode(common->Merge(2), node, copier.map(node)->InputAt(0));
// Replace all uses of the loop exit with the merge node.
for (Edge use_edge : node->use_edges()) {
Node* use = use_edge.from();
if (loop->count(use) == 1) {
// Uses within the loop will be LoopExitEffects and LoopExitValues.
// Those are used by nodes outside the loop. We need to create phis from
// the main loop and peeled iteration to replace loop exits.
DCHECK(use->opcode() == IrOpcode::kLoopExitEffect ||
use->opcode() == IrOpcode::kLoopExitValue);
const Operator* phi_operator =
use->opcode() == IrOpcode::kLoopExitEffect
? common->EffectPhi(2)
: common->Phi(LoopExitValueRepresentationOf(use->op()), 2);
Node* phi = graph->NewNode(phi_operator, use,
copier.map(use)->InputAt(0), merge_node);
use->ReplaceUses(phi);
// Fix the input of phi we just broke.
phi->ReplaceInput(0, use);
copier.map(use)->Kill();
} else if (use != merge_node) {
// For uses outside the loop, simply redirect them to the merge.
use->ReplaceInput(use_edge.index(), merge_node);
}
}
copier.map(node)->Kill();
}
// Step 2: The peeled iteration is not a loop anymore. Any control uses of
// its loop header should now point to its non-recursive input. Any phi uses
// should use the value coming from outside the loop.
for (Edge use_edge : peeled_iteration_header->use_edges()) {
if (NodeProperties::IsPhi(use_edge.from())) {
use_edge.from()->ReplaceUses(use_edge.from()->InputAt(0));
} else {
use_edge.UpdateTo(loop_node->InputAt(0));
}
}
// We are now left with an unconnected subgraph of the peeled Loop node and
// its phi uses.
// Step 3: Rewire the peeled iteration to flow into the main loop.
// We are reusing the Loop node of the peeled iteration and its phis as the
// merge and phis which flow from the peeled iteration into the main loop.
// First, remove the non-recursive input.
peeled_iteration_header->RemoveInput(0);
NodeProperties::ChangeOp(
peeled_iteration_header,
common->Merge(peeled_iteration_header->InputCount()));
// Remove the non-recursive input.
for (Edge use_edge : peeled_iteration_header->use_edges()) {
DCHECK(NodeProperties::IsPhi(use_edge.from()));
use_edge.from()->RemoveInput(0);
const Operator* phi = common->ResizeMergeOrPhi(
use_edge.from()->op(),
use_edge.from()->InputCount() - /* control input */ 1);
NodeProperties::ChangeOp(use_edge.from(), phi);
}
// In the main loop, change inputs to the merge and phis above.
loop_node->ReplaceInput(0, peeled_iteration_header);
for (Edge use_edge : loop_node->use_edges()) {
if (NodeProperties::IsPhi(use_edge.from())) {
use_edge.from()->ReplaceInput(0, copier.map(use_edge.from()));
}
}
}
} // namespace compiler
} // namespace internal
} // namespace v8

View File

@ -0,0 +1,33 @@
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#if !V8_ENABLE_WEBASSEMBLY
#error This header should only be included if WebAssembly is enabled.
#endif // !V8_ENABLE_WEBASSEMBLY
#ifndef V8_COMPILER_WASM_LOOP_PEELING_H_
#define V8_COMPILER_WASM_LOOP_PEELING_H_
#include "src/compiler/common-operator.h"
#include "src/compiler/loop-analysis.h"
namespace v8 {
namespace internal {
namespace compiler {
// Loop peeling is an optimization that copies the body of a loop, creating
// a new copy of the body called the "peeled iteration" that represents the
// first iteration. It enables a kind of loop hoisting: repeated computations
// without side-effects in the body of the loop can be computed in the first
// iteration only and reused in the next iterations.
void PeelWasmLoop(Node* loop_node, ZoneUnorderedSet<Node*>* loop, Graph* graph,
CommonOperatorBuilder* common, Zone* tmp_zone,
SourcePositionTable* source_positions,
NodeOriginTable* node_origins);
} // namespace compiler
} // namespace internal
} // namespace v8
#endif // V8_COMPILER_WASM_LOOP_PEELING_H_

View File

@ -1104,6 +1104,7 @@ DEFINE_NEG_IMPLICATION(liftoff_only, wasm_speculative_inlining)
DEFINE_BOOL(wasm_loop_unrolling, true,
"enable loop unrolling for wasm functions")
DEFINE_BOOL(wasm_loop_peeling, false, "enable loop peeling for wasm functions")
DEFINE_BOOL(wasm_fuzzer_gen_test, false,
"generate a test case when running a wasm fuzzer")
DEFINE_IMPLICATION(wasm_fuzzer_gen_test, single_threaded)

View File

@ -372,6 +372,7 @@ class RuntimeCallTimer final {
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, VerifyGraph) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmBaseOptimization) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmInlining) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmLoopPeeling) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmLoopUnrolling) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmOptimization) \
\

View File

@ -1310,7 +1310,9 @@ class WasmGraphBuildingInterface {
// different nodes during inlining. These are Return and TailCall nodes.
// - After IfFailure nodes.
// - When exiting a loop through Delegate.
bool emit_loop_exits() { return FLAG_wasm_loop_unrolling; }
bool emit_loop_exits() {
return FLAG_wasm_loop_unrolling || FLAG_wasm_loop_peeling;
}
void GetNodes(TFNode** nodes, Value* values, size_t count) {
for (size_t i = 0; i < count; ++i) {

View File

@ -62,6 +62,49 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
assertEquals(10, instance.exports.main(10));
})();
(function LoopInLoopTest() {
print(arguments.callee.name);
let builder = new WasmModuleBuilder();
let fact = builder.addFunction("fact", kSig_i_i)
.addLocals(kWasmI32, 1)
.addBody([// result = 1;
kExprI32Const, 1, kExprLocalSet, 1,
kExprLoop, kWasmVoid,
kExprLocalGet, 1,
// if input == 1 return result;
kExprLocalGet, 0, kExprI32Const, 1, kExprI32Eq, kExprBrIf, 1,
// result *= input;
kExprLocalGet, 0, kExprI32Mul, kExprLocalSet, 1,
// input -= 1;
kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub,
kExprLocalSet, 0,
kExprBr, 0,
kExprEnd,
kExprUnreachable]);
builder.addFunction("main", kSig_i_i)
.addLocals(kWasmI32, 1)
.addBody([
kExprLoop, kWasmVoid,
kExprLocalGet, 1,
// if input == 0 return sum;
kExprLocalGet, 0, kExprI32Const, 0, kExprI32Eq, kExprBrIf, 1,
// sum += fact(input);
kExprLocalGet, 0, kExprCallFunction, fact.index,
kExprI32Add, kExprLocalSet, 1,
// input -= 1;
kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub,
kExprLocalSet, 0,
kExprBr, 0,
kExprEnd,
kExprUnreachable])
.exportAs("main");
let instance = builder.instantiate();
assertEquals(33, instance.exports.main(4));
})();
(function InfiniteLoopTest() {
print(arguments.callee.name);
let builder = new WasmModuleBuilder();