[wasm] Implement loop peeling

We implement loop peeling for wasm, currently available behind a flag. Loops are peeled regardless of size. Bug: v8:11510 Change-Id: Ia4c883abdee83df632b2611584d608c44e3295c8 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3367615 Reviewed-by: Andreas Haas <ahaas@chromium.org> Reviewed-by: Tobias Tebbi <tebbi@chromium.org> Commit-Queue: Manos Koukoutos <manoskouk@chromium.org> Cr-Commit-Position: refs/heads/main@{#78496}
2022-01-05 15:30:02 +00:00 · 2022-01-05 15:30:02 +00:00 · 8e9d8e1783
commit 8e9d8e1783
parent 457827106a
11 changed files with 275 additions and 20 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -2249,6 +2249,7 @@ filegroup(
            "src/compiler/wasm-compiler.h",
            "src/compiler/wasm-escape-analysis.h",
            "src/compiler/wasm-inlining.h",
+            "src/compiler/wasm-loop-peeling.h",
            "src/debug/debug-wasm-objects.cc",
            "src/debug/debug-wasm-objects.h",
            "src/debug/debug-wasm-objects-inl.h",
@ -2634,6 +2635,7 @@ filegroup(
        ":is_v8_enable_webassembly": [
            "src/compiler/int64-lowering.cc",
            "src/compiler/wasm-compiler.cc",
+            "src/compiler/wasm-loop-peeling.cc",
            "src/compiler/wasm-escape-analysis.cc",
            "src/compiler/wasm-inlining.cc",
        ],
--- a/BUILD.gn
+++ b/BUILD.gn
@ -3435,6 +3435,7 @@ v8_header_set("v8_internal_headers") {
      "src/compiler/wasm-compiler.h",
      "src/compiler/wasm-escape-analysis.h",
      "src/compiler/wasm-inlining.h",
+      "src/compiler/wasm-loop-peeling.h",
      "src/debug/debug-wasm-objects-inl.h",
      "src/debug/debug-wasm-objects.h",
      "src/trap-handler/trap-handler-internal.h",
@ -3907,6 +3908,7 @@ if (v8_enable_webassembly) {
    "src/compiler/wasm-compiler.cc",
    "src/compiler/wasm-escape-analysis.cc",
    "src/compiler/wasm-inlining.cc",
+    "src/compiler/wasm-loop-peeling.cc",
  ]
 }

--- a/src/compiler/loop-analysis.cc
+++ b/src/compiler/loop-analysis.cc
@ -551,7 +551,7 @@ LoopTree* LoopFinder::BuildLoopTree(Graph* graph, TickCounter* tick_counter,
 #if V8_ENABLE_WEBASSEMBLY
 // static
 ZoneUnorderedSet<Node*>* LoopFinder::FindSmallInnermostLoopFromHeader(
-    Node* loop_header, Zone* zone, size_t max_size) {
+    Node* loop_header, Zone* zone, size_t max_size, bool calls_are_large) {
  auto* visited = zone->New<ZoneUnorderedSet<Node*>>(zone);
  std::vector<Node*> queue;

@ -594,13 +594,19 @@ ZoneUnorderedSet<Node*>* LoopFinder::FindSmallInnermostLoopFromHeader(
        }
        // All uses are outside the loop, do nothing.
        break;
+      // If {calls_are_large}, call nodes are considered to have unbounded size,
+      // i.e. >max_size, with the exception of certain wasm builtins.
      case IrOpcode::kTailCall:
      case IrOpcode::kJSWasmCall:
      case IrOpcode::kJSCall:
-        // Call nodes are considered to have unbounded size, i.e. >max_size,
-        // with the exception of certain wasm builtins.
-        return nullptr;
+        if (calls_are_large) return nullptr;
+        ENQUEUE_USES(use, true)
+        break;
      case IrOpcode::kCall: {
+        if (!calls_are_large) {
+          ENQUEUE_USES(use, true);
+          break;
+        }
        Node* callee = node->InputAt(0);
        if (callee->opcode() != IrOpcode::kRelocatableInt32Constant &&
            callee->opcode() != IrOpcode::kRelocatableInt64Constant) {
--- a/src/compiler/loop-analysis.h
+++ b/src/compiler/loop-analysis.h
@ -186,11 +186,11 @@ class V8_EXPORT_PRIVATE LoopFinder {
  // marked with LoopExit, LoopExitEffect, LoopExitValue, or End nodes.
  // Returns {nullptr} if
  // 1) the loop size (in graph nodes) exceeds {max_size},
-  // 2) a function call is found in the loop, excluding calls to a set of wasm
-  //    builtins,
+  // 2) {calls_are_large} and a function call is found in the loop, excluding
+  //    calls to a set of wasm builtins,
  // 3) a nested loop is found in the loop.
  static ZoneUnorderedSet<Node*>* FindSmallInnermostLoopFromHeader(
-      Node* loop_header, Zone* zone, size_t max_size);
+      Node* loop_header, Zone* zone, size_t max_size, bool calls_are_large);
 #endif
 };

@ -198,7 +198,7 @@ class V8_EXPORT_PRIVATE LoopFinder {
 class NodeCopier {
 public:
  // {max}: The maximum number of nodes that this copier will track, including
-  //        The original nodes and all copies.
+  //        the original nodes and all copies.
  // {p}: A vector that holds the original nodes and all copies.
  // {copy_count}: How many times the nodes should be copied.
  NodeCopier(Graph* graph, uint32_t max, NodeVector* p, uint32_t copy_count)
--- a/src/compiler/pipeline.cc
+++ b/src/compiler/pipeline.cc
@ -99,6 +99,7 @@
 #include "src/compiler/wasm-compiler.h"
 #include "src/compiler/wasm-escape-analysis.h"
 #include "src/compiler/wasm-inlining.h"
+#include "src/compiler/wasm-loop-peeling.h"
 #include "src/wasm/function-body-decoder.h"
 #include "src/wasm/function-compiler.h"
 #include "src/wasm/wasm-engine.h"
@ -1680,6 +1681,24 @@ struct WasmInliningPhase {
  }
 };

+namespace {
+void EliminateLoopExits(std::vector<compiler::WasmLoopInfo>* loop_infos) {
+  for (WasmLoopInfo& loop_info : *loop_infos) {
+    std::unordered_set<Node*> loop_exits;
+    // We collect exits into a set first because we are not allowed to mutate
+    // them while iterating uses().
+    for (Node* use : loop_info.header->uses()) {
+      if (use->opcode() == IrOpcode::kLoopExit) {
+        loop_exits.insert(use);
+      }
+    }
+    for (Node* use : loop_exits) {
+      LoopPeeler::EliminateLoopExit(use);
+    }
+  }
+}
+}  // namespace
+
 struct WasmLoopUnrollingPhase {
  DECL_PIPELINE_PHASE_CONSTANTS(WasmLoopUnrolling)

@ -1692,7 +1711,7 @@ struct WasmLoopUnrollingPhase {
                loop_info.header, temp_zone,
                // Only discover the loop until its size is the maximum unrolled
                // size for its depth.
-                maximum_unrollable_size(loop_info.nesting_depth));
+                maximum_unrollable_size(loop_info.nesting_depth), true);
        if (loop == nullptr) continue;
        UnrollLoop(loop_info.header, loop, loop_info.nesting_depth,
                   data->graph(), data->common(), temp_zone,
@ -1700,19 +1719,28 @@ struct WasmLoopUnrollingPhase {
      }
    }

+    EliminateLoopExits(loop_infos);
+  }
+};
+
+struct WasmLoopPeelingPhase {
+  DECL_PIPELINE_PHASE_CONSTANTS(WasmLoopPeeling)
+
+  void Run(PipelineData* data, Zone* temp_zone,
+           std::vector<compiler::WasmLoopInfo>* loop_infos) {
    for (WasmLoopInfo& loop_info : *loop_infos) {
-      std::unordered_set<Node*> loop_exits;
-      // We collect exits into a set first because we are not allowed to mutate
-      // them while iterating uses().
-      for (Node* use : loop_info.header->uses()) {
-        if (use->opcode() == IrOpcode::kLoopExit) {
-          loop_exits.insert(use);
-        }
-      }
-      for (Node* use : loop_exits) {
-        LoopPeeler::EliminateLoopExit(use);
+      if (loop_info.can_be_innermost) {
+        ZoneUnorderedSet<Node*>* loop =
+            LoopFinder::FindSmallInnermostLoopFromHeader(
+                loop_info.header, temp_zone, std::numeric_limits<size_t>::max(),
+                false);
+        if (loop == nullptr) continue;
+        PeelWasmLoop(loop_info.header, loop, data->graph(), data->common(),
+                     temp_zone, data->source_positions(), data->node_origins());
      }
    }
+    // If we are going to unroll later, keep loop exits.
+    if (!FLAG_wasm_loop_unrolling) EliminateLoopExits(loop_infos);
  }
 };
 #endif  // V8_ENABLE_WEBASSEMBLY
@ -3249,6 +3277,10 @@ void Pipeline::GenerateCodeForWasmFunction(
                                    loop_info);
    pipeline.RunPrintAndVerify(WasmInliningPhase::phase_name(), true);
  }
+  if (FLAG_wasm_loop_peeling) {
+    pipeline.Run<WasmLoopPeelingPhase>(loop_info);
+    pipeline.RunPrintAndVerify(WasmLoopPeelingPhase::phase_name(), true);
+  }
  if (FLAG_wasm_loop_unrolling) {
    pipeline.Run<WasmLoopUnrollingPhase>(loop_info);
    pipeline.RunPrintAndVerify(WasmLoopUnrollingPhase::phase_name(), true);
--- a/src/compiler/wasm-loop-peeling.cc
+++ b/src/compiler/wasm-loop-peeling.cc
@ -0,0 +1,133 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/compiler/wasm-loop-peeling.h"
+
+#include "src/base/small-vector.h"
+#include "src/codegen/tick-counter.h"
+#include "src/compiler/common-operator.h"
+#include "src/compiler/loop-analysis.h"
+#include "src/compiler/loop-peeling.h"
+
+namespace v8 {
+namespace internal {
+namespace compiler {
+
+void PeelWasmLoop(Node* loop_node, ZoneUnorderedSet<Node*>* loop, Graph* graph,
+                  CommonOperatorBuilder* common, Zone* tmp_zone,
+                  SourcePositionTable* source_positions,
+                  NodeOriginTable* node_origins) {
+  DCHECK_EQ(loop_node->opcode(), IrOpcode::kLoop);
+  DCHECK_NOT_NULL(loop);
+  // No back-jump to the loop header means this is not really a loop.
+  if (loop_node->InputCount() < 2) return;
+
+  uint32_t copied_size = static_cast<uint32_t>(loop->size()) * 2;
+
+  NodeVector copied_nodes(tmp_zone);
+
+  NodeCopier copier(graph, copied_size, &copied_nodes, 1);
+  source_positions->AddDecorator();
+  copier.CopyNodes(graph, tmp_zone, graph->NewNode(common->Dead()),
+                   base::make_iterator_range(loop->begin(), loop->end()),
+                   source_positions, node_origins);
+  source_positions->RemoveDecorator();
+
+  Node* peeled_iteration_header = copier.map(loop_node);
+
+  // The terminator nodes in the copies need to get connected to the graph's end
+  // node, except Terminate nodes which will be deleted anyway.
+  for (Node* node : copied_nodes) {
+    if (IrOpcode::IsGraphTerminator(node->opcode()) &&
+        node->opcode() != IrOpcode::kTerminate && node->UseCount() == 0) {
+      NodeProperties::MergeControlToEnd(graph, common, node);
+    }
+  }
+
+  // Step 1: Create merges for loop exits.
+  for (Node* node : loop_node->uses()) {
+    // We do not need the Terminate node for the peeled iteration.
+    if (node->opcode() == IrOpcode::kTerminate) {
+      copier.map(node)->Kill();
+      continue;
+    }
+    if (node->opcode() != IrOpcode::kLoopExit) continue;
+    DCHECK_EQ(node->InputAt(1), loop_node);
+    // Create a merge node for the peeled iteration and main loop. Skip the
+    // LoopExit node in the peeled iteration, use its control input instead.
+    Node* merge_node =
+        graph->NewNode(common->Merge(2), node, copier.map(node)->InputAt(0));
+    // Replace all uses of the loop exit with the merge node.
+    for (Edge use_edge : node->use_edges()) {
+      Node* use = use_edge.from();
+      if (loop->count(use) == 1) {
+        // Uses within the loop will be LoopExitEffects and LoopExitValues.
+        // Those are used by nodes outside the loop. We need to create phis from
+        // the main loop and peeled iteration to replace loop exits.
+        DCHECK(use->opcode() == IrOpcode::kLoopExitEffect ||
+               use->opcode() == IrOpcode::kLoopExitValue);
+        const Operator* phi_operator =
+            use->opcode() == IrOpcode::kLoopExitEffect
+                ? common->EffectPhi(2)
+                : common->Phi(LoopExitValueRepresentationOf(use->op()), 2);
+        Node* phi = graph->NewNode(phi_operator, use,
+                                   copier.map(use)->InputAt(0), merge_node);
+        use->ReplaceUses(phi);
+        // Fix the input of phi we just broke.
+        phi->ReplaceInput(0, use);
+        copier.map(use)->Kill();
+      } else if (use != merge_node) {
+        // For uses outside the loop, simply redirect them to the merge.
+        use->ReplaceInput(use_edge.index(), merge_node);
+      }
+    }
+    copier.map(node)->Kill();
+  }
+
+  // Step 2: The peeled iteration is not a loop anymore. Any control uses of
+  // its loop header should now point to its non-recursive input. Any phi uses
+  // should use the value coming from outside the loop.
+  for (Edge use_edge : peeled_iteration_header->use_edges()) {
+    if (NodeProperties::IsPhi(use_edge.from())) {
+      use_edge.from()->ReplaceUses(use_edge.from()->InputAt(0));
+    } else {
+      use_edge.UpdateTo(loop_node->InputAt(0));
+    }
+  }
+
+  // We are now left with an unconnected subgraph of the peeled Loop node and
+  // its phi uses.
+
+  // Step 3: Rewire the peeled iteration to flow into the main loop.
+
+  // We are reusing the Loop node of the peeled iteration and its phis as the
+  // merge and phis which flow from the peeled iteration into the main loop.
+  // First, remove the non-recursive input.
+  peeled_iteration_header->RemoveInput(0);
+  NodeProperties::ChangeOp(
+      peeled_iteration_header,
+      common->Merge(peeled_iteration_header->InputCount()));
+
+  // Remove the non-recursive input.
+  for (Edge use_edge : peeled_iteration_header->use_edges()) {
+    DCHECK(NodeProperties::IsPhi(use_edge.from()));
+    use_edge.from()->RemoveInput(0);
+    const Operator* phi = common->ResizeMergeOrPhi(
+        use_edge.from()->op(),
+        use_edge.from()->InputCount() - /* control input */ 1);
+    NodeProperties::ChangeOp(use_edge.from(), phi);
+  }
+
+  // In the main loop, change inputs to the merge and phis above.
+  loop_node->ReplaceInput(0, peeled_iteration_header);
+  for (Edge use_edge : loop_node->use_edges()) {
+    if (NodeProperties::IsPhi(use_edge.from())) {
+      use_edge.from()->ReplaceInput(0, copier.map(use_edge.from()));
+    }
+  }
+}
+
+}  // namespace compiler
+}  // namespace internal
+}  // namespace v8
--- a/src/compiler/wasm-loop-peeling.h
+++ b/src/compiler/wasm-loop-peeling.h
@ -0,0 +1,33 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if !V8_ENABLE_WEBASSEMBLY
+#error This header should only be included if WebAssembly is enabled.
+#endif  // !V8_ENABLE_WEBASSEMBLY
+
+#ifndef V8_COMPILER_WASM_LOOP_PEELING_H_
+#define V8_COMPILER_WASM_LOOP_PEELING_H_
+
+#include "src/compiler/common-operator.h"
+#include "src/compiler/loop-analysis.h"
+
+namespace v8 {
+namespace internal {
+namespace compiler {
+
+// Loop peeling is an optimization that copies the body of a loop, creating
+// a new copy of the body called the "peeled iteration" that represents the
+// first iteration. It enables a kind of loop hoisting: repeated computations
+// without side-effects in the body of the loop can be computed in the first
+// iteration only and reused in the next iterations.
+void PeelWasmLoop(Node* loop_node, ZoneUnorderedSet<Node*>* loop, Graph* graph,
+                  CommonOperatorBuilder* common, Zone* tmp_zone,
+                  SourcePositionTable* source_positions,
+                  NodeOriginTable* node_origins);
+
+}  // namespace compiler
+}  // namespace internal
+}  // namespace v8
+
+#endif  // V8_COMPILER_WASM_LOOP_PEELING_H_
--- a/src/flags/flag-definitions.h
+++ b/src/flags/flag-definitions.h
@ -1104,6 +1104,7 @@ DEFINE_NEG_IMPLICATION(liftoff_only, wasm_speculative_inlining)

 DEFINE_BOOL(wasm_loop_unrolling, true,
            "enable loop unrolling for wasm functions")
+DEFINE_BOOL(wasm_loop_peeling, false, "enable loop peeling for wasm functions")
 DEFINE_BOOL(wasm_fuzzer_gen_test, false,
            "generate a test case when running a wasm fuzzer")
 DEFINE_IMPLICATION(wasm_fuzzer_gen_test, single_threaded)
--- a/src/logging/runtime-call-stats.h
+++ b/src/logging/runtime-call-stats.h
@ -372,6 +372,7 @@ class RuntimeCallTimer final {
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, VerifyGraph)                     \
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmBaseOptimization)            \
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmInlining)                    \
+  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmLoopPeeling)                 \
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmLoopUnrolling)               \
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmOptimization)                \
                                                                            \
--- a/src/wasm/graph-builder-interface.cc
+++ b/src/wasm/graph-builder-interface.cc
@ -1310,7 +1310,9 @@ class WasmGraphBuildingInterface {
  //   different nodes during inlining. These are Return and TailCall nodes.
  // - After IfFailure nodes.
  // - When exiting a loop through Delegate.
-  bool emit_loop_exits() { return FLAG_wasm_loop_unrolling; }
+  bool emit_loop_exits() {
+    return FLAG_wasm_loop_unrolling || FLAG_wasm_loop_peeling;
+  }

  void GetNodes(TFNode** nodes, Value* values, size_t count) {
    for (size_t i = 0; i < count; ++i) {
--- a/test/mjsunit/wasm/inlining.js
+++ b/test/mjsunit/wasm/inlining.js
@ -62,6 +62,49 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
  assertEquals(10, instance.exports.main(10));
 })();

+(function LoopInLoopTest() {
+  print(arguments.callee.name);
+  let builder = new WasmModuleBuilder();
+
+  let fact = builder.addFunction("fact", kSig_i_i)
+    .addLocals(kWasmI32, 1)
+    .addBody([// result = 1;
+              kExprI32Const, 1, kExprLocalSet, 1,
+              kExprLoop, kWasmVoid,
+                kExprLocalGet, 1,
+                // if input == 1 return result;
+                kExprLocalGet, 0, kExprI32Const, 1, kExprI32Eq, kExprBrIf, 1,
+                // result *= input;
+                kExprLocalGet, 0, kExprI32Mul, kExprLocalSet, 1,
+                // input -= 1;
+                kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub,
+                kExprLocalSet, 0,
+                kExprBr, 0,
+              kExprEnd,
+              kExprUnreachable]);
+
+  builder.addFunction("main", kSig_i_i)
+    .addLocals(kWasmI32, 1)
+    .addBody([
+      kExprLoop, kWasmVoid,
+        kExprLocalGet, 1,
+        // if input == 0 return sum;
+        kExprLocalGet, 0, kExprI32Const, 0, kExprI32Eq, kExprBrIf, 1,
+        // sum += fact(input);
+        kExprLocalGet, 0, kExprCallFunction, fact.index,
+        kExprI32Add, kExprLocalSet, 1,
+        // input -= 1;
+        kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub,
+        kExprLocalSet, 0,
+        kExprBr, 0,
+      kExprEnd,
+      kExprUnreachable])
+    .exportAs("main");
+
+  let instance = builder.instantiate();
+  assertEquals(33, instance.exports.main(4));
+})();
+
 (function InfiniteLoopTest() {
  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();