[turboshaft] Port LateEscapeAnalysis

Bug: v8:12783 Change-Id: Id5fa026d103dc67e05322b725f34186124bc5936 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4054621 Commit-Queue: Darius Mercadier <dmercadier@chromium.org> Reviewed-by: Tobias Tebbi <tebbi@chromium.org> Cr-Commit-Position: refs/heads/main@{#84603}
2022-12-01 16:59:18 +01:00 · 2022-12-01 16:59:18 +01:00 · fbcffa62b8
commit fbcffa62b8
parent c618a17984
9 changed files with 395 additions and 166 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -2898,6 +2898,8 @@ filegroup(
        "src/compiler/turboshaft/index.h",
        "src/compiler/turboshaft/graph-visualizer.cc",
        "src/compiler/turboshaft/graph-visualizer.h",
+        "src/compiler/turboshaft/late-escape-analysis-reducer.h",
+        "src/compiler/turboshaft/late-escape-analysis-reducer.cc",
        "src/compiler/turboshaft/layered-hash-map.h",
        "src/compiler/turboshaft/machine-optimization-reducer.h",
        "src/compiler/turboshaft/memory-optimization.cc",
--- a/BUILD.gn
+++ b/BUILD.gn
@ -2955,6 +2955,7 @@ v8_header_set("v8_internal_headers") {
    "src/compiler/turboshaft/graph-visualizer.h",
    "src/compiler/turboshaft/graph.h",
    "src/compiler/turboshaft/index.h",
+    "src/compiler/turboshaft/late-escape-analysis-reducer.h",
    "src/compiler/turboshaft/layered-hash-map.h",
    "src/compiler/turboshaft/machine-optimization-reducer.h",
    "src/compiler/turboshaft/memory-optimization.h",
@ -4291,6 +4292,7 @@ v8_source_set("v8_turboshaft") {
    "src/compiler/turboshaft/graph-builder.cc",
    "src/compiler/turboshaft/graph-visualizer.cc",
    "src/compiler/turboshaft/graph.cc",
+    "src/compiler/turboshaft/late-escape-analysis-reducer.cc",
    "src/compiler/turboshaft/memory-optimization.cc",
    "src/compiler/turboshaft/operations.cc",
    "src/compiler/turboshaft/optimization-phase.cc",
--- a/src/compiler/pipeline.cc
+++ b/src/compiler/pipeline.cc
@ -85,6 +85,7 @@
 #include "src/compiler/turboshaft/graph-builder.h"
 #include "src/compiler/turboshaft/graph-visualizer.h"
 #include "src/compiler/turboshaft/graph.h"
+#include "src/compiler/turboshaft/late-escape-analysis-reducer.h"
 #include "src/compiler/turboshaft/machine-optimization-reducer.h"
 #include "src/compiler/turboshaft/memory-optimization.h"
 #include "src/compiler/turboshaft/optimization-phase.h"
@ -1964,8 +1965,7 @@ struct LateOptimizationPhase {

  void Run(PipelineData* data, Zone* temp_zone) {
    if (data->HasTurboshaftGraph()) {
-      // TODO(dmercadier,tebbi): port missing reducers (LateEscapeAnalysis and
-      // CommonOperatorReducer) to turboshaft.
+      // TODO(dmercadier,tebbi): add missing CommonOperatorReducer.
      turboshaft::OptimizationPhase<
          turboshaft::VariableReducer, turboshaft::BranchEliminationReducer,
          turboshaft::SelectLoweringReducer,
@ -1994,8 +1994,8 @@ struct LateOptimizationPhase {
      JSGraphAssembler graph_assembler(data->jsgraph(), temp_zone,
                                       BranchSemantics::kMachine);
      SelectLowering select_lowering(&graph_assembler, data->graph());
-      AddReducer(data, &graph_reducer, &escape_analysis);
      if (!v8_flags.turboshaft) {
+        AddReducer(data, &graph_reducer, &escape_analysis);
        AddReducer(data, &graph_reducer, &branch_condition_elimination);
      }
      AddReducer(data, &graph_reducer, &dead_code_elimination);
@ -2094,6 +2094,7 @@ struct OptimizeTurboshaftPhase {
    UnparkedScopeIfNeeded scope(data->broker(),
                                v8_flags.turboshaft_trace_reduction);
    turboshaft::OptimizationPhase<
+        turboshaft::LateEscapeAnalysisReducer,
        turboshaft::MemoryOptimizationReducer, turboshaft::VariableReducer,
        turboshaft::MachineOptimizationReducerSignallingNanImpossible,
        turboshaft::ValueNumberingReducer>::
--- a/src/compiler/turboshaft/assembler.h
+++ b/src/compiler/turboshaft/assembler.h
@ -53,6 +53,7 @@ class ReducerStack<Assembler, FirstReducer, Reducers...>
 template <class Assembler>
 class ReducerStack<Assembler> {
 public:
+  using AssemblerType = Assembler;
  Assembler& Asm() { return *static_cast<Assembler*>(this); }
 };

@ -89,6 +90,7 @@ class ReducerBaseForwarder : public Next {
 // (Goto, Branch, Switch, CallAndCatchException), and takes care of updating
 // Block predecessors (and calls the Assembler to maintain split-edge form).
 // ReducerBase is always added by Assembler at the bottom of the reducer stack.
+// It also provides a default ShouldSkipOperation method that returns false.
 template <class Next>
 class ReducerBase : public ReducerBaseForwarder<Next> {
 public:
@ -193,6 +195,15 @@ class ReducerBase : public ReducerBaseForwarder<Next> {
    Asm().AddPredecessor(saved_current_block, default_case, true);
    return new_opindex;
  }
+
+  template <class Op>
+  bool ShouldSkipOperation(const Op& op, OpIndex) {
+    if (op.saturated_use_count == 0 &&
+        !op.Properties().is_required_when_unused) {
+      return true;
+    }
+    return false;
+  }
 };

 template <class Assembler>
--- a/src/compiler/turboshaft/late-escape-analysis-reducer.cc
+++ b/src/compiler/turboshaft/late-escape-analysis-reducer.cc
@ -0,0 +1,100 @@
+// Copyright 2022 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/compiler/turboshaft/late-escape-analysis-reducer.h"
+
+namespace v8::internal::compiler::turboshaft {
+
+void LateEscapeAnalysisAnalyzer::Run() {
+  CollectUsesAndAllocations();
+  FindRemovableAllocations();
+}
+
+void LateEscapeAnalysisAnalyzer::RecordAllocateUse(OpIndex alloc, OpIndex use) {
+  auto [it, new_entry] = alloc_uses_.try_emplace(alloc, phase_zone_);
+  auto& uses = it->second;
+  if (new_entry) {
+    uses.reserve(graph_.Get(alloc).saturated_use_count);
+  }
+  uses.push_back(use);
+}
+
+// Collects the Allocate Operations and their uses.
+void LateEscapeAnalysisAnalyzer::CollectUsesAndAllocations() {
+  for (auto& op : graph_.AllOperations()) {
+    OpIndex op_index = graph_.Index(op);
+    for (OpIndex input : op.inputs()) {
+      if (graph_.Get(input).Is<AllocateOp>()) {
+        RecordAllocateUse(input, op_index);
+      }
+    }
+    if (op.Is<AllocateOp>()) {
+      allocs_.push_back(op_index);
+    }
+  }
+}
+
+void LateEscapeAnalysisAnalyzer::FindRemovableAllocations() {
+  while (!allocs_.empty()) {
+    OpIndex current_alloc = allocs_.back();
+    allocs_.pop_back();
+
+    if (ShouldSkipOperation(current_alloc)) {
+      // We are re-visiting an allocation that we've actually already removed.
+      continue;
+    }
+
+    if (!AllocationIsEscaping(current_alloc)) {
+      MarkToRemove(current_alloc);
+    }
+  }
+}
+
+bool LateEscapeAnalysisAnalyzer::AllocationIsEscaping(OpIndex alloc) {
+  if (alloc_uses_.find(alloc) == alloc_uses_.end()) return false;
+  for (OpIndex use : alloc_uses_.at(alloc)) {
+    if (EscapesThroughUse(alloc, use)) return true;
+  }
+  // We haven't found any non-store use
+  return false;
+}
+
+// Returns true if {using_op_idx} is an operation that forces {alloc} to be
+// emitted.
+bool LateEscapeAnalysisAnalyzer::EscapesThroughUse(OpIndex alloc,
+                                                   OpIndex using_op_idx) {
+  if (ShouldSkipOperation(alloc)) {
+    // {using_op_idx} is an Allocate itself, which has been removed.
+    return false;
+  }
+  const Operation& op = graph_.Get(using_op_idx);
+  if (const StoreOp* store_op = op.TryCast<StoreOp>()) {
+    // A StoreOp only makes {alloc} escape if it uses {alloc} as the {value} or
+    // the {index}. Put otherwise, StoreOp makes {alloc} escape if it writes
+    // {alloc}, but not if it writes **to** {alloc}.
+    return store_op->value() == alloc;
+  }
+  return true;
+}
+
+void LateEscapeAnalysisAnalyzer::MarkToRemove(OpIndex alloc) {
+  operations_to_skip_.insert(alloc);
+  if (alloc_uses_.find(alloc) == alloc_uses_.end()) {
+    return;
+  }
+
+  // The uses of {alloc} should also be skipped.
+  for (OpIndex use : alloc_uses_.at(alloc)) {
+    operations_to_skip_.insert(use);
+    const StoreOp& store = graph_.Get(use).Cast<StoreOp>();
+    if (graph_.Get(store.value()).Is<AllocateOp>()) {
+      // This store was storing the result of an allocation. Because we now
+      // removed this store, we might be able to remove the other allocation
+      // as well.
+      allocs_.push_back(store.value());
+    }
+  }
+}
+
+}  // namespace v8::internal::compiler::turboshaft
--- a/src/compiler/turboshaft/late-escape-analysis-reducer.h
+++ b/src/compiler/turboshaft/late-escape-analysis-reducer.h
@ -0,0 +1,90 @@
+// Copyright 2022 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_COMPILER_TURBOSHAFT_LATE_ESCAPE_ANALYSIS_REDUCER_H_
+#define V8_COMPILER_TURBOSHAFT_LATE_ESCAPE_ANALYSIS_REDUCER_H_
+
+#include "src/compiler/turboshaft/assembler.h"
+#include "src/compiler/turboshaft/graph.h"
+#include "src/compiler/turboshaft/utils.h"
+#include "src/zone/zone-containers.h"
+#include "src/zone/zone.h"
+
+namespace v8::internal::compiler::turboshaft {
+
+// LateEscapeAnalysis removes allocation that have no uses besides the stores
+// initializing the object.
+
+class LateEscapeAnalysisAnalyzer {
+ public:
+  LateEscapeAnalysisAnalyzer(const Graph& graph, Zone* zone)
+      : graph_(graph),
+        phase_zone_(zone),
+        alloc_uses_(zone),
+        allocs_(zone),
+        operations_to_skip_(zone) {}
+
+  void Run();
+
+  bool ShouldSkipOperation(OpIndex index) {
+    return operations_to_skip_.count(index) > 0;
+  }
+
+ private:
+  void RecordAllocateUse(OpIndex alloc, OpIndex use);
+
+  void CollectUsesAndAllocations();
+  void FindRemovableAllocations();
+  bool AllocationIsEscaping(OpIndex alloc);
+  bool EscapesThroughUse(OpIndex alloc, OpIndex using_op_idx);
+  void MarkToRemove(OpIndex alloc);
+
+  const Graph& graph_;
+  Zone* phase_zone_;
+
+  // {alloc_uses_} records all the uses of each AllocateOp.
+  ZoneUnorderedMap<OpIndex, ZoneVector<OpIndex>> alloc_uses_;
+  // {allocs_} is filled with all of the AllocateOp of the graph, and then
+  // iterated upon to determine which allocations can be removed and which
+  // cannot.
+  ZoneVector<OpIndex> allocs_;
+  // {operations_to_skip_} contains all of the AllocateOp and StoreOp that can
+  // be removed.
+  ZoneUnorderedSet<OpIndex> operations_to_skip_;
+};
+
+template <class Next>
+class LateEscapeAnalysisReducer : public Next {
+ public:
+  using Next::Asm;
+  // We need the next line to not shadow Next's (and ReducerBase's in
+  // particular) ShouldSkipOperation method.
+  using Next::ShouldSkipOperation;
+
+  template <class... Args>
+  explicit LateEscapeAnalysisReducer(const std::tuple<Args...>& args)
+      : Next(args), analyzer_(Asm().input_graph(), Asm().phase_zone()) {}
+
+  void Analyze() {
+    analyzer_.Run();
+    Next::Analyze();
+  }
+
+  bool ShouldSkipOperation(const StoreOp& op, OpIndex old_idx) {
+    return analyzer_.ShouldSkipOperation(old_idx) ||
+           Next::ShouldSkipOperation(op, old_idx);
+  }
+
+  bool ShouldSkipOperation(const AllocateOp& op, OpIndex old_idx) {
+    return analyzer_.ShouldSkipOperation(old_idx) ||
+           Next::ShouldSkipOperation(op, old_idx);
+  }
+
+ private:
+  LateEscapeAnalysisAnalyzer analyzer_;
+};
+
+}  // namespace v8::internal::compiler::turboshaft
+
+#endif  // V8_COMPILER_TURBOSHAFT_LATE_ESCAPE_ANALYSIS_REDUCER_H_
--- a/src/compiler/turboshaft/memory-optimization.cc
+++ b/src/compiler/turboshaft/memory-optimization.cc
@ -7,7 +7,7 @@
 #include "src/codegen/interface-descriptors-inl.h"
 #include "src/compiler/linkage.h"

-namespace v8 ::internal::compiler::turboshaft {
+namespace v8::internal::compiler::turboshaft {

 const TSCallDescriptor* CreateAllocateBuiltinDescriptor(Zone* zone) {
  return TSCallDescriptor::Create(
@ -19,146 +19,4 @@ const TSCallDescriptor* CreateAllocateBuiltinDescriptor(Zone* zone) {
      zone);
 }

-void MemoryAnalyzer::Run() {
-  block_states[current_block] = BlockState{};
-  BlockIndex end = BlockIndex(input_graph.block_count());
-  while (current_block < end) {
-    state = *block_states[current_block];
-    auto operations_range =
-        input_graph.operations(input_graph.Get(current_block));
-    // Set the next block index here already, to allow it to be changed if
-    // needed.
-    current_block = BlockIndex(current_block.id() + 1);
-    for (const Operation& op : operations_range) {
-      Process(op);
-    }
-  }
-}
-
-void MemoryAnalyzer::Process(const Operation& op) {
-  if (auto* alloc = op.TryCast<AllocateOp>()) {
-    ProcessAllocation(*alloc);
-    return;
-  }
-  if (auto* store = op.TryCast<StoreOp>()) {
-    ProcessStore(input_graph.Index(op), store->base());
-    return;
-  }
-  OpProperties properties = op.Properties();
-  if (properties.can_allocate) {
-    state = BlockState();
-  }
-  if (properties.is_block_terminator) {
-    ProcessBlockTerminator(op);
-  }
-}
-
-// Update the successor block states based on the state of the current block.
-// For loop backedges, we need to re-start the analysis from the loop header
-// unless the backedge state is unchanged.
-void MemoryAnalyzer::ProcessBlockTerminator(const Operation& op) {
-  if (auto* goto_op = op.TryCast<GotoOp>()) {
-    if (input_graph.IsLoopBackedge(*goto_op)) {
-      base::Optional<BlockState>& target_state =
-          block_states[goto_op->destination->index()];
-      BlockState old_state = *target_state;
-      MergeCurrentStateIntoSuccessor(goto_op->destination);
-      if (old_state != *target_state) {
-        // We can never fold allocations inside of the loop into an
-        // allocation before the loop, since this leads to unbounded
-        // allocation size. An unknown `reserved_size` will prevent adding
-        // allocations inside of the loop.
-        target_state->reserved_size = base::nullopt;
-        // Redo the analysis from the beginning of the loop.
-        current_block = goto_op->destination->index();
-      }
-      return;
-    } else if (goto_op->destination->IsLoop()) {
-      // Look ahead to detect allocating loops earlier, avoiding a wrong
-      // speculation resulting in processing the loop twice.
-      for (const Operation& op :
-           input_graph.operations(*goto_op->destination)) {
-        if (op.Properties().can_allocate) {
-          state = BlockState();
-          break;
-        }
-      }
-    }
-  }
-  for (Block* successor : SuccessorBlocks(op)) {
-    MergeCurrentStateIntoSuccessor(successor);
-  }
-}
-
-// We try to merge the new allocation into a previous dominating allocation.
-// We also allow folding allocations across blocks, as long as there is a
-// dominating relationship.
-void MemoryAnalyzer::ProcessAllocation(const AllocateOp& alloc) {
-  if (ShouldSkipOptimizationStep()) return;
-  base::Optional<uint64_t> new_size;
-  if (auto* size = input_graph.Get(alloc.size()).TryCast<ConstantOp>()) {
-    new_size = size->integral();
-  }
-  // If the new allocation has a static size and is of the same type, then we
-  // can fold it into the previous allocation unless the folded allocation would
-  // exceed `kMaxRegularHeapObjectSize`.
-  if (state.last_allocation && new_size.has_value() &&
-      state.reserved_size.has_value() &&
-      alloc.type == state.last_allocation->type &&
-      *new_size <= kMaxRegularHeapObjectSize - *state.reserved_size) {
-    state.reserved_size =
-        static_cast<uint32_t>(*state.reserved_size + *new_size);
-    folded_into[&alloc] = state.last_allocation;
-    uint32_t& max_reserved_size = reserved_size[state.last_allocation];
-    max_reserved_size = std::max(max_reserved_size, *state.reserved_size);
-    return;
-  }
-  state.last_allocation = &alloc;
-  state.reserved_size = base::nullopt;
-  if (new_size.has_value() && *new_size <= kMaxRegularHeapObjectSize) {
-    state.reserved_size = static_cast<uint32_t>(*new_size);
-  }
-  // We might be re-visiting the current block. In this case, we need to remove
-  // an allocation that can no longer be folded.
-  reserved_size.erase(&alloc);
-  folded_into.erase(&alloc);
-}
-
-void MemoryAnalyzer::ProcessStore(OpIndex store, OpIndex object) {
-  if (SkipWriteBarrier(input_graph.Get(object))) {
-    skipped_write_barriers.insert(store);
-  } else {
-    // We might be re-visiting the current block. In this case, we need to
-    // still update the information.
-    skipped_write_barriers.erase(store);
-  }
-}
-
-void MemoryAnalyzer::MergeCurrentStateIntoSuccessor(const Block* successor) {
-  base::Optional<BlockState>& target_state = block_states[successor->index()];
-  if (!target_state.has_value()) {
-    target_state = state;
-    return;
-  }
-  // All predecessors need to have the same last allocation for us to continue
-  // folding into it. This is only true when all the predecessors don't do any
-  // allocations and have the same ancestor that does an allocation (and there
-  // is no allocation on the path from the predecessors to their allocating
-  // common ancestor).
-  if (target_state->last_allocation != state.last_allocation) {
-    target_state = BlockState();
-    return;
-  }
-  // We take the maximum allocation size of all predecessors. If the size is
-  // unknown because it is dynamic, we remember the allocation to eliminate
-  // write barriers.
-  if (target_state->reserved_size.has_value() &&
-      state.reserved_size.has_value()) {
-    target_state->reserved_size =
-        std::max(*target_state->reserved_size, *state.reserved_size);
-  } else {
-    target_state->reserved_size = base::nullopt;
-  }
-}
-
 }  // namespace v8::internal::compiler::turboshaft
--- a/src/compiler/turboshaft/memory-optimization.h
+++ b/src/compiler/turboshaft/memory-optimization.h
@ -11,7 +11,7 @@
 #include "src/compiler/turboshaft/assembler.h"
 #include "src/compiler/turboshaft/utils.h"

-namespace v8 ::internal::compiler::turboshaft {
+namespace v8::internal::compiler::turboshaft {

 const TSCallDescriptor* CreateAllocateBuiltinDescriptor(Zone* zone);

@ -27,11 +27,16 @@ const TSCallDescriptor* CreateAllocateBuiltinDescriptor(Zone* zone);
 // to satisfy all subsequent allocations.
 // We can do write barrier elimination across loops if the loop does not contain
 // any potentially allocating operations.
+template <class Assembler>
 struct MemoryAnalyzer {
  Zone* phase_zone;
  const Graph& input_graph;
-  MemoryAnalyzer(Zone* phase_zone, const Graph& input_graph)
-      : phase_zone(phase_zone), input_graph(input_graph) {}
+  Assembler* assembler;
+  MemoryAnalyzer(Zone* phase_zone, const Graph& input_graph,
+                 Assembler* assembler)
+      : phase_zone(phase_zone),
+        input_graph(input_graph),
+        assembler(assembler) {}

  struct BlockState {
    const AllocateOp* last_allocation = nullptr;
@ -66,12 +71,13 @@ struct MemoryAnalyzer {
  }

  bool IsFoldedAllocation(OpIndex op) {
-    return folded_into.count(input_graph.Get(op).TryCast<AllocateOp>());
+    return folded_into.count(
+        input_graph.Get(op).template TryCast<AllocateOp>());
  }

  base::Optional<uint32_t> ReservedSize(OpIndex alloc) {
-    if (auto it =
-            reserved_size.find(input_graph.Get(alloc).TryCast<AllocateOp>());
+    if (auto it = reserved_size.find(
+            input_graph.Get(alloc).template TryCast<AllocateOp>());
        it != reserved_size.end()) {
      return it->second;
    }
@ -79,6 +85,7 @@ struct MemoryAnalyzer {
  }

  void Run();
+
  void Process(const Operation& op);
  void ProcessBlockTerminator(const Operation& op);
  void ProcessAllocation(const AllocateOp& alloc);
@ -103,7 +110,7 @@ class MemoryOptimizationReducer : public Next {
        isolate_(std::get<MemoryOptimizationReducerArgs>(args).isolate) {}

  void Analyze() {
-    analyzer_.emplace(Asm().phase_zone(), Asm().input_graph());
+    analyzer_.emplace(Asm().phase_zone(), Asm().input_graph(), &Asm());
    analyzer_->Run();
    Next::Analyze();
  }
@ -248,7 +255,7 @@ class MemoryOptimizationReducer : public Next {
  }

 private:
-  base::Optional<MemoryAnalyzer> analyzer_;
+  base::Optional<MemoryAnalyzer<typename Next::AssemblerType>> analyzer_;
  Isolate* isolate_;
  const TSCallDescriptor* allocate_builtin_descriptor_ = nullptr;

@ -261,6 +268,161 @@ class MemoryOptimizationReducer : public Next {
  }
 };

+template <class Assembler>
+inline void MemoryAnalyzer<Assembler>::Run() {
+  block_states[current_block] = BlockState{};
+  BlockIndex end = BlockIndex(input_graph.block_count());
+  while (current_block < end) {
+    state = *block_states[current_block];
+    auto operations_range =
+        input_graph.operations(input_graph.Get(current_block));
+    // Set the next block index here already, to allow it to be changed if
+    // needed.
+    current_block = BlockIndex(current_block.id() + 1);
+    for (const Operation& op : operations_range) {
+      Process(op);
+    }
+  }
+}
+
+template <class Assembler>
+inline void MemoryAnalyzer<Assembler>::Process(const Operation& op) {
+  if (assembler->ShouldSkipOperation(op, input_graph.Index(op))) {
+    return;
+  }
+
+  if (auto* alloc = op.TryCast<AllocateOp>()) {
+    ProcessAllocation(*alloc);
+    return;
+  }
+  if (auto* store = op.TryCast<StoreOp>()) {
+    ProcessStore(input_graph.Index(op), store->base());
+    return;
+  }
+  OpProperties properties = op.Properties();
+  if (properties.can_allocate) {
+    state = BlockState();
+  }
+  if (properties.is_block_terminator) {
+    ProcessBlockTerminator(op);
+  }
+}
+
+// Update the successor block states based on the state of the current block.
+// For loop backedges, we need to re-start the analysis from the loop header
+// unless the backedge state is unchanged.
+template <class Assembler>
+inline void MemoryAnalyzer<Assembler>::ProcessBlockTerminator(
+    const Operation& op) {
+  if (auto* goto_op = op.TryCast<GotoOp>()) {
+    if (input_graph.IsLoopBackedge(*goto_op)) {
+      base::Optional<BlockState>& target_state =
+          block_states[goto_op->destination->index()];
+      BlockState old_state = *target_state;
+      MergeCurrentStateIntoSuccessor(goto_op->destination);
+      if (old_state != *target_state) {
+        // We can never fold allocations inside of the loop into an
+        // allocation before the loop, since this leads to unbounded
+        // allocation size. An unknown `reserved_size` will prevent adding
+        // allocations inside of the loop.
+        target_state->reserved_size = base::nullopt;
+        // Redo the analysis from the beginning of the loop.
+        current_block = goto_op->destination->index();
+      }
+      return;
+    } else if (goto_op->destination->IsLoop()) {
+      // Look ahead to detect allocating loops earlier, avoiding a wrong
+      // speculation resulting in processing the loop twice.
+      for (const Operation& op :
+           input_graph.operations(*goto_op->destination)) {
+        if (op.Properties().can_allocate &&
+            !assembler->ShouldSkipOperation(op, input_graph.Index(op))) {
+          state = BlockState();
+          break;
+        }
+      }
+    }
+  }
+  for (Block* successor : SuccessorBlocks(op)) {
+    MergeCurrentStateIntoSuccessor(successor);
+  }
+}
+
+// We try to merge the new allocation into a previous dominating allocation.
+// We also allow folding allocations across blocks, as long as there is a
+// dominating relationship.
+template <class Assembler>
+inline void MemoryAnalyzer<Assembler>::ProcessAllocation(
+    const AllocateOp& alloc) {
+  if (ShouldSkipOptimizationStep()) return;
+  base::Optional<uint64_t> new_size;
+  if (auto* size =
+          input_graph.Get(alloc.size()).template TryCast<ConstantOp>()) {
+    new_size = size->integral();
+  }
+  // If the new allocation has a static size and is of the same type, then we
+  // can fold it into the previous allocation unless the folded allocation would
+  // exceed `kMaxRegularHeapObjectSize`.
+  if (state.last_allocation && new_size.has_value() &&
+      state.reserved_size.has_value() &&
+      alloc.type == state.last_allocation->type &&
+      *new_size <= kMaxRegularHeapObjectSize - *state.reserved_size) {
+    state.reserved_size =
+        static_cast<uint32_t>(*state.reserved_size + *new_size);
+    folded_into[&alloc] = state.last_allocation;
+    uint32_t& max_reserved_size = reserved_size[state.last_allocation];
+    max_reserved_size = std::max(max_reserved_size, *state.reserved_size);
+    return;
+  }
+  state.last_allocation = &alloc;
+  state.reserved_size = base::nullopt;
+  if (new_size.has_value() && *new_size <= kMaxRegularHeapObjectSize) {
+    state.reserved_size = static_cast<uint32_t>(*new_size);
+  }
+  // We might be re-visiting the current block. In this case, we need to remove
+  // an allocation that can no longer be folded.
+  reserved_size.erase(&alloc);
+  folded_into.erase(&alloc);
+}
+
+template <class Assembler>
+inline void MemoryAnalyzer<Assembler>::ProcessStore(OpIndex store,
+                                                    OpIndex object) {
+  if (SkipWriteBarrier(input_graph.Get(object))) {
+    skipped_write_barriers.insert(store);
+  } else {
+    // We might be re-visiting the current block. In this case, we need to
+    // still update the information.
+    skipped_write_barriers.erase(store);
+  }
+}
+
+template <class Assembler>
+inline void MemoryAnalyzer<Assembler>::MergeCurrentStateIntoSuccessor(
+    const Block* successor) {
+  base::Optional<BlockState>& target_state = block_states[successor->index()];
+  if (!target_state.has_value()) {
+    target_state = state;
+    return;
+  }
+  // All predecessors need to have the same last allocation for us to continue
+  // folding into it.
+  if (target_state->last_allocation != state.last_allocation) {
+    target_state = BlockState();
+    return;
+  }
+  // We take the maximum allocation size of all predecessors. If the size is
+  // unknown because it is dynamic, we remember the allocation to eliminate
+  // write barriers.
+  if (target_state->reserved_size.has_value() &&
+      state.reserved_size.has_value()) {
+    target_state->reserved_size =
+        std::max(*target_state->reserved_size, *state.reserved_size);
+  } else {
+    target_state->reserved_size = base::nullopt;
+  }
+}
+
 }  // namespace v8::internal::compiler::turboshaft

 #endif  // V8_COMPILER_TURBOSHAFT_MEMORY_OPTIMIZATION_H_
--- a/src/compiler/turboshaft/optimization-phase.h
+++ b/src/compiler/turboshaft/optimization-phase.h
@ -312,15 +312,14 @@ class GraphVisitor {
        assembler().output_graph().next_operation_index();
    USE(first_output_index);
    const Operation& op = input_graph().Get(index);
-    if (op.saturated_use_count == 0 &&
-        !op.Properties().is_required_when_unused) {
-      if constexpr (trace_reduction) TraceOperationUnused();
-      return true;
-    }
    if constexpr (trace_reduction) TraceReductionStart(index);
    OpIndex new_index;
    if (input_block->IsLoop() && op.Is<PhiOp>()) {
      const PhiOp& phi = op.Cast<PhiOp>();
+      if (assembler().ShouldSkipOperation(phi, index)) {
+        if constexpr (trace_reduction) TraceOperationSkipped();
+        return true;
+      }
      new_index = assembler().PendingLoopPhi(MapToNewGraph(phi.inputs()[0]),
                                             phi.rep, phi.inputs()[1]);
      CreateOldToNewMapping(index, new_index);
@ -329,12 +328,16 @@ class GraphVisitor {
      }
    } else {
      switch (op.opcode) {
-#define EMIT_INSTR_CASE(Name)                           \
-  case Opcode::k##Name:                                 \
-    new_index = this->Visit##Name(op.Cast<Name##Op>()); \
-    if (CanBeUsedAsInput(op.Cast<Name##Op>())) {        \
-      CreateOldToNewMapping(index, new_index);          \
-    }                                                   \
+#define EMIT_INSTR_CASE(Name)                                          \
+  case Opcode::k##Name:                                                \
+    if (assembler().ShouldSkipOperation(op.Cast<Name##Op>(), index)) { \
+      if constexpr (trace_reduction) TraceOperationSkipped();          \
+      return true;                                                     \
+    }                                                                  \
+    new_index = this->Visit##Name(op.Cast<Name##Op>());                \
+    if (CanBeUsedAsInput(op.Cast<Name##Op>())) {                       \
+      CreateOldToNewMapping(index, new_index);                         \
+    }                                                                  \
    break;
        TURBOSHAFT_OPERATION_LIST(EMIT_INSTR_CASE)
 #undef EMIT_INSTR_CASE
@ -351,7 +354,7 @@ class GraphVisitor {
              << PaddingSpace{5 - CountDecimalDigits(index.id())}
              << OperationPrintStyle{input_graph().Get(index), "#o"} << "\n";
  }
-  void TraceOperationUnused() { std::cout << "╰─> unused\n\n"; }
+  void TraceOperationSkipped() { std::cout << "╰─> skipped\n\n"; }
  void TraceBlockUnreachable() { std::cout << "╰─> unreachable\n\n"; }
  void TraceReductionResult(Block* current_block, OpIndex first_output_index,
                            OpIndex new_index) {