[turboshaft] add basic optimization phase: liveness analysis

Bug: v8:12783 Change-Id: I15cf16bd66a97c33170ca4f1f5e3acc6ff9bf956 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3576129 Auto-Submit: Tobias Tebbi <tebbi@chromium.org> Commit-Queue: Tobias Tebbi <tebbi@chromium.org> Reviewed-by: Nico Hartmann <nicohartmann@chromium.org> Cr-Commit-Position: refs/heads/main@{#80618}
2022-05-18 16:07:15 +00:00 · 2022-05-18 16:07:15 +00:00 · ecc0bc8f35
commit ecc0bc8f35
parent de877f7497
11 changed files with 493 additions and 42 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -2828,6 +2828,8 @@ filegroup(
        "src/compiler/turboshaft/graph.h",
        "src/compiler/turboshaft/operations.cc",
        "src/compiler/turboshaft/operations.h",
+        "src/compiler/turboshaft/optimization-phase.cc",
+        "src/compiler/turboshaft/optimization-phase.h",
        "src/compiler/turboshaft/recreate-schedule.cc",
        "src/compiler/turboshaft/recreate-schedule.h",
        "src/compiler/type-cache.cc",
--- a/BUILD.gn
+++ b/BUILD.gn
@ -2905,6 +2905,7 @@ v8_header_set("v8_internal_headers") {
    "src/compiler/turboshaft/graph-builder.h",
    "src/compiler/turboshaft/graph.h",
    "src/compiler/turboshaft/operations.h",
+    "src/compiler/turboshaft/optimization-phase.h",
    "src/compiler/turboshaft/recreate-schedule.h",
    "src/compiler/type-cache.h",
    "src/compiler/type-narrowing-reducer.h",
@ -4106,6 +4107,7 @@ v8_source_set("v8_turboshaft") {
    "src/compiler/turboshaft/graph-builder.cc",
    "src/compiler/turboshaft/graph.cc",
    "src/compiler/turboshaft/operations.cc",
+    "src/compiler/turboshaft/optimization-phase.cc",
    "src/compiler/turboshaft/recreate-schedule.cc",
  ]

--- a/src/compiler/pipeline.cc
+++ b/src/compiler/pipeline.cc
@ -78,6 +78,7 @@
 #include "src/compiler/turboshaft/assembler.h"
 #include "src/compiler/turboshaft/graph-builder.h"
 #include "src/compiler/turboshaft/graph.h"
+#include "src/compiler/turboshaft/optimization-phase.h"
 #include "src/compiler/turboshaft/recreate-schedule.h"
 #include "src/compiler/type-narrowing-reducer.h"
 #include "src/compiler/typed-optimization.h"
@ -2011,7 +2012,7 @@ struct BranchConditionDuplicationPhase {
 };

 struct BuildTurboshaftPhase {
-  DECL_PIPELINE_PHASE_CONSTANTS(BuildTurboShaft)
+  DECL_PIPELINE_PHASE_CONSTANTS(BuildTurboshaft)

  void Run(PipelineData* data, Zone* temp_zone) {
    turboshaft::BuildGraph(data->schedule(), data->graph_zone(), temp_zone,
@ -2020,6 +2021,16 @@ struct BuildTurboshaftPhase {
  }
 };

+struct OptimizeTurboshaftPhase {
+  DECL_PIPELINE_PHASE_CONSTANTS(OptimizeTurboshaft)
+
+  void Run(PipelineData* data, Zone* temp_zone) {
+    turboshaft::OptimizationPhase<
+        turboshaft::LivenessAnalyzer,
+        turboshaft::Assembler>::Run(&data->turboshaft_graph(), temp_zone);
+  }
+};
+
 struct TurboshaftRecreateSchedulePhase {
  DECL_PIPELINE_PHASE_CONSTANTS(TurboshaftRecreateSchedule)

@ -2860,10 +2871,12 @@ bool PipelineImpl::OptimizeGraph(Linkage* linkage) {
      AllowHandleDereference allow_deref;
      CodeTracer::StreamScope tracing_scope(data->GetCodeTracer());
      tracing_scope.stream()
-          << "\n-- TurboShaft Graph ----------------------------\n"
+          << "\n-- Turboshaft Graph ----------------------------\n"
          << data->turboshaft_graph();
    }

+    Run<OptimizeTurboshaftPhase>();
+
    Run<TurboshaftRecreateSchedulePhase>(linkage);
    if (data->info()->trace_turbo_graph() || FLAG_trace_turbo_scheduler) {
      UnparkedScopeIfNeeded scope(data->broker());
--- a/src/compiler/turboshaft/graph-builder.cc
+++ b/src/compiler/turboshaft/graph-builder.cc
@ -170,7 +170,8 @@ void GraphBuilder::Run() {
        DCHECK_EQ(block->SuccessorCount(), 1);
        Block* destination = Map(block->SuccessorAt(0));
        assembler.Goto(destination);
-        if (destination->IsLoop()) {
+        if (destination->IsBound()) {
+          DCHECK(destination->IsLoop());
          FixLoopPhis(destination, target_block);
        }
        break;
--- a/src/compiler/turboshaft/graph.h
+++ b/src/compiler/turboshaft/graph.h
@ -22,7 +22,7 @@ namespace v8::internal::compiler::turboshaft {
 class Assembler;
 class VarAssembler;

-// `OperationBuffer` is a growable, Zone-allocated buffer to store TurboShaft
+// `OperationBuffer` is a growable, Zone-allocated buffer to store Turboshaft
 // operations. It is part of a `Graph`.
 // The buffer can be seen as an array of 8-byte `OperationStorageSlot` values.
 // The structure is append-only, that is, we only add operations at the end.
@ -220,8 +220,19 @@ class Block {
    return result;
  }

+  Block* LastPredecessor() const { return last_predecessor_; }
+  Block* NeighboringPredecessor() const { return neighboring_predecessor_; }
  bool HasPredecessors() const { return last_predecessor_ != nullptr; }

+  // The block from the previous graph which produced the current block. This is
+  // used for translating phi nodes from the previous graph.
+  void SetOrigin(const Block* origin) {
+    DCHECK_NULL(origin_);
+    DCHECK_NE(origin->graph_, graph_);
+    origin_ = origin;
+  }
+  const Block* Origin() const { return origin_; }
+
  OpIndex begin() const {
    DCHECK(begin_.valid());
    return begin_;
@ -243,6 +254,7 @@ class Block {
  BlockIndex index_ = BlockIndex::Invalid();
  Block* last_predecessor_ = nullptr;
  Block* neighboring_predecessor_ = nullptr;
+  const Block* origin_ = nullptr;
 #ifdef DEBUG
  Graph* graph_ = nullptr;
 #endif
@ -342,7 +354,7 @@ class Graph {
    return result;
  }

-  bool Add(Block* block) {
+  V8_INLINE bool Add(Block* block) {
    DCHECK_EQ(block->graph_, this);
    if (!bound_blocks_.empty() && !block->HasPredecessors()) return false;
    bool deferred = true;
@ -435,12 +447,16 @@ class Graph {

  base::iterator_range<ConstOperationIterator> operations(OpIndex begin,
                                                          OpIndex end) const {
+    DCHECK(begin.valid());
+    DCHECK(end.valid());
    return {ConstOperationIterator(begin, this),
            ConstOperationIterator(end, this)};
  }

  base::iterator_range<MutableOperationIterator> operations(OpIndex begin,
                                                            OpIndex end) {
+    DCHECK(begin.valid());
+    DCHECK(end.valid());
    return {MutableOperationIterator(begin, this),
            MutableOperationIterator(end, this)};
  }
--- a/src/compiler/turboshaft/operations.h
+++ b/src/compiler/turboshaft/operations.h
@ -220,7 +220,7 @@ struct OpProperties {
  }
 };

-// Baseclass for all TurboShaft operations.
+// Baseclass for all Turboshaft operations.
 // The `alignas(OpIndex)` is necessary because it is followed by an array of
 // `OpIndex` inputs.
 struct alignas(OpIndex) Operation {
@ -308,24 +308,25 @@ struct OperationT : Operation {

  static constexpr OpProperties properties() { return Derived::properties; }

+  Derived& derived_this() { return *static_cast<Derived*>(this); }
+  const Derived& derived_this() const {
+    return *static_cast<const Derived*>(this);
+  }
+
  // Shadow Operation::inputs to exploit static knowledge about object size.
  base::Vector<OpIndex> inputs() {
    return {reinterpret_cast<OpIndex*>(reinterpret_cast<char*>(this) +
                                       sizeof(Derived)),
-            input_count};
+            derived_this().input_count};
  }
  base::Vector<const OpIndex> inputs() const {
    return {reinterpret_cast<const OpIndex*>(
                reinterpret_cast<const char*>(this) + sizeof(Derived)),
-            input_count};
+            derived_this().input_count};
  }

-  V8_INLINE OpIndex& input(size_t i) {
-    return static_cast<Derived*>(this)->inputs()[i];
-  }
-  V8_INLINE OpIndex input(size_t i) const {
-    return static_cast<const Derived*>(this)->inputs()[i];
-  }
+  V8_INLINE OpIndex& input(size_t i) { return derived_this().inputs()[i]; }
+  V8_INLINE OpIndex input(size_t i) const { return derived_this().inputs()[i]; }

  static size_t StorageSlotCount(size_t input_count) {
    // The operation size in bytes is:
@ -373,8 +374,8 @@ struct OperationT : Operation {

  bool operator==(const Derived& other) const {
    const Derived& derived = *static_cast<const Derived*>(this);
-    if (derived.inputs() != other.inputs()) return false;
-    return derived.options() == other.options();
+    return derived.inputs() == other.inputs() &&
+           derived.options() == other.options();
  }
  size_t hash_value() const {
    const Derived& derived = *static_cast<const Derived*>(this);
@ -382,7 +383,7 @@ struct OperationT : Operation {
  }

  void PrintOptions(std::ostream& os) const {
-    const auto& options = static_cast<const Derived*>(this)->options();
+    const auto& options = derived_this().options();
    constexpr size_t options_count =
        std::tuple_size<std::remove_reference_t<decltype(options)>>::value;
    if (options_count == 0) {
@ -411,19 +412,8 @@ struct FixedArityOperationT : OperationT<Derived> {
  // Enable concise base access in derived struct.
  using Base = FixedArityOperationT;

-  // Shadow OperationT<Derived>::inputs to exploit static knowledge about input
-  // count.
+  // Shadow Operation::input_count to exploit static knowledge.
  static constexpr uint16_t input_count = InputCount;
-  base::Vector<OpIndex> inputs() {
-    return {reinterpret_cast<OpIndex*>(reinterpret_cast<char*>(this) +
-                                       sizeof(Derived)),
-            InputCount};
-  }
-  base::Vector<const OpIndex> inputs() const {
-    return {reinterpret_cast<const OpIndex*>(
-                reinterpret_cast<const char*>(this) + sizeof(Derived)),
-            InputCount};
-  }

  template <class... Args>
  explicit FixedArityOperationT(Args... args)
@ -434,12 +424,6 @@ struct FixedArityOperationT : OperationT<Derived> {
    ((inputs[i++] = args), ...);
  }

-  bool operator==(const Derived& other) const {
-    return std::equal(inputs().begin(), inputs().end(),
-                      other.inputs().begin()) &&
-           static_cast<const Derived*>(this)->options() == other.options();
-  }
-
  // Redefine the input initialization to tell C++ about the static input size.
  template <class... Args>
  static Derived& New(Graph* graph, Args... args) {
@ -695,6 +679,8 @@ struct PhiOp : OperationT<PhiOp> {

  static constexpr OpProperties properties = OpProperties::Pure();

+  static constexpr size_t kLoopPhiBackEdgeIndex = 1;
+
  explicit PhiOp(base::Vector<const OpIndex> inputs, MachineRepresentation rep)
      : Base(inputs), rep(rep) {}
  auto options() const { return std::tuple{rep}; }
@ -705,7 +691,7 @@ struct PhiOp : OperationT<PhiOp> {
 struct PendingLoopPhiOp : FixedArityOperationT<1, PendingLoopPhiOp> {
  MachineRepresentation rep;
  union {
-    // Used when transforming a TurboShaft graph.
+    // Used when transforming a Turboshaft graph.
    // This is not an input because it refers to the old graph.
    OpIndex old_backedge_index = OpIndex::Invalid();
    // Used when translating from sea-of-nodes.
@ -896,6 +882,8 @@ struct ConstantOp : FixedArityOperationT<0, ConstantOp> {
    }
  }

+  auto options() const { return std::tuple{kind, storage}; }
+
  void PrintOptions(std::ostream& os) const;
  size_t hash_value() const {
    switch (kind) {
@ -985,7 +973,7 @@ struct IndexedLoadOp : FixedArityOperationT<2, IndexedLoadOp> {
        offset(offset) {}
  void PrintOptions(std::ostream& os) const;
  auto options() const {
-    return std::tuple{kind, loaded_rep, element_size_log2, offset};
+    return std::tuple{kind, loaded_rep, offset, element_size_log2};
  }
 };

@ -1047,8 +1035,8 @@ struct IndexedStoreOp : FixedArityOperationT<3, IndexedStoreOp> {
        offset(offset) {}
  void PrintOptions(std::ostream& os) const;
  auto options() const {
-    return std::tuple{kind, stored_rep, write_barrier, element_size_log2,
-                      offset};
+    return std::tuple{kind, stored_rep, write_barrier, offset,
+                      element_size_log2};
  }
 };

--- a/src/compiler/turboshaft/optimization-phase.cc
+++ b/src/compiler/turboshaft/optimization-phase.cc
@ -0,0 +1,26 @@
+// Copyright 2022 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/compiler/turboshaft/optimization-phase.h"
+
+namespace v8::internal::compiler::turboshaft {
+
+int CountDecimalDigits(uint32_t value) {
+  int result = 1;
+  while (value > 9) {
+    result++;
+    value = value / 10;
+  }
+  return result;
+}
+
+std::ostream& operator<<(std::ostream& os, PaddingSpace padding) {
+  if (padding.spaces > 10000) return os;
+  for (int i = 0; i < padding.spaces; ++i) {
+    os << ' ';
+  }
+  return os;
+}
+
+}  // namespace v8::internal::compiler::turboshaft
--- a/src/compiler/turboshaft/optimization-phase.h
+++ b/src/compiler/turboshaft/optimization-phase.h
@ -0,0 +1,400 @@
+// Copyright 2022 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_COMPILER_TURBOSHAFT_OPTIMIZATION_PHASE_H_
+#define V8_COMPILER_TURBOSHAFT_OPTIMIZATION_PHASE_H_
+
+#include <utility>
+
+#include "src/base/iterator.h"
+#include "src/base/logging.h"
+#include "src/base/small-vector.h"
+#include "src/base/vector.h"
+#include "src/compiler/turboshaft/graph.h"
+#include "src/compiler/turboshaft/operations.h"
+
+namespace v8::internal::compiler::turboshaft {
+
+int CountDecimalDigits(uint32_t value);
+struct PaddingSpace {
+  int spaces;
+};
+std::ostream& operator<<(std::ostream& os, PaddingSpace padding);
+
+struct AnalyzerBase {
+  Zone* phase_zone;
+  const Graph& graph;
+
+  void Run() {}
+  bool OpIsUsed(OpIndex i) const { return true; }
+
+  explicit AnalyzerBase(const Graph& graph, Zone* phase_zone)
+      : phase_zone(phase_zone), graph(graph) {}
+};
+
+struct LivenessAnalyzer : AnalyzerBase {
+  using Base = AnalyzerBase;
+  // Using `uint8_t` instead of `bool` prevents `std::vector` from using a
+  // bitvector, which has worse performance.
+  std::vector<uint8_t> op_used;
+
+  LivenessAnalyzer(const Graph& graph, Zone* phase_zone)
+      : AnalyzerBase(graph, phase_zone), op_used(graph.op_id_count(), false) {}
+
+  bool OpIsUsed(OpIndex i) { return op_used[i.id()]; }
+
+  void Run() {
+    for (uint32_t unprocessed_count = graph.block_count();
+         unprocessed_count > 0;) {
+      BlockIndex block_index = static_cast<BlockIndex>(unprocessed_count - 1);
+      --unprocessed_count;
+      const Block& block = graph.Get(block_index);
+      if (V8_UNLIKELY(block.IsLoop())) {
+        ProcessBlock<true>(block, &unprocessed_count);
+      } else {
+        ProcessBlock<false>(block, &unprocessed_count);
+      }
+    }
+  }
+
+  template <bool is_loop>
+  void ProcessBlock(const Block& block, uint32_t* unprocessed_count) {
+    auto op_range = graph.operations(block);
+    for (auto it = op_range.end(); it != op_range.begin();) {
+      --it;
+      OpIndex index = it.Index();
+      const Operation& op = *it;
+      if (op.properties().is_required_when_unused) {
+        op_used[index.id()] = true;
+      } else if (!OpIsUsed(index)) {
+        continue;
+      }
+      if constexpr (is_loop) {
+        if (op.Is<PhiOp>()) {
+          const PhiOp& phi = op.Cast<PhiOp>();
+          // Mark the loop backedge as used. Trigger a revisit if it wasn't
+          // marked as used already.
+          if (!OpIsUsed(phi.inputs()[PhiOp::kLoopPhiBackEdgeIndex])) {
+            Block* backedge = block.LastPredecessor();
+            // Revisit the loop by increasing the `unprocessed_count` to include
+            // all blocks of the loop.
+            *unprocessed_count =
+                std::max(*unprocessed_count, backedge->index().id() + 1);
+          }
+        }
+      }
+      for (OpIndex input : op.inputs()) {
+        op_used[input.id()] = true;
+      }
+    }
+  }
+};
+
+template <class Analyzer, class Assembler>
+class OptimizationPhase {
+ private:
+  struct Impl;
+
+ public:
+  static void Run(Graph* input, Zone* phase_zone) {
+    Impl phase{*input, phase_zone};
+    if (FLAG_turboshaft_trace_reduction) {
+      phase.template Run<true>();
+    } else {
+      phase.template Run<false>();
+    }
+  }
+  static void RunWithoutTracing(Graph* input, Zone* phase_zone) {
+    Impl phase{*input, phase_zone};
+    phase.template Run<false>();
+  }
+};
+
+template <class Analyzer, class Assembler>
+struct OptimizationPhase<Analyzer, Assembler>::Impl {
+  Graph& input_graph;
+  Zone* phase_zone;
+
+  Analyzer analyzer{input_graph, phase_zone};
+  Assembler assembler{&input_graph.GetOrCreateCompanion(), phase_zone};
+  const Block* current_input_block = nullptr;
+  // Mappings from the old graph to the new graph.
+  std::vector<Block*> block_mapping{input_graph.block_count(), nullptr};
+  std::vector<OpIndex> op_mapping{input_graph.op_id_count(),
+                                  OpIndex::Invalid()};
+
+  // `trace_reduction` is a template parameter to avoid paying for tracing at
+  // runtime.
+  template <bool trace_reduction>
+  void Run() {
+    analyzer.Run();
+
+    for (const Block& input_block : input_graph.blocks()) {
+      block_mapping[input_block.index().id()] =
+          assembler.NewBlock(input_block.kind());
+    }
+    for (const Block& input_block : input_graph.blocks()) {
+      current_input_block = &input_block;
+      if constexpr (trace_reduction) {
+        std::cout << PrintAsBlockHeader{input_block} << "\n";
+      }
+      if (!assembler.Bind(MapToNewGraph(input_block.index()))) {
+        if constexpr (trace_reduction) TraceBlockUnreachable();
+        continue;
+      }
+      assembler.current_block()->SetDeferred(input_block.IsDeferred());
+      auto op_range = input_graph.operations(input_block);
+      for (auto it = op_range.begin(); it != op_range.end(); ++it) {
+        const Operation& op = *it;
+        OpIndex index = it.Index();
+        OpIndex first_output_index = assembler.graph().next_operation_index();
+        if constexpr (trace_reduction) TraceReductionStart(index);
+        if (!analyzer.OpIsUsed(index)) {
+          if constexpr (trace_reduction) TraceOperationUnused();
+          continue;
+        }
+        OpIndex new_index;
+        if (input_block.IsLoop() && op.Is<PhiOp>()) {
+          const PhiOp& phi = op.Cast<PhiOp>();
+          new_index = assembler.PendingLoopPhi(MapToNewGraph(phi.inputs()[0]),
+                                               phi.rep, phi.inputs()[1]);
+          if constexpr (trace_reduction) {
+            TraceReductionResult(first_output_index, new_index);
+          }
+        } else {
+          switch (op.opcode) {
+#define EMIT_INSTR_CASE(Name)                            \
+  case Opcode::k##Name:                                  \
+    new_index = this->Reduce##Name(op.Cast<Name##Op>()); \
+    break;
+            TURBOSHAFT_OPERATION_LIST(EMIT_INSTR_CASE)
+#undef EMIT_INSTR_CASE
+          }
+          if constexpr (trace_reduction) {
+            TraceReductionResult(first_output_index, new_index);
+          }
+        }
+        op_mapping[index.id()] = new_index;
+      }
+      if constexpr (trace_reduction) TraceBlockFinished();
+    }
+    input_graph.SwapWithCompanion();
+  }
+
+  void TraceReductionStart(OpIndex index) {
+    std::cout << "╭── o" << index.id() << ": "
+              << PaddingSpace{5 - CountDecimalDigits(index.id())}
+              << OperationPrintStyle{input_graph.Get(index), "#o"} << "\n";
+  }
+  void TraceOperationUnused() { std::cout << "╰─> unused\n\n"; }
+  void TraceBlockUnreachable() { std::cout << "╰─> unreachable\n\n"; }
+  void TraceReductionResult(OpIndex first_output_index, OpIndex new_index) {
+    if (new_index < first_output_index) {
+      // The operation was replaced with an already existing one.
+      std::cout << "╰─> #n" << new_index.id() << "\n";
+    }
+    bool before_arrow = new_index >= first_output_index;
+    for (const Operation& op : assembler.graph().operations(
+             first_output_index, assembler.graph().next_operation_index())) {
+      OpIndex index = assembler.graph().Index(op);
+      const char* prefix;
+      if (index == new_index) {
+        prefix = "╰─>";
+        before_arrow = false;
+      } else if (before_arrow) {
+        prefix = "│  ";
+      } else {
+        prefix = "   ";
+      }
+      std::cout << prefix << " n" << index.id() << ": "
+                << PaddingSpace{5 - CountDecimalDigits(index.id())}
+                << OperationPrintStyle{assembler.graph().Get(index), "#n"}
+                << "\n";
+    }
+    std::cout << "\n";
+  }
+  void TraceBlockFinished() { std::cout << "\n"; }
+
+  // These functions take an operation from the old graph and use the assembler
+  // to emit a corresponding operation in the new graph, translating inputs and
+  // blocks accordingly.
+
+  V8_INLINE OpIndex ReduceGoto(const GotoOp& op) {
+    Block* destination = MapToNewGraph(op.destination->index());
+    if (destination->IsBound()) {
+      DCHECK(destination->IsLoop());
+      FixLoopPhis(destination);
+    }
+    assembler.current_block()->SetOrigin(current_input_block);
+    return assembler.Goto(destination);
+  }
+  V8_INLINE OpIndex ReduceBranch(const BranchOp& op) {
+    Block* if_true = MapToNewGraph(op.if_true->index());
+    Block* if_false = MapToNewGraph(op.if_false->index());
+    return assembler.Branch(MapToNewGraph(op.condition()), if_true, if_false);
+  }
+  OpIndex ReduceSwitch(const SwitchOp& op) {
+    base::SmallVector<SwitchOp::Case, 16> cases;
+    for (SwitchOp::Case c : op.cases) {
+      cases.emplace_back(c.value, MapToNewGraph(c.destination->index()));
+    }
+    return assembler.Switch(
+        MapToNewGraph(op.input()),
+        assembler.graph_zone()->CloneVector(base::VectorOf(cases)),
+        MapToNewGraph(op.default_case->index()));
+  }
+  OpIndex ReducePhi(const PhiOp& op) {
+    base::Vector<const OpIndex> old_inputs = op.inputs();
+    base::SmallVector<OpIndex, 8> new_inputs;
+    Block* old_pred = current_input_block->LastPredecessor();
+    Block* new_pred = assembler.current_block()->LastPredecessor();
+    // Control predecessors might be missing after the optimization phase. So we
+    // need to skip phi inputs that belong to control predecessors that have no
+    // equivalent in the new graph. We do, however, assume that the order of
+    // control predecessors did not change.
+    for (OpIndex input : base::Reversed(old_inputs)) {
+      if (new_pred->Origin() == old_pred) {
+        new_inputs.push_back(MapToNewGraph(input));
+        new_pred = new_pred->NeighboringPredecessor();
+      }
+      old_pred = old_pred->NeighboringPredecessor();
+    }
+    DCHECK_NULL(old_pred);
+    DCHECK_NULL(new_pred);
+    std::reverse(new_inputs.begin(), new_inputs.end());
+    return assembler.Phi(base::VectorOf(new_inputs), op.rep);
+  }
+  OpIndex ReducePendingLoopPhi(const PendingLoopPhiOp& op) { UNREACHABLE(); }
+  V8_INLINE OpIndex ReduceFrameState(const FrameStateOp& op) {
+    auto inputs = MapToNewGraph<32>(op.inputs());
+    return assembler.FrameState(base::VectorOf(inputs), op.inlined, op.data);
+  }
+  OpIndex ReduceCall(const CallOp& op) {
+    OpIndex callee = MapToNewGraph(op.callee());
+    auto arguments = MapToNewGraph<16>(op.arguments());
+    return assembler.Call(callee, base::VectorOf(arguments), op.descriptor);
+  }
+  OpIndex ReduceReturn(const ReturnOp& op) {
+    auto inputs = MapToNewGraph<4>(op.inputs());
+    return assembler.Return(base::VectorOf(inputs), op.pop_count);
+  }
+  OpIndex ReduceOverflowCheckedBinop(const OverflowCheckedBinopOp& op) {
+    return assembler.OverflowCheckedBinop(
+        MapToNewGraph(op.left()), MapToNewGraph(op.right()), op.kind, op.rep);
+  }
+  OpIndex ReduceFloatUnary(const FloatUnaryOp& op) {
+    return assembler.FloatUnary(MapToNewGraph(op.input()), op.kind, op.rep);
+  }
+  OpIndex ReduceShift(const ShiftOp& op) {
+    return assembler.Shift(MapToNewGraph(op.left()), MapToNewGraph(op.right()),
+                           op.kind, op.rep);
+  }
+  OpIndex ReduceEqual(const EqualOp& op) {
+    return assembler.Equal(MapToNewGraph(op.left()), MapToNewGraph(op.right()),
+                           op.rep);
+  }
+  OpIndex ReduceComparison(const ComparisonOp& op) {
+    return assembler.Comparison(MapToNewGraph(op.left()),
+                                MapToNewGraph(op.right()), op.kind, op.rep);
+  }
+  OpIndex ReduceChange(const ChangeOp& op) {
+    return assembler.Change(MapToNewGraph(op.input()), op.kind, op.from, op.to);
+  }
+  OpIndex ReduceTaggedBitcast(const TaggedBitcastOp& op) {
+    return assembler.TaggedBitcast(MapToNewGraph(op.input()), op.from, op.to);
+  }
+  OpIndex ReduceConstant(const ConstantOp& op) {
+    return assembler.Constant(op.kind, op.storage);
+  }
+  OpIndex ReduceLoad(const LoadOp& op) {
+    return assembler.Load(MapToNewGraph(op.base()), op.kind, op.loaded_rep,
+                          op.offset);
+  }
+  OpIndex ReduceIndexedLoad(const IndexedLoadOp& op) {
+    return assembler.IndexedLoad(
+        MapToNewGraph(op.base()), MapToNewGraph(op.index()), op.kind,
+        op.loaded_rep, op.offset, op.element_size_log2);
+  }
+  OpIndex ReduceStore(const StoreOp& op) {
+    return assembler.Store(MapToNewGraph(op.base()), MapToNewGraph(op.value()),
+                           op.kind, op.stored_rep, op.write_barrier, op.offset);
+  }
+  OpIndex ReduceIndexedStore(const IndexedStoreOp& op) {
+    return assembler.IndexedStore(
+        MapToNewGraph(op.base()), MapToNewGraph(op.index()),
+        MapToNewGraph(op.value()), op.kind, op.stored_rep, op.write_barrier,
+        op.offset, op.element_size_log2);
+  }
+  OpIndex ReduceParameter(const ParameterOp& op) {
+    return assembler.Parameter(op.parameter_index, op.debug_name);
+  }
+  OpIndex ReduceStackPointerGreaterThan(const StackPointerGreaterThanOp& op) {
+    return assembler.StackPointerGreaterThan(MapToNewGraph(op.stack_limit()),
+                                             op.kind);
+  }
+  OpIndex ReduceLoadStackCheckOffset(const LoadStackCheckOffsetOp& op) {
+    return assembler.LoadStackCheckOffset();
+  }
+  OpIndex ReduceCheckLazyDeopt(const CheckLazyDeoptOp& op) {
+    return assembler.CheckLazyDeopt(MapToNewGraph(op.call()),
+                                    MapToNewGraph(op.frame_state()));
+  }
+  OpIndex ReduceDeoptimize(const DeoptimizeOp& op) {
+    return assembler.Deoptimize(MapToNewGraph(op.frame_state()), op.parameters);
+  }
+  OpIndex ReduceDeoptimizeIf(const DeoptimizeIfOp& op) {
+    return assembler.DeoptimizeIf(MapToNewGraph(op.condition()),
+                                  MapToNewGraph(op.frame_state()), op.negated,
+                                  op.parameters);
+  }
+  OpIndex ReduceProjection(const ProjectionOp& op) {
+    return assembler.Projection(MapToNewGraph(op.input()), op.kind);
+  }
+  OpIndex ReduceBinop(const BinopOp& op) {
+    return assembler.Binop(MapToNewGraph(op.left()), MapToNewGraph(op.right()),
+                           op.kind, op.rep);
+  }
+  OpIndex ReduceUnreachable(const UnreachableOp& op) {
+    return assembler.Unreachable();
+  }
+
+  OpIndex MapToNewGraph(OpIndex old_index) {
+    OpIndex result = op_mapping[old_index.id()];
+    DCHECK(result.valid());
+    return result;
+  }
+
+  template <size_t expected_size>
+  base::SmallVector<OpIndex, expected_size> MapToNewGraph(
+      base::Vector<const OpIndex> inputs) {
+    base::SmallVector<OpIndex, expected_size> result;
+    for (OpIndex input : inputs) {
+      result.push_back(MapToNewGraph(input));
+    }
+    return result;
+  }
+
+  Block* MapToNewGraph(BlockIndex old_index) {
+    Block* result = block_mapping[old_index.id()];
+    DCHECK_NOT_NULL(result);
+    return result;
+  }
+
+  void FixLoopPhis(Block* loop) {
+    DCHECK(loop->IsLoop());
+    for (Operation& op : assembler.graph().operations(*loop)) {
+      if (auto* pending_phi = op.TryCast<PendingLoopPhiOp>()) {
+        assembler.graph().template Replace<PhiOp>(
+            assembler.graph().Index(*pending_phi),
+            base::VectorOf({pending_phi->first(),
+                            MapToNewGraph(pending_phi->old_backedge_index)}),
+            pending_phi->rep);
+      }
+    }
+  }
+};
+
+}  // namespace v8::internal::compiler::turboshaft
+
+#endif  // V8_COMPILER_TURBOSHAFT_OPTIMIZATION_PHASE_H_
--- a/src/compiler/turboshaft/recreate-schedule.cc
+++ b/src/compiler/turboshaft/recreate-schedule.cc
@ -106,7 +106,7 @@ RecreateScheduleResult ScheduleBuilder::Run() {
  DCHECK_GE(input_graph.block_count(), 1);
  // The schedule needs to contain an dummy end block because the register
  // allocator expects this. This block is not actually reachable with control
-  // flow. It is added here because the TurboShaft grahp doesn't contain such a
+  // flow. It is added here because the Turboshaft grahp doesn't contain such a
  // block.
  blocks.reserve(input_graph.block_count() + 1);
  blocks.push_back(current_block);
--- a/src/flags/flag-definitions.h
+++ b/src/flags/flag-definitions.h
@ -966,7 +966,9 @@ DEFINE_FLOAT(script_delay_fraction, 0.0,
             "busy wait after each Script::Run by the given fraction of the "
             "run's duration")

-DEFINE_BOOL(turboshaft, false, "enable TurboFan's TurboShaft phases")
+DEFINE_BOOL(turboshaft, false, "enable TurboFan's Turboshaft phases")
+DEFINE_BOOL(turboshaft_trace_reduction, false,
+            "trace individual Turboshaft reduction steps")

 // Favor memory over execution speed.
 DEFINE_BOOL(optimize_for_size, false,
--- a/src/logging/runtime-call-stats.h
+++ b/src/logging/runtime-call-stats.h
@ -369,7 +369,8 @@ class RuntimeCallTimer final {
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, SimplifiedLowering)              \
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, StoreStoreElimination)           \
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, TraceScheduleAndVerify)          \
-  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, BuildTurboShaft)                 \
+  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, BuildTurboshaft)                 \
+  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, OptimizeTurboshaft)              \
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, TurboshaftRecreateSchedule)      \
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, TypeAssertions)                  \
  ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, TypedLowering)                   \