[wasm][turbofan] Improve inlining heuristics

This CL improves wasm inlining heuristics in Turbofan, for an average 8,5% performance improvement in selected benchmarks. Changes: - In WasmInliner::Reduce(), only collect inlining candidates into a priority queue, according to WasmInliner::LexicographicOrdering. Move actual inlining to Finalize(). - Remove the InlineFirstFew heuristic. Add two limits to inlining: Maximum relative size increase (reversely proportional to the function size), and absolute size increase. - Pass information about call frequency from liftoff-collected feedback to the WasmInliner though the wasm module. - Run wasm inlining along other optimizations in the pipeline. - Split inlining and speculative inlining tests. Bug: v8:7748, v8:12166 Change-Id: Iccee22093db765981889a24451fb458dfce1f1a6 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3222764 Reviewed-by: Nico Hartmann <nicohartmann@chromium.org> Reviewed-by: Jakob Kummerow <jkummerow@chromium.org> Commit-Queue: Manos Koukoutos <manoskouk@chromium.org> Cr-Commit-Position: refs/heads/main@{#77428}
2021-10-15 13:51:34 +00:00 · 2021-10-15 13:51:34 +00:00 · bce4410837
commit bce4410837
parent e4dba97006
10 changed files with 418 additions and 216 deletions
--- a/src/compiler/pipeline.cc
+++ b/src/compiler/pipeline.cc
@ -1685,28 +1685,6 @@ struct WasmLoopUnrollingPhase {
    }
  }
 };
-
-struct WasmInliningPhase {
-  DECL_PIPELINE_PHASE_CONSTANTS(WasmInlining)
-
-  void Run(PipelineData* data, Zone* temp_zone, wasm::CompilationEnv* env,
-           const wasm::WireBytesStorage* wire_bytes) {
-    GraphReducer graph_reducer(
-        temp_zone, data->graph(), &data->info()->tick_counter(), data->broker(),
-        data->jsgraph()->Dead(), data->observe_node_manager());
-    DeadCodeElimination dead(&graph_reducer, data->graph(),
-                             data->mcgraph()->common(), temp_zone);
-    // For now, inline the first few functions;
-    InlineFirstFew heuristics(FLAG_wasm_inlining_budget);
-    WasmInliner inliner(&graph_reducer, env, data->source_positions(),
-                        data->node_origins(), data->mcgraph(), wire_bytes,
-                        &heuristics);
-    AddReducer(data, &graph_reducer, &dead);
-    AddReducer(data, &graph_reducer, &inliner);
-
-    graph_reducer.ReduceGraph();
-  }
-};
 #endif  // V8_ENABLE_WEBASSEMBLY

 struct LoopExitEliminationPhase {
@ -2010,12 +1988,14 @@ struct ScheduledEffectControlLinearizationPhase {
 struct WasmOptimizationPhase {
  DECL_PIPELINE_PHASE_CONSTANTS(WasmOptimization)

-  void Run(PipelineData* data, Zone* temp_zone, bool allow_signalling_nan) {
+  void Run(PipelineData* data, Zone* temp_zone, bool allow_signalling_nan,
+           wasm::CompilationEnv* env, uint32_t function_index,
+           const wasm::WireBytesStorage* wire_bytes) {
    // Run optimizations in two rounds: First one around load elimination and
    // then one around branch elimination. This is because those two
    // optimizations sometimes display quadratic complexity when run together.
    // We only need load elimination for managed objects.
-    if (FLAG_experimental_wasm_gc) {
+    if (FLAG_experimental_wasm_gc || FLAG_wasm_inlining) {
      GraphReducer graph_reducer(temp_zone, data->graph(),
                                 &data->info()->tick_counter(), data->broker(),
                                 data->jsgraph()->Dead(),
@ -2030,11 +2010,20 @@ struct WasmOptimizationPhase {
      ValueNumberingReducer value_numbering(temp_zone, data->graph()->zone());
      CsaLoadElimination load_elimination(&graph_reducer, data->jsgraph(),
                                          temp_zone);
+      WasmInliner inliner(&graph_reducer, env, function_index,
+                          data->source_positions(), data->node_origins(),
+                          data->mcgraph(), wire_bytes);
      AddReducer(data, &graph_reducer, &machine_reducer);
      AddReducer(data, &graph_reducer, &dead_code_elimination);
      AddReducer(data, &graph_reducer, &common_reducer);
      AddReducer(data, &graph_reducer, &value_numbering);
+      if (FLAG_experimental_wasm_gc) {
        AddReducer(data, &graph_reducer, &load_elimination);
+      }
+      if (FLAG_wasm_inlining &&
+          !WasmInliner::any_inlining_impossible(data->graph()->NodeCount())) {
+        AddReducer(data, &graph_reducer, &inliner);
+      }
      graph_reducer.ReduceGraph();
    }
    {
@ -3236,14 +3225,11 @@ void Pipeline::GenerateCodeForWasmFunction(
    pipeline.Run<WasmLoopUnrollingPhase>(loop_info);
    pipeline.RunPrintAndVerify(WasmLoopUnrollingPhase::phase_name(), true);
  }
-  if (FLAG_wasm_inlining) {
-    pipeline.Run<WasmInliningPhase>(env, wire_bytes_storage);
-    pipeline.RunPrintAndVerify(WasmInliningPhase::phase_name(), true);
-  }
  const bool is_asm_js = is_asmjs_module(module);

  if (FLAG_wasm_opt || is_asm_js) {
-    pipeline.Run<WasmOptimizationPhase>(is_asm_js);
+    pipeline.Run<WasmOptimizationPhase>(is_asm_js, env, function_index,
+                                        wire_bytes_storage);
    pipeline.RunPrintAndVerify(WasmOptimizationPhase::phase_name(), true);
  } else {
    pipeline.Run<WasmBaseOptimizationPhase>();
--- a/src/compiler/wasm-inlining.cc
+++ b/src/compiler/wasm-inlining.cc
@ -28,24 +28,34 @@ Reduction WasmInliner::Reduce(Node* node) {
 }

 #define TRACE(...) \
-  if (FLAG_trace_wasm_speculative_inlining) { \
-    PrintF(__VA_ARGS__);                      \
-  }
+  if (FLAG_trace_wasm_inlining) PrintF(__VA_ARGS__);

 // TODO(12166): Save inlined frames for trap/--trace-wasm purposes. Consider
 //              tail calls.
-// TODO(12166): Inline indirect calls/call_ref.
 Reduction WasmInliner::ReduceCall(Node* call) {
  DCHECK(call->opcode() == IrOpcode::kCall ||
         call->opcode() == IrOpcode::kTailCall);
+
+  if (seen_.find(call) != seen_.end()) {
+    TRACE("function %d: have already seen node %d, skipping\n", function_index_,
+          call->id());
+    return NoChange();
+  }
+  seen_.insert(call);
+
  Node* callee = NodeProperties::GetValueInput(call, 0);
  IrOpcode::Value reloc_opcode = mcgraph_->machine()->Is32()
                                     ? IrOpcode::kRelocatableInt32Constant
                                     : IrOpcode::kRelocatableInt64Constant;
-  if (callee->opcode() != reloc_opcode) return NoChange();
+  if (callee->opcode() != reloc_opcode) {
+    TRACE("[function %d: considering node %d... not a relocatable constant]\n",
+          function_index_, call->id());
+    return NoChange();
+  }
  auto info = OpParameter<RelocatablePtrConstantInfo>(callee->op());
  uint32_t inlinee_index = static_cast<uint32_t>(info.value());
-  TRACE("[considering call to %d... ", inlinee_index)
+  TRACE("[function %d: considering node %d, call to %d... ", function_index_,
+        call->id(), inlinee_index)
  if (info.rmode() != RelocInfo::WASM_CALL) {
    TRACE("not a wasm call]\n")
    return NoChange();
@ -54,19 +64,64 @@ Reduction WasmInliner::ReduceCall(Node* call) {
    TRACE("imported function]\n")
    return NoChange();
  }
-
-  if (!heuristics_->DoInline(source_positions_->GetSourcePosition(call),
-                             inlinee_index)) {
-    TRACE("heuristics say no]\n")
+  if (inlinee_index == function_index_) {
+    TRACE("recursive call]\n")
    return NoChange();
  }
-  TRACE("inlining!]\n")
+
+  TRACE("adding to inlining candidates!]\n")
+
+  bool is_speculative_call_ref = false;
+  int call_count = 0;
+  if (FLAG_wasm_speculative_inlining) {
+    base::MutexGuard guard(&module()->type_feedback.mutex);
+    auto maybe_feedback =
+        module()->type_feedback.feedback_for_function.find(function_index_);
+    if (maybe_feedback != module()->type_feedback.feedback_for_function.end()) {
+      wasm::FunctionTypeFeedback feedback = maybe_feedback->second;
+      wasm::WasmCodePosition position =
+          source_positions_->GetSourcePosition(call).ScriptOffset();
+      DCHECK_NE(position, wasm::kNoCodePosition);
+      auto index_in_feedback_vector = feedback.positions.find(position);
+      if (index_in_feedback_vector != feedback.positions.end()) {
+        is_speculative_call_ref = true;
+        call_count = feedback.feedback_vector[index_in_feedback_vector->second]
+                         .absolute_call_frequency;
+      }
+    }
+  }

  CHECK_LT(inlinee_index, module()->functions.size());
  const wasm::WasmFunction* inlinee = &module()->functions[inlinee_index];
-
  base::Vector<const byte> function_bytes = wire_bytes_->GetCode(inlinee->code);

+  CandidateInfo candidate{call, inlinee_index, is_speculative_call_ref,
+                          call_count, function_bytes.length()};
+
+  inlining_candidates_.push(candidate);
+  return NoChange();
+}
+
+void WasmInliner::Finalize() {
+  TRACE("function %d: going though inlining candidates...\n", function_index_);
+  while (!inlining_candidates_.empty()) {
+    CandidateInfo candidate = inlining_candidates_.top();
+    inlining_candidates_.pop();
+    Node* call = candidate.node;
+    TRACE(
+        "  [function %d: considering candidate {@%d, index=%d, type=%s, "
+        "count=%d, size=%d}... ",
+        function_index_, call->id(), candidate.inlinee_index,
+        candidate.is_speculative_call_ref ? "ref" : "direct",
+        candidate.call_count, candidate.wire_byte_size);
+    if (call->IsDead()) {
+      TRACE("dead node]\n");
+      continue;
+    }
+    const wasm::WasmFunction* inlinee =
+        &module()->functions[candidate.inlinee_index];
+    base::Vector<const byte> function_bytes =
+        wire_bytes_->GetCode(inlinee->code);
    const wasm::FunctionBody inlinee_body(inlinee->sig, inlinee->code.offset(),
                                          function_bytes.begin(),
                                          function_bytes.end());
@ -81,19 +136,40 @@ Reduction WasmInliner::ReduceCall(Node* call) {
    Node* inlinee_end;
    {
      Graph::SubgraphScope scope(graph());
-    result = wasm::BuildTFGraph(zone()->allocator(), env_->enabled_features,
-                                module(), &builder, &detected, inlinee_body,
-                                &infos, node_origins_, inlinee_index,
-                                wasm::kInlinedFunction);
+      result = wasm::BuildTFGraph(
+          zone()->allocator(), env_->enabled_features, module(), &builder,
+          &detected, inlinee_body, &infos, node_origins_,
+          candidate.inlinee_index, wasm::kInlinedFunction);
      inlinee_start = graph()->start();
      inlinee_end = graph()->end();
    }
+    if (result.failed()) {
+      // This can happen if the inlinee has never been compiled before and is
+      // invalid. Return, as there is no point to keep optimizing.
+      TRACE("failed to compile]\n")
+      return;
+    }

-  if (result.failed()) return NoChange();
-  return call->opcode() == IrOpcode::kCall
-             ? InlineCall(call, inlinee_start, inlinee_end, inlinee->sig,
-                          subgraph_min_node_id)
-             : InlineTailCall(call, inlinee_start, inlinee_end);
+    size_t additional_nodes = graph()->NodeCount() - subgraph_min_node_id;
+    if (current_graph_size_ + additional_nodes >
+        size_limit(initial_graph_size_)) {
+      // This is not based on the accurate graph size, as it may have been
+      // shrunk by other optimizations. We could recompute the accurate size
+      // with a traversal, but it is most probably not worth the time.
+      TRACE("not enough inlining budget]\n");
+      continue;
+    }
+    TRACE("inlining!]\n");
+    current_graph_size_ += additional_nodes;
+
+    if (call->opcode() == IrOpcode::kCall) {
+      InlineCall(call, inlinee_start, inlinee_end, inlinee->sig,
+                 subgraph_min_node_id);
+    } else {
+      InlineTailCall(call, inlinee_start, inlinee_end);
+    }
+    // Returning after only one inlining has been tried and found worse.
+  }
 }

 /* Rewire callee formal parameters to the call-site real parameters. Rewire
@ -121,12 +197,13 @@ void WasmInliner::RewireFunctionEntry(Node* call, Node* callee_start) {
        } else {
          UNREACHABLE();
        }
+        Revisit(edge.from());
        break;
    }
  }
 }

-Reduction WasmInliner::InlineTailCall(Node* call, Node* callee_start,
+void WasmInliner::InlineTailCall(Node* call, Node* callee_start,
                                 Node* callee_end) {
  DCHECK(call->opcode() == IrOpcode::kTailCall);
  // 1) Rewire function entry.
@ -136,14 +213,17 @@ Reduction WasmInliner::InlineTailCall(Node* call, Node* callee_start,
  for (Node* const input : callee_end->inputs()) {
    DCHECK(IrOpcode::IsGraphTerminator(input->opcode()));
    NodeProperties::MergeControlToEnd(graph(), common(), input);
-    Revisit(graph()->end());
+  }
+  for (Edge edge_to_end : call->use_edges()) {
+    DCHECK_EQ(edge_to_end.from(), graph()->end());
+    edge_to_end.UpdateTo(mcgraph()->Dead());
  }
  callee_end->Kill();
-  return Replace(mcgraph()->Dead());
+  call->Kill();
+  Revisit(graph()->end());
 }

-Reduction WasmInliner::InlineCall(Node* call, Node* callee_start,
-                                  Node* callee_end,
+void WasmInliner::InlineCall(Node* call, Node* callee_start, Node* callee_end,
                             const wasm::FunctionSig* inlinee_sig,
                             size_t subgraph_min_node_id) {
  DCHECK(call->opcode() == IrOpcode::kCall);
@ -312,17 +392,17 @@ Reduction WasmInliner::InlineCall(Node* call, Node* callee_start,
      // Dead() as a dummy for value replacement.
      ReplaceWithValue(call, mcgraph()->Dead(), effect_output, control_output);
    }
-    return Replace(mcgraph()->Dead());
  } else {
    // The callee can never return. The call node and all its uses are dead.
    ReplaceWithValue(call, mcgraph()->Dead(), mcgraph()->Dead(),
                     mcgraph()->Dead());
-    return Changed(call);
  }
 }

 const wasm::WasmModule* WasmInliner::module() const { return env_->module; }

+#undef TRACE
+
 }  // namespace compiler
 }  // namespace internal
 }  // namespace v8
--- a/src/compiler/wasm-inlining.h
+++ b/src/compiler/wasm-inlining.h
@ -30,23 +30,6 @@ namespace compiler {
 class NodeOriginTable;
 class SourcePositionTable;

-// Parent class for classes that provide heuristics on how to inline in wasm.
-class WasmInliningHeuristics {
- public:
-  virtual bool DoInline(SourcePosition position, uint32_t function_index) = 0;
-};
-
-class InlineFirstFew : public WasmInliningHeuristics {
- public:
-  explicit InlineFirstFew(int count) : count_(count) {}
-  bool DoInline(SourcePosition position, uint32_t function_index) override {
-    return count_-- > 0;
-  }
-
- private:
-  int count_;
-};
-
 // The WasmInliner provides the core graph inlining machinery for Webassembly
 // graphs. Note that this class only deals with the mechanics of how to inline
 // one graph into another; heuristics that decide what and how much to inline
@ -54,43 +37,118 @@ class InlineFirstFew : public WasmInliningHeuristics {
 class WasmInliner final : public AdvancedReducer {
 public:
  WasmInliner(Editor* editor, wasm::CompilationEnv* env,
-              SourcePositionTable* source_positions,
+              uint32_t function_index, SourcePositionTable* source_positions,
              NodeOriginTable* node_origins, MachineGraph* mcgraph,
-              const wasm::WireBytesStorage* wire_bytes,
-              WasmInliningHeuristics* heuristics)
+              const wasm::WireBytesStorage* wire_bytes)
      : AdvancedReducer(editor),
        env_(env),
+        function_index_(function_index),
        source_positions_(source_positions),
        node_origins_(node_origins),
        mcgraph_(mcgraph),
        wire_bytes_(wire_bytes),
-        heuristics_(heuristics) {}
+        initial_graph_size_(mcgraph->graph()->NodeCount()),
+        current_graph_size_(initial_graph_size_),
+        inlining_candidates_() {}

  const char* reducer_name() const override { return "WasmInliner"; }

  Reduction Reduce(Node* node) final;
+  void Finalize() final;
+
+  static bool any_inlining_impossible(size_t initial_graph_size) {
+    return size_limit(initial_graph_size) - initial_graph_size <
+           kMinimumFunctionNodeCount;
+  }

 private:
+  struct CandidateInfo {
+    Node* node;
+    uint32_t inlinee_index;
+    bool is_speculative_call_ref;
+    int call_count;
+    int wire_byte_size;
+  };
+
+  struct LexicographicOrdering {
+    // Returns if c1 should be prioritized less than c2.
+    bool operator()(CandidateInfo& c1, CandidateInfo& c2) {
+      if (c1.is_speculative_call_ref && !c2.is_speculative_call_ref) {
+        return false;
+      }
+      if (c2.is_speculative_call_ref && !c1.is_speculative_call_ref) {
+        return true;
+      }
+      if (c1.call_count > c2.call_count) return false;
+      if (c2.call_count > c1.call_count) return true;
+      return c1.wire_byte_size > c2.wire_byte_size;
+    }
+  };
+
+  // TODO(manoskouk): This has not been found to be useful, but something
+  // similar may be tried again in the future.
+  // struct AdvancedOrdering {
+  //  // Returns if c1 should be prioritized less than c2.
+  //  bool operator()(CandidateInfo& c1, CandidateInfo& c2) {
+  //    if (c1.is_speculative_call_ref && c2.is_speculative_call_ref) {
+  //      if (c1.call_count > c2.call_count) return false;
+  //      if (c2.call_count > c1.call_count) return true;
+  //      return c1.wire_byte_size > c2.wire_byte_size;
+  //    }
+  //    if (!c1.is_speculative_call_ref && !c2.is_speculative_call_ref) {
+  //      return c1.wire_byte_size > c2.wire_byte_size;
+  //    }
+  //
+  //    constexpr int kAssumedCallCountForDirectCalls = 3;
+  //
+  //    int c1_call_count = c1.is_speculative_call_ref
+  //                            ? c1.call_count
+  //                            : kAssumedCallCountForDirectCalls;
+  //    int c2_call_count = c2.is_speculative_call_ref
+  //                            ? c2.call_count
+  //                            : kAssumedCallCountForDirectCalls;
+  //
+  //    return static_cast<float>(c1_call_count) / c1.wire_byte_size <
+  //           static_cast<float>(c2_call_count) / c2.wire_byte_size;
+  //  }
+  //};
+
  Zone* zone() const { return mcgraph_->zone(); }
  CommonOperatorBuilder* common() const { return mcgraph_->common(); }
  Graph* graph() const { return mcgraph_->graph(); }
  MachineGraph* mcgraph() const { return mcgraph_; }
  const wasm::WasmModule* module() const;
-  const wasm::WasmFunction* inlinee() const;
+
+  // A limit to the size of the inlined graph as a function of its initial size.
+  static size_t size_limit(size_t initial_graph_size) {
+    return initial_graph_size +
+           std::min(FLAG_wasm_inlining_max_size,
+                    FLAG_wasm_inlining_budget_factor / initial_graph_size);
+  }
+
+  // The smallest size in TF nodes any meaningful wasm function can have
+  // (start, instance parameter, end).
+  static constexpr size_t kMinimumFunctionNodeCount = 3;

  Reduction ReduceCall(Node* call);
-  Reduction InlineCall(Node* call, Node* callee_start, Node* callee_end,
+  void InlineCall(Node* call, Node* callee_start, Node* callee_end,
                  const wasm::FunctionSig* inlinee_sig,
                  size_t subgraph_min_node_id);
-  Reduction InlineTailCall(Node* call, Node* callee_start, Node* callee_end);
+  void InlineTailCall(Node* call, Node* callee_start, Node* callee_end);
  void RewireFunctionEntry(Node* call, Node* callee_start);

  wasm::CompilationEnv* const env_;
+  uint32_t function_index_;
  SourcePositionTable* const source_positions_;
  NodeOriginTable* const node_origins_;
  MachineGraph* const mcgraph_;
  const wasm::WireBytesStorage* const wire_bytes_;
-  WasmInliningHeuristics* heuristics_;
+  const size_t initial_graph_size_;
+  size_t current_graph_size_;
+  std::priority_queue<CandidateInfo, std::vector<CandidateInfo>,
+                      LexicographicOrdering>
+      inlining_candidates_;
+  std::unordered_set<Node*> seen_;
 };

 }  // namespace compiler
--- a/src/flags/flag-definitions.h
+++ b/src/flags/flag-definitions.h
@ -1052,10 +1052,14 @@ DEFINE_BOOL(wasm_math_intrinsics, true,
 DEFINE_BOOL(
    wasm_inlining, false,
    "enable inlining of wasm functions into wasm functions (experimental)")
-DEFINE_INT(wasm_inlining_budget, 3,
-           "maximum number of call targets to inline into a Wasm function")
+DEFINE_SIZE_T(
+    wasm_inlining_budget_factor, 100000,
+    "maximum allowed size to inline a function is given by {n / caller size}")
+DEFINE_SIZE_T(wasm_inlining_max_size, 1250,
+              "maximum size of a function that can be inlined, in TF nodes")
 DEFINE_BOOL(wasm_speculative_inlining, false,
            "enable speculative inlining of call_ref targets (experimental)")
+DEFINE_BOOL(trace_wasm_inlining, false, "trace wasm inlining")
 DEFINE_BOOL(trace_wasm_speculative_inlining, false,
            "trace wasm speculative inlining")
 DEFINE_IMPLICATION(wasm_speculative_inlining, experimental_wasm_typed_funcref)
--- a/src/wasm/baseline/liftoff-compiler.cc
+++ b/src/wasm/baseline/liftoff-compiler.cc
@ -5945,6 +5945,12 @@ class LiftoffCompiler {
      LiftoffAssembler::VarState vector_var(kPointerKind, vector, 0);
      LiftoffRegister index = pinned.set(__ GetUnusedRegister(kGpReg, pinned));
      uintptr_t vector_slot = num_call_ref_instructions_ * 2;
+      {
+        base::MutexGuard mutex_guard(&decoder->module_->type_feedback.mutex);
+        decoder->module_->type_feedback.feedback_for_function[func_index_]
+            .positions[decoder->position()] =
+            static_cast<int>(num_call_ref_instructions_);
+      }
      num_call_ref_instructions_++;
      __ LoadConstant(index, WasmValue::ForUintPtr(vector_slot));
      LiftoffAssembler::VarState index_var(kIntPtrKind, index, 0);
--- a/src/wasm/graph-builder-interface.cc
+++ b/src/wasm/graph-builder-interface.cc
@ -127,13 +127,10 @@ class WasmGraphBuildingInterface {
      base::MutexGuard mutex_guard(&feedbacks.mutex);
      auto feedback = feedbacks.feedback_for_function.find(func_index_);
      if (feedback != feedbacks.feedback_for_function.end()) {
-        type_feedback_ = std::move(feedback->second);
-        // Erasing the map entry means that if the same function later gets
-        // inlined, its inlined copy won't have any type feedback available.
-        // However, if we don't erase the entry now, we'll be stuck with it
-        // forever.
+        type_feedback_ = feedback->second.feedback_vector;
+        // We need to keep the feedback in the module to inline later. However,
+        // this means we are stuck with it forever.
        // TODO(jkummerow): Reconsider our options here.
-        feedbacks.feedback_for_function.erase(func_index_);
      }
    }
    // The first '+ 1' is needed by TF Start node, the second '+ 1' is for the
@ -675,7 +672,8 @@ class WasmGraphBuildingInterface {
    // we won't have any for inlined functions. Figure out how to change that.
    if (FLAG_wasm_speculative_inlining && type_feedback_.size() > 0) {
      DCHECK_LT(feedback_instruction_index_, type_feedback_.size());
-      maybe_feedback = type_feedback_[feedback_instruction_index_];
+      maybe_feedback =
+          type_feedback_[feedback_instruction_index_].function_index;
      feedback_instruction_index_++;
    }
    if (maybe_feedback == -1) {
@ -742,9 +740,10 @@ class WasmGraphBuildingInterface {
                     const FunctionSig* sig, uint32_t sig_index,
                     const Value args[]) {
    int maybe_feedback = -1;
-    if (FLAG_wasm_speculative_inlining) {
-      DCHECK_LE(feedback_instruction_index_, type_feedback_.size());
-      maybe_feedback = type_feedback_[feedback_instruction_index_];
+    if (FLAG_wasm_speculative_inlining && type_feedback_.size() > 0) {
+      DCHECK_LT(feedback_instruction_index_, type_feedback_.size());
+      maybe_feedback =
+          type_feedback_[feedback_instruction_index_].function_index;
      feedback_instruction_index_++;
    }
    if (maybe_feedback == -1) {
@ -1291,7 +1290,7 @@ class WasmGraphBuildingInterface {
  // The entries in {type_feedback_} are indexed by the position of feedback-
  // consuming instructions (currently only call_ref).
  int feedback_instruction_index_ = 0;
-  std::vector<int> type_feedback_;
+  std::vector<CallSiteFeedback> type_feedback_;

  TFNode* effect() { return builder_->effect(); }

--- a/src/wasm/module-compiler.cc
+++ b/src/wasm/module-compiler.cc
@ -1241,14 +1241,13 @@ bool CompileLazy(Isolate* isolate, Handle<WasmInstanceObject> instance,
  return true;
 }

-std::vector<int> ProcessTypeFeedback(Isolate* isolate,
-                                     Handle<WasmInstanceObject> instance,
-                                     int func_index) {
+std::vector<CallSiteFeedback> ProcessTypeFeedback(
+    Isolate* isolate, Handle<WasmInstanceObject> instance, int func_index) {
  int which_vector = declared_function_index(instance->module(), func_index);
  Object maybe_feedback = instance->feedback_vectors().get(which_vector);
  if (!maybe_feedback.IsFixedArray()) return {};
  FixedArray feedback = FixedArray::cast(maybe_feedback);
-  std::vector<int> result(feedback.length() / 2);
+  std::vector<CallSiteFeedback> result(feedback.length() / 2);
  int imported_functions =
      static_cast<int>(instance->module()->num_imported_functions);
  for (int i = 0; i < feedback.length(); i += 2) {
@ -1263,7 +1262,9 @@ std::vector<int> ProcessTypeFeedback(Isolate* isolate,
          PrintF("[Function #%d call_ref #%d inlineable (monomorphic)]\n",
                 func_index, i / 2);
        }
-        result[i / 2] = target.function_index();
+        CallRefData data = CallRefData::cast(feedback.get(i + 1));
+        result[i / 2] = {target.function_index(),
+                         static_cast<int>(data.count())};
        continue;
      }
    } else if (value.IsFixedArray()) {
@ -1276,6 +1277,7 @@ std::vector<int> ProcessTypeFeedback(Isolate* isolate,
        total_count += CallRefData::cast(polymorphic.get(j + 1)).count();
      }
      int found_target = -1;
+      int found_count = -1;
      double best_frequency = 0;
      for (int j = 0; j < polymorphic.length(); j += 2) {
        uint32_t this_count = CallRefData::cast(polymorphic.get(j + 1)).count();
@ -1293,6 +1295,7 @@ std::vector<int> ProcessTypeFeedback(Isolate* isolate,
          continue;
        }
        found_target = target.function_index();
+        found_count = static_cast<int>(this_count);
        if (FLAG_trace_wasm_speculative_inlining) {
          PrintF("[Function #%d call_ref #%d inlineable (polymorphic %f)]\n",
                 func_index, i / 2, frequency);
@ -1300,7 +1303,7 @@ std::vector<int> ProcessTypeFeedback(Isolate* isolate,
        break;
      }
      if (found_target >= 0) {
-        result[i / 2] = found_target;
+        result[i / 2] = {found_target, found_count};
        continue;
      } else if (FLAG_trace_wasm_speculative_inlining) {
        PrintF("[Function #%d call_ref #%d: best frequency %f]\n", func_index,
@ -1310,7 +1313,7 @@ std::vector<int> ProcessTypeFeedback(Isolate* isolate,
    // If we fall through to here, then this call isn't eligible for inlining.
    // Possible reasons: uninitialized or megamorphic feedback; or monomorphic
    // or polymorphic that didn't meet our requirements.
-    result[i / 2] = -1;
+    result[i / 2] = {-1, -1};
  }
  return result;
 }
@ -1329,7 +1332,7 @@ void TriggerTierUp(Isolate* isolate, NativeModule* native_module,
    // TODO(jkummerow): we could have collisions here if two different instances
    // of the same module schedule tier-ups of the same function at the same
    // time. If that ever becomes a problem, figure out a solution.
-    module->type_feedback.feedback_for_function[func_index] =
+    module->type_feedback.feedback_for_function[func_index].feedback_vector =
        std::move(feedback);
  }

--- a/src/wasm/wasm-module.h
+++ b/src/wasm/wasm-module.h
@ -262,8 +262,16 @@ struct V8_EXPORT_PRIVATE WasmDebugSymbols {
  WireBytesRef external_url;
 };

+struct CallSiteFeedback {
+  int function_index;
+  int absolute_call_frequency;
+};
+struct FunctionTypeFeedback {
+  std::vector<CallSiteFeedback> feedback_vector;
+  std::map<WasmCodePosition, int> positions;
+};
 struct TypeFeedbackStorage {
-  std::map<uint32_t, std::vector<int>> feedback_for_function;
+  std::map<uint32_t, FunctionTypeFeedback> feedback_for_function;
  // Accesses to {feedback_for_function} are guarded by this mutex.
  base::Mutex mutex;
 };
--- a/test/mjsunit/wasm/inlining.js
+++ b/test/mjsunit/wasm/inlining.js
@ -3,7 +3,6 @@
 // found in the LICENSE file.

 // Flags: --wasm-inlining --no-liftoff --experimental-wasm-return-call
-// Flags: --experimental-wasm-typed-funcref

 d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");

@ -11,6 +10,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 // output, or implementing testing infrastructure with --allow-natives-syntax.

 (function SimpleInliningTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();

  // f(x) = x - 1
@ -27,6 +27,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 })();

 (function MultiReturnTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();

  // f(x) = (x - 1, x + 1)
@ -43,6 +44,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 })();

 (function NoReturnTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();

  let global = builder.addGlobal(kWasmI32, true);
@ -60,6 +62,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 })();

 (function InfiniteLoopTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();

  let callee = builder.addFunction("callee", kSig_i_i)
@ -78,6 +81,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 })();

 (function TailCallInCalleeTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();

  // f(x) = g(x - 1)
@ -98,6 +102,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 })();

 (function MultipleCallAndReturnSitesTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();

  // f(x) = x >= 0 ? x - 1 : x + 1
@ -121,6 +126,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 })();

 (function TailCallInCallerTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();

  // f(x) = x > 0 ? g(x) + 1: g(x - 1);
@ -148,6 +154,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 })();

 (function HandledInHandledTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();
  let tag = builder.addTag(kSig_v_i);

@ -173,6 +180,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 })();

 (function HandledInUnhandledTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();
  let tag = builder.addTag(kSig_v_i);

@ -194,6 +202,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 })();

 (function UnhandledInUnhandledTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();
  let tag = builder.addTag(kSig_v_i);

@ -213,6 +222,7 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
 // the unhandled calls in the callee (including the 'throw' builtin) to the
 // handler in the caller.
 (function UnhandledInHandledTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();
  let tag = builder.addTag(kSig_v_i);

@ -241,96 +251,9 @@ d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
  assertEquals(20, instance.exports.main(10, 20));
 })();

-(function CallRefSpecSucceededTest() {
-  let builder = new WasmModuleBuilder();
-
-  // f(x) = x - 1
-  let callee = builder.addFunction("callee", kSig_i_i)
-    .addBody([kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub]);
-
-  let global = builder.addGlobal(wasmRefType(0), false,
-                                 WasmInitExpr.RefFunc(callee.index));
-
-  // g(x) = f(5) + x
-  builder.addFunction("main", kSig_i_i)
-    .addBody([kExprI32Const, 5, kExprGlobalGet, global.index, kExprCallRef,
-              kExprLocalGet, 0, kExprI32Add])
-    .exportAs("main");
-
-  let instance = builder.instantiate();
-  assertEquals(14, instance.exports.main(10));
-})();
-
-(function CallRefSpecFailedTest() {
-  let builder = new WasmModuleBuilder();
-
-  // h(x) = x - 1
-  builder.addFunction("callee", kSig_i_i)
-    .addBody([kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub]);
-
-  // f(x) = x - 2
-  let callee = builder.addFunction("callee", kSig_i_i)
-    .addBody([kExprLocalGet, 0, kExprI32Const, 2, kExprI32Sub]);
-
-  let global = builder.addGlobal(wasmRefType(1), false,
-                                 WasmInitExpr.RefFunc(callee.index));
-
-  // g(x) = f(5) + x
-  builder.addFunction("main", kSig_i_i)
-    .addBody([kExprI32Const, 5, kExprGlobalGet, global.index, kExprCallRef,
-              kExprLocalGet, 0, kExprI32Add])
-    .exportAs("main");
-
-  let instance = builder.instantiate();
-  assertEquals(13, instance.exports.main(10));
-})();
-
-(function CallReturnRefSpecSucceededTest() {
-  let builder = new WasmModuleBuilder();
-
-  // f(x) = x - 1
-  let callee = builder.addFunction("callee", kSig_i_i)
-    .addBody([kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub]);
-
-  let global = builder.addGlobal(wasmRefType(0), false,
-                                 WasmInitExpr.RefFunc(callee.index));
-
-  // g(x) = f(5 + x)
-  builder.addFunction("main", kSig_i_i)
-    .addBody([kExprI32Const, 5, kExprLocalGet, 0, kExprI32Add,
-              kExprGlobalGet, global.index, kExprReturnCallRef])
-    .exportAs("main");
-
-  let instance = builder.instantiate();
-  assertEquals(14, instance.exports.main(10));
-})();
-
-(function CallReturnRefSpecFailedTest() {
-  let builder = new WasmModuleBuilder();
-
-  // h(x) = x - 1
-  builder.addFunction("callee", kSig_i_i)
-    .addBody([kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub]);
-
-  // f(x) = x - 2
-  let callee = builder.addFunction("callee", kSig_i_i)
-    .addBody([kExprLocalGet, 0, kExprI32Const, 2, kExprI32Sub]);
-
-  let global = builder.addGlobal(wasmRefType(1), false,
-                                 WasmInitExpr.RefFunc(callee.index));
-
-  // g(x) = f(5 + x)
-  builder.addFunction("main", kSig_i_i)
-    .addBody([kExprI32Const, 5, kExprLocalGet, 0, kExprI32Add,
-              kExprGlobalGet, global.index, kExprReturnCallRef])
-    .exportAs("main");
-
-  let instance = builder.instantiate();
-  assertEquals(13, instance.exports.main(10));
-})();
-
 // Tests that no LoopExits are emitted in the inlined function.
 (function LoopUnrollingTest() {
+  print(arguments.callee.name);
  let builder = new WasmModuleBuilder();

  // f(x, y) = { do { y += 1; x -= 1; } while (x > 0); return y; }
--- a/test/mjsunit/wasm/speculative-inlining.js
+++ b/test/mjsunit/wasm/speculative-inlining.js
@ -0,0 +1,135 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Flags: --wasm-speculative-inlining --experimental-wasm-return-call
+// Flags: --experimental-wasm-typed-funcref
+
+d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
+
+(function CallRefSpecSucceededTest() {
+  print(arguments.callee.name);
+  let builder = new WasmModuleBuilder();
+
+  // f(x) = x - 1
+  let callee = builder.addFunction("callee", kSig_i_i)
+    .addBody([kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub]);
+
+  let global = builder.addGlobal(wasmRefType(0), false,
+                                 WasmInitExpr.RefFunc(callee.index));
+
+  // g(x) = f(5) + x
+  builder.addFunction("main", kSig_i_i)
+    .addBody([kExprI32Const, 5, kExprGlobalGet, global.index, kExprCallRef,
+              kExprLocalGet, 0, kExprI32Add])
+    .exportAs("main");
+
+  let instance = builder.instantiate();
+  // Run it 10 times to trigger tier-up.
+  for (var i = 0; i < 10; i++) assertEquals(14, instance.exports.main(10));
+})();
+
+(function CallRefSpecFailedTest() {
+  print(arguments.callee.name);
+  let builder = new WasmModuleBuilder();
+
+  // h(x) = x - 1
+  let callee0 = builder.addFunction("callee0", kSig_i_i)
+    .addBody([kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub]);
+
+  // f(x) = x - 2
+  let callee1 = builder.addFunction("callee1", kSig_i_i)
+    .addBody([kExprLocalGet, 0, kExprI32Const, 2, kExprI32Sub]);
+
+  let global0 = builder.addGlobal(wasmRefType(1), false,
+                                 WasmInitExpr.RefFunc(callee0.index));
+  let global1 = builder.addGlobal(wasmRefType(1), false,
+                                 WasmInitExpr.RefFunc(callee1.index));
+
+  // g(x, y) = if (y) { h(5) + x } else { f(7) + x }
+  builder.addFunction("main", kSig_i_ii)
+    .addBody([
+      kExprLocalGet, 1,
+      kExprIf, kWasmI32,
+        kExprI32Const, 5, kExprGlobalGet, global0.index, kExprCallRef,
+        kExprLocalGet, 0, kExprI32Add,
+      kExprElse,
+        kExprI32Const, 7, kExprGlobalGet, global1.index, kExprCallRef,
+        kExprLocalGet, 0, kExprI32Add,
+      kExprEnd])
+    .exportAs("main");
+
+  let instance = builder.instantiate();
+  // Run main 10 times with the same function reference to trigger tier-up.
+  // This will speculatively inline a call to function {h}.
+  for (var i = 0; i < 10; i++) assertEquals(14, instance.exports.main(10, 1));
+  // If tier-up is done, "callee0" should be inlined in the trace.
+  assertEquals(14, instance.exports.main(10, 1))
+
+  // Now, run main with {f} instead. The correct reference should still be
+  // called, i.e., "callee1".
+  assertEquals(15, instance.exports.main(10, 0));
+})();
+
+// TODO(manoskouk): Fix the following tests.
+(function CallReturnRefSpecSucceededTest() {
+  print(arguments.callee.name);
+  let builder = new WasmModuleBuilder();
+
+  // f(x) = x - 1
+  let callee = builder.addFunction("callee", kSig_i_i)
+    .addBody([kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub]);
+
+  let global = builder.addGlobal(wasmRefType(0), false,
+                                 WasmInitExpr.RefFunc(callee.index));
+
+  // g(x) = f(5 + x)
+  builder.addFunction("main", kSig_i_i)
+    .addBody([kExprI32Const, 5, kExprLocalGet, 0, kExprI32Add,
+              kExprGlobalGet, global.index, kExprReturnCallRef])
+    .exportAs("main");
+
+  let instance = builder.instantiate();
+  // Run it 10 times to trigger tier-up.
+  for (var i = 0; i < 10; i++) assertEquals(14, instance.exports.main(10));
+})();
+
+(function CallReturnRefSpecFailedTest() {
+  print(arguments.callee.name);
+  let builder = new WasmModuleBuilder();
+
+  // h(x) = x - 1
+  let callee0 = builder.addFunction("callee0", kSig_i_i)
+    .addBody([kExprLocalGet, 0, kExprI32Const, 1, kExprI32Sub]);
+
+  // f(x) = x - 2
+  let callee1 = builder.addFunction("callee1", kSig_i_i)
+    .addBody([kExprLocalGet, 0, kExprI32Const, 2, kExprI32Sub]);
+
+  let global0 = builder.addGlobal(wasmRefType(1), false,
+                                 WasmInitExpr.RefFunc(callee0.index));
+  let global1 = builder.addGlobal(wasmRefType(1), false,
+                                 WasmInitExpr.RefFunc(callee1.index));
+
+  // g(x, y) = if (y) { h(x) } else { f(x) }
+  builder.addFunction("main", kSig_i_ii)
+    .addBody([
+      kExprLocalGet, 1,
+      kExprIf, kWasmI32,
+        kExprLocalGet, 0, kExprGlobalGet, global0.index, kExprReturnCallRef,
+      kExprElse,
+        kExprLocalGet, 0, kExprGlobalGet, global1.index, kExprReturnCallRef,
+      kExprEnd])
+    .exportAs("main");
+
+  let instance = builder.instantiate();
+  // Run main 10 times with the same function reference to trigger tier-up.
+  // This will speculatively inline a call to function {h}.
+  for (var i = 0; i < 10; i++) assertEquals(9, instance.exports.main(10, 1));
+  // If tier-up is done, "callee0" should be inlined in the trace.
+  assertEquals(9, instance.exports.main(10, 1))
+
+  // Now, run main with {f} instead. The correct reference should still be
+  // called, i.e., "callee1".
+  assertEquals(8, instance.exports.main(10, 0));
+})();