diff --git a/BUILD.gn b/BUILD.gn index 3ec0102866..e4ff9d0beb 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -144,6 +144,22 @@ declare_args() { # used builtins. v8_enable_builtins_profiling_verbose = false + # Provides the given V8 log file as an input to mksnapshot, where it can be + # used for profile-guided optimization of builtins. + # + # To do profile-guided optimizations of builtins: + # 1. Build with v8_enable_builtins_profiling = true + # 2. Run your chosen workload with the --turbo-profiling-log-builtins flag. + # For Chrome, the invocation might look like this: + # chrome --no-sandbox --disable-extensions + # --js-flags="--turbo-profiling-log-builtins --logfile=path/to/v8.log" + # "http://localhost/test-suite" + # 3. Optionally repeat step 2 for additional workloads, and concatenate all of + # the resulting log files into a single file. + # 4. Build again with v8_builtins_profiling_log_file set to the file created + # in steps 2-3. + v8_builtins_profiling_log_file = "" + # Enables various testing features. v8_enable_test_features = "" @@ -1473,6 +1489,12 @@ template("run_mksnapshot") { if (v8_enable_builtins_profiling_verbose) { args += [ "--turbo-profiling-verbose" ] } + if (v8_builtins_profiling_log_file != "") { + args += [ + "--turbo-profiling-log-file", + v8_builtins_profiling_log_file, + ] + } # This is needed to distinguish between generating code for the simulator # and cross-compiling. The latter may need to run code on the host with the @@ -1728,6 +1750,8 @@ v8_source_set("v8_initializers") { "src/builtins/builtins-wasm-gen.h", "src/builtins/growable-fixed-array-gen.cc", "src/builtins/growable-fixed-array-gen.h", + "src/builtins/profile-data-reader.cc", + "src/builtins/profile-data-reader.h", "src/builtins/setup-builtins-internal.cc", "src/codegen/code-stub-assembler.cc", "src/codegen/code-stub-assembler.h", @@ -1869,6 +1893,7 @@ v8_header_set("v8_shared_internal_headers") { v8_compiler_sources = [ ### gcmole(all) ### + "src/builtins/profile-data-reader.h", "src/compiler/access-builder.cc", "src/compiler/access-builder.h", "src/compiler/access-info.cc", @@ -2261,6 +2286,7 @@ v8_source_set("v8_base_without_compiler") { "src/builtins/builtins.h", "src/builtins/constants-table-builder.cc", "src/builtins/constants-table-builder.h", + "src/builtins/profile-data-reader.h", "src/codegen/assembler-arch.h", "src/codegen/assembler-inl.h", "src/codegen/assembler.cc", diff --git a/src/builtins/profile-data-reader.cc b/src/builtins/profile-data-reader.cc new file mode 100644 index 0000000000..b07c9c21f2 --- /dev/null +++ b/src/builtins/profile-data-reader.cc @@ -0,0 +1,121 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "src/builtins/profile-data-reader.h" + +#include +#include +#include + +#include "src/base/lazy-instance.h" +#include "src/flags/flags.h" +#include "src/utils/utils.h" + +namespace v8 { +namespace internal { + +namespace { + +class ProfileDataFromFileInternal : public ProfileDataFromFile { + public: + bool hash_has_value() const { return hash_has_value_; } + + void set_hash(int hash) { + hash_ = hash; + hash_has_value_ = true; + } + + void AddCountToBlock(size_t block_id, uint32_t count) { + if (block_counts_by_id_.size() <= block_id) { + // std::vector initializes new data to zero when resizing. + block_counts_by_id_.resize(block_id + 1); + } + block_counts_by_id_[block_id] += count; + } + + private: + bool hash_has_value_ = false; +}; + +const std::unordered_map& +EnsureInitProfileData() { + static base::LeakyObject< + std::unordered_map> + data; + static bool initialized = false; + + if (initialized) return *data.get(); + initialized = true; + const char* filename = FLAG_turbo_profiling_log_file; + if (filename == nullptr) return *data.get(); + std::ifstream file(filename); + CHECK_WITH_MSG(file.good(), "Can't read log file"); + for (std::string line; std::getline(file, line);) { + std::string token; + std::istringstream line_stream(line); + if (!std::getline(line_stream, token, ',')) continue; + if (token == ProfileDataFromFileConstants::kBlockCounterMarker) { + // Any line starting with kBlockCounterMarker is a block usage count. + // As defined by Logger::BasicBlockCounterEvent, the format is: + // literal kBlockCounterMarker , builtin_name , block_id , usage_count + std::string builtin_name; + CHECK(std::getline(line_stream, builtin_name, ',')); + CHECK(std::getline(line_stream, token, ',')); + char* end = nullptr; + uint32_t id = static_cast(strtoul(token.c_str(), &end, 0)); + CHECK(errno == 0 && end != token.c_str()); + std::getline(line_stream, token, ','); + CHECK(line_stream.eof()); + uint32_t count = static_cast(strtoul(token.c_str(), &end, 0)); + CHECK(errno == 0 && end != token.c_str()); + ProfileDataFromFileInternal& counters_and_hash = + (*data.get())[builtin_name]; + // We allow concatenating data from several Isolates, so we might see the + // same block multiple times. Just sum them all. + counters_and_hash.AddCountToBlock(id, count); + } else if (token == ProfileDataFromFileConstants::kBuiltinHashMarker) { + // Any line starting with kBuiltinHashMarker is a function hash record. + // As defined by Logger::BuiltinHashEvent, the format is: + // literal kBuiltinHashMarker , builtin_name , hash + std::string builtin_name; + CHECK(std::getline(line_stream, builtin_name, ',')); + std::getline(line_stream, token, ','); + CHECK(line_stream.eof()); + char* end = nullptr; + int hash = static_cast(strtol(token.c_str(), &end, 0)); + CHECK(errno == 0 && end != token.c_str()); + ProfileDataFromFileInternal& counters_and_hash = + (*data.get())[builtin_name]; + // We allow concatenating data from several Isolates, but expect them all + // to be running the same build. Any file with mismatched hashes for a + // function is considered ill-formed. + CHECK_IMPLIES(counters_and_hash.hash_has_value(), + counters_and_hash.hash() == hash); + counters_and_hash.set_hash(hash); + } + } + for (const auto& pair : *data.get()) { + // Every function is required to have a hash in the log. + CHECK(pair.second.hash_has_value()); + } + if (data.get()->size() == 0) { + PrintF( + "No basic block counters were found in log file.\n" + "Did you build with v8_enable_builtins_profiling=true\n" + "and run with --turbo-profiling-log-builtins?\n"); + } + + return *data.get(); +} + +} // namespace + +const ProfileDataFromFile* ProfileDataFromFile::TryRead(const char* name) { + const auto& data = EnsureInitProfileData(); + auto it = data.find(name); + return it == data.end() ? nullptr : &it->second; +} + +} // namespace internal +} // namespace v8 diff --git a/src/builtins/profile-data-reader.h b/src/builtins/profile-data-reader.h new file mode 100644 index 0000000000..18490141d1 --- /dev/null +++ b/src/builtins/profile-data-reader.h @@ -0,0 +1,62 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_BUILTINS_PROFILE_DATA_READER_H_ +#define V8_BUILTINS_PROFILE_DATA_READER_H_ + +#include +#include +#include + +namespace v8 { +namespace internal { + +class ProfileDataFromFile { + public: + // A hash of the function's Graph before scheduling. Allows us to avoid using + // profiling data if the function has been changed. + int hash() const { return hash_; } + + // Returns how many times the block with the given ID was executed during + // profiling. + uint32_t GetCounter(size_t block_id) const { + // The profile data is allowed to omit blocks which were never hit, so be + // careful to avoid out-of-bounds access. + return block_id < block_counts_by_id_.size() ? block_counts_by_id_[block_id] + : 0; + } + + // Load basic block profiling data for the builtin with the given name, if + // such data exists. The returned vector is indexed by block ID, and its + // values are the number of times each block was executed while profiling. + static const ProfileDataFromFile* TryRead(const char* name); + + protected: + int hash_ = 0; + + // How many times each block was executed, indexed by block ID. This vector + // may be shorter than the total number of blocks; any omitted block should be + // treated as a zero. + std::vector block_counts_by_id_; +}; + +// The following strings can't be static members of ProfileDataFromFile until +// C++ 17; see https://stackoverflow.com/q/8016780/839379 . So for now we use a +// namespace. +namespace ProfileDataFromFileConstants { + +// Any line in a v8.log beginning with this string represents a basic block +// counter. +static constexpr char kBlockCounterMarker[] = "block"; + +// Any line in a v8.log beginning with this string represents the hash of the +// function Graph for a builtin. +static constexpr char kBuiltinHashMarker[] = "builtin_hash"; + +} // namespace ProfileDataFromFileConstants + +} // namespace internal +} // namespace v8 + +#endif // V8_BUILTINS_PROFILE_DATA_READER_H_ diff --git a/src/builtins/setup-builtins-internal.cc b/src/builtins/setup-builtins-internal.cc index d094c3f2ad..7f654cf6ed 100644 --- a/src/builtins/setup-builtins-internal.cc +++ b/src/builtins/setup-builtins-internal.cc @@ -3,6 +3,7 @@ // found in the LICENSE file. #include "src/builtins/builtins.h" +#include "src/builtins/profile-data-reader.h" #include "src/codegen/assembler-inl.h" #include "src/codegen/interface-descriptors.h" #include "src/codegen/macro-assembler-inl.h" @@ -165,7 +166,8 @@ Code BuildWithCodeStubAssemblerJS(Isolate* isolate, int32_t builtin_index, PoisoningMitigationLevel::kDontPoison, builtin_index); generator(&state); Handle code = compiler::CodeAssembler::GenerateCode( - &state, BuiltinAssemblerOptions(isolate, builtin_index)); + &state, BuiltinAssemblerOptions(isolate, builtin_index), + ProfileDataFromFile::TryRead(name)); return *code; } @@ -189,7 +191,8 @@ Code BuildWithCodeStubAssemblerCS(Isolate* isolate, int32_t builtin_index, PoisoningMitigationLevel::kDontPoison, builtin_index); generator(&state); Handle code = compiler::CodeAssembler::GenerateCode( - &state, BuiltinAssemblerOptions(isolate, builtin_index)); + &state, BuiltinAssemblerOptions(isolate, builtin_index), + ProfileDataFromFile::TryRead(name)); return *code; } diff --git a/src/compiler/basic-block-instrumentor.cc b/src/compiler/basic-block-instrumentor.cc index ca6a60b782..20c916757a 100644 --- a/src/compiler/basic-block-instrumentor.cc +++ b/src/compiler/basic-block-instrumentor.cc @@ -98,7 +98,9 @@ BasicBlockProfilerData* BasicBlockInstrumentor::Instrument( for (BasicBlockVector::iterator it = blocks->begin(); block_number < n_blocks; ++it, ++block_number) { BasicBlock* block = (*it); - data->SetBlockRpoNumber(block_number, block->rpo_number()); + // Iteration is already in reverse post-order. + DCHECK_EQ(block->rpo_number(), block_number); + data->SetBlockId(block_number, block->id().ToInt()); // It is unnecessary to wire effect and control deps for load and store // since this happens after scheduling. // Construct increment operation. diff --git a/src/compiler/code-assembler.cc b/src/compiler/code-assembler.cc index cda02a5c96..91ca9297ef 100644 --- a/src/compiler/code-assembler.cc +++ b/src/compiler/code-assembler.cc @@ -175,8 +175,9 @@ PoisoningMitigationLevel CodeAssembler::poisoning_level() const { } // static -Handle CodeAssembler::GenerateCode(CodeAssemblerState* state, - const AssemblerOptions& options) { +Handle CodeAssembler::GenerateCode( + CodeAssemblerState* state, const AssemblerOptions& options, + const ProfileDataFromFile* profile_data) { DCHECK(!state->code_generated_); RawMachineAssembler* rasm = state->raw_assembler_.get(); @@ -184,11 +185,12 @@ Handle CodeAssembler::GenerateCode(CodeAssemblerState* state, Handle code; Graph* graph = rasm->ExportForOptimization(); - code = Pipeline::GenerateCodeForCodeStub( - rasm->isolate(), rasm->call_descriptor(), graph, state->jsgraph_, - rasm->source_positions(), state->kind_, state->name_, - state->builtin_index_, rasm->poisoning_level(), options) - .ToHandleChecked(); + code = + Pipeline::GenerateCodeForCodeStub( + rasm->isolate(), rasm->call_descriptor(), graph, state->jsgraph_, + rasm->source_positions(), state->kind_, state->name_, + state->builtin_index_, rasm->poisoning_level(), options, profile_data) + .ToHandleChecked(); state->code_generated_ = true; return code; diff --git a/src/compiler/code-assembler.h b/src/compiler/code-assembler.h index 7cbaf8087b..248dc08837 100644 --- a/src/compiler/code-assembler.h +++ b/src/compiler/code-assembler.h @@ -68,6 +68,7 @@ class JSFinalizationRegistry; class JSWeakMap; class JSWeakRef; class JSWeakSet; +class ProfileDataFromFile; class PromiseCapability; class PromiseFulfillReactionJobTask; class PromiseReaction; @@ -375,7 +376,8 @@ class V8_EXPORT_PRIVATE CodeAssembler { ~CodeAssembler(); static Handle GenerateCode(CodeAssemblerState* state, - const AssemblerOptions& options); + const AssemblerOptions& options, + const ProfileDataFromFile* profile_data); bool Is64() const; bool Is32() const; diff --git a/src/compiler/pipeline.cc b/src/compiler/pipeline.cc index aab226291d..da2822f04d 100644 --- a/src/compiler/pipeline.cc +++ b/src/compiler/pipeline.cc @@ -11,6 +11,7 @@ #include "src/base/optional.h" #include "src/base/platform/elapsed-timer.h" +#include "src/builtins/profile-data-reader.h" #include "src/codegen/assembler-inl.h" #include "src/codegen/compiler.h" #include "src/codegen/optimized-compilation-info.h" @@ -215,7 +216,8 @@ class PipelineData { JSGraph* jsgraph, Schedule* schedule, SourcePositionTable* source_positions, NodeOriginTable* node_origins, JumpOptimizationInfo* jump_opt, - const AssemblerOptions& assembler_options) + const AssemblerOptions& assembler_options, + const ProfileDataFromFile* profile_data) : isolate_(isolate), wasm_engine_(isolate_->wasm_engine()), allocator_(allocator), @@ -236,7 +238,8 @@ class PipelineData { kRegisterAllocationZoneName), register_allocation_zone_(register_allocation_zone_scope_.zone()), jump_optimization_info_(jump_opt), - assembler_options_(assembler_options) { + assembler_options_(assembler_options), + profile_data_(profile_data) { if (jsgraph) { jsgraph_ = jsgraph; simplified_ = jsgraph->simplified(); @@ -533,6 +536,11 @@ class PipelineData { return roots_relative_addressing_enabled_; } + const ProfileDataFromFile* profile_data() const { return profile_data_; } + void set_profile_data(const ProfileDataFromFile* profile_data) { + profile_data_ = profile_data; + } + // RuntimeCallStats that is only available during job execution but not // finalization. // TODO(delphick): Currently even during execution this can be nullptr, due to @@ -614,6 +622,7 @@ class PipelineData { size_t max_pushed_argument_count_ = 0; RuntimeCallStats* runtime_call_stats_ = nullptr; + const ProfileDataFromFile* profile_data_ = nullptr; DISALLOW_COPY_AND_ASSIGN(PipelineData); }; @@ -1243,7 +1252,7 @@ class WasmHeapStubCompilationJob final : public OptimizedCompilationJob { graph_(graph), data_(&zone_stats_, &info_, isolate, wasm_engine->allocator(), graph_, nullptr, nullptr, source_positions, - zone_->New(graph_), nullptr, options), + zone_->New(graph_), nullptr, options, nullptr), pipeline_(&data_), wasm_engine_(wasm_engine) {} @@ -1728,7 +1737,7 @@ struct EffectControlLinearizationPhase { // 'floating' allocation regions.) Schedule* schedule = Scheduler::ComputeSchedule( temp_zone, data->graph(), Scheduler::kTempSchedule, - &data->info()->tick_counter()); + &data->info()->tick_counter(), data->profile_data()); TraceScheduleAndVerify(data->info(), data, schedule, "effect linearization schedule"); @@ -2028,7 +2037,7 @@ struct ComputeSchedulePhase { temp_zone, data->graph(), data->info()->splitting() ? Scheduler::kSplitNodes : Scheduler::kNoFlags, - &data->info()->tick_counter()); + &data->info()->tick_counter(), data->profile_data()); data->set_schedule(schedule); } }; @@ -2326,9 +2335,9 @@ struct PrintGraphPhase { AccountingAllocator allocator; Schedule* schedule = data->schedule(); if (schedule == nullptr) { - schedule = Scheduler::ComputeSchedule(temp_zone, data->graph(), - Scheduler::kNoFlags, - &info->tick_counter()); + schedule = Scheduler::ComputeSchedule( + temp_zone, data->graph(), Scheduler::kNoFlags, + &info->tick_counter(), data->profile_data()); } AllowHandleDereference allow_deref; @@ -2633,11 +2642,80 @@ bool PipelineImpl::OptimizeGraphForMidTier(Linkage* linkage) { return SelectInstructions(linkage); } +namespace { + +// Compute a hash of the given graph, in a way that should provide the same +// result in multiple runs of mksnapshot, meaning the hash cannot depend on any +// external pointer values or uncompressed heap constants. This hash can be used +// to reject profiling data if the builtin's current code doesn't match the +// version that was profiled. Hash collisions are not catastrophic; in the worst +// case, we just defer some blocks that ideally shouldn't be deferred. The +// result value is in the valid Smi range. +int HashGraphForPGO(Graph* graph) { + AccountingAllocator allocator; + Zone local_zone(&allocator, ZONE_NAME); + + constexpr NodeId kUnassigned = static_cast(-1); + + constexpr byte kUnvisited = 0; + constexpr byte kOnStack = 1; + constexpr byte kVisited = 2; + + // Do a depth-first post-order traversal of the graph. For every node, hash: + // + // - the node's traversal number + // - the opcode + // - the number of inputs + // - each input node's traversal number + // + // What's a traversal number? We can't use node IDs because they're not stable + // build-to-build, so we assign a new number for each node as it is visited. + + ZoneVector state(graph->NodeCount(), kUnvisited, &local_zone); + ZoneVector traversal_numbers(graph->NodeCount(), kUnassigned, + &local_zone); + ZoneStack stack(&local_zone); + + NodeId visited_count = 0; + size_t hash = 0; + + stack.push(graph->end()); + state[graph->end()->id()] = kOnStack; + traversal_numbers[graph->end()->id()] = visited_count++; + while (!stack.empty()) { + Node* n = stack.top(); + bool pop = true; + for (Node* const i : n->inputs()) { + if (state[i->id()] == kUnvisited) { + state[i->id()] = kOnStack; + traversal_numbers[i->id()] = visited_count++; + stack.push(i); + pop = false; + break; + } + } + if (pop) { + state[n->id()] = kVisited; + stack.pop(); + hash = base::hash_combine(hash, traversal_numbers[n->id()], n->opcode(), + n->InputCount()); + for (Node* const i : n->inputs()) { + DCHECK(traversal_numbers[i->id()] != kUnassigned); + hash = base::hash_combine(hash, traversal_numbers[i->id()]); + } + } + } + return Smi(IntToSmi(static_cast(hash))).value(); +} + +} // namespace + MaybeHandle Pipeline::GenerateCodeForCodeStub( Isolate* isolate, CallDescriptor* call_descriptor, Graph* graph, JSGraph* jsgraph, SourcePositionTable* source_positions, Code::Kind kind, const char* debug_name, int32_t builtin_index, - PoisoningMitigationLevel poisoning_level, const AssemblerOptions& options) { + PoisoningMitigationLevel poisoning_level, const AssemblerOptions& options, + const ProfileDataFromFile* profile_data) { OptimizedCompilationInfo info(CStrVector(debug_name), graph->zone(), kind); info.set_builtin_index(builtin_index); @@ -2654,7 +2732,8 @@ MaybeHandle Pipeline::GenerateCodeForCodeStub( !FLAG_turbo_profiling; PipelineData data(&zone_stats, &info, isolate, isolate->allocator(), graph, jsgraph, nullptr, source_positions, &node_origins, - should_optimize_jumps ? &jump_opt : nullptr, options); + should_optimize_jumps ? &jump_opt : nullptr, options, + profile_data); PipelineJobScope scope(&data, isolate->counters()->runtime_call_stats()); RuntimeCallTimerScope timer_scope(isolate, RuntimeCallCounterId::kOptimizeCode); @@ -2699,6 +2778,19 @@ MaybeHandle Pipeline::GenerateCodeForCodeStub( true); pipeline.Run(true); + + int graph_hash_before_scheduling = 0; + if (FLAG_turbo_profiling || profile_data != nullptr) { + graph_hash_before_scheduling = HashGraphForPGO(data.graph()); + } + + if (profile_data != nullptr && + profile_data->hash() != graph_hash_before_scheduling) { + PrintF("Rejected profile data for %s due to function change\n", debug_name); + profile_data = nullptr; + data.set_profile_data(profile_data); + } + pipeline.ComputeScheduledGraph(); DCHECK_NOT_NULL(data.schedule()); @@ -2708,13 +2800,18 @@ MaybeHandle Pipeline::GenerateCodeForCodeStub( PipelineData second_data(&zone_stats, &info, isolate, isolate->allocator(), data.graph(), data.jsgraph(), data.schedule(), data.source_positions(), data.node_origins(), - data.jump_optimization_info(), options); + data.jump_optimization_info(), options, + profile_data); PipelineJobScope second_scope(&second_data, isolate->counters()->runtime_call_stats()); second_data.set_verify_graph(FLAG_verify_csa); PipelineImpl second_pipeline(&second_data); second_pipeline.SelectInstructionsAndAssemble(call_descriptor); + if (FLAG_turbo_profiling) { + info.profiler_data()->SetHash(graph_hash_before_scheduling); + } + Handle code; if (jump_opt.is_optimizable()) { jump_opt.set_optimizing(); @@ -2882,7 +2979,7 @@ MaybeHandle Pipeline::GenerateCodeForTesting( NodeOriginTable* node_positions = info->zone()->New(graph); PipelineData data(&zone_stats, info, isolate, isolate->allocator(), graph, nullptr, schedule, nullptr, node_positions, nullptr, - options); + options, nullptr); std::unique_ptr pipeline_statistics; if (FLAG_turbo_stats || FLAG_turbo_stats_nvp) { pipeline_statistics.reset(new PipelineStatistics( diff --git a/src/compiler/pipeline.h b/src/compiler/pipeline.h index b964c131f9..da992f62a3 100644 --- a/src/compiler/pipeline.h +++ b/src/compiler/pipeline.h @@ -19,6 +19,7 @@ namespace internal { struct AssemblerOptions; class OptimizedCompilationInfo; class OptimizedCompilationJob; +class ProfileDataFromFile; class RegisterConfiguration; namespace wasm { @@ -78,8 +79,8 @@ class Pipeline : public AllStatic { Isolate* isolate, CallDescriptor* call_descriptor, Graph* graph, JSGraph* jsgraph, SourcePositionTable* source_positions, Code::Kind kind, const char* debug_name, int32_t builtin_index, - PoisoningMitigationLevel poisoning_level, - const AssemblerOptions& options); + PoisoningMitigationLevel poisoning_level, const AssemblerOptions& options, + const ProfileDataFromFile* profile_data); // --------------------------------------------------------------------------- // The following methods are for testing purposes only. Avoid production use. diff --git a/src/compiler/scheduler.cc b/src/compiler/scheduler.cc index 8072eb3e67..11b7636127 100644 --- a/src/compiler/scheduler.cc +++ b/src/compiler/scheduler.cc @@ -7,6 +7,7 @@ #include #include "src/base/iterator.h" +#include "src/builtins/profile-data-reader.h" #include "src/codegen/tick-counter.h" #include "src/compiler/common-operator.h" #include "src/compiler/control-equivalence.h" @@ -27,7 +28,8 @@ namespace compiler { } while (false) Scheduler::Scheduler(Zone* zone, Graph* graph, Schedule* schedule, Flags flags, - size_t node_count_hint, TickCounter* tick_counter) + size_t node_count_hint, TickCounter* tick_counter, + const ProfileDataFromFile* profile_data) : zone_(zone), graph_(graph), schedule_(schedule), @@ -36,13 +38,15 @@ Scheduler::Scheduler(Zone* zone, Graph* graph, Schedule* schedule, Flags flags, schedule_root_nodes_(zone), schedule_queue_(zone), node_data_(zone), - tick_counter_(tick_counter) { + tick_counter_(tick_counter), + profile_data_(profile_data) { node_data_.reserve(node_count_hint); node_data_.resize(graph->NodeCount(), DefaultSchedulerData()); } Schedule* Scheduler::ComputeSchedule(Zone* zone, Graph* graph, Flags flags, - TickCounter* tick_counter) { + TickCounter* tick_counter, + const ProfileDataFromFile* profile_data) { Zone* schedule_zone = (flags & Scheduler::kTempSchedule) ? zone : graph->zone(); @@ -54,7 +58,7 @@ Schedule* Scheduler::ComputeSchedule(Zone* zone, Graph* graph, Flags flags, Schedule* schedule = schedule_zone->New(schedule_zone, node_count_hint); Scheduler scheduler(zone, graph, schedule, flags, node_count_hint, - tick_counter); + tick_counter, profile_data); scheduler.BuildCFG(); scheduler.ComputeSpecialRPONumbering(); @@ -472,9 +476,38 @@ class CFGBuilder : public ZoneObject { CollectSuccessorBlocks(branch, successor_blocks, arraysize(successor_blocks)); + BranchHint hint_from_profile = BranchHint::kNone; + if (const ProfileDataFromFile* profile_data = scheduler_->profile_data()) { + uint32_t block_zero_count = + profile_data->GetCounter(successor_blocks[0]->id().ToSize()); + uint32_t block_one_count = + profile_data->GetCounter(successor_blocks[1]->id().ToSize()); + // If a branch is visited a non-trivial number of times and substantially + // more often than its alternative, then mark it as likely. + constexpr uint32_t kMinimumCount = 100000; + constexpr uint32_t kThresholdRatio = 4000; + if (block_zero_count > kMinimumCount && + block_zero_count / kThresholdRatio > block_one_count) { + hint_from_profile = BranchHint::kTrue; + } else if (block_one_count > kMinimumCount && + block_one_count / kThresholdRatio > block_zero_count) { + hint_from_profile = BranchHint::kFalse; + } + } + // Consider branch hints. - switch (BranchHintOf(branch->op())) { + switch (hint_from_profile) { case BranchHint::kNone: + switch (BranchHintOf(branch->op())) { + case BranchHint::kNone: + break; + case BranchHint::kTrue: + successor_blocks[1]->set_deferred(true); + break; + case BranchHint::kFalse: + successor_blocks[0]->set_deferred(true); + break; + } break; case BranchHint::kTrue: successor_blocks[1]->set_deferred(true); @@ -484,6 +517,12 @@ class CFGBuilder : public ZoneObject { break; } + if (hint_from_profile != BranchHint::kNone && + BranchHintOf(branch->op()) != BranchHint::kNone && + hint_from_profile != BranchHintOf(branch->op())) { + PrintF("Warning: profiling data overrode manual branch hint.\n"); + } + if (branch == component_entry_) { TraceConnect(branch, component_start_, successor_blocks[0]); TraceConnect(branch, component_start_, successor_blocks[1]); diff --git a/src/compiler/scheduler.h b/src/compiler/scheduler.h index 291b53db67..d8934ec157 100644 --- a/src/compiler/scheduler.h +++ b/src/compiler/scheduler.h @@ -16,6 +16,7 @@ namespace v8 { namespace internal { +class ProfileDataFromFile; class TickCounter; namespace compiler { @@ -37,7 +38,8 @@ class V8_EXPORT_PRIVATE Scheduler { // The complete scheduling algorithm. Creates a new schedule and places all // nodes from the graph into it. static Schedule* ComputeSchedule(Zone* temp_zone, Graph* graph, Flags flags, - TickCounter* tick_counter); + TickCounter* tick_counter, + const ProfileDataFromFile* profile_data); // Compute the RPO of blocks in an existing schedule. static BasicBlockVector* ComputeSpecialRPO(Zone* zone, Schedule* schedule); @@ -45,6 +47,8 @@ class V8_EXPORT_PRIVATE Scheduler { // Computes the dominator tree on an existing schedule that has RPO computed. static void GenerateDominatorTree(Schedule* schedule); + const ProfileDataFromFile* profile_data() const { return profile_data_; } + private: // Placement of a node changes during scheduling. The placement state // transitions over time while the scheduler is choosing a position: @@ -85,9 +89,11 @@ class V8_EXPORT_PRIVATE Scheduler { SpecialRPONumberer* special_rpo_; // Special RPO numbering of blocks. ControlEquivalence* equivalence_; // Control dependence equivalence. TickCounter* const tick_counter_; + const ProfileDataFromFile* profile_data_; Scheduler(Zone* zone, Graph* graph, Schedule* schedule, Flags flags, - size_t node_count_hint_, TickCounter* tick_counter); + size_t node_count_hint_, TickCounter* tick_counter, + const ProfileDataFromFile* profile_data); inline SchedulerData DefaultSchedulerData(); inline SchedulerData* GetData(Node* node); diff --git a/src/diagnostics/basic-block-profiler.cc b/src/diagnostics/basic-block-profiler.cc index cd3bbac50a..ddce03880e 100644 --- a/src/diagnostics/basic-block-profiler.cc +++ b/src/diagnostics/basic-block-profiler.cc @@ -18,7 +18,7 @@ namespace internal { DEFINE_LAZY_LEAKY_OBJECT_GETTER(BasicBlockProfiler, BasicBlockProfiler::Get) BasicBlockProfilerData::BasicBlockProfilerData(size_t n_blocks) - : block_rpo_numbers_(n_blocks), counts_(n_blocks, 0) {} + : block_ids_(n_blocks), counts_(n_blocks, 0) {} void BasicBlockProfilerData::SetCode(const std::ostringstream& os) { code_ = os.str(); @@ -32,12 +32,13 @@ void BasicBlockProfilerData::SetSchedule(const std::ostringstream& os) { schedule_ = os.str(); } -void BasicBlockProfilerData::SetBlockRpoNumber(size_t offset, - int32_t block_rpo) { +void BasicBlockProfilerData::SetBlockId(size_t offset, int32_t id) { DCHECK(offset < n_blocks()); - block_rpo_numbers_[offset] = block_rpo; + block_ids_[offset] = id; } +void BasicBlockProfilerData::SetHash(int hash) { hash_ = hash; } + void BasicBlockProfilerData::ResetCounts() { for (size_t i = 0; i < n_blocks(); ++i) { counts_[i] = 0; @@ -58,7 +59,7 @@ Handle CopyStringToJSHeap(const std::string& source, Isolate* isolate) { AllocationType::kOld); } -// Size of entries in both block_rpo_numbers and counts. +// Size of entries in both block_ids and counts. constexpr int kBasicBlockSlotSize = kInt32Size; } // namespace @@ -71,11 +72,12 @@ BasicBlockProfilerData::BasicBlockProfilerData( for (int i = 0; i < counts->length() / kBasicBlockSlotSize; ++i) { counts_.push_back(counts->get_uint32(i)); } - Handle rpo_numbers(js_heap_data->block_rpo_numbers(), isolate); - for (int i = 0; i < rpo_numbers->length() / kBasicBlockSlotSize; ++i) { - block_rpo_numbers_.push_back(rpo_numbers->get_int(i)); + Handle block_ids(js_heap_data->block_ids(), isolate); + for (int i = 0; i < block_ids->length() / kBasicBlockSlotSize; ++i) { + block_ids_.push_back(block_ids->get_int(i)); } - CHECK_EQ(block_rpo_numbers_.size(), counts_.size()); + CHECK_EQ(block_ids_.size(), counts_.size()); + hash_ = js_heap_data->hash(); } BasicBlockProfilerData::BasicBlockProfilerData( @@ -87,11 +89,11 @@ BasicBlockProfilerData::BasicBlockProfilerData( for (int i = 0; i < counts.length() / kBasicBlockSlotSize; ++i) { counts_.push_back(counts.get_uint32(i)); } - ByteArray rpo_numbers(js_heap_data.block_rpo_numbers()); - for (int i = 0; i < rpo_numbers.length() / kBasicBlockSlotSize; ++i) { - block_rpo_numbers_.push_back(rpo_numbers.get_int(i)); + ByteArray block_ids(js_heap_data.block_ids()); + for (int i = 0; i < block_ids.length() / kBasicBlockSlotSize; ++i) { + block_ids_.push_back(block_ids.get_int(i)); } - CHECK_EQ(block_rpo_numbers_.size(), counts_.size()); + CHECK_EQ(block_ids_.size(), counts_.size()); } Handle BasicBlockProfilerData::CopyToJSHeap( @@ -100,10 +102,10 @@ Handle BasicBlockProfilerData::CopyToJSHeap( CHECK(array_size_in_bytes >= 0 && static_cast(array_size_in_bytes) / kBasicBlockSlotSize == n_blocks()); // Overflow - Handle block_rpo_numbers = isolate->factory()->NewByteArray( + Handle block_ids = isolate->factory()->NewByteArray( array_size_in_bytes, AllocationType::kOld); for (int i = 0; i < static_cast(n_blocks()); ++i) { - block_rpo_numbers->set_int(i, block_rpo_numbers_[i]); + block_ids->set_int(i, block_ids_[i]); } Handle counts = isolate->factory()->NewByteArray( array_size_in_bytes, AllocationType::kOld); @@ -115,7 +117,7 @@ Handle BasicBlockProfilerData::CopyToJSHeap( Handle code = CopyStringToJSHeap(code_, isolate); return isolate->factory()->NewOnHeapBasicBlockProfilerData( - block_rpo_numbers, counts, name, schedule, code, AllocationType::kOld); + block_ids, counts, name, schedule, code, hash_, AllocationType::kOld); } void BasicBlockProfiler::ResetCounts(Isolate* isolate) { @@ -147,11 +149,18 @@ void BasicBlockProfiler::Print(std::ostream& os, Isolate* isolate) { HandleScope scope(isolate); Handle list(isolate->heap()->basic_block_profiling_data(), isolate); + std::unordered_set builtin_names; for (int i = 0; i < list->Length(); ++i) { BasicBlockProfilerData data( handle(OnHeapBasicBlockProfilerData::cast(list->Get(i)), isolate), isolate); + // Print data for builtins to both stdout and the log file, if logging is + // enabled. os << data; + data.Log(isolate); + // Ensure that all builtin names are unique; otherwise profile-guided + // optimization might get confused. + CHECK(builtin_names.insert(data.function_name_).second); } os << "---- End Profiling Data ----" << std::endl; } @@ -171,6 +180,20 @@ std::vector BasicBlockProfiler::GetCoverageBitmap(Isolate* isolate) { return out; } +void BasicBlockProfilerData::Log(Isolate* isolate) { + bool any_nonzero_counter = false; + for (size_t i = 0; i < n_blocks(); ++i) { + if (counts_[i] > 0) { + any_nonzero_counter = true; + isolate->logger()->BasicBlockCounterEvent(function_name_.c_str(), + block_ids_[i], counts_[i]); + } + } + if (any_nonzero_counter) { + isolate->logger()->BuiltinHashEvent(function_name_.c_str(), hash_); + } +} + std::ostream& operator<<(std::ostream& os, const BasicBlockProfilerData& d) { int block_count_sum = std::accumulate(d.counts_.begin(), d.counts_.end(), 0); if (block_count_sum == 0) return os; @@ -184,17 +207,17 @@ std::ostream& operator<<(std::ostream& os, const BasicBlockProfilerData& d) { os << d.schedule_.c_str() << std::endl; } os << "block counts for " << name << ":" << std::endl; - std::vector> pairs; + std::vector> pairs; pairs.reserve(d.n_blocks()); for (size_t i = 0; i < d.n_blocks(); ++i) { - pairs.push_back(std::make_pair(d.block_rpo_numbers_[i], d.counts_[i])); + pairs.push_back(std::make_pair(i, d.counts_[i])); } - std::sort(pairs.begin(), pairs.end(), - [=](std::pair left, - std::pair right) { - if (right.second == left.second) return left.first < right.first; - return right.second < left.second; - }); + std::sort( + pairs.begin(), pairs.end(), + [=](std::pair left, std::pair right) { + if (right.second == left.second) return left.first < right.first; + return right.second < left.second; + }); for (auto it : pairs) { if (it.second == 0) break; os << "block B" << it.first << " : " << it.second << std::endl; diff --git a/src/diagnostics/basic-block-profiler.h b/src/diagnostics/basic-block-profiler.h index 49c66e7fa2..b0bf3543d9 100644 --- a/src/diagnostics/basic-block-profiler.h +++ b/src/diagnostics/basic-block-profiler.h @@ -28,21 +28,24 @@ class BasicBlockProfilerData { OnHeapBasicBlockProfilerData js_heap_data); size_t n_blocks() const { - DCHECK_EQ(block_rpo_numbers_.size(), counts_.size()); - return block_rpo_numbers_.size(); + DCHECK_EQ(block_ids_.size(), counts_.size()); + return block_ids_.size(); } const uint32_t* counts() const { return &counts_[0]; } void SetCode(const std::ostringstream& os); void SetFunctionName(std::unique_ptr name); void SetSchedule(const std::ostringstream& os); - void SetBlockRpoNumber(size_t offset, int32_t block_rpo); + void SetBlockId(size_t offset, int32_t id); + void SetHash(int hash); // Copy the data from this object into an equivalent object stored on the JS // heap, so that it can survive snapshotting and relocation. This must // happen on the main thread during finalization of the compilation. Handle CopyToJSHeap(Isolate* isolate); + void Log(Isolate* isolate); + private: friend class BasicBlockProfiler; friend std::ostream& operator<<(std::ostream& os, @@ -50,11 +53,13 @@ class BasicBlockProfilerData { V8_EXPORT_PRIVATE void ResetCounts(); - std::vector block_rpo_numbers_; + // These vectors are indexed by reverse post-order block number. + std::vector block_ids_; std::vector counts_; std::string function_name_; std::string schedule_; std::string code_; + int hash_; DISALLOW_COPY_AND_ASSIGN(BasicBlockProfilerData); }; diff --git a/src/flags/flag-definitions.h b/src/flags/flag-definitions.h index 3f6bfdcaeb..ed8415a10c 100644 --- a/src/flags/flag-definitions.h +++ b/src/flags/flag-definitions.h @@ -640,6 +640,9 @@ DEFINE_BOOL(turbo_profiling_verbose, false, "enable basic block profiling in TurboFan, and include each " "function's schedule and disassembly in the output") DEFINE_IMPLICATION(turbo_profiling_verbose, turbo_profiling) +DEFINE_BOOL(turbo_profiling_log_builtins, false, + "emit data about basic block usage in builtins to v8.log (requires " + "that V8 was built with v8_enable_builtins_profiling=true)") DEFINE_BOOL(turbo_verify_allocation, DEBUG_BOOL, "verify register allocation in TurboFan") DEFINE_BOOL(turbo_move_optimization, true, "optimize gap moves in TurboFan") @@ -1449,6 +1452,9 @@ DEFINE_BOOL(target_is_simulator, false, "Instruct mksnapshot that the target is meant to run in the " "simulator and it can generate simulator-specific instructions. " "(mksnapshot only)") +DEFINE_STRING(turbo_profiling_log_file, nullptr, + "Path of the input file containing basic block counters for " + "builtins. (mksnapshot only)") // // Minor mark compact collector flags. diff --git a/src/interpreter/interpreter-generator.cc b/src/interpreter/interpreter-generator.cc index ce90905216..37ed677a82 100644 --- a/src/interpreter/interpreter-generator.cc +++ b/src/interpreter/interpreter-generator.cc @@ -9,6 +9,7 @@ #include "src/builtins/builtins-constructor-gen.h" #include "src/builtins/builtins-iterator-gen.h" +#include "src/builtins/profile-data-reader.h" #include "src/codegen/code-factory.h" #include "src/debug/debug.h" #include "src/ic/accessor-assembler.h" @@ -3127,7 +3128,8 @@ Handle GenerateBytecodeHandler(Isolate* isolate, const char* debug_name, #undef CALL_GENERATOR } - Handle code = compiler::CodeAssembler::GenerateCode(&state, options); + Handle code = compiler::CodeAssembler::GenerateCode( + &state, options, ProfileDataFromFile::TryRead(debug_name)); #ifdef ENABLE_DISASSEMBLER if (FLAG_trace_ignition_codegen) { diff --git a/src/logging/log-utils.h b/src/logging/log-utils.h index bae665cd45..14395de84e 100644 --- a/src/logging/log-utils.h +++ b/src/logging/log-utils.h @@ -37,7 +37,8 @@ class Log { FLAG_log_suspect || FLAG_ll_prof || FLAG_perf_basic_prof || FLAG_perf_prof || FLAG_log_source_code || FLAG_gdbjit || FLAG_log_internal_timer_events || FLAG_prof_cpp || FLAG_trace_ic || - FLAG_log_function_events || FLAG_trace_zone_stats; + FLAG_log_function_events || FLAG_trace_zone_stats || + FLAG_turbo_profiling_log_builtins; } // Frees all resources acquired in Initialize and Open... functions. diff --git a/src/logging/log.cc b/src/logging/log.cc index 22d4e32614..7d58ce0c89 100644 --- a/src/logging/log.cc +++ b/src/logging/log.cc @@ -10,6 +10,7 @@ #include "src/api/api-inl.h" #include "src/base/platform/platform.h" +#include "src/builtins/profile-data-reader.h" #include "src/codegen/bailout-reason.h" #include "src/codegen/macro-assembler.h" #include "src/codegen/source-position-table.h" @@ -34,11 +35,10 @@ #include "src/strings/unicode-inl.h" #include "src/tracing/tracing-category-observer.h" #include "src/utils/memcopy.h" +#include "src/utils/version.h" #include "src/wasm/wasm-code-manager.h" #include "src/wasm/wasm-objects-inl.h" -#include "src/utils/version.h" - namespace v8 { namespace internal { @@ -1072,6 +1072,23 @@ void Logger::TimerEvent(Logger::StartEnd se, const char* name) { msg.WriteToLogFile(); } +void Logger::BasicBlockCounterEvent(const char* name, int block_id, + uint32_t count) { + if (!log_->IsEnabled() || !FLAG_turbo_profiling_log_builtins) return; + Log::MessageBuilder msg(log_.get()); + msg << ProfileDataFromFileConstants::kBlockCounterMarker << kNext << name + << kNext << block_id << kNext << count; + msg.WriteToLogFile(); +} + +void Logger::BuiltinHashEvent(const char* name, int hash) { + if (!log_->IsEnabled() || !FLAG_turbo_profiling_log_builtins) return; + Log::MessageBuilder msg(log_.get()); + msg << ProfileDataFromFileConstants::kBuiltinHashMarker << kNext << name + << kNext << hash; + msg.WriteToLogFile(); +} + bool Logger::is_logging() { // Disable logging while the CPU profiler is running. if (isolate_->is_profiling()) return false; diff --git a/src/logging/log.h b/src/logging/log.h index b1825ad69d..a6472dff11 100644 --- a/src/logging/log.h +++ b/src/logging/log.h @@ -246,6 +246,10 @@ class Logger : public CodeEventListener { V8_EXPORT_PRIVATE void TimerEvent(StartEnd se, const char* name); + void BasicBlockCounterEvent(const char* name, int block_id, uint32_t count); + + void BuiltinHashEvent(const char* name, int hash); + static void EnterExternal(Isolate* isolate); static void LeaveExternal(Isolate* isolate); diff --git a/src/objects/shared-function-info.tq b/src/objects/shared-function-info.tq index d467ff7284..1533d552cf 100644 --- a/src/objects/shared-function-info.tq +++ b/src/objects/shared-function-info.tq @@ -81,9 +81,10 @@ extern class UncompiledDataWithPreparseData extends UncompiledData { @export class OnHeapBasicBlockProfilerData extends HeapObject { - block_rpo_numbers: ByteArray; // Stored as 4-byte ints - counts: ByteArray; // Stored as 4-byte ints + block_ids: ByteArray; // Stored as 4-byte ints + counts: ByteArray; // Stored as 4-byte ints name: String; schedule: String; code: String; + hash: Smi; } diff --git a/test/cctest/compiler/code-assembler-tester.h b/test/cctest/compiler/code-assembler-tester.h index 68144987bd..aa80c1b34e 100644 --- a/test/cctest/compiler/code-assembler-tester.h +++ b/test/cctest/compiler/code-assembler-tester.h @@ -63,7 +63,7 @@ class CodeAssemblerTester { if (state_.InsideBlock()) { CodeAssembler(&state_).Unreachable(); } - return CodeAssembler::GenerateCode(&state_, options); + return CodeAssembler::GenerateCode(&state_, options, nullptr); } Handle GenerateCodeCloseAndEscape() { diff --git a/test/unittests/compiler/scheduler-unittest.cc b/test/unittests/compiler/scheduler-unittest.cc index ee1c7997b3..2532fc3a2b 100644 --- a/test/unittests/compiler/scheduler-unittest.cc +++ b/test/unittests/compiler/scheduler-unittest.cc @@ -40,7 +40,7 @@ class SchedulerTest : public TestWithIsolateAndZone { } Schedule* schedule = Scheduler::ComputeSchedule( - zone(), graph(), Scheduler::kSplitNodes, tick_counter()); + zone(), graph(), Scheduler::kSplitNodes, tick_counter(), nullptr); if (FLAG_trace_turbo_scheduler) { StdoutStream{} << *schedule << std::endl; @@ -92,7 +92,7 @@ TEST_F(SchedulerTest, BuildScheduleEmpty) { graph()->SetStart(graph()->NewNode(common()->Start(0))); graph()->SetEnd(graph()->NewNode(common()->End(1), graph()->start())); USE(Scheduler::ComputeSchedule(zone(), graph(), Scheduler::kNoFlags, - tick_counter())); + tick_counter(), nullptr)); } @@ -107,7 +107,7 @@ TEST_F(SchedulerTest, BuildScheduleOneParameter) { graph()->SetEnd(graph()->NewNode(common()->End(1), ret)); USE(Scheduler::ComputeSchedule(zone(), graph(), Scheduler::kNoFlags, - tick_counter())); + tick_counter(), nullptr)); }