[fastcall] Add CPU profiler support for fast calls

This CL introduces a new fast_api_call_target field on the isolate,
which is set by Turbofan before making the fast call. It then uses
the field when creating a stack sample and stores it in the existing
external_callback_entry used for regular API callbacks. The CL also
adds a cctest with simple usage scenario and introduces a minor
refactoring in test-api.cc.

Design doc:
https://docs.google.com/document/d/1r32qlPzGz0P7nieisJ5h2qfSnWOs40Cigt0LXPipejE/edit

Bug: chromium:1052746
Change-Id: I2dab1bc395ccab0c14088f7c354fb52b08df8d32
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2488683
Commit-Queue: Maya Lekova <mslekova@chromium.org>
Reviewed-by: Georg Neis <neis@chromium.org>
Reviewed-by: Peter Marshall <petermarshall@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71254}
This commit is contained in:
Maya Lekova 2020-11-18 12:19:56 +01:00 committed by Commit Bot
parent 18cbf05e8e
commit 7a62cceb72
13 changed files with 273 additions and 43 deletions

View File

@ -207,8 +207,10 @@ class Internals {
kNumIsolateDataSlots * kApiSystemPointerSize;
static const int kIsolateFastCCallCallerPcOffset =
kIsolateFastCCallCallerFpOffset + kApiSystemPointerSize;
static const int kIsolateStackGuardOffset =
static const int kIsolateFastApiCallTargetOffset =
kIsolateFastCCallCallerPcOffset + kApiSystemPointerSize;
static const int kIsolateStackGuardOffset =
kIsolateFastApiCallTargetOffset + kApiSystemPointerSize;
static const int kIsolateRootsOffset =
kIsolateStackGuardOffset + 7 * kApiSystemPointerSize;

View File

@ -826,6 +826,12 @@ ExternalReference ExternalReference::fast_c_call_caller_pc_address(
isolate->isolate_data()->fast_c_call_caller_pc_address());
}
ExternalReference ExternalReference::fast_api_call_target_address(
Isolate* isolate) {
return ExternalReference(
isolate->isolate_data()->fast_api_call_target_address());
}
ExternalReference ExternalReference::stack_is_iterable_address(
Isolate* isolate) {
return ExternalReference(

View File

@ -71,6 +71,7 @@ class StatsCounter;
"IsolateData::fast_c_call_caller_fp_address") \
V(fast_c_call_caller_pc_address, \
"IsolateData::fast_c_call_caller_pc_address") \
V(fast_api_call_target_address, "IsolateData::fast_api_call_target_address") \
V(stack_is_iterable_address, "IsolateData::stack_is_iterable_address") \
V(address_of_regexp_stack_limit_address, \
"RegExpStack::limit_address_address()") \

View File

@ -5085,9 +5085,16 @@ Node* EffectControlLinearizer::LowerFastApiCall(Node* node) {
call_descriptor->SetCFunctionInfo(c_signature);
// CPU profiler support
Node* target_address = __ ExternalConstant(
ExternalReference::fast_api_call_target_address(isolate()));
__ Store(StoreRepresentation(MachineType::PointerRepresentation(),
kNoWriteBarrier),
target_address, 0, n.target());
Node** const inputs = graph()->zone()->NewArray<Node*>(
c_arg_count + FastApiCallNode::kFastCallExtraInputCount);
inputs[0] = NodeProperties::GetValueInput(node, 0); // the target
inputs[0] = n.target();
for (int i = FastApiCallNode::kFastTargetInputCount;
i < c_arg_count + FastApiCallNode::kFastTargetInputCount; ++i) {
if (c_signature->ArgumentInfo(i - 1).GetType() ==
@ -5099,12 +5106,17 @@ Node* EffectControlLinearizer::LowerFastApiCall(Node* node) {
}
}
inputs[c_arg_count + 1] = fast_api_call_stack_slot_;
inputs[c_arg_count + 2] = __ effect();
inputs[c_arg_count + 3] = __ control();
__ Call(call_descriptor,
c_arg_count + FastApiCallNode::kFastCallExtraInputCount, inputs);
__ Store(StoreRepresentation(MachineType::PointerRepresentation(),
kNoWriteBarrier),
target_address, 0, __ IntPtrConstant(0));
// Generate the load from `fast_api_call_stack_slot_`.
Node* load = __ Load(MachineType::Int32(), fast_api_call_stack_slot_, 0);

View File

@ -301,13 +301,13 @@ SafeStackFrameIterator::SafeStackFrameIterator(Isolate* isolate, Address pc,
frame_ = nullptr;
return;
}
// 'Fast C calls' are a special type of C call where we call directly from JS
// to C without an exit frame inbetween. The CEntryStub is responsible for
// setting Isolate::c_entry_fp, meaning that it won't be set for fast C calls.
// To keep the stack iterable, we store the FP and PC of the caller of the
// fast C call on the isolate. This is guaranteed to be the topmost JS frame,
// because fast C calls cannot call back into JS. We start iterating the stack
// from this topmost JS frame.
// 'Fast C calls' are a special type of C call where we call directly from
// JS to C without an exit frame inbetween. The CEntryStub is responsible
// for setting Isolate::c_entry_fp, meaning that it won't be set for fast C
// calls. To keep the stack iterable, we store the FP and PC of the caller
// of the fast C call on the isolate. This is guaranteed to be the topmost
// JS frame, because fast C calls cannot call back into JS. We start
// iterating the stack from this topmost JS frame.
if (fast_c_fp) {
DCHECK_NE(kNullAddress, isolate->isolate_data()->fast_c_call_caller_pc());
type = StackFrame::Type::OPTIMIZED;
@ -402,6 +402,7 @@ void SafeStackFrameIterator::AdvanceOneFrame() {
DCHECK(!done());
StackFrame* last_frame = frame_;
Address last_sp = last_frame->sp(), last_fp = last_frame->fp();
// Before advancing to the next stack frame, perform pointer validity tests.
if (!IsValidFrame(last_frame) || !IsValidCaller(last_frame)) {
frame_ = nullptr;

View File

@ -75,6 +75,10 @@ class IsolateData final {
return kFastCCallCallerPCOffset - kIsolateRootBias;
}
static constexpr int fast_api_call_target_offset() {
return kFastApiCallTargetOffset - kIsolateRootBias;
}
// Root-register-relative offset of the given builtin table entry.
// TODO(ishell): remove in favour of typified id version.
static int builtin_slot_offset(int builtin_index) {
@ -90,10 +94,14 @@ class IsolateData final {
// The FP and PC that are saved right before TurboAssembler::CallCFunction.
Address* fast_c_call_caller_fp_address() { return &fast_c_call_caller_fp_; }
Address* fast_c_call_caller_pc_address() { return &fast_c_call_caller_pc_; }
// The address of the fast API callback right before it's executed from
// generated code.
Address* fast_api_call_target_address() { return &fast_api_call_target_; }
StackGuard* stack_guard() { return &stack_guard_; }
uint8_t* stack_is_iterable_address() { return &stack_is_iterable_; }
Address fast_c_call_caller_fp() { return fast_c_call_caller_fp_; }
Address fast_c_call_caller_pc() { return fast_c_call_caller_pc_; }
Address fast_api_call_target() { return fast_api_call_target_; }
uint8_t stack_is_iterable() { return stack_is_iterable_; }
// Returns true if this address points to data stored in this instance.
@ -130,6 +138,7 @@ class IsolateData final {
V(kEmbedderDataOffset, Internals::kNumIsolateDataSlots* kSystemPointerSize) \
V(kFastCCallCallerFPOffset, kSystemPointerSize) \
V(kFastCCallCallerPCOffset, kSystemPointerSize) \
V(kFastApiCallTargetOffset, kSystemPointerSize) \
V(kStackGuardOffset, StackGuard::kSizeInBytes) \
V(kRootsTableOffset, RootsTable::kEntriesCount* kSystemPointerSize) \
V(kExternalReferenceTableOffset, ExternalReferenceTable::kSizeInBytes) \
@ -166,9 +175,10 @@ class IsolateData final {
// instruction in compiled code.
Address fast_c_call_caller_fp_ = kNullAddress;
Address fast_c_call_caller_pc_ = kNullAddress;
Address fast_api_call_target_ = kNullAddress;
// Fields related to the system and JS stack. In particular, this contains the
// stack limit used by stack checks in generated code.
// Fields related to the system and JS stack. In particular, this contains
// the stack limit used by stack checks in generated code.
StackGuard stack_guard_;
RootsTable roots_;
@ -232,6 +242,8 @@ void IsolateData::AssertPredictableLayout() {
kFastCCallCallerFPOffset);
STATIC_ASSERT(offsetof(IsolateData, fast_c_call_caller_pc_) ==
kFastCCallCallerPCOffset);
STATIC_ASSERT(offsetof(IsolateData, fast_api_call_target_) ==
kFastApiCallTargetOffset);
STATIC_ASSERT(offsetof(IsolateData, stack_guard_) == kStackGuardOffset);
#ifdef V8_HEAP_SANDBOX
STATIC_ASSERT(offsetof(IsolateData, external_pointer_table_) ==

View File

@ -2299,8 +2299,14 @@ void ExistingCodeLogger::LogExistingFunction(
#if USES_FUNCTION_DESCRIPTORS
entry_point = *FUNCTION_ENTRYPOINT_ADDRESS(entry_point);
#endif
CALL_CODE_EVENT_HANDLER(
CallbackEvent(handle(shared->DebugName(), isolate_), entry_point))
Handle<String> fun_name(shared->DebugName(), isolate_);
CALL_CODE_EVENT_HANDLER(CallbackEvent(fun_name, entry_point))
// Fast API function.
Address c_function = v8::ToCData<Address>(fun_data.GetCFunction());
if (c_function != kNullAddress) {
CALL_CODE_EVENT_HANDLER(CallbackEvent(fun_name, c_function))
}
}
}
}

View File

@ -156,14 +156,17 @@ DISABLE_ASAN void TickSample::Init(Isolate* v8_isolate,
SampleInfo info;
RegisterState regs = reg_state;
if (!GetStackSample(v8_isolate, &regs, record_c_entry_frame, stack,
kMaxFramesCount, &info, use_simulator_reg_state)) {
kMaxFramesCount, &info, &state,
use_simulator_reg_state)) {
// It is executing JS but failed to collect a stack trace.
// Mark the sample as spoiled.
pc = nullptr;
return;
}
state = info.vm_state;
if (state != StateTag::EXTERNAL) {
state = info.vm_state;
}
pc = regs.pc;
frames_count = static_cast<unsigned>(info.frames_count);
has_external_callback = info.external_callback_entry != nullptr;
@ -193,6 +196,7 @@ bool TickSample::GetStackSample(Isolate* v8_isolate, RegisterState* regs,
RecordCEntryFrame record_c_entry_frame,
void** frames, size_t frames_limit,
v8::SampleInfo* sample_info,
StateTag* out_state,
bool use_simulator_reg_state) {
i::Isolate* isolate = reinterpret_cast<i::Isolate*>(v8_isolate);
sample_info->frames_count = 0;
@ -243,6 +247,23 @@ bool TickSample::GetStackSample(Isolate* v8_isolate, RegisterState* regs,
? nullptr
: reinterpret_cast<void*>(*external_callback_entry_ptr);
}
// 'Fast API calls' are similar to fast C calls (see frames.cc) in that
// they don't build an exit frame when entering C from JS. They have the
// added speciality of having separate "fast" and "default" callbacks, the
// latter being the regular API callback called before the JS function is
// optimized. When TurboFan optimizes the JS caller, the fast callback
// gets executed instead of the default one, therefore we need to store
// its address in the sample.
IsolateData* isolate_data = isolate->isolate_data();
Address fast_c_fp = isolate_data->fast_c_call_caller_fp();
if (fast_c_fp != kNullAddress &&
isolate_data->fast_api_call_target() != kNullAddress) {
sample_info->external_callback_entry =
reinterpret_cast<void*>(isolate_data->fast_api_call_target());
if (out_state) {
*out_state = StateTag::EXTERNAL;
}
}
i::SafeStackFrameIterator it(isolate, reinterpret_cast<i::Address>(regs->pc),
reinterpret_cast<i::Address>(regs->fp),

View File

@ -56,6 +56,13 @@ struct V8_EXPORT TickSample {
* \param sample_info The sample info is filled up by the function
* provides number of actual captured stack frames and
* the current VM state.
* \param out_state Output parameter. If non-nullptr pointer is provided,
* and the execution is currently in a fast API call,
* records StateTag::EXTERNAL to it. The caller could then
* use this as a marker to not take into account the actual
* VM state recorded in |sample_info|. In the case of fast
* API calls, the VM state must be EXTERNAL, as the callback
* is always an external C++ function.
* \param use_simulator_reg_state When set to true and V8 is running under a
* simulator, the method will use the simulator
* register state rather than the one provided
@ -69,6 +76,7 @@ struct V8_EXPORT TickSample {
RecordCEntryFrame record_c_entry_frame,
void** frames, size_t frames_limit,
v8::SampleInfo* sample_info,
StateTag* out_state = nullptr,
bool use_simulator_reg_state = true);
void print() const;

View File

@ -406,3 +406,11 @@ int main(int argc, char* argv[]) {
RegisterThreadedTest* RegisterThreadedTest::first_ = nullptr;
int RegisterThreadedTest::count_ = 0;
bool IsValidUnwrapObject(v8::Object* object) {
i::Address addr = *reinterpret_cast<i::Address*>(object);
auto instance_type = i::Internals::GetInstanceType(addr);
return (instance_type == i::Internals::kJSObjectType ||
instance_type == i::Internals::kJSApiObjectType ||
instance_type == i::Internals::kJSSpecialApiObjectType);
}

View File

@ -812,4 +812,26 @@ class SimulatorHelper {
};
#endif // USE_SIMULATOR
// The following should correspond to Chromium's kV8DOMWrapperTypeIndex and
// kV8DOMWrapperObjectIndex.
static const int kV8WrapperTypeIndex = 0;
static const int kV8WrapperObjectIndex = 1;
enum class ApiCheckerResult : uint8_t {
kNotCalled = 0,
kSlowCalled = 1 << 0,
kFastCalled = 1 << 1,
};
using ApiCheckerResultFlags = v8::base::Flags<ApiCheckerResult>;
DEFINE_OPERATORS_FOR_FLAGS(ApiCheckerResultFlags)
bool IsValidUnwrapObject(v8::Object* object);
template <typename T, int offset>
T* GetInternalField(v8::Object* wrapper) {
assert(offset < wrapper->InternalFieldCount());
return reinterpret_cast<T*>(
wrapper->GetAlignedPointerFromInternalField(offset));
}
#endif // ifndef CCTEST_H_

View File

@ -66,6 +66,7 @@
#include "src/objects/string-inl.h"
#include "src/objects/synthetic-module-inl.h"
#include "src/profiler/cpu-profiler.h"
#include "src/profiler/symbolizer.h"
#include "src/strings/unicode-inl.h"
#include "src/utils/utils.h"
#include "test/cctest/heap/heap-tester.h"
@ -27454,10 +27455,6 @@ UNINITIALIZED_TEST(NestedIsolates) {
#ifndef V8_LITE_MODE
namespace {
// The following should correspond to Chromium's kV8DOMWrapperObjectIndex.
static const int kV8WrapperTypeIndex = 0;
static const int kV8WrapperObjectIndex = 1;
template <typename T>
struct ConvertJSValue {
static Maybe<T> Get(v8::Local<v8::Value> value,
@ -27578,14 +27575,6 @@ struct ConvertJSValue<bool> {
}
};
enum class ApiCheckerResult : uint8_t {
kNotCalled = 0,
kSlowCalled = 1 << 0,
kFastCalled = 1 << 1,
};
using ApiCheckerResultFlags = v8::base::Flags<ApiCheckerResult>;
DEFINE_OPERATORS_FOR_FLAGS(ApiCheckerResultFlags)
template <typename Value, typename Impl>
struct BasicApiChecker {
static void FastCallback(v8::ApiObject receiver, Value argument,
@ -27606,22 +27595,6 @@ struct BasicApiChecker {
ApiCheckerResultFlags result_ = ApiCheckerResult::kNotCalled;
};
bool IsValidUnwrapObject(v8::Object* object) {
v8::internal::Address addr =
*reinterpret_cast<v8::internal::Address*>(object);
auto instance_type = v8::internal::Internals::GetInstanceType(addr);
return (instance_type == v8::internal::Internals::kJSObjectType ||
instance_type == v8::internal::Internals::kJSApiObjectType ||
instance_type == v8::internal::Internals::kJSSpecialApiObjectType);
}
template <typename T, int offset>
T* GetInternalField(v8::Object* wrapper) {
assert(offset < wrapper->InternalFieldCount());
return reinterpret_cast<T*>(
wrapper->GetAlignedPointerFromInternalField(offset));
}
enum class Behavior {
kNoException,
kException, // An exception should be thrown by the callback function.

View File

@ -31,6 +31,7 @@
#include <memory>
#include "include/libplatform/v8-tracing.h"
#include "include/v8-fast-api-calls.h"
#include "include/v8-profiler.h"
#include "src/api/api-inl.h"
#include "src/base/platform/platform.h"
@ -52,6 +53,7 @@
#include "test/cctest/cctest.h"
#include "test/cctest/heap/heap-utils.h"
#include "test/cctest/profiler-extension.h"
#include "test/common/flag-utils.h"
#ifdef V8_USE_PERFETTO
#include "protos/perfetto/trace/trace.pb.h"
@ -3875,6 +3877,162 @@ UNINITIALIZED_TEST(DetailedSourcePositionAPI_Inlining) {
isolate->Dispose();
}
namespace {
struct FastApiReceiver {
static void FastCallback(v8::ApiObject receiver, int argument,
int* fallback) {
v8::Object* receiver_obj = reinterpret_cast<v8::Object*>(&receiver);
if (!IsValidUnwrapObject(receiver_obj)) {
*fallback = 1;
return;
}
FastApiReceiver* receiver_ptr =
GetInternalField<FastApiReceiver, kV8WrapperObjectIndex>(receiver_obj);
receiver_ptr->result_ |= ApiCheckerResult::kFastCalled;
// Artificially slow down the callback with a predictable amount of time.
// This ensures the test has a relatively stable run time on various
// platforms and protects it from flakyness.
v8::base::OS::Sleep(v8::base::TimeDelta::FromMilliseconds(100));
}
static void SlowCallback(const v8::FunctionCallbackInfo<v8::Value>& info) {
v8::Object* receiver_obj = v8::Object::Cast(*info.Holder());
if (!IsValidUnwrapObject(receiver_obj)) {
info.GetIsolate()->ThrowException(v8_str("Called with a non-object."));
return;
}
FastApiReceiver* receiver =
GetInternalField<FastApiReceiver, kV8WrapperObjectIndex>(receiver_obj);
receiver->result_ |= ApiCheckerResult::kSlowCalled;
}
bool DidCallFast() const { return (result_ & ApiCheckerResult::kFastCalled); }
bool DidCallSlow() const { return (result_ & ApiCheckerResult::kSlowCalled); }
ApiCheckerResultFlags result_ = ApiCheckerResult::kNotCalled;
};
} // namespace
v8::Local<v8::Function> CreateApiCode(LocalContext* env) {
const char* foo_name = "foo";
const char* script =
"function foo(arg) {"
" for (let i = 0; i < arg; ++i) { receiver.api_func(i); }"
"}"
"%PrepareFunctionForOptimization(foo);"
"foo(42); foo(42);"
"%OptimizeFunctionOnNextCall(foo);";
CompileRun(script);
return GetFunction(env->local(), foo_name);
}
TEST(FastApiCPUProfiler) {
#if !defined(V8_LITE_MODE) && !defined(USE_SIMULATOR)
if (i::FLAG_jitless) return;
if (i::FLAG_turboprop) return;
FLAG_SCOPE_EXTERNAL(opt);
FLAG_SCOPE_EXTERNAL(turbo_fast_api_calls);
FLAG_SCOPE_EXTERNAL(allow_natives_syntax);
// Disable --always_opt, otherwise we haven't generated the necessary
// feedback to go down the "best optimization" path for the fast call.
UNFLAG_SCOPE_EXTERNAL(always_opt);
UNFLAG_SCOPE_EXTERNAL(prof_browser_mode);
CcTest::InitializeVM();
LocalContext env;
v8::Isolate* isolate = CcTest::isolate();
i::Isolate* i_isolate = reinterpret_cast<i::Isolate*>(isolate);
i_isolate->set_embedder_wrapper_type_index(kV8WrapperTypeIndex);
i_isolate->set_embedder_wrapper_object_index(kV8WrapperObjectIndex);
i::HandleScope scope(i_isolate);
// Setup the fast call.
FastApiReceiver receiver;
v8::TryCatch try_catch(isolate);
v8::CFunction c_func =
v8::CFunction::MakeWithFallbackSupport(FastApiReceiver::FastCallback);
Local<v8::FunctionTemplate> receiver_templ = v8::FunctionTemplate::New(
isolate, FastApiReceiver::SlowCallback, v8::Local<v8::Value>(),
v8::Local<v8::Signature>(), 1, v8::ConstructorBehavior::kAllow,
v8::SideEffectType::kHasSideEffect, &c_func);
v8::Local<v8::ObjectTemplate> object_template =
v8::ObjectTemplate::New(isolate);
object_template->SetInternalFieldCount(kV8WrapperObjectIndex + 1);
const char* api_func_str = "api_func";
object_template->Set(isolate, api_func_str, receiver_templ);
v8::Local<v8::Object> object =
object_template->NewInstance(env.local()).ToLocalChecked();
object->SetAlignedPointerInInternalField(kV8WrapperObjectIndex,
reinterpret_cast<void*>(&receiver));
int num_runs_arg = 100;
env->Global()->Set(env.local(), v8_str("receiver"), object).Check();
// Prepare the code.
v8::Local<v8::Function> function = CreateApiCode(&env);
// Setup and start CPU profiler.
v8::Local<v8::Value> args[] = {
v8::Integer::New(env->GetIsolate(), num_runs_arg)};
ProfilerHelper helper(env.local());
// TODO(mslekova): We could tweak the following count to reduce test
// runtime, while still keeping the test stable.
unsigned external_samples = 1000;
v8::CpuProfile* profile =
helper.Run(function, args, arraysize(args), 0, external_samples);
// Check if the fast and slow callbacks got executed.
CHECK(receiver.DidCallFast());
CHECK(receiver.DidCallSlow());
CHECK(!try_catch.HasCaught());
// Check that generated profile has the expected structure.
const v8::CpuProfileNode* root = profile->GetTopDownRoot();
const v8::CpuProfileNode* foo_node = GetChild(env.local(), root, "foo");
const v8::CpuProfileNode* api_func_node =
GetChild(env.local(), foo_node, api_func_str);
CHECK_NOT_NULL(api_func_node);
CHECK_EQ(api_func_node->GetSourceType(), CpuProfileNode::kCallback);
// Check that the CodeEntry is the expected one, i.e. the fast callback.
CodeEntry* code_entry =
reinterpret_cast<const ProfileNode*>(api_func_node)->entry();
CodeMap* code_map = reinterpret_cast<CpuProfile*>(profile)
->cpu_profiler()
->code_map_for_test();
CodeEntry* expected_code_entry =
code_map->FindEntry(reinterpret_cast<Address>(c_func.GetAddress()));
CHECK_EQ(code_entry, expected_code_entry);
int foo_ticks = foo_node->GetHitCount();
int api_func_ticks = api_func_node->GetHitCount();
// Check that at least 80% of the samples in foo hit the fast callback.
CHECK_LE(foo_ticks, api_func_ticks * 0.2);
// The following constant in the CHECK is because above we expect at least
// 1000 samples with EXTERNAL type (see external_samples). Since the only
// thing that generates those kind of samples is the fast callback, then
// we're supposed to have close to 1000 ticks in its node. Since the CPU
// profiler is nondeterministic, we've allowed for some slack, otherwise
// this could be 1000 instead of 800.
CHECK_GE(api_func_ticks, 800);
profile->Delete();
#endif
}
} // namespace test_cpu_profiler
} // namespace internal
} // namespace v8