[cpu-profiler] Fix stack iterability for fast C calls with no exit frame

Before fast C calls, store the current FP and PC on the isolate. When
iterating frames in SafeStackFrameIterator, check if these fields are
set and start iterating at the calling frame's FP instead of the current
FP, which will be in C++ code. We need to do this because c_entry_fp is
not set on the Isolate for Fast-C-Calls because we don't build an exit
frame.

This change makes stack samples that occur within 'Fast-C-Calls'
iterable, meaning we can properly attribute ticks within the JS caller.

Fast-C-Calls can't call back into JS code, so we can only ever have one
such call on the stack at a time, allowing us to store the FP on the
isolate rather than the stack.

TBR=v8-mips-ports@googlegroups.com

Bug: v8:8464, v8:7202
Change-Id: I7bf39eba779dad34754d5759d741c421b362a406
Reviewed-on: https://chromium-review.googlesource.com/c/1340241
Commit-Queue: Peter Marshall <petermarshall@chromium.org>
Reviewed-by: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: Martyn Capewell <martyn.capewell@arm.com>
Reviewed-by: Alexei Filippov <alph@chromium.org>
Cr-Commit-Position: refs/heads/master@{#57896}
This commit is contained in:
Peter Marshall 2018-11-26 15:46:46 +01:00 committed by Commit Bot
parent e0766dbe5c
commit d5f4a33eb8
11 changed files with 225 additions and 4 deletions

View File

@ -2344,10 +2344,37 @@ void TurboAssembler::CallCFunctionHelper(Register function,
}
#endif
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
Register scratch = r4;
Push(scratch);
Move(scratch, ExternalReference::fast_c_call_caller_pc_address(isolate()));
str(pc, MemOperand(scratch));
Move(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
str(fp, MemOperand(scratch));
Pop(scratch);
}
// Just call directly. The function called cannot cause a GC, or
// allow preemption, so the return address in the link register
// stays correct.
Call(function);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
Register scratch1 = r4;
Register scratch2 = r5;
Push(scratch1);
Push(scratch2);
Move(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
mov(scratch2, Operand::Zero());
str(scratch2, MemOperand(scratch1));
Pop(scratch2);
Pop(scratch1);
}
int stack_passed_arguments = CalculateStackPassedWords(
num_reg_arguments, num_double_arguments);
if (ActivationFrameAlignment() > kPointerSize) {

View File

@ -1827,10 +1827,38 @@ void TurboAssembler::CallCFunction(Register function, int num_of_reg_args,
DCHECK_LE(num_of_double_args + num_of_reg_args, 2);
}
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
Register scratch1 = x4;
Register scratch2 = x5;
Push(scratch1, scratch2);
Label get_pc;
Bind(&get_pc);
Adr(scratch2, &get_pc);
Mov(scratch1, ExternalReference::fast_c_call_caller_pc_address(isolate()));
Str(scratch2, MemOperand(scratch1));
Mov(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
Str(fp, MemOperand(scratch1));
Pop(scratch2, scratch1);
}
// Call directly. The function called cannot cause a GC, or allow preemption,
// so the return address in the link register stays correct.
Call(function);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
Register scratch = x4;
Push(scratch, xzr);
Mov(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
Str(xzr, MemOperand(scratch));
Pop(xzr, scratch);
}
if (num_of_reg_args > kRegisterPassedArguments) {
// Drop the register passed arguments.
int claim_slots = RoundUp(num_of_reg_args - kRegisterPassedArguments, 2);

View File

@ -832,6 +832,18 @@ ExternalReference ExternalReference::wasm_thread_in_wasm_flag_address_address(
&isolate->thread_local_top()->thread_in_wasm_flag_address_));
}
ExternalReference ExternalReference::fast_c_call_caller_fp_address(
Isolate* isolate) {
return ExternalReference(
isolate->isolate_data()->fast_c_call_caller_fp_address());
}
ExternalReference ExternalReference::fast_c_call_caller_pc_address(
Isolate* isolate) {
return ExternalReference(
isolate->isolate_data()->fast_c_call_caller_pc_address());
}
ExternalReference ExternalReference::fixed_typed_array_base_data_offset() {
return ExternalReference(reinterpret_cast<void*>(
FixedTypedArrayBase::kDataOffset - kHeapObjectTag));

View File

@ -72,6 +72,10 @@ class StatsCounter;
V(debug_restart_fp_address, "Debug::restart_fp_address()") \
V(wasm_thread_in_wasm_flag_address_address, \
"&Isolate::thread_in_wasm_flag_address") \
V(fast_c_call_caller_fp_address, \
"IsolateData::fast_c_call_caller_fp_address") \
V(fast_c_call_caller_pc_address, \
"IsolateData::fast_c_call_caller_pc_address") \
EXTERNAL_REFERENCE_LIST_NON_INTERPRETED_REGEXP(V)
#define EXTERNAL_REFERENCE_LIST(V) \

View File

@ -226,7 +226,24 @@ SafeStackFrameIterator::SafeStackFrameIterator(
StackFrame::Type type;
ThreadLocalTop* top = isolate->thread_local_top();
bool advance_frame = true;
if (IsValidTop(top)) {
Address fast_c_fp = isolate->isolate_data()->fast_c_call_caller_fp();
// 'Fast C calls' are a special type of C call where we call directly from JS
// to C without an exit frame inbetween. The CEntryStub is responsible for
// setting Isolate::c_entry_fp, meaning that it won't be set for fast C calls.
// To keep the stack iterable, we store the FP and PC of the caller of the
// fast C call on the isolate. This is guaranteed to be the topmost JS frame,
// because fast C calls cannot call back into JS. We start iterating the stack
// from this topmost JS frame.
if (fast_c_fp) {
DCHECK_NE(kNullAddress, isolate->isolate_data()->fast_c_call_caller_pc());
type = StackFrame::Type::OPTIMIZED;
top_frame_type_ = type;
state.fp = fast_c_fp;
state.sp = sp;
state.pc_address = isolate->isolate_data()->fast_c_call_caller_pc_address();
advance_frame = false;
} else if (IsValidTop(top)) {
type = ExitFrame::GetStateForFramePointer(Isolate::c_entry_fp(top), &state);
top_frame_type_ = type;
} else if (IsValidStackAddress(fp)) {

View File

@ -1830,7 +1830,39 @@ void TurboAssembler::CallCFunction(Register function, int num_arguments) {
CheckStackAlignment();
}
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
// Get the current PC via call, pop. This gets the return address pushed to
// the stack by call.
Label get_pc;
call(&get_pc);
bind(&get_pc);
// Find two caller-saved scratch registers.
Register scratch1 = eax;
Register scratch2 = ecx;
if (function == eax) scratch1 = edx;
if (function == ecx) scratch2 = edx;
pop(scratch1);
mov(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_pc_address(isolate()),
scratch2),
scratch1);
mov(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_fp_address(isolate()),
scratch2),
ebp);
}
call(function);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
mov(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_fp_address(isolate()), edx),
Immediate(0));
}
if (base::OS::ActivationFrameAlignment() != 0) {
mov(esp, Operand(esp, num_arguments * kPointerSize));
} else {

View File

@ -71,6 +71,12 @@ class IsolateData final {
return kVirtualCallTargetRegisterOffset - kIsolateRootBias;
}
// The FP and PC that are saved right before TurboAssembler::CallCFunction.
Address* fast_c_call_caller_fp_address() { return &fast_c_call_caller_fp_; }
Address* fast_c_call_caller_pc_address() { return &fast_c_call_caller_pc_; }
Address fast_c_call_caller_fp() { return fast_c_call_caller_fp_; }
Address fast_c_call_caller_pc() { return fast_c_call_caller_pc_; }
// Returns true if this address points to data stored in this instance.
// If it's the case then the value can be accessed indirectly through the
// root register.
@ -100,6 +106,8 @@ class IsolateData final {
V(kExternalReferenceTableOffset, ExternalReferenceTable::SizeInBytes()) \
V(kBuiltinsTableOffset, Builtins::builtin_count* kPointerSize) \
V(kVirtualCallTargetRegisterOffset, kPointerSize) \
V(kFastCCallCallerFPOffset, kPointerSize) \
V(kFastCCallCallerPCOffset, kPointerSize) \
/* This padding aligns IsolateData size by 8 bytes. */ \
V(kPaddingOffset, \
8 + RoundUp<8>(static_cast<int>(kPaddingOffset)) - kPaddingOffset) \
@ -138,6 +146,13 @@ class IsolateData final {
// ia32 (otherwise the arguments adaptor call runs out of registers).
void* virtual_call_target_register_ = nullptr;
// Stores the state of the caller for TurboAssembler::CallCFunction so that
// the sampling CPU profiler can iterate the stack during such calls. These
// are stored on IsolateData so that they can be stored to with only one move
// instruction in compiled code.
Address fast_c_call_caller_fp_ = kNullAddress;
Address fast_c_call_caller_pc_ = kNullAddress;
// Ensure the size is 8-byte aligned in order to make alignment of the field
// following the IsolateData field predictable. This solves the issue with
// C++ compilers for 32-bit platforms which are not consistent at aligning
@ -177,6 +192,10 @@ void IsolateData::AssertPredictableLayout() {
kExternalMemoryLlimitOffset);
STATIC_ASSERT(offsetof(IsolateData, external_memory_at_last_mark_compact_) ==
kExternalMemoryAtLastMarkCompactOffset);
STATIC_ASSERT(offsetof(IsolateData, fast_c_call_caller_fp_) ==
kFastCCallCallerFPOffset);
STATIC_ASSERT(offsetof(IsolateData, fast_c_call_caller_pc_) ==
kFastCCallCallerPCOffset);
STATIC_ASSERT(sizeof(IsolateData) == IsolateData::kSize);
}

View File

@ -5399,7 +5399,38 @@ void TurboAssembler::CallCFunctionHelper(Register function_base,
function_offset = 0;
}
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
UseScratchRegisterScope temps(this);
Register scratch1 = temps.Acquire();
// 't' registers are caller-saved so this is safe as a scratch register.
Register scratch2 = t5;
DCHECK(!AreAliased(scratch1, scratch2, function_base));
Label get_pc;
mov(scratch1, ra);
Call(&get_pc);
bind(&get_pc);
mov(scratch2, ra);
mov(ra, scratch1);
li(scratch1, ExternalReference::fast_c_call_caller_pc_address(isolate()));
sw(scratch2, MemOperand(scratch1));
li(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
sw(fp, MemOperand(scratch1));
}
Call(function_base, function_offset);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
UseScratchRegisterScope temps(this);
Register scratch = temps.Acquire();
li(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
sw(zero_reg, MemOperand(scratch));
}
}
int stack_passed_arguments = CalculateStackPassedWords(

View File

@ -5761,7 +5761,38 @@ void TurboAssembler::CallCFunctionHelper(Register function,
function = t9;
}
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
UseScratchRegisterScope temps(this);
Register scratch1 = temps.Acquire();
// 't' registers are caller-saved so this is safe as a scratch register.
Register scratch2 = t2;
DCHECK(!AreAliased(scratch1, scratch2, function));
Label get_pc;
mov(scratch1, ra);
Call(&get_pc);
bind(&get_pc);
mov(scratch2, ra);
mov(ra, scratch1);
li(scratch1, ExternalReference::fast_c_call_caller_pc_address(isolate()));
Sd(scratch2, MemOperand(scratch1));
li(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
Sd(fp, MemOperand(scratch1));
}
Call(function);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
UseScratchRegisterScope temps(this);
Register scratch = temps.Acquire();
li(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
Sd(zero_reg, MemOperand(scratch));
}
}
int stack_passed_arguments = CalculateStackPassedWords(

View File

@ -2652,7 +2652,30 @@ void TurboAssembler::CallCFunction(Register function, int num_arguments) {
CheckStackAlignment();
}
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
Label get_pc;
DCHECK(!AreAliased(kScratchRegister, function));
leaq(kScratchRegister, Operand(&get_pc, 0));
bind(&get_pc);
movp(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_pc_address(isolate())),
kScratchRegister);
movp(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_fp_address(isolate())),
rbp);
}
call(function);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
movp(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_fp_address(isolate())),
Immediate(0));
}
DCHECK_NE(base::OS::ActivationFrameAlignment(), 0);
DCHECK_GE(num_arguments, 0);
int argument_slots_on_stack =

View File

@ -93,9 +93,6 @@
'test-cpu-profiler/TracingCpuProfiler': [SKIP],
'test-sampler/LibSamplerCollectSample': [SKIP],
# BUG(7202). The test is flaky.
'test-cpu-profiler/NativeFrameStackTrace': [SKIP],
# BUG(7054)
'test-cpu-profiler/StaticCollectSampleAPI': [SKIP],