Reland "[cpu-profiler] Fix stack iterability for fast C calls with no exit frame"
This is a reland of d5f4a33eb8
Original change's description:
> [cpu-profiler] Fix stack iterability for fast C calls with no exit frame
>
> Before fast C calls, store the current FP and PC on the isolate. When
> iterating frames in SafeStackFrameIterator, check if these fields are
> set and start iterating at the calling frame's FP instead of the current
> FP, which will be in C++ code. We need to do this because c_entry_fp is
> not set on the Isolate for Fast-C-Calls because we don't build an exit
> frame.
>
> This change makes stack samples that occur within 'Fast-C-Calls'
> iterable, meaning we can properly attribute ticks within the JS caller.
>
> Fast-C-Calls can't call back into JS code, so we can only ever have one
> such call on the stack at a time, allowing us to store the FP on the
> isolate rather than the stack.
>
> TBR=v8-mips-ports@googlegroups.com
>
> Bug: v8:8464, v8:7202
> Change-Id: I7bf39eba779dad34754d5759d741c421b362a406
> Reviewed-on: https://chromium-review.googlesource.com/c/1340241
> Commit-Queue: Peter Marshall <petermarshall@chromium.org>
> Reviewed-by: Jakob Gruber <jgruber@chromium.org>
> Reviewed-by: Martyn Capewell <martyn.capewell@arm.com>
> Reviewed-by: Alexei Filippov <alph@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#57896}
TBR=v8-mips-ports@googlegroups.com
TBR=jgruber@chromium.org
Bug: v8:8464, v8:7202
Change-Id: I5f37ded4ea572e8e9890ba186aa3d74a0dfc1274
Reviewed-on: https://chromium-review.googlesource.com/c/1354042
Reviewed-by: Peter Marshall <petermarshall@chromium.org>
Commit-Queue: Peter Marshall <petermarshall@chromium.org>
Cr-Commit-Position: refs/heads/master@{#57912}
This commit is contained in:
parent
781789c0f1
commit
ddaa1f0a0d
@ -2344,10 +2344,37 @@ void TurboAssembler::CallCFunctionHelper(Register function,
|
||||
}
|
||||
#endif
|
||||
|
||||
// Save the frame pointer and PC so that the stack layout remains iterable,
|
||||
// even without an ExitFrame which normally exists between JS and C frames.
|
||||
if (isolate() != nullptr) {
|
||||
Register scratch = r4;
|
||||
Push(scratch);
|
||||
|
||||
Move(scratch, ExternalReference::fast_c_call_caller_pc_address(isolate()));
|
||||
str(pc, MemOperand(scratch));
|
||||
Move(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
|
||||
str(fp, MemOperand(scratch));
|
||||
Pop(scratch);
|
||||
}
|
||||
|
||||
// Just call directly. The function called cannot cause a GC, or
|
||||
// allow preemption, so the return address in the link register
|
||||
// stays correct.
|
||||
Call(function);
|
||||
|
||||
if (isolate() != nullptr) {
|
||||
// We don't unset the PC; the FP is the source of truth.
|
||||
Register scratch1 = r4;
|
||||
Register scratch2 = r5;
|
||||
Push(scratch1);
|
||||
Push(scratch2);
|
||||
Move(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
|
||||
mov(scratch2, Operand::Zero());
|
||||
str(scratch2, MemOperand(scratch1));
|
||||
Pop(scratch2);
|
||||
Pop(scratch1);
|
||||
}
|
||||
|
||||
int stack_passed_arguments = CalculateStackPassedWords(
|
||||
num_reg_arguments, num_double_arguments);
|
||||
if (ActivationFrameAlignment() > kPointerSize) {
|
||||
|
@ -1827,10 +1827,38 @@ void TurboAssembler::CallCFunction(Register function, int num_of_reg_args,
|
||||
DCHECK_LE(num_of_double_args + num_of_reg_args, 2);
|
||||
}
|
||||
|
||||
// Save the frame pointer and PC so that the stack layout remains iterable,
|
||||
// even without an ExitFrame which normally exists between JS and C frames.
|
||||
if (isolate() != nullptr) {
|
||||
Register scratch1 = x4;
|
||||
Register scratch2 = x5;
|
||||
Push(scratch1, scratch2);
|
||||
|
||||
Label get_pc;
|
||||
Bind(&get_pc);
|
||||
Adr(scratch2, &get_pc);
|
||||
|
||||
Mov(scratch1, ExternalReference::fast_c_call_caller_pc_address(isolate()));
|
||||
Str(scratch2, MemOperand(scratch1));
|
||||
Mov(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
|
||||
Str(fp, MemOperand(scratch1));
|
||||
|
||||
Pop(scratch2, scratch1);
|
||||
}
|
||||
|
||||
// Call directly. The function called cannot cause a GC, or allow preemption,
|
||||
// so the return address in the link register stays correct.
|
||||
Call(function);
|
||||
|
||||
if (isolate() != nullptr) {
|
||||
// We don't unset the PC; the FP is the source of truth.
|
||||
Register scratch = x4;
|
||||
Push(scratch, xzr);
|
||||
Mov(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
|
||||
Str(xzr, MemOperand(scratch));
|
||||
Pop(xzr, scratch);
|
||||
}
|
||||
|
||||
if (num_of_reg_args > kRegisterPassedArguments) {
|
||||
// Drop the register passed arguments.
|
||||
int claim_slots = RoundUp(num_of_reg_args - kRegisterPassedArguments, 2);
|
||||
|
@ -832,6 +832,18 @@ ExternalReference ExternalReference::wasm_thread_in_wasm_flag_address_address(
|
||||
&isolate->thread_local_top()->thread_in_wasm_flag_address_));
|
||||
}
|
||||
|
||||
ExternalReference ExternalReference::fast_c_call_caller_fp_address(
|
||||
Isolate* isolate) {
|
||||
return ExternalReference(
|
||||
isolate->isolate_data()->fast_c_call_caller_fp_address());
|
||||
}
|
||||
|
||||
ExternalReference ExternalReference::fast_c_call_caller_pc_address(
|
||||
Isolate* isolate) {
|
||||
return ExternalReference(
|
||||
isolate->isolate_data()->fast_c_call_caller_pc_address());
|
||||
}
|
||||
|
||||
ExternalReference ExternalReference::fixed_typed_array_base_data_offset() {
|
||||
return ExternalReference(reinterpret_cast<void*>(
|
||||
FixedTypedArrayBase::kDataOffset - kHeapObjectTag));
|
||||
|
@ -72,6 +72,10 @@ class StatsCounter;
|
||||
V(debug_restart_fp_address, "Debug::restart_fp_address()") \
|
||||
V(wasm_thread_in_wasm_flag_address_address, \
|
||||
"&Isolate::thread_in_wasm_flag_address") \
|
||||
V(fast_c_call_caller_fp_address, \
|
||||
"IsolateData::fast_c_call_caller_fp_address") \
|
||||
V(fast_c_call_caller_pc_address, \
|
||||
"IsolateData::fast_c_call_caller_pc_address") \
|
||||
EXTERNAL_REFERENCE_LIST_NON_INTERPRETED_REGEXP(V)
|
||||
|
||||
#define EXTERNAL_REFERENCE_LIST(V) \
|
||||
|
@ -226,7 +226,24 @@ SafeStackFrameIterator::SafeStackFrameIterator(
|
||||
StackFrame::Type type;
|
||||
ThreadLocalTop* top = isolate->thread_local_top();
|
||||
bool advance_frame = true;
|
||||
if (IsValidTop(top)) {
|
||||
|
||||
Address fast_c_fp = isolate->isolate_data()->fast_c_call_caller_fp();
|
||||
// 'Fast C calls' are a special type of C call where we call directly from JS
|
||||
// to C without an exit frame inbetween. The CEntryStub is responsible for
|
||||
// setting Isolate::c_entry_fp, meaning that it won't be set for fast C calls.
|
||||
// To keep the stack iterable, we store the FP and PC of the caller of the
|
||||
// fast C call on the isolate. This is guaranteed to be the topmost JS frame,
|
||||
// because fast C calls cannot call back into JS. We start iterating the stack
|
||||
// from this topmost JS frame.
|
||||
if (fast_c_fp) {
|
||||
DCHECK_NE(kNullAddress, isolate->isolate_data()->fast_c_call_caller_pc());
|
||||
type = StackFrame::Type::OPTIMIZED;
|
||||
top_frame_type_ = type;
|
||||
state.fp = fast_c_fp;
|
||||
state.sp = sp;
|
||||
state.pc_address = isolate->isolate_data()->fast_c_call_caller_pc_address();
|
||||
advance_frame = false;
|
||||
} else if (IsValidTop(top)) {
|
||||
type = ExitFrame::GetStateForFramePointer(Isolate::c_entry_fp(top), &state);
|
||||
top_frame_type_ = type;
|
||||
} else if (IsValidStackAddress(fp)) {
|
||||
|
@ -1830,7 +1830,39 @@ void TurboAssembler::CallCFunction(Register function, int num_arguments) {
|
||||
CheckStackAlignment();
|
||||
}
|
||||
|
||||
// Save the frame pointer and PC so that the stack layout remains iterable,
|
||||
// even without an ExitFrame which normally exists between JS and C frames.
|
||||
if (isolate() != nullptr) {
|
||||
// Get the current PC via call, pop. This gets the return address pushed to
|
||||
// the stack by call.
|
||||
Label get_pc;
|
||||
call(&get_pc);
|
||||
bind(&get_pc);
|
||||
// Find two caller-saved scratch registers.
|
||||
Register scratch1 = eax;
|
||||
Register scratch2 = ecx;
|
||||
if (function == eax) scratch1 = edx;
|
||||
if (function == ecx) scratch2 = edx;
|
||||
pop(scratch1);
|
||||
mov(ExternalReferenceAsOperand(
|
||||
ExternalReference::fast_c_call_caller_pc_address(isolate()),
|
||||
scratch2),
|
||||
scratch1);
|
||||
mov(ExternalReferenceAsOperand(
|
||||
ExternalReference::fast_c_call_caller_fp_address(isolate()),
|
||||
scratch2),
|
||||
ebp);
|
||||
}
|
||||
|
||||
call(function);
|
||||
|
||||
if (isolate() != nullptr) {
|
||||
// We don't unset the PC; the FP is the source of truth.
|
||||
mov(ExternalReferenceAsOperand(
|
||||
ExternalReference::fast_c_call_caller_fp_address(isolate()), edx),
|
||||
Immediate(0));
|
||||
}
|
||||
|
||||
if (base::OS::ActivationFrameAlignment() != 0) {
|
||||
mov(esp, Operand(esp, num_arguments * kPointerSize));
|
||||
} else {
|
||||
|
@ -71,6 +71,12 @@ class IsolateData final {
|
||||
return kVirtualCallTargetRegisterOffset - kIsolateRootBias;
|
||||
}
|
||||
|
||||
// The FP and PC that are saved right before TurboAssembler::CallCFunction.
|
||||
Address* fast_c_call_caller_fp_address() { return &fast_c_call_caller_fp_; }
|
||||
Address* fast_c_call_caller_pc_address() { return &fast_c_call_caller_pc_; }
|
||||
Address fast_c_call_caller_fp() { return fast_c_call_caller_fp_; }
|
||||
Address fast_c_call_caller_pc() { return fast_c_call_caller_pc_; }
|
||||
|
||||
// Returns true if this address points to data stored in this instance.
|
||||
// If it's the case then the value can be accessed indirectly through the
|
||||
// root register.
|
||||
@ -100,6 +106,8 @@ class IsolateData final {
|
||||
V(kExternalReferenceTableOffset, ExternalReferenceTable::SizeInBytes()) \
|
||||
V(kBuiltinsTableOffset, Builtins::builtin_count* kPointerSize) \
|
||||
V(kVirtualCallTargetRegisterOffset, kPointerSize) \
|
||||
V(kFastCCallCallerFPOffset, kPointerSize) \
|
||||
V(kFastCCallCallerPCOffset, kPointerSize) \
|
||||
/* This padding aligns IsolateData size by 8 bytes. */ \
|
||||
V(kPaddingOffset, \
|
||||
8 + RoundUp<8>(static_cast<int>(kPaddingOffset)) - kPaddingOffset) \
|
||||
@ -138,6 +146,13 @@ class IsolateData final {
|
||||
// ia32 (otherwise the arguments adaptor call runs out of registers).
|
||||
void* virtual_call_target_register_ = nullptr;
|
||||
|
||||
// Stores the state of the caller for TurboAssembler::CallCFunction so that
|
||||
// the sampling CPU profiler can iterate the stack during such calls. These
|
||||
// are stored on IsolateData so that they can be stored to with only one move
|
||||
// instruction in compiled code.
|
||||
Address fast_c_call_caller_fp_ = kNullAddress;
|
||||
Address fast_c_call_caller_pc_ = kNullAddress;
|
||||
|
||||
// Ensure the size is 8-byte aligned in order to make alignment of the field
|
||||
// following the IsolateData field predictable. This solves the issue with
|
||||
// C++ compilers for 32-bit platforms which are not consistent at aligning
|
||||
@ -177,6 +192,10 @@ void IsolateData::AssertPredictableLayout() {
|
||||
kExternalMemoryLlimitOffset);
|
||||
STATIC_ASSERT(offsetof(IsolateData, external_memory_at_last_mark_compact_) ==
|
||||
kExternalMemoryAtLastMarkCompactOffset);
|
||||
STATIC_ASSERT(offsetof(IsolateData, fast_c_call_caller_fp_) ==
|
||||
kFastCCallCallerFPOffset);
|
||||
STATIC_ASSERT(offsetof(IsolateData, fast_c_call_caller_pc_) ==
|
||||
kFastCCallCallerPCOffset);
|
||||
STATIC_ASSERT(sizeof(IsolateData) == IsolateData::kSize);
|
||||
}
|
||||
|
||||
|
@ -5399,7 +5399,38 @@ void TurboAssembler::CallCFunctionHelper(Register function_base,
|
||||
function_offset = 0;
|
||||
}
|
||||
|
||||
// Save the frame pointer and PC so that the stack layout remains iterable,
|
||||
// even without an ExitFrame which normally exists between JS and C frames.
|
||||
if (isolate() != nullptr) {
|
||||
UseScratchRegisterScope temps(this);
|
||||
Register scratch1 = temps.Acquire();
|
||||
// 't' registers are caller-saved so this is safe as a scratch register.
|
||||
Register scratch2 = t5;
|
||||
DCHECK(!AreAliased(scratch1, scratch2, function_base));
|
||||
|
||||
Label get_pc;
|
||||
mov(scratch1, ra);
|
||||
Call(&get_pc);
|
||||
|
||||
bind(&get_pc);
|
||||
mov(scratch2, ra);
|
||||
mov(ra, scratch1);
|
||||
|
||||
li(scratch1, ExternalReference::fast_c_call_caller_pc_address(isolate()));
|
||||
sw(scratch2, MemOperand(scratch1));
|
||||
li(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
|
||||
sw(fp, MemOperand(scratch1));
|
||||
}
|
||||
|
||||
Call(function_base, function_offset);
|
||||
|
||||
if (isolate() != nullptr) {
|
||||
// We don't unset the PC; the FP is the source of truth.
|
||||
UseScratchRegisterScope temps(this);
|
||||
Register scratch = temps.Acquire();
|
||||
li(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
|
||||
sw(zero_reg, MemOperand(scratch));
|
||||
}
|
||||
}
|
||||
|
||||
int stack_passed_arguments = CalculateStackPassedWords(
|
||||
|
@ -5761,7 +5761,38 @@ void TurboAssembler::CallCFunctionHelper(Register function,
|
||||
function = t9;
|
||||
}
|
||||
|
||||
// Save the frame pointer and PC so that the stack layout remains iterable,
|
||||
// even without an ExitFrame which normally exists between JS and C frames.
|
||||
if (isolate() != nullptr) {
|
||||
UseScratchRegisterScope temps(this);
|
||||
Register scratch1 = temps.Acquire();
|
||||
// 't' registers are caller-saved so this is safe as a scratch register.
|
||||
Register scratch2 = t2;
|
||||
DCHECK(!AreAliased(scratch1, scratch2, function));
|
||||
|
||||
Label get_pc;
|
||||
mov(scratch1, ra);
|
||||
Call(&get_pc);
|
||||
|
||||
bind(&get_pc);
|
||||
mov(scratch2, ra);
|
||||
mov(ra, scratch1);
|
||||
|
||||
li(scratch1, ExternalReference::fast_c_call_caller_pc_address(isolate()));
|
||||
Sd(scratch2, MemOperand(scratch1));
|
||||
li(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
|
||||
Sd(fp, MemOperand(scratch1));
|
||||
}
|
||||
|
||||
Call(function);
|
||||
|
||||
if (isolate() != nullptr) {
|
||||
// We don't unset the PC; the FP is the source of truth.
|
||||
UseScratchRegisterScope temps(this);
|
||||
Register scratch = temps.Acquire();
|
||||
li(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
|
||||
Sd(zero_reg, MemOperand(scratch));
|
||||
}
|
||||
}
|
||||
|
||||
int stack_passed_arguments = CalculateStackPassedWords(
|
||||
|
@ -2652,7 +2652,30 @@ void TurboAssembler::CallCFunction(Register function, int num_arguments) {
|
||||
CheckStackAlignment();
|
||||
}
|
||||
|
||||
// Save the frame pointer and PC so that the stack layout remains iterable,
|
||||
// even without an ExitFrame which normally exists between JS and C frames.
|
||||
if (isolate() != nullptr) {
|
||||
Label get_pc;
|
||||
DCHECK(!AreAliased(kScratchRegister, function));
|
||||
leaq(kScratchRegister, Operand(&get_pc, 0));
|
||||
bind(&get_pc);
|
||||
movp(ExternalReferenceAsOperand(
|
||||
ExternalReference::fast_c_call_caller_pc_address(isolate())),
|
||||
kScratchRegister);
|
||||
movp(ExternalReferenceAsOperand(
|
||||
ExternalReference::fast_c_call_caller_fp_address(isolate())),
|
||||
rbp);
|
||||
}
|
||||
|
||||
call(function);
|
||||
|
||||
if (isolate() != nullptr) {
|
||||
// We don't unset the PC; the FP is the source of truth.
|
||||
movp(ExternalReferenceAsOperand(
|
||||
ExternalReference::fast_c_call_caller_fp_address(isolate())),
|
||||
Immediate(0));
|
||||
}
|
||||
|
||||
DCHECK_NE(base::OS::ActivationFrameAlignment(), 0);
|
||||
DCHECK_GE(num_arguments, 0);
|
||||
int argument_slots_on_stack =
|
||||
|
@ -93,9 +93,6 @@
|
||||
'test-cpu-profiler/TracingCpuProfiler': [SKIP],
|
||||
'test-sampler/LibSamplerCollectSample': [SKIP],
|
||||
|
||||
# BUG(7202). The test is flaky.
|
||||
'test-cpu-profiler/NativeFrameStackTrace': [SKIP],
|
||||
|
||||
# BUG(7054)
|
||||
'test-cpu-profiler/StaticCollectSampleAPI': [SKIP],
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user