[builtins,x64] pc-relative builtin-to-builtin calls
This addresses one of the major remaining slowdowns with embedded builtins on x64. When generating code for a call to a builtin callee from a builtin caller, we'd look up the Code target object from the builtins constant list, calculate the location of the first instruction, and jump to it. Note that for embedded builtin callees, the Code object is itself only a trampoline to the off-heap code and thus an additional indirection. An example of the call sequence in pseudo-asm: // Load from the constants list. mov reg, [kRootPointer, kBuiltinsConstantListOffset] mov reg, [reg, offset_of_the_code_constant] // Calculate first instruction and call it. add reg, Code::kHeaderOffset call reg // The trampoline forwards to the off-heap area. mov kOffHeapTrampolineRegister, <off-heap instruction_start> jmp kOffHeapTrampolineRegister This CL changes calls to embedded builtin targets to use pc-relative addressing. This reduces the above instruction sequence to: call <pc-relative offset to target instruction_start> Embedded-to-embedded calls jump directly to the embedded instruction stream, bypassing the trampoline. Heap-to-embedded calls (and all calls to heap-builtins) use pc-relative addressing targeting the on-heap Code object. Other relevant platforms (arm,arm64,mips,mips64) do not use pc-relative calls. For these, we'll need a different solution, e.g. a table of embedded builtin addresses reachable from the root pointer, similar to the external reference table. Bug: v8:6666 Change-Id: Ic0317d454e2da37d74eaecebcdfcbc0d5f5041ad Reviewed-on: https://chromium-review.googlesource.com/1068732 Commit-Queue: Jakob Gruber <jgruber@chromium.org> Reviewed-by: Michael Starzinger <mstarzinger@chromium.org> Cr-Commit-Position: refs/heads/master@{#53349}
This commit is contained in:
parent
2f2bf24660
commit
e5630ea97b
@ -43,6 +43,7 @@
|
||||
#include "src/ostreams.h"
|
||||
#include "src/simulator.h" // For flushing instruction cache.
|
||||
#include "src/snapshot/serializer-common.h"
|
||||
#include "src/snapshot/snapshot.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
@ -461,6 +462,16 @@ RelocIterator::RelocIterator(const CodeReference code_reference, int mode_mask)
|
||||
code_reference.relocation_end(),
|
||||
code_reference.relocation_start(), mode_mask) {}
|
||||
|
||||
#ifdef V8_EMBEDDED_BUILTINS
|
||||
RelocIterator::RelocIterator(EmbeddedData* embedded_data, Code* code,
|
||||
int mode_mask)
|
||||
: RelocIterator(
|
||||
code, embedded_data->InstructionStartOfBuiltin(code->builtin_index()),
|
||||
code->constant_pool(),
|
||||
code->relocation_start() + code->relocation_size(),
|
||||
code->relocation_start(), mode_mask) {}
|
||||
#endif // V8_EMBEDDED_BUILTINS
|
||||
|
||||
RelocIterator::RelocIterator(const CodeDesc& desc, int mode_mask)
|
||||
: RelocIterator(nullptr, reinterpret_cast<Address>(desc.buffer), 0,
|
||||
desc.buffer + desc.buffer_size,
|
||||
|
@ -60,6 +60,7 @@ class ApiFunction;
|
||||
namespace internal {
|
||||
|
||||
// Forward declarations.
|
||||
class EmbeddedData;
|
||||
class InstructionStream;
|
||||
class Isolate;
|
||||
class SCTableReference;
|
||||
@ -679,6 +680,10 @@ class RelocIterator: public Malloced {
|
||||
// Relocation information with mode k is included in the
|
||||
// iteration iff bit k of mode_mask is set.
|
||||
explicit RelocIterator(Code* code, int mode_mask = -1);
|
||||
#ifdef V8_EMBEDDED_BUILTINS
|
||||
explicit RelocIterator(EmbeddedData* embedded_data, Code* code,
|
||||
int mode_mask);
|
||||
#endif // V8_EMBEDDED_BUILTINS
|
||||
explicit RelocIterator(const CodeDesc& desc, int mode_mask = -1);
|
||||
explicit RelocIterator(const CodeReference code_reference,
|
||||
int mode_mask = -1);
|
||||
|
@ -302,19 +302,30 @@ bool Builtins::IsLazy(int index) {
|
||||
// static
|
||||
bool Builtins::IsIsolateIndependent(int index) {
|
||||
DCHECK(IsBuiltinId(index));
|
||||
// TODO(jgruber): There's currently two blockers for moving
|
||||
// InterpreterEntryTrampoline into the binary:
|
||||
// 1. InterpreterEnterBytecode calculates a pointer into the middle of
|
||||
// InterpreterEntryTrampoline (see interpreter_entry_return_pc_offset).
|
||||
// When the builtin is embedded, the pointer would need to be calculated
|
||||
// at an offset from the embedded instruction stream (instead of the
|
||||
// trampoline code object).
|
||||
// 2. We create distinct copies of the trampoline to make it possible to
|
||||
// attribute ticks in the interpreter to individual JS functions.
|
||||
// See https://crrev.com/c/959081 and InstallBytecodeArray. When the
|
||||
// trampoline is embedded, we need to ensure that CopyCode creates a copy
|
||||
// of the builtin itself (and not just the trampoline).
|
||||
return index != kInterpreterEntryTrampoline;
|
||||
switch (index) {
|
||||
// TODO(jgruber): There's currently two blockers for moving
|
||||
// InterpreterEntryTrampoline into the binary:
|
||||
// 1. InterpreterEnterBytecode calculates a pointer into the middle of
|
||||
// InterpreterEntryTrampoline (see interpreter_entry_return_pc_offset).
|
||||
// When the builtin is embedded, the pointer would need to be calculated
|
||||
// at an offset from the embedded instruction stream (instead of the
|
||||
// trampoline code object).
|
||||
// 2. We create distinct copies of the trampoline to make it possible to
|
||||
// attribute ticks in the interpreter to individual JS functions.
|
||||
// See https://crrev.com/c/959081 and InstallBytecodeArray. When the
|
||||
// trampoline is embedded, we need to ensure that CopyCode creates a copy
|
||||
// of the builtin itself (and not just the trampoline).
|
||||
case kInterpreterEntryTrampoline:
|
||||
return false;
|
||||
// TODO(jgruber): WasmCompileLazy is copied off the heap during module
|
||||
// compilation, which breaks pc-relative calls. It can be marked
|
||||
// isolate-independent once copies are no longer generated for wasm.
|
||||
case kWasmCompileLazy:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
#ifdef V8_EMBEDDED_BUILTINS
|
||||
|
@ -33,6 +33,10 @@ uint32_t BuiltinsConstantsTableBuilder::AddObject(Handle<Object> object) {
|
||||
DCHECK_EQ(isolate_->heap()->empty_fixed_array(),
|
||||
isolate_->heap()->builtins_constants_table());
|
||||
|
||||
// Must be on the main thread.
|
||||
DCHECK(ThreadId::Current().Equals(isolate_->thread_id()));
|
||||
|
||||
// Must be serializing.
|
||||
DCHECK(isolate_->serializer_enabled());
|
||||
#endif
|
||||
|
||||
|
@ -2599,7 +2599,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
|
||||
if (IsMaterializableFromRoot(src_object, &index)) {
|
||||
__ LoadRoot(dst, index);
|
||||
} else {
|
||||
__ Mov(dst, src_object);
|
||||
__ Move(dst, src_object);
|
||||
}
|
||||
} else {
|
||||
__ Mov(dst, g.ToImmediate(source));
|
||||
|
@ -260,31 +260,6 @@ void CodeAssembler::GenerateCheckMaybeObjectIsObject(Node* node,
|
||||
#endif
|
||||
|
||||
#ifdef V8_EMBEDDED_BUILTINS
|
||||
TNode<HeapObject> CodeAssembler::LookupConstant(Handle<HeapObject> object) {
|
||||
DCHECK(isolate()->ShouldLoadConstantsFromRootList());
|
||||
|
||||
// Ensure the given object is in the builtins constants table and fetch its
|
||||
// index.
|
||||
BuiltinsConstantsTableBuilder* builder =
|
||||
isolate()->builtins_constants_table_builder();
|
||||
uint32_t index = builder->AddObject(object);
|
||||
|
||||
// The builtins constants table is loaded through the root register on all
|
||||
// supported platforms. This is checked by the
|
||||
// VerifyBuiltinsIsolateIndependence cctest, which disallows embedded objects
|
||||
// in isolate-independent builtins.
|
||||
DCHECK(isolate()->heap()->RootCanBeTreatedAsConstant(
|
||||
Heap::kBuiltinsConstantsTableRootIndex));
|
||||
TNode<FixedArray> builtins_constants_table = UncheckedCast<FixedArray>(
|
||||
LoadRoot(Heap::kBuiltinsConstantsTableRootIndex));
|
||||
|
||||
// Generate the lookup.
|
||||
const int32_t header_size = FixedArray::kHeaderSize - kHeapObjectTag;
|
||||
TNode<IntPtrT> offset = IntPtrConstant(header_size + kPointerSize * index);
|
||||
return UncheckedCast<HeapObject>(
|
||||
Load(MachineType::AnyTagged(), builtins_constants_table, offset));
|
||||
}
|
||||
|
||||
// External references are stored in the external reference table.
|
||||
TNode<ExternalReference> CodeAssembler::LookupExternalReference(
|
||||
ExternalReference reference) {
|
||||
@ -349,16 +324,6 @@ TNode<Smi> CodeAssembler::SmiConstant(int value) {
|
||||
|
||||
TNode<HeapObject> CodeAssembler::UntypedHeapConstant(
|
||||
Handle<HeapObject> object) {
|
||||
#ifdef V8_EMBEDDED_BUILTINS
|
||||
// Root constants are simply loaded from the root list, while non-root
|
||||
// constants must be looked up from the builtins constants table.
|
||||
if (isolate()->ShouldLoadConstantsFromRootList()) {
|
||||
Heap::RootListIndex root_index;
|
||||
if (!isolate()->heap()->IsRootHandle(object, &root_index)) {
|
||||
return LookupConstant(object);
|
||||
}
|
||||
}
|
||||
#endif // V8_EMBEDDED_BUILTINS
|
||||
return UncheckedCast<HeapObject>(raw_assembler()->HeapConstant(object));
|
||||
}
|
||||
|
||||
|
@ -691,7 +691,6 @@ class V8_EXPORT_PRIVATE CodeAssembler {
|
||||
#endif
|
||||
|
||||
#ifdef V8_EMBEDDED_BUILTINS
|
||||
TNode<HeapObject> LookupConstant(Handle<HeapObject> object);
|
||||
TNode<ExternalReference> LookupExternalReference(ExternalReference reference);
|
||||
#endif
|
||||
|
||||
|
@ -736,7 +736,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
if (HasImmediateInput(instr, 0)) {
|
||||
Handle<Code> code = i.InputCode(0);
|
||||
__ jmp(code, RelocInfo::CODE_TARGET);
|
||||
__ Jump(code, RelocInfo::CODE_TARGET);
|
||||
} else {
|
||||
Register reg = i.InputRegister(0);
|
||||
__ addp(reg, Immediate(Code::kHeaderSize - kHeapObjectTag));
|
||||
|
@ -14250,7 +14250,7 @@ const char* AbstractCode::Kind2String(Kind kind) {
|
||||
}
|
||||
|
||||
#ifdef V8_EMBEDDED_BUILTINS
|
||||
bool Code::IsProcessIndependent() {
|
||||
bool Code::IsProcessIndependent(Isolate* isolate) {
|
||||
constexpr int all_real_modes_mask =
|
||||
(1 << (RelocInfo::LAST_REAL_RELOC_MODE + 1)) - 1;
|
||||
constexpr int mode_mask =
|
||||
@ -14273,8 +14273,22 @@ bool Code::IsProcessIndependent() {
|
||||
RelocInfo::ModeMask(RelocInfo::RUNTIME_ENTRY) |
|
||||
RelocInfo::ModeMask(RelocInfo::EXTERNAL_REFERENCE)));
|
||||
|
||||
RelocIterator it(this, mode_mask);
|
||||
return it.done();
|
||||
bool is_process_independent = true;
|
||||
for (RelocIterator it(this, mode_mask); !it.done(); it.next()) {
|
||||
if (RelocInfo::IsCodeTarget(it.rinfo()->rmode())) {
|
||||
// Off-heap code targets are later rewritten as pc-relative jumps to the
|
||||
// off-heap instruction stream and are thus process-independent.
|
||||
Address target_address = it.rinfo()->target_address();
|
||||
if (InstructionStream::PcIsOffHeap(isolate, target_address)) continue;
|
||||
|
||||
Code* target = Code::GetCodeFromTargetAddress(target_address);
|
||||
CHECK(target->IsCode());
|
||||
if (Builtins::IsEmbeddedBuiltin(target)) continue;
|
||||
}
|
||||
is_process_independent = false;
|
||||
}
|
||||
|
||||
return is_process_independent;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -344,7 +344,7 @@ class Code : public HeapObject {
|
||||
#endif // DEBUG
|
||||
|
||||
#ifdef V8_EMBEDDED_BUILTINS
|
||||
bool IsProcessIndependent();
|
||||
bool IsProcessIndependent(Isolate* isolate);
|
||||
#endif
|
||||
|
||||
inline bool CanContainWeakObjects();
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "src/snapshot/snapshot.h"
|
||||
|
||||
#include "src/api.h"
|
||||
#include "src/assembler-inl.h"
|
||||
#include "src/base/platform/platform.h"
|
||||
#include "src/callable.h"
|
||||
#include "src/interface-descriptors.h"
|
||||
@ -325,6 +326,42 @@ bool BuiltinAliasesOffHeapTrampolineRegister(Isolate* isolate, Code* code) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void FinalizeEmbeddedCodeTargets(Isolate* isolate, EmbeddedData* blob) {
|
||||
static const int kRelocMask = RelocInfo::ModeMask(RelocInfo::CODE_TARGET);
|
||||
|
||||
for (int i = 0; i < Builtins::builtin_count; i++) {
|
||||
if (!Builtins::IsIsolateIndependent(i)) continue;
|
||||
|
||||
Code* code = isolate->builtins()->builtin(i);
|
||||
RelocIterator on_heap_it(code, kRelocMask);
|
||||
RelocIterator off_heap_it(blob, code, kRelocMask);
|
||||
|
||||
#ifdef V8_TARGET_ARCH_X64
|
||||
while (!on_heap_it.done()) {
|
||||
DCHECK(!off_heap_it.done());
|
||||
|
||||
RelocInfo* rinfo = on_heap_it.rinfo();
|
||||
DCHECK(RelocInfo::IsCodeTarget(rinfo->rmode()));
|
||||
Code* target = Code::GetCodeFromTargetAddress(rinfo->target_address());
|
||||
CHECK(Builtins::IsEmbeddedBuiltin(target));
|
||||
|
||||
off_heap_it.rinfo()->set_target_address(
|
||||
blob->InstructionStartOfBuiltin(target->builtin_index()));
|
||||
|
||||
on_heap_it.next();
|
||||
off_heap_it.next();
|
||||
}
|
||||
DCHECK(off_heap_it.done());
|
||||
#else
|
||||
// Architectures other than x64 do not use pc-relative calls and thus must
|
||||
// not contain embedded code targets. Instead, we use an indirection through
|
||||
// the root register.
|
||||
CHECK(on_heap_it.done());
|
||||
CHECK(off_heap_it.done());
|
||||
#endif // V8_TARGET_ARCH_X64
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// static
|
||||
@ -345,7 +382,7 @@ EmbeddedData EmbeddedData::FromIsolate(Isolate* isolate) {
|
||||
|
||||
// Sanity-check that the given builtin is isolate-independent and does not
|
||||
// use the trampoline register in its calling convention.
|
||||
if (!code->IsProcessIndependent()) {
|
||||
if (!code->IsProcessIndependent(isolate)) {
|
||||
saw_unsafe_builtin = true;
|
||||
fprintf(stderr, "%s is not isolate-independent.\n", Builtins::name(i));
|
||||
}
|
||||
@ -399,6 +436,9 @@ EmbeddedData EmbeddedData::FromIsolate(Isolate* isolate) {
|
||||
|
||||
EmbeddedData d(blob, blob_size);
|
||||
|
||||
// Fix up call targets that point to other embedded builtins.
|
||||
FinalizeEmbeddedCodeTargets(isolate, &d);
|
||||
|
||||
// Hash the blob and store the result.
|
||||
STATIC_ASSERT(HashSize() == kSizetSize);
|
||||
const size_t hash = d.CreateHash();
|
||||
|
@ -1527,13 +1527,12 @@ void MacroAssembler::PopQuad(Operand dst) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void MacroAssembler::Jump(ExternalReference ext) {
|
||||
void TurboAssembler::Jump(ExternalReference ext) {
|
||||
LoadAddress(kScratchRegister, ext);
|
||||
jmp(kScratchRegister);
|
||||
}
|
||||
|
||||
void MacroAssembler::Jump(Operand op) {
|
||||
void TurboAssembler::Jump(Operand op) {
|
||||
if (kPointerSize == kInt64Size) {
|
||||
jmp(op);
|
||||
} else {
|
||||
@ -1542,17 +1541,22 @@ void MacroAssembler::Jump(Operand op) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void MacroAssembler::Jump(Address destination, RelocInfo::Mode rmode) {
|
||||
void TurboAssembler::Jump(Address destination, RelocInfo::Mode rmode) {
|
||||
Move(kScratchRegister, destination, rmode);
|
||||
jmp(kScratchRegister);
|
||||
}
|
||||
|
||||
void MacroAssembler::Jump(Handle<Code> code_object, RelocInfo::Mode rmode,
|
||||
void TurboAssembler::Jump(Handle<Code> code_object, RelocInfo::Mode rmode,
|
||||
Condition cc) {
|
||||
// TODO(X64): Inline this
|
||||
#ifdef V8_EMBEDDED_BUILTINS
|
||||
if (root_array_available_ && isolate()->ShouldLoadConstantsFromRootList()) {
|
||||
if (root_array_available_ && isolate()->ShouldLoadConstantsFromRootList() &&
|
||||
!Builtins::IsEmbeddedBuiltin(*code_object)) {
|
||||
// Calls to embedded targets are initially generated as standard
|
||||
// pc-relative calls below. When creating the embedded blob, call offsets
|
||||
// are patched up to point directly to the off-heap instruction start.
|
||||
// Note: It is safe to dereference code_object above since code generation
|
||||
// for builtins and code stubs happens on the main thread.
|
||||
Label skip;
|
||||
if (cc != always) {
|
||||
if (cc == never) return;
|
||||
@ -1608,7 +1612,13 @@ void TurboAssembler::Call(Address destination, RelocInfo::Mode rmode) {
|
||||
|
||||
void TurboAssembler::Call(Handle<Code> code_object, RelocInfo::Mode rmode) {
|
||||
#ifdef V8_EMBEDDED_BUILTINS
|
||||
if (root_array_available_ && isolate()->ShouldLoadConstantsFromRootList()) {
|
||||
if (root_array_available_ && isolate()->ShouldLoadConstantsFromRootList() &&
|
||||
!Builtins::IsEmbeddedBuiltin(*code_object)) {
|
||||
// Calls to embedded targets are initially generated as standard
|
||||
// pc-relative calls below. When creating the embedded blob, call offsets
|
||||
// are patched up to point directly to the off-heap instruction start.
|
||||
// Note: It is safe to dereference code_object above since code generation
|
||||
// for builtins and code stubs happens on the main thread.
|
||||
LookupConstant(kScratchRegister, code_object);
|
||||
leap(kScratchRegister, FieldOperand(kScratchRegister, Code::kHeaderSize));
|
||||
call(kScratchRegister);
|
||||
|
@ -391,6 +391,12 @@ class TurboAssembler : public Assembler {
|
||||
void RetpolineCall(Register reg);
|
||||
void RetpolineCall(Address destination, RelocInfo::Mode rmode);
|
||||
|
||||
void Jump(Address destination, RelocInfo::Mode rmode);
|
||||
void Jump(ExternalReference ext);
|
||||
void Jump(Operand op);
|
||||
void Jump(Handle<Code> code_object, RelocInfo::Mode rmode,
|
||||
Condition cc = always);
|
||||
|
||||
void RetpolineJump(Register reg);
|
||||
|
||||
void CallForDeoptimization(Address target, RelocInfo::Mode rmode) {
|
||||
@ -760,14 +766,6 @@ class MacroAssembler : public TurboAssembler {
|
||||
void Negps(XMMRegister dst);
|
||||
void Abspd(XMMRegister dst);
|
||||
void Negpd(XMMRegister dst);
|
||||
|
||||
// Control Flow
|
||||
void Jump(Address destination, RelocInfo::Mode rmode);
|
||||
void Jump(ExternalReference ext);
|
||||
void Jump(Operand op);
|
||||
void Jump(Handle<Code> code_object, RelocInfo::Mode rmode,
|
||||
Condition cc = always);
|
||||
|
||||
// Generates a trampoline to jump to the off-heap instruction stream.
|
||||
void JumpToInstructionStream(Address entry);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user