[wasm] Implement patching of far jump table
If the jump is too large for a near jump, we patch the far jump table instead, and patch the (near) jump table to jump to the far jump table slot. R=mstarzinger@chromium.org Bug: v8:9477 Change-Id: Ic9a929b405492c1cfe744738e0807ad4357c53ff Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1799543 Commit-Queue: Clemens Hammacher <clemensh@chromium.org> Reviewed-by: Michael Starzinger <mstarzinger@chromium.org> Cr-Commit-Position: refs/heads/master@{#63754}
This commit is contained in:
parent
fe674753fa
commit
0a8ddb134c
@ -218,6 +218,7 @@ Address Assembler::target_address_at(Address pc, Address constant_pool) {
|
||||
void Assembler::set_target_address_at(Address pc, Address constant_pool,
|
||||
Address target,
|
||||
ICacheFlushMode icache_flush_mode) {
|
||||
DCHECK(is_int32(target - pc - 4));
|
||||
WriteUnalignedValue(pc, static_cast<int32_t>(target - pc - 4));
|
||||
if (icache_flush_mode != SKIP_ICACHE_FLUSH) {
|
||||
FlushInstructionCache(pc, sizeof(int32_t));
|
||||
|
@ -21,13 +21,12 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
|
||||
EmitJumpSlot(lazy_compile_target); // 5 bytes
|
||||
}
|
||||
|
||||
void JumpTableAssembler::EmitJumpSlot(Address target) {
|
||||
// On x64, all code is allocated within a single code section, so we can use
|
||||
// relative jumps.
|
||||
static_assert(kMaxWasmCodeMemory <= size_t{2} * GB, "can use relative jump");
|
||||
bool JumpTableAssembler::EmitJumpSlot(Address target) {
|
||||
intptr_t displacement = static_cast<intptr_t>(
|
||||
reinterpret_cast<byte*>(target) - pc_ - kNearJmpInstrSize);
|
||||
near_jmp(displacement, RelocInfo::NONE);
|
||||
if (!is_int32(displacement)) return false;
|
||||
near_jmp(displacement, RelocInfo::NONE); // 5 bytes
|
||||
return true;
|
||||
}
|
||||
|
||||
void JumpTableAssembler::EmitFarJumpSlot(Address target) {
|
||||
@ -35,13 +34,26 @@ void JumpTableAssembler::EmitFarJumpSlot(Address target) {
|
||||
int start_offset = pc_offset();
|
||||
jmp(Operand(&data)); // 6 bytes
|
||||
Nop(2); // 2 bytes
|
||||
// The data must be properly aligned, so it can be patched atomically.
|
||||
DCHECK_EQ(start_offset + 8, pc_offset());
|
||||
// The data must be properly aligned, so it can be patched atomically (see
|
||||
// {PatchFarJumpSlot}).
|
||||
DCHECK_EQ(start_offset + kSystemPointerSize, pc_offset());
|
||||
USE(start_offset);
|
||||
bind(&data);
|
||||
dq(target); // 8 bytes
|
||||
}
|
||||
|
||||
// static
|
||||
void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
|
||||
// The slot needs to be pointer-size aligned so we can atomically update it.
|
||||
DCHECK(IsAligned(slot, kSystemPointerSize));
|
||||
// Offset of the target is at 8 bytes, see {EmitFarJumpSlot}.
|
||||
reinterpret_cast<std::atomic<Address>*>(slot + kSystemPointerSize)
|
||||
->store(target, std::memory_order_relaxed);
|
||||
// The update is atomic because the address is properly aligned.
|
||||
// Because of cache coherence, the data update will eventually be seen by all
|
||||
// cores. It's ok if they temporarily jump to the old target.
|
||||
}
|
||||
|
||||
void JumpTableAssembler::NopBytes(int bytes) {
|
||||
DCHECK_LE(0, bytes);
|
||||
Nop(bytes);
|
||||
@ -54,14 +66,20 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
|
||||
jmp(lazy_compile_target, RelocInfo::NONE); // 5 bytes
|
||||
}
|
||||
|
||||
void JumpTableAssembler::EmitJumpSlot(Address target) {
|
||||
bool JumpTableAssembler::EmitJumpSlot(Address target) {
|
||||
jmp(target, RelocInfo::NONE);
|
||||
return true;
|
||||
}
|
||||
|
||||
void JumpTableAssembler::EmitFarJumpSlot(Address target) {
|
||||
jmp(target, RelocInfo::NONE);
|
||||
}
|
||||
|
||||
// static
|
||||
void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void JumpTableAssembler::NopBytes(int bytes) {
|
||||
DCHECK_LE(0, bytes);
|
||||
Nop(bytes);
|
||||
@ -82,11 +100,12 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
|
||||
EmitJumpSlot(lazy_compile_target);
|
||||
}
|
||||
|
||||
void JumpTableAssembler::EmitJumpSlot(Address target) {
|
||||
bool JumpTableAssembler::EmitJumpSlot(Address target) {
|
||||
// Note that {Move32BitImmediate} emits [ldr, constant] for the relocation
|
||||
// mode used below, we need this to allow concurrent patching of this slot.
|
||||
Move32BitImmediate(pc, Operand(target, RelocInfo::WASM_CALL));
|
||||
CheckConstPool(true, false); // force emit of const pool
|
||||
return true;
|
||||
}
|
||||
|
||||
void JumpTableAssembler::EmitFarJumpSlot(Address target) {
|
||||
@ -98,6 +117,11 @@ void JumpTableAssembler::EmitFarJumpSlot(Address target) {
|
||||
STATIC_ASSERT(kFarJumpTableSlotSize == 2 * kInstrSize);
|
||||
}
|
||||
|
||||
// static
|
||||
void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void JumpTableAssembler::NopBytes(int bytes) {
|
||||
DCHECK_LE(0, bytes);
|
||||
DCHECK_EQ(0, bytes % kInstrSize);
|
||||
@ -117,13 +141,14 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
|
||||
if (nop_bytes) nop();
|
||||
}
|
||||
|
||||
void JumpTableAssembler::EmitJumpSlot(Address target) {
|
||||
// TODO(wasm): Currently this is guaranteed to be a {near_call} and hence is
|
||||
// patchable concurrently. Once {kMaxWasmCodeMemory} is raised on ARM64, make
|
||||
// sure concurrent patching is still supported.
|
||||
DCHECK(TurboAssembler::IsNearCallOffset(
|
||||
(reinterpret_cast<byte*>(target) - pc_) / kInstrSize));
|
||||
bool JumpTableAssembler::EmitJumpSlot(Address target) {
|
||||
if (!TurboAssembler::IsNearCallOffset(
|
||||
(reinterpret_cast<byte*>(target) - pc_) / kInstrSize)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Jump(target, RelocInfo::NONE);
|
||||
return true;
|
||||
}
|
||||
|
||||
void JumpTableAssembler::EmitFarJumpSlot(Address target) {
|
||||
@ -138,10 +163,23 @@ void JumpTableAssembler::EmitFarJumpSlot(Address target) {
|
||||
ldr_pcrel(kTmpReg, 2); // 1 instruction
|
||||
br(kTmpReg); // 1 instruction
|
||||
dq(target); // 8 bytes (== 2 instructions)
|
||||
STATIC_ASSERT(kInstrSize == kInt32Size);
|
||||
STATIC_ASSERT(2 * kInstrSize == kSystemPointerSize);
|
||||
STATIC_ASSERT(kFarJumpTableSlotSize == 4 * kInstrSize);
|
||||
}
|
||||
|
||||
// static
|
||||
void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
|
||||
// The slot needs to be pointer-size aligned so we can atomically update it.
|
||||
DCHECK(IsAligned(slot, kSystemPointerSize));
|
||||
// Offset of the target is at 8 bytes, see {EmitFarJumpSlot}.
|
||||
reinterpret_cast<std::atomic<Address>*>(slot + kSystemPointerSize)
|
||||
->store(target, std::memory_order_relaxed);
|
||||
// The data update is guaranteed to be atomic since it's a properly aligned
|
||||
// and stores a single machine word. This update will eventually be observed
|
||||
// by any concurrent [ldr] on the same address because of the data cache
|
||||
// coherence. It's ok if other cores temporarily jump to the old target.
|
||||
}
|
||||
|
||||
void JumpTableAssembler::NopBytes(int bytes) {
|
||||
DCHECK_LE(0, bytes);
|
||||
DCHECK_EQ(0, bytes % kInstrSize);
|
||||
|
@ -93,8 +93,8 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
|
||||
// Determine the size of a far jump table containing the given number of
|
||||
// slots.
|
||||
static constexpr uint32_t SizeForNumberOfFarJumpSlots(
|
||||
int num_stubs, int num_function_slots) {
|
||||
int num_entries = num_stubs + num_function_slots;
|
||||
int num_runtime_slots, int num_function_slots) {
|
||||
int num_entries = num_runtime_slots + num_function_slots;
|
||||
return num_entries * kFarJumpTableSlotSize;
|
||||
}
|
||||
|
||||
@ -124,17 +124,20 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
|
||||
}
|
||||
|
||||
static void GenerateFarJumpTable(Address base, Address* stub_targets,
|
||||
int num_stubs, int num_function_slots) {
|
||||
int num_runtime_slots,
|
||||
int num_function_slots) {
|
||||
uint32_t table_size =
|
||||
SizeForNumberOfFarJumpSlots(num_stubs, num_function_slots);
|
||||
SizeForNumberOfFarJumpSlots(num_runtime_slots, num_function_slots);
|
||||
// Assume enough space, so the Assembler does not try to grow the buffer.
|
||||
JumpTableAssembler jtasm(base, table_size + 256);
|
||||
int offset = 0;
|
||||
for (int index = 0; index < num_stubs + num_function_slots; ++index) {
|
||||
for (int index = 0; index < num_runtime_slots + num_function_slots;
|
||||
++index) {
|
||||
DCHECK_EQ(offset, FarJumpSlotIndexToOffset(index));
|
||||
// Functions slots initially jump to themselves. They are patched before
|
||||
// being used.
|
||||
Address target = index < num_stubs ? stub_targets[index] : base + offset;
|
||||
Address target =
|
||||
index < num_runtime_slots ? stub_targets[index] : base + offset;
|
||||
jtasm.EmitFarJumpSlot(target);
|
||||
offset += kFarJumpTableSlotSize;
|
||||
DCHECK_EQ(offset, jtasm.pc_offset());
|
||||
@ -142,13 +145,19 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
|
||||
FlushInstructionCache(base, table_size);
|
||||
}
|
||||
|
||||
static void PatchJumpTableSlot(Address base, uint32_t slot_index,
|
||||
Address new_target) {
|
||||
Address slot = base + JumpSlotIndexToOffset(slot_index);
|
||||
JumpTableAssembler jtasm(slot);
|
||||
jtasm.EmitJumpSlot(new_target);
|
||||
static void PatchJumpTableSlot(Address jump_table_slot,
|
||||
Address far_jump_table_slot, Address target) {
|
||||
// First, try to patch the jump table slot.
|
||||
JumpTableAssembler jtasm(jump_table_slot);
|
||||
if (!jtasm.EmitJumpSlot(target)) {
|
||||
// If that fails, we need to patch the far jump table slot, and then
|
||||
// update the jump table slot to jump to this far jump table slot.
|
||||
DCHECK_NE(kNullAddress, far_jump_table_slot);
|
||||
JumpTableAssembler::PatchFarJumpSlot(far_jump_table_slot, target);
|
||||
CHECK(jtasm.EmitJumpSlot(far_jump_table_slot));
|
||||
}
|
||||
jtasm.NopBytes(kJumpTableSlotSize - jtasm.pc_offset());
|
||||
FlushInstructionCache(slot, kJumpTableSlotSize);
|
||||
FlushInstructionCache(jump_table_slot, kJumpTableSlotSize);
|
||||
}
|
||||
|
||||
private:
|
||||
@ -223,10 +232,16 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
|
||||
void EmitLazyCompileJumpSlot(uint32_t func_index,
|
||||
Address lazy_compile_target);
|
||||
|
||||
void EmitJumpSlot(Address target);
|
||||
// Returns {true} if the jump fits in the jump table slot, {false} otherwise.
|
||||
bool EmitJumpSlot(Address target);
|
||||
|
||||
// Initially emit a far jump slot.
|
||||
void EmitFarJumpSlot(Address target);
|
||||
|
||||
// Patch an existing far jump slot, and make sure that this updated eventually
|
||||
// becomes available to all execution units that might execute this code.
|
||||
static void PatchFarJumpSlot(Address slot, Address target);
|
||||
|
||||
void NopBytes(int bytes);
|
||||
};
|
||||
|
||||
|
@ -1143,11 +1143,26 @@ void NativeModule::PatchJumpTablesLocked(uint32_t func_index, Address target) {
|
||||
// The caller must hold the {allocation_mutex_}, thus we fail to lock it here.
|
||||
DCHECK(!allocation_mutex_.TryLock());
|
||||
|
||||
uint32_t slot_index = func_index - module_->num_imported_functions;
|
||||
for (auto& code_space_data : code_space_data_) {
|
||||
DCHECK_IMPLIES(code_space_data.jump_table, code_space_data.far_jump_table);
|
||||
if (!code_space_data.jump_table) continue;
|
||||
Address jump_table_base = code_space_data.jump_table->instruction_start();
|
||||
JumpTableAssembler::PatchJumpTableSlot(jump_table_base, slot_index, target);
|
||||
uint32_t slot_index = func_index - module_->num_imported_functions;
|
||||
Address jump_table_slot =
|
||||
code_space_data.jump_table->instruction_start() +
|
||||
JumpTableAssembler::JumpSlotIndexToOffset(slot_index);
|
||||
uint32_t far_jump_table_offset =
|
||||
JumpTableAssembler::FarJumpSlotIndexToOffset(
|
||||
WasmCode::kRuntimeStubCount + slot_index);
|
||||
// Only pass the far jump table start if the far jump table actually has a
|
||||
// slot for this function index (i.e. does not only contain runtime stubs).
|
||||
Address far_jump_table_slot =
|
||||
far_jump_table_offset <
|
||||
code_space_data.far_jump_table->instructions().size()
|
||||
? code_space_data.far_jump_table->instruction_start() +
|
||||
far_jump_table_offset
|
||||
: kNullAddress;
|
||||
JumpTableAssembler::PatchJumpTableSlot(jump_table_slot, far_jump_table_slot,
|
||||
target);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -184,8 +184,9 @@ class JumpTablePatcher : public v8::base::Thread {
|
||||
TRACE(" patcher %p patch slot " V8PRIxPTR_FMT " to thunk #%d\n", this,
|
||||
slot_address, i % 2);
|
||||
base::MutexGuard jump_table_guard(jump_table_mutex_);
|
||||
JumpTableAssembler::PatchJumpTableSlot(slot_start_, slot_index_,
|
||||
thunks_[i % 2]);
|
||||
JumpTableAssembler::PatchJumpTableSlot(
|
||||
slot_start_ + JumpTableAssembler::JumpSlotIndexToOffset(slot_index_),
|
||||
kNullAddress, thunks_[i % 2]);
|
||||
}
|
||||
TRACE("Patcher %p is stopping ...\n", this);
|
||||
}
|
||||
@ -242,8 +243,9 @@ TEST(JumpTablePatchingStress) {
|
||||
std::vector<std::unique_ptr<TestingAssemblerBuffer>> thunk_buffers;
|
||||
// Patch the jump table slot to jump to itself. This will later be patched
|
||||
// by the patchers.
|
||||
JumpTableAssembler::PatchJumpTableSlot(slot_start, slot,
|
||||
slot_start + slot_offset);
|
||||
Address slot_addr =
|
||||
slot_start + JumpTableAssembler::JumpSlotIndexToOffset(slot);
|
||||
JumpTableAssembler::PatchJumpTableSlot(slot_addr, kNullAddress, slot_addr);
|
||||
// For each patcher, generate two thunks where this patcher can emit code
|
||||
// which finally jumps back to {slot} in the jump table.
|
||||
std::vector<Address> patcher_thunks;
|
||||
|
Loading…
Reference in New Issue
Block a user