[wasm] Implement patching of far jump table

If the jump is too large for a near jump, we patch the far jump table
instead, and patch the (near) jump table to jump to the far jump table
slot.

R=mstarzinger@chromium.org

Bug: v8:9477
Change-Id: Ic9a929b405492c1cfe744738e0807ad4357c53ff
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1799543
Commit-Queue: Clemens Hammacher <clemensh@chromium.org>
Reviewed-by: Michael Starzinger <mstarzinger@chromium.org>
Cr-Commit-Position: refs/heads/master@{#63754}
This commit is contained in:
Clemens Hammacher 2019-09-13 13:43:41 +02:00 committed by Commit Bot
parent fe674753fa
commit 0a8ddb134c
5 changed files with 107 additions and 36 deletions

View File

@ -218,6 +218,7 @@ Address Assembler::target_address_at(Address pc, Address constant_pool) {
void Assembler::set_target_address_at(Address pc, Address constant_pool,
Address target,
ICacheFlushMode icache_flush_mode) {
DCHECK(is_int32(target - pc - 4));
WriteUnalignedValue(pc, static_cast<int32_t>(target - pc - 4));
if (icache_flush_mode != SKIP_ICACHE_FLUSH) {
FlushInstructionCache(pc, sizeof(int32_t));

View File

@ -21,13 +21,12 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
EmitJumpSlot(lazy_compile_target); // 5 bytes
}
void JumpTableAssembler::EmitJumpSlot(Address target) {
// On x64, all code is allocated within a single code section, so we can use
// relative jumps.
static_assert(kMaxWasmCodeMemory <= size_t{2} * GB, "can use relative jump");
bool JumpTableAssembler::EmitJumpSlot(Address target) {
intptr_t displacement = static_cast<intptr_t>(
reinterpret_cast<byte*>(target) - pc_ - kNearJmpInstrSize);
near_jmp(displacement, RelocInfo::NONE);
if (!is_int32(displacement)) return false;
near_jmp(displacement, RelocInfo::NONE); // 5 bytes
return true;
}
void JumpTableAssembler::EmitFarJumpSlot(Address target) {
@ -35,13 +34,26 @@ void JumpTableAssembler::EmitFarJumpSlot(Address target) {
int start_offset = pc_offset();
jmp(Operand(&data)); // 6 bytes
Nop(2); // 2 bytes
// The data must be properly aligned, so it can be patched atomically.
DCHECK_EQ(start_offset + 8, pc_offset());
// The data must be properly aligned, so it can be patched atomically (see
// {PatchFarJumpSlot}).
DCHECK_EQ(start_offset + kSystemPointerSize, pc_offset());
USE(start_offset);
bind(&data);
dq(target); // 8 bytes
}
// static
void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
// The slot needs to be pointer-size aligned so we can atomically update it.
DCHECK(IsAligned(slot, kSystemPointerSize));
// Offset of the target is at 8 bytes, see {EmitFarJumpSlot}.
reinterpret_cast<std::atomic<Address>*>(slot + kSystemPointerSize)
->store(target, std::memory_order_relaxed);
// The update is atomic because the address is properly aligned.
// Because of cache coherence, the data update will eventually be seen by all
// cores. It's ok if they temporarily jump to the old target.
}
void JumpTableAssembler::NopBytes(int bytes) {
DCHECK_LE(0, bytes);
Nop(bytes);
@ -54,14 +66,20 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
jmp(lazy_compile_target, RelocInfo::NONE); // 5 bytes
}
void JumpTableAssembler::EmitJumpSlot(Address target) {
bool JumpTableAssembler::EmitJumpSlot(Address target) {
jmp(target, RelocInfo::NONE);
return true;
}
void JumpTableAssembler::EmitFarJumpSlot(Address target) {
jmp(target, RelocInfo::NONE);
}
// static
void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
UNREACHABLE();
}
void JumpTableAssembler::NopBytes(int bytes) {
DCHECK_LE(0, bytes);
Nop(bytes);
@ -82,11 +100,12 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
EmitJumpSlot(lazy_compile_target);
}
void JumpTableAssembler::EmitJumpSlot(Address target) {
bool JumpTableAssembler::EmitJumpSlot(Address target) {
// Note that {Move32BitImmediate} emits [ldr, constant] for the relocation
// mode used below, we need this to allow concurrent patching of this slot.
Move32BitImmediate(pc, Operand(target, RelocInfo::WASM_CALL));
CheckConstPool(true, false); // force emit of const pool
return true;
}
void JumpTableAssembler::EmitFarJumpSlot(Address target) {
@ -98,6 +117,11 @@ void JumpTableAssembler::EmitFarJumpSlot(Address target) {
STATIC_ASSERT(kFarJumpTableSlotSize == 2 * kInstrSize);
}
// static
void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
UNREACHABLE();
}
void JumpTableAssembler::NopBytes(int bytes) {
DCHECK_LE(0, bytes);
DCHECK_EQ(0, bytes % kInstrSize);
@ -117,13 +141,14 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
if (nop_bytes) nop();
}
void JumpTableAssembler::EmitJumpSlot(Address target) {
// TODO(wasm): Currently this is guaranteed to be a {near_call} and hence is
// patchable concurrently. Once {kMaxWasmCodeMemory} is raised on ARM64, make
// sure concurrent patching is still supported.
DCHECK(TurboAssembler::IsNearCallOffset(
(reinterpret_cast<byte*>(target) - pc_) / kInstrSize));
bool JumpTableAssembler::EmitJumpSlot(Address target) {
if (!TurboAssembler::IsNearCallOffset(
(reinterpret_cast<byte*>(target) - pc_) / kInstrSize)) {
return false;
}
Jump(target, RelocInfo::NONE);
return true;
}
void JumpTableAssembler::EmitFarJumpSlot(Address target) {
@ -138,10 +163,23 @@ void JumpTableAssembler::EmitFarJumpSlot(Address target) {
ldr_pcrel(kTmpReg, 2); // 1 instruction
br(kTmpReg); // 1 instruction
dq(target); // 8 bytes (== 2 instructions)
STATIC_ASSERT(kInstrSize == kInt32Size);
STATIC_ASSERT(2 * kInstrSize == kSystemPointerSize);
STATIC_ASSERT(kFarJumpTableSlotSize == 4 * kInstrSize);
}
// static
void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
// The slot needs to be pointer-size aligned so we can atomically update it.
DCHECK(IsAligned(slot, kSystemPointerSize));
// Offset of the target is at 8 bytes, see {EmitFarJumpSlot}.
reinterpret_cast<std::atomic<Address>*>(slot + kSystemPointerSize)
->store(target, std::memory_order_relaxed);
// The data update is guaranteed to be atomic since it's a properly aligned
// and stores a single machine word. This update will eventually be observed
// by any concurrent [ldr] on the same address because of the data cache
// coherence. It's ok if other cores temporarily jump to the old target.
}
void JumpTableAssembler::NopBytes(int bytes) {
DCHECK_LE(0, bytes);
DCHECK_EQ(0, bytes % kInstrSize);

View File

@ -93,8 +93,8 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
// Determine the size of a far jump table containing the given number of
// slots.
static constexpr uint32_t SizeForNumberOfFarJumpSlots(
int num_stubs, int num_function_slots) {
int num_entries = num_stubs + num_function_slots;
int num_runtime_slots, int num_function_slots) {
int num_entries = num_runtime_slots + num_function_slots;
return num_entries * kFarJumpTableSlotSize;
}
@ -124,17 +124,20 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
}
static void GenerateFarJumpTable(Address base, Address* stub_targets,
int num_stubs, int num_function_slots) {
int num_runtime_slots,
int num_function_slots) {
uint32_t table_size =
SizeForNumberOfFarJumpSlots(num_stubs, num_function_slots);
SizeForNumberOfFarJumpSlots(num_runtime_slots, num_function_slots);
// Assume enough space, so the Assembler does not try to grow the buffer.
JumpTableAssembler jtasm(base, table_size + 256);
int offset = 0;
for (int index = 0; index < num_stubs + num_function_slots; ++index) {
for (int index = 0; index < num_runtime_slots + num_function_slots;
++index) {
DCHECK_EQ(offset, FarJumpSlotIndexToOffset(index));
// Functions slots initially jump to themselves. They are patched before
// being used.
Address target = index < num_stubs ? stub_targets[index] : base + offset;
Address target =
index < num_runtime_slots ? stub_targets[index] : base + offset;
jtasm.EmitFarJumpSlot(target);
offset += kFarJumpTableSlotSize;
DCHECK_EQ(offset, jtasm.pc_offset());
@ -142,13 +145,19 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
FlushInstructionCache(base, table_size);
}
static void PatchJumpTableSlot(Address base, uint32_t slot_index,
Address new_target) {
Address slot = base + JumpSlotIndexToOffset(slot_index);
JumpTableAssembler jtasm(slot);
jtasm.EmitJumpSlot(new_target);
static void PatchJumpTableSlot(Address jump_table_slot,
Address far_jump_table_slot, Address target) {
// First, try to patch the jump table slot.
JumpTableAssembler jtasm(jump_table_slot);
if (!jtasm.EmitJumpSlot(target)) {
// If that fails, we need to patch the far jump table slot, and then
// update the jump table slot to jump to this far jump table slot.
DCHECK_NE(kNullAddress, far_jump_table_slot);
JumpTableAssembler::PatchFarJumpSlot(far_jump_table_slot, target);
CHECK(jtasm.EmitJumpSlot(far_jump_table_slot));
}
jtasm.NopBytes(kJumpTableSlotSize - jtasm.pc_offset());
FlushInstructionCache(slot, kJumpTableSlotSize);
FlushInstructionCache(jump_table_slot, kJumpTableSlotSize);
}
private:
@ -223,10 +232,16 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
void EmitLazyCompileJumpSlot(uint32_t func_index,
Address lazy_compile_target);
void EmitJumpSlot(Address target);
// Returns {true} if the jump fits in the jump table slot, {false} otherwise.
bool EmitJumpSlot(Address target);
// Initially emit a far jump slot.
void EmitFarJumpSlot(Address target);
// Patch an existing far jump slot, and make sure that this updated eventually
// becomes available to all execution units that might execute this code.
static void PatchFarJumpSlot(Address slot, Address target);
void NopBytes(int bytes);
};

View File

@ -1143,11 +1143,26 @@ void NativeModule::PatchJumpTablesLocked(uint32_t func_index, Address target) {
// The caller must hold the {allocation_mutex_}, thus we fail to lock it here.
DCHECK(!allocation_mutex_.TryLock());
uint32_t slot_index = func_index - module_->num_imported_functions;
for (auto& code_space_data : code_space_data_) {
DCHECK_IMPLIES(code_space_data.jump_table, code_space_data.far_jump_table);
if (!code_space_data.jump_table) continue;
Address jump_table_base = code_space_data.jump_table->instruction_start();
JumpTableAssembler::PatchJumpTableSlot(jump_table_base, slot_index, target);
uint32_t slot_index = func_index - module_->num_imported_functions;
Address jump_table_slot =
code_space_data.jump_table->instruction_start() +
JumpTableAssembler::JumpSlotIndexToOffset(slot_index);
uint32_t far_jump_table_offset =
JumpTableAssembler::FarJumpSlotIndexToOffset(
WasmCode::kRuntimeStubCount + slot_index);
// Only pass the far jump table start if the far jump table actually has a
// slot for this function index (i.e. does not only contain runtime stubs).
Address far_jump_table_slot =
far_jump_table_offset <
code_space_data.far_jump_table->instructions().size()
? code_space_data.far_jump_table->instruction_start() +
far_jump_table_offset
: kNullAddress;
JumpTableAssembler::PatchJumpTableSlot(jump_table_slot, far_jump_table_slot,
target);
}
}

View File

@ -184,8 +184,9 @@ class JumpTablePatcher : public v8::base::Thread {
TRACE(" patcher %p patch slot " V8PRIxPTR_FMT " to thunk #%d\n", this,
slot_address, i % 2);
base::MutexGuard jump_table_guard(jump_table_mutex_);
JumpTableAssembler::PatchJumpTableSlot(slot_start_, slot_index_,
thunks_[i % 2]);
JumpTableAssembler::PatchJumpTableSlot(
slot_start_ + JumpTableAssembler::JumpSlotIndexToOffset(slot_index_),
kNullAddress, thunks_[i % 2]);
}
TRACE("Patcher %p is stopping ...\n", this);
}
@ -242,8 +243,9 @@ TEST(JumpTablePatchingStress) {
std::vector<std::unique_ptr<TestingAssemblerBuffer>> thunk_buffers;
// Patch the jump table slot to jump to itself. This will later be patched
// by the patchers.
JumpTableAssembler::PatchJumpTableSlot(slot_start, slot,
slot_start + slot_offset);
Address slot_addr =
slot_start + JumpTableAssembler::JumpSlotIndexToOffset(slot);
JumpTableAssembler::PatchJumpTableSlot(slot_addr, kNullAddress, slot_addr);
// For each patcher, generate two thunks where this patcher can emit code
// which finally jumps back to {slot} in the jump table.
std::vector<Address> patcher_thunks;