[wasm] Implement patching of far jump table

If the jump is too large for a near jump, we patch the far jump table instead, and patch the (near) jump table to jump to the far jump table slot. R=mstarzinger@chromium.org Bug: v8:9477 Change-Id: Ic9a929b405492c1cfe744738e0807ad4357c53ff Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1799543 Commit-Queue: Clemens Hammacher <clemensh@chromium.org> Reviewed-by: Michael Starzinger <mstarzinger@chromium.org> Cr-Commit-Position: refs/heads/master@{#63754}
2019-09-13 13:43:41 +02:00 · 2019-09-13 13:43:41 +02:00 · 0a8ddb134c
commit 0a8ddb134c
parent fe674753fa
5 changed files with 107 additions and 36 deletions
--- a/src/codegen/x64/assembler-x64-inl.h
+++ b/src/codegen/x64/assembler-x64-inl.h
@ -218,6 +218,7 @@ Address Assembler::target_address_at(Address pc, Address constant_pool) {
 void Assembler::set_target_address_at(Address pc, Address constant_pool,
                                      Address target,
                                      ICacheFlushMode icache_flush_mode) {
+  DCHECK(is_int32(target - pc - 4));
  WriteUnalignedValue(pc, static_cast<int32_t>(target - pc - 4));
  if (icache_flush_mode != SKIP_ICACHE_FLUSH) {
    FlushInstructionCache(pc, sizeof(int32_t));
--- a/src/wasm/jump-table-assembler.cc
+++ b/src/wasm/jump-table-assembler.cc
@ -21,13 +21,12 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
  EmitJumpSlot(lazy_compile_target);  // 5 bytes
 }

-void JumpTableAssembler::EmitJumpSlot(Address target) {
-  // On x64, all code is allocated within a single code section, so we can use
-  // relative jumps.
-  static_assert(kMaxWasmCodeMemory <= size_t{2} * GB, "can use relative jump");
+bool JumpTableAssembler::EmitJumpSlot(Address target) {
  intptr_t displacement = static_cast<intptr_t>(
      reinterpret_cast<byte*>(target) - pc_ - kNearJmpInstrSize);
-  near_jmp(displacement, RelocInfo::NONE);
+  if (!is_int32(displacement)) return false;
+  near_jmp(displacement, RelocInfo::NONE);  // 5 bytes
+  return true;
 }

 void JumpTableAssembler::EmitFarJumpSlot(Address target) {
@ -35,13 +34,26 @@ void JumpTableAssembler::EmitFarJumpSlot(Address target) {
  int start_offset = pc_offset();
  jmp(Operand(&data));  // 6 bytes
  Nop(2);               // 2 bytes
-  // The data must be properly aligned, so it can be patched atomically.
-  DCHECK_EQ(start_offset + 8, pc_offset());
+  // The data must be properly aligned, so it can be patched atomically (see
+  // {PatchFarJumpSlot}).
+  DCHECK_EQ(start_offset + kSystemPointerSize, pc_offset());
  USE(start_offset);
  bind(&data);
  dq(target);  // 8 bytes
 }

+// static
+void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
+  // The slot needs to be pointer-size aligned so we can atomically update it.
+  DCHECK(IsAligned(slot, kSystemPointerSize));
+  // Offset of the target is at 8 bytes, see {EmitFarJumpSlot}.
+  reinterpret_cast<std::atomic<Address>*>(slot + kSystemPointerSize)
+      ->store(target, std::memory_order_relaxed);
+  // The update is atomic because the address is properly aligned.
+  // Because of cache coherence, the data update will eventually be seen by all
+  // cores. It's ok if they temporarily jump to the old target.
+}
+
 void JumpTableAssembler::NopBytes(int bytes) {
  DCHECK_LE(0, bytes);
  Nop(bytes);
@ -54,14 +66,20 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
  jmp(lazy_compile_target, RelocInfo::NONE);           // 5 bytes
 }

-void JumpTableAssembler::EmitJumpSlot(Address target) {
+bool JumpTableAssembler::EmitJumpSlot(Address target) {
  jmp(target, RelocInfo::NONE);
+  return true;
 }

 void JumpTableAssembler::EmitFarJumpSlot(Address target) {
  jmp(target, RelocInfo::NONE);
 }

+// static
+void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
+  UNREACHABLE();
+}
+
 void JumpTableAssembler::NopBytes(int bytes) {
  DCHECK_LE(0, bytes);
  Nop(bytes);
@ -82,11 +100,12 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
  EmitJumpSlot(lazy_compile_target);
 }

-void JumpTableAssembler::EmitJumpSlot(Address target) {
+bool JumpTableAssembler::EmitJumpSlot(Address target) {
  // Note that {Move32BitImmediate} emits [ldr, constant] for the relocation
  // mode used below, we need this to allow concurrent patching of this slot.
  Move32BitImmediate(pc, Operand(target, RelocInfo::WASM_CALL));
  CheckConstPool(true, false);  // force emit of const pool
+  return true;
 }

 void JumpTableAssembler::EmitFarJumpSlot(Address target) {
@ -98,6 +117,11 @@ void JumpTableAssembler::EmitFarJumpSlot(Address target) {
  STATIC_ASSERT(kFarJumpTableSlotSize == 2 * kInstrSize);
 }

+// static
+void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
+  UNREACHABLE();
+}
+
 void JumpTableAssembler::NopBytes(int bytes) {
  DCHECK_LE(0, bytes);
  DCHECK_EQ(0, bytes % kInstrSize);
@ -117,13 +141,14 @@ void JumpTableAssembler::EmitLazyCompileJumpSlot(uint32_t func_index,
  if (nop_bytes) nop();
 }

-void JumpTableAssembler::EmitJumpSlot(Address target) {
-  // TODO(wasm): Currently this is guaranteed to be a {near_call} and hence is
-  // patchable concurrently. Once {kMaxWasmCodeMemory} is raised on ARM64, make
-  // sure concurrent patching is still supported.
-  DCHECK(TurboAssembler::IsNearCallOffset(
-      (reinterpret_cast<byte*>(target) - pc_) / kInstrSize));
+bool JumpTableAssembler::EmitJumpSlot(Address target) {
+  if (!TurboAssembler::IsNearCallOffset(
+          (reinterpret_cast<byte*>(target) - pc_) / kInstrSize)) {
+    return false;
+  }
+
  Jump(target, RelocInfo::NONE);
+  return true;
 }

 void JumpTableAssembler::EmitFarJumpSlot(Address target) {
@ -138,10 +163,23 @@ void JumpTableAssembler::EmitFarJumpSlot(Address target) {
  ldr_pcrel(kTmpReg, 2);  // 1 instruction
  br(kTmpReg);            // 1 instruction
  dq(target);             // 8 bytes (== 2 instructions)
-  STATIC_ASSERT(kInstrSize == kInt32Size);
+  STATIC_ASSERT(2 * kInstrSize == kSystemPointerSize);
  STATIC_ASSERT(kFarJumpTableSlotSize == 4 * kInstrSize);
 }

+// static
+void JumpTableAssembler::PatchFarJumpSlot(Address slot, Address target) {
+  // The slot needs to be pointer-size aligned so we can atomically update it.
+  DCHECK(IsAligned(slot, kSystemPointerSize));
+  // Offset of the target is at 8 bytes, see {EmitFarJumpSlot}.
+  reinterpret_cast<std::atomic<Address>*>(slot + kSystemPointerSize)
+      ->store(target, std::memory_order_relaxed);
+  // The data update is guaranteed to be atomic since it's a properly aligned
+  // and stores a single machine word. This update will eventually be observed
+  // by any concurrent [ldr] on the same address because of the data cache
+  // coherence. It's ok if other cores temporarily jump to the old target.
+}
+
 void JumpTableAssembler::NopBytes(int bytes) {
  DCHECK_LE(0, bytes);
  DCHECK_EQ(0, bytes % kInstrSize);
--- a/src/wasm/jump-table-assembler.h
+++ b/src/wasm/jump-table-assembler.h
@ -93,8 +93,8 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
  // Determine the size of a far jump table containing the given number of
  // slots.
  static constexpr uint32_t SizeForNumberOfFarJumpSlots(
-      int num_stubs, int num_function_slots) {
-    int num_entries = num_stubs + num_function_slots;
+      int num_runtime_slots, int num_function_slots) {
+    int num_entries = num_runtime_slots + num_function_slots;
    return num_entries * kFarJumpTableSlotSize;
  }

@ -124,17 +124,20 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
  }

  static void GenerateFarJumpTable(Address base, Address* stub_targets,
-                                   int num_stubs, int num_function_slots) {
+                                   int num_runtime_slots,
+                                   int num_function_slots) {
    uint32_t table_size =
-        SizeForNumberOfFarJumpSlots(num_stubs, num_function_slots);
+        SizeForNumberOfFarJumpSlots(num_runtime_slots, num_function_slots);
    // Assume enough space, so the Assembler does not try to grow the buffer.
    JumpTableAssembler jtasm(base, table_size + 256);
    int offset = 0;
-    for (int index = 0; index < num_stubs + num_function_slots; ++index) {
+    for (int index = 0; index < num_runtime_slots + num_function_slots;
+         ++index) {
      DCHECK_EQ(offset, FarJumpSlotIndexToOffset(index));
      // Functions slots initially jump to themselves. They are patched before
      // being used.
-      Address target = index < num_stubs ? stub_targets[index] : base + offset;
+      Address target =
+          index < num_runtime_slots ? stub_targets[index] : base + offset;
      jtasm.EmitFarJumpSlot(target);
      offset += kFarJumpTableSlotSize;
      DCHECK_EQ(offset, jtasm.pc_offset());
@ -142,13 +145,19 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
    FlushInstructionCache(base, table_size);
  }

-  static void PatchJumpTableSlot(Address base, uint32_t slot_index,
-                                 Address new_target) {
-    Address slot = base + JumpSlotIndexToOffset(slot_index);
-    JumpTableAssembler jtasm(slot);
-    jtasm.EmitJumpSlot(new_target);
+  static void PatchJumpTableSlot(Address jump_table_slot,
+                                 Address far_jump_table_slot, Address target) {
+    // First, try to patch the jump table slot.
+    JumpTableAssembler jtasm(jump_table_slot);
+    if (!jtasm.EmitJumpSlot(target)) {
+      // If that fails, we need to patch the far jump table slot, and then
+      // update the jump table slot to jump to this far jump table slot.
+      DCHECK_NE(kNullAddress, far_jump_table_slot);
+      JumpTableAssembler::PatchFarJumpSlot(far_jump_table_slot, target);
+      CHECK(jtasm.EmitJumpSlot(far_jump_table_slot));
+    }
    jtasm.NopBytes(kJumpTableSlotSize - jtasm.pc_offset());
-    FlushInstructionCache(slot, kJumpTableSlotSize);
+    FlushInstructionCache(jump_table_slot, kJumpTableSlotSize);
  }

 private:
@ -223,10 +232,16 @@ class V8_EXPORT_PRIVATE JumpTableAssembler : public MacroAssembler {
  void EmitLazyCompileJumpSlot(uint32_t func_index,
                               Address lazy_compile_target);

-  void EmitJumpSlot(Address target);
+  // Returns {true} if the jump fits in the jump table slot, {false} otherwise.
+  bool EmitJumpSlot(Address target);

+  // Initially emit a far jump slot.
  void EmitFarJumpSlot(Address target);

+  // Patch an existing far jump slot, and make sure that this updated eventually
+  // becomes available to all execution units that might execute this code.
+  static void PatchFarJumpSlot(Address slot, Address target);
+
  void NopBytes(int bytes);
 };

--- a/src/wasm/wasm-code-manager.cc
+++ b/src/wasm/wasm-code-manager.cc
@ -1143,11 +1143,26 @@ void NativeModule::PatchJumpTablesLocked(uint32_t func_index, Address target) {
  // The caller must hold the {allocation_mutex_}, thus we fail to lock it here.
  DCHECK(!allocation_mutex_.TryLock());

-  uint32_t slot_index = func_index - module_->num_imported_functions;
  for (auto& code_space_data : code_space_data_) {
+    DCHECK_IMPLIES(code_space_data.jump_table, code_space_data.far_jump_table);
    if (!code_space_data.jump_table) continue;
-    Address jump_table_base = code_space_data.jump_table->instruction_start();
-    JumpTableAssembler::PatchJumpTableSlot(jump_table_base, slot_index, target);
+    uint32_t slot_index = func_index - module_->num_imported_functions;
+    Address jump_table_slot =
+        code_space_data.jump_table->instruction_start() +
+        JumpTableAssembler::JumpSlotIndexToOffset(slot_index);
+    uint32_t far_jump_table_offset =
+        JumpTableAssembler::FarJumpSlotIndexToOffset(
+            WasmCode::kRuntimeStubCount + slot_index);
+    // Only pass the far jump table start if the far jump table actually has a
+    // slot for this function index (i.e. does not only contain runtime stubs).
+    Address far_jump_table_slot =
+        far_jump_table_offset <
+                code_space_data.far_jump_table->instructions().size()
+            ? code_space_data.far_jump_table->instruction_start() +
+                  far_jump_table_offset
+            : kNullAddress;
+    JumpTableAssembler::PatchJumpTableSlot(jump_table_slot, far_jump_table_slot,
+                                           target);
  }
 }

--- a/test/cctest/wasm/test-jump-table-assembler.cc
+++ b/test/cctest/wasm/test-jump-table-assembler.cc
@ -184,8 +184,9 @@ class JumpTablePatcher : public v8::base::Thread {
      TRACE("  patcher %p patch slot " V8PRIxPTR_FMT " to thunk #%d\n", this,
            slot_address, i % 2);
      base::MutexGuard jump_table_guard(jump_table_mutex_);
-      JumpTableAssembler::PatchJumpTableSlot(slot_start_, slot_index_,
-                                             thunks_[i % 2]);
+      JumpTableAssembler::PatchJumpTableSlot(
+          slot_start_ + JumpTableAssembler::JumpSlotIndexToOffset(slot_index_),
+          kNullAddress, thunks_[i % 2]);
    }
    TRACE("Patcher %p is stopping ...\n", this);
  }
@ -242,8 +243,9 @@ TEST(JumpTablePatchingStress) {
    std::vector<std::unique_ptr<TestingAssemblerBuffer>> thunk_buffers;
    // Patch the jump table slot to jump to itself. This will later be patched
    // by the patchers.
-    JumpTableAssembler::PatchJumpTableSlot(slot_start, slot,
-                                           slot_start + slot_offset);
+    Address slot_addr =
+        slot_start + JumpTableAssembler::JumpSlotIndexToOffset(slot);
+    JumpTableAssembler::PatchJumpTableSlot(slot_addr, kNullAddress, slot_addr);
    // For each patcher, generate two thunks where this patcher can emit code
    // which finally jumps back to {slot} in the jump table.
    std::vector<Address> patcher_thunks;