v8/test/cctest/wasm/test-jump-table-assembler.cc
Igor Sheludko 449ece383b Reland "[rwx][mac] Support fast W^X permission switching on Apple Silicon (M1)"
This is a reland of commit 9d31f8663a
There were issues with --future flag implications on M1.

Original change's description:
> [rwx][mac] Support fast W^X permission switching on Apple Silicon (M1)
>
> ... for V8 code space. The feature is currently disabled.
>
> In order to use fast W^X permission switching we must allocate
> executable pages with readable writable executable permissions (RWX).
> However, MacOS on ARM64 ("Apple M1"/Apple Silicon) prohibits further
> permission changing of RWX memory pages. This means that the code page
> headers must be allocated with RWX permissions too because otherwise
> it wouldn't be possible to allocate a large code page over the freed
> regular code page and vice versa.
>
> When enabled, the new machinery works as follows:
>
> 1) when memory region is reserved for allocating executable pages, the
>    whole region is committed with RWX permissions and then decommitted,
> 2) since reconfiguration of RWX page permissions is not allowed on
>    MacOS on ARM64 ("Apple M1"/Apple Silicon), there must be no attempts
>    to change them,
> 3) the request to set RWX permissions in the executable page region
>    just recommits the pages without changing permissions (see (1), they
>    were already allocated as RWX and then discarded),
> 4) in order to make executable pages inaccessible one must use
>    OS::DiscardSystemPages() instead of OS::DecommitPages() or
>    setting permissions to kNoAccess because the latter two are not
>    allowed by the MacOS (see (2)).
> 5) since code space page headers are allocated as RWX pages it's also
>    necessary to switch between W^X modes when updating the data in the
>    page headers (i.e. when marking, updating stats, wiring pages in
>    lists, etc.). The new CodePageHeaderModificationScope class is used
>    in the respective places. On unrelated configurations it's a no-op.
>
> The fast permission switching can't be used for V8 configuration with
> enabled pointer compression and disabled external code space because
> a) the pointer compression cage has to be reserved with MAP_JIT flag
>    which is too expensive,
> b) in case of shared pointer compression cage if the code range will
>    be deleted while the cage is still alive then attempt to configure
>    permissions of pages that were previously set to RWX will fail.
>
> This also CL extends the unmapper unit tests with permissions tracking
> for discarded pages.
>
> Bug: v8:12797
> Change-Id: Idb28cbc481306477589eee9962d2e75167d87c61
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3579303
> Reviewed-by: Nico Hartmann <nicohartmann@chromium.org>
> Reviewed-by: Clemens Backes <clemensb@chromium.org>
> Reviewed-by: Michael Lippautz <mlippautz@chromium.org>
> Commit-Queue: Igor Sheludko <ishell@chromium.org>
> Cr-Commit-Position: refs/heads/main@{#80238}

Bug: v8:12797
Change-Id: I0fe86666f31bad37d7074e217555c95900d2afba
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3610433
Reviewed-by: Nico Hartmann <nicohartmann@chromium.org>
Reviewed-by: Michael Lippautz <mlippautz@chromium.org>
Commit-Queue: Igor Sheludko <ishell@chromium.org>
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/main@{#80259}
2022-04-28 14:08:11 +00:00

317 lines
12 KiB
C++

// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <bitset>
#include "src/base/utils/random-number-generator.h"
#include "src/codegen/assembler-inl.h"
#include "src/codegen/macro-assembler-inl.h"
#include "src/execution/simulator.h"
#include "src/utils/utils.h"
#include "src/wasm/code-space-access.h"
#include "src/wasm/jump-table-assembler.h"
#include "test/cctest/cctest.h"
#include "test/common/assembler-tester.h"
namespace v8 {
namespace internal {
namespace wasm {
#if 0
#define TRACE(...) PrintF(__VA_ARGS__)
#else
#define TRACE(...)
#endif
#define __ masm.
namespace {
static volatile int global_stop_bit = 0;
constexpr int kJumpTableSlotCount = 128;
constexpr uint32_t kJumpTableSize =
JumpTableAssembler::SizeForNumberOfSlots(kJumpTableSlotCount);
// This must be a safe commit page size so we pick the largest OS page size that
// V8 is known to support. Arm64 linux can support up to 64k at runtime.
constexpr size_t kThunkBufferSize = 64 * KB;
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_X64
// We need the branches (from CompileJumpTableThunk) to be within near-call
// range of the jump table slots. The address hint to AllocateAssemblerBuffer
// is not reliable enough to guarantee that we can always achieve this with
// separate allocations, so we generate all code in a single
// kMaxCodeMemory-sized chunk.
constexpr size_t kAssemblerBufferSize = WasmCodeAllocator::kMaxCodeSpaceSize;
constexpr uint32_t kAvailableBufferSlots =
(WasmCodeAllocator::kMaxCodeSpaceSize - kJumpTableSize) / kThunkBufferSize;
constexpr uint32_t kBufferSlotStartOffset =
RoundUp<kThunkBufferSize>(kJumpTableSize);
#else
constexpr size_t kAssemblerBufferSize = kJumpTableSize;
constexpr uint32_t kAvailableBufferSlots = 0;
constexpr uint32_t kBufferSlotStartOffset = 0;
#endif
Address AllocateJumpTableThunk(
Address jump_target, byte* thunk_slot_buffer,
std::bitset<kAvailableBufferSlots>* used_slots,
std::vector<std::unique_ptr<TestingAssemblerBuffer>>* thunk_buffers) {
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_X64
// To guarantee that the branch range lies within the near-call range,
// generate the thunk in the same (kMaxWasmCodeSpaceSize-sized) buffer as the
// jump_target itself.
//
// Allocate a slot that we haven't already used. This is necessary because
// each test iteration expects to generate two unique addresses and we leave
// each slot executable (and not writable).
base::RandomNumberGenerator* rng =
CcTest::i_isolate()->random_number_generator();
// Ensure a chance of completion without too much thrashing.
DCHECK(used_slots->count() < (used_slots->size() / 2));
int buffer_index;
do {
buffer_index = rng->NextInt(kAvailableBufferSlots);
} while (used_slots->test(buffer_index));
used_slots->set(buffer_index);
return reinterpret_cast<Address>(thunk_slot_buffer +
buffer_index * kThunkBufferSize);
#else
USE(thunk_slot_buffer);
USE(used_slots);
thunk_buffers->emplace_back(
AllocateAssemblerBuffer(kThunkBufferSize, GetRandomMmapAddr()));
return reinterpret_cast<Address>(thunk_buffers->back()->start());
#endif
}
void CompileJumpTableThunk(Address thunk, Address jump_target) {
MacroAssembler masm(nullptr, AssemblerOptions{}, CodeObjectRequired::kNo,
ExternalAssemblerBuffer(reinterpret_cast<void*>(thunk),
kThunkBufferSize));
Label exit;
Register scratch = kReturnRegister0;
Address stop_bit_address = reinterpret_cast<Address>(&global_stop_bit);
#if V8_TARGET_ARCH_X64
__ Move(scratch, stop_bit_address, RelocInfo::NO_INFO);
__ testl(MemOperand(scratch, 0), Immediate(1));
__ j(not_zero, &exit);
__ Jump(jump_target, RelocInfo::NO_INFO);
#elif V8_TARGET_ARCH_IA32
__ Move(scratch, Immediate(stop_bit_address, RelocInfo::NO_INFO));
__ test(MemOperand(scratch, 0), Immediate(1));
__ j(not_zero, &exit);
__ jmp(jump_target, RelocInfo::NO_INFO);
#elif V8_TARGET_ARCH_ARM
__ mov(scratch, Operand(stop_bit_address, RelocInfo::NO_INFO));
__ ldr(scratch, MemOperand(scratch, 0));
__ tst(scratch, Operand(1));
__ b(ne, &exit);
__ Jump(jump_target, RelocInfo::NO_INFO);
#elif V8_TARGET_ARCH_ARM64
UseScratchRegisterScope temps(&masm);
temps.Exclude(x16);
scratch = x16;
__ Mov(scratch, Operand(stop_bit_address, RelocInfo::NO_INFO));
__ Ldr(scratch, MemOperand(scratch, 0));
__ Tbnz(scratch, 0, &exit);
__ Mov(scratch, Immediate(jump_target, RelocInfo::NO_INFO));
__ Br(scratch);
#elif V8_TARGET_ARCH_PPC64
__ mov(scratch, Operand(stop_bit_address, RelocInfo::NO_INFO));
__ LoadU64(scratch, MemOperand(scratch));
__ cmpi(scratch, Operand::Zero());
__ bne(&exit);
__ mov(scratch, Operand(jump_target, RelocInfo::NO_INFO));
__ Jump(scratch);
#elif V8_TARGET_ARCH_S390X
__ mov(scratch, Operand(stop_bit_address, RelocInfo::NO_INFO));
__ LoadU64(scratch, MemOperand(scratch));
__ CmpP(scratch, Operand(0));
__ bne(&exit);
__ mov(scratch, Operand(jump_target, RelocInfo::NO_INFO));
__ Jump(scratch);
#elif V8_TARGET_ARCH_MIPS64
__ li(scratch, Operand(stop_bit_address, RelocInfo::NO_INFO));
__ Lw(scratch, MemOperand(scratch, 0));
__ Branch(&exit, ne, scratch, Operand(zero_reg));
__ Jump(jump_target, RelocInfo::NO_INFO);
#elif V8_TARGET_ARCH_LOONG64
__ li(scratch, Operand(stop_bit_address, RelocInfo::NO_INFO));
__ Ld_w(scratch, MemOperand(scratch, 0));
__ Branch(&exit, ne, scratch, Operand(zero_reg));
__ Jump(jump_target, RelocInfo::NO_INFO);
#elif V8_TARGET_ARCH_MIPS
__ li(scratch, Operand(stop_bit_address, RelocInfo::NO_INFO));
__ lw(scratch, MemOperand(scratch, 0));
__ Branch(&exit, ne, scratch, Operand(zero_reg));
__ Jump(jump_target, RelocInfo::NO_INFO);
#elif V8_TARGET_ARCH_RISCV64
__ li(scratch, Operand(stop_bit_address, RelocInfo::NO_INFO));
__ Lw(scratch, MemOperand(scratch, 0));
__ Branch(&exit, ne, scratch, Operand(zero_reg));
__ Jump(jump_target, RelocInfo::NO_INFO);
#else
#error Unsupported architecture
#endif
__ bind(&exit);
__ Ret();
FlushInstructionCache(thunk, kThunkBufferSize);
#if defined(V8_OS_DARWIN) && defined(V8_HOST_ARCH_ARM64)
// MacOS on arm64 refuses {mprotect} calls to toggle permissions of RWX
// memory. Simply do nothing here, as the space will by default be executable
// and non-writable for the JumpTableRunner.
#else
CHECK(SetPermissions(GetPlatformPageAllocator(), thunk, kThunkBufferSize,
v8::PageAllocator::kReadExecute));
#endif
}
class JumpTableRunner : public v8::base::Thread {
public:
JumpTableRunner(Address slot_address, int runner_id)
: Thread(Options("JumpTableRunner")),
slot_address_(slot_address),
runner_id_(runner_id) {}
void Run() override {
TRACE("Runner #%d is starting ...\n", runner_id_);
GeneratedCode<void>::FromAddress(CcTest::i_isolate(), slot_address_).Call();
TRACE("Runner #%d is stopping ...\n", runner_id_);
USE(runner_id_);
}
private:
Address slot_address_;
int runner_id_;
};
class JumpTablePatcher : public v8::base::Thread {
public:
JumpTablePatcher(Address slot_start, uint32_t slot_index, Address thunk1,
Address thunk2, base::Mutex* jump_table_mutex)
: Thread(Options("JumpTablePatcher")),
slot_start_(slot_start),
slot_index_(slot_index),
thunks_{thunk1, thunk2},
jump_table_mutex_(jump_table_mutex) {}
void Run() override {
TRACE("Patcher %p is starting ...\n", this);
RwxMemoryWriteScopeForTesting rwx_write_scope;
Address slot_address =
slot_start_ + JumpTableAssembler::JumpSlotIndexToOffset(slot_index_);
// First, emit code to the two thunks.
for (Address thunk : thunks_) {
CompileJumpTableThunk(thunk, slot_address);
}
// Then, repeatedly patch the jump table to jump to one of the two thunks.
constexpr int kNumberOfPatchIterations = 64;
for (int i = 0; i < kNumberOfPatchIterations; ++i) {
TRACE(" patcher %p patch slot " V8PRIxPTR_FMT
" to thunk #%d (" V8PRIxPTR_FMT ")\n",
this, slot_address, i % 2, thunks_[i % 2]);
base::MutexGuard jump_table_guard(jump_table_mutex_);
JumpTableAssembler::PatchJumpTableSlot(
slot_start_ + JumpTableAssembler::JumpSlotIndexToOffset(slot_index_),
kNullAddress, thunks_[i % 2]);
}
TRACE("Patcher %p is stopping ...\n", this);
}
private:
Address slot_start_;
uint32_t slot_index_;
Address thunks_[2];
base::Mutex* jump_table_mutex_;
};
} // namespace
// This test is intended to stress concurrent patching of jump-table slots. It
// uses the following setup:
// 1) Picks a particular slot of the jump-table. Slots are iterated over to
// ensure multiple entries (at different offset alignments) are tested.
// 2) Starts multiple runners that spin through the above slot. The runners
// use thunk code that will jump to the same jump-table slot repeatedly
// until the {global_stop_bit} indicates a test-end condition.
// 3) Start a patcher that repeatedly patches the jump-table slot back and
// forth between two thunk. If there is a race then chances are high that
// one of the runners is currently executing the jump-table slot.
TEST(JumpTablePatchingStress) {
constexpr int kNumberOfRunnerThreads = 5;
constexpr int kNumberOfPatcherThreads = 3;
STATIC_ASSERT(kAssemblerBufferSize >= kJumpTableSize);
auto buffer = AllocateAssemblerBuffer(kAssemblerBufferSize, nullptr,
JitPermission::kMapAsJittable);
byte* thunk_slot_buffer = buffer->start() + kBufferSlotStartOffset;
std::bitset<kAvailableBufferSlots> used_thunk_slots;
buffer->MakeWritableAndExecutable();
RwxMemoryWriteScopeForTesting rwx_write_scope;
// Iterate through jump-table slots to hammer at different alignments within
// the jump-table, thereby increasing stress for variable-length ISAs.
Address slot_start = reinterpret_cast<Address>(buffer->start());
for (int slot = 0; slot < kJumpTableSlotCount; ++slot) {
TRACE("Hammering on jump table slot #%d ...\n", slot);
uint32_t slot_offset = JumpTableAssembler::JumpSlotIndexToOffset(slot);
std::vector<std::unique_ptr<TestingAssemblerBuffer>> thunk_buffers;
std::vector<Address> patcher_thunks;
{
// Patch the jump table slot to jump to itself. This will later be patched
// by the patchers.
Address slot_addr =
slot_start + JumpTableAssembler::JumpSlotIndexToOffset(slot);
JumpTableAssembler::PatchJumpTableSlot(slot_addr, kNullAddress,
slot_addr);
// For each patcher, generate two thunks where this patcher can emit code
// which finally jumps back to {slot} in the jump table.
for (int i = 0; i < 2 * kNumberOfPatcherThreads; ++i) {
Address thunk =
AllocateJumpTableThunk(slot_start + slot_offset, thunk_slot_buffer,
&used_thunk_slots, &thunk_buffers);
ZapCode(thunk, kThunkBufferSize);
patcher_thunks.push_back(thunk);
TRACE(" generated jump thunk: " V8PRIxPTR_FMT "\n",
patcher_thunks.back());
}
}
// Start multiple runner threads that execute the jump table slot
// concurrently.
std::list<JumpTableRunner> runners;
for (int runner = 0; runner < kNumberOfRunnerThreads; ++runner) {
runners.emplace_back(slot_start + slot_offset, runner);
}
// Start multiple patcher thread that concurrently generate code and insert
// jumps to that into the jump table slot.
std::list<JumpTablePatcher> patchers;
// Only one patcher should modify the jump table at a time.
base::Mutex jump_table_mutex;
for (int i = 0; i < kNumberOfPatcherThreads; ++i) {
patchers.emplace_back(slot_start, slot, patcher_thunks[2 * i],
patcher_thunks[2 * i + 1], &jump_table_mutex);
}
global_stop_bit = 0; // Signal runners to keep going.
for (auto& runner : runners) CHECK(runner.Start());
for (auto& patcher : patchers) CHECK(patcher.Start());
for (auto& patcher : patchers) patcher.Join();
global_stop_bit = -1; // Signal runners to stop.
for (auto& runner : runners) runner.Join();
}
}
#undef __
#undef TRACE
} // namespace wasm
} // namespace internal
} // namespace v8