linux: Add support for getrandom vDSO

Linux 6.11 has getrandom() in vDSO. It operates on a thread-local opaque
state allocated with mmap using flags specified by the vDSO.

Multiple states are allocated at once, as many as fit into a page, and
these are held in an array of available states to be doled out to each
thread upon first use, and recycled when a thread terminates. As these
states run low, more are allocated.

To make this procedure async-signal-safe, a simple guard is used in the
LSB of the opaque state address, falling back to the syscall if there's
reentrancy contention.

Also, _Fork() is handled by blocking signals on opaque state allocation
(so _Fork() always sees a consistent state even if it interrupts a
getrandom() call) and by iterating over the thread stack cache on
reclaim_stack. Each opaque state will be in the free states list
(grnd_alloc.states) or allocated to a running thread.

The cancellation is handled by always using GRND_NONBLOCK flags while
calling the vDSO, and falling back to the cancellable syscall if the
kernel returns EAGAIN (would block). Since getrandom is not defined by
POSIX and cancellation is supported as an extension, the cancellation is
handled as 'may occur' instead of 'shall occur' [1], meaning that if
vDSO does not block (the expected behavior) getrandom will not act as a
cancellation entrypoint. It avoids a pthread_testcancel call on the fast
path (different than 'shall occur' functions, like sem_wait()).

It is currently enabled for x86_64, which is available in Linux 6.11,
and aarch64, powerpc32, powerpc64, loongarch64, and s390x, which are
available in Linux 6.12.

Link: https://pubs.opengroup.org/onlinepubs/9799919799/nframe.html [1]
Co-developed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Tested-by: Xi Ruoyao <xry111@xry111.site>
This commit is contained in:
Jason A. Donenfeld 2024-09-18 16:01:22 +02:00 committed by Adhemerval Zanella
parent 5f62cf88c4
commit 24d2a0a474
20 changed files with 376 additions and 7 deletions

View File

@ -1,8 +1,12 @@
#ifndef _SYS_RANDOM_H
#include <stdlib/sys/random.h>
#include_next <sys/random.h>
# ifndef _ISOMAC
# include <stdbool.h>
extern ssize_t __getrandom (void *__buffer, size_t __length,
unsigned int __flags) __wur;
libc_hidden_proto (__getrandom)

View File

@ -3140,8 +3140,8 @@ static void
tcache_key_initialize (void)
{
/* We need to use the _nostatus version here, see BZ 29624. */
if (__getrandom_nocancel_nostatus (&tcache_key, sizeof(tcache_key),
GRND_NONBLOCK)
if (__getrandom_nocancel_nostatus_direct (&tcache_key, sizeof(tcache_key),
GRND_NONBLOCK)
!= sizeof (tcache_key))
{
tcache_key = random_bits ();

View File

@ -132,6 +132,8 @@ get_cached_stack (size_t *sizep, void **memp)
__libc_lock_init (result->exit_lock);
memset (&result->tls_state, 0, sizeof result->tls_state);
result->getrandom_buf = NULL;
/* Clear the DTV. */
dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)

View File

@ -404,6 +404,9 @@ struct pthread
/* Used on strsignal. */
struct tls_internal_t tls_state;
/* getrandom vDSO per-thread opaque state. */
void *getrandom_buf;
/* rseq area registered with the kernel. Use a custom definition
here to isolate from kernel struct rseq changes. The
implementation of sched_getcpu needs acccess to the cpu_id field;

View File

@ -38,6 +38,7 @@
#include <version.h>
#include <clone_internal.h>
#include <futex-internal.h>
#include <sys/random.h>
#include <shlib-compat.h>
@ -549,6 +550,10 @@ start_thread (void *arg)
}
#endif
/* Release the vDSO getrandom per-thread buffer with all signal blocked,
to avoid creating a new free-state block during thread release. */
__getrandom_vdso_release (pd);
if (!pd->user_stack)
advise_stack_range (pd->stackblock, pd->stackblock_size, (uintptr_t) pd,
pd->guardsize);

View File

@ -51,7 +51,9 @@
__fcntl64 (fd, cmd, __VA_ARGS__)
#define __getrandom_nocancel(buf, size, flags) \
__getrandom (buf, size, flags)
#define __getrandom_nocancel_nostatus(buf, size, flags) \
#define __getrandom_nocancel_direct(buf, size, flags) \
__getrandom (buf, size, flags)
#define __getrandom_nocancel_nostatus_direct(buf, size, flags) \
__getrandom (buf, size, flags)
#define __poll_infinity_nocancel(fds, nfds) \
__poll (fds, nfds, -1)

View File

@ -79,7 +79,7 @@ __typeof (__fcntl) __fcntl_nocancel;
/* Non cancellable getrandom syscall that does not also set errno in case of
failure. */
static inline ssize_t
__getrandom_nocancel_nostatus (void *buf, size_t buflen, unsigned int flags)
__getrandom_nocancel_nostatus_direct (void *buf, size_t buflen, unsigned int flags)
{
int save_errno = errno;
ssize_t r = __getrandom (buf, buflen, flags);
@ -90,6 +90,8 @@ __getrandom_nocancel_nostatus (void *buf, size_t buflen, unsigned int flags)
#define __getrandom_nocancel(buf, size, flags) \
__getrandom (buf, size, flags)
#define __getrandom_nocancel_direct(buf, size, flags) \
__getrandom (buf, size, flags)
#define __poll_infinity_nocancel(fds, nfds) \
__poll (fds, nfds, -1)

View File

@ -18,6 +18,7 @@
#include <arch-fork.h>
#include <pthreadP.h>
#include <sys/random.h>
pid_t
_Fork (void)
@ -43,6 +44,7 @@ _Fork (void)
self->robust_head.list = &self->robust_head;
INTERNAL_SYSCALL_CALL (set_robust_list, &self->robust_head,
sizeof (struct robust_list_head));
call_function_static_weak (__getrandom_fork_subprocess);
}
return pid;
}

View File

@ -26,6 +26,7 @@
#include <mqueue.h>
#include <pthreadP.h>
#include <sysdep.h>
#include <sys/random.h>
static inline void
fork_system_setup (void)
@ -46,6 +47,7 @@ fork_system_setup_after_fork (void)
call_function_static_weak (__mq_notify_fork_subprocess);
call_function_static_weak (__timer_fork_subprocess);
call_function_static_weak (__getrandom_fork_subprocess);
}
/* In case of a fork() call the memory allocation in the child will be
@ -128,9 +130,19 @@ reclaim_stacks (void)
curp->specific_used = true;
}
}
call_function_static_weak (__getrandom_reset_state, curp);
}
}
/* Also reset stale getrandom states for user stack threads. */
list_for_each (runp, &GL (dl_stack_user))
{
struct pthread *curp = list_entry (runp, struct pthread, list);
if (curp != self)
call_function_static_weak (__getrandom_reset_state, curp);
}
/* Add the stack of all running threads to the cache. */
list_splice (&GL (dl_stack_used), &GL (dl_stack_cache));

View File

@ -164,6 +164,7 @@
# define HAVE_CLOCK_GETRES64_VSYSCALL "__kernel_clock_getres"
# define HAVE_CLOCK_GETTIME64_VSYSCALL "__kernel_clock_gettime"
# define HAVE_GETTIMEOFDAY_VSYSCALL "__kernel_gettimeofday"
# define HAVE_GETRANDOM_VSYSCALL "__kernel_getrandom"
# define HAVE_CLONE3_WRAPPER 1

View File

@ -66,6 +66,18 @@ PROCINFO_CLASS int (*_dl_vdso_clock_getres) (clockid_t,
PROCINFO_CLASS int (*_dl_vdso_clock_getres_time64) (clockid_t,
struct __timespec64 *) RELRO;
# endif
# ifdef HAVE_GETRANDOM_VSYSCALL
PROCINFO_CLASS ssize_t (*_dl_vdso_getrandom) (void *buffer, size_t len,
unsigned int flags, void *state,
size_t state_len) RELRO;
/* These values will be initialized at loading time by calling the
_dl_vdso_getrandom with a special value. The 'state_size' is the opaque
state size per-thread allocated with a mmap using 'mmap_prot' and
'mmap_flags' argument. */
PROCINFO_CLASS uint32_t _dl_vdso_getrandom_state_size RELRO;
PROCINFO_CLASS uint32_t _dl_vdso_getrandom_mmap_prot RELRO;
PROCINFO_CLASS uint32_t _dl_vdso_getrandom_mmap_flags RELRO;
# endif
/* PowerPC specific ones. */
# ifdef HAVE_GET_TBFREQ

View File

@ -19,6 +19,10 @@
#ifndef _DL_VDSO_INIT_H
#define _DL_VDSO_INIT_H
#ifdef HAVE_GETRANDOM_VSYSCALL
# include <getrandom_vdso.h>
#endif
/* Initialize the VDSO functions pointers. */
static inline void __attribute__ ((always_inline))
setup_vdso_pointers (void)
@ -50,6 +54,19 @@ setup_vdso_pointers (void)
#ifdef HAVE_RISCV_HWPROBE
GLRO(dl_vdso_riscv_hwprobe) = dl_vdso_vsym (HAVE_RISCV_HWPROBE);
#endif
#ifdef HAVE_GETRANDOM_VSYSCALL
GLRO(dl_vdso_getrandom) = dl_vdso_vsym (HAVE_GETRANDOM_VSYSCALL);
if (GLRO(dl_vdso_getrandom) != NULL)
{
struct vgetrandom_opaque_params params;
if (GLRO(dl_vdso_getrandom) (NULL, 0, 0, &params, ~0UL) == 0)
{
GLRO(dl_vdso_getrandom_state_size) = params.size_of_opaque_state;
GLRO(dl_vdso_getrandom_mmap_prot) = params.mmap_prot;
GLRO(dl_vdso_getrandom_mmap_flags) = params.mmap_flags;
}
}
#endif
}
#endif

View File

@ -21,12 +21,247 @@
#include <unistd.h>
#include <sysdep-cancel.h>
static inline ssize_t
getrandom_syscall (void *buffer, size_t length, unsigned int flags,
bool cancel)
{
return cancel
? SYSCALL_CANCEL (getrandom, buffer, length, flags)
: INLINE_SYSCALL_CALL (getrandom, buffer, length, flags);
}
#ifdef HAVE_GETRANDOM_VSYSCALL
# include <getrandom_vdso.h>
# include <ldsodefs.h>
# include <libc-lock.h>
# include <list.h>
# include <setvmaname.h>
# include <sys/mman.h>
# include <sys/sysinfo.h>
# include <tls-internal.h>
# define ALIGN_PAGE(p) PTR_ALIGN_UP (p, GLRO (dl_pagesize))
# define READ_ONCE(p) (*((volatile typeof (p) *) (&(p))))
# define WRITE_ONCE(p, v) (*((volatile typeof (p) *) (&(p))) = (v))
# define RESERVE_PTR(p) ((void *) ((uintptr_t) (p) | 1UL))
# define RELEASE_PTR(p) ((void *) ((uintptr_t) (p) & ~1UL))
# define IS_RESERVED_PTR(p) (!!((uintptr_t) (p) & 1UL))
static struct
{
__libc_lock_define (, lock);
void **states; /* Queue of opaque states allocated with the kernel
provided flags and used on getrandom vDSO call. */
size_t len; /* Number of available free states in the queue. */
size_t total; /* Number of states allocated from the kernel. */
size_t cap; /* Total numver of states that 'states' can hold before
needed to be resized. */
} grnd_alloc = {
.lock = LLL_LOCK_INITIALIZER
};
static bool
vgetrandom_get_state_alloc (void)
{
size_t num = __get_nprocs (); /* Just a decent heuristic. */
size_t block_size = ALIGN_PAGE (num * GLRO(dl_vdso_getrandom_state_size));
num = (GLRO (dl_pagesize) / GLRO(dl_vdso_getrandom_state_size)) *
(block_size / GLRO (dl_pagesize));
void *block = __mmap (NULL, block_size, GLRO(dl_vdso_getrandom_mmap_prot),
GLRO(dl_vdso_getrandom_mmap_flags), -1, 0);
if (block == MAP_FAILED)
return false;
__set_vma_name (block, block_size, " glibc: getrandom");
if (grnd_alloc.total + num > grnd_alloc.cap)
{
/* Use a new mmap instead of trying to mremap. It avoids a
potential multithread fork issue where fork is called just after
mremap returns but before assigning to the grnd_alloc.states,
thus making the its value invalid in the child. */
void *old_states = grnd_alloc.states;
size_t old_states_size = ALIGN_PAGE (sizeof (*grnd_alloc.states) *
grnd_alloc.total + num);
size_t states_size;
if (grnd_alloc.states == NULL)
states_size = old_states_size;
else
states_size = ALIGN_PAGE (sizeof (*grnd_alloc.states)
* grnd_alloc.cap);
void **states = __mmap (NULL, states_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (states == MAP_FAILED)
{
__munmap (block, block_size);
return false;
}
/* Atomically replace the old state, so if a fork happens the child
process will see a consistent free state buffer. The size might
not be updated, but it does not really matter since the buffer is
always increased. */
atomic_store_relaxed (&grnd_alloc.states, states);
if (old_states != NULL)
__munmap (old_states, old_states_size);
__set_vma_name (states, states_size, " glibc: getrandom states");
grnd_alloc.cap = states_size / sizeof (*grnd_alloc.states);
}
for (size_t i = 0; i < num; ++i)
{
/* States should not straddle a page. */
if (((uintptr_t) block & (GLRO (dl_pagesize) - 1)) +
GLRO(dl_vdso_getrandom_state_size) > GLRO (dl_pagesize))
block = ALIGN_PAGE (block);
grnd_alloc.states[i] = block;
block += GLRO(dl_vdso_getrandom_state_size);
}
grnd_alloc.len = num;
grnd_alloc.total += num;
return true;
}
/* Allocate an opaque state for vgetrandom. If the grnd_alloc does not have
any, mmap() another page of them using the vgetrandom parameters. */
static void *
vgetrandom_get_state (void)
{
void *state = NULL;
/* The signal blocking avoid the potential issue where _Fork() (which is
async-signal-safe) is called with the lock taken. The function is
called only once during thread lifetime, so the overhead should be
minimal. */
internal_sigset_t set;
internal_signal_block_all (&set);
__libc_lock_lock (grnd_alloc.lock);
if (grnd_alloc.len > 0 || vgetrandom_get_state_alloc ())
state = grnd_alloc.states[--grnd_alloc.len];
__libc_lock_unlock (grnd_alloc.lock);
internal_signal_restore_set (&set);
return state;
}
/* Returns true when vgetrandom is used successfully. Returns false if the
syscall fallback should be issued in the case the vDSO is not present, in
the case of reentrancy, or if any memory allocation fails. */
static ssize_t
getrandom_vdso (void *buffer, size_t length, unsigned int flags, bool cancel)
{
if (GLRO (dl_vdso_getrandom_state_size) == 0)
return getrandom_syscall (buffer, length, flags, cancel);
struct pthread *self = THREAD_SELF;
/* If the LSB of getrandom_buf is set, then this function is already being
called, and we have a reentrant call from a signal handler. In this case
fallback to the syscall. */
void *state = READ_ONCE (self->getrandom_buf);
if (IS_RESERVED_PTR (state))
return getrandom_syscall (buffer, length, flags, cancel);
WRITE_ONCE (self->getrandom_buf, RESERVE_PTR (state));
bool r = false;
if (state == NULL)
{
state = vgetrandom_get_state ();
if (state == NULL)
goto out;
}
/* Since the vDSO fallback does not issue the syscall with the cancellation
bridge (__syscall_cancel_arch), use GRND_NONBLOCK so there is no
potential unbounded blocking in the kernel. It should be a rare
situation, only at system startup when RNG is not initialized. */
ssize_t ret = GLRO (dl_vdso_getrandom) (buffer,
length,
flags | GRND_NONBLOCK,
state,
GLRO(dl_vdso_getrandom_state_size));
if (INTERNAL_SYSCALL_ERROR_P (ret))
{
/* Fallback to the syscall if the kernel would block. */
int err = INTERNAL_SYSCALL_ERRNO (ret);
if (err == EAGAIN && !(flags & GRND_NONBLOCK))
goto out;
__set_errno (err);
ret = -1;
}
r = true;
out:
WRITE_ONCE (self->getrandom_buf, state);
return r ? ret : getrandom_syscall (buffer, length, flags, cancel);
}
#endif
/* Re-add the state state from CURP on the free list. */
void
__getrandom_reset_state (struct pthread *curp)
{
#ifdef HAVE_GETRANDOM_VSYSCALL
if (grnd_alloc.states == NULL || curp->getrandom_buf == NULL)
return;
grnd_alloc.states[grnd_alloc.len++] = RELEASE_PTR (curp->getrandom_buf);
curp->getrandom_buf = NULL;
#endif
}
/* Called when a thread terminates, and adds its random buffer back into the
allocator pool for use in a future thread. */
void
__getrandom_vdso_release (struct pthread *curp)
{
#ifdef HAVE_GETRANDOM_VSYSCALL
if (curp->getrandom_buf == NULL)
return;
__libc_lock_lock (grnd_alloc.lock);
grnd_alloc.states[grnd_alloc.len++] = curp->getrandom_buf;
__libc_lock_unlock (grnd_alloc.lock);
#endif
}
/* Reset the internal lock state in case another thread has locked while
this thread calls fork. The stale thread states will be handled by
reclaim_stacks which calls __getrandom_reset_state on each thread. */
void
__getrandom_fork_subprocess (void)
{
#ifdef HAVE_GETRANDOM_VSYSCALL
grnd_alloc.lock = LLL_LOCK_INITIALIZER;
#endif
}
ssize_t
__getrandom_nocancel (void *buffer, size_t length, unsigned int flags)
{
#ifdef HAVE_GETRANDOM_VSYSCALL
return getrandom_vdso (buffer, length, flags, false);
#else
return getrandom_syscall (buffer, length, flags, false);
#endif
}
/* Write up to LENGTH bytes of randomness starting at BUFFER.
Return the number of bytes written, or -1 on error. */
ssize_t
__getrandom (void *buffer, size_t length, unsigned int flags)
{
return SYSCALL_CANCEL (getrandom, buffer, length, flags);
#ifdef HAVE_GETRANDOM_VSYSCALL
return getrandom_vdso (buffer, length, flags, true);
#else
return getrandom_syscall (buffer, length, flags, true);
#endif
}
libc_hidden_def (__getrandom)
weak_alias (__getrandom, getrandom)

View File

@ -0,0 +1,36 @@
/* Linux getrandom vDSO support.
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _GETRANDOM_VDSO_H
#define _GETRANDOM_VDSO_H
#include <stddef.h>
#include <stdint.h>
#include <sys/types.h>
/* Used to query the vDSO for the required mmap flags and the opaque
per-thread state size Defined by linux/random.h. */
struct vgetrandom_opaque_params
{
uint32_t size_of_opaque_state;
uint32_t mmap_prot;
uint32_t mmap_flags;
uint32_t reserved[13];
};
#endif

View File

@ -0,0 +1,29 @@
/* Internal definitions for Linux getrandom implementation.
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _LINUX_SYS_RANDOM_H
#define _LINUX_SYS_RANDOM_H
# ifndef _ISOMAC
# include <pthreadP.h>
extern void __getrandom_fork_subprocess (void) attribute_hidden;
extern void __getrandom_vdso_release (struct pthread *curp) attribute_hidden;
extern void __getrandom_reset_state (struct pthread *curp) attribute_hidden;
# endif
#endif

View File

@ -119,6 +119,7 @@
#define HAVE_CLOCK_GETTIME64_VSYSCALL "__vdso_clock_gettime"
#define HAVE_GETTIMEOFDAY_VSYSCALL "__vdso_gettimeofday"
#define HAVE_GETCPU_VSYSCALL "__vdso_getcpu"
#define HAVE_GETRANDOM_VSYSCALL "__vdso_getrandom"
#define HAVE_CLONE3_WRAPPER 1

View File

@ -27,6 +27,7 @@
#include <sys/syscall.h>
#include <sys/wait.h>
#include <time.h>
#include <sys/random.h>
/* Non cancellable open syscall. */
__typeof (open) __open_nocancel;
@ -84,15 +85,17 @@ __writev_nocancel_nostatus (int fd, const struct iovec *iov, int iovcnt)
}
static inline ssize_t
__getrandom_nocancel (void *buf, size_t buflen, unsigned int flags)
__getrandom_nocancel_direct (void *buf, size_t buflen, unsigned int flags)
{
return INLINE_SYSCALL_CALL (getrandom, buf, buflen, flags);
}
__typeof (getrandom) __getrandom_nocancel attribute_hidden;
/* Non cancellable getrandom syscall that does not also set errno in case of
failure. */
static inline ssize_t
__getrandom_nocancel_nostatus (void *buf, size_t buflen, unsigned int flags)
__getrandom_nocancel_nostatus_direct (void *buf, size_t buflen, unsigned int flags)
{
return INTERNAL_SYSCALL_CALL (getrandom, buf, buflen, flags);
}

View File

@ -223,5 +223,6 @@
#define HAVE_TIME_VSYSCALL "__kernel_time"
#define HAVE_GETTIMEOFDAY_VSYSCALL "__kernel_gettimeofday"
#define HAVE_GET_TBFREQ "__kernel_get_tbfreq"
#define HAVE_GETRANDOM_VSYSCALL "__kernel_getrandom"
#endif /* _LINUX_POWERPC_SYSDEP_H */

View File

@ -72,6 +72,7 @@
#ifdef __s390x__
#define HAVE_CLOCK_GETRES64_VSYSCALL "__kernel_clock_getres"
#define HAVE_CLOCK_GETTIME64_VSYSCALL "__kernel_clock_gettime"
#define HAVE_GETRANDOM_VSYSCALL "__kernel_getrandom"
#else
#define HAVE_CLOCK_GETRES_VSYSCALL "__kernel_clock_getres"
#define HAVE_CLOCK_GETTIME_VSYSCALL "__kernel_clock_gettime"

View File

@ -376,6 +376,7 @@
# define HAVE_TIME_VSYSCALL "__vdso_time"
# define HAVE_GETCPU_VSYSCALL "__vdso_getcpu"
# define HAVE_CLOCK_GETRES64_VSYSCALL "__vdso_clock_getres"
# define HAVE_GETRANDOM_VSYSCALL "__vdso_getrandom"
# define HAVE_CLONE3_WRAPPER 1