forkfd: implement vfork(2)-like support on Linux

fork() works by implementing Copy-On-Write for all pages that either the
parent or the child process write to. So if the parent process continues
running while the child is between fork(2) and execve(2), then it will
keep causing page faults and requiring the OS to duplicate those pages,
which may be expensive (page table updates, TLB flushes, etc.). This
problem is aggravated if the parent process is multithreaded, as the
simple act of running in the parent will cause those threads' stacks to
cause page faults.

The BSD solution for that was vfork(), which has two differences in
behavior: (1) it blocks the parent from running and (2) it shares memory
with it. But it's always been tricky, so POSIX.1-2001 deprecated it and
2008 removed its definition completely. Still, it is available somewhat
widely, and on Linux that can be achieved with clone(2) and the
CLONE_VFORK and CLONE_VM flags, for those two behaviors respectively.

Because of (2), we can't return from the forkfd() function in the child
(as that would trash the stack in the parent process), so to implement
this functionality vforkfd() adds a callback of the same signature as
glibc's clone(2) wrapper (something that hadn't occurred to me when we
attempted to use CLONE_VFORK last time).

On Linux, (1) is no problem, as clone(2) has native forkfd support. But
on other OSes, forkfd() requires the parent to run before the child
execve()s, in order to save the child PID in the list of children we're
going to handle SIGCHLD for in a non-racy way. Investigating if it is
possible to use vfork() anyway is left as an exercise for the reader.

Matching OpenDCDiag pull request:
https://github.com/opendcdiag/opendcdiag/pull/94

Pick-to: 6.4
Fixes: QTBUG-104493
Change-Id: Id0fb9ab0089845ee8843fffd16fa63c7c6f7dd1c
Reviewed-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Reviewed-by: Fabian Kosmale <fabian.kosmale@qt.io>
Reviewed-by: Milian Wolff <milian.wolff@kdab.com>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Thiago Macieira 2022-06-20 10:18:42 -07:00
parent a7e187cf16
commit e1a787a76e
5 changed files with 180 additions and 67 deletions

View File

@ -99,6 +99,7 @@
static int system_has_forkfd(void);
static int system_forkfd(int flags, pid_t *ppid, int *system);
static int system_vforkfd(int flags, pid_t *ppid, int (*)(void *), void *, int *system);
static int system_forkfd_wait(int ffd, struct forkfd_info *info, int ffdwoptions, struct rusage *rusage);
static int disable_fork_fallback(void)
@ -595,46 +596,7 @@ static int create_pipe(int filedes[], int flags)
}
#ifndef FORKFD_NO_FORKFD
/**
* @brief forkfd returns a file descriptor representing a child process
* @return a file descriptor, or -1 in case of failure
*
* forkfd() creates a file descriptor that can be used to be notified of when a
* child process exits. This file descriptor can be monitored using select(2),
* poll(2) or similar mechanisms.
*
* The @a flags parameter can contain the following values ORed to change the
* behaviour of forkfd():
*
* @li @c FFD_NONBLOCK Set the O_NONBLOCK file status flag on the new open file
* descriptor. Using this flag saves extra calls to fnctl(2) to achieve the same
* result.
*
* @li @c FFD_CLOEXEC Set the close-on-exec (FD_CLOEXEC) flag on the new file
* descriptor. You probably want to set this flag, since forkfd() does not work
* if the original parent process dies.
*
* @li @c FFD_USE_FORK Tell forkfd() to actually call fork() instead of a
* different system implementation that may be available. On systems where a
* different implementation is available, its behavior may differ from that of
* fork(), such as not calling the functions registered with pthread_atfork().
* If that's necessary, pass this flag.
*
* The file descriptor returned by forkfd() supports the following operations:
*
* @li read(2) When the child process exits, then the buffer supplied to
* read(2) is used to return information about the status of the child in the
* form of one @c siginfo_t structure. The buffer must be at least
* sizeof(siginfo_t) bytes. The return value of read(2) is the total number of
* bytes read.
*
* @li poll(2), select(2) (and similar) The file descriptor is readable (the
* select(2) readfds argument; the poll(2) POLLIN flag) if the child has exited
* or signalled via SIGCHLD.
*
* @li close(2) When the file descriptor is no longer required it should be closed.
*/
int forkfd(int flags, pid_t *ppid)
static int forkfd_fork_fallback(int flags, pid_t *ppid)
{
Header *header;
ProcessInfo *info;
@ -647,15 +609,6 @@ int forkfd(int flags, pid_t *ppid)
int efd;
#endif
if (disable_fork_fallback())
flags &= ~FFD_USE_FORK;
if ((flags & FFD_USE_FORK) == 0) {
fd = system_forkfd(flags, ppid, &ret);
if (ret || disable_fork_fallback())
return fd;
}
(void) pthread_once(&forkfd_initialization, forkfd_initialize);
info = allocateInfo(&header);
@ -764,6 +717,112 @@ err_free:
freeInfo(header, info);
return -1;
}
/**
* @brief forkfd returns a file descriptor representing a child process
* @return a file descriptor, or -1 in case of failure
*
* forkfd() creates a file descriptor that can be used to be notified of when a
* child process exits. This file descriptor can be monitored using select(2),
* poll(2) or similar mechanisms.
*
* The @a flags parameter can contain the following values ORed to change the
* behaviour of forkfd():
*
* @li @c FFD_NONBLOCK Set the O_NONBLOCK file status flag on the new open file
* descriptor. Using this flag saves extra calls to fnctl(2) to achieve the same
* result.
*
* @li @c FFD_CLOEXEC Set the close-on-exec (FD_CLOEXEC) flag on the new file
* descriptor. You probably want to set this flag, since forkfd() does not work
* if the original parent process dies.
*
* @li @c FFD_USE_FORK Tell forkfd() to actually call fork() instead of a
* different system implementation that may be available. On systems where a
* different implementation is available, its behavior may differ from that of
* fork(), such as not calling the functions registered with pthread_atfork().
* If that's necessary, pass this flag.
*
* The file descriptor returned by forkfd() supports the following operations:
*
* @li read(2) When the child process exits, then the buffer supplied to
* read(2) is used to return information about the status of the child in the
* form of one @c siginfo_t structure. The buffer must be at least
* sizeof(siginfo_t) bytes. The return value of read(2) is the total number of
* bytes read.
*
* @li poll(2), select(2) (and similar) The file descriptor is readable (the
* select(2) readfds argument; the poll(2) POLLIN flag) if the child has exited
* or signalled via SIGCHLD.
*
* @li close(2) When the file descriptor is no longer required it should be closed.
*/
int forkfd(int flags, pid_t *ppid)
{
int fd;
if (disable_fork_fallback())
flags &= ~FFD_USE_FORK;
if ((flags & FFD_USE_FORK) == 0) {
int system_forkfd_works;
fd = system_forkfd(flags, ppid, &system_forkfd_works);
if (system_forkfd_works || disable_fork_fallback())
return fd;
}
return forkfd_fork_fallback(flags, ppid);
}
/**
* @brief vforkfd returns a file descriptor representing a child process
* @return a file descriptor, or -1 in case of failure
*
* vforkfd() operates in the same way as forkfd() and the @a flags and @a ppid
* arguments are the same. See the forkfd() documentation for details on the
* possible values and information on the returned file descriptor.
*
* This function does not return @c FFD_CHILD_PROCESS. Instead, the function @a
* childFn is called in the child process with the @a token parameter as
* argument. If that function returns, its return value will be passed to
* _exit(2).
*
* This function differs from forkfd() the same way that vfork() differs from
* fork(): the parent process may be suspended while the child is has not yet
* called _exit(2) or execve(2). Additionally, on some systems, the child
* process may share memory with the parent process the same way an auxiliary
* thread would, so extreme care should be employed on what functions the child
* process uses before termination.
*
* The @c FFD_USE_FORK flag retains its behavior as described in the forkfd()
* documentation, including that of actually using fork(2) and no other
* implementation.
*
* Currently, only on Linux will this function have any behavior different from
* forkfd(). In all other systems, it is equivalent to the following code:
*
* @code
* int ffd = forkfd(flags, &pid);
* if (ffd == FFD_CHILD_PROCESS)
* _exit(childFn(token));
* @endcode
*/
int vforkfd(int flags, pid_t *ppid, int (*childFn)(void *), void *token)
{
int fd;
if ((flags & FFD_USE_FORK) == 0) {
int system_forkfd_works;
fd = system_vforkfd(flags, ppid, childFn, token, &system_forkfd_works);
if (system_forkfd_works || disable_fork_fallback())
return fd;
}
fd = forkfd_fork_fallback(flags, ppid);
if (fd == FFD_CHILD_PROCESS) {
/* child process */
_exit(childFn(token));
}
return fd;
}
#endif // FORKFD_NO_FORKFD
#if _POSIX_SPAWN > 0 && !defined(FORKFD_NO_SPAWNFD)
@ -889,3 +948,16 @@ int system_forkfd_wait(int ffd, struct forkfd_info *info, int options, struct ru
return -1;
}
#endif
#ifndef SYSTEM_FORKFD_CAN_VFORK
int system_vforkfd(int flags, pid_t *ppid, int (*childFn)(void *), void *token, int *system)
{
/* we don't have a way to vfork(), so fake it */
int ret = system_forkfd(flags, ppid, system);
if (ret == FFD_CHILD_PROCESS) {
/* child process */
_exit(childFn(token));
}
return ret;
}
#endif
#undef SYSTEM_FORKFD_CAN_VFORK

View File

@ -53,6 +53,7 @@ struct forkfd_info {
};
int forkfd(int flags, pid_t *ppid);
int vforkfd(int flags, pid_t *ppid, int (*childFn)(void *), void *token);
int forkfd_wait4(int ffd, struct forkfd_info *info, int options, struct rusage *rusage);
static inline int forkfd_wait(int ffd, struct forkfd_info *info, struct rusage *rusage)
{

View File

@ -29,6 +29,8 @@
#include "forkfd_atomic.h"
#undef SYSTEM_FORKFD_CAN_VFORK
// in forkfd.c
static int convertForkfdWaitFlagsToWaitFlags(int ffdoptions);
static void convertStatusToForkfdInfo(int status, struct forkfd_info *info);

View File

@ -51,6 +51,8 @@
# define P_PIDFD 3
#endif
#define SYSTEM_FORKFD_CAN_VFORK
// in forkfd.c
static int convertForkfdWaitFlagsToWaitFlags(int ffdoptions);
static void convertStatusToForkfdInfo(int status, struct forkfd_info *info);
@ -131,16 +133,55 @@ int system_has_forkfd()
return ffd_atomic_load(&system_forkfd_state, FFD_ATOMIC_RELAXED) > 0;
}
int system_forkfd(int flags, pid_t *ppid, int *system)
static int system_forkfd_availability(void)
{
pid_t pid;
int pidfd;
int state = ffd_atomic_load(&system_forkfd_state, FFD_ATOMIC_RELAXED);
if (state == 0) {
state = detect_clone_pidfd_support();
ffd_atomic_store(&system_forkfd_state, state, FFD_ATOMIC_RELAXED);
}
return state;
}
static int system_forkfd_pidfd_set_flags(int pidfd, int flags)
{
if ((flags & FFD_CLOEXEC) == 0) {
/* pidfd defaults to O_CLOEXEC */
fcntl(pidfd, F_SETFD, 0);
}
if (flags & FFD_NONBLOCK)
fcntl(pidfd, F_SETFL, fcntl(pidfd, F_GETFL) | O_NONBLOCK);
return pidfd;
}
int system_vforkfd(int flags, pid_t *ppid, int (*childFn)(void *), void *token, int *system)
{
__attribute__((aligned(64))) char childStack[4096];
pid_t pid;
int pidfd;
unsigned long cloneflags = CLONE_PIDFD | CLONE_VFORK | CLONE_VM | SIGCHLD;
int state = system_forkfd_availability();
if (state < 0) {
*system = 0;
return state;
}
*system = 1;
pid = clone(childFn, childStack + sizeof(childStack), cloneflags, token, &pidfd, NULL, NULL);
if (pid < 0)
return pid;
if (ppid)
*ppid = pid;
return system_forkfd_pidfd_set_flags(pidfd, flags);
}
int system_forkfd(int flags, pid_t *ppid, int *system)
{
pid_t pid;
int pidfd;
int state = system_forkfd_availability();
if (state < 0) {
*system = 0;
return state;
@ -160,13 +201,7 @@ int system_forkfd(int flags, pid_t *ppid, int *system)
}
/* parent process */
if ((flags & FFD_CLOEXEC) == 0) {
/* pidfd defaults to O_CLOEXEC */
fcntl(pidfd, F_SETFD, 0);
}
if (flags & FFD_NONBLOCK)
fcntl(pidfd, F_SETFL, fcntl(pidfd, F_GETFL) | O_NONBLOCK);
return pidfd;
return system_forkfd_pidfd_set_flags(pidfd, flags);
}
int system_forkfd_wait(int ffd, struct forkfd_info *info, int ffdoptions, struct rusage *rusage)

View File

@ -452,6 +452,15 @@ void QProcessPrivate::startProcess()
workingDirPtr = encodedWorkingDirectory.constData();
}
// Start the child.
auto execChild1 = [this, workingDirPtr, &argv, &envp]() {
execChild(workingDirPtr, argv.pointers.get(), envp.pointers.get());
};
auto execChild2 = [](void *lambda) {
static_cast<decltype(execChild1) *>(lambda)->operator()();
return -1;
};
int ffdflags = FFD_CLOEXEC;
// QTBUG-86285
@ -460,7 +469,7 @@ void QProcessPrivate::startProcess()
#endif
pid_t childPid;
forkfd = ::forkfd(ffdflags , &childPid);
forkfd = ::vforkfd(ffdflags , &childPid, execChild2, &execChild1);
int lastForkErrno = errno;
if (forkfd == -1) {
@ -475,12 +484,6 @@ void QProcessPrivate::startProcess()
return;
}
// Start the child.
if (forkfd == FFD_CHILD_PROCESS) {
execChild(workingDirPtr, argv.pointers.get(), envp.pointers.get());
::_exit(-1);
}
pid = qint64(childPid);
Q_ASSERT(pid > 0);