mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-02 17:50:20 +00:00
1a044511a3
This addresses an issue that is present mainly on SMP machines running threaded code. In a typical indirect call or PLT import stub, the target address is loaded first. Then the global pointer is loaded into the PIC register in the delay slot of a branch to the target address. During lazy binding, the target address is a trampoline which transfers to _dl_runtime_resolve(). _dl_runtime_resolve() uses the relocation offset stored in the global pointer and the linkage map stored in the trampoline to find the relocation. Then, the function descriptor is updated. In a multi-threaded application, it is possible for the global pointer to be updated between the load of the target address and the global pointer. When this happens, the relocation offset has been replaced by the new global pointer. The function pointer has probably been updated as well but there is no way to find the address of the function descriptor and to transfer to the target. So, _dl_runtime_resolve() typically crashes. HP-UX addressed this problem by adding an extra pc-relative branch to the trampoline. The descriptor is initially setup to point to the branch. The branch then transfers to the trampoline. This allowed the trampoline code to figure out which descriptor was being used without any modification to user code. I didn't use this approach as it is more complex and changes function pointer canonicalization. The order of loading the target address and global pointer in indirect calls was not consistent with the order used in import stubs. In particular, $$dyncall and some inline versions of it loaded the global pointer first. This was inconsistent with the global pointer being updated first in dl-machine.h. Assuming the accesses are ordered, we want elf_machine_fixup_plt() to store the global pointer first and calls to load it last. Then, the global pointer will be correct when the target function is entered. However, just to make things more fun, HP added support for out-of-order execution of accesses in PA 2.0. The accesses used by calls are weakly ordered. So, it's possibly under some circumstances that a function might be entered with the wrong global pointer. However, HP uses weakly ordered accesses in 64-bit HP-UX, so I assume that loading the global pointer in the delay slot of the branch must work consistently. The basic fix for the race is a combination of modifying user code to preserve the address of the function descriptor in register %r22 and setting the least-significant bit in the relocation offset. The latter was suggested by Carlos as a way to distinguish relocation offsets from global pointer values. Conventionally, %r22 is used as the address of the function descriptor in calls to $$dyncall. So, it wasn't hard to preserve the address in %r22. I have updated gcc trunk and gcc-9 branch to not clobber %r22 in $$dyncall and inline indirect calls. I have also modified the import stubs in binutils trunk and the 2.33 branch to preserve %r22. This required making the stubs one instruction longer but we save one relocation. I also modified binutils to align the .plt section on a 8-byte boundary. This allows descriptors to be updated atomically with a floting-point store. With these changes, _dl_runtime_resolve() can fallback to an alternate mechanism to find the relocation offset when it has been clobbered. There's just one additional instruction in the fast path. I tested the fallback function, _dl_fix_reloc_arg(), by changing the branch to always use the fallback. Old code still runs as it did before. Fixes bug 23296. Reviewed-by: Carlos O'Donell <carlos@redhat.com>
123 lines
4.5 KiB
C
123 lines
4.5 KiB
C
/* Copyright (C) 2003-2020 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
Contributed by Carlos O'Donell <carlos@baldric.uwo.ca>, 2005.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library. If not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include <stdint.h> /* Required for type definitions e.g. uint8_t. */
|
|
|
|
#ifndef _ATOMIC_MACHINE_H
|
|
#define _ATOMIC_MACHINE_H 1
|
|
|
|
typedef int8_t atomic8_t;
|
|
typedef uint8_t uatomic8_t;
|
|
typedef int_fast8_t atomic_fast8_t;
|
|
typedef uint_fast8_t uatomic_fast8_t;
|
|
|
|
typedef int32_t atomic32_t;
|
|
typedef uint32_t uatomic32_t;
|
|
typedef int_fast32_t atomic_fast32_t;
|
|
typedef uint_fast32_t uatomic_fast32_t;
|
|
|
|
typedef intptr_t atomicptr_t;
|
|
typedef uintptr_t uatomicptr_t;
|
|
typedef intmax_t atomic_max_t;
|
|
typedef uintmax_t uatomic_max_t;
|
|
|
|
#define atomic_full_barrier() __sync_synchronize ()
|
|
|
|
#define __HAVE_64B_ATOMICS 0
|
|
#define USE_ATOMIC_COMPILER_BUILTINS 0
|
|
|
|
/* We use the compiler atomic load and store builtins as the generic
|
|
defines are not atomic. In particular, we need to use compare and
|
|
exchange for stores as the implementation is synthesized. */
|
|
void __atomic_link_error (void);
|
|
#define __atomic_check_size_ls(mem) \
|
|
if ((sizeof (*mem) != 1) && (sizeof (*mem) != 2) && sizeof (*mem) != 4) \
|
|
__atomic_link_error ();
|
|
|
|
#define atomic_load_relaxed(mem) \
|
|
({ __atomic_check_size_ls((mem)); \
|
|
__atomic_load_n ((mem), __ATOMIC_RELAXED); })
|
|
#define atomic_load_acquire(mem) \
|
|
({ __atomic_check_size_ls((mem)); \
|
|
__atomic_load_n ((mem), __ATOMIC_ACQUIRE); })
|
|
|
|
#define atomic_store_relaxed(mem, val) \
|
|
do { \
|
|
__atomic_check_size_ls((mem)); \
|
|
__atomic_store_n ((mem), (val), __ATOMIC_RELAXED); \
|
|
} while (0)
|
|
#define atomic_store_release(mem, val) \
|
|
do { \
|
|
__atomic_check_size_ls((mem)); \
|
|
__atomic_store_n ((mem), (val), __ATOMIC_RELEASE); \
|
|
} while (0)
|
|
|
|
/* XXX Is this actually correct? */
|
|
#define ATOMIC_EXCHANGE_USES_CAS 1
|
|
|
|
/* prev = *addr;
|
|
if (prev == old)
|
|
*addr = new;
|
|
return prev; */
|
|
|
|
/* Use the kernel atomic light weight syscalls on hppa. */
|
|
#define _LWS "0xb0"
|
|
#define _LWS_CAS "0"
|
|
/* Note r31 is the link register. */
|
|
#define _LWS_CLOBBER "r1", "r23", "r22", "r20", "r31", "memory"
|
|
/* String constant for -EAGAIN. */
|
|
#define _ASM_EAGAIN "-11"
|
|
/* String constant for -EDEADLOCK. */
|
|
#define _ASM_EDEADLOCK "-45"
|
|
|
|
/* The only basic operation needed is compare and exchange. The mem
|
|
pointer must be word aligned. We no longer loop on deadlock. */
|
|
#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
|
|
({ \
|
|
register long lws_errno asm("r21"); \
|
|
register unsigned long lws_ret asm("r28"); \
|
|
register unsigned long lws_mem asm("r26") = (unsigned long)(mem); \
|
|
register unsigned long lws_old asm("r25") = (unsigned long)(oldval);\
|
|
register unsigned long lws_new asm("r24") = (unsigned long)(newval);\
|
|
__asm__ __volatile__( \
|
|
"0: \n\t" \
|
|
"ble " _LWS "(%%sr2, %%r0) \n\t" \
|
|
"ldi " _LWS_CAS ", %%r20 \n\t" \
|
|
"cmpiclr,<> " _ASM_EAGAIN ", %%r21, %%r0\n\t" \
|
|
"b,n 0b \n\t" \
|
|
"cmpclr,= %%r0, %%r21, %%r0 \n\t" \
|
|
"iitlbp %%r0,(%%sr0, %%r0) \n\t" \
|
|
: "=r" (lws_ret), "=r" (lws_errno) \
|
|
: "r" (lws_mem), "r" (lws_old), "r" (lws_new) \
|
|
: _LWS_CLOBBER \
|
|
); \
|
|
\
|
|
(__typeof (oldval)) lws_ret; \
|
|
})
|
|
|
|
#define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
|
|
({ \
|
|
__typeof__ (*mem) ret; \
|
|
ret = atomic_compare_and_exchange_val_acq(mem, newval, oldval); \
|
|
/* Return 1 if it was already acquired. */ \
|
|
(ret != oldval); \
|
|
})
|
|
|
|
#endif
|
|
/* _ATOMIC_MACHINE_H */
|