mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-12 12:10:16 +00:00
d7e4c642ef
This patch optimizes both __lll_lock_wait_private and __lll_lock_wait by issuing only one lll_futex_wait. Since it is defined as an inlined syscall and inlined syscalls are defined using inlined assembly the compiler usually can not see both calls are equal and optimize accordingly. On aarch64 the resulting binary is change from: 0000000000000060 <__lll_lock_wait>: 60: 2a0103e5 mov w5, w1 64: b9400001 ldr w1, [x0] 68: aa0003e4 mov x4, x0 6c: 7100083f cmp w1, #0x2 70: 540000e1 b.ne 8c <__lll_lock_wait+0x2c> // b.any 74: 521900a1 eor w1, w5, #0x80 78: d2800042 mov x2, #0x2 // #2 7c: 93407c21 sxtw x1, w1 80: d2800003 mov x3, #0x0 // #0 84: d2800c48 mov x8, #0x62 // #98 88: d4000001 svc #0x0 8c: 521900a5 eor w5, w5, #0x80 90: 52800046 mov w6, #0x2 // #2 94: 93407ca5 sxtw x5, w5 98: 14000008 b b8 <__lll_lock_wait+0x58> 9c: d503201f nop a0: aa0403e0 mov x0, x4 a4: aa0503e1 mov x1, x5 a8: d2800042 mov x2, #0x2 // #2 ac: d2800003 mov x3, #0x0 // #0 b0: d2800c48 mov x8, #0x62 // #98 b4: d4000001 svc #0x0 b8: 885ffc80 ldaxr w0, [x4] bc: 88017c86 stxr w1, w6, [x4] c0: 35ffffc1 cbnz w1, b8 <__lll_lock_wait+0x58> c4: 35fffee0 cbnz w0, a0 <__lll_lock_wait+0x40> c8: d65f03c0 ret To: 0000000000000048 <__lll_lock_wait>: 48: aa0003e4 mov x4, x0 4c: 2a0103e5 mov w5, w1 50: b9400000 ldr w0, [x0] 54: 7100081f cmp w0, #0x2 58: 540000c0 b.eq 70 <__lll_lock_wait+0x28> // b.none 5c: 52800041 mov w1, #0x2 // #2 60: 885ffc80 ldaxr w0, [x4] 64: 88027c81 stxr w2, w1, [x4] 68: 35ffffc2 cbnz w2, 60 <__lll_lock_wait+0x18> 6c: 34000120 cbz w0, 90 <__lll_lock_wait+0x48> 70: 521900a1 eor w1, w5, #0x80 74: aa0403e0 mov x0, x4 78: 93407c21 sxtw x1, w1 7c: d2800042 mov x2, #0x2 // #2 80: d2800003 mov x3, #0x0 // #0 84: d2800c48 mov x8, #0x62 // #98 88: d4000001 svc #0x0 8c: 17fffff4 b 5c <__lll_lock_wait+0x14> 90: d65f03c0 ret I see similar changes on powerpc and other architectures. It also aligns with x86_64 implementation by adding the systemtap probes. Checker on aarch64-linux-gnu. * nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private): Optimize futex call and add systemtap probe.
56 lines
1.7 KiB
C
56 lines
1.7 KiB
C
/* low level locking for pthread library. Generic futex-using version.
|
|
Copyright (C) 2003-2019 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
Contributed by Paul Mackerras <paulus@au.ibm.com>, 2003.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include <sysdep.h>
|
|
#include <lowlevellock.h>
|
|
#include <atomic.h>
|
|
#include <stap-probe.h>
|
|
|
|
void
|
|
__lll_lock_wait_private (int *futex)
|
|
{
|
|
if (atomic_load_relaxed (futex) == 2)
|
|
goto futex;
|
|
|
|
while (atomic_exchange_acquire (futex, 2) != 0)
|
|
{
|
|
futex:
|
|
LIBC_PROBE (lll_lock_wait_private, 1, futex);
|
|
lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */
|
|
}
|
|
}
|
|
|
|
|
|
/* This function doesn't get included in libc. */
|
|
#if IS_IN (libpthread)
|
|
void
|
|
__lll_lock_wait (int *futex, int private)
|
|
{
|
|
if (atomic_load_relaxed (futex) == 2)
|
|
goto futex;
|
|
|
|
while (atomic_exchange_acquire (futex, 2) != 0)
|
|
{
|
|
futex:
|
|
LIBC_PROBE (lll_lock_wait, 1, futex);
|
|
lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */
|
|
}
|
|
}
|
|
#endif
|