Make qYieldCpu() public API

Rewritten to be a bit simpler, added a few more yield/YieldProcessor
alternatives, added RISC-V support.

[ChangeLog][QtCore] Added qYieldCpu() function.

Fixes: QTBUG-103014
Change-Id: I53335f845a1345299031fffd176f59032e7400f5
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
Thiago Macieira 2023-07-06 10:57:35 -07:00
parent aaa8c38353
commit a7f227f56c
9 changed files with 134 additions and 47 deletions

View File

@ -3,8 +3,6 @@
#include "qtconcurrentthreadengine.h"
#include <QtCore/private/qsimd_p.h>
#if !defined(QT_NO_CONCURRENT) || defined(Q_QDOC)
QT_BEGIN_NAMESPACE

View File

@ -251,6 +251,7 @@ qt_internal_add_module(Core
thread/qthreadstorage.h
thread/qtsan_impl.h
thread/qwaitcondition.h thread/qwaitcondition_p.h
thread/qyieldcpu.h
time/qcalendar.cpp time/qcalendar.h
time/qcalendarbackend_p.h
time/qcalendarmath_p.h

View File

@ -14,7 +14,6 @@
#include "qdebug.h"
#include "qmutex.h"
#include <QtCore/private/qlocking_p.h>
#include <QtCore/private/qsimd_p.h>
#include "qloggingcategory.h"
#ifndef QT_BOOTSTRAPPED
#include "qelapsedtimer.h"

View File

@ -378,49 +378,6 @@ static inline uint64_t qCpuFeatures()
#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
|| ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
/*
Small wrapper around x86's PAUSE and ARM's YIELD instructions.
This is completely different from QThread::yieldCurrentThread(), which is
an OS-level operation that takes the whole thread off the CPU.
This is just preventing one SMT thread from filling a core's pipeline with
speculated further loop iterations (which need to be expensively flushed on
final success) when it could just give those pipeline slots to a second SMT
thread that can do something useful with the core, such as unblocking this
SMT thread :)
So, instead of
while (!condition)
;
it's better to use
while (!condition)
qYieldCpu();
*/
static inline void qYieldCpu()
{
#if defined(Q_PROCESSOR_X86)
_mm_pause();
#elif defined(Q_PROCESSOR_ARM) && Q_PROCESSOR_ARM >= 7 /* yield was added in ARMv7 */
# if __has_builtin(__builtin_arm_yield) /* e.g. Clang */
__builtin_arm_yield();
# elif defined(Q_OS_INTEGRITY) || defined(Q_CC_GNU_ONLY)
/*
- Integrity is missing the arm_acle.h header
- GCC doesn't have __yield() in arm_acle.h
https://stackoverflow.com/a/70076751/134841
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105416
*/
asm volatile("yield"); /* this works everywhere */
# else
__yield(); /* this is what should work everywhere */
# endif
#endif
}
#ifdef __cplusplus
} // extern "C"

View File

@ -6,6 +6,7 @@
#define QATOMIC_CXX11_H
#include <QtCore/qgenericatomic.h>
#include <QtCore/qyieldcpu.h>
#include <atomic>
QT_BEGIN_NAMESPACE

View File

@ -9,7 +9,6 @@
#include <QtCore/qcoreapplication.h>
#include <QtCore/qthread.h>
#include <QtCore/qvarlengtharray.h>
#include <QtCore/private/qsimd_p.h> // for qYieldCpu()
#include <private/qthreadpool_p.h>
#include <private/qobject_p.h>

View File

@ -0,0 +1,66 @@
// Copyright (C) 2023 The Qt Company Ltd.
// Copyright (C) 2023 Intel Corporation.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#ifndef QYIELDCPU_H
#define QYIELDCPU_H
#include <QtCore/qcompilerdetection.h>
#include <QtCore/qprocessordetection.h>
#include <QtCore/qtconfigmacros.h>
#ifdef Q_CC_MSVC_ONLY
// MSVC defines _YIELD_PROCESSOR() in <xatomic.h>, but as that is a private
// header, we include the public ones
# ifdef __cplusplus
# include <atomic>
extern "C"
# endif
void _mm_pause(void); // the compiler recognizes as intrinsic
#endif
QT_BEGIN_NAMESPACE
#ifdef Q_CC_GNU
__attribute__((artificial))
#endif
Q_ALWAYS_INLINE void qYieldCpu(void) Q_DECL_NOEXCEPT;
void qYieldCpu(void)
#ifdef __cplusplus
noexcept
#endif
{
#if __has_builtin(__yield)
__yield(); // Generic
#elif defined(_YIELD_PROCESSOR) && defined(Q_CC_MSVC)
_YIELD_PROCESSOR(); // Generic; MSVC's <atomic>
#elif __has_builtin(__builtin_ia32_pause)
__builtin_ia32_pause();
#elif defined(Q_PROCESSOR_X86) && defined(Q_CC_GNU)
// GCC < 10 didn't have __has_builtin()
__builtin_ia32_pause();
#elif defined(Q_PROCESSOR_X86) && defined(Q_CC_MSVC)
_mm_pause();
#elif defined(Q_PROCESSOR_X86)
asm("pause"); // hopefully asm() works in this compiler
#elif __has_builtin(__builtin_arm_yield)
__builtin_arm_yield();
#elif defined(Q_PROCESSOR_ARM) && Q_PROCESSOR_ARM >= 7
asm("yield"); // this works everywhere
#elif __has_builtin(__builtin_riscv_pause)
__builtin_riscv_pause(); // Zihintpause extension
#elif defined(Q_PROCESSOR_RISCV)
asm("fence w, 0"); // a.k.a. "pause"
#elif defined(_YIELD_PROCESSOR) && defined(Q_CC_GHS)
_YIELD_PROCESSOR; // Green Hills (INTEGRITY), but only on ARM
#endif
}
QT_END_NAMESPACE
#endif // QYIELDCPU_H

View File

@ -0,0 +1,59 @@
// Copyright (C) 2023 The Qt Company Ltd.
// Copyright (C) 2023 Intel Corporation.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
/*!
\fn qYieldCpu()
\inmodule QtCore
\ingroup thread
\relates QAtomicInteger
\relatesalso QAtomicPointer
\since 6.7
Pauses the execution of the current thread for an unspecified time, using
hardware instructions, without de-scheduling this thread. This function is
meant to be used in high-throughput loops where the code expects another
thread to modify an atomic variable. This is completely different from
QThread::yieldCurrentThread(), which is an OS-level operation that may take
the whole thread off the CPU and allow other threads (possibly belonging to
other processes) to run.
So, instead of
\code
while (!condition)
;
\endcode
one should write
\code
while (!condition)
qYieldCpu();
\endcode
This is useful both with and without hardware multithreading on the same
core. In the case of hardware threads, it serves to prevent further
speculative execution filling up the pipeline, which could starve the
sibling thread of resources. Across cores and higher levels of separation,
it allows the cache coherency protocol to allocate the cache line being
modified and inspected to the logical processor whose result this code is
expecting.
It is also recommended to loop around code that does not modify the global
variable, to avoid contention in exclusively obtaining the memory location.
Therefore, an atomic modification loop such as a spinlock acquisition
should be:
\code
while (true) {
while (!readOnlyCondition(atomic))
qYieldCpu();
if (modify(atomic))
break;
}
\endcode
On x86 processors and on RISC-V processors with the \c{Zihintpause}
extension, this will emit the \c PAUSE instruction, which is ignored on
processors that don't support it; on ARMv7 or later ARM processors, it will
emit the \c{YIELD} instruction.
*/

View File

@ -3,6 +3,7 @@
#include <QtCore/qglobal.h>
#include <QtCore/qtversion.h>
#include <QtCore/qyieldcpu.h>
#ifdef Q_COMPILER_THREAD_LOCAL
# include <threads.h>
@ -62,6 +63,12 @@ const char *tst_qVersion()
#endif
}
void tst_qYieldCpu(void) Q_DECL_NOEXCEPT;
void tst_qYieldCpu(void)
{
qYieldCpu();
}
/* Static assertion */
Q_STATIC_ASSERT(true);
Q_STATIC_ASSERT(1);