[x64] Add support for "cold calls" in hot paths

This makes (specially annotated) calls to "cold functions" in hot paths
more efficient by hiding the fact that we are actually calling a
function here. Clang would otherwise unconditionally spill and reload
registers that might be clobbered by the call. This would slow down the
fast path.

This CL allows to reverse priorities here: The fast path can stay fast
(no spills and loads), but the slow path gets even slower. The inline
assembly that implements the cold call spills and reloads *all*
registers, because we do not know which registers are in use in the
scope where the cold call is being emitted.

I.e. this behaves like a custom calling convention with no caller-saved
registers.

The `preserve_all` attribute (experimental in clang, and incomplete for
C++) would also solve this, but it is not production-ready yet (leads to
crashes of clang and crashes of the generated code).

R=leszeks@chromium.org
CC=​dlehmann@chromium.org

Bug: v8:13565, v8:13570
Change-Id: I2b54a480da1c689113a67c601c29d73239b0ff2b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4116584
Commit-Queue: Clemens Backes <clemensb@chromium.org>
Reviewed-by: Anton Bikineev <bikineev@chromium.org>
Reviewed-by: Leszek Swirski <leszeks@chromium.org>
Cr-Commit-Position: refs/heads/main@{#85127}
This commit is contained in:
Clemens Backes 2023-01-05 17:37:44 +01:00 committed by V8 LUCI CQ
parent 922fa2f9ee
commit 31ccfed461
6 changed files with 174 additions and 2 deletions

View File

@ -595,6 +595,8 @@ filegroup(
"src/base/bounded-page-allocator.h",
"src/base/bounds.h",
"src/base/build_config.h",
"src/base/call_cold.cc",
"src/base/call_cold.h",
"src/base/compiler-specific.h",
"src/base/container-utils.h",
"src/base/cpu.cc",

View File

@ -5491,6 +5491,8 @@ v8_component("v8_libbase") {
"src/base/bounded-page-allocator.h",
"src/base/bounds.h",
"src/base/build_config.h",
"src/base/call_cold.cc",
"src/base/call_cold.h",
"src/base/compiler-specific.h",
"src/base/container-utils.h",
"src/base/cpu.cc",

58
src/base/call_cold.cc Normal file
View File

@ -0,0 +1,58 @@
// Copyright 2023 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/base/call_cold.h"
namespace v8::base {
#if V8_HOST_ARCH_X64 && (defined(__clang__) || defined(__GNUC__))
asm(".globl v8_base_call_cold\n"
"v8_base_call_cold:\n"
" push %rbp\n"
" mov %rsp, %rbp\n"
// Push all non-clobbered registers, except for callee-saved ones. The
// compiler does not even know that it is executing a call, so we can not
// clobber any register, not the registers holding the function address or
// the arguments.
" push %rax\n"
" push %rcx\n"
" push %rdx\n"
#ifndef V8_OS_WIN
// %rsi and %rdi are callee-saved on Windows.
" push %rsi\n"
" push %rdi\n"
#endif // !V8_OS_WIN
" push %r8\n"
" push %r9\n"
" push %r10\n"
" push %r11\n"
// Save %rsp to %r15 (after pushing it) and align %rsp to 16 bytes.
// %r15 is callee-saved, so the value will still be there after the call.
" push %r15\n"
" mov %rsp, %r15\n"
" and $-16, %rsp\n"
// Now execute the actual call.
" call *%rax\n"
// Restore the potentially unaligned %rsp.
" mov %r15, %rsp\n"
// Pop the previously pushed registers. We have no return value, so we do
// not need to preserve %rax.
" pop %r15\n"
" pop %r11\n"
" pop %r10\n"
" pop %r9\n"
" pop %r8\n"
#ifndef V8_OS_WIN
" pop %rdi\n"
" pop %rsi\n"
#endif // !V8_OS_WIN
" pop %rdx\n"
" pop %rcx\n"
" pop %rax\n"
// Leave the frame and return.
" pop %rbp\n"
" ret");
#endif
} // namespace v8::base

103
src/base/call_cold.h Normal file
View File

@ -0,0 +1,103 @@
// Copyright 2023 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_BASE_CALL_COLD_H_
#define V8_BASE_CALL_COLD_H_
#include <type_traits>
#include "include/v8config.h"
namespace v8::base {
// Use {call_cold} for calls in hot paths that are unlikely to be executed. The
// compiler will not know that this executes a call, so it will not clobber any
// registers (i.e. this behaves like a custom calling convention where all
// registers are callee-save).
// Executing the call will be significantly slower then without going through
// {call_cold}, as all register will have to be spilled and an indirect call is
// being executed.
// As a start, we added support for GCC and clang on x64. Other platforms can
// be added later, as needed.
template <typename Fn, typename... Ps>
constexpr bool IsValidForCallCold =
// The callable object must be convertible to a function pointer (e.g. a
// capture-less lambda).
std::is_convertible_v<Fn, void (*)(Ps...)> &&
// All parameters must be integral (support for floating-point arguments is
// not implemented).
(... && (std::is_integral_v<Ps> || std::is_pointer_v<Ps>));
// Do not use V8_CC_GNU, as this is not defined for clang on Windows. Explicitly
// check for GCC or clang.
#if V8_HOST_ARCH_X64 && (defined(__clang__) || defined(__GNUC__))
// Define the parameter registers. Windows uses a different calling convention
// than other OSes.
#define REG_FN "rax"
#ifdef V8_OS_WIN
#define REG_P1 "rcx"
#define REG_P2 "rdx"
#define REG_P3 "r8"
#else
#define REG_P1 "rdi"
#define REG_P2 "rsi"
#define REG_P3 "rdx"
#endif
// We clobber all xmm registers so we do not have to spill and reload them.
#define CLOBBER \
"memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", \
"xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", \
"xmm15", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm6", "st", \
"st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
#define V8_CALL_COLD_ASM \
"sub $128, %%rsp\n" /* Bump %rsp by 128, beyond the red zone. */ \
"call v8_base_call_cold\n" /* Call our trampoline. */ \
"add $128, %%rsp" /* Restore previous %rsp. */
// 1 Parameter, no result.
template <typename P1, typename Fn>
V8_INLINE void call_cold(const Fn& fn, P1 p1) {
static_assert(IsValidForCallCold<Fn, P1>);
using FnPtr = void (*)(P1);
register FnPtr fn_reg asm(REG_FN) = fn;
register P1 p1_reg asm(REG_P1) = p1;
asm(V8_CALL_COLD_ASM : : "r"(fn_reg), "r"(p1_reg) : CLOBBER);
}
// 3 Parameters, no result.
template <typename P1, typename P2, typename P3, typename Fn>
V8_INLINE void call_cold(const Fn& fn, P1 p1, P2 p2, P3 p3) {
static_assert(IsValidForCallCold<Fn, P1, P2, P3>);
using FnPtr = void (*)(P1, P2, P3);
register FnPtr fn_reg asm(REG_FN) = fn;
register P1 p1_reg asm(REG_P1) = p1;
register P2 p2_reg asm(REG_P2) = p2;
register P3 p3_reg asm(REG_P3) = p3;
asm(V8_CALL_COLD_ASM
:
: "r"(fn_reg), "r"(p1_reg), "r"(p2_reg), "r"(p3_reg)
: CLOBBER);
}
#else
// Architectures without special support just execute the call directly.
template <typename... Ps, typename Fn>
V8_INLINE void call_cold(const Fn& fn, Ps... ps) {
static_assert(IsValidForCallCold<Fn, Ps...>);
fn(ps...);
}
#endif
#undef REG_P1
#undef REG_P2
#undef REG_P3
#undef CLOBBER
#undef V8_CALL_COLD_ASM
} // namespace v8::base
#endif // V8_BASE_CALL_COLD_H_

View File

@ -42,6 +42,7 @@
#include <memory>
#include <vector>
#include "src/base/call_cold.h"
#include "src/base/export-template.h"
#include "src/codegen/assembler.h"
#include "src/codegen/cpu-features.h"
@ -2640,7 +2641,10 @@ void Assembler::vinstr(byte op, YMMRegister dst, XMMRegister src1,
class EnsureSpace {
public:
explicit V8_INLINE EnsureSpace(Assembler* assembler) : assembler_(assembler) {
if (V8_UNLIKELY(assembler_->buffer_overflow())) assembler_->GrowBuffer();
if (V8_UNLIKELY(assembler_->buffer_overflow())) {
base::call_cold([](Assembler* assembler) { assembler->GrowBuffer(); },
assembler_);
}
#ifdef DEBUG
space_before_ = assembler_->available_space();
#endif

View File

@ -16,6 +16,7 @@
#include <optional>
#include "src/base/call_cold.h"
#include "src/base/small-vector.h"
#include "src/base/strings.h"
#include "src/base/v8-fallthrough.h"
@ -1256,7 +1257,9 @@ class FastZoneVector {
V8_INLINE void EnsureMoreCapacity(int slots_needed, Zone* zone) {
if (V8_LIKELY(capacity_end_ - end_ >= slots_needed)) return;
Grow(slots_needed, zone);
base::call_cold([](FastZoneVector* vec, int slots_needed,
Zone* zone) { vec->Grow(slots_needed, zone); },
this, slots_needed, zone);
}
private: