x86-64: Update _dl_tlsdesc_dynamic to preserve AMX registers

_dl_tlsdesc_dynamic should also preserve AMX registers which are
caller-saved.  Add X86_XSTATE_TILECFG_ID and X86_XSTATE_TILEDATA_ID
to x86-64 TLSDESC_CALL_STATE_SAVE_MASK.  Compute the AMX state size
and save it in xsave_state_full_size which is only used by
_dl_tlsdesc_dynamic_xsave and _dl_tlsdesc_dynamic_xsavec.  This fixes
the AMX part of BZ #31372.  Tested on AMX processor.

AMX test is enabled only for compilers with the fix for

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114098

GCC 14 and GCC 11/12/13 branches have the bug fix.
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
This commit is contained in:
H.J. Lu 2024-02-28 12:08:03 -08:00
parent a1735e0aa8
commit 9b7091415a
14 changed files with 299 additions and 6 deletions

View File

@ -66,6 +66,33 @@ $(objpfx)libx86-64-isa-level%.os: $(..)/sysdeps/unix/sysv/linux/x86_64/x86-64-is
$(objpfx)libx86-64-isa-level.so: $(objpfx)libx86-64-isa-level-1.so
cp $< $@
endif
ifeq (yes,$(have-mamx-tile))
tests += \
tst-gnu2-tls2-amx \
# tests
modules-names += \
tst-gnu2-tls2-amx-mod0 \
tst-gnu2-tls2-amx-mod1 \
tst-gnu2-tls2-amx-mod2 \
# modules-names
$(objpfx)tst-gnu2-tls2-amx: $(shared-thread-library)
$(objpfx)tst-gnu2-tls2-amx.out: \
$(objpfx)tst-gnu2-tls2-amx-mod0.so \
$(objpfx)tst-gnu2-tls2-amx-mod1.so \
$(objpfx)tst-gnu2-tls2-amx-mod2.so
$(objpfx)tst-gnu2-tls2-amx-mod0.so: $(libsupport)
$(objpfx)tst-gnu2-tls2-amx-mod1.so: $(libsupport)
$(objpfx)tst-gnu2-tls2-amx-mod2.so: $(libsupport)
CFLAGS-tst-gnu2-tls2-amx.c += -mamx-tile
CFLAGS-tst-gnu2-tls2-amx-mod0.c += -mamx-tile -mtls-dialect=gnu2
CFLAGS-tst-gnu2-tls2-amx-mod1.c += -mamx-tile -mtls-dialect=gnu2
CFLAGS-tst-gnu2-tls2-amx-mod2.c += -mamx-tile -mtls-dialect=gnu2
endif
endif # $(subdir) == elf
ifneq ($(enable-cet),no)

View File

@ -20,3 +20,8 @@
# define ARCH_SHSTK_SHSTK 0x1
# define ARCH_SHSTK_WRSS 0x2
#endif
#ifndef ARCH_GET_XCOMP_PERM
# define ARCH_GET_XCOMP_PERM 0x1022
# define ARCH_REQ_XCOMP_PERM 0x1023
#endif

View File

@ -0,0 +1,2 @@
#include "tst-gnu2-tls2-amx.h"
#include <tst-gnu2-tls2mod0.c>

View File

@ -0,0 +1,2 @@
#include "tst-gnu2-tls2-amx.h"
#include <tst-gnu2-tls2mod1.c>

View File

@ -0,0 +1,2 @@
#include "tst-gnu2-tls2-amx.h"
#include <tst-gnu2-tls2mod2.c>

View File

@ -0,0 +1,83 @@
/* Test TLSDESC relocation with AMX.
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <stdbool.h>
#include <asm/prctl.h>
#include <support/check.h>
#include "tst-gnu2-tls2-amx.h"
extern int arch_prctl (int, ...);
#define X86_XSTATE_TILECFG_ID 17
#define X86_XSTATE_TILEDATA_ID 18
/* Initialize tile config. */
__attribute__ ((noinline, noclone))
static void
init_tile_config (__tilecfg *tileinfo)
{
int i;
tileinfo->palette_id = 1;
tileinfo->start_row = 0;
tileinfo->colsb[0] = MAX_ROWS;
tileinfo->rows[0] = MAX_ROWS;
for (i = 1; i < 4; ++i)
{
tileinfo->colsb[i] = MAX_COLS;
tileinfo->rows[i] = MAX_ROWS;
}
_tile_loadconfig (tileinfo);
}
static bool
enable_amx (void)
{
uint64_t bitmask;
if (arch_prctl (ARCH_GET_XCOMP_PERM, &bitmask) != 0)
return false;
if ((bitmask & (1 << X86_XSTATE_TILECFG_ID)) == 0)
return false;
if (arch_prctl (ARCH_REQ_XCOMP_PERM, X86_XSTATE_TILEDATA_ID) != 0)
return false;
/* Load tile configuration. */
__tilecfg tile_data = { 0 };
init_tile_config (&tile_data);
return true;
}
/* An architecture can define it to clobber caller-saved registers in
malloc below to verify that the implicit TLSDESC call won't change
caller-saved registers. */
static void
clear_tile_register (void)
{
_tile_zero (2);
}
#define MOD(i) "tst-gnu2-tls2-amx-mod" #i ".so"
#define IS_SUPPORTED() enable_amx ()
#define PREPARE_MALLOC() clear_tile_register ()
#include <elf/tst-gnu2-tls2.c>

View File

@ -0,0 +1,63 @@
/* Test TLSDESC relocation with AMX.
Copyright (C) 2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <stdint.h>
#include <string.h>
#include <x86intrin.h>
#include <support/check.h>
#define MAX_ROWS 16
#define MAX_COLS 64
#define MAX 1024
#define STRIDE 64
typedef struct __tile_config
{
uint8_t palette_id;
uint8_t start_row;
uint8_t reserved_0[14];
uint16_t colsb[16];
uint8_t rows[16];
} __tilecfg __attribute__ ((aligned (64)));
/* Initialize int8_t buffer */
static inline void
init_buffer (int8_t *buf, int8_t value)
{
int rows, colsb, i, j;
rows = MAX_ROWS;
colsb = MAX_COLS;
for (i = 0; i < rows; i++)
for (j = 0; j < colsb; j++)
buf[i * colsb + j] = value;
}
#define BEFORE_TLSDESC_CALL() \
int8_t src[MAX]; \
int8_t res[MAX]; \
/* Initialize src with data */ \
init_buffer (src, 2); \
/* Load tile rows from memory. */ \
_tile_loadd (2, src, STRIDE);
#define AFTER_TLSDESC_CALL() \
/* Store the tile data to memory. */ \
_tile_stored (2, res, STRIDE); \
_tile_release (); \
TEST_VERIFY_EXIT (memcmp (src, res, sizeof (res)) == 0);

View File

@ -3,3 +3,4 @@
#include <ldsodefs.h>
XSAVE_STATE_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_size)
XSAVE_STATE_FULL_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_full_size)

View File

@ -308,6 +308,8 @@ update_active (struct cpu_features *cpu_features)
__cpuid_count (0xd, 0, eax, ebx, ecx, edx);
if (ebx != 0)
{
/* NB: On AMX capable processors, ebx always includes AMX
states. */
unsigned int xsave_state_full_size
= ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
@ -321,6 +323,11 @@ update_active (struct cpu_features *cpu_features)
{
unsigned int xstate_comp_offsets[32];
unsigned int xstate_comp_sizes[32];
#ifdef __x86_64__
unsigned int xstate_amx_comp_offsets[32];
unsigned int xstate_amx_comp_sizes[32];
unsigned int amx_ecx;
#endif
unsigned int i;
xstate_comp_offsets[0] = 0;
@ -328,16 +335,39 @@ update_active (struct cpu_features *cpu_features)
xstate_comp_offsets[2] = 576;
xstate_comp_sizes[0] = 160;
xstate_comp_sizes[1] = 256;
#ifdef __x86_64__
xstate_amx_comp_offsets[0] = 0;
xstate_amx_comp_offsets[1] = 160;
xstate_amx_comp_offsets[2] = 576;
xstate_amx_comp_sizes[0] = 160;
xstate_amx_comp_sizes[1] = 256;
#endif
for (i = 2; i < 32; i++)
{
if ((STATE_SAVE_MASK & (1 << i)) != 0)
if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
{
__cpuid_count (0xd, i, eax, ebx, ecx, edx);
xstate_comp_sizes[i] = eax;
#ifdef __x86_64__
/* Include this in xsave_state_full_size. */
amx_ecx = ecx;
xstate_amx_comp_sizes[i] = eax;
if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
{
/* Exclude this from xsave_state_size. */
ecx = 0;
xstate_comp_sizes[i] = 0;
}
else
#endif
xstate_comp_sizes[i] = eax;
}
else
{
#ifdef __x86_64__
amx_ecx = 0;
xstate_amx_comp_sizes[i] = 0;
#endif
ecx = 0;
xstate_comp_sizes[i] = 0;
}
@ -350,6 +380,15 @@ update_active (struct cpu_features *cpu_features)
if ((ecx & (1 << 1)) != 0)
xstate_comp_offsets[i]
= ALIGN_UP (xstate_comp_offsets[i], 64);
#ifdef __x86_64__
xstate_amx_comp_offsets[i]
= (xstate_amx_comp_offsets[i - 1]
+ xstate_amx_comp_sizes[i - 1]);
if ((amx_ecx & (1 << 1)) != 0)
xstate_amx_comp_offsets[i]
= ALIGN_UP (xstate_amx_comp_offsets[i],
64);
#endif
}
}
@ -358,6 +397,18 @@ update_active (struct cpu_features *cpu_features)
= xstate_comp_offsets[31] + xstate_comp_sizes[31];
if (size)
{
#ifdef __x86_64__
unsigned int amx_size
= (xstate_amx_comp_offsets[31]
+ xstate_amx_comp_sizes[31]);
amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,
64);
/* Set xsave_state_full_size to the compact AMX
state size for XSAVEC. NB: xsave_state_full_size
is only used in _dl_tlsdesc_dynamic_xsave and
_dl_tlsdesc_dynamic_xsavec. */
cpu_features->xsave_state_full_size = amx_size;
#endif
cpu_features->xsave_state_size
= ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
CPU_FEATURE_SET (cpu_features, XSAVEC);

View File

@ -934,6 +934,8 @@ struct cpu_features
/* The full state size for XSAVE when XSAVEC is disabled by
GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
and the AMX state size when XSAVEC is available.
*/
unsigned int xsave_state_full_size;
/* Data cache size for use in memory and string routines, typically

View File

@ -56,6 +56,14 @@
| (1 << X86_XSTATE_ZMM_H_ID) \
| (1 << X86_XSTATE_ZMM_ID) \
| (1 << X86_XSTATE_APX_F_ID))
/* AMX state mask. */
# define AMX_STATE_SAVE_MASK \
((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
/* States to be included in xsave_state_full_size. */
# define FULL_STATE_SAVE_MASK \
(STATE_SAVE_MASK | AMX_STATE_SAVE_MASK)
#else
/* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386
doesn't have red-zone, use 0 here. */
@ -68,13 +76,17 @@
| (1 << X86_XSTATE_BNDREGS_ID) \
| (1 << X86_XSTATE_K_ID) \
| (1 << X86_XSTATE_ZMM_H_ID))
/* States to be included in xsave_state_size. */
# define FULL_STATE_SAVE_MASK STATE_SAVE_MASK
#endif
/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
Compiler assumes that all registers, including x87 FPU stack registers,
are unchanged after CALL, except for EFLAGS and RAX/EAX. */
Compiler assumes that all registers, including AMX and x87 FPU
stack registers, are unchanged after CALL, except for EFLAGS and
RAX/EAX. */
#define TLSDESC_CALL_STATE_SAVE_MASK \
(STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
(FULL_STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
/* Constants for bits in __x86_string_control: */

View File

@ -134,6 +134,34 @@ fi
config_vars="$config_vars
enable-cet = $enable_cet"
# Check if -mamx-tile works properly.
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mamx-tile works properly" >&5
printf %s "checking whether -mamx-tile works properly... " >&6; }
if test ${libc_cv_x86_have_amx_tile+y}
then :
printf %s "(cached) " >&6
else $as_nop
cat > conftest.c <<EOF
#include <x86intrin.h>
EOF
libc_cv_x86_have_amx_tile=no
if { ac_try='${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i'
{ { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
(eval $ac_try) 2>&5
ac_status=$?
printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; }; then
if grep -q __builtin_ia32_ldtilecfg conftest.i; then
libc_cv_x86_have_amx_tile=yes
fi
fi
rm -rf conftest*
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x86_have_amx_tile" >&5
printf "%s\n" "$libc_cv_x86_have_amx_tile" >&6; }
config_vars="$config_vars
have-mamx-tile = $libc_cv_x86_have_amx_tile"
test -n "$critic_missing" && as_fn_error $? "
*** $critic_missing" "$LINENO" 5

View File

@ -61,5 +61,20 @@ elif test $enable_cet = permissive; then
fi
LIBC_CONFIG_VAR([enable-cet], [$enable_cet])
# Check if -mamx-tile works properly.
AC_CACHE_CHECK(whether -mamx-tile works properly,
libc_cv_x86_have_amx_tile, [dnl
cat > conftest.c <<EOF
#include <x86intrin.h>
EOF
libc_cv_x86_have_amx_tile=no
if AC_TRY_COMMAND(${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i); then
if grep -q __builtin_ia32_ldtilecfg conftest.i; then
libc_cv_x86_have_amx_tile=yes
fi
fi
rm -rf conftest*])
LIBC_CONFIG_VAR([have-mamx-tile], [$libc_cv_x86_have_amx_tile])
test -n "$critic_missing" && AC_MSG_ERROR([
*** $critic_missing])

View File

@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic:
# endif
#else
/* Allocate stack space of the required size to save the state. */
sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP
#endif
/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
r10 and r11. */