Merge branch release/2.32/master into ibm/2.32/master

This commit is contained in:
Raoni Fassina Firmino 2022-04-01 17:05:01 -03:00
commit b81d2ac6b4
21 changed files with 718 additions and 325 deletions

3
NEWS
View File

@ -24,9 +24,12 @@ The following bugs are resolved with this release:
[27130] "rep movsb" performance issue
[27177] GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on doesn't work
[27457] vzeroupper use in AVX2 multiarch string functions cause HTM aborts
[27974] Overflow bug in some implementation of wcsnlen, wmemchr, and wcsncat
[28524] Conversion from ISO-2022-JP-3 with iconv may emit spurious NULs
[28607] Masked signals are delivered on thread exit
[28755] overflow bug in wcsncmp_avx2 and wcsncmp_evex
[28896] strncmp-avx2-rtm and wcsncmp-avx2-rtm fallback on non-rtm
variants when avoiding overflow
Version 2.32

View File

@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
CHAR *res = CALL (impl, s, c, n);
if (res != exp_res)
{
error (0, 0, "Wrong result in function %s %p %p", impl->name,
res, exp_res);
error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
impl->name, s, c, n, res, exp_res);
ret = 1;
return;
}
@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
}
buf[align + len] = 0;
if (pos < len)
if (pos < MIN(n, len))
{
buf[align + pos] = seek_char;
buf[align + len] = -seek_char;
@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
}
static void
do_overflow_tests (void)
{
size_t i, j, len;
const size_t one = 1;
uintptr_t buf_addr = (uintptr_t) buf1;
for (i = 0; i < 750; ++i)
{
do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
do_test (0, i, 751, i - buf_addr, BIG_CHAR);
do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
len = 0;
for (j = 8 * sizeof(size_t) - 1; j ; --j)
{
len |= one << j;
do_test (0, i, 751, len - i, BIG_CHAR);
do_test (0, i, 751, len + i, BIG_CHAR);
do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
do_test (0, i, 751, ~len - i, BIG_CHAR);
do_test (0, i, 751, ~len + i, BIG_CHAR);
do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
}
}
}
static void
do_random_tests (void)
{
@ -221,6 +253,7 @@ test_main (void)
do_test (page_size / 2 - i, i, i, 1, 0x9B);
do_random_tests ();
do_overflow_tests ();
return ret;
}

View File

@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
}
}
static void
do_overflow_tests (void)
{
size_t i, j, len;
const size_t one = 1;
CHAR *s1, *s2;
uintptr_t s1_addr;
s1 = (CHAR *) buf1;
s2 = (CHAR *) buf2;
s1_addr = (uintptr_t)s1;
for (j = 0; j < 200; ++j)
s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
s2[200] = 0;
for (i = 0; i < 750; ++i) {
for (j = 0; j < i; ++j)
s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
s1[i] = '\0';
FOR_EACH_IMPL (impl, 0)
{
s2[200] = '\0';
do_one_test (impl, s2, s1, SIZE_MAX - i);
s2[200] = '\0';
do_one_test (impl, s2, s1, i - s1_addr);
s2[200] = '\0';
do_one_test (impl, s2, s1, -s1_addr - i);
s2[200] = '\0';
do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
s2[200] = '\0';
do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
}
len = 0;
for (j = 8 * sizeof(size_t) - 1; j ; --j)
{
len |= one << j;
FOR_EACH_IMPL (impl, 0)
{
s2[200] = '\0';
do_one_test (impl, s2, s1, len - i);
s2[200] = '\0';
do_one_test (impl, s2, s1, len + i);
s2[200] = '\0';
do_one_test (impl, s2, s1, len - s1_addr - i);
s2[200] = '\0';
do_one_test (impl, s2, s1, len - s1_addr + i);
s2[200] = '\0';
do_one_test (impl, s2, s1, ~len - i);
s2[200] = '\0';
do_one_test (impl, s2, s1, ~len + i);
s2[200] = '\0';
do_one_test (impl, s2, s1, ~len - s1_addr - i);
s2[200] = '\0';
do_one_test (impl, s2, s1, ~len - s1_addr + i);
}
}
}
}
static void
do_random_tests (void)
{
@ -316,6 +376,7 @@ test_main (void)
}
do_random_tests ();
do_overflow_tests ();
return ret;
}

View File

@ -403,6 +403,18 @@ check2 (void)
free (s2);
}
static void
check3 (void)
{
const CHAR *s1 = L ("abc");
CHAR *s2 = STRDUP (s1);
FOR_EACH_IMPL (impl, 0)
check_result (impl, s1, s2, SIZE_MAX, 0);
free (s2);
}
int
test_main (void)
{
@ -412,6 +424,7 @@ test_main (void)
check1 ();
check2 ();
check3 ();
printf ("%23s", "");
FOR_EACH_IMPL (impl, 0)

View File

@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
}
static void
do_overflow_tests (void)
{
size_t i, j, len;
const size_t one = 1;
uintptr_t buf_addr = (uintptr_t) buf1;
for (i = 0; i < 750; ++i)
{
do_test (0, i, SIZE_MAX - i, BIG_CHAR);
do_test (0, i, i - buf_addr, BIG_CHAR);
do_test (0, i, -buf_addr - i, BIG_CHAR);
do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
len = 0;
for (j = 8 * sizeof(size_t) - 1; j ; --j)
{
len |= one << j;
do_test (0, i, len - i, BIG_CHAR);
do_test (0, i, len + i, BIG_CHAR);
do_test (0, i, len - buf_addr - i, BIG_CHAR);
do_test (0, i, len - buf_addr + i, BIG_CHAR);
do_test (0, i, ~len - i, BIG_CHAR);
do_test (0, i, ~len + i, BIG_CHAR);
do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
}
}
}
static void
do_random_tests (void)
{
@ -274,6 +306,7 @@ test_main (void)
do_random_tests ();
do_page_tests ();
do_page_2_tests ();
do_overflow_tests ();
return ret;
}

View File

@ -33,7 +33,9 @@ tests += \
tst-strcpy-rtm \
tst-strlen-rtm \
tst-strncmp-rtm \
tst-strrchr-rtm
tst-strrchr-rtm \
tst-wcsncmp-rtm \
# tests
CFLAGS-tst-memchr-rtm.c += -mrtm
CFLAGS-tst-memcmp-rtm.c += -mrtm
@ -43,8 +45,9 @@ CFLAGS-tst-memset-rtm.c += -mrtm
CFLAGS-tst-strchr-rtm.c += -mrtm
CFLAGS-tst-strcpy-rtm.c += -mrtm
CFLAGS-tst-strlen-rtm.c += -mrtm
CFLAGS-tst-strncmp-rtm.c += -mrtm
CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
CFLAGS-tst-strrchr-rtm.c += -mrtm
CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
endif
ifneq ($(enable-cet),no)

View File

@ -71,7 +71,6 @@ update_usable (struct cpu_features *cpu_features)
CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_6);
CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_7);
CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_9);
CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_11);
CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_12);
CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_13);
CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_17);
@ -318,6 +317,9 @@ update_usable (struct cpu_features *cpu_features)
/* Determine if PKU is usable. */
if (CPU_FEATURES_CPU_P (cpu_features, OSPKE))
CPU_FEATURE_SET (cpu_features, PKU);
if (CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
CPU_FEATURE_UNSET (cpu_features, RTM);
}
static void
@ -516,11 +518,39 @@ init_cpu_features (struct cpu_features *cpu_features)
break;
}
/* Disable TSX on some Haswell processors to avoid TSX on kernels that
weren't updated with the latest microcode package (which disables
broken feature by default). */
/* Disable TSX on some processors to avoid TSX on kernels that
weren't updated with the latest microcode package (which
disables broken feature by default). */
switch (model)
{
case 0x55:
if (stepping <= 5)
goto disable_tsx;
break;
case 0x8e:
/* NB: Although the errata documents that for model == 0x8e,
only 0xb stepping or lower are impacted, the intention of
the errata was to disable TSX on all client processors on
all steppings. Include 0xc stepping which is an Intel
Core i7-8665U, a client mobile processor. */
case 0x9e:
if (stepping > 0xc)
break;
/* Fall through. */
case 0x4e:
case 0x5e:
{
/* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
processors listed in:
https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
*/
disable_tsx:
CPU_FEATURE_UNSET (cpu_features, HLE);
CPU_FEATURE_UNSET (cpu_features, RTM);
CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
}
break;
case 0x3f:
/* Xeon E7 v3 with stepping >= 4 has working TSX. */
if (stepping >= 4)

View File

@ -295,7 +295,7 @@ extern const struct cpu_features *__get_cpu_features (void)
#define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
#define bit_cpu_INDEX_7_EDX_9 (1u << 9)
#define bit_cpu_MD_CLEAR (1u << 10)
#define bit_cpu_INDEX_7_EDX_11 (1u << 11)
#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
#define bit_cpu_INDEX_7_EDX_12 (1u << 12)
#define bit_cpu_INDEX_7_EDX_13 (1u << 13)
#define bit_cpu_SERIALIZE (1u << 14)
@ -508,7 +508,7 @@ extern const struct cpu_features *__get_cpu_features (void)
#define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
#define index_cpu_INDEX_7_EDX_9 COMMON_CPUID_INDEX_7
#define index_cpu_MD_CLEAR COMMON_CPUID_INDEX_7
#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
#define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
#define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
#define index_cpu_SERIALIZE COMMON_CPUID_INDEX_7
@ -721,7 +721,7 @@ extern const struct cpu_features *__get_cpu_features (void)
#define reg_AVX512_VP2INTERSECT edx
#define reg_INDEX_7_EDX_9 edx
#define reg_MD_CLEAR edx
#define reg_INDEX_7_EDX_11 edx
#define reg_RTM_ALWAYS_ABORT edx
#define reg_INDEX_7_EDX_12 edx
#define reg_INDEX_7_EDX_13 edx
#define reg_SERIALIZE edx

View File

@ -183,6 +183,7 @@ do_test (void)
CHECK_CPU_FEATURE (FSRM);
CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
CHECK_CPU_FEATURE (MD_CLEAR);
CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
CHECK_CPU_FEATURE (SERIALIZE);
CHECK_CPU_FEATURE (HYBRID);
CHECK_CPU_FEATURE (TSXLDTRK);
@ -336,6 +337,7 @@ do_test (void)
CHECK_CPU_FEATURE_USABLE (FSRM);
CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
CHECK_CPU_FEATURE_USABLE (SERIALIZE);
CHECK_CPU_FEATURE_USABLE (HYBRID);
CHECK_CPU_FEATURE_USABLE (TSXLDTRK);

View File

@ -16,20 +16,35 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <stdint.h>
#include <tst-string-rtm.h>
#ifdef WIDE
# define CHAR wchar_t
# define MEMSET wmemset
# define STRNCMP wcsncmp
# define TEST_NAME "wcsncmp"
#else /* !WIDE */
# define CHAR char
# define MEMSET memset
# define STRNCMP strncmp
# define TEST_NAME "strncmp"
#endif /* !WIDE */
#define LOOP 3000
#define STRING_SIZE 1024
char string1[STRING_SIZE];
char string2[STRING_SIZE];
CHAR string1[STRING_SIZE];
CHAR string2[STRING_SIZE];
__attribute__ ((noinline, noclone))
static int
prepare (void)
{
memset (string1, 'a', STRING_SIZE - 1);
memset (string2, 'a', STRING_SIZE - 1);
if (strncmp (string1, string2, STRING_SIZE) == 0)
MEMSET (string1, 'a', STRING_SIZE - 1);
MEMSET (string2, 'a', STRING_SIZE - 1);
if (STRNCMP (string1, string2, STRING_SIZE) == 0)
return EXIT_SUCCESS;
else
return EXIT_FAILURE;
@ -39,7 +54,17 @@ __attribute__ ((noinline, noclone))
static int
function (void)
{
if (strncmp (string1, string2, STRING_SIZE) == 0)
if (STRNCMP (string1, string2, STRING_SIZE) == 0)
return 0;
else
return 1;
}
__attribute__ ((noinline, noclone))
static int
function_overflow (void)
{
if (STRNCMP (string1, string2, SIZE_MAX) == 0)
return 0;
else
return 1;
@ -48,5 +73,9 @@ function (void)
static int
do_test (void)
{
return do_test_1 ("strncmp", LOOP, prepare, function);
int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
if (status != EXIT_SUCCESS)
return status;
status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
return status;
}

View File

@ -0,0 +1,21 @@
/* Test case for wcsncmp inside a transactionally executing RTM region.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define WIDE 1
#include <wchar.h>
#include "tst-strncmp-rtm.c"

View File

@ -20,6 +20,8 @@ endif
ifeq ($(subdir),string)
sysdep_routines += strcasecmp_l-nonascii strncase_l-nonascii
gen-as-const-headers += locale-defines.sym
tests += \
tst-rsi-strlen
endif
ifeq ($(subdir),elf)
@ -150,6 +152,11 @@ ifeq ($(subdir),csu)
gen-as-const-headers += tlsdesc.sym rtld-offsets.sym
endif
ifeq ($(subdir),wcsmbs)
tests += \
tst-rsi-wcslen
endif
$(objpfx)x86_64/tst-x86_64mod-1.os: $(objpfx)tst-x86_64mod-1.os
$(make-target-directory)
rm -f $@

View File

@ -295,7 +295,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
@ -312,7 +313,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
@ -655,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
IFUNC_IMPL_ADD (array, i, wcsnlen,
IFUNC_IMPL_ADD (array, i, wcslen,
CPU_FEATURE_USABLE (SSE4_1),
__wcsnlen_sse4_1)
__wcslen_sse4_1)
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */

View File

@ -95,7 +95,7 @@ ENTRY (STRCMP)
length to bound a valid memory region. In these cases just use
'wcscmp'. */
shrq $56, %rcx
jnz __wcscmp_avx2
jnz OVERFLOW_STRCMP
# endif
/* Convert units: from wide to byte char. */
shl $2, %RDX_LP

View File

@ -29,11 +29,13 @@
# ifdef USE_AS_WCSLEN
# define VPCMP vpcmpd
# define VPMINU vpminud
# define SHIFT_REG r9d
# define SHIFT_REG ecx
# define CHAR_SIZE 4
# else
# define VPCMP vpcmpb
# define VPMINU vpminub
# define SHIFT_REG ecx
# define SHIFT_REG edx
# define CHAR_SIZE 1
# endif
# define XMMZERO xmm16
@ -46,132 +48,165 @@
# define YMM6 ymm22
# define VEC_SIZE 32
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section .text.evex,"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
/* Check for zero length. */
/* Check zero length. */
test %RSI_LP, %RSI_LP
jz L(zero)
# ifdef USE_AS_WCSLEN
shl $2, %RSI_LP
# elif defined __ILP32__
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %esi, %esi
# endif
mov %RSI_LP, %R8_LP
# endif
movl %edi, %ecx
movq %rdi, %rdx
movl %edi, %eax
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
/* Clear high bits from edi. Only keeping bits relevant to page
cross check. */
andl $(PAGE_SIZE - 1), %eax
/* Check if we may cross page boundary with one vector load. */
andl $(2 * VEC_SIZE - 1), %ecx
cmpl $VEC_SIZE, %ecx
ja L(cros_page_boundary)
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. Each bit in K0 represents a
null byte. */
VPCMP $0, (%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
# ifdef USE_AS_STRNLEN
jnz L(first_vec_x0_check)
/* Adjust length and check the end of data. */
subq $VEC_SIZE, %rsi
jbe L(max)
# else
jnz L(first_vec_x0)
/* If length < CHAR_PER_VEC handle special. */
cmpq $CHAR_PER_VEC, %rsi
jbe L(first_vec_x0)
# endif
/* Align data for aligned loads in the loop. */
addq $VEC_SIZE, %rdi
andl $(VEC_SIZE - 1), %ecx
andq $-VEC_SIZE, %rdi
# ifdef USE_AS_STRNLEN
/* Adjust length. */
addq %rcx, %rsi
subq $(VEC_SIZE * 4), %rsi
jbe L(last_4x_vec_or_less)
# endif
jmp L(more_4x_vec)
.p2align 4
L(cros_page_boundary):
andl $(VEC_SIZE - 1), %ecx
andq $-VEC_SIZE, %rdi
# ifdef USE_AS_WCSLEN
/* NB: Divide shift count by 4 since each bit in K0 represent 4
bytes. */
movl %ecx, %SHIFT_REG
sarl $2, %SHIFT_REG
# endif
VPCMP $0, (%rdi), %YMMZERO, %k0
kmovd %k0, %eax
/* Remove the leading bytes. */
sarxl %SHIFT_REG, %eax, %eax
testl %eax, %eax
jz L(aligned_more)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
# endif
ret
# ifdef USE_AS_STRNLEN
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
L(zero):
xorl %eax, %eax
ret
.p2align 4
L(first_vec_x0):
/* Set bit for max len so that tzcnt will return min of max len
and position of first match. */
btsq %rsi, %rax
tzcntl %eax, %eax
ret
# endif
addq %rdi, %rax
addq %rcx, %rax
subq %rdx, %rax
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
/* Safe to use 32 bit instructions as these are only called for
size = [1, 159]. */
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
# else
subl %edx, %edi
# ifdef USE_AS_WCSLEN
shrq $2, %rax
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarl $2, %edi
# endif
leal CHAR_PER_VEC(%rdi, %rax), %eax
# endif
ret
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
/* Safe to use 32 bit instructions as these are only called for
size = [1, 159]. */
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
# else
subl %edx, %edi
# ifdef USE_AS_WCSLEN
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarl $2, %edi
# endif
leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
# endif
ret
.p2align 4
L(first_vec_x3):
tzcntl %eax, %eax
/* Safe to use 32 bit instructions as these are only called for
size = [1, 159]. */
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
# else
subl %edx, %edi
# ifdef USE_AS_WCSLEN
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarl $2, %edi
# endif
leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
# endif
ret
.p2align 4
L(first_vec_x4):
tzcntl %eax, %eax
/* Safe to use 32 bit instructions as these are only called for
size = [1, 159]. */
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
# else
subl %edx, %edi
# ifdef USE_AS_WCSLEN
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarl $2, %edi
# endif
leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
# endif
ret
.p2align 5
L(aligned_more):
# ifdef USE_AS_STRNLEN
/* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
to void possible addition overflow. */
negq %rcx
addq $VEC_SIZE, %rcx
/* Check the end of data. */
subq %rcx, %rsi
jbe L(max)
# endif
addq $VEC_SIZE, %rdi
# ifdef USE_AS_STRNLEN
subq $(VEC_SIZE * 4), %rsi
jbe L(last_4x_vec_or_less)
# endif
L(more_4x_vec):
movq %rdi, %rdx
/* Align data to VEC_SIZE. */
andq $-(VEC_SIZE), %rdi
L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
VPCMP $0, (%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x0)
# ifdef USE_AS_STRNLEN
/* + CHAR_SIZE because it simplies the logic in
last_4x_vec_or_less. */
leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
subq %rdx, %rcx
# ifdef USE_AS_WCSLEN
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarl $2, %ecx
# endif
# endif
/* Load first VEC regardless. */
VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
# ifdef USE_AS_STRNLEN
/* Adjust length. If near end handle specially. */
subq %rcx, %rsi
jb L(last_4x_vec_or_less)
# endif
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x1)
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
test %eax, %eax
jnz L(first_vec_x2)
VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
@ -179,258 +214,276 @@ L(more_4x_vec):
testl %eax, %eax
jnz L(first_vec_x3)
addq $(VEC_SIZE * 4), %rdi
VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x4)
addq $VEC_SIZE, %rdi
# ifdef USE_AS_STRNLEN
subq $(VEC_SIZE * 4), %rsi
jbe L(last_4x_vec_or_less)
/* Check if at last VEC_SIZE * 4 length. */
cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
jbe L(last_4x_vec_or_less_load)
movl %edi, %ecx
andl $(VEC_SIZE * 4 - 1), %ecx
# ifdef USE_AS_WCSLEN
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarl $2, %ecx
# endif
/* Align data to 4 * VEC_SIZE. */
movq %rdi, %rcx
andl $(4 * VEC_SIZE - 1), %ecx
andq $-(4 * VEC_SIZE), %rdi
# ifdef USE_AS_STRNLEN
/* Adjust length. */
/* Readjust length. */
addq %rcx, %rsi
# endif
/* Align data to VEC_SIZE * 4. */
andq $-(VEC_SIZE * 4), %rdi
/* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
/* Compare 4 * VEC at a time forward. */
VMOVA (%rdi), %YMM1
VMOVA VEC_SIZE(%rdi), %YMM2
VMOVA (VEC_SIZE * 2)(%rdi), %YMM3
VMOVA (VEC_SIZE * 3)(%rdi), %YMM4
/* Load first VEC regardless. */
VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
# ifdef USE_AS_STRNLEN
/* Break if at end of length. */
subq $(CHAR_PER_VEC * 4), %rsi
jb L(last_4x_vec_or_less_cmpeq)
# endif
/* Save some code size by microfusing VPMINU with the load. Since
the matches in ymm2/ymm4 can only be returned if there where no
matches in ymm1/ymm3 respectively there is no issue with overlap.
*/
VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
VPMINU %YMM1, %YMM2, %YMM5
VPMINU %YMM3, %YMM4, %YMM6
VPCMP $0, %YMM2, %YMMZERO, %k0
VPCMP $0, %YMM4, %YMMZERO, %k1
subq $-(VEC_SIZE * 4), %rdi
kortestd %k0, %k1
jz L(loop_4x_vec)
VPMINU %YMM5, %YMM6, %YMM5
VPCMP $0, %YMM5, %YMMZERO, %k0
ktestd %k0, %k0
jnz L(4x_vec_end)
/* Check if end was in first half. */
kmovd %k0, %eax
subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
shrq $2, %rdi
# endif
testl %eax, %eax
jz L(second_vec_return)
addq $(VEC_SIZE * 4), %rdi
# ifndef USE_AS_STRNLEN
jmp L(loop_4x_vec)
VPCMP $0, %YMM1, %YMMZERO, %k2
kmovd %k2, %edx
/* Combine VEC1 matches (edx) with VEC2 matches (eax). */
# ifdef USE_AS_WCSLEN
sall $CHAR_PER_VEC, %eax
orl %edx, %eax
tzcntl %eax, %eax
# else
subq $(VEC_SIZE * 4), %rsi
ja L(loop_4x_vec)
salq $CHAR_PER_VEC, %rax
orq %rdx, %rax
tzcntq %rax, %rax
# endif
addq %rdi, %rax
ret
# ifdef USE_AS_STRNLEN
L(last_4x_vec_or_less_load):
/* Depending on entry adjust rdi / prepare first VEC in YMM1. */
VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
L(last_4x_vec_or_less_cmpeq):
VPCMP $0, %YMM1, %YMMZERO, %k0
addq $(VEC_SIZE * 3), %rdi
L(last_4x_vec_or_less):
/* Less than 4 * VEC and aligned to VEC_SIZE. */
addl $(VEC_SIZE * 2), %esi
jle L(last_2x_vec)
VPCMP $0, (%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x0)
/* If remaining length > VEC_SIZE * 2. This works if esi is off by
VEC_SIZE * 4. */
testl $(CHAR_PER_VEC * 2), %esi
jnz L(last_4x_vec)
VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
/* length may have been negative or positive by an offset of
CHAR_PER_VEC * 4 depending on where this was called from. This
fixes that. */
andl $(CHAR_PER_VEC * 4 - 1), %esi
testl %eax, %eax
jnz L(first_vec_x1)
jnz L(last_vec_x1_check)
/* Check the end of data. */
subl $CHAR_PER_VEC, %esi
jb L(max)
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
jb L(max)
subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarq $2, %rdi
# endif
leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
ret
L(max):
movq %r8, %rax
ret
# endif
/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
in the 4x VEC loop can use 2 byte encoding. */
.p2align 4
L(second_vec_return):
VPCMP $0, %YMM3, %YMMZERO, %k0
/* Combine YMM3 matches (k0) with YMM4 matches (k1). */
# ifdef USE_AS_WCSLEN
kunpckbw %k0, %k1, %k0
kmovd %k0, %eax
tzcntl %eax, %eax
# else
kunpckdq %k0, %k1, %k0
kmovq %k0, %rax
tzcntq %rax, %rax
# endif
leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
ret
# ifdef USE_AS_STRNLEN
L(last_vec_x1_check):
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
jb L(max)
subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarq $2, %rdi
# endif
leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
ret
.p2align 4
L(last_4x_vec):
/* Test first 2x VEC normally. */
testl %eax, %eax
jnz L(last_vec_x1)
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x2_check)
subl $VEC_SIZE, %esi
jle L(max)
jnz L(last_vec_x2)
/* Normalize length. */
andl $(CHAR_PER_VEC * 4 - 1), %esi
VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3_check)
movq %r8, %rax
jnz L(last_vec_x3)
/* Check the end of data. */
subl $(CHAR_PER_VEC * 3), %esi
jb L(max)
VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
jb L(max_end)
subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
shrq $2, %rax
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarq $2, %rdi
# endif
leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
ret
.p2align 4
L(last_2x_vec):
addl $(VEC_SIZE * 2), %esi
L(last_vec_x1):
tzcntl %eax, %eax
subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarq $2, %rdi
# endif
leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
ret
.p2align 4
L(last_vec_x2):
tzcntl %eax, %eax
subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarq $2, %rdi
# endif
leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
ret
.p2align 4
L(last_vec_x3):
tzcntl %eax, %eax
subl $(CHAR_PER_VEC * 2), %esi
/* Check the end of data. */
cmpl %eax, %esi
jb L(max_end)
subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarq $2, %rdi
# endif
leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
ret
L(max_end):
movq %r8, %rax
ret
# endif
/* Cold case for crossing page with first load. */
.p2align 4
L(cross_page_boundary):
movq %rdi, %rdx
/* Align data to VEC_SIZE. */
andq $-VEC_SIZE, %rdi
VPCMP $0, (%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x0_check)
subl $VEC_SIZE, %esi
jle L(max)
VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x1_check)
movq %r8, %rax
/* Remove the leading bytes. */
# ifdef USE_AS_WCSLEN
shrq $2, %rax
/* NB: Divide shift count by 4 since each bit in K0 represent 4
bytes. */
movl %edx, %ecx
shrl $2, %ecx
andl $(CHAR_PER_VEC - 1), %ecx
# endif
ret
.p2align 4
L(first_vec_x0_check):
/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
sarxl %SHIFT_REG, %eax, %eax
testl %eax, %eax
# ifndef USE_AS_STRNLEN
jz L(cross_page_continue)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
ret
# else
jnz L(cross_page_less_vec)
# ifndef USE_AS_WCSLEN
movl %edx, %ecx
andl $(CHAR_PER_VEC - 1), %ecx
# endif
movl $CHAR_PER_VEC, %eax
subl %ecx, %eax
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
addq %rdi, %rax
subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
ja L(cross_page_continue)
movl %esi, %eax
ret
.p2align 4
L(first_vec_x1_check):
L(cross_page_less_vec):
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
# endif
/* Check the end of data. */
/* Select min of length and position of first null. */
cmpq %rax, %rsi
jbe L(max)
addq $VEC_SIZE, %rax
addq %rdi, %rax
subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
ret
.p2align 4
L(first_vec_x2_check):
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
addq $(VEC_SIZE * 2), %rax
addq %rdi, %rax
subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
ret
.p2align 4
L(first_vec_x3_check):
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
addq $(VEC_SIZE * 3), %rax
addq %rdi, %rax
subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
ret
.p2align 4
L(max):
movq %r8, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
ret
.p2align 4
L(zero):
xorl %eax, %eax
cmovb %esi, %eax
ret
# endif
.p2align 4
L(first_vec_x0):
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
# endif
addq %rdi, %rax
subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
ret
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
# endif
addq $VEC_SIZE, %rax
addq %rdi, %rax
subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
ret
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
# endif
addq $(VEC_SIZE * 2), %rax
addq %rdi, %rax
subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
ret
.p2align 4
L(4x_vec_end):
VPCMP $0, %YMM1, %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x0)
VPCMP $0, %YMM2, %YMMZERO, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
VPCMP $0, %YMM3, %YMMZERO, %k2
kmovd %k2, %eax
testl %eax, %eax
jnz L(first_vec_x2)
VPCMP $0, %YMM4, %YMMZERO, %k3
kmovd %k3, %eax
L(first_vec_x3):
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
# endif
addq $(VEC_SIZE * 3), %rax
addq %rdi, %rax
subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
ret
END (STRLEN)
#endif

View File

@ -1,3 +1,4 @@
#define STRCMP __strncmp_avx2_rtm
#define USE_AS_STRNCMP 1
#define OVERFLOW_STRCMP __strcmp_avx2_rtm
#include "strcmp-avx2-rtm.S"

View File

@ -1,3 +1,4 @@
#define STRCMP __strncmp_avx2
#define USE_AS_STRNCMP 1
#define OVERFLOW_STRCMP __strcmp_avx2
#include "strcmp-avx2.S"

View File

@ -1,5 +1,5 @@
#define STRCMP __wcsncmp_avx2_rtm
#define USE_AS_STRNCMP 1
#define USE_AS_WCSCMP 1
#define OVERFLOW_STRCMP __wcscmp_avx2_rtm
#include "strcmp-avx2-rtm.S"

View File

@ -1,5 +1,5 @@
#define STRCMP __wcsncmp_avx2
#define USE_AS_STRNCMP 1
#define USE_AS_WCSCMP 1
#define OVERFLOW_STRCMP __wcscmp_avx2
#include "strcmp-avx2.S"

View File

@ -0,0 +1,81 @@
/* Test strlen with 0 in the RSI register.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifdef WIDE
# define TEST_NAME "wcslen"
#else
# define TEST_NAME "strlen"
#endif /* WIDE */
#define TEST_MAIN
#include <string/test-string.h>
#ifdef WIDE
# include <wchar.h>
# define STRLEN wcslen
# define CHAR wchar_t
#else
# define STRLEN strlen
# define CHAR char
#endif /* WIDE */
IMPL (STRLEN, 1)
typedef size_t (*proto_t) (const CHAR *);
typedef struct
{
void (*fn) (void);
} parameter_t;
size_t
__attribute__ ((weak, noinline, noclone))
do_strlen (parameter_t *a, int zero, const CHAR *str)
{
return CALL (a, str);
}
static int
test_main (void)
{
test_init ();
size_t size = page_size / sizeof (CHAR) - 1;
CHAR *buf = (CHAR *) buf2;
buf[size] = 0;
parameter_t a;
int ret = 0;
FOR_EACH_IMPL (impl, 0)
{
a.fn = impl->fn;
/* NB: Pass 0 in RSI. */
size_t res = do_strlen (&a, 0, buf);
if (res != size)
{
error (0, 0, "Wrong result in function %s: %zu != %zu",
impl->name, res, size);
ret = 1;
}
}
return ret ? EXIT_FAILURE : EXIT_SUCCESS;
}
#include <support/test-driver.c>

View File

@ -0,0 +1,20 @@
/* Test wcslen with 0 in the RSI register.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define WIDE 1
#include "tst-rsi-strlen.c"