glibc/sysdeps/generic/dl-new-hash.h

110 lines
3.2 KiB
C
Raw Normal View History

elf: Optimize _dl_new_hash in dl-new-hash.h Unroll slightly and enforce good instruction scheduling. This improves performance on out-of-order machines. The unrolling allows for pipelined multiplies. As well, as an optional sysdep, reorder the operations and prevent reassosiation for better scheduling and higher ILP. This commit only adds the barrier for x86, although it should be either no change or a win for any architecture. Unrolling further started to induce slowdowns for sizes [0, 4] but can help the loop so if larger sizes are the target further unrolling can be beneficial. Results for _dl_new_hash Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz Time as Geometric Mean of N=30 runs Geometric of all benchmark New / Old: 0.674 type, length, New Time, Old Time, New Time / Old Time fixed, 0, 2.865, 2.72, 1.053 fixed, 1, 3.567, 2.489, 1.433 fixed, 2, 2.577, 3.649, 0.706 fixed, 3, 3.644, 5.983, 0.609 fixed, 4, 4.211, 6.833, 0.616 fixed, 5, 4.741, 9.372, 0.506 fixed, 6, 5.415, 9.561, 0.566 fixed, 7, 6.649, 10.789, 0.616 fixed, 8, 8.081, 11.808, 0.684 fixed, 9, 8.427, 12.935, 0.651 fixed, 10, 8.673, 14.134, 0.614 fixed, 11, 10.69, 15.408, 0.694 fixed, 12, 10.789, 16.982, 0.635 fixed, 13, 12.169, 18.411, 0.661 fixed, 14, 12.659, 19.914, 0.636 fixed, 15, 13.526, 21.541, 0.628 fixed, 16, 14.211, 23.088, 0.616 fixed, 32, 29.412, 52.722, 0.558 fixed, 64, 65.41, 142.351, 0.459 fixed, 128, 138.505, 295.625, 0.469 fixed, 256, 291.707, 601.983, 0.485 random, 2, 12.698, 12.849, 0.988 random, 4, 16.065, 15.857, 1.013 random, 8, 19.564, 21.105, 0.927 random, 16, 23.919, 26.823, 0.892 random, 32, 31.987, 39.591, 0.808 random, 64, 49.282, 71.487, 0.689 random, 128, 82.23, 145.364, 0.566 random, 256, 152.209, 298.434, 0.51 Co-authored-by: Alexander Monakov <amonakov@ispras.ru> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
2022-05-19 22:18:03 +00:00
/* _dl_new_hash for elf symbol lookup
Copyright (C) 2022-2024 Free Software Foundation, Inc.
elf: Optimize _dl_new_hash in dl-new-hash.h Unroll slightly and enforce good instruction scheduling. This improves performance on out-of-order machines. The unrolling allows for pipelined multiplies. As well, as an optional sysdep, reorder the operations and prevent reassosiation for better scheduling and higher ILP. This commit only adds the barrier for x86, although it should be either no change or a win for any architecture. Unrolling further started to induce slowdowns for sizes [0, 4] but can help the loop so if larger sizes are the target further unrolling can be beneficial. Results for _dl_new_hash Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz Time as Geometric Mean of N=30 runs Geometric of all benchmark New / Old: 0.674 type, length, New Time, Old Time, New Time / Old Time fixed, 0, 2.865, 2.72, 1.053 fixed, 1, 3.567, 2.489, 1.433 fixed, 2, 2.577, 3.649, 0.706 fixed, 3, 3.644, 5.983, 0.609 fixed, 4, 4.211, 6.833, 0.616 fixed, 5, 4.741, 9.372, 0.506 fixed, 6, 5.415, 9.561, 0.566 fixed, 7, 6.649, 10.789, 0.616 fixed, 8, 8.081, 11.808, 0.684 fixed, 9, 8.427, 12.935, 0.651 fixed, 10, 8.673, 14.134, 0.614 fixed, 11, 10.69, 15.408, 0.694 fixed, 12, 10.789, 16.982, 0.635 fixed, 13, 12.169, 18.411, 0.661 fixed, 14, 12.659, 19.914, 0.636 fixed, 15, 13.526, 21.541, 0.628 fixed, 16, 14.211, 23.088, 0.616 fixed, 32, 29.412, 52.722, 0.558 fixed, 64, 65.41, 142.351, 0.459 fixed, 128, 138.505, 295.625, 0.469 fixed, 256, 291.707, 601.983, 0.485 random, 2, 12.698, 12.849, 0.988 random, 4, 16.065, 15.857, 1.013 random, 8, 19.564, 21.105, 0.927 random, 16, 23.919, 26.823, 0.892 random, 32, 31.987, 39.591, 0.808 random, 64, 49.282, 71.487, 0.689 random, 128, 82.23, 145.364, 0.566 random, 256, 152.209, 298.434, 0.51 Co-authored-by: Alexander Monakov <amonakov@ispras.ru> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
2022-05-19 22:18:03 +00:00
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _DL_NEW_HASH_H
#define _DL_NEW_HASH_H 1
#include <stdint.h>
/* For __always_inline and __glibc_unlikely. */
#include <sys/cdefs.h>
/* The simplest implementation of _dl_new_hash is:
_dl_new_hash (const char *s)
{
uint32_t h = 5381;
for (unsigned char c = *s; c != '\0'; c = *++s)
h = h * 33 + c;
return h;
}
We can get better performance by slightly unrolling the loop to
pipeline the multiples, which gcc cannot easily do due to
dependencies across iterations.
As well, as an architecture specific option we add asm statements
to explicitly specify order of operations and prevent reassociation
of instructions that lengthens the loop carried dependency. This
may have no affect as the compiler may have ordered instructions
the same way without it but in testing this has not been the case
for GCC. Improving GCC to reliably schedule instructions ideally
cannot be easily done.
Architecture(s) that use the reassociation barriers are:
x86
Note it is very unlikely the reassociation barriers would
de-optimize performance on any architecture and with an imperfect
compiler it may help performance, especially on out-of-order cpus,
so it is suggested that the respective maintainers add them.
Architecture maintainers are encouraged to benchmark this with
__asm_reassociation_barrier defined to __asm__ like it is in x86.
*/
#ifndef __asm_reassociation_barrier
# define __asm_reassociation_barrier(...)
#endif
static __always_inline uint32_t
__attribute__ ((unused))
_dl_new_hash (const char *str)
{
const unsigned char *s = (const unsigned char *) str;
unsigned int h = 5381;
unsigned int c0, c1;
for (;;)
{
c0 = s[0];
/* Since hashed string is normally not empty, this is unlikely on the
first iteration of the loop. */
if (__glibc_unlikely (c0 == 0))
return h;
c1 = s[1];
if (c1 == 0)
{
/* Ideal computational order is:
c0 += h;
h *= 32;
h += c0; */
c0 += h;
__asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
h = h * 32 + c0;
return h;
}
/* Ideal computational order is:
c1 += c0;
h *= 33 * 33;
c0 *= 32;
c1 += c0;
h += c1; */
c1 += c0;
__asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
h *= 33 * 33;
c1 += c0 * 32;
__asm_reassociation_barrier("" : "+r"(c1));
h += c1;
s += 2;
}
}
#endif /* dl-new-hash.h */