Small tcache improvements

Change the tcache->counts[] entries to uint16_t - this removes
the limit set by char and allows a larger tcache.  Remove a few
redundant asserts.

bench-malloc-thread with 4 threads is ~15% faster on Cortex-A72.

Reviewed-by: DJ Delorie <dj@redhat.com>

	* malloc/malloc.c (MAX_TCACHE_COUNT): Increase to UINT16_MAX.
	(tcache_put): Remove redundant assert.
	(tcache_get): Remove redundant asserts.
	(__libc_malloc): Check tcache count is not zero.
	* manual/tunables.texi (glibc.malloc.tcache_count): Update maximum.
This commit is contained in:
Wilco Dijkstra 2019-05-17 18:16:20 +01:00
parent fef7c63cd5
commit 1f50f2ad85
3 changed files with 15 additions and 9 deletions

View File

@ -1,3 +1,11 @@
2019-05-17 Wilco Dijkstra <wdijkstr@arm.com>
* malloc/malloc.c (MAX_TCACHE_COUNT): Increase to UINT16_MAX.
(tcache_put): Remove redundant assert.
(tcache_get): Remove redundant asserts.
(__libc_malloc): Check tcache count is not zero.
* manual/tunables.texi (glibc.malloc.tcache_count): Update maximum.
2019-05-17 Florian Weimer <fweimer@redhat.com> 2019-05-17 Florian Weimer <fweimer@redhat.com>
* manual/llio.texi (Open-time Flags): Document O_DIRECTORY. * manual/llio.texi (Open-time Flags): Document O_DIRECTORY.

View File

@ -321,6 +321,10 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line,
/* This is another arbitrary limit, which tunables can change. Each /* This is another arbitrary limit, which tunables can change. Each
tcache bin will hold at most this number of chunks. */ tcache bin will hold at most this number of chunks. */
# define TCACHE_FILL_COUNT 7 # define TCACHE_FILL_COUNT 7
/* Maximum chunks in tcache bins for tunables. This value must fit the range
of tcache->counts[] entries, else they may overflow. */
# define MAX_TCACHE_COUNT UINT16_MAX
#endif #endif
@ -2901,12 +2905,10 @@ typedef struct tcache_entry
time), this is for performance reasons. */ time), this is for performance reasons. */
typedef struct tcache_perthread_struct typedef struct tcache_perthread_struct
{ {
char counts[TCACHE_MAX_BINS]; uint16_t counts[TCACHE_MAX_BINS];
tcache_entry *entries[TCACHE_MAX_BINS]; tcache_entry *entries[TCACHE_MAX_BINS];
} tcache_perthread_struct; } tcache_perthread_struct;
#define MAX_TCACHE_COUNT 127 /* Maximum value of counts[] entries. */
static __thread bool tcache_shutting_down = false; static __thread bool tcache_shutting_down = false;
static __thread tcache_perthread_struct *tcache = NULL; static __thread tcache_perthread_struct *tcache = NULL;
@ -2916,7 +2918,6 @@ static __always_inline void
tcache_put (mchunkptr chunk, size_t tc_idx) tcache_put (mchunkptr chunk, size_t tc_idx)
{ {
tcache_entry *e = (tcache_entry *) chunk2mem (chunk); tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
assert (tc_idx < TCACHE_MAX_BINS);
/* Mark this chunk as "in the tcache" so the test in _int_free will /* Mark this chunk as "in the tcache" so the test in _int_free will
detect a double free. */ detect a double free. */
@ -2933,8 +2934,6 @@ static __always_inline void *
tcache_get (size_t tc_idx) tcache_get (size_t tc_idx)
{ {
tcache_entry *e = tcache->entries[tc_idx]; tcache_entry *e = tcache->entries[tc_idx];
assert (tc_idx < TCACHE_MAX_BINS);
assert (tcache->counts[tc_idx] > 0);
tcache->entries[tc_idx] = e->next; tcache->entries[tc_idx] = e->next;
--(tcache->counts[tc_idx]); --(tcache->counts[tc_idx]);
e->key = NULL; e->key = NULL;
@ -3046,9 +3045,8 @@ __libc_malloc (size_t bytes)
DIAG_PUSH_NEEDS_COMMENT; DIAG_PUSH_NEEDS_COMMENT;
if (tc_idx < mp_.tcache_bins if (tc_idx < mp_.tcache_bins
/*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
&& tcache && tcache
&& tcache->entries[tc_idx] != NULL) && tcache->counts[tc_idx] > 0)
{ {
return tcache_get (tc_idx); return tcache_get (tc_idx);
} }

View File

@ -189,7 +189,7 @@ per-thread cache. The default (and maximum) value is 1032 bytes on
@deftp Tunable glibc.malloc.tcache_count @deftp Tunable glibc.malloc.tcache_count
The maximum number of chunks of each size to cache. The default is 7. The maximum number of chunks of each size to cache. The default is 7.
The upper limit is 127. If set to zero, the per-thread cache is effectively The upper limit is 65535. If set to zero, the per-thread cache is effectively
disabled. disabled.
The approximate maximum overhead of the per-thread cache is thus equal The approximate maximum overhead of the per-thread cache is thus equal