From 189ad0f81dc3b029c05a3b7cf19d8ba78c4e0429 Mon Sep 17 00:00:00 2001 From: daan Date: Mon, 22 Jul 2019 20:51:12 -0700 Subject: [PATCH] small optimizations, use bitwise aligne --- CMakeLists.txt | 1 + include/mimalloc-internal.h | 26 +++++++++++++++++++++++++- include/mimalloc-types.h | 11 ++++++----- include/mimalloc.h | 6 +++--- src/alloc.c | 6 +++--- src/init.c | 8 +++++--- src/os.c | 7 ------- src/page.c | 14 ++++++++++---- src/segment.c | 12 ++++++------ 9 files changed, 59 insertions(+), 32 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d86d096..ec0fd99 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,6 +87,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU") if(CMAKE_C_COMPILER_ID MATCHES "GNU") list(APPEND mi_cflags -Wno-invalid-memory-model) list(APPEND mi_cflags -fvisibility=hidden) + list(APPEND mi_cflags -fbranch-target-load-optimize ) endif() endif() diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h index cbed590..e261dba 100644 --- a/include/mimalloc-internal.h +++ b/include/mimalloc-internal.h @@ -39,7 +39,6 @@ bool _mi_preloading(); // true while the C runtime is not ready // os.c size_t _mi_os_page_size(void); -uintptr_t _mi_align_up(uintptr_t sz, size_t alignment); void _mi_os_init(void); // called from process init void* _mi_os_alloc(size_t size, mi_stats_t* stats); // to allocate thread local data void _mi_os_free(void* p, size_t size, mi_stats_t* stats); // to free thread local data @@ -165,6 +164,20 @@ static inline bool mi_mul_overflow(size_t size, size_t count, size_t* total) { #endif } +// Align upwards +static inline uintptr_t _mi_is_power_of_two(uintptr_t x) { + return ((x & (x - 1)) == 0); +} +static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) { + uintptr_t mask = alignment - 1; + if ((alignment & mask) == 0) { // power of two? + return ((sz + mask) & ~mask); + } + else { + return (((sz + mask)/alignment)*alignment); + } +} + // Align a byte size to a size in _machine words_, // i.e. byte size == `wsize*sizeof(void*)`. static inline size_t _mi_wsize_from_size(size_t size) { @@ -324,12 +337,23 @@ static inline void mi_block_set_nextx(uintptr_t cookie, mi_block_t* block, mi_bl } static inline mi_block_t* mi_block_next(mi_page_t* page, mi_block_t* block) { + #if MI_SECURE return mi_block_nextx(page->cookie,block); + #else + UNUSED(page); + return mi_block_nextx(0, block); + #endif } static inline void mi_block_set_next(mi_page_t* page, mi_block_t* block, mi_block_t* next) { + #if MI_SECURE mi_block_set_nextx(page->cookie,block,next); + #else + UNUSED(page); + mi_block_set_nextx(0, block, next); + #endif } + // ------------------------------------------------------------------- // Getting the thread id should be performant // as it is called in the fast path of `_mi_free`, diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h index 7221f5b..5c14ffd 100644 --- a/include/mimalloc-types.h +++ b/include/mimalloc-types.h @@ -132,10 +132,9 @@ typedef union mi_page_flags_u { } mi_page_flags_t; // Thread free list. -// We use bottom 2 bits of the pointer for mi_delayed_t flags +// We use the bottom 2 bits of the pointer for mi_delayed_t flags typedef uintptr_t mi_thread_free_t; - // A page contains blocks of one specific size (`block_size`). // Each page has three list of free blocks: // `free` for blocks that can be allocated, @@ -165,9 +164,11 @@ typedef struct mi_page_s { mi_page_flags_t flags; uint16_t capacity; // number of blocks committed uint16_t reserved; // number of blocks reserved in memory - + mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) + #if MI_SECURE uintptr_t cookie; // random cookie to encode the free lists + #endif size_t used; // number of blocks in use (including blocks in `local_free` and `thread_free`) mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) @@ -182,9 +183,9 @@ typedef struct mi_page_s { // improve page index calculation #if MI_INTPTR_SIZE==8 - //void* padding[1]; // 10 words on 64-bit + //void* padding[1]; // 12 words on 64-bit #elif MI_INTPTR_SIZE==4 - void* padding[1]; // 12 words on 32-bit + void* padding[1]; // 12 words on 32-bit #endif } mi_page_t; diff --git a/include/mimalloc.h b/include/mimalloc.h index e7e8379..c6b7b5f 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -52,8 +52,8 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_attr_alloc_size2(s1,s2) #else #define mi_attr_alloc_size(s) __attribute__((alloc_size(s))) - #define mi_attr_alloc_size2(s1,s2) __attribute__((alloc_size(s1,s2))) - #define mi_cdecl // leads to warnings... __attribute__((cdecl)) + #define mi_attr_alloc_size2(s1,s2) __attribute__((alloc_size(s1,s2))) + #define mi_cdecl // leads to warnings... __attribute__((cdecl)) #endif #else #define mi_decl_thread __thread @@ -62,7 +62,7 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_attr_malloc #define mi_attr_alloc_size(s) #define mi_attr_alloc_size2(s1,s2) - #define mi_cdecl + #define mi_cdecl #endif // ------------------------------------------------------ diff --git a/src/alloc.c b/src/alloc.c index 649b6e9..6a91c0a 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -237,9 +237,9 @@ void mi_free(void* p) mi_attr_noexcept #endif // adjust if it might be an un-aligned block - if (mi_likely(page->flags.value==0)) { // note: merging both tests (local | value) does not matter for performance + if (mi_likely(page->flags.value==0)) { // not full or aligned mi_block_t* block = (mi_block_t*)p; - if (mi_likely(local)) { + if (mi_likely(local)) { // note: merging both tests (local | value) does not matter for performance // owning thread can free a block directly mi_block_set_next(page, block, page->local_free); // note: moving this write earlier does not matter for performance page->local_free = block; @@ -248,7 +248,7 @@ void mi_free(void* p) mi_attr_noexcept } else { // use atomic operations for a multi-threaded free - _mi_free_block_mt(page, block); + _mi_free_block_mt(page, block); } } else { diff --git a/src/init.c b/src/init.c index 152e906..44e3c9c 100644 --- a/src/init.c +++ b/src/init.c @@ -12,9 +12,11 @@ terms of the MIT license. A copy of the license can be found in the file // Empty page used to initialize the small free pages array const mi_page_t _mi_page_empty = { - 0, false, false, false, {0}, - 0, 0, - NULL, 0, 0, // free, used, cookie + 0, false, false, false, {0}, 0, 0, + NULL, 0, // free, used + #if MI_SECURE + 0, + #endif NULL, 0, 0, 0, NULL, NULL, NULL #if (MI_INTPTR_SIZE==4) diff --git a/src/os.c b/src/os.c index f7b3625..f635891 100644 --- a/src/os.c +++ b/src/os.c @@ -34,13 +34,6 @@ terms of the MIT license. A copy of the license can be found in the file ----------------------------------------------------------- */ bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats); -uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) { - uintptr_t x = (sz / alignment) * alignment; - if (x < sz) x += alignment; - if (x < sz) return 0; // overflow - return x; -} - static void* mi_align_up_ptr(void* p, size_t alignment) { return (void*)_mi_align_up((uintptr_t)p, alignment); } diff --git a/src/page.c b/src/page.c index b0c0b38..d46a5aa 100644 --- a/src/page.c +++ b/src/page.c @@ -93,7 +93,9 @@ static bool mi_page_is_valid_init(mi_page_t* page) { bool _mi_page_is_valid(mi_page_t* page) { mi_assert_internal(mi_page_is_valid_init(page)); + #if MI_SECURE mi_assert_internal(page->cookie != 0); + #endif if (page->heap!=NULL) { mi_segment_t* segment = _mi_page_segment(page); mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == page->heap->thread_id); @@ -119,7 +121,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay ) { else if (mi_unlikely(mi_tf_delayed(tfree) == MI_DELAYED_FREEING)) { mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done. continue; // and try again - } + } } while((mi_tf_delayed(tfreex) != mi_tf_delayed(tfree)) && // avoid atomic operation if already equal !mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex, tfree)); @@ -258,7 +260,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) { mi_block_t* next = mi_block_nextx(heap->cookie,block); // use internal free instead of regular one to keep stats etc correct if (!_mi_free_delayed_block(block)) { - // we might already start delayed freeing while another thread has not yet + // we might already start delayed freeing while another thread has not yet // reset the delayed_freeing flag; in that case delay it further by reinserting. mi_block_t* dfree; do { @@ -498,7 +500,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_stats_t* st if (page->capacity >= page->reserved) return; size_t page_size; - _mi_page_start(_mi_page_segment(page), page, &page_size); + _mi_page_start(_mi_page_segment(page), page, &page_size); _mi_stat_increase(&stats->pages_extended, 1); // calculate the extend count @@ -533,7 +535,9 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi page->block_size = block_size; mi_assert_internal(page_size / block_size < (1L<<16)); page->reserved = (uint16_t)(page_size / block_size); + #if MI_SECURE page->cookie = _mi_heap_random(heap) | 1; + #endif mi_assert_internal(page->capacity == 0); mi_assert_internal(page->free == NULL); @@ -543,7 +547,9 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi mi_assert_internal(page->next == NULL); mi_assert_internal(page->prev == NULL); mi_assert_internal(page->flags.has_aligned == false); + #if MI_SECURE mi_assert_internal(page->cookie != 0); + #endif mi_assert_expensive(mi_page_is_valid_init(page)); // initialize an initial free list @@ -683,7 +689,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) { mi_assert_internal(mi_page_immediate_available(page)); mi_assert_internal(page->block_size == block_size); mi_heap_stat_increase( heap, huge, block_size); - } + } return page; } diff --git a/src/segment.c b/src/segment.c index 7f7bedd..8f254a2 100644 --- a/src/segment.c +++ b/src/segment.c @@ -235,8 +235,8 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se // The thread local segment cache is limited to be at most 1/8 of the peak size of segments in use, -// and no more than 4. -#define MI_SEGMENT_CACHE_MAX (4) +// and no more than 2. +#define MI_SEGMENT_CACHE_MAX (2) #define MI_SEGMENT_CACHE_FRACTION (8) // note: returned segment may be partially reset @@ -252,7 +252,7 @@ static mi_segment_t* mi_segment_cache_pop(size_t segment_size, mi_segments_tld_t } static bool mi_segment_cache_full(mi_segments_tld_t* tld) { - if (tld->cache_count < MI_SEGMENT_CACHE_MAX && + if (tld->cache_count < MI_SEGMENT_CACHE_MAX && tld->cache_count < (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) { // always allow 1 element cache return false; } @@ -318,7 +318,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind, size_t page_size = (page_kind == MI_PAGE_HUGE ? segment_size : (size_t)1 << page_shift); // Try to get it from our thread local cache first - bool commit = mi_option_is_enabled(mi_option_eager_commit) || (page_kind > MI_PAGE_MEDIUM); + bool commit = mi_option_is_enabled(mi_option_eager_commit) || (page_kind > MI_PAGE_MEDIUM); bool protection_still_good = false; mi_segment_t* segment = mi_segment_cache_pop(segment_size, tld); if (segment != NULL) { @@ -702,10 +702,10 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld mi_page_t* _mi_segment_page_alloc(size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { mi_page_t* page; - if (block_size <= (MI_SMALL_PAGE_SIZE/16)*3) { + if (block_size <= (MI_SMALL_PAGE_SIZE/4)) { page = mi_segment_small_page_alloc(tld,os_tld); } - else if (block_size <= (MI_MEDIUM_PAGE_SIZE/16)*3) { + else if (block_size <= (MI_MEDIUM_PAGE_SIZE/4)) { page = mi_segment_medium_page_alloc(tld, os_tld); } else if (block_size < (MI_LARGE_SIZE_MAX - sizeof(mi_segment_t))) {