aarch64: optimize _dl_tlsdesc_dynamic fast path

Remove some load/store instructions from the dynamic tlsdesc resolver
fast path.  This gives around 20% faster tls access in dlopened shared
libraries (assuming glibc ran out of static tls space).

	* sysdeps/aarch64/dl-tlsdesc.S (_dl_tlsdesc_dynamic): Optimize.
This commit is contained in:
Szabolcs Nagy 2017-10-24 17:49:14 +01:00
parent 94d2f0af15
commit 659ca26736
2 changed files with 55 additions and 54 deletions

View File

@ -1,3 +1,7 @@
2017-11-03 Szabolcs Nagy <szabolcs.nagy@arm.com>
* sysdeps/aarch64/dl-tlsdesc.S (_dl_tlsdesc_dynamic): Optimize.
2017-11-03 Szabolcs Nagy <szabolcs.nagy@arm.com> 2017-11-03 Szabolcs Nagy <szabolcs.nagy@arm.com>
* sysdeps/arm/dl-machine.h (elf_machine_runtime_setup): Remove * sysdeps/arm/dl-machine.h (elf_machine_runtime_setup): Remove

View File

@ -142,23 +142,17 @@ _dl_tlsdesc_undefweak:
cfi_startproc cfi_startproc
.align 2 .align 2
_dl_tlsdesc_dynamic: _dl_tlsdesc_dynamic:
# define NSAVEXREGPAIRS 2
stp x29, x30, [sp,#-(32+16*NSAVEXREGPAIRS)]!
cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
cfi_rel_offset (x29, 0)
cfi_rel_offset (x30, 8)
mov x29, sp
DELOUSE (0) DELOUSE (0)
/* Save just enough registers to support fast path, if we fall /* Save just enough registers to support fast path, if we fall
into slow path we will save additional registers. */ into slow path we will save additional registers. */
stp x1, x2, [sp, #-32]!
stp x1, x2, [sp, #32+16*0] stp x3, x4, [sp, #16]
stp x3, x4, [sp, #32+16*1] cfi_adjust_cfa_offset (32)
cfi_rel_offset (x1, 32) cfi_rel_offset (x1, 0)
cfi_rel_offset (x2, 32+8) cfi_rel_offset (x2, 8)
cfi_rel_offset (x3, 32+16) cfi_rel_offset (x3, 16)
cfi_rel_offset (x4, 32+24) cfi_rel_offset (x4, 24)
mrs x4, tpidr_el0 mrs x4, tpidr_el0
ldr PTR_REG (1), [x0,#TLSDESC_ARG] ldr PTR_REG (1), [x0,#TLSDESC_ARG]
@ -167,23 +161,18 @@ _dl_tlsdesc_dynamic:
ldr PTR_REG (2), [x0,#DTV_COUNTER] ldr PTR_REG (2), [x0,#DTV_COUNTER]
cmp PTR_REG (3), PTR_REG (2) cmp PTR_REG (3), PTR_REG (2)
b.hi 2f b.hi 2f
ldr PTR_REG (2), [x1,#TLSDESC_MODID] /* Load r2 = td->tlsinfo.ti_module and r3 = td->tlsinfo.ti_offset. */
ldp PTR_REG (2), PTR_REG (3), [x1,#TLSDESC_MODID]
add PTR_REG (0), PTR_REG (0), PTR_REG (2), lsl #(PTR_LOG_SIZE + 1) add PTR_REG (0), PTR_REG (0), PTR_REG (2), lsl #(PTR_LOG_SIZE + 1)
ldr PTR_REG (0), [x0] /* Load val member of DTV entry. */ ldr PTR_REG (0), [x0] /* Load val member of DTV entry. */
cmp PTR_REG (0), #TLS_DTV_UNALLOCATED cmp PTR_REG (0), #TLS_DTV_UNALLOCATED
b.eq 2f b.eq 2f
ldr PTR_REG (1), [x1,#TLSDESC_MODOFF] sub PTR_REG (3), PTR_REG (3), PTR_REG (4)
add PTR_REG (0), PTR_REG (0), PTR_REG (1) add PTR_REG (0), PTR_REG (0), PTR_REG (3)
sub PTR_REG (0), PTR_REG (0), PTR_REG (4)
1: 1:
ldp x1, x2, [sp, #32+16*0] ldp x3, x4, [sp, #16]
ldp x3, x4, [sp, #32+16*1] ldp x1, x2, [sp], #32
cfi_adjust_cfa_offset (-32)
ldp x29, x30, [sp], #(32+16*NSAVEXREGPAIRS)
cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS)
cfi_restore (x29)
cfi_restore (x30)
# undef NSAVEXREGPAIRS
RET RET
2: 2:
/* This is the slow path. We need to call __tls_get_addr() which /* This is the slow path. We need to call __tls_get_addr() which
@ -191,29 +180,33 @@ _dl_tlsdesc_dynamic:
callee will trash. */ callee will trash. */
/* Save the remaining registers that we must treat as caller save. */ /* Save the remaining registers that we must treat as caller save. */
# define NSAVEXREGPAIRS 7 # define NSAVEXREGPAIRS 8
stp x5, x6, [sp, #-16*NSAVEXREGPAIRS]! stp x29, x30, [sp,#-16*NSAVEXREGPAIRS]!
cfi_adjust_cfa_offset (16*NSAVEXREGPAIRS) cfi_adjust_cfa_offset (16*NSAVEXREGPAIRS)
stp x7, x8, [sp, #16*1] cfi_rel_offset (x29, 0)
stp x9, x10, [sp, #16*2] cfi_rel_offset (x30, 8)
stp x11, x12, [sp, #16*3] mov x29, sp
stp x13, x14, [sp, #16*4] stp x5, x6, [sp, #16*1]
stp x15, x16, [sp, #16*5] stp x7, x8, [sp, #16*2]
stp x17, x18, [sp, #16*6] stp x9, x10, [sp, #16*3]
cfi_rel_offset (x5, 0) stp x11, x12, [sp, #16*4]
cfi_rel_offset (x6, 8) stp x13, x14, [sp, #16*5]
cfi_rel_offset (x7, 16) stp x15, x16, [sp, #16*6]
cfi_rel_offset (x8, 16+8) stp x17, x18, [sp, #16*7]
cfi_rel_offset (x9, 16*2) cfi_rel_offset (x5, 16*1)
cfi_rel_offset (x10, 16*2+8) cfi_rel_offset (x6, 16*1+8)
cfi_rel_offset (x11, 16*3) cfi_rel_offset (x7, 16*2)
cfi_rel_offset (x12, 16*3+8) cfi_rel_offset (x8, 16*2+8)
cfi_rel_offset (x13, 16*4) cfi_rel_offset (x9, 16*3)
cfi_rel_offset (x14, 16*4+8) cfi_rel_offset (x10, 16*3+8)
cfi_rel_offset (x15, 16*5) cfi_rel_offset (x11, 16*4)
cfi_rel_offset (x16, 16*5+8) cfi_rel_offset (x12, 16*4+8)
cfi_rel_offset (x17, 16*6) cfi_rel_offset (x13, 16*5)
cfi_rel_offset (x18, 16*6+8) cfi_rel_offset (x14, 16*5+8)
cfi_rel_offset (x15, 16*6)
cfi_rel_offset (x16, 16*6+8)
cfi_rel_offset (x17, 16*7)
cfi_rel_offset (x18, 16*7+8)
SAVE_Q_REGISTERS SAVE_Q_REGISTERS
@ -225,14 +218,18 @@ _dl_tlsdesc_dynamic:
RESTORE_Q_REGISTERS RESTORE_Q_REGISTERS
ldp x7, x8, [sp, #16*1] ldp x5, x6, [sp, #16*1]
ldp x9, x10, [sp, #16*2] ldp x7, x8, [sp, #16*2]
ldp x11, x12, [sp, #16*3] ldp x9, x10, [sp, #16*3]
ldp x13, x14, [sp, #16*4] ldp x11, x12, [sp, #16*4]
ldp x15, x16, [sp, #16*5] ldp x13, x14, [sp, #16*5]
ldp x17, x18, [sp, #16*6] ldp x15, x16, [sp, #16*6]
ldp x5, x6, [sp], #16*NSAVEXREGPAIRS ldp x17, x18, [sp, #16*7]
ldp x29, x30, [sp], #16*NSAVEXREGPAIRS
cfi_adjust_cfa_offset (-16*NSAVEXREGPAIRS) cfi_adjust_cfa_offset (-16*NSAVEXREGPAIRS)
cfi_restore (x29)
cfi_restore (x30)
b 1b b 1b
cfi_endproc cfi_endproc
.size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic