aarch64: Optimize __libc_mtag_tag_region

This is a target hook for memory tagging, the original was a naive
implementation. The optimized version relies on "dc gva" to tag 64
bytes at a time for large allocations and optimizes small cases without
adding too many branches. This was not benchmarked on real cpu, but
expected to be faster than the naive implementation.
This commit is contained in:
Szabolcs Nagy 2021-02-09 17:56:02 +00:00
parent 383bc24028
commit 23fd760add

View File

@ -20,32 +20,94 @@
#ifdef USE_MTAG
/* Use the same register names and assignments as memset. */
/* Assumptions:
*
* ARMv8-a, AArch64, MTE, LP64 ABI.
*
* Interface contract:
* Address is 16 byte aligned and size is multiple of 16.
* Returns the passed pointer.
* The memory region may remain untagged if tagging is not enabled.
*/
.arch armv8.5-a
.arch_extension memtag
/* NB, only supported on variants with 64-bit pointers. */
#define dstin x0
#define count x1
#define dst x2
#define dstend x3
#define tmp x4
#define zva_val x4
/* FIXME: This is a minimal implementation. We could do better than
this for larger values of COUNT. */
ENTRY (__libc_mtag_tag_region)
PTR_ARG (0)
SIZE_ARG (1)
#define dstin x0
#define count x1
#define dst x2
add dstend, dstin, count
ENTRY_ALIGN(__libc_mtag_tag_region, 6)
cmp count, 96
b.hi L(set_long)
mov dst, dstin
L(loop):
stg dst, [dst], #16
subs count, count, 16
bne L(loop)
#if 0
/* This is not currently needed, since for now we are only called
to tag memory that is taggable. */
ldg dstin, [dstin] // Recover the tag created (might be untagged).
#endif
tbnz count, 6, L(set96)
/* Set 0, 16, 32, or 48 bytes. */
lsr tmp, count, 5
add tmp, dstin, tmp, lsl 4
cbz count, L(end)
stg dstin, [dstin]
stg dstin, [tmp]
stg dstin, [dstend, -16]
L(end):
ret
.p2align 4
/* Set 64..96 bytes. Write 64 bytes from the start and
32 bytes from the end. */
L(set96):
st2g dstin, [dstin]
st2g dstin, [dstin, 32]
st2g dstin, [dstend, -32]
ret
.p2align 4
/* Size is > 96 bytes. */
L(set_long):
cmp count, 160
b.lo L(no_zva)
#ifndef SKIP_ZVA_CHECK
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
b.ne L(no_zva)
#endif
st2g dstin, [dstin]
st2g dstin, [dstin, 32]
bic dst, dstin, 63
sub count, dstend, dst /* Count is now 64 too large. */
sub count, count, 128 /* Adjust count and bias for loop. */
.p2align 4
L(zva_loop):
add dst, dst, 64
dc gva, dst
subs count, count, 64
b.hi L(zva_loop)
st2g dstin, [dstend, -64]
st2g dstin, [dstend, -32]
ret
L(no_zva):
sub dst, dstin, 32 /* Dst is biased by -32. */
sub count, count, 64 /* Adjust count for loop. */
L(no_zva_loop):
st2g dstin, [dst, 32]
st2g dstin, [dst, 64]!
subs count, count, 64
b.hi L(no_zva_loop)
st2g dstin, [dstend, -64]
st2g dstin, [dstend, -32]
ret
END (__libc_mtag_tag_region)
#endif /* USE_MTAG */